LLVM 20.0.0git
AArch64ISelLowering.cpp
Go to the documentation of this file.
1//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the AArch64TargetLowering class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64ISelLowering.h"
15#include "AArch64ExpandImm.h"
18#include "AArch64RegisterInfo.h"
19#include "AArch64Subtarget.h"
22#include "llvm/ADT/APFloat.h"
23#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/ArrayRef.h"
25#include "llvm/ADT/STLExtras.h"
26#include "llvm/ADT/SmallSet.h"
28#include "llvm/ADT/Statistic.h"
29#include "llvm/ADT/StringRef.h"
30#include "llvm/ADT/Twine.h"
59#include "llvm/IR/Attributes.h"
60#include "llvm/IR/Constants.h"
61#include "llvm/IR/DataLayout.h"
62#include "llvm/IR/DebugLoc.h"
64#include "llvm/IR/Function.h"
66#include "llvm/IR/GlobalValue.h"
67#include "llvm/IR/IRBuilder.h"
68#include "llvm/IR/Instruction.h"
71#include "llvm/IR/Intrinsics.h"
72#include "llvm/IR/IntrinsicsAArch64.h"
73#include "llvm/IR/Module.h"
75#include "llvm/IR/Type.h"
76#include "llvm/IR/Use.h"
77#include "llvm/IR/Value.h"
83#include "llvm/Support/Debug.h"
93#include <algorithm>
94#include <bitset>
95#include <cassert>
96#include <cctype>
97#include <cstdint>
98#include <cstdlib>
99#include <iterator>
100#include <limits>
101#include <optional>
102#include <tuple>
103#include <utility>
104#include <vector>
105
106using namespace llvm;
107using namespace llvm::PatternMatch;
108
109#define DEBUG_TYPE "aarch64-lower"
110
111STATISTIC(NumTailCalls, "Number of tail calls");
112STATISTIC(NumShiftInserts, "Number of vector shift inserts");
113STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
114
115// FIXME: The necessary dtprel relocations don't seem to be supported
116// well in the GNU bfd and gold linkers at the moment. Therefore, by
117// default, for now, fall back to GeneralDynamic code generation.
119 "aarch64-elf-ldtls-generation", cl::Hidden,
120 cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
121 cl::init(false));
122
123static cl::opt<bool>
124EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
125 cl::desc("Enable AArch64 logical imm instruction "
126 "optimization"),
127 cl::init(true));
128
129// Temporary option added for the purpose of testing functionality added
130// to DAGCombiner.cpp in D92230. It is expected that this can be removed
131// in future when both implementations will be based off MGATHER rather
132// than the GLD1 nodes added for the SVE gather load intrinsics.
133static cl::opt<bool>
134EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
135 cl::desc("Combine extends of AArch64 masked "
136 "gather intrinsics"),
137 cl::init(true));
138
139static cl::opt<bool> EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden,
140 cl::desc("Combine ext and trunc to TBL"),
141 cl::init(true));
142
143// All of the XOR, OR and CMP use ALU ports, and data dependency will become the
144// bottleneck after this transform on high end CPU. So this max leaf node
145// limitation is guard cmp+ccmp will be profitable.
146static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden,
147 cl::desc("Maximum of xors"));
148
149// By turning this on, we will not fallback to DAG ISel when encountering
150// scalable vector types for all instruction, even if SVE is not yet supported
151// with some instructions.
152// See [AArch64TargetLowering::fallbackToDAGISel] for implementation details.
154 "aarch64-enable-gisel-sve", cl::Hidden,
155 cl::desc("Enable / disable SVE scalable vectors in Global ISel"),
156 cl::init(false));
157
158/// Value type used for condition codes.
159static const MVT MVT_CC = MVT::i32;
160
161static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2,
162 AArch64::X3, AArch64::X4, AArch64::X5,
163 AArch64::X6, AArch64::X7};
164static const MCPhysReg FPRArgRegs[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
165 AArch64::Q3, AArch64::Q4, AArch64::Q5,
166 AArch64::Q6, AArch64::Q7};
167
169
171
172static inline EVT getPackedSVEVectorVT(EVT VT) {
173 switch (VT.getSimpleVT().SimpleTy) {
174 default:
175 llvm_unreachable("unexpected element type for vector");
176 case MVT::i8:
177 return MVT::nxv16i8;
178 case MVT::i16:
179 return MVT::nxv8i16;
180 case MVT::i32:
181 return MVT::nxv4i32;
182 case MVT::i64:
183 return MVT::nxv2i64;
184 case MVT::f16:
185 return MVT::nxv8f16;
186 case MVT::f32:
187 return MVT::nxv4f32;
188 case MVT::f64:
189 return MVT::nxv2f64;
190 case MVT::bf16:
191 return MVT::nxv8bf16;
192 }
193}
194
195// NOTE: Currently there's only a need to return integer vector types. If this
196// changes then just add an extra "type" parameter.
198 switch (EC.getKnownMinValue()) {
199 default:
200 llvm_unreachable("unexpected element count for vector");
201 case 16:
202 return MVT::nxv16i8;
203 case 8:
204 return MVT::nxv8i16;
205 case 4:
206 return MVT::nxv4i32;
207 case 2:
208 return MVT::nxv2i64;
209 }
210}
211
213 assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
214 "Expected scalable predicate vector type!");
215 switch (VT.getVectorMinNumElements()) {
216 default:
217 llvm_unreachable("unexpected element count for vector");
218 case 2:
219 return MVT::nxv2i64;
220 case 4:
221 return MVT::nxv4i32;
222 case 8:
223 return MVT::nxv8i16;
224 case 16:
225 return MVT::nxv16i8;
226 }
227}
228
229/// Returns true if VT's elements occupy the lowest bit positions of its
230/// associated register class without any intervening space.
231///
232/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
233/// same register class, but only nxv8f16 can be treated as a packed vector.
234static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
236 "Expected legal vector type!");
237 return VT.isFixedLengthVector() ||
239}
240
241// Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
242// predicate and end with a passthru value matching the result type.
243static bool isMergePassthruOpcode(unsigned Opc) {
244 switch (Opc) {
245 default:
246 return false;
276 return true;
277 }
278}
279
280// Returns true if inactive lanes are known to be zeroed by construction.
282 switch (Op.getOpcode()) {
283 default:
284 return false;
285 // We guarantee i1 splat_vectors to zero the other lanes
289 return true;
291 switch (Op.getConstantOperandVal(0)) {
292 default:
293 return false;
294 case Intrinsic::aarch64_sve_ptrue:
295 case Intrinsic::aarch64_sve_pnext:
296 case Intrinsic::aarch64_sve_cmpeq:
297 case Intrinsic::aarch64_sve_cmpne:
298 case Intrinsic::aarch64_sve_cmpge:
299 case Intrinsic::aarch64_sve_cmpgt:
300 case Intrinsic::aarch64_sve_cmphs:
301 case Intrinsic::aarch64_sve_cmphi:
302 case Intrinsic::aarch64_sve_cmpeq_wide:
303 case Intrinsic::aarch64_sve_cmpne_wide:
304 case Intrinsic::aarch64_sve_cmpge_wide:
305 case Intrinsic::aarch64_sve_cmpgt_wide:
306 case Intrinsic::aarch64_sve_cmplt_wide:
307 case Intrinsic::aarch64_sve_cmple_wide:
308 case Intrinsic::aarch64_sve_cmphs_wide:
309 case Intrinsic::aarch64_sve_cmphi_wide:
310 case Intrinsic::aarch64_sve_cmplo_wide:
311 case Intrinsic::aarch64_sve_cmpls_wide:
312 case Intrinsic::aarch64_sve_fcmpeq:
313 case Intrinsic::aarch64_sve_fcmpne:
314 case Intrinsic::aarch64_sve_fcmpge:
315 case Intrinsic::aarch64_sve_fcmpgt:
316 case Intrinsic::aarch64_sve_fcmpuo:
317 case Intrinsic::aarch64_sve_facgt:
318 case Intrinsic::aarch64_sve_facge:
319 case Intrinsic::aarch64_sve_whilege:
320 case Intrinsic::aarch64_sve_whilegt:
321 case Intrinsic::aarch64_sve_whilehi:
322 case Intrinsic::aarch64_sve_whilehs:
323 case Intrinsic::aarch64_sve_whilele:
324 case Intrinsic::aarch64_sve_whilelo:
325 case Intrinsic::aarch64_sve_whilels:
326 case Intrinsic::aarch64_sve_whilelt:
327 case Intrinsic::aarch64_sve_match:
328 case Intrinsic::aarch64_sve_nmatch:
329 case Intrinsic::aarch64_sve_whilege_x2:
330 case Intrinsic::aarch64_sve_whilegt_x2:
331 case Intrinsic::aarch64_sve_whilehi_x2:
332 case Intrinsic::aarch64_sve_whilehs_x2:
333 case Intrinsic::aarch64_sve_whilele_x2:
334 case Intrinsic::aarch64_sve_whilelo_x2:
335 case Intrinsic::aarch64_sve_whilels_x2:
336 case Intrinsic::aarch64_sve_whilelt_x2:
337 return true;
338 }
339 }
340}
341
342static std::tuple<SDValue, SDValue>
344 SDLoc DL(Disc);
345 SDValue AddrDisc;
346 SDValue ConstDisc;
347
348 // If this is a blend, remember the constant and address discriminators.
349 // Otherwise, it's either a constant discriminator, or a non-blended
350 // address discriminator.
351 if (Disc->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
352 Disc->getConstantOperandVal(0) == Intrinsic::ptrauth_blend) {
353 AddrDisc = Disc->getOperand(1);
354 ConstDisc = Disc->getOperand(2);
355 } else {
356 ConstDisc = Disc;
357 }
358
359 // If the constant discriminator (either the blend RHS, or the entire
360 // discriminator value) isn't a 16-bit constant, bail out, and let the
361 // discriminator be computed separately.
362 const auto *ConstDiscN = dyn_cast<ConstantSDNode>(ConstDisc);
363 if (!ConstDiscN || !isUInt<16>(ConstDiscN->getZExtValue()))
364 return std::make_tuple(DAG->getTargetConstant(0, DL, MVT::i64), Disc);
365
366 // If there's no address discriminator, use NoRegister, which we'll later
367 // replace with XZR, or directly use a Z variant of the inst. when available.
368 if (!AddrDisc)
369 AddrDisc = DAG->getRegister(AArch64::NoRegister, MVT::i64);
370
371 return std::make_tuple(
372 DAG->getTargetConstant(ConstDiscN->getZExtValue(), DL, MVT::i64),
373 AddrDisc);
374}
375
377 const AArch64Subtarget &STI)
378 : TargetLowering(TM), Subtarget(&STI) {
379 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
380 // we have to make something up. Arbitrarily, choose ZeroOrOne.
382 // When comparing vectors the result sets the different elements in the
383 // vector to all-one or all-zero.
385
386 // Set up the register classes.
387 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
388 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
389
390 if (Subtarget->hasLS64()) {
391 addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
392 setOperationAction(ISD::LOAD, MVT::i64x8, Custom);
394 }
395
396 if (Subtarget->hasFPARMv8()) {
397 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
398 addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
399 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
400 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
401 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
402 }
403
404 if (Subtarget->hasNEON()) {
405 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
406 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
407
408 addDRType(MVT::v2f32);
409 addDRType(MVT::v8i8);
410 addDRType(MVT::v4i16);
411 addDRType(MVT::v2i32);
412 addDRType(MVT::v1i64);
413 addDRType(MVT::v1f64);
414 addDRType(MVT::v4f16);
415 addDRType(MVT::v4bf16);
416
417 addQRType(MVT::v4f32);
418 addQRType(MVT::v2f64);
419 addQRType(MVT::v16i8);
420 addQRType(MVT::v8i16);
421 addQRType(MVT::v4i32);
422 addQRType(MVT::v2i64);
423 addQRType(MVT::v8f16);
424 addQRType(MVT::v8bf16);
425 }
426
427 if (Subtarget->isSVEorStreamingSVEAvailable()) {
428 // Add legal sve predicate types
429 addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass);
430 addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
431 addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
432 addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
433 addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
434
435 // Add legal sve data types
436 addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
437 addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
438 addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
439 addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
440
441 addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
442 addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
443 addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
444 addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
445 addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
446 addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
447
448 addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
449 addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
450 addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
451
452 if (Subtarget->useSVEForFixedLengthVectors()) {
455 addRegisterClass(VT, &AArch64::ZPRRegClass);
456
459 addRegisterClass(VT, &AArch64::ZPRRegClass);
460 }
461 }
462
463 if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) {
464 addRegisterClass(MVT::aarch64svcount, &AArch64::PPRRegClass);
465 setOperationPromotedToType(ISD::LOAD, MVT::aarch64svcount, MVT::nxv16i1);
466 setOperationPromotedToType(ISD::STORE, MVT::aarch64svcount, MVT::nxv16i1);
467
468 setOperationAction(ISD::SELECT, MVT::aarch64svcount, Custom);
469 setOperationAction(ISD::SELECT_CC, MVT::aarch64svcount, Expand);
470 }
471
472 // Compute derived properties from the register classes
474
475 // Provide all sorts of operation actions
515
517
521
525
527
528 // Custom lowering hooks are needed for XOR
529 // to fold it into CSINC/CSINV.
532
533 // Virtually no operation on f128 is legal, but LLVM can't expand them when
534 // there's a valid register class, so we need custom operations in most cases.
559 // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
560 // aren't handled.
561
562 // Lowering for many of the conversions is actually specified by the non-f128
563 // type. The LowerXXX function will be trivial when f128 isn't involved.
588 if (Subtarget->hasFPARMv8()) {
591 }
594 if (Subtarget->hasFPARMv8()) {
597 }
600
605
606 // Variable arguments.
611
612 // Variable-sized objects.
615
616 // Lowering Funnel Shifts to EXTR
621
623
624 // Constant pool entries
626
627 // BlockAddress
629
630 // AArch64 lacks both left-rotate and popcount instructions.
636 }
637
638 // AArch64 doesn't have i32 MULH{S|U}.
641
642 // AArch64 doesn't have {U|S}MUL_LOHI.
647
648 if (Subtarget->hasCSSC()) {
652
654
658
661
666
671 } else {
675
678
681 }
682
688 }
695
696 // Custom lower Add/Sub/Mul with overflow.
709
718
727 if (Subtarget->hasFullFP16()) {
730 } else {
733 }
734
735 for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
747 setOperationAction(Op, MVT::f16, Promote);
748 setOperationAction(Op, MVT::v4f16, Expand);
749 setOperationAction(Op, MVT::v8f16, Expand);
750 setOperationAction(Op, MVT::bf16, Promote);
751 setOperationAction(Op, MVT::v4bf16, Expand);
752 setOperationAction(Op, MVT::v8bf16, Expand);
753 }
754
755 auto LegalizeNarrowFP = [this](MVT ScalarVT) {
756 for (auto Op : {
760 ISD::FADD,
761 ISD::FSUB,
762 ISD::FMUL,
763 ISD::FDIV,
764 ISD::FMA,
794 })
795 setOperationAction(Op, ScalarVT, Promote);
796
797 for (auto Op : {ISD::FNEG, ISD::FABS})
798 setOperationAction(Op, ScalarVT, Legal);
799
800 // Round-to-integer need custom lowering for fp16, as Promote doesn't work
801 // because the result type is integer.
805 setOperationAction(Op, ScalarVT, Custom);
806
807 // promote v4f16 to v4f32 when that is known to be safe.
808 auto V4Narrow = MVT::getVectorVT(ScalarVT, 4);
809 setOperationPromotedToType(ISD::FADD, V4Narrow, MVT::v4f32);
810 setOperationPromotedToType(ISD::FSUB, V4Narrow, MVT::v4f32);
811 setOperationPromotedToType(ISD::FMUL, V4Narrow, MVT::v4f32);
812 setOperationPromotedToType(ISD::FDIV, V4Narrow, MVT::v4f32);
813 setOperationPromotedToType(ISD::FCEIL, V4Narrow, MVT::v4f32);
814 setOperationPromotedToType(ISD::FFLOOR, V4Narrow, MVT::v4f32);
815 setOperationPromotedToType(ISD::FROUND, V4Narrow, MVT::v4f32);
816 setOperationPromotedToType(ISD::FTRUNC, V4Narrow, MVT::v4f32);
817 setOperationPromotedToType(ISD::FROUNDEVEN, V4Narrow, MVT::v4f32);
818 setOperationPromotedToType(ISD::FRINT, V4Narrow, MVT::v4f32);
819 setOperationPromotedToType(ISD::FNEARBYINT, V4Narrow, MVT::v4f32);
820
830
831 auto V8Narrow = MVT::getVectorVT(ScalarVT, 8);
853 };
854
855 if (!Subtarget->hasFullFP16()) {
856 LegalizeNarrowFP(MVT::f16);
857 }
858 LegalizeNarrowFP(MVT::bf16);
861
862 // AArch64 has implementations of a lot of rounding-like FP operations.
863 for (auto Op :
874 for (MVT Ty : {MVT::f32, MVT::f64})
876 if (Subtarget->hasFullFP16())
877 setOperationAction(Op, MVT::f16, Legal);
878 }
879
880 // Basic strict FP operations are legal
883 for (MVT Ty : {MVT::f32, MVT::f64})
885 if (Subtarget->hasFullFP16())
886 setOperationAction(Op, MVT::f16, Legal);
887 }
888
889 // Strict conversion to a larger type is legal
890 for (auto VT : {MVT::f32, MVT::f64})
892
894
900
902 if (!Subtarget->hasLSE() && !Subtarget->outlineAtomics()) {
905 } else {
908 }
911
912 // Generate outline atomics library calls only if LSE was not specified for
913 // subtarget
914 if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
940#define LCALLNAMES(A, B, N) \
941 setLibcallName(A##N##_RELAX, #B #N "_relax"); \
942 setLibcallName(A##N##_ACQ, #B #N "_acq"); \
943 setLibcallName(A##N##_REL, #B #N "_rel"); \
944 setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
945#define LCALLNAME4(A, B) \
946 LCALLNAMES(A, B, 1) \
947 LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
948#define LCALLNAME5(A, B) \
949 LCALLNAMES(A, B, 1) \
950 LCALLNAMES(A, B, 2) \
951 LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
952 LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
953 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
954 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
955 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
956 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
957 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
958#undef LCALLNAMES
959#undef LCALLNAME4
960#undef LCALLNAME5
961 }
962
963 if (Subtarget->hasLSE128()) {
964 // Custom lowering because i128 is not legal. Must be replaced by 2x64
965 // values. ATOMIC_LOAD_AND also needs op legalisation to emit LDCLRP.
969 }
970
971 // 128-bit loads and stores can be done without expanding
974
975 // Aligned 128-bit loads and stores are single-copy atomic according to the
976 // v8.4a spec. LRCPC3 introduces 128-bit STILP/LDIAPP but still requires LSE2.
977 if (Subtarget->hasLSE2()) {
980 }
981
982 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
983 // custom lowering, as there are no un-paired non-temporal stores and
984 // legalization will break up 256 bit inputs.
986 setOperationAction(ISD::STORE, MVT::v16i16, Custom);
987 setOperationAction(ISD::STORE, MVT::v16f16, Custom);
988 setOperationAction(ISD::STORE, MVT::v16bf16, Custom);
993
994 // 256 bit non-temporal loads can be lowered to LDNP. This is done using
995 // custom lowering, as there are no un-paired non-temporal loads legalization
996 // will break up 256 bit inputs.
997 setOperationAction(ISD::LOAD, MVT::v32i8, Custom);
998 setOperationAction(ISD::LOAD, MVT::v16i16, Custom);
999 setOperationAction(ISD::LOAD, MVT::v16f16, Custom);
1000 setOperationAction(ISD::LOAD, MVT::v16bf16, Custom);
1001 setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
1002 setOperationAction(ISD::LOAD, MVT::v8f32, Custom);
1003 setOperationAction(ISD::LOAD, MVT::v4f64, Custom);
1004 setOperationAction(ISD::LOAD, MVT::v4i64, Custom);
1005
1006 // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
1008
1009 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1010 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1011 // Issue __sincos_stret if available.
1014 } else {
1017 }
1018
1019 // Make floating-point constants legal for the large code model, so they don't
1020 // become loads from the constant pool.
1021 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
1024 }
1025
1026 // AArch64 does not have floating-point extending loads, i1 sign-extending
1027 // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
1028 for (MVT VT : MVT::fp_valuetypes()) {
1029 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
1030 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
1031 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
1032 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
1033 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
1034 }
1035 for (MVT VT : MVT::integer_valuetypes())
1036 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);
1037
1038 for (MVT WideVT : MVT::fp_valuetypes()) {
1039 for (MVT NarrowVT : MVT::fp_valuetypes()) {
1040 if (WideVT.getScalarSizeInBits() > NarrowVT.getScalarSizeInBits()) {
1041 setTruncStoreAction(WideVT, NarrowVT, Expand);
1042 }
1043 }
1044 }
1045
1046 if (Subtarget->hasFPARMv8()) {
1050 }
1051
1052 // Indexed loads and stores are supported.
1053 for (unsigned im = (unsigned)ISD::PRE_INC;
1055 setIndexedLoadAction(im, MVT::i8, Legal);
1056 setIndexedLoadAction(im, MVT::i16, Legal);
1057 setIndexedLoadAction(im, MVT::i32, Legal);
1058 setIndexedLoadAction(im, MVT::i64, Legal);
1059 setIndexedLoadAction(im, MVT::f64, Legal);
1060 setIndexedLoadAction(im, MVT::f32, Legal);
1061 setIndexedLoadAction(im, MVT::f16, Legal);
1062 setIndexedLoadAction(im, MVT::bf16, Legal);
1063 setIndexedStoreAction(im, MVT::i8, Legal);
1064 setIndexedStoreAction(im, MVT::i16, Legal);
1065 setIndexedStoreAction(im, MVT::i32, Legal);
1066 setIndexedStoreAction(im, MVT::i64, Legal);
1067 setIndexedStoreAction(im, MVT::f64, Legal);
1068 setIndexedStoreAction(im, MVT::f32, Legal);
1069 setIndexedStoreAction(im, MVT::f16, Legal);
1070 setIndexedStoreAction(im, MVT::bf16, Legal);
1071 }
1072
1073 // Trap.
1074 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1077
1078 // We combine OR nodes for bitfield operations.
1080 // Try to create BICs for vector ANDs.
1082
1083 // llvm.init.trampoline and llvm.adjust.trampoline
1086
1087 // Vector add and sub nodes may conceal a high-half opportunity.
1088 // Also, try to fold ADD into CSINC/CSINV..
1091
1094
1095 // Try and combine setcc with csel
1097
1099
1106
1108
1110
1112
1116
1118
1120
1122
1124
1128
1130
1131 // In case of strict alignment, avoid an excessive number of byte wide stores.
1134 Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32;
1135
1139 Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16;
1140
1143
1146 Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8;
1147
1149
1151
1152 EnableExtLdPromotion = true;
1153
1154 // Set required alignment.
1156 // Set preferred alignments.
1157
1158 // Don't align loops on Windows. The SEH unwind info generation needs to
1159 // know the exact length of functions before the alignments have been
1160 // expanded.
1161 if (!Subtarget->isTargetWindows())
1165
1166 // Only change the limit for entries in a jump table if specified by
1167 // the sub target, but not at the command line.
1168 unsigned MaxJT = STI.getMaximumJumpTableSize();
1169 if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
1171
1173
1175
1177
1178 if (Subtarget->isNeonAvailable()) {
1179 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
1180 // silliness like this:
1181 // clang-format off
1182 for (auto Op :
1202 setOperationAction(Op, MVT::v1f64, Expand);
1203 // clang-format on
1204 for (auto Op :
1209 setOperationAction(Op, MVT::v1i64, Expand);
1210
1211 // AArch64 doesn't have a direct vector ->f32 conversion instructions for
1212 // elements smaller than i32, so promote the input to i32 first.
1213 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
1214 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
1215
1216 // Similarly, there is no direct i32 -> f64 vector conversion instruction.
1217 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
1218 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1221 for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
1223
1224 if (Subtarget->hasFullFP16()) {
1227
1236 } else {
1237 // when AArch64 doesn't have fullfp16 support, promote the input
1238 // to i32 first.
1239 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
1240 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
1241 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32);
1242 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32);
1243 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
1244 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
1245 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
1246 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
1247 }
1248
1249 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
1250 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
1257 for (auto VT : {MVT::v1i64, MVT::v2i64}) {
1262 }
1263
1264 // Custom handling for some quad-vector types to detect MULL.
1265 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
1266 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1267 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1268 setOperationAction(ISD::MUL, MVT::v4i16, Custom);
1269 setOperationAction(ISD::MUL, MVT::v2i32, Custom);
1270 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1271
1272 // Saturates
1273 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1274 MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1279 }
1280
1281 for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1282 MVT::v4i32}) {
1289 }
1290
1291 // Vector reductions
1292 for (MVT VT : { MVT::v4f16, MVT::v2f32,
1293 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1294 if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1299
1301 }
1302 }
1303 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1304 MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1313 }
1318
1320 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
1321 // Likewise, narrowing and extending vector loads/stores aren't handled
1322 // directly.
1325
1326 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1329 } else {
1332 }
1335
1338
1339 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1340 setTruncStoreAction(VT, InnerVT, Expand);
1341 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1342 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1343 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1344 }
1345 }
1346
1347 // AArch64 has implementations of a lot of rounding-like FP operations.
1348 for (auto Op :
1353 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1355 if (Subtarget->hasFullFP16())
1356 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1358 }
1359
1360 // LRINT and LLRINT.
1361 for (auto Op : {ISD::LRINT, ISD::LLRINT}) {
1362 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1364 if (Subtarget->hasFullFP16())
1365 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1367 }
1368
1369 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
1370
1375
1379
1380 setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1381 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1382 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1383 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1384 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1385 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1386
1387 // ADDP custom lowering
1388 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1390 // FADDP custom lowering
1391 for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1393 } else /* !isNeonAvailable */ {
1395 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1397
1398 if (VT.is128BitVector() || VT.is64BitVector()) {
1402 Subtarget->isLittleEndian() ? Legal : Expand);
1403 }
1404 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1405 setTruncStoreAction(VT, InnerVT, Expand);
1406 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1407 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1408 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1409 }
1410 }
1411 }
1412
1413 if (Subtarget->hasSME()) {
1415 }
1416
1417 // FIXME: Move lowering for more nodes here if those are common between
1418 // SVE and SME.
1419 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1420 for (auto VT :
1421 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1426 }
1427 }
1428
1429 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1430 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1471
1477
1486
1491
1492 if (!Subtarget->isLittleEndian())
1494
1495 if (Subtarget->hasSVE2() ||
1496 (Subtarget->hasSME() && Subtarget->isStreaming()))
1497 // For SLI/SRI.
1499 }
1500
1501 // Illegal unpacked integer vector types.
1502 for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1505 }
1506
1507 // Legalize unpacked bitcasts to REINTERPRET_CAST.
1508 for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32, MVT::nxv2bf16,
1509 MVT::nxv4bf16, MVT::nxv2f16, MVT::nxv4f16, MVT::nxv2f32})
1511
1512 for (auto VT :
1513 { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
1514 MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
1516
1517 for (auto VT :
1518 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1526
1530
1531 // There are no legal MVT::nxv16f## based types.
1532 if (VT != MVT::nxv16i1) {
1535 }
1536 }
1537
1538 // NEON doesn't support masked loads/stores, but SME and SVE do.
1539 for (auto VT :
1540 {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
1541 MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1542 MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1545 }
1546
1547 // Firstly, exclude all scalable vector extending loads/truncating stores,
1548 // include both integer and floating scalable vector.
1550 for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
1551 setTruncStoreAction(VT, InnerVT, Expand);
1552 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1553 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1554 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1555 }
1556 }
1557
1558 // Then, selectively enable those which we directly support.
1559 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i8, Legal);
1560 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i16, Legal);
1561 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i32, Legal);
1562 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i8, Legal);
1563 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i16, Legal);
1564 setTruncStoreAction(MVT::nxv8i16, MVT::nxv8i8, Legal);
1565 for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1566 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal);
1567 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal);
1568 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal);
1569 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal);
1570 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal);
1571 setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal);
1572 }
1573
1574 // SVE supports truncating stores of 64 and 128-bit vectors
1575 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);
1576 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);
1577 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom);
1578 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
1579 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
1580
1581 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1582 MVT::nxv4f32, MVT::nxv2f64}) {
1621
1642
1654
1655 if (!Subtarget->isLittleEndian())
1657 }
1658
1659 for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1665
1666 if (!Subtarget->isLittleEndian())
1668 }
1669
1672
1673 // NEON doesn't support integer divides, but SVE does
1674 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1675 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1678 }
1679
1680 // NEON doesn't support 64-bit vector integer muls, but SVE does.
1681 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1682 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1683
1684 // NOTE: Currently this has to happen after computeRegisterProperties rather
1685 // than the preferred option of combining it with the addRegisterClass call.
1686 if (Subtarget->useSVEForFixedLengthVectors()) {
1689 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1690 addTypeForFixedLengthSVE(VT);
1691 }
1694 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1695 addTypeForFixedLengthSVE(VT);
1696 }
1697
1698 // 64bit results can mean a bigger than NEON input.
1699 for (auto VT : {MVT::v8i8, MVT::v4i16})
1702
1703 // 128bit results imply a bigger than NEON input.
1704 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1706 for (auto VT : {MVT::v8f16, MVT::v4f32})
1708
1709 // These operations are not supported on NEON but SVE can do them.
1711 setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
1712 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1713 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
1714 setOperationAction(ISD::MULHS, MVT::v1i64, Custom);
1715 setOperationAction(ISD::MULHS, MVT::v2i64, Custom);
1716 setOperationAction(ISD::MULHU, MVT::v1i64, Custom);
1717 setOperationAction(ISD::MULHU, MVT::v2i64, Custom);
1718 setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
1719 setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
1720 setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
1721 setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
1722 setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
1723 setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
1724 setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
1725 setOperationAction(ISD::UMIN, MVT::v2i64, Custom);
1730
1731 // Int operations with no NEON support.
1732 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1733 MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
1741 }
1742
1743 // Use SVE for vectors with more than 2 elements.
1744 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1746 }
1747
1748 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64);
1749 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32);
1750 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16);
1751 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);
1752
1754
1755 for (auto VT : {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1})
1757 }
1758
1759 // Handle operations that are only available in non-streaming SVE mode.
1760 if (Subtarget->isSVEAvailable()) {
1761 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64,
1762 MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1763 MVT::nxv4f32, MVT::nxv2f64, MVT::nxv2bf16, MVT::nxv4bf16,
1764 MVT::nxv8bf16, MVT::v4f16, MVT::v8f16, MVT::v2f32,
1765 MVT::v4f32, MVT::v1f64, MVT::v2f64, MVT::v8i8,
1766 MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1767 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1770 }
1771
1772 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1773 MVT::nxv4f32, MVT::nxv2f64, MVT::v4f16, MVT::v8f16,
1774 MVT::v2f32, MVT::v4f32, MVT::v2f64})
1776
1777 // Histcnt is SVE2 only
1778 if (Subtarget->hasSVE2())
1780 Custom);
1781 }
1782
1783
1784 if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
1785 // Only required for llvm.aarch64.mops.memset.tag
1787 }
1788
1790
1791 if (Subtarget->hasSVE()) {
1796 }
1797
1798 PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
1799
1800 IsStrictFPEnabled = true;
1802
1803 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
1804 // it, but it's just a wrapper around ldexp.
1805 if (Subtarget->isTargetWindows()) {
1807 if (isOperationExpand(Op, MVT::f32))
1808 setOperationAction(Op, MVT::f32, Promote);
1809 }
1810
1811 // LegalizeDAG currently can't expand fp16 LDEXP/FREXP on targets where i16
1812 // isn't legal.
1814 if (isOperationExpand(Op, MVT::f16))
1815 setOperationAction(Op, MVT::f16, Promote);
1816
1817 if (Subtarget->isWindowsArm64EC()) {
1818 // FIXME: are there intrinsics we need to exclude from this?
1819 for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i) {
1820 auto code = static_cast<RTLIB::Libcall>(i);
1821 auto libcallName = getLibcallName(code);
1822 if ((libcallName != nullptr) && (libcallName[0] != '#')) {
1823 setLibcallName(code, Saver.save(Twine("#") + libcallName).data());
1824 }
1825 }
1826 }
1827}
1828
1829void AArch64TargetLowering::addTypeForNEON(MVT VT) {
1830 assert(VT.isVector() && "VT should be a vector type");
1831
1832 if (VT.isFloatingPoint()) {
1834 setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
1835 setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
1836 }
1837
1838 // Mark vector float intrinsics as expand.
1839 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
1856 }
1857
1858 // But we do support custom-lowering for FCOPYSIGN.
1859 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
1860 ((VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v4f16 ||
1861 VT == MVT::v8f16) &&
1862 Subtarget->hasFullFP16()))
1864
1877
1881 for (MVT InnerVT : MVT::all_valuetypes())
1882 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1883
1884 // CNT supports only B element sizes, then use UADDLP to widen.
1885 if (VT != MVT::v8i8 && VT != MVT::v16i8)
1887
1893
1894 for (unsigned Opcode :
1897 setOperationAction(Opcode, VT, Custom);
1898
1899 if (!VT.isFloatingPoint())
1901
1902 // [SU][MIN|MAX] are available for all NEON types apart from i64.
1903 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
1904 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
1905 setOperationAction(Opcode, VT, Legal);
1906
1907 // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
1908 // NEON types.
1909 if (VT.isFloatingPoint() &&
1910 VT.getVectorElementType() != MVT::bf16 &&
1911 (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
1912 for (unsigned Opcode :
1918 setOperationAction(Opcode, VT, Legal);
1919
1920 // Strict fp extend and trunc are legal
1921 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16)
1923 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64)
1925
1926 // FIXME: We could potentially make use of the vector comparison instructions
1927 // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
1928 // complications:
1929 // * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,
1930 // so we would need to expand when the condition code doesn't match the
1931 // kind of comparison.
1932 // * Some kinds of comparison require more than one FCMXY instruction so
1933 // would need to be expanded instead.
1934 // * The lowering of the non-strict versions involves target-specific ISD
1935 // nodes so we would likely need to add strict versions of all of them and
1936 // handle them appropriately.
1939
1940 if (Subtarget->isLittleEndian()) {
1941 for (unsigned im = (unsigned)ISD::PRE_INC;
1945 }
1946 }
1947
1948 if (Subtarget->hasD128()) {
1951 }
1952}
1953
1955 EVT OpVT) const {
1956 // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
1957 if (!Subtarget->hasSVE())
1958 return true;
1959
1960 // We can only support legal predicate result types. We can use the SVE
1961 // whilelo instruction for generating fixed-width predicates too.
1962 if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 &&
1963 ResVT != MVT::nxv16i1 && ResVT != MVT::v2i1 && ResVT != MVT::v4i1 &&
1964 ResVT != MVT::v8i1 && ResVT != MVT::v16i1)
1965 return true;
1966
1967 // The whilelo instruction only works with i32 or i64 scalar inputs.
1968 if (OpVT != MVT::i32 && OpVT != MVT::i64)
1969 return true;
1970
1971 return false;
1972}
1973
1975 if (!Subtarget->isSVEorStreamingSVEAvailable())
1976 return true;
1977
1978 // We can only use the BRKB + CNTP sequence with legal predicate types. We can
1979 // also support fixed-width predicates.
1980 return VT != MVT::nxv16i1 && VT != MVT::nxv8i1 && VT != MVT::nxv4i1 &&
1981 VT != MVT::nxv2i1 && VT != MVT::v16i1 && VT != MVT::v8i1 &&
1982 VT != MVT::v4i1 && VT != MVT::v2i1;
1983}
1984
1985void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
1986 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
1987
1988 // By default everything must be expanded.
1989 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1991
1992 if (VT.isFloatingPoint()) {
2002 }
2003
2005 VT == MVT::v1f64 ? Expand : Custom;
2006
2007 // Mark integer truncating stores/extending loads as having custom lowering
2008 if (VT.isInteger()) {
2009 MVT InnerVT = VT.changeVectorElementType(MVT::i8);
2010 while (InnerVT != VT) {
2011 setTruncStoreAction(VT, InnerVT, Default);
2012 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Default);
2013 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Default);
2014 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
2015 InnerVT = InnerVT.changeVectorElementType(
2016 MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
2017 }
2018 }
2019
2020 // Mark floating-point truncating stores/extending loads as having custom
2021 // lowering
2022 if (VT.isFloatingPoint()) {
2023 MVT InnerVT = VT.changeVectorElementType(MVT::f16);
2024 while (InnerVT != VT) {
2025 setTruncStoreAction(VT, InnerVT, Custom);
2026 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
2027 InnerVT = InnerVT.changeVectorElementType(
2029 }
2030 }
2031
2032 bool PreferNEON = VT.is64BitVector() || VT.is128BitVector();
2033 bool PreferSVE = !PreferNEON && Subtarget->isSVEAvailable();
2034
2035 // Lower fixed length vector operations to scalable equivalents.
2040 setOperationAction(ISD::BITCAST, VT, PreferNEON ? Legal : Default);
2077 setOperationAction(ISD::LOAD, VT, PreferNEON ? Legal : Default);
2078 setOperationAction(ISD::MGATHER, VT, PreferSVE ? Default : Expand);
2080 setOperationAction(ISD::MSCATTER, VT, PreferSVE ? Default : Expand);
2099 setOperationAction(ISD::STORE, VT, PreferNEON ? Legal : Default);
2125}
2126
2127void AArch64TargetLowering::addDRType(MVT VT) {
2128 addRegisterClass(VT, &AArch64::FPR64RegClass);
2129 if (Subtarget->isNeonAvailable())
2130 addTypeForNEON(VT);
2131}
2132
2133void AArch64TargetLowering::addQRType(MVT VT) {
2134 addRegisterClass(VT, &AArch64::FPR128RegClass);
2135 if (Subtarget->isNeonAvailable())
2136 addTypeForNEON(VT);
2137}
2138
2140 LLVMContext &C, EVT VT) const {
2141 if (!VT.isVector())
2142 return MVT::i32;
2143 if (VT.isScalableVector())
2144 return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
2146}
2147
2148// isIntImmediate - This method tests to see if the node is a constant
2149// operand. If so Imm will receive the value.
2150static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
2151 if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
2152 Imm = C->getZExtValue();
2153 return true;
2154 }
2155 return false;
2156}
2157
2158// isOpcWithIntImmediate - This method tests to see if the node is a specific
2159// opcode and that it has a immediate integer right operand.
2160// If so Imm will receive the value.
2161static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
2162 uint64_t &Imm) {
2163 return N->getOpcode() == Opc &&
2164 isIntImmediate(N->getOperand(1).getNode(), Imm);
2165}
2166
2167static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
2168 const APInt &Demanded,
2170 unsigned NewOpc) {
2171 uint64_t OldImm = Imm, NewImm, Enc;
2172 uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
2173
2174 // Return if the immediate is already all zeros, all ones, a bimm32 or a
2175 // bimm64.
2176 if (Imm == 0 || Imm == Mask ||
2178 return false;
2179
2180 unsigned EltSize = Size;
2181 uint64_t DemandedBits = Demanded.getZExtValue();
2182
2183 // Clear bits that are not demanded.
2184 Imm &= DemandedBits;
2185
2186 while (true) {
2187 // The goal here is to set the non-demanded bits in a way that minimizes
2188 // the number of switching between 0 and 1. In order to achieve this goal,
2189 // we set the non-demanded bits to the value of the preceding demanded bits.
2190 // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
2191 // non-demanded bit), we copy bit0 (1) to the least significant 'x',
2192 // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
2193 // The final result is 0b11000011.
2194 uint64_t NonDemandedBits = ~DemandedBits;
2195 uint64_t InvertedImm = ~Imm & DemandedBits;
2196 uint64_t RotatedImm =
2197 ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
2198 NonDemandedBits;
2199 uint64_t Sum = RotatedImm + NonDemandedBits;
2200 bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
2201 uint64_t Ones = (Sum + Carry) & NonDemandedBits;
2202 NewImm = (Imm | Ones) & Mask;
2203
2204 // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
2205 // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
2206 // we halve the element size and continue the search.
2207 if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
2208 break;
2209
2210 // We cannot shrink the element size any further if it is 2-bits.
2211 if (EltSize == 2)
2212 return false;
2213
2214 EltSize /= 2;
2215 Mask >>= EltSize;
2216 uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
2217
2218 // Return if there is mismatch in any of the demanded bits of Imm and Hi.
2219 if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
2220 return false;
2221
2222 // Merge the upper and lower halves of Imm and DemandedBits.
2223 Imm |= Hi;
2224 DemandedBits |= DemandedBitsHi;
2225 }
2226
2227 ++NumOptimizedImms;
2228
2229 // Replicate the element across the register width.
2230 while (EltSize < Size) {
2231 NewImm |= NewImm << EltSize;
2232 EltSize *= 2;
2233 }
2234
2235 (void)OldImm;
2236 assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
2237 "demanded bits should never be altered");
2238 assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
2239
2240 // Create the new constant immediate node.
2241 EVT VT = Op.getValueType();
2242 SDLoc DL(Op);
2243 SDValue New;
2244
2245 // If the new constant immediate is all-zeros or all-ones, let the target
2246 // independent DAG combine optimize this node.
2247 if (NewImm == 0 || NewImm == OrigMask) {
2248 New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
2249 TLO.DAG.getConstant(NewImm, DL, VT));
2250 // Otherwise, create a machine node so that target independent DAG combine
2251 // doesn't undo this optimization.
2252 } else {
2254 SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
2255 New = SDValue(
2256 TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
2257 }
2258
2259 return TLO.CombineTo(Op, New);
2260}
2261
2263 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
2264 TargetLoweringOpt &TLO) const {
2265 // Delay this optimization to as late as possible.
2266 if (!TLO.LegalOps)
2267 return false;
2268
2270 return false;
2271
2272 EVT VT = Op.getValueType();
2273 if (VT.isVector())
2274 return false;
2275
2276 unsigned Size = VT.getSizeInBits();
2277 assert((Size == 32 || Size == 64) &&
2278 "i32 or i64 is expected after legalization.");
2279
2280 // Exit early if we demand all bits.
2281 if (DemandedBits.popcount() == Size)
2282 return false;
2283
2284 unsigned NewOpc;
2285 switch (Op.getOpcode()) {
2286 default:
2287 return false;
2288 case ISD::AND:
2289 NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
2290 break;
2291 case ISD::OR:
2292 NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
2293 break;
2294 case ISD::XOR:
2295 NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
2296 break;
2297 }
2298 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
2299 if (!C)
2300 return false;
2301 uint64_t Imm = C->getZExtValue();
2302 return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
2303}
2304
2305/// computeKnownBitsForTargetNode - Determine which of the bits specified in
2306/// Mask are known to be either zero or one and return them Known.
2308 const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
2309 const SelectionDAG &DAG, unsigned Depth) const {
2310 switch (Op.getOpcode()) {
2311 default:
2312 break;
2313 case AArch64ISD::DUP: {
2314 SDValue SrcOp = Op.getOperand(0);
2315 Known = DAG.computeKnownBits(SrcOp, Depth + 1);
2316 if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
2317 assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
2318 "Expected DUP implicit truncation");
2319 Known = Known.trunc(Op.getScalarValueSizeInBits());
2320 }
2321 break;
2322 }
2323 case AArch64ISD::CSEL: {
2324 KnownBits Known2;
2325 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2326 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2327 Known = Known.intersectWith(Known2);
2328 break;
2329 }
2330 case AArch64ISD::BICi: {
2331 // Compute the bit cleared value.
2332 uint64_t Mask =
2333 ~(Op->getConstantOperandVal(1) << Op->getConstantOperandVal(2));
2334 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2335 Known &= KnownBits::makeConstant(APInt(Known.getBitWidth(), Mask));
2336 break;
2337 }
2338 case AArch64ISD::VLSHR: {
2339 KnownBits Known2;
2340 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2341 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2342 Known = KnownBits::lshr(Known, Known2);
2343 break;
2344 }
2345 case AArch64ISD::VASHR: {
2346 KnownBits Known2;
2347 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2348 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2349 Known = KnownBits::ashr(Known, Known2);
2350 break;
2351 }
2352 case AArch64ISD::VSHL: {
2353 KnownBits Known2;
2354 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2355 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2356 Known = KnownBits::shl(Known, Known2);
2357 break;
2358 }
2359 case AArch64ISD::MOVI: {
2361 APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)));
2362 break;
2363 }
2365 case AArch64ISD::ADDlow: {
2366 if (!Subtarget->isTargetILP32())
2367 break;
2368 // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
2369 Known.Zero = APInt::getHighBitsSet(64, 32);
2370 break;
2371 }
2373 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2374 Known.Zero |= APInt(Known.getBitWidth(), 0xFE);
2375 break;
2376 }
2378 Intrinsic::ID IntID =
2379 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
2380 switch (IntID) {
2381 default: return;
2382 case Intrinsic::aarch64_ldaxr:
2383 case Intrinsic::aarch64_ldxr: {
2384 unsigned BitWidth = Known.getBitWidth();
2385 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
2386 unsigned MemBits = VT.getScalarSizeInBits();
2387 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
2388 return;
2389 }
2390 }
2391 break;
2392 }
2394 case ISD::INTRINSIC_VOID: {
2395 unsigned IntNo = Op.getConstantOperandVal(0);
2396 switch (IntNo) {
2397 default:
2398 break;
2399 case Intrinsic::aarch64_neon_uaddlv: {
2400 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2401 unsigned BitWidth = Known.getBitWidth();
2402 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2403 unsigned Bound = (VT == MVT::v8i8) ? 11 : 12;
2404 assert(BitWidth >= Bound && "Unexpected width!");
2406 Known.Zero |= Mask;
2407 }
2408 break;
2409 }
2410 case Intrinsic::aarch64_neon_umaxv:
2411 case Intrinsic::aarch64_neon_uminv: {
2412 // Figure out the datatype of the vector operand. The UMINV instruction
2413 // will zero extend the result, so we can mark as known zero all the
2414 // bits larger than the element datatype. 32-bit or larget doesn't need
2415 // this as those are legal types and will be handled by isel directly.
2416 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2417 unsigned BitWidth = Known.getBitWidth();
2418 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2419 assert(BitWidth >= 8 && "Unexpected width!");
2421 Known.Zero |= Mask;
2422 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
2423 assert(BitWidth >= 16 && "Unexpected width!");
2425 Known.Zero |= Mask;
2426 }
2427 break;
2428 } break;
2429 }
2430 }
2431 }
2432}
2433
2435 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
2436 unsigned Depth) const {
2437 EVT VT = Op.getValueType();
2438 unsigned VTBits = VT.getScalarSizeInBits();
2439 unsigned Opcode = Op.getOpcode();
2440 switch (Opcode) {
2441 case AArch64ISD::CMEQ:
2442 case AArch64ISD::CMGE:
2443 case AArch64ISD::CMGT:
2444 case AArch64ISD::CMHI:
2445 case AArch64ISD::CMHS:
2446 case AArch64ISD::FCMEQ:
2447 case AArch64ISD::FCMGE:
2448 case AArch64ISD::FCMGT:
2449 case AArch64ISD::CMEQz:
2450 case AArch64ISD::CMGEz:
2451 case AArch64ISD::CMGTz:
2452 case AArch64ISD::CMLEz:
2453 case AArch64ISD::CMLTz:
2454 case AArch64ISD::FCMEQz:
2455 case AArch64ISD::FCMGEz:
2456 case AArch64ISD::FCMGTz:
2457 case AArch64ISD::FCMLEz:
2458 case AArch64ISD::FCMLTz:
2459 // Compares return either 0 or all-ones
2460 return VTBits;
2461 }
2462
2463 return 1;
2464}
2465
2467 EVT) const {
2468 return MVT::i64;
2469}
2470
2472 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2473 unsigned *Fast) const {
2474 if (Subtarget->requiresStrictAlign())
2475 return false;
2476
2477 if (Fast) {
2478 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2479 *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
2480 // See comments in performSTORECombine() for more details about
2481 // these conditions.
2482
2483 // Code that uses clang vector extensions can mark that it
2484 // wants unaligned accesses to be treated as fast by
2485 // underspecifying alignment to be 1 or 2.
2486 Alignment <= 2 ||
2487
2488 // Disregard v2i64. Memcpy lowering produces those and splitting
2489 // them regresses performance on micro-benchmarks and olden/bh.
2490 VT == MVT::v2i64;
2491 }
2492 return true;
2493}
2494
2495// Same as above but handling LLTs instead.
2497 LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2498 unsigned *Fast) const {
2499 if (Subtarget->requiresStrictAlign())
2500 return false;
2501
2502 if (Fast) {
2503 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2504 *Fast = !Subtarget->isMisaligned128StoreSlow() ||
2505 Ty.getSizeInBytes() != 16 ||
2506 // See comments in performSTORECombine() for more details about
2507 // these conditions.
2508
2509 // Code that uses clang vector extensions can mark that it
2510 // wants unaligned accesses to be treated as fast by
2511 // underspecifying alignment to be 1 or 2.
2512 Alignment <= 2 ||
2513
2514 // Disregard v2i64. Memcpy lowering produces those and splitting
2515 // them regresses performance on micro-benchmarks and olden/bh.
2516 Ty == LLT::fixed_vector(2, 64);
2517 }
2518 return true;
2519}
2520
2521FastISel *
2523 const TargetLibraryInfo *libInfo) const {
2524 return AArch64::createFastISel(funcInfo, libInfo);
2525}
2526
2527const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
2528#define MAKE_CASE(V) \
2529 case V: \
2530 return #V;
2531 switch ((AArch64ISD::NodeType)Opcode) {
2533 break;
2857 }
2858#undef MAKE_CASE
2859 return nullptr;
2860}
2861
2864 MachineBasicBlock *MBB) const {
2865 // We materialise the F128CSEL pseudo-instruction as some control flow and a
2866 // phi node:
2867
2868 // OrigBB:
2869 // [... previous instrs leading to comparison ...]
2870 // b.ne TrueBB
2871 // b EndBB
2872 // TrueBB:
2873 // ; Fallthrough
2874 // EndBB:
2875 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2876
2877 MachineFunction *MF = MBB->getParent();
2878 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2879 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2880 DebugLoc DL = MI.getDebugLoc();
2882
2883 Register DestReg = MI.getOperand(0).getReg();
2884 Register IfTrueReg = MI.getOperand(1).getReg();
2885 Register IfFalseReg = MI.getOperand(2).getReg();
2886 unsigned CondCode = MI.getOperand(3).getImm();
2887 bool NZCVKilled = MI.getOperand(4).isKill();
2888
2889 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
2890 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
2891 MF->insert(It, TrueBB);
2892 MF->insert(It, EndBB);
2893
2894 // Transfer rest of current basic-block to EndBB
2895 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
2896 MBB->end());
2898
2899 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
2900 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
2901 MBB->addSuccessor(TrueBB);
2902 MBB->addSuccessor(EndBB);
2903
2904 // TrueBB falls through to the end.
2905 TrueBB->addSuccessor(EndBB);
2906
2907 if (!NZCVKilled) {
2908 TrueBB->addLiveIn(AArch64::NZCV);
2909 EndBB->addLiveIn(AArch64::NZCV);
2910 }
2911
2912 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
2913 .addReg(IfTrueReg)
2914 .addMBB(TrueBB)
2915 .addReg(IfFalseReg)
2916 .addMBB(MBB);
2917
2918 MI.eraseFromParent();
2919 return EndBB;
2920}
2921
2923 MachineInstr &MI, MachineBasicBlock *BB) const {
2925 BB->getParent()->getFunction().getPersonalityFn())) &&
2926 "SEH does not use catchret!");
2927 return BB;
2928}
2929
2932 MachineBasicBlock *MBB) const {
2933 MachineFunction &MF = *MBB->getParent();
2934 MachineBasicBlock::iterator MBBI = MI.getIterator();
2936 const AArch64InstrInfo &TII =
2937 *MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
2938 Register TargetReg = MI.getOperand(0).getReg();
2940 TII.probedStackAlloc(MBBI, TargetReg, false);
2941
2942 MI.eraseFromParent();
2943 return NextInst->getParent();
2944}
2945
2947AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
2949 MachineBasicBlock *BB) const {
2950 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2951 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2952
2953 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2954 MIB.add(MI.getOperand(1)); // slice index register
2955 MIB.add(MI.getOperand(2)); // slice index offset
2956 MIB.add(MI.getOperand(3)); // pg
2957 MIB.add(MI.getOperand(4)); // base
2958 MIB.add(MI.getOperand(5)); // offset
2959
2960 MI.eraseFromParent(); // The pseudo is gone now.
2961 return BB;
2962}
2963
2966 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2968 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA));
2969
2970 MIB.addReg(AArch64::ZA, RegState::Define);
2971 MIB.add(MI.getOperand(0)); // Vector select register
2972 MIB.add(MI.getOperand(1)); // Vector select offset
2973 MIB.add(MI.getOperand(2)); // Base
2974 MIB.add(MI.getOperand(1)); // Offset, same as vector select offset
2975
2976 MI.eraseFromParent(); // The pseudo is gone now.
2977 return BB;
2978}
2979
2982 unsigned Opcode,
2983 bool Op0IsDef) const {
2984 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2986
2987 MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opcode))
2988 .addReg(MI.getOperand(0).getReg(), Op0IsDef ? RegState::Define : 0);
2989 for (unsigned I = 1; I < MI.getNumOperands(); ++I)
2990 MIB.add(MI.getOperand(I));
2991
2992 MI.eraseFromParent(); // The pseudo is gone now.
2993 return BB;
2994}
2995
2997AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg,
2999 MachineBasicBlock *BB) const {
3000 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3001 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
3002 unsigned StartIdx = 0;
3003
3004 bool HasTile = BaseReg != AArch64::ZA;
3005 bool HasZPROut = HasTile && MI.getOperand(0).isReg();
3006 if (HasZPROut) {
3007 MIB.add(MI.getOperand(StartIdx)); // Output ZPR
3008 ++StartIdx;
3009 }
3010 if (HasTile) {
3011 MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm(),
3012 RegState::Define); // Output ZA Tile
3013 MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm()); // Input Za Tile
3014 StartIdx++;
3015 } else {
3016 // Avoids all instructions with mnemonic za.<sz>[Reg, Imm,
3017 if (MI.getOperand(0).isReg() && !MI.getOperand(1).isImm()) {
3018 MIB.add(MI.getOperand(StartIdx)); // Output ZPR
3019 ++StartIdx;
3020 }
3021 MIB.addReg(BaseReg, RegState::Define).addReg(BaseReg);
3022 }
3023 for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I)
3024 MIB.add(MI.getOperand(I));
3025
3026 MI.eraseFromParent(); // The pseudo is gone now.
3027 return BB;
3028}
3029
3032 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3034 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M));
3035 MIB.add(MI.getOperand(0)); // Mask
3036
3037 unsigned Mask = MI.getOperand(0).getImm();
3038 for (unsigned I = 0; I < 8; I++) {
3039 if (Mask & (1 << I))
3040 MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine);
3041 }
3042
3043 MI.eraseFromParent(); // The pseudo is gone now.
3044 return BB;
3045}
3046
3049 MachineBasicBlock *BB) const {
3050 MachineFunction *MF = BB->getParent();
3051 MachineFrameInfo &MFI = MF->getFrameInfo();
3053 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
3054 if (TPIDR2.Uses > 0) {
3055 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3056 // Store the buffer pointer to the TPIDR2 stack object.
3057 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui))
3058 .addReg(MI.getOperand(0).getReg())
3059 .addFrameIndex(TPIDR2.FrameIndex)
3060 .addImm(0);
3061 // Set the reserved bytes (10-15) to zero
3062 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRHHui))
3063 .addReg(AArch64::WZR)
3064 .addFrameIndex(TPIDR2.FrameIndex)
3065 .addImm(5);
3066 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRWui))
3067 .addReg(AArch64::WZR)
3068 .addFrameIndex(TPIDR2.FrameIndex)
3069 .addImm(3);
3070 } else
3071 MFI.RemoveStackObject(TPIDR2.FrameIndex);
3072
3073 BB->remove_instr(&MI);
3074 return BB;
3075}
3076
3079 MachineBasicBlock *BB) const {
3080 MachineFunction *MF = BB->getParent();
3081 MachineFrameInfo &MFI = MF->getFrameInfo();
3083 // TODO This function grows the stack with a subtraction, which doesn't work
3084 // on Windows. Some refactoring to share the functionality in
3085 // LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI
3086 // supports SME
3088 "Lazy ZA save is not yet supported on Windows");
3089
3090 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
3091
3092 if (TPIDR2.Uses > 0) {
3093 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3095
3096 // The SUBXrs below won't always be emitted in a form that accepts SP
3097 // directly
3098 Register SP = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3099 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), SP)
3100 .addReg(AArch64::SP);
3101
3102 // Allocate a lazy-save buffer object of the size given, normally SVL * SVL
3103 auto Size = MI.getOperand(1).getReg();
3104 auto Dest = MI.getOperand(0).getReg();
3105 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::MSUBXrrr), Dest)
3106 .addReg(Size)
3107 .addReg(Size)
3108 .addReg(SP);
3109 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3110 AArch64::SP)
3111 .addReg(Dest);
3112
3113 // We have just allocated a variable sized object, tell this to PEI.
3114 MFI.CreateVariableSizedObject(Align(16), nullptr);
3115 }
3116
3117 BB->remove_instr(&MI);
3118 return BB;
3119}
3120
3122 MachineInstr &MI, MachineBasicBlock *BB) const {
3123
3124 int SMEOrigInstr = AArch64::getSMEPseudoMap(MI.getOpcode());
3125 if (SMEOrigInstr != -1) {
3126 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3127 uint64_t SMEMatrixType =
3128 TII->get(MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask;
3129 switch (SMEMatrixType) {
3131 return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB);
3133 return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB);
3135 return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB);
3137 return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB);
3139 return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB);
3141 return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB);
3142 }
3143 }
3144
3145 switch (MI.getOpcode()) {
3146 default:
3147#ifndef NDEBUG
3148 MI.dump();
3149#endif
3150 llvm_unreachable("Unexpected instruction for custom inserter!");
3151 case AArch64::InitTPIDR2Obj:
3152 return EmitInitTPIDR2Object(MI, BB);
3153 case AArch64::AllocateZABuffer:
3154 return EmitAllocateZABuffer(MI, BB);
3155 case AArch64::F128CSEL:
3156 return EmitF128CSEL(MI, BB);
3157 case TargetOpcode::STATEPOINT:
3158 // STATEPOINT is a pseudo instruction which has no implicit defs/uses
3159 // while bl call instruction (where statepoint will be lowered at the end)
3160 // has implicit def. This def is early-clobber as it will be set at
3161 // the moment of the call and earlier than any use is read.
3162 // Add this implicit dead def here as a workaround.
3163 MI.addOperand(*MI.getMF(),
3165 AArch64::LR, /*isDef*/ true,
3166 /*isImp*/ true, /*isKill*/ false, /*isDead*/ true,
3167 /*isUndef*/ false, /*isEarlyClobber*/ true));
3168 [[fallthrough]];
3169 case TargetOpcode::STACKMAP:
3170 case TargetOpcode::PATCHPOINT:
3171 return emitPatchPoint(MI, BB);
3172
3173 case TargetOpcode::PATCHABLE_EVENT_CALL:
3174 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
3175 return BB;
3176
3177 case AArch64::CATCHRET:
3178 return EmitLoweredCatchRet(MI, BB);
3179
3180 case AArch64::PROBED_STACKALLOC_DYN:
3181 return EmitDynamicProbedAlloc(MI, BB);
3182
3183 case AArch64::LD1_MXIPXX_H_PSEUDO_B:
3184 return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
3185 case AArch64::LD1_MXIPXX_H_PSEUDO_H:
3186 return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB);
3187 case AArch64::LD1_MXIPXX_H_PSEUDO_S:
3188 return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB);
3189 case AArch64::LD1_MXIPXX_H_PSEUDO_D:
3190 return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB);
3191 case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
3192 return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB);
3193 case AArch64::LD1_MXIPXX_V_PSEUDO_B:
3194 return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB);
3195 case AArch64::LD1_MXIPXX_V_PSEUDO_H:
3196 return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB);
3197 case AArch64::LD1_MXIPXX_V_PSEUDO_S:
3198 return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB);
3199 case AArch64::LD1_MXIPXX_V_PSEUDO_D:
3200 return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB);
3201 case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
3202 return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
3203 case AArch64::LDR_ZA_PSEUDO:
3204 return EmitFill(MI, BB);
3205 case AArch64::LDR_TX_PSEUDO:
3206 return EmitZTInstr(MI, BB, AArch64::LDR_TX, /*Op0IsDef=*/true);
3207 case AArch64::STR_TX_PSEUDO:
3208 return EmitZTInstr(MI, BB, AArch64::STR_TX, /*Op0IsDef=*/false);
3209 case AArch64::ZERO_M_PSEUDO:
3210 return EmitZero(MI, BB);
3211 case AArch64::ZERO_T_PSEUDO:
3212 return EmitZTInstr(MI, BB, AArch64::ZERO_T, /*Op0IsDef=*/true);
3213 }
3214}
3215
3216//===----------------------------------------------------------------------===//
3217// AArch64 Lowering private implementation.
3218//===----------------------------------------------------------------------===//
3219
3220//===----------------------------------------------------------------------===//
3221// Lowering Code
3222//===----------------------------------------------------------------------===//
3223
3224// Forward declarations of SVE fixed length lowering helpers
3229 SelectionDAG &DAG);
3232 EVT VT);
3233
3234/// isZerosVector - Check whether SDNode N is a zero-filled vector.
3235static bool isZerosVector(const SDNode *N) {
3236 // Look through a bit convert.
3237 while (N->getOpcode() == ISD::BITCAST)
3238 N = N->getOperand(0).getNode();
3239
3241 return true;
3242
3243 if (N->getOpcode() != AArch64ISD::DUP)
3244 return false;
3245
3246 auto Opnd0 = N->getOperand(0);
3247 return isNullConstant(Opnd0) || isNullFPConstant(Opnd0);
3248}
3249
3250/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
3251/// CC
3253 switch (CC) {
3254 default:
3255 llvm_unreachable("Unknown condition code!");
3256 case ISD::SETNE:
3257 return AArch64CC::NE;
3258 case ISD::SETEQ:
3259 return AArch64CC::EQ;
3260 case ISD::SETGT:
3261 return AArch64CC::GT;
3262 case ISD::SETGE:
3263 return AArch64CC::GE;
3264 case ISD::SETLT:
3265 return AArch64CC::LT;
3266 case ISD::SETLE:
3267 return AArch64CC::LE;
3268 case ISD::SETUGT:
3269 return AArch64CC::HI;
3270 case ISD::SETUGE:
3271 return AArch64CC::HS;
3272 case ISD::SETULT:
3273 return AArch64CC::LO;
3274 case ISD::SETULE:
3275 return AArch64CC::LS;
3276 }
3277}
3278
3279/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
3281 AArch64CC::CondCode &CondCode,
3282 AArch64CC::CondCode &CondCode2) {
3283 CondCode2 = AArch64CC::AL;
3284 switch (CC) {
3285 default:
3286 llvm_unreachable("Unknown FP condition!");
3287 case ISD::SETEQ:
3288 case ISD::SETOEQ:
3289 CondCode = AArch64CC::EQ;
3290 break;
3291 case ISD::SETGT:
3292 case ISD::SETOGT:
3293 CondCode = AArch64CC::GT;
3294 break;
3295 case ISD::SETGE:
3296 case ISD::SETOGE:
3297 CondCode = AArch64CC::GE;
3298 break;
3299 case ISD::SETOLT:
3300 CondCode = AArch64CC::MI;
3301 break;
3302 case ISD::SETOLE:
3303 CondCode = AArch64CC::LS;
3304 break;
3305 case ISD::SETONE:
3306 CondCode = AArch64CC::MI;
3307 CondCode2 = AArch64CC::GT;
3308 break;
3309 case ISD::SETO:
3310 CondCode = AArch64CC::VC;
3311 break;
3312 case ISD::SETUO:
3313 CondCode = AArch64CC::VS;
3314 break;
3315 case ISD::SETUEQ:
3316 CondCode = AArch64CC::EQ;
3317 CondCode2 = AArch64CC::VS;
3318 break;
3319 case ISD::SETUGT:
3320 CondCode = AArch64CC::HI;
3321 break;
3322 case ISD::SETUGE:
3323 CondCode = AArch64CC::PL;
3324 break;
3325 case ISD::SETLT:
3326 case ISD::SETULT:
3327 CondCode = AArch64CC::LT;
3328 break;
3329 case ISD::SETLE:
3330 case ISD::SETULE:
3331 CondCode = AArch64CC::LE;
3332 break;
3333 case ISD::SETNE:
3334 case ISD::SETUNE:
3335 CondCode = AArch64CC::NE;
3336 break;
3337 }
3338}
3339
3340/// Convert a DAG fp condition code to an AArch64 CC.
3341/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
3342/// should be AND'ed instead of OR'ed.
3344 AArch64CC::CondCode &CondCode,
3345 AArch64CC::CondCode &CondCode2) {
3346 CondCode2 = AArch64CC::AL;
3347 switch (CC) {
3348 default:
3349 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3350 assert(CondCode2 == AArch64CC::AL);
3351 break;
3352 case ISD::SETONE:
3353 // (a one b)
3354 // == ((a olt b) || (a ogt b))
3355 // == ((a ord b) && (a une b))
3356 CondCode = AArch64CC::VC;
3357 CondCode2 = AArch64CC::NE;
3358 break;
3359 case ISD::SETUEQ:
3360 // (a ueq b)
3361 // == ((a uno b) || (a oeq b))
3362 // == ((a ule b) && (a uge b))
3363 CondCode = AArch64CC::PL;
3364 CondCode2 = AArch64CC::LE;
3365 break;
3366 }
3367}
3368
3369/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
3370/// CC usable with the vector instructions. Fewer operations are available
3371/// without a real NZCV register, so we have to use less efficient combinations
3372/// to get the same effect.
3374 AArch64CC::CondCode &CondCode,
3375 AArch64CC::CondCode &CondCode2,
3376 bool &Invert) {
3377 Invert = false;
3378 switch (CC) {
3379 default:
3380 // Mostly the scalar mappings work fine.
3381 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3382 break;
3383 case ISD::SETUO:
3384 Invert = true;
3385 [[fallthrough]];
3386 case ISD::SETO:
3387 CondCode = AArch64CC::MI;
3388 CondCode2 = AArch64CC::GE;
3389 break;
3390 case ISD::SETUEQ:
3391 case ISD::SETULT:
3392 case ISD::SETULE:
3393 case ISD::SETUGT:
3394 case ISD::SETUGE:
3395 // All of the compare-mask comparisons are ordered, but we can switch
3396 // between the two by a double inversion. E.g. ULE == !OGT.
3397 Invert = true;
3398 changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
3399 CondCode, CondCode2);
3400 break;
3401 }
3402}
3403
3405 // Matches AArch64DAGToDAGISel::SelectArithImmed().
3406 bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
3407 LLVM_DEBUG(dbgs() << "Is imm " << C
3408 << " legal: " << (IsLegal ? "yes\n" : "no\n"));
3409 return IsLegal;
3410}
3411
3412static bool cannotBeIntMin(SDValue CheckedVal, SelectionDAG &DAG) {
3413 KnownBits KnownSrc = DAG.computeKnownBits(CheckedVal);
3414 return !KnownSrc.getSignedMinValue().isMinSignedValue();
3415}
3416
3417// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
3418// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
3419// can be set differently by this operation. It comes down to whether
3420// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
3421// everything is fine. If not then the optimization is wrong. Thus general
3422// comparisons are only valid if op2 != 0.
3423//
3424// So, finally, the only LLVM-native comparisons that don't mention C or V
3425// are the ones that aren't unsigned comparisons. They're the only ones we can
3426// safely use CMN for in the absence of information about op2.
3428 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
3429 (isIntEqualitySetCC(CC) ||
3430 (isUnsignedIntSetCC(CC) && DAG.isKnownNeverZero(Op.getOperand(1))) ||
3431 (isSignedIntSetCC(CC) && cannotBeIntMin(Op.getOperand(1), DAG)));
3432}
3433
3435 SelectionDAG &DAG, SDValue Chain,
3436 bool IsSignaling) {
3437 EVT VT = LHS.getValueType();
3438 assert(VT != MVT::f128);
3439
3440 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3441
3442 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3443 LHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
3444 {Chain, LHS});
3445 RHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
3446 {LHS.getValue(1), RHS});
3447 Chain = RHS.getValue(1);
3448 VT = MVT::f32;
3449 }
3450 unsigned Opcode =
3452 return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS});
3453}
3454
3456 const SDLoc &dl, SelectionDAG &DAG) {
3457 EVT VT = LHS.getValueType();
3458 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3459
3460 if (VT.isFloatingPoint()) {
3461 assert(VT != MVT::f128);
3462 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3463 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
3464 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
3465 VT = MVT::f32;
3466 }
3467 return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
3468 }
3469
3470 // The CMP instruction is just an alias for SUBS, and representing it as
3471 // SUBS means that it's possible to get CSE with subtract operations.
3472 // A later phase can perform the optimization of setting the destination
3473 // register to WZR/XZR if it ends up being unused.
3474 unsigned Opcode = AArch64ISD::SUBS;
3475
3476 if (isCMN(RHS, CC, DAG)) {
3477 // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
3478 Opcode = AArch64ISD::ADDS;
3479 RHS = RHS.getOperand(1);
3480 } else if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
3481 isIntEqualitySetCC(CC)) {
3482 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3483 // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
3484 Opcode = AArch64ISD::ADDS;
3485 LHS = LHS.getOperand(1);
3486 } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
3487 if (LHS.getOpcode() == ISD::AND) {
3488 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
3489 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
3490 // of the signed comparisons.
3491 const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl,
3492 DAG.getVTList(VT, MVT_CC),
3493 LHS.getOperand(0),
3494 LHS.getOperand(1));
3495 // Replace all users of (and X, Y) with newly generated (ands X, Y)
3496 DAG.ReplaceAllUsesWith(LHS, ANDSNode);
3497 return ANDSNode.getValue(1);
3498 } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
3499 // Use result of ANDS
3500 return LHS.getValue(1);
3501 }
3502 }
3503
3504 return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
3505 .getValue(1);
3506}
3507
3508/// \defgroup AArch64CCMP CMP;CCMP matching
3509///
3510/// These functions deal with the formation of CMP;CCMP;... sequences.
3511/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
3512/// a comparison. They set the NZCV flags to a predefined value if their
3513/// predicate is false. This allows to express arbitrary conjunctions, for
3514/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
3515/// expressed as:
3516/// cmp A
3517/// ccmp B, inv(CB), CA
3518/// check for CB flags
3519///
3520/// This naturally lets us implement chains of AND operations with SETCC
3521/// operands. And we can even implement some other situations by transforming
3522/// them:
3523/// - We can implement (NEG SETCC) i.e. negating a single comparison by
3524/// negating the flags used in a CCMP/FCCMP operations.
3525/// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
3526/// by negating the flags we test for afterwards. i.e.
3527/// NEG (CMP CCMP CCCMP ...) can be implemented.
3528/// - Note that we can only ever negate all previously processed results.
3529/// What we can not implement by flipping the flags to test is a negation
3530/// of two sub-trees (because the negation affects all sub-trees emitted so
3531/// far, so the 2nd sub-tree we emit would also affect the first).
3532/// With those tools we can implement some OR operations:
3533/// - (OR (SETCC A) (SETCC B)) can be implemented via:
3534/// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
3535/// - After transforming OR to NEG/AND combinations we may be able to use NEG
3536/// elimination rules from earlier to implement the whole thing as a
3537/// CCMP/FCCMP chain.
3538///
3539/// As complete example:
3540/// or (or (setCA (cmp A)) (setCB (cmp B)))
3541/// (and (setCC (cmp C)) (setCD (cmp D)))"
3542/// can be reassociated to:
3543/// or (and (setCC (cmp C)) setCD (cmp D))
3544// (or (setCA (cmp A)) (setCB (cmp B)))
3545/// can be transformed to:
3546/// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
3547/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
3548/// which can be implemented as:
3549/// cmp C
3550/// ccmp D, inv(CD), CC
3551/// ccmp A, CA, inv(CD)
3552/// ccmp B, CB, inv(CA)
3553/// check for CB flags
3554///
3555/// A counterexample is "or (and A B) (and C D)" which translates to
3556/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
3557/// can only implement 1 of the inner (not) operations, but not both!
3558/// @{
3559
3560/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
3562 ISD::CondCode CC, SDValue CCOp,
3563 AArch64CC::CondCode Predicate,
3564 AArch64CC::CondCode OutCC,
3565 const SDLoc &DL, SelectionDAG &DAG) {
3566 unsigned Opcode = 0;
3567 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3568
3569 if (LHS.getValueType().isFloatingPoint()) {
3570 assert(LHS.getValueType() != MVT::f128);
3571 if ((LHS.getValueType() == MVT::f16 && !FullFP16) ||
3572 LHS.getValueType() == MVT::bf16) {
3573 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3574 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3575 }
3576 Opcode = AArch64ISD::FCCMP;
3577 } else if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(RHS)) {
3578 APInt Imm = Const->getAPIntValue();
3579 if (Imm.isNegative() && Imm.sgt(-32)) {
3580 Opcode = AArch64ISD::CCMN;
3581 RHS = DAG.getConstant(Imm.abs(), DL, Const->getValueType(0));
3582 }
3583 } else if (isCMN(RHS, CC, DAG)) {
3584 Opcode = AArch64ISD::CCMN;
3585 RHS = RHS.getOperand(1);
3586 } else if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
3587 isIntEqualitySetCC(CC)) {
3588 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3589 // we combine a (CCMP (sub 0, op1), op2) into a CCMN instruction ?
3590 Opcode = AArch64ISD::CCMN;
3591 LHS = LHS.getOperand(1);
3592 }
3593 if (Opcode == 0)
3594 Opcode = AArch64ISD::CCMP;
3595
3596 SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
3598 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
3599 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
3600 return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
3601}
3602
3603/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
3604/// expressed as a conjunction. See \ref AArch64CCMP.
3605/// \param CanNegate Set to true if we can negate the whole sub-tree just by
3606/// changing the conditions on the SETCC tests.
3607/// (this means we can call emitConjunctionRec() with
3608/// Negate==true on this sub-tree)
3609/// \param MustBeFirst Set to true if this subtree needs to be negated and we
3610/// cannot do the negation naturally. We are required to
3611/// emit the subtree first in this case.
3612/// \param WillNegate Is true if are called when the result of this
3613/// subexpression must be negated. This happens when the
3614/// outer expression is an OR. We can use this fact to know
3615/// that we have a double negation (or (or ...) ...) that
3616/// can be implemented for free.
3617static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
3618 bool &MustBeFirst, bool WillNegate,
3619 unsigned Depth = 0) {
3620 if (!Val.hasOneUse())
3621 return false;
3622 unsigned Opcode = Val->getOpcode();
3623 if (Opcode == ISD::SETCC) {
3624 if (Val->getOperand(0).getValueType() == MVT::f128)
3625 return false;
3626 CanNegate = true;
3627 MustBeFirst = false;
3628 return true;
3629 }
3630 // Protect against exponential runtime and stack overflow.
3631 if (Depth > 6)
3632 return false;
3633 if (Opcode == ISD::AND || Opcode == ISD::OR) {
3634 bool IsOR = Opcode == ISD::OR;
3635 SDValue O0 = Val->getOperand(0);
3636 SDValue O1 = Val->getOperand(1);
3637 bool CanNegateL;
3638 bool MustBeFirstL;
3639 if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
3640 return false;
3641 bool CanNegateR;
3642 bool MustBeFirstR;
3643 if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
3644 return false;
3645
3646 if (MustBeFirstL && MustBeFirstR)
3647 return false;
3648
3649 if (IsOR) {
3650 // For an OR expression we need to be able to naturally negate at least
3651 // one side or we cannot do the transformation at all.
3652 if (!CanNegateL && !CanNegateR)
3653 return false;
3654 // If we the result of the OR will be negated and we can naturally negate
3655 // the leafs, then this sub-tree as a whole negates naturally.
3656 CanNegate = WillNegate && CanNegateL && CanNegateR;
3657 // If we cannot naturally negate the whole sub-tree, then this must be
3658 // emitted first.
3659 MustBeFirst = !CanNegate;
3660 } else {
3661 assert(Opcode == ISD::AND && "Must be OR or AND");
3662 // We cannot naturally negate an AND operation.
3663 CanNegate = false;
3664 MustBeFirst = MustBeFirstL || MustBeFirstR;
3665 }
3666 return true;
3667 }
3668 return false;
3669}
3670
3671/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
3672/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
3673/// Tries to transform the given i1 producing node @p Val to a series compare
3674/// and conditional compare operations. @returns an NZCV flags producing node
3675/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
3676/// transformation was not possible.
3677/// \p Negate is true if we want this sub-tree being negated just by changing
3678/// SETCC conditions.
3680 AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
3681 AArch64CC::CondCode Predicate) {
3682 // We're at a tree leaf, produce a conditional comparison operation.
3683 unsigned Opcode = Val->getOpcode();
3684 if (Opcode == ISD::SETCC) {
3685 SDValue LHS = Val->getOperand(0);
3686 SDValue RHS = Val->getOperand(1);
3687 ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
3688 bool isInteger = LHS.getValueType().isInteger();
3689 if (Negate)
3690 CC = getSetCCInverse(CC, LHS.getValueType());
3691 SDLoc DL(Val);
3692 // Determine OutCC and handle FP special case.
3693 if (isInteger) {
3694 OutCC = changeIntCCToAArch64CC(CC);
3695 } else {
3696 assert(LHS.getValueType().isFloatingPoint());
3697 AArch64CC::CondCode ExtraCC;
3698 changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
3699 // Some floating point conditions can't be tested with a single condition
3700 // code. Construct an additional comparison in this case.
3701 if (ExtraCC != AArch64CC::AL) {
3702 SDValue ExtraCmp;
3703 if (!CCOp.getNode())
3704 ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
3705 else
3706 ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
3707 ExtraCC, DL, DAG);
3708 CCOp = ExtraCmp;
3709 Predicate = ExtraCC;
3710 }
3711 }
3712
3713 // Produce a normal comparison if we are first in the chain
3714 if (!CCOp)
3715 return emitComparison(LHS, RHS, CC, DL, DAG);
3716 // Otherwise produce a ccmp.
3717 return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
3718 DAG);
3719 }
3720 assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
3721
3722 bool IsOR = Opcode == ISD::OR;
3723
3724 SDValue LHS = Val->getOperand(0);
3725 bool CanNegateL;
3726 bool MustBeFirstL;
3727 bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
3728 assert(ValidL && "Valid conjunction/disjunction tree");
3729 (void)ValidL;
3730
3731 SDValue RHS = Val->getOperand(1);
3732 bool CanNegateR;
3733 bool MustBeFirstR;
3734 bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
3735 assert(ValidR && "Valid conjunction/disjunction tree");
3736 (void)ValidR;
3737
3738 // Swap sub-tree that must come first to the right side.
3739 if (MustBeFirstL) {
3740 assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
3741 std::swap(LHS, RHS);
3742 std::swap(CanNegateL, CanNegateR);
3743 std::swap(MustBeFirstL, MustBeFirstR);
3744 }
3745
3746 bool NegateR;
3747 bool NegateAfterR;
3748 bool NegateL;
3749 bool NegateAfterAll;
3750 if (Opcode == ISD::OR) {
3751 // Swap the sub-tree that we can negate naturally to the left.
3752 if (!CanNegateL) {
3753 assert(CanNegateR && "at least one side must be negatable");
3754 assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
3755 assert(!Negate);
3756 std::swap(LHS, RHS);
3757 NegateR = false;
3758 NegateAfterR = true;
3759 } else {
3760 // Negate the left sub-tree if possible, otherwise negate the result.
3761 NegateR = CanNegateR;
3762 NegateAfterR = !CanNegateR;
3763 }
3764 NegateL = true;
3765 NegateAfterAll = !Negate;
3766 } else {
3767 assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
3768 assert(!Negate && "Valid conjunction/disjunction tree");
3769
3770 NegateL = false;
3771 NegateR = false;
3772 NegateAfterR = false;
3773 NegateAfterAll = false;
3774 }
3775
3776 // Emit sub-trees.
3777 AArch64CC::CondCode RHSCC;
3778 SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
3779 if (NegateAfterR)
3780 RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
3781 SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
3782 if (NegateAfterAll)
3783 OutCC = AArch64CC::getInvertedCondCode(OutCC);
3784 return CmpL;
3785}
3786
3787/// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
3788/// In some cases this is even possible with OR operations in the expression.
3789/// See \ref AArch64CCMP.
3790/// \see emitConjunctionRec().
3792 AArch64CC::CondCode &OutCC) {
3793 bool DummyCanNegate;
3794 bool DummyMustBeFirst;
3795 if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
3796 return SDValue();
3797
3798 return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
3799}
3800
3801/// @}
3802
3803/// Returns how profitable it is to fold a comparison's operand's shift and/or
3804/// extension operations.
3806 auto isSupportedExtend = [&](SDValue V) {
3807 if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
3808 return true;
3809
3810 if (V.getOpcode() == ISD::AND)
3811 if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
3812 uint64_t Mask = MaskCst->getZExtValue();
3813 return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
3814 }
3815
3816 return false;
3817 };
3818
3819 if (!Op.hasOneUse())
3820 return 0;
3821
3822 if (isSupportedExtend(Op))
3823 return 1;
3824
3825 unsigned Opc = Op.getOpcode();
3826 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
3827 if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
3828 uint64_t Shift = ShiftCst->getZExtValue();
3829 if (isSupportedExtend(Op.getOperand(0)))
3830 return (Shift <= 4) ? 2 : 1;
3831 EVT VT = Op.getValueType();
3832 if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
3833 return 1;
3834 }
3835
3836 return 0;
3837}
3838
3840 SDValue &AArch64cc, SelectionDAG &DAG,
3841 const SDLoc &dl) {
3842 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
3843 EVT VT = RHS.getValueType();
3844 uint64_t C = RHSC->getZExtValue();
3845 if (!isLegalArithImmed(C)) {
3846 // Constant does not fit, try adjusting it by one?
3847 switch (CC) {
3848 default:
3849 break;
3850 case ISD::SETLT:
3851 case ISD::SETGE:
3852 if ((VT == MVT::i32 && C != 0x80000000 &&
3853 isLegalArithImmed((uint32_t)(C - 1))) ||
3854 (VT == MVT::i64 && C != 0x80000000ULL &&
3855 isLegalArithImmed(C - 1ULL))) {
3857 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
3858 RHS = DAG.getConstant(C, dl, VT);
3859 }
3860 break;
3861 case ISD::SETULT:
3862 case ISD::SETUGE:
3863 if ((VT == MVT::i32 && C != 0 &&
3864 isLegalArithImmed((uint32_t)(C - 1))) ||
3865 (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
3867 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
3868 RHS = DAG.getConstant(C, dl, VT);
3869 }
3870 break;
3871 case ISD::SETLE:
3872 case ISD::SETGT:
3873 if ((VT == MVT::i32 && C != INT32_MAX &&
3874 isLegalArithImmed((uint32_t)(C + 1))) ||
3875 (VT == MVT::i64 && C != INT64_MAX &&
3876 isLegalArithImmed(C + 1ULL))) {
3878 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
3879 RHS = DAG.getConstant(C, dl, VT);
3880 }
3881 break;
3882 case ISD::SETULE:
3883 case ISD::SETUGT:
3884 if ((VT == MVT::i32 && C != UINT32_MAX &&
3885 isLegalArithImmed((uint32_t)(C + 1))) ||
3886 (VT == MVT::i64 && C != UINT64_MAX &&
3887 isLegalArithImmed(C + 1ULL))) {
3889 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
3890 RHS = DAG.getConstant(C, dl, VT);
3891 }
3892 break;
3893 }
3894 }
3895 }
3896
3897 // Comparisons are canonicalized so that the RHS operand is simpler than the
3898 // LHS one, the extreme case being when RHS is an immediate. However, AArch64
3899 // can fold some shift+extend operations on the RHS operand, so swap the
3900 // operands if that can be done.
3901 //
3902 // For example:
3903 // lsl w13, w11, #1
3904 // cmp w13, w12
3905 // can be turned into:
3906 // cmp w12, w11, lsl #1
3907 if (!isa<ConstantSDNode>(RHS) ||
3908 !isLegalArithImmed(RHS->getAsAPIntVal().abs().getZExtValue())) {
3909 bool LHSIsCMN = isCMN(LHS, CC, DAG);
3910 bool RHSIsCMN = isCMN(RHS, CC, DAG);
3911 SDValue TheLHS = LHSIsCMN ? LHS.getOperand(1) : LHS;
3912 SDValue TheRHS = RHSIsCMN ? RHS.getOperand(1) : RHS;
3913
3914 if (getCmpOperandFoldingProfit(TheLHS) + (LHSIsCMN ? 1 : 0) >
3915 getCmpOperandFoldingProfit(TheRHS) + (RHSIsCMN ? 1 : 0)) {
3916 std::swap(LHS, RHS);
3918 }
3919 }
3920
3921 SDValue Cmp;
3922 AArch64CC::CondCode AArch64CC;
3923 if (isIntEqualitySetCC(CC) && isa<ConstantSDNode>(RHS)) {
3924 const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
3925
3926 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
3927 // For the i8 operand, the largest immediate is 255, so this can be easily
3928 // encoded in the compare instruction. For the i16 operand, however, the
3929 // largest immediate cannot be encoded in the compare.
3930 // Therefore, use a sign extending load and cmn to avoid materializing the
3931 // -1 constant. For example,
3932 // movz w1, #65535
3933 // ldrh w0, [x0, #0]
3934 // cmp w0, w1
3935 // >
3936 // ldrsh w0, [x0, #0]
3937 // cmn w0, #1
3938 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
3939 // if and only if (sext LHS) == (sext RHS). The checks are in place to
3940 // ensure both the LHS and RHS are truly zero extended and to make sure the
3941 // transformation is profitable.
3942 if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
3943 cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
3944 cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
3945 LHS.getNode()->hasNUsesOfValue(1, 0)) {
3946 int16_t ValueofRHS = RHS->getAsZExtVal();
3947 if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
3948 SDValue SExt =
3949 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
3950 DAG.getValueType(MVT::i16));
3951 Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
3952 RHS.getValueType()),
3953 CC, dl, DAG);
3954 AArch64CC = changeIntCCToAArch64CC(CC);
3955 }
3956 }
3957
3958 if (!Cmp && (RHSC->isZero() || RHSC->isOne())) {
3959 if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
3960 if ((CC == ISD::SETNE) ^ RHSC->isZero())
3961 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
3962 }
3963 }
3964 }
3965
3966 if (!Cmp) {
3967 Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
3968 AArch64CC = changeIntCCToAArch64CC(CC);
3969 }
3970 AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
3971 return Cmp;
3972}
3973
3974static std::pair<SDValue, SDValue>
3976 assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
3977 "Unsupported value type");
3978 SDValue Value, Overflow;
3979 SDLoc DL(Op);
3980 SDValue LHS = Op.getOperand(0);
3981 SDValue RHS = Op.getOperand(1);
3982 unsigned Opc = 0;
3983 switch (Op.getOpcode()) {
3984 default:
3985 llvm_unreachable("Unknown overflow instruction!");
3986 case ISD::SADDO:
3987 Opc = AArch64ISD::ADDS;
3988 CC = AArch64CC::VS;
3989 break;
3990 case ISD::UADDO:
3991 Opc = AArch64ISD::ADDS;
3992 CC = AArch64CC::HS;
3993 break;
3994 case ISD::SSUBO:
3995 Opc = AArch64ISD::SUBS;
3996 CC = AArch64CC::VS;
3997 break;
3998 case ISD::USUBO:
3999 Opc = AArch64ISD::SUBS;
4000 CC = AArch64CC::LO;
4001 break;
4002 // Multiply needs a little bit extra work.
4003 case ISD::SMULO:
4004 case ISD::UMULO: {
4005 CC = AArch64CC::NE;
4006 bool IsSigned = Op.getOpcode() == ISD::SMULO;
4007 if (Op.getValueType() == MVT::i32) {
4008 // Extend to 64-bits, then perform a 64-bit multiply.
4009 unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4010 LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
4011 RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
4012 SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
4013 Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
4014
4015 // Check that the result fits into a 32-bit integer.
4016 SDVTList VTs = DAG.getVTList(MVT::i64, MVT_CC);
4017 if (IsSigned) {
4018 // cmp xreg, wreg, sxtw
4019 SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value);
4020 Overflow =
4021 DAG.getNode(AArch64ISD::SUBS, DL, VTs, Mul, SExtMul).getValue(1);
4022 } else {
4023 // tst xreg, #0xffffffff00000000
4024 SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64);
4025 Overflow =
4026 DAG.getNode(AArch64ISD::ANDS, DL, VTs, Mul, UpperBits).getValue(1);
4027 }
4028 break;
4029 }
4030 assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
4031 // For the 64 bit multiply
4032 Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
4033 if (IsSigned) {
4034 SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
4035 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
4036 DAG.getConstant(63, DL, MVT::i64));
4037 // It is important that LowerBits is last, otherwise the arithmetic
4038 // shift will not be folded into the compare (SUBS).
4039 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
4040 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
4041 .getValue(1);
4042 } else {
4043 SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
4044 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
4045 Overflow =
4046 DAG.getNode(AArch64ISD::SUBS, DL, VTs,
4047 DAG.getConstant(0, DL, MVT::i64),
4048 UpperBits).getValue(1);
4049 }
4050 break;
4051 }
4052 } // switch (...)
4053
4054 if (Opc) {
4055 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
4056
4057 // Emit the AArch64 operation with overflow check.
4058 Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
4059 Overflow = Value.getValue(1);
4060 }
4061 return std::make_pair(Value, Overflow);
4062}
4063
4064SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
4065 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
4066 !Subtarget->isNeonAvailable()))
4067 return LowerToScalableOp(Op, DAG);
4068
4069 SDValue Sel = Op.getOperand(0);
4070 SDValue Other = Op.getOperand(1);
4071 SDLoc dl(Sel);
4072
4073 // If the operand is an overflow checking operation, invert the condition
4074 // code and kill the Not operation. I.e., transform:
4075 // (xor (overflow_op_bool, 1))
4076 // -->
4077 // (csel 1, 0, invert(cc), overflow_op_bool)
4078 // ... which later gets transformed to just a cset instruction with an
4079 // inverted condition code, rather than a cset + eor sequence.
4081 // Only lower legal XALUO ops.
4083 return SDValue();
4084
4085 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
4086 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
4088 SDValue Value, Overflow;
4089 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
4090 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
4091 return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
4092 CCVal, Overflow);
4093 }
4094 // If neither operand is a SELECT_CC, give up.
4095 if (Sel.getOpcode() != ISD::SELECT_CC)
4096 std::swap(Sel, Other);
4097 if (Sel.getOpcode() != ISD::SELECT_CC)
4098 return Op;
4099
4100 // The folding we want to perform is:
4101 // (xor x, (select_cc a, b, cc, 0, -1) )
4102 // -->
4103 // (csel x, (xor x, -1), cc ...)
4104 //
4105 // The latter will get matched to a CSINV instruction.
4106
4107 ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
4108 SDValue LHS = Sel.getOperand(0);
4109 SDValue RHS = Sel.getOperand(1);
4110 SDValue TVal = Sel.getOperand(2);
4111 SDValue FVal = Sel.getOperand(3);
4112
4113 // FIXME: This could be generalized to non-integer comparisons.
4114 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
4115 return Op;
4116
4117 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
4118 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
4119
4120 // The values aren't constants, this isn't the pattern we're looking for.
4121 if (!CFVal || !CTVal)
4122 return Op;
4123
4124 // We can commute the SELECT_CC by inverting the condition. This
4125 // might be needed to make this fit into a CSINV pattern.
4126 if (CTVal->isAllOnes() && CFVal->isZero()) {
4127 std::swap(TVal, FVal);
4128 std::swap(CTVal, CFVal);
4129 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
4130 }
4131
4132 // If the constants line up, perform the transform!
4133 if (CTVal->isZero() && CFVal->isAllOnes()) {
4134 SDValue CCVal;
4135 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
4136
4137 FVal = Other;
4138 TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
4139 DAG.getConstant(-1ULL, dl, Other.getValueType()));
4140
4141 return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
4142 CCVal, Cmp);
4143 }
4144
4145 return Op;
4146}
4147
4148// If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C'
4149// bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else
4150// sets 'C' bit to 0.
4152 SDLoc DL(Value);
4153 EVT VT = Value.getValueType();
4154 SDValue Op0 = Invert ? DAG.getConstant(0, DL, VT) : Value;
4155 SDValue Op1 = Invert ? Value : DAG.getConstant(1, DL, VT);
4156 SDValue Cmp =
4157 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::Glue), Op0, Op1);
4158 return Cmp.getValue(1);
4159}
4160
4161// If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0.
4162// If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1.
4164 bool Invert) {
4165 assert(Glue.getResNo() == 1);
4166 SDLoc DL(Glue);
4167 SDValue Zero = DAG.getConstant(0, DL, VT);
4168 SDValue One = DAG.getConstant(1, DL, VT);
4169 unsigned Cond = Invert ? AArch64CC::LO : AArch64CC::HS;
4170 SDValue CC = DAG.getConstant(Cond, DL, MVT::i32);
4171 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
4172}
4173
4174// Value is 1 if 'V' bit of NZCV is 1, else 0
4176 assert(Glue.getResNo() == 1);
4177 SDLoc DL(Glue);
4178 SDValue Zero = DAG.getConstant(0, DL, VT);
4179 SDValue One = DAG.getConstant(1, DL, VT);
4180 SDValue CC = DAG.getConstant(AArch64CC::VS, DL, MVT::i32);
4181 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
4182}
4183
4184// This lowering is inefficient, but it will get cleaned up by
4185// `foldOverflowCheck`
4187 unsigned Opcode, bool IsSigned) {
4188 EVT VT0 = Op.getValue(0).getValueType();
4189 EVT VT1 = Op.getValue(1).getValueType();
4190
4191 if (VT0 != MVT::i32 && VT0 != MVT::i64)
4192 return SDValue();
4193
4194 bool InvertCarry = Opcode == AArch64ISD::SBCS;
4195 SDValue OpLHS = Op.getOperand(0);
4196 SDValue OpRHS = Op.getOperand(1);
4197 SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry);
4198
4199 SDLoc DL(Op);
4200 SDVTList VTs = DAG.getVTList(VT0, VT1);
4201
4202 SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, MVT::Glue), OpLHS,
4203 OpRHS, OpCarryIn);
4204
4205 SDValue OutFlag =
4206 IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG)
4207 : carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry);
4208
4209 return DAG.getNode(ISD::MERGE_VALUES, DL, VTs, Sum, OutFlag);
4210}
4211
4213 // Let legalize expand this if it isn't a legal type yet.
4214 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
4215 return SDValue();
4216
4217 SDLoc dl(Op);
4219 // The actual operation that sets the overflow or carry flag.
4220 SDValue Value, Overflow;
4221 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
4222
4223 // We use 0 and 1 as false and true values.
4224 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
4225 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
4226
4227 // We use an inverted condition, because the conditional select is inverted
4228 // too. This will allow it to be selected to a single instruction:
4229 // CSINC Wd, WZR, WZR, invert(cond).
4230 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
4231 Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
4232 CCVal, Overflow);
4233
4234 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
4235 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
4236}
4237
4238// Prefetch operands are:
4239// 1: Address to prefetch
4240// 2: bool isWrite
4241// 3: int locality (0 = no locality ... 3 = extreme locality)
4242// 4: bool isDataCache
4244 SDLoc DL(Op);
4245 unsigned IsWrite = Op.getConstantOperandVal(2);
4246 unsigned Locality = Op.getConstantOperandVal(3);
4247 unsigned IsData = Op.getConstantOperandVal(4);
4248
4249 bool IsStream = !Locality;
4250 // When the locality number is set
4251 if (Locality) {
4252 // The front-end should have filtered out the out-of-range values
4253 assert(Locality <= 3 && "Prefetch locality out-of-range");
4254 // The locality degree is the opposite of the cache speed.
4255 // Put the number the other way around.
4256 // The encoding starts at 0 for level 1
4257 Locality = 3 - Locality;
4258 }
4259
4260 // built the mask value encoding the expected behavior.
4261 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
4262 (!IsData << 3) | // IsDataCache bit
4263 (Locality << 1) | // Cache level bits
4264 (unsigned)IsStream; // Stream bit
4265 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
4266 DAG.getTargetConstant(PrfOp, DL, MVT::i32),
4267 Op.getOperand(1));
4268}
4269
4270SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
4271 SelectionDAG &DAG) const {
4272 EVT VT = Op.getValueType();
4273 if (VT.isScalableVector())
4274 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
4275
4276 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
4277 return LowerFixedLengthFPExtendToSVE(Op, DAG);
4278
4279 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
4280 return SDValue();
4281}
4282
4283SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
4284 SelectionDAG &DAG) const {
4285 EVT VT = Op.getValueType();
4286 if (VT.isScalableVector())
4287 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4288
4289 bool IsStrict = Op->isStrictFPOpcode();
4290 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4291 EVT SrcVT = SrcVal.getValueType();
4292 bool Trunc = Op.getConstantOperandVal(IsStrict ? 2 : 1) == 1;
4293
4294 if (useSVEForFixedLengthVectorVT(SrcVT, !Subtarget->isNeonAvailable()))
4295 return LowerFixedLengthFPRoundToSVE(Op, DAG);
4296
4297 // Expand cases where the result type is BF16 but we don't have hardware
4298 // instructions to lower it.
4299 if (VT.getScalarType() == MVT::bf16 &&
4300 !((Subtarget->hasNEON() || Subtarget->hasSME()) &&
4301 Subtarget->hasBF16())) {
4302 SDLoc dl(Op);
4303 SDValue Narrow = SrcVal;
4304 SDValue NaN;
4305 EVT I32 = SrcVT.changeElementType(MVT::i32);
4306 EVT F32 = SrcVT.changeElementType(MVT::f32);
4307 if (SrcVT.getScalarType() == MVT::f32) {
4308 bool NeverSNaN = DAG.isKnownNeverSNaN(Narrow);
4309 Narrow = DAG.getNode(ISD::BITCAST, dl, I32, Narrow);
4310 if (!NeverSNaN) {
4311 // Set the quiet bit.
4312 NaN = DAG.getNode(ISD::OR, dl, I32, Narrow,
4313 DAG.getConstant(0x400000, dl, I32));
4314 }
4315 } else if (SrcVT.getScalarType() == MVT::f64) {
4316 Narrow = DAG.getNode(AArch64ISD::FCVTXN, dl, F32, Narrow);
4317 Narrow = DAG.getNode(ISD::BITCAST, dl, I32, Narrow);
4318 } else {
4319 return SDValue();
4320 }
4321 if (!Trunc) {
4322 SDValue One = DAG.getConstant(1, dl, I32);
4323 SDValue Lsb = DAG.getNode(ISD::SRL, dl, I32, Narrow,
4324 DAG.getShiftAmountConstant(16, I32, dl));
4325 Lsb = DAG.getNode(ISD::AND, dl, I32, Lsb, One);
4326 SDValue RoundingBias =
4327 DAG.getNode(ISD::ADD, dl, I32, DAG.getConstant(0x7fff, dl, I32), Lsb);
4328 Narrow = DAG.getNode(ISD::ADD, dl, I32, Narrow, RoundingBias);
4329 }
4330
4331 // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4332 // 0x80000000.
4333 if (NaN) {
4334 SDValue IsNaN = DAG.getSetCC(
4335 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT),
4336 SrcVal, SrcVal, ISD::SETUO);
4337 Narrow = DAG.getSelect(dl, I32, IsNaN, NaN, Narrow);
4338 }
4339
4340 // Now that we have rounded, shift the bits into position.
4341 Narrow = DAG.getNode(ISD::SRL, dl, I32, Narrow,
4342 DAG.getShiftAmountConstant(16, I32, dl));
4343 if (VT.isVector()) {
4344 EVT I16 = I32.changeVectorElementType(MVT::i16);
4345 Narrow = DAG.getNode(ISD::TRUNCATE, dl, I16, Narrow);
4346 return DAG.getNode(ISD::BITCAST, dl, VT, Narrow);
4347 }
4348 Narrow = DAG.getNode(ISD::BITCAST, dl, F32, Narrow);
4349 SDValue Result = DAG.getTargetExtractSubreg(AArch64::hsub, dl, VT, Narrow);
4350 return IsStrict ? DAG.getMergeValues({Result, Op.getOperand(0)}, dl)
4351 : Result;
4352 }
4353
4354 if (SrcVT != MVT::f128) {
4355 // Expand cases where the input is a vector bigger than NEON.
4357 return SDValue();
4358
4359 // It's legal except when f128 is involved
4360 return Op;
4361 }
4362
4363 return SDValue();
4364}
4365
4366SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
4367 SelectionDAG &DAG) const {
4368 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4369 // Any additional optimization in this function should be recorded
4370 // in the cost tables.
4371 bool IsStrict = Op->isStrictFPOpcode();
4372 EVT InVT = Op.getOperand(IsStrict ? 1 : 0).getValueType();
4373 EVT VT = Op.getValueType();
4374
4375 if (VT.isScalableVector()) {
4376 unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
4379 return LowerToPredicatedOp(Op, DAG, Opcode);
4380 }
4381
4382 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
4383 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
4384 return LowerFixedLengthFPToIntToSVE(Op, DAG);
4385
4386 unsigned NumElts = InVT.getVectorNumElements();
4387
4388 // f16 conversions are promoted to f32 when full fp16 is not supported.
4389 if ((InVT.getVectorElementType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4390 InVT.getVectorElementType() == MVT::bf16) {
4391 MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
4392 SDLoc dl(Op);
4393 if (IsStrict) {
4394 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NewVT, MVT::Other},
4395 {Op.getOperand(0), Op.getOperand(1)});
4396 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
4397 {Ext.getValue(1), Ext.getValue(0)});
4398 }
4399 return DAG.getNode(
4400 Op.getOpcode(), dl, Op.getValueType(),
4401 DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
4402 }
4403
4404 uint64_t VTSize = VT.getFixedSizeInBits();
4405 uint64_t InVTSize = InVT.getFixedSizeInBits();
4406 if (VTSize < InVTSize) {
4407 SDLoc dl(Op);
4408 if (IsStrict) {
4410 SDValue Cv = DAG.getNode(Op.getOpcode(), dl, {InVT, MVT::Other},
4411 {Op.getOperand(0), Op.getOperand(1)});
4412 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
4413 return DAG.getMergeValues({Trunc, Cv.getValue(1)}, dl);
4414 }
4415 SDValue Cv =
4416 DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
4417 Op.getOperand(0));
4418 return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
4419 }
4420
4421 if (VTSize > InVTSize) {
4422 SDLoc dl(Op);
4423 MVT ExtVT =
4426 if (IsStrict) {
4427 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {ExtVT, MVT::Other},
4428 {Op.getOperand(0), Op.getOperand(1)});
4429 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
4430 {Ext.getValue(1), Ext.getValue(0)});
4431 }
4432 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
4433 return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
4434 }
4435
4436 // Use a scalar operation for conversions between single-element vectors of
4437 // the same size.
4438 if (NumElts == 1) {
4439 SDLoc dl(Op);
4440 SDValue Extract = DAG.getNode(
4442 Op.getOperand(IsStrict ? 1 : 0), DAG.getConstant(0, dl, MVT::i64));
4443 EVT ScalarVT = VT.getScalarType();
4444 if (IsStrict)
4445 return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
4446 {Op.getOperand(0), Extract});
4447 return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
4448 }
4449
4450 // Type changing conversions are illegal.
4451 return Op;
4452}
4453
4454SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
4455 SelectionDAG &DAG) const {
4456 bool IsStrict = Op->isStrictFPOpcode();
4457 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4458
4459 if (SrcVal.getValueType().isVector())
4460 return LowerVectorFP_TO_INT(Op, DAG);
4461
4462 // f16 conversions are promoted to f32 when full fp16 is not supported.
4463 if ((SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4464 SrcVal.getValueType() == MVT::bf16) {
4465 SDLoc dl(Op);
4466 if (IsStrict) {
4467 SDValue Ext =
4468 DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
4469 {Op.getOperand(0), SrcVal});
4470 return DAG.getNode(Op.getOpcode(), dl, {Op.getValueType(), MVT::Other},
4471 {Ext.getValue(1), Ext.getValue(0)});
4472 }
4473 return DAG.getNode(
4474 Op.getOpcode(), dl, Op.getValueType(),
4475 DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal));
4476 }
4477
4478 if (SrcVal.getValueType() != MVT::f128) {
4479 // It's legal except when f128 is involved
4480 return Op;
4481 }
4482
4483 return SDValue();
4484}
4485
4486SDValue
4487AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
4488 SelectionDAG &DAG) const {
4489 // AArch64 FP-to-int conversions saturate to the destination element size, so
4490 // we can lower common saturating conversions to simple instructions.
4491 SDValue SrcVal = Op.getOperand(0);
4492 EVT SrcVT = SrcVal.getValueType();
4493 EVT DstVT = Op.getValueType();
4494 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4495
4496 uint64_t SrcElementWidth = SrcVT.getScalarSizeInBits();
4497 uint64_t DstElementWidth = DstVT.getScalarSizeInBits();
4498 uint64_t SatWidth = SatVT.getScalarSizeInBits();
4499 assert(SatWidth <= DstElementWidth &&
4500 "Saturation width cannot exceed result width");
4501
4502 // TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT.
4503 // Currently, the `llvm.fpto[su]i.sat.*` intrinsics don't accept scalable
4504 // types, so this is hard to reach.
4505 if (DstVT.isScalableVector())
4506 return SDValue();
4507
4508 EVT SrcElementVT = SrcVT.getVectorElementType();
4509
4510 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4511 if ((SrcElementVT == MVT::f16 &&
4512 (!Subtarget->hasFullFP16() || DstElementWidth > 16)) ||
4513 SrcElementVT == MVT::bf16) {
4514 MVT F32VT = MVT::getVectorVT(MVT::f32, SrcVT.getVectorNumElements());
4515 SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), F32VT, SrcVal);
4516 SrcVT = F32VT;
4517 SrcElementVT = MVT::f32;
4518 SrcElementWidth = 32;
4519 } else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 &&
4520 SrcElementVT != MVT::f16 && SrcElementVT != MVT::bf16)
4521 return SDValue();
4522
4523 SDLoc DL(Op);
4524 // Expand to f64 if we are saturating to i64, to help produce keep the lanes
4525 // the same width and produce a fcvtzu.
4526 if (SatWidth == 64 && SrcElementWidth < 64) {
4527 MVT F64VT = MVT::getVectorVT(MVT::f64, SrcVT.getVectorNumElements());
4528 SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F64VT, SrcVal);
4529 SrcVT = F64VT;
4530 SrcElementVT = MVT::f64;
4531 SrcElementWidth = 64;
4532 }
4533 // Cases that we can emit directly.
4534 if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth)
4535 return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
4536 DAG.getValueType(DstVT.getScalarType()));
4537
4538 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4539 // result. This is only valid if the legal cvt is larger than the saturate
4540 // width. For double, as we don't have MIN/MAX, it can be simpler to scalarize
4541 // (at least until sqxtn is selected).
4542 if (SrcElementWidth < SatWidth || SrcElementVT == MVT::f64)
4543 return SDValue();
4544
4545 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
4546 SDValue NativeCvt = DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal,
4547 DAG.getValueType(IntVT.getScalarType()));
4548 SDValue Sat;
4549 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4550 SDValue MinC = DAG.getConstant(
4551 APInt::getSignedMaxValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4552 SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC);
4553 SDValue MaxC = DAG.getConstant(
4554 APInt::getSignedMinValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4555 Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC);
4556 } else {
4557 SDValue MinC = DAG.getConstant(
4558 APInt::getAllOnes(SatWidth).zext(SrcElementWidth), DL, IntVT);
4559 Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC);
4560 }
4561
4562 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
4563}
4564
4565SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
4566 SelectionDAG &DAG) const {
4567 // AArch64 FP-to-int conversions saturate to the destination register size, so
4568 // we can lower common saturating conversions to simple instructions.
4569 SDValue SrcVal = Op.getOperand(0);
4570 EVT SrcVT = SrcVal.getValueType();
4571
4572 if (SrcVT.isVector())
4573 return LowerVectorFP_TO_INT_SAT(Op, DAG);
4574
4575 EVT DstVT = Op.getValueType();
4576 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4577 uint64_t SatWidth = SatVT.getScalarSizeInBits();
4578 uint64_t DstWidth = DstVT.getScalarSizeInBits();
4579 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
4580
4581 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4582 if ((SrcVT == MVT::f16 && !Subtarget->hasFullFP16()) || SrcVT == MVT::bf16) {
4583 SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal);
4584 SrcVT = MVT::f32;
4585 } else if (SrcVT != MVT::f64 && SrcVT != MVT::f32 && SrcVT != MVT::f16 &&
4586 SrcVT != MVT::bf16)
4587 return SDValue();
4588
4589 SDLoc DL(Op);
4590 // Cases that we can emit directly.
4591 if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 ||
4592 (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) &&
4593 DstVT == SatVT && (DstVT == MVT::i64 || DstVT == MVT::i32))
4594 return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
4595 DAG.getValueType(DstVT));
4596
4597 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4598 // result. This is only valid if the legal cvt is larger than the saturate
4599 // width.
4600 if (DstWidth < SatWidth)
4601 return SDValue();
4602
4603 SDValue NativeCvt =
4604 DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, DAG.getValueType(DstVT));
4605 SDValue Sat;
4606 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4607 SDValue MinC = DAG.getConstant(
4608 APInt::getSignedMaxValue(SatWidth).sext(DstWidth), DL, DstVT);
4609 SDValue Min = DAG.getNode(ISD::SMIN, DL, DstVT, NativeCvt, MinC);
4610 SDValue MaxC = DAG.getConstant(
4611 APInt::getSignedMinValue(SatWidth).sext(DstWidth), DL, DstVT);
4612 Sat = DAG.getNode(ISD::SMAX, DL, DstVT, Min, MaxC);
4613 } else {
4614 SDValue MinC = DAG.getConstant(
4615 APInt::getAllOnes(SatWidth).zext(DstWidth), DL, DstVT);
4616 Sat = DAG.getNode(ISD::UMIN, DL, DstVT, NativeCvt, MinC);
4617 }
4618
4619 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
4620}
4621
4622SDValue AArch64TargetLowering::LowerVectorXRINT(SDValue Op,
4623 SelectionDAG &DAG) const {
4624 EVT VT = Op.getValueType();
4625 SDValue Src = Op.getOperand(0);
4626 SDLoc DL(Op);
4627
4628 assert(VT.isVector() && "Expected vector type");
4629
4630 EVT CastVT =
4631 VT.changeVectorElementType(Src.getValueType().getVectorElementType());
4632
4633 // Round the floating-point value into a floating-point register with the
4634 // current rounding mode.
4635 SDValue FOp = DAG.getNode(ISD::FRINT, DL, CastVT, Src);
4636
4637 // Truncate the rounded floating point to an integer.
4638 return DAG.getNode(ISD::FP_TO_SINT_SAT, DL, VT, FOp,
4640}
4641
4642SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
4643 SelectionDAG &DAG) const {
4644 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4645 // Any additional optimization in this function should be recorded
4646 // in the cost tables.
4647 bool IsStrict = Op->isStrictFPOpcode();
4648 EVT VT = Op.getValueType();
4649 SDLoc dl(Op);
4650 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
4651 EVT InVT = In.getValueType();
4652 unsigned Opc = Op.getOpcode();
4653 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
4654
4655 if (VT.isScalableVector()) {
4656 if (InVT.getVectorElementType() == MVT::i1) {
4657 // We can't directly extend an SVE predicate; extend it first.
4658 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4659 EVT CastVT = getPromotedVTForPredicate(InVT);
4660 In = DAG.getNode(CastOpc, dl, CastVT, In);
4661 return DAG.getNode(Opc, dl, VT, In);
4662 }
4663
4664 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
4666 return LowerToPredicatedOp(Op, DAG, Opcode);
4667 }
4668
4669 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
4670 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
4671 return LowerFixedLengthIntToFPToSVE(Op, DAG);
4672
4673 // Promote bf16 conversions to f32.
4674 if (VT.getVectorElementType() == MVT::bf16) {
4675 EVT F32 = VT.changeElementType(MVT::f32);
4676 if (IsStrict) {
4677 SDValue Val = DAG.getNode(Op.getOpcode(), dl, {F32, MVT::Other},
4678 {Op.getOperand(0), In});
4679 return DAG.getNode(
4680 ISD::STRICT_FP_ROUND, dl, {Op.getValueType(), MVT::Other},
4681 {Val.getValue(1), Val.getValue(0), DAG.getIntPtrConstant(0, dl)});
4682 }
4683 return DAG.getNode(ISD::FP_ROUND, dl, Op.getValueType(),
4684 DAG.getNode(Op.getOpcode(), dl, F32, In),
4685 DAG.getIntPtrConstant(0, dl));
4686 }
4687
4688 uint64_t VTSize = VT.getFixedSizeInBits();
4689 uint64_t InVTSize = InVT.getFixedSizeInBits();
4690 if (VTSize < InVTSize) {
4691 MVT CastVT =
4693 InVT.getVectorNumElements());
4694 if (IsStrict) {
4695 In = DAG.getNode(Opc, dl, {CastVT, MVT::Other},
4696 {Op.getOperand(0), In});
4697 return DAG.getNode(
4698 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
4699 {In.getValue(1), In.getValue(0), DAG.getIntPtrConstant(0, dl)});
4700 }
4701 In = DAG.getNode(Opc, dl, CastVT, In);
4702 return DAG.getNode(ISD::FP_ROUND, dl, VT, In,
4703 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
4704 }
4705
4706 if (VTSize > InVTSize) {
4707 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4709 In = DAG.getNode(CastOpc, dl, CastVT, In);
4710 if (IsStrict)
4711 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op.getOperand(0), In});
4712 return DAG.getNode(Opc, dl, VT, In);
4713 }
4714
4715 // Use a scalar operation for conversions between single-element vectors of
4716 // the same size.
4717 if (VT.getVectorNumElements() == 1) {
4718 SDValue Extract = DAG.getNode(
4720 In, DAG.getConstant(0, dl, MVT::i64));
4721 EVT ScalarVT = VT.getScalarType();
4722 if (IsStrict)
4723 return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
4724 {Op.getOperand(0), Extract});
4725 return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
4726 }
4727
4728 return Op;
4729}
4730
4731SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
4732 SelectionDAG &DAG) const {
4733 if (Op.getValueType().isVector())
4734 return LowerVectorINT_TO_FP(Op, DAG);
4735
4736 bool IsStrict = Op->isStrictFPOpcode();
4737 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4738
4739 bool IsSigned = Op->getOpcode() == ISD::STRICT_SINT_TO_FP ||
4740 Op->getOpcode() == ISD::SINT_TO_FP;
4741
4742 auto IntToFpViaPromotion = [&](EVT PromoteVT) {
4743 SDLoc dl(Op);
4744 if (IsStrict) {
4745 SDValue Val = DAG.getNode(Op.getOpcode(), dl, {PromoteVT, MVT::Other},
4746 {Op.getOperand(0), SrcVal});
4747 return DAG.getNode(
4748 ISD::STRICT_FP_ROUND, dl, {Op.getValueType(), MVT::Other},
4749 {Val.getValue(1), Val.getValue(0), DAG.getIntPtrConstant(0, dl)});
4750 }
4751 return DAG.getNode(ISD::FP_ROUND, dl, Op.getValueType(),
4752 DAG.getNode(Op.getOpcode(), dl, PromoteVT, SrcVal),
4753 DAG.getIntPtrConstant(0, dl));
4754 };
4755
4756 if (Op.getValueType() == MVT::bf16) {
4757 unsigned MaxWidth = IsSigned
4758 ? DAG.ComputeMaxSignificantBits(SrcVal)
4759 : DAG.computeKnownBits(SrcVal).countMaxActiveBits();
4760 // bf16 conversions are promoted to f32 when converting from i16.
4761 if (MaxWidth <= 24) {
4762 return IntToFpViaPromotion(MVT::f32);
4763 }
4764
4765 // bf16 conversions are promoted to f64 when converting from i32.
4766 if (MaxWidth <= 53) {
4767 return IntToFpViaPromotion(MVT::f64);
4768 }
4769
4770 // We need to be careful about i64 -> bf16.
4771 // Consider an i32 22216703.
4772 // This number cannot be represented exactly as an f32 and so a itofp will
4773 // turn it into 22216704.0 fptrunc to bf16 will turn this into 22282240.0
4774 // However, the correct bf16 was supposed to be 22151168.0
4775 // We need to use sticky rounding to get this correct.
4776 if (SrcVal.getValueType() == MVT::i64) {
4777 SDLoc DL(Op);
4778 // This algorithm is equivalent to the following:
4779 // uint64_t SrcHi = SrcVal & ~0xfffull;
4780 // uint64_t SrcLo = SrcVal & 0xfffull;
4781 // uint64_t Highest = SrcVal >> 53;
4782 // bool HasHighest = Highest != 0;
4783 // uint64_t ToRound = HasHighest ? SrcHi : SrcVal;
4784 // double Rounded = static_cast<double>(ToRound);
4785 // uint64_t RoundedBits = std::bit_cast<uint64_t>(Rounded);
4786 // uint64_t HasLo = SrcLo != 0;
4787 // bool NeedsAdjustment = HasHighest & HasLo;
4788 // uint64_t AdjustedBits = RoundedBits | uint64_t{NeedsAdjustment};
4789 // double Adjusted = std::bit_cast<double>(AdjustedBits);
4790 // return static_cast<__bf16>(Adjusted);
4791 //
4792 // Essentially, what happens is that SrcVal either fits perfectly in a
4793 // double-precision value or it is too big. If it is sufficiently small,
4794 // we should just go u64 -> double -> bf16 in a naive way. Otherwise, we
4795 // ensure that u64 -> double has no rounding error by only using the 52
4796 // MSB of the input. The low order bits will get merged into a sticky bit
4797 // which will avoid issues incurred by double rounding.
4798
4799 // Signed conversion is more or less like so:
4800 // copysign((__bf16)abs(SrcVal), SrcVal)
4801 SDValue SignBit;
4802 if (IsSigned) {
4803 SignBit = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
4804 DAG.getConstant(1ull << 63, DL, MVT::i64));
4805 SrcVal = DAG.getNode(ISD::ABS, DL, MVT::i64, SrcVal);
4806 }
4807 SDValue SrcHi = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
4808 DAG.getConstant(~0xfffull, DL, MVT::i64));
4809 SDValue SrcLo = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
4810 DAG.getConstant(0xfffull, DL, MVT::i64));
4812 DAG.getNode(ISD::SRL, DL, MVT::i64, SrcVal,
4813 DAG.getShiftAmountConstant(53, MVT::i64, DL));
4814 SDValue Zero64 = DAG.getConstant(0, DL, MVT::i64);
4815 SDValue ToRound =
4816 DAG.getSelectCC(DL, Highest, Zero64, SrcHi, SrcVal, ISD::SETNE);
4817 SDValue Rounded =
4818 IsStrict ? DAG.getNode(Op.getOpcode(), DL, {MVT::f64, MVT::Other},
4819 {Op.getOperand(0), ToRound})
4820 : DAG.getNode(Op.getOpcode(), DL, MVT::f64, ToRound);
4821
4822 SDValue RoundedBits = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Rounded);
4823 if (SignBit) {
4824 RoundedBits = DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, SignBit);
4825 }
4826
4827 SDValue HasHighest = DAG.getSetCC(
4828 DL,
4829 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
4830 Highest, Zero64, ISD::SETNE);
4831
4832 SDValue HasLo = DAG.getSetCC(
4833 DL,
4834 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
4835 SrcLo, Zero64, ISD::SETNE);
4836
4837 SDValue NeedsAdjustment =
4838 DAG.getNode(ISD::AND, DL, HasLo.getValueType(), HasHighest, HasLo);
4839 NeedsAdjustment = DAG.getZExtOrTrunc(NeedsAdjustment, DL, MVT::i64);
4840
4841 SDValue AdjustedBits =
4842 DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, NeedsAdjustment);
4843 SDValue Adjusted = DAG.getNode(ISD::BITCAST, DL, MVT::f64, AdjustedBits);
4844 return IsStrict
4846 {Op.getValueType(), MVT::Other},
4847 {Rounded.getValue(1), Adjusted,
4848 DAG.getIntPtrConstant(0, DL)})
4849 : DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(), Adjusted,
4850 DAG.getIntPtrConstant(0, DL, true));
4851 }
4852 }
4853
4854 // f16 conversions are promoted to f32 when full fp16 is not supported.
4855 if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
4856 return IntToFpViaPromotion(MVT::f32);
4857 }
4858
4859 // i128 conversions are libcalls.
4860 if (SrcVal.getValueType() == MVT::i128)
4861 return SDValue();
4862
4863 // Other conversions are legal, unless it's to the completely software-based
4864 // fp128.
4865 if (Op.getValueType() != MVT::f128)
4866 return Op;
4867 return SDValue();
4868}
4869
4870SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
4871 SelectionDAG &DAG) const {
4872 // For iOS, we want to call an alternative entry point: __sincos_stret,
4873 // which returns the values in two S / D registers.
4874 SDLoc dl(Op);
4875 SDValue Arg = Op.getOperand(0);
4876 EVT ArgVT = Arg.getValueType();
4877 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
4878
4880 ArgListEntry Entry;
4881
4882 Entry.Node = Arg;
4883 Entry.Ty = ArgTy;
4884 Entry.IsSExt = false;
4885 Entry.IsZExt = false;
4886 Args.push_back(Entry);
4887
4888 RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
4889 : RTLIB::SINCOS_STRET_F32;
4890 const char *LibcallName = getLibcallName(LC);
4891 SDValue Callee =
4892 DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
4893
4894 StructType *RetTy = StructType::get(ArgTy, ArgTy);
4896 CLI.setDebugLoc(dl)
4897 .setChain(DAG.getEntryNode())
4898 .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
4899
4900 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
4901 return CallResult.first;
4902}
4903
4904static MVT getSVEContainerType(EVT ContentTy);
4905
4906SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
4907 SelectionDAG &DAG) const {
4908 EVT OpVT = Op.getValueType();
4909 EVT ArgVT = Op.getOperand(0).getValueType();
4910
4912 return LowerFixedLengthBitcastToSVE(Op, DAG);
4913
4914 if (OpVT.isScalableVector()) {
4915 // Bitcasting between unpacked vector types of different element counts is
4916 // not a NOP because the live elements are laid out differently.
4917 // 01234567
4918 // e.g. nxv2i32 = XX??XX??
4919 // nxv4f16 = X?X?X?X?
4920 if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount())
4921 return SDValue();
4922
4923 if (isTypeLegal(OpVT) && !isTypeLegal(ArgVT)) {
4924 assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
4925 "Expected int->fp bitcast!");
4926 SDValue ExtResult =
4928 Op.getOperand(0));
4929 return getSVESafeBitCast(OpVT, ExtResult, DAG);
4930 }
4931 return getSVESafeBitCast(OpVT, Op.getOperand(0), DAG);
4932 }
4933
4934 if (OpVT != MVT::f16 && OpVT != MVT::bf16)
4935 return SDValue();
4936
4937 // Bitcasts between f16 and bf16 are legal.
4938 if (ArgVT == MVT::f16 || ArgVT == MVT::bf16)
4939 return Op;
4940
4941 assert(ArgVT == MVT::i16);
4942 SDLoc DL(Op);
4943
4944 Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
4945 Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
4946 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, OpVT, Op);
4947}
4948
4949static EVT getExtensionTo64Bits(const EVT &OrigVT) {
4950 if (OrigVT.getSizeInBits() >= 64)
4951 return OrigVT;
4952
4953 assert(OrigVT.isSimple() && "Expecting a simple value type");
4954
4955 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
4956 switch (OrigSimpleTy) {
4957 default: llvm_unreachable("Unexpected Vector Type");
4958 case MVT::v2i8:
4959 case MVT::v2i16:
4960 return MVT::v2i32;
4961 case MVT::v4i8:
4962 return MVT::v4i16;
4963 }
4964}
4965
4967 const EVT &OrigTy,
4968 const EVT &ExtTy,
4969 unsigned ExtOpcode) {
4970 // The vector originally had a size of OrigTy. It was then extended to ExtTy.
4971 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
4972 // 64-bits we need to insert a new extension so that it will be 64-bits.
4973 assert(ExtTy.is128BitVector() && "Unexpected extension size");
4974 if (OrigTy.getSizeInBits() >= 64)
4975 return N;
4976
4977 // Must extend size to at least 64 bits to be used as an operand for VMULL.
4978 EVT NewVT = getExtensionTo64Bits(OrigTy);
4979
4980 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
4981}
4982
4983// Returns lane if Op extracts from a two-element vector and lane is constant
4984// (i.e., extractelt(<2 x Ty> %v, ConstantLane)), and std::nullopt otherwise.
4985static std::optional<uint64_t>
4987 SDNode *OpNode = Op.getNode();
4988 if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
4989 return std::nullopt;
4990
4991 EVT VT = OpNode->getOperand(0).getValueType();
4992 ConstantSDNode *C = dyn_cast<ConstantSDNode>(OpNode->getOperand(1));
4993 if (!VT.isFixedLengthVector() || VT.getVectorNumElements() != 2 || !C)
4994 return std::nullopt;
4995
4996 return C->getZExtValue();
4997}
4998
5000 bool isSigned) {
5001 EVT VT = N.getValueType();
5002
5003 if (N.getOpcode() != ISD::BUILD_VECTOR)
5004 return false;
5005
5006 for (const SDValue &Elt : N->op_values()) {
5007 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
5008 unsigned EltSize = VT.getScalarSizeInBits();
5009 unsigned HalfSize = EltSize / 2;
5010 if (isSigned) {
5011 if (!isIntN(HalfSize, C->getSExtValue()))
5012 return false;
5013 } else {
5014 if (!isUIntN(HalfSize, C->getZExtValue()))
5015 return false;
5016 }
5017 continue;
5018 }
5019 return false;
5020 }
5021
5022 return true;
5023}
5024
5026 EVT VT = N.getValueType();
5027 assert(VT.is128BitVector() && "Unexpected vector MULL size");
5028
5029 unsigned NumElts = VT.getVectorNumElements();
5030 unsigned OrigEltSize = VT.getScalarSizeInBits();
5031 unsigned EltSize = OrigEltSize / 2;
5032 MVT TruncVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
5033
5034 APInt HiBits = APInt::getHighBitsSet(OrigEltSize, EltSize);
5035 if (DAG.MaskedValueIsZero(N, HiBits))
5036 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), TruncVT, N);
5037
5038 if (ISD::isExtOpcode(N.getOpcode()))
5039 return addRequiredExtensionForVectorMULL(N.getOperand(0), DAG,
5040 N.getOperand(0).getValueType(), VT,
5041 N.getOpcode());
5042
5043 assert(N.getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
5044 SDLoc dl(N);
5046 for (unsigned i = 0; i != NumElts; ++i) {
5047 const APInt &CInt = N.getConstantOperandAPInt(i);
5048 // Element types smaller than 32 bits are not legal, so use i32 elements.
5049 // The values are implicitly truncated so sext vs. zext doesn't matter.
5050 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
5051 }
5052 return DAG.getBuildVector(TruncVT, dl, Ops);
5053}
5054
5056 return N.getOpcode() == ISD::SIGN_EXTEND ||
5057 N.getOpcode() == ISD::ANY_EXTEND ||
5058 isExtendedBUILD_VECTOR(N, DAG, true);
5059}
5060
5062 return N.getOpcode() == ISD::ZERO_EXTEND ||
5063 N.getOpcode() == ISD::ANY_EXTEND ||
5064 isExtendedBUILD_VECTOR(N, DAG, false);
5065}
5066
5068 unsigned Opcode = N.getOpcode();
5069 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
5070 SDValue N0 = N.getOperand(0);
5071 SDValue N1 = N.getOperand(1);
5072 return N0->hasOneUse() && N1->hasOneUse() &&
5073 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
5074 }
5075 return false;
5076}
5077
5079 unsigned Opcode = N.getOpcode();
5080 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
5081 SDValue N0 = N.getOperand(0);
5082 SDValue N1 = N.getOperand(1);
5083 return N0->hasOneUse() && N1->hasOneUse() &&
5084 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
5085 }
5086 return false;
5087}
5088
5089SDValue AArch64TargetLowering::LowerGET_ROUNDING(SDValue Op,
5090 SelectionDAG &DAG) const {
5091 // The rounding mode is in bits 23:22 of the FPSCR.
5092 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
5093 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
5094 // so that the shift + and get folded into a bitfield extract.
5095 SDLoc dl(Op);
5096
5097 SDValue Chain = Op.getOperand(0);
5098 SDValue FPCR_64 = DAG.getNode(
5099 ISD::INTRINSIC_W_CHAIN, dl, {MVT::i64, MVT::Other},
5100 {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, MVT::i64)});
5101 Chain = FPCR_64.getValue(1);
5102 SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64);
5103 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32,
5104 DAG.getConstant(1U << 22, dl, MVT::i32));
5105 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
5106 DAG.getConstant(22, dl, MVT::i32));
5107 SDValue AND = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
5108 DAG.getConstant(3, dl, MVT::i32));
5109 return DAG.getMergeValues({AND, Chain}, dl);
5110}
5111
5112SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op,
5113 SelectionDAG &DAG) const {
5114 SDLoc DL(Op);
5115 SDValue Chain = Op->getOperand(0);
5116 SDValue RMValue = Op->getOperand(1);
5117
5118 // The rounding mode is in bits 23:22 of the FPCR.
5119 // The llvm.set.rounding argument value to the rounding mode in FPCR mapping
5120 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
5121 // ((arg - 1) & 3) << 22).
5122 //
5123 // The argument of llvm.set.rounding must be within the segment [0, 3], so
5124 // NearestTiesToAway (4) is not handled here. It is responsibility of the code
5125 // generated llvm.set.rounding to ensure this condition.
5126
5127 // Calculate new value of FPCR[23:22].
5128 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
5129 DAG.getConstant(1, DL, MVT::i32));
5130 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
5131 DAG.getConstant(0x3, DL, MVT::i32));
5132 RMValue =
5133 DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
5134 DAG.getConstant(AArch64::RoundingBitsPos, DL, MVT::i32));
5135 RMValue = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, RMValue);
5136
5137 // Get current value of FPCR.
5138 SDValue Ops[] = {
5139 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5140 SDValue FPCR =
5141 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5142 Chain = FPCR.getValue(1);
5143 FPCR = FPCR.getValue(0);
5144
5145 // Put new rounding mode into FPSCR[23:22].
5146 const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos);
5147 FPCR = DAG.getNode(ISD::AND, DL, MVT::i64, FPCR,
5148 DAG.getConstant(RMMask, DL, MVT::i64));
5149 FPCR = DAG.getNode(ISD::OR, DL, MVT::i64, FPCR, RMValue);
5150 SDValue Ops2[] = {
5151 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
5152 FPCR};
5153 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5154}
5155
5156SDValue AArch64TargetLowering::LowerGET_FPMODE(SDValue Op,
5157 SelectionDAG &DAG) const {
5158 SDLoc DL(Op);
5159 SDValue Chain = Op->getOperand(0);
5160
5161 // Get current value of FPCR.
5162 SDValue Ops[] = {
5163 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5164 SDValue FPCR =
5165 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5166 Chain = FPCR.getValue(1);
5167 FPCR = FPCR.getValue(0);
5168
5169 // Truncate FPCR to 32 bits.
5170 SDValue Result = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPCR);
5171
5172 return DAG.getMergeValues({Result, Chain}, DL);
5173}
5174
5175SDValue AArch64TargetLowering::LowerSET_FPMODE(SDValue Op,
5176 SelectionDAG &DAG) const {
5177 SDLoc DL(Op);
5178 SDValue Chain = Op->getOperand(0);
5179 SDValue Mode = Op->getOperand(1);
5180
5181 // Extend the specified value to 64 bits.
5182 SDValue FPCR = DAG.getZExtOrTrunc(Mode, DL, MVT::i64);
5183
5184 // Set new value of FPCR.
5185 SDValue Ops2[] = {
5186 Chain, DAG.getConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64), FPCR};
5187 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5188}
5189
5190SDValue AArch64TargetLowering::LowerRESET_FPMODE(SDValue Op,
5191 SelectionDAG &DAG) const {
5192 SDLoc DL(Op);
5193 SDValue Chain = Op->getOperand(0);
5194
5195 // Get current value of FPCR.
5196 SDValue Ops[] = {
5197 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5198 SDValue FPCR =
5199 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5200 Chain = FPCR.getValue(1);
5201 FPCR = FPCR.getValue(0);
5202
5203 // Clear bits that are not reserved.
5204 SDValue FPSCRMasked = DAG.getNode(
5205 ISD::AND, DL, MVT::i64, FPCR,
5207
5208 // Set new value of FPCR.
5209 SDValue Ops2[] = {Chain,
5210 DAG.getConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
5211 FPSCRMasked};
5212 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5213}
5214
5215static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG,
5216 SDLoc DL, bool &IsMLA) {
5217 bool IsN0SExt = isSignExtended(N0, DAG);
5218 bool IsN1SExt = isSignExtended(N1, DAG);
5219 if (IsN0SExt && IsN1SExt)
5220 return AArch64ISD::SMULL;
5221
5222 bool IsN0ZExt = isZeroExtended(N0, DAG);
5223 bool IsN1ZExt = isZeroExtended(N1, DAG);
5224
5225 if (IsN0ZExt && IsN1ZExt)
5226 return AArch64ISD::UMULL;
5227
5228 // Select SMULL if we can replace zext with sext.
5229 if (((IsN0SExt && IsN1ZExt) || (IsN0ZExt && IsN1SExt)) &&
5230 !isExtendedBUILD_VECTOR(N0, DAG, false) &&
5231 !isExtendedBUILD_VECTOR(N1, DAG, false)) {
5232 SDValue ZextOperand;
5233 if (IsN0ZExt)
5234 ZextOperand = N0.getOperand(0);
5235 else
5236 ZextOperand = N1.getOperand(0);
5237 if (DAG.SignBitIsZero(ZextOperand)) {
5238 SDValue NewSext =
5239 DAG.getSExtOrTrunc(ZextOperand, DL, N0.getValueType());
5240 if (IsN0ZExt)
5241 N0 = NewSext;
5242 else
5243 N1 = NewSext;
5244 return AArch64ISD::SMULL;
5245 }
5246 }
5247
5248 // Select UMULL if we can replace the other operand with an extend.
5249 if (IsN0ZExt || IsN1ZExt) {
5250 EVT VT = N0.getValueType();
5252 VT.getScalarSizeInBits() / 2);
5253 if (DAG.MaskedValueIsZero(IsN0ZExt ? N1 : N0, Mask))
5254 return AArch64ISD::UMULL;
5255 }
5256
5257 if (!IsN1SExt && !IsN1ZExt)
5258 return 0;
5259
5260 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
5261 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
5262 if (IsN1SExt && isAddSubSExt(N0, DAG)) {
5263 IsMLA = true;
5264 return AArch64ISD::SMULL;
5265 }
5266 if (IsN1ZExt && isAddSubZExt(N0, DAG)) {
5267 IsMLA = true;
5268 return AArch64ISD::UMULL;
5269 }
5270 if (IsN0ZExt && isAddSubZExt(N1, DAG)) {
5271 std::swap(N0, N1);
5272 IsMLA = true;
5273 return AArch64ISD::UMULL;
5274 }
5275 return 0;
5276}
5277
5278SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
5279 EVT VT = Op.getValueType();
5280
5281 bool OverrideNEON = !Subtarget->isNeonAvailable();
5282 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
5283 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5284
5285 // Multiplications are only custom-lowered for 128-bit and 64-bit vectors so
5286 // that VMULL can be detected. Otherwise v2i64 multiplications are not legal.
5287 assert((VT.is128BitVector() || VT.is64BitVector()) && VT.isInteger() &&
5288 "unexpected type for custom-lowering ISD::MUL");
5289 SDValue N0 = Op.getOperand(0);
5290 SDValue N1 = Op.getOperand(1);
5291 bool isMLA = false;
5292 EVT OVT = VT;
5293 if (VT.is64BitVector()) {
5294 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5295 isNullConstant(N0.getOperand(1)) &&
5297 isNullConstant(N1.getOperand(1))) {
5298 N0 = N0.getOperand(0);
5299 N1 = N1.getOperand(0);
5300 VT = N0.getValueType();
5301 } else {
5302 if (VT == MVT::v1i64) {
5303 if (Subtarget->hasSVE())
5304 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5305 // Fall through to expand this. It is not legal.
5306 return SDValue();
5307 } else
5308 // Other vector multiplications are legal.
5309 return Op;
5310 }
5311 }
5312
5313 SDLoc DL(Op);
5314 unsigned NewOpc = selectUmullSmull(N0, N1, DAG, DL, isMLA);
5315
5316 if (!NewOpc) {
5317 if (VT.getVectorElementType() == MVT::i64) {
5318 // If SVE is available then i64 vector multiplications can also be made
5319 // legal.
5320 if (Subtarget->hasSVE())
5321 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5322 // Fall through to expand this. It is not legal.
5323 return SDValue();
5324 } else
5325 // Other vector multiplications are legal.
5326 return Op;
5327 }
5328
5329 // Legalize to a S/UMULL instruction
5330 SDValue Op0;
5331 SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
5332 if (!isMLA) {
5333 Op0 = skipExtensionForVectorMULL(N0, DAG);
5335 Op1.getValueType().is64BitVector() &&
5336 "unexpected types for extended operands to VMULL");
5337 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OVT,
5338 DAG.getNode(NewOpc, DL, VT, Op0, Op1),
5339 DAG.getConstant(0, DL, MVT::i64));
5340 }
5341 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
5342 // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
5343 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
5346 EVT Op1VT = Op1.getValueType();
5347 return DAG.getNode(
5349 DAG.getNode(N0.getOpcode(), DL, VT,
5350 DAG.getNode(NewOpc, DL, VT,
5351 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
5352 DAG.getNode(NewOpc, DL, VT,
5353 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)),
5354 DAG.getConstant(0, DL, MVT::i64));
5355}
5356
5357static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
5358 int Pattern) {
5359 if (VT == MVT::nxv1i1 && Pattern == AArch64SVEPredPattern::all)
5360 return DAG.getConstant(1, DL, MVT::nxv1i1);
5361 return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
5362 DAG.getTargetConstant(Pattern, DL, MVT::i32));
5363}
5364
5366 bool IsSigned, bool IsEqual) {
5367 if (!isa<ConstantSDNode>(Op.getOperand(1)) ||
5368 !isa<ConstantSDNode>(Op.getOperand(2)))
5369 return SDValue();
5370
5371 SDLoc dl(Op);
5372 APInt X = Op.getConstantOperandAPInt(1);
5373 APInt Y = Op.getConstantOperandAPInt(2);
5374 bool Overflow;
5375 APInt NumActiveElems =
5376 IsSigned ? Y.ssub_ov(X, Overflow) : Y.usub_ov(X, Overflow);
5377
5378 if (Overflow)
5379 return SDValue();
5380
5381 if (IsEqual) {
5382 APInt One(NumActiveElems.getBitWidth(), 1, IsSigned);
5383 NumActiveElems = IsSigned ? NumActiveElems.sadd_ov(One, Overflow)
5384 : NumActiveElems.uadd_ov(One, Overflow);
5385 if (Overflow)
5386 return SDValue();
5387 }
5388
5389 std::optional<unsigned> PredPattern =
5391 unsigned MinSVEVectorSize = std::max(
5393 unsigned ElementSize = 128 / Op.getValueType().getVectorMinNumElements();
5394 if (PredPattern != std::nullopt &&
5395 NumActiveElems.getZExtValue() <= (MinSVEVectorSize / ElementSize))
5396 return getPTrue(DAG, dl, Op.getValueType(), *PredPattern);
5397
5398 return SDValue();
5399}
5400
5401// Returns a safe bitcast between two scalable vector predicates, where
5402// any newly created lanes from a widening bitcast are defined as zero.
5404 SDLoc DL(Op);
5405 EVT InVT = Op.getValueType();
5406
5407 assert(InVT.getVectorElementType() == MVT::i1 &&
5408 VT.getVectorElementType() == MVT::i1 &&
5409 "Expected a predicate-to-predicate bitcast");
5411 InVT.isScalableVector() &&
5412 DAG.getTargetLoweringInfo().isTypeLegal(InVT) &&
5413 "Only expect to cast between legal scalable predicate types!");
5414
5415 // Return the operand if the cast isn't changing type,
5416 // e.g. <n x 16 x i1> -> <n x 16 x i1>
5417 if (InVT == VT)
5418 return Op;
5419
5420 SDValue Reinterpret = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
5421
5422 // We only have to zero the lanes if new lanes are being defined, e.g. when
5423 // casting from <vscale x 2 x i1> to <vscale x 16 x i1>. If this is not the
5424 // case (e.g. when casting from <vscale x 16 x i1> -> <vscale x 2 x i1>) then
5425 // we can return here.
5426 if (InVT.bitsGT(VT))
5427 return Reinterpret;
5428
5429 // Check if the other lanes are already known to be zeroed by
5430 // construction.
5432 return Reinterpret;
5433
5434 // Zero the newly introduced lanes.
5435 SDValue Mask = DAG.getConstant(1, DL, InVT);
5436 Mask = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Mask);
5437 return DAG.getNode(ISD::AND, DL, VT, Reinterpret, Mask);
5438}
5439
5440SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG,
5441 SDValue Chain, SDLoc DL,
5442 EVT VT) const {
5443 SDValue Callee = DAG.getExternalSymbol("__arm_sme_state",
5445 Type *Int64Ty = Type::getInt64Ty(*DAG.getContext());
5446 Type *RetTy = StructType::get(Int64Ty, Int64Ty);
5449 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
5451 RetTy, Callee, std::move(Args));
5452 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
5453 SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ 1, DL, MVT::i64);
5454 return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(0),
5455 Mask);
5456}
5457
5458// Lower an SME LDR/STR ZA intrinsic
5459// Case 1: If the vector number (vecnum) is an immediate in range, it gets
5460// folded into the instruction
5461// ldr(%tileslice, %ptr, 11) -> ldr [%tileslice, 11], [%ptr, 11]
5462// Case 2: If the vecnum is not an immediate, then it is used to modify the base
5463// and tile slice registers
5464// ldr(%tileslice, %ptr, %vecnum)
5465// ->
5466// %svl = rdsvl
5467// %ptr2 = %ptr + %svl * %vecnum
5468// %tileslice2 = %tileslice + %vecnum
5469// ldr [%tileslice2, 0], [%ptr2, 0]
5470// Case 3: If the vecnum is an immediate out of range, then the same is done as
5471// case 2, but the base and slice registers are modified by the greatest
5472// multiple of 15 lower than the vecnum and the remainder is folded into the
5473// instruction. This means that successive loads and stores that are offset from
5474// each other can share the same base and slice register updates.
5475// ldr(%tileslice, %ptr, 22)
5476// ldr(%tileslice, %ptr, 23)
5477// ->
5478// %svl = rdsvl
5479// %ptr2 = %ptr + %svl * 15
5480// %tileslice2 = %tileslice + 15
5481// ldr [%tileslice2, 7], [%ptr2, 7]
5482// ldr [%tileslice2, 8], [%ptr2, 8]
5483// Case 4: If the vecnum is an add of an immediate, then the non-immediate
5484// operand and the immediate can be folded into the instruction, like case 2.
5485// ldr(%tileslice, %ptr, %vecnum + 7)
5486// ldr(%tileslice, %ptr, %vecnum + 8)
5487// ->
5488// %svl = rdsvl
5489// %ptr2 = %ptr + %svl * %vecnum
5490// %tileslice2 = %tileslice + %vecnum
5491// ldr [%tileslice2, 7], [%ptr2, 7]
5492// ldr [%tileslice2, 8], [%ptr2, 8]
5493// Case 5: The vecnum being an add of an immediate out of range is also handled,
5494// in which case the same remainder logic as case 3 is used.
5496 SDLoc DL(N);
5497
5498 SDValue TileSlice = N->getOperand(2);
5499 SDValue Base = N->getOperand(3);
5500 SDValue VecNum = N->getOperand(4);
5501 int32_t ConstAddend = 0;
5502 SDValue VarAddend = VecNum;
5503
5504 // If the vnum is an add of an immediate, we can fold it into the instruction
5505 if (VecNum.getOpcode() == ISD::ADD &&
5506 isa<ConstantSDNode>(VecNum.getOperand(1))) {
5507 ConstAddend = cast<ConstantSDNode>(VecNum.getOperand(1))->getSExtValue();
5508 VarAddend = VecNum.getOperand(0);
5509 } else if (auto ImmNode = dyn_cast<ConstantSDNode>(VecNum)) {
5510 ConstAddend = ImmNode->getSExtValue();
5511 VarAddend = SDValue();
5512 }
5513
5514 int32_t ImmAddend = ConstAddend % 16;
5515 if (int32_t C = (ConstAddend - ImmAddend)) {
5516 SDValue CVal = DAG.getTargetConstant(C, DL, MVT::i32);
5517 VarAddend = VarAddend
5518 ? DAG.getNode(ISD::ADD, DL, MVT::i32, {VarAddend, CVal})
5519 : CVal;
5520 }
5521
5522 if (VarAddend) {
5523 // Get the vector length that will be multiplied by vnum
5524 auto SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
5525 DAG.getConstant(1, DL, MVT::i32));
5526
5527 // Multiply SVL and vnum then add it to the base
5528 SDValue Mul = DAG.getNode(
5529 ISD::MUL, DL, MVT::i64,
5530 {SVL, DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, VarAddend)});
5531 Base = DAG.getNode(ISD::ADD, DL, MVT::i64, {Base, Mul});
5532 // Just add vnum to the tileslice
5533 TileSlice = DAG.getNode(ISD::ADD, DL, MVT::i32, {TileSlice, VarAddend});
5534 }
5535
5537 DL, MVT::Other,
5538 {/*Chain=*/N.getOperand(0), TileSlice, Base,
5539 DAG.getTargetConstant(ImmAddend, DL, MVT::i32)});
5540}
5541
5542SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
5543 SelectionDAG &DAG) const {
5544 unsigned IntNo = Op.getConstantOperandVal(1);
5545 SDLoc DL(Op);
5546 switch (IntNo) {
5547 default:
5548 return SDValue(); // Don't custom lower most intrinsics.
5549 case Intrinsic::aarch64_prefetch: {
5550 SDValue Chain = Op.getOperand(0);
5551 SDValue Addr = Op.getOperand(2);
5552
5553 unsigned IsWrite = Op.getConstantOperandVal(3);
5554 unsigned Locality = Op.getConstantOperandVal(4);
5555 unsigned IsStream = Op.getConstantOperandVal(5);
5556 unsigned IsData = Op.getConstantOperandVal(6);
5557 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
5558 (!IsData << 3) | // IsDataCache bit
5559 (Locality << 1) | // Cache level bits
5560 (unsigned)IsStream; // Stream bit
5561
5562 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Chain,
5563 DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr);
5564 }
5565 case Intrinsic::aarch64_sme_str:
5566 case Intrinsic::aarch64_sme_ldr: {
5567 return LowerSMELdrStr(Op, DAG, IntNo == Intrinsic::aarch64_sme_ldr);
5568 }
5569 case Intrinsic::aarch64_sme_za_enable:
5570 return DAG.getNode(
5571 AArch64ISD::SMSTART, DL, MVT::Other,
5572 Op->getOperand(0), // Chain
5573 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
5574 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
5575 case Intrinsic::aarch64_sme_za_disable:
5576 return DAG.getNode(
5577 AArch64ISD::SMSTOP, DL, MVT::Other,
5578 Op->getOperand(0), // Chain
5579 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
5580 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
5581 }
5582}
5583
5584SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
5585 SelectionDAG &DAG) const {
5586 unsigned IntNo = Op.getConstantOperandVal(1);
5587 SDLoc DL(Op);
5588 switch (IntNo) {
5589 default:
5590 return SDValue(); // Don't custom lower most intrinsics.
5591 case Intrinsic::aarch64_mops_memset_tag: {
5592 auto Node = cast<MemIntrinsicSDNode>(Op.getNode());
5593 SDValue Chain = Node->getChain();
5594 SDValue Dst = Op.getOperand(2);
5595 SDValue Val = Op.getOperand(3);
5596 Val = DAG.getAnyExtOrTrunc(Val, DL, MVT::i64);
5597 SDValue Size = Op.getOperand(4);
5598 auto Alignment = Node->getMemOperand()->getAlign();
5599 bool IsVol = Node->isVolatile();
5600 auto DstPtrInfo = Node->getPointerInfo();
5601
5602 const auto &SDI =
5603 static_cast<const AArch64SelectionDAGInfo &>(DAG.getSelectionDAGInfo());
5604 SDValue MS =
5605 SDI.EmitMOPS(AArch64ISD::MOPS_MEMSET_TAGGING, DAG, DL, Chain, Dst, Val,
5606 Size, Alignment, IsVol, DstPtrInfo, MachinePointerInfo{});
5607
5608 // MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the
5609 // intrinsic has 2. So hide SizeWb using MERGE_VALUES. Otherwise
5610 // LowerOperationWrapper will complain that the number of results has
5611 // changed.
5612 return DAG.getMergeValues({MS.getValue(0), MS.getValue(2)}, DL);
5613 }
5614 }
5615}
5616
5617SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
5618 SelectionDAG &DAG) const {
5619 unsigned IntNo = Op.getConstantOperandVal(0);
5620 SDLoc dl(Op);
5621 switch (IntNo) {
5622 default: return SDValue(); // Don't custom lower most intrinsics.
5623 case Intrinsic::thread_pointer: {
5624 EVT PtrVT = getPointerTy(DAG.getDataLayout());
5625 return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
5626 }
5627 case Intrinsic::aarch64_neon_abs: {
5628 EVT Ty = Op.getValueType();
5629 if (Ty == MVT::i64) {
5630 SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64,
5631 Op.getOperand(1));
5632 Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result);
5633 return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result);
5634 } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
5635 return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1));
5636 } else {
5637 report_fatal_error("Unexpected type for AArch64 NEON intrinic");
5638 }
5639 }
5640 case Intrinsic::aarch64_neon_pmull64: {
5641 SDValue LHS = Op.getOperand(1);
5642 SDValue RHS = Op.getOperand(2);
5643
5644 std::optional<uint64_t> LHSLane =
5646 std::optional<uint64_t> RHSLane =
5648
5649 assert((!LHSLane || *LHSLane < 2) && "Expect lane to be None or 0 or 1");
5650 assert((!RHSLane || *RHSLane < 2) && "Expect lane to be None or 0 or 1");
5651
5652 // 'aarch64_neon_pmull64' takes i64 parameters; while pmull/pmull2
5653 // instructions execute on SIMD registers. So canonicalize i64 to v1i64,
5654 // which ISel recognizes better. For example, generate a ldr into d*
5655 // registers as opposed to a GPR load followed by a fmov.
5656 auto TryVectorizeOperand = [](SDValue N, std::optional<uint64_t> NLane,
5657 std::optional<uint64_t> OtherLane,
5658 const SDLoc &dl,
5659 SelectionDAG &DAG) -> SDValue {
5660 // If the operand is an higher half itself, rewrite it to
5661 // extract_high_v2i64; this way aarch64_neon_pmull64 could
5662 // re-use the dag-combiner function with aarch64_neon_{pmull,smull,umull}.
5663 if (NLane && *NLane == 1)
5664 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
5665 N.getOperand(0), DAG.getConstant(1, dl, MVT::i64));
5666
5667 // Operand N is not a higher half but the other operand is.
5668 if (OtherLane && *OtherLane == 1) {
5669 // If this operand is a lower half, rewrite it to
5670 // extract_high_v2i64(duplane(<2 x Ty>, 0)). This saves a roundtrip to
5671 // align lanes of two operands. A roundtrip sequence (to move from lane
5672 // 1 to lane 0) is like this:
5673 // mov x8, v0.d[1]
5674 // fmov d0, x8
5675 if (NLane && *NLane == 0)
5676 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i64,
5677 DAG.getNode(AArch64ISD::DUPLANE64, dl, MVT::v2i64,
5678 N.getOperand(0),
5679 DAG.getConstant(0, dl, MVT::i64)),
5680 DAG.getConstant(1, dl, MVT::i64));
5681
5682 // Otherwise just dup from main to all lanes.
5683 return DAG.getNode(AArch64ISD::DUP, dl, MVT::v1i64, N);
5684 }
5685
5686 // Neither operand is an extract of higher half, so codegen may just use
5687 // the non-high version of PMULL instruction. Use v1i64 to represent i64.
5688 assert(N.getValueType() == MVT::i64 &&
5689 "Intrinsic aarch64_neon_pmull64 requires i64 parameters");
5690 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, N);
5691 };
5692
5693 LHS = TryVectorizeOperand(LHS, LHSLane, RHSLane, dl, DAG);
5694 RHS = TryVectorizeOperand(RHS, RHSLane, LHSLane, dl, DAG);
5695
5696 return DAG.getNode(AArch64ISD::PMULL, dl, Op.getValueType(), LHS, RHS);
5697 }
5698 case Intrinsic::aarch64_neon_smax:
5699 return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
5700 Op.getOperand(1), Op.getOperand(2));
5701 case Intrinsic::aarch64_neon_umax:
5702 return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
5703 Op.getOperand(1), Op.getOperand(2));
5704 case Intrinsic::aarch64_neon_smin:
5705 return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
5706 Op.getOperand(1), Op.getOperand(2));
5707 case Intrinsic::aarch64_neon_umin:
5708 return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
5709 Op.getOperand(1), Op.getOperand(2));
5710 case Intrinsic::aarch64_neon_scalar_sqxtn:
5711 case Intrinsic::aarch64_neon_scalar_sqxtun:
5712 case Intrinsic::aarch64_neon_scalar_uqxtn: {
5713 assert(Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::f32);
5714 if (Op.getValueType() == MVT::i32)
5715 return DAG.getNode(ISD::BITCAST, dl, MVT::i32,
5716 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::f32,
5717 Op.getOperand(0),
5718 DAG.getNode(ISD::BITCAST, dl, MVT::f64,
5719 Op.getOperand(1))));
5720 return SDValue();
5721 }
5722 case Intrinsic::aarch64_sve_whilelo:
5723 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/false,
5724 /*IsEqual=*/false);
5725 case Intrinsic::aarch64_sve_whilelt:
5726 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/true,
5727 /*IsEqual=*/false);
5728 case Intrinsic::aarch64_sve_whilels:
5729 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/false,
5730 /*IsEqual=*/true);
5731 case Intrinsic::aarch64_sve_whilele:
5732 return optimizeIncrementingWhile(Op, DAG, /*IsSigned=*/true,
5733 /*IsEqual=*/true);
5734 case Intrinsic::aarch64_sve_sunpkhi:
5735 return DAG.getNode(AArch64ISD::SUNPKHI, dl, Op.getValueType(),
5736 Op.getOperand(1));
5737 case Intrinsic::aarch64_sve_sunpklo:
5738 return DAG.getNode(AArch64ISD::SUNPKLO, dl, Op.getValueType(),
5739 Op.getOperand(1));
5740 case Intrinsic::aarch64_sve_uunpkhi:
5741 return DAG.getNode(AArch64ISD::UUNPKHI, dl, Op.getValueType(),
5742 Op.getOperand(1));
5743 case Intrinsic::aarch64_sve_uunpklo:
5744 return DAG.getNode(AArch64ISD::UUNPKLO, dl, Op.getValueType(),
5745 Op.getOperand(1));
5746 case Intrinsic::aarch64_sve_clasta_n:
5747 return DAG.getNode(AArch64ISD::CLASTA_N, dl, Op.getValueType(),
5748 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5749 case Intrinsic::aarch64_sve_clastb_n:
5750 return DAG.getNode(AArch64ISD::CLASTB_N, dl, Op.getValueType(),
5751 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5752 case Intrinsic::aarch64_sve_lasta:
5753 return DAG.getNode(AArch64ISD::LASTA, dl, Op.getValueType(),
5754 Op.getOperand(1), Op.getOperand(2));
5755 case Intrinsic::aarch64_sve_lastb:
5756 return DAG.getNode(AArch64ISD::LASTB, dl, Op.getValueType(),
5757 Op.getOperand(1), Op.getOperand(2));
5758 case Intrinsic::aarch64_sve_rev:
5759 return DAG.getNode(ISD::VECTOR_REVERSE, dl, Op.getValueType(),
5760 Op.getOperand(1));
5761 case Intrinsic::aarch64_sve_tbl:
5762 return DAG.getNode(AArch64ISD::TBL, dl, Op.getValueType(),
5763 Op.getOperand(1), Op.getOperand(2));
5764 case Intrinsic::aarch64_sve_trn1:
5765 return DAG.getNode(AArch64ISD::TRN1, dl, Op.getValueType(),
5766 Op.getOperand(1), Op.getOperand(2));
5767 case Intrinsic::aarch64_sve_trn2:
5768 return DAG.getNode(AArch64ISD::TRN2, dl, Op.getValueType(),
5769 Op.getOperand(1), Op.getOperand(2));
5770 case Intrinsic::aarch64_sve_uzp1:
5771 return DAG.getNode(AArch64ISD::UZP1, dl, Op.getValueType(),
5772 Op.getOperand(1), Op.getOperand(2));
5773 case Intrinsic::aarch64_sve_uzp2:
5774 return DAG.getNode(AArch64ISD::UZP2, dl, Op.getValueType(),
5775 Op.getOperand(1), Op.getOperand(2));
5776 case Intrinsic::aarch64_sve_zip1:
5777 return DAG.getNode(AArch64ISD::ZIP1, dl, Op.getValueType(),
5778 Op.getOperand(1), Op.getOperand(2));
5779 case Intrinsic::aarch64_sve_zip2:
5780 return DAG.getNode(AArch64ISD::ZIP2, dl, Op.getValueType(),
5781 Op.getOperand(1), Op.getOperand(2));
5782 case Intrinsic::aarch64_sve_splice:
5783 return DAG.getNode(AArch64ISD::SPLICE, dl, Op.getValueType(),
5784 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5785 case Intrinsic::aarch64_sve_ptrue:
5786 return getPTrue(DAG, dl, Op.getValueType(), Op.getConstantOperandVal(1));
5787 case Intrinsic::aarch64_sve_clz:
5788 return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, dl, Op.getValueType(),
5789 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5790 case Intrinsic::aarch64_sme_cntsb:
5791 return DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
5792 DAG.getConstant(1, dl, MVT::i32));
5793 case Intrinsic::aarch64_sme_cntsh: {
5794 SDValue One = DAG.getConstant(1, dl, MVT::i32);
5795 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(), One);
5796 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes, One);
5797 }
5798 case Intrinsic::aarch64_sme_cntsw: {
5799 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
5800 DAG.getConstant(1, dl, MVT::i32));
5801 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
5802 DAG.getConstant(2, dl, MVT::i32));
5803 }
5804 case Intrinsic::aarch64_sme_cntsd: {
5805 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
5806 DAG.getConstant(1, dl, MVT::i32));
5807 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
5808 DAG.getConstant(3, dl, MVT::i32));
5809 }
5810 case Intrinsic::aarch64_sve_cnt: {
5811 SDValue Data = Op.getOperand(3);
5812 // CTPOP only supports integer operands.
5813 if (Data.getValueType().isFloatingPoint())
5814 Data = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Data);
5815 return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, dl, Op.getValueType(),
5816 Op.getOperand(2), Data, Op.getOperand(1));
5817 }
5818 case Intrinsic::aarch64_sve_dupq_lane:
5819 return LowerDUPQLane(Op, DAG);
5820 case Intrinsic::aarch64_sve_convert_from_svbool:
5821 if (Op.getValueType() == MVT::aarch64svcount)
5822 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Op.getOperand(1));
5823 return getSVEPredicateBitCast(Op.getValueType(), Op.getOperand(1), DAG);
5824 case Intrinsic::aarch64_sve_convert_to_svbool:
5825 if (Op.getOperand(1).getValueType() == MVT::aarch64svcount)
5826 return DAG.getNode(ISD::BITCAST, dl, MVT::nxv16i1, Op.getOperand(1));
5827 return getSVEPredicateBitCast(MVT::nxv16i1, Op.getOperand(1), DAG);
5828 case Intrinsic::aarch64_sve_fneg:
5829 return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, dl, Op.getValueType(),
5830 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5831 case Intrinsic::aarch64_sve_frintp:
5832 return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, dl, Op.getValueType(),
5833 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5834 case Intrinsic::aarch64_sve_frintm:
5835 return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, dl, Op.getValueType(),
5836 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5837 case Intrinsic::aarch64_sve_frinti:
5838 return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, dl, Op.getValueType(),
5839 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5840 case Intrinsic::aarch64_sve_frintx:
5841 return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, dl, Op.getValueType(),
5842 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5843 case Intrinsic::aarch64_sve_frinta:
5844 return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, dl, Op.getValueType(),
5845 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5846 case Intrinsic::aarch64_sve_frintn:
5847 return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, dl, Op.getValueType(),
5848 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5849 case Intrinsic::aarch64_sve_frintz:
5850 return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, dl, Op.getValueType(),
5851 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5852 case Intrinsic::aarch64_sve_ucvtf:
5854 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5855 Op.getOperand(1));
5856 case Intrinsic::aarch64_sve_scvtf:
5858 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5859 Op.getOperand(1));
5860 case Intrinsic::aarch64_sve_fcvtzu:
5862 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5863 Op.getOperand(1));
5864 case Intrinsic::aarch64_sve_fcvtzs:
5866 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5867 Op.getOperand(1));
5868 case Intrinsic::aarch64_sve_fsqrt:
5869 return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, dl, Op.getValueType(),
5870 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5871 case Intrinsic::aarch64_sve_frecpx:
5872 return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, dl, Op.getValueType(),
5873 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5874 case Intrinsic::aarch64_sve_frecpe_x:
5875 return DAG.getNode(AArch64ISD::FRECPE, dl, Op.getValueType(),
5876 Op.getOperand(1));
5877 case Intrinsic::aarch64_sve_frecps_x:
5878 return DAG.getNode(AArch64ISD::FRECPS, dl, Op.getValueType(),
5879 Op.getOperand(1), Op.getOperand(2));
5880 case Intrinsic::aarch64_sve_frsqrte_x:
5881 return DAG.getNode(AArch64ISD::FRSQRTE, dl, Op.getValueType(),
5882 Op.getOperand(1));
5883 case Intrinsic::aarch64_sve_frsqrts_x:
5884 return DAG.getNode(AArch64ISD::FRSQRTS, dl, Op.getValueType(),
5885 Op.getOperand(1), Op.getOperand(2));
5886 case Intrinsic::aarch64_sve_fabs:
5887 return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, dl, Op.getValueType(),
5888 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5889 case Intrinsic::aarch64_sve_abs:
5890 return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, dl, Op.getValueType(),
5891 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5892 case Intrinsic::aarch64_sve_neg:
5893 return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, dl, Op.getValueType(),
5894 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5895 case Intrinsic::aarch64_sve_insr: {
5896 SDValue Scalar = Op.getOperand(2);
5897 EVT ScalarTy = Scalar.getValueType();
5898 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
5899 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
5900
5901 return DAG.getNode(AArch64ISD::INSR, dl, Op.getValueType(),
5902 Op.getOperand(1), Scalar);
5903 }
5904 case Intrinsic::aarch64_sve_rbit:
5906 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
5907 Op.getOperand(1));
5908 case Intrinsic::aarch64_sve_revb:
5909 return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, dl, Op.getValueType(),
5910 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5911 case Intrinsic::aarch64_sve_revh:
5912 return DAG.getNode(AArch64ISD::REVH_MERGE_PASSTHRU, dl, Op.getValueType(),
5913 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5914 case Intrinsic::aarch64_sve_revw:
5915 return DAG.getNode(AArch64ISD::REVW_MERGE_PASSTHRU, dl, Op.getValueType(),
5916 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5917 case Intrinsic::aarch64_sve_revd:
5918 return DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, dl, Op.getValueType(),
5919 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
5920 case Intrinsic::aarch64_sve_sxtb:
5921 return DAG.getNode(
5923 Op.getOperand(2), Op.getOperand(3),
5924 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
5925 Op.getOperand(1));
5926 case Intrinsic::aarch64_sve_sxth:
5927 return DAG.getNode(
5929 Op.getOperand(2), Op.getOperand(3),
5930 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
5931 Op.getOperand(1));
5932 case Intrinsic::aarch64_sve_sxtw:
5933 return DAG.getNode(
5935 Op.getOperand(2), Op.getOperand(3),
5936 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
5937 Op.getOperand(1));
5938 case Intrinsic::aarch64_sve_uxtb:
5939 return DAG.getNode(
5941 Op.getOperand(2), Op.getOperand(3),
5942 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
5943 Op.getOperand(1));
5944 case Intrinsic::aarch64_sve_uxth:
5945 return DAG.getNode(
5947 Op.getOperand(2), Op.getOperand(3),
5948 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
5949 Op.getOperand(1));
5950 case Intrinsic::aarch64_sve_uxtw:
5951 return DAG.getNode(
5953 Op.getOperand(2), Op.getOperand(3),
5954 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
5955 Op.getOperand(1));
5956 case Intrinsic::localaddress: {
5957 const auto &MF = DAG.getMachineFunction();
5958 const auto *RegInfo = Subtarget->getRegisterInfo();
5959 unsigned Reg = RegInfo->getLocalAddressRegister(MF);
5960 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg,
5961 Op.getSimpleValueType());
5962 }
5963
5964 case Intrinsic::eh_recoverfp: {
5965 // FIXME: This needs to be implemented to correctly handle highly aligned
5966 // stack objects. For now we simply return the incoming FP. Refer D53541
5967 // for more details.
5968 SDValue FnOp = Op.getOperand(1);
5969 SDValue IncomingFPOp = Op.getOperand(2);
5970 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
5971 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
5972 if (!Fn)
5974 "llvm.eh.recoverfp must take a function as the first argument");
5975 return IncomingFPOp;
5976 }
5977
5978 case Intrinsic::aarch64_neon_vsri:
5979 case Intrinsic::aarch64_neon_vsli:
5980 case Intrinsic::aarch64_sve_sri:
5981 case Intrinsic::aarch64_sve_sli: {
5982 EVT Ty = Op.getValueType();
5983
5984 if (!Ty.isVector())
5985 report_fatal_error("Unexpected type for aarch64_neon_vsli");
5986
5987 assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());
5988
5989 bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri ||
5990 IntNo == Intrinsic::aarch64_sve_sri;
5991 unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
5992 return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2),
5993 Op.getOperand(3));
5994 }
5995
5996 case Intrinsic::aarch64_neon_srhadd:
5997 case Intrinsic::aarch64_neon_urhadd:
5998 case Intrinsic::aarch64_neon_shadd:
5999 case Intrinsic::aarch64_neon_uhadd: {
6000 bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
6001 IntNo == Intrinsic::aarch64_neon_shadd);
6002 bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
6003 IntNo == Intrinsic::aarch64_neon_urhadd);
6004 unsigned Opcode = IsSignedAdd
6005 ? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS)
6006 : (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU);
6007 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
6008 Op.getOperand(2));
6009 }
6010 case Intrinsic::aarch64_neon_saddlp:
6011 case Intrinsic::aarch64_neon_uaddlp: {
6012 unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp
6015 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1));
6016 }
6017 case Intrinsic::aarch64_neon_sdot:
6018 case Intrinsic::aarch64_neon_udot:
6019 case Intrinsic::aarch64_sve_sdot:
6020 case Intrinsic::aarch64_sve_udot: {
6021 unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot ||
6022 IntNo == Intrinsic::aarch64_sve_udot)
6025 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
6026 Op.getOperand(2), Op.getOperand(3));
6027 }
6028 case Intrinsic::get_active_lane_mask: {
6029 SDValue ID =
6030 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl, MVT::i64);
6031
6032 EVT VT = Op.getValueType();
6033 if (VT.isScalableVector())
6034 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, ID, Op.getOperand(1),
6035 Op.getOperand(2));
6036
6037 // We can use the SVE whilelo instruction to lower this intrinsic by
6038 // creating the appropriate sequence of scalable vector operations and
6039 // then extracting a fixed-width subvector from the scalable vector.
6040
6041 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
6042 EVT WhileVT = ContainerVT.changeElementType(MVT::i1);
6043
6044 SDValue Mask = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, WhileVT, ID,
6045 Op.getOperand(1), Op.getOperand(2));
6046 SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, dl, ContainerVT, Mask);
6047 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, MaskAsInt,
6048 DAG.getVectorIdxConstant(0, dl));
6049 }
6050 case Intrinsic::aarch64_neon_uaddlv: {
6051 EVT OpVT = Op.getOperand(1).getValueType();
6052 EVT ResVT = Op.getValueType();
6053 if (ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 ||
6054 OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) {
6055 // In order to avoid insert_subvector, used v4i32 than v2i32.
6056 SDValue UADDLV =
6057 DAG.getNode(AArch64ISD::UADDLV, dl, MVT::v4i32, Op.getOperand(1));
6058 SDValue EXTRACT_VEC_ELT =
6059 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, UADDLV,
6060 DAG.getConstant(0, dl, MVT::i64));
6061 return EXTRACT_VEC_ELT;
6062 }
6063 return SDValue();
6064 }
6065 case Intrinsic::experimental_cttz_elts: {
6066 SDValue CttzOp = Op.getOperand(1);
6067 EVT VT = CttzOp.getValueType();
6068 assert(VT.getVectorElementType() == MVT::i1 && "Expected MVT::i1");
6069
6070 if (VT.isFixedLengthVector()) {
6071 // We can use SVE instructions to lower this intrinsic by first creating
6072 // an SVE predicate register mask from the fixed-width vector.
6073 EVT NewVT = getTypeToTransformTo(*DAG.getContext(), VT);
6074 SDValue Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, NewVT, CttzOp);
6075 CttzOp = convertFixedMaskToScalableVector(Mask, DAG);
6076 }
6077
6078 SDValue NewCttzElts =
6079 DAG.getNode(AArch64ISD::CTTZ_ELTS, dl, MVT::i64, CttzOp);
6080 return DAG.getZExtOrTrunc(NewCttzElts, dl, Op.getValueType());
6081 }
6082 }
6083}
6084
6085bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
6086 if (VT.getVectorElementType() == MVT::i8 ||
6087 VT.getVectorElementType() == MVT::i16) {
6088 EltTy = MVT::i32;
6089 return true;
6090 }
6091 return false;
6092}
6093
6094bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(SDValue Extend,
6095 EVT DataVT) const {
6096 const EVT IndexVT = Extend.getOperand(0).getValueType();
6097 // SVE only supports implicit extension of 32-bit indices.
6098 if (!Subtarget->hasSVE() || IndexVT.getVectorElementType() != MVT::i32)
6099 return false;
6100
6101 // Indices cannot be smaller than the main data type.
6102 if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits())
6103 return false;
6104
6105 // Scalable vectors with "vscale * 2" or fewer elements sit within a 64-bit
6106 // element container type, which would violate the previous clause.
6107 return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2;
6108}
6109
6110bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
6111 EVT ExtVT = ExtVal.getValueType();
6112 if (!ExtVT.isScalableVector() && !Subtarget->useSVEForFixedLengthVectors())
6113 return false;
6114
6115 // It may be worth creating extending masked loads if there are multiple
6116 // masked loads using the same predicate. That way we'll end up creating
6117 // extending masked loads that may then get split by the legaliser. This
6118 // results in just one set of predicate unpacks at the start, instead of
6119 // multiple sets of vector unpacks after each load.
6120 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal->getOperand(0))) {
6121 if (!isLoadExtLegalOrCustom(ISD::ZEXTLOAD, ExtVT, Ld->getValueType(0))) {
6122 // Disable extending masked loads for fixed-width for now, since the code
6123 // quality doesn't look great.
6124 if (!ExtVT.isScalableVector())
6125 return false;
6126
6127 unsigned NumExtMaskedLoads = 0;
6128 for (auto *U : Ld->getMask()->uses())
6129 if (isa<MaskedLoadSDNode>(U))
6130 NumExtMaskedLoads++;
6131
6132 if (NumExtMaskedLoads <= 1)
6133 return false;
6134 }
6135 }
6136
6137 return true;
6138}
6139
6140unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
6141 std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
6142 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
6144 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
6146 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
6148 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
6150 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
6152 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
6154 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
6156 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
6158 };
6159 auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
6160 return AddrModes.find(Key)->second;
6161}
6162
6163unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
6164 switch (Opcode) {
6165 default:
6166 llvm_unreachable("unimplemented opcode");
6167 return Opcode;
6182 }
6183}
6184
6185SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
6186 SelectionDAG &DAG) const {
6187 MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);
6188
6189 SDLoc DL(Op);
6190 SDValue Chain = MGT->getChain();
6191 SDValue PassThru = MGT->getPassThru();
6192 SDValue Mask = MGT->getMask();
6193 SDValue BasePtr = MGT->getBasePtr();
6194 SDValue Index = MGT->getIndex();
6195 SDValue Scale = MGT->getScale();
6196 EVT VT = Op.getValueType();
6197 EVT MemVT = MGT->getMemoryVT();
6198 ISD::LoadExtType ExtType = MGT->getExtensionType();
6199 ISD::MemIndexType IndexType = MGT->getIndexType();
6200
6201 // SVE supports zero (and so undef) passthrough values only, everything else
6202 // must be handled manually by an explicit select on the load's output.
6203 if (!PassThru->isUndef() && !isZerosVector(PassThru.getNode())) {
6204 SDValue Ops[] = {Chain, DAG.getUNDEF(VT), Mask, BasePtr, Index, Scale};
6205 SDValue Load =
6206 DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
6207 MGT->getMemOperand(), IndexType, ExtType);
6208 SDValue Select = DAG.getSelect(DL, VT, Mask, Load, PassThru);
6209 return DAG.getMergeValues({Select, Load.getValue(1)}, DL);
6210 }
6211
6212 bool IsScaled = MGT->isIndexScaled();
6213 bool IsSigned = MGT->isIndexSigned();
6214
6215 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
6216 // must be calculated before hand.
6217 uint64_t ScaleVal = Scale->getAsZExtVal();
6218 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
6219 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
6220 EVT IndexVT = Index.getValueType();
6221 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
6222 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
6223 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
6224
6225 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
6226 return DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
6227 MGT->getMemOperand(), IndexType, ExtType);
6228 }
6229
6230 // Lower fixed length gather to a scalable equivalent.
6231 if (VT.isFixedLengthVector()) {
6232 assert(Subtarget->useSVEForFixedLengthVectors() &&
6233 "Cannot lower when not using SVE for fixed vectors!");
6234
6235 // NOTE: Handle floating-point as if integer then bitcast the result.
6237 MemVT = MemVT.changeVectorElementTypeToInteger();
6238
6239 // Find the smallest integer fixed length vector we can use for the gather.
6240 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
6241 if (DataVT.getVectorElementType() == MVT::i64 ||
6242 Index.getValueType().getVectorElementType() == MVT::i64 ||
6243 Mask.getValueType().getVectorElementType() == MVT::i64)
6244 PromotedVT = VT.changeVectorElementType(MVT::i64);
6245
6246 // Promote vector operands except for passthrough, which we know is either
6247 // undef or zero, and thus best constructed directly.
6248 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6249 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
6250 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
6251
6252 // A promoted result type forces the need for an extending load.
6253 if (PromotedVT != DataVT && ExtType == ISD::NON_EXTLOAD)
6254 ExtType = ISD::EXTLOAD;
6255
6256 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
6257
6258 // Convert fixed length vector operands to scalable.
6259 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
6260 Index = convertToScalableVector(DAG, ContainerVT, Index);
6262 PassThru = PassThru->isUndef() ? DAG.getUNDEF(ContainerVT)
6263 : DAG.getConstant(0, DL, ContainerVT);
6264
6265 // Emit equivalent scalable vector gather.
6266 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
6267 SDValue Load =
6268 DAG.getMaskedGather(DAG.getVTList(ContainerVT, MVT::Other), MemVT, DL,
6269 Ops, MGT->getMemOperand(), IndexType, ExtType);
6270
6271 // Extract fixed length data then convert to the required result type.
6272 SDValue Result = convertFromScalableVector(DAG, PromotedVT, Load);
6273 Result = DAG.getNode(ISD::TRUNCATE, DL, DataVT, Result);
6274 if (VT.isFloatingPoint())
6275 Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
6276
6277 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
6278 }
6279
6280 // Everything else is legal.
6281 return Op;
6282}
6283
6284SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
6285 SelectionDAG &DAG) const {
6286 MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);
6287
6288 SDLoc DL(Op);
6289 SDValue Chain = MSC->getChain();
6290 SDValue StoreVal = MSC->getValue();
6291 SDValue Mask = MSC->getMask();
6292 SDValue BasePtr = MSC->getBasePtr();
6293 SDValue Index = MSC->getIndex();
6294 SDValue Scale = MSC->getScale();
6295 EVT VT = StoreVal.getValueType();
6296 EVT MemVT = MSC->getMemoryVT();
6297 ISD::MemIndexType IndexType = MSC->getIndexType();
6298 bool Truncating = MSC->isTruncatingStore();
6299
6300 bool IsScaled = MSC->isIndexScaled();
6301 bool IsSigned = MSC->isIndexSigned();
6302
6303 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
6304 // must be calculated before hand.
6305 uint64_t ScaleVal = Scale->getAsZExtVal();
6306 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
6307 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
6308 EVT IndexVT = Index.getValueType();
6309 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
6310 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
6311 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
6312
6313 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
6314 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
6315 MSC->getMemOperand(), IndexType, Truncating);
6316 }
6317
6318 // Lower fixed length scatter to a scalable equivalent.
6319 if (VT.isFixedLengthVector()) {
6320 assert(Subtarget->useSVEForFixedLengthVectors() &&
6321 "Cannot lower when not using SVE for fixed vectors!");
6322
6323 // Once bitcast we treat floating-point scatters as if integer.
6324 if (VT.isFloatingPoint()) {
6326 MemVT = MemVT.changeVectorElementTypeToInteger();
6327 StoreVal = DAG.getNode(ISD::BITCAST, DL, VT, StoreVal);
6328 }
6329
6330 // Find the smallest integer fixed length vector we can use for the scatter.
6331 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
6332 if (VT.getVectorElementType() == MVT::i64 ||
6333 Index.getValueType().getVectorElementType() == MVT::i64 ||
6334 Mask.getValueType().getVectorElementType() == MVT::i64)
6335 PromotedVT = VT.changeVectorElementType(MVT::i64);
6336
6337 // Promote vector operands.
6338 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6339 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
6340 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
6341 StoreVal = DAG.getNode(ISD::ANY_EXTEND, DL, PromotedVT, StoreVal);
6342
6343 // A promoted value type forces the need for a truncating store.
6344 if (PromotedVT != VT)
6345 Truncating = true;
6346
6347 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
6348
6349 // Convert fixed length vector operands to scalable.
6350 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
6351 Index = convertToScalableVector(DAG, ContainerVT, Index);
6353 StoreVal = convertToScalableVector(DAG, ContainerVT, StoreVal);
6354
6355 // Emit equivalent scalable vector scatter.
6356 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
6357 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
6358 MSC->getMemOperand(), IndexType, Truncating);
6359 }
6360
6361 // Everything else is legal.
6362 return Op;
6363}
6364
6365SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
6366 SDLoc DL(Op);
6367 MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Op);
6368 assert(LoadNode && "Expected custom lowering of a masked load node");
6369 EVT VT = Op->getValueType(0);
6370
6371 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
6372 return LowerFixedLengthVectorMLoadToSVE(Op, DAG);
6373
6374 SDValue PassThru = LoadNode->getPassThru();
6375 SDValue Mask = LoadNode->getMask();
6376
6377 if (PassThru->isUndef() || isZerosVector(PassThru.getNode()))
6378 return Op;
6379
6381 VT, DL, LoadNode->getChain(), LoadNode->getBasePtr(),
6382 LoadNode->getOffset(), Mask, DAG.getUNDEF(VT), LoadNode->getMemoryVT(),
6383 LoadNode->getMemOperand(), LoadNode->getAddressingMode(),
6384 LoadNode->getExtensionType());
6385
6386 SDValue Result = DAG.getSelect(DL, VT, Mask, Load, PassThru);
6387
6388 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
6389}
6390
6391// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
6393 EVT VT, EVT MemVT,
6394 SelectionDAG &DAG) {
6395 assert(VT.isVector() && "VT should be a vector type");
6396 assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
6397
6398 SDValue Value = ST->getValue();
6399
6400 // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
6401 // the word lane which represent the v4i8 subvector. It optimizes the store
6402 // to:
6403 //
6404 // xtn v0.8b, v0.8h
6405 // str s0, [x0]
6406
6407 SDValue Undef = DAG.getUNDEF(MVT::i16);
6408 SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
6409 {Undef, Undef, Undef, Undef});
6410
6411 SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
6412 Value, UndefVec);
6413 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
6414
6415 Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
6416 SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
6417 Trunc, DAG.getConstant(0, DL, MVT::i64));
6418
6419 return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
6420 ST->getBasePtr(), ST->getMemOperand());
6421}
6422
6423// Custom lowering for any store, vector or scalar and/or default or with
6424// a truncate operations. Currently only custom lower truncate operation
6425// from vector v4i16 to v4i8 or volatile stores of i128.
6426SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
6427 SelectionDAG &DAG) const {
6428 SDLoc Dl(Op);
6429 StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
6430 assert (StoreNode && "Can only custom lower store nodes");
6431
6432 SDValue Value = StoreNode->getValue();
6433
6434 EVT VT = Value.getValueType();
6435 EVT MemVT = StoreNode->getMemoryVT();
6436
6437 if (VT.isVector()) {
6439 VT,
6440 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
6441 return LowerFixedLengthVectorStoreToSVE(Op, DAG);
6442
6443 unsigned AS = StoreNode->getAddressSpace();
6444 Align Alignment = StoreNode->getAlign();
6445 if (Alignment < MemVT.getStoreSize() &&
6446 !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment,
6447 StoreNode->getMemOperand()->getFlags(),
6448 nullptr)) {
6449 return scalarizeVectorStore(StoreNode, DAG);
6450 }
6451
6452 if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 &&
6453 MemVT == MVT::v4i8) {
6454 return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
6455 }
6456 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
6457 // the custom lowering, as there are no un-paired non-temporal stores and
6458 // legalization will break up 256 bit inputs.
6460 if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
6461 EC.isKnownEven() && DAG.getDataLayout().isLittleEndian() &&
6462 (MemVT.getScalarSizeInBits() == 8u ||
6463 MemVT.getScalarSizeInBits() == 16u ||
6464 MemVT.getScalarSizeInBits() == 32u ||
6465 MemVT.getScalarSizeInBits() == 64u)) {
6466 SDValue Lo =
6469 StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
6470 SDValue Hi =
6473 StoreNode->getValue(),
6474 DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64));
6476 AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
6477 {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
6478 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
6479 return Result;
6480 }
6481 } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
6482 return LowerStore128(Op, DAG);
6483 } else if (MemVT == MVT::i64x8) {
6484 SDValue Value = StoreNode->getValue();
6485 assert(Value->getValueType(0) == MVT::i64x8);
6486 SDValue Chain = StoreNode->getChain();
6487 SDValue Base = StoreNode->getBasePtr();
6488 EVT PtrVT = Base.getValueType();
6489 for (unsigned i = 0; i < 8; i++) {
6490 SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64,
6491 Value, DAG.getConstant(i, Dl, MVT::i32));
6492 SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base,
6493 DAG.getConstant(i * 8, Dl, PtrVT));
6494 Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(),
6495 StoreNode->getOriginalAlign());
6496 }
6497 return Chain;
6498 }
6499
6500 return SDValue();
6501}
6502
6503/// Lower atomic or volatile 128-bit stores to a single STP instruction.
6504SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
6505 SelectionDAG &DAG) const {
6506 MemSDNode *StoreNode = cast<MemSDNode>(Op);
6507 assert(StoreNode->getMemoryVT() == MVT::i128);
6508 assert(StoreNode->isVolatile() || StoreNode->isAtomic());
6509
6510 bool IsStoreRelease =
6512 if (StoreNode->isAtomic())
6513 assert((Subtarget->hasFeature(AArch64::FeatureLSE2) &&
6514 Subtarget->hasFeature(AArch64::FeatureRCPC3) && IsStoreRelease) ||
6517
6518 SDValue Value = (StoreNode->getOpcode() == ISD::STORE ||
6519 StoreNode->getOpcode() == ISD::ATOMIC_STORE)
6520 ? StoreNode->getOperand(1)
6521 : StoreNode->getOperand(2);
6522 SDLoc DL(Op);
6523 auto StoreValue = DAG.SplitScalar(Value, DL, MVT::i64, MVT::i64);
6524 unsigned Opcode = IsStoreRelease ? AArch64ISD::STILP : AArch64ISD::STP;
6525 if (DAG.getDataLayout().isBigEndian())
6526 std::swap(StoreValue.first, StoreValue.second);
6528 Opcode, DL, DAG.getVTList(MVT::Other),
6529 {StoreNode->getChain(), StoreValue.first, StoreValue.second,
6530 StoreNode->getBasePtr()},
6531 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
6532 return Result;
6533}
6534
6535SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
6536 SelectionDAG &DAG) const {
6537 SDLoc DL(Op);
6538 LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
6539 assert(LoadNode && "Expected custom lowering of a load node");
6540
6541 if (LoadNode->getMemoryVT() == MVT::i64x8) {
6543 SDValue Base = LoadNode->getBasePtr();
6544 SDValue Chain = LoadNode->getChain();
6545 EVT PtrVT = Base.getValueType();
6546 for (unsigned i = 0; i < 8; i++) {
6547 SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
6548 DAG.getConstant(i * 8, DL, PtrVT));
6549 SDValue Part = DAG.getLoad(MVT::i64, DL, Chain, Ptr,
6550 LoadNode->getPointerInfo(),
6551 LoadNode->getOriginalAlign());
6552 Ops.push_back(Part);
6553 Chain = SDValue(Part.getNode(), 1);
6554 }
6555 SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops);
6556 return DAG.getMergeValues({Loaded, Chain}, DL);
6557 }
6558
6559 // Custom lowering for extending v4i8 vector loads.
6560 EVT VT = Op->getValueType(0);
6561 assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32");
6562
6563 if (LoadNode->getMemoryVT() != MVT::v4i8)
6564 return SDValue();
6565
6566 // Avoid generating unaligned loads.
6567 if (Subtarget->requiresStrictAlign() && LoadNode->getAlign() < Align(4))
6568 return SDValue();
6569
6570 unsigned ExtType;
6571 if (LoadNode->getExtensionType() == ISD::SEXTLOAD)
6572 ExtType = ISD::SIGN_EXTEND;
6573 else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD ||
6574 LoadNode->getExtensionType() == ISD::EXTLOAD)
6575 ExtType = ISD::ZERO_EXTEND;
6576 else
6577 return SDValue();
6578
6579 SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(),
6580 LoadNode->getBasePtr(), MachinePointerInfo());
6581 SDValue Chain = Load.getValue(1);
6582 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load);
6583 SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec);
6584 SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC);
6585 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext,
6586 DAG.getConstant(0, DL, MVT::i64));
6587 if (VT == MVT::v4i32)
6588 Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext);
6589 return DAG.getMergeValues({Ext, Chain}, DL);
6590}
6591
6592// Generate SUBS and CSEL for integer abs.
6593SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
6594 MVT VT = Op.getSimpleValueType();
6595
6596 if (VT.isVector())
6597 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU);
6598
6599 SDLoc DL(Op);
6600 SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
6601 Op.getOperand(0));
6602 // Generate SUBS & CSEL.
6603 SDValue Cmp =
6604 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
6605 Op.getOperand(0), DAG.getConstant(0, DL, VT));
6606 return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
6607 DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
6608 Cmp.getValue(1));
6609}
6610
6612 SDValue Chain = Op.getOperand(0);
6613 SDValue Cond = Op.getOperand(1);
6614 SDValue Dest = Op.getOperand(2);
6615
6617 if (SDValue Cmp = emitConjunction(DAG, Cond, CC)) {
6618 SDLoc dl(Op);
6619 SDValue CCVal = DAG.getConstant(CC, dl, MVT::i32);
6620 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
6621 Cmp);
6622 }
6623
6624 return SDValue();
6625}
6626
6627// Treat FSHR with constant shifts as legal operation, otherwise it is expanded
6628// FSHL is converted to FSHR before deciding what to do with it
6630 SDValue Shifts = Op.getOperand(2);
6631 // Check if the shift amount is a constant
6632 // If opcode is FSHL, convert it to FSHR
6633 if (auto *ShiftNo = dyn_cast<ConstantSDNode>(Shifts)) {
6634 SDLoc DL(Op);
6635 MVT VT = Op.getSimpleValueType();
6636
6637 if (Op.getOpcode() == ISD::FSHL) {
6638 unsigned int NewShiftNo =
6639 VT.getFixedSizeInBits() - ShiftNo->getZExtValue();
6640 return DAG.getNode(
6641 ISD::FSHR, DL, VT, Op.getOperand(0), Op.getOperand(1),
6642 DAG.getConstant(NewShiftNo, DL, Shifts.getValueType()));
6643 } else if (Op.getOpcode() == ISD::FSHR) {
6644 return Op;
6645 }
6646 }
6647
6648 return SDValue();
6649}
6650
6652 SDValue X = Op.getOperand(0);
6653 EVT XScalarTy = X.getValueType();
6654 SDValue Exp = Op.getOperand(1);
6655
6656 SDLoc DL(Op);
6657 EVT XVT, ExpVT;
6658 switch (Op.getSimpleValueType().SimpleTy) {
6659 default:
6660 return SDValue();
6661 case MVT::bf16:
6662 case MVT::f16:
6663 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X);
6664 [[fallthrough]];
6665 case MVT::f32:
6666 XVT = MVT::nxv4f32;
6667 ExpVT = MVT::nxv4i32;
6668 break;
6669 case MVT::f64:
6670 XVT = MVT::nxv2f64;
6671 ExpVT = MVT::nxv2i64;
6672 Exp = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Exp);
6673 break;
6674 }
6675
6676 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
6677 SDValue VX =
6678 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, XVT, DAG.getUNDEF(XVT), X, Zero);
6679 SDValue VExp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpVT,
6680 DAG.getUNDEF(ExpVT), Exp, Zero);
6681 SDValue VPg = getPTrue(DAG, DL, XVT.changeVectorElementType(MVT::i1),
6682 AArch64SVEPredPattern::all);
6683 SDValue FScale =
6685 DAG.getConstant(Intrinsic::aarch64_sve_fscale, DL, MVT::i64),
6686 VPg, VX, VExp);
6687 SDValue Final =
6688 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, X.getValueType(), FScale, Zero);
6689 if (X.getValueType() != XScalarTy)
6690 Final = DAG.getNode(ISD::FP_ROUND, DL, XScalarTy, Final,
6691 DAG.getIntPtrConstant(1, SDLoc(Op)));
6692 return Final;
6693}
6694
6695SDValue AArch64TargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
6696 SelectionDAG &DAG) const {
6697 // Note: x18 cannot be used for the Nest parameter on Windows and macOS.
6698 if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
6700 "ADJUST_TRAMPOLINE operation is only supported on Linux.");
6701
6702 return Op.getOperand(0);
6703}
6704
6705SDValue AArch64TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
6706 SelectionDAG &DAG) const {
6707
6708 // Note: x18 cannot be used for the Nest parameter on Windows and macOS.
6709 if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
6710 report_fatal_error("INIT_TRAMPOLINE operation is only supported on Linux.");
6711
6712 SDValue Chain = Op.getOperand(0);
6713 SDValue Trmp = Op.getOperand(1); // trampoline
6714 SDValue FPtr = Op.getOperand(2); // nested function
6715 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
6716 SDLoc dl(Op);
6717
6718 EVT PtrVT = getPointerTy(DAG.getDataLayout());
6719 Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
6720
6723
6724 Entry.Ty = IntPtrTy;
6725 Entry.Node = Trmp;
6726 Args.push_back(Entry);
6727 Entry.Node = DAG.getConstant(20, dl, MVT::i64);
6728 Args.push_back(Entry);
6729
6730 Entry.Node = FPtr;
6731 Args.push_back(Entry);
6732 Entry.Node = Nest;
6733 Args.push_back(Entry);
6734
6735 // Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg)
6737 CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
6739 DAG.getExternalSymbol("__trampoline_setup", PtrVT), std::move(Args));
6740
6741 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
6742 return CallResult.second;
6743}
6744
6746 SelectionDAG &DAG) const {
6747 LLVM_DEBUG(dbgs() << "Custom lowering: ");
6748 LLVM_DEBUG(Op.dump());
6749
6750 switch (Op.getOpcode()) {
6751 default:
6752 llvm_unreachable("unimplemented operand");
6753 return SDValue();
6754 case ISD::BITCAST:
6755 return LowerBITCAST(Op, DAG);
6756 case ISD::GlobalAddress:
6757 return LowerGlobalAddress(Op, DAG);
6759 return LowerGlobalTLSAddress(Op, DAG);
6761 return LowerPtrAuthGlobalAddress(Op, DAG);
6763 return LowerADJUST_TRAMPOLINE(Op, DAG);
6765 return LowerINIT_TRAMPOLINE(Op, DAG);
6766 case ISD::SETCC:
6767 case ISD::STRICT_FSETCC:
6769 return LowerSETCC(Op, DAG);
6770 case ISD::SETCCCARRY:
6771 return LowerSETCCCARRY(Op, DAG);
6772 case ISD::BRCOND:
6773 return LowerBRCOND(Op, DAG);
6774 case ISD::BR_CC:
6775 return LowerBR_CC(Op, DAG);
6776 case ISD::SELECT:
6777 return LowerSELECT(Op, DAG);
6778 case ISD::SELECT_CC:
6779 return LowerSELECT_CC(Op, DAG);
6780 case ISD::JumpTable:
6781 return LowerJumpTable(Op, DAG);
6782 case ISD::BR_JT:
6783 return LowerBR_JT(Op, DAG);
6784 case ISD::BRIND:
6785 return LowerBRIND(Op, DAG);
6786 case ISD::ConstantPool:
6787 return LowerConstantPool(Op, DAG);
6788 case ISD::BlockAddress:
6789 return LowerBlockAddress(Op, DAG);
6790 case ISD::VASTART:
6791 return LowerVASTART(Op, DAG);
6792 case ISD::VACOPY:
6793 return LowerVACOPY(Op, DAG);
6794 case ISD::VAARG:
6795 return LowerVAARG(Op, DAG);
6796 case ISD::UADDO_CARRY:
6797 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, false /*unsigned*/);
6798 case ISD::USUBO_CARRY:
6799 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, false /*unsigned*/);
6800 case ISD::SADDO_CARRY:
6801 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, true /*signed*/);
6802 case ISD::SSUBO_CARRY:
6803 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, true /*signed*/);
6804 case ISD::SADDO:
6805 case ISD::UADDO:
6806 case ISD::SSUBO:
6807 case ISD::USUBO:
6808 case ISD::SMULO:
6809 case ISD::UMULO:
6810 return LowerXALUO(Op, DAG);
6811 case ISD::FADD:
6812 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
6813 case ISD::FSUB:
6814 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED);
6815 case ISD::FMUL:
6816 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED);
6817 case ISD::FMA:
6818 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
6819 case ISD::FDIV:
6820 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED);
6821 case ISD::FNEG:
6822 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
6823 case ISD::FCEIL:
6824 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU);
6825 case ISD::FFLOOR:
6826 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU);
6827 case ISD::FNEARBYINT:
6828 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);
6829 case ISD::FRINT:
6830 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU);
6831 case ISD::FROUND:
6832 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU);
6833 case ISD::FROUNDEVEN:
6834 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);
6835 case ISD::FTRUNC:
6836 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU);
6837 case ISD::FSQRT:
6838 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU);
6839 case ISD::FABS:
6840 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU);
6841 case ISD::FP_ROUND:
6843 return LowerFP_ROUND(Op, DAG);
6844 case ISD::FP_EXTEND:
6845 return LowerFP_EXTEND(Op, DAG);
6846 case ISD::FRAMEADDR:
6847 return LowerFRAMEADDR(Op, DAG);
6848 case ISD::SPONENTRY:
6849 return LowerSPONENTRY(Op, DAG);
6850 case ISD::RETURNADDR:
6851 return LowerRETURNADDR(Op, DAG);
6853 return LowerADDROFRETURNADDR(Op, DAG);
6855 return LowerCONCAT_VECTORS(Op, DAG);
6857 return LowerINSERT_VECTOR_ELT(Op, DAG);
6859 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
6860 case ISD::BUILD_VECTOR:
6861 return LowerBUILD_VECTOR(Op, DAG);
6863 return LowerZERO_EXTEND_VECTOR_INREG(Op, DAG);
6865 return LowerVECTOR_SHUFFLE(Op, DAG);
6866 case ISD::SPLAT_VECTOR:
6867 return LowerSPLAT_VECTOR(Op, DAG);
6869 return LowerEXTRACT_SUBVECTOR(Op, DAG);
6871 return LowerINSERT_SUBVECTOR(Op, DAG);
6872 case ISD::SDIV:
6873 case ISD::UDIV:
6874 return LowerDIV(Op, DAG);
6875 case ISD::SMIN:
6876 case ISD::UMIN:
6877 case ISD::SMAX:
6878 case ISD::UMAX:
6879 return LowerMinMax(Op, DAG);
6880 case ISD::SRA:
6881 case ISD::SRL:
6882 case ISD::SHL:
6883 return LowerVectorSRA_SRL_SHL(Op, DAG);
6884 case ISD::SHL_PARTS:
6885 case ISD::SRL_PARTS:
6886 case ISD::SRA_PARTS:
6887 return LowerShiftParts(Op, DAG);
6888 case ISD::CTPOP:
6889 case ISD::PARITY:
6890 return LowerCTPOP_PARITY(Op, DAG);
6891 case ISD::FCOPYSIGN:
6892 return LowerFCOPYSIGN(Op, DAG);
6893 case ISD::OR:
6894 return LowerVectorOR(Op, DAG);
6895 case ISD::XOR:
6896 return LowerXOR(Op, DAG);
6897 case ISD::PREFETCH:
6898 return LowerPREFETCH(Op, DAG);
6899 case ISD::SINT_TO_FP:
6900 case ISD::UINT_TO_FP:
6903 return LowerINT_TO_FP(Op, DAG);
6904 case ISD::FP_TO_SINT:
6905 case ISD::FP_TO_UINT:
6908 return LowerFP_TO_INT(Op, DAG);
6911 return LowerFP_TO_INT_SAT(Op, DAG);
6912 case ISD::FSINCOS:
6913 return LowerFSINCOS(Op, DAG);
6914 case ISD::GET_ROUNDING:
6915 return LowerGET_ROUNDING(Op, DAG);
6916 case ISD::SET_ROUNDING:
6917 return LowerSET_ROUNDING(Op, DAG);
6918 case ISD::GET_FPMODE:
6919 return LowerGET_FPMODE(Op, DAG);
6920 case ISD::SET_FPMODE:
6921 return LowerSET_FPMODE(Op, DAG);
6922 case ISD::RESET_FPMODE:
6923 return LowerRESET_FPMODE(Op, DAG);
6924 case ISD::MUL:
6925 return LowerMUL(Op, DAG);
6926 case ISD::MULHS:
6927 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED);
6928 case ISD::MULHU:
6929 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED);
6931 return LowerINTRINSIC_W_CHAIN(Op, DAG);
6933 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
6935 return LowerINTRINSIC_VOID(Op, DAG);
6936 case ISD::ATOMIC_STORE:
6937 if (cast<MemSDNode>(Op)->getMemoryVT() == MVT::i128) {
6938 assert(Subtarget->hasLSE2() || Subtarget->hasRCPC3());
6939 return LowerStore128(Op, DAG);
6940 }
6941 return SDValue();
6942 case ISD::STORE:
6943 return LowerSTORE(Op, DAG);
6944 case ISD::MSTORE:
6945 return LowerFixedLengthVectorMStoreToSVE(Op, DAG);
6946 case ISD::MGATHER:
6947 return LowerMGATHER(Op, DAG);
6948 case ISD::MSCATTER:
6949 return LowerMSCATTER(Op, DAG);
6951 return LowerVECREDUCE_SEQ_FADD(Op, DAG);
6952 case ISD::VECREDUCE_ADD:
6953 case ISD::VECREDUCE_AND:
6954 case ISD::VECREDUCE_OR:
6955 case ISD::VECREDUCE_XOR:
6965 return LowerVECREDUCE(Op, DAG);
6967 return LowerATOMIC_LOAD_AND(Op, DAG);
6969 return LowerDYNAMIC_STACKALLOC(Op, DAG);
6970 case ISD::VSCALE:
6971 return LowerVSCALE(Op, DAG);
6972 case ISD::ANY_EXTEND:
6973 case ISD::SIGN_EXTEND:
6974 case ISD::ZERO_EXTEND:
6975 return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
6977 // Only custom lower when ExtraVT has a legal byte based element type.
6978 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
6979 EVT ExtraEltVT = ExtraVT.getVectorElementType();
6980 if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&
6981 (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64))
6982 return SDValue();
6983
6984 return LowerToPredicatedOp(Op, DAG,
6986 }
6987 case ISD::TRUNCATE:
6988 return LowerTRUNCATE(Op, DAG);
6989 case ISD::MLOAD:
6990 return LowerMLOAD(Op, DAG);
6991 case ISD::LOAD:
6992 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
6993 !Subtarget->isNeonAvailable()))
6994 return LowerFixedLengthVectorLoadToSVE(Op, DAG);
6995 return LowerLOAD(Op, DAG);
6996 case ISD::ADD:
6997 case ISD::AND:
6998 case ISD::SUB:
6999 return LowerToScalableOp(Op, DAG);
7000 case ISD::FMAXIMUM:
7001 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED);
7002 case ISD::FMAXNUM:
7003 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED);
7004 case ISD::FMINIMUM:
7005 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMIN_PRED);
7006 case ISD::FMINNUM:
7007 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED);
7008 case ISD::VSELECT:
7009 return LowerFixedLengthVectorSelectToSVE(Op, DAG);
7010 case ISD::ABS:
7011 return LowerABS(Op, DAG);
7012 case ISD::ABDS:
7013 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDS_PRED);
7014 case ISD::ABDU:
7015 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED);
7016 case ISD::AVGFLOORS:
7017 return LowerAVG(Op, DAG, AArch64ISD::HADDS_PRED);
7018 case ISD::AVGFLOORU:
7019 return LowerAVG(Op, DAG, AArch64ISD::HADDU_PRED);
7020 case ISD::AVGCEILS:
7021 return LowerAVG(Op, DAG, AArch64ISD::RHADDS_PRED);
7022 case ISD::AVGCEILU:
7023 return LowerAVG(Op, DAG, AArch64ISD::RHADDU_PRED);
7024 case ISD::BITREVERSE:
7025 return LowerBitreverse(Op, DAG);
7026 case ISD::BSWAP:
7027 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU);
7028 case ISD::CTLZ:
7029 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU);
7030 case ISD::CTTZ:
7031 return LowerCTTZ(Op, DAG);
7032 case ISD::VECTOR_SPLICE:
7033 return LowerVECTOR_SPLICE(Op, DAG);
7035 return LowerVECTOR_DEINTERLEAVE(Op, DAG);
7037 return LowerVECTOR_INTERLEAVE(Op, DAG);
7038 case ISD::LRINT:
7039 case ISD::LLRINT:
7040 if (Op.getValueType().isVector())
7041 return LowerVectorXRINT(Op, DAG);
7042 [[fallthrough]];
7043 case ISD::LROUND:
7044 case ISD::LLROUND: {
7045 assert((Op.getOperand(0).getValueType() == MVT::f16 ||
7046 Op.getOperand(0).getValueType() == MVT::bf16) &&
7047 "Expected custom lowering of rounding operations only for f16");
7048 SDLoc DL(Op);
7049 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Op.getOperand(0));
7050 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), Ext);
7051 }
7052 case ISD::STRICT_LROUND:
7054 case ISD::STRICT_LRINT:
7055 case ISD::STRICT_LLRINT: {
7056 assert((Op.getOperand(1).getValueType() == MVT::f16 ||
7057 Op.getOperand(1).getValueType() == MVT::bf16) &&
7058 "Expected custom lowering of rounding operations only for f16");
7059 SDLoc DL(Op);
7060 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
7061 {Op.getOperand(0), Op.getOperand(1)});
7062 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
7063 {Ext.getValue(1), Ext.getValue(0)});
7064 }
7065 case ISD::WRITE_REGISTER: {
7066 assert(Op.getOperand(2).getValueType() == MVT::i128 &&
7067 "WRITE_REGISTER custom lowering is only for 128-bit sysregs");
7068 SDLoc DL(Op);
7069
7070 SDValue Chain = Op.getOperand(0);
7071 SDValue SysRegName = Op.getOperand(1);
7072 std::pair<SDValue, SDValue> Pair =
7073 DAG.SplitScalar(Op.getOperand(2), DL, MVT::i64, MVT::i64);
7074
7075 // chain = MSRR(chain, sysregname, lo, hi)
7076 SDValue Result = DAG.getNode(AArch64ISD::MSRR, DL, MVT::Other, Chain,
7077 SysRegName, Pair.first, Pair.second);
7078
7079 return Result;
7080 }
7081 case ISD::FSHL:
7082 case ISD::FSHR:
7083 return LowerFunnelShift(Op, DAG);
7084 case ISD::FLDEXP:
7085 return LowerFLDEXP(Op, DAG);
7087 return LowerVECTOR_HISTOGRAM(Op, DAG);
7088 }
7089}
7090
7092 return !Subtarget->useSVEForFixedLengthVectors();
7093}
7094
7096 EVT VT, bool OverrideNEON) const {
7097 if (!VT.isFixedLengthVector() || !VT.isSimple())
7098 return false;
7099
7100 // Don't use SVE for vectors we cannot scalarize if required.
7101 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
7102 // Fixed length predicates should be promoted to i8.
7103 // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
7104 case MVT::i1:
7105 default:
7106 return false;
7107 case MVT::i8:
7108 case MVT::i16:
7109 case MVT::i32:
7110 case MVT::i64:
7111 case MVT::f16:
7112 case MVT::f32:
7113 case MVT::f64:
7114 break;
7115 }
7116
7117 // NEON-sized vectors can be emulated using SVE instructions.
7118 if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
7119 return Subtarget->isSVEorStreamingSVEAvailable();
7120
7121 // Ensure NEON MVTs only belong to a single register class.
7122 if (VT.getFixedSizeInBits() <= 128)
7123 return false;
7124
7125 // Ensure wider than NEON code generation is enabled.
7126 if (!Subtarget->useSVEForFixedLengthVectors())
7127 return false;
7128
7129 // Don't use SVE for types that don't fit.
7130 if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
7131 return false;
7132
7133 // TODO: Perhaps an artificial restriction, but worth having whilst getting
7134 // the base fixed length SVE support in place.
7135 if (!VT.isPow2VectorType())
7136 return false;
7137
7138 return true;
7139}
7140
7141//===----------------------------------------------------------------------===//
7142// Calling Convention Implementation
7143//===----------------------------------------------------------------------===//
7144
7145static unsigned getIntrinsicID(const SDNode *N) {
7146 unsigned Opcode = N->getOpcode();
7147 switch (Opcode) {
7148 default:
7151 unsigned IID = N->getConstantOperandVal(0);
7152 if (IID < Intrinsic::num_intrinsics)
7153 return IID;
7155 }
7156 }
7157}
7158
7160 SDValue N1) const {
7161 if (!N0.hasOneUse())
7162 return false;
7163
7164 unsigned IID = getIntrinsicID(N1.getNode());
7165 // Avoid reassociating expressions that can be lowered to smlal/umlal.
7166 if (IID == Intrinsic::aarch64_neon_umull ||
7167 N1.getOpcode() == AArch64ISD::UMULL ||
7168 IID == Intrinsic::aarch64_neon_smull ||
7170 return N0.getOpcode() != ISD::ADD;
7171
7172 return true;
7173}
7174
7175/// Selects the correct CCAssignFn for a given CallingConvention value.
7177 bool IsVarArg) const {
7178 switch (CC) {
7179 default:
7180 report_fatal_error("Unsupported calling convention.");
7181 case CallingConv::GHC:
7182 return CC_AArch64_GHC;
7184 // The VarArg implementation makes assumptions about register
7185 // argument passing that do not hold for preserve_none, so we
7186 // instead fall back to C argument passing.
7187 // The non-vararg case is handled in the CC function itself.
7188 if (!IsVarArg)
7190 [[fallthrough]];
7191 case CallingConv::C:
7192 case CallingConv::Fast:
7196 case CallingConv::Swift:
7198 case CallingConv::Tail:
7199 case CallingConv::GRAAL:
7200 if (Subtarget->isTargetWindows()) {
7201 if (IsVarArg) {
7202 if (Subtarget->isWindowsArm64EC())
7205 }
7206 return CC_AArch64_Win64PCS;
7207 }
7208 if (!Subtarget->isTargetDarwin())
7209 return CC_AArch64_AAPCS;
7210 if (!IsVarArg)
7211 return CC_AArch64_DarwinPCS;
7214 case CallingConv::Win64:
7215 if (IsVarArg) {
7216 if (Subtarget->isWindowsArm64EC())
7219 }
7220 return CC_AArch64_Win64PCS;
7222 if (Subtarget->isWindowsArm64EC())
7229 return CC_AArch64_AAPCS;
7234 }
7235}
7236
7237CCAssignFn *
7239 switch (CC) {
7240 default:
7241 return RetCC_AArch64_AAPCS;
7245 if (Subtarget->isWindowsArm64EC())
7247 return RetCC_AArch64_AAPCS;
7248 }
7249}
7250
7251static bool isPassedInFPR(EVT VT) {
7252 return VT.isFixedLengthVector() ||
7253 (VT.isFloatingPoint() && !VT.isScalableVector());
7254}
7255
7256SDValue AArch64TargetLowering::LowerFormalArguments(
7257 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
7258 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
7259 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
7261 const Function &F = MF.getFunction();
7262 MachineFrameInfo &MFI = MF.getFrameInfo();
7263 bool IsWin64 =
7264 Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg());
7265 bool StackViaX4 = CallConv == CallingConv::ARM64EC_Thunk_X64 ||
7266 (isVarArg && Subtarget->isWindowsArm64EC());
7268
7270 GetReturnInfo(CallConv, F.getReturnType(), F.getAttributes(), Outs,
7272 if (any_of(Outs, [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); }))
7273 FuncInfo->setIsSVECC(true);
7274
7275 // Assign locations to all of the incoming arguments.
7277 DenseMap<unsigned, SDValue> CopiedRegs;
7278 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
7279
7280 // At this point, Ins[].VT may already be promoted to i32. To correctly
7281 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
7282 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
7283 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
7284 // we use a special version of AnalyzeFormalArguments to pass in ValVT and
7285 // LocVT.
7286 unsigned NumArgs = Ins.size();
7287 Function::const_arg_iterator CurOrigArg = F.arg_begin();
7288 unsigned CurArgIdx = 0;
7289 for (unsigned i = 0; i != NumArgs; ++i) {
7290 MVT ValVT = Ins[i].VT;
7291 if (Ins[i].isOrigArg()) {
7292 std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
7293 CurArgIdx = Ins[i].getOrigArgIndex();
7294
7295 // Get type of the original argument.
7296 EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
7297 /*AllowUnknown*/ true);
7298 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
7299 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
7300 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
7301 ValVT = MVT::i8;
7302 else if (ActualMVT == MVT::i16)
7303 ValVT = MVT::i16;
7304 }
7305 bool UseVarArgCC = false;
7306 if (IsWin64)
7307 UseVarArgCC = isVarArg;
7308 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC);
7309 bool Res =
7310 AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
7311 assert(!Res && "Call operand has unhandled type");
7312 (void)Res;
7313 }
7314
7316 bool IsLocallyStreaming =
7317 !Attrs.hasStreamingInterface() && Attrs.hasStreamingBody();
7318 assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
7319 SDValue Glue = Chain.getValue(1);
7320
7321 SmallVector<SDValue, 16> ArgValues;
7322 unsigned ExtraArgLocs = 0;
7323 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
7324 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
7325
7326 if (Ins[i].Flags.isByVal()) {
7327 // Byval is used for HFAs in the PCS, but the system should work in a
7328 // non-compliant manner for larger structs.
7329 EVT PtrVT = getPointerTy(DAG.getDataLayout());
7330 int Size = Ins[i].Flags.getByValSize();
7331 unsigned NumRegs = (Size + 7) / 8;
7332
7333 // FIXME: This works on big-endian for composite byvals, which are the common
7334 // case. It should also work for fundamental types too.
7335 unsigned FrameIdx =
7336 MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
7337 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
7338 InVals.push_back(FrameIdxN);
7339
7340 continue;
7341 }
7342
7343 if (Ins[i].Flags.isSwiftAsync())
7345
7346 SDValue ArgValue;
7347 if (VA.isRegLoc()) {
7348 // Arguments stored in registers.
7349 EVT RegVT = VA.getLocVT();
7350 const TargetRegisterClass *RC;
7351
7352 if (RegVT == MVT::i32)
7353 RC = &AArch64::GPR32RegClass;
7354 else if (RegVT == MVT::i64)
7355 RC = &AArch64::GPR64RegClass;
7356 else if (RegVT == MVT::f16 || RegVT == MVT::bf16)
7357 RC = &AArch64::FPR16RegClass;
7358 else if (RegVT == MVT::f32)
7359 RC = &AArch64::FPR32RegClass;
7360 else if (RegVT == MVT::f64 || RegVT.is64BitVector())
7361 RC = &AArch64::FPR64RegClass;
7362 else if (RegVT == MVT::f128 || RegVT.is128BitVector())
7363 RC = &AArch64::FPR128RegClass;
7364 else if (RegVT.isScalableVector() &&
7365 RegVT.getVectorElementType() == MVT::i1) {
7366 FuncInfo->setIsSVECC(true);
7367 RC = &AArch64::PPRRegClass;
7368 } else if (RegVT == MVT::aarch64svcount) {
7369 FuncInfo->setIsSVECC(true);
7370 RC = &AArch64::PPRRegClass;
7371 } else if (RegVT.isScalableVector()) {
7372 FuncInfo->setIsSVECC(true);
7373 RC = &AArch64::ZPRRegClass;
7374 } else
7375 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
7376
7377 // Transform the arguments in physical registers into virtual ones.
7378 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
7379
7380 if (IsLocallyStreaming) {
7381 // LocallyStreamingFunctions must insert the SMSTART in the correct
7382 // position, so we use Glue to ensure no instructions can be scheduled
7383 // between the chain of:
7384 // t0: ch,glue = EntryNode
7385 // t1: res,ch,glue = CopyFromReg
7386 // ...
7387 // tn: res,ch,glue = CopyFromReg t(n-1), ..
7388 // t(n+1): ch, glue = SMSTART t0:0, ...., tn:2
7389 // ^^^^^^
7390 // This will be the new Chain/Root node.
7391 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT, Glue);
7392 Glue = ArgValue.getValue(2);
7393 if (isPassedInFPR(ArgValue.getValueType())) {
7394 ArgValue =
7396 DAG.getVTList(ArgValue.getValueType(), MVT::Glue),
7397 {ArgValue, Glue});
7398 Glue = ArgValue.getValue(1);
7399 }
7400 } else
7401 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
7402
7403 // If this is an 8, 16 or 32-bit value, it is really passed promoted
7404 // to 64 bits. Insert an assert[sz]ext to capture this, then
7405 // truncate to the right size.
7406 switch (VA.getLocInfo()) {
7407 default:
7408 llvm_unreachable("Unknown loc info!");
7409 case CCValAssign::Full:
7410 break;
7412 assert(
7413 (VA.getValVT().isScalableVT() || Subtarget->isWindowsArm64EC()) &&
7414 "Indirect arguments should be scalable on most subtargets");
7415 break;
7416 case CCValAssign::BCvt:
7417 ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
7418 break;
7419 case CCValAssign::AExt:
7420 case CCValAssign::SExt:
7421 case CCValAssign::ZExt:
7422 break;
7424 ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
7425 DAG.getConstant(32, DL, RegVT));
7426 ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
7427 break;
7428 }
7429 } else { // VA.isRegLoc()
7430 assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
7431 unsigned ArgOffset = VA.getLocMemOffset();
7432 unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
7433 ? VA.getLocVT().getSizeInBits()
7434 : VA.getValVT().getSizeInBits()) / 8;
7435
7436 uint32_t BEAlign = 0;
7437 if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
7438 !Ins[i].Flags.isInConsecutiveRegs())
7439 BEAlign = 8 - ArgSize;
7440
7441 SDValue FIN;
7442 MachinePointerInfo PtrInfo;
7443 if (StackViaX4) {
7444 // In both the ARM64EC varargs convention and the thunk convention,
7445 // arguments on the stack are accessed relative to x4, not sp. In
7446 // the thunk convention, there's an additional offset of 32 bytes
7447 // to account for the shadow store.
7448 unsigned ObjOffset = ArgOffset + BEAlign;
7449 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
7450 ObjOffset += 32;
7451 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
7452 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
7453 FIN = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
7454 DAG.getConstant(ObjOffset, DL, MVT::i64));
7456 } else {
7457 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
7458
7459 // Create load nodes to retrieve arguments from the stack.
7460 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
7461 PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
7462 }
7463
7464 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
7466 MVT MemVT = VA.getValVT();
7467
7468 switch (VA.getLocInfo()) {
7469 default:
7470 break;
7471 case CCValAssign::Trunc:
7472 case CCValAssign::BCvt:
7473 MemVT = VA.getLocVT();
7474 break;
7477 Subtarget->isWindowsArm64EC()) &&
7478 "Indirect arguments should be scalable on most subtargets");
7479 MemVT = VA.getLocVT();
7480 break;
7481 case CCValAssign::SExt:
7482 ExtType = ISD::SEXTLOAD;
7483 break;
7484 case CCValAssign::ZExt:
7485 ExtType = ISD::ZEXTLOAD;
7486 break;
7487 case CCValAssign::AExt:
7488 ExtType = ISD::EXTLOAD;
7489 break;
7490 }
7491
7492 ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, PtrInfo,
7493 MemVT);
7494 }
7495
7496 if (VA.getLocInfo() == CCValAssign::Indirect) {
7497 assert((VA.getValVT().isScalableVT() ||
7498 Subtarget->isWindowsArm64EC()) &&
7499 "Indirect arguments should be scalable on most subtargets");
7500
7501 uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinValue();
7502 unsigned NumParts = 1;
7503 if (Ins[i].Flags.isInConsecutiveRegs()) {
7504 while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
7505 ++NumParts;
7506 }
7507
7508 MVT PartLoad = VA.getValVT();
7509 SDValue Ptr = ArgValue;
7510
7511 // Ensure we generate all loads for each tuple part, whilst updating the
7512 // pointer after each load correctly using vscale.
7513 while (NumParts > 0) {
7514 ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo());
7515 InVals.push_back(ArgValue);
7516 NumParts--;
7517 if (NumParts > 0) {
7518 SDValue BytesIncrement;
7519 if (PartLoad.isScalableVector()) {
7520 BytesIncrement = DAG.getVScale(
7521 DL, Ptr.getValueType(),
7522 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
7523 } else {
7524 BytesIncrement = DAG.getConstant(
7525 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
7526 Ptr.getValueType());
7527 }
7529 Flags.setNoUnsignedWrap(true);
7530 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
7531 BytesIncrement, Flags);
7532 ExtraArgLocs++;
7533 i++;
7534 }
7535 }
7536 } else {
7537 if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
7538 ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
7539 ArgValue, DAG.getValueType(MVT::i32));
7540
7541 // i1 arguments are zero-extended to i8 by the caller. Emit a
7542 // hint to reflect this.
7543 if (Ins[i].isOrigArg()) {
7544 Argument *OrigArg = F.getArg(Ins[i].getOrigArgIndex());
7545 if (OrigArg->getType()->isIntegerTy(1)) {
7546 if (!Ins[i].Flags.isZExt()) {
7547 ArgValue = DAG.getNode(AArch64ISD::ASSERT_ZEXT_BOOL, DL,
7548 ArgValue.getValueType(), ArgValue);
7549 }
7550 }
7551 }
7552
7553 InVals.push_back(ArgValue);
7554 }
7555 }
7556 assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
7557
7558 // Insert the SMSTART if this is a locally streaming function and
7559 // make sure it is Glued to the last CopyFromReg value.
7560 if (IsLocallyStreaming) {
7561 SDValue PStateSM;
7562 if (Attrs.hasStreamingCompatibleInterface()) {
7563 PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
7566 FuncInfo->setPStateSMReg(Reg);
7567 Chain = DAG.getCopyToReg(Chain, DL, Reg, PStateSM);
7568 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
7570 } else
7571 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
7573
7574 // Ensure that the SMSTART happens after the CopyWithChain such that its
7575 // chain result is used.
7576 for (unsigned I=0; I<InVals.size(); ++I) {
7578 getRegClassFor(InVals[I].getValueType().getSimpleVT()));
7579 Chain = DAG.getCopyToReg(Chain, DL, Reg, InVals[I]);
7580 InVals[I] = DAG.getCopyFromReg(Chain, DL, Reg,
7581 InVals[I].getValueType());
7582 }
7583 }
7584
7585 // varargs
7586 if (isVarArg) {
7587 if (!Subtarget->isTargetDarwin() || IsWin64) {
7588 // The AAPCS variadic function ABI is identical to the non-variadic
7589 // one. As a result there may be more arguments in registers and we should
7590 // save them for future reference.
7591 // Win64 variadic functions also pass arguments in registers, but all float
7592 // arguments are passed in integer registers.
7593 saveVarArgRegisters(CCInfo, DAG, DL, Chain);
7594 }
7595
7596 // This will point to the next argument passed via stack.
7597 unsigned VarArgsOffset = CCInfo.getStackSize();
7598 // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
7599 VarArgsOffset = alignTo(VarArgsOffset, Subtarget->isTargetILP32() ? 4 : 8);
7600 FuncInfo->setVarArgsStackOffset(VarArgsOffset);
7601 FuncInfo->setVarArgsStackIndex(
7602 MFI.CreateFixedObject(4, VarArgsOffset, true));
7603
7604 if (MFI.hasMustTailInVarArgFunc()) {
7605 SmallVector<MVT, 2> RegParmTypes;
7606 RegParmTypes.push_back(MVT::i64);
7607 RegParmTypes.push_back(MVT::f128);
7608 // Compute the set of forwarded registers. The rest are scratch.
7610 FuncInfo->getForwardedMustTailRegParms();
7611 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
7613
7614 // Conservatively forward X8, since it might be used for aggregate return.
7615 if (!CCInfo.isAllocated(AArch64::X8)) {
7616 Register X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
7617 Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
7618 }
7619 }
7620 }
7621
7622 // On Windows, InReg pointers must be returned, so record the pointer in a
7623 // virtual register at the start of the function so it can be returned in the
7624 // epilogue.
7625 if (IsWin64 || F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64) {
7626 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
7627 if ((F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64 ||
7628 Ins[I].Flags.isInReg()) &&
7629 Ins[I].Flags.isSRet()) {
7630 assert(!FuncInfo->getSRetReturnReg());
7631
7632 MVT PtrTy = getPointerTy(DAG.getDataLayout());
7633 Register Reg =
7635 FuncInfo->setSRetReturnReg(Reg);
7636
7637 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
7638 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
7639 break;
7640 }
7641 }
7642 }
7643
7644 unsigned StackArgSize = CCInfo.getStackSize();
7645 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
7646 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
7647 // This is a non-standard ABI so by fiat I say we're allowed to make full
7648 // use of the stack area to be popped, which must be aligned to 16 bytes in
7649 // any case:
7650 StackArgSize = alignTo(StackArgSize, 16);
7651
7652 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
7653 // a multiple of 16.
7654 FuncInfo->setArgumentStackToRestore(StackArgSize);
7655
7656 // This realignment carries over to the available bytes below. Our own
7657 // callers will guarantee the space is free by giving an aligned value to
7658 // CALLSEQ_START.
7659 }
7660 // Even if we're not expected to free up the space, it's useful to know how
7661 // much is there while considering tail calls (because we can reuse it).
7662 FuncInfo->setBytesInStackArgArea(StackArgSize);
7663
7664 if (Subtarget->hasCustomCallingConv())
7666
7667 // Create a 16 Byte TPIDR2 object. The dynamic buffer
7668 // will be expanded and stored in the static object later using a pseudonode.
7669 if (SMEAttrs(MF.getFunction()).hasZAState()) {
7670 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
7671 TPIDR2.FrameIndex = MFI.CreateStackObject(16, Align(16), false);
7672 SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
7673 DAG.getConstant(1, DL, MVT::i32));
7674
7675 SDValue Buffer;
7676 if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
7678 DAG.getVTList(MVT::i64, MVT::Other), {Chain, SVL});
7679 } else {
7680 SDValue Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
7681 Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL,
7682 DAG.getVTList(MVT::i64, MVT::Other),
7683 {Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
7684 MFI.CreateVariableSizedObject(Align(16), nullptr);
7685 }
7686 Chain = DAG.getNode(
7687 AArch64ISD::INIT_TPIDR2OBJ, DL, DAG.getVTList(MVT::Other),
7688 {/*Chain*/ Buffer.getValue(1), /*Buffer ptr*/ Buffer.getValue(0)});
7689 }
7690
7691 if (CallConv == CallingConv::PreserveNone) {
7692 for (const ISD::InputArg &I : Ins) {
7693 if (I.Flags.isSwiftSelf() || I.Flags.isSwiftError() ||
7694 I.Flags.isSwiftAsync()) {
7697 MF.getFunction(),
7698 "Swift attributes can't be used with preserve_none",
7699 DL.getDebugLoc()));
7700 break;
7701 }
7702 }
7703 }
7704
7705 return Chain;
7706}
7707
7708void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
7709 SelectionDAG &DAG,
7710 const SDLoc &DL,
7711 SDValue &Chain) const {
7713 MachineFrameInfo &MFI = MF.getFrameInfo();
7715 auto PtrVT = getPointerTy(DAG.getDataLayout());
7716 Function &F = MF.getFunction();
7717 bool IsWin64 =
7718 Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg());
7719
7721
7723 unsigned NumGPRArgRegs = GPRArgRegs.size();
7724 if (Subtarget->isWindowsArm64EC()) {
7725 // In the ARM64EC ABI, only x0-x3 are used to pass arguments to varargs
7726 // functions.
7727 NumGPRArgRegs = 4;
7728 }
7729 unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
7730
7731 unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
7732 int GPRIdx = 0;
7733 if (GPRSaveSize != 0) {
7734 if (IsWin64) {
7735 GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
7736 if (GPRSaveSize & 15)
7737 // The extra size here, if triggered, will always be 8.
7738 MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
7739 } else
7740 GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);
7741
7742 SDValue FIN;
7743 if (Subtarget->isWindowsArm64EC()) {
7744 // With the Arm64EC ABI, we reserve the save area as usual, but we
7745 // compute its address relative to x4. For a normal AArch64->AArch64
7746 // call, x4 == sp on entry, but calls from an entry thunk can pass in a
7747 // different address.
7748 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
7749 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
7750 FIN = DAG.getNode(ISD::SUB, DL, MVT::i64, Val,
7751 DAG.getConstant(GPRSaveSize, DL, MVT::i64));
7752 } else {
7753 FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
7754 }
7755
7756 for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
7757 Register VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
7758 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
7759 SDValue Store =
7760 DAG.getStore(Val.getValue(1), DL, Val, FIN,
7762 MF, GPRIdx, (i - FirstVariadicGPR) * 8)
7763 : MachinePointerInfo::getStack(MF, i * 8));
7764 MemOps.push_back(Store);
7765 FIN =
7766 DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
7767 }
7768 }
7769 FuncInfo->setVarArgsGPRIndex(GPRIdx);
7770 FuncInfo->setVarArgsGPRSize(GPRSaveSize);
7771
7772 if (Subtarget->hasFPARMv8() && !IsWin64) {
7774 const unsigned NumFPRArgRegs = FPRArgRegs.size();
7775 unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
7776
7777 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
7778 int FPRIdx = 0;
7779 if (FPRSaveSize != 0) {
7780 FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false);
7781
7782 SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
7783
7784 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
7785 Register VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
7786 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
7787
7788 SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
7789 MachinePointerInfo::getStack(MF, i * 16));
7790 MemOps.push_back(Store);
7791 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
7792 DAG.getConstant(16, DL, PtrVT));
7793 }
7794 }
7795 FuncInfo->setVarArgsFPRIndex(FPRIdx);
7796 FuncInfo->setVarArgsFPRSize(FPRSaveSize);
7797 }
7798
7799 if (!MemOps.empty()) {
7800 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
7801 }
7802}
7803
7804/// LowerCallResult - Lower the result values of a call into the
7805/// appropriate copies out of appropriate physical registers.
7806SDValue AArch64TargetLowering::LowerCallResult(
7807 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
7808 const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL,
7809 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
7810 SDValue ThisVal, bool RequiresSMChange) const {
7811 DenseMap<unsigned, SDValue> CopiedRegs;
7812 // Copy all of the result registers out of their specified physreg.
7813 for (unsigned i = 0; i != RVLocs.size(); ++i) {
7814 CCValAssign VA = RVLocs[i];
7815
7816 // Pass 'this' value directly from the argument to return value, to avoid
7817 // reg unit interference
7818 if (i == 0 && isThisReturn) {
7819 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
7820 "unexpected return calling convention register assignment");
7821 InVals.push_back(ThisVal);
7822 continue;
7823 }
7824
7825 // Avoid copying a physreg twice since RegAllocFast is incompetent and only
7826 // allows one use of a physreg per block.
7827 SDValue Val = CopiedRegs.lookup(VA.getLocReg());
7828 if (!Val) {
7829 Val =
7830 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
7831 Chain = Val.getValue(1);
7832 InGlue = Val.getValue(2);
7833 CopiedRegs[VA.getLocReg()] = Val;
7834 }
7835
7836 switch (VA.getLocInfo()) {
7837 default:
7838 llvm_unreachable("Unknown loc info!");
7839 case CCValAssign::Full:
7840 break;
7841 case CCValAssign::BCvt:
7842 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
7843 break;
7845 Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
7846 DAG.getConstant(32, DL, VA.getLocVT()));
7847 [[fallthrough]];
7848 case CCValAssign::AExt:
7849 [[fallthrough]];
7850 case CCValAssign::ZExt:
7851 Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
7852 break;
7853 }
7854
7855 if (RequiresSMChange && isPassedInFPR(VA.getValVT()))
7857 Val);
7858
7859 InVals.push_back(Val);
7860 }
7861
7862 return Chain;
7863}
7864
7865/// Return true if the calling convention is one that we can guarantee TCO for.
7866static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
7867 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
7869}
7870
7871/// Return true if we might ever do TCO for calls with this calling convention.
7873 switch (CC) {
7874 case CallingConv::C:
7879 case CallingConv::Swift:
7881 case CallingConv::Tail:
7882 case CallingConv::Fast:
7883 return true;
7884 default:
7885 return false;
7886 }
7887}
7888
7889/// Return true if the call convention supports varargs
7890/// Currently only those that pass varargs like the C
7891/// calling convention does are eligible
7892/// Calling conventions listed in this function must also
7893/// be properly handled in AArch64Subtarget::isCallingConvWin64
7895 switch (CC) {
7896 case CallingConv::C:
7898 return true;
7899 default:
7900 return false;
7901 }
7902}
7903
7905 const AArch64Subtarget *Subtarget,
7907 CCState &CCInfo) {
7908 const SelectionDAG &DAG = CLI.DAG;
7909 CallingConv::ID CalleeCC = CLI.CallConv;
7910 bool IsVarArg = CLI.IsVarArg;
7911 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
7912 bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC, IsVarArg);
7913
7914 // For Arm64EC thunks, allocate 32 extra bytes at the bottom of the stack
7915 // for the shadow store.
7916 if (CalleeCC == CallingConv::ARM64EC_Thunk_X64)
7917 CCInfo.AllocateStack(32, Align(16));
7918
7919 unsigned NumArgs = Outs.size();
7920 for (unsigned i = 0; i != NumArgs; ++i) {
7921 MVT ArgVT = Outs[i].VT;
7922 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
7923
7924 bool UseVarArgCC = false;
7925 if (IsVarArg) {
7926 // On Windows, the fixed arguments in a vararg call are passed in GPRs
7927 // too, so use the vararg CC to force them to integer registers.
7928 if (IsCalleeWin64) {
7929 UseVarArgCC = true;
7930 } else {
7931 UseVarArgCC = !Outs[i].IsFixed;
7932 }
7933 }
7934
7935 if (!UseVarArgCC) {
7936 // Get type of the original argument.
7937 EVT ActualVT =
7938 TLI.getValueType(DAG.getDataLayout(), CLI.Args[Outs[i].OrigArgIndex].Ty,
7939 /*AllowUnknown*/ true);
7940 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT;
7941 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
7942 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
7943 ArgVT = MVT::i8;
7944 else if (ActualMVT == MVT::i16)
7945 ArgVT = MVT::i16;
7946 }
7947
7948 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CalleeCC, UseVarArgCC);
7949 bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
7950 assert(!Res && "Call operand has unhandled type");
7951 (void)Res;
7952 }
7953}
7954
7955bool AArch64TargetLowering::isEligibleForTailCallOptimization(
7956 const CallLoweringInfo &CLI) const {
7957 CallingConv::ID CalleeCC = CLI.CallConv;
7958 if (!mayTailCallThisCC(CalleeCC))
7959 return false;
7960
7961 SDValue Callee = CLI.Callee;
7962 bool IsVarArg = CLI.IsVarArg;
7963 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
7964 const SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
7965 const SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
7966 const SelectionDAG &DAG = CLI.DAG;
7968 const Function &CallerF = MF.getFunction();
7969 CallingConv::ID CallerCC = CallerF.getCallingConv();
7970
7971 // SME Streaming functions are not eligible for TCO as they may require
7972 // the streaming mode or ZA to be restored after returning from the call.
7973 SMEAttrs CallerAttrs(MF.getFunction());
7974 auto CalleeAttrs = CLI.CB ? SMEAttrs(*CLI.CB) : SMEAttrs(SMEAttrs::Normal);
7975 if (CallerAttrs.requiresSMChange(CalleeAttrs) ||
7976 CallerAttrs.requiresLazySave(CalleeAttrs) ||
7977 CallerAttrs.hasStreamingBody())
7978 return false;
7979
7980 // Functions using the C or Fast calling convention that have an SVE signature
7981 // preserve more registers and should assume the SVE_VectorCall CC.
7982 // The check for matching callee-saved regs will determine whether it is
7983 // eligible for TCO.
7984 if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) &&
7987
7988 bool CCMatch = CallerCC == CalleeCC;
7989
7990 // When using the Windows calling convention on a non-windows OS, we want
7991 // to back up and restore X18 in such functions; we can't do a tail call
7992 // from those functions.
7993 if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
7994 CalleeCC != CallingConv::Win64)
7995 return false;
7996
7997 // Byval parameters hand the function a pointer directly into the stack area
7998 // we want to reuse during a tail call. Working around this *is* possible (see
7999 // X86) but less efficient and uglier in LowerCall.
8000 for (Function::const_arg_iterator i = CallerF.arg_begin(),
8001 e = CallerF.arg_end();
8002 i != e; ++i) {
8003 if (i->hasByValAttr())
8004 return false;
8005
8006 // On Windows, "inreg" attributes signify non-aggregate indirect returns.
8007 // In this case, it is necessary to save/restore X0 in the callee. Tail
8008 // call opt interferes with this. So we disable tail call opt when the
8009 // caller has an argument with "inreg" attribute.
8010
8011 // FIXME: Check whether the callee also has an "inreg" argument.
8012 if (i->hasInRegAttr())
8013 return false;
8014 }
8015
8016 if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
8017 return CCMatch;
8018
8019 // Externally-defined functions with weak linkage should not be
8020 // tail-called on AArch64 when the OS does not support dynamic
8021 // pre-emption of symbols, as the AAELF spec requires normal calls
8022 // to undefined weak functions to be replaced with a NOP or jump to the
8023 // next instruction. The behaviour of branch instructions in this
8024 // situation (as used for tail calls) is implementation-defined, so we
8025 // cannot rely on the linker replacing the tail call with a return.
8026 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
8027 const GlobalValue *GV = G->getGlobal();
8029 if (GV->hasExternalWeakLinkage() &&
8030 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
8031 return false;
8032 }
8033
8034 // Now we search for cases where we can use a tail call without changing the
8035 // ABI. Sibcall is used in some places (particularly gcc) to refer to this
8036 // concept.
8037
8038 // I want anyone implementing a new calling convention to think long and hard
8039 // about this assert.
8040 if (IsVarArg && !callConvSupportsVarArgs(CalleeCC))
8041 report_fatal_error("Unsupported variadic calling convention");
8042
8043 LLVMContext &C = *DAG.getContext();
8044 // Check that the call results are passed in the same way.
8045 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
8046 CCAssignFnForCall(CalleeCC, IsVarArg),
8047 CCAssignFnForCall(CallerCC, IsVarArg)))
8048 return false;
8049 // The callee has to preserve all registers the caller needs to preserve.
8050 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8051 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
8052 if (!CCMatch) {
8053 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
8054 if (Subtarget->hasCustomCallingConv()) {
8055 TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
8056 TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
8057 }
8058 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
8059 return false;
8060 }
8061
8062 // Nothing more to check if the callee is taking no arguments
8063 if (Outs.empty())
8064 return true;
8065
8067 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C);
8068
8069 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
8070
8071 if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) {
8072 // When we are musttail, additional checks have been done and we can safely ignore this check
8073 // At least two cases here: if caller is fastcc then we can't have any
8074 // memory arguments (we'd be expected to clean up the stack afterwards). If
8075 // caller is C then we could potentially use its argument area.
8076
8077 // FIXME: for now we take the most conservative of these in both cases:
8078 // disallow all variadic memory operands.
8079 for (const CCValAssign &ArgLoc : ArgLocs)
8080 if (!ArgLoc.isRegLoc())
8081 return false;
8082 }
8083
8084 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8085
8086 // If any of the arguments is passed indirectly, it must be SVE, so the
8087 // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
8088 // allocate space on the stack. That is why we determine this explicitly here
8089 // the call cannot be a tailcall.
8090 if (llvm::any_of(ArgLocs, [&](CCValAssign &A) {
8091 assert((A.getLocInfo() != CCValAssign::Indirect ||
8092 A.getValVT().isScalableVector() ||
8093 Subtarget->isWindowsArm64EC()) &&
8094 "Expected value to be scalable");
8095 return A.getLocInfo() == CCValAssign::Indirect;
8096 }))
8097 return false;
8098
8099 // If the stack arguments for this call do not fit into our own save area then
8100 // the call cannot be made tail.
8101 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
8102 return false;
8103
8104 const MachineRegisterInfo &MRI = MF.getRegInfo();
8105 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
8106 return false;
8107
8108 return true;
8109}
8110
8111SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
8112 SelectionDAG &DAG,
8113 MachineFrameInfo &MFI,
8114 int ClobberedFI) const {
8115 SmallVector<SDValue, 8> ArgChains;
8116 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
8117 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
8118
8119 // Include the original chain at the beginning of the list. When this is
8120 // used by target LowerCall hooks, this helps legalize find the
8121 // CALLSEQ_BEGIN node.
8122 ArgChains.push_back(Chain);
8123
8124 // Add a chain value for each stack argument corresponding
8125 for (SDNode *U : DAG.getEntryNode().getNode()->uses())
8126 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U))
8127 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
8128 if (FI->getIndex() < 0) {
8129 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
8130 int64_t InLastByte = InFirstByte;
8131 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
8132
8133 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
8134 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
8135 ArgChains.push_back(SDValue(L, 1));
8136 }
8137
8138 // Build a tokenfactor for all the chains.
8139 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
8140}
8141
8142bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
8143 bool TailCallOpt) const {
8144 return (CallCC == CallingConv::Fast && TailCallOpt) ||
8145 CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail;
8146}
8147
8148// Check if the value is zero-extended from i1 to i8
8149static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
8150 unsigned SizeInBits = Arg.getValueType().getSizeInBits();
8151 if (SizeInBits < 8)
8152 return false;
8153
8154 APInt RequredZero(SizeInBits, 0xFE);
8155 KnownBits Bits = DAG.computeKnownBits(Arg, 4);
8156 bool ZExtBool = (Bits.Zero & RequredZero) == RequredZero;
8157 return ZExtBool;
8158}
8159
8160void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
8161 SDNode *Node) const {
8162 // Live-in physreg copies that are glued to SMSTART are applied as
8163 // implicit-def's in the InstrEmitter. Here we remove them, allowing the
8164 // register allocator to pass call args in callee saved regs, without extra
8165 // copies to avoid these fake clobbers of actually-preserved GPRs.
8166 if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1 ||
8167 MI.getOpcode() == AArch64::MSRpstatePseudo) {
8168 for (unsigned I = MI.getNumOperands() - 1; I > 0; --I)
8169 if (MachineOperand &MO = MI.getOperand(I);
8170 MO.isReg() && MO.isImplicit() && MO.isDef() &&
8171 (AArch64::GPR32RegClass.contains(MO.getReg()) ||
8172 AArch64::GPR64RegClass.contains(MO.getReg())))
8173 MI.removeOperand(I);
8174
8175 // The SVE vector length can change when entering/leaving streaming mode.
8176 if (MI.getOperand(0).getImm() == AArch64SVCR::SVCRSM ||
8177 MI.getOperand(0).getImm() == AArch64SVCR::SVCRSMZA) {
8178 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
8179 /*IsImplicit=*/true));
8180 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/true,
8181 /*IsImplicit=*/true));
8182 }
8183 }
8184
8185 // Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that
8186 // have nothing to do with VG, were it not that they are used to materialise a
8187 // frame-address. If they contain a frame-index to a scalable vector, this
8188 // will likely require an ADDVL instruction to materialise the address, thus
8189 // reading VG.
8190 const MachineFunction &MF = *MI.getMF();
8192 (MI.getOpcode() == AArch64::ADDXri ||
8193 MI.getOpcode() == AArch64::SUBXri)) {
8194 const MachineOperand &MO = MI.getOperand(1);
8195 if (MO.isFI() && MF.getFrameInfo().getStackID(MO.getIndex()) ==
8197 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
8198 /*IsImplicit=*/true));
8199 }
8200}
8201
8203 bool Enable, SDValue Chain,
8204 SDValue InGlue,
8205 unsigned Condition,
8206 SDValue PStateSM) const {
8209 FuncInfo->setHasStreamingModeChanges(true);
8210
8211 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8212 SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask());
8213 SDValue MSROp =
8214 DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32);
8215 SDValue ConditionOp = DAG.getTargetConstant(Condition, DL, MVT::i64);
8216 SmallVector<SDValue> Ops = {Chain, MSROp, ConditionOp};
8217 if (Condition != AArch64SME::Always) {
8218 assert(PStateSM && "PStateSM should be defined");
8219 Ops.push_back(PStateSM);
8220 }
8221 Ops.push_back(RegMask);
8222
8223 if (InGlue)
8224 Ops.push_back(InGlue);
8225
8226 unsigned Opcode = Enable ? AArch64ISD::SMSTART : AArch64ISD::SMSTOP;
8227 return DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
8228}
8229
8230static unsigned getSMCondition(const SMEAttrs &CallerAttrs,
8231 const SMEAttrs &CalleeAttrs) {
8232 if (!CallerAttrs.hasStreamingCompatibleInterface() ||
8233 CallerAttrs.hasStreamingBody())
8234 return AArch64SME::Always;
8235 if (CalleeAttrs.hasNonStreamingInterface())
8237 if (CalleeAttrs.hasStreamingInterface())
8239
8240 llvm_unreachable("Unsupported attributes");
8241}
8242
8243/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
8244/// and add input and output parameter nodes.
8245SDValue
8246AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
8247 SmallVectorImpl<SDValue> &InVals) const {
8248 SelectionDAG &DAG = CLI.DAG;
8249 SDLoc &DL = CLI.DL;
8250 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
8251 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
8253 SDValue Chain = CLI.Chain;
8254 SDValue Callee = CLI.Callee;
8255 bool &IsTailCall = CLI.IsTailCall;
8256 CallingConv::ID &CallConv = CLI.CallConv;
8257 bool IsVarArg = CLI.IsVarArg;
8258
8261 bool IsThisReturn = false;
8262
8264 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
8265 bool IsCFICall = CLI.CB && CLI.CB->isIndirectCall() && CLI.CFIType;
8266 bool IsSibCall = false;
8267 bool GuardWithBTI = false;
8268
8269 if (CLI.CB && CLI.CB->hasFnAttr(Attribute::ReturnsTwice) &&
8270 !Subtarget->noBTIAtReturnTwice()) {
8271 GuardWithBTI = FuncInfo->branchTargetEnforcement();
8272 }
8273
8274 // Analyze operands of the call, assigning locations to each operand.
8276 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
8277
8278 if (IsVarArg) {
8279 unsigned NumArgs = Outs.size();
8280
8281 for (unsigned i = 0; i != NumArgs; ++i) {
8282 if (!Outs[i].IsFixed && Outs[i].VT.isScalableVector())
8283 report_fatal_error("Passing SVE types to variadic functions is "
8284 "currently not supported");
8285 }
8286 }
8287
8288 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
8289
8290 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
8291 // Assign locations to each value returned by this call.
8293 CCState RetCCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
8294 *DAG.getContext());
8295 RetCCInfo.AnalyzeCallResult(Ins, RetCC);
8296
8297 // Check callee args/returns for SVE registers and set calling convention
8298 // accordingly.
8299 if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) {
8300 auto HasSVERegLoc = [](CCValAssign &Loc) {
8301 if (!Loc.isRegLoc())
8302 return false;
8303 return AArch64::ZPRRegClass.contains(Loc.getLocReg()) ||
8304 AArch64::PPRRegClass.contains(Loc.getLocReg());
8305 };
8306 if (any_of(RVLocs, HasSVERegLoc) || any_of(ArgLocs, HasSVERegLoc))
8308 }
8309
8310 if (IsTailCall) {
8311 // Check if it's really possible to do a tail call.
8312 IsTailCall = isEligibleForTailCallOptimization(CLI);
8313
8314 // A sibling call is one where we're under the usual C ABI and not planning
8315 // to change that but can still do a tail call:
8316 if (!TailCallOpt && IsTailCall && CallConv != CallingConv::Tail &&
8317 CallConv != CallingConv::SwiftTail)
8318 IsSibCall = true;
8319
8320 if (IsTailCall)
8321 ++NumTailCalls;
8322 }
8323
8324 if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
8325 report_fatal_error("failed to perform tail call elimination on a call "
8326 "site marked musttail");
8327
8328 // Get a count of how many bytes are to be pushed on the stack.
8329 unsigned NumBytes = CCInfo.getStackSize();
8330
8331 if (IsSibCall) {
8332 // Since we're not changing the ABI to make this a tail call, the memory
8333 // operands are already available in the caller's incoming argument space.
8334 NumBytes = 0;
8335 }
8336
8337 // FPDiff is the byte offset of the call's argument area from the callee's.
8338 // Stores to callee stack arguments will be placed in FixedStackSlots offset
8339 // by this amount for a tail call. In a sibling call it must be 0 because the
8340 // caller will deallocate the entire stack and the callee still expects its
8341 // arguments to begin at SP+0. Completely unused for non-tail calls.
8342 int FPDiff = 0;
8343
8344 if (IsTailCall && !IsSibCall) {
8345 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
8346
8347 // Since callee will pop argument stack as a tail call, we must keep the
8348 // popped size 16-byte aligned.
8349 NumBytes = alignTo(NumBytes, 16);
8350
8351 // FPDiff will be negative if this tail call requires more space than we
8352 // would automatically have in our incoming argument space. Positive if we
8353 // can actually shrink the stack.
8354 FPDiff = NumReusableBytes - NumBytes;
8355
8356 // Update the required reserved area if this is the tail call requiring the
8357 // most argument stack space.
8358 if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff)
8359 FuncInfo->setTailCallReservedStack(-FPDiff);
8360
8361 // The stack pointer must be 16-byte aligned at all times it's used for a
8362 // memory operation, which in practice means at *all* times and in
8363 // particular across call boundaries. Therefore our own arguments started at
8364 // a 16-byte aligned SP and the delta applied for the tail call should
8365 // satisfy the same constraint.
8366 assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
8367 }
8368
8369 // Determine whether we need any streaming mode changes.
8370 SMEAttrs CalleeAttrs, CallerAttrs(MF.getFunction());
8371 if (CLI.CB)
8372 CalleeAttrs = SMEAttrs(*CLI.CB);
8373 else if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
8374 CalleeAttrs = SMEAttrs(ES->getSymbol());
8375
8376 auto DescribeCallsite =
8378 R << "call from '" << ore::NV("Caller", MF.getName()) << "' to '";
8379 if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
8380 R << ore::NV("Callee", ES->getSymbol());
8381 else if (CLI.CB && CLI.CB->getCalledFunction())
8382 R << ore::NV("Callee", CLI.CB->getCalledFunction()->getName());
8383 else
8384 R << "unknown callee";
8385 R << "'";
8386 return R;
8387 };
8388
8389 bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs);
8390 if (RequiresLazySave) {
8391 const TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
8392 MachinePointerInfo MPI =
8394 SDValue TPIDR2ObjAddr = DAG.getFrameIndex(
8395 TPIDR2.FrameIndex,
8397 SDValue NumZaSaveSlicesAddr =
8398 DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr,
8399 DAG.getConstant(8, DL, TPIDR2ObjAddr.getValueType()));
8400 SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
8401 DAG.getConstant(1, DL, MVT::i32));
8402 Chain = DAG.getTruncStore(Chain, DL, NumZaSaveSlices, NumZaSaveSlicesAddr,
8403 MPI, MVT::i16);
8404 Chain = DAG.getNode(
8405 ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
8406 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
8407 TPIDR2ObjAddr);
8409 ORE.emit([&]() {
8410 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
8411 CLI.CB)
8412 : OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
8413 &MF.getFunction());
8414 return DescribeCallsite(R) << " sets up a lazy save for ZA";
8415 });
8416 }
8417
8418 SDValue PStateSM;
8419 bool RequiresSMChange = CallerAttrs.requiresSMChange(CalleeAttrs);
8420 if (RequiresSMChange) {
8421 if (CallerAttrs.hasStreamingInterfaceOrBody())
8422 PStateSM = DAG.getConstant(1, DL, MVT::i64);
8423 else if (CallerAttrs.hasNonStreamingInterface())
8424 PStateSM = DAG.getConstant(0, DL, MVT::i64);
8425 else
8426 PStateSM = getRuntimePStateSM(DAG, Chain, DL, MVT::i64);
8428 ORE.emit([&]() {
8429 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMETransition",
8430 CLI.CB)
8431 : OptimizationRemarkAnalysis("sme", "SMETransition",
8432 &MF.getFunction());
8433 DescribeCallsite(R) << " requires a streaming mode transition";
8434 return R;
8435 });
8436 }
8437
8438 SDValue ZTFrameIdx;
8439 MachineFrameInfo &MFI = MF.getFrameInfo();
8440 bool ShouldPreserveZT0 = CallerAttrs.requiresPreservingZT0(CalleeAttrs);
8441
8442 // If the caller has ZT0 state which will not be preserved by the callee,
8443 // spill ZT0 before the call.
8444 if (ShouldPreserveZT0) {
8445 unsigned ZTObj = MFI.CreateSpillStackObject(64, Align(16));
8446 ZTFrameIdx = DAG.getFrameIndex(
8447 ZTObj,
8449
8450 Chain = DAG.getNode(AArch64ISD::SAVE_ZT, DL, DAG.getVTList(MVT::Other),
8451 {Chain, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
8452 }
8453
8454 // If caller shares ZT0 but the callee is not shared ZA, we need to stop
8455 // PSTATE.ZA before the call if there is no lazy-save active.
8456 bool DisableZA = CallerAttrs.requiresDisablingZABeforeCall(CalleeAttrs);
8457 assert((!DisableZA || !RequiresLazySave) &&
8458 "Lazy-save should have PSTATE.SM=1 on entry to the function");
8459
8460 if (DisableZA)
8461 Chain = DAG.getNode(
8462 AArch64ISD::SMSTOP, DL, MVT::Other, Chain,
8463 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
8464 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
8465
8466 // Adjust the stack pointer for the new arguments...
8467 // These operations are automatically eliminated by the prolog/epilog pass
8468 if (!IsSibCall)
8469 Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL);
8470
8471 SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
8473
8475 SmallSet<unsigned, 8> RegsUsed;
8476 SmallVector<SDValue, 8> MemOpChains;
8477 auto PtrVT = getPointerTy(DAG.getDataLayout());
8478
8479 if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
8480 const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
8481 for (const auto &F : Forwards) {
8482 SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
8483 RegsToPass.emplace_back(F.PReg, Val);
8484 }
8485 }
8486
8487 // Walk the register/memloc assignments, inserting copies/loads.
8488 unsigned ExtraArgLocs = 0;
8489 for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
8490 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
8491 SDValue Arg = OutVals[i];
8492 ISD::ArgFlagsTy Flags = Outs[i].Flags;
8493
8494 // Promote the value if needed.
8495 switch (VA.getLocInfo()) {
8496 default:
8497 llvm_unreachable("Unknown loc info!");
8498 case CCValAssign::Full:
8499 break;
8500 case CCValAssign::SExt:
8501 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
8502 break;
8503 case CCValAssign::ZExt:
8504 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
8505 break;
8506 case CCValAssign::AExt:
8507 if (Outs[i].ArgVT == MVT::i1) {
8508 // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
8509 //
8510 // Check if we actually have to do this, because the value may
8511 // already be zero-extended.
8512 //
8513 // We cannot just emit a (zext i8 (trunc (assert-zext i8)))
8514 // and rely on DAGCombiner to fold this, because the following
8515 // (anyext i32) is combined with (zext i8) in DAG.getNode:
8516 //
8517 // (ext (zext x)) -> (zext x)
8518 //
8519 // This will give us (zext i32), which we cannot remove, so
8520 // try to check this beforehand.
8521 if (!checkZExtBool(Arg, DAG)) {
8522 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
8523 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
8524 }
8525 }
8526 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
8527 break;
8529 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
8530 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
8531 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
8532 DAG.getConstant(32, DL, VA.getLocVT()));
8533 break;
8534 case CCValAssign::BCvt:
8535 Arg = DAG.getBitcast(VA.getLocVT(), Arg);
8536 break;
8537 case CCValAssign::Trunc:
8538 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
8539 break;
8540 case CCValAssign::FPExt:
8541 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
8542 break;
8544 bool isScalable = VA.getValVT().isScalableVT();
8545 assert((isScalable || Subtarget->isWindowsArm64EC()) &&
8546 "Indirect arguments should be scalable on most subtargets");
8547
8548 uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinValue();
8549 uint64_t PartSize = StoreSize;
8550 unsigned NumParts = 1;
8551 if (Outs[i].Flags.isInConsecutiveRegs()) {
8552 while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
8553 ++NumParts;
8554 StoreSize *= NumParts;
8555 }
8556
8557 Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
8558 Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
8559 MachineFrameInfo &MFI = MF.getFrameInfo();
8560 int FI = MFI.CreateStackObject(StoreSize, Alignment, false);
8561 if (isScalable)
8563
8567 SDValue SpillSlot = Ptr;
8568
8569 // Ensure we generate all stores for each tuple part, whilst updating the
8570 // pointer after each store correctly using vscale.
8571 while (NumParts) {
8572 SDValue Store = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI);
8573 MemOpChains.push_back(Store);
8574
8575 NumParts--;
8576 if (NumParts > 0) {
8577 SDValue BytesIncrement;
8578 if (isScalable) {
8579 BytesIncrement = DAG.getVScale(
8580 DL, Ptr.getValueType(),
8581 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
8582 } else {
8583 BytesIncrement = DAG.getConstant(
8584 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
8585 Ptr.getValueType());
8586 }
8588 Flags.setNoUnsignedWrap(true);
8589
8590 MPI = MachinePointerInfo(MPI.getAddrSpace());
8591 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
8592 BytesIncrement, Flags);
8593 ExtraArgLocs++;
8594 i++;
8595 }
8596 }
8597
8598 Arg = SpillSlot;
8599 break;
8600 }
8601
8602 if (VA.isRegLoc()) {
8603 if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
8604 Outs[0].VT == MVT::i64) {
8605 assert(VA.getLocVT() == MVT::i64 &&
8606 "unexpected calling convention register assignment");
8607 assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
8608 "unexpected use of 'returned'");
8609 IsThisReturn = true;
8610 }
8611 if (RegsUsed.count(VA.getLocReg())) {
8612 // If this register has already been used then we're trying to pack
8613 // parts of an [N x i32] into an X-register. The extension type will
8614 // take care of putting the two halves in the right place but we have to
8615 // combine them.
8616 SDValue &Bits =
8617 llvm::find_if(RegsToPass,
8618 [=](const std::pair<unsigned, SDValue> &Elt) {
8619 return Elt.first == VA.getLocReg();
8620 })
8621 ->second;
8622 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
8623 // Call site info is used for function's parameter entry value
8624 // tracking. For now we track only simple cases when parameter
8625 // is transferred through whole register.
8627 [&VA](MachineFunction::ArgRegPair ArgReg) {
8628 return ArgReg.Reg == VA.getLocReg();
8629 });
8630 } else {
8631 // Add an extra level of indirection for streaming mode changes by
8632 // using a pseudo copy node that cannot be rematerialised between a
8633 // smstart/smstop and the call by the simple register coalescer.
8634 if (RequiresSMChange && isPassedInFPR(Arg.getValueType()))
8636 Arg.getValueType(), Arg);
8637 RegsToPass.emplace_back(VA.getLocReg(), Arg);
8638 RegsUsed.insert(VA.getLocReg());
8639 const TargetOptions &Options = DAG.getTarget().Options;
8640 if (Options.EmitCallSiteInfo)
8641 CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i);
8642 }
8643 } else {
8644 assert(VA.isMemLoc());
8645
8646 SDValue DstAddr;
8647 MachinePointerInfo DstInfo;
8648
8649 // FIXME: This works on big-endian for composite byvals, which are the
8650 // common case. It should also work for fundamental types too.
8651 uint32_t BEAlign = 0;
8652 unsigned OpSize;
8653 if (VA.getLocInfo() == CCValAssign::Indirect ||
8655 OpSize = VA.getLocVT().getFixedSizeInBits();
8656 else
8657 OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
8658 : VA.getValVT().getSizeInBits();
8659 OpSize = (OpSize + 7) / 8;
8660 if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
8661 !Flags.isInConsecutiveRegs()) {
8662 if (OpSize < 8)
8663 BEAlign = 8 - OpSize;
8664 }
8665 unsigned LocMemOffset = VA.getLocMemOffset();
8666 int32_t Offset = LocMemOffset + BEAlign;
8667 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
8668 PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
8669
8670 if (IsTailCall) {
8671 Offset = Offset + FPDiff;
8672 int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
8673
8674 DstAddr = DAG.getFrameIndex(FI, PtrVT);
8675 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
8676
8677 // Make sure any stack arguments overlapping with where we're storing
8678 // are loaded before this eventual operation. Otherwise they'll be
8679 // clobbered.
8680 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
8681 } else {
8682 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
8683
8684 DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
8685 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
8686 }
8687
8688 if (Outs[i].Flags.isByVal()) {
8689 SDValue SizeNode =
8690 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
8691 SDValue Cpy = DAG.getMemcpy(
8692 Chain, DL, DstAddr, Arg, SizeNode,
8693 Outs[i].Flags.getNonZeroByValAlign(),
8694 /*isVol = */ false, /*AlwaysInline = */ false,
8695 /*CI=*/nullptr, std::nullopt, DstInfo, MachinePointerInfo());
8696
8697 MemOpChains.push_back(Cpy);
8698 } else {
8699 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
8700 // promoted to a legal register type i32, we should truncate Arg back to
8701 // i1/i8/i16.
8702 if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
8703 VA.getValVT() == MVT::i16)
8704 Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
8705
8706 SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
8707 MemOpChains.push_back(Store);
8708 }
8709 }
8710 }
8711
8712 if (IsVarArg && Subtarget->isWindowsArm64EC()) {
8713 SDValue ParamPtr = StackPtr;
8714 if (IsTailCall) {
8715 // Create a dummy object at the top of the stack that can be used to get
8716 // the SP after the epilogue
8717 int FI = MF.getFrameInfo().CreateFixedObject(1, FPDiff, true);
8718 ParamPtr = DAG.getFrameIndex(FI, PtrVT);
8719 }
8720
8721 // For vararg calls, the Arm64EC ABI requires values in x4 and x5
8722 // describing the argument list. x4 contains the address of the
8723 // first stack parameter. x5 contains the size in bytes of all parameters
8724 // passed on the stack.
8725 RegsToPass.emplace_back(AArch64::X4, ParamPtr);
8726 RegsToPass.emplace_back(AArch64::X5,
8727 DAG.getConstant(NumBytes, DL, MVT::i64));
8728 }
8729
8730 if (!MemOpChains.empty())
8731 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
8732
8733 SDValue InGlue;
8734 if (RequiresSMChange) {
8735
8736 Chain = DAG.getNode(AArch64ISD::VG_SAVE, DL,
8737 DAG.getVTList(MVT::Other, MVT::Glue), Chain);
8738 InGlue = Chain.getValue(1);
8739
8740 SDValue NewChain = changeStreamingMode(
8741 DAG, DL, CalleeAttrs.hasStreamingInterface(), Chain, InGlue,
8742 getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
8743 Chain = NewChain.getValue(0);
8744 InGlue = NewChain.getValue(1);
8745 }
8746
8747 // Build a sequence of copy-to-reg nodes chained together with token chain
8748 // and flag operands which copy the outgoing args into the appropriate regs.
8749 for (auto &RegToPass : RegsToPass) {
8750 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
8751 RegToPass.second, InGlue);
8752 InGlue = Chain.getValue(1);
8753 }
8754
8755 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
8756 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
8757 // node so that legalize doesn't hack it.
8758 if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
8759 auto GV = G->getGlobal();
8760 unsigned OpFlags =
8762 if (OpFlags & AArch64II::MO_GOT) {
8763 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
8764 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
8765 } else {
8766 const GlobalValue *GV = G->getGlobal();
8767 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
8768 }
8769 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
8770 bool UseGot = (getTargetMachine().getCodeModel() == CodeModel::Large &&
8771 Subtarget->isTargetMachO()) ||
8773 const char *Sym = S->getSymbol();
8774 if (UseGot) {
8776 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
8777 } else {
8778 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
8779 }
8780 }
8781
8782 // We don't usually want to end the call-sequence here because we would tidy
8783 // the frame up *after* the call, however in the ABI-changing tail-call case
8784 // we've carefully laid out the parameters so that when sp is reset they'll be
8785 // in the correct location.
8786 if (IsTailCall && !IsSibCall) {
8787 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, DL);
8788 InGlue = Chain.getValue(1);
8789 }
8790
8791 unsigned Opc = IsTailCall ? AArch64ISD::TC_RETURN : AArch64ISD::CALL;
8792
8793 std::vector<SDValue> Ops;
8794 Ops.push_back(Chain);
8795 Ops.push_back(Callee);
8796
8797 // Calls with operand bundle "clang.arc.attachedcall" are special. They should
8798 // be expanded to the call, directly followed by a special marker sequence and
8799 // a call to an ObjC library function. Use CALL_RVMARKER to do that.
8800 if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
8801 assert(!IsTailCall &&
8802 "tail calls cannot be marked with clang.arc.attachedcall");
8804
8805 // Add a target global address for the retainRV/claimRV runtime function
8806 // just before the call target.
8807 Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
8808 auto GA = DAG.getTargetGlobalAddress(ARCFn, DL, PtrVT);
8809 Ops.insert(Ops.begin() + 1, GA);
8810 } else if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
8812 } else if (GuardWithBTI) {
8814 }
8815
8816 if (IsTailCall) {
8817 // Each tail call may have to adjust the stack by a different amount, so
8818 // this information must travel along with the operation for eventual
8819 // consumption by emitEpilogue.
8820 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
8821 }
8822
8823 if (CLI.PAI) {
8824 const uint64_t Key = CLI.PAI->Key;
8825 assert((Key == AArch64PACKey::IA || Key == AArch64PACKey::IB) &&
8826 "Invalid auth call key");
8827
8828 // Split the discriminator into address/integer components.
8829 SDValue AddrDisc, IntDisc;
8830 std::tie(IntDisc, AddrDisc) =
8831 extractPtrauthBlendDiscriminators(CLI.PAI->Discriminator, &DAG);
8832
8833 if (Opc == AArch64ISD::CALL_RVMARKER)
8835 else
8837 Ops.push_back(DAG.getTargetConstant(Key, DL, MVT::i32));
8838 Ops.push_back(IntDisc);
8839 Ops.push_back(AddrDisc);
8840 }
8841
8842 // Add argument registers to the end of the list so that they are known live
8843 // into the call.
8844 for (auto &RegToPass : RegsToPass)
8845 Ops.push_back(DAG.getRegister(RegToPass.first,
8846 RegToPass.second.getValueType()));
8847
8848 // Add a register mask operand representing the call-preserved registers.
8849 const uint32_t *Mask;
8850 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
8851 if (IsThisReturn) {
8852 // For 'this' returns, use the X0-preserving mask if applicable
8853 Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
8854 if (!Mask) {
8855 IsThisReturn = false;
8856 Mask = TRI->getCallPreservedMask(MF, CallConv);
8857 }
8858 } else
8859 Mask = TRI->getCallPreservedMask(MF, CallConv);
8860
8861 if (Subtarget->hasCustomCallingConv())
8862 TRI->UpdateCustomCallPreservedMask(MF, &Mask);
8863
8864 if (TRI->isAnyArgRegReserved(MF))
8865 TRI->emitReservedArgRegCallError(MF);
8866
8867 assert(Mask && "Missing call preserved mask for calling convention");
8868 Ops.push_back(DAG.getRegisterMask(Mask));
8869
8870 if (InGlue.getNode())
8871 Ops.push_back(InGlue);
8872
8873 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
8874
8875 // If we're doing a tall call, use a TC_RETURN here rather than an
8876 // actual call instruction.
8877 if (IsTailCall) {
8879 SDValue Ret = DAG.getNode(Opc, DL, NodeTys, Ops);
8880 if (IsCFICall)
8881 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
8882
8883 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
8884 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
8885 return Ret;
8886 }
8887
8888 // Returns a chain and a flag for retval copy to use.
8889 Chain = DAG.getNode(Opc, DL, NodeTys, Ops);
8890 if (IsCFICall)
8891 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
8892
8893 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
8894 InGlue = Chain.getValue(1);
8895 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
8896
8897 uint64_t CalleePopBytes =
8898 DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
8899
8900 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, DL);
8901 InGlue = Chain.getValue(1);
8902
8903 // Handle result values, copying them out of physregs into vregs that we
8904 // return.
8905 SDValue Result = LowerCallResult(
8906 Chain, InGlue, CallConv, IsVarArg, RVLocs, DL, DAG, InVals, IsThisReturn,
8907 IsThisReturn ? OutVals[0] : SDValue(), RequiresSMChange);
8908
8909 if (!Ins.empty())
8910 InGlue = Result.getValue(Result->getNumValues() - 1);
8911
8912 if (RequiresSMChange) {
8913 assert(PStateSM && "Expected a PStateSM to be set");
8915 DAG, DL, !CalleeAttrs.hasStreamingInterface(), Result, InGlue,
8916 getSMCondition(CallerAttrs, CalleeAttrs), PStateSM);
8917 InGlue = Result.getValue(1);
8918
8919 Result =
8921 DAG.getVTList(MVT::Other, MVT::Glue), {Result, InGlue});
8922 }
8923
8924 if (CallerAttrs.requiresEnablingZAAfterCall(CalleeAttrs))
8925 // Unconditionally resume ZA.
8926 Result = DAG.getNode(
8927 AArch64ISD::SMSTART, DL, MVT::Other, Result,
8928 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
8929 DAG.getConstant(AArch64SME::Always, DL, MVT::i64));
8930
8931 if (ShouldPreserveZT0)
8932 Result =
8933 DAG.getNode(AArch64ISD::RESTORE_ZT, DL, DAG.getVTList(MVT::Other),
8934 {Result, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
8935
8936 if (RequiresLazySave) {
8937 // Conditionally restore the lazy save using a pseudo node.
8938 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
8939 SDValue RegMask = DAG.getRegisterMask(
8940 TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
8941 SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
8942 "__arm_tpidr2_restore", getPointerTy(DAG.getDataLayout()));
8943 SDValue TPIDR2_EL0 = DAG.getNode(
8944 ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result,
8945 DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
8946
8947 // Copy the address of the TPIDR2 block into X0 before 'calling' the
8948 // RESTORE_ZA pseudo.
8949 SDValue Glue;
8950 SDValue TPIDR2Block = DAG.getFrameIndex(
8951 TPIDR2.FrameIndex,
8953 Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue);
8954 Result =
8955 DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
8956 {Result, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64),
8957 RestoreRoutine, RegMask, Result.getValue(1)});
8958
8959 // Finally reset the TPIDR2_EL0 register to 0.
8960 Result = DAG.getNode(
8961 ISD::INTRINSIC_VOID, DL, MVT::Other, Result,
8962 DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
8963 DAG.getConstant(0, DL, MVT::i64));
8964 TPIDR2.Uses++;
8965 }
8966
8967 if (RequiresSMChange || RequiresLazySave || ShouldPreserveZT0) {
8968 for (unsigned I = 0; I < InVals.size(); ++I) {
8969 // The smstart/smstop is chained as part of the call, but when the
8970 // resulting chain is discarded (which happens when the call is not part
8971 // of a chain, e.g. a call to @llvm.cos()), we need to ensure the
8972 // smstart/smstop is chained to the result value. We can do that by doing
8973 // a vreg -> vreg copy.
8975 getRegClassFor(InVals[I].getValueType().getSimpleVT()));
8976 SDValue X = DAG.getCopyToReg(Result, DL, Reg, InVals[I]);
8977 InVals[I] = DAG.getCopyFromReg(X, DL, Reg,
8978 InVals[I].getValueType());
8979 }
8980 }
8981
8982 if (CallConv == CallingConv::PreserveNone) {
8983 for (const ISD::OutputArg &O : Outs) {
8984 if (O.Flags.isSwiftSelf() || O.Flags.isSwiftError() ||
8985 O.Flags.isSwiftAsync()) {
8988 MF.getFunction(),
8989 "Swift attributes can't be used with preserve_none",
8990 DL.getDebugLoc()));
8991 break;
8992 }
8993 }
8994 }
8995
8996 return Result;
8997}
8998
8999bool AArch64TargetLowering::CanLowerReturn(
9000 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
9001 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
9002 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
9004 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
9005 return CCInfo.CheckReturn(Outs, RetCC);
9006}
9007
9008SDValue
9009AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
9010 bool isVarArg,
9012 const SmallVectorImpl<SDValue> &OutVals,
9013 const SDLoc &DL, SelectionDAG &DAG) const {
9014 auto &MF = DAG.getMachineFunction();
9015 auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
9016
9017 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
9019 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
9020 CCInfo.AnalyzeReturn(Outs, RetCC);
9021
9022 // Copy the result values into the output registers.
9023 SDValue Glue;
9025 SmallSet<unsigned, 4> RegsUsed;
9026 for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
9027 ++i, ++realRVLocIdx) {
9028 CCValAssign &VA = RVLocs[i];
9029 assert(VA.isRegLoc() && "Can only return in registers!");
9030 SDValue Arg = OutVals[realRVLocIdx];
9031
9032 switch (VA.getLocInfo()) {
9033 default:
9034 llvm_unreachable("Unknown loc info!");
9035 case CCValAssign::Full:
9036 if (Outs[i].ArgVT == MVT::i1) {
9037 // AAPCS requires i1 to be zero-extended to i8 by the producer of the
9038 // value. This is strictly redundant on Darwin (which uses "zeroext
9039 // i1"), but will be optimised out before ISel.
9040 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
9041 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
9042 }
9043 break;
9044 case CCValAssign::BCvt:
9045 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
9046 break;
9047 case CCValAssign::AExt:
9048 case CCValAssign::ZExt:
9049 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
9050 break;
9052 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
9053 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
9054 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
9055 DAG.getConstant(32, DL, VA.getLocVT()));
9056 break;
9057 }
9058
9059 if (RegsUsed.count(VA.getLocReg())) {
9060 SDValue &Bits =
9061 llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) {
9062 return Elt.first == VA.getLocReg();
9063 })->second;
9064 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
9065 } else {
9066 RetVals.emplace_back(VA.getLocReg(), Arg);
9067 RegsUsed.insert(VA.getLocReg());
9068 }
9069 }
9070
9071 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9072
9073 // Emit SMSTOP before returning from a locally streaming function
9074 SMEAttrs FuncAttrs(MF.getFunction());
9075 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) {
9076 if (FuncAttrs.hasStreamingCompatibleInterface()) {
9077 Register Reg = FuncInfo->getPStateSMReg();
9078 assert(Reg.isValid() && "PStateSM Register is invalid");
9079 SDValue PStateSM = DAG.getCopyFromReg(Chain, DL, Reg, MVT::i64);
9080 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
9081 /*Glue*/ SDValue(),
9083 } else
9084 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
9085 /*Glue*/ SDValue(), AArch64SME::Always);
9086 Glue = Chain.getValue(1);
9087 }
9088
9089 SmallVector<SDValue, 4> RetOps(1, Chain);
9090 for (auto &RetVal : RetVals) {
9091 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface() &&
9092 isPassedInFPR(RetVal.second.getValueType()))
9093 RetVal.second = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
9094 RetVal.second.getValueType(), RetVal.second);
9095 Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Glue);
9096 Glue = Chain.getValue(1);
9097 RetOps.push_back(
9098 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
9099 }
9100
9101 // Windows AArch64 ABIs require that for returning structs by value we copy
9102 // the sret argument into X0 for the return.
9103 // We saved the argument into a virtual register in the entry block,
9104 // so now we copy the value out and into X0.
9105 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
9106 SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
9108
9109 unsigned RetValReg = AArch64::X0;
9110 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
9111 RetValReg = AArch64::X8;
9112 Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Glue);
9113 Glue = Chain.getValue(1);
9114
9115 RetOps.push_back(
9116 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
9117 }
9118
9119 const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&MF);
9120 if (I) {
9121 for (; *I; ++I) {
9122 if (AArch64::GPR64RegClass.contains(*I))
9123 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
9124 else if (AArch64::FPR64RegClass.contains(*I))
9125 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
9126 else
9127 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
9128 }
9129 }
9130
9131 RetOps[0] = Chain; // Update chain.
9132
9133 // Add the glue if we have it.
9134 if (Glue.getNode())
9135 RetOps.push_back(Glue);
9136
9137 if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
9138 // ARM64EC entry thunks use a special return sequence: instead of a regular
9139 // "ret" instruction, they need to explicitly call the emulator.
9140 EVT PtrVT = getPointerTy(DAG.getDataLayout());
9141 SDValue Arm64ECRetDest =
9142 DAG.getExternalSymbol("__os_arm64x_dispatch_ret", PtrVT);
9143 Arm64ECRetDest =
9144 getAddr(cast<ExternalSymbolSDNode>(Arm64ECRetDest), DAG, 0);
9145 Arm64ECRetDest = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Arm64ECRetDest,
9147 RetOps.insert(RetOps.begin() + 1, Arm64ECRetDest);
9148 RetOps.insert(RetOps.begin() + 2, DAG.getTargetConstant(0, DL, MVT::i32));
9149 return DAG.getNode(AArch64ISD::TC_RETURN, DL, MVT::Other, RetOps);
9150 }
9151
9152 return DAG.getNode(AArch64ISD::RET_GLUE, DL, MVT::Other, RetOps);
9153}
9154
9155//===----------------------------------------------------------------------===//
9156// Other Lowering Code
9157//===----------------------------------------------------------------------===//
9158
9159SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
9160 SelectionDAG &DAG,
9161 unsigned Flag) const {
9162 return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
9163 N->getOffset(), Flag);
9164}
9165
9166SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
9167 SelectionDAG &DAG,
9168 unsigned Flag) const {
9169 return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
9170}
9171
9172SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
9173 SelectionDAG &DAG,
9174 unsigned Flag) const {
9175 return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
9176 N->getOffset(), Flag);
9177}
9178
9179SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
9180 SelectionDAG &DAG,
9181 unsigned Flag) const {
9182 return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
9183}
9184
9185SDValue AArch64TargetLowering::getTargetNode(ExternalSymbolSDNode *N, EVT Ty,
9186 SelectionDAG &DAG,
9187 unsigned Flag) const {
9188 return DAG.getTargetExternalSymbol(N->getSymbol(), Ty, Flag);
9189}
9190
9191// (loadGOT sym)
9192template <class NodeTy>
9193SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
9194 unsigned Flags) const {
9195 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
9196 SDLoc DL(N);
9197 EVT Ty = getPointerTy(DAG.getDataLayout());
9198 SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
9199 // FIXME: Once remat is capable of dealing with instructions with register
9200 // operands, expand this into two nodes instead of using a wrapper node.
9201 return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
9202}
9203
9204// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
9205template <class NodeTy>
9206SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
9207 unsigned Flags) const {
9208 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
9209 SDLoc DL(N);
9210 EVT Ty = getPointerTy(DAG.getDataLayout());
9211 const unsigned char MO_NC = AArch64II::MO_NC;
9212 return DAG.getNode(
9214 getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
9215 getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
9216 getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
9217 getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
9218}
9219
9220// (addlow (adrp %hi(sym)) %lo(sym))
9221template <class NodeTy>
9222SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
9223 unsigned Flags) const {
9224 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
9225 SDLoc DL(N);
9226 EVT Ty = getPointerTy(DAG.getDataLayout());
9227 SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
9228 SDValue Lo = getTargetNode(N, Ty, DAG,
9231 return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
9232}
9233
9234// (adr sym)
9235template <class NodeTy>
9236SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
9237 unsigned Flags) const {
9238 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
9239 SDLoc DL(N);
9240 EVT Ty = getPointerTy(DAG.getDataLayout());
9241 SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
9242 return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);
9243}
9244
9245SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
9246 SelectionDAG &DAG) const {
9247 GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
9248 const GlobalValue *GV = GN->getGlobal();
9249 unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
9250
9251 if (OpFlags != AArch64II::MO_NO_FLAG)
9252 assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
9253 "unexpected offset in global node");
9254
9255 // This also catches the large code model case for Darwin, and tiny code
9256 // model with got relocations.
9257 if ((OpFlags & AArch64II::MO_GOT) != 0) {
9258 return getGOT(GN, DAG, OpFlags);
9259 }
9260
9264 Result = getAddrLarge(GN, DAG, OpFlags);
9265 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
9266 Result = getAddrTiny(GN, DAG, OpFlags);
9267 } else {
9268 Result = getAddr(GN, DAG, OpFlags);
9269 }
9270 EVT PtrVT = getPointerTy(DAG.getDataLayout());
9271 SDLoc DL(GN);
9273 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
9275 return Result;
9276}
9277
9278/// Convert a TLS address reference into the correct sequence of loads
9279/// and calls to compute the variable's address (for Darwin, currently) and
9280/// return an SDValue containing the final node.
9281
9282/// Darwin only has one TLS scheme which must be capable of dealing with the
9283/// fully general situation, in the worst case. This means:
9284/// + "extern __thread" declaration.
9285/// + Defined in a possibly unknown dynamic library.
9286///
9287/// The general system is that each __thread variable has a [3 x i64] descriptor
9288/// which contains information used by the runtime to calculate the address. The
9289/// only part of this the compiler needs to know about is the first xword, which
9290/// contains a function pointer that must be called with the address of the
9291/// entire descriptor in "x0".
9292///
9293/// Since this descriptor may be in a different unit, in general even the
9294/// descriptor must be accessed via an indirect load. The "ideal" code sequence
9295/// is:
9296/// adrp x0, _var@TLVPPAGE
9297/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
9298/// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
9299/// ; the function pointer
9300/// blr x1 ; Uses descriptor address in x0
9301/// ; Address of _var is now in x0.
9302///
9303/// If the address of _var's descriptor *is* known to the linker, then it can
9304/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
9305/// a slight efficiency gain.
9306SDValue
9307AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
9308 SelectionDAG &DAG) const {
9309 assert(Subtarget->isTargetDarwin() &&
9310 "This function expects a Darwin target");
9311
9312 SDLoc DL(Op);
9313 MVT PtrVT = getPointerTy(DAG.getDataLayout());
9314 MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout());
9315 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
9316
9317 SDValue TLVPAddr =
9318 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
9319 SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
9320
9321 // The first entry in the descriptor is a function pointer that we must call
9322 // to obtain the address of the variable.
9323 SDValue Chain = DAG.getEntryNode();
9324 SDValue FuncTLVGet = DAG.getLoad(
9325 PtrMemVT, DL, Chain, DescAddr,
9327 Align(PtrMemVT.getSizeInBits() / 8),
9329 Chain = FuncTLVGet.getValue(1);
9330
9331 // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
9332 FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT);
9333
9335 MFI.setAdjustsStack(true);
9336
9337 // TLS calls preserve all registers except those that absolutely must be
9338 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
9339 // silly).
9340 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9341 const uint32_t *Mask = TRI->getTLSCallPreservedMask();
9342 if (Subtarget->hasCustomCallingConv())
9343 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
9344
9345 // Finally, we can make the call. This is just a degenerate version of a
9346 // normal AArch64 call node: x0 takes the address of the descriptor, and
9347 // returns the address of the variable in this thread.
9348 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
9349
9350 unsigned Opcode = AArch64ISD::CALL;
9352 Ops.push_back(Chain);
9353 Ops.push_back(FuncTLVGet);
9354
9355 // With ptrauth-calls, the tlv access thunk pointer is authenticated (IA, 0).
9356 if (DAG.getMachineFunction().getFunction().hasFnAttribute("ptrauth-calls")) {
9357 Opcode = AArch64ISD::AUTH_CALL;
9358 Ops.push_back(DAG.getTargetConstant(AArch64PACKey::IA, DL, MVT::i32));
9359 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64)); // Integer Disc.
9360 Ops.push_back(DAG.getRegister(AArch64::NoRegister, MVT::i64)); // Addr Disc.
9361 }
9362
9363 Ops.push_back(DAG.getRegister(AArch64::X0, MVT::i64));
9364 Ops.push_back(DAG.getRegisterMask(Mask));
9365 Ops.push_back(Chain.getValue(1));
9366 Chain = DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
9367 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
9368}
9369
9370/// Convert a thread-local variable reference into a sequence of instructions to
9371/// compute the variable's address for the local exec TLS model of ELF targets.
9372/// The sequence depends on the maximum TLS area size.
9373SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
9374 SDValue ThreadBase,
9375 const SDLoc &DL,
9376 SelectionDAG &DAG) const {
9377 EVT PtrVT = getPointerTy(DAG.getDataLayout());
9378 SDValue TPOff, Addr;
9379
9380 switch (DAG.getTarget().Options.TLSSize) {
9381 default:
9382 llvm_unreachable("Unexpected TLS size");
9383
9384 case 12: {
9385 // mrs x0, TPIDR_EL0
9386 // add x0, x0, :tprel_lo12:a
9388 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
9389 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
9390 Var,
9391 DAG.getTargetConstant(0, DL, MVT::i32)),
9392 0);
9393 }
9394
9395 case 24: {
9396 // mrs x0, TPIDR_EL0
9397 // add x0, x0, :tprel_hi12:a
9398 // add x0, x0, :tprel_lo12_nc:a
9399 SDValue HiVar = DAG.getTargetGlobalAddress(
9400 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
9401 SDValue LoVar = DAG.getTargetGlobalAddress(
9402 GV, DL, PtrVT, 0,
9404 Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
9405 HiVar,
9406 DAG.getTargetConstant(0, DL, MVT::i32)),
9407 0);
9408 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,
9409 LoVar,
9410 DAG.getTargetConstant(0, DL, MVT::i32)),
9411 0);
9412 }
9413
9414 case 32: {
9415 // mrs x1, TPIDR_EL0
9416 // movz x0, #:tprel_g1:a
9417 // movk x0, #:tprel_g0_nc:a
9418 // add x0, x1, x0
9419 SDValue HiVar = DAG.getTargetGlobalAddress(
9420 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
9421 SDValue LoVar = DAG.getTargetGlobalAddress(
9422 GV, DL, PtrVT, 0,
9424 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
9425 DAG.getTargetConstant(16, DL, MVT::i32)),
9426 0);
9427 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
9428 DAG.getTargetConstant(0, DL, MVT::i32)),
9429 0);
9430 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
9431 }
9432
9433 case 48: {
9434 // mrs x1, TPIDR_EL0
9435 // movz x0, #:tprel_g2:a
9436 // movk x0, #:tprel_g1_nc:a
9437 // movk x0, #:tprel_g0_nc:a
9438 // add x0, x1, x0
9439 SDValue HiVar = DAG.getTargetGlobalAddress(
9440 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G2);
9441 SDValue MiVar = DAG.getTargetGlobalAddress(
9442 GV, DL, PtrVT, 0,
9444 SDValue LoVar = DAG.getTargetGlobalAddress(
9445 GV, DL, PtrVT, 0,
9447 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
9448 DAG.getTargetConstant(32, DL, MVT::i32)),
9449 0);
9450 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar,
9451 DAG.getTargetConstant(16, DL, MVT::i32)),
9452 0);
9453 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
9454 DAG.getTargetConstant(0, DL, MVT::i32)),
9455 0);
9456 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
9457 }
9458 }
9459}
9460
9461/// When accessing thread-local variables under either the general-dynamic or
9462/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
9463/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
9464/// is a function pointer to carry out the resolution.
9465///
9466/// The sequence is:
9467/// adrp x0, :tlsdesc:var
9468/// ldr x1, [x0, #:tlsdesc_lo12:var]
9469/// add x0, x0, #:tlsdesc_lo12:var
9470/// .tlsdesccall var
9471/// blr x1
9472/// (TPIDR_EL0 offset now in x0)
9473///
9474/// The above sequence must be produced unscheduled, to enable the linker to
9475/// optimize/relax this sequence.
9476/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
9477/// above sequence, and expanded really late in the compilation flow, to ensure
9478/// the sequence is produced as per above.
9479SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
9480 const SDLoc &DL,
9481 SelectionDAG &DAG) const {
9482 EVT PtrVT = getPointerTy(DAG.getDataLayout());
9483
9484 SDValue Chain = DAG.getEntryNode();
9485 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
9486
9487 Chain =
9488 DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr});
9489 SDValue Glue = Chain.getValue(1);
9490
9491 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
9492}
9493
9494SDValue
9495AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
9496 SelectionDAG &DAG) const {
9497 assert(Subtarget->isTargetELF() && "This function expects an ELF target");
9498
9499 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
9500
9502
9504 if (Model == TLSModel::LocalDynamic)
9506 }
9507
9509 Model != TLSModel::LocalExec)
9510 report_fatal_error("ELF TLS only supported in small memory model or "
9511 "in local exec TLS model");
9512 // Different choices can be made for the maximum size of the TLS area for a
9513 // module. For the small address model, the default TLS size is 16MiB and the
9514 // maximum TLS size is 4GiB.
9515 // FIXME: add tiny and large code model support for TLS access models other
9516 // than local exec. We currently generate the same code as small for tiny,
9517 // which may be larger than needed.
9518
9519 SDValue TPOff;
9520 EVT PtrVT = getPointerTy(DAG.getDataLayout());
9521 SDLoc DL(Op);
9522 const GlobalValue *GV = GA->getGlobal();
9523
9524 SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
9525
9526 if (Model == TLSModel::LocalExec) {
9527 return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
9528 } else if (Model == TLSModel::InitialExec) {
9529 TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
9530 TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
9531 } else if (Model == TLSModel::LocalDynamic) {
9532 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
9533 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
9534 // the beginning of the module's TLS region, followed by a DTPREL offset
9535 // calculation.
9536
9537 // These accesses will need deduplicating if there's more than one.
9538 AArch64FunctionInfo *MFI =
9541
9542 // The call needs a relocation too for linker relaxation. It doesn't make
9543 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
9544 // the address.
9545 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
9547
9548 // Now we can calculate the offset from TPIDR_EL0 to this module's
9549 // thread-local area.
9550 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
9551
9552 // Now use :dtprel_whatever: operations to calculate this variable's offset
9553 // in its thread-storage area.
9554 SDValue HiVar = DAG.getTargetGlobalAddress(
9555 GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
9556 SDValue LoVar = DAG.getTargetGlobalAddress(
9557 GV, DL, MVT::i64, 0,
9559
9560 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
9561 DAG.getTargetConstant(0, DL, MVT::i32)),
9562 0);
9563 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
9564 DAG.getTargetConstant(0, DL, MVT::i32)),
9565 0);
9566 } else if (Model == TLSModel::GeneralDynamic) {
9567 // The call needs a relocation too for linker relaxation. It doesn't make
9568 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
9569 // the address.
9570 SDValue SymAddr =
9571 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
9572
9573 // Finally we can make a call to calculate the offset from tpidr_el0.
9574 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
9575 } else
9576 llvm_unreachable("Unsupported ELF TLS access model");
9577
9578 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
9579}
9580
9581SDValue
9582AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
9583 SelectionDAG &DAG) const {
9584 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
9585
9586 SDValue Chain = DAG.getEntryNode();
9587 EVT PtrVT = getPointerTy(DAG.getDataLayout());
9588 SDLoc DL(Op);
9589
9590 SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);
9591
9592 // Load the ThreadLocalStoragePointer from the TEB
9593 // A pointer to the TLS array is located at offset 0x58 from the TEB.
9594 SDValue TLSArray =
9595 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
9596 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
9597 Chain = TLSArray.getValue(1);
9598
9599 // Load the TLS index from the C runtime;
9600 // This does the same as getAddr(), but without having a GlobalAddressSDNode.
9601 // This also does the same as LOADgot, but using a generic i32 load,
9602 // while LOADgot only loads i64.
9603 SDValue TLSIndexHi =
9604 DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE);
9605 SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
9606 "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
9607 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi);
9608 SDValue TLSIndex =
9609 DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo);
9610 TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
9611 Chain = TLSIndex.getValue(1);
9612
9613 // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
9614 // offset into the TLSArray.
9615 TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
9616 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
9617 DAG.getConstant(3, DL, PtrVT));
9618 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
9619 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
9621 Chain = TLS.getValue(1);
9622
9623 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
9624 const GlobalValue *GV = GA->getGlobal();
9625 SDValue TGAHi = DAG.getTargetGlobalAddress(
9626 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
9627 SDValue TGALo = DAG.getTargetGlobalAddress(
9628 GV, DL, PtrVT, 0,
9630
9631 // Add the offset from the start of the .tls section (section base).
9632 SDValue Addr =
9633 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
9634 DAG.getTargetConstant(0, DL, MVT::i32)),
9635 0);
9636 Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo);
9637 return Addr;
9638}
9639
9640SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
9641 SelectionDAG &DAG) const {
9642 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
9643 if (DAG.getTarget().useEmulatedTLS())
9644 return LowerToTLSEmulatedModel(GA, DAG);
9645
9646 if (Subtarget->isTargetDarwin())
9647 return LowerDarwinGlobalTLSAddress(Op, DAG);
9648 if (Subtarget->isTargetELF())
9649 return LowerELFGlobalTLSAddress(Op, DAG);
9650 if (Subtarget->isTargetWindows())
9651 return LowerWindowsGlobalTLSAddress(Op, DAG);
9652
9653 llvm_unreachable("Unexpected platform trying to use TLS");
9654}
9655
9656//===----------------------------------------------------------------------===//
9657// PtrAuthGlobalAddress lowering
9658//
9659// We have 3 lowering alternatives to choose from:
9660// - MOVaddrPAC: similar to MOVaddr, with added PAC.
9661// If the GV doesn't need a GOT load (i.e., is locally defined)
9662// materialize the pointer using adrp+add+pac. See LowerMOVaddrPAC.
9663//
9664// - LOADgotPAC: similar to LOADgot, with added PAC.
9665// If the GV needs a GOT load, materialize the pointer using the usual
9666// GOT adrp+ldr, +pac. Pointers in GOT are assumed to be not signed, the GOT
9667// section is assumed to be read-only (for example, via relro mechanism). See
9668// LowerMOVaddrPAC.
9669//
9670// - LOADauthptrstatic: similar to LOADgot, but use a
9671// special stub slot instead of a GOT slot.
9672// Load a signed pointer for symbol 'sym' from a stub slot named
9673// 'sym$auth_ptr$key$disc' filled by dynamic linker during relocation
9674// resolving. This usually lowers to adrp+ldr, but also emits an entry into
9675// .data with an @AUTH relocation. See LowerLOADauthptrstatic.
9676//
9677// All 3 are pseudos that are expand late to longer sequences: this lets us
9678// provide integrity guarantees on the to-be-signed intermediate values.
9679//
9680// LOADauthptrstatic is undesirable because it requires a large section filled
9681// with often similarly-signed pointers, making it a good harvesting target.
9682// Thus, it's only used for ptrauth references to extern_weak to avoid null
9683// checks.
9684
9685SDValue AArch64TargetLowering::LowerPtrAuthGlobalAddressStatically(
9686 SDValue TGA, SDLoc DL, EVT VT, AArch64PACKey::ID KeyC,
9687 SDValue Discriminator, SDValue AddrDiscriminator, SelectionDAG &DAG) const {
9688 const auto *TGN = cast<GlobalAddressSDNode>(TGA.getNode());
9689 assert(TGN->getGlobal()->hasExternalWeakLinkage());
9690
9691 // Offsets and extern_weak don't mix well: ptrauth aside, you'd get the
9692 // offset alone as a pointer if the symbol wasn't available, which would
9693 // probably break null checks in users. Ptrauth complicates things further:
9694 // error out.
9695 if (TGN->getOffset() != 0)
9697 "unsupported non-zero offset in weak ptrauth global reference");
9698
9699 if (!isNullConstant(AddrDiscriminator))
9700 report_fatal_error("unsupported weak addr-div ptrauth global");
9701
9702 SDValue Key = DAG.getTargetConstant(KeyC, DL, MVT::i32);
9703 return SDValue(DAG.getMachineNode(AArch64::LOADauthptrstatic, DL, MVT::i64,
9704 {TGA, Key, Discriminator}),
9705 0);
9706}
9707
9708SDValue
9709AArch64TargetLowering::LowerPtrAuthGlobalAddress(SDValue Op,
9710 SelectionDAG &DAG) const {
9711 SDValue Ptr = Op.getOperand(0);
9712 uint64_t KeyC = Op.getConstantOperandVal(1);
9713 SDValue AddrDiscriminator = Op.getOperand(2);
9714 uint64_t DiscriminatorC = Op.getConstantOperandVal(3);
9715 EVT VT = Op.getValueType();
9716 SDLoc DL(Op);
9717
9718 if (KeyC > AArch64PACKey::LAST)
9719 report_fatal_error("key in ptrauth global out of range [0, " +
9720 Twine((int)AArch64PACKey::LAST) + "]");
9721
9722 // Blend only works if the integer discriminator is 16-bit wide.
9723 if (!isUInt<16>(DiscriminatorC))
9725 "constant discriminator in ptrauth global out of range [0, 0xffff]");
9726
9727 // Choosing between 3 lowering alternatives is target-specific.
9728 if (!Subtarget->isTargetELF() && !Subtarget->isTargetMachO())
9729 report_fatal_error("ptrauth global lowering only supported on MachO/ELF");
9730
9731 int64_t PtrOffsetC = 0;
9732 if (Ptr.getOpcode() == ISD::ADD) {
9733 PtrOffsetC = Ptr.getConstantOperandVal(1);
9734 Ptr = Ptr.getOperand(0);
9735 }
9736 const auto *PtrN = cast<GlobalAddressSDNode>(Ptr.getNode());
9737 const GlobalValue *PtrGV = PtrN->getGlobal();
9738
9739 // Classify the reference to determine whether it needs a GOT load.
9740 const unsigned OpFlags =
9741 Subtarget->ClassifyGlobalReference(PtrGV, getTargetMachine());
9742 const bool NeedsGOTLoad = ((OpFlags & AArch64II::MO_GOT) != 0);
9743 assert(((OpFlags & (~AArch64II::MO_GOT)) == 0) &&
9744 "unsupported non-GOT op flags on ptrauth global reference");
9745
9746 // Fold any offset into the GV; our pseudos expect it there.
9747 PtrOffsetC += PtrN->getOffset();
9748 SDValue TPtr = DAG.getTargetGlobalAddress(PtrGV, DL, VT, PtrOffsetC,
9749 /*TargetFlags=*/0);
9750 assert(PtrN->getTargetFlags() == 0 &&
9751 "unsupported target flags on ptrauth global");
9752
9753 SDValue Key = DAG.getTargetConstant(KeyC, DL, MVT::i32);
9754 SDValue Discriminator = DAG.getTargetConstant(DiscriminatorC, DL, MVT::i64);
9755 SDValue TAddrDiscriminator = !isNullConstant(AddrDiscriminator)
9756 ? AddrDiscriminator
9757 : DAG.getRegister(AArch64::XZR, MVT::i64);
9758
9759 // No GOT load needed -> MOVaddrPAC
9760 if (!NeedsGOTLoad) {
9761 assert(!PtrGV->hasExternalWeakLinkage() && "extern_weak should use GOT");
9762 return SDValue(
9763 DAG.getMachineNode(AArch64::MOVaddrPAC, DL, MVT::i64,
9764 {TPtr, Key, TAddrDiscriminator, Discriminator}),
9765 0);
9766 }
9767
9768 // GOT load -> LOADgotPAC
9769 // Note that we disallow extern_weak refs to avoid null checks later.
9770 if (!PtrGV->hasExternalWeakLinkage())
9771 return SDValue(
9772 DAG.getMachineNode(AArch64::LOADgotPAC, DL, MVT::i64,
9773 {TPtr, Key, TAddrDiscriminator, Discriminator}),
9774 0);
9775
9776 // extern_weak ref -> LOADauthptrstatic
9777 return LowerPtrAuthGlobalAddressStatically(
9778 TPtr, DL, VT, (AArch64PACKey::ID)KeyC, Discriminator, AddrDiscriminator,
9779 DAG);
9780}
9781
9782// Looks through \param Val to determine the bit that can be used to
9783// check the sign of the value. It returns the unextended value and
9784// the sign bit position.
9785std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
9786 if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG)
9787 return {Val.getOperand(0),
9788 cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() -
9789 1};
9790
9791 if (Val.getOpcode() == ISD::SIGN_EXTEND)
9792 return {Val.getOperand(0),
9793 Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1};
9794
9795 return {Val, Val.getValueSizeInBits() - 1};
9796}
9797
9798SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
9799 SDValue Chain = Op.getOperand(0);
9800 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
9801 SDValue LHS = Op.getOperand(2);
9802 SDValue RHS = Op.getOperand(3);
9803 SDValue Dest = Op.getOperand(4);
9804 SDLoc dl(Op);
9805
9807 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
9808 // will not be produced, as they are conditional branch instructions that do
9809 // not set flags.
9810 bool ProduceNonFlagSettingCondBr =
9811 !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
9812
9813 // Handle f128 first, since lowering it will result in comparing the return
9814 // value of a libcall against zero, which is just what the rest of LowerBR_CC
9815 // is expecting to deal with.
9816 if (LHS.getValueType() == MVT::f128) {
9817 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
9818
9819 // If softenSetCCOperands returned a scalar, we need to compare the result
9820 // against zero to select between true and false values.
9821 if (!RHS.getNode()) {
9822 RHS = DAG.getConstant(0, dl, LHS.getValueType());
9823 CC = ISD::SETNE;
9824 }
9825 }
9826
9827 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
9828 // instruction.
9829 if (ISD::isOverflowIntrOpRes(LHS) && isOneConstant(RHS) &&
9830 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
9831 // Only lower legal XALUO ops.
9832 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
9833 return SDValue();
9834
9835 // The actual operation with overflow check.
9837 SDValue Value, Overflow;
9838 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
9839
9840 if (CC == ISD::SETNE)
9841 OFCC = getInvertedCondCode(OFCC);
9842 SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32);
9843
9844 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
9845 Overflow);
9846 }
9847
9848 if (LHS.getValueType().isInteger()) {
9849 assert((LHS.getValueType() == RHS.getValueType()) &&
9850 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
9851
9852 // If the RHS of the comparison is zero, we can potentially fold this
9853 // to a specialized branch.
9854 const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
9855 if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
9856 if (CC == ISD::SETEQ) {
9857 // See if we can use a TBZ to fold in an AND as well.
9858 // TBZ has a smaller branch displacement than CBZ. If the offset is
9859 // out of bounds, a late MI-layer pass rewrites branches.
9860 // 403.gcc is an example that hits this case.
9861 if (LHS.getOpcode() == ISD::AND &&
9862 isa<ConstantSDNode>(LHS.getOperand(1)) &&
9863 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
9864 SDValue Test = LHS.getOperand(0);
9865 uint64_t Mask = LHS.getConstantOperandVal(1);
9866 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
9867 DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
9868 Dest);
9869 }
9870
9871 return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
9872 } else if (CC == ISD::SETNE) {
9873 // See if we can use a TBZ to fold in an AND as well.
9874 // TBZ has a smaller branch displacement than CBZ. If the offset is
9875 // out of bounds, a late MI-layer pass rewrites branches.
9876 // 403.gcc is an example that hits this case.
9877 if (LHS.getOpcode() == ISD::AND &&
9878 isa<ConstantSDNode>(LHS.getOperand(1)) &&
9879 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
9880 SDValue Test = LHS.getOperand(0);
9881 uint64_t Mask = LHS.getConstantOperandVal(1);
9882 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
9883 DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
9884 Dest);
9885 }
9886
9887 return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
9888 } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
9889 // Don't combine AND since emitComparison converts the AND to an ANDS
9890 // (a.k.a. TST) and the test in the test bit and branch instruction
9891 // becomes redundant. This would also increase register pressure.
9892 uint64_t SignBitPos;
9893 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
9894 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
9895 DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
9896 }
9897 }
9898 if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
9899 LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
9900 // Don't combine AND since emitComparison converts the AND to an ANDS
9901 // (a.k.a. TST) and the test in the test bit and branch instruction
9902 // becomes redundant. This would also increase register pressure.
9903 uint64_t SignBitPos;
9904 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
9905 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
9906 DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
9907 }
9908
9909 SDValue CCVal;
9910 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
9911 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
9912 Cmp);
9913 }
9914
9915 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||
9916 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
9917
9918 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
9919 // clean. Some of them require two branches to implement.
9920 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
9921 AArch64CC::CondCode CC1, CC2;
9922 changeFPCCToAArch64CC(CC, CC1, CC2);
9923 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
9924 SDValue BR1 =
9925 DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
9926 if (CC2 != AArch64CC::AL) {
9927 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
9928 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
9929 Cmp);
9930 }
9931
9932 return BR1;
9933}
9934
9935SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
9936 SelectionDAG &DAG) const {
9937 if (!Subtarget->isNeonAvailable() &&
9938 !Subtarget->useSVEForFixedLengthVectors())
9939 return SDValue();
9940
9941 EVT VT = Op.getValueType();
9942 EVT IntVT = VT.changeTypeToInteger();
9943 SDLoc DL(Op);
9944
9945 SDValue In1 = Op.getOperand(0);
9946 SDValue In2 = Op.getOperand(1);
9947 EVT SrcVT = In2.getValueType();
9948
9949 if (!SrcVT.bitsEq(VT))
9950 In2 = DAG.getFPExtendOrRound(In2, DL, VT);
9951
9952 if (VT.isScalableVector())
9953 IntVT =
9955
9956 if (VT.isFixedLengthVector() &&
9957 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
9958 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
9959
9960 In1 = convertToScalableVector(DAG, ContainerVT, In1);
9961 In2 = convertToScalableVector(DAG, ContainerVT, In2);
9962
9963 SDValue Res = DAG.getNode(ISD::FCOPYSIGN, DL, ContainerVT, In1, In2);
9964 return convertFromScalableVector(DAG, VT, Res);
9965 }
9966
9967 auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) {
9968 if (VT.isScalableVector())
9969 return getSVESafeBitCast(VT, Op, DAG);
9970
9971 return DAG.getBitcast(VT, Op);
9972 };
9973
9974 SDValue VecVal1, VecVal2;
9975 EVT VecVT;
9976 auto SetVecVal = [&](int Idx = -1) {
9977 if (!VT.isVector()) {
9978 VecVal1 =
9979 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In1);
9980 VecVal2 =
9981 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In2);
9982 } else {
9983 VecVal1 = BitCast(VecVT, In1, DAG);
9984 VecVal2 = BitCast(VecVT, In2, DAG);
9985 }
9986 };
9987 if (VT.isVector()) {
9988 VecVT = IntVT;
9989 SetVecVal();
9990 } else if (VT == MVT::f64) {
9991 VecVT = MVT::v2i64;
9992 SetVecVal(AArch64::dsub);
9993 } else if (VT == MVT::f32) {
9994 VecVT = MVT::v4i32;
9995 SetVecVal(AArch64::ssub);
9996 } else if (VT == MVT::f16 || VT == MVT::bf16) {
9997 VecVT = MVT::v8i16;
9998 SetVecVal(AArch64::hsub);
9999 } else {
10000 llvm_unreachable("Invalid type for copysign!");
10001 }
10002
10003 unsigned BitWidth = In1.getScalarValueSizeInBits();
10004 SDValue SignMaskV = DAG.getConstant(~APInt::getSignMask(BitWidth), DL, VecVT);
10005
10006 // We want to materialize a mask with every bit but the high bit set, but the
10007 // AdvSIMD immediate moves cannot materialize that in a single instruction for
10008 // 64-bit elements. Instead, materialize all bits set and then negate that.
10009 if (VT == MVT::f64 || VT == MVT::v2f64) {
10010 SignMaskV = DAG.getConstant(APInt::getAllOnes(BitWidth), DL, VecVT);
10011 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, SignMaskV);
10012 SignMaskV = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, SignMaskV);
10013 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, SignMaskV);
10014 }
10015
10016 SDValue BSP =
10017 DAG.getNode(AArch64ISD::BSP, DL, VecVT, SignMaskV, VecVal1, VecVal2);
10018 if (VT == MVT::f16 || VT == MVT::bf16)
10019 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, BSP);
10020 if (VT == MVT::f32)
10021 return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, BSP);
10022 if (VT == MVT::f64)
10023 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, BSP);
10024
10025 return BitCast(VT, BSP, DAG);
10026}
10027
10028SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
10029 SelectionDAG &DAG) const {
10031 Attribute::NoImplicitFloat))
10032 return SDValue();
10033
10034 EVT VT = Op.getValueType();
10035 if (VT.isScalableVector() ||
10037 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
10038
10039 if (!Subtarget->isNeonAvailable())
10040 return SDValue();
10041
10042 bool IsParity = Op.getOpcode() == ISD::PARITY;
10043 SDValue Val = Op.getOperand(0);
10044 SDLoc DL(Op);
10045
10046 // for i32, general parity function using EORs is more efficient compared to
10047 // using floating point
10048 if (VT == MVT::i32 && IsParity)
10049 return SDValue();
10050
10051 // If there is no CNT instruction available, GPR popcount can
10052 // be more efficiently lowered to the following sequence that uses
10053 // AdvSIMD registers/instructions as long as the copies to/from
10054 // the AdvSIMD registers are cheap.
10055 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
10056 // CNT V0.8B, V0.8B // 8xbyte pop-counts
10057 // ADDV B0, V0.8B // sum 8xbyte pop-counts
10058 // UMOV X0, V0.B[0] // copy byte result back to integer reg
10059 if (VT == MVT::i32 || VT == MVT::i64) {
10060 if (VT == MVT::i32)
10061 Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
10062 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
10063
10064 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
10065 SDValue UaddLV = DAG.getNode(AArch64ISD::UADDLV, DL, MVT::v4i32, CtPop);
10066 UaddLV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, UaddLV,
10067 DAG.getConstant(0, DL, MVT::i64));
10068
10069 if (IsParity)
10070 UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
10071 DAG.getConstant(1, DL, MVT::i32));
10072
10073 if (VT == MVT::i64)
10074 UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
10075 return UaddLV;
10076 } else if (VT == MVT::i128) {
10077 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
10078
10079 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
10080 SDValue UaddLV = DAG.getNode(AArch64ISD::UADDLV, DL, MVT::v4i32, CtPop);
10081 UaddLV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, UaddLV,
10082 DAG.getConstant(0, DL, MVT::i64));
10083
10084 if (IsParity)
10085 UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
10086 DAG.getConstant(1, DL, MVT::i32));
10087
10088 return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV);
10089 }
10090
10091 assert(!IsParity && "ISD::PARITY of vector types not supported");
10092
10093 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
10094 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
10095 "Unexpected type for custom ctpop lowering");
10096
10097 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
10098 Val = DAG.getBitcast(VT8Bit, Val);
10099 Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);
10100
10101 if (Subtarget->hasDotProd() && VT.getScalarSizeInBits() != 16 &&
10102 VT.getVectorNumElements() >= 2) {
10103 EVT DT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
10104 SDValue Zeros = DAG.getConstant(0, DL, DT);
10105 SDValue Ones = DAG.getConstant(1, DL, VT8Bit);
10106
10107 if (VT == MVT::v2i64) {
10108 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
10109 Val = DAG.getNode(AArch64ISD::UADDLP, DL, VT, Val);
10110 } else if (VT == MVT::v2i32) {
10111 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
10112 } else if (VT == MVT::v4i32) {
10113 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
10114 } else {
10115 llvm_unreachable("Unexpected type for custom ctpop lowering");
10116 }
10117
10118 return Val;
10119 }
10120
10121 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
10122 unsigned EltSize = 8;
10123 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
10124 while (EltSize != VT.getScalarSizeInBits()) {
10125 EltSize *= 2;
10126 NumElts /= 2;
10127 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
10128 Val = DAG.getNode(AArch64ISD::UADDLP, DL, WidenVT, Val);
10129 }
10130
10131 return Val;
10132}
10133
10134SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
10135 EVT VT = Op.getValueType();
10136 assert(VT.isScalableVector() ||
10138 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()));
10139
10140 SDLoc DL(Op);
10141 SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0));
10142 return DAG.getNode(ISD::CTLZ, DL, VT, RBIT);
10143}
10144
10145SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
10146 SelectionDAG &DAG) const {
10147
10148 EVT VT = Op.getValueType();
10149 SDLoc DL(Op);
10150 unsigned Opcode = Op.getOpcode();
10152 switch (Opcode) {
10153 default:
10154 llvm_unreachable("Wrong instruction");
10155 case ISD::SMAX:
10156 CC = ISD::SETGT;
10157 break;
10158 case ISD::SMIN:
10159 CC = ISD::SETLT;
10160 break;
10161 case ISD::UMAX:
10162 CC = ISD::SETUGT;
10163 break;
10164 case ISD::UMIN:
10165 CC = ISD::SETULT;
10166 break;
10167 }
10168
10169 if (VT.isScalableVector() ||
10171 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
10172 switch (Opcode) {
10173 default:
10174 llvm_unreachable("Wrong instruction");
10175 case ISD::SMAX:
10176 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED);
10177 case ISD::SMIN:
10178 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED);
10179 case ISD::UMAX:
10180 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED);
10181 case ISD::UMIN:
10182 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED);
10183 }
10184 }
10185
10186 SDValue Op0 = Op.getOperand(0);
10187 SDValue Op1 = Op.getOperand(1);
10188 SDValue Cond = DAG.getSetCC(DL, VT, Op0, Op1, CC);
10189 return DAG.getSelect(DL, VT, Cond, Op0, Op1);
10190}
10191
10192SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
10193 SelectionDAG &DAG) const {
10194 EVT VT = Op.getValueType();
10195
10196 if (VT.isScalableVector() ||
10198 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
10199 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU);
10200
10201 SDLoc DL(Op);
10202 SDValue REVB;
10203 MVT VST;
10204
10205 switch (VT.getSimpleVT().SimpleTy) {
10206 default:
10207 llvm_unreachable("Invalid type for bitreverse!");
10208
10209 case MVT::v2i32: {
10210 VST = MVT::v8i8;
10211 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
10212
10213 break;
10214 }
10215
10216 case MVT::v4i32: {
10217 VST = MVT::v16i8;
10218 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
10219
10220 break;
10221 }
10222
10223 case MVT::v1i64: {
10224 VST = MVT::v8i8;
10225 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
10226
10227 break;
10228 }
10229
10230 case MVT::v2i64: {
10231 VST = MVT::v16i8;
10232 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
10233
10234 break;
10235 }
10236 }
10237
10238 return DAG.getNode(AArch64ISD::NVCAST, DL, VT,
10239 DAG.getNode(ISD::BITREVERSE, DL, VST, REVB));
10240}
10241
10242// Check whether the continuous comparison sequence.
10243static bool
10244isOrXorChain(SDValue N, unsigned &Num,
10245 SmallVector<std::pair<SDValue, SDValue>, 16> &WorkList) {
10246 if (Num == MaxXors)
10247 return false;
10248
10249 // Skip the one-use zext
10250 if (N->getOpcode() == ISD::ZERO_EXTEND && N->hasOneUse())
10251 N = N->getOperand(0);
10252
10253 // The leaf node must be XOR
10254 if (N->getOpcode() == ISD::XOR) {
10255 WorkList.push_back(std::make_pair(N->getOperand(0), N->getOperand(1)));
10256 Num++;
10257 return true;
10258 }
10259
10260 // All the non-leaf nodes must be OR.
10261 if (N->getOpcode() != ISD::OR || !N->hasOneUse())
10262 return false;
10263
10264 if (isOrXorChain(N->getOperand(0), Num, WorkList) &&
10265 isOrXorChain(N->getOperand(1), Num, WorkList))
10266 return true;
10267 return false;
10268}
10269
10270// Transform chains of ORs and XORs, which usually outlined by memcmp/bmp.
10272 SDValue LHS = N->getOperand(0);
10273 SDValue RHS = N->getOperand(1);
10274 SDLoc DL(N);
10275 EVT VT = N->getValueType(0);
10277
10278 // Only handle integer compares.
10279 if (N->getOpcode() != ISD::SETCC)
10280 return SDValue();
10281
10282 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
10283 // Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as:
10284 // sub A0, A1; ccmp B0, B1, 0, eq; cmp inv(Cond) flag
10285 unsigned NumXors = 0;
10286 if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) &&
10287 LHS->getOpcode() == ISD::OR && LHS->hasOneUse() &&
10288 isOrXorChain(LHS, NumXors, WorkList)) {
10289 SDValue XOR0, XOR1;
10290 std::tie(XOR0, XOR1) = WorkList[0];
10291 unsigned LogicOp = (Cond == ISD::SETEQ) ? ISD::AND : ISD::OR;
10292 SDValue Cmp = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
10293 for (unsigned I = 1; I < WorkList.size(); I++) {
10294 std::tie(XOR0, XOR1) = WorkList[I];
10295 SDValue CmpChain = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
10296 Cmp = DAG.getNode(LogicOp, DL, VT, Cmp, CmpChain);
10297 }
10298
10299 // Exit early by inverting the condition, which help reduce indentations.
10300 return Cmp;
10301 }
10302
10303 return SDValue();
10304}
10305
10306SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
10307
10308 if (Op.getValueType().isVector())
10309 return LowerVSETCC(Op, DAG);
10310
10311 bool IsStrict = Op->isStrictFPOpcode();
10312 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
10313 unsigned OpNo = IsStrict ? 1 : 0;
10314 SDValue Chain;
10315 if (IsStrict)
10316 Chain = Op.getOperand(0);
10317 SDValue LHS = Op.getOperand(OpNo + 0);
10318 SDValue RHS = Op.getOperand(OpNo + 1);
10319 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get();
10320 SDLoc dl(Op);
10321
10322 // We chose ZeroOrOneBooleanContents, so use zero and one.
10323 EVT VT = Op.getValueType();
10324 SDValue TVal = DAG.getConstant(1, dl, VT);
10325 SDValue FVal = DAG.getConstant(0, dl, VT);
10326
10327 // Handle f128 first, since one possible outcome is a normal integer
10328 // comparison which gets picked up by the next if statement.
10329 if (LHS.getValueType() == MVT::f128) {
10330 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS, Chain,
10331 IsSignaling);
10332
10333 // If softenSetCCOperands returned a scalar, use it.
10334 if (!RHS.getNode()) {
10335 assert(LHS.getValueType() == Op.getValueType() &&
10336 "Unexpected setcc expansion!");
10337 return IsStrict ? DAG.getMergeValues({LHS, Chain}, dl) : LHS;
10338 }
10339 }
10340
10341 if (LHS.getValueType().isInteger()) {
10342 SDValue CCVal;
10344 LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, dl);
10345
10346 // Note that we inverted the condition above, so we reverse the order of
10347 // the true and false operands here. This will allow the setcc to be
10348 // matched to a single CSINC instruction.
10349 SDValue Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
10350 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
10351 }
10352
10353 // Now we know we're dealing with FP values.
10354 assert(LHS.getValueType() == MVT::bf16 || LHS.getValueType() == MVT::f16 ||
10355 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
10356
10357 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
10358 // and do the comparison.
10359 SDValue Cmp;
10360 if (IsStrict)
10361 Cmp = emitStrictFPComparison(LHS, RHS, dl, DAG, Chain, IsSignaling);
10362 else
10363 Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
10364
10365 AArch64CC::CondCode CC1, CC2;
10366 changeFPCCToAArch64CC(CC, CC1, CC2);
10367 SDValue Res;
10368 if (CC2 == AArch64CC::AL) {
10369 changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1,
10370 CC2);
10371 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
10372
10373 // Note that we inverted the condition above, so we reverse the order of
10374 // the true and false operands here. This will allow the setcc to be
10375 // matched to a single CSINC instruction.
10376 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
10377 } else {
10378 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
10379 // totally clean. Some of them require two CSELs to implement. As is in
10380 // this case, we emit the first CSEL and then emit a second using the output
10381 // of the first as the RHS. We're effectively OR'ing the two CC's together.
10382
10383 // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
10384 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
10385 SDValue CS1 =
10386 DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
10387
10388 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
10389 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
10390 }
10391 return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, dl) : Res;
10392}
10393
10394SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,
10395 SelectionDAG &DAG) const {
10396
10397 SDValue LHS = Op.getOperand(0);
10398 SDValue RHS = Op.getOperand(1);
10399 EVT VT = LHS.getValueType();
10400 if (VT != MVT::i32 && VT != MVT::i64)
10401 return SDValue();
10402
10403 SDLoc DL(Op);
10404 SDValue Carry = Op.getOperand(2);
10405 // SBCS uses a carry not a borrow so the carry flag should be inverted first.
10406 SDValue InvCarry = valueToCarryFlag(Carry, DAG, true);
10407 SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, MVT::Glue),
10408 LHS, RHS, InvCarry);
10409
10410 EVT OpVT = Op.getValueType();
10411 SDValue TVal = DAG.getConstant(1, DL, OpVT);
10412 SDValue FVal = DAG.getConstant(0, DL, OpVT);
10413
10414 ISD::CondCode Cond = cast<CondCodeSDNode>(Op.getOperand(3))->get();
10416 SDValue CCVal =
10417 DAG.getConstant(changeIntCCToAArch64CC(CondInv), DL, MVT::i32);
10418 // Inputs are swapped because the condition is inverted. This will allow
10419 // matching with a single CSINC instruction.
10420 return DAG.getNode(AArch64ISD::CSEL, DL, OpVT, FVal, TVal, CCVal,
10421 Cmp.getValue(1));
10422}
10423
10424SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
10425 SDValue RHS, SDValue TVal,
10426 SDValue FVal, const SDLoc &dl,
10427 SelectionDAG &DAG) const {
10428 // Handle f128 first, because it will result in a comparison of some RTLIB
10429 // call result against zero.
10430 if (LHS.getValueType() == MVT::f128) {
10431 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
10432
10433 // If softenSetCCOperands returned a scalar, we need to compare the result
10434 // against zero to select between true and false values.
10435 if (!RHS.getNode()) {
10436 RHS = DAG.getConstant(0, dl, LHS.getValueType());
10437 CC = ISD::SETNE;
10438 }
10439 }
10440
10441 // Also handle f16, for which we need to do a f32 comparison.
10442 if ((LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
10443 LHS.getValueType() == MVT::bf16) {
10444 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
10445 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
10446 }
10447
10448 // Next, handle integers.
10449 if (LHS.getValueType().isInteger()) {
10450 assert((LHS.getValueType() == RHS.getValueType()) &&
10451 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
10452
10453 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
10454 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
10455 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
10456 // Check for sign pattern (SELECT_CC setgt, iN lhs, -1, 1, -1) and transform
10457 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
10458 // supported types.
10459 if (CC == ISD::SETGT && RHSC && RHSC->isAllOnes() && CTVal && CFVal &&
10460 CTVal->isOne() && CFVal->isAllOnes() &&
10461 LHS.getValueType() == TVal.getValueType()) {
10462 EVT VT = LHS.getValueType();
10463 SDValue Shift =
10464 DAG.getNode(ISD::SRA, dl, VT, LHS,
10465 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
10466 return DAG.getNode(ISD::OR, dl, VT, Shift, DAG.getConstant(1, dl, VT));
10467 }
10468
10469 // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
10470 // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
10471 // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
10472 // Both require less instructions than compare and conditional select.
10473 if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TVal &&
10474 RHSC && RHSC->isZero() && CFVal && CFVal->isZero() &&
10475 LHS.getValueType() == RHS.getValueType()) {
10476 EVT VT = LHS.getValueType();
10477 SDValue Shift =
10478 DAG.getNode(ISD::SRA, dl, VT, LHS,
10479 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
10480
10481 if (CC == ISD::SETGT)
10482 Shift = DAG.getNOT(dl, Shift, VT);
10483
10484 return DAG.getNode(ISD::AND, dl, VT, LHS, Shift);
10485 }
10486
10487 unsigned Opcode = AArch64ISD::CSEL;
10488
10489 // If both the TVal and the FVal are constants, see if we can swap them in
10490 // order to for a CSINV or CSINC out of them.
10491 if (CTVal && CFVal && CTVal->isAllOnes() && CFVal->isZero()) {
10492 std::swap(TVal, FVal);
10493 std::swap(CTVal, CFVal);
10494 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
10495 } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isZero()) {
10496 std::swap(TVal, FVal);
10497 std::swap(CTVal, CFVal);
10498 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
10499 } else if (TVal.getOpcode() == ISD::XOR) {
10500 // If TVal is a NOT we want to swap TVal and FVal so that we can match
10501 // with a CSINV rather than a CSEL.
10502 if (isAllOnesConstant(TVal.getOperand(1))) {
10503 std::swap(TVal, FVal);
10504 std::swap(CTVal, CFVal);
10505 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
10506 }
10507 } else if (TVal.getOpcode() == ISD::SUB) {
10508 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
10509 // that we can match with a CSNEG rather than a CSEL.
10510 if (isNullConstant(TVal.getOperand(0))) {
10511 std::swap(TVal, FVal);
10512 std::swap(CTVal, CFVal);
10513 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
10514 }
10515 } else if (CTVal && CFVal) {
10516 const int64_t TrueVal = CTVal->getSExtValue();
10517 const int64_t FalseVal = CFVal->getSExtValue();
10518 bool Swap = false;
10519
10520 // If both TVal and FVal are constants, see if FVal is the
10521 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
10522 // instead of a CSEL in that case.
10523 if (TrueVal == ~FalseVal) {
10524 Opcode = AArch64ISD::CSINV;
10525 } else if (FalseVal > std::numeric_limits<int64_t>::min() &&
10526 TrueVal == -FalseVal) {
10527 Opcode = AArch64ISD::CSNEG;
10528 } else if (TVal.getValueType() == MVT::i32) {
10529 // If our operands are only 32-bit wide, make sure we use 32-bit
10530 // arithmetic for the check whether we can use CSINC. This ensures that
10531 // the addition in the check will wrap around properly in case there is
10532 // an overflow (which would not be the case if we do the check with
10533 // 64-bit arithmetic).
10534 const uint32_t TrueVal32 = CTVal->getZExtValue();
10535 const uint32_t FalseVal32 = CFVal->getZExtValue();
10536
10537 if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
10538 Opcode = AArch64ISD::CSINC;
10539
10540 if (TrueVal32 > FalseVal32) {
10541 Swap = true;
10542 }
10543 }
10544 } else {
10545 // 64-bit check whether we can use CSINC.
10546 const uint64_t TrueVal64 = TrueVal;
10547 const uint64_t FalseVal64 = FalseVal;
10548
10549 if ((TrueVal64 == FalseVal64 + 1) || (TrueVal64 + 1 == FalseVal64)) {
10550 Opcode = AArch64ISD::CSINC;
10551
10552 if (TrueVal > FalseVal) {
10553 Swap = true;
10554 }
10555 }
10556 }
10557
10558 // Swap TVal and FVal if necessary.
10559 if (Swap) {
10560 std::swap(TVal, FVal);
10561 std::swap(CTVal, CFVal);
10562 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
10563 }
10564
10565 if (Opcode != AArch64ISD::CSEL) {
10566 // Drop FVal since we can get its value by simply inverting/negating
10567 // TVal.
10568 FVal = TVal;
10569 }
10570 }
10571
10572 // Avoid materializing a constant when possible by reusing a known value in
10573 // a register. However, don't perform this optimization if the known value
10574 // is one, zero or negative one in the case of a CSEL. We can always
10575 // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
10576 // FVal, respectively.
10577 ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
10578 if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
10579 !RHSVal->isZero() && !RHSVal->isAllOnes()) {
10581 // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
10582 // "a != C ? x : a" to avoid materializing C.
10583 if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
10584 TVal = LHS;
10585 else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
10586 FVal = LHS;
10587 } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
10588 assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
10589 // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
10590 // avoid materializing C.
10592 if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
10593 Opcode = AArch64ISD::CSINV;
10594 TVal = LHS;
10595 FVal = DAG.getConstant(0, dl, FVal.getValueType());
10596 }
10597 }
10598
10599 SDValue CCVal;
10600 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
10601 EVT VT = TVal.getValueType();
10602 return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
10603 }
10604
10605 // Now we know we're dealing with FP values.
10606 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
10607 LHS.getValueType() == MVT::f64);
10608 assert(LHS.getValueType() == RHS.getValueType());
10609 EVT VT = TVal.getValueType();
10610 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
10611
10612 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
10613 // clean. Some of them require two CSELs to implement.
10614 AArch64CC::CondCode CC1, CC2;
10615 changeFPCCToAArch64CC(CC, CC1, CC2);
10616
10617 if (DAG.getTarget().Options.UnsafeFPMath) {
10618 // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
10619 // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
10620 ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
10621 if (RHSVal && RHSVal->isZero()) {
10622 ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
10623 ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);
10624
10625 if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
10626 CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
10627 TVal = LHS;
10628 else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
10629 CFVal && CFVal->isZero() &&
10630 FVal.getValueType() == LHS.getValueType())
10631 FVal = LHS;
10632 }
10633 }
10634
10635 // Emit first, and possibly only, CSEL.
10636 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
10637 SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
10638
10639 // If we need a second CSEL, emit it, using the output of the first as the
10640 // RHS. We're effectively OR'ing the two CC's together.
10641 if (CC2 != AArch64CC::AL) {
10642 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
10643 return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
10644 }
10645
10646 // Otherwise, return the output of the first CSEL.
10647 return CS1;
10648}
10649
10650SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
10651 SelectionDAG &DAG) const {
10652 EVT Ty = Op.getValueType();
10653 auto Idx = Op.getConstantOperandAPInt(2);
10654 int64_t IdxVal = Idx.getSExtValue();
10655 assert(Ty.isScalableVector() &&
10656 "Only expect scalable vectors for custom lowering of VECTOR_SPLICE");
10657
10658 // We can use the splice instruction for certain index values where we are
10659 // able to efficiently generate the correct predicate. The index will be
10660 // inverted and used directly as the input to the ptrue instruction, i.e.
10661 // -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the
10662 // splice predicate. However, we can only do this if we can guarantee that
10663 // there are enough elements in the vector, hence we check the index <= min
10664 // number of elements.
10665 std::optional<unsigned> PredPattern;
10666 if (Ty.isScalableVector() && IdxVal < 0 &&
10667 (PredPattern = getSVEPredPatternFromNumElements(std::abs(IdxVal))) !=
10668 std::nullopt) {
10669 SDLoc DL(Op);
10670
10671 // Create a predicate where all but the last -IdxVal elements are false.
10672 EVT PredVT = Ty.changeVectorElementType(MVT::i1);
10673 SDValue Pred = getPTrue(DAG, DL, PredVT, *PredPattern);
10674 Pred = DAG.getNode(ISD::VECTOR_REVERSE, DL, PredVT, Pred);
10675
10676 // Now splice the two inputs together using the predicate.
10677 return DAG.getNode(AArch64ISD::SPLICE, DL, Ty, Pred, Op.getOperand(0),
10678 Op.getOperand(1));
10679 }
10680
10681 // We can select to an EXT instruction when indexing the first 256 bytes.
10683 if (IdxVal >= 0 && (IdxVal * BlockSize / 8) < 256)
10684 return Op;
10685
10686 return SDValue();
10687}
10688
10689SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
10690 SelectionDAG &DAG) const {
10691 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
10692 SDValue LHS = Op.getOperand(0);
10693 SDValue RHS = Op.getOperand(1);
10694 SDValue TVal = Op.getOperand(2);
10695 SDValue FVal = Op.getOperand(3);
10696 SDLoc DL(Op);
10697 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
10698}
10699
10700SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
10701 SelectionDAG &DAG) const {
10702 SDValue CCVal = Op->getOperand(0);
10703 SDValue TVal = Op->getOperand(1);
10704 SDValue FVal = Op->getOperand(2);
10705 SDLoc DL(Op);
10706
10707 EVT Ty = Op.getValueType();
10708 if (Ty == MVT::aarch64svcount) {
10709 TVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, TVal);
10710 FVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, FVal);
10711 SDValue Sel =
10712 DAG.getNode(ISD::SELECT, DL, MVT::nxv16i1, CCVal, TVal, FVal);
10713 return DAG.getNode(ISD::BITCAST, DL, Ty, Sel);
10714 }
10715
10716 if (Ty.isScalableVector()) {
10717 MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());
10718 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, CCVal);
10719 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
10720 }
10721
10722 if (useSVEForFixedLengthVectorVT(Ty, !Subtarget->isNeonAvailable())) {
10723 // FIXME: Ideally this would be the same as above using i1 types, however
10724 // for the moment we can't deal with fixed i1 vector types properly, so
10725 // instead extend the predicate to a result type sized integer vector.
10726 MVT SplatValVT = MVT::getIntegerVT(Ty.getScalarSizeInBits());
10727 MVT PredVT = MVT::getVectorVT(SplatValVT, Ty.getVectorElementCount());
10728 SDValue SplatVal = DAG.getSExtOrTrunc(CCVal, DL, SplatValVT);
10729 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, SplatVal);
10730 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
10731 }
10732
10733 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
10734 // instruction.
10735 if (ISD::isOverflowIntrOpRes(CCVal)) {
10736 // Only lower legal XALUO ops.
10737 if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
10738 return SDValue();
10739
10741 SDValue Value, Overflow;
10742 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
10743 SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32);
10744
10745 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
10746 CCVal, Overflow);
10747 }
10748
10749 // Lower it the same way as we would lower a SELECT_CC node.
10751 SDValue LHS, RHS;
10752 if (CCVal.getOpcode() == ISD::SETCC) {
10753 LHS = CCVal.getOperand(0);
10754 RHS = CCVal.getOperand(1);
10755 CC = cast<CondCodeSDNode>(CCVal.getOperand(2))->get();
10756 } else {
10757 LHS = CCVal;
10758 RHS = DAG.getConstant(0, DL, CCVal.getValueType());
10759 CC = ISD::SETNE;
10760 }
10761
10762 // If we are lowering a f16 and we do not have fullf16, convert to a f32 in
10763 // order to use FCSELSrrr
10764 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
10765 TVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
10766 DAG.getUNDEF(MVT::f32), TVal);
10767 FVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
10768 DAG.getUNDEF(MVT::f32), FVal);
10769 }
10770
10771 SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
10772
10773 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
10774 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res);
10775 }
10776
10777 return Res;
10778}
10779
10780SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
10781 SelectionDAG &DAG) const {
10782 // Jump table entries as PC relative offsets. No additional tweaking
10783 // is necessary here. Just get the address of the jump table.
10784 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
10785
10788 !Subtarget->isTargetMachO())
10789 return getAddrLarge(JT, DAG);
10790 if (CM == CodeModel::Tiny)
10791 return getAddrTiny(JT, DAG);
10792 return getAddr(JT, DAG);
10793}
10794
10795SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
10796 SelectionDAG &DAG) const {
10797 // Jump table entries as PC relative offsets. No additional tweaking
10798 // is necessary here. Just get the address of the jump table.
10799 SDLoc DL(Op);
10800 SDValue JT = Op.getOperand(1);
10801 SDValue Entry = Op.getOperand(2);
10802 int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();
10803
10804 auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
10805 AFI->setJumpTableEntryInfo(JTI, 4, nullptr);
10806
10807 // With aarch64-jump-table-hardening, we only expand the jump table dispatch
10808 // sequence later, to guarantee the integrity of the intermediate values.
10810 "aarch64-jump-table-hardening")) {
10812 if (Subtarget->isTargetMachO()) {
10813 if (CM != CodeModel::Small && CM != CodeModel::Large)
10814 report_fatal_error("Unsupported code-model for hardened jump-table");
10815 } else {
10816 // Note that COFF support would likely also need JUMP_TABLE_DEBUG_INFO.
10817 assert(Subtarget->isTargetELF() &&
10818 "jump table hardening only supported on MachO/ELF");
10819 if (CM != CodeModel::Small)
10820 report_fatal_error("Unsupported code-model for hardened jump-table");
10821 }
10822
10823 SDValue X16Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::X16,
10824 Entry, SDValue());
10825 SDNode *B = DAG.getMachineNode(AArch64::BR_JumpTable, DL, MVT::Other,
10826 DAG.getTargetJumpTable(JTI, MVT::i32),
10827 X16Copy.getValue(0), X16Copy.getValue(1));
10828 return SDValue(B, 0);
10829 }
10830
10831 SDNode *Dest =
10832 DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
10833 Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
10834 SDValue JTInfo = DAG.getJumpTableDebugInfo(JTI, Op.getOperand(0), DL);
10835 return DAG.getNode(ISD::BRIND, DL, MVT::Other, JTInfo, SDValue(Dest, 0));
10836}
10837
10838SDValue AArch64TargetLowering::LowerBRIND(SDValue Op, SelectionDAG &DAG) const {
10839 SDValue Chain = Op.getOperand(0);
10840 SDValue Dest = Op.getOperand(1);
10841
10842 // BR_JT is lowered to BRIND, but the later lowering is specific to indirectbr
10843 // Skip over the jump-table BRINDs, where the destination is JumpTableDest32.
10844 if (Dest->isMachineOpcode() &&
10845 Dest->getMachineOpcode() == AArch64::JumpTableDest32)
10846 return SDValue();
10847
10848 const MachineFunction &MF = DAG.getMachineFunction();
10849 std::optional<uint16_t> BADisc =
10851 if (!BADisc)
10852 return SDValue();
10853
10854 SDLoc DL(Op);
10855
10856 SDValue Disc = DAG.getTargetConstant(*BADisc, DL, MVT::i64);
10858 SDValue AddrDisc = DAG.getRegister(AArch64::XZR, MVT::i64);
10859
10860 SDNode *BrA = DAG.getMachineNode(AArch64::BRA, DL, MVT::Other,
10861 {Dest, Key, Disc, AddrDisc, Chain});
10862 return SDValue(BrA, 0);
10863}
10864
10865SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
10866 SelectionDAG &DAG) const {
10867 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
10869 if (CM == CodeModel::Large) {
10870 // Use the GOT for the large code model on iOS.
10871 if (Subtarget->isTargetMachO()) {
10872 return getGOT(CP, DAG);
10873 }
10875 return getAddrLarge(CP, DAG);
10876 } else if (CM == CodeModel::Tiny) {
10877 return getAddrTiny(CP, DAG);
10878 }
10879 return getAddr(CP, DAG);
10880}
10881
10882SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
10883 SelectionDAG &DAG) const {
10884 BlockAddressSDNode *BAN = cast<BlockAddressSDNode>(Op);
10885 const BlockAddress *BA = BAN->getBlockAddress();
10886
10887 if (std::optional<uint16_t> BADisc =
10889 *BA->getFunction())) {
10890 SDLoc DL(Op);
10891
10892 // This isn't cheap, but BRIND is rare.
10893 SDValue TargetBA = DAG.getTargetBlockAddress(BA, BAN->getValueType(0));
10894
10895 SDValue Disc = DAG.getTargetConstant(*BADisc, DL, MVT::i64);
10896
10898 SDValue AddrDisc = DAG.getRegister(AArch64::XZR, MVT::i64);
10899
10900 SDNode *MOV =
10901 DAG.getMachineNode(AArch64::MOVaddrPAC, DL, {MVT::Other, MVT::Glue},
10902 {TargetBA, Key, AddrDisc, Disc});
10903 return DAG.getCopyFromReg(SDValue(MOV, 0), DL, AArch64::X16, MVT::i64,
10904 SDValue(MOV, 1));
10905 }
10906
10908 if (CM == CodeModel::Large && !Subtarget->isTargetMachO()) {
10910 return getAddrLarge(BAN, DAG);
10911 } else if (CM == CodeModel::Tiny) {
10912 return getAddrTiny(BAN, DAG);
10913 }
10914 return getAddr(BAN, DAG);
10915}
10916
10917SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
10918 SelectionDAG &DAG) const {
10919 AArch64FunctionInfo *FuncInfo =
10921
10922 SDLoc DL(Op);
10923 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
10925 FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout()));
10926 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10927 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
10928 MachinePointerInfo(SV));
10929}
10930
10931SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
10932 SelectionDAG &DAG) const {
10935
10936 SDLoc DL(Op);
10937 SDValue FR;
10938 if (Subtarget->isWindowsArm64EC()) {
10939 // With the Arm64EC ABI, we compute the address of the varargs save area
10940 // relative to x4. For a normal AArch64->AArch64 call, x4 == sp on entry,
10941 // but calls from an entry thunk can pass in a different address.
10942 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
10943 SDValue Val = DAG.getCopyFromReg(DAG.getEntryNode(), DL, VReg, MVT::i64);
10945 if (FuncInfo->getVarArgsGPRSize() > 0)
10946 StackOffset = -(uint64_t)FuncInfo->getVarArgsGPRSize();
10947 else
10948 StackOffset = FuncInfo->getVarArgsStackOffset();
10949 FR = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
10950 DAG.getConstant(StackOffset, DL, MVT::i64));
10951 } else {
10952 FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
10953 ? FuncInfo->getVarArgsGPRIndex()
10954 : FuncInfo->getVarArgsStackIndex(),
10956 }
10957 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10958 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
10959 MachinePointerInfo(SV));
10960}
10961
10962SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
10963 SelectionDAG &DAG) const {
10964 // The layout of the va_list struct is specified in the AArch64 Procedure Call
10965 // Standard, section B.3.
10968 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
10969 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
10970 auto PtrVT = getPointerTy(DAG.getDataLayout());
10971 SDLoc DL(Op);
10972
10973 SDValue Chain = Op.getOperand(0);
10974 SDValue VAList = Op.getOperand(1);
10975 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
10977
10978 // void *__stack at offset 0
10979 unsigned Offset = 0;
10980 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
10981 Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT);
10982 MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
10983 MachinePointerInfo(SV), Align(PtrSize)));
10984
10985 // void *__gr_top at offset 8 (4 on ILP32)
10986 Offset += PtrSize;
10987 int GPRSize = FuncInfo->getVarArgsGPRSize();
10988 if (GPRSize > 0) {
10989 SDValue GRTop, GRTopAddr;
10990
10991 GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
10992 DAG.getConstant(Offset, DL, PtrVT));
10993
10994 GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
10995 GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
10996 DAG.getConstant(GPRSize, DL, PtrVT));
10997 GRTop = DAG.getZExtOrTrunc(GRTop, DL, PtrMemVT);
10998
10999 MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
11001 Align(PtrSize)));
11002 }
11003
11004 // void *__vr_top at offset 16 (8 on ILP32)
11005 Offset += PtrSize;
11006 int FPRSize = FuncInfo->getVarArgsFPRSize();
11007 if (FPRSize > 0) {
11008 SDValue VRTop, VRTopAddr;
11009 VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
11010 DAG.getConstant(Offset, DL, PtrVT));
11011
11012 VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
11013 VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
11014 DAG.getConstant(FPRSize, DL, PtrVT));
11015 VRTop = DAG.getZExtOrTrunc(VRTop, DL, PtrMemVT);
11016
11017 MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
11019 Align(PtrSize)));
11020 }
11021
11022 // int __gr_offs at offset 24 (12 on ILP32)
11023 Offset += PtrSize;
11024 SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
11025 DAG.getConstant(Offset, DL, PtrVT));
11026 MemOps.push_back(
11027 DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32),
11028 GROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
11029
11030 // int __vr_offs at offset 28 (16 on ILP32)
11031 Offset += 4;
11032 SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
11033 DAG.getConstant(Offset, DL, PtrVT));
11034 MemOps.push_back(
11035 DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32),
11036 VROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
11037
11038 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
11039}
11040
11041SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
11042 SelectionDAG &DAG) const {
11044 Function &F = MF.getFunction();
11045
11046 if (Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg()))
11047 return LowerWin64_VASTART(Op, DAG);
11048 else if (Subtarget->isTargetDarwin())
11049 return LowerDarwin_VASTART(Op, DAG);
11050 else
11051 return LowerAAPCS_VASTART(Op, DAG);
11052}
11053
11054SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
11055 SelectionDAG &DAG) const {
11056 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
11057 // pointer.
11058 SDLoc DL(Op);
11059 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
11060 unsigned VaListSize =
11061 (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
11062 ? PtrSize
11063 : Subtarget->isTargetILP32() ? 20 : 32;
11064 const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
11065 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
11066
11067 return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
11068 DAG.getConstant(VaListSize, DL, MVT::i32),
11069 Align(PtrSize), false, false, /*CI=*/nullptr,
11070 std::nullopt, MachinePointerInfo(DestSV),
11071 MachinePointerInfo(SrcSV));
11072}
11073
11074SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
11075 assert(Subtarget->isTargetDarwin() &&
11076 "automatic va_arg instruction only works on Darwin");
11077
11078 const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
11079 EVT VT = Op.getValueType();
11080 SDLoc DL(Op);
11081 SDValue Chain = Op.getOperand(0);
11082 SDValue Addr = Op.getOperand(1);
11083 MaybeAlign Align(Op.getConstantOperandVal(3));
11084 unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
11085 auto PtrVT = getPointerTy(DAG.getDataLayout());
11086 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
11087 SDValue VAList =
11088 DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V));
11089 Chain = VAList.getValue(1);
11090 VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);
11091
11092 if (VT.isScalableVector())
11093 report_fatal_error("Passing SVE types to variadic functions is "
11094 "currently not supported");
11095
11096 if (Align && *Align > MinSlotSize) {
11097 VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
11098 DAG.getConstant(Align->value() - 1, DL, PtrVT));
11099 VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
11100 DAG.getConstant(-(int64_t)Align->value(), DL, PtrVT));
11101 }
11102
11103 Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
11104 unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
11105
11106 // Scalar integer and FP values smaller than 64 bits are implicitly extended
11107 // up to 64 bits. At the very least, we have to increase the striding of the
11108 // vaargs list to match this, and for FP values we need to introduce
11109 // FP_ROUND nodes as well.
11110 if (VT.isInteger() && !VT.isVector())
11111 ArgSize = std::max(ArgSize, MinSlotSize);
11112 bool NeedFPTrunc = false;
11113 if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
11114 ArgSize = 8;
11115 NeedFPTrunc = true;
11116 }
11117
11118 // Increment the pointer, VAList, to the next vaarg
11119 SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
11120 DAG.getConstant(ArgSize, DL, PtrVT));
11121 VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT);
11122
11123 // Store the incremented VAList to the legalized pointer
11124 SDValue APStore =
11125 DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
11126
11127 // Load the actual argument out of the pointer VAList
11128 if (NeedFPTrunc) {
11129 // Load the value as an f64.
11130 SDValue WideFP =
11131 DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
11132 // Round the value down to an f32.
11133 SDValue NarrowFP =
11134 DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
11135 DAG.getIntPtrConstant(1, DL, /*isTarget=*/true));
11136 SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
11137 // Merge the rounded value with the chain output of the load.
11138 return DAG.getMergeValues(Ops, DL);
11139 }
11140
11141 return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
11142}
11143
11144SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
11145 SelectionDAG &DAG) const {
11147 MFI.setFrameAddressIsTaken(true);
11148
11149 EVT VT = Op.getValueType();
11150 SDLoc DL(Op);
11151 unsigned Depth = Op.getConstantOperandVal(0);
11152 SDValue FrameAddr =
11153 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
11154 while (Depth--)
11155 FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
11157
11158 if (Subtarget->isTargetILP32())
11159 FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr,
11160 DAG.getValueType(VT));
11161
11162 return FrameAddr;
11163}
11164
11165SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
11166 SelectionDAG &DAG) const {
11168
11169 EVT VT = getPointerTy(DAG.getDataLayout());
11170 SDLoc DL(Op);
11171 int FI = MFI.CreateFixedObject(4, 0, false);
11172 return DAG.getFrameIndex(FI, VT);
11173}
11174
11175#define GET_REGISTER_MATCHER
11176#include "AArch64GenAsmMatcher.inc"
11177
11178// FIXME? Maybe this could be a TableGen attribute on some registers and
11179// this table could be generated automatically from RegInfo.
11180Register AArch64TargetLowering::
11181getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
11183 if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
11184 const AArch64RegisterInfo *MRI = Subtarget->getRegisterInfo();
11185 unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
11186 if (!Subtarget->isXRegisterReserved(DwarfRegNum) &&
11187 !MRI->isReservedReg(MF, Reg))
11188 Reg = 0;
11189 }
11190 if (Reg)
11191 return Reg;
11192 report_fatal_error(Twine("Invalid register name \""
11193 + StringRef(RegName) + "\"."));
11194}
11195
11196SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
11197 SelectionDAG &DAG) const {
11199
11200 EVT VT = Op.getValueType();
11201 SDLoc DL(Op);
11202
11203 SDValue FrameAddr =
11204 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
11206
11207 return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);
11208}
11209
11210SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
11211 SelectionDAG &DAG) const {
11213 MachineFrameInfo &MFI = MF.getFrameInfo();
11214 MFI.setReturnAddressIsTaken(true);
11215
11216 EVT VT = Op.getValueType();
11217 SDLoc DL(Op);
11218 unsigned Depth = Op.getConstantOperandVal(0);
11219 SDValue ReturnAddress;
11220 if (Depth) {
11221 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
11223 ReturnAddress = DAG.getLoad(
11224 VT, DL, DAG.getEntryNode(),
11225 DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo());
11226 } else {
11227 // Return LR, which contains the return address. Mark it an implicit
11228 // live-in.
11229 Register Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
11230 ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
11231 }
11232
11233 // The XPACLRI instruction assembles to a hint-space instruction before
11234 // Armv8.3-A therefore this instruction can be safely used for any pre
11235 // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
11236 // that instead.
11237 SDNode *St;
11238 if (Subtarget->hasPAuth()) {
11239 St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress);
11240 } else {
11241 // XPACLRI operates on LR therefore we must move the operand accordingly.
11242 SDValue Chain =
11243 DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress);
11244 St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain);
11245 }
11246 return SDValue(St, 0);
11247}
11248
11249/// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two
11250/// i32 values and take a 2 x i32 value to shift plus a shift amount.
11251SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op,
11252 SelectionDAG &DAG) const {
11253 SDValue Lo, Hi;
11254 expandShiftParts(Op.getNode(), Lo, Hi, DAG);
11255 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
11256}
11257
11259 const GlobalAddressSDNode *GA) const {
11260 // Offsets are folded in the DAG combine rather than here so that we can
11261 // intelligently choose an offset based on the uses.
11262 return false;
11263}
11264
11266 bool OptForSize) const {
11267 bool IsLegal = false;
11268 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
11269 // 16-bit case when target has full fp16 support.
11270 // We encode bf16 bit patterns as if they were fp16. This results in very
11271 // strange looking assembly but should populate the register with appropriate
11272 // values. Let's say we wanted to encode 0xR3FC0 which is 1.5 in BF16. We will
11273 // end up encoding this as the imm8 0x7f. This imm8 will be expanded to the
11274 // FP16 1.9375 which shares the same bit pattern as BF16 1.5.
11275 // FIXME: We should be able to handle f128 as well with a clever lowering.
11276 const APInt ImmInt = Imm.bitcastToAPInt();
11277 if (VT == MVT::f64)
11278 IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero();
11279 else if (VT == MVT::f32)
11280 IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero();
11281 else if (VT == MVT::f16 || VT == MVT::bf16)
11282 IsLegal =
11283 (Subtarget->hasFullFP16() && AArch64_AM::getFP16Imm(ImmInt) != -1) ||
11284 Imm.isPosZero();
11285
11286 // If we can not materialize in immediate field for fmov, check if the
11287 // value can be encoded as the immediate operand of a logical instruction.
11288 // The immediate value will be created with either MOVZ, MOVN, or ORR.
11289 // TODO: fmov h0, w0 is also legal, however we don't have an isel pattern to
11290 // generate that fmov.
11291 if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) {
11292 // The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
11293 // however the mov+fmov sequence is always better because of the reduced
11294 // cache pressure. The timings are still the same if you consider
11295 // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
11296 // movw+movk is fused). So we limit up to 2 instrdduction at most.
11299 unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 5 : 2));
11300 IsLegal = Insn.size() <= Limit;
11301 }
11302
11303 LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT
11304 << " imm value: "; Imm.dump(););
11305 return IsLegal;
11306}
11307
11308//===----------------------------------------------------------------------===//
11309// AArch64 Optimization Hooks
11310//===----------------------------------------------------------------------===//
11311
11312static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
11313 SDValue Operand, SelectionDAG &DAG,
11314 int &ExtraSteps) {
11315 EVT VT = Operand.getValueType();
11316 if ((ST->hasNEON() &&
11317 (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
11318 VT == MVT::f32 || VT == MVT::v1f32 || VT == MVT::v2f32 ||
11319 VT == MVT::v4f32)) ||
11320 (ST->hasSVE() &&
11321 (VT == MVT::nxv8f16 || VT == MVT::nxv4f32 || VT == MVT::nxv2f64))) {
11323 // For the reciprocal estimates, convergence is quadratic, so the number
11324 // of digits is doubled after each iteration. In ARMv8, the accuracy of
11325 // the initial estimate is 2^-8. Thus the number of extra steps to refine
11326 // the result for float (23 mantissa bits) is 2 and for double (52
11327 // mantissa bits) is 3.
11328 constexpr unsigned AccurateBits = 8;
11329 unsigned DesiredBits =
11331 ExtraSteps = DesiredBits <= AccurateBits
11332 ? 0
11333 : Log2_64_Ceil(DesiredBits) - Log2_64_Ceil(AccurateBits);
11334 }
11335
11336 return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
11337 }
11338
11339 return SDValue();
11340}
11341
11342SDValue
11343AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
11344 const DenormalMode &Mode) const {
11345 SDLoc DL(Op);
11346 EVT VT = Op.getValueType();
11347 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
11348 SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
11349 return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
11350}
11351
11352SDValue
11353AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op,
11354 SelectionDAG &DAG) const {
11355 return Op;
11356}
11357
11358SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
11359 SelectionDAG &DAG, int Enabled,
11360 int &ExtraSteps,
11361 bool &UseOneConst,
11362 bool Reciprocal) const {
11364 (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
11365 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
11366 DAG, ExtraSteps)) {
11367 SDLoc DL(Operand);
11368 EVT VT = Operand.getValueType();
11369
11371 Flags.setAllowReassociation(true);
11372
11373 // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
11374 // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
11375 for (int i = ExtraSteps; i > 0; --i) {
11376 SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
11377 Flags);
11378 Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
11379 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
11380 }
11381 if (!Reciprocal)
11382 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
11383
11384 ExtraSteps = 0;
11385 return Estimate;
11386 }
11387
11388 return SDValue();
11389}
11390
11391SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
11392 SelectionDAG &DAG, int Enabled,
11393 int &ExtraSteps) const {
11395 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
11396 DAG, ExtraSteps)) {
11397 SDLoc DL(Operand);
11398 EVT VT = Operand.getValueType();
11399
11401 Flags.setAllowReassociation(true);
11402
11403 // Newton reciprocal iteration: E * (2 - X * E)
11404 // AArch64 reciprocal iteration instruction: (2 - M * N)
11405 for (int i = ExtraSteps; i > 0; --i) {
11406 SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
11407 Estimate, Flags);
11408 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
11409 }
11410
11411 ExtraSteps = 0;
11412 return Estimate;
11413 }
11414
11415 return SDValue();
11416}
11417
11418//===----------------------------------------------------------------------===//
11419// AArch64 Inline Assembly Support
11420//===----------------------------------------------------------------------===//
11421
11422// Table of Constraints
11423// TODO: This is the current set of constraints supported by ARM for the
11424// compiler, not all of them may make sense.
11425//
11426// r - A general register
11427// w - An FP/SIMD register of some size in the range v0-v31
11428// x - An FP/SIMD register of some size in the range v0-v15
11429// I - Constant that can be used with an ADD instruction
11430// J - Constant that can be used with a SUB instruction
11431// K - Constant that can be used with a 32-bit logical instruction
11432// L - Constant that can be used with a 64-bit logical instruction
11433// M - Constant that can be used as a 32-bit MOV immediate
11434// N - Constant that can be used as a 64-bit MOV immediate
11435// Q - A memory reference with base register and no offset
11436// S - A symbolic address
11437// Y - Floating point constant zero
11438// Z - Integer constant zero
11439//
11440// Note that general register operands will be output using their 64-bit x
11441// register name, whatever the size of the variable, unless the asm operand
11442// is prefixed by the %w modifier. Floating-point and SIMD register operands
11443// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
11444// %q modifier.
11445const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
11446 // At this point, we have to lower this constraint to something else, so we
11447 // lower it to an "r" or "w". However, by doing this we will force the result
11448 // to be in register, while the X constraint is much more permissive.
11449 //
11450 // Although we are correct (we are free to emit anything, without
11451 // constraints), we might break use cases that would expect us to be more
11452 // efficient and emit something else.
11453 if (!Subtarget->hasFPARMv8())
11454 return "r";
11455
11456 if (ConstraintVT.isFloatingPoint())
11457 return "w";
11458
11459 if (ConstraintVT.isVector() &&
11460 (ConstraintVT.getSizeInBits() == 64 ||
11461 ConstraintVT.getSizeInBits() == 128))
11462 return "w";
11463
11464 return "r";
11465}
11466
11468
11469static std::optional<PredicateConstraint>
11472 .Case("Uph", PredicateConstraint::Uph)
11473 .Case("Upl", PredicateConstraint::Upl)
11474 .Case("Upa", PredicateConstraint::Upa)
11475 .Default(std::nullopt);
11476}
11477
11478static const TargetRegisterClass *
11480 if (VT != MVT::aarch64svcount &&
11481 (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1))
11482 return nullptr;
11483
11484 switch (Constraint) {
11485 case PredicateConstraint::Uph:
11486 return VT == MVT::aarch64svcount ? &AArch64::PNR_p8to15RegClass
11487 : &AArch64::PPR_p8to15RegClass;
11488 case PredicateConstraint::Upl:
11489 return VT == MVT::aarch64svcount ? &AArch64::PNR_3bRegClass
11490 : &AArch64::PPR_3bRegClass;
11491 case PredicateConstraint::Upa:
11492 return VT == MVT::aarch64svcount ? &AArch64::PNRRegClass
11493 : &AArch64::PPRRegClass;
11494 }
11495
11496 llvm_unreachable("Missing PredicateConstraint!");
11497}
11498
11500
11501static std::optional<ReducedGprConstraint>
11504 .Case("Uci", ReducedGprConstraint::Uci)
11505 .Case("Ucj", ReducedGprConstraint::Ucj)
11506 .Default(std::nullopt);
11507}
11508
11509static const TargetRegisterClass *
11511 if (!VT.isScalarInteger() || VT.getFixedSizeInBits() > 64)
11512 return nullptr;
11513
11514 switch (Constraint) {
11515 case ReducedGprConstraint::Uci:
11516 return &AArch64::MatrixIndexGPR32_8_11RegClass;
11517 case ReducedGprConstraint::Ucj:
11518 return &AArch64::MatrixIndexGPR32_12_15RegClass;
11519 }
11520
11521 llvm_unreachable("Missing ReducedGprConstraint!");
11522}
11523
11524// The set of cc code supported is from
11525// https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Flag-Output-Operands
11528 .Case("{@cchi}", AArch64CC::HI)
11529 .Case("{@cccs}", AArch64CC::HS)
11530 .Case("{@cclo}", AArch64CC::LO)
11531 .Case("{@ccls}", AArch64CC::LS)
11532 .Case("{@cccc}", AArch64CC::LO)
11533 .Case("{@cceq}", AArch64CC::EQ)
11534 .Case("{@ccgt}", AArch64CC::GT)
11535 .Case("{@ccge}", AArch64CC::GE)
11536 .Case("{@cclt}", AArch64CC::LT)
11537 .Case("{@ccle}", AArch64CC::LE)
11538 .Case("{@cchs}", AArch64CC::HS)
11539 .Case("{@ccne}", AArch64CC::NE)
11540 .Case("{@ccvc}", AArch64CC::VC)
11541 .Case("{@ccpl}", AArch64CC::PL)
11542 .Case("{@ccvs}", AArch64CC::VS)
11543 .Case("{@ccmi}", AArch64CC::MI)
11545 return Cond;
11546}
11547
11548/// Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR,
11549/// WZR, invert(<cond>)'.
11551 SelectionDAG &DAG) {
11552 return DAG.getNode(
11553 AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
11554 DAG.getConstant(0, DL, MVT::i32),
11555 DAG.getConstant(getInvertedCondCode(CC), DL, MVT::i32), NZCV);
11556}
11557
11558// Lower @cc flag output via getSETCC.
11559SDValue AArch64TargetLowering::LowerAsmOutputForConstraint(
11560 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
11561 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
11562 AArch64CC::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
11563 if (Cond == AArch64CC::Invalid)
11564 return SDValue();
11565 // The output variable should be a scalar integer.
11566 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
11567 OpInfo.ConstraintVT.getSizeInBits() < 8)
11568 report_fatal_error("Flag output operand is of invalid type");
11569
11570 // Get NZCV register. Only update chain when copyfrom is glued.
11571 if (Glue.getNode()) {
11572 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32, Glue);
11573 Chain = Glue.getValue(1);
11574 } else
11575 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, MVT::i32);
11576 // Extract CC code.
11577 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
11578
11580
11581 // Truncate or ZERO_EXTEND based on value types.
11582 if (OpInfo.ConstraintVT.getSizeInBits() <= 32)
11583 Result = DAG.getNode(ISD::TRUNCATE, DL, OpInfo.ConstraintVT, CC);
11584 else
11585 Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
11586
11587 return Result;
11588}
11589
11590/// getConstraintType - Given a constraint letter, return the type of
11591/// constraint it is for this target.
11593AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
11594 if (Constraint.size() == 1) {
11595 switch (Constraint[0]) {
11596 default:
11597 break;
11598 case 'x':
11599 case 'w':
11600 case 'y':
11601 return C_RegisterClass;
11602 // An address with a single base register. Due to the way we
11603 // currently handle addresses it is the same as 'r'.
11604 case 'Q':
11605 return C_Memory;
11606 case 'I':
11607 case 'J':
11608 case 'K':
11609 case 'L':
11610 case 'M':
11611 case 'N':
11612 case 'Y':
11613 case 'Z':
11614 return C_Immediate;
11615 case 'z':
11616 case 'S': // A symbol or label reference with a constant offset
11617 return C_Other;
11618 }
11619 } else if (parsePredicateConstraint(Constraint))
11620 return C_RegisterClass;
11621 else if (parseReducedGprConstraint(Constraint))
11622 return C_RegisterClass;
11623 else if (parseConstraintCode(Constraint) != AArch64CC::Invalid)
11624 return C_Other;
11625 return TargetLowering::getConstraintType(Constraint);
11626}
11627
11628/// Examine constraint type and operand type and determine a weight value.
11629/// This object must already have been set up with the operand type
11630/// and the current alternative constraint selected.
11632AArch64TargetLowering::getSingleConstraintMatchWeight(
11633 AsmOperandInfo &info, const char *constraint) const {
11635 Value *CallOperandVal = info.CallOperandVal;
11636 // If we don't have a value, we can't do a match,
11637 // but allow it at the lowest weight.
11638 if (!CallOperandVal)
11639 return CW_Default;
11640 Type *type = CallOperandVal->getType();
11641 // Look at the constraint type.
11642 switch (*constraint) {
11643 default:
11645 break;
11646 case 'x':
11647 case 'w':
11648 case 'y':
11649 if (type->isFloatingPointTy() || type->isVectorTy())
11650 weight = CW_Register;
11651 break;
11652 case 'z':
11653 weight = CW_Constant;
11654 break;
11655 case 'U':
11656 if (parsePredicateConstraint(constraint) ||
11657 parseReducedGprConstraint(constraint))
11658 weight = CW_Register;
11659 break;
11660 }
11661 return weight;
11662}
11663
11664std::pair<unsigned, const TargetRegisterClass *>
11665AArch64TargetLowering::getRegForInlineAsmConstraint(
11666 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
11667 if (Constraint.size() == 1) {
11668 switch (Constraint[0]) {
11669 case 'r':
11670 if (VT.isScalableVector())
11671 return std::make_pair(0U, nullptr);
11672 if (Subtarget->hasLS64() && VT.getSizeInBits() == 512)
11673 return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass);
11674 if (VT.getFixedSizeInBits() == 64)
11675 return std::make_pair(0U, &AArch64::GPR64commonRegClass);
11676 return std::make_pair(0U, &AArch64::GPR32commonRegClass);
11677 case 'w': {
11678 if (!Subtarget->hasFPARMv8())
11679 break;
11680 if (VT.isScalableVector()) {
11681 if (VT.getVectorElementType() != MVT::i1)
11682 return std::make_pair(0U, &AArch64::ZPRRegClass);
11683 return std::make_pair(0U, nullptr);
11684 }
11685 if (VT == MVT::Other)
11686 break;
11687 uint64_t VTSize = VT.getFixedSizeInBits();
11688 if (VTSize == 16)
11689 return std::make_pair(0U, &AArch64::FPR16RegClass);
11690 if (VTSize == 32)
11691 return std::make_pair(0U, &AArch64::FPR32RegClass);
11692 if (VTSize == 64)
11693 return std::make_pair(0U, &AArch64::FPR64RegClass);
11694 if (VTSize == 128)
11695 return std::make_pair(0U, &AArch64::FPR128RegClass);
11696 break;
11697 }
11698 // The instructions that this constraint is designed for can
11699 // only take 128-bit registers so just use that regclass.
11700 case 'x':
11701 if (!Subtarget->hasFPARMv8())
11702 break;
11703 if (VT.isScalableVector())
11704 return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
11705 if (VT.getSizeInBits() == 128)
11706 return std::make_pair(0U, &AArch64::FPR128_loRegClass);
11707 break;
11708 case 'y':
11709 if (!Subtarget->hasFPARMv8())
11710 break;
11711 if (VT.isScalableVector())
11712 return std::make_pair(0U, &AArch64::ZPR_3bRegClass);
11713 break;
11714 }
11715 } else {
11716 if (const auto PC = parsePredicateConstraint(Constraint))
11717 if (const auto *RegClass = getPredicateRegisterClass(*PC, VT))
11718 return std::make_pair(0U, RegClass);
11719
11720 if (const auto RGC = parseReducedGprConstraint(Constraint))
11721 if (const auto *RegClass = getReducedGprRegisterClass(*RGC, VT))
11722 return std::make_pair(0U, RegClass);
11723 }
11724 if (StringRef("{cc}").equals_insensitive(Constraint) ||
11726 return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
11727
11728 if (Constraint == "{za}") {
11729 return std::make_pair(unsigned(AArch64::ZA), &AArch64::MPRRegClass);
11730 }
11731
11732 if (Constraint == "{zt0}") {
11733 return std::make_pair(unsigned(AArch64::ZT0), &AArch64::ZTRRegClass);
11734 }
11735
11736 // Use the default implementation in TargetLowering to convert the register
11737 // constraint into a member of a register class.
11738 std::pair<unsigned, const TargetRegisterClass *> Res;
11740
11741 // Not found as a standard register?
11742 if (!Res.second) {
11743 unsigned Size = Constraint.size();
11744 if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
11745 tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
11746 int RegNo;
11747 bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
11748 if (!Failed && RegNo >= 0 && RegNo <= 31) {
11749 // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
11750 // By default we'll emit v0-v31 for this unless there's a modifier where
11751 // we'll emit the correct register as well.
11752 if (VT != MVT::Other && VT.getSizeInBits() == 64) {
11753 Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
11754 Res.second = &AArch64::FPR64RegClass;
11755 } else {
11756 Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
11757 Res.second = &AArch64::FPR128RegClass;
11758 }
11759 }
11760 }
11761 }
11762
11763 if (Res.second && !Subtarget->hasFPARMv8() &&
11764 !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&
11765 !AArch64::GPR64allRegClass.hasSubClassEq(Res.second))
11766 return std::make_pair(0U, nullptr);
11767
11768 return Res;
11769}
11770
11772 llvm::Type *Ty,
11773 bool AllowUnknown) const {
11774 if (Subtarget->hasLS64() && Ty->isIntegerTy(512))
11775 return EVT(MVT::i64x8);
11776
11777 return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown);
11778}
11779
11780/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
11781/// vector. If it is invalid, don't add anything to Ops.
11782void AArch64TargetLowering::LowerAsmOperandForConstraint(
11783 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
11784 SelectionDAG &DAG) const {
11785 SDValue Result;
11786
11787 // Currently only support length 1 constraints.
11788 if (Constraint.size() != 1)
11789 return;
11790
11791 char ConstraintLetter = Constraint[0];
11792 switch (ConstraintLetter) {
11793 default:
11794 break;
11795
11796 // This set of constraints deal with valid constants for various instructions.
11797 // Validate and return a target constant for them if we can.
11798 case 'z': {
11799 // 'z' maps to xzr or wzr so it needs an input of 0.
11800 if (!isNullConstant(Op))
11801 return;
11802
11803 if (Op.getValueType() == MVT::i64)
11804 Result = DAG.getRegister(AArch64::XZR, MVT::i64);
11805 else
11806 Result = DAG.getRegister(AArch64::WZR, MVT::i32);
11807 break;
11808 }
11809 case 'S':
11810 // Use the generic code path for "s". In GCC's aarch64 port, "S" is
11811 // supported for PIC while "s" isn't, making "s" less useful. We implement
11812 // "S" but not "s".
11814 break;
11815
11816 case 'I':
11817 case 'J':
11818 case 'K':
11819 case 'L':
11820 case 'M':
11821 case 'N':
11822 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
11823 if (!C)
11824 return;
11825
11826 // Grab the value and do some validation.
11827 uint64_t CVal = C->getZExtValue();
11828 switch (ConstraintLetter) {
11829 // The I constraint applies only to simple ADD or SUB immediate operands:
11830 // i.e. 0 to 4095 with optional shift by 12
11831 // The J constraint applies only to ADD or SUB immediates that would be
11832 // valid when negated, i.e. if [an add pattern] were to be output as a SUB
11833 // instruction [or vice versa], in other words -1 to -4095 with optional
11834 // left shift by 12.
11835 case 'I':
11836 if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
11837 break;
11838 return;
11839 case 'J': {
11840 uint64_t NVal = -C->getSExtValue();
11841 if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
11842 CVal = C->getSExtValue();
11843 break;
11844 }
11845 return;
11846 }
11847 // The K and L constraints apply *only* to logical immediates, including
11848 // what used to be the MOVI alias for ORR (though the MOVI alias has now
11849 // been removed and MOV should be used). So these constraints have to
11850 // distinguish between bit patterns that are valid 32-bit or 64-bit
11851 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
11852 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
11853 // versa.
11854 case 'K':
11855 if (AArch64_AM::isLogicalImmediate(CVal, 32))
11856 break;
11857 return;
11858 case 'L':
11859 if (AArch64_AM::isLogicalImmediate(CVal, 64))
11860 break;
11861 return;
11862 // The M and N constraints are a superset of K and L respectively, for use
11863 // with the MOV (immediate) alias. As well as the logical immediates they
11864 // also match 32 or 64-bit immediates that can be loaded either using a
11865 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
11866 // (M) or 64-bit 0x1234000000000000 (N) etc.
11867 // As a note some of this code is liberally stolen from the asm parser.
11868 case 'M': {
11869 if (!isUInt<32>(CVal))
11870 return;
11871 if (AArch64_AM::isLogicalImmediate(CVal, 32))
11872 break;
11873 if ((CVal & 0xFFFF) == CVal)
11874 break;
11875 if ((CVal & 0xFFFF0000ULL) == CVal)
11876 break;
11877 uint64_t NCVal = ~(uint32_t)CVal;
11878 if ((NCVal & 0xFFFFULL) == NCVal)
11879 break;
11880 if ((NCVal & 0xFFFF0000ULL) == NCVal)
11881 break;
11882 return;
11883 }
11884 case 'N': {
11885 if (AArch64_AM::isLogicalImmediate(CVal, 64))
11886 break;
11887 if ((CVal & 0xFFFFULL) == CVal)
11888 break;
11889 if ((CVal & 0xFFFF0000ULL) == CVal)
11890 break;
11891 if ((CVal & 0xFFFF00000000ULL) == CVal)
11892 break;
11893 if ((CVal & 0xFFFF000000000000ULL) == CVal)
11894 break;
11895 uint64_t NCVal = ~CVal;
11896 if ((NCVal & 0xFFFFULL) == NCVal)
11897 break;
11898 if ((NCVal & 0xFFFF0000ULL) == NCVal)
11899 break;
11900 if ((NCVal & 0xFFFF00000000ULL) == NCVal)
11901 break;
11902 if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
11903 break;
11904 return;
11905 }
11906 default:
11907 return;
11908 }
11909
11910 // All assembler immediates are 64-bit integers.
11911 Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
11912 break;
11913 }
11914
11915 if (Result.getNode()) {
11916 Ops.push_back(Result);
11917 return;
11918 }
11919
11920 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
11921}
11922
11923//===----------------------------------------------------------------------===//
11924// AArch64 Advanced SIMD Support
11925//===----------------------------------------------------------------------===//
11926
11927/// WidenVector - Given a value in the V64 register class, produce the
11928/// equivalent value in the V128 register class.
11930 EVT VT = V64Reg.getValueType();
11931 unsigned NarrowSize = VT.getVectorNumElements();
11932 MVT EltTy = VT.getVectorElementType().getSimpleVT();
11933 MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
11934 SDLoc DL(V64Reg);
11935
11936 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
11937 V64Reg, DAG.getConstant(0, DL, MVT::i64));
11938}
11939
11940/// getExtFactor - Determine the adjustment factor for the position when
11941/// generating an "extract from vector registers" instruction.
11942static unsigned getExtFactor(SDValue &V) {
11943 EVT EltType = V.getValueType().getVectorElementType();
11944 return EltType.getSizeInBits() / 8;
11945}
11946
11947// Check if a vector is built from one vector via extracted elements of
11948// another together with an AND mask, ensuring that all elements fit
11949// within range. This can be reconstructed using AND and NEON's TBL1.
11951 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
11952 SDLoc dl(Op);
11953 EVT VT = Op.getValueType();
11954 assert(!VT.isScalableVector() &&
11955 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
11956
11957 // Can only recreate a shuffle with 16xi8 or 8xi8 elements, as they map
11958 // directly to TBL1.
11959 if (VT != MVT::v16i8 && VT != MVT::v8i8)
11960 return SDValue();
11961
11962 unsigned NumElts = VT.getVectorNumElements();
11963 assert((NumElts == 8 || NumElts == 16) &&
11964 "Need to have exactly 8 or 16 elements in vector.");
11965
11966 SDValue SourceVec;
11967 SDValue MaskSourceVec;
11968 SmallVector<SDValue, 16> AndMaskConstants;
11969
11970 for (unsigned i = 0; i < NumElts; ++i) {
11971 SDValue V = Op.getOperand(i);
11972 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
11973 return SDValue();
11974
11975 SDValue OperandSourceVec = V.getOperand(0);
11976 if (!SourceVec)
11977 SourceVec = OperandSourceVec;
11978 else if (SourceVec != OperandSourceVec)
11979 return SDValue();
11980
11981 // This only looks at shuffles with elements that are
11982 // a) truncated by a constant AND mask extracted from a mask vector, or
11983 // b) extracted directly from a mask vector.
11984 SDValue MaskSource = V.getOperand(1);
11985 if (MaskSource.getOpcode() == ISD::AND) {
11986 if (!isa<ConstantSDNode>(MaskSource.getOperand(1)))
11987 return SDValue();
11988
11989 AndMaskConstants.push_back(MaskSource.getOperand(1));
11990 MaskSource = MaskSource->getOperand(0);
11991 } else if (!AndMaskConstants.empty()) {
11992 // Either all or no operands should have an AND mask.
11993 return SDValue();
11994 }
11995
11996 // An ANY_EXTEND may be inserted between the AND and the source vector
11997 // extraction. We don't care about that, so we can just skip it.
11998 if (MaskSource.getOpcode() == ISD::ANY_EXTEND)
11999 MaskSource = MaskSource.getOperand(0);
12000
12001 if (MaskSource.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
12002 return SDValue();
12003
12004 SDValue MaskIdx = MaskSource.getOperand(1);
12005 if (!isa<ConstantSDNode>(MaskIdx) ||
12006 !cast<ConstantSDNode>(MaskIdx)->getConstantIntValue()->equalsInt(i))
12007 return SDValue();
12008
12009 // We only apply this if all elements come from the same vector with the
12010 // same vector type.
12011 if (!MaskSourceVec) {
12012 MaskSourceVec = MaskSource->getOperand(0);
12013 if (MaskSourceVec.getValueType() != VT)
12014 return SDValue();
12015 } else if (MaskSourceVec != MaskSource->getOperand(0)) {
12016 return SDValue();
12017 }
12018 }
12019
12020 // We need a v16i8 for TBL, so we extend the source with a placeholder vector
12021 // for v8i8 to get a v16i8. As the pattern we are replacing is extract +
12022 // insert, we know that the index in the mask must be smaller than the number
12023 // of elements in the source, or we would have an out-of-bounds access.
12024 if (NumElts == 8)
12025 SourceVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, SourceVec,
12026 DAG.getUNDEF(VT));
12027
12028 // Preconditions met, so we can use a vector (AND +) TBL to build this vector.
12029 if (!AndMaskConstants.empty())
12030 MaskSourceVec = DAG.getNode(ISD::AND, dl, VT, MaskSourceVec,
12031 DAG.getBuildVector(VT, dl, AndMaskConstants));
12032
12033 return DAG.getNode(
12035 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, dl, MVT::i32), SourceVec,
12036 MaskSourceVec);
12037}
12038
12039// Gather data to see if the operation can be modelled as a
12040// shuffle in combination with VEXTs.
12042 SelectionDAG &DAG) const {
12043 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
12044 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
12045 SDLoc dl(Op);
12046 EVT VT = Op.getValueType();
12047 assert(!VT.isScalableVector() &&
12048 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
12049 unsigned NumElts = VT.getVectorNumElements();
12050
12051 struct ShuffleSourceInfo {
12052 SDValue Vec;
12053 unsigned MinElt;
12054 unsigned MaxElt;
12055
12056 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
12057 // be compatible with the shuffle we intend to construct. As a result
12058 // ShuffleVec will be some sliding window into the original Vec.
12059 SDValue ShuffleVec;
12060
12061 // Code should guarantee that element i in Vec starts at element "WindowBase
12062 // + i * WindowScale in ShuffleVec".
12063 int WindowBase;
12064 int WindowScale;
12065
12066 ShuffleSourceInfo(SDValue Vec)
12067 : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
12068 ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
12069
12070 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
12071 };
12072
12073 // First gather all vectors used as an immediate source for this BUILD_VECTOR
12074 // node.
12076 for (unsigned i = 0; i < NumElts; ++i) {
12077 SDValue V = Op.getOperand(i);
12078 if (V.isUndef())
12079 continue;
12080 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
12081 !isa<ConstantSDNode>(V.getOperand(1)) ||
12082 V.getOperand(0).getValueType().isScalableVector()) {
12083 LLVM_DEBUG(
12084 dbgs() << "Reshuffle failed: "
12085 "a shuffle can only come from building a vector from "
12086 "various elements of other fixed-width vectors, provided "
12087 "their indices are constant\n");
12088 return SDValue();
12089 }
12090
12091 // Add this element source to the list if it's not already there.
12092 SDValue SourceVec = V.getOperand(0);
12093 auto Source = find(Sources, SourceVec);
12094 if (Source == Sources.end())
12095 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
12096
12097 // Update the minimum and maximum lane number seen.
12098 unsigned EltNo = V.getConstantOperandVal(1);
12099 Source->MinElt = std::min(Source->MinElt, EltNo);
12100 Source->MaxElt = std::max(Source->MaxElt, EltNo);
12101 }
12102
12103 // If we have 3 or 4 sources, try to generate a TBL, which will at least be
12104 // better than moving to/from gpr registers for larger vectors.
12105 if ((Sources.size() == 3 || Sources.size() == 4) && NumElts > 4) {
12106 // Construct a mask for the tbl. We may need to adjust the index for types
12107 // larger than i8.
12109 unsigned OutputFactor = VT.getScalarSizeInBits() / 8;
12110 for (unsigned I = 0; I < NumElts; ++I) {
12111 SDValue V = Op.getOperand(I);
12112 if (V.isUndef()) {
12113 for (unsigned OF = 0; OF < OutputFactor; OF++)
12114 Mask.push_back(-1);
12115 continue;
12116 }
12117 // Set the Mask lanes adjusted for the size of the input and output
12118 // lanes. The Mask is always i8, so it will set OutputFactor lanes per
12119 // output element, adjusted in their positions per input and output types.
12120 unsigned Lane = V.getConstantOperandVal(1);
12121 for (unsigned S = 0; S < Sources.size(); S++) {
12122 if (V.getOperand(0) == Sources[S].Vec) {
12123 unsigned InputSize = Sources[S].Vec.getScalarValueSizeInBits();
12124 unsigned InputBase = 16 * S + Lane * InputSize / 8;
12125 for (unsigned OF = 0; OF < OutputFactor; OF++)
12126 Mask.push_back(InputBase + OF);
12127 break;
12128 }
12129 }
12130 }
12131
12132 // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to
12133 // v16i8, and the TBLMask
12134 SmallVector<SDValue, 16> TBLOperands;
12135 TBLOperands.push_back(DAG.getConstant(Sources.size() == 3
12136 ? Intrinsic::aarch64_neon_tbl3
12137 : Intrinsic::aarch64_neon_tbl4,
12138 dl, MVT::i32));
12139 for (unsigned i = 0; i < Sources.size(); i++) {
12140 SDValue Src = Sources[i].Vec;
12141 EVT SrcVT = Src.getValueType();
12142 Src = DAG.getBitcast(SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, Src);
12143 assert((SrcVT.is64BitVector() || SrcVT.is128BitVector()) &&
12144 "Expected a legally typed vector");
12145 if (SrcVT.is64BitVector())
12146 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, Src,
12147 DAG.getUNDEF(MVT::v8i8));
12148 TBLOperands.push_back(Src);
12149 }
12150
12152 for (unsigned i = 0; i < Mask.size(); i++)
12153 TBLMask.push_back(DAG.getConstant(Mask[i], dl, MVT::i32));
12154 assert((Mask.size() == 8 || Mask.size() == 16) &&
12155 "Expected a v8i8 or v16i8 Mask");
12156 TBLOperands.push_back(
12157 DAG.getBuildVector(Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, dl, TBLMask));
12158
12159 SDValue Shuffle =
12161 Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, TBLOperands);
12162 return DAG.getBitcast(VT, Shuffle);
12163 }
12164
12165 if (Sources.size() > 2) {
12166 LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something "
12167 << "sensible when at most two source vectors are "
12168 << "involved\n");
12169 return SDValue();
12170 }
12171
12172 // Find out the smallest element size among result and two sources, and use
12173 // it as element size to build the shuffle_vector.
12174 EVT SmallestEltTy = VT.getVectorElementType();
12175 for (auto &Source : Sources) {
12176 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
12177 if (SrcEltTy.bitsLT(SmallestEltTy)) {
12178 SmallestEltTy = SrcEltTy;
12179 }
12180 }
12181 unsigned ResMultiplier =
12182 VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits();
12183 uint64_t VTSize = VT.getFixedSizeInBits();
12184 NumElts = VTSize / SmallestEltTy.getFixedSizeInBits();
12185 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
12186
12187 // If the source vector is too wide or too narrow, we may nevertheless be able
12188 // to construct a compatible shuffle either by concatenating it with UNDEF or
12189 // extracting a suitable range of elements.
12190 for (auto &Src : Sources) {
12191 EVT SrcVT = Src.ShuffleVec.getValueType();
12192
12193 TypeSize SrcVTSize = SrcVT.getSizeInBits();
12194 if (SrcVTSize == TypeSize::getFixed(VTSize))
12195 continue;
12196
12197 // This stage of the search produces a source with the same element type as
12198 // the original, but with a total width matching the BUILD_VECTOR output.
12199 EVT EltVT = SrcVT.getVectorElementType();
12200 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
12201 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
12202
12203 if (SrcVTSize.getFixedValue() < VTSize) {
12204 assert(2 * SrcVTSize == VTSize);
12205 // We can pad out the smaller vector for free, so if it's part of a
12206 // shuffle...
12207 Src.ShuffleVec =
12208 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
12209 DAG.getUNDEF(Src.ShuffleVec.getValueType()));
12210 continue;
12211 }
12212
12213 if (SrcVTSize.getFixedValue() != 2 * VTSize) {
12214 LLVM_DEBUG(
12215 dbgs() << "Reshuffle failed: result vector too small to extract\n");
12216 return SDValue();
12217 }
12218
12219 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
12220 LLVM_DEBUG(
12221 dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
12222 return SDValue();
12223 }
12224
12225 if (Src.MinElt >= NumSrcElts) {
12226 // The extraction can just take the second half
12227 Src.ShuffleVec =
12228 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
12229 DAG.getConstant(NumSrcElts, dl, MVT::i64));
12230 Src.WindowBase = -NumSrcElts;
12231 } else if (Src.MaxElt < NumSrcElts) {
12232 // The extraction can just take the first half
12233 Src.ShuffleVec =
12234 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
12235 DAG.getConstant(0, dl, MVT::i64));
12236 } else {
12237 // An actual VEXT is needed
12238 SDValue VEXTSrc1 =
12239 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
12240 DAG.getConstant(0, dl, MVT::i64));
12241 SDValue VEXTSrc2 =
12242 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
12243 DAG.getConstant(NumSrcElts, dl, MVT::i64));
12244 unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
12245
12246 if (!SrcVT.is64BitVector()) {
12247 LLVM_DEBUG(
12248 dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
12249 "for SVE vectors.");
12250 return SDValue();
12251 }
12252
12253 Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
12254 VEXTSrc2,
12255 DAG.getConstant(Imm, dl, MVT::i32));
12256 Src.WindowBase = -Src.MinElt;
12257 }
12258 }
12259
12260 // Another possible incompatibility occurs from the vector element types. We
12261 // can fix this by bitcasting the source vectors to the same type we intend
12262 // for the shuffle.
12263 for (auto &Src : Sources) {
12264 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
12265 if (SrcEltTy == SmallestEltTy)
12266 continue;
12267 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
12268 if (DAG.getDataLayout().isBigEndian()) {
12269 Src.ShuffleVec =
12270 DAG.getNode(AArch64ISD::NVCAST, dl, ShuffleVT, Src.ShuffleVec);
12271 } else {
12272 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
12273 }
12274 Src.WindowScale =
12275 SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits();
12276 Src.WindowBase *= Src.WindowScale;
12277 }
12278
12279 // Final check before we try to actually produce a shuffle.
12280 LLVM_DEBUG(for (auto Src
12281 : Sources)
12282 assert(Src.ShuffleVec.getValueType() == ShuffleVT););
12283
12284 // The stars all align, our next step is to produce the mask for the shuffle.
12285 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
12286 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
12287 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
12288 SDValue Entry = Op.getOperand(i);
12289 if (Entry.isUndef())
12290 continue;
12291
12292 auto Src = find(Sources, Entry.getOperand(0));
12293 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
12294
12295 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
12296 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
12297 // segment.
12298 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
12299 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
12300 VT.getScalarSizeInBits());
12301 int LanesDefined = BitsDefined / BitsPerShuffleLane;
12302
12303 // This source is expected to fill ResMultiplier lanes of the final shuffle,
12304 // starting at the appropriate offset.
12305 int *LaneMask = &Mask[i * ResMultiplier];
12306
12307 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
12308 ExtractBase += NumElts * (Src - Sources.begin());
12309 for (int j = 0; j < LanesDefined; ++j)
12310 LaneMask[j] = ExtractBase + j;
12311 }
12312
12313 // Final check before we try to produce nonsense...
12314 if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
12315 LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
12316 return SDValue();
12317 }
12318
12319 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
12320 for (unsigned i = 0; i < Sources.size(); ++i)
12321 ShuffleOps[i] = Sources[i].ShuffleVec;
12322
12323 SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
12324 ShuffleOps[1], Mask);
12325 SDValue V;
12326 if (DAG.getDataLayout().isBigEndian()) {
12327 V = DAG.getNode(AArch64ISD::NVCAST, dl, VT, Shuffle);
12328 } else {
12329 V = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
12330 }
12331
12332 LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
12333 dbgs() << "Reshuffle, creating node: "; V.dump(););
12334
12335 return V;
12336}
12337
12338// check if an EXT instruction can handle the shuffle mask when the
12339// vector sources of the shuffle are the same.
12340static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
12341 unsigned NumElts = VT.getVectorNumElements();
12342
12343 // Assume that the first shuffle index is not UNDEF. Fail if it is.
12344 if (M[0] < 0)
12345 return false;
12346
12347 Imm = M[0];
12348
12349 // If this is a VEXT shuffle, the immediate value is the index of the first
12350 // element. The other shuffle indices must be the successive elements after
12351 // the first one.
12352 unsigned ExpectedElt = Imm;
12353 for (unsigned i = 1; i < NumElts; ++i) {
12354 // Increment the expected index. If it wraps around, just follow it
12355 // back to index zero and keep going.
12356 ++ExpectedElt;
12357 if (ExpectedElt == NumElts)
12358 ExpectedElt = 0;
12359
12360 if (M[i] < 0)
12361 continue; // ignore UNDEF indices
12362 if (ExpectedElt != static_cast<unsigned>(M[i]))
12363 return false;
12364 }
12365
12366 return true;
12367}
12368
12369// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
12370// v4i32s. This is really a truncate, which we can construct out of (legal)
12371// concats and truncate nodes.
12373 if (V.getValueType() != MVT::v16i8)
12374 return SDValue();
12375 assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");
12376
12377 for (unsigned X = 0; X < 4; X++) {
12378 // Check the first item in each group is an extract from lane 0 of a v4i32
12379 // or v4i16.
12380 SDValue BaseExt = V.getOperand(X * 4);
12381 if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
12382 (BaseExt.getOperand(0).getValueType() != MVT::v4i16 &&
12383 BaseExt.getOperand(0).getValueType() != MVT::v4i32) ||
12384 !isa<ConstantSDNode>(BaseExt.getOperand(1)) ||
12385 BaseExt.getConstantOperandVal(1) != 0)
12386 return SDValue();
12387 SDValue Base = BaseExt.getOperand(0);
12388 // And check the other items are extracts from the same vector.
12389 for (unsigned Y = 1; Y < 4; Y++) {
12390 SDValue Ext = V.getOperand(X * 4 + Y);
12391 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
12392 Ext.getOperand(0) != Base ||
12393 !isa<ConstantSDNode>(Ext.getOperand(1)) ||
12394 Ext.getConstantOperandVal(1) != Y)
12395 return SDValue();
12396 }
12397 }
12398
12399 // Turn the buildvector into a series of truncates and concates, which will
12400 // become uzip1's. Any v4i32s we found get truncated to v4i16, which are
12401 // concat together to produce 2 v8i16. These are both truncated and concat
12402 // together.
12403 SDLoc DL(V);
12404 SDValue Trunc[4] = {
12405 V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0),
12406 V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)};
12407 for (SDValue &V : Trunc)
12408 if (V.getValueType() == MVT::v4i32)
12409 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, V);
12410 SDValue Concat0 =
12411 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]);
12412 SDValue Concat1 =
12413 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]);
12414 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0);
12415 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1);
12416 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1);
12417}
12418
12419/// Check if a vector shuffle corresponds to a DUP instructions with a larger
12420/// element width than the vector lane type. If that is the case the function
12421/// returns true and writes the value of the DUP instruction lane operand into
12422/// DupLaneOp
12423static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,
12424 unsigned &DupLaneOp) {
12425 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
12426 "Only possible block sizes for wide DUP are: 16, 32, 64");
12427
12428 if (BlockSize <= VT.getScalarSizeInBits())
12429 return false;
12430 if (BlockSize % VT.getScalarSizeInBits() != 0)
12431 return false;
12432 if (VT.getSizeInBits() % BlockSize != 0)
12433 return false;
12434
12435 size_t SingleVecNumElements = VT.getVectorNumElements();
12436 size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits();
12437 size_t NumBlocks = VT.getSizeInBits() / BlockSize;
12438
12439 // We are looking for masks like
12440 // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
12441 // might be replaced by 'undefined'. BlockIndices will eventually contain
12442 // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
12443 // for the above examples)
12444 SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1);
12445 for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++)
12446 for (size_t I = 0; I < NumEltsPerBlock; I++) {
12447 int Elt = M[BlockIndex * NumEltsPerBlock + I];
12448 if (Elt < 0)
12449 continue;
12450 // For now we don't support shuffles that use the second operand
12451 if ((unsigned)Elt >= SingleVecNumElements)
12452 return false;
12453 if (BlockElts[I] < 0)
12454 BlockElts[I] = Elt;
12455 else if (BlockElts[I] != Elt)
12456 return false;
12457 }
12458
12459 // We found a candidate block (possibly with some undefs). It must be a
12460 // sequence of consecutive integers starting with a value divisible by
12461 // NumEltsPerBlock with some values possibly replaced by undef-s.
12462
12463 // Find first non-undef element
12464 auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; });
12465 assert(FirstRealEltIter != BlockElts.end() &&
12466 "Shuffle with all-undefs must have been caught by previous cases, "
12467 "e.g. isSplat()");
12468 if (FirstRealEltIter == BlockElts.end()) {
12469 DupLaneOp = 0;
12470 return true;
12471 }
12472
12473 // Index of FirstRealElt in BlockElts
12474 size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();
12475
12476 if ((unsigned)*FirstRealEltIter < FirstRealIndex)
12477 return false;
12478 // BlockElts[0] must have the following value if it isn't undef:
12479 size_t Elt0 = *FirstRealEltIter - FirstRealIndex;
12480
12481 // Check the first element
12482 if (Elt0 % NumEltsPerBlock != 0)
12483 return false;
12484 // Check that the sequence indeed consists of consecutive integers (modulo
12485 // undefs)
12486 for (size_t I = 0; I < NumEltsPerBlock; I++)
12487 if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I)
12488 return false;
12489
12490 DupLaneOp = Elt0 / NumEltsPerBlock;
12491 return true;
12492}
12493
12494// check if an EXT instruction can handle the shuffle mask when the
12495// vector sources of the shuffle are different.
12496static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
12497 unsigned &Imm) {
12498 // Look for the first non-undef element.
12499 const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
12500
12501 // Benefit form APInt to handle overflow when calculating expected element.
12502 unsigned NumElts = VT.getVectorNumElements();
12503 unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
12504 APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
12505 // The following shuffle indices must be the successive elements after the
12506 // first real element.
12507 bool FoundWrongElt = std::any_of(FirstRealElt + 1, M.end(), [&](int Elt) {
12508 return Elt != ExpectedElt++ && Elt != -1;
12509 });
12510 if (FoundWrongElt)
12511 return false;
12512
12513 // The index of an EXT is the first element if it is not UNDEF.
12514 // Watch out for the beginning UNDEFs. The EXT index should be the expected
12515 // value of the first element. E.g.
12516 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
12517 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
12518 // ExpectedElt is the last mask index plus 1.
12519 Imm = ExpectedElt.getZExtValue();
12520
12521 // There are two difference cases requiring to reverse input vectors.
12522 // For example, for vector <4 x i32> we have the following cases,
12523 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
12524 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
12525 // For both cases, we finally use mask <5, 6, 7, 0>, which requires
12526 // to reverse two input vectors.
12527 if (Imm < NumElts)
12528 ReverseEXT = true;
12529 else
12530 Imm -= NumElts;
12531
12532 return true;
12533}
12534
12535/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
12536/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
12537/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
12538static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
12539 unsigned NumElts = VT.getVectorNumElements();
12540 if (NumElts % 2 != 0)
12541 return false;
12542 WhichResult = (M[0] == 0 ? 0 : 1);
12543 unsigned Idx = WhichResult * NumElts / 2;
12544 for (unsigned i = 0; i != NumElts; i += 2) {
12545 if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
12546 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
12547 return false;
12548 Idx += 1;
12549 }
12550
12551 return true;
12552}
12553
12554/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
12555/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
12556/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
12557static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
12558 unsigned Half = VT.getVectorNumElements() / 2;
12559 WhichResult = (M[0] == 0 ? 0 : 1);
12560 for (unsigned j = 0; j != 2; ++j) {
12561 unsigned Idx = WhichResult;
12562 for (unsigned i = 0; i != Half; ++i) {
12563 int MIdx = M[i + j * Half];
12564 if (MIdx >= 0 && (unsigned)MIdx != Idx)
12565 return false;
12566 Idx += 2;
12567 }
12568 }
12569
12570 return true;
12571}
12572
12573/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
12574/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
12575/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
12576static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
12577 unsigned NumElts = VT.getVectorNumElements();
12578 if (NumElts % 2 != 0)
12579 return false;
12580 WhichResult = (M[0] == 0 ? 0 : 1);
12581 for (unsigned i = 0; i < NumElts; i += 2) {
12582 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
12583 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
12584 return false;
12585 }
12586 return true;
12587}
12588
12589static bool isINSMask(ArrayRef<int> M, int NumInputElements,
12590 bool &DstIsLeft, int &Anomaly) {
12591 if (M.size() != static_cast<size_t>(NumInputElements))
12592 return false;
12593
12594 int NumLHSMatch = 0, NumRHSMatch = 0;
12595 int LastLHSMismatch = -1, LastRHSMismatch = -1;
12596
12597 for (int i = 0; i < NumInputElements; ++i) {
12598 if (M[i] == -1) {
12599 ++NumLHSMatch;
12600 ++NumRHSMatch;
12601 continue;
12602 }
12603
12604 if (M[i] == i)
12605 ++NumLHSMatch;
12606 else
12607 LastLHSMismatch = i;
12608
12609 if (M[i] == i + NumInputElements)
12610 ++NumRHSMatch;
12611 else
12612 LastRHSMismatch = i;
12613 }
12614
12615 if (NumLHSMatch == NumInputElements - 1) {
12616 DstIsLeft = true;
12617 Anomaly = LastLHSMismatch;
12618 return true;
12619 } else if (NumRHSMatch == NumInputElements - 1) {
12620 DstIsLeft = false;
12621 Anomaly = LastRHSMismatch;
12622 return true;
12623 }
12624
12625 return false;
12626}
12627
12628static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
12629 if (VT.getSizeInBits() != 128)
12630 return false;
12631
12632 unsigned NumElts = VT.getVectorNumElements();
12633
12634 for (int I = 0, E = NumElts / 2; I != E; I++) {
12635 if (Mask[I] != I)
12636 return false;
12637 }
12638
12639 int Offset = NumElts / 2;
12640 for (int I = NumElts / 2, E = NumElts; I != E; I++) {
12641 if (Mask[I] != I + SplitLHS * Offset)
12642 return false;
12643 }
12644
12645 return true;
12646}
12647
12649 SDLoc DL(Op);
12650 EVT VT = Op.getValueType();
12651 SDValue V0 = Op.getOperand(0);
12652 SDValue V1 = Op.getOperand(1);
12653 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
12654
12657 return SDValue();
12658
12659 bool SplitV0 = V0.getValueSizeInBits() == 128;
12660
12661 if (!isConcatMask(Mask, VT, SplitV0))
12662 return SDValue();
12663
12664 EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
12665 if (SplitV0) {
12666 V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
12667 DAG.getConstant(0, DL, MVT::i64));
12668 }
12669 if (V1.getValueSizeInBits() == 128) {
12670 V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
12671 DAG.getConstant(0, DL, MVT::i64));
12672 }
12673 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
12674}
12675
12676/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
12677/// the specified operations to build the shuffle. ID is the perfect-shuffle
12678//ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
12679//table entry and LHS/RHS are the immediate inputs for this stage of the
12680//shuffle.
12682 SDValue V2, unsigned PFEntry, SDValue LHS,
12683 SDValue RHS, SelectionDAG &DAG,
12684 const SDLoc &dl) {
12685 unsigned OpNum = (PFEntry >> 26) & 0x0F;
12686 unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
12687 unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
12688
12689 enum {
12690 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
12691 OP_VREV,
12692 OP_VDUP0,
12693 OP_VDUP1,
12694 OP_VDUP2,
12695 OP_VDUP3,
12696 OP_VEXT1,
12697 OP_VEXT2,
12698 OP_VEXT3,
12699 OP_VUZPL, // VUZP, left result
12700 OP_VUZPR, // VUZP, right result
12701 OP_VZIPL, // VZIP, left result
12702 OP_VZIPR, // VZIP, right result
12703 OP_VTRNL, // VTRN, left result
12704 OP_VTRNR, // VTRN, right result
12705 OP_MOVLANE // Move lane. RHSID is the lane to move into
12706 };
12707
12708 if (OpNum == OP_COPY) {
12709 if (LHSID == (1 * 9 + 2) * 9 + 3)
12710 return LHS;
12711 assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
12712 return RHS;
12713 }
12714
12715 if (OpNum == OP_MOVLANE) {
12716 // Decompose a PerfectShuffle ID to get the Mask for lane Elt
12717 auto getPFIDLane = [](unsigned ID, int Elt) -> int {
12718 assert(Elt < 4 && "Expected Perfect Lanes to be less than 4");
12719 Elt = 3 - Elt;
12720 while (Elt > 0) {
12721 ID /= 9;
12722 Elt--;
12723 }
12724 return (ID % 9 == 8) ? -1 : ID % 9;
12725 };
12726
12727 // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We
12728 // get the lane to move from the PFID, which is always from the
12729 // original vectors (V1 or V2).
12731 LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
12732 EVT VT = OpLHS.getValueType();
12733 assert(RHSID < 8 && "Expected a lane index for RHSID!");
12734 unsigned ExtLane = 0;
12735 SDValue Input;
12736
12737 // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs
12738 // convert into a higher type.
12739 if (RHSID & 0x4) {
12740 int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1;
12741 if (MaskElt == -1)
12742 MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1;
12743 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
12744 ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2);
12745 Input = MaskElt < 2 ? V1 : V2;
12746 if (VT.getScalarSizeInBits() == 16) {
12747 Input = DAG.getBitcast(MVT::v2f32, Input);
12748 OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS);
12749 } else {
12750 assert(VT.getScalarSizeInBits() == 32 &&
12751 "Expected 16 or 32 bit shuffle elemements");
12752 Input = DAG.getBitcast(MVT::v2f64, Input);
12753 OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS);
12754 }
12755 } else {
12756 int MaskElt = getPFIDLane(ID, RHSID);
12757 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
12758 ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
12759 Input = MaskElt < 4 ? V1 : V2;
12760 // Be careful about creating illegal types. Use f16 instead of i16.
12761 if (VT == MVT::v4i16) {
12762 Input = DAG.getBitcast(MVT::v4f16, Input);
12763 OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS);
12764 }
12765 }
12768 Input, DAG.getVectorIdxConstant(ExtLane, dl));
12769 SDValue Ins =
12770 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Input.getValueType(), OpLHS,
12771 Ext, DAG.getVectorIdxConstant(RHSID & 0x3, dl));
12772 return DAG.getBitcast(VT, Ins);
12773 }
12774
12775 SDValue OpLHS, OpRHS;
12776 OpLHS = GeneratePerfectShuffle(LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS,
12777 RHS, DAG, dl);
12778 OpRHS = GeneratePerfectShuffle(RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS,
12779 RHS, DAG, dl);
12780 EVT VT = OpLHS.getValueType();
12781
12782 switch (OpNum) {
12783 default:
12784 llvm_unreachable("Unknown shuffle opcode!");
12785 case OP_VREV:
12786 // VREV divides the vector in half and swaps within the half.
12787 if (VT.getVectorElementType() == MVT::i32 ||
12788 VT.getVectorElementType() == MVT::f32)
12789 return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
12790 // vrev <4 x i16> -> REV32
12791 if (VT.getVectorElementType() == MVT::i16 ||
12792 VT.getVectorElementType() == MVT::f16 ||
12793 VT.getVectorElementType() == MVT::bf16)
12794 return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
12795 // vrev <4 x i8> -> REV16
12796 assert(VT.getVectorElementType() == MVT::i8);
12797 return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
12798 case OP_VDUP0:
12799 case OP_VDUP1:
12800 case OP_VDUP2:
12801 case OP_VDUP3: {
12802 EVT EltTy = VT.getVectorElementType();
12803 unsigned Opcode;
12804 if (EltTy == MVT::i8)
12805 Opcode = AArch64ISD::DUPLANE8;
12806 else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)
12807 Opcode = AArch64ISD::DUPLANE16;
12808 else if (EltTy == MVT::i32 || EltTy == MVT::f32)
12809 Opcode = AArch64ISD::DUPLANE32;
12810 else if (EltTy == MVT::i64 || EltTy == MVT::f64)
12811 Opcode = AArch64ISD::DUPLANE64;
12812 else
12813 llvm_unreachable("Invalid vector element type?");
12814
12815 if (VT.getSizeInBits() == 64)
12816 OpLHS = WidenVector(OpLHS, DAG);
12817 SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64);
12818 return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
12819 }
12820 case OP_VEXT1:
12821 case OP_VEXT2:
12822 case OP_VEXT3: {
12823 unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
12824 return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
12825 DAG.getConstant(Imm, dl, MVT::i32));
12826 }
12827 case OP_VUZPL:
12828 return DAG.getNode(AArch64ISD::UZP1, dl, VT, OpLHS, OpRHS);
12829 case OP_VUZPR:
12830 return DAG.getNode(AArch64ISD::UZP2, dl, VT, OpLHS, OpRHS);
12831 case OP_VZIPL:
12832 return DAG.getNode(AArch64ISD::ZIP1, dl, VT, OpLHS, OpRHS);
12833 case OP_VZIPR:
12834 return DAG.getNode(AArch64ISD::ZIP2, dl, VT, OpLHS, OpRHS);
12835 case OP_VTRNL:
12836 return DAG.getNode(AArch64ISD::TRN1, dl, VT, OpLHS, OpRHS);
12837 case OP_VTRNR:
12838 return DAG.getNode(AArch64ISD::TRN2, dl, VT, OpLHS, OpRHS);
12839 }
12840}
12841
12843 SelectionDAG &DAG) {
12844 // Check to see if we can use the TBL instruction.
12845 SDValue V1 = Op.getOperand(0);
12846 SDValue V2 = Op.getOperand(1);
12847 SDLoc DL(Op);
12848
12849 EVT EltVT = Op.getValueType().getVectorElementType();
12850 unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
12851
12852 bool Swap = false;
12853 if (V1.isUndef() || isZerosVector(V1.getNode())) {
12854 std::swap(V1, V2);
12855 Swap = true;
12856 }
12857
12858 // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill
12859 // out of range values with 0s. We do need to make sure that any out-of-range
12860 // values are really out-of-range for a v16i8 vector.
12861 bool IsUndefOrZero = V2.isUndef() || isZerosVector(V2.getNode());
12862 MVT IndexVT = MVT::v8i8;
12863 unsigned IndexLen = 8;
12864 if (Op.getValueSizeInBits() == 128) {
12865 IndexVT = MVT::v16i8;
12866 IndexLen = 16;
12867 }
12868
12870 for (int Val : ShuffleMask) {
12871 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
12872 unsigned Offset = Byte + Val * BytesPerElt;
12873 if (Swap)
12874 Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen;
12875 if (IsUndefOrZero && Offset >= IndexLen)
12876 Offset = 255;
12877 TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
12878 }
12879 }
12880
12881 SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
12882 SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
12883
12884 SDValue Shuffle;
12885 if (IsUndefOrZero) {
12886 if (IndexLen == 8)
12887 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
12888 Shuffle = DAG.getNode(
12889 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
12890 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
12891 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
12892 } else {
12893 if (IndexLen == 8) {
12894 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
12895 Shuffle = DAG.getNode(
12896 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
12897 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
12898 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
12899 } else {
12900 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
12901 // cannot currently represent the register constraints on the input
12902 // table registers.
12903 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
12904 // DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
12905 // IndexLen));
12906 Shuffle = DAG.getNode(
12907 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
12908 DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
12909 V2Cst,
12910 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
12911 }
12912 }
12913 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
12914}
12915
12916static unsigned getDUPLANEOp(EVT EltType) {
12917 if (EltType == MVT::i8)
12918 return AArch64ISD::DUPLANE8;
12919 if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)
12920 return AArch64ISD::DUPLANE16;
12921 if (EltType == MVT::i32 || EltType == MVT::f32)
12922 return AArch64ISD::DUPLANE32;
12923 if (EltType == MVT::i64 || EltType == MVT::f64)
12924 return AArch64ISD::DUPLANE64;
12925
12926 llvm_unreachable("Invalid vector element type?");
12927}
12928
12929static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT,
12930 unsigned Opcode, SelectionDAG &DAG) {
12931 // Try to eliminate a bitcasted extract subvector before a DUPLANE.
12932 auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
12933 // Match: dup (bitcast (extract_subv X, C)), LaneC
12934 if (BitCast.getOpcode() != ISD::BITCAST ||
12936 return false;
12937
12938 // The extract index must align in the destination type. That may not
12939 // happen if the bitcast is from narrow to wide type.
12940 SDValue Extract = BitCast.getOperand(0);
12941 unsigned ExtIdx = Extract.getConstantOperandVal(1);
12942 unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
12943 unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
12944 unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
12945 if (ExtIdxInBits % CastedEltBitWidth != 0)
12946 return false;
12947
12948 // Can't handle cases where vector size is not 128-bit
12949 if (!Extract.getOperand(0).getValueType().is128BitVector())
12950 return false;
12951
12952 // Update the lane value by offsetting with the scaled extract index.
12953 LaneC += ExtIdxInBits / CastedEltBitWidth;
12954
12955 // Determine the casted vector type of the wide vector input.
12956 // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
12957 // Examples:
12958 // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
12959 // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
12960 unsigned SrcVecNumElts =
12961 Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
12963 SrcVecNumElts);
12964 return true;
12965 };
12966 MVT CastVT;
12967 if (getScaledOffsetDup(V, Lane, CastVT)) {
12968 V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0));
12969 } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
12970 V.getOperand(0).getValueType().is128BitVector()) {
12971 // The lane is incremented by the index of the extract.
12972 // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
12973 Lane += V.getConstantOperandVal(1);
12974 V = V.getOperand(0);
12975 } else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
12976 // The lane is decremented if we are splatting from the 2nd operand.
12977 // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
12978 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
12979 Lane -= Idx * VT.getVectorNumElements() / 2;
12980 V = WidenVector(V.getOperand(Idx), DAG);
12981 } else if (VT.getSizeInBits() == 64) {
12982 // Widen the operand to 128-bit register with undef.
12983 V = WidenVector(V, DAG);
12984 }
12985 return DAG.getNode(Opcode, dl, VT, V, DAG.getConstant(Lane, dl, MVT::i64));
12986}
12987
12988// Return true if we can get a new shuffle mask by checking the parameter mask
12989// array to test whether every two adjacent mask values are continuous and
12990// starting from an even number.
12992 SmallVectorImpl<int> &NewMask) {
12993 unsigned NumElts = VT.getVectorNumElements();
12994 if (NumElts % 2 != 0)
12995 return false;
12996
12997 NewMask.clear();
12998 for (unsigned i = 0; i < NumElts; i += 2) {
12999 int M0 = M[i];
13000 int M1 = M[i + 1];
13001
13002 // If both elements are undef, new mask is undef too.
13003 if (M0 == -1 && M1 == -1) {
13004 NewMask.push_back(-1);
13005 continue;
13006 }
13007
13008 if (M0 == -1 && M1 != -1 && (M1 % 2) == 1) {
13009 NewMask.push_back(M1 / 2);
13010 continue;
13011 }
13012
13013 if (M0 != -1 && (M0 % 2) == 0 && ((M0 + 1) == M1 || M1 == -1)) {
13014 NewMask.push_back(M0 / 2);
13015 continue;
13016 }
13017
13018 NewMask.clear();
13019 return false;
13020 }
13021
13022 assert(NewMask.size() == NumElts / 2 && "Incorrect size for mask!");
13023 return true;
13024}
13025
13026// Try to widen element type to get a new mask value for a better permutation
13027// sequence, so that we can use NEON shuffle instructions, such as zip1/2,
13028// UZP1/2, TRN1/2, REV, INS, etc.
13029// For example:
13030// shufflevector <4 x i32> %a, <4 x i32> %b,
13031// <4 x i32> <i32 6, i32 7, i32 2, i32 3>
13032// is equivalent to:
13033// shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
13034// Finally, we can get:
13035// mov v0.d[0], v1.d[1]
13037 SDLoc DL(Op);
13038 EVT VT = Op.getValueType();
13039 EVT ScalarVT = VT.getVectorElementType();
13040 unsigned ElementSize = ScalarVT.getFixedSizeInBits();
13041 SDValue V0 = Op.getOperand(0);
13042 SDValue V1 = Op.getOperand(1);
13043 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
13044
13045 // If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ...
13046 // We need to make sure the wider element type is legal. Thus, ElementSize
13047 // should be not larger than 32 bits, and i1 type should also be excluded.
13048 if (ElementSize > 32 || ElementSize == 1)
13049 return SDValue();
13050
13051 SmallVector<int, 8> NewMask;
13052 if (isWideTypeMask(Mask, VT, NewMask)) {
13053 MVT NewEltVT = VT.isFloatingPoint()
13054 ? MVT::getFloatingPointVT(ElementSize * 2)
13055 : MVT::getIntegerVT(ElementSize * 2);
13056 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
13057 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
13058 V0 = DAG.getBitcast(NewVT, V0);
13059 V1 = DAG.getBitcast(NewVT, V1);
13060 return DAG.getBitcast(VT,
13061 DAG.getVectorShuffle(NewVT, DL, V0, V1, NewMask));
13062 }
13063 }
13064
13065 return SDValue();
13066}
13067
13068// Try to fold shuffle (tbl2, tbl2) into a single tbl4.
13070 ArrayRef<int> ShuffleMask,
13071 SelectionDAG &DAG) {
13072 SDValue Tbl1 = Op->getOperand(0);
13073 SDValue Tbl2 = Op->getOperand(1);
13074 SDLoc dl(Op);
13075 SDValue Tbl2ID =
13076 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, dl, MVT::i64);
13077
13078 EVT VT = Op.getValueType();
13079 if (Tbl1->getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
13080 Tbl1->getOperand(0) != Tbl2ID ||
13082 Tbl2->getOperand(0) != Tbl2ID)
13083 return SDValue();
13084
13085 if (Tbl1->getValueType(0) != MVT::v16i8 ||
13086 Tbl2->getValueType(0) != MVT::v16i8)
13087 return SDValue();
13088
13089 SDValue Mask1 = Tbl1->getOperand(3);
13090 SDValue Mask2 = Tbl2->getOperand(3);
13091 SmallVector<SDValue, 16> TBLMaskParts(16, SDValue());
13092 for (unsigned I = 0; I < 16; I++) {
13093 if (ShuffleMask[I] < 16)
13094 TBLMaskParts[I] = Mask1->getOperand(ShuffleMask[I]);
13095 else {
13096 auto *C =
13097 dyn_cast<ConstantSDNode>(Mask2->getOperand(ShuffleMask[I] - 16));
13098 if (!C)
13099 return SDValue();
13100 TBLMaskParts[I] = DAG.getConstant(C->getSExtValue() + 32, dl, MVT::i32);
13101 }
13102 }
13103
13104 SDValue TBLMask = DAG.getBuildVector(VT, dl, TBLMaskParts);
13105 SDValue ID =
13106 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl4, dl, MVT::i64);
13107
13108 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v16i8,
13109 {ID, Tbl1->getOperand(1), Tbl1->getOperand(2),
13110 Tbl2->getOperand(1), Tbl2->getOperand(2), TBLMask});
13111}
13112
13113// Baseline legalization for ZERO_EXTEND_VECTOR_INREG will blend-in zeros,
13114// but we don't have an appropriate instruction,
13115// so custom-lower it as ZIP1-with-zeros.
13116SDValue
13117AArch64TargetLowering::LowerZERO_EXTEND_VECTOR_INREG(SDValue Op,
13118 SelectionDAG &DAG) const {
13119 SDLoc dl(Op);
13120 EVT VT = Op.getValueType();
13121 SDValue SrcOp = Op.getOperand(0);
13122 EVT SrcVT = SrcOp.getValueType();
13123 assert(VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits() == 0 &&
13124 "Unexpected extension factor.");
13125 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
13126 // FIXME: support multi-step zipping?
13127 if (Scale != 2)
13128 return SDValue();
13129 SDValue Zeros = DAG.getConstant(0, dl, SrcVT);
13130 return DAG.getBitcast(VT,
13131 DAG.getNode(AArch64ISD::ZIP1, dl, SrcVT, SrcOp, Zeros));
13132}
13133
13134SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
13135 SelectionDAG &DAG) const {
13136 SDLoc dl(Op);
13137 EVT VT = Op.getValueType();
13138
13139 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
13140
13141 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
13142 return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG);
13143
13144 // Convert shuffles that are directly supported on NEON to target-specific
13145 // DAG nodes, instead of keeping them as shuffles and matching them again
13146 // during code selection. This is more efficient and avoids the possibility
13147 // of inconsistencies between legalization and selection.
13148 ArrayRef<int> ShuffleMask = SVN->getMask();
13149
13150 SDValue V1 = Op.getOperand(0);
13151 SDValue V2 = Op.getOperand(1);
13152
13153 assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!");
13154 assert(ShuffleMask.size() == VT.getVectorNumElements() &&
13155 "Unexpected VECTOR_SHUFFLE mask size!");
13156
13157 if (SDValue Res = tryToConvertShuffleOfTbl2ToTbl4(Op, ShuffleMask, DAG))
13158 return Res;
13159
13160 if (SVN->isSplat()) {
13161 int Lane = SVN->getSplatIndex();
13162 // If this is undef splat, generate it via "just" vdup, if possible.
13163 if (Lane == -1)
13164 Lane = 0;
13165
13166 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
13167 return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
13168 V1.getOperand(0));
13169 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
13170 // constant. If so, we can just reference the lane's definition directly.
13171 if (V1.getOpcode() == ISD::BUILD_VECTOR &&
13172 !isa<ConstantSDNode>(V1.getOperand(Lane)))
13173 return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));
13174
13175 // Otherwise, duplicate from the lane of the input vector.
13176 unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
13177 return constructDup(V1, Lane, dl, VT, Opcode, DAG);
13178 }
13179
13180 // Check if the mask matches a DUP for a wider element
13181 for (unsigned LaneSize : {64U, 32U, 16U}) {
13182 unsigned Lane = 0;
13183 if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) {
13184 unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
13185 : LaneSize == 32 ? AArch64ISD::DUPLANE32
13187 // Cast V1 to an integer vector with required lane size
13188 MVT NewEltTy = MVT::getIntegerVT(LaneSize);
13189 unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
13190 MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount);
13191 V1 = DAG.getBitcast(NewVecTy, V1);
13192 // Constuct the DUP instruction
13193 V1 = constructDup(V1, Lane, dl, NewVecTy, Opcode, DAG);
13194 // Cast back to the original type
13195 return DAG.getBitcast(VT, V1);
13196 }
13197 }
13198
13199 unsigned NumElts = VT.getVectorNumElements();
13200 unsigned EltSize = VT.getScalarSizeInBits();
13201 if (isREVMask(ShuffleMask, EltSize, NumElts, 64))
13202 return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);
13203 if (isREVMask(ShuffleMask, EltSize, NumElts, 32))
13204 return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);
13205 if (isREVMask(ShuffleMask, EltSize, NumElts, 16))
13206 return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);
13207
13208 if (((NumElts == 8 && EltSize == 16) || (NumElts == 16 && EltSize == 8)) &&
13209 ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size())) {
13210 SDValue Rev = DAG.getNode(AArch64ISD::REV64, dl, VT, V1);
13211 return DAG.getNode(AArch64ISD::EXT, dl, VT, Rev, Rev,
13212 DAG.getConstant(8, dl, MVT::i32));
13213 }
13214
13215 bool ReverseEXT = false;
13216 unsigned Imm;
13217 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
13218 if (ReverseEXT)
13219 std::swap(V1, V2);
13220 Imm *= getExtFactor(V1);
13221 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
13222 DAG.getConstant(Imm, dl, MVT::i32));
13223 } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
13224 Imm *= getExtFactor(V1);
13225 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
13226 DAG.getConstant(Imm, dl, MVT::i32));
13227 }
13228
13229 unsigned WhichResult;
13230 if (isZIPMask(ShuffleMask, NumElts, WhichResult)) {
13231 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
13232 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
13233 }
13234 if (isUZPMask(ShuffleMask, NumElts, WhichResult)) {
13235 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
13236 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
13237 }
13238 if (isTRNMask(ShuffleMask, NumElts, WhichResult)) {
13239 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
13240 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
13241 }
13242
13243 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
13244 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
13245 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
13246 }
13247 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
13248 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
13249 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
13250 }
13251 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
13252 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
13253 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
13254 }
13255
13257 return Concat;
13258
13259 bool DstIsLeft;
13260 int Anomaly;
13261 int NumInputElements = V1.getValueType().getVectorNumElements();
13262 if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
13263 SDValue DstVec = DstIsLeft ? V1 : V2;
13264 SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64);
13265
13266 SDValue SrcVec = V1;
13267 int SrcLane = ShuffleMask[Anomaly];
13268 if (SrcLane >= NumInputElements) {
13269 SrcVec = V2;
13270 SrcLane -= NumElts;
13271 }
13272 SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64);
13273
13274 EVT ScalarVT = VT.getVectorElementType();
13275
13276 if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger())
13277 ScalarVT = MVT::i32;
13278
13279 return DAG.getNode(
13280 ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
13281 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
13282 DstLaneV);
13283 }
13284
13285 if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG))
13286 return NewSD;
13287
13288 // If the shuffle is not directly supported and it has 4 elements, use
13289 // the PerfectShuffle-generated table to synthesize it from other shuffles.
13290 if (NumElts == 4) {
13291 unsigned PFIndexes[4];
13292 for (unsigned i = 0; i != 4; ++i) {
13293 if (ShuffleMask[i] < 0)
13294 PFIndexes[i] = 8;
13295 else
13296 PFIndexes[i] = ShuffleMask[i];
13297 }
13298
13299 // Compute the index in the perfect shuffle table.
13300 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
13301 PFIndexes[2] * 9 + PFIndexes[3];
13302 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
13303 return GeneratePerfectShuffle(PFTableIndex, V1, V2, PFEntry, V1, V2, DAG,
13304 dl);
13305 }
13306
13307 return GenerateTBL(Op, ShuffleMask, DAG);
13308}
13309
13310SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
13311 SelectionDAG &DAG) const {
13312 EVT VT = Op.getValueType();
13313
13314 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
13315 return LowerToScalableOp(Op, DAG);
13316
13317 assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i1 &&
13318 "Unexpected vector type!");
13319
13320 // We can handle the constant cases during isel.
13321 if (isa<ConstantSDNode>(Op.getOperand(0)))
13322 return Op;
13323
13324 // There isn't a natural way to handle the general i1 case, so we use some
13325 // trickery with whilelo.
13326 SDLoc DL(Op);
13327 SDValue SplatVal = DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, MVT::i64);
13328 SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, SplatVal,
13329 DAG.getValueType(MVT::i1));
13330 SDValue ID =
13331 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
13332 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
13333 if (VT == MVT::nxv1i1)
13334 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::nxv1i1,
13335 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv2i1, ID,
13336 Zero, SplatVal),
13337 Zero);
13338 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, ID, Zero, SplatVal);
13339}
13340
13341SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
13342 SelectionDAG &DAG) const {
13343 SDLoc DL(Op);
13344
13345 EVT VT = Op.getValueType();
13346 if (!isTypeLegal(VT) || !VT.isScalableVector())
13347 return SDValue();
13348
13349 // Current lowering only supports the SVE-ACLE types.
13351 return SDValue();
13352
13353 // The DUPQ operation is independent of element type so normalise to i64s.
13354 SDValue Idx128 = Op.getOperand(2);
13355
13356 // DUPQ can be used when idx is in range.
13357 auto *CIdx = dyn_cast<ConstantSDNode>(Idx128);
13358 if (CIdx && (CIdx->getZExtValue() <= 3)) {
13359 SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
13360 return DAG.getNode(AArch64ISD::DUPLANE128, DL, VT, Op.getOperand(1), CI);
13361 }
13362
13363 SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
13364
13365 // The ACLE says this must produce the same result as:
13366 // svtbl(data, svadd_x(svptrue_b64(),
13367 // svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
13368 // index * 2))
13369 SDValue One = DAG.getConstant(1, DL, MVT::i64);
13370 SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One);
13371
13372 // create the vector 0,1,0,1,...
13373 SDValue SV = DAG.getStepVector(DL, MVT::nxv2i64);
13374 SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);
13375
13376 // create the vector idx64,idx64+1,idx64,idx64+1,...
13377 SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128);
13378 SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);
13379 SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);
13380
13381 // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
13382 SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
13383 return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
13384}
13385
13386
13387static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
13388 APInt &UndefBits) {
13389 EVT VT = BVN->getValueType(0);
13390 APInt SplatBits, SplatUndef;
13391 unsigned SplatBitSize;
13392 bool HasAnyUndefs;
13393 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
13394 unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
13395
13396 for (unsigned i = 0; i < NumSplats; ++i) {
13397 CnstBits <<= SplatBitSize;
13398 UndefBits <<= SplatBitSize;
13399 CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
13400 UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
13401 }
13402
13403 return true;
13404 }
13405
13406 return false;
13407}
13408
13409// Try 64-bit splatted SIMD immediate.
13410static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
13411 const APInt &Bits) {
13412 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
13413 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
13414 EVT VT = Op.getValueType();
13415 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;
13416
13419
13420 SDLoc dl(Op);
13421 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
13422 DAG.getConstant(Value, dl, MVT::i32));
13423 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
13424 }
13425 }
13426
13427 return SDValue();
13428}
13429
13430// Try 32-bit splatted SIMD immediate.
13431static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
13432 const APInt &Bits,
13433 const SDValue *LHS = nullptr) {
13434 EVT VT = Op.getValueType();
13435 if (VT.isFixedLengthVector() &&
13437 return SDValue();
13438
13439 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
13440 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
13441 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
13442 bool isAdvSIMDModImm = false;
13443 uint64_t Shift;
13444
13445 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {
13447 Shift = 0;
13448 }
13449 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
13451 Shift = 8;
13452 }
13453 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
13455 Shift = 16;
13456 }
13457 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
13459 Shift = 24;
13460 }
13461
13462 if (isAdvSIMDModImm) {
13463 SDLoc dl(Op);
13464 SDValue Mov;
13465
13466 if (LHS)
13467 Mov = DAG.getNode(NewOp, dl, MovTy,
13468 DAG.getNode(AArch64ISD::NVCAST, dl, MovTy, *LHS),
13469 DAG.getConstant(Value, dl, MVT::i32),
13470 DAG.getConstant(Shift, dl, MVT::i32));
13471 else
13472 Mov = DAG.getNode(NewOp, dl, MovTy,
13473 DAG.getConstant(Value, dl, MVT::i32),
13474 DAG.getConstant(Shift, dl, MVT::i32));
13475
13476 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
13477 }
13478 }
13479
13480 return SDValue();
13481}
13482
13483// Try 16-bit splatted SIMD immediate.
13484static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
13485 const APInt &Bits,
13486 const SDValue *LHS = nullptr) {
13487 EVT VT = Op.getValueType();
13488 if (VT.isFixedLengthVector() &&
13490 return SDValue();
13491
13492 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
13493 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
13494 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
13495 bool isAdvSIMDModImm = false;
13496 uint64_t Shift;
13497
13498 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {
13500 Shift = 0;
13501 }
13502 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
13504 Shift = 8;
13505 }
13506
13507 if (isAdvSIMDModImm) {
13508 SDLoc dl(Op);
13509 SDValue Mov;
13510
13511 if (LHS)
13512 Mov = DAG.getNode(NewOp, dl, MovTy,
13513 DAG.getNode(AArch64ISD::NVCAST, dl, MovTy, *LHS),
13514 DAG.getConstant(Value, dl, MVT::i32),
13515 DAG.getConstant(Shift, dl, MVT::i32));
13516 else
13517 Mov = DAG.getNode(NewOp, dl, MovTy,
13518 DAG.getConstant(Value, dl, MVT::i32),
13519 DAG.getConstant(Shift, dl, MVT::i32));
13520
13521 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
13522 }
13523 }
13524
13525 return SDValue();
13526}
13527
13528// Try 32-bit splatted SIMD immediate with shifted ones.
13530 SelectionDAG &DAG, const APInt &Bits) {
13531 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
13532 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
13533 EVT VT = Op.getValueType();
13534 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
13535 bool isAdvSIMDModImm = false;
13536 uint64_t Shift;
13537
13538 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {
13540 Shift = 264;
13541 }
13542 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
13544 Shift = 272;
13545 }
13546
13547 if (isAdvSIMDModImm) {
13548 SDLoc dl(Op);
13549 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
13550 DAG.getConstant(Value, dl, MVT::i32),
13551 DAG.getConstant(Shift, dl, MVT::i32));
13552 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
13553 }
13554 }
13555
13556 return SDValue();
13557}
13558
13559// Try 8-bit splatted SIMD immediate.
13560static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
13561 const APInt &Bits) {
13562 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
13563 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
13564 EVT VT = Op.getValueType();
13565 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
13566
13569
13570 SDLoc dl(Op);
13571 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
13572 DAG.getConstant(Value, dl, MVT::i32));
13573 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
13574 }
13575 }
13576
13577 return SDValue();
13578}
13579
13580// Try FP splatted SIMD immediate.
13581static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
13582 const APInt &Bits) {
13583 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
13584 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
13585 EVT VT = Op.getValueType();
13586 bool isWide = (VT.getSizeInBits() == 128);
13587 MVT MovTy;
13588 bool isAdvSIMDModImm = false;
13589
13590 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {
13592 MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
13593 }
13594 else if (isWide &&
13595 (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
13597 MovTy = MVT::v2f64;
13598 }
13599
13600 if (isAdvSIMDModImm) {
13601 SDLoc dl(Op);
13602 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
13603 DAG.getConstant(Value, dl, MVT::i32));
13604 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
13605 }
13606 }
13607
13608 return SDValue();
13609}
13610
13611// Specialized code to quickly find if PotentialBVec is a BuildVector that
13612// consists of only the same constant int value, returned in reference arg
13613// ConstVal
13614static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
13615 uint64_t &ConstVal) {
13616 BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
13617 if (!Bvec)
13618 return false;
13619 ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
13620 if (!FirstElt)
13621 return false;
13622 EVT VT = Bvec->getValueType(0);
13623 unsigned NumElts = VT.getVectorNumElements();
13624 for (unsigned i = 1; i < NumElts; ++i)
13625 if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
13626 return false;
13627 ConstVal = FirstElt->getZExtValue();
13628 return true;
13629}
13630
13632 // Look through cast.
13633 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST)
13634 N = N.getOperand(0);
13635
13636 return ISD::isConstantSplatVectorAllZeros(N.getNode());
13637}
13638
13640 unsigned NumElts = N.getValueType().getVectorMinNumElements();
13641
13642 // Look through cast.
13643 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {
13644 N = N.getOperand(0);
13645 // When reinterpreting from a type with fewer elements the "new" elements
13646 // are not active, so bail if they're likely to be used.
13647 if (N.getValueType().getVectorMinNumElements() < NumElts)
13648 return false;
13649 }
13650
13651 if (ISD::isConstantSplatVectorAllOnes(N.getNode()))
13652 return true;
13653
13654 // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
13655 // or smaller than the implicit element type represented by N.
13656 // NOTE: A larger element count implies a smaller element type.
13657 if (N.getOpcode() == AArch64ISD::PTRUE &&
13658 N.getConstantOperandVal(0) == AArch64SVEPredPattern::all)
13659 return N.getValueType().getVectorMinNumElements() >= NumElts;
13660
13661 // If we're compiling for a specific vector-length, we can check if the
13662 // pattern's VL equals that of the scalable vector at runtime.
13663 if (N.getOpcode() == AArch64ISD::PTRUE) {
13664 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
13665 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
13666 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
13667 if (MaxSVESize && MinSVESize == MaxSVESize) {
13668 unsigned VScale = MaxSVESize / AArch64::SVEBitsPerBlock;
13669 unsigned PatNumElts =
13670 getNumElementsFromSVEPredPattern(N.getConstantOperandVal(0));
13671 return PatNumElts == (NumElts * VScale);
13672 }
13673 }
13674
13675 return false;
13676}
13677
13678// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
13679// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
13680// BUILD_VECTORs with constant element C1, C2 is a constant, and:
13681// - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
13682// - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
13683// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
13685 EVT VT = N->getValueType(0);
13686
13687 if (!VT.isVector())
13688 return SDValue();
13689
13690 SDLoc DL(N);
13691
13692 SDValue And;
13693 SDValue Shift;
13694
13695 SDValue FirstOp = N->getOperand(0);
13696 unsigned FirstOpc = FirstOp.getOpcode();
13697 SDValue SecondOp = N->getOperand(1);
13698 unsigned SecondOpc = SecondOp.getOpcode();
13699
13700 // Is one of the operands an AND or a BICi? The AND may have been optimised to
13701 // a BICi in order to use an immediate instead of a register.
13702 // Is the other operand an shl or lshr? This will have been turned into:
13703 // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift
13704 // or (AArch64ISD::SHL_PRED || AArch64ISD::SRL_PRED) mask, vector, #shiftVec.
13705 if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
13706 (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR ||
13707 SecondOpc == AArch64ISD::SHL_PRED ||
13708 SecondOpc == AArch64ISD::SRL_PRED)) {
13709 And = FirstOp;
13710 Shift = SecondOp;
13711
13712 } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
13713 (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR ||
13714 FirstOpc == AArch64ISD::SHL_PRED ||
13715 FirstOpc == AArch64ISD::SRL_PRED)) {
13716 And = SecondOp;
13717 Shift = FirstOp;
13718 } else
13719 return SDValue();
13720
13721 bool IsAnd = And.getOpcode() == ISD::AND;
13722 bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR ||
13724 bool ShiftHasPredOp = Shift.getOpcode() == AArch64ISD::SHL_PRED ||
13726
13727 // Is the shift amount constant and are all lanes active?
13728 uint64_t C2;
13729 if (ShiftHasPredOp) {
13730 if (!isAllActivePredicate(DAG, Shift.getOperand(0)))
13731 return SDValue();
13732 APInt C;
13734 return SDValue();
13735 C2 = C.getZExtValue();
13736 } else if (ConstantSDNode *C2node =
13737 dyn_cast<ConstantSDNode>(Shift.getOperand(1)))
13738 C2 = C2node->getZExtValue();
13739 else
13740 return SDValue();
13741
13742 APInt C1AsAPInt;
13743 unsigned ElemSizeInBits = VT.getScalarSizeInBits();
13744 if (IsAnd) {
13745 // Is the and mask vector all constant?
13746 if (!ISD::isConstantSplatVector(And.getOperand(1).getNode(), C1AsAPInt))
13747 return SDValue();
13748 } else {
13749 // Reconstruct the corresponding AND immediate from the two BICi immediates.
13750 ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1));
13751 ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2));
13752 assert(C1nodeImm && C1nodeShift);
13753 C1AsAPInt = ~(C1nodeImm->getAPIntValue() << C1nodeShift->getAPIntValue());
13754 C1AsAPInt = C1AsAPInt.zextOrTrunc(ElemSizeInBits);
13755 }
13756
13757 // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
13758 // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
13759 // how much one can shift elements of a particular size?
13760 if (C2 > ElemSizeInBits)
13761 return SDValue();
13762
13763 APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2)
13764 : APInt::getLowBitsSet(ElemSizeInBits, C2);
13765 if (C1AsAPInt != RequiredC1)
13766 return SDValue();
13767
13768 SDValue X = And.getOperand(0);
13769 SDValue Y = ShiftHasPredOp ? Shift.getOperand(1) : Shift.getOperand(0);
13770 SDValue Imm = ShiftHasPredOp ? DAG.getTargetConstant(C2, DL, MVT::i32)
13771 : Shift.getOperand(1);
13772
13773 unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
13774 SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Imm);
13775
13776 LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n");
13777 LLVM_DEBUG(N->dump(&DAG));
13778 LLVM_DEBUG(dbgs() << "into: \n");
13779 LLVM_DEBUG(ResultSLI->dump(&DAG));
13780
13781 ++NumShiftInserts;
13782 return ResultSLI;
13783}
13784
13785SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
13786 SelectionDAG &DAG) const {
13787 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
13788 !Subtarget->isNeonAvailable()))
13789 return LowerToScalableOp(Op, DAG);
13790
13791 // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
13792 if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
13793 return Res;
13794
13795 EVT VT = Op.getValueType();
13796 if (VT.isScalableVector())
13797 return Op;
13798
13799 SDValue LHS = Op.getOperand(0);
13800 BuildVectorSDNode *BVN =
13801 dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
13802 if (!BVN) {
13803 // OR commutes, so try swapping the operands.
13804 LHS = Op.getOperand(1);
13805 BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
13806 }
13807 if (!BVN)
13808 return Op;
13809
13810 APInt DefBits(VT.getSizeInBits(), 0);
13811 APInt UndefBits(VT.getSizeInBits(), 0);
13812 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
13813 SDValue NewOp;
13814
13815 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
13816 DefBits, &LHS)) ||
13817 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
13818 DefBits, &LHS)))
13819 return NewOp;
13820
13821 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
13822 UndefBits, &LHS)) ||
13823 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
13824 UndefBits, &LHS)))
13825 return NewOp;
13826 }
13827
13828 // We can always fall back to a non-immediate OR.
13829 return Op;
13830}
13831
13832// Normalize the operands of BUILD_VECTOR. The value of constant operands will
13833// be truncated to fit element width.
13835 SelectionDAG &DAG) {
13836 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
13837 SDLoc dl(Op);
13838 EVT VT = Op.getValueType();
13839 EVT EltTy= VT.getVectorElementType();
13840
13841 if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
13842 return Op;
13843
13845 for (SDValue Lane : Op->ops()) {
13846 // For integer vectors, type legalization would have promoted the
13847 // operands already. Otherwise, if Op is a floating-point splat
13848 // (with operands cast to integers), then the only possibilities
13849 // are constants and UNDEFs.
13850 if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
13851 APInt LowBits(EltTy.getSizeInBits(),
13852 CstLane->getZExtValue());
13853 Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32);
13854 } else if (Lane.getNode()->isUndef()) {
13855 Lane = DAG.getUNDEF(MVT::i32);
13856 } else {
13857 assert(Lane.getValueType() == MVT::i32 &&
13858 "Unexpected BUILD_VECTOR operand type");
13859 }
13860 Ops.push_back(Lane);
13861 }
13862 return DAG.getBuildVector(VT, dl, Ops);
13863}
13864
13866 const AArch64Subtarget *ST) {
13867 EVT VT = Op.getValueType();
13868 assert((VT.getSizeInBits() == 64 || VT.getSizeInBits() == 128) &&
13869 "Expected a legal NEON vector");
13870
13871 APInt DefBits(VT.getSizeInBits(), 0);
13872 APInt UndefBits(VT.getSizeInBits(), 0);
13873 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
13874 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
13875 auto TryMOVIWithBits = [&](APInt DefBits) {
13876 SDValue NewOp;
13877 if ((NewOp =
13878 tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
13879 (NewOp =
13880 tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
13881 (NewOp =
13882 tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
13883 (NewOp =
13884 tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
13885 (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
13886 (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
13887 return NewOp;
13888
13889 APInt NotDefBits = ~DefBits;
13890 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG,
13891 NotDefBits)) ||
13893 NotDefBits)) ||
13894 (NewOp =
13895 tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, NotDefBits)))
13896 return NewOp;
13897 return SDValue();
13898 };
13899 if (SDValue R = TryMOVIWithBits(DefBits))
13900 return R;
13901 if (SDValue R = TryMOVIWithBits(UndefBits))
13902 return R;
13903
13904 // See if a fneg of the constant can be materialized with a MOVI, etc
13905 auto TryWithFNeg = [&](APInt DefBits, MVT FVT) {
13906 // FNegate each sub-element of the constant
13907 assert(VT.getSizeInBits() % FVT.getScalarSizeInBits() == 0);
13908 APInt Neg = APInt::getHighBitsSet(FVT.getSizeInBits(), 1)
13909 .zext(VT.getSizeInBits());
13910 APInt NegBits(VT.getSizeInBits(), 0);
13911 unsigned NumElts = VT.getSizeInBits() / FVT.getScalarSizeInBits();
13912 for (unsigned i = 0; i < NumElts; i++)
13913 NegBits |= Neg << (FVT.getScalarSizeInBits() * i);
13914 NegBits = DefBits ^ NegBits;
13915
13916 // Try to create the new constants with MOVI, and if so generate a fneg
13917 // for it.
13918 if (SDValue NewOp = TryMOVIWithBits(NegBits)) {
13919 SDLoc DL(Op);
13920 MVT VFVT = NumElts == 1 ? FVT : MVT::getVectorVT(FVT, NumElts);
13921 return DAG.getNode(
13923 DAG.getNode(ISD::FNEG, DL, VFVT,
13924 DAG.getNode(AArch64ISD::NVCAST, DL, VFVT, NewOp)));
13925 }
13926 return SDValue();
13927 };
13928 SDValue R;
13929 if ((R = TryWithFNeg(DefBits, MVT::f32)) ||
13930 (R = TryWithFNeg(DefBits, MVT::f64)) ||
13931 (ST->hasFullFP16() && (R = TryWithFNeg(DefBits, MVT::f16))))
13932 return R;
13933 }
13934
13935 return SDValue();
13936}
13937
13938SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
13939 SelectionDAG &DAG) const {
13940 EVT VT = Op.getValueType();
13941
13942 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
13943 if (auto SeqInfo = cast<BuildVectorSDNode>(Op)->isConstantSequence()) {
13944 SDLoc DL(Op);
13945 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
13946 SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT);
13947 SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second);
13948 SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps);
13949 return convertFromScalableVector(DAG, Op.getValueType(), Seq);
13950 }
13951
13952 // Revert to common legalisation for all other variants.
13953 return SDValue();
13954 }
13955
13956 // Try to build a simple constant vector.
13957 Op = NormalizeBuildVector(Op, DAG);
13958 // Thought this might return a non-BUILD_VECTOR (e.g. CONCAT_VECTORS), if so,
13959 // abort.
13960 if (Op.getOpcode() != ISD::BUILD_VECTOR)
13961 return SDValue();
13962
13963 // Certain vector constants, used to express things like logical NOT and
13964 // arithmetic NEG, are passed through unmodified. This allows special
13965 // patterns for these operations to match, which will lower these constants
13966 // to whatever is proven necessary.
13967 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
13968 if (BVN->isConstant()) {
13969 if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
13970 unsigned BitSize = VT.getVectorElementType().getSizeInBits();
13971 APInt Val(BitSize,
13972 Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
13973 if (Val.isZero() || (VT.isInteger() && Val.isAllOnes()))
13974 return Op;
13975 }
13976 if (ConstantFPSDNode *Const = BVN->getConstantFPSplatNode())
13977 if (Const->isZero() && !Const->isNegative())
13978 return Op;
13979 }
13980
13981 if (SDValue V = ConstantBuildVector(Op, DAG, Subtarget))
13982 return V;
13983
13984 // Scan through the operands to find some interesting properties we can
13985 // exploit:
13986 // 1) If only one value is used, we can use a DUP, or
13987 // 2) if only the low element is not undef, we can just insert that, or
13988 // 3) if only one constant value is used (w/ some non-constant lanes),
13989 // we can splat the constant value into the whole vector then fill
13990 // in the non-constant lanes.
13991 // 4) FIXME: If different constant values are used, but we can intelligently
13992 // select the values we'll be overwriting for the non-constant
13993 // lanes such that we can directly materialize the vector
13994 // some other way (MOVI, e.g.), we can be sneaky.
13995 // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
13996 SDLoc dl(Op);
13997 unsigned NumElts = VT.getVectorNumElements();
13998 bool isOnlyLowElement = true;
13999 bool usesOnlyOneValue = true;
14000 bool usesOnlyOneConstantValue = true;
14001 bool isConstant = true;
14002 bool AllLanesExtractElt = true;
14003 unsigned NumConstantLanes = 0;
14004 unsigned NumDifferentLanes = 0;
14005 unsigned NumUndefLanes = 0;
14006 SDValue Value;
14007 SDValue ConstantValue;
14008 SmallMapVector<SDValue, unsigned, 16> DifferentValueMap;
14009 unsigned ConsecutiveValCount = 0;
14010 SDValue PrevVal;
14011 for (unsigned i = 0; i < NumElts; ++i) {
14012 SDValue V = Op.getOperand(i);
14013 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
14014 AllLanesExtractElt = false;
14015 if (V.isUndef()) {
14016 ++NumUndefLanes;
14017 continue;
14018 }
14019 if (i > 0)
14020 isOnlyLowElement = false;
14021 if (!isIntOrFPConstant(V))
14022 isConstant = false;
14023
14024 if (isIntOrFPConstant(V)) {
14025 ++NumConstantLanes;
14026 if (!ConstantValue.getNode())
14027 ConstantValue = V;
14028 else if (ConstantValue != V)
14029 usesOnlyOneConstantValue = false;
14030 }
14031
14032 if (!Value.getNode())
14033 Value = V;
14034 else if (V != Value) {
14035 usesOnlyOneValue = false;
14036 ++NumDifferentLanes;
14037 }
14038
14039 if (PrevVal != V) {
14040 ConsecutiveValCount = 0;
14041 PrevVal = V;
14042 }
14043
14044 // Keep different values and its last consecutive count. For example,
14045 //
14046 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
14047 // t24, t24, t24, t24, t24, t24, t24, t24
14048 // t23 = consecutive count 8
14049 // t24 = consecutive count 8
14050 // ------------------------------------------------------------------
14051 // t22: v16i8 = build_vector t24, t24, t23, t23, t23, t23, t23, t24,
14052 // t24, t24, t24, t24, t24, t24, t24, t24
14053 // t23 = consecutive count 5
14054 // t24 = consecutive count 9
14055 DifferentValueMap[V] = ++ConsecutiveValCount;
14056 }
14057
14058 if (!Value.getNode()) {
14059 LLVM_DEBUG(
14060 dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
14061 return DAG.getUNDEF(VT);
14062 }
14063
14064 // Convert BUILD_VECTOR where all elements but the lowest are undef into
14065 // SCALAR_TO_VECTOR, except for when we have a single-element constant vector
14066 // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
14067 if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(Value))) {
14068 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
14069 "SCALAR_TO_VECTOR node\n");
14070 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
14071 }
14072
14073 if (AllLanesExtractElt) {
14074 SDNode *Vector = nullptr;
14075 bool Even = false;
14076 bool Odd = false;
14077 // Check whether the extract elements match the Even pattern <0,2,4,...> or
14078 // the Odd pattern <1,3,5,...>.
14079 for (unsigned i = 0; i < NumElts; ++i) {
14080 SDValue V = Op.getOperand(i);
14081 const SDNode *N = V.getNode();
14082 if (!isa<ConstantSDNode>(N->getOperand(1))) {
14083 Even = false;
14084 Odd = false;
14085 break;
14086 }
14087 SDValue N0 = N->getOperand(0);
14088
14089 // All elements are extracted from the same vector.
14090 if (!Vector) {
14091 Vector = N0.getNode();
14092 // Check that the type of EXTRACT_VECTOR_ELT matches the type of
14093 // BUILD_VECTOR.
14094 if (VT.getVectorElementType() !=
14096 break;
14097 } else if (Vector != N0.getNode()) {
14098 Odd = false;
14099 Even = false;
14100 break;
14101 }
14102
14103 // Extracted values are either at Even indices <0,2,4,...> or at Odd
14104 // indices <1,3,5,...>.
14105 uint64_t Val = N->getConstantOperandVal(1);
14106 if (Val == 2 * i) {
14107 Even = true;
14108 continue;
14109 }
14110 if (Val - 1 == 2 * i) {
14111 Odd = true;
14112 continue;
14113 }
14114
14115 // Something does not match: abort.
14116 Odd = false;
14117 Even = false;
14118 break;
14119 }
14120 if (Even || Odd) {
14121 SDValue LHS =
14123 DAG.getConstant(0, dl, MVT::i64));
14124 SDValue RHS =
14126 DAG.getConstant(NumElts, dl, MVT::i64));
14127
14128 if (Even && !Odd)
14129 return DAG.getNode(AArch64ISD::UZP1, dl, VT, LHS, RHS);
14130 if (Odd && !Even)
14131 return DAG.getNode(AArch64ISD::UZP2, dl, VT, LHS, RHS);
14132 }
14133 }
14134
14135 // Use DUP for non-constant splats. For f32 constant splats, reduce to
14136 // i32 and try again.
14137 if (usesOnlyOneValue) {
14138 if (!isConstant) {
14139 if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14140 Value.getValueType() != VT) {
14141 LLVM_DEBUG(
14142 dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
14143 return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
14144 }
14145
14146 // This is actually a DUPLANExx operation, which keeps everything vectory.
14147
14148 SDValue Lane = Value.getOperand(1);
14149 Value = Value.getOperand(0);
14150 if (Value.getValueSizeInBits() == 64) {
14151 LLVM_DEBUG(
14152 dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
14153 "widening it\n");
14154 Value = WidenVector(Value, DAG);
14155 }
14156
14157 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
14158 return DAG.getNode(Opcode, dl, VT, Value, Lane);
14159 }
14160
14163 EVT EltTy = VT.getVectorElementType();
14164 assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
14165 EltTy == MVT::f64) && "Unsupported floating-point vector type");
14166 LLVM_DEBUG(
14167 dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
14168 "BITCASTS, and try again\n");
14169 MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
14170 for (unsigned i = 0; i < NumElts; ++i)
14171 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
14172 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
14173 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
14174 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
14175 Val.dump(););
14176 Val = LowerBUILD_VECTOR(Val, DAG);
14177 if (Val.getNode())
14178 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
14179 }
14180 }
14181
14182 // If we need to insert a small number of different non-constant elements and
14183 // the vector width is sufficiently large, prefer using DUP with the common
14184 // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
14185 // skip the constant lane handling below.
14186 bool PreferDUPAndInsert =
14187 !isConstant && NumDifferentLanes >= 1 &&
14188 NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) &&
14189 NumDifferentLanes >= NumConstantLanes;
14190
14191 // If there was only one constant value used and for more than one lane,
14192 // start by splatting that value, then replace the non-constant lanes. This
14193 // is better than the default, which will perform a separate initialization
14194 // for each lane.
14195 if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) {
14196 // Firstly, try to materialize the splat constant.
14197 SDValue Val = DAG.getSplatBuildVector(VT, dl, ConstantValue);
14198 unsigned BitSize = VT.getScalarSizeInBits();
14199 APInt ConstantValueAPInt(1, 0);
14200 if (auto *C = dyn_cast<ConstantSDNode>(ConstantValue))
14201 ConstantValueAPInt = C->getAPIntValue().zextOrTrunc(BitSize);
14202 if (!isNullConstant(ConstantValue) && !isNullFPConstant(ConstantValue) &&
14203 !ConstantValueAPInt.isAllOnes()) {
14204 Val = ConstantBuildVector(Val, DAG, Subtarget);
14205 if (!Val)
14206 // Otherwise, materialize the constant and splat it.
14207 Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
14208 }
14209
14210 // Now insert the non-constant lanes.
14211 for (unsigned i = 0; i < NumElts; ++i) {
14212 SDValue V = Op.getOperand(i);
14213 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
14214 if (!isIntOrFPConstant(V))
14215 // Note that type legalization likely mucked about with the VT of the
14216 // source operand, so we may have to convert it here before inserting.
14217 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
14218 }
14219 return Val;
14220 }
14221
14222 // This will generate a load from the constant pool.
14223 if (isConstant) {
14224 LLVM_DEBUG(
14225 dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
14226 "expansion\n");
14227 return SDValue();
14228 }
14229
14230 // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
14231 // v4i32s. This is really a truncate, which we can construct out of (legal)
14232 // concats and truncate nodes.
14234 return M;
14235
14236 // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
14237 if (NumElts >= 4) {
14238 if (SDValue Shuffle = ReconstructShuffle(Op, DAG))
14239 return Shuffle;
14240
14241 if (SDValue Shuffle = ReconstructShuffleWithRuntimeMask(Op, DAG))
14242 return Shuffle;
14243 }
14244
14245 if (PreferDUPAndInsert) {
14246 // First, build a constant vector with the common element.
14247 SmallVector<SDValue, 8> Ops(NumElts, Value);
14248 SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops), DAG);
14249 // Next, insert the elements that do not match the common value.
14250 for (unsigned I = 0; I < NumElts; ++I)
14251 if (Op.getOperand(I) != Value)
14252 NewVector =
14253 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, NewVector,
14254 Op.getOperand(I), DAG.getConstant(I, dl, MVT::i64));
14255
14256 return NewVector;
14257 }
14258
14259 // If vector consists of two different values, try to generate two DUPs and
14260 // (CONCAT_VECTORS or VECTOR_SHUFFLE).
14261 if (DifferentValueMap.size() == 2 && NumUndefLanes == 0) {
14263 // Check the consecutive count of the value is the half number of vector
14264 // elements. In this case, we can use CONCAT_VECTORS. For example,
14265 //
14266 // canUseVECTOR_CONCAT = true;
14267 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
14268 // t24, t24, t24, t24, t24, t24, t24, t24
14269 //
14270 // canUseVECTOR_CONCAT = false;
14271 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t24, t24, t24,
14272 // t24, t24, t24, t24, t24, t24, t24, t24
14273 bool canUseVECTOR_CONCAT = true;
14274 for (auto Pair : DifferentValueMap) {
14275 // Check different values have same length which is NumElts / 2.
14276 if (Pair.second != NumElts / 2)
14277 canUseVECTOR_CONCAT = false;
14278 Vals.push_back(Pair.first);
14279 }
14280
14281 // If canUseVECTOR_CONCAT is true, we can generate two DUPs and
14282 // CONCAT_VECTORs. For example,
14283 //
14284 // t22: v16i8 = BUILD_VECTOR t23, t23, t23, t23, t23, t23, t23, t23,
14285 // t24, t24, t24, t24, t24, t24, t24, t24
14286 // ==>
14287 // t26: v8i8 = AArch64ISD::DUP t23
14288 // t28: v8i8 = AArch64ISD::DUP t24
14289 // t29: v16i8 = concat_vectors t26, t28
14290 if (canUseVECTOR_CONCAT) {
14291 EVT SubVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
14292 if (isTypeLegal(SubVT) && SubVT.isVector() &&
14293 SubVT.getVectorNumElements() >= 2) {
14294 SmallVector<SDValue, 8> Ops1(NumElts / 2, Vals[0]);
14295 SmallVector<SDValue, 8> Ops2(NumElts / 2, Vals[1]);
14296 SDValue DUP1 =
14297 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops1), DAG);
14298 SDValue DUP2 =
14299 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, dl, Ops2), DAG);
14301 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, DUP1, DUP2);
14302 return CONCAT_VECTORS;
14303 }
14304 }
14305
14306 // Let's try to generate VECTOR_SHUFFLE. For example,
14307 //
14308 // t24: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t26, t26, t26, t26
14309 // ==>
14310 // t27: v8i8 = BUILD_VECTOR t26, t26, t26, t26, t26, t26, t26, t26
14311 // t28: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t25, t25, t25, t25
14312 // t29: v8i8 = vector_shuffle<0,1,2,3,12,13,14,15> t27, t28
14313 if (NumElts >= 8) {
14314 SmallVector<int, 16> MaskVec;
14315 // Build mask for VECTOR_SHUFLLE.
14316 SDValue FirstLaneVal = Op.getOperand(0);
14317 for (unsigned i = 0; i < NumElts; ++i) {
14318 SDValue Val = Op.getOperand(i);
14319 if (FirstLaneVal == Val)
14320 MaskVec.push_back(i);
14321 else
14322 MaskVec.push_back(i + NumElts);
14323 }
14324
14325 SmallVector<SDValue, 8> Ops1(NumElts, Vals[0]);
14326 SmallVector<SDValue, 8> Ops2(NumElts, Vals[1]);
14327 SDValue VEC1 = DAG.getBuildVector(VT, dl, Ops1);
14328 SDValue VEC2 = DAG.getBuildVector(VT, dl, Ops2);
14330 DAG.getVectorShuffle(VT, dl, VEC1, VEC2, MaskVec);
14331 return VECTOR_SHUFFLE;
14332 }
14333 }
14334
14335 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
14336 // know the default expansion would otherwise fall back on something even
14337 // worse. For a vector with one or two non-undef values, that's
14338 // scalar_to_vector for the elements followed by a shuffle (provided the
14339 // shuffle is valid for the target) and materialization element by element
14340 // on the stack followed by a load for everything else.
14341 if (!isConstant && !usesOnlyOneValue) {
14342 LLVM_DEBUG(
14343 dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
14344 "of INSERT_VECTOR_ELT\n");
14345
14346 SDValue Vec = DAG.getUNDEF(VT);
14347 SDValue Op0 = Op.getOperand(0);
14348 unsigned i = 0;
14349
14350 // Use SCALAR_TO_VECTOR for lane zero to
14351 // a) Avoid a RMW dependency on the full vector register, and
14352 // b) Allow the register coalescer to fold away the copy if the
14353 // value is already in an S or D register, and we're forced to emit an
14354 // INSERT_SUBREG that we can't fold anywhere.
14355 //
14356 // We also allow types like i8 and i16 which are illegal scalar but legal
14357 // vector element types. After type-legalization the inserted value is
14358 // extended (i32) and it is safe to cast them to the vector type by ignoring
14359 // the upper bits of the lowest lane (e.g. v8i8, v4i16).
14360 if (!Op0.isUndef()) {
14361 LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
14362 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0);
14363 ++i;
14364 }
14365 LLVM_DEBUG(if (i < NumElts) dbgs()
14366 << "Creating nodes for the other vector elements:\n";);
14367 for (; i < NumElts; ++i) {
14368 SDValue V = Op.getOperand(i);
14369 if (V.isUndef())
14370 continue;
14371 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
14372 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
14373 }
14374 return Vec;
14375 }
14376
14377 LLVM_DEBUG(
14378 dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
14379 "better alternative\n");
14380 return SDValue();
14381}
14382
14383SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
14384 SelectionDAG &DAG) const {
14385 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
14386 !Subtarget->isNeonAvailable()))
14387 return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
14388
14389 assert(Op.getValueType().isScalableVector() &&
14390 isTypeLegal(Op.getValueType()) &&
14391 "Expected legal scalable vector type!");
14392
14393 if (isTypeLegal(Op.getOperand(0).getValueType())) {
14394 unsigned NumOperands = Op->getNumOperands();
14395 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
14396 "Unexpected number of operands in CONCAT_VECTORS");
14397
14398 if (NumOperands == 2)
14399 return Op;
14400
14401 // Concat each pair of subvectors and pack into the lower half of the array.
14402 SmallVector<SDValue> ConcatOps(Op->op_begin(), Op->op_end());
14403 while (ConcatOps.size() > 1) {
14404 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
14405 SDValue V1 = ConcatOps[I];
14406 SDValue V2 = ConcatOps[I + 1];
14407 EVT SubVT = V1.getValueType();
14408 EVT PairVT = SubVT.getDoubleNumVectorElementsVT(*DAG.getContext());
14409 ConcatOps[I / 2] =
14410 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), PairVT, V1, V2);
14411 }
14412 ConcatOps.resize(ConcatOps.size() / 2);
14413 }
14414 return ConcatOps[0];
14415 }
14416
14417 return SDValue();
14418}
14419
14420SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
14421 SelectionDAG &DAG) const {
14422 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
14423
14424 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
14425 !Subtarget->isNeonAvailable()))
14426 return LowerFixedLengthInsertVectorElt(Op, DAG);
14427
14428 EVT VT = Op.getOperand(0).getValueType();
14429
14430 if (VT.getScalarType() == MVT::i1) {
14431 EVT VectorVT = getPromotedVTForPredicate(VT);
14432 SDLoc DL(Op);
14433 SDValue ExtendedVector =
14434 DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, VectorVT);
14435 SDValue ExtendedValue =
14436 DAG.getAnyExtOrTrunc(Op.getOperand(1), DL,
14437 VectorVT.getScalarType().getSizeInBits() < 32
14438 ? MVT::i32
14439 : VectorVT.getScalarType());
14440 ExtendedVector =
14441 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT, ExtendedVector,
14442 ExtendedValue, Op.getOperand(2));
14443 return DAG.getAnyExtOrTrunc(ExtendedVector, DL, VT);
14444 }
14445
14446 // Check for non-constant or out of range lane.
14447 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
14448 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
14449 return SDValue();
14450
14451 return Op;
14452}
14453
14454SDValue
14455AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
14456 SelectionDAG &DAG) const {
14457 assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
14458 EVT VT = Op.getOperand(0).getValueType();
14459
14460 if (VT.getScalarType() == MVT::i1) {
14461 // We can't directly extract from an SVE predicate; extend it first.
14462 // (This isn't the only possible lowering, but it's straightforward.)
14463 EVT VectorVT = getPromotedVTForPredicate(VT);
14464 SDLoc DL(Op);
14465 SDValue Extend =
14466 DAG.getNode(ISD::ANY_EXTEND, DL, VectorVT, Op.getOperand(0));
14467 MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32;
14468 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractTy,
14469 Extend, Op.getOperand(1));
14470 return DAG.getAnyExtOrTrunc(Extract, DL, Op.getValueType());
14471 }
14472
14473 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
14474 return LowerFixedLengthExtractVectorElt(Op, DAG);
14475
14476 // Check for non-constant or out of range lane.
14477 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
14478 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
14479 return SDValue();
14480
14481 // Insertion/extraction are legal for V128 types.
14482 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
14483 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
14484 VT == MVT::v8f16 || VT == MVT::v8bf16)
14485 return Op;
14486
14487 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
14488 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
14489 VT != MVT::v4bf16)
14490 return SDValue();
14491
14492 // For V64 types, we perform extraction by expanding the value
14493 // to a V128 type and perform the extraction on that.
14494 SDLoc DL(Op);
14495 SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
14496 EVT WideTy = WideVec.getValueType();
14497
14498 EVT ExtrTy = WideTy.getVectorElementType();
14499 if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
14500 ExtrTy = MVT::i32;
14501
14502 // For extractions, we just return the result directly.
14503 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
14504 Op.getOperand(1));
14505}
14506
14507SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
14508 SelectionDAG &DAG) const {
14509 EVT VT = Op.getValueType();
14511 "Only cases that extract a fixed length vector are supported!");
14512 EVT InVT = Op.getOperand(0).getValueType();
14513
14514 // If we don't have legal types yet, do nothing
14515 if (!isTypeLegal(InVT))
14516 return SDValue();
14517
14518 if (InVT.is128BitVector()) {
14519 assert(VT.is64BitVector() && "Extracting unexpected vector type!");
14520 unsigned Idx = Op.getConstantOperandVal(1);
14521
14522 // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
14523 if (Idx == 0)
14524 return Op;
14525
14526 // If this is extracting the upper 64-bits of a 128-bit vector, we match
14527 // that directly.
14528 if (Idx * InVT.getScalarSizeInBits() == 64 && Subtarget->isNeonAvailable())
14529 return Op;
14530 }
14531
14532 if (InVT.isScalableVector() ||
14533 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable())) {
14534 SDLoc DL(Op);
14535 SDValue Vec = Op.getOperand(0);
14536 SDValue Idx = Op.getOperand(1);
14537
14539 if (PackedVT != InVT) {
14540 // Pack input into the bottom part of an SVE register and try again.
14541 SDValue Container = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, PackedVT,
14542 DAG.getUNDEF(PackedVT), Vec,
14543 DAG.getVectorIdxConstant(0, DL));
14544 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Container, Idx);
14545 }
14546
14547 // This will get matched by custom code during ISelDAGToDAG.
14548 if (isNullConstant(Idx))
14549 return Op;
14550
14551 assert(InVT.isScalableVector() && "Unexpected vector type!");
14552 // Move requested subvector to the start of the vector and try again.
14553 SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, InVT, Vec, Vec, Idx);
14554 return convertFromScalableVector(DAG, VT, Splice);
14555 }
14556
14557 return SDValue();
14558}
14559
14560SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
14561 SelectionDAG &DAG) const {
14562 assert(Op.getValueType().isScalableVector() &&
14563 "Only expect to lower inserts into scalable vectors!");
14564
14565 EVT InVT = Op.getOperand(1).getValueType();
14566 unsigned Idx = Op.getConstantOperandVal(2);
14567
14568 SDValue Vec0 = Op.getOperand(0);
14569 SDValue Vec1 = Op.getOperand(1);
14570 SDLoc DL(Op);
14571 EVT VT = Op.getValueType();
14572
14573 if (InVT.isScalableVector()) {
14574 if (!isTypeLegal(VT))
14575 return SDValue();
14576
14577 // Break down insert_subvector into simpler parts.
14578 if (VT.getVectorElementType() == MVT::i1) {
14579 unsigned NumElts = VT.getVectorMinNumElements();
14580 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
14581
14582 SDValue Lo, Hi;
14583 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
14584 DAG.getVectorIdxConstant(0, DL));
14585 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
14586 DAG.getVectorIdxConstant(NumElts / 2, DL));
14587 if (Idx < (NumElts / 2))
14588 Lo = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Lo, Vec1,
14590 else
14591 Hi = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Hi, Vec1,
14592 DAG.getVectorIdxConstant(Idx - (NumElts / 2), DL));
14593
14594 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
14595 }
14596
14597 // Ensure the subvector is half the size of the main vector.
14598 if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
14599 return SDValue();
14600
14601 // Here narrow and wide refers to the vector element types. After "casting"
14602 // both vectors must have the same bit length and so because the subvector
14603 // has fewer elements, those elements need to be bigger.
14606
14607 // NOP cast operands to the largest legal vector of the same element count.
14608 if (VT.isFloatingPoint()) {
14609 Vec0 = getSVESafeBitCast(NarrowVT, Vec0, DAG);
14610 Vec1 = getSVESafeBitCast(WideVT, Vec1, DAG);
14611 } else {
14612 // Legal integer vectors are already their largest so Vec0 is fine as is.
14613 Vec1 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
14614 }
14615
14616 // To replace the top/bottom half of vector V with vector SubV we widen the
14617 // preserved half of V, concatenate this to SubV (the order depending on the
14618 // half being replaced) and then narrow the result.
14619 SDValue Narrow;
14620 if (Idx == 0) {
14621 SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0);
14622 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, Vec1, HiVec0);
14623 } else {
14625 "Invalid subvector index!");
14626 SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0);
14627 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, LoVec0, Vec1);
14628 }
14629
14630 return getSVESafeBitCast(VT, Narrow, DAG);
14631 }
14632
14633 if (Idx == 0 && isPackedVectorType(VT, DAG)) {
14634 // This will be matched by custom code during ISelDAGToDAG.
14635 if (Vec0.isUndef())
14636 return Op;
14637
14638 std::optional<unsigned> PredPattern =
14640 auto PredTy = VT.changeVectorElementType(MVT::i1);
14641 SDValue PTrue = getPTrue(DAG, DL, PredTy, *PredPattern);
14642 SDValue ScalableVec1 = convertToScalableVector(DAG, VT, Vec1);
14643 return DAG.getNode(ISD::VSELECT, DL, VT, PTrue, ScalableVec1, Vec0);
14644 }
14645
14646 return SDValue();
14647}
14648
14649static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated) {
14650 if (Op.getOpcode() != AArch64ISD::DUP &&
14651 Op.getOpcode() != ISD::SPLAT_VECTOR &&
14652 Op.getOpcode() != ISD::BUILD_VECTOR)
14653 return false;
14654
14655 if (Op.getOpcode() == ISD::BUILD_VECTOR &&
14656 !isAllConstantBuildVector(Op, SplatVal))
14657 return false;
14658
14659 if (Op.getOpcode() != ISD::BUILD_VECTOR &&
14660 !isa<ConstantSDNode>(Op->getOperand(0)))
14661 return false;
14662
14663 SplatVal = Op->getConstantOperandVal(0);
14664 if (Op.getValueType().getVectorElementType() != MVT::i64)
14665 SplatVal = (int32_t)SplatVal;
14666
14667 Negated = false;
14668 if (isPowerOf2_64(SplatVal))
14669 return true;
14670
14671 Negated = true;
14672 if (isPowerOf2_64(-SplatVal)) {
14673 SplatVal = -SplatVal;
14674 return true;
14675 }
14676
14677 return false;
14678}
14679
14680SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
14681 EVT VT = Op.getValueType();
14682 SDLoc dl(Op);
14683
14684 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
14685 return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
14686
14687 assert(VT.isScalableVector() && "Expected a scalable vector.");
14688
14689 bool Signed = Op.getOpcode() == ISD::SDIV;
14690 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
14691
14692 bool Negated;
14693 uint64_t SplatVal;
14694 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
14695 SDValue Pg = getPredicateForScalableVector(DAG, dl, VT);
14696 SDValue Res =
14697 DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, VT, Pg, Op->getOperand(0),
14698 DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32));
14699 if (Negated)
14700 Res = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), Res);
14701
14702 return Res;
14703 }
14704
14705 if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64)
14706 return LowerToPredicatedOp(Op, DAG, PredOpcode);
14707
14708 // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
14709 // operations, and truncate the result.
14710 EVT WidenedVT;
14711 if (VT == MVT::nxv16i8)
14712 WidenedVT = MVT::nxv8i16;
14713 else if (VT == MVT::nxv8i16)
14714 WidenedVT = MVT::nxv4i32;
14715 else
14716 llvm_unreachable("Unexpected Custom DIV operation");
14717
14718 unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
14719 unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
14720 SDValue Op0Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(0));
14721 SDValue Op1Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(1));
14722 SDValue Op0Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(0));
14723 SDValue Op1Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(1));
14724 SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Lo, Op1Lo);
14725 SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Hi, Op1Hi);
14726 return DAG.getNode(AArch64ISD::UZP1, dl, VT, ResultLo, ResultHi);
14727}
14728
14729bool AArch64TargetLowering::shouldExpandBuildVectorWithShuffles(
14730 EVT VT, unsigned DefinedValues) const {
14731 if (!Subtarget->isNeonAvailable())
14732 return false;
14734}
14735
14737 // Currently no fixed length shuffles that require SVE are legal.
14738 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
14739 return false;
14740
14741 if (VT.getVectorNumElements() == 4 &&
14742 (VT.is128BitVector() || VT.is64BitVector())) {
14743 unsigned Cost = getPerfectShuffleCost(M);
14744 if (Cost <= 1)
14745 return true;
14746 }
14747
14748 bool DummyBool;
14749 int DummyInt;
14750 unsigned DummyUnsigned;
14751
14752 unsigned EltSize = VT.getScalarSizeInBits();
14753 unsigned NumElts = VT.getVectorNumElements();
14754 return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
14755 isREVMask(M, EltSize, NumElts, 64) ||
14756 isREVMask(M, EltSize, NumElts, 32) ||
14757 isREVMask(M, EltSize, NumElts, 16) ||
14758 isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
14759 isTRNMask(M, NumElts, DummyUnsigned) ||
14760 isUZPMask(M, NumElts, DummyUnsigned) ||
14761 isZIPMask(M, NumElts, DummyUnsigned) ||
14762 isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
14763 isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
14764 isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
14765 isINSMask(M, NumElts, DummyBool, DummyInt) ||
14766 isConcatMask(M, VT, VT.getSizeInBits() == 128));
14767}
14768
14770 EVT VT) const {
14771 // Just delegate to the generic legality, clear masks aren't special.
14772 return isShuffleMaskLegal(M, VT);
14773}
14774
14775/// getVShiftImm - Check if this is a valid build_vector for the immediate
14776/// operand of a vector shift operation, where all the elements of the
14777/// build_vector must have the same constant integer value.
14778static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
14779 // Ignore bit_converts.
14780 while (Op.getOpcode() == ISD::BITCAST)
14781 Op = Op.getOperand(0);
14782 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
14783 APInt SplatBits, SplatUndef;
14784 unsigned SplatBitSize;
14785 bool HasAnyUndefs;
14786 if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
14787 HasAnyUndefs, ElementBits) ||
14788 SplatBitSize > ElementBits)
14789 return false;
14790 Cnt = SplatBits.getSExtValue();
14791 return true;
14792}
14793
14794/// isVShiftLImm - Check if this is a valid build_vector for the immediate
14795/// operand of a vector shift left operation. That value must be in the range:
14796/// 0 <= Value < ElementBits for a left shift; or
14797/// 0 <= Value <= ElementBits for a long left shift.
14798static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
14799 assert(VT.isVector() && "vector shift count is not a vector type");
14800 int64_t ElementBits = VT.getScalarSizeInBits();
14801 if (!getVShiftImm(Op, ElementBits, Cnt))
14802 return false;
14803 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
14804}
14805
14806/// isVShiftRImm - Check if this is a valid build_vector for the immediate
14807/// operand of a vector shift right operation. The value must be in the range:
14808/// 1 <= Value <= ElementBits for a right shift; or
14809static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
14810 assert(VT.isVector() && "vector shift count is not a vector type");
14811 int64_t ElementBits = VT.getScalarSizeInBits();
14812 if (!getVShiftImm(Op, ElementBits, Cnt))
14813 return false;
14814 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
14815}
14816
14817SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
14818 SelectionDAG &DAG) const {
14819 EVT VT = Op.getValueType();
14820
14821 if (VT.getScalarType() == MVT::i1) {
14822 // Lower i1 truncate to `(x & 1) != 0`.
14823 SDLoc dl(Op);
14824 EVT OpVT = Op.getOperand(0).getValueType();
14825 SDValue Zero = DAG.getConstant(0, dl, OpVT);
14826 SDValue One = DAG.getConstant(1, dl, OpVT);
14827 SDValue And = DAG.getNode(ISD::AND, dl, OpVT, Op.getOperand(0), One);
14828 return DAG.getSetCC(dl, VT, And, Zero, ISD::SETNE);
14829 }
14830
14831 if (!VT.isVector() || VT.isScalableVector())
14832 return SDValue();
14833
14834 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
14835 !Subtarget->isNeonAvailable()))
14836 return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
14837
14838 return SDValue();
14839}
14840
14841// Check if we can we lower this SRL to a rounding shift instruction. ResVT is
14842// possibly a truncated type, it tells how many bits of the value are to be
14843// used.
14845 SelectionDAG &DAG,
14846 unsigned &ShiftValue,
14847 SDValue &RShOperand) {
14848 if (Shift->getOpcode() != ISD::SRL)
14849 return false;
14850
14851 EVT VT = Shift.getValueType();
14852 assert(VT.isScalableVT());
14853
14854 auto ShiftOp1 =
14855 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(Shift->getOperand(1)));
14856 if (!ShiftOp1)
14857 return false;
14858
14859 ShiftValue = ShiftOp1->getZExtValue();
14860 if (ShiftValue < 1 || ShiftValue > ResVT.getScalarSizeInBits())
14861 return false;
14862
14863 SDValue Add = Shift->getOperand(0);
14864 if (Add->getOpcode() != ISD::ADD || !Add->hasOneUse())
14865 return false;
14866
14868 "ResVT must be truncated or same type as the shift.");
14869 // Check if an overflow can lead to incorrect results.
14870 uint64_t ExtraBits = VT.getScalarSizeInBits() - ResVT.getScalarSizeInBits();
14871 if (ShiftValue > ExtraBits && !Add->getFlags().hasNoUnsignedWrap())
14872 return false;
14873
14874 auto AddOp1 =
14875 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(Add->getOperand(1)));
14876 if (!AddOp1)
14877 return false;
14878 uint64_t AddValue = AddOp1->getZExtValue();
14879 if (AddValue != 1ULL << (ShiftValue - 1))
14880 return false;
14881
14882 RShOperand = Add->getOperand(0);
14883 return true;
14884}
14885
14886SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
14887 SelectionDAG &DAG) const {
14888 EVT VT = Op.getValueType();
14889 SDLoc DL(Op);
14890 int64_t Cnt;
14891
14892 if (!Op.getOperand(1).getValueType().isVector())
14893 return Op;
14894 unsigned EltSize = VT.getScalarSizeInBits();
14895
14896 switch (Op.getOpcode()) {
14897 case ISD::SHL:
14898 if (VT.isScalableVector() ||
14900 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED);
14901
14902 if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
14903 return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
14904 DAG.getConstant(Cnt, DL, MVT::i32));
14905 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
14906 DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
14907 MVT::i32),
14908 Op.getOperand(0), Op.getOperand(1));
14909 case ISD::SRA:
14910 case ISD::SRL:
14911 if (VT.isScalableVector() &&
14912 (Subtarget->hasSVE2() ||
14913 (Subtarget->hasSME() && Subtarget->isStreaming()))) {
14914 SDValue RShOperand;
14915 unsigned ShiftValue;
14916 if (canLowerSRLToRoundingShiftForVT(Op, VT, DAG, ShiftValue, RShOperand))
14917 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, VT,
14918 getPredicateForVector(DAG, DL, VT), RShOperand,
14919 DAG.getTargetConstant(ShiftValue, DL, MVT::i32));
14920 }
14921
14922 if (VT.isScalableVector() ||
14923 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
14924 unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
14926 return LowerToPredicatedOp(Op, DAG, Opc);
14927 }
14928
14929 // Right shift immediate
14930 if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
14931 unsigned Opc =
14932 (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
14933 return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
14934 DAG.getConstant(Cnt, DL, MVT::i32), Op->getFlags());
14935 }
14936
14937 // Right shift register. Note, there is not a shift right register
14938 // instruction, but the shift left register instruction takes a signed
14939 // value, where negative numbers specify a right shift.
14940 unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
14941 : Intrinsic::aarch64_neon_ushl;
14942 // negate the shift amount
14943 SDValue NegShift = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
14944 Op.getOperand(1));
14945 SDValue NegShiftLeft =
14947 DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
14948 NegShift);
14949 return NegShiftLeft;
14950 }
14951
14952 llvm_unreachable("unexpected shift opcode");
14953}
14954
14956 AArch64CC::CondCode CC, bool NoNans, EVT VT,
14957 const SDLoc &dl, SelectionDAG &DAG) {
14958 EVT SrcVT = LHS.getValueType();
14959 assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
14960 "function only supposed to emit natural comparisons");
14961
14962 APInt SplatValue;
14963 APInt SplatUndef;
14964 unsigned SplatBitSize = 0;
14965 bool HasAnyUndefs;
14966
14967 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
14968 bool IsCnst = BVN && BVN->isConstantSplat(SplatValue, SplatUndef,
14969 SplatBitSize, HasAnyUndefs);
14970
14971 bool IsZero = IsCnst && SplatValue == 0;
14972 bool IsOne =
14973 IsCnst && SrcVT.getScalarSizeInBits() == SplatBitSize && SplatValue == 1;
14974 bool IsMinusOne = IsCnst && SplatValue.isAllOnes();
14975
14976 if (SrcVT.getVectorElementType().isFloatingPoint()) {
14977 switch (CC) {
14978 default:
14979 return SDValue();
14980 case AArch64CC::NE: {
14981 SDValue Fcmeq;
14982 if (IsZero)
14983 Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
14984 else
14985 Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
14986 return DAG.getNOT(dl, Fcmeq, VT);
14987 }
14988 case AArch64CC::EQ:
14989 if (IsZero)
14990 return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
14991 return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
14992 case AArch64CC::GE:
14993 if (IsZero)
14994 return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
14995 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
14996 case AArch64CC::GT:
14997 if (IsZero)
14998 return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
14999 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
15000 case AArch64CC::LE:
15001 if (!NoNans)
15002 return SDValue();
15003 // If we ignore NaNs then we can use to the LS implementation.
15004 [[fallthrough]];
15005 case AArch64CC::LS:
15006 if (IsZero)
15007 return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
15008 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
15009 case AArch64CC::LT:
15010 if (!NoNans)
15011 return SDValue();
15012 // If we ignore NaNs then we can use to the MI implementation.
15013 [[fallthrough]];
15014 case AArch64CC::MI:
15015 if (IsZero)
15016 return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
15017 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
15018 }
15019 }
15020
15021 switch (CC) {
15022 default:
15023 return SDValue();
15024 case AArch64CC::NE: {
15025 SDValue Cmeq;
15026 if (IsZero)
15027 Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
15028 else
15029 Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
15030 return DAG.getNOT(dl, Cmeq, VT);
15031 }
15032 case AArch64CC::EQ:
15033 if (IsZero)
15034 return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
15035 return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
15036 case AArch64CC::GE:
15037 if (IsZero)
15038 return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
15039 return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
15040 case AArch64CC::GT:
15041 if (IsZero)
15042 return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
15043 if (IsMinusOne)
15044 return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS, RHS);
15045 return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
15046 case AArch64CC::LE:
15047 if (IsZero)
15048 return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
15049 return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
15050 case AArch64CC::LS:
15051 return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
15052 case AArch64CC::LO:
15053 return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
15054 case AArch64CC::LT:
15055 if (IsZero)
15056 return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
15057 if (IsOne)
15058 return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
15059 return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
15060 case AArch64CC::HI:
15061 return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
15062 case AArch64CC::HS:
15063 return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
15064 }
15065}
15066
15067SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
15068 SelectionDAG &DAG) const {
15069 if (Op.getValueType().isScalableVector())
15070 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
15071
15072 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
15073 !Subtarget->isNeonAvailable()))
15074 return LowerFixedLengthVectorSetccToSVE(Op, DAG);
15075
15076 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
15077 SDValue LHS = Op.getOperand(0);
15078 SDValue RHS = Op.getOperand(1);
15079 EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
15080 SDLoc dl(Op);
15081
15082 if (LHS.getValueType().getVectorElementType().isInteger()) {
15083 assert(LHS.getValueType() == RHS.getValueType());
15085 SDValue Cmp =
15086 EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG);
15087 return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
15088 }
15089
15090 // Lower isnan(x) | isnan(never-nan) to x != x.
15091 // Lower !isnan(x) & !isnan(never-nan) to x == x.
15092 if (CC == ISD::SETUO || CC == ISD::SETO) {
15093 bool OneNaN = false;
15094 if (LHS == RHS) {
15095 OneNaN = true;
15096 } else if (DAG.isKnownNeverNaN(RHS)) {
15097 OneNaN = true;
15098 RHS = LHS;
15099 } else if (DAG.isKnownNeverNaN(LHS)) {
15100 OneNaN = true;
15101 LHS = RHS;
15102 }
15103 if (OneNaN) {
15105 }
15106 }
15107
15108 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
15109
15110 // Make v4f16 (only) fcmp operations utilise vector instructions
15111 // v8f16 support will be a litle more complicated
15112 if ((!FullFP16 && LHS.getValueType().getVectorElementType() == MVT::f16) ||
15113 LHS.getValueType().getVectorElementType() == MVT::bf16) {
15114 if (LHS.getValueType().getVectorNumElements() == 4) {
15115 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS);
15116 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS);
15117 SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC);
15118 DAG.ReplaceAllUsesWith(Op, NewSetcc);
15119 CmpVT = MVT::v4i32;
15120 } else
15121 return SDValue();
15122 }
15123
15124 assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) ||
15125 LHS.getValueType().getVectorElementType() != MVT::bf16 ||
15126 LHS.getValueType().getVectorElementType() != MVT::f128);
15127
15128 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
15129 // clean. Some of them require two branches to implement.
15130 AArch64CC::CondCode CC1, CC2;
15131 bool ShouldInvert;
15132 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
15133
15134 bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs();
15135 SDValue Cmp =
15136 EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
15137 if (!Cmp.getNode())
15138 return SDValue();
15139
15140 if (CC2 != AArch64CC::AL) {
15141 SDValue Cmp2 =
15142 EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
15143 if (!Cmp2.getNode())
15144 return SDValue();
15145
15146 Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2);
15147 }
15148
15149 Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
15150
15151 if (ShouldInvert)
15152 Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
15153
15154 return Cmp;
15155}
15156
15157static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
15158 SelectionDAG &DAG) {
15159 SDValue VecOp = ScalarOp.getOperand(0);
15160 auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
15161 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
15162 DAG.getConstant(0, DL, MVT::i64));
15163}
15164
15165static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,
15166 SDLoc DL, SelectionDAG &DAG) {
15167 unsigned ScalarOpcode;
15168 switch (Opcode) {
15169 case ISD::VECREDUCE_AND:
15170 ScalarOpcode = ISD::AND;
15171 break;
15172 case ISD::VECREDUCE_OR:
15173 ScalarOpcode = ISD::OR;
15174 break;
15175 case ISD::VECREDUCE_XOR:
15176 ScalarOpcode = ISD::XOR;
15177 break;
15178 default:
15179 llvm_unreachable("Expected bitwise vector reduction");
15180 return SDValue();
15181 }
15182
15183 EVT VecVT = Vec.getValueType();
15184 assert(VecVT.isFixedLengthVector() && VecVT.isPow2VectorType() &&
15185 "Expected power-of-2 length vector");
15186
15187 EVT ElemVT = VecVT.getVectorElementType();
15188
15189 SDValue Result;
15190 unsigned NumElems = VecVT.getVectorNumElements();
15191
15192 // Special case for boolean reductions
15193 if (ElemVT == MVT::i1) {
15194 // Split large vectors into smaller ones
15195 if (NumElems > 16) {
15196 SDValue Lo, Hi;
15197 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
15198 EVT HalfVT = Lo.getValueType();
15199 SDValue HalfVec = DAG.getNode(ScalarOpcode, DL, HalfVT, Lo, Hi);
15200 return getVectorBitwiseReduce(Opcode, HalfVec, VT, DL, DAG);
15201 }
15202
15203 // Vectors that are less than 64 bits get widened to neatly fit a 64 bit
15204 // register, so e.g. <4 x i1> gets lowered to <4 x i16>. Sign extending to
15205 // this element size leads to the best codegen, since e.g. setcc results
15206 // might need to be truncated otherwise.
15207 EVT ExtendedVT = MVT::getIntegerVT(std::max(64u / NumElems, 8u));
15208
15209 // any_ext doesn't work with umin/umax, so only use it for uadd.
15210 unsigned ExtendOp =
15211 ScalarOpcode == ISD::XOR ? ISD::ANY_EXTEND : ISD::SIGN_EXTEND;
15212 SDValue Extended = DAG.getNode(
15213 ExtendOp, DL, VecVT.changeVectorElementType(ExtendedVT), Vec);
15214 switch (ScalarOpcode) {
15215 case ISD::AND:
15216 Result = DAG.getNode(ISD::VECREDUCE_UMIN, DL, ExtendedVT, Extended);
15217 break;
15218 case ISD::OR:
15219 Result = DAG.getNode(ISD::VECREDUCE_UMAX, DL, ExtendedVT, Extended);
15220 break;
15221 case ISD::XOR:
15222 Result = DAG.getNode(ISD::VECREDUCE_ADD, DL, ExtendedVT, Extended);
15223 break;
15224 default:
15225 llvm_unreachable("Unexpected Opcode");
15226 }
15227
15228 Result = DAG.getAnyExtOrTrunc(Result, DL, MVT::i1);
15229 } else {
15230 // Iteratively split the vector in half and combine using the bitwise
15231 // operation until it fits in a 64 bit register.
15232 while (VecVT.getSizeInBits() > 64) {
15233 SDValue Lo, Hi;
15234 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
15235 VecVT = Lo.getValueType();
15236 NumElems = VecVT.getVectorNumElements();
15237 Vec = DAG.getNode(ScalarOpcode, DL, VecVT, Lo, Hi);
15238 }
15239
15240 EVT ScalarVT = EVT::getIntegerVT(*DAG.getContext(), VecVT.getSizeInBits());
15241
15242 // Do the remaining work on a scalar since it allows the code generator to
15243 // combine the shift and bitwise operation into one instruction and since
15244 // integer instructions can have higher throughput than vector instructions.
15245 SDValue Scalar = DAG.getBitcast(ScalarVT, Vec);
15246
15247 // Iteratively combine the lower and upper halves of the scalar using the
15248 // bitwise operation, halving the relevant region of the scalar in each
15249 // iteration, until the relevant region is just one element of the original
15250 // vector.
15251 for (unsigned Shift = NumElems / 2; Shift > 0; Shift /= 2) {
15252 SDValue ShiftAmount =
15253 DAG.getConstant(Shift * ElemVT.getSizeInBits(), DL, MVT::i64);
15254 SDValue Shifted =
15255 DAG.getNode(ISD::SRL, DL, ScalarVT, Scalar, ShiftAmount);
15256 Scalar = DAG.getNode(ScalarOpcode, DL, ScalarVT, Scalar, Shifted);
15257 }
15258
15259 Result = DAG.getAnyExtOrTrunc(Scalar, DL, ElemVT);
15260 }
15261
15262 return DAG.getAnyExtOrTrunc(Result, DL, VT);
15263}
15264
15265SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
15266 SelectionDAG &DAG) const {
15267 SDValue Src = Op.getOperand(0);
15268
15269 // Try to lower fixed length reductions to SVE.
15270 EVT SrcVT = Src.getValueType();
15271 bool OverrideNEON = !Subtarget->isNeonAvailable() ||
15272 Op.getOpcode() == ISD::VECREDUCE_AND ||
15273 Op.getOpcode() == ISD::VECREDUCE_OR ||
15274 Op.getOpcode() == ISD::VECREDUCE_XOR ||
15275 Op.getOpcode() == ISD::VECREDUCE_FADD ||
15276 (Op.getOpcode() != ISD::VECREDUCE_ADD &&
15277 SrcVT.getVectorElementType() == MVT::i64);
15278 if (SrcVT.isScalableVector() ||
15280 SrcVT, OverrideNEON && Subtarget->useSVEForFixedLengthVectors())) {
15281
15282 if (SrcVT.getVectorElementType() == MVT::i1)
15283 return LowerPredReductionToSVE(Op, DAG);
15284
15285 switch (Op.getOpcode()) {
15286 case ISD::VECREDUCE_ADD:
15287 return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG);
15288 case ISD::VECREDUCE_AND:
15289 return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG);
15290 case ISD::VECREDUCE_OR:
15291 return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG);
15293 return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG);
15295 return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG);
15297 return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG);
15299 return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG);
15300 case ISD::VECREDUCE_XOR:
15301 return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG);
15303 return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG);
15305 return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG);
15307 return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG);
15309 return LowerReductionToSVE(AArch64ISD::FMAXV_PRED, Op, DAG);
15311 return LowerReductionToSVE(AArch64ISD::FMINV_PRED, Op, DAG);
15312 default:
15313 llvm_unreachable("Unhandled fixed length reduction");
15314 }
15315 }
15316
15317 // Lower NEON reductions.
15318 SDLoc dl(Op);
15319 switch (Op.getOpcode()) {
15320 case ISD::VECREDUCE_AND:
15321 case ISD::VECREDUCE_OR:
15322 case ISD::VECREDUCE_XOR:
15323 return getVectorBitwiseReduce(Op.getOpcode(), Op.getOperand(0),
15324 Op.getValueType(), dl, DAG);
15325 case ISD::VECREDUCE_ADD:
15326 return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG);
15328 return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG);
15330 return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG);
15332 return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG);
15334 return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG);
15335 default:
15336 llvm_unreachable("Unhandled reduction");
15337 }
15338}
15339
15340SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
15341 SelectionDAG &DAG) const {
15342 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
15343 // No point replacing if we don't have the relevant instruction/libcall anyway
15344 if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
15345 return SDValue();
15346
15347 // LSE has an atomic load-clear instruction, but not a load-and.
15348 SDLoc dl(Op);
15349 MVT VT = Op.getSimpleValueType();
15350 assert(VT != MVT::i128 && "Handled elsewhere, code replicated.");
15351 SDValue RHS = Op.getOperand(2);
15352 AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
15353 RHS = DAG.getNode(ISD::XOR, dl, VT, DAG.getConstant(-1ULL, dl, VT), RHS);
15354 return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, dl, AN->getMemoryVT(),
15355 Op.getOperand(0), Op.getOperand(1), RHS,
15356 AN->getMemOperand());
15357}
15358
15359SDValue
15360AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op,
15361 SelectionDAG &DAG) const {
15362
15363 SDLoc dl(Op);
15364 // Get the inputs.
15365 SDNode *Node = Op.getNode();
15366 SDValue Chain = Op.getOperand(0);
15367 SDValue Size = Op.getOperand(1);
15369 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
15370 EVT VT = Node->getValueType(0);
15371
15373 "no-stack-arg-probe")) {
15374 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
15375 Chain = SP.getValue(1);
15376 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
15377 if (Align)
15378 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
15379 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
15380 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
15381 SDValue Ops[2] = {SP, Chain};
15382 return DAG.getMergeValues(Ops, dl);
15383 }
15384
15385 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
15386
15387 EVT PtrVT = getPointerTy(DAG.getDataLayout());
15389 PtrVT, 0);
15390
15391 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
15392 const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
15393 if (Subtarget->hasCustomCallingConv())
15394 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
15395
15396 Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size,
15397 DAG.getConstant(4, dl, MVT::i64));
15398 Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue());
15399 Chain =
15400 DAG.getNode(AArch64ISD::CALL, dl, DAG.getVTList(MVT::Other, MVT::Glue),
15401 Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
15402 DAG.getRegisterMask(Mask), Chain.getValue(1));
15403 // To match the actual intent better, we should read the output from X15 here
15404 // again (instead of potentially spilling it to the stack), but rereading Size
15405 // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
15406 // here.
15407
15408 Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size,
15409 DAG.getConstant(4, dl, MVT::i64));
15410
15411 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
15412 Chain = SP.getValue(1);
15413 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
15414 if (Align)
15415 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
15416 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
15417 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
15418
15419 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
15420
15421 SDValue Ops[2] = {SP, Chain};
15422 return DAG.getMergeValues(Ops, dl);
15423}
15424
15425SDValue
15426AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op,
15427 SelectionDAG &DAG) const {
15428 // Get the inputs.
15429 SDNode *Node = Op.getNode();
15430 SDValue Chain = Op.getOperand(0);
15431 SDValue Size = Op.getOperand(1);
15432
15434 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
15435 SDLoc dl(Op);
15436 EVT VT = Node->getValueType(0);
15437
15438 // Construct the new SP value in a GPR.
15439 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
15440 Chain = SP.getValue(1);
15441 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
15442 if (Align)
15443 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
15444 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
15445
15446 // Set the real SP to the new value with a probing loop.
15447 Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, dl, MVT::Other, Chain, SP);
15448 SDValue Ops[2] = {SP, Chain};
15449 return DAG.getMergeValues(Ops, dl);
15450}
15451
15452SDValue
15453AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
15454 SelectionDAG &DAG) const {
15456
15457 if (Subtarget->isTargetWindows())
15458 return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG);
15459 else if (hasInlineStackProbe(MF))
15460 return LowerInlineDYNAMIC_STACKALLOC(Op, DAG);
15461 else
15462 return SDValue();
15463}
15464
15465SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG,
15466 unsigned NewOp) const {
15467 if (Subtarget->hasSVE2())
15468 return LowerToPredicatedOp(Op, DAG, NewOp);
15469
15470 // Default to expand.
15471 return SDValue();
15472}
15473
15474SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
15475 SelectionDAG &DAG) const {
15476 EVT VT = Op.getValueType();
15477 assert(VT != MVT::i64 && "Expected illegal VSCALE node");
15478
15479 SDLoc DL(Op);
15480 APInt MulImm = Op.getConstantOperandAPInt(0);
15481 return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sext(64)), DL,
15482 VT);
15483}
15484
15485/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
15486template <unsigned NumVecs>
15487static bool
15491 // Retrieve EC from first vector argument.
15492 const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType());
15494#ifndef NDEBUG
15495 // Check the assumption that all input vectors are the same type.
15496 for (unsigned I = 0; I < NumVecs; ++I)
15497 assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&
15498 "Invalid type.");
15499#endif
15500 // memVT is `NumVecs * VT`.
15502 EC * NumVecs);
15503 Info.ptrVal = CI.getArgOperand(CI.arg_size() - 1);
15504 Info.offset = 0;
15505 Info.align.reset();
15507 return true;
15508}
15509
15510/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
15511/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
15512/// specified in the intrinsic calls.
15514 const CallInst &I,
15515 MachineFunction &MF,
15516 unsigned Intrinsic) const {
15517 auto &DL = I.getDataLayout();
15518 switch (Intrinsic) {
15519 case Intrinsic::aarch64_sve_st2:
15520 return setInfoSVEStN<2>(*this, DL, Info, I);
15521 case Intrinsic::aarch64_sve_st3:
15522 return setInfoSVEStN<3>(*this, DL, Info, I);
15523 case Intrinsic::aarch64_sve_st4:
15524 return setInfoSVEStN<4>(*this, DL, Info, I);
15525 case Intrinsic::aarch64_neon_ld2:
15526 case Intrinsic::aarch64_neon_ld3:
15527 case Intrinsic::aarch64_neon_ld4:
15528 case Intrinsic::aarch64_neon_ld1x2:
15529 case Intrinsic::aarch64_neon_ld1x3:
15530 case Intrinsic::aarch64_neon_ld1x4: {
15532 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
15533 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
15534 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
15535 Info.offset = 0;
15536 Info.align.reset();
15537 // volatile loads with NEON intrinsics not supported
15539 return true;
15540 }
15541 case Intrinsic::aarch64_neon_ld2lane:
15542 case Intrinsic::aarch64_neon_ld3lane:
15543 case Intrinsic::aarch64_neon_ld4lane:
15544 case Intrinsic::aarch64_neon_ld2r:
15545 case Intrinsic::aarch64_neon_ld3r:
15546 case Intrinsic::aarch64_neon_ld4r: {
15548 // ldx return struct with the same vec type
15549 Type *RetTy = I.getType();
15550 auto *StructTy = cast<StructType>(RetTy);
15551 unsigned NumElts = StructTy->getNumElements();
15552 Type *VecTy = StructTy->getElementType(0);
15553 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
15554 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
15555 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
15556 Info.offset = 0;
15557 Info.align.reset();
15558 // volatile loads with NEON intrinsics not supported
15560 return true;
15561 }
15562 case Intrinsic::aarch64_neon_st2:
15563 case Intrinsic::aarch64_neon_st3:
15564 case Intrinsic::aarch64_neon_st4:
15565 case Intrinsic::aarch64_neon_st1x2:
15566 case Intrinsic::aarch64_neon_st1x3:
15567 case Intrinsic::aarch64_neon_st1x4: {
15569 unsigned NumElts = 0;
15570 for (const Value *Arg : I.args()) {
15571 Type *ArgTy = Arg->getType();
15572 if (!ArgTy->isVectorTy())
15573 break;
15574 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
15575 }
15576 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
15577 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
15578 Info.offset = 0;
15579 Info.align.reset();
15580 // volatile stores with NEON intrinsics not supported
15582 return true;
15583 }
15584 case Intrinsic::aarch64_neon_st2lane:
15585 case Intrinsic::aarch64_neon_st3lane:
15586 case Intrinsic::aarch64_neon_st4lane: {
15588 unsigned NumElts = 0;
15589 // all the vector type is same
15590 Type *VecTy = I.getArgOperand(0)->getType();
15591 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
15592
15593 for (const Value *Arg : I.args()) {
15594 Type *ArgTy = Arg->getType();
15595 if (!ArgTy->isVectorTy())
15596 break;
15597 NumElts += 1;
15598 }
15599
15600 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
15601 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
15602 Info.offset = 0;
15603 Info.align.reset();
15604 // volatile stores with NEON intrinsics not supported
15606 return true;
15607 }
15608 case Intrinsic::aarch64_ldaxr:
15609 case Intrinsic::aarch64_ldxr: {
15610 Type *ValTy = I.getParamElementType(0);
15612 Info.memVT = MVT::getVT(ValTy);
15613 Info.ptrVal = I.getArgOperand(0);
15614 Info.offset = 0;
15615 Info.align = DL.getABITypeAlign(ValTy);
15617 return true;
15618 }
15619 case Intrinsic::aarch64_stlxr:
15620 case Intrinsic::aarch64_stxr: {
15621 Type *ValTy = I.getParamElementType(1);
15623 Info.memVT = MVT::getVT(ValTy);
15624 Info.ptrVal = I.getArgOperand(1);
15625 Info.offset = 0;
15626 Info.align = DL.getABITypeAlign(ValTy);
15628 return true;
15629 }
15630 case Intrinsic::aarch64_ldaxp:
15631 case Intrinsic::aarch64_ldxp:
15633 Info.memVT = MVT::i128;
15634 Info.ptrVal = I.getArgOperand(0);
15635 Info.offset = 0;
15636 Info.align = Align(16);
15638 return true;
15639 case Intrinsic::aarch64_stlxp:
15640 case Intrinsic::aarch64_stxp:
15642 Info.memVT = MVT::i128;
15643 Info.ptrVal = I.getArgOperand(2);
15644 Info.offset = 0;
15645 Info.align = Align(16);
15647 return true;
15648 case Intrinsic::aarch64_sve_ldnt1: {
15649 Type *ElTy = cast<VectorType>(I.getType())->getElementType();
15651 Info.memVT = MVT::getVT(I.getType());
15652 Info.ptrVal = I.getArgOperand(1);
15653 Info.offset = 0;
15654 Info.align = DL.getABITypeAlign(ElTy);
15656 return true;
15657 }
15658 case Intrinsic::aarch64_sve_stnt1: {
15659 Type *ElTy =
15660 cast<VectorType>(I.getArgOperand(0)->getType())->getElementType();
15662 Info.memVT = MVT::getVT(I.getOperand(0)->getType());
15663 Info.ptrVal = I.getArgOperand(2);
15664 Info.offset = 0;
15665 Info.align = DL.getABITypeAlign(ElTy);
15667 return true;
15668 }
15669 case Intrinsic::aarch64_mops_memset_tag: {
15670 Value *Dst = I.getArgOperand(0);
15671 Value *Val = I.getArgOperand(1);
15673 Info.memVT = MVT::getVT(Val->getType());
15674 Info.ptrVal = Dst;
15675 Info.offset = 0;
15676 Info.align = I.getParamAlign(0).valueOrOne();
15678 // The size of the memory being operated on is unknown at this point
15680 return true;
15681 }
15682 default:
15683 break;
15684 }
15685
15686 return false;
15687}
15688
15690 ISD::LoadExtType ExtTy,
15691 EVT NewVT) const {
15692 // TODO: This may be worth removing. Check regression tests for diffs.
15693 if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT))
15694 return false;
15695
15696 // If we're reducing the load width in order to avoid having to use an extra
15697 // instruction to do extension then it's probably a good idea.
15698 if (ExtTy != ISD::NON_EXTLOAD)
15699 return true;
15700 // Don't reduce load width if it would prevent us from combining a shift into
15701 // the offset.
15702 MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
15703 assert(Mem);
15704 const SDValue &Base = Mem->getBasePtr();
15705 if (Base.getOpcode() == ISD::ADD &&
15706 Base.getOperand(1).getOpcode() == ISD::SHL &&
15707 Base.getOperand(1).hasOneUse() &&
15708 Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
15709 // It's unknown whether a scalable vector has a power-of-2 bitwidth.
15710 if (Mem->getMemoryVT().isScalableVector())
15711 return false;
15712 // The shift can be combined if it matches the size of the value being
15713 // loaded (and so reducing the width would make it not match).
15714 uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
15715 uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
15716 if (ShiftAmount == Log2_32(LoadBytes))
15717 return false;
15718 }
15719 // We have no reason to disallow reducing the load width, so allow it.
15720 return true;
15721}
15722
15723// Treat a sext_inreg(extract(..)) as free if it has multiple uses.
15725 EVT VT = Extend.getValueType();
15726 if ((VT == MVT::i64 || VT == MVT::i32) && Extend->use_size()) {
15727 SDValue Extract = Extend.getOperand(0);
15728 if (Extract.getOpcode() == ISD::ANY_EXTEND && Extract.hasOneUse())
15729 Extract = Extract.getOperand(0);
15730 if (Extract.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Extract.hasOneUse()) {
15731 EVT VecVT = Extract.getOperand(0).getValueType();
15732 if (VecVT.getScalarType() == MVT::i8 || VecVT.getScalarType() == MVT::i16)
15733 return false;
15734 }
15735 }
15736 return true;
15737}
15738
15739// Truncations from 64-bit GPR to 32-bit GPR is free.
15741 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
15742 return false;
15743 uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedValue();
15744 uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedValue();
15745 return NumBits1 > NumBits2;
15746}
15748 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
15749 return false;
15750 uint64_t NumBits1 = VT1.getFixedSizeInBits();
15751 uint64_t NumBits2 = VT2.getFixedSizeInBits();
15752 return NumBits1 > NumBits2;
15753}
15754
15755/// Check if it is profitable to hoist instruction in then/else to if.
15756/// Not profitable if I and it's user can form a FMA instruction
15757/// because we prefer FMSUB/FMADD.
15759 if (I->getOpcode() != Instruction::FMul)
15760 return true;
15761
15762 if (!I->hasOneUse())
15763 return true;
15764
15765 Instruction *User = I->user_back();
15766
15767 if (!(User->getOpcode() == Instruction::FSub ||
15768 User->getOpcode() == Instruction::FAdd))
15769 return true;
15770
15772 const Function *F = I->getFunction();
15773 const DataLayout &DL = F->getDataLayout();
15774 Type *Ty = User->getOperand(0)->getType();
15775
15776 return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
15778 (Options.AllowFPOpFusion == FPOpFusion::Fast ||
15779 Options.UnsafeFPMath));
15780}
15781
15782// All 32-bit GPR operations implicitly zero the high-half of the corresponding
15783// 64-bit GPR.
15785 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
15786 return false;
15787 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
15788 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
15789 return NumBits1 == 32 && NumBits2 == 64;
15790}
15792 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
15793 return false;
15794 unsigned NumBits1 = VT1.getSizeInBits();
15795 unsigned NumBits2 = VT2.getSizeInBits();
15796 return NumBits1 == 32 && NumBits2 == 64;
15797}
15798
15800 EVT VT1 = Val.getValueType();
15801 if (isZExtFree(VT1, VT2)) {
15802 return true;
15803 }
15804
15805 if (Val.getOpcode() != ISD::LOAD)
15806 return false;
15807
15808 // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
15809 return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
15810 VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
15811 VT1.getSizeInBits() <= 32);
15812}
15813
15814bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
15815 if (isa<FPExtInst>(Ext))
15816 return false;
15817
15818 // Vector types are not free.
15819 if (Ext->getType()->isVectorTy())
15820 return false;
15821
15822 for (const Use &U : Ext->uses()) {
15823 // The extension is free if we can fold it with a left shift in an
15824 // addressing mode or an arithmetic operation: add, sub, and cmp.
15825
15826 // Is there a shift?
15827 const Instruction *Instr = cast<Instruction>(U.getUser());
15828
15829 // Is this a constant shift?
15830 switch (Instr->getOpcode()) {
15831 case Instruction::Shl:
15832 if (!isa<ConstantInt>(Instr->getOperand(1)))
15833 return false;
15834 break;
15835 case Instruction::GetElementPtr: {
15836 gep_type_iterator GTI = gep_type_begin(Instr);
15837 auto &DL = Ext->getDataLayout();
15838 std::advance(GTI, U.getOperandNo()-1);
15839 Type *IdxTy = GTI.getIndexedType();
15840 // This extension will end up with a shift because of the scaling factor.
15841 // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
15842 // Get the shift amount based on the scaling factor:
15843 // log2(sizeof(IdxTy)) - log2(8).
15844 if (IdxTy->isScalableTy())
15845 return false;
15846 uint64_t ShiftAmt =
15847 llvm::countr_zero(DL.getTypeStoreSizeInBits(IdxTy).getFixedValue()) -
15848 3;
15849 // Is the constant foldable in the shift of the addressing mode?
15850 // I.e., shift amount is between 1 and 4 inclusive.
15851 if (ShiftAmt == 0 || ShiftAmt > 4)
15852 return false;
15853 break;
15854 }
15855 case Instruction::Trunc:
15856 // Check if this is a noop.
15857 // trunc(sext ty1 to ty2) to ty1.
15858 if (Instr->getType() == Ext->getOperand(0)->getType())
15859 continue;
15860 [[fallthrough]];
15861 default:
15862 return false;
15863 }
15864
15865 // At this point we can use the bfm family, so this extension is free
15866 // for that use.
15867 }
15868 return true;
15869}
15870
15871static bool isSplatShuffle(Value *V) {
15872 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
15873 return all_equal(Shuf->getShuffleMask());
15874 return false;
15875}
15876
15877/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
15878/// or upper half of the vector elements.
15879static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
15880 bool AllowSplat = false) {
15881 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
15882 auto *FullTy = FullV->getType();
15883 auto *HalfTy = HalfV->getType();
15884 return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
15885 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();
15886 };
15887
15888 auto extractHalf = [](Value *FullV, Value *HalfV) {
15889 auto *FullVT = cast<FixedVectorType>(FullV->getType());
15890 auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
15891 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
15892 };
15893
15894 ArrayRef<int> M1, M2;
15895 Value *S1Op1 = nullptr, *S2Op1 = nullptr;
15896 if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
15897 !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
15898 return false;
15899
15900 // If we allow splats, set S1Op1/S2Op1 to nullptr for the relavant arg so that
15901 // it is not checked as an extract below.
15902 if (AllowSplat && isSplatShuffle(Op1))
15903 S1Op1 = nullptr;
15904 if (AllowSplat && isSplatShuffle(Op2))
15905 S2Op1 = nullptr;
15906
15907 // Check that the operands are half as wide as the result and we extract
15908 // half of the elements of the input vectors.
15909 if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||
15910 (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))
15911 return false;
15912
15913 // Check the mask extracts either the lower or upper half of vector
15914 // elements.
15915 int M1Start = 0;
15916 int M2Start = 0;
15917 int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
15918 if ((S1Op1 &&
15919 !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) ||
15920 (S2Op1 &&
15921 !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start)))
15922 return false;
15923
15924 if ((M1Start != 0 && M1Start != (NumElements / 2)) ||
15925 (M2Start != 0 && M2Start != (NumElements / 2)))
15926 return false;
15927 if (S1Op1 && S2Op1 && M1Start != M2Start)
15928 return false;
15929
15930 return true;
15931}
15932
15933/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
15934/// of the vector elements.
15935static bool areExtractExts(Value *Ext1, Value *Ext2) {
15936 auto areExtDoubled = [](Instruction *Ext) {
15937 return Ext->getType()->getScalarSizeInBits() ==
15938 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
15939 };
15940
15941 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
15942 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
15943 !areExtDoubled(cast<Instruction>(Ext1)) ||
15944 !areExtDoubled(cast<Instruction>(Ext2)))
15945 return false;
15946
15947 return true;
15948}
15949
15950/// Check if Op could be used with vmull_high_p64 intrinsic.
15952 Value *VectorOperand = nullptr;
15953 ConstantInt *ElementIndex = nullptr;
15954 return match(Op, m_ExtractElt(m_Value(VectorOperand),
15955 m_ConstantInt(ElementIndex))) &&
15956 ElementIndex->getValue() == 1 &&
15957 isa<FixedVectorType>(VectorOperand->getType()) &&
15958 cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
15959}
15960
15961/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
15962static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
15964}
15965
15967 // Restrict ourselves to the form CodeGenPrepare typically constructs.
15968 auto *GEP = dyn_cast<GetElementPtrInst>(Ptrs);
15969 if (!GEP || GEP->getNumOperands() != 2)
15970 return false;
15971
15972 Value *Base = GEP->getOperand(0);
15973 Value *Offsets = GEP->getOperand(1);
15974
15975 // We only care about scalar_base+vector_offsets.
15976 if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy())
15977 return false;
15978
15979 // Sink extends that would allow us to use 32-bit offset vectors.
15980 if (isa<SExtInst>(Offsets) || isa<ZExtInst>(Offsets)) {
15981 auto *OffsetsInst = cast<Instruction>(Offsets);
15982 if (OffsetsInst->getType()->getScalarSizeInBits() > 32 &&
15983 OffsetsInst->getOperand(0)->getType()->getScalarSizeInBits() <= 32)
15984 Ops.push_back(&GEP->getOperandUse(1));
15985 }
15986
15987 // Sink the GEP.
15988 return true;
15989}
15990
15991/// We want to sink following cases:
15992/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale;
15993/// (add|sub|gep) A, ((mul|shl) zext(vscale), imm);
15995 if (match(Op, m_VScale()))
15996 return true;
15997 if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
15999 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
16000 return true;
16001 }
16002 if (match(Op, m_Shl(m_ZExt(m_VScale()), m_ConstantInt())) ||
16004 Value *ZExtOp = cast<Instruction>(Op)->getOperand(0);
16005 Ops.push_back(&cast<Instruction>(ZExtOp)->getOperandUse(0));
16006 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
16007 return true;
16008 }
16009 return false;
16010}
16011
16012/// Check if sinking \p I's operands to I's basic block is profitable, because
16013/// the operands can be folded into a target instruction, e.g.
16014/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
16016 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
16017 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
16018 switch (II->getIntrinsicID()) {
16019 case Intrinsic::aarch64_neon_smull:
16020 case Intrinsic::aarch64_neon_umull:
16021 if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1),
16022 /*AllowSplat=*/true)) {
16023 Ops.push_back(&II->getOperandUse(0));
16024 Ops.push_back(&II->getOperandUse(1));
16025 return true;
16026 }
16027 [[fallthrough]];
16028
16029 case Intrinsic::fma:
16030 if (isa<VectorType>(I->getType()) &&
16031 cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
16032 !Subtarget->hasFullFP16())
16033 return false;
16034 [[fallthrough]];
16035 case Intrinsic::aarch64_neon_sqdmull:
16036 case Intrinsic::aarch64_neon_sqdmulh:
16037 case Intrinsic::aarch64_neon_sqrdmulh:
16038 // Sink splats for index lane variants
16039 if (isSplatShuffle(II->getOperand(0)))
16040 Ops.push_back(&II->getOperandUse(0));
16041 if (isSplatShuffle(II->getOperand(1)))
16042 Ops.push_back(&II->getOperandUse(1));
16043 return !Ops.empty();
16044 case Intrinsic::aarch64_neon_fmlal:
16045 case Intrinsic::aarch64_neon_fmlal2:
16046 case Intrinsic::aarch64_neon_fmlsl:
16047 case Intrinsic::aarch64_neon_fmlsl2:
16048 // Sink splats for index lane variants
16049 if (isSplatShuffle(II->getOperand(1)))
16050 Ops.push_back(&II->getOperandUse(1));
16051 if (isSplatShuffle(II->getOperand(2)))
16052 Ops.push_back(&II->getOperandUse(2));
16053 return !Ops.empty();
16054 case Intrinsic::aarch64_sve_ptest_first:
16055 case Intrinsic::aarch64_sve_ptest_last:
16056 if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0)))
16057 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
16058 Ops.push_back(&II->getOperandUse(0));
16059 return !Ops.empty();
16060 case Intrinsic::aarch64_sme_write_horiz:
16061 case Intrinsic::aarch64_sme_write_vert:
16062 case Intrinsic::aarch64_sme_writeq_horiz:
16063 case Intrinsic::aarch64_sme_writeq_vert: {
16064 auto *Idx = dyn_cast<Instruction>(II->getOperand(1));
16065 if (!Idx || Idx->getOpcode() != Instruction::Add)
16066 return false;
16067 Ops.push_back(&II->getOperandUse(1));
16068 return true;
16069 }
16070 case Intrinsic::aarch64_sme_read_horiz:
16071 case Intrinsic::aarch64_sme_read_vert:
16072 case Intrinsic::aarch64_sme_readq_horiz:
16073 case Intrinsic::aarch64_sme_readq_vert:
16074 case Intrinsic::aarch64_sme_ld1b_vert:
16075 case Intrinsic::aarch64_sme_ld1h_vert:
16076 case Intrinsic::aarch64_sme_ld1w_vert:
16077 case Intrinsic::aarch64_sme_ld1d_vert:
16078 case Intrinsic::aarch64_sme_ld1q_vert:
16079 case Intrinsic::aarch64_sme_st1b_vert:
16080 case Intrinsic::aarch64_sme_st1h_vert:
16081 case Intrinsic::aarch64_sme_st1w_vert:
16082 case Intrinsic::aarch64_sme_st1d_vert:
16083 case Intrinsic::aarch64_sme_st1q_vert:
16084 case Intrinsic::aarch64_sme_ld1b_horiz:
16085 case Intrinsic::aarch64_sme_ld1h_horiz:
16086 case Intrinsic::aarch64_sme_ld1w_horiz:
16087 case Intrinsic::aarch64_sme_ld1d_horiz:
16088 case Intrinsic::aarch64_sme_ld1q_horiz:
16089 case Intrinsic::aarch64_sme_st1b_horiz:
16090 case Intrinsic::aarch64_sme_st1h_horiz:
16091 case Intrinsic::aarch64_sme_st1w_horiz:
16092 case Intrinsic::aarch64_sme_st1d_horiz:
16093 case Intrinsic::aarch64_sme_st1q_horiz: {
16094 auto *Idx = dyn_cast<Instruction>(II->getOperand(3));
16095 if (!Idx || Idx->getOpcode() != Instruction::Add)
16096 return false;
16097 Ops.push_back(&II->getOperandUse(3));
16098 return true;
16099 }
16100 case Intrinsic::aarch64_neon_pmull:
16101 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
16102 return false;
16103 Ops.push_back(&II->getOperandUse(0));
16104 Ops.push_back(&II->getOperandUse(1));
16105 return true;
16106 case Intrinsic::aarch64_neon_pmull64:
16107 if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
16108 II->getArgOperand(1)))
16109 return false;
16110 Ops.push_back(&II->getArgOperandUse(0));
16111 Ops.push_back(&II->getArgOperandUse(1));
16112 return true;
16113 case Intrinsic::masked_gather:
16114 if (!shouldSinkVectorOfPtrs(II->getArgOperand(0), Ops))
16115 return false;
16116 Ops.push_back(&II->getArgOperandUse(0));
16117 return true;
16118 case Intrinsic::masked_scatter:
16119 if (!shouldSinkVectorOfPtrs(II->getArgOperand(1), Ops))
16120 return false;
16121 Ops.push_back(&II->getArgOperandUse(1));
16122 return true;
16123 default:
16124 return false;
16125 }
16126 }
16127
16128 // Sink vscales closer to uses for better isel
16129 switch (I->getOpcode()) {
16130 case Instruction::GetElementPtr:
16131 case Instruction::Add:
16132 case Instruction::Sub:
16133 for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
16134 if (shouldSinkVScale(I->getOperand(Op), Ops)) {
16135 Ops.push_back(&I->getOperandUse(Op));
16136 return true;
16137 }
16138 }
16139 break;
16140 default:
16141 break;
16142 }
16143
16144 if (!I->getType()->isVectorTy())
16145 return false;
16146
16147 switch (I->getOpcode()) {
16148 case Instruction::Sub:
16149 case Instruction::Add: {
16150 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
16151 return false;
16152
16153 // If the exts' operands extract either the lower or upper elements, we
16154 // can sink them too.
16155 auto Ext1 = cast<Instruction>(I->getOperand(0));
16156 auto Ext2 = cast<Instruction>(I->getOperand(1));
16157 if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) {
16158 Ops.push_back(&Ext1->getOperandUse(0));
16159 Ops.push_back(&Ext2->getOperandUse(0));
16160 }
16161
16162 Ops.push_back(&I->getOperandUse(0));
16163 Ops.push_back(&I->getOperandUse(1));
16164
16165 return true;
16166 }
16167 case Instruction::Or: {
16168 // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
16169 // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
16170 if (Subtarget->hasNEON()) {
16171 Instruction *OtherAnd, *IA, *IB;
16172 Value *MaskValue;
16173 // MainAnd refers to And instruction that has 'Not' as one of its operands
16174 if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)),
16175 m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))),
16176 m_Instruction(IA)))))) {
16177 if (match(OtherAnd,
16178 m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) {
16179 Instruction *MainAnd = I->getOperand(0) == OtherAnd
16180 ? cast<Instruction>(I->getOperand(1))
16181 : cast<Instruction>(I->getOperand(0));
16182
16183 // Both Ands should be in same basic block as Or
16184 if (I->getParent() != MainAnd->getParent() ||
16185 I->getParent() != OtherAnd->getParent())
16186 return false;
16187
16188 // Non-mask operands of both Ands should also be in same basic block
16189 if (I->getParent() != IA->getParent() ||
16190 I->getParent() != IB->getParent())
16191 return false;
16192
16193 Ops.push_back(&MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0));
16194 Ops.push_back(&I->getOperandUse(0));
16195 Ops.push_back(&I->getOperandUse(1));
16196
16197 return true;
16198 }
16199 }
16200 }
16201
16202 return false;
16203 }
16204 case Instruction::Mul: {
16205 int NumZExts = 0, NumSExts = 0;
16206 for (auto &Op : I->operands()) {
16207 // Make sure we are not already sinking this operand
16208 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
16209 continue;
16210
16211 if (match(&Op, m_SExt(m_Value()))) {
16212 NumSExts++;
16213 continue;
16214 } else if (match(&Op, m_ZExt(m_Value()))) {
16215 NumZExts++;
16216 continue;
16217 }
16218
16219 ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Op);
16220
16221 // If the Shuffle is a splat and the operand is a zext/sext, sinking the
16222 // operand and the s/zext can help create indexed s/umull. This is
16223 // especially useful to prevent i64 mul being scalarized.
16224 if (Shuffle && isSplatShuffle(Shuffle) &&
16225 match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {
16226 Ops.push_back(&Shuffle->getOperandUse(0));
16227 Ops.push_back(&Op);
16228 if (match(Shuffle->getOperand(0), m_SExt(m_Value())))
16229 NumSExts++;
16230 else
16231 NumZExts++;
16232 continue;
16233 }
16234
16235 if (!Shuffle)
16236 continue;
16237
16238 Value *ShuffleOperand = Shuffle->getOperand(0);
16239 InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
16240 if (!Insert)
16241 continue;
16242
16243 Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
16244 if (!OperandInstr)
16245 continue;
16246
16247 ConstantInt *ElementConstant =
16248 dyn_cast<ConstantInt>(Insert->getOperand(2));
16249 // Check that the insertelement is inserting into element 0
16250 if (!ElementConstant || !ElementConstant->isZero())
16251 continue;
16252
16253 unsigned Opcode = OperandInstr->getOpcode();
16254 if (Opcode == Instruction::SExt)
16255 NumSExts++;
16256 else if (Opcode == Instruction::ZExt)
16257 NumZExts++;
16258 else {
16259 // If we find that the top bits are known 0, then we can sink and allow
16260 // the backend to generate a umull.
16261 unsigned Bitwidth = I->getType()->getScalarSizeInBits();
16262 APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2);
16263 const DataLayout &DL = I->getDataLayout();
16264 if (!MaskedValueIsZero(OperandInstr, UpperMask, DL))
16265 continue;
16266 NumZExts++;
16267 }
16268
16269 Ops.push_back(&Shuffle->getOperandUse(0));
16270 Ops.push_back(&Op);
16271 }
16272
16273 // Is it profitable to sink if we found two of the same type of extends.
16274 return !Ops.empty() && (NumSExts == 2 || NumZExts == 2);
16275 }
16276 default:
16277 return false;
16278 }
16279 return false;
16280}
16281
16282static bool createTblShuffleMask(unsigned SrcWidth, unsigned DstWidth,
16283 unsigned NumElts, bool IsLittleEndian,
16284 SmallVectorImpl<int> &Mask) {
16285 if (DstWidth % 8 != 0 || DstWidth <= 16 || DstWidth >= 64)
16286 return false;
16287
16288 assert(DstWidth % SrcWidth == 0 &&
16289 "TBL lowering is not supported for a conversion instruction with this "
16290 "source and destination element type.");
16291
16292 unsigned Factor = DstWidth / SrcWidth;
16293 unsigned MaskLen = NumElts * Factor;
16294
16295 Mask.clear();
16296 Mask.resize(MaskLen, NumElts);
16297
16298 unsigned SrcIndex = 0;
16299 for (unsigned I = IsLittleEndian ? 0 : Factor - 1; I < MaskLen; I += Factor)
16300 Mask[I] = SrcIndex++;
16301
16302 return true;
16303}
16304
16306 FixedVectorType *ZExtTy,
16307 FixedVectorType *DstTy,
16308 bool IsLittleEndian) {
16309 auto *SrcTy = cast<FixedVectorType>(Op->getType());
16310 unsigned NumElts = SrcTy->getNumElements();
16311 auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
16312 auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
16313
16314 SmallVector<int> Mask;
16315 if (!createTblShuffleMask(SrcWidth, DstWidth, NumElts, IsLittleEndian, Mask))
16316 return nullptr;
16317
16318 auto *FirstEltZero = Builder.CreateInsertElement(
16319 PoisonValue::get(SrcTy), Builder.getInt8(0), uint64_t(0));
16320 Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
16321 Result = Builder.CreateBitCast(Result, DstTy);
16322 if (DstTy != ZExtTy)
16323 Result = Builder.CreateZExt(Result, ZExtTy);
16324 return Result;
16325}
16326
16328 FixedVectorType *DstTy,
16329 bool IsLittleEndian) {
16330 auto *SrcTy = cast<FixedVectorType>(Op->getType());
16331 auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
16332 auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
16333
16334 SmallVector<int> Mask;
16335 if (!createTblShuffleMask(SrcWidth, DstWidth, SrcTy->getNumElements(),
16336 !IsLittleEndian, Mask))
16337 return nullptr;
16338
16339 auto *FirstEltZero = Builder.CreateInsertElement(
16340 PoisonValue::get(SrcTy), Builder.getInt8(0), uint64_t(0));
16341
16342 return Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
16343}
16344
16345static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
16346 IRBuilder<> Builder(TI);
16348 int NumElements = cast<FixedVectorType>(TI->getType())->getNumElements();
16349 auto *SrcTy = cast<FixedVectorType>(TI->getOperand(0)->getType());
16350 auto *DstTy = cast<FixedVectorType>(TI->getType());
16351 assert(SrcTy->getElementType()->isIntegerTy() &&
16352 "Non-integer type source vector element is not supported");
16353 assert(DstTy->getElementType()->isIntegerTy(8) &&
16354 "Unsupported destination vector element type");
16355 unsigned SrcElemTySz =
16356 cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
16357 unsigned DstElemTySz =
16358 cast<IntegerType>(DstTy->getElementType())->getBitWidth();
16359 assert((SrcElemTySz % DstElemTySz == 0) &&
16360 "Cannot lower truncate to tbl instructions for a source element size "
16361 "that is not divisible by the destination element size");
16362 unsigned TruncFactor = SrcElemTySz / DstElemTySz;
16363 assert((SrcElemTySz == 16 || SrcElemTySz == 32 || SrcElemTySz == 64) &&
16364 "Unsupported source vector element type size");
16365 Type *VecTy = FixedVectorType::get(Builder.getInt8Ty(), 16);
16366
16367 // Create a mask to choose every nth byte from the source vector table of
16368 // bytes to create the truncated destination vector, where 'n' is the truncate
16369 // ratio. For example, for a truncate from Yxi64 to Yxi8, choose
16370 // 0,8,16,..Y*8th bytes for the little-endian format
16372 for (int Itr = 0; Itr < 16; Itr++) {
16373 if (Itr < NumElements)
16374 MaskConst.push_back(Builder.getInt8(
16375 IsLittleEndian ? Itr * TruncFactor
16376 : Itr * TruncFactor + (TruncFactor - 1)));
16377 else
16378 MaskConst.push_back(Builder.getInt8(255));
16379 }
16380
16381 int MaxTblSz = 128 * 4;
16382 int MaxSrcSz = SrcElemTySz * NumElements;
16383 int ElemsPerTbl =
16384 (MaxTblSz > MaxSrcSz) ? NumElements : (MaxTblSz / SrcElemTySz);
16385 assert(ElemsPerTbl <= 16 &&
16386 "Maximum elements selected using TBL instruction cannot exceed 16!");
16387
16388 int ShuffleCount = 128 / SrcElemTySz;
16389 SmallVector<int> ShuffleLanes;
16390 for (int i = 0; i < ShuffleCount; ++i)
16391 ShuffleLanes.push_back(i);
16392
16393 // Create TBL's table of bytes in 1,2,3 or 4 FP/SIMD registers using shuffles
16394 // over the source vector. If TBL's maximum 4 FP/SIMD registers are saturated,
16395 // call TBL & save the result in a vector of TBL results for combining later.
16397 while (ShuffleLanes.back() < NumElements) {
16398 Parts.push_back(Builder.CreateBitCast(
16399 Builder.CreateShuffleVector(TI->getOperand(0), ShuffleLanes), VecTy));
16400
16401 if (Parts.size() == 4) {
16403 Intrinsic::aarch64_neon_tbl4, VecTy);
16404 Parts.push_back(ConstantVector::get(MaskConst));
16405 Results.push_back(Builder.CreateCall(F, Parts));
16406 Parts.clear();
16407 }
16408
16409 for (int i = 0; i < ShuffleCount; ++i)
16410 ShuffleLanes[i] += ShuffleCount;
16411 }
16412
16413 assert((Parts.empty() || Results.empty()) &&
16414 "Lowering trunc for vectors requiring different TBL instructions is "
16415 "not supported!");
16416 // Call TBL for the residual table bytes present in 1,2, or 3 FP/SIMD
16417 // registers
16418 if (!Parts.empty()) {
16419 Intrinsic::ID TblID;
16420 switch (Parts.size()) {
16421 case 1:
16422 TblID = Intrinsic::aarch64_neon_tbl1;
16423 break;
16424 case 2:
16425 TblID = Intrinsic::aarch64_neon_tbl2;
16426 break;
16427 case 3:
16428 TblID = Intrinsic::aarch64_neon_tbl3;
16429 break;
16430 }
16431
16432 auto *F = Intrinsic::getDeclaration(TI->getModule(), TblID, VecTy);
16433 Parts.push_back(ConstantVector::get(MaskConst));
16434 Results.push_back(Builder.CreateCall(F, Parts));
16435 }
16436
16437 // Extract the destination vector from TBL result(s) after combining them
16438 // where applicable. Currently, at most two TBLs are supported.
16439 assert(Results.size() <= 2 && "Trunc lowering does not support generation of "
16440 "more than 2 tbl instructions!");
16441 Value *FinalResult = Results[0];
16442 if (Results.size() == 1) {
16443 if (ElemsPerTbl < 16) {
16444 SmallVector<int> FinalMask(ElemsPerTbl);
16445 std::iota(FinalMask.begin(), FinalMask.end(), 0);
16446 FinalResult = Builder.CreateShuffleVector(Results[0], FinalMask);
16447 }
16448 } else {
16449 SmallVector<int> FinalMask(ElemsPerTbl * Results.size());
16450 if (ElemsPerTbl < 16) {
16451 std::iota(FinalMask.begin(), FinalMask.begin() + ElemsPerTbl, 0);
16452 std::iota(FinalMask.begin() + ElemsPerTbl, FinalMask.end(), 16);
16453 } else {
16454 std::iota(FinalMask.begin(), FinalMask.end(), 0);
16455 }
16456 FinalResult =
16457 Builder.CreateShuffleVector(Results[0], Results[1], FinalMask);
16458 }
16459
16460 TI->replaceAllUsesWith(FinalResult);
16461 TI->eraseFromParent();
16462}
16463
16465 Instruction *I, Loop *L, const TargetTransformInfo &TTI) const {
16466 // shuffle_vector instructions are serialized when targeting SVE,
16467 // see LowerSPLAT_VECTOR. This peephole is not beneficial.
16468 if (!EnableExtToTBL || Subtarget->useSVEForFixedLengthVectors())
16469 return false;
16470
16471 // Try to optimize conversions using tbl. This requires materializing constant
16472 // index vectors, which can increase code size and add loads. Skip the
16473 // transform unless the conversion is in a loop block guaranteed to execute
16474 // and we are not optimizing for size.
16475 Function *F = I->getParent()->getParent();
16476 if (!L || L->getHeader() != I->getParent() || F->hasMinSize() ||
16477 F->hasOptSize())
16478 return false;
16479
16480 auto *SrcTy = dyn_cast<FixedVectorType>(I->getOperand(0)->getType());
16481 auto *DstTy = dyn_cast<FixedVectorType>(I->getType());
16482 if (!SrcTy || !DstTy)
16483 return false;
16484
16485 // Convert 'zext <Y x i8> %x to <Y x i8X>' to a shuffle that can be
16486 // lowered to tbl instructions to insert the original i8 elements
16487 // into i8x lanes. This is enabled for cases where it is beneficial.
16488 auto *ZExt = dyn_cast<ZExtInst>(I);
16489 if (ZExt && SrcTy->getElementType()->isIntegerTy(8)) {
16490 auto DstWidth = DstTy->getElementType()->getScalarSizeInBits();
16491 if (DstWidth % 8 != 0)
16492 return false;
16493
16494 auto *TruncDstType =
16495 cast<FixedVectorType>(VectorType::getTruncatedElementVectorType(DstTy));
16496 // If the ZExt can be lowered to a single ZExt to the next power-of-2 and
16497 // the remaining ZExt folded into the user, don't use tbl lowering.
16498 auto SrcWidth = SrcTy->getElementType()->getScalarSizeInBits();
16499 if (TTI.getCastInstrCost(I->getOpcode(), DstTy, TruncDstType,
16502 if (SrcWidth * 2 >= TruncDstType->getElementType()->getScalarSizeInBits())
16503 return false;
16504
16505 DstTy = TruncDstType;
16506 }
16507 IRBuilder<> Builder(ZExt);
16509 Builder, ZExt->getOperand(0), cast<FixedVectorType>(ZExt->getType()),
16510 DstTy, Subtarget->isLittleEndian());
16511 if (!Result)
16512 return false;
16513 ZExt->replaceAllUsesWith(Result);
16514 ZExt->eraseFromParent();
16515 return true;
16516 }
16517
16518 auto *UIToFP = dyn_cast<UIToFPInst>(I);
16519 if (UIToFP && SrcTy->getElementType()->isIntegerTy(8) &&
16520 DstTy->getElementType()->isFloatTy()) {
16521 IRBuilder<> Builder(I);
16523 Builder, I->getOperand(0), FixedVectorType::getInteger(DstTy),
16524 FixedVectorType::getInteger(DstTy), Subtarget->isLittleEndian());
16525 assert(ZExt && "Cannot fail for the i8 to float conversion");
16526 auto *UI = Builder.CreateUIToFP(ZExt, DstTy);
16527 I->replaceAllUsesWith(UI);
16528 I->eraseFromParent();
16529 return true;
16530 }
16531
16532 auto *SIToFP = dyn_cast<SIToFPInst>(I);
16533 if (SIToFP && SrcTy->getElementType()->isIntegerTy(8) &&
16534 DstTy->getElementType()->isFloatTy()) {
16535 IRBuilder<> Builder(I);
16536 auto *Shuffle = createTblShuffleForSExt(Builder, I->getOperand(0),
16538 Subtarget->isLittleEndian());
16539 assert(Shuffle && "Cannot fail for the i8 to float conversion");
16540 auto *Cast = Builder.CreateBitCast(Shuffle, VectorType::getInteger(DstTy));
16541 auto *AShr = Builder.CreateAShr(Cast, 24, "", true);
16542 auto *SI = Builder.CreateSIToFP(AShr, DstTy);
16543 I->replaceAllUsesWith(SI);
16544 I->eraseFromParent();
16545 return true;
16546 }
16547
16548 // Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui
16549 // followed by a truncate lowered to using tbl.4.
16550 auto *FPToUI = dyn_cast<FPToUIInst>(I);
16551 if (FPToUI &&
16552 (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
16553 SrcTy->getElementType()->isFloatTy() &&
16554 DstTy->getElementType()->isIntegerTy(8)) {
16555 IRBuilder<> Builder(I);
16556 auto *WideConv = Builder.CreateFPToUI(FPToUI->getOperand(0),
16557 VectorType::getInteger(SrcTy));
16558 auto *TruncI = Builder.CreateTrunc(WideConv, DstTy);
16559 I->replaceAllUsesWith(TruncI);
16560 I->eraseFromParent();
16561 createTblForTrunc(cast<TruncInst>(TruncI), Subtarget->isLittleEndian());
16562 return true;
16563 }
16564
16565 // Convert 'trunc <(8|16) x (i32|i64)> %x to <(8|16) x i8>' to an appropriate
16566 // tbl instruction selecting the lowest/highest (little/big endian) 8 bits
16567 // per lane of the input that is represented using 1,2,3 or 4 128-bit table
16568 // registers
16569 auto *TI = dyn_cast<TruncInst>(I);
16570 if (TI && DstTy->getElementType()->isIntegerTy(8) &&
16571 ((SrcTy->getElementType()->isIntegerTy(32) ||
16572 SrcTy->getElementType()->isIntegerTy(64)) &&
16573 (SrcTy->getNumElements() == 16 || SrcTy->getNumElements() == 8))) {
16574 createTblForTrunc(TI, Subtarget->isLittleEndian());
16575 return true;
16576 }
16577
16578 return false;
16579}
16580
16582 Align &RequiredAligment) const {
16583 if (!LoadedType.isSimple() ||
16584 (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
16585 return false;
16586 // Cyclone supports unaligned accesses.
16587 RequiredAligment = Align(1);
16588 unsigned NumBits = LoadedType.getSizeInBits();
16589 return NumBits == 32 || NumBits == 64;
16590}
16591
16592/// A helper function for determining the number of interleaved accesses we
16593/// will generate when lowering accesses of the given type.
16595 VectorType *VecTy, const DataLayout &DL, bool UseScalable) const {
16596 unsigned VecSize = 128;
16597 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
16598 unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
16599 if (UseScalable && isa<FixedVectorType>(VecTy))
16600 VecSize = std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
16601 return std::max<unsigned>(1, (MinElts * ElSize + 127) / VecSize);
16602}
16603
16606 if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
16607 I.hasMetadata(FALKOR_STRIDED_ACCESS_MD))
16608 return MOStridedAccess;
16610}
16611
16613 VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const {
16614 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
16615 auto EC = VecTy->getElementCount();
16616 unsigned MinElts = EC.getKnownMinValue();
16617
16618 UseScalable = false;
16619
16620 if (isa<FixedVectorType>(VecTy) && !Subtarget->isNeonAvailable() &&
16621 (!Subtarget->useSVEForFixedLengthVectors() ||
16623 return false;
16624
16625 if (isa<ScalableVectorType>(VecTy) &&
16626 !Subtarget->isSVEorStreamingSVEAvailable())
16627 return false;
16628
16629 // Ensure the number of vector elements is greater than 1.
16630 if (MinElts < 2)
16631 return false;
16632
16633 // Ensure the element type is legal.
16634 if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
16635 return false;
16636
16637 if (EC.isScalable()) {
16638 UseScalable = true;
16639 return isPowerOf2_32(MinElts) && (MinElts * ElSize) % 128 == 0;
16640 }
16641
16642 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
16643 if (Subtarget->useSVEForFixedLengthVectors()) {
16644 unsigned MinSVEVectorSize =
16645 std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
16646 if (VecSize % MinSVEVectorSize == 0 ||
16647 (VecSize < MinSVEVectorSize && isPowerOf2_32(MinElts) &&
16648 (!Subtarget->isNeonAvailable() || VecSize > 128))) {
16649 UseScalable = true;
16650 return true;
16651 }
16652 }
16653
16654 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
16655 // 128 will be split into multiple interleaved accesses.
16656 return Subtarget->isNeonAvailable() && (VecSize == 64 || VecSize % 128 == 0);
16657}
16658
16660 if (VTy->getElementType() == Type::getDoubleTy(VTy->getContext()))
16661 return ScalableVectorType::get(VTy->getElementType(), 2);
16662
16663 if (VTy->getElementType() == Type::getFloatTy(VTy->getContext()))
16664 return ScalableVectorType::get(VTy->getElementType(), 4);
16665
16666 if (VTy->getElementType() == Type::getBFloatTy(VTy->getContext()))
16667 return ScalableVectorType::get(VTy->getElementType(), 8);
16668
16669 if (VTy->getElementType() == Type::getHalfTy(VTy->getContext()))
16670 return ScalableVectorType::get(VTy->getElementType(), 8);
16671
16672 if (VTy->getElementType() == Type::getInt64Ty(VTy->getContext()))
16673 return ScalableVectorType::get(VTy->getElementType(), 2);
16674
16675 if (VTy->getElementType() == Type::getInt32Ty(VTy->getContext()))
16676 return ScalableVectorType::get(VTy->getElementType(), 4);
16677
16678 if (VTy->getElementType() == Type::getInt16Ty(VTy->getContext()))
16679 return ScalableVectorType::get(VTy->getElementType(), 8);
16680
16681 if (VTy->getElementType() == Type::getInt8Ty(VTy->getContext()))
16682 return ScalableVectorType::get(VTy->getElementType(), 16);
16683
16684 llvm_unreachable("Cannot handle input vector type");
16685}
16686
16687static Function *getStructuredLoadFunction(Module *M, unsigned Factor,
16688 bool Scalable, Type *LDVTy,
16689 Type *PtrTy) {
16690 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
16691 static const Intrinsic::ID SVELoads[3] = {Intrinsic::aarch64_sve_ld2_sret,
16692 Intrinsic::aarch64_sve_ld3_sret,
16693 Intrinsic::aarch64_sve_ld4_sret};
16694 static const Intrinsic::ID NEONLoads[3] = {Intrinsic::aarch64_neon_ld2,
16695 Intrinsic::aarch64_neon_ld3,
16696 Intrinsic::aarch64_neon_ld4};
16697 if (Scalable)
16698 return Intrinsic::getDeclaration(M, SVELoads[Factor - 2], {LDVTy});
16699
16700 return Intrinsic::getDeclaration(M, NEONLoads[Factor - 2], {LDVTy, PtrTy});
16701}
16702
16703static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
16704 bool Scalable, Type *STVTy,
16705 Type *PtrTy) {
16706 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
16707 static const Intrinsic::ID SVEStores[3] = {Intrinsic::aarch64_sve_st2,
16708 Intrinsic::aarch64_sve_st3,
16709 Intrinsic::aarch64_sve_st4};
16710 static const Intrinsic::ID NEONStores[3] = {Intrinsic::aarch64_neon_st2,
16711 Intrinsic::aarch64_neon_st3,
16712 Intrinsic::aarch64_neon_st4};
16713 if (Scalable)
16714 return Intrinsic::getDeclaration(M, SVEStores[Factor - 2], {STVTy});
16715
16716 return Intrinsic::getDeclaration(M, NEONStores[Factor - 2], {STVTy, PtrTy});
16717}
16718
16719/// Lower an interleaved load into a ldN intrinsic.
16720///
16721/// E.g. Lower an interleaved load (Factor = 2):
16722/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
16723/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
16724/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
16725///
16726/// Into:
16727/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
16728/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
16729/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
16732 ArrayRef<unsigned> Indices, unsigned Factor) const {
16733 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
16734 "Invalid interleave factor");
16735 assert(!Shuffles.empty() && "Empty shufflevector input");
16736 assert(Shuffles.size() == Indices.size() &&
16737 "Unmatched number of shufflevectors and indices");
16738
16739 const DataLayout &DL = LI->getDataLayout();
16740
16741 VectorType *VTy = Shuffles[0]->getType();
16742
16743 // Skip if we do not have NEON and skip illegal vector types. We can
16744 // "legalize" wide vector types into multiple interleaved accesses as long as
16745 // the vector types are divisible by 128.
16746 bool UseScalable;
16747 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
16748 return false;
16749
16750 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
16751
16752 auto *FVTy = cast<FixedVectorType>(VTy);
16753
16754 // A pointer vector can not be the return type of the ldN intrinsics. Need to
16755 // load integer vectors first and then convert to pointer vectors.
16756 Type *EltTy = FVTy->getElementType();
16757 if (EltTy->isPointerTy())
16758 FVTy =
16759 FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements());
16760
16761 // If we're going to generate more than one load, reset the sub-vector type
16762 // to something legal.
16763 FVTy = FixedVectorType::get(FVTy->getElementType(),
16764 FVTy->getNumElements() / NumLoads);
16765
16766 auto *LDVTy =
16767 UseScalable ? cast<VectorType>(getSVEContainerIRType(FVTy)) : FVTy;
16768
16769 IRBuilder<> Builder(LI);
16770
16771 // The base address of the load.
16772 Value *BaseAddr = LI->getPointerOperand();
16773
16774 Type *PtrTy = LI->getPointerOperandType();
16775 Type *PredTy = VectorType::get(Type::getInt1Ty(LDVTy->getContext()),
16776 LDVTy->getElementCount());
16777
16778 Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor,
16779 UseScalable, LDVTy, PtrTy);
16780
16781 // Holds sub-vectors extracted from the load intrinsic return values. The
16782 // sub-vectors are associated with the shufflevector instructions they will
16783 // replace.
16785
16786 Value *PTrue = nullptr;
16787 if (UseScalable) {
16788 std::optional<unsigned> PgPattern =
16789 getSVEPredPatternFromNumElements(FVTy->getNumElements());
16790 if (Subtarget->getMinSVEVectorSizeInBits() ==
16791 Subtarget->getMaxSVEVectorSizeInBits() &&
16792 Subtarget->getMinSVEVectorSizeInBits() == DL.getTypeSizeInBits(FVTy))
16793 PgPattern = AArch64SVEPredPattern::all;
16794
16795 auto *PTruePat =
16796 ConstantInt::get(Type::getInt32Ty(LDVTy->getContext()), *PgPattern);
16797 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
16798 {PTruePat});
16799 }
16800
16801 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
16802
16803 // If we're generating more than one load, compute the base address of
16804 // subsequent loads as an offset from the previous.
16805 if (LoadCount > 0)
16806 BaseAddr = Builder.CreateConstGEP1_32(LDVTy->getElementType(), BaseAddr,
16807 FVTy->getNumElements() * Factor);
16808
16809 CallInst *LdN;
16810 if (UseScalable)
16811 LdN = Builder.CreateCall(LdNFunc, {PTrue, BaseAddr}, "ldN");
16812 else
16813 LdN = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
16814
16815 // Extract and store the sub-vectors returned by the load intrinsic.
16816 for (unsigned i = 0; i < Shuffles.size(); i++) {
16817 ShuffleVectorInst *SVI = Shuffles[i];
16818 unsigned Index = Indices[i];
16819
16820 Value *SubVec = Builder.CreateExtractValue(LdN, Index);
16821
16822 if (UseScalable)
16823 SubVec = Builder.CreateExtractVector(
16824 FVTy, SubVec,
16825 ConstantInt::get(Type::getInt64Ty(VTy->getContext()), 0));
16826
16827 // Convert the integer vector to pointer vector if the element is pointer.
16828 if (EltTy->isPointerTy())
16829 SubVec = Builder.CreateIntToPtr(
16831 FVTy->getNumElements()));
16832
16833 SubVecs[SVI].push_back(SubVec);
16834 }
16835 }
16836
16837 // Replace uses of the shufflevector instructions with the sub-vectors
16838 // returned by the load intrinsic. If a shufflevector instruction is
16839 // associated with more than one sub-vector, those sub-vectors will be
16840 // concatenated into a single wide vector.
16841 for (ShuffleVectorInst *SVI : Shuffles) {
16842 auto &SubVec = SubVecs[SVI];
16843 auto *WideVec =
16844 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
16845 SVI->replaceAllUsesWith(WideVec);
16846 }
16847
16848 return true;
16849}
16850
16851template <typename Iter>
16852bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
16853 int MaxLookupDist = 20;
16854 unsigned IdxWidth = DL.getIndexSizeInBits(0);
16855 APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
16856 const Value *PtrA1 =
16857 Ptr->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
16858
16859 while (++It != End) {
16860 if (It->isDebugOrPseudoInst())
16861 continue;
16862 if (MaxLookupDist-- == 0)
16863 break;
16864 if (const auto *SI = dyn_cast<StoreInst>(&*It)) {
16865 const Value *PtrB1 =
16866 SI->getPointerOperand()->stripAndAccumulateInBoundsConstantOffsets(
16867 DL, OffsetB);
16868 if (PtrA1 == PtrB1 &&
16869 (OffsetA.sextOrTrunc(IdxWidth) - OffsetB.sextOrTrunc(IdxWidth))
16870 .abs() == 16)
16871 return true;
16872 }
16873 }
16874
16875 return false;
16876}
16877
16878/// Lower an interleaved store into a stN intrinsic.
16879///
16880/// E.g. Lower an interleaved store (Factor = 3):
16881/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
16882/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
16883/// store <12 x i32> %i.vec, <12 x i32>* %ptr
16884///
16885/// Into:
16886/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
16887/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
16888/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
16889/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
16890///
16891/// Note that the new shufflevectors will be removed and we'll only generate one
16892/// st3 instruction in CodeGen.
16893///
16894/// Example for a more general valid mask (Factor 3). Lower:
16895/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
16896/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
16897/// store <12 x i32> %i.vec, <12 x i32>* %ptr
16898///
16899/// Into:
16900/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
16901/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
16902/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
16903/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
16905 ShuffleVectorInst *SVI,
16906 unsigned Factor) const {
16907
16908 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
16909 "Invalid interleave factor");
16910
16911 auto *VecTy = cast<FixedVectorType>(SVI->getType());
16912 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
16913
16914 unsigned LaneLen = VecTy->getNumElements() / Factor;
16915 Type *EltTy = VecTy->getElementType();
16916 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
16917
16918 const DataLayout &DL = SI->getDataLayout();
16919 bool UseScalable;
16920
16921 // Skip if we do not have NEON and skip illegal vector types. We can
16922 // "legalize" wide vector types into multiple interleaved accesses as long as
16923 // the vector types are divisible by 128.
16924 if (!isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
16925 return false;
16926
16927 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
16928
16929 Value *Op0 = SVI->getOperand(0);
16930 Value *Op1 = SVI->getOperand(1);
16931 IRBuilder<> Builder(SI);
16932
16933 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
16934 // vectors to integer vectors.
16935 if (EltTy->isPointerTy()) {
16936 Type *IntTy = DL.getIntPtrType(EltTy);
16937 unsigned NumOpElts =
16938 cast<FixedVectorType>(Op0->getType())->getNumElements();
16939
16940 // Convert to the corresponding integer vector.
16941 auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts);
16942 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
16943 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
16944
16945 SubVecTy = FixedVectorType::get(IntTy, LaneLen);
16946 }
16947
16948 // If we're going to generate more than one store, reset the lane length
16949 // and sub-vector type to something legal.
16950 LaneLen /= NumStores;
16951 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
16952
16953 auto *STVTy = UseScalable ? cast<VectorType>(getSVEContainerIRType(SubVecTy))
16954 : SubVecTy;
16955
16956 // The base address of the store.
16957 Value *BaseAddr = SI->getPointerOperand();
16958
16959 auto Mask = SVI->getShuffleMask();
16960
16961 // Sanity check if all the indices are NOT in range.
16962 // If mask is `poison`, `Mask` may be a vector of -1s.
16963 // If all of them are `poison`, OOB read will happen later.
16964 if (llvm::all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
16965 return false;
16966 }
16967 // A 64bit st2 which does not start at element 0 will involved adding extra
16968 // ext elements making the st2 unprofitable, and if there is a nearby store
16969 // that points to BaseAddr+16 or BaseAddr-16 then it can be better left as a
16970 // zip;ldp pair which has higher throughput.
16971 if (Factor == 2 && SubVecTy->getPrimitiveSizeInBits() == 64 &&
16972 (Mask[0] != 0 ||
16973 hasNearbyPairedStore(SI->getIterator(), SI->getParent()->end(), BaseAddr,
16974 DL) ||
16975 hasNearbyPairedStore(SI->getReverseIterator(), SI->getParent()->rend(),
16976 BaseAddr, DL)))
16977 return false;
16978
16979 Type *PtrTy = SI->getPointerOperandType();
16980 Type *PredTy = VectorType::get(Type::getInt1Ty(STVTy->getContext()),
16981 STVTy->getElementCount());
16982
16983 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
16984 UseScalable, STVTy, PtrTy);
16985
16986 Value *PTrue = nullptr;
16987 if (UseScalable) {
16988 std::optional<unsigned> PgPattern =
16989 getSVEPredPatternFromNumElements(SubVecTy->getNumElements());
16990 if (Subtarget->getMinSVEVectorSizeInBits() ==
16991 Subtarget->getMaxSVEVectorSizeInBits() &&
16992 Subtarget->getMinSVEVectorSizeInBits() ==
16993 DL.getTypeSizeInBits(SubVecTy))
16994 PgPattern = AArch64SVEPredPattern::all;
16995
16996 auto *PTruePat =
16997 ConstantInt::get(Type::getInt32Ty(STVTy->getContext()), *PgPattern);
16998 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
16999 {PTruePat});
17000 }
17001
17002 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
17003
17005
17006 // Split the shufflevector operands into sub vectors for the new stN call.
17007 for (unsigned i = 0; i < Factor; i++) {
17008 Value *Shuffle;
17009 unsigned IdxI = StoreCount * LaneLen * Factor + i;
17010 if (Mask[IdxI] >= 0) {
17011 Shuffle = Builder.CreateShuffleVector(
17012 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0));
17013 } else {
17014 unsigned StartMask = 0;
17015 for (unsigned j = 1; j < LaneLen; j++) {
17016 unsigned IdxJ = StoreCount * LaneLen * Factor + j * Factor + i;
17017 if (Mask[IdxJ] >= 0) {
17018 StartMask = Mask[IdxJ] - j;
17019 break;
17020 }
17021 }
17022 // Note: Filling undef gaps with random elements is ok, since
17023 // those elements were being written anyway (with undefs).
17024 // In the case of all undefs we're defaulting to using elems from 0
17025 // Note: StartMask cannot be negative, it's checked in
17026 // isReInterleaveMask
17027 Shuffle = Builder.CreateShuffleVector(
17028 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0));
17029 }
17030
17031 if (UseScalable)
17032 Shuffle = Builder.CreateInsertVector(
17033 STVTy, UndefValue::get(STVTy), Shuffle,
17034 ConstantInt::get(Type::getInt64Ty(STVTy->getContext()), 0));
17035
17036 Ops.push_back(Shuffle);
17037 }
17038
17039 if (UseScalable)
17040 Ops.push_back(PTrue);
17041
17042 // If we generating more than one store, we compute the base address of
17043 // subsequent stores as an offset from the previous.
17044 if (StoreCount > 0)
17045 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
17046 BaseAddr, LaneLen * Factor);
17047
17048 Ops.push_back(BaseAddr);
17049 Builder.CreateCall(StNFunc, Ops);
17050 }
17051 return true;
17052}
17053
17055 IntrinsicInst *DI, LoadInst *LI) const {
17056 // Only deinterleave2 supported at present.
17057 if (DI->getIntrinsicID() != Intrinsic::vector_deinterleave2)
17058 return false;
17059
17060 // Only a factor of 2 supported at present.
17061 const unsigned Factor = 2;
17062
17063 VectorType *VTy = cast<VectorType>(DI->getType()->getContainedType(0));
17064 const DataLayout &DL = DI->getDataLayout();
17065 bool UseScalable;
17066 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
17067 return false;
17068
17069 // TODO: Add support for using SVE instructions with fixed types later, using
17070 // the code from lowerInterleavedLoad to obtain the correct container type.
17071 if (UseScalable && !VTy->isScalableTy())
17072 return false;
17073
17074 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
17075
17076 VectorType *LdTy =
17078 VTy->getElementCount().divideCoefficientBy(NumLoads));
17079
17080 Type *PtrTy = LI->getPointerOperandType();
17081 Function *LdNFunc = getStructuredLoadFunction(DI->getModule(), Factor,
17082 UseScalable, LdTy, PtrTy);
17083
17084 IRBuilder<> Builder(LI);
17085
17086 Value *Pred = nullptr;
17087 if (UseScalable)
17088 Pred =
17089 Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue());
17090
17091 Value *BaseAddr = LI->getPointerOperand();
17092 Value *Result;
17093 if (NumLoads > 1) {
17094 Value *Left = PoisonValue::get(VTy);
17096
17097 for (unsigned I = 0; I < NumLoads; ++I) {
17098 Value *Offset = Builder.getInt64(I * Factor);
17099
17100 Value *Address = Builder.CreateGEP(LdTy, BaseAddr, {Offset});
17101 Value *LdN = nullptr;
17102 if (UseScalable)
17103 LdN = Builder.CreateCall(LdNFunc, {Pred, Address}, "ldN");
17104 else
17105 LdN = Builder.CreateCall(LdNFunc, Address, "ldN");
17106
17107 Value *Idx =
17108 Builder.getInt64(I * LdTy->getElementCount().getKnownMinValue());
17109 Left = Builder.CreateInsertVector(
17110 VTy, Left, Builder.CreateExtractValue(LdN, 0), Idx);
17111 Right = Builder.CreateInsertVector(
17112 VTy, Right, Builder.CreateExtractValue(LdN, 1), Idx);
17113 }
17114
17115 Result = PoisonValue::get(DI->getType());
17116 Result = Builder.CreateInsertValue(Result, Left, 0);
17117 Result = Builder.CreateInsertValue(Result, Right, 1);
17118 } else {
17119 if (UseScalable)
17120 Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
17121 else
17122 Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
17123 }
17124
17125 DI->replaceAllUsesWith(Result);
17126 return true;
17127}
17128
17130 IntrinsicInst *II, StoreInst *SI) const {
17131 // Only interleave2 supported at present.
17132 if (II->getIntrinsicID() != Intrinsic::vector_interleave2)
17133 return false;
17134
17135 // Only a factor of 2 supported at present.
17136 const unsigned Factor = 2;
17137
17138 VectorType *VTy = cast<VectorType>(II->getOperand(0)->getType());
17139 const DataLayout &DL = II->getDataLayout();
17140 bool UseScalable;
17141 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
17142 return false;
17143
17144 // TODO: Add support for using SVE instructions with fixed types later, using
17145 // the code from lowerInterleavedStore to obtain the correct container type.
17146 if (UseScalable && !VTy->isScalableTy())
17147 return false;
17148
17149 unsigned NumStores = getNumInterleavedAccesses(VTy, DL, UseScalable);
17150
17151 VectorType *StTy =
17153 VTy->getElementCount().divideCoefficientBy(NumStores));
17154
17155 Type *PtrTy = SI->getPointerOperandType();
17156 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
17157 UseScalable, StTy, PtrTy);
17158
17159 IRBuilder<> Builder(SI);
17160
17161 Value *BaseAddr = SI->getPointerOperand();
17162 Value *Pred = nullptr;
17163
17164 if (UseScalable)
17165 Pred =
17166 Builder.CreateVectorSplat(StTy->getElementCount(), Builder.getTrue());
17167
17168 Value *L = II->getOperand(0);
17169 Value *R = II->getOperand(1);
17170
17171 for (unsigned I = 0; I < NumStores; ++I) {
17172 Value *Address = BaseAddr;
17173 if (NumStores > 1) {
17174 Value *Offset = Builder.getInt64(I * Factor);
17175 Address = Builder.CreateGEP(StTy, BaseAddr, {Offset});
17176
17177 Value *Idx =
17178 Builder.getInt64(I * StTy->getElementCount().getKnownMinValue());
17179 L = Builder.CreateExtractVector(StTy, II->getOperand(0), Idx);
17180 R = Builder.CreateExtractVector(StTy, II->getOperand(1), Idx);
17181 }
17182
17183 if (UseScalable)
17184 Builder.CreateCall(StNFunc, {L, R, Pred, Address});
17185 else
17186 Builder.CreateCall(StNFunc, {L, R, Address});
17187 }
17188
17189 return true;
17190}
17191
17193 const MemOp &Op, const AttributeList &FuncAttributes) const {
17194 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
17195 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
17196 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
17197 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
17198 // taken one instruction to materialize the v2i64 zero and one store (with
17199 // restrictive addressing mode). Just do i64 stores.
17200 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
17201 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
17202 if (Op.isAligned(AlignCheck))
17203 return true;
17204 unsigned Fast;
17205 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
17207 Fast;
17208 };
17209
17210 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
17211 AlignmentIsAcceptable(MVT::v16i8, Align(16)))
17212 return MVT::v16i8;
17213 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
17214 return MVT::f128;
17215 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
17216 return MVT::i64;
17217 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
17218 return MVT::i32;
17219 return MVT::Other;
17220}
17221
17223 const MemOp &Op, const AttributeList &FuncAttributes) const {
17224 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
17225 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
17226 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
17227 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
17228 // taken one instruction to materialize the v2i64 zero and one store (with
17229 // restrictive addressing mode). Just do i64 stores.
17230 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
17231 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
17232 if (Op.isAligned(AlignCheck))
17233 return true;
17234 unsigned Fast;
17235 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
17237 Fast;
17238 };
17239
17240 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
17241 AlignmentIsAcceptable(MVT::v2i64, Align(16)))
17242 return LLT::fixed_vector(2, 64);
17243 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
17244 return LLT::scalar(128);
17245 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
17246 return LLT::scalar(64);
17247 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
17248 return LLT::scalar(32);
17249 return LLT();
17250}
17251
17252// 12-bit optionally shifted immediates are legal for adds.
17254 if (Immed == std::numeric_limits<int64_t>::min()) {
17255 LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed
17256 << ": avoid UB for INT64_MIN\n");
17257 return false;
17258 }
17259 // Same encoding for add/sub, just flip the sign.
17260 Immed = std::abs(Immed);
17261 bool IsLegal = ((Immed >> 12) == 0 ||
17262 ((Immed & 0xfff) == 0 && Immed >> 24 == 0));
17263 LLVM_DEBUG(dbgs() << "Is " << Immed
17264 << " legal add imm: " << (IsLegal ? "yes" : "no") << "\n");
17265 return IsLegal;
17266}
17267
17269 // We will only emit addvl/inc* instructions for SVE2
17270 if (!Subtarget->hasSVE2())
17271 return false;
17272
17273 // addvl's immediates are in terms of the number of bytes in a register.
17274 // Since there are 16 in the base supported size (128bits), we need to
17275 // divide the immediate by that much to give us a useful immediate to
17276 // multiply by vscale. We can't have a remainder as a result of this.
17277 if (Imm % 16 == 0)
17278 return isInt<6>(Imm / 16);
17279
17280 // Inc[b|h|w|d] instructions take a pattern and a positive immediate
17281 // multiplier. For now, assume a pattern of 'all'. Incb would be a subset
17282 // of addvl as a result, so only take h|w|d into account.
17283 // Dec[h|w|d] will cover subtractions.
17284 // Immediates are in the range [1,16], so we can't do a 2's complement check.
17285 // FIXME: Can we make use of other patterns to cover other immediates?
17286
17287 // inch|dech
17288 if (Imm % 8 == 0)
17289 return std::abs(Imm / 8) <= 16;
17290 // incw|decw
17291 if (Imm % 4 == 0)
17292 return std::abs(Imm / 4) <= 16;
17293 // incd|decd
17294 if (Imm % 2 == 0)
17295 return std::abs(Imm / 2) <= 16;
17296
17297 return false;
17298}
17299
17300// Return false to prevent folding
17301// (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine,
17302// if the folding leads to worse code.
17304 SDValue AddNode, SDValue ConstNode) const {
17305 // Let the DAGCombiner decide for vector types and large types.
17306 const EVT VT = AddNode.getValueType();
17307 if (VT.isVector() || VT.getScalarSizeInBits() > 64)
17308 return true;
17309
17310 // It is worse if c1 is legal add immediate, while c1*c2 is not
17311 // and has to be composed by at least two instructions.
17312 const ConstantSDNode *C1Node = cast<ConstantSDNode>(AddNode.getOperand(1));
17313 const ConstantSDNode *C2Node = cast<ConstantSDNode>(ConstNode);
17314 const int64_t C1 = C1Node->getSExtValue();
17315 const APInt C1C2 = C1Node->getAPIntValue() * C2Node->getAPIntValue();
17317 return true;
17319 // Adapt to the width of a register.
17320 unsigned BitSize = VT.getSizeInBits() <= 32 ? 32 : 64;
17322 if (Insn.size() > 1)
17323 return false;
17324
17325 // Default to true and let the DAGCombiner decide.
17326 return true;
17327}
17328
17329// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
17330// immediates is the same as for an add or a sub.
17332 return isLegalAddImmediate(Immed);
17333}
17334
17335/// isLegalAddressingMode - Return true if the addressing mode represented
17336/// by AM is legal for this target, for a load/store of the specified type.
17338 const AddrMode &AMode, Type *Ty,
17339 unsigned AS, Instruction *I) const {
17340 // AArch64 has five basic addressing modes:
17341 // reg
17342 // reg + 9-bit signed offset
17343 // reg + SIZE_IN_BYTES * 12-bit unsigned offset
17344 // reg1 + reg2
17345 // reg + SIZE_IN_BYTES * reg
17346
17347 // No global is ever allowed as a base.
17348 if (AMode.BaseGV)
17349 return false;
17350
17351 // No reg+reg+imm addressing.
17352 if (AMode.HasBaseReg && AMode.BaseOffs && AMode.Scale)
17353 return false;
17354
17355 // Canonicalise `1*ScaledReg + imm` into `BaseReg + imm` and
17356 // `2*ScaledReg` into `BaseReg + ScaledReg`
17357 AddrMode AM = AMode;
17358 if (AM.Scale && !AM.HasBaseReg) {
17359 if (AM.Scale == 1) {
17360 AM.HasBaseReg = true;
17361 AM.Scale = 0;
17362 } else if (AM.Scale == 2) {
17363 AM.HasBaseReg = true;
17364 AM.Scale = 1;
17365 } else {
17366 return false;
17367 }
17368 }
17369
17370 // A base register is required in all addressing modes.
17371 if (!AM.HasBaseReg)
17372 return false;
17373
17374 if (Ty->isScalableTy()) {
17375 if (isa<ScalableVectorType>(Ty)) {
17376 // See if we have a foldable vscale-based offset, for vector types which
17377 // are either legal or smaller than the minimum; more work will be
17378 // required if we need to consider addressing for types which need
17379 // legalization by splitting.
17380 uint64_t VecNumBytes = DL.getTypeSizeInBits(Ty).getKnownMinValue() / 8;
17381 if (AM.HasBaseReg && !AM.BaseOffs && AM.ScalableOffset && !AM.Scale &&
17382 (AM.ScalableOffset % VecNumBytes == 0) && VecNumBytes <= 16 &&
17383 isPowerOf2_64(VecNumBytes))
17384 return isInt<4>(AM.ScalableOffset / (int64_t)VecNumBytes);
17385
17386 uint64_t VecElemNumBytes =
17387 DL.getTypeSizeInBits(cast<VectorType>(Ty)->getElementType()) / 8;
17388 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset &&
17389 (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes);
17390 }
17391
17392 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset && !AM.Scale;
17393 }
17394
17395 // No scalable offsets allowed for non-scalable types.
17396 if (AM.ScalableOffset)
17397 return false;
17398
17399 // check reg + imm case:
17400 // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
17401 uint64_t NumBytes = 0;
17402 if (Ty->isSized()) {
17403 uint64_t NumBits = DL.getTypeSizeInBits(Ty);
17404 NumBytes = NumBits / 8;
17405 if (!isPowerOf2_64(NumBits))
17406 NumBytes = 0;
17407 }
17408
17409 return Subtarget->getInstrInfo()->isLegalAddressingMode(NumBytes, AM.BaseOffs,
17410 AM.Scale);
17411}
17412
17413// Check whether the 2 offsets belong to the same imm24 range, and their high
17414// 12bits are same, then their high part can be decoded with the offset of add.
17415int64_t
17417 int64_t MaxOffset) const {
17418 int64_t HighPart = MinOffset & ~0xfffULL;
17419 if (MinOffset >> 12 == MaxOffset >> 12 && isLegalAddImmediate(HighPart)) {
17420 // Rebase the value to an integer multiple of imm12.
17421 return HighPart;
17422 }
17423
17424 return 0;
17425}
17426
17428 // Consider splitting large offset of struct or array.
17429 return true;
17430}
17431
17433 const MachineFunction &MF, EVT VT) const {
17434 VT = VT.getScalarType();
17435
17436 if (!VT.isSimple())
17437 return false;
17438
17439 switch (VT.getSimpleVT().SimpleTy) {
17440 case MVT::f16:
17441 return Subtarget->hasFullFP16();
17442 case MVT::f32:
17443 case MVT::f64:
17444 return true;
17445 default:
17446 break;
17447 }
17448
17449 return false;
17450}
17451
17453 Type *Ty) const {
17454 switch (Ty->getScalarType()->getTypeID()) {
17455 case Type::FloatTyID:
17456 case Type::DoubleTyID:
17457 return true;
17458 default:
17459 return false;
17460 }
17461}
17462
17464 EVT VT, CodeGenOptLevel OptLevel) const {
17465 return (OptLevel >= CodeGenOptLevel::Aggressive) && !VT.isScalableVector() &&
17467}
17468
17469const MCPhysReg *
17471 // LR is a callee-save register, but we must treat it as clobbered by any call
17472 // site. Hence we include LR in the scratch registers, which are in turn added
17473 // as implicit-defs for stackmaps and patchpoints.
17474 static const MCPhysReg ScratchRegs[] = {
17475 AArch64::X16, AArch64::X17, AArch64::LR, 0
17476 };
17477 return ScratchRegs;
17478}
17479
17481 static const MCPhysReg RCRegs[] = {AArch64::FPCR};
17482 return RCRegs;
17483}
17484
17485bool
17487 CombineLevel Level) const {
17488 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
17489 N->getOpcode() == ISD::SRL) &&
17490 "Expected shift op");
17491
17492 SDValue ShiftLHS = N->getOperand(0);
17493 EVT VT = N->getValueType(0);
17494
17495 // If ShiftLHS is unsigned bit extraction: ((x >> C) & mask), then do not
17496 // combine it with shift 'N' to let it be lowered to UBFX except:
17497 // ((x >> C) & mask) << C.
17498 if (ShiftLHS.getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
17499 isa<ConstantSDNode>(ShiftLHS.getOperand(1))) {
17500 uint64_t TruncMask = ShiftLHS.getConstantOperandVal(1);
17501 if (isMask_64(TruncMask)) {
17502 SDValue AndLHS = ShiftLHS.getOperand(0);
17503 if (AndLHS.getOpcode() == ISD::SRL) {
17504 if (auto *SRLC = dyn_cast<ConstantSDNode>(AndLHS.getOperand(1))) {
17505 if (N->getOpcode() == ISD::SHL)
17506 if (auto *SHLC = dyn_cast<ConstantSDNode>(N->getOperand(1)))
17507 return SRLC->getZExtValue() == SHLC->getZExtValue();
17508 return false;
17509 }
17510 }
17511 }
17512 }
17513 return true;
17514}
17515
17517 const SDNode *N) const {
17518 assert(N->getOpcode() == ISD::XOR &&
17519 (N->getOperand(0).getOpcode() == ISD::SHL ||
17520 N->getOperand(0).getOpcode() == ISD::SRL) &&
17521 "Expected XOR(SHIFT) pattern");
17522
17523 // Only commute if the entire NOT mask is a hidden shifted mask.
17524 auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
17525 auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
17526 if (XorC && ShiftC) {
17527 unsigned MaskIdx, MaskLen;
17528 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
17529 unsigned ShiftAmt = ShiftC->getZExtValue();
17530 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
17531 if (N->getOperand(0).getOpcode() == ISD::SHL)
17532 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
17533 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
17534 }
17535 }
17536
17537 return false;
17538}
17539
17541 const SDNode *N, CombineLevel Level) const {
17542 assert(((N->getOpcode() == ISD::SHL &&
17543 N->getOperand(0).getOpcode() == ISD::SRL) ||
17544 (N->getOpcode() == ISD::SRL &&
17545 N->getOperand(0).getOpcode() == ISD::SHL)) &&
17546 "Expected shift-shift mask");
17547 // Don't allow multiuse shift folding with the same shift amount.
17548 if (!N->getOperand(0)->hasOneUse())
17549 return false;
17550
17551 // Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns.
17552 EVT VT = N->getValueType(0);
17553 if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 || VT == MVT::i64)) {
17554 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
17555 auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
17556 return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue());
17557 }
17558
17559 return true;
17560}
17561
17563 unsigned BinOpcode, EVT VT) const {
17564 return VT.isScalableVector() && isTypeLegal(VT);
17565}
17566
17568 Type *Ty) const {
17569 assert(Ty->isIntegerTy());
17570
17571 unsigned BitSize = Ty->getPrimitiveSizeInBits();
17572 if (BitSize == 0)
17573 return false;
17574
17575 int64_t Val = Imm.getSExtValue();
17576 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
17577 return true;
17578
17579 if ((int64_t)Val < 0)
17580 Val = ~Val;
17581 if (BitSize == 32)
17582 Val &= (1LL << 32) - 1;
17583
17584 unsigned Shift = llvm::Log2_64((uint64_t)Val) / 16;
17585 // MOVZ is free so return true for one or fewer MOVK.
17586 return Shift < 3;
17587}
17588
17590 unsigned Index) const {
17592 return false;
17593
17594 return (Index == 0 || Index == ResVT.getVectorMinNumElements());
17595}
17596
17597/// Turn vector tests of the signbit in the form of:
17598/// xor (sra X, elt_size(X)-1), -1
17599/// into:
17600/// cmge X, X, #0
17602 const AArch64Subtarget *Subtarget) {
17603 EVT VT = N->getValueType(0);
17604 if (!Subtarget->hasNEON() || !VT.isVector())
17605 return SDValue();
17606
17607 // There must be a shift right algebraic before the xor, and the xor must be a
17608 // 'not' operation.
17609 SDValue Shift = N->getOperand(0);
17610 SDValue Ones = N->getOperand(1);
17611 if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
17613 return SDValue();
17614
17615 // The shift should be smearing the sign bit across each vector element.
17616 auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
17617 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
17618 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
17619 return SDValue();
17620
17621 return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));
17622}
17623
17624// Given a vecreduce_add node, detect the below pattern and convert it to the
17625// node sequence with UABDL, [S|U]ADB and UADDLP.
17626//
17627// i32 vecreduce_add(
17628// v16i32 abs(
17629// v16i32 sub(
17630// v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b))))
17631// =================>
17632// i32 vecreduce_add(
17633// v4i32 UADDLP(
17634// v8i16 add(
17635// v8i16 zext(
17636// v8i8 [S|U]ABD low8:v16i8 a, low8:v16i8 b
17637// v8i16 zext(
17638// v8i8 [S|U]ABD high8:v16i8 a, high8:v16i8 b
17640 SelectionDAG &DAG) {
17641 // Assumed i32 vecreduce_add
17642 if (N->getValueType(0) != MVT::i32)
17643 return SDValue();
17644
17645 SDValue VecReduceOp0 = N->getOperand(0);
17646 unsigned Opcode = VecReduceOp0.getOpcode();
17647 // Assumed v16i32 abs
17648 if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != MVT::v16i32)
17649 return SDValue();
17650
17651 SDValue ABS = VecReduceOp0;
17652 // Assumed v16i32 sub
17653 if (ABS->getOperand(0)->getOpcode() != ISD::SUB ||
17654 ABS->getOperand(0)->getValueType(0) != MVT::v16i32)
17655 return SDValue();
17656
17657 SDValue SUB = ABS->getOperand(0);
17658 unsigned Opcode0 = SUB->getOperand(0).getOpcode();
17659 unsigned Opcode1 = SUB->getOperand(1).getOpcode();
17660 // Assumed v16i32 type
17661 if (SUB->getOperand(0)->getValueType(0) != MVT::v16i32 ||
17662 SUB->getOperand(1)->getValueType(0) != MVT::v16i32)
17663 return SDValue();
17664
17665 // Assumed zext or sext
17666 bool IsZExt = false;
17667 if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) {
17668 IsZExt = true;
17669 } else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) {
17670 IsZExt = false;
17671 } else
17672 return SDValue();
17673
17674 SDValue EXT0 = SUB->getOperand(0);
17675 SDValue EXT1 = SUB->getOperand(1);
17676 // Assumed zext's operand has v16i8 type
17677 if (EXT0->getOperand(0)->getValueType(0) != MVT::v16i8 ||
17678 EXT1->getOperand(0)->getValueType(0) != MVT::v16i8)
17679 return SDValue();
17680
17681 // Pattern is dectected. Let's convert it to sequence of nodes.
17682 SDLoc DL(N);
17683
17684 // First, create the node pattern of UABD/SABD.
17685 SDValue UABDHigh8Op0 =
17686 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
17687 DAG.getConstant(8, DL, MVT::i64));
17688 SDValue UABDHigh8Op1 =
17689 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
17690 DAG.getConstant(8, DL, MVT::i64));
17691 SDValue UABDHigh8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
17692 UABDHigh8Op0, UABDHigh8Op1);
17693 SDValue UABDL = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDHigh8);
17694
17695 // Second, create the node pattern of UABAL.
17696 SDValue UABDLo8Op0 =
17697 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
17698 DAG.getConstant(0, DL, MVT::i64));
17699 SDValue UABDLo8Op1 =
17700 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
17701 DAG.getConstant(0, DL, MVT::i64));
17702 SDValue UABDLo8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
17703 UABDLo8Op0, UABDLo8Op1);
17704 SDValue ZExtUABD = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDLo8);
17705 SDValue UABAL = DAG.getNode(ISD::ADD, DL, MVT::v8i16, UABDL, ZExtUABD);
17706
17707 // Third, create the node of UADDLP.
17708 SDValue UADDLP = DAG.getNode(AArch64ISD::UADDLP, DL, MVT::v4i32, UABAL);
17709
17710 // Fourth, create the node of VECREDUCE_ADD.
17711 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP);
17712}
17713
17714// Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
17715// vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
17716// vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
17717// If we have vectors larger than v16i8 we extract v16i8 vectors,
17718// Follow the same steps above to get DOT instructions concatenate them
17719// and generate vecreduce.add(concat_vector(DOT, DOT2, ..)).
17721 const AArch64Subtarget *ST) {
17722 if (!ST->hasDotProd())
17724
17725 SDValue Op0 = N->getOperand(0);
17726 if (N->getValueType(0) != MVT::i32 || Op0.getValueType().isScalableVT() ||
17727 Op0.getValueType().getVectorElementType() != MVT::i32)
17728 return SDValue();
17729
17730 unsigned ExtOpcode = Op0.getOpcode();
17731 SDValue A = Op0;
17732 SDValue B;
17733 if (ExtOpcode == ISD::MUL) {
17734 A = Op0.getOperand(0);
17735 B = Op0.getOperand(1);
17736 if (A.getOpcode() != B.getOpcode() ||
17737 A.getOperand(0).getValueType() != B.getOperand(0).getValueType())
17738 return SDValue();
17739 ExtOpcode = A.getOpcode();
17740 }
17741 if (ExtOpcode != ISD::ZERO_EXTEND && ExtOpcode != ISD::SIGN_EXTEND)
17742 return SDValue();
17743
17744 EVT Op0VT = A.getOperand(0).getValueType();
17745 bool IsValidElementCount = Op0VT.getVectorNumElements() % 8 == 0;
17746 bool IsValidSize = Op0VT.getScalarSizeInBits() == 8;
17747 if (!IsValidElementCount || !IsValidSize)
17748 return SDValue();
17749
17750 SDLoc DL(Op0);
17751 // For non-mla reductions B can be set to 1. For MLA we take the operand of
17752 // the extend B.
17753 if (!B)
17754 B = DAG.getConstant(1, DL, Op0VT);
17755 else
17756 B = B.getOperand(0);
17757
17758 unsigned IsMultipleOf16 = Op0VT.getVectorNumElements() % 16 == 0;
17759 unsigned NumOfVecReduce;
17760 EVT TargetType;
17761 if (IsMultipleOf16) {
17762 NumOfVecReduce = Op0VT.getVectorNumElements() / 16;
17763 TargetType = MVT::v4i32;
17764 } else {
17765 NumOfVecReduce = Op0VT.getVectorNumElements() / 8;
17766 TargetType = MVT::v2i32;
17767 }
17768 auto DotOpcode =
17770 // Handle the case where we need to generate only one Dot operation.
17771 if (NumOfVecReduce == 1) {
17772 SDValue Zeros = DAG.getConstant(0, DL, TargetType);
17773 SDValue Dot = DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros,
17774 A.getOperand(0), B);
17775 return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
17776 }
17777 // Generate Dot instructions that are multiple of 16.
17778 unsigned VecReduce16Num = Op0VT.getVectorNumElements() / 16;
17779 SmallVector<SDValue, 4> SDotVec16;
17780 unsigned I = 0;
17781 for (; I < VecReduce16Num; I += 1) {
17782 SDValue Zeros = DAG.getConstant(0, DL, MVT::v4i32);
17783 SDValue Op0 =
17784 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, A.getOperand(0),
17785 DAG.getConstant(I * 16, DL, MVT::i64));
17786 SDValue Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, B,
17787 DAG.getConstant(I * 16, DL, MVT::i64));
17788 SDValue Dot =
17789 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Op0, Op1);
17790 SDotVec16.push_back(Dot);
17791 }
17792 // Concatenate dot operations.
17793 EVT SDot16EVT =
17794 EVT::getVectorVT(*DAG.getContext(), MVT::i32, 4 * VecReduce16Num);
17795 SDValue ConcatSDot16 =
17796 DAG.getNode(ISD::CONCAT_VECTORS, DL, SDot16EVT, SDotVec16);
17797 SDValue VecReduceAdd16 =
17798 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), ConcatSDot16);
17799 unsigned VecReduce8Num = (Op0VT.getVectorNumElements() % 16) / 8;
17800 if (VecReduce8Num == 0)
17801 return VecReduceAdd16;
17802
17803 // Generate the remainder Dot operation that is multiple of 8.
17804 SmallVector<SDValue, 4> SDotVec8;
17805 SDValue Zeros = DAG.getConstant(0, DL, MVT::v2i32);
17806 SDValue Vec8Op0 =
17807 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, A.getOperand(0),
17808 DAG.getConstant(I * 16, DL, MVT::i64));
17809 SDValue Vec8Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, B,
17810 DAG.getConstant(I * 16, DL, MVT::i64));
17811 SDValue Dot =
17812 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Vec8Op0, Vec8Op1);
17813 SDValue VecReudceAdd8 =
17814 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
17815 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), VecReduceAdd16,
17816 VecReudceAdd8);
17817}
17818
17819// Given an (integer) vecreduce, we know the order of the inputs does not
17820// matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x))))
17821// into UADDV(UADDLP(x)). This can also happen through an extra add, where we
17822// transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))).
17824 auto DetectAddExtract = [&](SDValue A) {
17825 // Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning
17826 // UADDLP(x) if found.
17827 assert(A.getOpcode() == ISD::ADD);
17828 EVT VT = A.getValueType();
17829 SDValue Op0 = A.getOperand(0);
17830 SDValue Op1 = A.getOperand(1);
17831 if (Op0.getOpcode() != Op0.getOpcode() ||
17832 (Op0.getOpcode() != ISD::ZERO_EXTEND &&
17833 Op0.getOpcode() != ISD::SIGN_EXTEND))
17834 return SDValue();
17835 SDValue Ext0 = Op0.getOperand(0);
17836 SDValue Ext1 = Op1.getOperand(0);
17837 if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
17839 Ext0.getOperand(0) != Ext1.getOperand(0))
17840 return SDValue();
17841 // Check that the type is twice the add types, and the extract are from
17842 // upper/lower parts of the same source.
17844 VT.getVectorNumElements() * 2)
17845 return SDValue();
17846 if ((Ext0.getConstantOperandVal(1) != 0 ||
17848 (Ext1.getConstantOperandVal(1) != 0 ||
17850 return SDValue();
17851 unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP
17853 return DAG.getNode(Opcode, SDLoc(A), VT, Ext0.getOperand(0));
17854 };
17855
17856 if (SDValue R = DetectAddExtract(A))
17857 return R;
17858
17859 if (A.getOperand(0).getOpcode() == ISD::ADD && A.getOperand(0).hasOneUse())
17860 if (SDValue R = performUADDVAddCombine(A.getOperand(0), DAG))
17861 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
17862 A.getOperand(1));
17863 if (A.getOperand(1).getOpcode() == ISD::ADD && A.getOperand(1).hasOneUse())
17864 if (SDValue R = performUADDVAddCombine(A.getOperand(1), DAG))
17865 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
17866 A.getOperand(0));
17867 return SDValue();
17868}
17869
17870// We can convert a UADDV(add(zext(64-bit source), zext(64-bit source))) into
17871// UADDLV(concat), where the concat represents the 64-bit zext sources.
17873 // Look for add(zext(64-bit source), zext(64-bit source)), returning
17874 // UADDLV(concat(zext, zext)) if found.
17875 assert(A.getOpcode() == ISD::ADD);
17876 EVT VT = A.getValueType();
17877 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
17878 return SDValue();
17879 SDValue Op0 = A.getOperand(0);
17880 SDValue Op1 = A.getOperand(1);
17881 if (Op0.getOpcode() != ISD::ZERO_EXTEND || Op0.getOpcode() != Op1.getOpcode())
17882 return SDValue();
17883 SDValue Ext0 = Op0.getOperand(0);
17884 SDValue Ext1 = Op1.getOperand(0);
17885 EVT ExtVT0 = Ext0.getValueType();
17886 EVT ExtVT1 = Ext1.getValueType();
17887 // Check zext VTs are the same and 64-bit length.
17888 if (ExtVT0 != ExtVT1 ||
17889 VT.getScalarSizeInBits() != (2 * ExtVT0.getScalarSizeInBits()))
17890 return SDValue();
17891 // Get VT for concat of zext sources.
17892 EVT PairVT = ExtVT0.getDoubleNumVectorElementsVT(*DAG.getContext());
17893 SDValue Concat =
17894 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(A), PairVT, Ext0, Ext1);
17895
17896 switch (VT.getSimpleVT().SimpleTy) {
17897 case MVT::v2i64:
17898 case MVT::v4i32:
17899 return DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), VT, Concat);
17900 case MVT::v8i16: {
17901 SDValue Uaddlv =
17902 DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), MVT::v4i32, Concat);
17903 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(A), MVT::v8i16, Uaddlv);
17904 }
17905 default:
17906 llvm_unreachable("Unhandled vector type");
17907 }
17908}
17909
17911 SDValue A = N->getOperand(0);
17912 if (A.getOpcode() == ISD::ADD) {
17913 if (SDValue R = performUADDVAddCombine(A, DAG))
17914 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), R);
17915 else if (SDValue R = performUADDVZextCombine(A, DAG))
17916 return R;
17917 }
17918 return SDValue();
17919}
17920
17923 const AArch64Subtarget *Subtarget) {
17924 if (DCI.isBeforeLegalizeOps())
17925 return SDValue();
17926
17927 return foldVectorXorShiftIntoCmp(N, DAG, Subtarget);
17928}
17929
17930SDValue
17931AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
17932 SelectionDAG &DAG,
17933 SmallVectorImpl<SDNode *> &Created) const {
17935 if (isIntDivCheap(N->getValueType(0), Attr))
17936 return SDValue(N, 0); // Lower SDIV as SDIV
17937
17938 EVT VT = N->getValueType(0);
17939
17940 // For scalable and fixed types, mark them as cheap so we can handle it much
17941 // later. This allows us to handle larger than legal types.
17942 if (VT.isScalableVector() ||
17943 (VT.isFixedLengthVector() && Subtarget->useSVEForFixedLengthVectors()))
17944 return SDValue(N, 0);
17945
17946 // fold (sdiv X, pow2)
17947 if ((VT != MVT::i32 && VT != MVT::i64) ||
17948 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
17949 return SDValue();
17950
17951 // If the divisor is 2 or -2, the default expansion is better. It will add
17952 // (N->getValueType(0) >> (BitWidth - 1)) to it before shifting right.
17953 if (Divisor == 2 ||
17954 Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))
17955 return SDValue();
17956
17957 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
17958}
17959
17960SDValue
17961AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
17962 SelectionDAG &DAG,
17963 SmallVectorImpl<SDNode *> &Created) const {
17965 if (isIntDivCheap(N->getValueType(0), Attr))
17966 return SDValue(N, 0); // Lower SREM as SREM
17967
17968 EVT VT = N->getValueType(0);
17969
17970 // For scalable and fixed types, mark them as cheap so we can handle it much
17971 // later. This allows us to handle larger than legal types.
17972 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
17973 return SDValue(N, 0);
17974
17975 // fold (srem X, pow2)
17976 if ((VT != MVT::i32 && VT != MVT::i64) ||
17977 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
17978 return SDValue();
17979
17980 unsigned Lg2 = Divisor.countr_zero();
17981 if (Lg2 == 0)
17982 return SDValue();
17983
17984 SDLoc DL(N);
17985 SDValue N0 = N->getOperand(0);
17986 SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
17987 SDValue Zero = DAG.getConstant(0, DL, VT);
17988 SDValue CCVal, CSNeg;
17989 if (Lg2 == 1) {
17990 SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETGE, CCVal, DAG, DL);
17991 SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
17992 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, And, And, CCVal, Cmp);
17993
17994 Created.push_back(Cmp.getNode());
17995 Created.push_back(And.getNode());
17996 } else {
17997 SDValue CCVal = DAG.getConstant(AArch64CC::MI, DL, MVT_CC);
17998 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
17999
18000 SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0);
18001 SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
18002 SDValue AndNeg = DAG.getNode(ISD::AND, DL, VT, Negs, Pow2MinusOne);
18003 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, AndPos, AndNeg, CCVal,
18004 Negs.getValue(1));
18005
18006 Created.push_back(Negs.getNode());
18007 Created.push_back(AndPos.getNode());
18008 Created.push_back(AndNeg.getNode());
18009 }
18010
18011 return CSNeg;
18012}
18013
18014static std::optional<unsigned> IsSVECntIntrinsic(SDValue S) {
18015 switch(getIntrinsicID(S.getNode())) {
18016 default:
18017 break;
18018 case Intrinsic::aarch64_sve_cntb:
18019 return 8;
18020 case Intrinsic::aarch64_sve_cnth:
18021 return 16;
18022 case Intrinsic::aarch64_sve_cntw:
18023 return 32;
18024 case Intrinsic::aarch64_sve_cntd:
18025 return 64;
18026 }
18027 return {};
18028}
18029
18030/// Calculates what the pre-extend type is, based on the extension
18031/// operation node provided by \p Extend.
18032///
18033/// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
18034/// pre-extend type is pulled directly from the operand, while other extend
18035/// operations need a bit more inspection to get this information.
18036///
18037/// \param Extend The SDNode from the DAG that represents the extend operation
18038///
18039/// \returns The type representing the \p Extend source type, or \p MVT::Other
18040/// if no valid type can be determined
18042 switch (Extend.getOpcode()) {
18043 case ISD::SIGN_EXTEND:
18044 case ISD::ZERO_EXTEND:
18045 return Extend.getOperand(0).getValueType();
18046 case ISD::AssertSext:
18047 case ISD::AssertZext:
18049 VTSDNode *TypeNode = dyn_cast<VTSDNode>(Extend.getOperand(1));
18050 if (!TypeNode)
18051 return MVT::Other;
18052 return TypeNode->getVT();
18053 }
18054 case ISD::AND: {
18056 dyn_cast<ConstantSDNode>(Extend.getOperand(1).getNode());
18057 if (!Constant)
18058 return MVT::Other;
18059
18060 uint32_t Mask = Constant->getZExtValue();
18061
18062 if (Mask == UCHAR_MAX)
18063 return MVT::i8;
18064 else if (Mask == USHRT_MAX)
18065 return MVT::i16;
18066 else if (Mask == UINT_MAX)
18067 return MVT::i32;
18068
18069 return MVT::Other;
18070 }
18071 default:
18072 return MVT::Other;
18073 }
18074}
18075
18076/// Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern
18077/// into sext/zext(buildvector) or sext/zext(shuffle) making use of the vector
18078/// SExt/ZExt rather than the scalar SExt/ZExt
18080 EVT VT = BV.getValueType();
18081 if (BV.getOpcode() != ISD::BUILD_VECTOR &&
18083 return SDValue();
18084
18085 // Use the first item in the buildvector/shuffle to get the size of the
18086 // extend, and make sure it looks valid.
18087 SDValue Extend = BV->getOperand(0);
18088 unsigned ExtendOpcode = Extend.getOpcode();
18089 bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
18090 ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
18091 ExtendOpcode == ISD::AssertSext;
18092 if (!IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
18093 ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
18094 return SDValue();
18095 // Shuffle inputs are vector, limit to SIGN_EXTEND and ZERO_EXTEND to ensure
18096 // calculatePreExtendType will work without issue.
18097 if (BV.getOpcode() == ISD::VECTOR_SHUFFLE &&
18098 ExtendOpcode != ISD::SIGN_EXTEND && ExtendOpcode != ISD::ZERO_EXTEND)
18099 return SDValue();
18100
18101 // Restrict valid pre-extend data type
18102 EVT PreExtendType = calculatePreExtendType(Extend);
18103 if (PreExtendType == MVT::Other ||
18104 PreExtendType.getScalarSizeInBits() != VT.getScalarSizeInBits() / 2)
18105 return SDValue();
18106
18107 // Make sure all other operands are equally extended
18108 for (SDValue Op : drop_begin(BV->ops())) {
18109 if (Op.isUndef())
18110 continue;
18111 unsigned Opc = Op.getOpcode();
18112 bool OpcIsSExt = Opc == ISD::SIGN_EXTEND || Opc == ISD::SIGN_EXTEND_INREG ||
18113 Opc == ISD::AssertSext;
18114 if (OpcIsSExt != IsSExt || calculatePreExtendType(Op) != PreExtendType)
18115 return SDValue();
18116 }
18117
18118 SDValue NBV;
18119 SDLoc DL(BV);
18120 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
18121 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType);
18122 EVT PreExtendLegalType =
18123 PreExtendType.getScalarSizeInBits() < 32 ? MVT::i32 : PreExtendType;
18125 for (SDValue Op : BV->ops())
18126 NewOps.push_back(Op.isUndef() ? DAG.getUNDEF(PreExtendLegalType)
18127 : DAG.getAnyExtOrTrunc(Op.getOperand(0), DL,
18128 PreExtendLegalType));
18129 NBV = DAG.getNode(ISD::BUILD_VECTOR, DL, PreExtendVT, NewOps);
18130 } else { // BV.getOpcode() == ISD::VECTOR_SHUFFLE
18131 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType.getScalarType());
18132 NBV = DAG.getVectorShuffle(PreExtendVT, DL, BV.getOperand(0).getOperand(0),
18133 BV.getOperand(1).isUndef()
18134 ? DAG.getUNDEF(PreExtendVT)
18135 : BV.getOperand(1).getOperand(0),
18136 cast<ShuffleVectorSDNode>(BV)->getMask());
18137 }
18138 return DAG.getNode(IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, VT, NBV);
18139}
18140
18141/// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
18142/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
18144 // If the value type isn't a vector, none of the operands are going to be dups
18145 EVT VT = Mul->getValueType(0);
18146 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
18147 return SDValue();
18148
18149 SDValue Op0 = performBuildShuffleExtendCombine(Mul->getOperand(0), DAG);
18150 SDValue Op1 = performBuildShuffleExtendCombine(Mul->getOperand(1), DAG);
18151
18152 // Neither operands have been changed, don't make any further changes
18153 if (!Op0 && !Op1)
18154 return SDValue();
18155
18156 SDLoc DL(Mul);
18157 return DAG.getNode(Mul->getOpcode(), DL, VT, Op0 ? Op0 : Mul->getOperand(0),
18158 Op1 ? Op1 : Mul->getOperand(1));
18159}
18160
18161// Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz
18162// Same for other types with equivalent constants.
18164 EVT VT = N->getValueType(0);
18165 if (VT != MVT::v2i64 && VT != MVT::v1i64 && VT != MVT::v2i32 &&
18166 VT != MVT::v4i32 && VT != MVT::v4i16 && VT != MVT::v8i16)
18167 return SDValue();
18168 if (N->getOperand(0).getOpcode() != ISD::AND ||
18169 N->getOperand(0).getOperand(0).getOpcode() != ISD::SRL)
18170 return SDValue();
18171
18172 SDValue And = N->getOperand(0);
18173 SDValue Srl = And.getOperand(0);
18174
18175 APInt V1, V2, V3;
18176 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), V1) ||
18177 !ISD::isConstantSplatVector(And.getOperand(1).getNode(), V2) ||
18179 return SDValue();
18180
18181 unsigned HalfSize = VT.getScalarSizeInBits() / 2;
18182 if (!V1.isMask(HalfSize) || V2 != (1ULL | 1ULL << HalfSize) ||
18183 V3 != (HalfSize - 1))
18184 return SDValue();
18185
18186 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
18187 EVT::getIntegerVT(*DAG.getContext(), HalfSize),
18188 VT.getVectorElementCount() * 2);
18189
18190 SDLoc DL(N);
18191 SDValue In = DAG.getNode(AArch64ISD::NVCAST, DL, HalfVT, Srl.getOperand(0));
18192 SDValue CM = DAG.getNode(AArch64ISD::CMLTz, DL, HalfVT, In);
18193 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, CM);
18194}
18195
18196// Transform vector add(zext i8 to i32, zext i8 to i32)
18197// into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
18198// This allows extra uses of saddl/uaddl at the lower vector widths, and less
18199// extends.
18201 EVT VT = N->getValueType(0);
18202 if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 ||
18203 (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
18204 N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) ||
18205 (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
18206 N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) ||
18207 N->getOperand(0).getOperand(0).getValueType() !=
18208 N->getOperand(1).getOperand(0).getValueType())
18209 return SDValue();
18210
18211 if (N->getOpcode() == ISD::MUL &&
18212 N->getOperand(0).getOpcode() != N->getOperand(1).getOpcode())
18213 return SDValue();
18214
18215 SDValue N0 = N->getOperand(0).getOperand(0);
18216 SDValue N1 = N->getOperand(1).getOperand(0);
18217 EVT InVT = N0.getValueType();
18218
18219 EVT S1 = InVT.getScalarType();
18220 EVT S2 = VT.getScalarType();
18221 if ((S2 == MVT::i32 && S1 == MVT::i8) ||
18222 (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) {
18223 SDLoc DL(N);
18224 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
18227 SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0);
18228 SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1);
18229 SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1);
18230 return DAG.getNode(N->getOpcode() == ISD::MUL ? N->getOperand(0).getOpcode()
18231 : (unsigned)ISD::SIGN_EXTEND,
18232 DL, VT, NewOp);
18233 }
18234 return SDValue();
18235}
18236
18239 const AArch64Subtarget *Subtarget) {
18240
18241 if (SDValue Ext = performMulVectorExtendCombine(N, DAG))
18242 return Ext;
18244 return Ext;
18245 if (SDValue Ext = performVectorExtCombine(N, DAG))
18246 return Ext;
18247
18248 if (DCI.isBeforeLegalizeOps())
18249 return SDValue();
18250
18251 // Canonicalize X*(Y+1) -> X*Y+X and (X+1)*Y -> X*Y+Y,
18252 // and in MachineCombiner pass, add+mul will be combined into madd.
18253 // Similarly, X*(1-Y) -> X - X*Y and (1-Y)*X -> X - Y*X.
18254 SDLoc DL(N);
18255 EVT VT = N->getValueType(0);
18256 SDValue N0 = N->getOperand(0);
18257 SDValue N1 = N->getOperand(1);
18258 SDValue MulOper;
18259 unsigned AddSubOpc;
18260
18261 auto IsAddSubWith1 = [&](SDValue V) -> bool {
18262 AddSubOpc = V->getOpcode();
18263 if ((AddSubOpc == ISD::ADD || AddSubOpc == ISD::SUB) && V->hasOneUse()) {
18264 SDValue Opnd = V->getOperand(1);
18265 MulOper = V->getOperand(0);
18266 if (AddSubOpc == ISD::SUB)
18267 std::swap(Opnd, MulOper);
18268 if (auto C = dyn_cast<ConstantSDNode>(Opnd))
18269 return C->isOne();
18270 }
18271 return false;
18272 };
18273
18274 if (IsAddSubWith1(N0)) {
18275 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N1, MulOper);
18276 return DAG.getNode(AddSubOpc, DL, VT, N1, MulVal);
18277 }
18278
18279 if (IsAddSubWith1(N1)) {
18280 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N0, MulOper);
18281 return DAG.getNode(AddSubOpc, DL, VT, N0, MulVal);
18282 }
18283
18284 // The below optimizations require a constant RHS.
18285 if (!isa<ConstantSDNode>(N1))
18286 return SDValue();
18287
18288 ConstantSDNode *C = cast<ConstantSDNode>(N1);
18289 const APInt &ConstValue = C->getAPIntValue();
18290
18291 // Allow the scaling to be folded into the `cnt` instruction by preventing
18292 // the scaling to be obscured here. This makes it easier to pattern match.
18293 if (IsSVECntIntrinsic(N0) ||
18294 (N0->getOpcode() == ISD::TRUNCATE &&
18295 (IsSVECntIntrinsic(N0->getOperand(0)))))
18296 if (ConstValue.sge(1) && ConstValue.sle(16))
18297 return SDValue();
18298
18299 // Multiplication of a power of two plus/minus one can be done more
18300 // cheaply as shift+add/sub. For now, this is true unilaterally. If
18301 // future CPUs have a cheaper MADD instruction, this may need to be
18302 // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
18303 // 64-bit is 5 cycles, so this is always a win.
18304 // More aggressively, some multiplications N0 * C can be lowered to
18305 // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
18306 // e.g. 6=3*2=(2+1)*2, 45=(1+4)*(1+8)
18307 // TODO: lower more cases.
18308
18309 // TrailingZeroes is used to test if the mul can be lowered to
18310 // shift+add+shift.
18311 unsigned TrailingZeroes = ConstValue.countr_zero();
18312 if (TrailingZeroes) {
18313 // Conservatively do not lower to shift+add+shift if the mul might be
18314 // folded into smul or umul.
18315 if (N0->hasOneUse() && (isSignExtended(N0, DAG) ||
18316 isZeroExtended(N0, DAG)))
18317 return SDValue();
18318 // Conservatively do not lower to shift+add+shift if the mul might be
18319 // folded into madd or msub.
18320 if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD ||
18321 N->use_begin()->getOpcode() == ISD::SUB))
18322 return SDValue();
18323 }
18324 // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
18325 // and shift+add+shift.
18326 APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
18327 unsigned ShiftAmt;
18328
18329 auto Shl = [&](SDValue N0, unsigned N1) {
18330 if (!N0.getNode())
18331 return SDValue();
18332 // If shift causes overflow, ignore this combine.
18333 if (N1 >= N0.getValueSizeInBits())
18334 return SDValue();
18335 SDValue RHS = DAG.getConstant(N1, DL, MVT::i64);
18336 return DAG.getNode(ISD::SHL, DL, VT, N0, RHS);
18337 };
18338 auto Add = [&](SDValue N0, SDValue N1) {
18339 if (!N0.getNode() || !N1.getNode())
18340 return SDValue();
18341 return DAG.getNode(ISD::ADD, DL, VT, N0, N1);
18342 };
18343 auto Sub = [&](SDValue N0, SDValue N1) {
18344 if (!N0.getNode() || !N1.getNode())
18345 return SDValue();
18346 return DAG.getNode(ISD::SUB, DL, VT, N0, N1);
18347 };
18348 auto Negate = [&](SDValue N) {
18349 if (!N0.getNode())
18350 return SDValue();
18351 SDValue Zero = DAG.getConstant(0, DL, VT);
18352 return DAG.getNode(ISD::SUB, DL, VT, Zero, N);
18353 };
18354
18355 // Can the const C be decomposed into (1+2^M1)*(1+2^N1), eg:
18356 // C = 45 is equal to (1+4)*(1+8), we don't decompose it into (1+2)*(16-1) as
18357 // the (2^N - 1) can't be execused via a single instruction.
18358 auto isPowPlusPlusConst = [](APInt C, APInt &M, APInt &N) {
18359 unsigned BitWidth = C.getBitWidth();
18360 for (unsigned i = 1; i < BitWidth / 2; i++) {
18361 APInt Rem;
18362 APInt X(BitWidth, (1 << i) + 1);
18363 APInt::sdivrem(C, X, N, Rem);
18364 APInt NVMinus1 = N - 1;
18365 if (Rem == 0 && NVMinus1.isPowerOf2()) {
18366 M = X;
18367 return true;
18368 }
18369 }
18370 return false;
18371 };
18372
18373 // Can the const C be decomposed into (2^M + 1) * 2^N + 1), eg:
18374 // C = 11 is equal to (1+4)*2+1, we don't decompose it into (1+2)*4-1 as
18375 // the (2^N - 1) can't be execused via a single instruction.
18376 auto isPowPlusPlusOneConst = [](APInt C, APInt &M, APInt &N) {
18377 APInt CVMinus1 = C - 1;
18378 if (CVMinus1.isNegative())
18379 return false;
18380 unsigned TrailingZeroes = CVMinus1.countr_zero();
18381 APInt SCVMinus1 = CVMinus1.ashr(TrailingZeroes) - 1;
18382 if (SCVMinus1.isPowerOf2()) {
18383 unsigned BitWidth = SCVMinus1.getBitWidth();
18384 M = APInt(BitWidth, SCVMinus1.logBase2());
18385 N = APInt(BitWidth, TrailingZeroes);
18386 return true;
18387 }
18388 return false;
18389 };
18390
18391 // Can the const C be decomposed into (1 - (1 - 2^M) * 2^N), eg:
18392 // C = 29 is equal to 1 - (1 - 2^3) * 2^2.
18393 auto isPowMinusMinusOneConst = [](APInt C, APInt &M, APInt &N) {
18394 APInt CVMinus1 = C - 1;
18395 if (CVMinus1.isNegative())
18396 return false;
18397 unsigned TrailingZeroes = CVMinus1.countr_zero();
18398 APInt CVPlus1 = CVMinus1.ashr(TrailingZeroes) + 1;
18399 if (CVPlus1.isPowerOf2()) {
18400 unsigned BitWidth = CVPlus1.getBitWidth();
18401 M = APInt(BitWidth, CVPlus1.logBase2());
18402 N = APInt(BitWidth, TrailingZeroes);
18403 return true;
18404 }
18405 return false;
18406 };
18407
18408 if (ConstValue.isNonNegative()) {
18409 // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
18410 // (mul x, 2^N - 1) => (sub (shl x, N), x)
18411 // (mul x, (2^(N-M) - 1) * 2^M) => (sub (shl x, N), (shl x, M))
18412 // (mul x, (2^M + 1) * (2^N + 1))
18413 // => MV = (add (shl x, M), x); (add (shl MV, N), MV)
18414 // (mul x, (2^M + 1) * 2^N + 1))
18415 // => MV = add (shl x, M), x); add (shl MV, N), x)
18416 // (mul x, 1 - (1 - 2^M) * 2^N))
18417 // => MV = sub (x - (shl x, M)); sub (x - (shl MV, N))
18418 APInt SCVMinus1 = ShiftedConstValue - 1;
18419 APInt SCVPlus1 = ShiftedConstValue + 1;
18420 APInt CVPlus1 = ConstValue + 1;
18421 APInt CVM, CVN;
18422 if (SCVMinus1.isPowerOf2()) {
18423 ShiftAmt = SCVMinus1.logBase2();
18424 return Shl(Add(Shl(N0, ShiftAmt), N0), TrailingZeroes);
18425 } else if (CVPlus1.isPowerOf2()) {
18426 ShiftAmt = CVPlus1.logBase2();
18427 return Sub(Shl(N0, ShiftAmt), N0);
18428 } else if (SCVPlus1.isPowerOf2()) {
18429 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
18430 return Sub(Shl(N0, ShiftAmt), Shl(N0, TrailingZeroes));
18431 }
18432 if (Subtarget->hasALULSLFast() &&
18433 isPowPlusPlusConst(ConstValue, CVM, CVN)) {
18434 APInt CVMMinus1 = CVM - 1;
18435 APInt CVNMinus1 = CVN - 1;
18436 unsigned ShiftM1 = CVMMinus1.logBase2();
18437 unsigned ShiftN1 = CVNMinus1.logBase2();
18438 // ALULSLFast implicate that Shifts <= 4 places are fast
18439 if (ShiftM1 <= 4 && ShiftN1 <= 4) {
18440 SDValue MVal = Add(Shl(N0, ShiftM1), N0);
18441 return Add(Shl(MVal, ShiftN1), MVal);
18442 }
18443 }
18444 if (Subtarget->hasALULSLFast() &&
18445 isPowPlusPlusOneConst(ConstValue, CVM, CVN)) {
18446 unsigned ShiftM = CVM.getZExtValue();
18447 unsigned ShiftN = CVN.getZExtValue();
18448 // ALULSLFast implicate that Shifts <= 4 places are fast
18449 if (ShiftM <= 4 && ShiftN <= 4) {
18450 SDValue MVal = Add(Shl(N0, CVM.getZExtValue()), N0);
18451 return Add(Shl(MVal, CVN.getZExtValue()), N0);
18452 }
18453 }
18454
18455 if (Subtarget->hasALULSLFast() &&
18456 isPowMinusMinusOneConst(ConstValue, CVM, CVN)) {
18457 unsigned ShiftM = CVM.getZExtValue();
18458 unsigned ShiftN = CVN.getZExtValue();
18459 // ALULSLFast implicate that Shifts <= 4 places are fast
18460 if (ShiftM <= 4 && ShiftN <= 4) {
18461 SDValue MVal = Sub(N0, Shl(N0, CVM.getZExtValue()));
18462 return Sub(N0, Shl(MVal, CVN.getZExtValue()));
18463 }
18464 }
18465 } else {
18466 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
18467 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
18468 // (mul x, -(2^(N-M) - 1) * 2^M) => (sub (shl x, M), (shl x, N))
18469 APInt SCVPlus1 = -ShiftedConstValue + 1;
18470 APInt CVNegPlus1 = -ConstValue + 1;
18471 APInt CVNegMinus1 = -ConstValue - 1;
18472 if (CVNegPlus1.isPowerOf2()) {
18473 ShiftAmt = CVNegPlus1.logBase2();
18474 return Sub(N0, Shl(N0, ShiftAmt));
18475 } else if (CVNegMinus1.isPowerOf2()) {
18476 ShiftAmt = CVNegMinus1.logBase2();
18477 return Negate(Add(Shl(N0, ShiftAmt), N0));
18478 } else if (SCVPlus1.isPowerOf2()) {
18479 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
18480 return Sub(Shl(N0, TrailingZeroes), Shl(N0, ShiftAmt));
18481 }
18482 }
18483
18484 return SDValue();
18485}
18486
18488 SelectionDAG &DAG) {
18489 // Take advantage of vector comparisons producing 0 or -1 in each lane to
18490 // optimize away operation when it's from a constant.
18491 //
18492 // The general transformation is:
18493 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
18494 // AND(VECTOR_CMP(x,y), constant2)
18495 // constant2 = UNARYOP(constant)
18496
18497 // Early exit if this isn't a vector operation, the operand of the
18498 // unary operation isn't a bitwise AND, or if the sizes of the operations
18499 // aren't the same.
18500 EVT VT = N->getValueType(0);
18501 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
18502 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
18503 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
18504 return SDValue();
18505
18506 // Now check that the other operand of the AND is a constant. We could
18507 // make the transformation for non-constant splats as well, but it's unclear
18508 // that would be a benefit as it would not eliminate any operations, just
18509 // perform one more step in scalar code before moving to the vector unit.
18510 if (BuildVectorSDNode *BV =
18511 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
18512 // Bail out if the vector isn't a constant.
18513 if (!BV->isConstant())
18514 return SDValue();
18515
18516 // Everything checks out. Build up the new and improved node.
18517 SDLoc DL(N);
18518 EVT IntVT = BV->getValueType(0);
18519 // Create a new constant of the appropriate type for the transformed
18520 // DAG.
18521 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
18522 // The AND node needs bitcasts to/from an integer vector type around it.
18523 SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
18524 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
18525 N->getOperand(0)->getOperand(0), MaskConst);
18526 SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
18527 return Res;
18528 }
18529
18530 return SDValue();
18531}
18532
18534 const AArch64Subtarget *Subtarget) {
18535 // First try to optimize away the conversion when it's conditionally from
18536 // a constant. Vectors only.
18538 return Res;
18539
18540 EVT VT = N->getValueType(0);
18541 if (VT != MVT::f32 && VT != MVT::f64)
18542 return SDValue();
18543
18544 // Only optimize when the source and destination types have the same width.
18545 if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
18546 return SDValue();
18547
18548 // If the result of an integer load is only used by an integer-to-float
18549 // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
18550 // This eliminates an "integer-to-vector-move" UOP and improves throughput.
18551 SDValue N0 = N->getOperand(0);
18552 if (Subtarget->isNeonAvailable() && ISD::isNormalLoad(N0.getNode()) &&
18553 N0.hasOneUse() &&
18554 // Do not change the width of a volatile load.
18555 !cast<LoadSDNode>(N0)->isVolatile()) {
18556 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
18557 SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
18558 LN0->getPointerInfo(), LN0->getAlign(),
18559 LN0->getMemOperand()->getFlags());
18560
18561 // Make sure successors of the original load stay after it by updating them
18562 // to use the new Chain.
18563 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
18564
18565 unsigned Opcode =
18567 return DAG.getNode(Opcode, SDLoc(N), VT, Load);
18568 }
18569
18570 return SDValue();
18571}
18572
18573/// Fold a floating-point multiply by power of two into floating-point to
18574/// fixed-point conversion.
18577 const AArch64Subtarget *Subtarget) {
18578 if (!Subtarget->isNeonAvailable())
18579 return SDValue();
18580
18581 if (!N->getValueType(0).isSimple())
18582 return SDValue();
18583
18584 SDValue Op = N->getOperand(0);
18585 if (!Op.getValueType().isSimple() || Op.getOpcode() != ISD::FMUL)
18586 return SDValue();
18587
18588 if (!Op.getValueType().is64BitVector() && !Op.getValueType().is128BitVector())
18589 return SDValue();
18590
18591 SDValue ConstVec = Op->getOperand(1);
18592 if (!isa<BuildVectorSDNode>(ConstVec))
18593 return SDValue();
18594
18595 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
18596 uint32_t FloatBits = FloatTy.getSizeInBits();
18597 if (FloatBits != 32 && FloatBits != 64 &&
18598 (FloatBits != 16 || !Subtarget->hasFullFP16()))
18599 return SDValue();
18600
18601 MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
18602 uint32_t IntBits = IntTy.getSizeInBits();
18603 if (IntBits != 16 && IntBits != 32 && IntBits != 64)
18604 return SDValue();
18605
18606 // Avoid conversions where iN is larger than the float (e.g., float -> i64).
18607 if (IntBits > FloatBits)
18608 return SDValue();
18609
18610 BitVector UndefElements;
18611 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
18612 int32_t Bits = IntBits == 64 ? 64 : 32;
18613 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
18614 if (C == -1 || C == 0 || C > Bits)
18615 return SDValue();
18616
18617 EVT ResTy = Op.getValueType().changeVectorElementTypeToInteger();
18618 if (!DAG.getTargetLoweringInfo().isTypeLegal(ResTy))
18619 return SDValue();
18620
18621 if (N->getOpcode() == ISD::FP_TO_SINT_SAT ||
18622 N->getOpcode() == ISD::FP_TO_UINT_SAT) {
18623 EVT SatVT = cast<VTSDNode>(N->getOperand(1))->getVT();
18624 if (SatVT.getScalarSizeInBits() != IntBits || IntBits != FloatBits)
18625 return SDValue();
18626 }
18627
18628 SDLoc DL(N);
18629 bool IsSigned = (N->getOpcode() == ISD::FP_TO_SINT ||
18630 N->getOpcode() == ISD::FP_TO_SINT_SAT);
18631 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
18632 : Intrinsic::aarch64_neon_vcvtfp2fxu;
18633 SDValue FixConv =
18635 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
18636 Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
18637 // We can handle smaller integers by generating an extra trunc.
18638 if (IntBits < FloatBits)
18639 FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
18640
18641 return FixConv;
18642}
18643
18645 const AArch64TargetLowering &TLI) {
18646 EVT VT = N->getValueType(0);
18647 SelectionDAG &DAG = DCI.DAG;
18648 SDLoc DL(N);
18649 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
18650
18651 if (!VT.isVector())
18652 return SDValue();
18653
18654 if (VT.isScalableVector() && !Subtarget.hasSVE2())
18655 return SDValue();
18656
18657 if (VT.isFixedLengthVector() &&
18658 (!Subtarget.isNeonAvailable() || TLI.useSVEForFixedLengthVectorVT(VT)))
18659 return SDValue();
18660
18661 SDValue N0 = N->getOperand(0);
18662 if (N0.getOpcode() != ISD::AND)
18663 return SDValue();
18664
18665 SDValue N1 = N->getOperand(1);
18666 if (N1.getOpcode() != ISD::AND)
18667 return SDValue();
18668
18669 // InstCombine does (not (neg a)) => (add a -1).
18670 // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
18671 // Loop over all combinations of AND operands.
18672 for (int i = 1; i >= 0; --i) {
18673 for (int j = 1; j >= 0; --j) {
18674 SDValue O0 = N0->getOperand(i);
18675 SDValue O1 = N1->getOperand(j);
18676 SDValue Sub, Add, SubSibling, AddSibling;
18677
18678 // Find a SUB and an ADD operand, one from each AND.
18679 if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
18680 Sub = O0;
18681 Add = O1;
18682 SubSibling = N0->getOperand(1 - i);
18683 AddSibling = N1->getOperand(1 - j);
18684 } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
18685 Add = O0;
18686 Sub = O1;
18687 AddSibling = N0->getOperand(1 - i);
18688 SubSibling = N1->getOperand(1 - j);
18689 } else
18690 continue;
18691
18693 continue;
18694
18695 // Constant ones is always righthand operand of the Add.
18696 if (!ISD::isConstantSplatVectorAllOnes(Add.getOperand(1).getNode()))
18697 continue;
18698
18699 if (Sub.getOperand(1) != Add.getOperand(0))
18700 continue;
18701
18702 return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling);
18703 }
18704 }
18705
18706 // (or (and a b) (and (not a) c)) => (bsl a b c)
18707 // We only have to look for constant vectors here since the general, variable
18708 // case can be handled in TableGen.
18709 unsigned Bits = VT.getScalarSizeInBits();
18710 uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
18711 for (int i = 1; i >= 0; --i)
18712 for (int j = 1; j >= 0; --j) {
18713 APInt Val1, Val2;
18714
18715 if (ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) &&
18717 (BitMask & ~Val1.getZExtValue()) == Val2.getZExtValue()) {
18718 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
18719 N0->getOperand(1 - i), N1->getOperand(1 - j));
18720 }
18721 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
18722 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
18723 if (!BVN0 || !BVN1)
18724 continue;
18725
18726 bool FoundMatch = true;
18727 for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
18728 ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
18729 ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
18730 if (!CN0 || !CN1 ||
18731 CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
18732 FoundMatch = false;
18733 break;
18734 }
18735 }
18736 if (FoundMatch)
18737 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
18738 N0->getOperand(1 - i), N1->getOperand(1 - j));
18739 }
18740
18741 return SDValue();
18742}
18743
18744// Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to
18745// convert to csel(ccmp(.., cc0)), depending on cc1:
18746
18747// (AND (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
18748// =>
18749// (CSET cc1 (CCMP x1 y1 !cc1 cc0 cmp0))
18750//
18751// (OR (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
18752// =>
18753// (CSET cc1 (CCMP x1 y1 cc1 !cc0 cmp0))
18755 EVT VT = N->getValueType(0);
18756 SDValue CSel0 = N->getOperand(0);
18757 SDValue CSel1 = N->getOperand(1);
18758
18759 if (CSel0.getOpcode() != AArch64ISD::CSEL ||
18760 CSel1.getOpcode() != AArch64ISD::CSEL)
18761 return SDValue();
18762
18763 if (!CSel0->hasOneUse() || !CSel1->hasOneUse())
18764 return SDValue();
18765
18766 if (!isNullConstant(CSel0.getOperand(0)) ||
18767 !isOneConstant(CSel0.getOperand(1)) ||
18768 !isNullConstant(CSel1.getOperand(0)) ||
18769 !isOneConstant(CSel1.getOperand(1)))
18770 return SDValue();
18771
18772 SDValue Cmp0 = CSel0.getOperand(3);
18773 SDValue Cmp1 = CSel1.getOperand(3);
18776 if (!Cmp0->hasOneUse() || !Cmp1->hasOneUse())
18777 return SDValue();
18778 if (Cmp1.getOpcode() != AArch64ISD::SUBS &&
18779 Cmp0.getOpcode() == AArch64ISD::SUBS) {
18780 std::swap(Cmp0, Cmp1);
18781 std::swap(CC0, CC1);
18782 }
18783
18784 if (Cmp1.getOpcode() != AArch64ISD::SUBS)
18785 return SDValue();
18786
18787 SDLoc DL(N);
18788 SDValue CCmp, Condition;
18789 unsigned NZCV;
18790
18791 if (N->getOpcode() == ISD::AND) {
18793 Condition = DAG.getConstant(InvCC0, DL, MVT_CC);
18795 } else {
18797 Condition = DAG.getConstant(CC0, DL, MVT_CC);
18799 }
18800
18801 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
18802
18803 auto *Op1 = dyn_cast<ConstantSDNode>(Cmp1.getOperand(1));
18804 if (Op1 && Op1->getAPIntValue().isNegative() &&
18805 Op1->getAPIntValue().sgt(-32)) {
18806 // CCMP accept the constant int the range [0, 31]
18807 // if the Op1 is a constant in the range [-31, -1], we
18808 // can select to CCMN to avoid the extra mov
18809 SDValue AbsOp1 =
18810 DAG.getConstant(Op1->getAPIntValue().abs(), DL, Op1->getValueType(0));
18811 CCmp = DAG.getNode(AArch64ISD::CCMN, DL, MVT_CC, Cmp1.getOperand(0), AbsOp1,
18812 NZCVOp, Condition, Cmp0);
18813 } else {
18814 CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, Cmp1.getOperand(0),
18815 Cmp1.getOperand(1), NZCVOp, Condition, Cmp0);
18816 }
18817 return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0),
18818 CSel0.getOperand(1), DAG.getConstant(CC1, DL, MVT::i32),
18819 CCmp);
18820}
18821
18823 const AArch64Subtarget *Subtarget,
18824 const AArch64TargetLowering &TLI) {
18825 SelectionDAG &DAG = DCI.DAG;
18826 EVT VT = N->getValueType(0);
18827
18828 if (SDValue R = performANDORCSELCombine(N, DAG))
18829 return R;
18830
18831 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
18832 return SDValue();
18833
18834 if (SDValue Res = tryCombineToBSL(N, DCI, TLI))
18835 return Res;
18836
18837 return SDValue();
18838}
18839
18841 if (!MemVT.getVectorElementType().isSimple())
18842 return false;
18843
18844 uint64_t MaskForTy = 0ull;
18845 switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) {
18846 case MVT::i8:
18847 MaskForTy = 0xffull;
18848 break;
18849 case MVT::i16:
18850 MaskForTy = 0xffffull;
18851 break;
18852 case MVT::i32:
18853 MaskForTy = 0xffffffffull;
18854 break;
18855 default:
18856 return false;
18857 break;
18858 }
18859
18860 if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR)
18861 if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0)))
18862 return Op0->getAPIntValue().getLimitedValue() == MaskForTy;
18863
18864 return false;
18865}
18866
18868 SDValue LeafOp = SDValue(N, 0);
18869 SDValue Op = N->getOperand(0);
18870 while (Op.getOpcode() == AArch64ISD::REINTERPRET_CAST &&
18871 LeafOp.getValueType() != Op.getValueType())
18872 Op = Op->getOperand(0);
18873 if (LeafOp.getValueType() == Op.getValueType())
18874 return Op;
18875 return SDValue();
18876}
18877
18880 SelectionDAG &DAG = DCI.DAG;
18881 SDValue Src = N->getOperand(0);
18882 unsigned Opc = Src->getOpcode();
18883
18884 // Zero/any extend of an unsigned unpack
18885 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
18886 SDValue UnpkOp = Src->getOperand(0);
18887 SDValue Dup = N->getOperand(1);
18888
18889 if (Dup.getOpcode() != ISD::SPLAT_VECTOR)
18890 return SDValue();
18891
18892 SDLoc DL(N);
18893 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Dup->getOperand(0));
18894 if (!C)
18895 return SDValue();
18896
18897 uint64_t ExtVal = C->getZExtValue();
18898
18899 auto MaskAndTypeMatch = [ExtVal](EVT VT) -> bool {
18900 return ((ExtVal == 0xFF && VT == MVT::i8) ||
18901 (ExtVal == 0xFFFF && VT == MVT::i16) ||
18902 (ExtVal == 0xFFFFFFFF && VT == MVT::i32));
18903 };
18904
18905 // If the mask is fully covered by the unpack, we don't need to push
18906 // a new AND onto the operand
18907 EVT EltTy = UnpkOp->getValueType(0).getVectorElementType();
18908 if (MaskAndTypeMatch(EltTy))
18909 return Src;
18910
18911 // If this is 'and (uunpklo/hi (extload MemTy -> ExtTy)), mask', then check
18912 // to see if the mask is all-ones of size MemTy.
18913 auto MaskedLoadOp = dyn_cast<MaskedLoadSDNode>(UnpkOp);
18914 if (MaskedLoadOp && (MaskedLoadOp->getExtensionType() == ISD::ZEXTLOAD ||
18915 MaskedLoadOp->getExtensionType() == ISD::EXTLOAD)) {
18916 EVT EltTy = MaskedLoadOp->getMemoryVT().getVectorElementType();
18917 if (MaskAndTypeMatch(EltTy))
18918 return Src;
18919 }
18920
18921 // Truncate to prevent a DUP with an over wide constant
18922 APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits());
18923
18924 // Otherwise, make sure we propagate the AND to the operand
18925 // of the unpack
18926 Dup = DAG.getNode(ISD::SPLAT_VECTOR, DL, UnpkOp->getValueType(0),
18927 DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));
18928
18929 SDValue And = DAG.getNode(ISD::AND, DL,
18930 UnpkOp->getValueType(0), UnpkOp, Dup);
18931
18932 return DAG.getNode(Opc, DL, N->getValueType(0), And);
18933 }
18934
18935 if (DCI.isBeforeLegalizeOps())
18936 return SDValue();
18937
18938 // If both sides of AND operations are i1 splat_vectors then
18939 // we can produce just i1 splat_vector as the result.
18940 if (isAllActivePredicate(DAG, N->getOperand(0)))
18941 return N->getOperand(1);
18942 if (isAllActivePredicate(DAG, N->getOperand(1)))
18943 return N->getOperand(0);
18944
18946 return SDValue();
18947
18948 SDValue Mask = N->getOperand(1);
18949
18950 if (!Src.hasOneUse())
18951 return SDValue();
18952
18953 EVT MemVT;
18954
18955 // SVE load instructions perform an implicit zero-extend, which makes them
18956 // perfect candidates for combining.
18957 switch (Opc) {
18961 MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT();
18962 break;
18978 MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
18979 break;
18980 default:
18981 return SDValue();
18982 }
18983
18984 if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT))
18985 return Src;
18986
18987 return SDValue();
18988}
18989
18990// Transform and(fcmp(a, b), fcmp(c, d)) into fccmp(fcmp(a, b), c, d)
18993
18994 // This function performs an optimization on a specific pattern involving
18995 // an AND operation and SETCC (Set Condition Code) node.
18996
18997 SDValue SetCC = N->getOperand(0);
18998 EVT VT = N->getValueType(0);
18999 SelectionDAG &DAG = DCI.DAG;
19000
19001 // Checks if the current node (N) is used by any SELECT instruction and
19002 // returns an empty SDValue to avoid applying the optimization to prevent
19003 // incorrect results
19004 for (auto U : N->uses())
19005 if (U->getOpcode() == ISD::SELECT)
19006 return SDValue();
19007
19008 // Check if the operand is a SETCC node with floating-point comparison
19009 if (SetCC.getOpcode() == ISD::SETCC &&
19010 SetCC.getOperand(0).getValueType() == MVT::f32) {
19011
19012 SDValue Cmp;
19014
19015 // Check if the DAG is after legalization and if we can emit the conjunction
19016 if (!DCI.isBeforeLegalize() &&
19017 (Cmp = emitConjunction(DAG, SDValue(N, 0), CC))) {
19018
19020
19021 SDLoc DL(N);
19022 return DAG.getNode(AArch64ISD::CSINC, DL, VT, DAG.getConstant(0, DL, VT),
19023 DAG.getConstant(0, DL, VT),
19024 DAG.getConstant(InvertedCC, DL, MVT::i32), Cmp);
19025 }
19026 }
19027 return SDValue();
19028}
19029
19032 SelectionDAG &DAG = DCI.DAG;
19033 SDValue LHS = N->getOperand(0);
19034 SDValue RHS = N->getOperand(1);
19035 EVT VT = N->getValueType(0);
19036
19037 if (SDValue R = performANDORCSELCombine(N, DAG))
19038 return R;
19039
19040 if (SDValue R = performANDSETCCCombine(N,DCI))
19041 return R;
19042
19043 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
19044 return SDValue();
19045
19046 if (VT.isScalableVector())
19047 return performSVEAndCombine(N, DCI);
19048
19049 // The combining code below works only for NEON vectors. In particular, it
19050 // does not work for SVE when dealing with vectors wider than 128 bits.
19051 if (!VT.is64BitVector() && !VT.is128BitVector())
19052 return SDValue();
19053
19054 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
19055 if (!BVN)
19056 return SDValue();
19057
19058 // AND does not accept an immediate, so check if we can use a BIC immediate
19059 // instruction instead. We do this here instead of using a (and x, (mvni imm))
19060 // pattern in isel, because some immediates may be lowered to the preferred
19061 // (and x, (movi imm)) form, even though an mvni representation also exists.
19062 APInt DefBits(VT.getSizeInBits(), 0);
19063 APInt UndefBits(VT.getSizeInBits(), 0);
19064 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
19065 SDValue NewOp;
19066
19067 // Any bits known to already be 0 need not be cleared again, which can help
19068 // reduce the size of the immediate to one supported by the instruction.
19069 KnownBits Known = DAG.computeKnownBits(LHS);
19070 APInt ZeroSplat(VT.getSizeInBits(), 0);
19071 for (unsigned I = 0; I < VT.getSizeInBits() / Known.Zero.getBitWidth(); I++)
19072 ZeroSplat |= Known.Zero.zext(VT.getSizeInBits())
19073 << (Known.Zero.getBitWidth() * I);
19074
19075 DefBits = ~(DefBits | ZeroSplat);
19076 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
19077 DefBits, &LHS)) ||
19078 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
19079 DefBits, &LHS)))
19080 return NewOp;
19081
19082 UndefBits = ~(UndefBits | ZeroSplat);
19083 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
19084 UndefBits, &LHS)) ||
19085 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
19086 UndefBits, &LHS)))
19087 return NewOp;
19088 }
19089
19090 return SDValue();
19091}
19092
19095 SelectionDAG &DAG = DCI.DAG;
19096 SDValue LHS = N->getOperand(0);
19097 SDValue RHS = N->getOperand(1);
19098 EVT VT = N->getValueType(0);
19099 SDLoc DL(N);
19100
19101 if (!N->getFlags().hasAllowReassociation())
19102 return SDValue();
19103
19104 // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
19105 auto ReassocComplex = [&](SDValue A, SDValue B) {
19106 if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
19107 return SDValue();
19108 unsigned Opc = A.getConstantOperandVal(0);
19109 if (Opc != Intrinsic::aarch64_neon_vcmla_rot0 &&
19110 Opc != Intrinsic::aarch64_neon_vcmla_rot90 &&
19111 Opc != Intrinsic::aarch64_neon_vcmla_rot180 &&
19112 Opc != Intrinsic::aarch64_neon_vcmla_rot270)
19113 return SDValue();
19114 SDValue VCMLA = DAG.getNode(
19115 ISD::INTRINSIC_WO_CHAIN, DL, VT, A.getOperand(0),
19116 DAG.getNode(ISD::FADD, DL, VT, A.getOperand(1), B, N->getFlags()),
19117 A.getOperand(2), A.getOperand(3));
19118 VCMLA->setFlags(A->getFlags());
19119 return VCMLA;
19120 };
19121 if (SDValue R = ReassocComplex(LHS, RHS))
19122 return R;
19123 if (SDValue R = ReassocComplex(RHS, LHS))
19124 return R;
19125
19126 return SDValue();
19127}
19128
19129static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
19130 switch (Opcode) {
19131 case ISD::STRICT_FADD:
19132 case ISD::FADD:
19133 return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64;
19134 case ISD::ADD:
19135 return VT == MVT::i64;
19136 default:
19137 return false;
19138 }
19139}
19140
19141static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
19143
19145 if ((N.getOpcode() == ISD::SETCC) ||
19146 (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
19147 (N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege ||
19148 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt ||
19149 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi ||
19150 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs ||
19151 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele ||
19152 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo ||
19153 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels ||
19154 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt ||
19155 // get_active_lane_mask is lowered to a whilelo instruction.
19156 N.getConstantOperandVal(0) == Intrinsic::get_active_lane_mask)))
19157 return true;
19158
19159 return false;
19160}
19161
19162// Materialize : i1 = extract_vector_elt t37, Constant:i64<0>
19163// ... into: "ptrue p, all" + PTEST
19164static SDValue
19167 const AArch64Subtarget *Subtarget) {
19168 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
19169 // Make sure PTEST can be legalised with illegal types.
19170 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
19171 return SDValue();
19172
19173 SDValue N0 = N->getOperand(0);
19174 EVT VT = N0.getValueType();
19175
19176 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 ||
19177 !isNullConstant(N->getOperand(1)))
19178 return SDValue();
19179
19180 // Restricted the DAG combine to only cases where we're extracting from a
19181 // flag-setting operation.
19182 if (!isPredicateCCSettingOp(N0))
19183 return SDValue();
19184
19185 // Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0
19186 SelectionDAG &DAG = DCI.DAG;
19187 SDValue Pg = getPTrue(DAG, SDLoc(N), VT, AArch64SVEPredPattern::all);
19188 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::FIRST_ACTIVE);
19189}
19190
19191// Materialize : Idx = (add (mul vscale, NumEls), -1)
19192// i1 = extract_vector_elt t37, Constant:i64<Idx>
19193// ... into: "ptrue p, all" + PTEST
19194static SDValue
19197 const AArch64Subtarget *Subtarget) {
19198 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
19199 // Make sure PTEST is legal types.
19200 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
19201 return SDValue();
19202
19203 SDValue N0 = N->getOperand(0);
19204 EVT OpVT = N0.getValueType();
19205
19206 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
19207 return SDValue();
19208
19209 // Idx == (add (mul vscale, NumEls), -1)
19210 SDValue Idx = N->getOperand(1);
19211 if (Idx.getOpcode() != ISD::ADD || !isAllOnesConstant(Idx.getOperand(1)))
19212 return SDValue();
19213
19214 SDValue VS = Idx.getOperand(0);
19215 if (VS.getOpcode() != ISD::VSCALE)
19216 return SDValue();
19217
19218 unsigned NumEls = OpVT.getVectorElementCount().getKnownMinValue();
19219 if (VS.getConstantOperandVal(0) != NumEls)
19220 return SDValue();
19221
19222 // Extracts of lane EC-1 for SVE can be expressed as PTEST(Op, LAST) ? 1 : 0
19223 SelectionDAG &DAG = DCI.DAG;
19224 SDValue Pg = getPTrue(DAG, SDLoc(N), OpVT, AArch64SVEPredPattern::all);
19225 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::LAST_ACTIVE);
19226}
19227
19228static SDValue
19230 const AArch64Subtarget *Subtarget) {
19231 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
19232 if (SDValue Res = performFirstTrueTestVectorCombine(N, DCI, Subtarget))
19233 return Res;
19234 if (SDValue Res = performLastTrueTestVectorCombine(N, DCI, Subtarget))
19235 return Res;
19236
19237 SelectionDAG &DAG = DCI.DAG;
19238 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
19239
19240 EVT VT = N->getValueType(0);
19241 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
19242 bool IsStrict = N0->isStrictFPOpcode();
19243
19244 // extract(dup x) -> x
19245 if (N0.getOpcode() == AArch64ISD::DUP)
19246 return VT.isInteger() ? DAG.getZExtOrTrunc(N0.getOperand(0), SDLoc(N), VT)
19247 : N0.getOperand(0);
19248
19249 // Rewrite for pairwise fadd pattern
19250 // (f32 (extract_vector_elt
19251 // (fadd (vXf32 Other)
19252 // (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0))
19253 // ->
19254 // (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
19255 // (extract_vector_elt (vXf32 Other) 1))
19256 // For strict_fadd we need to make sure the old strict_fadd can be deleted, so
19257 // we can only do this when it's used only by the extract_vector_elt.
19258 if (isNullConstant(N1) && hasPairwiseAdd(N0->getOpcode(), VT, FullFP16) &&
19259 (!IsStrict || N0.hasOneUse())) {
19260 SDLoc DL(N0);
19261 SDValue N00 = N0->getOperand(IsStrict ? 1 : 0);
19262 SDValue N01 = N0->getOperand(IsStrict ? 2 : 1);
19263
19264 ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(N01);
19265 SDValue Other = N00;
19266
19267 // And handle the commutative case.
19268 if (!Shuffle) {
19269 Shuffle = dyn_cast<ShuffleVectorSDNode>(N00);
19270 Other = N01;
19271 }
19272
19273 if (Shuffle && Shuffle->getMaskElt(0) == 1 &&
19274 Other == Shuffle->getOperand(0)) {
19275 SDValue Extract1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
19276 DAG.getConstant(0, DL, MVT::i64));
19277 SDValue Extract2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
19278 DAG.getConstant(1, DL, MVT::i64));
19279 if (!IsStrict)
19280 return DAG.getNode(N0->getOpcode(), DL, VT, Extract1, Extract2);
19281
19282 // For strict_fadd we need uses of the final extract_vector to be replaced
19283 // with the strict_fadd, but we also need uses of the chain output of the
19284 // original strict_fadd to use the chain output of the new strict_fadd as
19285 // otherwise it may not be deleted.
19286 SDValue Ret = DAG.getNode(N0->getOpcode(), DL,
19287 {VT, MVT::Other},
19288 {N0->getOperand(0), Extract1, Extract2});
19289 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Ret);
19290 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Ret.getValue(1));
19291 return SDValue(N, 0);
19292 }
19293 }
19294
19295 return SDValue();
19296}
19297
19300 SelectionDAG &DAG) {
19301 SDLoc dl(N);
19302 EVT VT = N->getValueType(0);
19303 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
19304 unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();
19305
19306 if (VT.isScalableVector())
19307 return SDValue();
19308
19309 // Optimize concat_vectors of truncated vectors, where the intermediate
19310 // type is illegal, to avoid said illegality, e.g.,
19311 // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
19312 // (v2i16 (truncate (v2i64)))))
19313 // ->
19314 // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
19315 // (v4i32 (bitcast (v2i64))),
19316 // <0, 2, 4, 6>)))
19317 // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
19318 // on both input and result type, so we might generate worse code.
19319 // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
19320 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
19321 N1Opc == ISD::TRUNCATE) {
19322 SDValue N00 = N0->getOperand(0);
19323 SDValue N10 = N1->getOperand(0);
19324 EVT N00VT = N00.getValueType();
19325
19326 if (N00VT == N10.getValueType() &&
19327 (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
19328 N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
19329 MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
19331 for (size_t i = 0; i < Mask.size(); ++i)
19332 Mask[i] = i * 2;
19333 return DAG.getNode(ISD::TRUNCATE, dl, VT,
19334 DAG.getVectorShuffle(
19335 MidVT, dl,
19336 DAG.getNode(ISD::BITCAST, dl, MidVT, N00),
19337 DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask));
19338 }
19339 }
19340
19341 if (N->getOperand(0).getValueType() == MVT::v4i8 ||
19342 N->getOperand(0).getValueType() == MVT::v2i16 ||
19343 N->getOperand(0).getValueType() == MVT::v2i8) {
19344 EVT SrcVT = N->getOperand(0).getValueType();
19345 // If we have a concat of v4i8 loads, convert them to a buildvector of f32
19346 // loads to prevent having to go through the v4i8 load legalization that
19347 // needs to extend each element into a larger type.
19348 if (N->getNumOperands() % 2 == 0 &&
19349 all_of(N->op_values(), [SrcVT](SDValue V) {
19350 if (V.getValueType() != SrcVT)
19351 return false;
19352 if (V.isUndef())
19353 return true;
19354 LoadSDNode *LD = dyn_cast<LoadSDNode>(V);
19355 return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() &&
19356 LD->getExtensionType() == ISD::NON_EXTLOAD;
19357 })) {
19358 EVT FVT = SrcVT == MVT::v2i8 ? MVT::f16 : MVT::f32;
19359 EVT NVT = EVT::getVectorVT(*DAG.getContext(), FVT, N->getNumOperands());
19361
19362 for (unsigned i = 0; i < N->getNumOperands(); i++) {
19363 SDValue V = N->getOperand(i);
19364 if (V.isUndef())
19365 Ops.push_back(DAG.getUNDEF(FVT));
19366 else {
19367 LoadSDNode *LD = cast<LoadSDNode>(V);
19368 SDValue NewLoad = DAG.getLoad(FVT, dl, LD->getChain(),
19369 LD->getBasePtr(), LD->getMemOperand());
19370 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1));
19371 Ops.push_back(NewLoad);
19372 }
19373 }
19374 return DAG.getBitcast(N->getValueType(0),
19375 DAG.getBuildVector(NVT, dl, Ops));
19376 }
19377 }
19378
19379 // Canonicalise concat_vectors to replace concatenations of truncated nots
19380 // with nots of concatenated truncates. This in some cases allows for multiple
19381 // redundant negations to be eliminated.
19382 // (concat_vectors (v4i16 (truncate (not (v4i32)))),
19383 // (v4i16 (truncate (not (v4i32)))))
19384 // ->
19385 // (not (concat_vectors (v4i16 (truncate (v4i32))),
19386 // (v4i16 (truncate (v4i32)))))
19387 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
19388 N1Opc == ISD::TRUNCATE && N->isOnlyUserOf(N0.getNode()) &&
19389 N->isOnlyUserOf(N1.getNode())) {
19390 auto isBitwiseVectorNegate = [](SDValue V) {
19391 return V->getOpcode() == ISD::XOR &&
19392 ISD::isConstantSplatVectorAllOnes(V.getOperand(1).getNode());
19393 };
19394 SDValue N00 = N0->getOperand(0);
19395 SDValue N10 = N1->getOperand(0);
19396 if (isBitwiseVectorNegate(N00) && N0->isOnlyUserOf(N00.getNode()) &&
19397 isBitwiseVectorNegate(N10) && N1->isOnlyUserOf(N10.getNode())) {
19398 return DAG.getNOT(
19399 dl,
19400 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
19401 DAG.getNode(ISD::TRUNCATE, dl, N0.getValueType(),
19402 N00->getOperand(0)),
19403 DAG.getNode(ISD::TRUNCATE, dl, N1.getValueType(),
19404 N10->getOperand(0))),
19405 VT);
19406 }
19407 }
19408
19409 // Wait till after everything is legalized to try this. That way we have
19410 // legal vector types and such.
19411 if (DCI.isBeforeLegalizeOps())
19412 return SDValue();
19413
19414 // Optimise concat_vectors of two identical binops with a 128-bit destination
19415 // size, combine into an binop of two contacts of the source vectors. eg:
19416 // concat(uhadd(a,b), uhadd(c, d)) -> uhadd(concat(a, c), concat(b, d))
19417 if (N->getNumOperands() == 2 && N0Opc == N1Opc && VT.is128BitVector() &&
19418 DAG.getTargetLoweringInfo().isBinOp(N0Opc) && N0->hasOneUse() &&
19419 N1->hasOneUse()) {
19420 SDValue N00 = N0->getOperand(0);
19421 SDValue N01 = N0->getOperand(1);
19422 SDValue N10 = N1->getOperand(0);
19423 SDValue N11 = N1->getOperand(1);
19424
19425 if (!N00.isUndef() && !N01.isUndef() && !N10.isUndef() && !N11.isUndef()) {
19426 SDValue Concat0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N00, N10);
19427 SDValue Concat1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N01, N11);
19428 return DAG.getNode(N0Opc, dl, VT, Concat0, Concat1);
19429 }
19430 }
19431
19432 auto IsRSHRN = [](SDValue Shr) {
19433 if (Shr.getOpcode() != AArch64ISD::VLSHR)
19434 return false;
19435 SDValue Op = Shr.getOperand(0);
19436 EVT VT = Op.getValueType();
19437 unsigned ShtAmt = Shr.getConstantOperandVal(1);
19438 if (ShtAmt > VT.getScalarSizeInBits() / 2 || Op.getOpcode() != ISD::ADD)
19439 return false;
19440
19441 APInt Imm;
19442 if (Op.getOperand(1).getOpcode() == AArch64ISD::MOVIshift)
19443 Imm = APInt(VT.getScalarSizeInBits(),
19444 Op.getOperand(1).getConstantOperandVal(0)
19445 << Op.getOperand(1).getConstantOperandVal(1));
19446 else if (Op.getOperand(1).getOpcode() == AArch64ISD::DUP &&
19447 isa<ConstantSDNode>(Op.getOperand(1).getOperand(0)))
19448 Imm = APInt(VT.getScalarSizeInBits(),
19449 Op.getOperand(1).getConstantOperandVal(0));
19450 else
19451 return false;
19452
19453 if (Imm != 1ULL << (ShtAmt - 1))
19454 return false;
19455 return true;
19456 };
19457
19458 // concat(rshrn(x), rshrn(y)) -> rshrn(concat(x, y))
19459 if (N->getNumOperands() == 2 && IsRSHRN(N0) &&
19460 ((IsRSHRN(N1) &&
19462 N1.isUndef())) {
19463 SDValue X = N0.getOperand(0).getOperand(0);
19464 SDValue Y = N1.isUndef() ? DAG.getUNDEF(X.getValueType())
19465 : N1.getOperand(0).getOperand(0);
19466 EVT BVT =
19467 X.getValueType().getDoubleNumVectorElementsVT(*DCI.DAG.getContext());
19468 SDValue CC = DAG.getNode(ISD::CONCAT_VECTORS, dl, BVT, X, Y);
19469 SDValue Add = DAG.getNode(
19470 ISD::ADD, dl, BVT, CC,
19471 DAG.getConstant(1ULL << (N0.getConstantOperandVal(1) - 1), dl, BVT));
19472 SDValue Shr =
19473 DAG.getNode(AArch64ISD::VLSHR, dl, BVT, Add, N0.getOperand(1));
19474 return Shr;
19475 }
19476
19477 // concat(zip1(a, b), zip2(a, b)) is zip1(a, b)
19478 if (N->getNumOperands() == 2 && N0Opc == AArch64ISD::ZIP1 &&
19479 N1Opc == AArch64ISD::ZIP2 && N0.getOperand(0) == N1.getOperand(0) &&
19480 N0.getOperand(1) == N1.getOperand(1)) {
19481 SDValue E0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N0.getOperand(0),
19482 DAG.getUNDEF(N0.getValueType()));
19483 SDValue E1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, N0.getOperand(1),
19484 DAG.getUNDEF(N0.getValueType()));
19485 return DAG.getNode(AArch64ISD::ZIP1, dl, VT, E0, E1);
19486 }
19487
19488 // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
19489 // splat. The indexed instructions are going to be expecting a DUPLANE64, so
19490 // canonicalise to that.
19491 if (N->getNumOperands() == 2 && N0 == N1 && VT.getVectorNumElements() == 2) {
19492 assert(VT.getScalarSizeInBits() == 64);
19493 return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),
19494 DAG.getConstant(0, dl, MVT::i64));
19495 }
19496
19497 // Canonicalise concat_vectors so that the right-hand vector has as few
19498 // bit-casts as possible before its real operation. The primary matching
19499 // destination for these operations will be the narrowing "2" instructions,
19500 // which depend on the operation being performed on this right-hand vector.
19501 // For example,
19502 // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
19503 // becomes
19504 // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
19505
19506 if (N->getNumOperands() != 2 || N1Opc != ISD::BITCAST)
19507 return SDValue();
19508 SDValue RHS = N1->getOperand(0);
19509 MVT RHSTy = RHS.getValueType().getSimpleVT();
19510 // If the RHS is not a vector, this is not the pattern we're looking for.
19511 if (!RHSTy.isVector())
19512 return SDValue();
19513
19514 LLVM_DEBUG(
19515 dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
19516
19517 MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
19518 RHSTy.getVectorNumElements() * 2);
19519 return DAG.getNode(ISD::BITCAST, dl, VT,
19520 DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
19521 DAG.getNode(ISD::BITCAST, dl, RHSTy, N0),
19522 RHS));
19523}
19524
19525static SDValue
19527 SelectionDAG &DAG) {
19528 if (DCI.isBeforeLegalizeOps())
19529 return SDValue();
19530
19531 EVT VT = N->getValueType(0);
19532 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)
19533 return SDValue();
19534
19535 SDValue V = N->getOperand(0);
19536
19537 // NOTE: This combine exists in DAGCombiner, but that version's legality check
19538 // blocks this combine because the non-const case requires custom lowering.
19539 //
19540 // ty1 extract_vector(ty2 splat(const))) -> ty1 splat(const)
19541 if (V.getOpcode() == ISD::SPLAT_VECTOR)
19542 if (isa<ConstantSDNode>(V.getOperand(0)))
19543 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, V.getOperand(0));
19544
19545 return SDValue();
19546}
19547
19548static SDValue
19550 SelectionDAG &DAG) {
19551 SDLoc DL(N);
19552 SDValue Vec = N->getOperand(0);
19553 SDValue SubVec = N->getOperand(1);
19554 uint64_t IdxVal = N->getConstantOperandVal(2);
19555 EVT VecVT = Vec.getValueType();
19556 EVT SubVT = SubVec.getValueType();
19557
19558 // Only do this for legal fixed vector types.
19559 if (!VecVT.isFixedLengthVector() ||
19560 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
19561 !DAG.getTargetLoweringInfo().isTypeLegal(SubVT))
19562 return SDValue();
19563
19564 // Ignore widening patterns.
19565 if (IdxVal == 0 && Vec.isUndef())
19566 return SDValue();
19567
19568 // Subvector must be half the width and an "aligned" insertion.
19569 unsigned NumSubElts = SubVT.getVectorNumElements();
19570 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
19571 (IdxVal != 0 && IdxVal != NumSubElts))
19572 return SDValue();
19573
19574 // Fold insert_subvector -> concat_vectors
19575 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
19576 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
19577 SDValue Lo, Hi;
19578 if (IdxVal == 0) {
19579 Lo = SubVec;
19580 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
19581 DAG.getVectorIdxConstant(NumSubElts, DL));
19582 } else {
19583 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
19584 DAG.getVectorIdxConstant(0, DL));
19585 Hi = SubVec;
19586 }
19587 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
19588}
19589
19592 SelectionDAG &DAG) {
19593 // Wait until after everything is legalized to try this. That way we have
19594 // legal vector types and such.
19595 if (DCI.isBeforeLegalizeOps())
19596 return SDValue();
19597 // Transform a scalar conversion of a value from a lane extract into a
19598 // lane extract of a vector conversion. E.g., from foo1 to foo2:
19599 // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
19600 // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
19601 //
19602 // The second form interacts better with instruction selection and the
19603 // register allocator to avoid cross-class register copies that aren't
19604 // coalescable due to a lane reference.
19605
19606 // Check the operand and see if it originates from a lane extract.
19607 SDValue Op1 = N->getOperand(1);
19609 return SDValue();
19610
19611 // Yep, no additional predication needed. Perform the transform.
19612 SDValue IID = N->getOperand(0);
19613 SDValue Shift = N->getOperand(2);
19614 SDValue Vec = Op1.getOperand(0);
19615 SDValue Lane = Op1.getOperand(1);
19616 EVT ResTy = N->getValueType(0);
19617 EVT VecResTy;
19618 SDLoc DL(N);
19619
19620 // The vector width should be 128 bits by the time we get here, even
19621 // if it started as 64 bits (the extract_vector handling will have
19622 // done so). Bail if it is not.
19623 if (Vec.getValueSizeInBits() != 128)
19624 return SDValue();
19625
19626 if (Vec.getValueType() == MVT::v4i32)
19627 VecResTy = MVT::v4f32;
19628 else if (Vec.getValueType() == MVT::v2i64)
19629 VecResTy = MVT::v2f64;
19630 else
19631 return SDValue();
19632
19633 SDValue Convert =
19634 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
19635 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
19636}
19637
19638// AArch64 high-vector "long" operations are formed by performing the non-high
19639// version on an extract_subvector of each operand which gets the high half:
19640//
19641// (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
19642//
19643// However, there are cases which don't have an extract_high explicitly, but
19644// have another operation that can be made compatible with one for free. For
19645// example:
19646//
19647// (dupv64 scalar) --> (extract_high (dup128 scalar))
19648//
19649// This routine does the actual conversion of such DUPs, once outer routines
19650// have determined that everything else is in order.
19651// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
19652// similarly here.
19654 MVT VT = N.getSimpleValueType();
19655 if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
19656 N.getConstantOperandVal(1) == 0)
19657 N = N.getOperand(0);
19658
19659 switch (N.getOpcode()) {
19660 case AArch64ISD::DUP:
19665 case AArch64ISD::MOVI:
19671 break;
19672 default:
19673 // FMOV could be supported, but isn't very useful, as it would only occur
19674 // if you passed a bitcast' floating point immediate to an eligible long
19675 // integer op (addl, smull, ...).
19676 return SDValue();
19677 }
19678
19679 if (!VT.is64BitVector())
19680 return SDValue();
19681
19682 SDLoc DL(N);
19683 unsigned NumElems = VT.getVectorNumElements();
19684 if (N.getValueType().is64BitVector()) {
19685 MVT ElementTy = VT.getVectorElementType();
19686 MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
19687 N = DAG.getNode(N->getOpcode(), DL, NewVT, N->ops());
19688 }
19689
19690 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N,
19691 DAG.getConstant(NumElems, DL, MVT::i64));
19692}
19693
19695 if (N.getOpcode() == ISD::BITCAST)
19696 N = N.getOperand(0);
19697 if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
19698 return false;
19699 if (N.getOperand(0).getValueType().isScalableVector())
19700 return false;
19701 return N.getConstantOperandAPInt(1) ==
19702 N.getOperand(0).getValueType().getVectorNumElements() / 2;
19703}
19704
19705/// Helper structure to keep track of ISD::SET_CC operands.
19710};
19711
19712/// Helper structure to keep track of a SET_CC lowered into AArch64 code.
19714 const SDValue *Cmp;
19716};
19717
19718/// Helper structure to keep track of SetCC information.
19722};
19723
19724/// Helper structure to be able to read SetCC information. If set to
19725/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
19726/// GenericSetCCInfo.
19730};
19731
19732/// Check whether or not \p Op is a SET_CC operation, either a generic or
19733/// an
19734/// AArch64 lowered one.
19735/// \p SetCCInfo is filled accordingly.
19736/// \post SetCCInfo is meanginfull only when this function returns true.
19737/// \return True when Op is a kind of SET_CC operation.
19739 // If this is a setcc, this is straight forward.
19740 if (Op.getOpcode() == ISD::SETCC) {
19741 SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
19742 SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
19743 SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
19744 SetCCInfo.IsAArch64 = false;
19745 return true;
19746 }
19747 // Otherwise, check if this is a matching csel instruction.
19748 // In other words:
19749 // - csel 1, 0, cc
19750 // - csel 0, 1, !cc
19751 if (Op.getOpcode() != AArch64ISD::CSEL)
19752 return false;
19753 // Set the information about the operands.
19754 // TODO: we want the operands of the Cmp not the csel
19755 SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
19756 SetCCInfo.IsAArch64 = true;
19757 SetCCInfo.Info.AArch64.CC =
19758 static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
19759
19760 // Check that the operands matches the constraints:
19761 // (1) Both operands must be constants.
19762 // (2) One must be 1 and the other must be 0.
19763 ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
19764 ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
19765
19766 // Check (1).
19767 if (!TValue || !FValue)
19768 return false;
19769
19770 // Check (2).
19771 if (!TValue->isOne()) {
19772 // Update the comparison when we are interested in !cc.
19773 std::swap(TValue, FValue);
19774 SetCCInfo.Info.AArch64.CC =
19776 }
19777 return TValue->isOne() && FValue->isZero();
19778}
19779
19780// Returns true if Op is setcc or zext of setcc.
19781static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
19782 if (isSetCC(Op, Info))
19783 return true;
19784 return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
19785 isSetCC(Op->getOperand(0), Info));
19786}
19787
19788// The folding we want to perform is:
19789// (add x, [zext] (setcc cc ...) )
19790// -->
19791// (csel x, (add x, 1), !cc ...)
19792//
19793// The latter will get matched to a CSINC instruction.
19795 assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
19796 SDValue LHS = Op->getOperand(0);
19797 SDValue RHS = Op->getOperand(1);
19798 SetCCInfoAndKind InfoAndKind;
19799
19800 // If both operands are a SET_CC, then we don't want to perform this
19801 // folding and create another csel as this results in more instructions
19802 // (and higher register usage).
19803 if (isSetCCOrZExtSetCC(LHS, InfoAndKind) &&
19804 isSetCCOrZExtSetCC(RHS, InfoAndKind))
19805 return SDValue();
19806
19807 // If neither operand is a SET_CC, give up.
19808 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
19809 std::swap(LHS, RHS);
19810 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
19811 return SDValue();
19812 }
19813
19814 // FIXME: This could be generatized to work for FP comparisons.
19815 EVT CmpVT = InfoAndKind.IsAArch64
19816 ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
19817 : InfoAndKind.Info.Generic.Opnd0->getValueType();
19818 if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
19819 return SDValue();
19820
19821 SDValue CCVal;
19822 SDValue Cmp;
19823 SDLoc dl(Op);
19824 if (InfoAndKind.IsAArch64) {
19825 CCVal = DAG.getConstant(
19827 MVT::i32);
19828 Cmp = *InfoAndKind.Info.AArch64.Cmp;
19829 } else
19830 Cmp = getAArch64Cmp(
19831 *InfoAndKind.Info.Generic.Opnd0, *InfoAndKind.Info.Generic.Opnd1,
19832 ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, CmpVT), CCVal, DAG,
19833 dl);
19834
19835 EVT VT = Op->getValueType(0);
19836 LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT));
19837 return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
19838}
19839
19840// ADD(UADDV a, UADDV b) --> UADDV(ADD a, b)
19842 EVT VT = N->getValueType(0);
19843 // Only scalar integer and vector types.
19844 if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger())
19845 return SDValue();
19846
19847 SDValue LHS = N->getOperand(0);
19848 SDValue RHS = N->getOperand(1);
19849 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
19850 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getValueType() != VT)
19851 return SDValue();
19852
19853 auto *LHSN1 = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
19854 auto *RHSN1 = dyn_cast<ConstantSDNode>(RHS->getOperand(1));
19855 if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isZero())
19856 return SDValue();
19857
19858 SDValue Op1 = LHS->getOperand(0);
19859 SDValue Op2 = RHS->getOperand(0);
19860 EVT OpVT1 = Op1.getValueType();
19861 EVT OpVT2 = Op2.getValueType();
19862 if (Op1.getOpcode() != AArch64ISD::UADDV || OpVT1 != OpVT2 ||
19863 Op2.getOpcode() != AArch64ISD::UADDV ||
19864 OpVT1.getVectorElementType() != VT)
19865 return SDValue();
19866
19867 SDValue Val1 = Op1.getOperand(0);
19868 SDValue Val2 = Op2.getOperand(0);
19869 EVT ValVT = Val1->getValueType(0);
19870 SDLoc DL(N);
19871 SDValue AddVal = DAG.getNode(ISD::ADD, DL, ValVT, Val1, Val2);
19872 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
19873 DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal),
19874 DAG.getConstant(0, DL, MVT::i64));
19875}
19876
19877/// Perform the scalar expression combine in the form of:
19878/// CSEL(c, 1, cc) + b => CSINC(b+c, b, cc)
19879/// CSNEG(c, -1, cc) + b => CSINC(b+c, b, cc)
19881 EVT VT = N->getValueType(0);
19882 if (!VT.isScalarInteger() || N->getOpcode() != ISD::ADD)
19883 return SDValue();
19884
19885 SDValue LHS = N->getOperand(0);
19886 SDValue RHS = N->getOperand(1);
19887
19888 // Handle commutivity.
19889 if (LHS.getOpcode() != AArch64ISD::CSEL &&
19890 LHS.getOpcode() != AArch64ISD::CSNEG) {
19891 std::swap(LHS, RHS);
19892 if (LHS.getOpcode() != AArch64ISD::CSEL &&
19893 LHS.getOpcode() != AArch64ISD::CSNEG) {
19894 return SDValue();
19895 }
19896 }
19897
19898 if (!LHS.hasOneUse())
19899 return SDValue();
19900
19901 AArch64CC::CondCode AArch64CC =
19902 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
19903
19904 // The CSEL should include a const one operand, and the CSNEG should include
19905 // One or NegOne operand.
19906 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(LHS.getOperand(0));
19907 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
19908 if (!CTVal || !CFVal)
19909 return SDValue();
19910
19911 if (!(LHS.getOpcode() == AArch64ISD::CSEL &&
19912 (CTVal->isOne() || CFVal->isOne())) &&
19913 !(LHS.getOpcode() == AArch64ISD::CSNEG &&
19914 (CTVal->isOne() || CFVal->isAllOnes())))
19915 return SDValue();
19916
19917 // Switch CSEL(1, c, cc) to CSEL(c, 1, !cc)
19918 if (LHS.getOpcode() == AArch64ISD::CSEL && CTVal->isOne() &&
19919 !CFVal->isOne()) {
19920 std::swap(CTVal, CFVal);
19921 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
19922 }
19923
19924 SDLoc DL(N);
19925 // Switch CSNEG(1, c, cc) to CSNEG(-c, -1, !cc)
19926 if (LHS.getOpcode() == AArch64ISD::CSNEG && CTVal->isOne() &&
19927 !CFVal->isAllOnes()) {
19928 APInt C = -1 * CFVal->getAPIntValue();
19929 CTVal = cast<ConstantSDNode>(DAG.getConstant(C, DL, VT));
19930 CFVal = cast<ConstantSDNode>(DAG.getAllOnesConstant(DL, VT));
19931 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
19932 }
19933
19934 // It might be neutral for larger constants, as the immediate need to be
19935 // materialized in a register.
19936 APInt ADDC = CTVal->getAPIntValue();
19937 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19938 if (!TLI.isLegalAddImmediate(ADDC.getSExtValue()))
19939 return SDValue();
19940
19941 assert(((LHS.getOpcode() == AArch64ISD::CSEL && CFVal->isOne()) ||
19942 (LHS.getOpcode() == AArch64ISD::CSNEG && CFVal->isAllOnes())) &&
19943 "Unexpected constant value");
19944
19945 SDValue NewNode = DAG.getNode(ISD::ADD, DL, VT, RHS, SDValue(CTVal, 0));
19946 SDValue CCVal = DAG.getConstant(AArch64CC, DL, MVT::i32);
19947 SDValue Cmp = LHS.getOperand(3);
19948
19949 return DAG.getNode(AArch64ISD::CSINC, DL, VT, NewNode, RHS, CCVal, Cmp);
19950}
19951
19952// ADD(UDOT(zero, x, y), A) --> UDOT(A, x, y)
19954 EVT VT = N->getValueType(0);
19955 if (N->getOpcode() != ISD::ADD)
19956 return SDValue();
19957
19958 SDValue Dot = N->getOperand(0);
19959 SDValue A = N->getOperand(1);
19960 // Handle commutivity
19961 auto isZeroDot = [](SDValue Dot) {
19962 return (Dot.getOpcode() == AArch64ISD::UDOT ||
19963 Dot.getOpcode() == AArch64ISD::SDOT) &&
19965 };
19966 if (!isZeroDot(Dot))
19967 std::swap(Dot, A);
19968 if (!isZeroDot(Dot))
19969 return SDValue();
19970
19971 return DAG.getNode(Dot.getOpcode(), SDLoc(N), VT, A, Dot.getOperand(1),
19972 Dot.getOperand(2));
19973}
19974
19976 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0));
19977}
19978
19980 SDLoc DL(Op);
19981 EVT VT = Op.getValueType();
19982 SDValue Zero = DAG.getConstant(0, DL, VT);
19983 return DAG.getNode(ISD::SUB, DL, VT, Zero, Op);
19984}
19985
19986// Try to fold
19987//
19988// (neg (csel X, Y)) -> (csel (neg X), (neg Y))
19989//
19990// The folding helps csel to be matched with csneg without generating
19991// redundant neg instruction, which includes negation of the csel expansion
19992// of abs node lowered by lowerABS.
19994 if (!isNegatedInteger(SDValue(N, 0)))
19995 return SDValue();
19996
19997 SDValue CSel = N->getOperand(1);
19998 if (CSel.getOpcode() != AArch64ISD::CSEL || !CSel->hasOneUse())
19999 return SDValue();
20000
20001 SDValue N0 = CSel.getOperand(0);
20002 SDValue N1 = CSel.getOperand(1);
20003
20004 // If both of them is not negations, it's not worth the folding as it
20005 // introduces two additional negations while reducing one negation.
20006 if (!isNegatedInteger(N0) && !isNegatedInteger(N1))
20007 return SDValue();
20008
20009 SDValue N0N = getNegatedInteger(N0, DAG);
20010 SDValue N1N = getNegatedInteger(N1, DAG);
20011
20012 SDLoc DL(N);
20013 EVT VT = CSel.getValueType();
20014 return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0N, N1N, CSel.getOperand(2),
20015 CSel.getOperand(3));
20016}
20017
20018// The basic add/sub long vector instructions have variants with "2" on the end
20019// which act on the high-half of their inputs. They are normally matched by
20020// patterns like:
20021//
20022// (add (zeroext (extract_high LHS)),
20023// (zeroext (extract_high RHS)))
20024// -> uaddl2 vD, vN, vM
20025//
20026// However, if one of the extracts is something like a duplicate, this
20027// instruction can still be used profitably. This function puts the DAG into a
20028// more appropriate form for those patterns to trigger.
20031 SelectionDAG &DAG = DCI.DAG;
20032 if (DCI.isBeforeLegalizeOps())
20033 return SDValue();
20034
20035 MVT VT = N->getSimpleValueType(0);
20036 if (!VT.is128BitVector()) {
20037 if (N->getOpcode() == ISD::ADD)
20038 return performSetccAddFolding(N, DAG);
20039 return SDValue();
20040 }
20041
20042 // Make sure both branches are extended in the same way.
20043 SDValue LHS = N->getOperand(0);
20044 SDValue RHS = N->getOperand(1);
20045 if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
20046 LHS.getOpcode() != ISD::SIGN_EXTEND) ||
20047 LHS.getOpcode() != RHS.getOpcode())
20048 return SDValue();
20049
20050 unsigned ExtType = LHS.getOpcode();
20051
20052 // It's not worth doing if at least one of the inputs isn't already an
20053 // extract, but we don't know which it'll be so we have to try both.
20054 if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) {
20055 RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
20056 if (!RHS.getNode())
20057 return SDValue();
20058
20059 RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
20060 } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) {
20061 LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
20062 if (!LHS.getNode())
20063 return SDValue();
20064
20065 LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
20066 }
20067
20068 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
20069}
20070
20071static bool isCMP(SDValue Op) {
20072 return Op.getOpcode() == AArch64ISD::SUBS &&
20073 !Op.getNode()->hasAnyUseOfValue(0);
20074}
20075
20076// (CSEL 1 0 CC Cond) => CC
20077// (CSEL 0 1 CC Cond) => !CC
20078static std::optional<AArch64CC::CondCode> getCSETCondCode(SDValue Op) {
20079 if (Op.getOpcode() != AArch64ISD::CSEL)
20080 return std::nullopt;
20081 auto CC = static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
20082 if (CC == AArch64CC::AL || CC == AArch64CC::NV)
20083 return std::nullopt;
20084 SDValue OpLHS = Op.getOperand(0);
20085 SDValue OpRHS = Op.getOperand(1);
20086 if (isOneConstant(OpLHS) && isNullConstant(OpRHS))
20087 return CC;
20088 if (isNullConstant(OpLHS) && isOneConstant(OpRHS))
20089 return getInvertedCondCode(CC);
20090
20091 return std::nullopt;
20092}
20093
20094// (ADC{S} l r (CMP (CSET HS carry) 1)) => (ADC{S} l r carry)
20095// (SBC{S} l r (CMP 0 (CSET LO carry))) => (SBC{S} l r carry)
20096static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd) {
20097 SDValue CmpOp = Op->getOperand(2);
20098 if (!isCMP(CmpOp))
20099 return SDValue();
20100
20101 if (IsAdd) {
20102 if (!isOneConstant(CmpOp.getOperand(1)))
20103 return SDValue();
20104 } else {
20105 if (!isNullConstant(CmpOp.getOperand(0)))
20106 return SDValue();
20107 }
20108
20109 SDValue CsetOp = CmpOp->getOperand(IsAdd ? 0 : 1);
20110 auto CC = getCSETCondCode(CsetOp);
20111 if (CC != (IsAdd ? AArch64CC::HS : AArch64CC::LO))
20112 return SDValue();
20113
20114 return DAG.getNode(Op->getOpcode(), SDLoc(Op), Op->getVTList(),
20115 Op->getOperand(0), Op->getOperand(1),
20116 CsetOp.getOperand(3));
20117}
20118
20119// (ADC x 0 cond) => (CINC x HS cond)
20121 SDValue LHS = N->getOperand(0);
20122 SDValue RHS = N->getOperand(1);
20123 SDValue Cond = N->getOperand(2);
20124
20125 if (!isNullConstant(RHS))
20126 return SDValue();
20127
20128 EVT VT = N->getValueType(0);
20129 SDLoc DL(N);
20130
20131 // (CINC x cc cond) <=> (CSINC x x !cc cond)
20132 SDValue CC = DAG.getConstant(AArch64CC::LO, DL, MVT::i32);
20133 return DAG.getNode(AArch64ISD::CSINC, DL, VT, LHS, LHS, CC, Cond);
20134}
20135
20138 SelectionDAG &DAG) {
20139 SDLoc DL(N);
20140 EVT VT = N->getValueType(0);
20141
20143 (VT == MVT::v4f16 || VT == MVT::v4bf16)) {
20144 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1),
20145 Elt2 = N->getOperand(2), Elt3 = N->getOperand(3);
20146 if (Elt0->getOpcode() == ISD::FP_ROUND &&
20147 Elt1->getOpcode() == ISD::FP_ROUND &&
20148 isa<ConstantSDNode>(Elt0->getOperand(1)) &&
20149 isa<ConstantSDNode>(Elt1->getOperand(1)) &&
20150 Elt0->getConstantOperandVal(1) == Elt1->getConstantOperandVal(1) &&
20152 Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20153 // Constant index.
20154 isa<ConstantSDNode>(Elt0->getOperand(0)->getOperand(1)) &&
20155 isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&
20156 Elt0->getOperand(0)->getOperand(0) ==
20157 Elt1->getOperand(0)->getOperand(0) &&
20158 Elt0->getOperand(0)->getConstantOperandVal(1) == 0 &&
20159 Elt1->getOperand(0)->getConstantOperandVal(1) == 1) {
20160 SDValue LowLanesSrcVec = Elt0->getOperand(0)->getOperand(0);
20161 if (LowLanesSrcVec.getValueType() == MVT::v2f64) {
20162 SDValue HighLanes;
20163 if (Elt2->getOpcode() == ISD::UNDEF &&
20164 Elt3->getOpcode() == ISD::UNDEF) {
20165 HighLanes = DAG.getUNDEF(MVT::v2f32);
20166 } else if (Elt2->getOpcode() == ISD::FP_ROUND &&
20167 Elt3->getOpcode() == ISD::FP_ROUND &&
20168 isa<ConstantSDNode>(Elt2->getOperand(1)) &&
20169 isa<ConstantSDNode>(Elt3->getOperand(1)) &&
20170 Elt2->getConstantOperandVal(1) ==
20171 Elt3->getConstantOperandVal(1) &&
20172 Elt2->getOperand(0)->getOpcode() ==
20174 Elt3->getOperand(0)->getOpcode() ==
20176 // Constant index.
20177 isa<ConstantSDNode>(Elt2->getOperand(0)->getOperand(1)) &&
20178 isa<ConstantSDNode>(Elt3->getOperand(0)->getOperand(1)) &&
20179 Elt2->getOperand(0)->getOperand(0) ==
20180 Elt3->getOperand(0)->getOperand(0) &&
20181 Elt2->getOperand(0)->getConstantOperandVal(1) == 0 &&
20182 Elt3->getOperand(0)->getConstantOperandVal(1) == 1) {
20183 SDValue HighLanesSrcVec = Elt2->getOperand(0)->getOperand(0);
20184 HighLanes =
20185 DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, HighLanesSrcVec);
20186 }
20187 if (HighLanes) {
20188 SDValue DoubleToSingleSticky =
20189 DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, LowLanesSrcVec);
20190 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
20191 DoubleToSingleSticky, HighLanes);
20192 return DAG.getNode(ISD::FP_ROUND, DL, VT, Concat,
20193 Elt0->getOperand(1));
20194 }
20195 }
20196 }
20197 }
20198
20199 if (VT == MVT::v2f64) {
20200 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
20201 if (Elt0->getOpcode() == ISD::FP_EXTEND &&
20202 Elt1->getOpcode() == ISD::FP_EXTEND &&
20204 Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20205 Elt0->getOperand(0)->getOperand(0) ==
20206 Elt1->getOperand(0)->getOperand(0) &&
20207 // Constant index.
20208 isa<ConstantSDNode>(Elt0->getOperand(0)->getOperand(1)) &&
20209 isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&
20210 Elt0->getOperand(0)->getConstantOperandVal(1) + 1 ==
20211 Elt1->getOperand(0)->getConstantOperandVal(1) &&
20212 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
20213 // ResultType's known minimum vector length.
20214 Elt0->getOperand(0)->getConstantOperandVal(1) %
20216 0) {
20217 SDValue SrcVec = Elt0->getOperand(0)->getOperand(0);
20218 if (SrcVec.getValueType() == MVT::v4f16 ||
20219 SrcVec.getValueType() == MVT::v4bf16) {
20220 SDValue HalfToSingle =
20221 DAG.getNode(ISD::FP_EXTEND, DL, MVT::v4f32, SrcVec);
20222 SDValue SubvectorIdx = Elt0->getOperand(0)->getOperand(1);
20223 SDValue Extract = DAG.getNode(
20225 HalfToSingle, SubvectorIdx);
20226 return DAG.getNode(ISD::FP_EXTEND, DL, VT, Extract);
20227 }
20228 }
20229 }
20230
20231 // A build vector of two extracted elements is equivalent to an
20232 // extract subvector where the inner vector is any-extended to the
20233 // extract_vector_elt VT.
20234 // (build_vector (extract_elt_iXX_to_i32 vec Idx+0)
20235 // (extract_elt_iXX_to_i32 vec Idx+1))
20236 // => (extract_subvector (anyext_iXX_to_i32 vec) Idx)
20237
20238 // For now, only consider the v2i32 case, which arises as a result of
20239 // legalization.
20240 if (VT != MVT::v2i32)
20241 return SDValue();
20242
20243 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
20244 // Reminder, EXTRACT_VECTOR_ELT has the effect of any-extending to its VT.
20245 if (Elt0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20246 Elt1->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20247 // Constant index.
20248 isa<ConstantSDNode>(Elt0->getOperand(1)) &&
20249 isa<ConstantSDNode>(Elt1->getOperand(1)) &&
20250 // Both EXTRACT_VECTOR_ELT from same vector...
20251 Elt0->getOperand(0) == Elt1->getOperand(0) &&
20252 // ... and contiguous. First element's index +1 == second element's index.
20253 Elt0->getConstantOperandVal(1) + 1 == Elt1->getConstantOperandVal(1) &&
20254 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
20255 // ResultType's known minimum vector length.
20256 Elt0->getConstantOperandVal(1) % VT.getVectorMinNumElements() == 0) {
20257 SDValue VecToExtend = Elt0->getOperand(0);
20258 EVT ExtVT = VecToExtend.getValueType().changeVectorElementType(MVT::i32);
20259 if (!DAG.getTargetLoweringInfo().isTypeLegal(ExtVT))
20260 return SDValue();
20261
20262 SDValue SubvectorIdx = DAG.getVectorIdxConstant(Elt0->getConstantOperandVal(1), DL);
20263
20264 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, DL, ExtVT, VecToExtend);
20265 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Ext,
20266 SubvectorIdx);
20267 }
20268
20269 return SDValue();
20270}
20271
20273 SelectionDAG &DAG) {
20274 EVT VT = N->getValueType(0);
20275 SDValue N0 = N->getOperand(0);
20276 if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() &&
20277 N0.getOpcode() == AArch64ISD::DUP) {
20278 SDValue Op = N0.getOperand(0);
20279 if (VT.getScalarType() == MVT::i32 &&
20280 N0.getOperand(0).getValueType().getScalarType() == MVT::i64)
20281 Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), MVT::i32, Op);
20282 return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Op);
20283 }
20284
20285 return SDValue();
20286}
20287
20288// Check an node is an extend or shift operand
20290 unsigned Opcode = N.getOpcode();
20291 if (ISD::isExtOpcode(Opcode) || Opcode == ISD::SIGN_EXTEND_INREG) {
20292 EVT SrcVT;
20293 if (Opcode == ISD::SIGN_EXTEND_INREG)
20294 SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT();
20295 else
20296 SrcVT = N.getOperand(0).getValueType();
20297
20298 return SrcVT == MVT::i32 || SrcVT == MVT::i16 || SrcVT == MVT::i8;
20299 } else if (Opcode == ISD::AND) {
20300 ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
20301 if (!CSD)
20302 return false;
20303 uint64_t AndMask = CSD->getZExtValue();
20304 return AndMask == 0xff || AndMask == 0xffff || AndMask == 0xffffffff;
20305 } else if (Opcode == ISD::SHL || Opcode == ISD::SRL || Opcode == ISD::SRA) {
20306 return isa<ConstantSDNode>(N.getOperand(1));
20307 }
20308
20309 return false;
20310}
20311
20312// (N - Y) + Z --> (Z - Y) + N
20313// when N is an extend or shift operand
20315 SelectionDAG &DAG) {
20316 auto IsOneUseExtend = [](SDValue N) {
20317 return N.hasOneUse() && isExtendOrShiftOperand(N);
20318 };
20319
20320 // DAGCombiner will revert the combination when Z is constant cause
20321 // dead loop. So don't enable the combination when Z is constant.
20322 // If Z is one use shift C, we also can't do the optimization.
20323 // It will falling to self infinite loop.
20324 if (isa<ConstantSDNode>(Z) || IsOneUseExtend(Z))
20325 return SDValue();
20326
20327 if (SUB.getOpcode() != ISD::SUB || !SUB.hasOneUse())
20328 return SDValue();
20329
20330 SDValue Shift = SUB.getOperand(0);
20331 if (!IsOneUseExtend(Shift))
20332 return SDValue();
20333
20334 SDLoc DL(N);
20335 EVT VT = N->getValueType(0);
20336
20337 SDValue Y = SUB.getOperand(1);
20338 SDValue NewSub = DAG.getNode(ISD::SUB, DL, VT, Z, Y);
20339 return DAG.getNode(ISD::ADD, DL, VT, NewSub, Shift);
20340}
20341
20343 SelectionDAG &DAG) {
20344 // NOTE: Swapping LHS and RHS is not done for SUB, since SUB is not
20345 // commutative.
20346 if (N->getOpcode() != ISD::ADD)
20347 return SDValue();
20348
20349 // Bail out when value type is not one of {i32, i64}, since AArch64 ADD with
20350 // shifted register is only available for i32 and i64.
20351 EVT VT = N->getValueType(0);
20352 if (VT != MVT::i32 && VT != MVT::i64)
20353 return SDValue();
20354
20355 SDLoc DL(N);
20356 SDValue LHS = N->getOperand(0);
20357 SDValue RHS = N->getOperand(1);
20358
20359 if (SDValue Val = performAddCombineSubShift(N, LHS, RHS, DAG))
20360 return Val;
20361 if (SDValue Val = performAddCombineSubShift(N, RHS, LHS, DAG))
20362 return Val;
20363
20364 uint64_t LHSImm = 0, RHSImm = 0;
20365 // If both operand are shifted by imm and shift amount is not greater than 4
20366 // for one operand, swap LHS and RHS to put operand with smaller shift amount
20367 // on RHS.
20368 //
20369 // On many AArch64 processors (Cortex A78, Neoverse N1/N2/V1, etc), ADD with
20370 // LSL shift (shift <= 4) has smaller latency and larger throughput than ADD
20371 // with LSL (shift > 4). For the rest of processors, this is no-op for
20372 // performance or correctness.
20373 if (isOpcWithIntImmediate(LHS.getNode(), ISD::SHL, LHSImm) &&
20374 isOpcWithIntImmediate(RHS.getNode(), ISD::SHL, RHSImm) && LHSImm <= 4 &&
20375 RHSImm > 4 && LHS.hasOneUse())
20376 return DAG.getNode(ISD::ADD, DL, VT, RHS, LHS);
20377
20378 return SDValue();
20379}
20380
20381// The mid end will reassociate sub(sub(x, m1), m2) to sub(x, add(m1, m2))
20382// This reassociates it back to allow the creation of more mls instructions.
20384 if (N->getOpcode() != ISD::SUB)
20385 return SDValue();
20386
20387 SDValue Add = N->getOperand(1);
20388 SDValue X = N->getOperand(0);
20389 if (Add.getOpcode() != ISD::ADD)
20390 return SDValue();
20391
20392 if (!Add.hasOneUse())
20393 return SDValue();
20395 return SDValue();
20396
20397 SDValue M1 = Add.getOperand(0);
20398 SDValue M2 = Add.getOperand(1);
20399 if (M1.getOpcode() != ISD::MUL && M1.getOpcode() != AArch64ISD::SMULL &&
20400 M1.getOpcode() != AArch64ISD::UMULL)
20401 return SDValue();
20402 if (M2.getOpcode() != ISD::MUL && M2.getOpcode() != AArch64ISD::SMULL &&
20404 return SDValue();
20405
20406 EVT VT = N->getValueType(0);
20407 SDValue Sub = DAG.getNode(ISD::SUB, SDLoc(N), VT, X, M1);
20408 return DAG.getNode(ISD::SUB, SDLoc(N), VT, Sub, M2);
20409}
20410
20411// Combine into mla/mls.
20412// This works on the patterns of:
20413// add v1, (mul v2, v3)
20414// sub v1, (mul v2, v3)
20415// for vectors of type <1 x i64> and <2 x i64> when SVE is available.
20416// It will transform the add/sub to a scalable version, so that we can
20417// make use of SVE's MLA/MLS that will be generated for that pattern
20418static SDValue
20420 SelectionDAG &DAG = DCI.DAG;
20421 // Make sure that the types are legal
20422 if (!DCI.isAfterLegalizeDAG())
20423 return SDValue();
20424 // Before using SVE's features, check first if it's available.
20425 if (!DAG.getSubtarget<AArch64Subtarget>().hasSVE())
20426 return SDValue();
20427
20428 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::SUB)
20429 return SDValue();
20430
20431 if (!N->getValueType(0).isFixedLengthVector())
20432 return SDValue();
20433
20434 auto performOpt = [&DAG, &N](SDValue Op0, SDValue Op1) -> SDValue {
20435 if (Op1.getOpcode() != ISD::EXTRACT_SUBVECTOR)
20436 return SDValue();
20437
20438 if (!cast<ConstantSDNode>(Op1->getOperand(1))->isZero())
20439 return SDValue();
20440
20441 SDValue MulValue = Op1->getOperand(0);
20442 if (MulValue.getOpcode() != AArch64ISD::MUL_PRED)
20443 return SDValue();
20444
20445 if (!Op1.hasOneUse() || !MulValue.hasOneUse())
20446 return SDValue();
20447
20448 EVT ScalableVT = MulValue.getValueType();
20449 if (!ScalableVT.isScalableVector())
20450 return SDValue();
20451
20452 SDValue ScaledOp = convertToScalableVector(DAG, ScalableVT, Op0);
20453 SDValue NewValue =
20454 DAG.getNode(N->getOpcode(), SDLoc(N), ScalableVT, {ScaledOp, MulValue});
20455 return convertFromScalableVector(DAG, N->getValueType(0), NewValue);
20456 };
20457
20458 if (SDValue res = performOpt(N->getOperand(0), N->getOperand(1)))
20459 return res;
20460 else if (N->getOpcode() == ISD::ADD)
20461 return performOpt(N->getOperand(1), N->getOperand(0));
20462
20463 return SDValue();
20464}
20465
20466// Given a i64 add from a v1i64 extract, convert to a neon v1i64 add. This can
20467// help, for example, to produce ssra from sshr+add.
20469 EVT VT = N->getValueType(0);
20470 if (VT != MVT::i64 ||
20471 DAG.getTargetLoweringInfo().isOperationExpand(N->getOpcode(), MVT::v1i64))
20472 return SDValue();
20473 SDValue Op0 = N->getOperand(0);
20474 SDValue Op1 = N->getOperand(1);
20475
20476 // At least one of the operands should be an extract, and the other should be
20477 // something that is easy to convert to v1i64 type (in this case a load).
20478 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
20479 Op0.getOpcode() != ISD::LOAD)
20480 return SDValue();
20481 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
20482 Op1.getOpcode() != ISD::LOAD)
20483 return SDValue();
20484
20485 SDLoc DL(N);
20486 if (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20487 Op0.getOperand(0).getValueType() == MVT::v1i64) {
20488 Op0 = Op0.getOperand(0);
20489 Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op1);
20490 } else if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20491 Op1.getOperand(0).getValueType() == MVT::v1i64) {
20492 Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op0);
20493 Op1 = Op1.getOperand(0);
20494 } else
20495 return SDValue();
20496
20497 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64,
20498 DAG.getNode(N->getOpcode(), DL, MVT::v1i64, Op0, Op1),
20499 DAG.getConstant(0, DL, MVT::i64));
20500}
20501
20504 if (!BV->hasOneUse())
20505 return false;
20506 if (auto *Ld = dyn_cast<LoadSDNode>(BV)) {
20507 if (!Ld || !Ld->isSimple())
20508 return false;
20509 Loads.push_back(Ld);
20510 return true;
20511 } else if (BV.getOpcode() == ISD::BUILD_VECTOR ||
20513 for (unsigned Op = 0; Op < BV.getNumOperands(); Op++) {
20514 auto *Ld = dyn_cast<LoadSDNode>(BV.getOperand(Op));
20515 if (!Ld || !Ld->isSimple() || !BV.getOperand(Op).hasOneUse())
20516 return false;
20517 Loads.push_back(Ld);
20518 }
20519 return true;
20520 } else if (B.getOpcode() == ISD::VECTOR_SHUFFLE) {
20521 // Try to find a tree of shuffles and concats from how IR shuffles of loads
20522 // are lowered. Note that this only comes up because we do not always visit
20523 // operands before uses. After that is fixed this can be removed and in the
20524 // meantime this is fairly specific to the lowering we expect from IR.
20525 // t46: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19> t44, t45
20526 // t44: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u> t42, t43
20527 // t42: v16i8 = concat_vectors t40, t36, undef:v4i8, undef:v4i8
20528 // t40: v4i8,ch = load<(load (s32) from %ir.17)> t0, t22, undef:i64
20529 // t36: v4i8,ch = load<(load (s32) from %ir.13)> t0, t18, undef:i64
20530 // t43: v16i8 = concat_vectors t32, undef:v4i8, undef:v4i8, undef:v4i8
20531 // t32: v4i8,ch = load<(load (s32) from %ir.9)> t0, t14, undef:i64
20532 // t45: v16i8 = concat_vectors t28, undef:v4i8, undef:v4i8, undef:v4i8
20533 // t28: v4i8,ch = load<(load (s32) from %ir.0)> t0, t2, undef:i64
20534 if (B.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE ||
20535 B.getOperand(0).getOperand(0).getOpcode() != ISD::CONCAT_VECTORS ||
20536 B.getOperand(0).getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
20537 B.getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
20538 B.getOperand(1).getNumOperands() != 4)
20539 return false;
20540 auto SV1 = cast<ShuffleVectorSDNode>(B);
20541 auto SV2 = cast<ShuffleVectorSDNode>(B.getOperand(0));
20542 int NumElts = B.getValueType().getVectorNumElements();
20543 int NumSubElts = NumElts / 4;
20544 for (int I = 0; I < NumSubElts; I++) {
20545 // <0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19>
20546 if (SV1->getMaskElt(I) != I ||
20547 SV1->getMaskElt(I + NumSubElts) != I + NumSubElts ||
20548 SV1->getMaskElt(I + NumSubElts * 2) != I + NumSubElts * 2 ||
20549 SV1->getMaskElt(I + NumSubElts * 3) != I + NumElts)
20550 return false;
20551 // <0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u>
20552 if (SV2->getMaskElt(I) != I ||
20553 SV2->getMaskElt(I + NumSubElts) != I + NumSubElts ||
20554 SV2->getMaskElt(I + NumSubElts * 2) != I + NumElts)
20555 return false;
20556 }
20557 auto *Ld0 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(0));
20558 auto *Ld1 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(1));
20559 auto *Ld2 = dyn_cast<LoadSDNode>(SV2->getOperand(1).getOperand(0));
20560 auto *Ld3 = dyn_cast<LoadSDNode>(B.getOperand(1).getOperand(0));
20561 if (!Ld0 || !Ld1 || !Ld2 || !Ld3 || !Ld0->isSimple() || !Ld1->isSimple() ||
20562 !Ld2->isSimple() || !Ld3->isSimple())
20563 return false;
20564 Loads.push_back(Ld0);
20565 Loads.push_back(Ld1);
20566 Loads.push_back(Ld2);
20567 Loads.push_back(Ld3);
20568 return true;
20569 }
20570 return false;
20571}
20572
20574 SelectionDAG &DAG,
20575 unsigned &NumSubLoads) {
20576 if (!Op0.hasOneUse() || !Op1.hasOneUse())
20577 return false;
20578
20579 SmallVector<LoadSDNode *> Loads0, Loads1;
20580 if (isLoadOrMultipleLoads(Op0, Loads0) &&
20581 isLoadOrMultipleLoads(Op1, Loads1)) {
20582 if (NumSubLoads && Loads0.size() != NumSubLoads)
20583 return false;
20584 NumSubLoads = Loads0.size();
20585 return Loads0.size() == Loads1.size() &&
20586 all_of(zip(Loads0, Loads1), [&DAG](auto L) {
20587 unsigned Size = get<0>(L)->getValueType(0).getSizeInBits();
20588 return Size == get<1>(L)->getValueType(0).getSizeInBits() &&
20589 DAG.areNonVolatileConsecutiveLoads(get<1>(L), get<0>(L),
20590 Size / 8, 1);
20591 });
20592 }
20593
20594 if (Op0.getOpcode() != Op1.getOpcode())
20595 return false;
20596
20597 switch (Op0.getOpcode()) {
20598 case ISD::ADD:
20599 case ISD::SUB:
20601 DAG, NumSubLoads) &&
20603 DAG, NumSubLoads);
20604 case ISD::SIGN_EXTEND:
20605 case ISD::ANY_EXTEND:
20606 case ISD::ZERO_EXTEND:
20607 EVT XVT = Op0.getOperand(0).getValueType();
20608 if (XVT.getScalarSizeInBits() != 8 && XVT.getScalarSizeInBits() != 16 &&
20609 XVT.getScalarSizeInBits() != 32)
20610 return false;
20612 DAG, NumSubLoads);
20613 }
20614 return false;
20615}
20616
20617// This method attempts to fold trees of add(ext(load p), shl(ext(load p+4))
20618// into a single load of twice the size, that we extract the bottom part and top
20619// part so that the shl can use a shll2 instruction. The two loads in that
20620// example can also be larger trees of instructions, which are identical except
20621// for the leaves which are all loads offset from the LHS, including
20622// buildvectors of multiple loads. For example the RHS tree could be
20623// sub(zext(buildvec(load p+4, load q+4)), zext(buildvec(load r+4, load s+4)))
20624// Whilst it can be common for the larger loads to replace LDP instructions
20625// (which doesn't gain anything on it's own), the larger loads can help create
20626// more efficient code, and in buildvectors prevent the need for ld1 lane
20627// inserts which can be slower than normal loads.
20629 EVT VT = N->getValueType(0);
20630 if (!VT.isFixedLengthVector() ||
20631 (VT.getScalarSizeInBits() != 16 && VT.getScalarSizeInBits() != 32 &&
20632 VT.getScalarSizeInBits() != 64))
20633 return SDValue();
20634
20635 SDValue Other = N->getOperand(0);
20636 SDValue Shift = N->getOperand(1);
20637 if (Shift.getOpcode() != ISD::SHL && N->getOpcode() != ISD::SUB)
20638 std::swap(Shift, Other);
20639 APInt ShiftAmt;
20640 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse() ||
20641 !ISD::isConstantSplatVector(Shift.getOperand(1).getNode(), ShiftAmt))
20642 return SDValue();
20643
20644 if (!ISD::isExtOpcode(Shift.getOperand(0).getOpcode()) ||
20645 !ISD::isExtOpcode(Other.getOpcode()) ||
20646 Shift.getOperand(0).getOperand(0).getValueType() !=
20647 Other.getOperand(0).getValueType() ||
20648 !Other.hasOneUse() || !Shift.getOperand(0).hasOneUse())
20649 return SDValue();
20650
20651 SDValue Op0 = Other.getOperand(0);
20652 SDValue Op1 = Shift.getOperand(0).getOperand(0);
20653
20654 unsigned NumSubLoads = 0;
20655 if (!areLoadedOffsetButOtherwiseSame(Op0, Op1, DAG, NumSubLoads))
20656 return SDValue();
20657
20658 // Attempt to rule out some unprofitable cases using heuristics (some working
20659 // around suboptimal code generation), notably if the extend not be able to
20660 // use ushll2 instructions as the types are not large enough. Otherwise zip's
20661 // will need to be created which can increase the instruction count.
20662 unsigned NumElts = Op0.getValueType().getVectorNumElements();
20663 unsigned NumSubElts = NumElts / NumSubLoads;
20664 if (NumSubElts * VT.getScalarSizeInBits() < 128 ||
20665 (Other.getOpcode() != Shift.getOperand(0).getOpcode() &&
20666 Op0.getValueType().getSizeInBits() < 128 &&
20668 return SDValue();
20669
20670 // Recreate the tree with the new combined loads.
20671 std::function<SDValue(SDValue, SDValue, SelectionDAG &)> GenCombinedTree =
20672 [&GenCombinedTree](SDValue Op0, SDValue Op1, SelectionDAG &DAG) {
20673 EVT DVT =
20675
20676 SmallVector<LoadSDNode *> Loads0, Loads1;
20677 if (isLoadOrMultipleLoads(Op0, Loads0) &&
20678 isLoadOrMultipleLoads(Op1, Loads1)) {
20679 EVT LoadVT = EVT::getVectorVT(
20680 *DAG.getContext(), Op0.getValueType().getScalarType(),
20681 Op0.getValueType().getVectorNumElements() / Loads0.size());
20682 EVT DLoadVT = LoadVT.getDoubleNumVectorElementsVT(*DAG.getContext());
20683
20684 SmallVector<SDValue> NewLoads;
20685 for (const auto &[L0, L1] : zip(Loads0, Loads1)) {
20686 SDValue Load = DAG.getLoad(DLoadVT, SDLoc(L0), L0->getChain(),
20687 L0->getBasePtr(), L0->getPointerInfo(),
20688 L0->getOriginalAlign());
20689 DAG.makeEquivalentMemoryOrdering(L0, Load.getValue(1));
20690 DAG.makeEquivalentMemoryOrdering(L1, Load.getValue(1));
20691 NewLoads.push_back(Load);
20692 }
20693 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op0), DVT, NewLoads);
20694 }
20695
20697 for (const auto &[O0, O1] : zip(Op0->op_values(), Op1->op_values()))
20698 Ops.push_back(GenCombinedTree(O0, O1, DAG));
20699 return DAG.getNode(Op0.getOpcode(), SDLoc(Op0), DVT, Ops);
20700 };
20701 SDValue NewOp = GenCombinedTree(Op0, Op1, DAG);
20702
20703 SmallVector<int> LowMask(NumElts, 0), HighMask(NumElts, 0);
20704 int Hi = NumSubElts, Lo = 0;
20705 for (unsigned i = 0; i < NumSubLoads; i++) {
20706 for (unsigned j = 0; j < NumSubElts; j++) {
20707 LowMask[i * NumSubElts + j] = Lo++;
20708 HighMask[i * NumSubElts + j] = Hi++;
20709 }
20710 Lo += NumSubElts;
20711 Hi += NumSubElts;
20712 }
20713 SDLoc DL(N);
20714 SDValue Ext0, Ext1;
20715 // Extract the top and bottom lanes, then extend the result. Possibly extend
20716 // the result then extract the lanes if the two operands match as it produces
20717 // slightly smaller code.
20718 if (Other.getOpcode() != Shift.getOperand(0).getOpcode()) {
20720 NewOp, DAG.getConstant(0, DL, MVT::i64));
20721 SDValue SubH =
20722 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Op0.getValueType(), NewOp,
20723 DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
20724 SDValue Extr0 =
20725 DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, LowMask);
20726 SDValue Extr1 =
20727 DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, HighMask);
20728 Ext0 = DAG.getNode(Other.getOpcode(), DL, VT, Extr0);
20729 Ext1 = DAG.getNode(Shift.getOperand(0).getOpcode(), DL, VT, Extr1);
20730 } else {
20732 SDValue Ext = DAG.getNode(Other.getOpcode(), DL, DVT, NewOp);
20733 SDValue SubL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
20734 DAG.getConstant(0, DL, MVT::i64));
20735 SDValue SubH =
20736 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
20737 DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
20738 Ext0 = DAG.getVectorShuffle(VT, DL, SubL, SubH, LowMask);
20739 Ext1 = DAG.getVectorShuffle(VT, DL, SubL, SubH, HighMask);
20740 }
20741 SDValue NShift =
20742 DAG.getNode(Shift.getOpcode(), DL, VT, Ext1, Shift.getOperand(1));
20743 return DAG.getNode(N->getOpcode(), DL, VT, Ext0, NShift);
20744}
20745
20748 // Try to change sum of two reductions.
20749 if (SDValue Val = performAddUADDVCombine(N, DCI.DAG))
20750 return Val;
20751 if (SDValue Val = performAddDotCombine(N, DCI.DAG))
20752 return Val;
20753 if (SDValue Val = performAddCSelIntoCSinc(N, DCI.DAG))
20754 return Val;
20755 if (SDValue Val = performNegCSelCombine(N, DCI.DAG))
20756 return Val;
20757 if (SDValue Val = performVectorExtCombine(N, DCI.DAG))
20758 return Val;
20760 return Val;
20761 if (SDValue Val = performSubAddMULCombine(N, DCI.DAG))
20762 return Val;
20763 if (SDValue Val = performSVEMulAddSubCombine(N, DCI))
20764 return Val;
20765 if (SDValue Val = performAddSubIntoVectorOp(N, DCI.DAG))
20766 return Val;
20767
20768 if (SDValue Val = performExtBinopLoadFold(N, DCI.DAG))
20769 return Val;
20770
20771 return performAddSubLongCombine(N, DCI);
20772}
20773
20774// Massage DAGs which we can use the high-half "long" operations on into
20775// something isel will recognize better. E.g.
20776//
20777// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
20778// (aarch64_neon_umull (extract_high (v2i64 vec)))
20779// (extract_high (v2i64 (dup128 scalar)))))
20780//
20783 SelectionDAG &DAG) {
20784 if (DCI.isBeforeLegalizeOps())
20785 return SDValue();
20786
20787 SDValue LHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 0 : 1);
20788 SDValue RHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 1 : 2);
20789 assert(LHS.getValueType().is64BitVector() &&
20790 RHS.getValueType().is64BitVector() &&
20791 "unexpected shape for long operation");
20792
20793 // Either node could be a DUP, but it's not worth doing both of them (you'd
20794 // just as well use the non-high version) so look for a corresponding extract
20795 // operation on the other "wing".
20798 if (!RHS.getNode())
20799 return SDValue();
20802 if (!LHS.getNode())
20803 return SDValue();
20804 } else
20805 return SDValue();
20806
20807 if (IID == Intrinsic::not_intrinsic)
20808 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), LHS, RHS);
20809
20810 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
20811 N->getOperand(0), LHS, RHS);
20812}
20813
20814static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
20815 MVT ElemTy = N->getSimpleValueType(0).getScalarType();
20816 unsigned ElemBits = ElemTy.getSizeInBits();
20817
20818 int64_t ShiftAmount;
20819 if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
20820 APInt SplatValue, SplatUndef;
20821 unsigned SplatBitSize;
20822 bool HasAnyUndefs;
20823 if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
20824 HasAnyUndefs, ElemBits) ||
20825 SplatBitSize != ElemBits)
20826 return SDValue();
20827
20828 ShiftAmount = SplatValue.getSExtValue();
20829 } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
20830 ShiftAmount = CVN->getSExtValue();
20831 } else
20832 return SDValue();
20833
20834 // If the shift amount is zero, remove the shift intrinsic.
20835 if (ShiftAmount == 0 && IID != Intrinsic::aarch64_neon_sqshlu)
20836 return N->getOperand(1);
20837
20838 unsigned Opcode;
20839 bool IsRightShift;
20840 switch (IID) {
20841 default:
20842 llvm_unreachable("Unknown shift intrinsic");
20843 case Intrinsic::aarch64_neon_sqshl:
20844 Opcode = AArch64ISD::SQSHL_I;
20845 IsRightShift = false;
20846 break;
20847 case Intrinsic::aarch64_neon_uqshl:
20848 Opcode = AArch64ISD::UQSHL_I;
20849 IsRightShift = false;
20850 break;
20851 case Intrinsic::aarch64_neon_srshl:
20852 Opcode = AArch64ISD::SRSHR_I;
20853 IsRightShift = true;
20854 break;
20855 case Intrinsic::aarch64_neon_urshl:
20856 Opcode = AArch64ISD::URSHR_I;
20857 IsRightShift = true;
20858 break;
20859 case Intrinsic::aarch64_neon_sqshlu:
20860 Opcode = AArch64ISD::SQSHLU_I;
20861 IsRightShift = false;
20862 break;
20863 case Intrinsic::aarch64_neon_sshl:
20864 case Intrinsic::aarch64_neon_ushl:
20865 // For positive shift amounts we can use SHL, as ushl/sshl perform a regular
20866 // left shift for positive shift amounts. For negative shifts we can use a
20867 // VASHR/VLSHR as appropiate.
20868 if (ShiftAmount < 0) {
20869 Opcode = IID == Intrinsic::aarch64_neon_sshl ? AArch64ISD::VASHR
20871 ShiftAmount = -ShiftAmount;
20872 } else
20873 Opcode = AArch64ISD::VSHL;
20874 IsRightShift = false;
20875 break;
20876 }
20877
20878 EVT VT = N->getValueType(0);
20879 SDValue Op = N->getOperand(1);
20880 SDLoc dl(N);
20881 if (VT == MVT::i64) {
20882 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op);
20883 VT = MVT::v1i64;
20884 }
20885
20886 if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
20887 Op = DAG.getNode(Opcode, dl, VT, Op,
20888 DAG.getConstant(-ShiftAmount, dl, MVT::i32));
20889 if (N->getValueType(0) == MVT::i64)
20890 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,
20891 DAG.getConstant(0, dl, MVT::i64));
20892 return Op;
20893 } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
20894 Op = DAG.getNode(Opcode, dl, VT, Op,
20895 DAG.getConstant(ShiftAmount, dl, MVT::i32));
20896 if (N->getValueType(0) == MVT::i64)
20897 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Op,
20898 DAG.getConstant(0, dl, MVT::i64));
20899 return Op;
20900 }
20901
20902 return SDValue();
20903}
20904
20905// The CRC32[BH] instructions ignore the high bits of their data operand. Since
20906// the intrinsics must be legal and take an i32, this means there's almost
20907// certainly going to be a zext in the DAG which we can eliminate.
20908static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
20909 SDValue AndN = N->getOperand(2);
20910 if (AndN.getOpcode() != ISD::AND)
20911 return SDValue();
20912
20913 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
20914 if (!CMask || CMask->getZExtValue() != Mask)
20915 return SDValue();
20916
20917 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
20918 N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
20919}
20920
20922 SelectionDAG &DAG) {
20923 SDLoc dl(N);
20924 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0),
20925 DAG.getNode(Opc, dl,
20926 N->getOperand(1).getSimpleValueType(),
20927 N->getOperand(1)),
20928 DAG.getConstant(0, dl, MVT::i64));
20929}
20930
20932 SDLoc DL(N);
20933 SDValue Op1 = N->getOperand(1);
20934 SDValue Op2 = N->getOperand(2);
20935 EVT ScalarTy = Op2.getValueType();
20936 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
20937 ScalarTy = MVT::i32;
20938
20939 // Lower index_vector(base, step) to mul(step step_vector(1)) + splat(base).
20940 SDValue StepVector = DAG.getStepVector(DL, N->getValueType(0));
20941 SDValue Step = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op2);
20942 SDValue Mul = DAG.getNode(ISD::MUL, DL, N->getValueType(0), StepVector, Step);
20943 SDValue Base = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op1);
20944 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), Mul, Base);
20945}
20946
20948 SDLoc dl(N);
20949 SDValue Scalar = N->getOperand(3);
20950 EVT ScalarTy = Scalar.getValueType();
20951
20952 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
20953 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
20954
20955 SDValue Passthru = N->getOperand(1);
20956 SDValue Pred = N->getOperand(2);
20957 return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, dl, N->getValueType(0),
20958 Pred, Scalar, Passthru);
20959}
20960
20962 SDLoc dl(N);
20963 LLVMContext &Ctx = *DAG.getContext();
20964 EVT VT = N->getValueType(0);
20965
20966 assert(VT.isScalableVector() && "Expected a scalable vector.");
20967
20968 // Current lowering only supports the SVE-ACLE types.
20970 return SDValue();
20971
20972 unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8;
20973 unsigned ByteSize = VT.getSizeInBits().getKnownMinValue() / 8;
20974 EVT ByteVT =
20975 EVT::getVectorVT(Ctx, MVT::i8, ElementCount::getScalable(ByteSize));
20976
20977 // Convert everything to the domain of EXT (i.e bytes).
20978 SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(1));
20979 SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(2));
20980 SDValue Op2 = DAG.getNode(ISD::MUL, dl, MVT::i32, N->getOperand(3),
20981 DAG.getConstant(ElemSize, dl, MVT::i32));
20982
20983 SDValue EXT = DAG.getNode(AArch64ISD::EXT, dl, ByteVT, Op0, Op1, Op2);
20984 return DAG.getNode(ISD::BITCAST, dl, VT, EXT);
20985}
20986
20989 SelectionDAG &DAG) {
20990 if (DCI.isBeforeLegalize())
20991 return SDValue();
20992
20993 SDValue Comparator = N->getOperand(3);
20994 if (Comparator.getOpcode() == AArch64ISD::DUP ||
20995 Comparator.getOpcode() == ISD::SPLAT_VECTOR) {
20996 unsigned IID = getIntrinsicID(N);
20997 EVT VT = N->getValueType(0);
20998 EVT CmpVT = N->getOperand(2).getValueType();
20999 SDValue Pred = N->getOperand(1);
21000 SDValue Imm;
21001 SDLoc DL(N);
21002
21003 switch (IID) {
21004 default:
21005 llvm_unreachable("Called with wrong intrinsic!");
21006 break;
21007
21008 // Signed comparisons
21009 case Intrinsic::aarch64_sve_cmpeq_wide:
21010 case Intrinsic::aarch64_sve_cmpne_wide:
21011 case Intrinsic::aarch64_sve_cmpge_wide:
21012 case Intrinsic::aarch64_sve_cmpgt_wide:
21013 case Intrinsic::aarch64_sve_cmplt_wide:
21014 case Intrinsic::aarch64_sve_cmple_wide: {
21015 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
21016 int64_t ImmVal = CN->getSExtValue();
21017 if (ImmVal >= -16 && ImmVal <= 15)
21018 Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
21019 else
21020 return SDValue();
21021 }
21022 break;
21023 }
21024 // Unsigned comparisons
21025 case Intrinsic::aarch64_sve_cmphs_wide:
21026 case Intrinsic::aarch64_sve_cmphi_wide:
21027 case Intrinsic::aarch64_sve_cmplo_wide:
21028 case Intrinsic::aarch64_sve_cmpls_wide: {
21029 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
21030 uint64_t ImmVal = CN->getZExtValue();
21031 if (ImmVal <= 127)
21032 Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
21033 else
21034 return SDValue();
21035 }
21036 break;
21037 }
21038 }
21039
21040 if (!Imm)
21041 return SDValue();
21042
21043 SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm);
21044 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred,
21045 N->getOperand(2), Splat, DAG.getCondCode(CC));
21046 }
21047
21048 return SDValue();
21049}
21050
21053 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21054
21055 SDLoc DL(Op);
21056 assert(Op.getValueType().isScalableVector() &&
21057 TLI.isTypeLegal(Op.getValueType()) &&
21058 "Expected legal scalable vector type!");
21059 assert(Op.getValueType() == Pg.getValueType() &&
21060 "Expected same type for PTEST operands");
21061
21062 // Ensure target specific opcodes are using legal type.
21063 EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
21064 SDValue TVal = DAG.getConstant(1, DL, OutVT);
21065 SDValue FVal = DAG.getConstant(0, DL, OutVT);
21066
21067 // Ensure operands have type nxv16i1.
21068 if (Op.getValueType() != MVT::nxv16i1) {
21071 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Pg);
21072 else
21073 Pg = getSVEPredicateBitCast(MVT::nxv16i1, Pg, DAG);
21074 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Op);
21075 }
21076
21077 // Set condition code (CC) flags.
21078 SDValue Test = DAG.getNode(
21080 DL, MVT::Other, Pg, Op);
21081
21082 // Convert CC to integer based on requested condition.
21083 // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
21084 SDValue CC = DAG.getConstant(getInvertedCondCode(Cond), DL, MVT::i32);
21085 SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, OutVT, FVal, TVal, CC, Test);
21086 return DAG.getZExtOrTrunc(Res, DL, VT);
21087}
21088
21090 SelectionDAG &DAG) {
21091 SDLoc DL(N);
21092
21093 SDValue Pred = N->getOperand(1);
21094 SDValue VecToReduce = N->getOperand(2);
21095
21096 // NOTE: The integer reduction's result type is not always linked to the
21097 // operand's element type so we construct it from the intrinsic's result type.
21098 EVT ReduceVT = getPackedSVEVectorVT(N->getValueType(0));
21099 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
21100
21101 // SVE reductions set the whole vector register with the first element
21102 // containing the reduction result, which we'll now extract.
21103 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
21104 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
21105 Zero);
21106}
21107
21109 SelectionDAG &DAG) {
21110 SDLoc DL(N);
21111
21112 SDValue Pred = N->getOperand(1);
21113 SDValue VecToReduce = N->getOperand(2);
21114
21115 EVT ReduceVT = VecToReduce.getValueType();
21116 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
21117
21118 // SVE reductions set the whole vector register with the first element
21119 // containing the reduction result, which we'll now extract.
21120 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
21121 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
21122 Zero);
21123}
21124
21126 SelectionDAG &DAG) {
21127 SDLoc DL(N);
21128
21129 SDValue Pred = N->getOperand(1);
21130 SDValue InitVal = N->getOperand(2);
21131 SDValue VecToReduce = N->getOperand(3);
21132 EVT ReduceVT = VecToReduce.getValueType();
21133
21134 // Ordered reductions use the first lane of the result vector as the
21135 // reduction's initial value.
21136 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
21137 InitVal = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReduceVT,
21138 DAG.getUNDEF(ReduceVT), InitVal, Zero);
21139
21140 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce);
21141
21142 // SVE reductions set the whole vector register with the first element
21143 // containing the reduction result, which we'll now extract.
21144 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
21145 Zero);
21146}
21147
21148// If a merged operation has no inactive lanes we can relax it to a predicated
21149// or unpredicated operation, which potentially allows better isel (perhaps
21150// using immediate forms) or relaxing register reuse requirements.
21152 SelectionDAG &DAG, bool UnpredOp = false,
21153 bool SwapOperands = false) {
21154 assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!");
21155 assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!");
21156 SDValue Pg = N->getOperand(1);
21157 SDValue Op1 = N->getOperand(SwapOperands ? 3 : 2);
21158 SDValue Op2 = N->getOperand(SwapOperands ? 2 : 3);
21159
21160 // ISD way to specify an all active predicate.
21161 if (isAllActivePredicate(DAG, Pg)) {
21162 if (UnpredOp)
21163 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Op1, Op2);
21164
21165 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Pg, Op1, Op2);
21166 }
21167
21168 // FUTURE: SplatVector(true)
21169 return SDValue();
21170}
21171
21174 const AArch64Subtarget *Subtarget) {
21175 if (DCI.isBeforeLegalize())
21176 return SDValue();
21177
21178 if (!Subtarget->hasSVE2p1())
21179 return SDValue();
21180
21181 if (!N->hasNUsesOfValue(2, 0))
21182 return SDValue();
21183
21184 const uint64_t HalfSize = N->getValueType(0).getVectorMinNumElements() / 2;
21185 if (HalfSize < 2)
21186 return SDValue();
21187
21188 auto It = N->use_begin();
21189 SDNode *Lo = *It++;
21190 SDNode *Hi = *It;
21191
21192 if (Lo->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
21193 Hi->getOpcode() != ISD::EXTRACT_SUBVECTOR)
21194 return SDValue();
21195
21196 uint64_t OffLo = Lo->getConstantOperandVal(1);
21197 uint64_t OffHi = Hi->getConstantOperandVal(1);
21198
21199 if (OffLo > OffHi) {
21200 std::swap(Lo, Hi);
21201 std::swap(OffLo, OffHi);
21202 }
21203
21204 if (OffLo != 0 || OffHi != HalfSize)
21205 return SDValue();
21206
21207 EVT HalfVec = Lo->getValueType(0);
21208 if (HalfVec != Hi->getValueType(0) ||
21209 HalfVec.getVectorElementCount() != ElementCount::getScalable(HalfSize))
21210 return SDValue();
21211
21212 SelectionDAG &DAG = DCI.DAG;
21213 SDLoc DL(N);
21214 SDValue ID =
21215 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo_x2, DL, MVT::i64);
21216 SDValue Idx = N->getOperand(1);
21217 SDValue TC = N->getOperand(2);
21218 if (Idx.getValueType() != MVT::i64) {
21219 Idx = DAG.getZExtOrTrunc(Idx, DL, MVT::i64);
21220 TC = DAG.getZExtOrTrunc(TC, DL, MVT::i64);
21221 }
21222 auto R =
21224 {Lo->getValueType(0), Hi->getValueType(0)}, {ID, Idx, TC});
21225
21226 DCI.CombineTo(Lo, R.getValue(0));
21227 DCI.CombineTo(Hi, R.getValue(1));
21228
21229 return SDValue(N, 0);
21230}
21231
21234 const AArch64Subtarget *Subtarget) {
21235 SelectionDAG &DAG = DCI.DAG;
21236 unsigned IID = getIntrinsicID(N);
21237 switch (IID) {
21238 default:
21239 break;
21240 case Intrinsic::aarch64_neon_vcvtfxs2fp:
21241 case Intrinsic::aarch64_neon_vcvtfxu2fp:
21242 return tryCombineFixedPointConvert(N, DCI, DAG);
21243 case Intrinsic::aarch64_neon_saddv:
21245 case Intrinsic::aarch64_neon_uaddv:
21247 case Intrinsic::aarch64_neon_sminv:
21249 case Intrinsic::aarch64_neon_uminv:
21251 case Intrinsic::aarch64_neon_smaxv:
21253 case Intrinsic::aarch64_neon_umaxv:
21255 case Intrinsic::aarch64_neon_fmax:
21256 return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0),
21257 N->getOperand(1), N->getOperand(2));
21258 case Intrinsic::aarch64_neon_fmin:
21259 return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0),
21260 N->getOperand(1), N->getOperand(2));
21261 case Intrinsic::aarch64_neon_fmaxnm:
21262 return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
21263 N->getOperand(1), N->getOperand(2));
21264 case Intrinsic::aarch64_neon_fminnm:
21265 return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
21266 N->getOperand(1), N->getOperand(2));
21267 case Intrinsic::aarch64_neon_smull:
21268 return DAG.getNode(AArch64ISD::SMULL, SDLoc(N), N->getValueType(0),
21269 N->getOperand(1), N->getOperand(2));
21270 case Intrinsic::aarch64_neon_umull:
21271 return DAG.getNode(AArch64ISD::UMULL, SDLoc(N), N->getValueType(0),
21272 N->getOperand(1), N->getOperand(2));
21273 case Intrinsic::aarch64_neon_pmull:
21274 return DAG.getNode(AArch64ISD::PMULL, SDLoc(N), N->getValueType(0),
21275 N->getOperand(1), N->getOperand(2));
21276 case Intrinsic::aarch64_neon_sqdmull:
21277 return tryCombineLongOpWithDup(IID, N, DCI, DAG);
21278 case Intrinsic::aarch64_neon_sqshl:
21279 case Intrinsic::aarch64_neon_uqshl:
21280 case Intrinsic::aarch64_neon_sqshlu:
21281 case Intrinsic::aarch64_neon_srshl:
21282 case Intrinsic::aarch64_neon_urshl:
21283 case Intrinsic::aarch64_neon_sshl:
21284 case Intrinsic::aarch64_neon_ushl:
21285 return tryCombineShiftImm(IID, N, DAG);
21286 case Intrinsic::aarch64_neon_sabd:
21287 return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
21288 N->getOperand(1), N->getOperand(2));
21289 case Intrinsic::aarch64_neon_uabd:
21290 return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
21291 N->getOperand(1), N->getOperand(2));
21292 case Intrinsic::aarch64_crc32b:
21293 case Intrinsic::aarch64_crc32cb:
21294 return tryCombineCRC32(0xff, N, DAG);
21295 case Intrinsic::aarch64_crc32h:
21296 case Intrinsic::aarch64_crc32ch:
21297 return tryCombineCRC32(0xffff, N, DAG);
21298 case Intrinsic::aarch64_sve_saddv:
21299 // There is no i64 version of SADDV because the sign is irrelevant.
21300 if (N->getOperand(2)->getValueType(0).getVectorElementType() == MVT::i64)
21302 else
21304 case Intrinsic::aarch64_sve_uaddv:
21306 case Intrinsic::aarch64_sve_smaxv:
21308 case Intrinsic::aarch64_sve_umaxv:
21310 case Intrinsic::aarch64_sve_sminv:
21312 case Intrinsic::aarch64_sve_uminv:
21314 case Intrinsic::aarch64_sve_orv:
21316 case Intrinsic::aarch64_sve_eorv:
21318 case Intrinsic::aarch64_sve_andv:
21320 case Intrinsic::aarch64_sve_index:
21321 return LowerSVEIntrinsicIndex(N, DAG);
21322 case Intrinsic::aarch64_sve_dup:
21323 return LowerSVEIntrinsicDUP(N, DAG);
21324 case Intrinsic::aarch64_sve_dup_x:
21325 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0),
21326 N->getOperand(1));
21327 case Intrinsic::aarch64_sve_ext:
21328 return LowerSVEIntrinsicEXT(N, DAG);
21329 case Intrinsic::aarch64_sve_mul_u:
21330 return DAG.getNode(AArch64ISD::MUL_PRED, SDLoc(N), N->getValueType(0),
21331 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21332 case Intrinsic::aarch64_sve_smulh_u:
21333 return DAG.getNode(AArch64ISD::MULHS_PRED, SDLoc(N), N->getValueType(0),
21334 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21335 case Intrinsic::aarch64_sve_umulh_u:
21336 return DAG.getNode(AArch64ISD::MULHU_PRED, SDLoc(N), N->getValueType(0),
21337 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21338 case Intrinsic::aarch64_sve_smin_u:
21339 return DAG.getNode(AArch64ISD::SMIN_PRED, SDLoc(N), N->getValueType(0),
21340 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21341 case Intrinsic::aarch64_sve_umin_u:
21342 return DAG.getNode(AArch64ISD::UMIN_PRED, SDLoc(N), N->getValueType(0),
21343 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21344 case Intrinsic::aarch64_sve_smax_u:
21345 return DAG.getNode(AArch64ISD::SMAX_PRED, SDLoc(N), N->getValueType(0),
21346 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21347 case Intrinsic::aarch64_sve_umax_u:
21348 return DAG.getNode(AArch64ISD::UMAX_PRED, SDLoc(N), N->getValueType(0),
21349 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21350 case Intrinsic::aarch64_sve_lsl_u:
21351 return DAG.getNode(AArch64ISD::SHL_PRED, SDLoc(N), N->getValueType(0),
21352 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21353 case Intrinsic::aarch64_sve_lsr_u:
21354 return DAG.getNode(AArch64ISD::SRL_PRED, SDLoc(N), N->getValueType(0),
21355 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21356 case Intrinsic::aarch64_sve_asr_u:
21357 return DAG.getNode(AArch64ISD::SRA_PRED, SDLoc(N), N->getValueType(0),
21358 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21359 case Intrinsic::aarch64_sve_fadd_u:
21360 return DAG.getNode(AArch64ISD::FADD_PRED, SDLoc(N), N->getValueType(0),
21361 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21362 case Intrinsic::aarch64_sve_fdiv_u:
21363 return DAG.getNode(AArch64ISD::FDIV_PRED, SDLoc(N), N->getValueType(0),
21364 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21365 case Intrinsic::aarch64_sve_fmax_u:
21366 return DAG.getNode(AArch64ISD::FMAX_PRED, SDLoc(N), N->getValueType(0),
21367 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21368 case Intrinsic::aarch64_sve_fmaxnm_u:
21369 return DAG.getNode(AArch64ISD::FMAXNM_PRED, SDLoc(N), N->getValueType(0),
21370 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21371 case Intrinsic::aarch64_sve_fmla_u:
21372 return DAG.getNode(AArch64ISD::FMA_PRED, SDLoc(N), N->getValueType(0),
21373 N->getOperand(1), N->getOperand(3), N->getOperand(4),
21374 N->getOperand(2));
21375 case Intrinsic::aarch64_sve_fmin_u:
21376 return DAG.getNode(AArch64ISD::FMIN_PRED, SDLoc(N), N->getValueType(0),
21377 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21378 case Intrinsic::aarch64_sve_fminnm_u:
21379 return DAG.getNode(AArch64ISD::FMINNM_PRED, SDLoc(N), N->getValueType(0),
21380 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21381 case Intrinsic::aarch64_sve_fmul_u:
21382 return DAG.getNode(AArch64ISD::FMUL_PRED, SDLoc(N), N->getValueType(0),
21383 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21384 case Intrinsic::aarch64_sve_fsub_u:
21385 return DAG.getNode(AArch64ISD::FSUB_PRED, SDLoc(N), N->getValueType(0),
21386 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21387 case Intrinsic::aarch64_sve_add_u:
21388 return DAG.getNode(ISD::ADD, SDLoc(N), N->getValueType(0), N->getOperand(2),
21389 N->getOperand(3));
21390 case Intrinsic::aarch64_sve_sub_u:
21391 return DAG.getNode(ISD::SUB, SDLoc(N), N->getValueType(0), N->getOperand(2),
21392 N->getOperand(3));
21393 case Intrinsic::aarch64_sve_subr:
21394 return convertMergedOpToPredOp(N, ISD::SUB, DAG, true, true);
21395 case Intrinsic::aarch64_sve_and_u:
21396 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0), N->getOperand(2),
21397 N->getOperand(3));
21398 case Intrinsic::aarch64_sve_bic_u:
21399 return DAG.getNode(AArch64ISD::BIC, SDLoc(N), N->getValueType(0),
21400 N->getOperand(2), N->getOperand(3));
21401 case Intrinsic::aarch64_sve_eor_u:
21402 return DAG.getNode(ISD::XOR, SDLoc(N), N->getValueType(0), N->getOperand(2),
21403 N->getOperand(3));
21404 case Intrinsic::aarch64_sve_orr_u:
21405 return DAG.getNode(ISD::OR, SDLoc(N), N->getValueType(0), N->getOperand(2),
21406 N->getOperand(3));
21407 case Intrinsic::aarch64_sve_sabd_u:
21408 return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
21409 N->getOperand(2), N->getOperand(3));
21410 case Intrinsic::aarch64_sve_uabd_u:
21411 return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
21412 N->getOperand(2), N->getOperand(3));
21413 case Intrinsic::aarch64_sve_sdiv_u:
21414 return DAG.getNode(AArch64ISD::SDIV_PRED, SDLoc(N), N->getValueType(0),
21415 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21416 case Intrinsic::aarch64_sve_udiv_u:
21417 return DAG.getNode(AArch64ISD::UDIV_PRED, SDLoc(N), N->getValueType(0),
21418 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21419 case Intrinsic::aarch64_sve_sqadd:
21420 return convertMergedOpToPredOp(N, ISD::SADDSAT, DAG, true);
21421 case Intrinsic::aarch64_sve_sqsub_u:
21422 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
21423 N->getOperand(2), N->getOperand(3));
21424 case Intrinsic::aarch64_sve_uqadd:
21425 return convertMergedOpToPredOp(N, ISD::UADDSAT, DAG, true);
21426 case Intrinsic::aarch64_sve_uqsub_u:
21427 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
21428 N->getOperand(2), N->getOperand(3));
21429 case Intrinsic::aarch64_sve_sqadd_x:
21430 return DAG.getNode(ISD::SADDSAT, SDLoc(N), N->getValueType(0),
21431 N->getOperand(1), N->getOperand(2));
21432 case Intrinsic::aarch64_sve_sqsub_x:
21433 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
21434 N->getOperand(1), N->getOperand(2));
21435 case Intrinsic::aarch64_sve_uqadd_x:
21436 return DAG.getNode(ISD::UADDSAT, SDLoc(N), N->getValueType(0),
21437 N->getOperand(1), N->getOperand(2));
21438 case Intrinsic::aarch64_sve_uqsub_x:
21439 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
21440 N->getOperand(1), N->getOperand(2));
21441 case Intrinsic::aarch64_sve_asrd:
21442 return DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, SDLoc(N), N->getValueType(0),
21443 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21444 case Intrinsic::aarch64_sve_cmphs:
21445 if (!N->getOperand(2).getValueType().isFloatingPoint())
21447 N->getValueType(0), N->getOperand(1), N->getOperand(2),
21448 N->getOperand(3), DAG.getCondCode(ISD::SETUGE));
21449 break;
21450 case Intrinsic::aarch64_sve_cmphi:
21451 if (!N->getOperand(2).getValueType().isFloatingPoint())
21453 N->getValueType(0), N->getOperand(1), N->getOperand(2),
21454 N->getOperand(3), DAG.getCondCode(ISD::SETUGT));
21455 break;
21456 case Intrinsic::aarch64_sve_fcmpge:
21457 case Intrinsic::aarch64_sve_cmpge:
21459 N->getValueType(0), N->getOperand(1), N->getOperand(2),
21460 N->getOperand(3), DAG.getCondCode(ISD::SETGE));
21461 break;
21462 case Intrinsic::aarch64_sve_fcmpgt:
21463 case Intrinsic::aarch64_sve_cmpgt:
21465 N->getValueType(0), N->getOperand(1), N->getOperand(2),
21466 N->getOperand(3), DAG.getCondCode(ISD::SETGT));
21467 break;
21468 case Intrinsic::aarch64_sve_fcmpeq:
21469 case Intrinsic::aarch64_sve_cmpeq:
21471 N->getValueType(0), N->getOperand(1), N->getOperand(2),
21472 N->getOperand(3), DAG.getCondCode(ISD::SETEQ));
21473 break;
21474 case Intrinsic::aarch64_sve_fcmpne:
21475 case Intrinsic::aarch64_sve_cmpne:
21477 N->getValueType(0), N->getOperand(1), N->getOperand(2),
21478 N->getOperand(3), DAG.getCondCode(ISD::SETNE));
21479 break;
21480 case Intrinsic::aarch64_sve_fcmpuo:
21482 N->getValueType(0), N->getOperand(1), N->getOperand(2),
21483 N->getOperand(3), DAG.getCondCode(ISD::SETUO));
21484 break;
21485 case Intrinsic::aarch64_sve_fadda:
21487 case Intrinsic::aarch64_sve_faddv:
21489 case Intrinsic::aarch64_sve_fmaxnmv:
21491 case Intrinsic::aarch64_sve_fmaxv:
21493 case Intrinsic::aarch64_sve_fminnmv:
21495 case Intrinsic::aarch64_sve_fminv:
21497 case Intrinsic::aarch64_sve_sel:
21498 return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0),
21499 N->getOperand(1), N->getOperand(2), N->getOperand(3));
21500 case Intrinsic::aarch64_sve_cmpeq_wide:
21501 return tryConvertSVEWideCompare(N, ISD::SETEQ, DCI, DAG);
21502 case Intrinsic::aarch64_sve_cmpne_wide:
21503 return tryConvertSVEWideCompare(N, ISD::SETNE, DCI, DAG);
21504 case Intrinsic::aarch64_sve_cmpge_wide:
21505 return tryConvertSVEWideCompare(N, ISD::SETGE, DCI, DAG);
21506 case Intrinsic::aarch64_sve_cmpgt_wide:
21507 return tryConvertSVEWideCompare(N, ISD::SETGT, DCI, DAG);
21508 case Intrinsic::aarch64_sve_cmplt_wide:
21509 return tryConvertSVEWideCompare(N, ISD::SETLT, DCI, DAG);
21510 case Intrinsic::aarch64_sve_cmple_wide:
21511 return tryConvertSVEWideCompare(N, ISD::SETLE, DCI, DAG);
21512 case Intrinsic::aarch64_sve_cmphs_wide:
21513 return tryConvertSVEWideCompare(N, ISD::SETUGE, DCI, DAG);
21514 case Intrinsic::aarch64_sve_cmphi_wide:
21515 return tryConvertSVEWideCompare(N, ISD::SETUGT, DCI, DAG);
21516 case Intrinsic::aarch64_sve_cmplo_wide:
21517 return tryConvertSVEWideCompare(N, ISD::SETULT, DCI, DAG);
21518 case Intrinsic::aarch64_sve_cmpls_wide:
21519 return tryConvertSVEWideCompare(N, ISD::SETULE, DCI, DAG);
21520 case Intrinsic::aarch64_sve_ptest_any:
21521 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
21523 case Intrinsic::aarch64_sve_ptest_first:
21524 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
21526 case Intrinsic::aarch64_sve_ptest_last:
21527 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
21529 case Intrinsic::aarch64_sve_whilelo:
21530 return tryCombineWhileLo(N, DCI, Subtarget);
21531 }
21532 return SDValue();
21533}
21534
21535static bool isCheapToExtend(const SDValue &N) {
21536 unsigned OC = N->getOpcode();
21537 return OC == ISD::LOAD || OC == ISD::MLOAD ||
21539}
21540
21541static SDValue
21543 SelectionDAG &DAG) {
21544 // If we have (sext (setcc A B)) and A and B are cheap to extend,
21545 // we can move the sext into the arguments and have the same result. For
21546 // example, if A and B are both loads, we can make those extending loads and
21547 // avoid an extra instruction. This pattern appears often in VLS code
21548 // generation where the inputs to the setcc have a different size to the
21549 // instruction that wants to use the result of the setcc.
21550 assert(N->getOpcode() == ISD::SIGN_EXTEND &&
21551 N->getOperand(0)->getOpcode() == ISD::SETCC);
21552 const SDValue SetCC = N->getOperand(0);
21553
21554 const SDValue CCOp0 = SetCC.getOperand(0);
21555 const SDValue CCOp1 = SetCC.getOperand(1);
21556 if (!CCOp0->getValueType(0).isInteger() ||
21557 !CCOp1->getValueType(0).isInteger())
21558 return SDValue();
21559
21560 ISD::CondCode Code =
21561 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get();
21562
21563 ISD::NodeType ExtType =
21564 isSignedIntSetCC(Code) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
21565
21566 if (isCheapToExtend(SetCC.getOperand(0)) &&
21567 isCheapToExtend(SetCC.getOperand(1))) {
21568 const SDValue Ext1 =
21569 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp0);
21570 const SDValue Ext2 =
21571 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp1);
21572
21573 return DAG.getSetCC(
21574 SDLoc(SetCC), N->getValueType(0), Ext1, Ext2,
21575 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get());
21576 }
21577
21578 return SDValue();
21579}
21580
21583 SelectionDAG &DAG) {
21584 // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
21585 // we can convert that DUP into another extract_high (of a bigger DUP), which
21586 // helps the backend to decide that an sabdl2 would be useful, saving a real
21587 // extract_high operation.
21588 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
21589 (N->getOperand(0).getOpcode() == ISD::ABDU ||
21590 N->getOperand(0).getOpcode() == ISD::ABDS)) {
21591 SDNode *ABDNode = N->getOperand(0).getNode();
21592 SDValue NewABD =
21594 if (!NewABD.getNode())
21595 return SDValue();
21596
21597 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD);
21598 }
21599
21600 if (N->getValueType(0).isFixedLengthVector() &&
21601 N->getOpcode() == ISD::SIGN_EXTEND &&
21602 N->getOperand(0)->getOpcode() == ISD::SETCC)
21603 return performSignExtendSetCCCombine(N, DCI, DAG);
21604
21605 return SDValue();
21606}
21607
21609 SDValue SplatVal, unsigned NumVecElts) {
21610 assert(!St.isTruncatingStore() && "cannot split truncating vector store");
21611 Align OrigAlignment = St.getAlign();
21612 unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
21613
21614 // Create scalar stores. This is at least as good as the code sequence for a
21615 // split unaligned store which is a dup.s, ext.b, and two stores.
21616 // Most of the time the three stores should be replaced by store pair
21617 // instructions (stp).
21618 SDLoc DL(&St);
21619 SDValue BasePtr = St.getBasePtr();
21620 uint64_t BaseOffset = 0;
21621
21622 const MachinePointerInfo &PtrInfo = St.getPointerInfo();
21623 SDValue NewST1 =
21624 DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
21625 OrigAlignment, St.getMemOperand()->getFlags());
21626
21627 // As this in ISel, we will not merge this add which may degrade results.
21628 if (BasePtr->getOpcode() == ISD::ADD &&
21629 isa<ConstantSDNode>(BasePtr->getOperand(1))) {
21630 BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
21631 BasePtr = BasePtr->getOperand(0);
21632 }
21633
21634 unsigned Offset = EltOffset;
21635 while (--NumVecElts) {
21636 Align Alignment = commonAlignment(OrigAlignment, Offset);
21637 SDValue OffsetPtr =
21638 DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
21639 DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
21640 NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
21641 PtrInfo.getWithOffset(Offset), Alignment,
21642 St.getMemOperand()->getFlags());
21643 Offset += EltOffset;
21644 }
21645 return NewST1;
21646}
21647
21648// Returns an SVE type that ContentTy can be trivially sign or zero extended
21649// into.
21650static MVT getSVEContainerType(EVT ContentTy) {
21651 assert(ContentTy.isSimple() && "No SVE containers for extended types");
21652
21653 switch (ContentTy.getSimpleVT().SimpleTy) {
21654 default:
21655 llvm_unreachable("No known SVE container for this MVT type");
21656 case MVT::nxv2i8:
21657 case MVT::nxv2i16:
21658 case MVT::nxv2i32:
21659 case MVT::nxv2i64:
21660 case MVT::nxv2f32:
21661 case MVT::nxv2f64:
21662 return MVT::nxv2i64;
21663 case MVT::nxv4i8:
21664 case MVT::nxv4i16:
21665 case MVT::nxv4i32:
21666 case MVT::nxv4f32:
21667 return MVT::nxv4i32;
21668 case MVT::nxv8i8:
21669 case MVT::nxv8i16:
21670 case MVT::nxv8f16:
21671 case MVT::nxv8bf16:
21672 return MVT::nxv8i16;
21673 case MVT::nxv16i8:
21674 return MVT::nxv16i8;
21675 }
21676}
21677
21678static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) {
21679 SDLoc DL(N);
21680 EVT VT = N->getValueType(0);
21681
21683 return SDValue();
21684
21685 EVT ContainerVT = VT;
21686 if (ContainerVT.isInteger())
21687 ContainerVT = getSVEContainerType(ContainerVT);
21688
21689 SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other);
21690 SDValue Ops[] = { N->getOperand(0), // Chain
21691 N->getOperand(2), // Pg
21692 N->getOperand(3), // Base
21693 DAG.getValueType(VT) };
21694
21695 SDValue Load = DAG.getNode(Opc, DL, VTs, Ops);
21696 SDValue LoadChain = SDValue(Load.getNode(), 1);
21697
21698 if (ContainerVT.isInteger() && (VT != ContainerVT))
21699 Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0));
21700
21701 return DAG.getMergeValues({ Load, LoadChain }, DL);
21702}
21703
21705 SDLoc DL(N);
21706 EVT VT = N->getValueType(0);
21707 EVT PtrTy = N->getOperand(3).getValueType();
21708
21709 EVT LoadVT = VT;
21710 if (VT.isFloatingPoint())
21711 LoadVT = VT.changeTypeToInteger();
21712
21713 auto *MINode = cast<MemIntrinsicSDNode>(N);
21714 SDValue PassThru = DAG.getConstant(0, DL, LoadVT);
21715 SDValue L = DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(),
21716 MINode->getOperand(3), DAG.getUNDEF(PtrTy),
21717 MINode->getOperand(2), PassThru,
21718 MINode->getMemoryVT(), MINode->getMemOperand(),
21720
21721 if (VT.isFloatingPoint()) {
21722 SDValue Ops[] = { DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1) };
21723 return DAG.getMergeValues(Ops, DL);
21724 }
21725
21726 return L;
21727}
21728
21729template <unsigned Opcode>
21731 static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO ||
21733 "Unsupported opcode.");
21734 SDLoc DL(N);
21735 EVT VT = N->getValueType(0);
21736
21737 EVT LoadVT = VT;
21738 if (VT.isFloatingPoint())
21739 LoadVT = VT.changeTypeToInteger();
21740
21741 SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)};
21742 SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops);
21743 SDValue LoadChain = SDValue(Load.getNode(), 1);
21744
21745 if (VT.isFloatingPoint())
21746 Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0));
21747
21748 return DAG.getMergeValues({Load, LoadChain}, DL);
21749}
21750
21752 SDLoc DL(N);
21753 SDValue Data = N->getOperand(2);
21754 EVT DataVT = Data.getValueType();
21755 EVT HwSrcVt = getSVEContainerType(DataVT);
21756 SDValue InputVT = DAG.getValueType(DataVT);
21757
21758 if (DataVT.isFloatingPoint())
21759 InputVT = DAG.getValueType(HwSrcVt);
21760
21761 SDValue SrcNew;
21762 if (Data.getValueType().isFloatingPoint())
21763 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Data);
21764 else
21765 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Data);
21766
21767 SDValue Ops[] = { N->getOperand(0), // Chain
21768 SrcNew,
21769 N->getOperand(4), // Base
21770 N->getOperand(3), // Pg
21771 InputVT
21772 };
21773
21774 return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops);
21775}
21776
21778 SDLoc DL(N);
21779
21780 SDValue Data = N->getOperand(2);
21781 EVT DataVT = Data.getValueType();
21782 EVT PtrTy = N->getOperand(4).getValueType();
21783
21784 if (DataVT.isFloatingPoint())
21786
21787 auto *MINode = cast<MemIntrinsicSDNode>(N);
21788 return DAG.getMaskedStore(MINode->getChain(), DL, Data, MINode->getOperand(4),
21789 DAG.getUNDEF(PtrTy), MINode->getOperand(3),
21790 MINode->getMemoryVT(), MINode->getMemOperand(),
21791 ISD::UNINDEXED, false, false);
21792}
21793
21794/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
21795/// load store optimizer pass will merge them to store pair stores. This should
21796/// be better than a movi to create the vector zero followed by a vector store
21797/// if the zero constant is not re-used, since one instructions and one register
21798/// live range will be removed.
21799///
21800/// For example, the final generated code should be:
21801///
21802/// stp xzr, xzr, [x0]
21803///
21804/// instead of:
21805///
21806/// movi v0.2d, #0
21807/// str q0, [x0]
21808///
21810 SDValue StVal = St.getValue();
21811 EVT VT = StVal.getValueType();
21812
21813 // Avoid scalarizing zero splat stores for scalable vectors.
21814 if (VT.isScalableVector())
21815 return SDValue();
21816
21817 // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
21818 // 2, 3 or 4 i32 elements.
21819 int NumVecElts = VT.getVectorNumElements();
21820 if (!(((NumVecElts == 2 || NumVecElts == 3) &&
21821 VT.getVectorElementType().getSizeInBits() == 64) ||
21822 ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) &&
21823 VT.getVectorElementType().getSizeInBits() == 32)))
21824 return SDValue();
21825
21826 if (StVal.getOpcode() != ISD::BUILD_VECTOR)
21827 return SDValue();
21828
21829 // If the zero constant has more than one use then the vector store could be
21830 // better since the constant mov will be amortized and stp q instructions
21831 // should be able to be formed.
21832 if (!StVal.hasOneUse())
21833 return SDValue();
21834
21835 // If the store is truncating then it's going down to i16 or smaller, which
21836 // means it can be implemented in a single store anyway.
21837 if (St.isTruncatingStore())
21838 return SDValue();
21839
21840 // If the immediate offset of the address operand is too large for the stp
21841 // instruction, then bail out.
21842 if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {
21843 int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);
21844 if (Offset < -512 || Offset > 504)
21845 return SDValue();
21846 }
21847
21848 for (int I = 0; I < NumVecElts; ++I) {
21849 SDValue EltVal = StVal.getOperand(I);
21850 if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal))
21851 return SDValue();
21852 }
21853
21854 // Use a CopyFromReg WZR/XZR here to prevent
21855 // DAGCombiner::MergeConsecutiveStores from undoing this transformation.
21856 SDLoc DL(&St);
21857 unsigned ZeroReg;
21858 EVT ZeroVT;
21859 if (VT.getVectorElementType().getSizeInBits() == 32) {
21860 ZeroReg = AArch64::WZR;
21861 ZeroVT = MVT::i32;
21862 } else {
21863 ZeroReg = AArch64::XZR;
21864 ZeroVT = MVT::i64;
21865 }
21866 SDValue SplatVal =
21867 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ZeroReg, ZeroVT);
21868 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
21869}
21870
21871/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
21872/// value. The load store optimizer pass will merge them to store pair stores.
21873/// This has better performance than a splat of the scalar followed by a split
21874/// vector store. Even if the stores are not merged it is four stores vs a dup,
21875/// followed by an ext.b and two stores.
21877 SDValue StVal = St.getValue();
21878 EVT VT = StVal.getValueType();
21879
21880 // Don't replace floating point stores, they possibly won't be transformed to
21881 // stp because of the store pair suppress pass.
21882 if (VT.isFloatingPoint())
21883 return SDValue();
21884
21885 // We can express a splat as store pair(s) for 2 or 4 elements.
21886 unsigned NumVecElts = VT.getVectorNumElements();
21887 if (NumVecElts != 4 && NumVecElts != 2)
21888 return SDValue();
21889
21890 // If the store is truncating then it's going down to i16 or smaller, which
21891 // means it can be implemented in a single store anyway.
21892 if (St.isTruncatingStore())
21893 return SDValue();
21894
21895 // Check that this is a splat.
21896 // Make sure that each of the relevant vector element locations are inserted
21897 // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
21898 std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
21899 SDValue SplatVal;
21900 for (unsigned I = 0; I < NumVecElts; ++I) {
21901 // Check for insert vector elements.
21902 if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
21903 return SDValue();
21904
21905 // Check that same value is inserted at each vector element.
21906 if (I == 0)
21907 SplatVal = StVal.getOperand(1);
21908 else if (StVal.getOperand(1) != SplatVal)
21909 return SDValue();
21910
21911 // Check insert element index.
21912 ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(StVal.getOperand(2));
21913 if (!CIndex)
21914 return SDValue();
21915 uint64_t IndexVal = CIndex->getZExtValue();
21916 if (IndexVal >= NumVecElts)
21917 return SDValue();
21918 IndexNotInserted.reset(IndexVal);
21919
21920 StVal = StVal.getOperand(0);
21921 }
21922 // Check that all vector element locations were inserted to.
21923 if (IndexNotInserted.any())
21924 return SDValue();
21925
21926 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
21927}
21928
21930 SelectionDAG &DAG,
21931 const AArch64Subtarget *Subtarget) {
21932
21933 StoreSDNode *S = cast<StoreSDNode>(N);
21934 if (S->isVolatile() || S->isIndexed())
21935 return SDValue();
21936
21937 SDValue StVal = S->getValue();
21938 EVT VT = StVal.getValueType();
21939
21940 if (!VT.isFixedLengthVector())
21941 return SDValue();
21942
21943 // If we get a splat of zeros, convert this vector store to a store of
21944 // scalars. They will be merged into store pairs of xzr thereby removing one
21945 // instruction and one register.
21946 if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S))
21947 return ReplacedZeroSplat;
21948
21949 // FIXME: The logic for deciding if an unaligned store should be split should
21950 // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
21951 // a call to that function here.
21952
21953 if (!Subtarget->isMisaligned128StoreSlow())
21954 return SDValue();
21955
21956 // Don't split at -Oz.
21958 return SDValue();
21959
21960 // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
21961 // those up regresses performance on micro-benchmarks and olden/bh.
21962 if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
21963 return SDValue();
21964
21965 // Split unaligned 16B stores. They are terrible for performance.
21966 // Don't split stores with alignment of 1 or 2. Code that uses clang vector
21967 // extensions can use this to mark that it does not want splitting to happen
21968 // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
21969 // eliminating alignment hazards is only 1 in 8 for alignment of 2.
21970 if (VT.getSizeInBits() != 128 || S->getAlign() >= Align(16) ||
21971 S->getAlign() <= Align(2))
21972 return SDValue();
21973
21974 // If we get a splat of a scalar convert this vector store to a store of
21975 // scalars. They will be merged into store pairs thereby removing two
21976 // instructions.
21977 if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S))
21978 return ReplacedSplat;
21979
21980 SDLoc DL(S);
21981
21982 // Split VT into two.
21983 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
21984 unsigned NumElts = HalfVT.getVectorNumElements();
21985 SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
21986 DAG.getConstant(0, DL, MVT::i64));
21987 SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
21988 DAG.getConstant(NumElts, DL, MVT::i64));
21989 SDValue BasePtr = S->getBasePtr();
21990 SDValue NewST1 =
21991 DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
21992 S->getAlign(), S->getMemOperand()->getFlags());
21993 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
21994 DAG.getConstant(8, DL, MVT::i64));
21995 return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
21996 S->getPointerInfo(), S->getAlign(),
21997 S->getMemOperand()->getFlags());
21998}
21999
22001 assert(N->getOpcode() == AArch64ISD::SPLICE && "Unexepected Opcode!");
22002
22003 // splice(pg, op1, undef) -> op1
22004 if (N->getOperand(2).isUndef())
22005 return N->getOperand(1);
22006
22007 return SDValue();
22008}
22009
22011 const AArch64Subtarget *Subtarget) {
22012 assert((N->getOpcode() == AArch64ISD::UUNPKHI ||
22013 N->getOpcode() == AArch64ISD::UUNPKLO) &&
22014 "Unexpected Opcode!");
22015
22016 // uunpklo/hi undef -> undef
22017 if (N->getOperand(0).isUndef())
22018 return DAG.getUNDEF(N->getValueType(0));
22019
22020 // If this is a masked load followed by an UUNPKLO, fold this into a masked
22021 // extending load. We can do this even if this is already a masked
22022 // {z,}extload.
22023 if (N->getOperand(0).getOpcode() == ISD::MLOAD &&
22024 N->getOpcode() == AArch64ISD::UUNPKLO) {
22025 MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(N->getOperand(0));
22026 SDValue Mask = MLD->getMask();
22027 SDLoc DL(N);
22028
22029 if (MLD->isUnindexed() && MLD->getExtensionType() != ISD::SEXTLOAD &&
22030 SDValue(MLD, 0).hasOneUse() && Mask->getOpcode() == AArch64ISD::PTRUE &&
22031 (MLD->getPassThru()->isUndef() ||
22032 isZerosVector(MLD->getPassThru().getNode()))) {
22033 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
22034 unsigned PgPattern = Mask->getConstantOperandVal(0);
22035 EVT VT = N->getValueType(0);
22036
22037 // Ensure we can double the size of the predicate pattern
22038 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
22039 if (NumElts &&
22040 NumElts * VT.getVectorElementType().getSizeInBits() <= MinSVESize) {
22041 Mask =
22042 getPTrue(DAG, DL, VT.changeVectorElementType(MVT::i1), PgPattern);
22043 SDValue PassThru = DAG.getConstant(0, DL, VT);
22044 SDValue NewLoad = DAG.getMaskedLoad(
22045 VT, DL, MLD->getChain(), MLD->getBasePtr(), MLD->getOffset(), Mask,
22046 PassThru, MLD->getMemoryVT(), MLD->getMemOperand(),
22048
22049 DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), NewLoad.getValue(1));
22050
22051 return NewLoad;
22052 }
22053 }
22054 }
22055
22056 return SDValue();
22057}
22058
22060 if (N->getOpcode() != AArch64ISD::UZP1)
22061 return false;
22062 SDValue Op0 = N->getOperand(0);
22063 EVT SrcVT = Op0->getValueType(0);
22064 EVT DstVT = N->getValueType(0);
22065 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv16i8) ||
22066 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv8i16) ||
22067 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv4i32);
22068}
22069
22070// Try to combine rounding shifts where the operands come from an extend, and
22071// the result is truncated and combined into one vector.
22072// uzp1(rshrnb(uunpklo(X),C), rshrnb(uunpkhi(X), C)) -> urshr(X, C)
22074 assert(N->getOpcode() == AArch64ISD::UZP1 && "Only UZP1 expected.");
22075 SDValue Op0 = N->getOperand(0);
22076 SDValue Op1 = N->getOperand(1);
22077 EVT ResVT = N->getValueType(0);
22078
22079 unsigned RshOpc = Op0.getOpcode();
22080 if (RshOpc != AArch64ISD::RSHRNB_I)
22081 return SDValue();
22082
22083 // Same op code and imm value?
22084 SDValue ShiftValue = Op0.getOperand(1);
22085 if (RshOpc != Op1.getOpcode() || ShiftValue != Op1.getOperand(1))
22086 return SDValue();
22087
22088 // Same unextended operand value?
22089 SDValue Lo = Op0.getOperand(0);
22090 SDValue Hi = Op1.getOperand(0);
22091 if (Lo.getOpcode() != AArch64ISD::UUNPKLO &&
22092 Hi.getOpcode() != AArch64ISD::UUNPKHI)
22093 return SDValue();
22094 SDValue OrigArg = Lo.getOperand(0);
22095 if (OrigArg != Hi.getOperand(0))
22096 return SDValue();
22097
22098 SDLoc DL(N);
22099 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, ResVT,
22100 getPredicateForVector(DAG, DL, ResVT), OrigArg,
22101 ShiftValue);
22102}
22103
22104// Try to simplify:
22105// t1 = nxv8i16 add(X, 1 << (ShiftValue - 1))
22106// t2 = nxv8i16 srl(t1, ShiftValue)
22107// to
22108// t1 = nxv8i16 rshrnb(X, shiftvalue).
22109// rshrnb will zero the top half bits of each element. Therefore, this combine
22110// should only be performed when a following instruction with the rshrnb
22111// as an operand does not care about the top half of each element. For example,
22112// a uzp1 or a truncating store.
22114 const AArch64Subtarget *Subtarget) {
22115 EVT VT = Srl->getValueType(0);
22116 if (!VT.isScalableVector() || !Subtarget->hasSVE2())
22117 return SDValue();
22118
22119 EVT ResVT;
22120 if (VT == MVT::nxv8i16)
22121 ResVT = MVT::nxv16i8;
22122 else if (VT == MVT::nxv4i32)
22123 ResVT = MVT::nxv8i16;
22124 else if (VT == MVT::nxv2i64)
22125 ResVT = MVT::nxv4i32;
22126 else
22127 return SDValue();
22128
22129 SDLoc DL(Srl);
22130 unsigned ShiftValue;
22131 SDValue RShOperand;
22132 if (!canLowerSRLToRoundingShiftForVT(Srl, ResVT, DAG, ShiftValue, RShOperand))
22133 return SDValue();
22134 SDValue Rshrnb = DAG.getNode(
22135 AArch64ISD::RSHRNB_I, DL, ResVT,
22136 {RShOperand, DAG.getTargetConstant(ShiftValue, DL, MVT::i32)});
22137 return DAG.getNode(ISD::BITCAST, DL, VT, Rshrnb);
22138}
22139
22141 const AArch64Subtarget *Subtarget) {
22142 SDLoc DL(N);
22143 SDValue Op0 = N->getOperand(0);
22144 SDValue Op1 = N->getOperand(1);
22145 EVT ResVT = N->getValueType(0);
22146
22147 // uzp(extract_lo(x), extract_hi(x)) -> extract_lo(uzp x, x)
22148 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
22150 Op0.getOperand(0) == Op1.getOperand(0)) {
22151
22152 SDValue SourceVec = Op0.getOperand(0);
22153 uint64_t ExtIdx0 = Op0.getConstantOperandVal(1);
22154 uint64_t ExtIdx1 = Op1.getConstantOperandVal(1);
22155 uint64_t NumElements = SourceVec.getValueType().getVectorMinNumElements();
22156 if (ExtIdx0 == 0 && ExtIdx1 == NumElements / 2) {
22157 EVT OpVT = Op0.getOperand(1).getValueType();
22158 EVT WidenedResVT = ResVT.getDoubleNumVectorElementsVT(*DAG.getContext());
22159 SDValue Uzp = DAG.getNode(N->getOpcode(), DL, WidenedResVT, SourceVec,
22160 DAG.getUNDEF(WidenedResVT));
22161 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Uzp,
22162 DAG.getConstant(0, DL, OpVT));
22163 }
22164 }
22165
22166 // Following optimizations only work with uzp1.
22167 if (N->getOpcode() == AArch64ISD::UZP2)
22168 return SDValue();
22169
22170 // uzp1(x, undef) -> concat(truncate(x), undef)
22171 if (Op1.getOpcode() == ISD::UNDEF) {
22172 EVT BCVT = MVT::Other, HalfVT = MVT::Other;
22173 switch (ResVT.getSimpleVT().SimpleTy) {
22174 default:
22175 break;
22176 case MVT::v16i8:
22177 BCVT = MVT::v8i16;
22178 HalfVT = MVT::v8i8;
22179 break;
22180 case MVT::v8i16:
22181 BCVT = MVT::v4i32;
22182 HalfVT = MVT::v4i16;
22183 break;
22184 case MVT::v4i32:
22185 BCVT = MVT::v2i64;
22186 HalfVT = MVT::v2i32;
22187 break;
22188 }
22189 if (BCVT != MVT::Other) {
22190 SDValue BC = DAG.getBitcast(BCVT, Op0);
22191 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC);
22192 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc,
22193 DAG.getUNDEF(HalfVT));
22194 }
22195 }
22196
22197 if (SDValue Urshr = tryCombineExtendRShTrunc(N, DAG))
22198 return Urshr;
22199
22200 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Op0, DAG, Subtarget))
22201 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Rshrnb, Op1);
22202
22203 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Op1, DAG, Subtarget))
22204 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Rshrnb);
22205
22206 // uzp1(unpklo(uzp1(x, y)), z) => uzp1(x, z)
22207 if (Op0.getOpcode() == AArch64ISD::UUNPKLO) {
22208 if (Op0.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
22209 SDValue X = Op0.getOperand(0).getOperand(0);
22210 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, X, Op1);
22211 }
22212 }
22213
22214 // uzp1(x, unpkhi(uzp1(y, z))) => uzp1(x, z)
22215 if (Op1.getOpcode() == AArch64ISD::UUNPKHI) {
22216 if (Op1.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
22217 SDValue Z = Op1.getOperand(0).getOperand(1);
22218 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Z);
22219 }
22220 }
22221
22222 // These optimizations only work on little endian.
22223 if (!DAG.getDataLayout().isLittleEndian())
22224 return SDValue();
22225
22226 // uzp1(bitcast(x), bitcast(y)) -> uzp1(x, y)
22227 // Example:
22228 // nxv4i32 = uzp1 bitcast(nxv4i32 x to nxv2i64), bitcast(nxv4i32 y to nxv2i64)
22229 // to
22230 // nxv4i32 = uzp1 nxv4i32 x, nxv4i32 y
22232 Op0.getOpcode() == ISD::BITCAST && Op1.getOpcode() == ISD::BITCAST) {
22233 if (Op0.getOperand(0).getValueType() == Op1.getOperand(0).getValueType()) {
22234 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0.getOperand(0),
22235 Op1.getOperand(0));
22236 }
22237 }
22238
22239 if (ResVT != MVT::v2i32 && ResVT != MVT::v4i16 && ResVT != MVT::v8i8)
22240 return SDValue();
22241
22242 SDValue SourceOp0 = peekThroughBitcasts(Op0);
22243 SDValue SourceOp1 = peekThroughBitcasts(Op1);
22244
22245 // truncating uzp1(x, y) -> xtn(concat (x, y))
22246 if (SourceOp0.getValueType() == SourceOp1.getValueType()) {
22247 EVT Op0Ty = SourceOp0.getValueType();
22248 if ((ResVT == MVT::v4i16 && Op0Ty == MVT::v2i32) ||
22249 (ResVT == MVT::v8i8 && Op0Ty == MVT::v4i16)) {
22250 SDValue Concat =
22253 SourceOp0, SourceOp1);
22254 return DAG.getNode(ISD::TRUNCATE, DL, ResVT, Concat);
22255 }
22256 }
22257
22258 // uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y))
22259 if (SourceOp0.getOpcode() != ISD::TRUNCATE ||
22260 SourceOp1.getOpcode() != ISD::TRUNCATE)
22261 return SDValue();
22262 SourceOp0 = SourceOp0.getOperand(0);
22263 SourceOp1 = SourceOp1.getOperand(0);
22264
22265 if (SourceOp0.getValueType() != SourceOp1.getValueType() ||
22266 !SourceOp0.getValueType().isSimple())
22267 return SDValue();
22268
22269 EVT ResultTy;
22270
22271 switch (SourceOp0.getSimpleValueType().SimpleTy) {
22272 case MVT::v2i64:
22273 ResultTy = MVT::v4i32;
22274 break;
22275 case MVT::v4i32:
22276 ResultTy = MVT::v8i16;
22277 break;
22278 case MVT::v8i16:
22279 ResultTy = MVT::v16i8;
22280 break;
22281 default:
22282 return SDValue();
22283 }
22284
22285 SDValue UzpOp0 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp0);
22286 SDValue UzpOp1 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp1);
22287 SDValue UzpResult =
22288 DAG.getNode(AArch64ISD::UZP1, DL, UzpOp0.getValueType(), UzpOp0, UzpOp1);
22289
22290 EVT BitcastResultTy;
22291
22292 switch (ResVT.getSimpleVT().SimpleTy) {
22293 case MVT::v2i32:
22294 BitcastResultTy = MVT::v2i64;
22295 break;
22296 case MVT::v4i16:
22297 BitcastResultTy = MVT::v4i32;
22298 break;
22299 case MVT::v8i8:
22300 BitcastResultTy = MVT::v8i16;
22301 break;
22302 default:
22303 llvm_unreachable("Should be one of {v2i32, v4i16, v8i8}");
22304 }
22305
22306 return DAG.getNode(ISD::TRUNCATE, DL, ResVT,
22307 DAG.getNode(ISD::BITCAST, DL, BitcastResultTy, UzpResult));
22308}
22309
22311 unsigned Opc = N->getOpcode();
22312
22313 assert(((Opc >= AArch64ISD::GLD1_MERGE_ZERO && // unsigned gather loads
22315 (Opc >= AArch64ISD::GLD1S_MERGE_ZERO && // signed gather loads
22317 "Invalid opcode.");
22318
22319 const bool Scaled = Opc == AArch64ISD::GLD1_SCALED_MERGE_ZERO ||
22321 const bool Signed = Opc == AArch64ISD::GLD1S_MERGE_ZERO ||
22323 const bool Extended = Opc == AArch64ISD::GLD1_SXTW_MERGE_ZERO ||
22327
22328 SDLoc DL(N);
22329 SDValue Chain = N->getOperand(0);
22330 SDValue Pg = N->getOperand(1);
22331 SDValue Base = N->getOperand(2);
22332 SDValue Offset = N->getOperand(3);
22333 SDValue Ty = N->getOperand(4);
22334
22335 EVT ResVT = N->getValueType(0);
22336
22337 const auto OffsetOpc = Offset.getOpcode();
22338 const bool OffsetIsZExt =
22340 const bool OffsetIsSExt =
22342
22343 // Fold sign/zero extensions of vector offsets into GLD1 nodes where possible.
22344 if (!Extended && (OffsetIsSExt || OffsetIsZExt)) {
22345 SDValue ExtPg = Offset.getOperand(0);
22346 VTSDNode *ExtFrom = cast<VTSDNode>(Offset.getOperand(2).getNode());
22347 EVT ExtFromEVT = ExtFrom->getVT().getVectorElementType();
22348
22349 // If the predicate for the sign- or zero-extended offset is the
22350 // same as the predicate used for this load and the sign-/zero-extension
22351 // was from a 32-bits...
22352 if (ExtPg == Pg && ExtFromEVT == MVT::i32) {
22353 SDValue UnextendedOffset = Offset.getOperand(1);
22354
22355 unsigned NewOpc = getGatherVecOpcode(Scaled, OffsetIsSExt, true);
22356 if (Signed)
22357 NewOpc = getSignExtendedGatherOpcode(NewOpc);
22358
22359 return DAG.getNode(NewOpc, DL, {ResVT, MVT::Other},
22360 {Chain, Pg, Base, UnextendedOffset, Ty});
22361 }
22362 }
22363
22364 return SDValue();
22365}
22366
22367/// Optimize a vector shift instruction and its operand if shifted out
22368/// bits are not used.
22370 const AArch64TargetLowering &TLI,
22372 assert(N->getOpcode() == AArch64ISD::VASHR ||
22373 N->getOpcode() == AArch64ISD::VLSHR);
22374
22375 SDValue Op = N->getOperand(0);
22376 unsigned OpScalarSize = Op.getScalarValueSizeInBits();
22377
22378 unsigned ShiftImm = N->getConstantOperandVal(1);
22379 assert(OpScalarSize > ShiftImm && "Invalid shift imm");
22380
22381 // Remove sign_extend_inreg (ashr(shl(x)) based on the number of sign bits.
22382 if (N->getOpcode() == AArch64ISD::VASHR &&
22383 Op.getOpcode() == AArch64ISD::VSHL &&
22384 N->getOperand(1) == Op.getOperand(1))
22385 if (DCI.DAG.ComputeNumSignBits(Op.getOperand(0)) > ShiftImm)
22386 return Op.getOperand(0);
22387
22388 // If the shift is exact, the shifted out bits matter.
22389 if (N->getFlags().hasExact())
22390 return SDValue();
22391
22392 APInt ShiftedOutBits = APInt::getLowBitsSet(OpScalarSize, ShiftImm);
22393 APInt DemandedMask = ~ShiftedOutBits;
22394
22395 if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
22396 return SDValue(N, 0);
22397
22398 return SDValue();
22399}
22400
22402 // sunpklo(sext(pred)) -> sext(extract_low_half(pred))
22403 // This transform works in partnership with performSetCCPunpkCombine to
22404 // remove unnecessary transfer of predicates into standard registers and back
22405 if (N->getOperand(0).getOpcode() == ISD::SIGN_EXTEND &&
22406 N->getOperand(0)->getOperand(0)->getValueType(0).getScalarType() ==
22407 MVT::i1) {
22408 SDValue CC = N->getOperand(0)->getOperand(0);
22409 auto VT = CC->getValueType(0).getHalfNumVectorElementsVT(*DAG.getContext());
22410 SDValue Unpk = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, CC,
22411 DAG.getVectorIdxConstant(0, SDLoc(N)));
22412 return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), N->getValueType(0), Unpk);
22413 }
22414
22415 return SDValue();
22416}
22417
22418/// Target-specific DAG combine function for post-increment LD1 (lane) and
22419/// post-increment LD1R.
22422 bool IsLaneOp) {
22423 if (DCI.isBeforeLegalizeOps())
22424 return SDValue();
22425
22426 SelectionDAG &DAG = DCI.DAG;
22427 EVT VT = N->getValueType(0);
22428
22429 if (!VT.is128BitVector() && !VT.is64BitVector())
22430 return SDValue();
22431
22432 unsigned LoadIdx = IsLaneOp ? 1 : 0;
22433 SDNode *LD = N->getOperand(LoadIdx).getNode();
22434 // If it is not LOAD, can not do such combine.
22435 if (LD->getOpcode() != ISD::LOAD)
22436 return SDValue();
22437
22438 // The vector lane must be a constant in the LD1LANE opcode.
22439 SDValue Lane;
22440 if (IsLaneOp) {
22441 Lane = N->getOperand(2);
22442 auto *LaneC = dyn_cast<ConstantSDNode>(Lane);
22443 if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements())
22444 return SDValue();
22445 }
22446
22447 LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
22448 EVT MemVT = LoadSDN->getMemoryVT();
22449 // Check if memory operand is the same type as the vector element.
22450 if (MemVT != VT.getVectorElementType())
22451 return SDValue();
22452
22453 // Check if there are other uses. If so, do not combine as it will introduce
22454 // an extra load.
22455 for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
22456 ++UI) {
22457 if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
22458 continue;
22459 if (*UI != N)
22460 return SDValue();
22461 }
22462
22463 // If there is one use and it can splat the value, prefer that operation.
22464 // TODO: This could be expanded to more operations if they reliably use the
22465 // index variants.
22466 if (N->hasOneUse()) {
22467 unsigned UseOpc = N->use_begin()->getOpcode();
22468 if (UseOpc == ISD::FMUL || UseOpc == ISD::FMA)
22469 return SDValue();
22470 }
22471
22472 SDValue Addr = LD->getOperand(1);
22473 SDValue Vector = N->getOperand(0);
22474 // Search for a use of the address operand that is an increment.
22475 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
22476 Addr.getNode()->use_end(); UI != UE; ++UI) {
22477 SDNode *User = *UI;
22478 if (User->getOpcode() != ISD::ADD
22479 || UI.getUse().getResNo() != Addr.getResNo())
22480 continue;
22481
22482 // If the increment is a constant, it must match the memory ref size.
22483 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
22484 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
22485 uint32_t IncVal = CInc->getZExtValue();
22486 unsigned NumBytes = VT.getScalarSizeInBits() / 8;
22487 if (IncVal != NumBytes)
22488 continue;
22489 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
22490 }
22491
22492 // To avoid cycle construction make sure that neither the load nor the add
22493 // are predecessors to each other or the Vector.
22496 Visited.insert(Addr.getNode());
22497 Worklist.push_back(User);
22498 Worklist.push_back(LD);
22499 Worklist.push_back(Vector.getNode());
22500 if (SDNode::hasPredecessorHelper(LD, Visited, Worklist) ||
22501 SDNode::hasPredecessorHelper(User, Visited, Worklist))
22502 continue;
22503
22505 Ops.push_back(LD->getOperand(0)); // Chain
22506 if (IsLaneOp) {
22507 Ops.push_back(Vector); // The vector to be inserted
22508 Ops.push_back(Lane); // The lane to be inserted in the vector
22509 }
22510 Ops.push_back(Addr);
22511 Ops.push_back(Inc);
22512
22513 EVT Tys[3] = { VT, MVT::i64, MVT::Other };
22514 SDVTList SDTys = DAG.getVTList(Tys);
22515 unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
22516 SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
22517 MemVT,
22518 LoadSDN->getMemOperand());
22519
22520 // Update the uses.
22521 SDValue NewResults[] = {
22522 SDValue(LD, 0), // The result of load
22523 SDValue(UpdN.getNode(), 2) // Chain
22524 };
22525 DCI.CombineTo(LD, NewResults);
22526 DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result
22527 DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register
22528
22529 break;
22530 }
22531 return SDValue();
22532}
22533
22534/// Simplify ``Addr`` given that the top byte of it is ignored by HW during
22535/// address translation.
22538 SelectionDAG &DAG) {
22539 APInt DemandedMask = APInt::getLowBitsSet(64, 56);
22540 KnownBits Known;
22542 !DCI.isBeforeLegalizeOps());
22543 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22544 if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) {
22545 DCI.CommitTargetLoweringOpt(TLO);
22546 return true;
22547 }
22548 return false;
22549}
22550
22552 assert((N->getOpcode() == ISD::STORE || N->getOpcode() == ISD::MSTORE) &&
22553 "Expected STORE dag node in input!");
22554
22555 if (auto Store = dyn_cast<StoreSDNode>(N)) {
22556 if (!Store->isTruncatingStore() || Store->isIndexed())
22557 return SDValue();
22558 SDValue Ext = Store->getValue();
22559 auto ExtOpCode = Ext.getOpcode();
22560 if (ExtOpCode != ISD::ZERO_EXTEND && ExtOpCode != ISD::SIGN_EXTEND &&
22561 ExtOpCode != ISD::ANY_EXTEND)
22562 return SDValue();
22563 SDValue Orig = Ext->getOperand(0);
22564 if (Store->getMemoryVT() != Orig.getValueType())
22565 return SDValue();
22566 return DAG.getStore(Store->getChain(), SDLoc(Store), Orig,
22567 Store->getBasePtr(), Store->getMemOperand());
22568 }
22569
22570 return SDValue();
22571}
22572
22573// A custom combine to lower load <3 x i8> as the more efficient sequence
22574// below:
22575// ldrb wX, [x0, #2]
22576// ldrh wY, [x0]
22577// orr wX, wY, wX, lsl #16
22578// fmov s0, wX
22579//
22580// Note that an alternative sequence with even fewer (although usually more
22581// complex/expensive) instructions would be:
22582// ld1r.4h { v0 }, [x0], #2
22583// ld1.b { v0 }[2], [x0]
22584//
22585// Generating this sequence unfortunately results in noticeably worse codegen
22586// for code that extends the loaded v3i8, due to legalization breaking vector
22587// shuffle detection in a way that is very difficult to work around.
22588// TODO: Revisit once v3i8 legalization has been improved in general.
22590 EVT MemVT = LD->getMemoryVT();
22591 if (MemVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3) ||
22592 LD->getOriginalAlign() >= 4)
22593 return SDValue();
22594
22595 SDLoc DL(LD);
22597 SDValue Chain = LD->getChain();
22598 SDValue BasePtr = LD->getBasePtr();
22599 MachineMemOperand *MMO = LD->getMemOperand();
22600 assert(LD->getOffset().isUndef() && "undef offset expected");
22601
22602 // Load 2 x i8, then 1 x i8.
22603 SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, MMO);
22604 TypeSize Offset2 = TypeSize::getFixed(2);
22605 SDValue L8 = DAG.getLoad(MVT::i8, DL, Chain,
22606 DAG.getMemBasePlusOffset(BasePtr, Offset2, DL),
22607 MF.getMachineMemOperand(MMO, 2, 1));
22608
22609 // Extend to i32.
22610 SDValue Ext16 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L16);
22611 SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8);
22612
22613 // Pack 2 x i8 and 1 x i8 in an i32 and convert to v4i8.
22614 SDValue Shl = DAG.getNode(ISD::SHL, DL, MVT::i32, Ext8,
22615 DAG.getConstant(16, DL, MVT::i32));
22616 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, Ext16, Shl);
22617 SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v4i8, Or);
22618
22619 // Extract v3i8 again.
22620 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, Cast,
22621 DAG.getConstant(0, DL, MVT::i64));
22622 SDValue TokenFactor = DAG.getNode(
22623 ISD::TokenFactor, DL, MVT::Other,
22624 {SDValue(cast<SDNode>(L16), 1), SDValue(cast<SDNode>(L8), 1)});
22625 return DAG.getMergeValues({Extract, TokenFactor}, DL);
22626}
22627
22628// Perform TBI simplification if supported by the target and try to break up
22629// nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
22630// load instructions can be selected.
22633 SelectionDAG &DAG,
22634 const AArch64Subtarget *Subtarget) {
22635 if (Subtarget->supportsAddressTopByteIgnored())
22636 performTBISimplification(N->getOperand(1), DCI, DAG);
22637
22638 LoadSDNode *LD = cast<LoadSDNode>(N);
22639 if (LD->isVolatile() || !Subtarget->isLittleEndian())
22640 return SDValue(N, 0);
22641
22642 if (SDValue Res = combineV3I8LoadExt(LD, DAG))
22643 return Res;
22644
22645 if (!LD->isNonTemporal())
22646 return SDValue(N, 0);
22647
22648 EVT MemVT = LD->getMemoryVT();
22649 if (MemVT.isScalableVector() || MemVT.getSizeInBits() <= 256 ||
22650 MemVT.getSizeInBits() % 256 == 0 ||
22651 256 % MemVT.getScalarSizeInBits() != 0)
22652 return SDValue(N, 0);
22653
22654 SDLoc DL(LD);
22655 SDValue Chain = LD->getChain();
22656 SDValue BasePtr = LD->getBasePtr();
22657 SDNodeFlags Flags = LD->getFlags();
22659 SmallVector<SDValue, 4> LoadOpsChain;
22660 // Replace any non temporal load over 256-bit with a series of 256 bit loads
22661 // and a scalar/vector load less than 256. This way we can utilize 256-bit
22662 // loads and reduce the amount of load instructions generated.
22663 MVT NewVT =
22665 256 / MemVT.getVectorElementType().getSizeInBits());
22666 unsigned Num256Loads = MemVT.getSizeInBits() / 256;
22667 // Create all 256-bit loads starting from offset 0 and up to Num256Loads-1*32.
22668 for (unsigned I = 0; I < Num256Loads; I++) {
22669 unsigned PtrOffset = I * 32;
22670 SDValue NewPtr = DAG.getMemBasePlusOffset(
22671 BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);
22672 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
22673 SDValue NewLoad = DAG.getLoad(
22674 NewVT, DL, Chain, NewPtr, LD->getPointerInfo().getWithOffset(PtrOffset),
22675 NewAlign, LD->getMemOperand()->getFlags(), LD->getAAInfo());
22676 LoadOps.push_back(NewLoad);
22677 LoadOpsChain.push_back(SDValue(cast<SDNode>(NewLoad), 1));
22678 }
22679
22680 // Process remaining bits of the load operation.
22681 // This is done by creating an UNDEF vector to match the size of the
22682 // 256-bit loads and inserting the remaining load to it. We extract the
22683 // original load type at the end using EXTRACT_SUBVECTOR instruction.
22684 unsigned BitsRemaining = MemVT.getSizeInBits() % 256;
22685 unsigned PtrOffset = (MemVT.getSizeInBits() - BitsRemaining) / 8;
22686 MVT RemainingVT = MVT::getVectorVT(
22688 BitsRemaining / MemVT.getVectorElementType().getSizeInBits());
22689 SDValue NewPtr = DAG.getMemBasePlusOffset(
22690 BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);
22691 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
22692 SDValue RemainingLoad =
22693 DAG.getLoad(RemainingVT, DL, Chain, NewPtr,
22694 LD->getPointerInfo().getWithOffset(PtrOffset), NewAlign,
22695 LD->getMemOperand()->getFlags(), LD->getAAInfo());
22696 SDValue UndefVector = DAG.getUNDEF(NewVT);
22697 SDValue InsertIdx = DAG.getVectorIdxConstant(0, DL);
22698 SDValue ExtendedReminingLoad =
22699 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewVT,
22700 {UndefVector, RemainingLoad, InsertIdx});
22701 LoadOps.push_back(ExtendedReminingLoad);
22702 LoadOpsChain.push_back(SDValue(cast<SDNode>(RemainingLoad), 1));
22703 EVT ConcatVT =
22705 LoadOps.size() * NewVT.getVectorNumElements());
22706 SDValue ConcatVectors =
22707 DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, LoadOps);
22708 // Extract the original vector type size.
22709 SDValue ExtractSubVector =
22710 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT,
22711 {ConcatVectors, DAG.getVectorIdxConstant(0, DL)});
22712 SDValue TokenFactor =
22713 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, LoadOpsChain);
22714 return DAG.getMergeValues({ExtractSubVector, TokenFactor}, DL);
22715}
22716
22718 EVT VecVT = Op.getValueType();
22719 assert(VecVT.isVector() && VecVT.getVectorElementType() == MVT::i1 &&
22720 "Need boolean vector type.");
22721
22722 if (Depth > 3)
22724
22725 // We can get the base type from a vector compare or truncate.
22726 if (Op.getOpcode() == ISD::SETCC || Op.getOpcode() == ISD::TRUNCATE)
22727 return Op.getOperand(0).getValueType();
22728
22729 // If an operand is a bool vector, continue looking.
22731 for (SDValue Operand : Op->op_values()) {
22732 if (Operand.getValueType() != VecVT)
22733 continue;
22734
22735 EVT OperandVT = tryGetOriginalBoolVectorType(Operand, Depth + 1);
22736 if (!BaseVT.isSimple())
22737 BaseVT = OperandVT;
22738 else if (OperandVT != BaseVT)
22740 }
22741
22742 return BaseVT;
22743}
22744
22745// When converting a <N x iX> vector to <N x i1> to store or use as a scalar
22746// iN, we can use a trick that extracts the i^th bit from the i^th element and
22747// then performs a vector add to get a scalar bitmask. This requires that each
22748// element's bits are either all 1 or all 0.
22750 SDLoc DL(N);
22751 SDValue ComparisonResult(N, 0);
22752 EVT VecVT = ComparisonResult.getValueType();
22753 assert(VecVT.isVector() && "Must be a vector type");
22754
22755 unsigned NumElts = VecVT.getVectorNumElements();
22756 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
22757 return SDValue();
22758
22759 if (VecVT.getVectorElementType() != MVT::i1 &&
22760 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT))
22761 return SDValue();
22762
22763 // If we can find the original types to work on instead of a vector of i1,
22764 // we can avoid extend/extract conversion instructions.
22765 if (VecVT.getVectorElementType() == MVT::i1) {
22766 VecVT = tryGetOriginalBoolVectorType(ComparisonResult);
22767 if (!VecVT.isSimple()) {
22768 unsigned BitsPerElement = std::max(64 / NumElts, 8u); // >= 64-bit vector
22769 VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement), NumElts);
22770 }
22771 }
22772 VecVT = VecVT.changeVectorElementTypeToInteger();
22773
22774 // Large vectors don't map directly to this conversion, so to avoid too many
22775 // edge cases, we don't apply it here. The conversion will likely still be
22776 // applied later via multiple smaller vectors, whose results are concatenated.
22777 if (VecVT.getSizeInBits() > 128)
22778 return SDValue();
22779
22780 // Ensure that all elements' bits are either 0s or 1s.
22781 ComparisonResult = DAG.getSExtOrTrunc(ComparisonResult, DL, VecVT);
22782
22783 SmallVector<SDValue, 16> MaskConstants;
22785 VecVT == MVT::v16i8) {
22786 // v16i8 is a special case, as we have 16 entries but only 8 positional bits
22787 // per entry. We split it into two halves, apply the mask, zip the halves to
22788 // create 8x 16-bit values, and the perform the vector reduce.
22789 for (unsigned Half = 0; Half < 2; ++Half) {
22790 for (unsigned MaskBit = 1; MaskBit <= 128; MaskBit *= 2) {
22791 MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i32));
22792 }
22793 }
22794 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);
22795 SDValue RepresentativeBits =
22796 DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);
22797
22798 SDValue UpperRepresentativeBits =
22799 DAG.getNode(AArch64ISD::EXT, DL, VecVT, RepresentativeBits,
22800 RepresentativeBits, DAG.getConstant(8, DL, MVT::i32));
22801 SDValue Zipped = DAG.getNode(AArch64ISD::ZIP1, DL, VecVT,
22802 RepresentativeBits, UpperRepresentativeBits);
22803 Zipped = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Zipped);
22804 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i16, Zipped);
22805 }
22806
22807 // All other vector sizes.
22808 unsigned MaxBitMask = 1u << (VecVT.getVectorNumElements() - 1);
22809 for (unsigned MaskBit = 1; MaskBit <= MaxBitMask; MaskBit *= 2) {
22810 MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i64));
22811 }
22812
22813 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);
22814 SDValue RepresentativeBits =
22815 DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);
22816 EVT ResultVT = MVT::getIntegerVT(std::max<unsigned>(
22817 NumElts, VecVT.getVectorElementType().getSizeInBits()));
22818 return DAG.getNode(ISD::VECREDUCE_ADD, DL, ResultVT, RepresentativeBits);
22819}
22820
22822 StoreSDNode *Store) {
22823 if (!Store->isTruncatingStore())
22824 return SDValue();
22825
22826 SDLoc DL(Store);
22827 SDValue VecOp = Store->getValue();
22828 EVT VT = VecOp.getValueType();
22829 EVT MemVT = Store->getMemoryVT();
22830
22831 if (!MemVT.isVector() || !VT.isVector() ||
22832 MemVT.getVectorElementType() != MVT::i1)
22833 return SDValue();
22834
22835 // If we are storing a vector that we are currently building, let
22836 // `scalarizeVectorStore()` handle this more efficiently.
22837 if (VecOp.getOpcode() == ISD::BUILD_VECTOR)
22838 return SDValue();
22839
22840 VecOp = DAG.getNode(ISD::TRUNCATE, DL, MemVT, VecOp);
22841 SDValue VectorBits = vectorToScalarBitmask(VecOp.getNode(), DAG);
22842 if (!VectorBits)
22843 return SDValue();
22844
22845 EVT StoreVT =
22847 SDValue ExtendedBits = DAG.getZExtOrTrunc(VectorBits, DL, StoreVT);
22848 return DAG.getStore(Store->getChain(), DL, ExtendedBits, Store->getBasePtr(),
22849 Store->getMemOperand());
22850}
22851
22853 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv8i8) ||
22854 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv4i16) ||
22855 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv2i32);
22856}
22857
22858// Combine store (trunc X to <3 x i8>) to sequence of ST1.b.
22860 const AArch64Subtarget *Subtarget) {
22861 SDValue Value = ST->getValue();
22862 EVT ValueVT = Value.getValueType();
22863
22864 if (ST->isVolatile() || !Subtarget->isLittleEndian() ||
22865 Value.getOpcode() != ISD::TRUNCATE ||
22866 ValueVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3))
22867 return SDValue();
22868
22869 assert(ST->getOffset().isUndef() && "undef offset expected");
22870 SDLoc DL(ST);
22871 auto WideVT = EVT::getVectorVT(
22872 *DAG.getContext(),
22873 Value->getOperand(0).getValueType().getVectorElementType(), 4);
22874 SDValue UndefVector = DAG.getUNDEF(WideVT);
22875 SDValue WideTrunc = DAG.getNode(
22876 ISD::INSERT_SUBVECTOR, DL, WideVT,
22877 {UndefVector, Value->getOperand(0), DAG.getVectorIdxConstant(0, DL)});
22878 SDValue Cast = DAG.getNode(
22879 ISD::BITCAST, DL, WideVT.getSizeInBits() == 64 ? MVT::v8i8 : MVT::v16i8,
22880 WideTrunc);
22881
22883 SDValue Chain = ST->getChain();
22884 MachineMemOperand *MMO = ST->getMemOperand();
22885 unsigned IdxScale = WideVT.getScalarSizeInBits() / 8;
22886 SDValue E2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
22887 DAG.getConstant(2 * IdxScale, DL, MVT::i64));
22888 TypeSize Offset2 = TypeSize::getFixed(2);
22889 SDValue Ptr2 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset2, DL);
22890 Chain = DAG.getStore(Chain, DL, E2, Ptr2, MF.getMachineMemOperand(MMO, 2, 1));
22891
22892 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
22893 DAG.getConstant(1 * IdxScale, DL, MVT::i64));
22894 TypeSize Offset1 = TypeSize::getFixed(1);
22895 SDValue Ptr1 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset1, DL);
22896 Chain = DAG.getStore(Chain, DL, E1, Ptr1, MF.getMachineMemOperand(MMO, 1, 1));
22897
22898 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
22899 DAG.getConstant(0, DL, MVT::i64));
22900 Chain = DAG.getStore(Chain, DL, E0, ST->getBasePtr(),
22901 MF.getMachineMemOperand(MMO, 0, 1));
22902 return Chain;
22903}
22904
22907 SelectionDAG &DAG,
22908 const AArch64Subtarget *Subtarget) {
22909 StoreSDNode *ST = cast<StoreSDNode>(N);
22910 SDValue Chain = ST->getChain();
22911 SDValue Value = ST->getValue();
22912 SDValue Ptr = ST->getBasePtr();
22913 EVT ValueVT = Value.getValueType();
22914
22915 auto hasValidElementTypeForFPTruncStore = [](EVT VT) {
22916 EVT EltVT = VT.getVectorElementType();
22917 return EltVT == MVT::f32 || EltVT == MVT::f64;
22918 };
22919
22920 if (SDValue Res = combineI8TruncStore(ST, DAG, Subtarget))
22921 return Res;
22922
22923 // If this is an FP_ROUND followed by a store, fold this into a truncating
22924 // store. We can do this even if this is already a truncstore.
22925 // We purposefully don't care about legality of the nodes here as we know
22926 // they can be split down into something legal.
22927 if (DCI.isBeforeLegalizeOps() && Value.getOpcode() == ISD::FP_ROUND &&
22928 Value.getNode()->hasOneUse() && ST->isUnindexed() &&
22929 Subtarget->useSVEForFixedLengthVectors() &&
22930 ValueVT.isFixedLengthVector() &&
22931 ValueVT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits() &&
22932 hasValidElementTypeForFPTruncStore(Value.getOperand(0).getValueType()))
22933 return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
22934 ST->getMemoryVT(), ST->getMemOperand());
22935
22936 if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
22937 return Split;
22938
22939 if (Subtarget->supportsAddressTopByteIgnored() &&
22940 performTBISimplification(N->getOperand(2), DCI, DAG))
22941 return SDValue(N, 0);
22942
22943 if (SDValue Store = foldTruncStoreOfExt(DAG, N))
22944 return Store;
22945
22946 if (SDValue Store = combineBoolVectorAndTruncateStore(DAG, ST))
22947 return Store;
22948
22949 if (ST->isTruncatingStore()) {
22950 EVT StoreVT = ST->getMemoryVT();
22951 if (!isHalvingTruncateOfLegalScalableType(ValueVT, StoreVT))
22952 return SDValue();
22953 if (SDValue Rshrnb =
22954 trySimplifySrlAddToRshrnb(ST->getOperand(1), DAG, Subtarget)) {
22955 return DAG.getTruncStore(ST->getChain(), ST, Rshrnb, ST->getBasePtr(),
22956 StoreVT, ST->getMemOperand());
22957 }
22958 }
22959
22960 return SDValue();
22961}
22962
22965 SelectionDAG &DAG,
22966 const AArch64Subtarget *Subtarget) {
22967 MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
22968 SDValue Value = MST->getValue();
22969 SDValue Mask = MST->getMask();
22970 SDLoc DL(N);
22971
22972 // If this is a UZP1 followed by a masked store, fold this into a masked
22973 // truncating store. We can do this even if this is already a masked
22974 // truncstore.
22975 if (Value.getOpcode() == AArch64ISD::UZP1 && Value->hasOneUse() &&
22976 MST->isUnindexed() && Mask->getOpcode() == AArch64ISD::PTRUE &&
22977 Value.getValueType().isInteger()) {
22978 Value = Value.getOperand(0);
22979 if (Value.getOpcode() == ISD::BITCAST) {
22980 EVT HalfVT =
22981 Value.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
22982 EVT InVT = Value.getOperand(0).getValueType();
22983
22984 if (HalfVT.widenIntegerVectorElementType(*DAG.getContext()) == InVT) {
22985 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
22986 unsigned PgPattern = Mask->getConstantOperandVal(0);
22987
22988 // Ensure we can double the size of the predicate pattern
22989 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
22990 if (NumElts && NumElts * InVT.getVectorElementType().getSizeInBits() <=
22991 MinSVESize) {
22992 Mask = getPTrue(DAG, DL, InVT.changeVectorElementType(MVT::i1),
22993 PgPattern);
22994 return DAG.getMaskedStore(MST->getChain(), DL, Value.getOperand(0),
22995 MST->getBasePtr(), MST->getOffset(), Mask,
22996 MST->getMemoryVT(), MST->getMemOperand(),
22997 MST->getAddressingMode(),
22998 /*IsTruncating=*/true);
22999 }
23000 }
23001 }
23002 }
23003
23004 if (MST->isTruncatingStore()) {
23005 EVT ValueVT = Value->getValueType(0);
23006 EVT MemVT = MST->getMemoryVT();
23007 if (!isHalvingTruncateOfLegalScalableType(ValueVT, MemVT))
23008 return SDValue();
23009 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Value, DAG, Subtarget)) {
23010 return DAG.getMaskedStore(MST->getChain(), DL, Rshrnb, MST->getBasePtr(),
23011 MST->getOffset(), MST->getMask(),
23012 MST->getMemoryVT(), MST->getMemOperand(),
23013 MST->getAddressingMode(), true);
23014 }
23015 }
23016
23017 return SDValue();
23018}
23019
23020/// \return true if part of the index was folded into the Base.
23021static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale,
23022 SDLoc DL, SelectionDAG &DAG) {
23023 // This function assumes a vector of i64 indices.
23024 EVT IndexVT = Index.getValueType();
23025 if (!IndexVT.isVector() || IndexVT.getVectorElementType() != MVT::i64)
23026 return false;
23027
23028 // Simplify:
23029 // BasePtr = Ptr
23030 // Index = X + splat(Offset)
23031 // ->
23032 // BasePtr = Ptr + Offset * scale.
23033 // Index = X
23034 if (Index.getOpcode() == ISD::ADD) {
23035 if (auto Offset = DAG.getSplatValue(Index.getOperand(1))) {
23036 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
23037 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
23038 Index = Index.getOperand(0);
23039 return true;
23040 }
23041 }
23042
23043 // Simplify:
23044 // BasePtr = Ptr
23045 // Index = (X + splat(Offset)) << splat(Shift)
23046 // ->
23047 // BasePtr = Ptr + (Offset << Shift) * scale)
23048 // Index = X << splat(shift)
23049 if (Index.getOpcode() == ISD::SHL &&
23050 Index.getOperand(0).getOpcode() == ISD::ADD) {
23051 SDValue Add = Index.getOperand(0);
23052 SDValue ShiftOp = Index.getOperand(1);
23053 SDValue OffsetOp = Add.getOperand(1);
23054 if (auto Shift = DAG.getSplatValue(ShiftOp))
23055 if (auto Offset = DAG.getSplatValue(OffsetOp)) {
23056 Offset = DAG.getNode(ISD::SHL, DL, MVT::i64, Offset, Shift);
23057 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
23058 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
23059 Index = DAG.getNode(ISD::SHL, DL, Index.getValueType(),
23060 Add.getOperand(0), ShiftOp);
23061 return true;
23062 }
23063 }
23064
23065 return false;
23066}
23067
23068// Analyse the specified address returning true if a more optimal addressing
23069// mode is available. When returning true all parameters are updated to reflect
23070// their recommended values.
23072 SDValue &BasePtr, SDValue &Index,
23073 SelectionDAG &DAG) {
23074 // Try to iteratively fold parts of the index into the base pointer to
23075 // simplify the index as much as possible.
23076 bool Changed = false;
23077 while (foldIndexIntoBase(BasePtr, Index, N->getScale(), SDLoc(N), DAG))
23078 Changed = true;
23079
23080 // Only consider element types that are pointer sized as smaller types can
23081 // be easily promoted.
23082 EVT IndexVT = Index.getValueType();
23083 if (IndexVT.getVectorElementType() != MVT::i64 || IndexVT == MVT::nxv2i64)
23084 return Changed;
23085
23086 // Can indices be trivially shrunk?
23087 EVT DataVT = N->getOperand(1).getValueType();
23088 // Don't attempt to shrink the index for fixed vectors of 64 bit data since it
23089 // will later be re-extended to 64 bits in legalization
23090 if (DataVT.isFixedLengthVector() && DataVT.getScalarSizeInBits() == 64)
23091 return Changed;
23092 if (ISD::isVectorShrinkable(Index.getNode(), 32, N->isIndexSigned())) {
23093 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
23094 Index = DAG.getNode(ISD::TRUNCATE, SDLoc(N), NewIndexVT, Index);
23095 return true;
23096 }
23097
23098 // Match:
23099 // Index = step(const)
23100 int64_t Stride = 0;
23101 if (Index.getOpcode() == ISD::STEP_VECTOR) {
23102 Stride = cast<ConstantSDNode>(Index.getOperand(0))->getSExtValue();
23103 }
23104 // Match:
23105 // Index = step(const) << shift(const)
23106 else if (Index.getOpcode() == ISD::SHL &&
23107 Index.getOperand(0).getOpcode() == ISD::STEP_VECTOR) {
23108 SDValue RHS = Index.getOperand(1);
23109 if (auto *Shift =
23110 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(RHS))) {
23111 int64_t Step = (int64_t)Index.getOperand(0).getConstantOperandVal(1);
23112 Stride = Step << Shift->getZExtValue();
23113 }
23114 }
23115
23116 // Return early because no supported pattern is found.
23117 if (Stride == 0)
23118 return Changed;
23119
23120 if (Stride < std::numeric_limits<int32_t>::min() ||
23121 Stride > std::numeric_limits<int32_t>::max())
23122 return Changed;
23123
23124 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
23125 unsigned MaxVScale =
23127 int64_t LastElementOffset =
23128 IndexVT.getVectorMinNumElements() * Stride * MaxVScale;
23129
23130 if (LastElementOffset < std::numeric_limits<int32_t>::min() ||
23131 LastElementOffset > std::numeric_limits<int32_t>::max())
23132 return Changed;
23133
23134 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
23135 // Stride does not scale explicitly by 'Scale', because it happens in
23136 // the gather/scatter addressing mode.
23137 Index = DAG.getStepVector(SDLoc(N), NewIndexVT, APInt(32, Stride));
23138 return true;
23139}
23140
23143 MaskedGatherScatterSDNode *MGS = cast<MaskedGatherScatterSDNode>(N);
23144 assert(MGS && "Can only combine gather load or scatter store nodes");
23145
23146 if (!DCI.isBeforeLegalize())
23147 return SDValue();
23148
23149 SDLoc DL(MGS);
23150 SDValue Chain = MGS->getChain();
23151 SDValue Scale = MGS->getScale();
23152 SDValue Index = MGS->getIndex();
23153 SDValue Mask = MGS->getMask();
23154 SDValue BasePtr = MGS->getBasePtr();
23155 ISD::MemIndexType IndexType = MGS->getIndexType();
23156
23157 if (!findMoreOptimalIndexType(MGS, BasePtr, Index, DAG))
23158 return SDValue();
23159
23160 // Here we catch such cases early and change MGATHER's IndexType to allow
23161 // the use of an Index that's more legalisation friendly.
23162 if (auto *MGT = dyn_cast<MaskedGatherSDNode>(MGS)) {
23163 SDValue PassThru = MGT->getPassThru();
23164 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
23165 return DAG.getMaskedGather(
23166 DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL,
23167 Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType());
23168 }
23169 auto *MSC = cast<MaskedScatterSDNode>(MGS);
23170 SDValue Data = MSC->getValue();
23171 SDValue Ops[] = {Chain, Data, Mask, BasePtr, Index, Scale};
23172 return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(), DL,
23173 Ops, MSC->getMemOperand(), IndexType,
23174 MSC->isTruncatingStore());
23175}
23176
23177/// Target-specific DAG combine function for NEON load/store intrinsics
23178/// to merge base address updates.
23181 SelectionDAG &DAG) {
23182 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
23183 return SDValue();
23184
23185 unsigned AddrOpIdx = N->getNumOperands() - 1;
23186 SDValue Addr = N->getOperand(AddrOpIdx);
23187
23188 // Search for a use of the address operand that is an increment.
23189 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
23190 UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
23191 SDNode *User = *UI;
23192 if (User->getOpcode() != ISD::ADD ||
23193 UI.getUse().getResNo() != Addr.getResNo())
23194 continue;
23195
23196 // Check that the add is independent of the load/store. Otherwise, folding
23197 // it would create a cycle.
23200 Visited.insert(Addr.getNode());
23201 Worklist.push_back(N);
23202 Worklist.push_back(User);
23203 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
23204 SDNode::hasPredecessorHelper(User, Visited, Worklist))
23205 continue;
23206
23207 // Find the new opcode for the updating load/store.
23208 bool IsStore = false;
23209 bool IsLaneOp = false;
23210 bool IsDupOp = false;
23211 unsigned NewOpc = 0;
23212 unsigned NumVecs = 0;
23213 unsigned IntNo = N->getConstantOperandVal(1);
23214 switch (IntNo) {
23215 default: llvm_unreachable("unexpected intrinsic for Neon base update");
23216 case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post;
23217 NumVecs = 2; break;
23218 case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post;
23219 NumVecs = 3; break;
23220 case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post;
23221 NumVecs = 4; break;
23222 case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post;
23223 NumVecs = 2; IsStore = true; break;
23224 case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post;
23225 NumVecs = 3; IsStore = true; break;
23226 case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post;
23227 NumVecs = 4; IsStore = true; break;
23228 case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post;
23229 NumVecs = 2; break;
23230 case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post;
23231 NumVecs = 3; break;
23232 case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post;
23233 NumVecs = 4; break;
23234 case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post;
23235 NumVecs = 2; IsStore = true; break;
23236 case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post;
23237 NumVecs = 3; IsStore = true; break;
23238 case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post;
23239 NumVecs = 4; IsStore = true; break;
23240 case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost;
23241 NumVecs = 2; IsDupOp = true; break;
23242 case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost;
23243 NumVecs = 3; IsDupOp = true; break;
23244 case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost;
23245 NumVecs = 4; IsDupOp = true; break;
23246 case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost;
23247 NumVecs = 2; IsLaneOp = true; break;
23248 case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost;
23249 NumVecs = 3; IsLaneOp = true; break;
23250 case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost;
23251 NumVecs = 4; IsLaneOp = true; break;
23252 case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost;
23253 NumVecs = 2; IsStore = true; IsLaneOp = true; break;
23254 case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost;
23255 NumVecs = 3; IsStore = true; IsLaneOp = true; break;
23256 case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost;
23257 NumVecs = 4; IsStore = true; IsLaneOp = true; break;
23258 }
23259
23260 EVT VecTy;
23261 if (IsStore)
23262 VecTy = N->getOperand(2).getValueType();
23263 else
23264 VecTy = N->getValueType(0);
23265
23266 // If the increment is a constant, it must match the memory ref size.
23267 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
23268 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
23269 uint32_t IncVal = CInc->getZExtValue();
23270 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
23271 if (IsLaneOp || IsDupOp)
23272 NumBytes /= VecTy.getVectorNumElements();
23273 if (IncVal != NumBytes)
23274 continue;
23275 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
23276 }
23278 Ops.push_back(N->getOperand(0)); // Incoming chain
23279 // Load lane and store have vector list as input.
23280 if (IsLaneOp || IsStore)
23281 for (unsigned i = 2; i < AddrOpIdx; ++i)
23282 Ops.push_back(N->getOperand(i));
23283 Ops.push_back(Addr); // Base register
23284 Ops.push_back(Inc);
23285
23286 // Return Types.
23287 EVT Tys[6];
23288 unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
23289 unsigned n;
23290 for (n = 0; n < NumResultVecs; ++n)
23291 Tys[n] = VecTy;
23292 Tys[n++] = MVT::i64; // Type of write back register
23293 Tys[n] = MVT::Other; // Type of the chain
23294 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
23295
23296 MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
23297 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
23298 MemInt->getMemoryVT(),
23299 MemInt->getMemOperand());
23300
23301 // Update the uses.
23302 std::vector<SDValue> NewResults;
23303 for (unsigned i = 0; i < NumResultVecs; ++i) {
23304 NewResults.push_back(SDValue(UpdN.getNode(), i));
23305 }
23306 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
23307 DCI.CombineTo(N, NewResults);
23308 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
23309
23310 break;
23311 }
23312 return SDValue();
23313}
23314
23315// Checks to see if the value is the prescribed width and returns information
23316// about its extension mode.
23317static
23318bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
23319 ExtType = ISD::NON_EXTLOAD;
23320 switch(V.getNode()->getOpcode()) {
23321 default:
23322 return false;
23323 case ISD::LOAD: {
23324 LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
23325 if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
23326 || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
23327 ExtType = LoadNode->getExtensionType();
23328 return true;
23329 }
23330 return false;
23331 }
23332 case ISD::AssertSext: {
23333 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
23334 if ((TypeNode->getVT() == MVT::i8 && width == 8)
23335 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
23336 ExtType = ISD::SEXTLOAD;
23337 return true;
23338 }
23339 return false;
23340 }
23341 case ISD::AssertZext: {
23342 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
23343 if ((TypeNode->getVT() == MVT::i8 && width == 8)
23344 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
23345 ExtType = ISD::ZEXTLOAD;
23346 return true;
23347 }
23348 return false;
23349 }
23350 case ISD::Constant:
23351 case ISD::TargetConstant: {
23352 return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
23353 1LL << (width - 1);
23354 }
23355 }
23356
23357 return true;
23358}
23359
23360// This function does a whole lot of voodoo to determine if the tests are
23361// equivalent without and with a mask. Essentially what happens is that given a
23362// DAG resembling:
23363//
23364// +-------------+ +-------------+ +-------------+ +-------------+
23365// | Input | | AddConstant | | CompConstant| | CC |
23366// +-------------+ +-------------+ +-------------+ +-------------+
23367// | | | |
23368// V V | +----------+
23369// +-------------+ +----+ | |
23370// | ADD | |0xff| | |
23371// +-------------+ +----+ | |
23372// | | | |
23373// V V | |
23374// +-------------+ | |
23375// | AND | | |
23376// +-------------+ | |
23377// | | |
23378// +-----+ | |
23379// | | |
23380// V V V
23381// +-------------+
23382// | CMP |
23383// +-------------+
23384//
23385// The AND node may be safely removed for some combinations of inputs. In
23386// particular we need to take into account the extension type of the Input,
23387// the exact values of AddConstant, CompConstant, and CC, along with the nominal
23388// width of the input (this can work for any width inputs, the above graph is
23389// specific to 8 bits.
23390//
23391// The specific equations were worked out by generating output tables for each
23392// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
23393// problem was simplified by working with 4 bit inputs, which means we only
23394// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
23395// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
23396// patterns present in both extensions (0,7). For every distinct set of
23397// AddConstant and CompConstants bit patterns we can consider the masked and
23398// unmasked versions to be equivalent if the result of this function is true for
23399// all 16 distinct bit patterns of for the current extension type of Input (w0).
23400//
23401// sub w8, w0, w1
23402// and w10, w8, #0x0f
23403// cmp w8, w2
23404// cset w9, AArch64CC
23405// cmp w10, w2
23406// cset w11, AArch64CC
23407// cmp w9, w11
23408// cset w0, eq
23409// ret
23410//
23411// Since the above function shows when the outputs are equivalent it defines
23412// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
23413// would be expensive to run during compiles. The equations below were written
23414// in a test harness that confirmed they gave equivalent outputs to the above
23415// for all inputs function, so they can be used determine if the removal is
23416// legal instead.
23417//
23418// isEquivalentMaskless() is the code for testing if the AND can be removed
23419// factored out of the DAG recognition as the DAG can take several forms.
23420
23421static bool isEquivalentMaskless(unsigned CC, unsigned width,
23422 ISD::LoadExtType ExtType, int AddConstant,
23423 int CompConstant) {
23424 // By being careful about our equations and only writing the in term
23425 // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
23426 // make them generally applicable to all bit widths.
23427 int MaxUInt = (1 << width);
23428
23429 // For the purposes of these comparisons sign extending the type is
23430 // equivalent to zero extending the add and displacing it by half the integer
23431 // width. Provided we are careful and make sure our equations are valid over
23432 // the whole range we can just adjust the input and avoid writing equations
23433 // for sign extended inputs.
23434 if (ExtType == ISD::SEXTLOAD)
23435 AddConstant -= (1 << (width-1));
23436
23437 switch(CC) {
23438 case AArch64CC::LE:
23439 case AArch64CC::GT:
23440 if ((AddConstant == 0) ||
23441 (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
23442 (AddConstant >= 0 && CompConstant < 0) ||
23443 (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
23444 return true;
23445 break;
23446 case AArch64CC::LT:
23447 case AArch64CC::GE:
23448 if ((AddConstant == 0) ||
23449 (AddConstant >= 0 && CompConstant <= 0) ||
23450 (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
23451 return true;
23452 break;
23453 case AArch64CC::HI:
23454 case AArch64CC::LS:
23455 if ((AddConstant >= 0 && CompConstant < 0) ||
23456 (AddConstant <= 0 && CompConstant >= -1 &&
23457 CompConstant < AddConstant + MaxUInt))
23458 return true;
23459 break;
23460 case AArch64CC::PL:
23461 case AArch64CC::MI:
23462 if ((AddConstant == 0) ||
23463 (AddConstant > 0 && CompConstant <= 0) ||
23464 (AddConstant < 0 && CompConstant <= AddConstant))
23465 return true;
23466 break;
23467 case AArch64CC::LO:
23468 case AArch64CC::HS:
23469 if ((AddConstant >= 0 && CompConstant <= 0) ||
23470 (AddConstant <= 0 && CompConstant >= 0 &&
23471 CompConstant <= AddConstant + MaxUInt))
23472 return true;
23473 break;
23474 case AArch64CC::EQ:
23475 case AArch64CC::NE:
23476 if ((AddConstant > 0 && CompConstant < 0) ||
23477 (AddConstant < 0 && CompConstant >= 0 &&
23478 CompConstant < AddConstant + MaxUInt) ||
23479 (AddConstant >= 0 && CompConstant >= 0 &&
23480 CompConstant >= AddConstant) ||
23481 (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
23482 return true;
23483 break;
23484 case AArch64CC::VS:
23485 case AArch64CC::VC:
23486 case AArch64CC::AL:
23487 case AArch64CC::NV:
23488 return true;
23489 case AArch64CC::Invalid:
23490 break;
23491 }
23492
23493 return false;
23494}
23495
23496// (X & C) >u Mask --> (X & (C & (~Mask)) != 0
23497// (X & C) <u Pow2 --> (X & (C & ~(Pow2-1)) == 0
23499 SDNode *AndNode, SelectionDAG &DAG,
23500 unsigned CCIndex, unsigned CmpIndex,
23501 unsigned CC) {
23502 ConstantSDNode *SubsC = dyn_cast<ConstantSDNode>(SubsNode->getOperand(1));
23503 if (!SubsC)
23504 return SDValue();
23505
23506 APInt SubsAP = SubsC->getAPIntValue();
23507 if (CC == AArch64CC::HI) {
23508 if (!SubsAP.isMask())
23509 return SDValue();
23510 } else if (CC == AArch64CC::LO) {
23511 if (!SubsAP.isPowerOf2())
23512 return SDValue();
23513 } else
23514 return SDValue();
23515
23516 ConstantSDNode *AndC = dyn_cast<ConstantSDNode>(AndNode->getOperand(1));
23517 if (!AndC)
23518 return SDValue();
23519
23520 APInt MaskAP = CC == AArch64CC::HI ? SubsAP : (SubsAP - 1);
23521
23522 SDLoc DL(N);
23523 APInt AndSMask = (~MaskAP) & AndC->getAPIntValue();
23524 SDValue ANDS = DAG.getNode(
23525 AArch64ISD::ANDS, DL, SubsNode->getVTList(), AndNode->getOperand(0),
23526 DAG.getConstant(AndSMask, DL, SubsC->getValueType(0)));
23527 SDValue AArch64_CC =
23529 N->getOperand(CCIndex)->getValueType(0));
23530
23531 // For now, only performCSELCombine and performBRCONDCombine call this
23532 // function. And both of them pass 2 for CCIndex, 3 for CmpIndex with 4
23533 // operands. So just init the ops direct to simplify the code. If we have some
23534 // other case with different CCIndex, CmpIndex, we need to use for loop to
23535 // rewrite the code here.
23536 // TODO: Do we need to assert number of operand is 4 here?
23537 assert((CCIndex == 2 && CmpIndex == 3) &&
23538 "Expected CCIndex to be 2 and CmpIndex to be 3.");
23539 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), AArch64_CC,
23540 ANDS.getValue(1)};
23541 return DAG.getNode(N->getOpcode(), N, N->getVTList(), Ops);
23542}
23543
23544static
23547 SelectionDAG &DAG, unsigned CCIndex,
23548 unsigned CmpIndex) {
23549 unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
23550 SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
23551 unsigned CondOpcode = SubsNode->getOpcode();
23552
23553 if (CondOpcode != AArch64ISD::SUBS || SubsNode->hasAnyUseOfValue(0) ||
23554 !SubsNode->hasOneUse())
23555 return SDValue();
23556
23557 // There is a SUBS feeding this condition. Is it fed by a mask we can
23558 // use?
23559
23560 SDNode *AndNode = SubsNode->getOperand(0).getNode();
23561 unsigned MaskBits = 0;
23562
23563 if (AndNode->getOpcode() != ISD::AND)
23564 return SDValue();
23565
23566 if (SDValue Val = performSubsToAndsCombine(N, SubsNode, AndNode, DAG, CCIndex,
23567 CmpIndex, CC))
23568 return Val;
23569
23570 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
23571 uint32_t CNV = CN->getZExtValue();
23572 if (CNV == 255)
23573 MaskBits = 8;
23574 else if (CNV == 65535)
23575 MaskBits = 16;
23576 }
23577
23578 if (!MaskBits)
23579 return SDValue();
23580
23581 SDValue AddValue = AndNode->getOperand(0);
23582
23583 if (AddValue.getOpcode() != ISD::ADD)
23584 return SDValue();
23585
23586 // The basic dag structure is correct, grab the inputs and validate them.
23587
23588 SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
23589 SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
23590 SDValue SubsInputValue = SubsNode->getOperand(1);
23591
23592 // The mask is present and the provenance of all the values is a smaller type,
23593 // lets see if the mask is superfluous.
23594
23595 if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
23596 !isa<ConstantSDNode>(SubsInputValue.getNode()))
23597 return SDValue();
23598
23599 ISD::LoadExtType ExtType;
23600
23601 if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
23602 !checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
23603 !checkValueWidth(AddInputValue1, MaskBits, ExtType) )
23604 return SDValue();
23605
23606 if(!isEquivalentMaskless(CC, MaskBits, ExtType,
23607 cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
23608 cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
23609 return SDValue();
23610
23611 // The AND is not necessary, remove it.
23612
23613 SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
23614 SubsNode->getValueType(1));
23615 SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
23616
23617 SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
23618 DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
23619
23620 return SDValue(N, 0);
23621}
23622
23623// Optimize compare with zero and branch.
23626 SelectionDAG &DAG) {
23628 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
23629 // will not be produced, as they are conditional branch instructions that do
23630 // not set flags.
23631 if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
23632 return SDValue();
23633
23634 if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
23635 N = NV.getNode();
23636 SDValue Chain = N->getOperand(0);
23637 SDValue Dest = N->getOperand(1);
23638 SDValue CCVal = N->getOperand(2);
23639 SDValue Cmp = N->getOperand(3);
23640
23641 assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
23642 unsigned CC = CCVal->getAsZExtVal();
23643 if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
23644 return SDValue();
23645
23646 unsigned CmpOpc = Cmp.getOpcode();
23647 if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
23648 return SDValue();
23649
23650 // Only attempt folding if there is only one use of the flag and no use of the
23651 // value.
23652 if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
23653 return SDValue();
23654
23655 SDValue LHS = Cmp.getOperand(0);
23656 SDValue RHS = Cmp.getOperand(1);
23657
23658 assert(LHS.getValueType() == RHS.getValueType() &&
23659 "Expected the value type to be the same for both operands!");
23660 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
23661 return SDValue();
23662
23663 if (isNullConstant(LHS))
23664 std::swap(LHS, RHS);
23665
23666 if (!isNullConstant(RHS))
23667 return SDValue();
23668
23669 if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
23670 LHS.getOpcode() == ISD::SRL)
23671 return SDValue();
23672
23673 // Fold the compare into the branch instruction.
23674 SDValue BR;
23675 if (CC == AArch64CC::EQ)
23676 BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
23677 else
23678 BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
23679
23680 // Do not add new nodes to DAG combiner worklist.
23681 DCI.CombineTo(N, BR, false);
23682
23683 return SDValue();
23684}
23685
23687 unsigned CC = N->getConstantOperandVal(2);
23688 SDValue SUBS = N->getOperand(3);
23689 SDValue Zero, CTTZ;
23690
23691 if (CC == AArch64CC::EQ && SUBS.getOpcode() == AArch64ISD::SUBS) {
23692 Zero = N->getOperand(0);
23693 CTTZ = N->getOperand(1);
23694 } else if (CC == AArch64CC::NE && SUBS.getOpcode() == AArch64ISD::SUBS) {
23695 Zero = N->getOperand(1);
23696 CTTZ = N->getOperand(0);
23697 } else
23698 return SDValue();
23699
23700 if ((CTTZ.getOpcode() != ISD::CTTZ && CTTZ.getOpcode() != ISD::TRUNCATE) ||
23701 (CTTZ.getOpcode() == ISD::TRUNCATE &&
23702 CTTZ.getOperand(0).getOpcode() != ISD::CTTZ))
23703 return SDValue();
23704
23705 assert((CTTZ.getValueType() == MVT::i32 || CTTZ.getValueType() == MVT::i64) &&
23706 "Illegal type in CTTZ folding");
23707
23708 if (!isNullConstant(Zero) || !isNullConstant(SUBS.getOperand(1)))
23709 return SDValue();
23710
23711 SDValue X = CTTZ.getOpcode() == ISD::TRUNCATE
23712 ? CTTZ.getOperand(0).getOperand(0)
23713 : CTTZ.getOperand(0);
23714
23715 if (X != SUBS.getOperand(0))
23716 return SDValue();
23717
23718 unsigned BitWidth = CTTZ.getOpcode() == ISD::TRUNCATE
23719 ? CTTZ.getOperand(0).getValueSizeInBits()
23720 : CTTZ.getValueSizeInBits();
23721 SDValue BitWidthMinusOne =
23722 DAG.getConstant(BitWidth - 1, SDLoc(N), CTTZ.getValueType());
23723 return DAG.getNode(ISD::AND, SDLoc(N), CTTZ.getValueType(), CTTZ,
23724 BitWidthMinusOne);
23725}
23726
23727// (CSEL l r EQ (CMP (CSEL x y cc2 cond) x)) => (CSEL l r cc2 cond)
23728// (CSEL l r EQ (CMP (CSEL x y cc2 cond) y)) => (CSEL l r !cc2 cond)
23729// Where x and y are constants and x != y
23730
23731// (CSEL l r NE (CMP (CSEL x y cc2 cond) x)) => (CSEL l r !cc2 cond)
23732// (CSEL l r NE (CMP (CSEL x y cc2 cond) y)) => (CSEL l r cc2 cond)
23733// Where x and y are constants and x != y
23735 SDValue L = Op->getOperand(0);
23736 SDValue R = Op->getOperand(1);
23737 AArch64CC::CondCode OpCC =
23738 static_cast<AArch64CC::CondCode>(Op->getConstantOperandVal(2));
23739
23740 SDValue OpCmp = Op->getOperand(3);
23741 if (!isCMP(OpCmp))
23742 return SDValue();
23743
23744 SDValue CmpLHS = OpCmp.getOperand(0);
23745 SDValue CmpRHS = OpCmp.getOperand(1);
23746
23747 if (CmpRHS.getOpcode() == AArch64ISD::CSEL)
23748 std::swap(CmpLHS, CmpRHS);
23749 else if (CmpLHS.getOpcode() != AArch64ISD::CSEL)
23750 return SDValue();
23751
23752 SDValue X = CmpLHS->getOperand(0);
23753 SDValue Y = CmpLHS->getOperand(1);
23754 if (!isa<ConstantSDNode>(X) || !isa<ConstantSDNode>(Y) || X == Y) {
23755 return SDValue();
23756 }
23757
23758 // If one of the constant is opaque constant, x,y sdnode is still different
23759 // but the real value maybe the same. So check APInt here to make sure the
23760 // code is correct.
23761 ConstantSDNode *CX = cast<ConstantSDNode>(X);
23762 ConstantSDNode *CY = cast<ConstantSDNode>(Y);
23763 if (CX->getAPIntValue() == CY->getAPIntValue())
23764 return SDValue();
23765
23767 static_cast<AArch64CC::CondCode>(CmpLHS->getConstantOperandVal(2));
23768 SDValue Cond = CmpLHS->getOperand(3);
23769
23770 if (CmpRHS == Y)
23772 else if (CmpRHS != X)
23773 return SDValue();
23774
23775 if (OpCC == AArch64CC::NE)
23777 else if (OpCC != AArch64CC::EQ)
23778 return SDValue();
23779
23780 SDLoc DL(Op);
23781 EVT VT = Op->getValueType(0);
23782
23783 SDValue CCValue = DAG.getConstant(CC, DL, MVT::i32);
23784 return DAG.getNode(AArch64ISD::CSEL, DL, VT, L, R, CCValue, Cond);
23785}
23786
23787// Optimize CSEL instructions
23790 SelectionDAG &DAG) {
23791 // CSEL x, x, cc -> x
23792 if (N->getOperand(0) == N->getOperand(1))
23793 return N->getOperand(0);
23794
23795 if (SDValue R = foldCSELOfCSEL(N, DAG))
23796 return R;
23797
23798 // CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1
23799 // CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1
23800 if (SDValue Folded = foldCSELofCTTZ(N, DAG))
23801 return Folded;
23802
23803 return performCONDCombine(N, DCI, DAG, 2, 3);
23804}
23805
23806// Try to re-use an already extended operand of a vector SetCC feeding a
23807// extended select. Doing so avoids requiring another full extension of the
23808// SET_CC result when lowering the select.
23810 EVT Op0MVT = Op->getOperand(0).getValueType();
23811 if (!Op0MVT.isVector() || Op->use_empty())
23812 return SDValue();
23813
23814 // Make sure that all uses of Op are VSELECTs with result matching types where
23815 // the result type has a larger element type than the SetCC operand.
23816 SDNode *FirstUse = *Op->use_begin();
23817 if (FirstUse->getOpcode() != ISD::VSELECT)
23818 return SDValue();
23819 EVT UseMVT = FirstUse->getValueType(0);
23820 if (UseMVT.getScalarSizeInBits() <= Op0MVT.getScalarSizeInBits())
23821 return SDValue();
23822 if (any_of(Op->uses(), [&UseMVT](const SDNode *N) {
23823 return N->getOpcode() != ISD::VSELECT || N->getValueType(0) != UseMVT;
23824 }))
23825 return SDValue();
23826
23827 APInt V;
23828 if (!ISD::isConstantSplatVector(Op->getOperand(1).getNode(), V))
23829 return SDValue();
23830
23831 SDLoc DL(Op);
23832 SDValue Op0ExtV;
23833 SDValue Op1ExtV;
23834 ISD::CondCode CC = cast<CondCodeSDNode>(Op->getOperand(2))->get();
23835 // Check if the first operand of the SET_CC is already extended. If it is,
23836 // split the SET_CC and re-use the extended version of the operand.
23837 SDNode *Op0SExt = DAG.getNodeIfExists(ISD::SIGN_EXTEND, DAG.getVTList(UseMVT),
23838 Op->getOperand(0));
23839 SDNode *Op0ZExt = DAG.getNodeIfExists(ISD::ZERO_EXTEND, DAG.getVTList(UseMVT),
23840 Op->getOperand(0));
23841 if (Op0SExt && (isSignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
23842 Op0ExtV = SDValue(Op0SExt, 0);
23843 Op1ExtV = DAG.getNode(ISD::SIGN_EXTEND, DL, UseMVT, Op->getOperand(1));
23844 } else if (Op0ZExt && (isUnsignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
23845 Op0ExtV = SDValue(Op0ZExt, 0);
23846 Op1ExtV = DAG.getNode(ISD::ZERO_EXTEND, DL, UseMVT, Op->getOperand(1));
23847 } else
23848 return SDValue();
23849
23850 return DAG.getNode(ISD::SETCC, DL, UseMVT.changeVectorElementType(MVT::i1),
23851 Op0ExtV, Op1ExtV, Op->getOperand(2));
23852}
23853
23854static SDValue
23856 SelectionDAG &DAG) {
23857 SDValue Vec = N->getOperand(0);
23858 if (DCI.isBeforeLegalize() &&
23859 Vec.getValueType().getVectorElementType() == MVT::i1 &&
23862 SDLoc DL(N);
23863 return getVectorBitwiseReduce(N->getOpcode(), Vec, N->getValueType(0), DL,
23864 DAG);
23865 }
23866
23867 return SDValue();
23868}
23869
23872 SelectionDAG &DAG) {
23873 assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!");
23874 SDValue LHS = N->getOperand(0);
23875 SDValue RHS = N->getOperand(1);
23876 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
23877 SDLoc DL(N);
23878 EVT VT = N->getValueType(0);
23879
23880 if (SDValue V = tryToWidenSetCCOperands(N, DAG))
23881 return V;
23882
23883 // setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X
23884 if (Cond == ISD::SETNE && isOneConstant(RHS) &&
23885 LHS->getOpcode() == AArch64ISD::CSEL &&
23886 isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
23887 LHS->hasOneUse()) {
23888 // Invert CSEL's condition.
23889 auto OldCond =
23890 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
23891 auto NewCond = getInvertedCondCode(OldCond);
23892
23893 // csel 0, 1, !cond, X
23894 SDValue CSEL =
23895 DAG.getNode(AArch64ISD::CSEL, DL, LHS.getValueType(), LHS.getOperand(0),
23896 LHS.getOperand(1), DAG.getConstant(NewCond, DL, MVT::i32),
23897 LHS.getOperand(3));
23898 return DAG.getZExtOrTrunc(CSEL, DL, VT);
23899 }
23900
23901 // setcc (srl x, imm), 0, ne ==> setcc (and x, (-1 << imm)), 0, ne
23902 if (Cond == ISD::SETNE && isNullConstant(RHS) &&
23903 LHS->getOpcode() == ISD::SRL && isa<ConstantSDNode>(LHS->getOperand(1)) &&
23904 LHS->getConstantOperandVal(1) < VT.getScalarSizeInBits() &&
23905 LHS->hasOneUse()) {
23906 EVT TstVT = LHS->getValueType(0);
23907 if (TstVT.isScalarInteger() && TstVT.getFixedSizeInBits() <= 64) {
23908 // this pattern will get better opt in emitComparison
23909 uint64_t TstImm = -1ULL << LHS->getConstantOperandVal(1);
23910 SDValue TST = DAG.getNode(ISD::AND, DL, TstVT, LHS->getOperand(0),
23911 DAG.getConstant(TstImm, DL, TstVT));
23912 return DAG.getNode(ISD::SETCC, DL, VT, TST, RHS, N->getOperand(2));
23913 }
23914 }
23915
23916 // setcc (iN (bitcast (vNi1 X))), 0, (eq|ne)
23917 // ==> setcc (iN (zext (i1 (vecreduce_or (vNi1 X))))), 0, (eq|ne)
23918 // setcc (iN (bitcast (vNi1 X))), -1, (eq|ne)
23919 // ==> setcc (iN (sext (i1 (vecreduce_and (vNi1 X))))), -1, (eq|ne)
23920 if (DCI.isBeforeLegalize() && VT.isScalarInteger() &&
23921 (Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
23923 LHS->getOpcode() == ISD::BITCAST) {
23924 EVT ToVT = LHS->getValueType(0);
23925 EVT FromVT = LHS->getOperand(0).getValueType();
23926 if (FromVT.isFixedLengthVector() &&
23927 FromVT.getVectorElementType() == MVT::i1) {
23928 bool IsNull = isNullConstant(RHS);
23930 DL, MVT::i1, LHS->getOperand(0));
23931 LHS = DAG.getNode(IsNull ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, DL, ToVT,
23932 LHS);
23933 return DAG.getSetCC(DL, VT, LHS, RHS, Cond);
23934 }
23935 }
23936
23937 // Try to perform the memcmp when the result is tested for [in]equality with 0
23938 if (SDValue V = performOrXorChainCombine(N, DAG))
23939 return V;
23940
23941 return SDValue();
23942}
23943
23944// Replace a flag-setting operator (eg ANDS) with the generic version
23945// (eg AND) if the flag is unused.
23948 unsigned GenericOpcode) {
23949 SDLoc DL(N);
23950 SDValue LHS = N->getOperand(0);
23951 SDValue RHS = N->getOperand(1);
23952 EVT VT = N->getValueType(0);
23953
23954 // If the flag result isn't used, convert back to a generic opcode.
23955 if (!N->hasAnyUseOfValue(1)) {
23956 SDValue Res = DCI.DAG.getNode(GenericOpcode, DL, VT, N->ops());
23957 return DCI.DAG.getMergeValues({Res, DCI.DAG.getConstant(0, DL, MVT::i32)},
23958 DL);
23959 }
23960
23961 // Combine identical generic nodes into this node, re-using the result.
23962 if (SDNode *Generic = DCI.DAG.getNodeIfExists(
23963 GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS}))
23964 DCI.CombineTo(Generic, SDValue(N, 0));
23965
23966 return SDValue();
23967}
23968
23970 // setcc_merge_zero pred
23971 // (sign_extend (extract_subvector (setcc_merge_zero ... pred ...))), 0, ne
23972 // => extract_subvector (inner setcc_merge_zero)
23973 SDValue Pred = N->getOperand(0);
23974 SDValue LHS = N->getOperand(1);
23975 SDValue RHS = N->getOperand(2);
23976 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
23977
23978 if (Cond != ISD::SETNE || !isZerosVector(RHS.getNode()) ||
23979 LHS->getOpcode() != ISD::SIGN_EXTEND)
23980 return SDValue();
23981
23982 SDValue Extract = LHS->getOperand(0);
23983 if (Extract->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
23984 Extract->getValueType(0) != N->getValueType(0) ||
23985 Extract->getConstantOperandVal(1) != 0)
23986 return SDValue();
23987
23988 SDValue InnerSetCC = Extract->getOperand(0);
23989 if (InnerSetCC->getOpcode() != AArch64ISD::SETCC_MERGE_ZERO)
23990 return SDValue();
23991
23992 // By this point we've effectively got
23993 // zero_inactive_lanes_and_trunc_i1(sext_i1(A)). If we can prove A's inactive
23994 // lanes are already zero then the trunc(sext()) sequence is redundant and we
23995 // can operate on A directly.
23996 SDValue InnerPred = InnerSetCC.getOperand(0);
23997 if (Pred.getOpcode() == AArch64ISD::PTRUE &&
23998 InnerPred.getOpcode() == AArch64ISD::PTRUE &&
23999 Pred.getConstantOperandVal(0) == InnerPred.getConstantOperandVal(0) &&
24000 Pred->getConstantOperandVal(0) >= AArch64SVEPredPattern::vl1 &&
24001 Pred->getConstantOperandVal(0) <= AArch64SVEPredPattern::vl256)
24002 return Extract;
24003
24004 return SDValue();
24005}
24006
24007static SDValue
24009 assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
24010 "Unexpected opcode!");
24011
24012 SelectionDAG &DAG = DCI.DAG;
24013 SDValue Pred = N->getOperand(0);
24014 SDValue LHS = N->getOperand(1);
24015 SDValue RHS = N->getOperand(2);
24016 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
24017
24018 if (SDValue V = performSetCCPunpkCombine(N, DAG))
24019 return V;
24020
24021 if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) &&
24022 LHS->getOpcode() == ISD::SIGN_EXTEND &&
24023 LHS->getOperand(0)->getValueType(0) == N->getValueType(0)) {
24024 // setcc_merge_zero(
24025 // pred, extend(setcc_merge_zero(pred, ...)), != splat(0))
24026 // => setcc_merge_zero(pred, ...)
24027 if (LHS->getOperand(0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
24028 LHS->getOperand(0)->getOperand(0) == Pred)
24029 return LHS->getOperand(0);
24030
24031 // setcc_merge_zero(
24032 // all_active, extend(nxvNi1 ...), != splat(0))
24033 // -> nxvNi1 ...
24034 if (isAllActivePredicate(DAG, Pred))
24035 return LHS->getOperand(0);
24036
24037 // setcc_merge_zero(
24038 // pred, extend(nxvNi1 ...), != splat(0))
24039 // -> nxvNi1 and(pred, ...)
24040 if (DCI.isAfterLegalizeDAG())
24041 // Do this after legalization to allow more folds on setcc_merge_zero
24042 // to be recognized.
24043 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0),
24044 LHS->getOperand(0), Pred);
24045 }
24046
24047 return SDValue();
24048}
24049
24050// Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
24051// as well as whether the test should be inverted. This code is required to
24052// catch these cases (as opposed to standard dag combines) because
24053// AArch64ISD::TBZ is matched during legalization.
24054static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
24055 SelectionDAG &DAG) {
24056
24057 if (!Op->hasOneUse())
24058 return Op;
24059
24060 // We don't handle undef/constant-fold cases below, as they should have
24061 // already been taken care of (e.g. and of 0, test of undefined shifted bits,
24062 // etc.)
24063
24064 // (tbz (trunc x), b) -> (tbz x, b)
24065 // This case is just here to enable more of the below cases to be caught.
24066 if (Op->getOpcode() == ISD::TRUNCATE &&
24067 Bit < Op->getValueType(0).getSizeInBits()) {
24068 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
24069 }
24070
24071 // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
24072 if (Op->getOpcode() == ISD::ANY_EXTEND &&
24073 Bit < Op->getOperand(0).getValueSizeInBits()) {
24074 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
24075 }
24076
24077 if (Op->getNumOperands() != 2)
24078 return Op;
24079
24080 auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
24081 if (!C)
24082 return Op;
24083
24084 switch (Op->getOpcode()) {
24085 default:
24086 return Op;
24087
24088 // (tbz (and x, m), b) -> (tbz x, b)
24089 case ISD::AND:
24090 if ((C->getZExtValue() >> Bit) & 1)
24091 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
24092 return Op;
24093
24094 // (tbz (shl x, c), b) -> (tbz x, b-c)
24095 case ISD::SHL:
24096 if (C->getZExtValue() <= Bit &&
24097 (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
24098 Bit = Bit - C->getZExtValue();
24099 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
24100 }
24101 return Op;
24102
24103 // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
24104 case ISD::SRA:
24105 Bit = Bit + C->getZExtValue();
24106 if (Bit >= Op->getValueType(0).getSizeInBits())
24107 Bit = Op->getValueType(0).getSizeInBits() - 1;
24108 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
24109
24110 // (tbz (srl x, c), b) -> (tbz x, b+c)
24111 case ISD::SRL:
24112 if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
24113 Bit = Bit + C->getZExtValue();
24114 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
24115 }
24116 return Op;
24117
24118 // (tbz (xor x, -1), b) -> (tbnz x, b)
24119 case ISD::XOR:
24120 if ((C->getZExtValue() >> Bit) & 1)
24121 Invert = !Invert;
24122 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
24123 }
24124}
24125
24126// Optimize test single bit zero/non-zero and branch.
24129 SelectionDAG &DAG) {
24130 unsigned Bit = N->getConstantOperandVal(2);
24131 bool Invert = false;
24132 SDValue TestSrc = N->getOperand(1);
24133 SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);
24134
24135 if (TestSrc == NewTestSrc)
24136 return SDValue();
24137
24138 unsigned NewOpc = N->getOpcode();
24139 if (Invert) {
24140 if (NewOpc == AArch64ISD::TBZ)
24141 NewOpc = AArch64ISD::TBNZ;
24142 else {
24143 assert(NewOpc == AArch64ISD::TBNZ);
24144 NewOpc = AArch64ISD::TBZ;
24145 }
24146 }
24147
24148 SDLoc DL(N);
24149 return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
24150 DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
24151}
24152
24153// Swap vselect operands where it may allow a predicated operation to achieve
24154// the `sel`.
24155//
24156// (vselect (setcc ( condcode) (_) (_)) (a) (op (a) (b)))
24157// => (vselect (setcc (!condcode) (_) (_)) (op (a) (b)) (a))
24159 auto SelectA = N->getOperand(1);
24160 auto SelectB = N->getOperand(2);
24161 auto NTy = N->getValueType(0);
24162
24163 if (!NTy.isScalableVector())
24164 return SDValue();
24165 SDValue SetCC = N->getOperand(0);
24166 if (SetCC.getOpcode() != ISD::SETCC || !SetCC.hasOneUse())
24167 return SDValue();
24168
24169 switch (SelectB.getOpcode()) {
24170 default:
24171 return SDValue();
24172 case ISD::FMUL:
24173 case ISD::FSUB:
24174 case ISD::FADD:
24175 break;
24176 }
24177 if (SelectA != SelectB.getOperand(0))
24178 return SDValue();
24179
24180 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
24181 ISD::CondCode InverseCC =
24183 auto InverseSetCC =
24184 DAG.getSetCC(SDLoc(SetCC), SetCC.getValueType(), SetCC.getOperand(0),
24185 SetCC.getOperand(1), InverseCC);
24186
24187 return DAG.getNode(ISD::VSELECT, SDLoc(N), NTy,
24188 {InverseSetCC, SelectB, SelectA});
24189}
24190
24191// vselect (v1i1 setcc) ->
24192// vselect (v1iXX setcc) (XX is the size of the compared operand type)
24193// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
24194// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
24195// such VSELECT.
24197 if (auto SwapResult = trySwapVSelectOperands(N, DAG))
24198 return SwapResult;
24199
24200 SDValue N0 = N->getOperand(0);
24201 EVT CCVT = N0.getValueType();
24202
24203 if (isAllActivePredicate(DAG, N0))
24204 return N->getOperand(1);
24205
24206 if (isAllInactivePredicate(N0))
24207 return N->getOperand(2);
24208
24209 // Check for sign pattern (VSELECT setgt, iN lhs, -1, 1, -1) and transform
24210 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
24211 // supported types.
24212 SDValue SetCC = N->getOperand(0);
24213 if (SetCC.getOpcode() == ISD::SETCC &&
24214 SetCC.getOperand(2) == DAG.getCondCode(ISD::SETGT)) {
24215 SDValue CmpLHS = SetCC.getOperand(0);
24216 EVT VT = CmpLHS.getValueType();
24217 SDNode *CmpRHS = SetCC.getOperand(1).getNode();
24218 SDNode *SplatLHS = N->getOperand(1).getNode();
24219 SDNode *SplatRHS = N->getOperand(2).getNode();
24220 APInt SplatLHSVal;
24221 if (CmpLHS.getValueType() == N->getOperand(1).getValueType() &&
24222 VT.isSimple() &&
24223 is_contained(ArrayRef({MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
24224 MVT::v2i32, MVT::v4i32, MVT::v2i64}),
24225 VT.getSimpleVT().SimpleTy) &&
24226 ISD::isConstantSplatVector(SplatLHS, SplatLHSVal) &&
24227 SplatLHSVal.isOne() && ISD::isConstantSplatVectorAllOnes(CmpRHS) &&
24229 unsigned NumElts = VT.getVectorNumElements();
24231 NumElts, DAG.getConstant(VT.getScalarSizeInBits() - 1, SDLoc(N),
24232 VT.getScalarType()));
24233 SDValue Val = DAG.getBuildVector(VT, SDLoc(N), Ops);
24234
24235 auto Shift = DAG.getNode(ISD::SRA, SDLoc(N), VT, CmpLHS, Val);
24236 auto Or = DAG.getNode(ISD::OR, SDLoc(N), VT, Shift, N->getOperand(1));
24237 return Or;
24238 }
24239 }
24240
24241 EVT CmpVT = N0.getOperand(0).getValueType();
24242 if (N0.getOpcode() != ISD::SETCC ||
24244 CCVT.getVectorElementType() != MVT::i1 ||
24246 return SDValue();
24247
24248 EVT ResVT = N->getValueType(0);
24249 // Only combine when the result type is of the same size as the compared
24250 // operands.
24251 if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
24252 return SDValue();
24253
24254 SDValue IfTrue = N->getOperand(1);
24255 SDValue IfFalse = N->getOperand(2);
24256 SetCC = DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
24257 N0.getOperand(0), N0.getOperand(1),
24258 cast<CondCodeSDNode>(N0.getOperand(2))->get());
24259 return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
24260 IfTrue, IfFalse);
24261}
24262
24263/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
24264/// the compare-mask instructions rather than going via NZCV, even if LHS and
24265/// RHS are really scalar. This replaces any scalar setcc in the above pattern
24266/// with a vector one followed by a DUP shuffle on the result.
24269 SelectionDAG &DAG = DCI.DAG;
24270 SDValue N0 = N->getOperand(0);
24271 EVT ResVT = N->getValueType(0);
24272
24273 if (N0.getOpcode() != ISD::SETCC)
24274 return SDValue();
24275
24276 if (ResVT.isScalableVT())
24277 return SDValue();
24278
24279 // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
24280 // scalar SetCCResultType. We also don't expect vectors, because we assume
24281 // that selects fed by vector SETCCs are canonicalized to VSELECT.
24282 assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) &&
24283 "Scalar-SETCC feeding SELECT has unexpected result type!");
24284
24285 // If NumMaskElts == 0, the comparison is larger than select result. The
24286 // largest real NEON comparison is 64-bits per lane, which means the result is
24287 // at most 32-bits and an illegal vector. Just bail out for now.
24288 EVT SrcVT = N0.getOperand(0).getValueType();
24289
24290 // Don't try to do this optimization when the setcc itself has i1 operands.
24291 // There are no legal vectors of i1, so this would be pointless. v1f16 is
24292 // ruled out to prevent the creation of setcc that need to be scalarized.
24293 if (SrcVT == MVT::i1 ||
24294 (SrcVT.isFloatingPoint() && SrcVT.getSizeInBits() <= 16))
24295 return SDValue();
24296
24297 int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
24298 if (!ResVT.isVector() || NumMaskElts == 0)
24299 return SDValue();
24300
24301 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
24303
24304 // Also bail out if the vector CCVT isn't the same size as ResVT.
24305 // This can happen if the SETCC operand size doesn't divide the ResVT size
24306 // (e.g., f64 vs v3f32).
24307 if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
24308 return SDValue();
24309
24310 // Make sure we didn't create illegal types, if we're not supposed to.
24311 assert(DCI.isBeforeLegalize() ||
24312 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));
24313
24314 // First perform a vector comparison, where lane 0 is the one we're interested
24315 // in.
24316 SDLoc DL(N0);
24317 SDValue LHS =
24318 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
24319 SDValue RHS =
24320 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
24321 SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
24322
24323 // Now duplicate the comparison mask we want across all other lanes.
24324 SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
24325 SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
24326 Mask = DAG.getNode(ISD::BITCAST, DL,
24327 ResVT.changeVectorElementTypeToInteger(), Mask);
24328
24329 return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
24330}
24331
24334 EVT VT = N->getValueType(0);
24335 SDLoc DL(N);
24336 // If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the
24337 // 128bit vector version.
24338 if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) {
24340 SmallVector<SDValue> Ops(N->ops());
24341 if (SDNode *LN = DCI.DAG.getNodeIfExists(N->getOpcode(),
24342 DCI.DAG.getVTList(LVT), Ops)) {
24343 return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0),
24344 DCI.DAG.getConstant(0, DL, MVT::i64));
24345 }
24346 }
24347
24348 if (N->getOpcode() == AArch64ISD::DUP) {
24349 if (DCI.isAfterLegalizeDAG()) {
24350 // If scalar dup's operand is extract_vector_elt, try to combine them into
24351 // duplane. For example,
24352 //
24353 // t21: i32 = extract_vector_elt t19, Constant:i64<0>
24354 // t18: v4i32 = AArch64ISD::DUP t21
24355 // ==>
24356 // t22: v4i32 = AArch64ISD::DUPLANE32 t19, Constant:i64<0>
24357 SDValue EXTRACT_VEC_ELT = N->getOperand(0);
24358 if (EXTRACT_VEC_ELT.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
24359 if (VT == EXTRACT_VEC_ELT.getOperand(0).getValueType()) {
24360 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
24361 return DCI.DAG.getNode(Opcode, DL, VT, EXTRACT_VEC_ELT.getOperand(0),
24362 EXTRACT_VEC_ELT.getOperand(1));
24363 }
24364 }
24365 }
24366
24367 return performPostLD1Combine(N, DCI, false);
24368 }
24369
24370 return SDValue();
24371}
24372
24373/// Get rid of unnecessary NVCASTs (that don't change the type).
24375 if (N->getValueType(0) == N->getOperand(0).getValueType())
24376 return N->getOperand(0);
24377 if (N->getOperand(0).getOpcode() == AArch64ISD::NVCAST)
24378 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(N), N->getValueType(0),
24379 N->getOperand(0).getOperand(0));
24380
24381 return SDValue();
24382}
24383
24384// If all users of the globaladdr are of the form (globaladdr + constant), find
24385// the smallest constant, fold it into the globaladdr's offset and rewrite the
24386// globaladdr as (globaladdr + constant) - constant.
24388 const AArch64Subtarget *Subtarget,
24389 const TargetMachine &TM) {
24390 auto *GN = cast<GlobalAddressSDNode>(N);
24391 if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=
24393 return SDValue();
24394
24395 uint64_t MinOffset = -1ull;
24396 for (SDNode *N : GN->uses()) {
24397 if (N->getOpcode() != ISD::ADD)
24398 return SDValue();
24399 auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0));
24400 if (!C)
24401 C = dyn_cast<ConstantSDNode>(N->getOperand(1));
24402 if (!C)
24403 return SDValue();
24404 MinOffset = std::min(MinOffset, C->getZExtValue());
24405 }
24406 uint64_t Offset = MinOffset + GN->getOffset();
24407
24408 // Require that the new offset is larger than the existing one. Otherwise, we
24409 // can end up oscillating between two possible DAGs, for example,
24410 // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
24411 if (Offset <= uint64_t(GN->getOffset()))
24412 return SDValue();
24413
24414 // Check whether folding this offset is legal. It must not go out of bounds of
24415 // the referenced object to avoid violating the code model, and must be
24416 // smaller than 2^20 because this is the largest offset expressible in all
24417 // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF
24418 // stores an immediate signed 21 bit offset.)
24419 //
24420 // This check also prevents us from folding negative offsets, which will end
24421 // up being treated in the same way as large positive ones. They could also
24422 // cause code model violations, and aren't really common enough to matter.
24423 if (Offset >= (1 << 20))
24424 return SDValue();
24425
24426 const GlobalValue *GV = GN->getGlobal();
24427 Type *T = GV->getValueType();
24428 if (!T->isSized() ||
24430 return SDValue();
24431
24432 SDLoc DL(GN);
24433 SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset);
24434 return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,
24435 DAG.getConstant(MinOffset, DL, MVT::i64));
24436}
24437
24439 const AArch64Subtarget *Subtarget) {
24440 SDValue BR = N->getOperand(0);
24441 if (!Subtarget->hasCSSC() || BR.getOpcode() != ISD::BITREVERSE ||
24442 !BR.getValueType().isScalarInteger())
24443 return SDValue();
24444
24445 SDLoc DL(N);
24446 return DAG.getNode(ISD::CTTZ, DL, BR.getValueType(), BR.getOperand(0));
24447}
24448
24449// Turns the vector of indices into a vector of byte offstes by scaling Offset
24450// by (BitWidth / 8).
24452 SDLoc DL, unsigned BitWidth) {
24453 assert(Offset.getValueType().isScalableVector() &&
24454 "This method is only for scalable vectors of offsets");
24455
24456 SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64);
24457 SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift);
24458
24459 return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift);
24460}
24461
24462/// Check if the value of \p OffsetInBytes can be used as an immediate for
24463/// the gather load/prefetch and scatter store instructions with vector base and
24464/// immediate offset addressing mode:
24465///
24466/// [<Zn>.[S|D]{, #<imm>}]
24467///
24468/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
24469inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes,
24470 unsigned ScalarSizeInBytes) {
24471 // The immediate is not a multiple of the scalar size.
24472 if (OffsetInBytes % ScalarSizeInBytes)
24473 return false;
24474
24475 // The immediate is out of range.
24476 if (OffsetInBytes / ScalarSizeInBytes > 31)
24477 return false;
24478
24479 return true;
24480}
24481
24482/// Check if the value of \p Offset represents a valid immediate for the SVE
24483/// gather load/prefetch and scatter store instructiona with vector base and
24484/// immediate offset addressing mode:
24485///
24486/// [<Zn>.[S|D]{, #<imm>}]
24487///
24488/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
24490 unsigned ScalarSizeInBytes) {
24491 ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode());
24492 return OffsetConst && isValidImmForSVEVecImmAddrMode(
24493 OffsetConst->getZExtValue(), ScalarSizeInBytes);
24494}
24495
24497 unsigned Opcode,
24498 bool OnlyPackedOffsets = true) {
24499 const SDValue Src = N->getOperand(2);
24500 const EVT SrcVT = Src->getValueType(0);
24501 assert(SrcVT.isScalableVector() &&
24502 "Scatter stores are only possible for SVE vectors");
24503
24504 SDLoc DL(N);
24505 MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT();
24506
24507 // Make sure that source data will fit into an SVE register
24509 return SDValue();
24510
24511 // For FPs, ACLE only supports _packed_ single and double precision types.
24512 // SST1Q_[INDEX_]PRED is the ST1Q for sve2p1 and should allow all sizes.
24513 if (SrcElVT.isFloatingPoint())
24514 if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64) &&
24515 ((Opcode != AArch64ISD::SST1Q_PRED &&
24516 Opcode != AArch64ISD::SST1Q_INDEX_PRED) ||
24517 ((SrcVT != MVT::nxv8f16) && (SrcVT != MVT::nxv8bf16))))
24518 return SDValue();
24519
24520 // Depending on the addressing mode, this is either a pointer or a vector of
24521 // pointers (that fits into one register)
24522 SDValue Base = N->getOperand(4);
24523 // Depending on the addressing mode, this is either a single offset or a
24524 // vector of offsets (that fits into one register)
24525 SDValue Offset = N->getOperand(5);
24526
24527 // For "scalar + vector of indices", just scale the indices. This only
24528 // applies to non-temporal scatters because there's no instruction that takes
24529 // indices.
24530 if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {
24531 Offset =
24533 Opcode = AArch64ISD::SSTNT1_PRED;
24534 } else if (Opcode == AArch64ISD::SST1Q_INDEX_PRED) {
24535 Offset =
24537 Opcode = AArch64ISD::SST1Q_PRED;
24538 }
24539
24540 // In the case of non-temporal gather loads there's only one SVE instruction
24541 // per data-size: "scalar + vector", i.e.
24542 // * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
24543 // Since we do have intrinsics that allow the arguments to be in a different
24544 // order, we may need to swap them to match the spec.
24545 if ((Opcode == AArch64ISD::SSTNT1_PRED || Opcode == AArch64ISD::SST1Q_PRED) &&
24546 Offset.getValueType().isVector())
24548
24549 // SST1_IMM requires that the offset is an immediate that is:
24550 // * a multiple of #SizeInBytes,
24551 // * in the range [0, 31 x #SizeInBytes],
24552 // where #SizeInBytes is the size in bytes of the stored items. For
24553 // immediates outside that range and non-immediate scalar offsets use SST1 or
24554 // SST1_UXTW instead.
24555 if (Opcode == AArch64ISD::SST1_IMM_PRED) {
24557 SrcVT.getScalarSizeInBits() / 8)) {
24558 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
24560 else
24561 Opcode = AArch64ISD::SST1_PRED;
24562
24564 }
24565 }
24566
24567 auto &TLI = DAG.getTargetLoweringInfo();
24568 if (!TLI.isTypeLegal(Base.getValueType()))
24569 return SDValue();
24570
24571 // Some scatter store variants allow unpacked offsets, but only as nxv2i32
24572 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
24573 // nxv2i64. Legalize accordingly.
24574 if (!OnlyPackedOffsets &&
24575 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
24576 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
24577
24578 if (!TLI.isTypeLegal(Offset.getValueType()))
24579 return SDValue();
24580
24581 // Source value type that is representable in hardware
24582 EVT HwSrcVt = getSVEContainerType(SrcVT);
24583
24584 // Keep the original type of the input data to store - this is needed to be
24585 // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For
24586 // FP values we want the integer equivalent, so just use HwSrcVt.
24587 SDValue InputVT = DAG.getValueType(SrcVT);
24588 if (SrcVT.isFloatingPoint())
24589 InputVT = DAG.getValueType(HwSrcVt);
24590
24591 SDVTList VTs = DAG.getVTList(MVT::Other);
24592 SDValue SrcNew;
24593
24594 if (Src.getValueType().isFloatingPoint())
24595 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Src);
24596 else
24597 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Src);
24598
24599 SDValue Ops[] = {N->getOperand(0), // Chain
24600 SrcNew,
24601 N->getOperand(3), // Pg
24602 Base,
24603 Offset,
24604 InputVT};
24605
24606 return DAG.getNode(Opcode, DL, VTs, Ops);
24607}
24608
24610 unsigned Opcode,
24611 bool OnlyPackedOffsets = true) {
24612 const EVT RetVT = N->getValueType(0);
24613 assert(RetVT.isScalableVector() &&
24614 "Gather loads are only possible for SVE vectors");
24615
24616 SDLoc DL(N);
24617
24618 // Make sure that the loaded data will fit into an SVE register
24620 return SDValue();
24621
24622 // Depending on the addressing mode, this is either a pointer or a vector of
24623 // pointers (that fits into one register)
24624 SDValue Base = N->getOperand(3);
24625 // Depending on the addressing mode, this is either a single offset or a
24626 // vector of offsets (that fits into one register)
24627 SDValue Offset = N->getOperand(4);
24628
24629 // For "scalar + vector of indices", scale the indices to obtain unscaled
24630 // offsets. This applies to non-temporal and quadword gathers, which do not
24631 // have an addressing mode with scaled offset.
24634 RetVT.getScalarSizeInBits());
24636 } else if (Opcode == AArch64ISD::GLD1Q_INDEX_MERGE_ZERO) {
24638 RetVT.getScalarSizeInBits());
24640 }
24641
24642 // In the case of non-temporal gather loads and quadword gather loads there's
24643 // only one addressing mode : "vector + scalar", e.g.
24644 // ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
24645 // Since we do have intrinsics that allow the arguments to be in a different
24646 // order, we may need to swap them to match the spec.
24647 if ((Opcode == AArch64ISD::GLDNT1_MERGE_ZERO ||
24648 Opcode == AArch64ISD::GLD1Q_MERGE_ZERO) &&
24649 Offset.getValueType().isVector())
24651
24652 // GLD{FF}1_IMM requires that the offset is an immediate that is:
24653 // * a multiple of #SizeInBytes,
24654 // * in the range [0, 31 x #SizeInBytes],
24655 // where #SizeInBytes is the size in bytes of the loaded items. For
24656 // immediates outside that range and non-immediate scalar offsets use
24657 // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead.
24658 if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO ||
24661 RetVT.getScalarSizeInBits() / 8)) {
24662 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
24663 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
24666 else
24667 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
24670
24672 }
24673 }
24674
24675 auto &TLI = DAG.getTargetLoweringInfo();
24676 if (!TLI.isTypeLegal(Base.getValueType()))
24677 return SDValue();
24678
24679 // Some gather load variants allow unpacked offsets, but only as nxv2i32
24680 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
24681 // nxv2i64. Legalize accordingly.
24682 if (!OnlyPackedOffsets &&
24683 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
24684 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
24685
24686 // Return value type that is representable in hardware
24687 EVT HwRetVt = getSVEContainerType(RetVT);
24688
24689 // Keep the original output value type around - this is needed to be able to
24690 // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP
24691 // values we want the integer equivalent, so just use HwRetVT.
24692 SDValue OutVT = DAG.getValueType(RetVT);
24693 if (RetVT.isFloatingPoint())
24694 OutVT = DAG.getValueType(HwRetVt);
24695
24696 SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other);
24697 SDValue Ops[] = {N->getOperand(0), // Chain
24698 N->getOperand(2), // Pg
24699 Base, Offset, OutVT};
24700
24701 SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops);
24702 SDValue LoadChain = SDValue(Load.getNode(), 1);
24703
24704 if (RetVT.isInteger() && (RetVT != HwRetVt))
24705 Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0));
24706
24707 // If the original return value was FP, bitcast accordingly. Doing it here
24708 // means that we can avoid adding TableGen patterns for FPs.
24709 if (RetVT.isFloatingPoint())
24710 Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0));
24711
24712 return DAG.getMergeValues({Load, LoadChain}, DL);
24713}
24714
24715static SDValue
24717 SelectionDAG &DAG) {
24718 SDLoc DL(N);
24719 SDValue Src = N->getOperand(0);
24720 unsigned Opc = Src->getOpcode();
24721
24722 // Sign extend of an unsigned unpack -> signed unpack
24723 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
24724
24725 unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
24727
24728 // Push the sign extend to the operand of the unpack
24729 // This is necessary where, for example, the operand of the unpack
24730 // is another unpack:
24731 // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)
24732 // ->
24733 // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)
24734 // ->
24735 // 4i32 sunpklo(8i16 sunpklo(16i8 opnd))
24736 SDValue ExtOp = Src->getOperand(0);
24737 auto VT = cast<VTSDNode>(N->getOperand(1))->getVT();
24738 EVT EltTy = VT.getVectorElementType();
24739 (void)EltTy;
24740
24741 assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) &&
24742 "Sign extending from an invalid type");
24743
24744 EVT ExtVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext());
24745
24747 ExtOp, DAG.getValueType(ExtVT));
24748
24749 return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
24750 }
24751
24752 if (DCI.isBeforeLegalizeOps())
24753 return SDValue();
24754
24756 return SDValue();
24757
24758 // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
24759 // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
24760 unsigned NewOpc;
24761 unsigned MemVTOpNum = 4;
24762 switch (Opc) {
24765 MemVTOpNum = 3;
24766 break;
24769 MemVTOpNum = 3;
24770 break;
24773 MemVTOpNum = 3;
24774 break;
24777 break;
24780 break;
24783 break;
24786 break;
24789 break;
24792 break;
24795 break;
24798 break;
24801 break;
24804 break;
24807 break;
24810 break;
24813 break;
24816 break;
24819 break;
24820 default:
24821 return SDValue();
24822 }
24823
24824 EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT();
24825 EVT SrcMemVT = cast<VTSDNode>(Src->getOperand(MemVTOpNum))->getVT();
24826
24827 if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse())
24828 return SDValue();
24829
24830 EVT DstVT = N->getValueType(0);
24831 SDVTList VTs = DAG.getVTList(DstVT, MVT::Other);
24832
24834 for (unsigned I = 0; I < Src->getNumOperands(); ++I)
24835 Ops.push_back(Src->getOperand(I));
24836
24837 SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops);
24838 DCI.CombineTo(N, ExtLoad);
24839 DCI.CombineTo(Src.getNode(), ExtLoad, ExtLoad.getValue(1));
24840
24841 // Return N so it doesn't get rechecked
24842 return SDValue(N, 0);
24843}
24844
24845/// Legalize the gather prefetch (scalar + vector addressing mode) when the
24846/// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
24847/// != nxv2i32) do not need legalization.
24849 const unsigned OffsetPos = 4;
24850 SDValue Offset = N->getOperand(OffsetPos);
24851
24852 // Not an unpacked vector, bail out.
24853 if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
24854 return SDValue();
24855
24856 // Extend the unpacked offset vector to 64-bit lanes.
24857 SDLoc DL(N);
24858 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset);
24859 SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
24860 // Replace the offset operand with the 64-bit one.
24861 Ops[OffsetPos] = Offset;
24862
24863 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
24864}
24865
24866/// Combines a node carrying the intrinsic
24867/// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses
24868/// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to
24869/// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the
24870/// sve gather prefetch instruction with vector plus immediate addressing mode.
24872 unsigned ScalarSizeInBytes) {
24873 const unsigned ImmPos = 4, OffsetPos = 3;
24874 // No need to combine the node if the immediate is valid...
24875 if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes))
24876 return SDValue();
24877
24878 // ...otherwise swap the offset base with the offset...
24879 SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
24880 std::swap(Ops[ImmPos], Ops[OffsetPos]);
24881 // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
24882 // `aarch64_sve_prfb_gather_uxtw_index`.
24883 SDLoc DL(N);
24884 Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL,
24885 MVT::i64);
24886
24887 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
24888}
24889
24890// Return true if the vector operation can guarantee only the first lane of its
24891// result contains data, with all bits in other lanes set to zero.
24893 switch (Op.getOpcode()) {
24894 default:
24895 return false;
24911 return true;
24912 }
24913}
24914
24916 assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!");
24917 SDValue InsertVec = N->getOperand(0);
24918 SDValue InsertElt = N->getOperand(1);
24919 SDValue InsertIdx = N->getOperand(2);
24920
24921 // We only care about inserts into the first element...
24922 if (!isNullConstant(InsertIdx))
24923 return SDValue();
24924 // ...of a zero'd vector...
24926 return SDValue();
24927 // ...where the inserted data was previously extracted...
24928 if (InsertElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
24929 return SDValue();
24930
24931 SDValue ExtractVec = InsertElt.getOperand(0);
24932 SDValue ExtractIdx = InsertElt.getOperand(1);
24933
24934 // ...from the first element of a vector.
24935 if (!isNullConstant(ExtractIdx))
24936 return SDValue();
24937
24938 // If we get here we are effectively trying to zero lanes 1-N of a vector.
24939
24940 // Ensure there's no type conversion going on.
24941 if (N->getValueType(0) != ExtractVec.getValueType())
24942 return SDValue();
24943
24944 if (!isLanes1toNKnownZero(ExtractVec))
24945 return SDValue();
24946
24947 // The explicit zeroing is redundant.
24948 return ExtractVec;
24949}
24950
24951static SDValue
24954 return Res;
24955
24956 return performPostLD1Combine(N, DCI, true);
24957}
24958
24961 const AArch64Subtarget *Subtarget) {
24962 SDValue N0 = N->getOperand(0);
24963 EVT VT = N->getValueType(0);
24964
24965 // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
24966 if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::FP_ROUND)
24967 return SDValue();
24968
24969 auto hasValidElementTypeForFPExtLoad = [](EVT VT) {
24970 EVT EltVT = VT.getVectorElementType();
24971 return EltVT == MVT::f32 || EltVT == MVT::f64;
24972 };
24973
24974 // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
24975 // We purposefully don't care about legality of the nodes here as we know
24976 // they can be split down into something legal.
24977 if (DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(N0.getNode()) &&
24978 N0.hasOneUse() && Subtarget->useSVEForFixedLengthVectors() &&
24979 VT.isFixedLengthVector() && hasValidElementTypeForFPExtLoad(VT) &&
24980 VT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits()) {
24981 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
24982 SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
24983 LN0->getChain(), LN0->getBasePtr(),
24984 N0.getValueType(), LN0->getMemOperand());
24985 DCI.CombineTo(N, ExtLoad);
24986 DCI.CombineTo(
24987 N0.getNode(),
24988 DAG.getNode(ISD::FP_ROUND, SDLoc(N0), N0.getValueType(), ExtLoad,
24989 DAG.getIntPtrConstant(1, SDLoc(N0), /*isTarget=*/true)),
24990 ExtLoad.getValue(1));
24991 return SDValue(N, 0); // Return N so it doesn't get rechecked!
24992 }
24993
24994 return SDValue();
24995}
24996
24998 const AArch64Subtarget *Subtarget) {
24999 EVT VT = N->getValueType(0);
25000
25001 // Don't expand for NEON, SVE2 or SME
25002 if (!VT.isScalableVector() || Subtarget->hasSVE2() || Subtarget->hasSME())
25003 return SDValue();
25004
25005 SDLoc DL(N);
25006
25007 SDValue Mask = N->getOperand(0);
25008 SDValue In1 = N->getOperand(1);
25009 SDValue In2 = N->getOperand(2);
25010
25011 SDValue InvMask = DAG.getNOT(DL, Mask, VT);
25012 SDValue Sel = DAG.getNode(ISD::AND, DL, VT, Mask, In1);
25013 SDValue SelInv = DAG.getNode(ISD::AND, DL, VT, InvMask, In2);
25014 return DAG.getNode(ISD::OR, DL, VT, Sel, SelInv);
25015}
25016
25018 EVT VT = N->getValueType(0);
25019
25020 SDValue Insert = N->getOperand(0);
25021 if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR)
25022 return SDValue();
25023
25024 if (!Insert.getOperand(0).isUndef())
25025 return SDValue();
25026
25027 uint64_t IdxInsert = Insert.getConstantOperandVal(2);
25028 uint64_t IdxDupLane = N->getConstantOperandVal(1);
25029 if (IdxInsert != 0 || IdxDupLane != 0)
25030 return SDValue();
25031
25032 SDValue Bitcast = Insert.getOperand(1);
25033 if (Bitcast.getOpcode() != ISD::BITCAST)
25034 return SDValue();
25035
25036 SDValue Subvec = Bitcast.getOperand(0);
25037 EVT SubvecVT = Subvec.getValueType();
25038 if (!SubvecVT.is128BitVector())
25039 return SDValue();
25040 EVT NewSubvecVT =
25042
25043 SDLoc DL(N);
25044 SDValue NewInsert =
25045 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewSubvecVT,
25046 DAG.getUNDEF(NewSubvecVT), Subvec, Insert->getOperand(2));
25047 SDValue NewDuplane128 = DAG.getNode(AArch64ISD::DUPLANE128, DL, NewSubvecVT,
25048 NewInsert, N->getOperand(1));
25049 return DAG.getNode(ISD::BITCAST, DL, VT, NewDuplane128);
25050}
25051
25052// Try to combine mull with uzp1.
25055 SelectionDAG &DAG) {
25056 if (DCI.isBeforeLegalizeOps())
25057 return SDValue();
25058
25059 SDValue LHS = N->getOperand(0);
25060 SDValue RHS = N->getOperand(1);
25061
25062 SDValue ExtractHigh;
25063 SDValue ExtractLow;
25064 SDValue TruncHigh;
25065 SDValue TruncLow;
25066 SDLoc DL(N);
25067
25068 // Check the operands are trunc and extract_high.
25070 RHS.getOpcode() == ISD::TRUNCATE) {
25071 TruncHigh = RHS;
25072 if (LHS.getOpcode() == ISD::BITCAST)
25073 ExtractHigh = LHS.getOperand(0);
25074 else
25075 ExtractHigh = LHS;
25077 LHS.getOpcode() == ISD::TRUNCATE) {
25078 TruncHigh = LHS;
25079 if (RHS.getOpcode() == ISD::BITCAST)
25080 ExtractHigh = RHS.getOperand(0);
25081 else
25082 ExtractHigh = RHS;
25083 } else
25084 return SDValue();
25085
25086 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
25087 // with uzp1.
25088 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
25089 SDValue TruncHighOp = TruncHigh.getOperand(0);
25090 EVT TruncHighOpVT = TruncHighOp.getValueType();
25091 if (TruncHighOp.getOpcode() == AArch64ISD::DUP ||
25092 DAG.isSplatValue(TruncHighOp, false))
25093 return SDValue();
25094
25095 // Check there is other extract_high with same source vector.
25096 // For example,
25097 //
25098 // t18: v4i16 = extract_subvector t2, Constant:i64<0>
25099 // t12: v4i16 = truncate t11
25100 // t31: v4i32 = AArch64ISD::SMULL t18, t12
25101 // t23: v4i16 = extract_subvector t2, Constant:i64<4>
25102 // t16: v4i16 = truncate t15
25103 // t30: v4i32 = AArch64ISD::SMULL t23, t1
25104 //
25105 // This dagcombine assumes the two extract_high uses same source vector in
25106 // order to detect the pair of the mull. If they have different source vector,
25107 // this code will not work.
25108 // TODO: Should also try to look through a bitcast.
25109 bool HasFoundMULLow = true;
25110 SDValue ExtractHighSrcVec = ExtractHigh.getOperand(0);
25111 if (ExtractHighSrcVec->use_size() != 2)
25112 HasFoundMULLow = false;
25113
25114 // Find ExtractLow.
25115 for (SDNode *User : ExtractHighSrcVec.getNode()->uses()) {
25116 if (User == ExtractHigh.getNode())
25117 continue;
25118
25119 if (User->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
25121 HasFoundMULLow = false;
25122 break;
25123 }
25124
25125 ExtractLow.setNode(User);
25126 }
25127
25128 if (!ExtractLow || !ExtractLow->hasOneUse())
25129 HasFoundMULLow = false;
25130
25131 // Check ExtractLow's user.
25132 if (HasFoundMULLow) {
25133 SDNode *ExtractLowUser = *ExtractLow.getNode()->use_begin();
25134 if (ExtractLowUser->getOpcode() != N->getOpcode()) {
25135 HasFoundMULLow = false;
25136 } else {
25137 if (ExtractLowUser->getOperand(0) == ExtractLow) {
25138 if (ExtractLowUser->getOperand(1).getOpcode() == ISD::TRUNCATE)
25139 TruncLow = ExtractLowUser->getOperand(1);
25140 else
25141 HasFoundMULLow = false;
25142 } else {
25143 if (ExtractLowUser->getOperand(0).getOpcode() == ISD::TRUNCATE)
25144 TruncLow = ExtractLowUser->getOperand(0);
25145 else
25146 HasFoundMULLow = false;
25147 }
25148 }
25149 }
25150
25151 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
25152 // with uzp1.
25153 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
25154 EVT TruncHighVT = TruncHigh.getValueType();
25155 EVT UZP1VT = TruncHighVT.getDoubleNumVectorElementsVT(*DAG.getContext());
25156 SDValue TruncLowOp =
25157 HasFoundMULLow ? TruncLow.getOperand(0) : DAG.getUNDEF(UZP1VT);
25158 EVT TruncLowOpVT = TruncLowOp.getValueType();
25159 if (HasFoundMULLow && (TruncLowOp.getOpcode() == AArch64ISD::DUP ||
25160 DAG.isSplatValue(TruncLowOp, false)))
25161 return SDValue();
25162
25163 // Create uzp1, extract_high and extract_low.
25164 if (TruncHighOpVT != UZP1VT)
25165 TruncHighOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncHighOp);
25166 if (TruncLowOpVT != UZP1VT)
25167 TruncLowOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncLowOp);
25168
25169 SDValue UZP1 =
25170 DAG.getNode(AArch64ISD::UZP1, DL, UZP1VT, TruncLowOp, TruncHighOp);
25171 SDValue HighIdxCst =
25172 DAG.getConstant(TruncHighVT.getVectorNumElements(), DL, MVT::i64);
25173 SDValue NewTruncHigh =
25174 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncHighVT, UZP1, HighIdxCst);
25175 DAG.ReplaceAllUsesWith(TruncHigh, NewTruncHigh);
25176
25177 if (HasFoundMULLow) {
25178 EVT TruncLowVT = TruncLow.getValueType();
25179 SDValue NewTruncLow = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncLowVT,
25180 UZP1, ExtractLow.getOperand(1));
25181 DAG.ReplaceAllUsesWith(TruncLow, NewTruncLow);
25182 }
25183
25184 return SDValue(N, 0);
25185}
25186
25189 SelectionDAG &DAG) {
25190 if (SDValue Val =
25192 return Val;
25193
25194 if (SDValue Val = tryCombineMULLWithUZP1(N, DCI, DAG))
25195 return Val;
25196
25197 return SDValue();
25198}
25199
25200static SDValue
25202 SelectionDAG &DAG) {
25203 // Let's do below transform.
25204 //
25205 // t34: v4i32 = AArch64ISD::UADDLV t2
25206 // t35: i32 = extract_vector_elt t34, Constant:i64<0>
25207 // t7: i64 = zero_extend t35
25208 // t20: v1i64 = scalar_to_vector t7
25209 // ==>
25210 // t34: v4i32 = AArch64ISD::UADDLV t2
25211 // t39: v2i32 = extract_subvector t34, Constant:i64<0>
25212 // t40: v1i64 = AArch64ISD::NVCAST t39
25213 if (DCI.isBeforeLegalizeOps())
25214 return SDValue();
25215
25216 EVT VT = N->getValueType(0);
25217 if (VT != MVT::v1i64)
25218 return SDValue();
25219
25220 SDValue ZEXT = N->getOperand(0);
25221 if (ZEXT.getOpcode() != ISD::ZERO_EXTEND || ZEXT.getValueType() != MVT::i64)
25222 return SDValue();
25223
25224 SDValue EXTRACT_VEC_ELT = ZEXT.getOperand(0);
25225 if (EXTRACT_VEC_ELT.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
25226 EXTRACT_VEC_ELT.getValueType() != MVT::i32)
25227 return SDValue();
25228
25229 if (!isNullConstant(EXTRACT_VEC_ELT.getOperand(1)))
25230 return SDValue();
25231
25232 SDValue UADDLV = EXTRACT_VEC_ELT.getOperand(0);
25233 if (UADDLV.getOpcode() != AArch64ISD::UADDLV ||
25234 UADDLV.getValueType() != MVT::v4i32 ||
25235 UADDLV.getOperand(0).getValueType() != MVT::v8i8)
25236 return SDValue();
25237
25238 // Let's generate new sequence with AArch64ISD::NVCAST.
25239 SDLoc DL(N);
25240 SDValue EXTRACT_SUBVEC =
25241 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, UADDLV,
25242 DAG.getConstant(0, DL, MVT::i64));
25243 SDValue NVCAST =
25244 DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, EXTRACT_SUBVEC);
25245
25246 return NVCAST;
25247}
25248
25250 DAGCombinerInfo &DCI) const {
25251 SelectionDAG &DAG = DCI.DAG;
25252 switch (N->getOpcode()) {
25253 default:
25254 LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
25255 break;
25256 case ISD::VECREDUCE_AND:
25257 case ISD::VECREDUCE_OR:
25258 case ISD::VECREDUCE_XOR:
25259 return performVecReduceBitwiseCombine(N, DCI, DAG);
25260 case ISD::ADD:
25261 case ISD::SUB:
25262 return performAddSubCombine(N, DCI);
25263 case ISD::BUILD_VECTOR:
25264 return performBuildVectorCombine(N, DCI, DAG);
25265 case ISD::TRUNCATE:
25266 return performTruncateCombine(N, DAG);
25267 case AArch64ISD::ANDS:
25268 return performFlagSettingCombine(N, DCI, ISD::AND);
25269 case AArch64ISD::ADC:
25270 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
25271 return R;
25272 return foldADCToCINC(N, DAG);
25273 case AArch64ISD::SBC:
25274 return foldOverflowCheck(N, DAG, /* IsAdd */ false);
25275 case AArch64ISD::ADCS:
25276 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
25277 return R;
25279 case AArch64ISD::SBCS:
25280 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ false))
25281 return R;
25283 case AArch64ISD::BICi: {
25285 APInt::getAllOnes(N->getValueType(0).getScalarSizeInBits());
25286 APInt DemandedElts =
25287 APInt::getAllOnes(N->getValueType(0).getVectorNumElements());
25288
25290 SDValue(N, 0), DemandedBits, DemandedElts, DCI))
25291 return SDValue();
25292
25293 break;
25294 }
25295 case ISD::XOR:
25296 return performXorCombine(N, DAG, DCI, Subtarget);
25297 case ISD::MUL:
25298 return performMulCombine(N, DAG, DCI, Subtarget);
25299 case ISD::SINT_TO_FP:
25300 case ISD::UINT_TO_FP:
25301 return performIntToFpCombine(N, DAG, Subtarget);
25302 case ISD::FP_TO_SINT:
25303 case ISD::FP_TO_UINT:
25306 return performFpToIntCombine(N, DAG, DCI, Subtarget);
25307 case ISD::OR:
25308 return performORCombine(N, DCI, Subtarget, *this);
25309 case ISD::AND:
25310 return performANDCombine(N, DCI);
25311 case ISD::FADD:
25312 return performFADDCombine(N, DCI);
25314 return performIntrinsicCombine(N, DCI, Subtarget);
25315 case ISD::ANY_EXTEND:
25316 case ISD::ZERO_EXTEND:
25317 case ISD::SIGN_EXTEND:
25318 return performExtendCombine(N, DCI, DAG);
25320 return performSignExtendInRegCombine(N, DCI, DAG);
25322 return performConcatVectorsCombine(N, DCI, DAG);
25324 return performExtractSubvectorCombine(N, DCI, DAG);
25326 return performInsertSubvectorCombine(N, DCI, DAG);
25327 case ISD::SELECT:
25328 return performSelectCombine(N, DCI);
25329 case ISD::VSELECT:
25330 return performVSelectCombine(N, DCI.DAG);
25331 case ISD::SETCC:
25332 return performSETCCCombine(N, DCI, DAG);
25333 case ISD::LOAD:
25334 return performLOADCombine(N, DCI, DAG, Subtarget);
25335 case ISD::STORE:
25336 return performSTORECombine(N, DCI, DAG, Subtarget);
25337 case ISD::MSTORE:
25338 return performMSTORECombine(N, DCI, DAG, Subtarget);
25339 case ISD::MGATHER:
25340 case ISD::MSCATTER:
25341 return performMaskedGatherScatterCombine(N, DCI, DAG);
25342 case ISD::FP_EXTEND:
25343 return performFPExtendCombine(N, DAG, DCI, Subtarget);
25344 case AArch64ISD::BRCOND:
25345 return performBRCONDCombine(N, DCI, DAG);
25346 case AArch64ISD::TBNZ:
25347 case AArch64ISD::TBZ:
25348 return performTBZCombine(N, DCI, DAG);
25349 case AArch64ISD::CSEL:
25350 return performCSELCombine(N, DCI, DAG);
25351 case AArch64ISD::DUP:
25356 return performDUPCombine(N, DCI);
25358 return performDupLane128Combine(N, DAG);
25359 case AArch64ISD::NVCAST:
25360 return performNVCASTCombine(N, DAG);
25361 case AArch64ISD::SPLICE:
25362 return performSpliceCombine(N, DAG);
25365 return performUnpackCombine(N, DAG, Subtarget);
25366 case AArch64ISD::UZP1:
25367 case AArch64ISD::UZP2:
25368 return performUzpCombine(N, DAG, Subtarget);
25370 return performSetccMergeZeroCombine(N, DCI);
25387 return performGLD1Combine(N, DAG);
25388 case AArch64ISD::VASHR:
25389 case AArch64ISD::VLSHR:
25390 return performVectorShiftCombine(N, *this, DCI);
25392 return performSunpkloCombine(N, DAG);
25393 case AArch64ISD::BSP:
25394 return performBSPExpandForSVE(N, DAG, Subtarget);
25396 return performInsertVectorEltCombine(N, DCI);
25398 return performExtractVectorEltCombine(N, DCI, Subtarget);
25399 case ISD::VECREDUCE_ADD:
25400 return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
25401 case AArch64ISD::UADDV:
25402 return performUADDVCombine(N, DAG);
25403 case AArch64ISD::SMULL:
25404 case AArch64ISD::UMULL:
25405 case AArch64ISD::PMULL:
25406 return performMULLCombine(N, DCI, DAG);
25409 switch (N->getConstantOperandVal(1)) {
25410 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
25411 return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/);
25412 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
25413 return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /*=ScalarSizeInBytes*/);
25414 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
25415 return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /*=ScalarSizeInBytes*/);
25416 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
25417 return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /*=ScalarSizeInBytes*/);
25418 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
25419 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
25420 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
25421 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
25422 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
25423 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
25424 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
25425 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
25427 case Intrinsic::aarch64_neon_ld2:
25428 case Intrinsic::aarch64_neon_ld3:
25429 case Intrinsic::aarch64_neon_ld4:
25430 case Intrinsic::aarch64_neon_ld1x2:
25431 case Intrinsic::aarch64_neon_ld1x3:
25432 case Intrinsic::aarch64_neon_ld1x4:
25433 case Intrinsic::aarch64_neon_ld2lane:
25434 case Intrinsic::aarch64_neon_ld3lane:
25435 case Intrinsic::aarch64_neon_ld4lane:
25436 case Intrinsic::aarch64_neon_ld2r:
25437 case Intrinsic::aarch64_neon_ld3r:
25438 case Intrinsic::aarch64_neon_ld4r:
25439 case Intrinsic::aarch64_neon_st2:
25440 case Intrinsic::aarch64_neon_st3:
25441 case Intrinsic::aarch64_neon_st4:
25442 case Intrinsic::aarch64_neon_st1x2:
25443 case Intrinsic::aarch64_neon_st1x3:
25444 case Intrinsic::aarch64_neon_st1x4:
25445 case Intrinsic::aarch64_neon_st2lane:
25446 case Intrinsic::aarch64_neon_st3lane:
25447 case Intrinsic::aarch64_neon_st4lane:
25448 return performNEONPostLDSTCombine(N, DCI, DAG);
25449 case Intrinsic::aarch64_sve_ldnt1:
25450 return performLDNT1Combine(N, DAG);
25451 case Intrinsic::aarch64_sve_ld1rq:
25452 return performLD1ReplicateCombine<AArch64ISD::LD1RQ_MERGE_ZERO>(N, DAG);
25453 case Intrinsic::aarch64_sve_ld1ro:
25454 return performLD1ReplicateCombine<AArch64ISD::LD1RO_MERGE_ZERO>(N, DAG);
25455 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
25457 case Intrinsic::aarch64_sve_ldnt1_gather:
25459 case Intrinsic::aarch64_sve_ldnt1_gather_index:
25460 return performGatherLoadCombine(N, DAG,
25462 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
25464 case Intrinsic::aarch64_sve_ld1:
25466 case Intrinsic::aarch64_sve_ldnf1:
25468 case Intrinsic::aarch64_sve_ldff1:
25470 case Intrinsic::aarch64_sve_st1:
25471 return performST1Combine(N, DAG);
25472 case Intrinsic::aarch64_sve_stnt1:
25473 return performSTNT1Combine(N, DAG);
25474 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
25476 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
25478 case Intrinsic::aarch64_sve_stnt1_scatter:
25480 case Intrinsic::aarch64_sve_stnt1_scatter_index:
25482 case Intrinsic::aarch64_sve_ld1_gather:
25484 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
25485 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
25487 case Intrinsic::aarch64_sve_ld1q_gather_index:
25488 return performGatherLoadCombine(N, DAG,
25490 case Intrinsic::aarch64_sve_ld1_gather_index:
25491 return performGatherLoadCombine(N, DAG,
25493 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
25495 /*OnlyPackedOffsets=*/false);
25496 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
25498 /*OnlyPackedOffsets=*/false);
25499 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
25500 return performGatherLoadCombine(N, DAG,
25502 /*OnlyPackedOffsets=*/false);
25503 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
25504 return performGatherLoadCombine(N, DAG,
25506 /*OnlyPackedOffsets=*/false);
25507 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
25509 case Intrinsic::aarch64_sve_ldff1_gather:
25511 case Intrinsic::aarch64_sve_ldff1_gather_index:
25512 return performGatherLoadCombine(N, DAG,
25514 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
25515 return performGatherLoadCombine(N, DAG,
25517 /*OnlyPackedOffsets=*/false);
25518 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
25519 return performGatherLoadCombine(N, DAG,
25521 /*OnlyPackedOffsets=*/false);
25522 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
25523 return performGatherLoadCombine(N, DAG,
25525 /*OnlyPackedOffsets=*/false);
25526 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
25527 return performGatherLoadCombine(N, DAG,
25529 /*OnlyPackedOffsets=*/false);
25530 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
25531 return performGatherLoadCombine(N, DAG,
25533 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
25534 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
25536 case Intrinsic::aarch64_sve_st1q_scatter_index:
25538 case Intrinsic::aarch64_sve_st1_scatter:
25540 case Intrinsic::aarch64_sve_st1_scatter_index:
25542 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
25544 /*OnlyPackedOffsets=*/false);
25545 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
25547 /*OnlyPackedOffsets=*/false);
25548 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
25549 return performScatterStoreCombine(N, DAG,
25551 /*OnlyPackedOffsets=*/false);
25552 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
25553 return performScatterStoreCombine(N, DAG,
25555 /*OnlyPackedOffsets=*/false);
25556 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
25558 case Intrinsic::aarch64_rndr:
25559 case Intrinsic::aarch64_rndrrs: {
25560 unsigned IntrinsicID = N->getConstantOperandVal(1);
25561 auto Register =
25562 (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR
25563 : AArch64SysReg::RNDRRS);
25564 SDLoc DL(N);
25565 SDValue A = DAG.getNode(
25566 AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, MVT::Glue, MVT::Other),
25567 N->getOperand(0), DAG.getConstant(Register, DL, MVT::i64));
25568 SDValue B = DAG.getNode(
25569 AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
25570 DAG.getConstant(0, DL, MVT::i32),
25571 DAG.getConstant(AArch64CC::NE, DL, MVT::i32), A.getValue(1));
25572 return DAG.getMergeValues(
25573 {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);
25574 }
25575 case Intrinsic::aarch64_sme_ldr_zt:
25577 DAG.getVTList(MVT::Other), N->getOperand(0),
25578 N->getOperand(2), N->getOperand(3));
25579 case Intrinsic::aarch64_sme_str_zt:
25580 return DAG.getNode(AArch64ISD::SAVE_ZT, SDLoc(N),
25581 DAG.getVTList(MVT::Other), N->getOperand(0),
25582 N->getOperand(2), N->getOperand(3));
25583 default:
25584 break;
25585 }
25586 break;
25587 case ISD::GlobalAddress:
25588 return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
25589 case ISD::CTLZ:
25590 return performCTLZCombine(N, DAG, Subtarget);
25592 return performScalarToVectorCombine(N, DCI, DAG);
25593 }
25594 return SDValue();
25595}
25596
25597// Check if the return value is used as only a return value, as otherwise
25598// we can't perform a tail-call. In particular, we need to check for
25599// target ISD nodes that are returns and any other "odd" constructs
25600// that the generic analysis code won't necessarily catch.
25601bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
25602 SDValue &Chain) const {
25603 if (N->getNumValues() != 1)
25604 return false;
25605 if (!N->hasNUsesOfValue(1, 0))
25606 return false;
25607
25608 SDValue TCChain = Chain;
25609 SDNode *Copy = *N->use_begin();
25610 if (Copy->getOpcode() == ISD::CopyToReg) {
25611 // If the copy has a glue operand, we conservatively assume it isn't safe to
25612 // perform a tail call.
25613 if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
25614 MVT::Glue)
25615 return false;
25616 TCChain = Copy->getOperand(0);
25617 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
25618 return false;
25619
25620 bool HasRet = false;
25621 for (SDNode *Node : Copy->uses()) {
25622 if (Node->getOpcode() != AArch64ISD::RET_GLUE)
25623 return false;
25624 HasRet = true;
25625 }
25626
25627 if (!HasRet)
25628 return false;
25629
25630 Chain = TCChain;
25631 return true;
25632}
25633
25634// Return whether the an instruction can potentially be optimized to a tail
25635// call. This will cause the optimizers to attempt to move, or duplicate,
25636// return instructions to help enable tail call optimizations for this
25637// instruction.
25638bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
25639 return CI->isTailCall();
25640}
25641
25642bool AArch64TargetLowering::isIndexingLegal(MachineInstr &MI, Register Base,
25643 Register Offset, bool IsPre,
25644 MachineRegisterInfo &MRI) const {
25645 auto CstOffset = getIConstantVRegVal(Offset, MRI);
25646 if (!CstOffset || CstOffset->isZero())
25647 return false;
25648
25649 // All of the indexed addressing mode instructions take a signed 9 bit
25650 // immediate offset. Our CstOffset is a G_PTR_ADD offset so it already
25651 // encodes the sign/indexing direction.
25652 return isInt<9>(CstOffset->getSExtValue());
25653}
25654
25655bool AArch64TargetLowering::getIndexedAddressParts(SDNode *N, SDNode *Op,
25656 SDValue &Base,
25657 SDValue &Offset,
25658 SelectionDAG &DAG) const {
25659 if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
25660 return false;
25661
25662 // Non-null if there is exactly one user of the loaded value (ignoring chain).
25663 SDNode *ValOnlyUser = nullptr;
25664 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE;
25665 ++UI) {
25666 if (UI.getUse().getResNo() == 1)
25667 continue; // Ignore chain.
25668 if (ValOnlyUser == nullptr)
25669 ValOnlyUser = *UI;
25670 else {
25671 ValOnlyUser = nullptr; // Multiple non-chain uses, bail out.
25672 break;
25673 }
25674 }
25675
25676 auto IsUndefOrZero = [](SDValue V) {
25677 return V.isUndef() || isNullOrNullSplat(V, /*AllowUndefs*/ true);
25678 };
25679
25680 // If the only user of the value is a scalable vector splat, it is
25681 // preferable to do a replicating load (ld1r*).
25682 if (ValOnlyUser && ValOnlyUser->getValueType(0).isScalableVector() &&
25683 (ValOnlyUser->getOpcode() == ISD::SPLAT_VECTOR ||
25684 (ValOnlyUser->getOpcode() == AArch64ISD::DUP_MERGE_PASSTHRU &&
25685 IsUndefOrZero(ValOnlyUser->getOperand(2)))))
25686 return false;
25687
25688 Base = Op->getOperand(0);
25689 // All of the indexed addressing mode instructions take a signed
25690 // 9 bit immediate offset.
25691 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
25692 int64_t RHSC = RHS->getSExtValue();
25693 if (Op->getOpcode() == ISD::SUB)
25694 RHSC = -(uint64_t)RHSC;
25695 if (!isInt<9>(RHSC))
25696 return false;
25697 // Always emit pre-inc/post-inc addressing mode. Use negated constant offset
25698 // when dealing with subtraction.
25699 Offset = DAG.getConstant(RHSC, SDLoc(N), RHS->getValueType(0));
25700 return true;
25701 }
25702 return false;
25703}
25704
25705bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
25706 SDValue &Offset,
25708 SelectionDAG &DAG) const {
25709 EVT VT;
25710 SDValue Ptr;
25711 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
25712 VT = LD->getMemoryVT();
25713 Ptr = LD->getBasePtr();
25714 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
25715 VT = ST->getMemoryVT();
25716 Ptr = ST->getBasePtr();
25717 } else
25718 return false;
25719
25720 if (!getIndexedAddressParts(N, Ptr.getNode(), Base, Offset, DAG))
25721 return false;
25722 AM = ISD::PRE_INC;
25723 return true;
25724}
25725
25726bool AArch64TargetLowering::getPostIndexedAddressParts(
25728 ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
25729 EVT VT;
25730 SDValue Ptr;
25731 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
25732 VT = LD->getMemoryVT();
25733 Ptr = LD->getBasePtr();
25734 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
25735 VT = ST->getMemoryVT();
25736 Ptr = ST->getBasePtr();
25737 } else
25738 return false;
25739
25740 if (!getIndexedAddressParts(N, Op, Base, Offset, DAG))
25741 return false;
25742 // Post-indexing updates the base, so it's not a valid transform
25743 // if that's not the same as the load's pointer.
25744 if (Ptr != Base)
25745 return false;
25746 AM = ISD::POST_INC;
25747 return true;
25748}
25749
25752 SelectionDAG &DAG) {
25753 SDLoc DL(N);
25754 SDValue Op = N->getOperand(0);
25755 EVT VT = N->getValueType(0);
25756 [[maybe_unused]] EVT SrcVT = Op.getValueType();
25757 assert(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
25758 "Must be bool vector.");
25759
25760 // Special handling for Clang's __builtin_convertvector. For vectors with <8
25761 // elements, it adds a vector concatenation with undef(s). If we encounter
25762 // this here, we can skip the concat.
25763 if (Op.getOpcode() == ISD::CONCAT_VECTORS && !Op.getOperand(0).isUndef()) {
25764 bool AllUndef = true;
25765 for (unsigned I = 1; I < Op.getNumOperands(); ++I)
25766 AllUndef &= Op.getOperand(I).isUndef();
25767
25768 if (AllUndef)
25769 Op = Op.getOperand(0);
25770 }
25771
25772 SDValue VectorBits = vectorToScalarBitmask(Op.getNode(), DAG);
25773 if (VectorBits)
25774 Results.push_back(DAG.getZExtOrTrunc(VectorBits, DL, VT));
25775}
25776
25779 SelectionDAG &DAG, EVT ExtendVT,
25780 EVT CastVT) {
25781 SDLoc DL(N);
25782 SDValue Op = N->getOperand(0);
25783 EVT VT = N->getValueType(0);
25784
25785 // Use SCALAR_TO_VECTOR for lane zero
25786 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtendVT, Op);
25787 SDValue CastVal = DAG.getNode(ISD::BITCAST, DL, CastVT, Vec);
25788 SDValue IdxZero = DAG.getVectorIdxConstant(0, DL);
25789 Results.push_back(
25790 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, CastVal, IdxZero));
25791}
25792
25793void AArch64TargetLowering::ReplaceBITCASTResults(
25795 SDLoc DL(N);
25796 SDValue Op = N->getOperand(0);
25797 EVT VT = N->getValueType(0);
25798 EVT SrcVT = Op.getValueType();
25799
25800 if (VT == MVT::v2i16 && SrcVT == MVT::i32) {
25801 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v4i16);
25802 return;
25803 }
25804
25805 if (VT == MVT::v4i8 && SrcVT == MVT::i32) {
25806 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v8i8);
25807 return;
25808 }
25809
25810 if (VT == MVT::v2i8 && SrcVT == MVT::i16) {
25811 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v4i16, MVT::v8i8);
25812 return;
25813 }
25814
25815 if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(SrcVT)) {
25816 assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
25817 "Expected fp->int bitcast!");
25818
25819 // Bitcasting between unpacked vector types of different element counts is
25820 // not a NOP because the live elements are laid out differently.
25821 // 01234567
25822 // e.g. nxv2i32 = XX??XX??
25823 // nxv4f16 = X?X?X?X?
25824 if (VT.getVectorElementCount() != SrcVT.getVectorElementCount())
25825 return;
25826
25827 SDValue CastResult = getSVESafeBitCast(getSVEContainerType(VT), Op, DAG);
25828 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, CastResult));
25829 return;
25830 }
25831
25832 if (SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
25833 !VT.isVector())
25834 return replaceBoolVectorBitcast(N, Results, DAG);
25835
25836 if (VT != MVT::i16 || (SrcVT != MVT::f16 && SrcVT != MVT::bf16))
25837 return;
25838
25839 Op = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
25840 DAG.getUNDEF(MVT::i32), Op);
25841 Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
25842 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
25843}
25844
25846 SelectionDAG &DAG,
25847 const AArch64Subtarget *Subtarget) {
25848 EVT VT = N->getValueType(0);
25849 if (!VT.is256BitVector() ||
25851 !N->getFlags().hasAllowReassociation()) ||
25852 (VT.getScalarType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
25853 VT.getScalarType() == MVT::bf16)
25854 return;
25855
25856 SDValue X = N->getOperand(0);
25857 auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(1));
25858 if (!Shuf) {
25859 Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0));
25860 X = N->getOperand(1);
25861 if (!Shuf)
25862 return;
25863 }
25864
25865 if (Shuf->getOperand(0) != X || !Shuf->getOperand(1)->isUndef())
25866 return;
25867
25868 // Check the mask is 1,0,3,2,5,4,...
25869 ArrayRef<int> Mask = Shuf->getMask();
25870 for (int I = 0, E = Mask.size(); I < E; I++)
25871 if (Mask[I] != (I % 2 == 0 ? I + 1 : I - 1))
25872 return;
25873
25874 SDLoc DL(N);
25875 auto LoHi = DAG.SplitVector(X, DL);
25876 assert(LoHi.first.getValueType() == LoHi.second.getValueType());
25877 SDValue Addp = DAG.getNode(AArch64ISD::ADDP, N, LoHi.first.getValueType(),
25878 LoHi.first, LoHi.second);
25879
25880 // Shuffle the elements back into order.
25881 SmallVector<int> NMask;
25882 for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I < E; I++) {
25883 NMask.push_back(I);
25884 NMask.push_back(I);
25885 }
25886 Results.push_back(
25887 DAG.getVectorShuffle(VT, DL,
25888 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Addp,
25889 DAG.getUNDEF(LoHi.first.getValueType())),
25890 DAG.getUNDEF(VT), NMask));
25891}
25892
25895 SelectionDAG &DAG, unsigned InterOp,
25896 unsigned AcrossOp) {
25897 EVT LoVT, HiVT;
25898 SDValue Lo, Hi;
25899 SDLoc dl(N);
25900 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
25901 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
25902 SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi);
25903 SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal);
25904 Results.push_back(SplitVal);
25905}
25906
25907void AArch64TargetLowering::ReplaceExtractSubVectorResults(
25909 SDValue In = N->getOperand(0);
25910 EVT InVT = In.getValueType();
25911
25912 // Common code will handle these just fine.
25913 if (!InVT.isScalableVector() || !InVT.isInteger())
25914 return;
25915
25916 SDLoc DL(N);
25917 EVT VT = N->getValueType(0);
25918
25919 // The following checks bail if this is not a halving operation.
25920
25922
25923 if (InVT.getVectorElementCount() != (ResEC * 2))
25924 return;
25925
25926 auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1));
25927 if (!CIndex)
25928 return;
25929
25930 unsigned Index = CIndex->getZExtValue();
25931 if ((Index != 0) && (Index != ResEC.getKnownMinValue()))
25932 return;
25933
25934 unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI;
25935 EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(*DAG.getContext());
25936
25937 SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0));
25938 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half));
25939}
25940
25941// Create an even/odd pair of X registers holding integer value V.
25943 SDLoc dl(V.getNode());
25944 auto [VLo, VHi] = DAG.SplitScalar(V, dl, MVT::i64, MVT::i64);
25945 if (DAG.getDataLayout().isBigEndian())
25946 std::swap (VLo, VHi);
25947 SDValue RegClass =
25948 DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, dl, MVT::i32);
25949 SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, dl, MVT::i32);
25950 SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, dl, MVT::i32);
25951 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
25952 return SDValue(
25953 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
25954}
25955
25958 SelectionDAG &DAG,
25959 const AArch64Subtarget *Subtarget) {
25960 assert(N->getValueType(0) == MVT::i128 &&
25961 "AtomicCmpSwap on types less than 128 should be legal");
25962
25963 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
25964 if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) {
25965 // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
25966 // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
25967 SDValue Ops[] = {
25968 createGPRPairNode(DAG, N->getOperand(2)), // Compare value
25969 createGPRPairNode(DAG, N->getOperand(3)), // Store value
25970 N->getOperand(1), // Ptr
25971 N->getOperand(0), // Chain in
25972 };
25973
25974 unsigned Opcode;
25975 switch (MemOp->getMergedOrdering()) {
25977 Opcode = AArch64::CASPX;
25978 break;
25980 Opcode = AArch64::CASPAX;
25981 break;
25983 Opcode = AArch64::CASPLX;
25984 break;
25987 Opcode = AArch64::CASPALX;
25988 break;
25989 default:
25990 llvm_unreachable("Unexpected ordering!");
25991 }
25992
25993 MachineSDNode *CmpSwap = DAG.getMachineNode(
25994 Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops);
25995 DAG.setNodeMemRefs(CmpSwap, {MemOp});
25996
25997 unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
25998 if (DAG.getDataLayout().isBigEndian())
25999 std::swap(SubReg1, SubReg2);
26000 SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
26001 SDValue(CmpSwap, 0));
26002 SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
26003 SDValue(CmpSwap, 0));
26004 Results.push_back(
26005 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
26006 Results.push_back(SDValue(CmpSwap, 1)); // Chain out
26007 return;
26008 }
26009
26010 unsigned Opcode;
26011 switch (MemOp->getMergedOrdering()) {
26013 Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
26014 break;
26016 Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
26017 break;
26019 Opcode = AArch64::CMP_SWAP_128_RELEASE;
26020 break;
26023 Opcode = AArch64::CMP_SWAP_128;
26024 break;
26025 default:
26026 llvm_unreachable("Unexpected ordering!");
26027 }
26028
26029 SDLoc DL(N);
26030 auto Desired = DAG.SplitScalar(N->getOperand(2), DL, MVT::i64, MVT::i64);
26031 auto New = DAG.SplitScalar(N->getOperand(3), DL, MVT::i64, MVT::i64);
26032 SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
26033 New.first, New.second, N->getOperand(0)};
26034 SDNode *CmpSwap = DAG.getMachineNode(
26035 Opcode, SDLoc(N), DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other),
26036 Ops);
26037 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
26038
26039 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
26040 SDValue(CmpSwap, 0), SDValue(CmpSwap, 1)));
26041 Results.push_back(SDValue(CmpSwap, 3));
26042}
26043
26044static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode,
26045 AtomicOrdering Ordering) {
26046 // ATOMIC_LOAD_CLR only appears when lowering ATOMIC_LOAD_AND (see
26047 // LowerATOMIC_LOAD_AND). We can't take that approach with 128-bit, because
26048 // the type is not legal. Therefore we shouldn't expect to see a 128-bit
26049 // ATOMIC_LOAD_CLR at any point.
26050 assert(ISDOpcode != ISD::ATOMIC_LOAD_CLR &&
26051 "ATOMIC_LOAD_AND should be lowered to LDCLRP directly");
26052 assert(ISDOpcode != ISD::ATOMIC_LOAD_ADD && "There is no 128 bit LDADD");
26053 assert(ISDOpcode != ISD::ATOMIC_LOAD_SUB && "There is no 128 bit LDSUB");
26054
26055 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
26056 // The operand will need to be XORed in a separate step.
26057 switch (Ordering) {
26059 return AArch64::LDCLRP;
26060 break;
26062 return AArch64::LDCLRPA;
26063 break;
26065 return AArch64::LDCLRPL;
26066 break;
26069 return AArch64::LDCLRPAL;
26070 break;
26071 default:
26072 llvm_unreachable("Unexpected ordering!");
26073 }
26074 }
26075
26076 if (ISDOpcode == ISD::ATOMIC_LOAD_OR) {
26077 switch (Ordering) {
26079 return AArch64::LDSETP;
26080 break;
26082 return AArch64::LDSETPA;
26083 break;
26085 return AArch64::LDSETPL;
26086 break;
26089 return AArch64::LDSETPAL;
26090 break;
26091 default:
26092 llvm_unreachable("Unexpected ordering!");
26093 }
26094 }
26095
26096 if (ISDOpcode == ISD::ATOMIC_SWAP) {
26097 switch (Ordering) {
26099 return AArch64::SWPP;
26100 break;
26102 return AArch64::SWPPA;
26103 break;
26105 return AArch64::SWPPL;
26106 break;
26109 return AArch64::SWPPAL;
26110 break;
26111 default:
26112 llvm_unreachable("Unexpected ordering!");
26113 }
26114 }
26115
26116 llvm_unreachable("Unexpected ISDOpcode!");
26117}
26118
26121 SelectionDAG &DAG,
26122 const AArch64Subtarget *Subtarget) {
26123 // LSE128 has a 128-bit RMW ops, but i128 is not a legal type, so lower it
26124 // here. This follows the approach of the CMP_SWAP_XXX pseudo instructions
26125 // rather than the CASP instructions, because CASP has register classes for
26126 // the pairs of registers and therefore uses REG_SEQUENCE and EXTRACT_SUBREG
26127 // to present them as single operands. LSE128 instructions use the GPR64
26128 // register class (because the pair does not have to be sequential), like
26129 // CMP_SWAP_XXX, and therefore we use TRUNCATE and BUILD_PAIR.
26130
26131 assert(N->getValueType(0) == MVT::i128 &&
26132 "AtomicLoadXXX on types less than 128 should be legal");
26133
26134 if (!Subtarget->hasLSE128())
26135 return;
26136
26137 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
26138 const SDValue &Chain = N->getOperand(0);
26139 const SDValue &Ptr = N->getOperand(1);
26140 const SDValue &Val128 = N->getOperand(2);
26141 std::pair<SDValue, SDValue> Val2x64 =
26142 DAG.SplitScalar(Val128, SDLoc(Val128), MVT::i64, MVT::i64);
26143
26144 const unsigned ISDOpcode = N->getOpcode();
26145 const unsigned MachineOpcode =
26146 getAtomicLoad128Opcode(ISDOpcode, MemOp->getMergedOrdering());
26147
26148 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
26149 SDLoc dl(Val128);
26150 Val2x64.first =
26151 DAG.getNode(ISD::XOR, dl, MVT::i64,
26152 DAG.getConstant(-1ULL, dl, MVT::i64), Val2x64.first);
26153 Val2x64.second =
26154 DAG.getNode(ISD::XOR, dl, MVT::i64,
26155 DAG.getConstant(-1ULL, dl, MVT::i64), Val2x64.second);
26156 }
26157
26158 SDValue Ops[] = {Val2x64.first, Val2x64.second, Ptr, Chain};
26159 if (DAG.getDataLayout().isBigEndian())
26160 std::swap(Ops[0], Ops[1]);
26161
26162 MachineSDNode *AtomicInst =
26163 DAG.getMachineNode(MachineOpcode, SDLoc(N),
26164 DAG.getVTList(MVT::i64, MVT::i64, MVT::Other), Ops);
26165
26166 DAG.setNodeMemRefs(AtomicInst, {MemOp});
26167
26168 SDValue Lo = SDValue(AtomicInst, 0), Hi = SDValue(AtomicInst, 1);
26169 if (DAG.getDataLayout().isBigEndian())
26170 std::swap(Lo, Hi);
26171
26172 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
26173 Results.push_back(SDValue(AtomicInst, 2)); // Chain out
26174}
26175
26176void AArch64TargetLowering::ReplaceNodeResults(
26178 switch (N->getOpcode()) {
26179 default:
26180 llvm_unreachable("Don't know how to custom expand this");
26181 case ISD::BITCAST:
26182 ReplaceBITCASTResults(N, Results, DAG);
26183 return;
26184 case ISD::VECREDUCE_ADD:
26189 Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
26190 return;
26191 case ISD::ADD:
26192 case ISD::FADD:
26193 ReplaceAddWithADDP(N, Results, DAG, Subtarget);
26194 return;
26195
26196 case ISD::CTPOP:
26197 case ISD::PARITY:
26198 if (SDValue Result = LowerCTPOP_PARITY(SDValue(N, 0), DAG))
26199 Results.push_back(Result);
26200 return;
26201 case AArch64ISD::SADDV:
26203 return;
26204 case AArch64ISD::UADDV:
26206 return;
26207 case AArch64ISD::SMINV:
26209 return;
26210 case AArch64ISD::UMINV:
26212 return;
26213 case AArch64ISD::SMAXV:
26215 return;
26216 case AArch64ISD::UMAXV:
26218 return;
26219 case ISD::MULHS:
26221 Results.push_back(
26222 LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHS_PRED));
26223 return;
26224 case ISD::MULHU:
26226 Results.push_back(
26227 LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHU_PRED));
26228 return;
26229 case ISD::FP_TO_UINT:
26230 case ISD::FP_TO_SINT:
26233 assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
26234 // Let normal code take care of it by not adding anything to Results.
26235 return;
26237 ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
26238 return;
26240 assert(N->getValueType(0) != MVT::i128 &&
26241 "128-bit ATOMIC_LOAD_AND should be lowered directly to LDCLRP");
26242 break;
26245 case ISD::ATOMIC_SWAP: {
26246 assert(cast<AtomicSDNode>(N)->getVal().getValueType() == MVT::i128 &&
26247 "Expected 128-bit atomicrmw.");
26248 // These need custom type legalisation so we go directly to instruction.
26249 ReplaceATOMIC_LOAD_128Results(N, Results, DAG, Subtarget);
26250 return;
26251 }
26252 case ISD::ATOMIC_LOAD:
26253 case ISD::LOAD: {
26254 MemSDNode *LoadNode = cast<MemSDNode>(N);
26255 EVT MemVT = LoadNode->getMemoryVT();
26256 // Handle lowering 256 bit non temporal loads into LDNP for little-endian
26257 // targets.
26258 if (LoadNode->isNonTemporal() && Subtarget->isLittleEndian() &&
26259 MemVT.getSizeInBits() == 256u &&
26260 (MemVT.getScalarSizeInBits() == 8u ||
26261 MemVT.getScalarSizeInBits() == 16u ||
26262 MemVT.getScalarSizeInBits() == 32u ||
26263 MemVT.getScalarSizeInBits() == 64u)) {
26264
26267 DAG.getVTList({MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
26268 MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
26269 MVT::Other}),
26270 {LoadNode->getChain(), LoadNode->getBasePtr()},
26271 LoadNode->getMemoryVT(), LoadNode->getMemOperand());
26272
26273 SDValue Pair = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), MemVT,
26274 Result.getValue(0), Result.getValue(1));
26275 Results.append({Pair, Result.getValue(2) /* Chain */});
26276 return;
26277 }
26278
26279 if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) ||
26280 LoadNode->getMemoryVT() != MVT::i128) {
26281 // Non-volatile or atomic loads are optimized later in AArch64's load/store
26282 // optimizer.
26283 return;
26284 }
26285
26286 if (SDValue(N, 0).getValueType() == MVT::i128) {
26287 auto *AN = dyn_cast<AtomicSDNode>(LoadNode);
26288 bool isLoadAcquire =
26290 unsigned Opcode = isLoadAcquire ? AArch64ISD::LDIAPP : AArch64ISD::LDP;
26291
26292 if (isLoadAcquire)
26293 assert(Subtarget->hasFeature(AArch64::FeatureRCPC3));
26294
26296 Opcode, SDLoc(N), DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
26297 {LoadNode->getChain(), LoadNode->getBasePtr()},
26298 LoadNode->getMemoryVT(), LoadNode->getMemOperand());
26299
26300 unsigned FirstRes = DAG.getDataLayout().isBigEndian() ? 1 : 0;
26301
26302 SDValue Pair =
26303 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
26304 Result.getValue(FirstRes), Result.getValue(1 - FirstRes));
26305 Results.append({Pair, Result.getValue(2) /* Chain */});
26306 }
26307 return;
26308 }
26310 ReplaceExtractSubVectorResults(N, Results, DAG);
26311 return;
26314 // Custom lowering has been requested for INSERT_SUBVECTOR and
26315 // CONCAT_VECTORS -- but delegate to common code for result type
26316 // legalisation
26317 return;
26319 EVT VT = N->getValueType(0);
26320
26321 Intrinsic::ID IntID =
26322 static_cast<Intrinsic::ID>(N->getConstantOperandVal(0));
26323 switch (IntID) {
26324 default:
26325 return;
26326 case Intrinsic::aarch64_sve_clasta_n: {
26327 assert((VT == MVT::i8 || VT == MVT::i16) &&
26328 "custom lowering for unexpected type");
26329 SDLoc DL(N);
26330 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
26331 auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32,
26332 N->getOperand(1), Op2, N->getOperand(3));
26333 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
26334 return;
26335 }
26336 case Intrinsic::aarch64_sve_clastb_n: {
26337 assert((VT == MVT::i8 || VT == MVT::i16) &&
26338 "custom lowering for unexpected type");
26339 SDLoc DL(N);
26340 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
26341 auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32,
26342 N->getOperand(1), Op2, N->getOperand(3));
26343 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
26344 return;
26345 }
26346 case Intrinsic::aarch64_sve_lasta: {
26347 assert((VT == MVT::i8 || VT == MVT::i16) &&
26348 "custom lowering for unexpected type");
26349 SDLoc DL(N);
26350 auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32,
26351 N->getOperand(1), N->getOperand(2));
26352 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
26353 return;
26354 }
26355 case Intrinsic::aarch64_sve_lastb: {
26356 assert((VT == MVT::i8 || VT == MVT::i16) &&
26357 "custom lowering for unexpected type");
26358 SDLoc DL(N);
26359 auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32,
26360 N->getOperand(1), N->getOperand(2));
26361 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
26362 return;
26363 }
26364 case Intrinsic::get_active_lane_mask: {
26365 if (!VT.isFixedLengthVector() || VT.getVectorElementType() != MVT::i1)
26366 return;
26367
26368 // NOTE: Only trivial type promotion is supported.
26369 EVT NewVT = getTypeToTransformTo(*DAG.getContext(), VT);
26370 if (NewVT.getVectorNumElements() != VT.getVectorNumElements())
26371 return;
26372
26373 SDLoc DL(N);
26374 auto V = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, NewVT, N->ops());
26375 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
26376 return;
26377 }
26378 }
26379 }
26380 case ISD::READ_REGISTER: {
26381 SDLoc DL(N);
26382 assert(N->getValueType(0) == MVT::i128 &&
26383 "READ_REGISTER custom lowering is only for 128-bit sysregs");
26384 SDValue Chain = N->getOperand(0);
26385 SDValue SysRegName = N->getOperand(1);
26386
26387 SDValue Result = DAG.getNode(
26388 AArch64ISD::MRRS, DL, DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
26389 Chain, SysRegName);
26390
26391 // Sysregs are not endian. Result.getValue(0) always contains the lower half
26392 // of the 128-bit System Register value.
26393 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
26394 Result.getValue(0), Result.getValue(1));
26395 Results.push_back(Pair);
26396 Results.push_back(Result.getValue(2)); // Chain
26397 return;
26398 }
26399 }
26400}
26401
26403 if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())
26405 return true;
26406}
26407
26408unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
26409 // Combine multiple FDIVs with the same divisor into multiple FMULs by the
26410 // reciprocal if there are three or more FDIVs.
26411 return 3;
26412}
26413
26416 // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8,
26417 // v4i16, v2i32 instead of to promote.
26418 if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 ||
26419 VT == MVT::v1f32)
26420 return TypeWidenVector;
26421
26423}
26424
26425// In v8.4a, ldp and stp instructions are guaranteed to be single-copy atomic
26426// provided the address is 16-byte aligned.
26428 if (!Subtarget->hasLSE2())
26429 return false;
26430
26431 if (auto LI = dyn_cast<LoadInst>(I))
26432 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
26433 LI->getAlign() >= Align(16);
26434
26435 if (auto SI = dyn_cast<StoreInst>(I))
26436 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
26437 SI->getAlign() >= Align(16);
26438
26439 return false;
26440}
26441
26443 if (!Subtarget->hasLSE128())
26444 return false;
26445
26446 // Only use SWPP for stores where LSE2 would require a fence. Unlike STP, SWPP
26447 // will clobber the two registers.
26448 if (const auto *SI = dyn_cast<StoreInst>(I))
26449 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
26450 SI->getAlign() >= Align(16) &&
26451 (SI->getOrdering() == AtomicOrdering::Release ||
26452 SI->getOrdering() == AtomicOrdering::SequentiallyConsistent);
26453
26454 if (const auto *RMW = dyn_cast<AtomicRMWInst>(I))
26455 return RMW->getValOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
26456 RMW->getAlign() >= Align(16) &&
26457 (RMW->getOperation() == AtomicRMWInst::Xchg ||
26458 RMW->getOperation() == AtomicRMWInst::And ||
26459 RMW->getOperation() == AtomicRMWInst::Or);
26460
26461 return false;
26462}
26463
26465 if (!Subtarget->hasLSE2() || !Subtarget->hasRCPC3())
26466 return false;
26467
26468 if (auto LI = dyn_cast<LoadInst>(I))
26469 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
26470 LI->getAlign() >= Align(16) &&
26471 LI->getOrdering() == AtomicOrdering::Acquire;
26472
26473 if (auto SI = dyn_cast<StoreInst>(I))
26474 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
26475 SI->getAlign() >= Align(16) &&
26476 SI->getOrdering() == AtomicOrdering::Release;
26477
26478 return false;
26479}
26480
26482 const Instruction *I) const {
26484 return false;
26486 return false;
26488 return true;
26489 return false;
26490}
26491
26493 const Instruction *I) const {
26494 // Store-Release instructions only provide seq_cst guarantees when paired with
26495 // Load-Acquire instructions. MSVC CRT does not use these instructions to
26496 // implement seq_cst loads and stores, so we need additional explicit fences
26497 // after memory writes.
26498 if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
26499 return false;
26500
26501 switch (I->getOpcode()) {
26502 default:
26503 return false;
26504 case Instruction::AtomicCmpXchg:
26505 return cast<AtomicCmpXchgInst>(I)->getSuccessOrdering() ==
26507 case Instruction::AtomicRMW:
26508 return cast<AtomicRMWInst>(I)->getOrdering() ==
26510 case Instruction::Store:
26511 return cast<StoreInst>(I)->getOrdering() ==
26513 }
26514}
26515
26516// Loads and stores less than 128-bits are already atomic; ones above that
26517// are doomed anyway, so defer to the default libcall and blame the OS when
26518// things go wrong.
26521 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
26522 if (Size != 128)
26524 if (isOpSuitableForRCPC3(SI))
26526 if (isOpSuitableForLSE128(SI))
26528 if (isOpSuitableForLDPSTP(SI))
26531}
26532
26533// Loads and stores less than 128-bits are already atomic; ones above that
26534// are doomed anyway, so defer to the default libcall and blame the OS when
26535// things go wrong.
26538 unsigned Size = LI->getType()->getPrimitiveSizeInBits();
26539
26540 if (Size != 128)
26542 if (isOpSuitableForRCPC3(LI))
26544 // No LSE128 loads
26545 if (isOpSuitableForLDPSTP(LI))
26547
26548 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
26549 // implement atomicrmw without spilling. If the target address is also on the
26550 // stack and close enough to the spill slot, this can lead to a situation
26551 // where the monitor always gets cleared and the atomic operation can never
26552 // succeed. So at -O0 lower this operation to a CAS loop.
26553 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
26555
26556 // Using CAS for an atomic load has a better chance of succeeding under high
26557 // contention situations. So use it if available.
26558 return Subtarget->hasLSE() ? AtomicExpansionKind::CmpXChg
26560}
26561
26562// The "default" for integer RMW operations is to expand to an LL/SC loop.
26563// However, with the LSE instructions (or outline-atomics mode, which provides
26564// library routines in place of the LSE-instructions), we can directly emit many
26565// operations instead.
26566//
26567// Floating-point operations are always emitted to a cmpxchg loop, because they
26568// may trigger a trap which aborts an LLSC sequence.
26571 unsigned Size = AI->getType()->getPrimitiveSizeInBits();
26572 assert(Size <= 128 && "AtomicExpandPass should've handled larger sizes.");
26573
26574 if (AI->isFloatingPointOperation())
26576
26577 bool CanUseLSE128 = Subtarget->hasLSE128() && Size == 128 &&
26581 if (CanUseLSE128)
26583
26584 // Nand is not supported in LSE.
26585 // Leave 128 bits to LLSC or CmpXChg.
26586 if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128) {
26587 if (Subtarget->hasLSE())
26589 if (Subtarget->outlineAtomics()) {
26590 // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
26591 // Don't outline them unless
26592 // (1) high level <atomic> support approved:
26593 // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
26594 // (2) low level libgcc and compiler-rt support implemented by:
26595 // min/max outline atomics helpers
26596 if (AI->getOperation() != AtomicRMWInst::Min &&
26601 }
26602 }
26603 }
26604
26605 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
26606 // implement atomicrmw without spilling. If the target address is also on the
26607 // stack and close enough to the spill slot, this can lead to a situation
26608 // where the monitor always gets cleared and the atomic operation can never
26609 // succeed. So at -O0 lower this operation to a CAS loop. Also worthwhile if
26610 // we have a single CAS instruction that can replace the loop.
26612 Subtarget->hasLSE())
26614
26616}
26617
26620 AtomicCmpXchgInst *AI) const {
26621 // If subtarget has LSE, leave cmpxchg intact for codegen.
26622 if (Subtarget->hasLSE() || Subtarget->outlineAtomics())
26624 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
26625 // implement cmpxchg without spilling. If the address being exchanged is also
26626 // on the stack and close enough to the spill slot, this can lead to a
26627 // situation where the monitor always gets cleared and the atomic operation
26628 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
26629 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
26631
26632 // 128-bit atomic cmpxchg is weird; AtomicExpand doesn't know how to expand
26633 // it.
26635 if (Size > 64)
26637
26639}
26640
26642 Type *ValueTy, Value *Addr,
26643 AtomicOrdering Ord) const {
26644 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
26645 bool IsAcquire = isAcquireOrStronger(Ord);
26646
26647 // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
26648 // intrinsic must return {i64, i64} and we have to recombine them into a
26649 // single i128 here.
26650 if (ValueTy->getPrimitiveSizeInBits() == 128) {
26652 IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
26654
26655 Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");
26656
26657 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
26658 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
26659 Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
26660 Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
26661 return Builder.CreateOr(
26662 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 64)), "val64");
26663 }
26664
26665 Type *Tys[] = { Addr->getType() };
26667 IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
26668 Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys);
26669
26670 const DataLayout &DL = M->getDataLayout();
26671 IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy));
26672 CallInst *CI = Builder.CreateCall(Ldxr, Addr);
26673 CI->addParamAttr(
26674 0, Attribute::get(Builder.getContext(), Attribute::ElementType, ValueTy));
26675 Value *Trunc = Builder.CreateTrunc(CI, IntEltTy);
26676
26677 return Builder.CreateBitCast(Trunc, ValueTy);
26678}
26679
26681 IRBuilderBase &Builder) const {
26682 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
26683 Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
26684}
26685
26687 Value *Val, Value *Addr,
26688 AtomicOrdering Ord) const {
26689 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
26690 bool IsRelease = isReleaseOrStronger(Ord);
26691
26692 // Since the intrinsics must have legal type, the i128 intrinsics take two
26693 // parameters: "i64, i64". We must marshal Val into the appropriate form
26694 // before the call.
26695 if (Val->getType()->getPrimitiveSizeInBits() == 128) {
26697 IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
26699 Type *Int64Ty = Type::getInt64Ty(M->getContext());
26700
26701 Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
26702 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
26703 return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
26704 }
26705
26707 IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
26708 Type *Tys[] = { Addr->getType() };
26709 Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);
26710
26711 const DataLayout &DL = M->getDataLayout();
26712 IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
26713 Val = Builder.CreateBitCast(Val, IntValTy);
26714
26715 CallInst *CI = Builder.CreateCall(
26716 Stxr, {Builder.CreateZExtOrBitCast(
26717 Val, Stxr->getFunctionType()->getParamType(0)),
26718 Addr});
26719 CI->addParamAttr(1, Attribute::get(Builder.getContext(),
26720 Attribute::ElementType, Val->getType()));
26721 return CI;
26722}
26723
26725 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
26726 const DataLayout &DL) const {
26727 if (!Ty->isArrayTy()) {
26728 const TypeSize &TySize = Ty->getPrimitiveSizeInBits();
26729 return TySize.isScalable() && TySize.getKnownMinValue() > 128;
26730 }
26731
26732 // All non aggregate members of the type must have the same type
26733 SmallVector<EVT> ValueVTs;
26734 ComputeValueVTs(*this, DL, Ty, ValueVTs);
26735 return all_equal(ValueVTs);
26736}
26737
26738bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
26739 EVT) const {
26740 return false;
26741}
26742
26743static Value *UseTlsOffset(IRBuilderBase &IRB, unsigned Offset) {
26744 Module *M = IRB.GetInsertBlock()->getParent()->getParent();
26745 Function *ThreadPointerFunc =
26746 Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
26747 return IRB.CreatePointerCast(
26748 IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc),
26749 Offset),
26750 IRB.getPtrTy(0));
26751}
26752
26754 // Android provides a fixed TLS slot for the stack cookie. See the definition
26755 // of TLS_SLOT_STACK_GUARD in
26756 // https://android.googlesource.com/platform/bionic/+/main/libc/platform/bionic/tls_defines.h
26757 if (Subtarget->isTargetAndroid())
26758 return UseTlsOffset(IRB, 0x28);
26759
26760 // Fuchsia is similar.
26761 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
26762 if (Subtarget->isTargetFuchsia())
26763 return UseTlsOffset(IRB, -0x10);
26764
26766}
26767
26769 // MSVC CRT provides functionalities for stack protection.
26770 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) {
26771 // MSVC CRT has a global variable holding security cookie.
26772 M.getOrInsertGlobal("__security_cookie",
26773 PointerType::getUnqual(M.getContext()));
26774
26775 // MSVC CRT has a function to validate security cookie.
26776 FunctionCallee SecurityCheckCookie =
26777 M.getOrInsertFunction(Subtarget->getSecurityCheckCookieName(),
26778 Type::getVoidTy(M.getContext()),
26779 PointerType::getUnqual(M.getContext()));
26780 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
26781 F->setCallingConv(CallingConv::Win64);
26782 F->addParamAttr(0, Attribute::AttrKind::InReg);
26783 }
26784 return;
26785 }
26787}
26788
26790 // MSVC CRT has a global variable holding security cookie.
26791 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
26792 return M.getGlobalVariable("__security_cookie");
26794}
26795
26797 // MSVC CRT has a function to validate security cookie.
26798 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
26799 return M.getFunction(Subtarget->getSecurityCheckCookieName());
26801}
26802
26803Value *
26805 // Android provides a fixed TLS slot for the SafeStack pointer. See the
26806 // definition of TLS_SLOT_SAFESTACK in
26807 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
26808 if (Subtarget->isTargetAndroid())
26809 return UseTlsOffset(IRB, 0x48);
26810
26811 // Fuchsia is similar.
26812 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
26813 if (Subtarget->isTargetFuchsia())
26814 return UseTlsOffset(IRB, -0x8);
26815
26817}
26818
26820 const Instruction &AndI) const {
26821 // Only sink 'and' mask to cmp use block if it is masking a single bit, since
26822 // this is likely to be fold the and/cmp/br into a single tbz instruction. It
26823 // may be beneficial to sink in other cases, but we would have to check that
26824 // the cmp would not get folded into the br to form a cbz for these to be
26825 // beneficial.
26826 ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
26827 if (!Mask)
26828 return false;
26829 return Mask->getValue().isPowerOf2();
26830}
26831
26835 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
26836 SelectionDAG &DAG) const {
26837 // Does baseline recommend not to perform the fold by default?
26839 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
26840 return false;
26841 // Else, if this is a vector shift, prefer 'shl'.
26842 return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL;
26843}
26844
26847 SelectionDAG &DAG, SDNode *N, unsigned int ExpansionFactor) const {
26849 !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
26852 ExpansionFactor);
26853}
26854
26856 // Update IsSplitCSR in AArch64unctionInfo.
26857 AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
26858 AFI->setIsSplitCSR(true);
26859}
26860
26862 MachineBasicBlock *Entry,
26863 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
26864 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
26865 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
26866 if (!IStart)
26867 return;
26868
26869 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
26870 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
26871 MachineBasicBlock::iterator MBBI = Entry->begin();
26872 for (const MCPhysReg *I = IStart; *I; ++I) {
26873 const TargetRegisterClass *RC = nullptr;
26874 if (AArch64::GPR64RegClass.contains(*I))
26875 RC = &AArch64::GPR64RegClass;
26876 else if (AArch64::FPR64RegClass.contains(*I))
26877 RC = &AArch64::FPR64RegClass;
26878 else
26879 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
26880
26881 Register NewVR = MRI->createVirtualRegister(RC);
26882 // Create copy from CSR to a virtual register.
26883 // FIXME: this currently does not emit CFI pseudo-instructions, it works
26884 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
26885 // nounwind. If we want to generalize this later, we may need to emit
26886 // CFI pseudo-instructions.
26887 assert(Entry->getParent()->getFunction().hasFnAttribute(
26888 Attribute::NoUnwind) &&
26889 "Function should be nounwind in insertCopiesSplitCSR!");
26890 Entry->addLiveIn(*I);
26891 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
26892 .addReg(*I);
26893
26894 // Insert the copy-back instructions right before the terminator.
26895 for (auto *Exit : Exits)
26896 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
26897 TII->get(TargetOpcode::COPY), *I)
26898 .addReg(NewVR);
26899 }
26900}
26901
26903 // Integer division on AArch64 is expensive. However, when aggressively
26904 // optimizing for code size, we prefer to use a div instruction, as it is
26905 // usually smaller than the alternative sequence.
26906 // The exception to this is vector division. Since AArch64 doesn't have vector
26907 // integer division, leaving the division as-is is a loss even in terms of
26908 // size, because it will have to be scalarized, while the alternative code
26909 // sequence can be performed in vector form.
26910 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
26911 return OptSize && !VT.isVector();
26912}
26913
26915 // We want inc-of-add for scalars and sub-of-not for vectors.
26916 return VT.isScalarInteger();
26917}
26918
26920 EVT VT) const {
26921 // v8f16 without fp16 need to be extended to v8f32, which is more difficult to
26922 // legalize.
26923 if (FPVT == MVT::v8f16 && !Subtarget->hasFullFP16())
26924 return false;
26925 if (FPVT == MVT::v8bf16)
26926 return false;
26927 return TargetLowering::shouldConvertFpToSat(Op, FPVT, VT);
26928}
26929
26933 const TargetInstrInfo *TII) const {
26934 assert(MBBI->isCall() && MBBI->getCFIType() &&
26935 "Invalid call instruction for a KCFI check");
26936
26937 switch (MBBI->getOpcode()) {
26938 case AArch64::BLR:
26939 case AArch64::BLRNoIP:
26940 case AArch64::TCRETURNri:
26941 case AArch64::TCRETURNrix16x17:
26942 case AArch64::TCRETURNrix17:
26943 case AArch64::TCRETURNrinotx16:
26944 break;
26945 default:
26946 llvm_unreachable("Unexpected CFI call opcode");
26947 }
26948
26949 MachineOperand &Target = MBBI->getOperand(0);
26950 assert(Target.isReg() && "Invalid target operand for an indirect call");
26951 Target.setIsRenamable(false);
26952
26953 return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(AArch64::KCFI_CHECK))
26954 .addReg(Target.getReg())
26955 .addImm(MBBI->getCFIType())
26956 .getInstr();
26957}
26958
26960 return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
26961}
26962
26963unsigned
26965 if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
26966 return getPointerTy(DL).getSizeInBits();
26967
26968 return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
26969}
26970
26971void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
26972 MachineFrameInfo &MFI = MF.getFrameInfo();
26973 // If we have any vulnerable SVE stack objects then the stack protector
26974 // needs to be placed at the top of the SVE stack area, as the SVE locals
26975 // are placed above the other locals, so we allocate it as if it were a
26976 // scalable vector.
26977 // FIXME: It may be worthwhile having a specific interface for this rather
26978 // than doing it here in finalizeLowering.
26979 if (MFI.hasStackProtectorIndex()) {
26980 for (unsigned int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
26986 break;
26987 }
26988 }
26989 }
26992}
26993
26994// Unlike X86, we let frame lowering assign offsets to all catch objects.
26996 return false;
26997}
26998
26999bool AArch64TargetLowering::shouldLocalize(
27000 const MachineInstr &MI, const TargetTransformInfo *TTI) const {
27001 auto &MF = *MI.getMF();
27002 auto &MRI = MF.getRegInfo();
27003 auto maxUses = [](unsigned RematCost) {
27004 // A cost of 1 means remats are basically free.
27005 if (RematCost == 1)
27006 return std::numeric_limits<unsigned>::max();
27007 if (RematCost == 2)
27008 return 2U;
27009
27010 // Remat is too expensive, only sink if there's one user.
27011 if (RematCost > 2)
27012 return 1U;
27013 llvm_unreachable("Unexpected remat cost");
27014 };
27015
27016 unsigned Opc = MI.getOpcode();
27017 switch (Opc) {
27018 case TargetOpcode::G_GLOBAL_VALUE: {
27019 // On Darwin, TLS global vars get selected into function calls, which
27020 // we don't want localized, as they can get moved into the middle of a
27021 // another call sequence.
27022 const GlobalValue &GV = *MI.getOperand(1).getGlobal();
27023 if (GV.isThreadLocal() && Subtarget->isTargetMachO())
27024 return false;
27025 return true; // Always localize G_GLOBAL_VALUE to avoid high reg pressure.
27026 }
27027 case TargetOpcode::G_FCONSTANT:
27028 case TargetOpcode::G_CONSTANT: {
27029 const ConstantInt *CI;
27030 unsigned AdditionalCost = 0;
27031
27032 if (Opc == TargetOpcode::G_CONSTANT)
27033 CI = MI.getOperand(1).getCImm();
27034 else {
27035 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
27036 // We try to estimate cost of 32/64b fpimms, as they'll likely be
27037 // materialized as integers.
27038 if (Ty.getScalarSizeInBits() != 32 && Ty.getScalarSizeInBits() != 64)
27039 break;
27040 auto APF = MI.getOperand(1).getFPImm()->getValueAPF();
27041 bool OptForSize =
27044 OptForSize))
27045 return true; // Constant should be cheap.
27046 CI =
27047 ConstantInt::get(MF.getFunction().getContext(), APF.bitcastToAPInt());
27048 // FP materialization also costs an extra move, from gpr to fpr.
27049 AdditionalCost = 1;
27050 }
27051 APInt Imm = CI->getValue();
27054 assert(Cost.isValid() && "Expected a valid imm cost");
27055
27056 unsigned RematCost = *Cost.getValue();
27057 RematCost += AdditionalCost;
27058 Register Reg = MI.getOperand(0).getReg();
27059 unsigned MaxUses = maxUses(RematCost);
27060 // Don't pass UINT_MAX sentinel value to hasAtMostUserInstrs().
27061 if (MaxUses == std::numeric_limits<unsigned>::max())
27062 --MaxUses;
27063 return MRI.hasAtMostUserInstrs(Reg, MaxUses);
27064 }
27065 // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
27066 // localizable.
27067 case AArch64::ADRP:
27068 case AArch64::G_ADD_LOW:
27069 // Need to localize G_PTR_ADD so that G_GLOBAL_VALUE can be localized too.
27070 case TargetOpcode::G_PTR_ADD:
27071 return true;
27072 default:
27073 break;
27074 }
27076}
27077
27079 // Fallback for scalable vectors.
27080 // Note that if EnableSVEGISel is true, we allow scalable vector types for
27081 // all instructions, regardless of whether they are actually supported.
27082 if (!EnableSVEGISel) {
27083 if (Inst.getType()->isScalableTy()) {
27084 return true;
27085 }
27086
27087 for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
27088 if (Inst.getOperand(i)->getType()->isScalableTy())
27089 return true;
27090
27091 if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
27092 if (AI->getAllocatedType()->isScalableTy())
27093 return true;
27094 }
27095 }
27096
27097 // Checks to allow the use of SME instructions
27098 if (auto *Base = dyn_cast<CallBase>(&Inst)) {
27099 auto CallerAttrs = SMEAttrs(*Inst.getFunction());
27100 auto CalleeAttrs = SMEAttrs(*Base);
27101 if (CallerAttrs.requiresSMChange(CalleeAttrs) ||
27102 CallerAttrs.requiresLazySave(CalleeAttrs) ||
27103 CallerAttrs.requiresPreservingZT0(CalleeAttrs))
27104 return true;
27105 }
27106 return false;
27107}
27108
27109// Return the largest legal scalable vector type that matches VT's element type.
27113 "Expected legal fixed length vector!");
27114 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
27115 default:
27116 llvm_unreachable("unexpected element type for SVE container");
27117 case MVT::i8:
27118 return EVT(MVT::nxv16i8);
27119 case MVT::i16:
27120 return EVT(MVT::nxv8i16);
27121 case MVT::i32:
27122 return EVT(MVT::nxv4i32);
27123 case MVT::i64:
27124 return EVT(MVT::nxv2i64);
27125 case MVT::bf16:
27126 return EVT(MVT::nxv8bf16);
27127 case MVT::f16:
27128 return EVT(MVT::nxv8f16);
27129 case MVT::f32:
27130 return EVT(MVT::nxv4f32);
27131 case MVT::f64:
27132 return EVT(MVT::nxv2f64);
27133 }
27134}
27135
27136// Return a PTRUE with active lanes corresponding to the extent of VT.
27138 EVT VT) {
27141 "Expected legal fixed length vector!");
27142
27143 std::optional<unsigned> PgPattern =
27145 assert(PgPattern && "Unexpected element count for SVE predicate");
27146
27147 // For vectors that are exactly getMaxSVEVectorSizeInBits big, we can use
27148 // AArch64SVEPredPattern::all, which can enable the use of unpredicated
27149 // variants of instructions when available.
27150 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
27151 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
27152 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
27153 if (MaxSVESize && MinSVESize == MaxSVESize &&
27154 MaxSVESize == VT.getSizeInBits())
27155 PgPattern = AArch64SVEPredPattern::all;
27156
27157 MVT MaskVT;
27158 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
27159 default:
27160 llvm_unreachable("unexpected element type for SVE predicate");
27161 case MVT::i8:
27162 MaskVT = MVT::nxv16i1;
27163 break;
27164 case MVT::i16:
27165 case MVT::f16:
27166 case MVT::bf16:
27167 MaskVT = MVT::nxv8i1;
27168 break;
27169 case MVT::i32:
27170 case MVT::f32:
27171 MaskVT = MVT::nxv4i1;
27172 break;
27173 case MVT::i64:
27174 case MVT::f64:
27175 MaskVT = MVT::nxv2i1;
27176 break;
27177 }
27178
27179 return getPTrue(DAG, DL, MaskVT, *PgPattern);
27180}
27181
27183 EVT VT) {
27185 "Expected legal scalable vector!");
27186 auto PredTy = VT.changeVectorElementType(MVT::i1);
27187 return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all);
27188}
27189
27191 if (VT.isFixedLengthVector())
27192 return getPredicateForFixedLengthVector(DAG, DL, VT);
27193
27194 return getPredicateForScalableVector(DAG, DL, VT);
27195}
27196
27197// Grow V to consume an entire SVE register.
27199 assert(VT.isScalableVector() &&
27200 "Expected to convert into a scalable vector!");
27201 assert(V.getValueType().isFixedLengthVector() &&
27202 "Expected a fixed length vector operand!");
27203 SDLoc DL(V);
27204 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
27205 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);
27206}
27207
27208// Shrink V so it's just big enough to maintain a VT's worth of data.
27211 "Expected to convert into a fixed length vector!");
27212 assert(V.getValueType().isScalableVector() &&
27213 "Expected a scalable vector operand!");
27214 SDLoc DL(V);
27215 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
27216 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero);
27217}
27218
27219// Convert all fixed length vector loads larger than NEON to masked_loads.
27220SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
27221 SDValue Op, SelectionDAG &DAG) const {
27222 auto Load = cast<LoadSDNode>(Op);
27223
27224 SDLoc DL(Op);
27225 EVT VT = Op.getValueType();
27226 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27227 EVT LoadVT = ContainerVT;
27228 EVT MemVT = Load->getMemoryVT();
27229
27230 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
27231
27232 if (VT.isFloatingPoint()) {
27233 LoadVT = ContainerVT.changeTypeToInteger();
27234 MemVT = MemVT.changeTypeToInteger();
27235 }
27236
27237 SDValue NewLoad = DAG.getMaskedLoad(
27238 LoadVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), Pg,
27239 DAG.getUNDEF(LoadVT), MemVT, Load->getMemOperand(),
27240 Load->getAddressingMode(), Load->getExtensionType());
27241
27242 SDValue Result = NewLoad;
27243 if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
27244 EVT ExtendVT = ContainerVT.changeVectorElementType(
27245 Load->getMemoryVT().getVectorElementType());
27246
27247 Result = getSVESafeBitCast(ExtendVT, Result, DAG);
27249 Pg, Result, DAG.getUNDEF(ContainerVT));
27250 } else if (VT.isFloatingPoint()) {
27251 Result = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Result);
27252 }
27253
27254 Result = convertFromScalableVector(DAG, VT, Result);
27255 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
27256 return DAG.getMergeValues(MergedValues, DL);
27257}
27258
27260 SelectionDAG &DAG) {
27261 SDLoc DL(Mask);
27262 EVT InVT = Mask.getValueType();
27263 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
27264
27265 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
27266
27267 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
27268 return Pg;
27269
27270 auto Op1 = convertToScalableVector(DAG, ContainerVT, Mask);
27271 auto Op2 = DAG.getConstant(0, DL, ContainerVT);
27272
27274 {Pg, Op1, Op2, DAG.getCondCode(ISD::SETNE)});
27275}
27276
27277// Convert all fixed length vector loads larger than NEON to masked_loads.
27278SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(
27279 SDValue Op, SelectionDAG &DAG) const {
27280 auto Load = cast<MaskedLoadSDNode>(Op);
27281
27282 SDLoc DL(Op);
27283 EVT VT = Op.getValueType();
27284 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27285
27286 SDValue Mask = Load->getMask();
27287 // If this is an extending load and the mask type is not the same as
27288 // load's type then we have to extend the mask type.
27289 if (VT.getScalarSizeInBits() > Mask.getValueType().getScalarSizeInBits()) {
27290 assert(Load->getExtensionType() != ISD::NON_EXTLOAD &&
27291 "Incorrect mask type");
27292 Mask = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Mask);
27293 }
27295
27296 SDValue PassThru;
27297 bool IsPassThruZeroOrUndef = false;
27298
27299 if (Load->getPassThru()->isUndef()) {
27300 PassThru = DAG.getUNDEF(ContainerVT);
27301 IsPassThruZeroOrUndef = true;
27302 } else {
27303 if (ContainerVT.isInteger())
27304 PassThru = DAG.getConstant(0, DL, ContainerVT);
27305 else
27306 PassThru = DAG.getConstantFP(0, DL, ContainerVT);
27307 if (isZerosVector(Load->getPassThru().getNode()))
27308 IsPassThruZeroOrUndef = true;
27309 }
27310
27311 SDValue NewLoad = DAG.getMaskedLoad(
27312 ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
27313 Mask, PassThru, Load->getMemoryVT(), Load->getMemOperand(),
27314 Load->getAddressingMode(), Load->getExtensionType());
27315
27316 SDValue Result = NewLoad;
27317 if (!IsPassThruZeroOrUndef) {
27318 SDValue OldPassThru =
27319 convertToScalableVector(DAG, ContainerVT, Load->getPassThru());
27320 Result = DAG.getSelect(DL, ContainerVT, Mask, Result, OldPassThru);
27321 }
27322
27323 Result = convertFromScalableVector(DAG, VT, Result);
27324 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
27325 return DAG.getMergeValues(MergedValues, DL);
27326}
27327
27328// Convert all fixed length vector stores larger than NEON to masked_stores.
27329SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
27330 SDValue Op, SelectionDAG &DAG) const {
27331 auto Store = cast<StoreSDNode>(Op);
27332
27333 SDLoc DL(Op);
27334 EVT VT = Store->getValue().getValueType();
27335 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27336 EVT MemVT = Store->getMemoryVT();
27337
27338 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
27339 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
27340
27341 if (VT.isFloatingPoint() && Store->isTruncatingStore()) {
27342 EVT TruncVT = ContainerVT.changeVectorElementType(
27343 Store->getMemoryVT().getVectorElementType());
27344 MemVT = MemVT.changeTypeToInteger();
27345 NewValue = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, TruncVT, Pg,
27346 NewValue, DAG.getTargetConstant(0, DL, MVT::i64),
27347 DAG.getUNDEF(TruncVT));
27348 NewValue =
27349 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
27350 } else if (VT.isFloatingPoint()) {
27351 MemVT = MemVT.changeTypeToInteger();
27352 NewValue =
27353 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
27354 }
27355
27356 return DAG.getMaskedStore(Store->getChain(), DL, NewValue,
27357 Store->getBasePtr(), Store->getOffset(), Pg, MemVT,
27358 Store->getMemOperand(), Store->getAddressingMode(),
27359 Store->isTruncatingStore());
27360}
27361
27362SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
27363 SDValue Op, SelectionDAG &DAG) const {
27364 auto *Store = cast<MaskedStoreSDNode>(Op);
27365
27366 SDLoc DL(Op);
27367 EVT VT = Store->getValue().getValueType();
27368 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27369
27370 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
27372
27373 return DAG.getMaskedStore(
27374 Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
27375 Mask, Store->getMemoryVT(), Store->getMemOperand(),
27376 Store->getAddressingMode(), Store->isTruncatingStore());
27377}
27378
27379SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
27380 SDValue Op, SelectionDAG &DAG) const {
27381 SDLoc dl(Op);
27382 EVT VT = Op.getValueType();
27383 EVT EltVT = VT.getVectorElementType();
27384
27385 bool Signed = Op.getOpcode() == ISD::SDIV;
27386 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
27387
27388 bool Negated;
27389 uint64_t SplatVal;
27390 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
27391 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27392 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
27393 SDValue Op2 = DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32);
27394
27395 SDValue Pg = getPredicateForFixedLengthVector(DAG, dl, VT);
27396 SDValue Res =
27397 DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, ContainerVT, Pg, Op1, Op2);
27398 if (Negated)
27399 Res = DAG.getNode(ISD::SUB, dl, ContainerVT,
27400 DAG.getConstant(0, dl, ContainerVT), Res);
27401
27402 return convertFromScalableVector(DAG, VT, Res);
27403 }
27404
27405 // Scalable vector i32/i64 DIV is supported.
27406 if (EltVT == MVT::i32 || EltVT == MVT::i64)
27407 return LowerToPredicatedOp(Op, DAG, PredOpcode);
27408
27409 // Scalable vector i8/i16 DIV is not supported. Promote it to i32.
27410 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
27411 EVT PromVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext());
27412 unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
27413
27414 // If the wider type is legal: extend, op, and truncate.
27415 EVT WideVT = VT.widenIntegerVectorElementType(*DAG.getContext());
27416 if (DAG.getTargetLoweringInfo().isTypeLegal(WideVT)) {
27417 SDValue Op0 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(0));
27418 SDValue Op1 = DAG.getNode(ExtendOpcode, dl, WideVT, Op.getOperand(1));
27419 SDValue Div = DAG.getNode(Op.getOpcode(), dl, WideVT, Op0, Op1);
27420 return DAG.getNode(ISD::TRUNCATE, dl, VT, Div);
27421 }
27422
27423 auto HalveAndExtendVector = [&DAG, &dl, &HalfVT, &PromVT,
27424 &ExtendOpcode](SDValue Op) {
27425 SDValue IdxZero = DAG.getConstant(0, dl, MVT::i64);
27426 SDValue IdxHalf =
27427 DAG.getConstant(HalfVT.getVectorNumElements(), dl, MVT::i64);
27428 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxZero);
27429 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, Op, IdxHalf);
27430 return std::pair<SDValue, SDValue>(
27431 {DAG.getNode(ExtendOpcode, dl, PromVT, Lo),
27432 DAG.getNode(ExtendOpcode, dl, PromVT, Hi)});
27433 };
27434
27435 // If wider type is not legal: split, extend, op, trunc and concat.
27436 auto [Op0LoExt, Op0HiExt] = HalveAndExtendVector(Op.getOperand(0));
27437 auto [Op1LoExt, Op1HiExt] = HalveAndExtendVector(Op.getOperand(1));
27438 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0LoExt, Op1LoExt);
27439 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, PromVT, Op0HiExt, Op1HiExt);
27440 SDValue LoTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Lo);
27441 SDValue HiTrunc = DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Hi);
27442 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, {LoTrunc, HiTrunc});
27443}
27444
27445SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(
27446 SDValue Op, SelectionDAG &DAG) const {
27447 EVT VT = Op.getValueType();
27448 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27449
27450 SDLoc DL(Op);
27451 SDValue Val = Op.getOperand(0);
27452 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
27453 Val = convertToScalableVector(DAG, ContainerVT, Val);
27454
27455 bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND;
27456 unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
27457
27458 // Repeatedly unpack Val until the result is of the desired element type.
27459 switch (ContainerVT.getSimpleVT().SimpleTy) {
27460 default:
27461 llvm_unreachable("unimplemented container type");
27462 case MVT::nxv16i8:
27463 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv8i16, Val);
27464 if (VT.getVectorElementType() == MVT::i16)
27465 break;
27466 [[fallthrough]];
27467 case MVT::nxv8i16:
27468 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv4i32, Val);
27469 if (VT.getVectorElementType() == MVT::i32)
27470 break;
27471 [[fallthrough]];
27472 case MVT::nxv4i32:
27473 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv2i64, Val);
27474 assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!");
27475 break;
27476 }
27477
27478 return convertFromScalableVector(DAG, VT, Val);
27479}
27480
27481SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
27482 SDValue Op, SelectionDAG &DAG) const {
27483 EVT VT = Op.getValueType();
27484 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27485
27486 SDLoc DL(Op);
27487 SDValue Val = Op.getOperand(0);
27488 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
27489 Val = convertToScalableVector(DAG, ContainerVT, Val);
27490
27491 // Repeatedly truncate Val until the result is of the desired element type.
27492 switch (ContainerVT.getSimpleVT().SimpleTy) {
27493 default:
27494 llvm_unreachable("unimplemented container type");
27495 case MVT::nxv2i64:
27496 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val);
27497 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val);
27498 if (VT.getVectorElementType() == MVT::i32)
27499 break;
27500 [[fallthrough]];
27501 case MVT::nxv4i32:
27502 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val);
27503 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val);
27504 if (VT.getVectorElementType() == MVT::i16)
27505 break;
27506 [[fallthrough]];
27507 case MVT::nxv8i16:
27508 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val);
27509 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val);
27510 assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
27511 break;
27512 }
27513
27514 return convertFromScalableVector(DAG, VT, Val);
27515}
27516
27517SDValue AArch64TargetLowering::LowerFixedLengthExtractVectorElt(
27518 SDValue Op, SelectionDAG &DAG) const {
27519 EVT VT = Op.getValueType();
27520 EVT InVT = Op.getOperand(0).getValueType();
27521 assert(InVT.isFixedLengthVector() && "Expected fixed length vector type!");
27522
27523 SDLoc DL(Op);
27524 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
27525 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
27526
27527 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Op.getOperand(1));
27528}
27529
27530SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt(
27531 SDValue Op, SelectionDAG &DAG) const {
27532 EVT VT = Op.getValueType();
27533 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27534
27535 SDLoc DL(Op);
27536 EVT InVT = Op.getOperand(0).getValueType();
27537 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
27538 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
27539
27540 auto ScalableRes = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, Op0,
27541 Op.getOperand(1), Op.getOperand(2));
27542
27543 return convertFromScalableVector(DAG, VT, ScalableRes);
27544}
27545
27546// Convert vector operation 'Op' to an equivalent predicated operation whereby
27547// the original operation's type is used to construct a suitable predicate.
27548// NOTE: The results for inactive lanes are undefined.
27549SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
27550 SelectionDAG &DAG,
27551 unsigned NewOp) const {
27552 EVT VT = Op.getValueType();
27553 SDLoc DL(Op);
27554 auto Pg = getPredicateForVector(DAG, DL, VT);
27555
27556 if (VT.isFixedLengthVector()) {
27557 assert(isTypeLegal(VT) && "Expected only legal fixed-width types");
27558 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27559
27560 // Create list of operands by converting existing ones to scalable types.
27562 for (const SDValue &V : Op->op_values()) {
27563 if (isa<CondCodeSDNode>(V)) {
27564 Operands.push_back(V);
27565 continue;
27566 }
27567
27568 if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(V)) {
27569 EVT VTArg = VTNode->getVT().getVectorElementType();
27570 EVT NewVTArg = ContainerVT.changeVectorElementType(VTArg);
27571 Operands.push_back(DAG.getValueType(NewVTArg));
27572 continue;
27573 }
27574
27575 assert(isTypeLegal(V.getValueType()) &&
27576 "Expected only legal fixed-width types");
27577 Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
27578 }
27579
27580 if (isMergePassthruOpcode(NewOp))
27581 Operands.push_back(DAG.getUNDEF(ContainerVT));
27582
27583 auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands);
27584 return convertFromScalableVector(DAG, VT, ScalableRes);
27585 }
27586
27587 assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
27588
27590 for (const SDValue &V : Op->op_values()) {
27591 assert((!V.getValueType().isVector() ||
27592 V.getValueType().isScalableVector()) &&
27593 "Only scalable vectors are supported!");
27594 Operands.push_back(V);
27595 }
27596
27597 if (isMergePassthruOpcode(NewOp))
27598 Operands.push_back(DAG.getUNDEF(VT));
27599
27600 return DAG.getNode(NewOp, DL, VT, Operands, Op->getFlags());
27601}
27602
27603// If a fixed length vector operation has no side effects when applied to
27604// undefined elements, we can safely use scalable vectors to perform the same
27605// operation without needing to worry about predication.
27606SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
27607 SelectionDAG &DAG) const {
27608 EVT VT = Op.getValueType();
27610 "Only expected to lower fixed length vector operation!");
27611 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27612
27613 // Create list of operands by converting existing ones to scalable types.
27615 for (const SDValue &V : Op->op_values()) {
27616 assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
27617
27618 // Pass through non-vector operands.
27619 if (!V.getValueType().isVector()) {
27620 Ops.push_back(V);
27621 continue;
27622 }
27623
27624 // "cast" fixed length vector to a scalable vector.
27625 assert(V.getValueType().isFixedLengthVector() &&
27626 isTypeLegal(V.getValueType()) &&
27627 "Only fixed length vectors are supported!");
27628 Ops.push_back(convertToScalableVector(DAG, ContainerVT, V));
27629 }
27630
27631 auto ScalableRes = DAG.getNode(Op.getOpcode(), SDLoc(Op), ContainerVT, Ops);
27632 return convertFromScalableVector(DAG, VT, ScalableRes);
27633}
27634
27635SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
27636 SelectionDAG &DAG) const {
27637 SDLoc DL(ScalarOp);
27638 SDValue AccOp = ScalarOp.getOperand(0);
27639 SDValue VecOp = ScalarOp.getOperand(1);
27640 EVT SrcVT = VecOp.getValueType();
27641 EVT ResVT = SrcVT.getVectorElementType();
27642
27643 EVT ContainerVT = SrcVT;
27644 if (SrcVT.isFixedLengthVector()) {
27645 ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
27646 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
27647 }
27648
27649 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
27650 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
27651
27652 // Convert operands to Scalable.
27653 AccOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
27654 DAG.getUNDEF(ContainerVT), AccOp, Zero);
27655
27656 // Perform reduction.
27657 SDValue Rdx = DAG.getNode(AArch64ISD::FADDA_PRED, DL, ContainerVT,
27658 Pg, AccOp, VecOp);
27659
27660 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero);
27661}
27662
27663SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
27664 SelectionDAG &DAG) const {
27665 SDLoc DL(ReduceOp);
27666 SDValue Op = ReduceOp.getOperand(0);
27667 EVT OpVT = Op.getValueType();
27668 EVT VT = ReduceOp.getValueType();
27669
27670 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
27671 return SDValue();
27672
27673 SDValue Pg = getPredicateForVector(DAG, DL, OpVT);
27674
27675 switch (ReduceOp.getOpcode()) {
27676 default:
27677 return SDValue();
27678 case ISD::VECREDUCE_OR:
27679 if (isAllActivePredicate(DAG, Pg) && OpVT == MVT::nxv16i1)
27680 // The predicate can be 'Op' because
27681 // vecreduce_or(Op & <all true>) <=> vecreduce_or(Op).
27682 return getPTest(DAG, VT, Op, Op, AArch64CC::ANY_ACTIVE);
27683 else
27684 return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE);
27685 case ISD::VECREDUCE_AND: {
27686 Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg);
27687 return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE);
27688 }
27689 case ISD::VECREDUCE_XOR: {
27690 SDValue ID =
27691 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64);
27692 if (OpVT == MVT::nxv1i1) {
27693 // Emulate a CNTP on .Q using .D and a different governing predicate.
27694 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Pg);
27695 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Op);
27696 }
27697 SDValue Cntp =
27698 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op);
27699 return DAG.getAnyExtOrTrunc(Cntp, DL, VT);
27700 }
27701 }
27702
27703 return SDValue();
27704}
27705
27706SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
27707 SDValue ScalarOp,
27708 SelectionDAG &DAG) const {
27709 SDLoc DL(ScalarOp);
27710 SDValue VecOp = ScalarOp.getOperand(0);
27711 EVT SrcVT = VecOp.getValueType();
27712
27714 SrcVT,
27715 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
27716 EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
27717 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
27718 }
27719
27720 // Lower VECREDUCE_ADD of nxv2i1-nxv16i1 to CNTP rather than UADDV.
27721 if (ScalarOp.getOpcode() == ISD::VECREDUCE_ADD &&
27722 VecOp.getOpcode() == ISD::ZERO_EXTEND) {
27723 SDValue BoolVec = VecOp.getOperand(0);
27724 if (BoolVec.getValueType().getVectorElementType() == MVT::i1) {
27725 // CNTP(BoolVec & BoolVec) <=> CNTP(BoolVec & PTRUE)
27726 SDValue CntpOp = DAG.getNode(
27727 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,
27728 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64),
27729 BoolVec, BoolVec);
27730 return DAG.getAnyExtOrTrunc(CntpOp, DL, ScalarOp.getValueType());
27731 }
27732 }
27733
27734 // UADDV always returns an i64 result.
27735 EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 :
27736 SrcVT.getVectorElementType();
27737 EVT RdxVT = SrcVT;
27738 if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED)
27739 RdxVT = getPackedSVEVectorVT(ResVT);
27740
27741 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
27742 SDValue Rdx = DAG.getNode(Opcode, DL, RdxVT, Pg, VecOp);
27743 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT,
27744 Rdx, DAG.getConstant(0, DL, MVT::i64));
27745
27746 // The VEC_REDUCE nodes expect an element size result.
27747 if (ResVT != ScalarOp.getValueType())
27748 Res = DAG.getAnyExtOrTrunc(Res, DL, ScalarOp.getValueType());
27749
27750 return Res;
27751}
27752
27753SDValue
27754AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,
27755 SelectionDAG &DAG) const {
27756 EVT VT = Op.getValueType();
27757 SDLoc DL(Op);
27758
27759 EVT InVT = Op.getOperand(1).getValueType();
27760 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
27761 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(1));
27762 SDValue Op2 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(2));
27763
27764 // Convert the mask to a predicated (NOTE: We don't need to worry about
27765 // inactive lanes since VSELECT is safe when given undefined elements).
27766 EVT MaskVT = Op.getOperand(0).getValueType();
27767 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskVT);
27768 auto Mask = convertToScalableVector(DAG, MaskContainerVT, Op.getOperand(0));
27770 MaskContainerVT.changeVectorElementType(MVT::i1), Mask);
27771
27772 auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT,
27773 Mask, Op1, Op2);
27774
27775 return convertFromScalableVector(DAG, VT, ScalableRes);
27776}
27777
27778SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(
27779 SDValue Op, SelectionDAG &DAG) const {
27780 SDLoc DL(Op);
27781 EVT InVT = Op.getOperand(0).getValueType();
27782 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
27783
27784 assert(InVT.isFixedLengthVector() && isTypeLegal(InVT) &&
27785 "Only expected to lower fixed length vector operation!");
27786 assert(Op.getValueType() == InVT.changeTypeToInteger() &&
27787 "Expected integer result of the same bit length as the inputs!");
27788
27789 auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
27790 auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
27791 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
27792
27793 EVT CmpVT = Pg.getValueType();
27794 auto Cmp = DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT,
27795 {Pg, Op1, Op2, Op.getOperand(2)});
27796
27797 EVT PromoteVT = ContainerVT.changeTypeToInteger();
27798 auto Promote = DAG.getBoolExtOrTrunc(Cmp, DL, PromoteVT, InVT);
27799 return convertFromScalableVector(DAG, Op.getValueType(), Promote);
27800}
27801
27802SDValue
27803AArch64TargetLowering::LowerFixedLengthBitcastToSVE(SDValue Op,
27804 SelectionDAG &DAG) const {
27805 SDLoc DL(Op);
27806 auto SrcOp = Op.getOperand(0);
27807 EVT VT = Op.getValueType();
27808 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
27809 EVT ContainerSrcVT =
27810 getContainerForFixedLengthVector(DAG, SrcOp.getValueType());
27811
27812 SrcOp = convertToScalableVector(DAG, ContainerSrcVT, SrcOp);
27813 Op = DAG.getNode(ISD::BITCAST, DL, ContainerDstVT, SrcOp);
27814 return convertFromScalableVector(DAG, VT, Op);
27815}
27816
27817SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE(
27818 SDValue Op, SelectionDAG &DAG) const {
27819 SDLoc DL(Op);
27820 unsigned NumOperands = Op->getNumOperands();
27821
27822 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
27823 "Unexpected number of operands in CONCAT_VECTORS");
27824
27825 auto SrcOp1 = Op.getOperand(0);
27826 auto SrcOp2 = Op.getOperand(1);
27827 EVT VT = Op.getValueType();
27828 EVT SrcVT = SrcOp1.getValueType();
27829
27830 if (NumOperands > 2) {
27832 EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());
27833 for (unsigned I = 0; I < NumOperands; I += 2)
27834 Ops.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, PairVT,
27835 Op->getOperand(I), Op->getOperand(I + 1)));
27836
27837 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
27838 }
27839
27840 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27841
27843 SrcOp1 = convertToScalableVector(DAG, ContainerVT, SrcOp1);
27844 SrcOp2 = convertToScalableVector(DAG, ContainerVT, SrcOp2);
27845
27846 Op = DAG.getNode(AArch64ISD::SPLICE, DL, ContainerVT, Pg, SrcOp1, SrcOp2);
27847
27848 return convertFromScalableVector(DAG, VT, Op);
27849}
27850
27851SDValue
27852AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op,
27853 SelectionDAG &DAG) const {
27854 EVT VT = Op.getValueType();
27855 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27856
27857 SDLoc DL(Op);
27858 SDValue Val = Op.getOperand(0);
27859 SDValue Pg = getPredicateForVector(DAG, DL, VT);
27860 EVT SrcVT = Val.getValueType();
27861 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
27862 EVT ExtendVT = ContainerVT.changeVectorElementType(
27863 SrcVT.getVectorElementType());
27864
27865 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
27866 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT.changeTypeToInteger(), Val);
27867
27868 Val = convertToScalableVector(DAG, ContainerVT.changeTypeToInteger(), Val);
27869 Val = getSVESafeBitCast(ExtendVT, Val, DAG);
27870 Val = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
27871 Pg, Val, DAG.getUNDEF(ContainerVT));
27872
27873 return convertFromScalableVector(DAG, VT, Val);
27874}
27875
27876SDValue
27877AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op,
27878 SelectionDAG &DAG) const {
27879 EVT VT = Op.getValueType();
27880 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27881
27882 SDLoc DL(Op);
27883 SDValue Val = Op.getOperand(0);
27884 EVT SrcVT = Val.getValueType();
27885 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
27886 EVT RoundVT = ContainerSrcVT.changeVectorElementType(
27888 SDValue Pg = getPredicateForVector(DAG, DL, RoundVT);
27889
27890 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
27891 Val = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, RoundVT, Pg, Val,
27892 Op.getOperand(1), DAG.getUNDEF(RoundVT));
27893 Val = getSVESafeBitCast(ContainerSrcVT.changeTypeToInteger(), Val, DAG);
27894 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
27895
27896 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
27897 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
27898}
27899
27900SDValue
27901AArch64TargetLowering::LowerFixedLengthIntToFPToSVE(SDValue Op,
27902 SelectionDAG &DAG) const {
27903 EVT VT = Op.getValueType();
27904 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
27905
27906 bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP;
27907 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
27909
27910 SDLoc DL(Op);
27911 SDValue Val = Op.getOperand(0);
27912 EVT SrcVT = Val.getValueType();
27913 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
27914 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
27915
27916 if (VT.bitsGE(SrcVT)) {
27918
27919 Val = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
27920 VT.changeTypeToInteger(), Val);
27921
27922 // Safe to use a larger than specified operand because by promoting the
27923 // value nothing has changed from an arithmetic point of view.
27924 Val =
27925 convertToScalableVector(DAG, ContainerDstVT.changeTypeToInteger(), Val);
27926 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
27927 DAG.getUNDEF(ContainerDstVT));
27928 return convertFromScalableVector(DAG, VT, Val);
27929 } else {
27930 EVT CvtVT = ContainerSrcVT.changeVectorElementType(
27931 ContainerDstVT.getVectorElementType());
27933
27934 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
27935 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
27936 Val = getSVESafeBitCast(ContainerSrcVT, Val, DAG);
27937 Val = convertFromScalableVector(DAG, SrcVT, Val);
27938
27939 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
27940 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
27941 }
27942}
27943
27944SDValue
27945AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op,
27946 SelectionDAG &DAG) const {
27947 SDLoc DL(Op);
27948 EVT OpVT = Op.getValueType();
27949 assert(OpVT.isScalableVector() &&
27950 "Expected scalable vector in LowerVECTOR_DEINTERLEAVE.");
27951 SDValue Even = DAG.getNode(AArch64ISD::UZP1, DL, OpVT, Op.getOperand(0),
27952 Op.getOperand(1));
27953 SDValue Odd = DAG.getNode(AArch64ISD::UZP2, DL, OpVT, Op.getOperand(0),
27954 Op.getOperand(1));
27955 return DAG.getMergeValues({Even, Odd}, DL);
27956}
27957
27958SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op,
27959 SelectionDAG &DAG) const {
27960 SDLoc DL(Op);
27961 EVT OpVT = Op.getValueType();
27962 assert(OpVT.isScalableVector() &&
27963 "Expected scalable vector in LowerVECTOR_INTERLEAVE.");
27964
27965 SDValue Lo = DAG.getNode(AArch64ISD::ZIP1, DL, OpVT, Op.getOperand(0),
27966 Op.getOperand(1));
27967 SDValue Hi = DAG.getNode(AArch64ISD::ZIP2, DL, OpVT, Op.getOperand(0),
27968 Op.getOperand(1));
27969 return DAG.getMergeValues({Lo, Hi}, DL);
27970}
27971
27972SDValue AArch64TargetLowering::LowerVECTOR_HISTOGRAM(SDValue Op,
27973 SelectionDAG &DAG) const {
27974 // FIXME: Maybe share some code with LowerMGather/Scatter?
27975 MaskedHistogramSDNode *HG = cast<MaskedHistogramSDNode>(Op);
27976 SDLoc DL(HG);
27977 SDValue Chain = HG->getChain();
27978 SDValue Inc = HG->getInc();
27979 SDValue Mask = HG->getMask();
27980 SDValue Ptr = HG->getBasePtr();
27981 SDValue Index = HG->getIndex();
27982 SDValue Scale = HG->getScale();
27983 SDValue IntID = HG->getIntID();
27984
27985 // The Intrinsic ID determines the type of update operation.
27986 [[maybe_unused]] ConstantSDNode *CID = cast<ConstantSDNode>(IntID.getNode());
27987 // Right now, we only support 'add' as an update.
27988 assert(CID->getZExtValue() == Intrinsic::experimental_vector_histogram_add &&
27989 "Unexpected histogram update operation");
27990
27991 EVT IncVT = Inc.getValueType();
27992 EVT IndexVT = Index.getValueType();
27993 EVT MemVT = EVT::getVectorVT(*DAG.getContext(), IncVT,
27994 IndexVT.getVectorElementCount());
27995 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
27996 SDValue PassThru = DAG.getSplatVector(MemVT, DL, Zero);
27997 SDValue IncSplat = DAG.getSplatVector(MemVT, DL, Inc);
27998 SDValue Ops[] = {Chain, PassThru, Mask, Ptr, Index, Scale};
27999
28000 MachineMemOperand *MMO = HG->getMemOperand();
28001 // Create an MMO for the gather, without load|store flags.
28004 MMO->getAlign(), MMO->getAAInfo());
28005 ISD::MemIndexType IndexType = HG->getIndexType();
28006 SDValue Gather =
28007 DAG.getMaskedGather(DAG.getVTList(MemVT, MVT::Other), MemVT, DL, Ops,
28008 GMMO, IndexType, ISD::NON_EXTLOAD);
28009
28010 SDValue GChain = Gather.getValue(1);
28011
28012 // Perform the histcnt, multiply by inc, add to bucket data.
28013 SDValue ID = DAG.getTargetConstant(Intrinsic::aarch64_sve_histcnt, DL, IncVT);
28014 SDValue HistCnt =
28015 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, ID, Mask, Index, Index);
28016 SDValue Mul = DAG.getNode(ISD::MUL, DL, MemVT, HistCnt, IncSplat);
28017 SDValue Add = DAG.getNode(ISD::ADD, DL, MemVT, Gather, Mul);
28018
28019 // Create an MMO for the scatter, without load|store flags.
28022 MMO->getAlign(), MMO->getAAInfo());
28023
28024 SDValue ScatterOps[] = {GChain, Add, Mask, Ptr, Index, Scale};
28025 SDValue Scatter = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MemVT, DL,
28026 ScatterOps, SMMO, IndexType, false);
28027 return Scatter;
28028}
28029
28030SDValue
28031AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
28032 SelectionDAG &DAG) const {
28033 EVT VT = Op.getValueType();
28034 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
28035
28036 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
28037 unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU
28039
28040 SDLoc DL(Op);
28041 SDValue Val = Op.getOperand(0);
28042 EVT SrcVT = Val.getValueType();
28043 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
28044 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
28045
28046 if (VT.bitsGT(SrcVT)) {
28047 EVT CvtVT = ContainerDstVT.changeVectorElementType(
28048 ContainerSrcVT.getVectorElementType());
28050
28051 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
28052 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Val);
28053
28054 Val = convertToScalableVector(DAG, ContainerDstVT, Val);
28055 Val = getSVESafeBitCast(CvtVT, Val, DAG);
28056 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
28057 DAG.getUNDEF(ContainerDstVT));
28058 return convertFromScalableVector(DAG, VT, Val);
28059 } else {
28060 EVT CvtVT = ContainerSrcVT.changeTypeToInteger();
28062
28063 // Safe to use a larger than specified result since an fp_to_int where the
28064 // result doesn't fit into the destination is undefined.
28065 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
28066 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
28067 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
28068
28069 return DAG.getNode(ISD::TRUNCATE, DL, VT, Val);
28070 }
28071}
28072
28074 ArrayRef<int> ShuffleMask, EVT VT,
28075 EVT ContainerVT, SelectionDAG &DAG) {
28076 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
28077 SDLoc DL(Op);
28078 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
28079 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
28080 bool IsSingleOp =
28081 ShuffleVectorInst::isSingleSourceMask(ShuffleMask, ShuffleMask.size());
28082
28083 if (!Subtarget.isNeonAvailable() && !MinSVESize)
28084 MinSVESize = 128;
28085
28086 // Ignore two operands if no SVE2 or all index numbers couldn't
28087 // be represented.
28088 if (!IsSingleOp && !Subtarget.hasSVE2())
28089 return SDValue();
28090
28091 EVT VTOp1 = Op.getOperand(0).getValueType();
28092 unsigned BitsPerElt = VTOp1.getVectorElementType().getSizeInBits();
28093 unsigned IndexLen = MinSVESize / BitsPerElt;
28094 unsigned ElementsPerVectorReg = VTOp1.getVectorNumElements();
28095 uint64_t MaxOffset = APInt(BitsPerElt, -1, false).getZExtValue();
28096 EVT MaskEltType = VTOp1.getVectorElementType().changeTypeToInteger();
28097 EVT MaskType = EVT::getVectorVT(*DAG.getContext(), MaskEltType, IndexLen);
28098 bool MinMaxEqual = (MinSVESize == MaxSVESize);
28099 assert(ElementsPerVectorReg <= IndexLen && ShuffleMask.size() <= IndexLen &&
28100 "Incorrectly legalised shuffle operation");
28101
28103 // If MinSVESize is not equal to MaxSVESize then we need to know which
28104 // TBL mask element needs adjustment.
28105 SmallVector<SDValue, 8> AddRuntimeVLMask;
28106
28107 // Bail out for 8-bits element types, because with 2048-bit SVE register
28108 // size 8 bits is only sufficient to index into the first source vector.
28109 if (!IsSingleOp && !MinMaxEqual && BitsPerElt == 8)
28110 return SDValue();
28111
28112 for (int Index : ShuffleMask) {
28113 // Handling poison index value.
28114 if (Index < 0)
28115 Index = 0;
28116 // If the mask refers to elements in the second operand, then we have to
28117 // offset the index by the number of elements in a vector. If this is number
28118 // is not known at compile-time, we need to maintain a mask with 'VL' values
28119 // to add at runtime.
28120 if ((unsigned)Index >= ElementsPerVectorReg) {
28121 if (MinMaxEqual) {
28122 Index += IndexLen - ElementsPerVectorReg;
28123 } else {
28124 Index = Index - ElementsPerVectorReg;
28125 AddRuntimeVLMask.push_back(DAG.getConstant(1, DL, MVT::i64));
28126 }
28127 } else if (!MinMaxEqual)
28128 AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
28129 // For 8-bit elements and 1024-bit SVE registers and MaxOffset equals
28130 // to 255, this might point to the last element of in the second operand
28131 // of the shufflevector, thus we are rejecting this transform.
28132 if ((unsigned)Index >= MaxOffset)
28133 return SDValue();
28134 TBLMask.push_back(DAG.getConstant(Index, DL, MVT::i64));
28135 }
28136
28137 // Choosing an out-of-range index leads to the lane being zeroed vs zero
28138 // value where it would perform first lane duplication for out of
28139 // index elements. For i8 elements an out-of-range index could be a valid
28140 // for 2048-bit vector register size.
28141 for (unsigned i = 0; i < IndexLen - ElementsPerVectorReg; ++i) {
28142 TBLMask.push_back(DAG.getConstant((int)MaxOffset, DL, MVT::i64));
28143 if (!MinMaxEqual)
28144 AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
28145 }
28146
28147 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskType);
28148 SDValue VecMask =
28149 DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
28150 SDValue SVEMask = convertToScalableVector(DAG, MaskContainerVT, VecMask);
28151
28152 SDValue Shuffle;
28153 if (IsSingleOp)
28154 Shuffle =
28155 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
28156 DAG.getConstant(Intrinsic::aarch64_sve_tbl, DL, MVT::i32),
28157 Op1, SVEMask);
28158 else if (Subtarget.hasSVE2()) {
28159 if (!MinMaxEqual) {
28160 unsigned MinNumElts = AArch64::SVEBitsPerBlock / BitsPerElt;
28161 SDValue VScale = (BitsPerElt == 64)
28162 ? DAG.getVScale(DL, MVT::i64, APInt(64, MinNumElts))
28163 : DAG.getVScale(DL, MVT::i32, APInt(32, MinNumElts));
28164 SDValue VecMask =
28165 DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
28166 SDValue MulByMask = DAG.getNode(
28167 ISD::MUL, DL, MaskType,
28168 DAG.getNode(ISD::SPLAT_VECTOR, DL, MaskType, VScale),
28169 DAG.getBuildVector(MaskType, DL,
28170 ArrayRef(AddRuntimeVLMask.data(), IndexLen)));
28171 SDValue UpdatedVecMask =
28172 DAG.getNode(ISD::ADD, DL, MaskType, VecMask, MulByMask);
28173 SVEMask = convertToScalableVector(
28174 DAG, getContainerForFixedLengthVector(DAG, MaskType), UpdatedVecMask);
28175 }
28176 Shuffle =
28177 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
28178 DAG.getConstant(Intrinsic::aarch64_sve_tbl2, DL, MVT::i32),
28179 Op1, Op2, SVEMask);
28180 }
28181 Shuffle = convertFromScalableVector(DAG, VT, Shuffle);
28182 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
28183}
28184
28185SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
28186 SDValue Op, SelectionDAG &DAG) const {
28187 EVT VT = Op.getValueType();
28188 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
28189
28190 auto *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
28191 auto ShuffleMask = SVN->getMask();
28192
28193 SDLoc DL(Op);
28194 SDValue Op1 = Op.getOperand(0);
28195 SDValue Op2 = Op.getOperand(1);
28196
28197 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
28198 Op1 = convertToScalableVector(DAG, ContainerVT, Op1);
28199 Op2 = convertToScalableVector(DAG, ContainerVT, Op2);
28200
28201 auto MinLegalExtractEltScalarTy = [](EVT ScalarTy) -> EVT {
28202 if (ScalarTy == MVT::i8 || ScalarTy == MVT::i16)
28203 return MVT::i32;
28204 return ScalarTy;
28205 };
28206
28207 if (SVN->isSplat()) {
28208 unsigned Lane = std::max(0, SVN->getSplatIndex());
28209 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
28210 SDValue SplatEl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
28211 DAG.getConstant(Lane, DL, MVT::i64));
28212 Op = DAG.getNode(ISD::SPLAT_VECTOR, DL, ContainerVT, SplatEl);
28213 return convertFromScalableVector(DAG, VT, Op);
28214 }
28215
28216 bool ReverseEXT = false;
28217 unsigned Imm;
28218 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm) &&
28219 Imm == VT.getVectorNumElements() - 1) {
28220 if (ReverseEXT)
28221 std::swap(Op1, Op2);
28222 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
28223 SDValue Scalar = DAG.getNode(
28224 ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
28225 DAG.getConstant(VT.getVectorNumElements() - 1, DL, MVT::i64));
28226 Op = DAG.getNode(AArch64ISD::INSR, DL, ContainerVT, Op2, Scalar);
28227 return convertFromScalableVector(DAG, VT, Op);
28228 }
28229
28230 unsigned EltSize = VT.getScalarSizeInBits();
28231 for (unsigned LaneSize : {64U, 32U, 16U}) {
28232 if (isREVMask(ShuffleMask, EltSize, VT.getVectorNumElements(), LaneSize)) {
28233 EVT NewVT =
28235 unsigned RevOp;
28236 if (EltSize == 8)
28238 else if (EltSize == 16)
28240 else
28242
28243 Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1);
28244 Op = LowerToPredicatedOp(Op, DAG, RevOp);
28245 Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op);
28246 return convertFromScalableVector(DAG, VT, Op);
28247 }
28248 }
28249
28250 if (Subtarget->hasSVE2p1() && EltSize == 64 &&
28251 isREVMask(ShuffleMask, EltSize, VT.getVectorNumElements(), 128)) {
28252 if (!VT.isFloatingPoint())
28253 return LowerToPredicatedOp(Op, DAG, AArch64ISD::REVD_MERGE_PASSTHRU);
28254
28256 Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1);
28257 Op = LowerToPredicatedOp(Op, DAG, AArch64ISD::REVD_MERGE_PASSTHRU);
28258 Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op);
28259 return convertFromScalableVector(DAG, VT, Op);
28260 }
28261
28262 unsigned WhichResult;
28263 if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult) &&
28264 WhichResult == 0)
28266 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op2));
28267
28268 if (isTRNMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) {
28269 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
28271 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
28272 }
28273
28274 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
28276 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op1));
28277
28278 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
28279 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
28281 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
28282 }
28283
28284 // Functions like isZIPMask return true when a ISD::VECTOR_SHUFFLE's mask
28285 // represents the same logical operation as performed by a ZIP instruction. In
28286 // isolation these functions do not mean the ISD::VECTOR_SHUFFLE is exactly
28287 // equivalent to an AArch64 instruction. There's the extra component of
28288 // ISD::VECTOR_SHUFFLE's value type to consider. Prior to SVE these functions
28289 // only operated on 64/128bit vector types that have a direct mapping to a
28290 // target register and so an exact mapping is implied.
28291 // However, when using SVE for fixed length vectors, most legal vector types
28292 // are actually sub-vectors of a larger SVE register. When mapping
28293 // ISD::VECTOR_SHUFFLE to an SVE instruction care must be taken to consider
28294 // how the mask's indices translate. Specifically, when the mapping requires
28295 // an exact meaning for a specific vector index (e.g. Index X is the last
28296 // vector element in the register) then such mappings are often only safe when
28297 // the exact SVE register size is know. The main exception to this is when
28298 // indices are logically relative to the first element of either
28299 // ISD::VECTOR_SHUFFLE operand because these relative indices don't change
28300 // when converting from fixed-length to scalable vector types (i.e. the start
28301 // of a fixed length vector is always the start of a scalable vector).
28302 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
28303 unsigned MaxSVESize = Subtarget->getMaxSVEVectorSizeInBits();
28304 if (MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits()) {
28305 if (ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size()) &&
28306 Op2.isUndef()) {
28307 Op = DAG.getNode(ISD::VECTOR_REVERSE, DL, ContainerVT, Op1);
28308 return convertFromScalableVector(DAG, VT, Op);
28309 }
28310
28311 if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult) &&
28312 WhichResult != 0)
28314 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op2));
28315
28316 if (isUZPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) {
28317 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
28319 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
28320 }
28321
28322 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult != 0)
28324 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op1));
28325
28326 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
28327 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
28329 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
28330 }
28331 }
28332
28333 // Avoid producing TBL instruction if we don't know SVE register minimal size,
28334 // unless NEON is not available and we can assume minimal SVE register size is
28335 // 128-bits.
28336 if (MinSVESize || !Subtarget->isNeonAvailable())
28337 return GenerateFixedLengthSVETBL(Op, Op1, Op2, ShuffleMask, VT, ContainerVT,
28338 DAG);
28339
28340 return SDValue();
28341}
28342
28343SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
28344 SelectionDAG &DAG) const {
28345 SDLoc DL(Op);
28346 EVT InVT = Op.getValueType();
28347
28348 assert(VT.isScalableVector() && isTypeLegal(VT) &&
28349 InVT.isScalableVector() && isTypeLegal(InVT) &&
28350 "Only expect to cast between legal scalable vector types!");
28351 assert(VT.getVectorElementType() != MVT::i1 &&
28352 InVT.getVectorElementType() != MVT::i1 &&
28353 "For predicate bitcasts, use getSVEPredicateBitCast");
28354
28355 if (InVT == VT)
28356 return Op;
28357
28359 EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType());
28360
28361 // Safe bitcasting between unpacked vector types of different element counts
28362 // is currently unsupported because the following is missing the necessary
28363 // work to ensure the result's elements live where they're supposed to within
28364 // an SVE register.
28365 // 01234567
28366 // e.g. nxv2i32 = XX??XX??
28367 // nxv4f16 = X?X?X?X?
28369 VT == PackedVT || InVT == PackedInVT) &&
28370 "Unexpected bitcast!");
28371
28372 // Pack input if required.
28373 if (InVT != PackedInVT)
28374 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op);
28375
28376 Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
28377
28378 // Unpack result if required.
28379 if (VT != PackedVT)
28381
28382 return Op;
28383}
28384
28386 SDValue N) const {
28387 return ::isAllActivePredicate(DAG, N);
28388}
28389
28391 return ::getPromotedVTForPredicate(VT);
28392}
28393
28394bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
28395 SDValue Op, const APInt &OriginalDemandedBits,
28396 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
28397 unsigned Depth) const {
28398
28399 unsigned Opc = Op.getOpcode();
28400 switch (Opc) {
28401 case AArch64ISD::VSHL: {
28402 // Match (VSHL (VLSHR Val X) X)
28403 SDValue ShiftL = Op;
28404 SDValue ShiftR = Op->getOperand(0);
28405 if (ShiftR->getOpcode() != AArch64ISD::VLSHR)
28406 return false;
28407
28408 if (!ShiftL.hasOneUse() || !ShiftR.hasOneUse())
28409 return false;
28410
28411 unsigned ShiftLBits = ShiftL->getConstantOperandVal(1);
28412 unsigned ShiftRBits = ShiftR->getConstantOperandVal(1);
28413
28414 // Other cases can be handled as well, but this is not
28415 // implemented.
28416 if (ShiftRBits != ShiftLBits)
28417 return false;
28418
28419 unsigned ScalarSize = Op.getScalarValueSizeInBits();
28420 assert(ScalarSize > ShiftLBits && "Invalid shift imm");
28421
28422 APInt ZeroBits = APInt::getLowBitsSet(ScalarSize, ShiftLBits);
28423 APInt UnusedBits = ~OriginalDemandedBits;
28424
28425 if ((ZeroBits & UnusedBits) != ZeroBits)
28426 return false;
28427
28428 // All bits that are zeroed by (VSHL (VLSHR Val X) X) are not
28429 // used - simplify to just Val.
28430 return TLO.CombineTo(Op, ShiftR->getOperand(0));
28431 }
28432 case AArch64ISD::BICi: {
28433 // Fold BICi if all destination bits already known to be zeroed
28434 SDValue Op0 = Op.getOperand(0);
28435 KnownBits KnownOp0 =
28436 TLO.DAG.computeKnownBits(Op0, OriginalDemandedElts, Depth + 1);
28437 // Op0 &= ~(ConstantOperandVal(1) << ConstantOperandVal(2))
28438 uint64_t BitsToClear = Op->getConstantOperandVal(1)
28439 << Op->getConstantOperandVal(2);
28440 APInt AlreadyZeroedBitsToClear = BitsToClear & KnownOp0.Zero;
28441 if (APInt(Known.getBitWidth(), BitsToClear)
28442 .isSubsetOf(AlreadyZeroedBitsToClear))
28443 return TLO.CombineTo(Op, Op0);
28444
28445 Known = KnownOp0 &
28446 KnownBits::makeConstant(APInt(Known.getBitWidth(), ~BitsToClear));
28447
28448 return false;
28449 }
28451 if (auto ElementSize = IsSVECntIntrinsic(Op)) {
28452 unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits();
28453 if (!MaxSVEVectorSizeInBits)
28454 MaxSVEVectorSizeInBits = AArch64::SVEMaxBitsPerVector;
28455 unsigned MaxElements = MaxSVEVectorSizeInBits / *ElementSize;
28456 // The SVE count intrinsics don't support the multiplier immediate so we
28457 // don't have to account for that here. The value returned may be slightly
28458 // over the true required bits, as this is based on the "ALL" pattern. The
28459 // other patterns are also exposed by these intrinsics, but they all
28460 // return a value that's strictly less than "ALL".
28461 unsigned RequiredBits = llvm::bit_width(MaxElements);
28462 unsigned BitWidth = Known.Zero.getBitWidth();
28463 if (RequiredBits < BitWidth)
28464 Known.Zero.setHighBits(BitWidth - RequiredBits);
28465 return false;
28466 }
28467 }
28468 }
28469
28471 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
28472}
28473
28474bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const {
28475 return Op.getOpcode() == AArch64ISD::DUP ||
28476 Op.getOpcode() == AArch64ISD::MOVI ||
28477 (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
28478 Op.getOperand(0).getOpcode() == AArch64ISD::DUP) ||
28480}
28481
28483 return Subtarget->hasSVE() || Subtarget->hasSVE2() ||
28484 Subtarget->hasComplxNum();
28485}
28486
28489 auto *VTy = dyn_cast<VectorType>(Ty);
28490 if (!VTy)
28491 return false;
28492
28493 // If the vector is scalable, SVE is enabled, implying support for complex
28494 // numbers. Otherwise, we need to ensure complex number support is available
28495 if (!VTy->isScalableTy() && !Subtarget->hasComplxNum())
28496 return false;
28497
28498 auto *ScalarTy = VTy->getScalarType();
28499 unsigned NumElements = VTy->getElementCount().getKnownMinValue();
28500
28501 // We can only process vectors that have a bit size of 128 or higher (with an
28502 // additional 64 bits for Neon). Additionally, these vectors must have a
28503 // power-of-2 size, as we later split them into the smallest supported size
28504 // and merging them back together after applying complex operation.
28505 unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
28506 if ((VTyWidth < 128 && (VTy->isScalableTy() || VTyWidth != 64)) ||
28507 !llvm::isPowerOf2_32(VTyWidth))
28508 return false;
28509
28510 if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2() && VTy->isScalableTy()) {
28511 unsigned ScalarWidth = ScalarTy->getScalarSizeInBits();
28512 return 8 <= ScalarWidth && ScalarWidth <= 64;
28513 }
28514
28515 return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) ||
28516 ScalarTy->isFloatTy() || ScalarTy->isDoubleTy();
28517}
28518
28521 ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
28522 Value *Accumulator) const {
28523 VectorType *Ty = cast<VectorType>(InputA->getType());
28524 bool IsScalable = Ty->isScalableTy();
28525 bool IsInt = Ty->getElementType()->isIntegerTy();
28526
28527 unsigned TyWidth =
28529
28530 assert(((TyWidth >= 128 && llvm::isPowerOf2_32(TyWidth)) || TyWidth == 64) &&
28531 "Vector type must be either 64 or a power of 2 that is at least 128");
28532
28533 if (TyWidth > 128) {
28534 int Stride = Ty->getElementCount().getKnownMinValue() / 2;
28535 auto *HalfTy = VectorType::getHalfElementsVectorType(Ty);
28536 auto *LowerSplitA = B.CreateExtractVector(HalfTy, InputA, B.getInt64(0));
28537 auto *LowerSplitB = B.CreateExtractVector(HalfTy, InputB, B.getInt64(0));
28538 auto *UpperSplitA =
28539 B.CreateExtractVector(HalfTy, InputA, B.getInt64(Stride));
28540 auto *UpperSplitB =
28541 B.CreateExtractVector(HalfTy, InputB, B.getInt64(Stride));
28542 Value *LowerSplitAcc = nullptr;
28543 Value *UpperSplitAcc = nullptr;
28544 if (Accumulator) {
28545 LowerSplitAcc = B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(0));
28546 UpperSplitAcc =
28547 B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(Stride));
28548 }
28549 auto *LowerSplitInt = createComplexDeinterleavingIR(
28550 B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
28551 auto *UpperSplitInt = createComplexDeinterleavingIR(
28552 B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
28553
28554 auto *Result = B.CreateInsertVector(Ty, PoisonValue::get(Ty), LowerSplitInt,
28555 B.getInt64(0));
28556 return B.CreateInsertVector(Ty, Result, UpperSplitInt, B.getInt64(Stride));
28557 }
28558
28559 if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
28560 if (Accumulator == nullptr)
28562
28563 if (IsScalable) {
28564 if (IsInt)
28565 return B.CreateIntrinsic(
28566 Intrinsic::aarch64_sve_cmla_x, Ty,
28567 {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
28568
28569 auto *Mask = B.getAllOnesMask(Ty->getElementCount());
28570 return B.CreateIntrinsic(
28571 Intrinsic::aarch64_sve_fcmla, Ty,
28572 {Mask, Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
28573 }
28574
28575 Intrinsic::ID IdMap[4] = {Intrinsic::aarch64_neon_vcmla_rot0,
28576 Intrinsic::aarch64_neon_vcmla_rot90,
28577 Intrinsic::aarch64_neon_vcmla_rot180,
28578 Intrinsic::aarch64_neon_vcmla_rot270};
28579
28580
28581 return B.CreateIntrinsic(IdMap[(int)Rotation], Ty,
28582 {Accumulator, InputA, InputB});
28583 }
28584
28585 if (OperationType == ComplexDeinterleavingOperation::CAdd) {
28586 if (IsScalable) {
28589 if (IsInt)
28590 return B.CreateIntrinsic(
28591 Intrinsic::aarch64_sve_cadd_x, Ty,
28592 {InputA, InputB, B.getInt32((int)Rotation * 90)});
28593
28594 auto *Mask = B.getAllOnesMask(Ty->getElementCount());
28595 return B.CreateIntrinsic(
28596 Intrinsic::aarch64_sve_fcadd, Ty,
28597 {Mask, InputA, InputB, B.getInt32((int)Rotation * 90)});
28598 }
28599 return nullptr;
28600 }
28601
28604 IntId = Intrinsic::aarch64_neon_vcadd_rot90;
28606 IntId = Intrinsic::aarch64_neon_vcadd_rot270;
28607
28608 if (IntId == Intrinsic::not_intrinsic)
28609 return nullptr;
28610
28611 return B.CreateIntrinsic(IntId, Ty, {InputA, InputB});
28612 }
28613
28614 return nullptr;
28615}
28616
28617bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const {
28618 unsigned Opc = N->getOpcode();
28619 if (ISD::isExtOpcode(Opc)) {
28620 if (any_of(N->uses(),
28621 [&](SDNode *Use) { return Use->getOpcode() == ISD::MUL; }))
28622 return false;
28623 }
28624 return true;
28625}
28626
28627unsigned AArch64TargetLowering::getMinimumJumpTableEntries() const {
28628 return Subtarget->getMinimumJumpTableEntries();
28629}
28630
28633 EVT VT) const {
28634 bool NonUnitFixedLengthVector =
28636 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
28638
28639 EVT VT1;
28640 MVT RegisterVT;
28641 unsigned NumIntermediates;
28642 getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1, NumIntermediates,
28643 RegisterVT);
28644 return RegisterVT;
28645}
28646
28648 LLVMContext &Context, CallingConv::ID CC, EVT VT) const {
28649 bool NonUnitFixedLengthVector =
28651 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
28653
28654 EVT VT1;
28655 MVT VT2;
28656 unsigned NumIntermediates;
28657 return getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1,
28658 NumIntermediates, VT2);
28659}
28660
28662 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
28663 unsigned &NumIntermediates, MVT &RegisterVT) const {
28665 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
28666 if (!RegisterVT.isFixedLengthVector() ||
28667 RegisterVT.getFixedSizeInBits() <= 128)
28668 return NumRegs;
28669
28670 assert(Subtarget->useSVEForFixedLengthVectors() && "Unexpected mode!");
28671 assert(IntermediateVT == RegisterVT && "Unexpected VT mismatch!");
28672 assert(RegisterVT.getFixedSizeInBits() % 128 == 0 && "Unexpected size!");
28673
28674 // A size mismatch here implies either type promotion or widening and would
28675 // have resulted in scalarisation if larger vectors had not be available.
28676 if (RegisterVT.getSizeInBits() * NumRegs != VT.getSizeInBits()) {
28677 EVT EltTy = VT.getVectorElementType();
28678 EVT NewVT = EVT::getVectorVT(Context, EltTy, ElementCount::getFixed(1));
28679 if (!isTypeLegal(NewVT))
28680 NewVT = EltTy;
28681
28682 IntermediateVT = NewVT;
28683 NumIntermediates = VT.getVectorNumElements();
28684 RegisterVT = getRegisterType(Context, NewVT);
28685 return NumIntermediates;
28686 }
28687
28688 // SVE VLS support does not introduce a new ABI so we should use NEON sized
28689 // types for vector arguments and returns.
28690
28691 unsigned NumSubRegs = RegisterVT.getFixedSizeInBits() / 128;
28692 NumIntermediates *= NumSubRegs;
28693 NumRegs *= NumSubRegs;
28694
28695 switch (RegisterVT.getVectorElementType().SimpleTy) {
28696 default:
28697 llvm_unreachable("unexpected element type for vector");
28698 case MVT::i8:
28699 IntermediateVT = RegisterVT = MVT::v16i8;
28700 break;
28701 case MVT::i16:
28702 IntermediateVT = RegisterVT = MVT::v8i16;
28703 break;
28704 case MVT::i32:
28705 IntermediateVT = RegisterVT = MVT::v4i32;
28706 break;
28707 case MVT::i64:
28708 IntermediateVT = RegisterVT = MVT::v2i64;
28709 break;
28710 case MVT::f16:
28711 IntermediateVT = RegisterVT = MVT::v8f16;
28712 break;
28713 case MVT::f32:
28714 IntermediateVT = RegisterVT = MVT::v4f32;
28715 break;
28716 case MVT::f64:
28717 IntermediateVT = RegisterVT = MVT::v2f64;
28718 break;
28719 case MVT::bf16:
28720 IntermediateVT = RegisterVT = MVT::v8bf16;
28721 break;
28722 }
28723
28724 return NumRegs;
28725}
28726
28728 const MachineFunction &MF) const {
28729 return !Subtarget->isTargetWindows() &&
28730 MF.getInfo<AArch64FunctionInfo>()->hasStackProbing();
28731}
28732
28733#ifndef NDEBUG
28735 switch (N->getOpcode()) {
28736 default:
28737 break;
28741 case AArch64ISD::UUNPKHI: {
28742 assert(N->getNumValues() == 1 && "Expected one result!");
28743 assert(N->getNumOperands() == 1 && "Expected one operand!");
28744 EVT VT = N->getValueType(0);
28745 EVT OpVT = N->getOperand(0).getValueType();
28746 assert(OpVT.isVector() && VT.isVector() && OpVT.isInteger() &&
28747 VT.isInteger() && "Expected integer vectors!");
28748 assert(OpVT.getSizeInBits() == VT.getSizeInBits() &&
28749 "Expected vectors of equal size!");
28750 // TODO: Enable assert once bogus creations have been fixed.
28751 // assert(OpVT.getVectorElementCount() == VT.getVectorElementCount()*2 &&
28752 // "Expected result vector with half the lanes of its input!");
28753 break;
28754 }
28755 case AArch64ISD::TRN1:
28756 case AArch64ISD::TRN2:
28757 case AArch64ISD::UZP1:
28758 case AArch64ISD::UZP2:
28759 case AArch64ISD::ZIP1:
28760 case AArch64ISD::ZIP2: {
28761 assert(N->getNumValues() == 1 && "Expected one result!");
28762 assert(N->getNumOperands() == 2 && "Expected two operands!");
28763 EVT VT = N->getValueType(0);
28764 EVT Op0VT = N->getOperand(0).getValueType();
28765 EVT Op1VT = N->getOperand(1).getValueType();
28766 assert(VT.isVector() && Op0VT.isVector() && Op1VT.isVector() &&
28767 "Expected vectors!");
28768 // TODO: Enable assert once bogus creations have been fixed.
28769 // assert(VT == Op0VT && VT == Op1VT && "Expected matching vectors!");
28770 break;
28771 }
28772 }
28773}
28774#endif
unsigned const MachineRegisterInfo * MRI
static MCRegister MatchRegisterName(StringRef Name)
static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc, uint64_t &Imm)
static std::tuple< SDValue, SDValue > extractPtrauthBlendDiscriminators(SDValue Disc, SelectionDAG *DAG)
static bool isIntImmediate(const SDNode *N, uint64_t &Imm)
isIntImmediate - This method tests to see if the node is a constant operand.
static void CustomNonLegalBITCASTResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, EVT ExtendVT, EVT CastVT)
static bool isConcatMask(ArrayRef< int > Mask, EVT VT, bool SplitLHS)
static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG)
static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, AArch64CC::CondCode CC, bool NoNans, EVT VT, const SDLoc &dl, SelectionDAG &DAG)
static bool isAddSubSExt(SDValue N, SelectionDAG &DAG)
static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue CCOp, AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC, const SDLoc &DL, SelectionDAG &DAG)
can be transformed to: not (and (not (and (setCC (cmp C)) (setCD (cmp D)))) (and (not (setCA (cmp A))...
static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2, bool &Invert)
changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC usable with the vector...
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt)
isVShiftRImm - Check if this is a valid build_vector for the immediate operand of a vector shift righ...
static bool isSingletonEXTMask(ArrayRef< int > M, EVT VT, unsigned &Imm)
static SDValue foldCSELofCTTZ(SDNode *N, SelectionDAG &DAG)
static SDValue performCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex)
static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue NormalizeBuildVector(SDValue Op, SelectionDAG &DAG)
static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of zeros to a vector store by scalar stores of WZR/XZR.
static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG)
static SDValue performLastTrueTestVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue GenerateTBL(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performDUPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static std::optional< PredicateConstraint > parsePredicateConstraint(StringRef Constraint)
static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static void analyzeCallOperands(const AArch64TargetLowering &TLI, const AArch64Subtarget *Subtarget, const TargetLowering::CallLoweringInfo &CLI, CCState &CCInfo)
static std::optional< unsigned > IsSVECntIntrinsic(SDValue S)
static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo)
Check whether or not Op is a SET_CC operation, either a generic or an AArch64 lowered one.
static bool isLegalArithImmed(uint64_t C)
static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT)
static ScalableVectorType * getSVEContainerIRType(FixedVectorType *VTy)
static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG)
unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend)
static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG)
static SDValue convertFixedMaskToScalableVector(SDValue Mask, SelectionDAG &DAG)
static bool shouldSinkVScale(Value *Op, SmallVectorImpl< Use * > &Ops)
We want to sink following cases: (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A,...
static bool isZeroingInactiveLanes(SDValue Op)
static SDValue trySwapVSelectOperands(SDNode *N, SelectionDAG &DAG)
static SDValue tryCombineMULLWithUZP1(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG, bool isSigned)
static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG)
static bool isZerosVector(const SDNode *N)
isZerosVector - Check whether SDNode N is a zero-filled vector.
static EVT tryGetOriginalBoolVectorType(SDValue Op, int Depth=0)
static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG)
static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue performNVCASTCombine(SDNode *N, SelectionDAG &DAG)
Get rid of unnecessary NVCASTs (that don't change the type).
static const TargetRegisterClass * getReducedGprRegisterClass(ReducedGprConstraint Constraint, EVT VT)
static SDValue carryFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG, bool Invert)
static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset, SDLoc DL, unsigned BitWidth)
static bool isPredicateCCSettingOp(SDValue N)
static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG)
static bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType)
static SDValue performSVEAndCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue overflowFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG)
static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2, ArrayRef< int > ShuffleMask, EVT VT, EVT ContainerVT, SelectionDAG &DAG)
static SDValue performBRCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static MVT getSVEContainerType(EVT ContentTy)
static SDValue getNegatedInteger(SDValue Op, SelectionDAG &DAG)
static bool isMergePassthruOpcode(unsigned Opc)
static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG, SDLoc DL, bool &IsMLA)
static SDValue performFADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performNEONPostLDSTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Target-specific DAG combine function for NEON load/store intrinsics to merge base address updates.
static SDValue performVectorExtCombine(SDNode *N, SelectionDAG &DAG)
static void ReplaceCMP_SWAP_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp, SelectionDAG &DAG)
static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget, const AArch64TargetLowering &TLI)
static bool isZeroExtended(SDValue N, SelectionDAG &DAG)
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
static SDValue performSelectCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with the compare-mask instruct...
static bool isCheapToExtend(const SDValue &N)
static cl::opt< bool > EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden, cl::desc("Enable AArch64 logical imm instruction " "optimization"), cl::init(true))
static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl< Use * > &Ops)
static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG)
static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes, unsigned ScalarSizeInBytes)
Check if the value of OffsetInBytes can be used as an immediate for the gather load/prefetch and scat...
static bool isUZP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of "vector_shuffle v,...
static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
#define LCALLNAME4(A, B)
static unsigned getDUPLANEOp(EVT EltType)
static void changeFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget, const TargetMachine &TM)
static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST, EVT VT, EVT MemVT, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static bool canLowerSRLToRoundingShiftForVT(SDValue Shift, EVT ResVT, SelectionDAG &DAG, unsigned &ShiftValue, SDValue &RShOperand)
static bool isExtendOrShiftOperand(SDValue N)
static bool isLanes1toNKnownZero(SDValue Op)
static bool setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL, AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI)
Set the IntrinsicInfo for the aarch64_sve_st<N> intrinsics.
static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG)
static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N, SelectionDAG &DAG)
static std::tuple< SDValue, SDValue > extractPtrauthBlendDiscriminators(SDValue Disc, SelectionDAG *DAG)
static EVT getPackedSVEVectorVT(EVT VT)
static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performUnpackCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performVecReduceBitwiseCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performFlagSettingCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned GenericOpcode)
static SDValue performSpliceCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performCSELCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static void ReplaceReductionResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, unsigned InterOp, unsigned AcrossOp)
static bool isEquivalentMaskless(unsigned CC, unsigned width, ISD::LoadExtType ExtType, int AddConstant, int CompConstant)
static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG)
static EVT getExtensionTo64Bits(const EVT &OrigVT)
static bool isCMP(SDValue Op)
static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG)
static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static Function * getStructuredLoadFunction(Module *M, unsigned Factor, bool Scalable, Type *LDVTy, Type *PtrTy)
static bool isCMN(SDValue Op, ISD::CondCode CC, SelectionDAG &DAG)
static SDValue foldCSELOfCSEL(SDNode *Op, SelectionDAG &DAG)
static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc, SelectionDAG &DAG, bool UnpredOp=false, bool SwapOperands=false)
static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
SDValue LowerSMELdrStr(SDValue N, SelectionDAG &DAG, bool IsLoad)
static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp, AArch64CC::CondCode Predicate)
Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain of CCMP/CFCMP ops.
static SDValue performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated)
static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian)
static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT, unsigned Opcode, SelectionDAG &DAG)
static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, SelectionDAG &DAG)
static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint)
static bool isINSMask(ArrayRef< int > M, int NumInputElements, bool &DstIsLeft, int &Anomaly)
static bool callConvSupportsVarArgs(CallingConv::ID CC)
Return true if the call convention supports varargs Currently only those that pass varargs like the C...
static const MCPhysReg GPRArgRegs[]
static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, APInt &UndefBits)
static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG)
static SDValue performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isPassedInFPR(EVT VT)
static unsigned getIntrinsicID(const SDNode *N)
static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert)
static SDValue performAddUADDVCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performExtBinopLoadFold(SDNode *N, SelectionDAG &DAG)
static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N, SDValue &BasePtr, SDValue &Index, SelectionDAG &DAG)
static SDValue performANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool canEmitConjunction(const SDValue Val, bool &CanNegate, bool &MustBeFirst, bool WillNegate, unsigned Depth=0)
Returns true if Val is a tree of AND/OR/SETCC operations that can be expressed as a conjunction.
static bool isWideDUPMask(ArrayRef< int > M, EVT VT, unsigned BlockSize, unsigned &DupLaneOp)
Check if a vector shuffle corresponds to a DUP instructions with a larger element width than the vect...
static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static cl::opt< bool > EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden, cl::desc("Combine ext and trunc to TBL"), cl::init(true))
static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St, SDValue SplatVal, unsigned NumVecElts)
static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *ST)
static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, SDValue V2, unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &dl)
GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit the specified operations t...
static SDValue performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue removeRedundantInsertVectorElt(SDNode *N)
static std::optional< AArch64CC::CondCode > getCSETCondCode(SDValue Op)
static SDValue optimizeIncrementingWhile(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsEqual)
static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG)
Legalize the gather prefetch (scalar + vector addressing mode) when the offset vector is an unpacked ...
static bool isNegatedInteger(SDValue Op)
static SDValue performFirstTrueTestVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static bool isLoadOrMultipleLoads(SDValue B, SmallVector< LoadSDNode * > &Loads)
static SDValue performSubAddMULCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc)
static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16)
static Function * getStructuredStoreFunction(Module *M, unsigned Factor, bool Scalable, Type *STVTy, Type *PtrTy)
static SDValue performVectorShiftCombine(SDNode *N, const AArch64TargetLowering &TLI, TargetLowering::DAGCombinerInfo &DCI)
Optimize a vector shift instruction and its operand if shifted out bits are not used.
static SDValue performUADDVAddCombine(SDValue A, SelectionDAG &DAG)
static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG, unsigned ScalarSizeInBytes)
Combines a node carrying the intrinsic aarch64_sve_prf<T>_gather_scalar_offset into a node that uses ...
static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of a scalar to a vector store by scalar stores of the scalar value.
unsigned getSignExtendedGatherOpcode(unsigned Opcode)
static bool isOrXorChain(SDValue N, unsigned &Num, SmallVector< std::pair< SDValue, SDValue >, 16 > &WorkList)
static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt)
getVShiftImm - Check if this is a valid build_vector for the immediate operand of a vector shift oper...
static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC)
changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64 CC
static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd)
static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG)
static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, const APInt &Demanded, TargetLowering::TargetLoweringOpt &TLO, unsigned NewOpc)
static unsigned getCmpOperandFoldingProfit(SDValue Op)
Returns how profitable it is to fold a comparison's operand's shift and/or extension operations.
static SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performConcatVectorsCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performSVEMulAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N, SelectionDAG &DAG)
#define MAKE_CASE(V)
static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG)
static SDValue performBuildShuffleExtendCombine(SDValue BV, SelectionDAG &DAG)
Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern into sext/zext(buildvecto...
static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG, const EVT &OrigTy, const EVT &ExtTy, unsigned ExtOpcode)
static Value * createTblShuffleForZExt(IRBuilderBase &Builder, Value *Op, FixedVectorType *ZExtTy, FixedVectorType *DstTy, bool IsLittleEndian)
static SDValue performAddSubIntoVectorOp(SDNode *N, SelectionDAG &DAG)
static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG)
static const MCPhysReg FPRArgRegs[]
static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG)
Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR, WZR, invert(<cond>)'.
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static void replaceBoolVectorBitcast(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG)
static SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT, int Pattern)
static bool isEXTMask(ArrayRef< int > M, EVT VT, bool &ReverseEXT, unsigned &Imm)
static std::optional< ReducedGprConstraint > parseReducedGprConstraint(StringRef Constraint)
static SDValue tryCombineFixedPointConvert(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue performSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG)
Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup)) making use of the vector SExt/ZE...
static SDValue performAddSubLongCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Fold a floating-point multiply by power of two into floating-point to fixed-point conversion.
static EVT calculatePreExtendType(SDValue Extend)
Calculates what the pre-extend type is, based on the extension operation node provided by Extend.
static SDValue performSetCCPunpkCombine(SDNode *N, SelectionDAG &DAG)
static EVT getPromotedVTForPredicate(EVT VT)
static void changeFPCCToANDAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
Convert a DAG fp condition code to an AArch64 CC.
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG)
static bool isAllConstantBuildVector(const SDValue &PotentialBVec, uint64_t &ConstVal)
static SDValue performExtractSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG)
static Value * UseTlsOffset(IRBuilderBase &IRB, unsigned Offset)
static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG)
WidenVector - Given a value in the V64 register class, produce the equivalent value in the V128 regis...
static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG)
static bool isSignExtended(SDValue N, SelectionDAG &DAG)
static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG, const AArch64Subtarget *ST)
static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op, AArch64CC::CondCode Cond)
static bool isSetCCOrZExtSetCC(const SDValue &Op, SetCCInfoAndKind &Info)
cl::opt< bool > EnableAArch64ELFLocalDynamicTLSGeneration("aarch64-elf-ldtls-generation", cl::Hidden, cl::desc("Allow AArch64 Local Dynamic TLS code generation"), cl::init(false))
static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG)
static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG)
static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG)
static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG)
static SDValue performSunpkloCombine(SDNode *N, SelectionDAG &DAG)
static SDValue tryToConvertShuffleOfTbl2ToTbl4(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode, AtomicOrdering Ordering)
static void ReplaceAddWithADDP(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
cl::opt< bool > EnableSVEGISel("aarch64-enable-gisel-sve", cl::Hidden, cl::desc("Enable / disable SVE scalable vectors in Global ISel"), cl::init(false))
static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performSetccMergeZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performPostLD1Combine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, bool IsLaneOp)
Target-specific DAG combine function for post-increment LD1 (lane) and post-increment LD1R.
static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2)
Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
std::pair< SDValue, uint64_t > lookThroughSignExtension(SDValue Val)
bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL)
static SDValue performMSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG)
static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale, SDLoc DL, SelectionDAG &DAG)
static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue skipExtensionForVectorMULL(SDValue N, SelectionDAG &DAG)
static SDValue performOrXorChainCombine(SDNode *N, SelectionDAG &DAG)
static bool isSplatShuffle(Value *V)
bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT)
static SDValue performAddCombineForShiftedOperands(SDNode *N, SelectionDAG &DAG)
static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V)
static SDValue lowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG, unsigned Opcode, bool IsSigned)
static bool isPackedVectorType(EVT VT, SelectionDAG &DAG)
Returns true if VT's elements occupy the lowest bit positions of its associated register class withou...
static bool isTRN_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of "vector_shuffle v,...
static bool isAddSubZExt(SDValue N, SelectionDAG &DAG)
static SDValue performSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt)
isVShiftLImm - Check if this is a valid build_vector for the immediate operand of a vector shift left...
static SDValue tryCombineWhileLo(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performMaskedGatherScatterCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert, SelectionDAG &DAG)
static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl, SelectionDAG &DAG, SDValue Chain, bool IsSignaling)
static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performBuildVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64TargetLowering &TLI)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue LowerFLDEXP(SDValue Op, SelectionDAG &DAG)
static unsigned getSMCondition(const SMEAttrs &CallerAttrs, const SMEAttrs &CalleeAttrs)
static SDValue combineSVEReductionInt(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static bool isHalvingTruncateAndConcatOfLegalIntScalableType(SDNode *N)
static bool isOperandOfVmullHighP64(Value *Op)
Check if Op could be used with vmull_high_p64 intrinsic.
static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode, SDValue Operand, SelectionDAG &DAG, int &ExtraSteps)
static SDValue performUADDVZextCombine(SDValue A, SelectionDAG &DAG)
static SDValue performAddCSelIntoCSinc(SDNode *N, SelectionDAG &DAG)
Perform the scalar expression combine in the form of: CSEL(c, 1, cc) + b => CSINC(b+c,...
static SDValue performCTLZCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static std::optional< uint64_t > getConstantLaneNumOfExtractHalfOperand(SDValue &Op)
static void ReplaceATOMIC_LOAD_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool areLoadedOffsetButOtherwiseSame(SDValue Op0, SDValue Op1, SelectionDAG &DAG, unsigned &NumSubLoads)
static bool cannotBeIntMin(SDValue CheckedVal, SelectionDAG &DAG)
static SDValue performLOADCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG, StoreSDNode *Store)
static bool isEssentiallyExtractHighSubvector(SDValue N)
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static Value * createTblShuffleForSExt(IRBuilderBase &Builder, Value *Op, FixedVectorType *DstTy, bool IsLittleEndian)
static unsigned getExtFactor(SDValue &V)
getExtFactor - Determine the adjustment factor for the position when generating an "extract from vect...
static cl::opt< unsigned > MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden, cl::desc("Maximum of xors"))
#define LCALLNAME5(A, B)
static SDValue performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
static SDValue performMULLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue trySimplifySrlAddToRshrnb(SDValue Srl, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static const MVT MVT_CC
Value type used for condition codes.
static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performReinterpretCastCombine(SDNode *N)
SDValue ReconstructShuffleWithRuntimeMask(SDValue Op, SelectionDAG &DAG)
static SDValue performTBZCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC)
Emit expression as a conjunction (a series of CCMP/CFCMP ops).
static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N)
static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AArch64cc, SelectionDAG &DAG, const SDLoc &dl)
static bool performTBISimplification(SDValue Addr, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Simplify Addr given that the top byte of it is ignored by HW during address translation.
static bool areExtractShuffleVectors(Value *Op1, Value *Op2, bool AllowSplat=false)
Check if both Op1 and Op2 are shufflevector extracts of either the lower or upper half of the vector ...
static SDValue tryCombineExtendRShTrunc(SDNode *N, SelectionDAG &DAG)
static bool isAllInactivePredicate(SDValue N)
static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT, SDLoc DL, SelectionDAG &DAG)
static SDValue performIntrinsicCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static cl::opt< bool > EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden, cl::desc("Combine extends of AArch64 masked " "gather intrinsics"), cl::init(true))
static bool isZIP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of "vector_shuffle v,...
static SDValue performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool createTblShuffleMask(unsigned SrcWidth, unsigned DstWidth, unsigned NumElts, bool IsLittleEndian, SmallVectorImpl< int > &Mask)
static bool isWideTypeMask(ArrayRef< int > M, EVT VT, SmallVectorImpl< int > &NewMask)
static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static SDValue performAddCombineSubShift(SDNode *N, SDValue SUB, SDValue Z, SelectionDAG &DAG)
static SDValue performANDSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static const TargetRegisterClass * getPredicateRegisterClass(PredicateConstraint Constraint, EVT VT)
static SDValue performAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performSubsToAndsCombine(SDNode *N, SDNode *SubsNode, SDNode *AndNode, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex, unsigned CC)
static std::pair< SDValue, SDValue > getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG)
#define FALKOR_STRIDED_ACCESS_MD
@ Generic
SmallVector< AArch64_IMM::ImmInsnModel, 4 > Insn
static bool isConstant(const MachineInstr &MI)
static const LLT S1
static const LLT F32
amdgpu AMDGPU Register Bank Select
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
@ Scaled
@ OP_VEXT3
@ OP_VTRNR
@ OP_VDUP1
@ OP_VZIPR
@ OP_VUZPR
@ OP_VREV
@ OP_VZIPL
@ OP_VTRNL
@ OP_COPY
@ OP_VEXT1
@ OP_VDUP0
@ OP_VEXT2
@ OP_VUZPL
@ OP_VDUP3
@ OP_VDUP2
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
Function Alias Analysis Results
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static bool isConstantSplatVectorMaskForType(SDNode *N, EVT ScalarTy)
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
Symbol * Sym
Definition: ELF_riscv.cpp:479
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static Function * getFunction(Constant *C)
Definition: Evaluator.cpp:236
static bool isSigned(unsigned int Opcode)
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
#define im(i)
Hexagon Common GEP
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
This defines the Use class.
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
lazy value info
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define G(x, y, z)
Definition: MD5.cpp:56
mir Rename Register Operands
unsigned const TargetRegisterInfo * TRI
This file provides utility analysis objects describing memory locations.
Module.h This file contains the declarations for the Module class.
uint64_t IntrinsicInst * II
This file defines ARC utility functions which are used by various parts of the compiler.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
static CodeModel::Model getCodeModel(const PPCSubtarget &S, const TargetMachine &TM, const MachineOperand &MO)
PowerPC Reduce CR logical Operation
const char LLVMTargetMachineRef TM
static bool getVal(MDTuple *MD, const char *Key, uint64_t &Val)
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallSet class.
This file defines the SmallVector class.
static bool Enabled
Definition: Statistic.cpp:46
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
static const int BlockSize
Definition: TarWriter.cpp:33
This pass exposes codegen information to IR-level passes.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
static constexpr int Concat[]
Value * RHS
Value * LHS
AArch64FunctionInfo - This class is derived from MachineFunctionInfo and contains private AArch64-spe...
void setVarArgsStackOffset(unsigned Offset)
void setTailCallReservedStack(unsigned bytes)
SmallVectorImpl< ForwardedRegister > & getForwardedMustTailRegParms()
void setBytesInStackArgArea(unsigned bytes)
void setHasSwiftAsyncContext(bool HasContext)
void setJumpTableEntryInfo(int Idx, unsigned Size, MCSymbol *PCRelSym)
void setArgumentStackToRestore(unsigned bytes)
void setHasStreamingModeChanges(bool HasChanges)
bool isLegalAddressingMode(unsigned NumBytes, int64_t Offset, unsigned Scale) const
void UpdateCustomCalleeSavedRegs(MachineFunction &MF) const
const AArch64RegisterInfo * getRegisterInfo() const override
bool isNeonAvailable() const
Returns true if the target has NEON and the function at runtime is known to have NEON enabled (e....
unsigned getMinimumJumpTableEntries() const
const AArch64InstrInfo * getInstrInfo() const override
const char * getSecurityCheckCookieName() const
unsigned getMaximumJumpTableSize() const
ARMProcFamilyEnum getProcFamily() const
Returns ARM processor family.
std::optional< uint16_t > getPtrAuthBlockAddressDiscriminatorIfEnabled(const Function &ParentFn) const
Compute the integer discriminator for a given BlockAddress constant, if blockaddress signing is enabl...
unsigned classifyGlobalFunctionReference(const GlobalValue *GV, const TargetMachine &TM) const
Align getPrefLoopAlignment() const
Align getPrefFunctionAlignment() const
unsigned getMaxBytesForLoopAlignment() const
bool supportsAddressTopByteIgnored() const
CPU has TBI (top byte of addresses is ignored during HW address translation) and OS enables it.
const Triple & getTargetTriple() const
const char * getChkStkName() const
bool isSVEorStreamingSVEAvailable() const
Returns true if the target has access to either the full range of SVE instructions,...
bool useSVEForFixedLengthVectors() const
unsigned ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const
ClassifyGlobalReference - Find the target operand flags that describe how a global value should be re...
bool isStreaming() const
Returns true if the function has a streaming body.
bool isXRegisterReserved(size_t i) const
unsigned getMaxSVEVectorSizeInBits() const
bool isCallingConvWin64(CallingConv::ID CC, bool IsVarArg) const
unsigned getMinSVEVectorSizeInBits() const
bool isSVEAvailable() const
Returns true if the target has SVE and can use the full range of SVE instructions,...
bool hasCustomCallingConv() const
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type FromTy to type ToTy.
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
bool isShuffleMaskLegal(ArrayRef< int > M, EVT VT) const override
Return true if the given shuffle mask can be codegen'd directly, or if it should be stack expanded.
unsigned getVaListSizeInBits(const DataLayout &DL) const override
Returns the size of the platform's va_list object.
MachineBasicBlock * EmitZAInstr(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) const override
Return the prefered common base offset.
bool shouldInsertTrailingFenceForAtomicStore(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert a trailing fence without reducing the ordering f...
bool shouldExpandCttzElements(EVT VT) const override
Return true if the @llvm.experimental.cttz.elts intrinsic should be expanded using generic code in Se...
MachineBasicBlock * EmitInitTPIDR2Object(MachineInstr &MI, MachineBasicBlock *BB) const
MachineBasicBlock * EmitTileLoad(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB) const
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL, bool UseScalable) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
bool shouldRemoveRedundantExtend(SDValue Op) const override
Return true (the default) if it is profitable to remove a sext_inreg(x) where the sext is redundant,...
CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC) const
Selects the correct CCAssignFn for a given CallingConvention value.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ISD::SETCC ValueType.
bool optimizeExtendOrTruncateConversion(Instruction *I, Loop *L, const TargetTransformInfo &TTI) const override
Try to optimize extending or truncating conversion instructions (like zext, trunc,...
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const
Selects the correct CCAssignFn for a given CallingConvention value.
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool hasInlineStackProbe(const MachineFunction &MF) const override
True if stack clash protection is enabled for this functions.
bool isLegalICmpImmediate(int64_t) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
Value * emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override
Perform a store-conditional operation to Addr.
bool preferIncOfAddToSubOfNot(EVT VT) const override
These two forms are equivalent: sub y, (xor x, -1) add (add x, 1), y The variant with two add's is IR...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
bool isOpSuitableForLSE128(const Instruction *I) const
bool lowerInterleavedLoad(LoadInst *LI, ArrayRef< ShuffleVectorInst * > Shuffles, ArrayRef< unsigned > Indices, unsigned Factor) const override
Lower an interleaved load into a ldN intrinsic.
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool shouldSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
bool fallBackToDAGISel(const Instruction &Inst) const override
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
getTgtMemIntrinsic - Represent NEON load and store intrinsics as MemIntrinsicNodes.
bool isLegalAddScalableImmediate(int64_t) const override
Return true if adding the specified scalable immediate is legal, that is the target has add instructi...
Function * getSSPStackGuardCheck(const Module &M) const override
If the target has a standard stack protection check function that performs validation and error handl...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
Value * createComplexDeinterleavingIR(IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator=nullptr) const override
Create the IR node for the given complex deinterleaving operation.
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
Returns true if the target allows unaligned memory accesses of the specified type.
unsigned getMaxSupportedInterleaveFactor() const override
Get the maximum supported factor for interleaved memory accesses.
bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const
Returns true if VecTy is a legal interleaved access type.
void insertSSPDeclarations(Module &M) const override
Inserts necessary declarations for SSP (stack protection) purpose.
bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg, const DataLayout &DL) const override
For some targets, an LLVM struct type must be broken down into multiple simple types,...
Value * emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr, AtomicOrdering Ord) const override
Perform a load-linked operation on Addr, returning a "Value *" with the corresponding pointee type.
MachineBasicBlock * EmitLoweredCatchRet(MachineInstr &MI, MachineBasicBlock *BB) const
bool isComplexDeinterleavingSupported() const override
Does this target support complex deinterleaving.
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const override
SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const
MachineBasicBlock * EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool useLoadStackGuardNode() const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
Value * getSafeStackPointerLocation(IRBuilderBase &IRB) const override
If the target has a standard location for the unsafe stack pointer, returns the address of that locat...
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
bool isProfitableToHoist(Instruction *I) const override
Check if it is profitable to hoist instruction in then/else to if.
bool isOpSuitableForRCPC3(const Instruction *I) const
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const override
Return true if it is profitable to reduce a load to a smaller type.
MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const override
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override
Lower an interleaved store into a stN intrinsic.
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
MachineBasicBlock * EmitZTInstr(MachineInstr &MI, MachineBasicBlock *BB, unsigned Opcode, bool Op0IsDef) const
MachineBasicBlock * EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const
bool shouldInsertFencesForAtomic(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
Control the following reassociation of operands: (op (op x, c1), y) -> (op (op x, y),...
void verifyTargetSDNode(const SDNode *N) const override
Check the given SDNode. Aborts if it is invalid.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
MachineBasicBlock * EmitF128CSEL(MachineInstr &MI, MachineBasicBlock *BB) const
LLT getOptimalMemOpLLT(const MemOp &Op, const AttributeList &FuncAttributes) const override
LLT returning variant.
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool needsFixedCatchObjects() const override
Used for exception handling on Win64.
MachineBasicBlock * EmitAllocateZABuffer(MachineInstr &MI, MachineBasicBlock *BB) const
bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI, LoadInst *LI) const override
Lower a deinterleave intrinsic to a target specific load intrinsic.
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Value * getIRStackGuard(IRBuilderBase &IRB) const override
If the target has a standard location for the stack protector cookie, returns the address of that loc...
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
bool generateFMAsInMachineCombiner(EVT VT, CodeGenOptLevel OptLevel) const override
bool isComplexDeinterleavingOperationSupported(ComplexDeinterleavingOperation Operation, Type *Ty) const override
Does this target support complex deinterleaving with the given operation and type.
bool hasPairedLoad(EVT LoadedType, Align &RequiredAligment) const override
Return true if the target supplies and combines to a paired load two loaded values of type LoadedType...
bool isOpSuitableForLDPSTP(const Instruction *I) const
bool shouldFoldConstantShiftPairToMask(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to fold a pair of shifts into a mask.
AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI)
bool isLegalAddImmediate(int64_t) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool shouldConsiderGEPOffsetSplit() const override
bool isVectorClearMaskLegal(ArrayRef< int > M, EVT VT) const override
Similar to isShuffleMaskLegal.
const MCPhysReg * getScratchRegisters(CallingConv::ID CC) const override
Returns a 0 terminated array of registers that can be safely used as scratch registers.
void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) const
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
bool isDesirableToCommuteXorWithShift(const SDNode *N) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II, StoreInst *SI) const override
Lower an interleave intrinsic to a target specific store intrinsic.
bool enableAggressiveFMAFusion(EVT VT) const override
Enable aggressive FMA fusion on targets that want it.
Value * getSDagStackGuard(const Module &M) const override
Return the variable that's previously inserted by insertSSPDeclarations, if any, otherwise return nul...
MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
MachineBasicBlock * EmitDynamicProbedAlloc(MachineInstr &MI, MachineBasicBlock *MBB) const
SDValue changeStreamingMode(SelectionDAG &DAG, SDLoc DL, bool Enable, SDValue Chain, SDValue InGlue, unsigned Condition, SDValue PStateSM=SDValue()) const
If a change in streaming mode is required on entry to/return from a function call it emits and return...
bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const override
Return true if the @llvm.get.active.lane.mask intrinsic should be expanded using generic code in Sele...
bool isMulAddWithConstProfitable(SDValue AddNode, SDValue ConstNode) const override
Return true if it may be profitable to transform (mul (add x, c1), c2) -> (add (mul x,...
bool useSVEForFixedLengthVectorVT(EVT VT, bool OverrideNEON=false) const
bool mergeStoresAfterLegalization(EVT VT) const override
SVE code generation for fixed length vectors does not custom lower BUILD_VECTOR.
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:214
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:429
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:981
static APInt getSignMask(unsigned BitWidth)
Get the SignMask for a specific bit width.
Definition: APInt.h:209
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition: APInt.h:403
uint64_t getZExtValue() const
Get zero extended value.
Definition: APInt.h:1500
static void sdivrem(const APInt &LHS, const APInt &RHS, APInt &Quotient, APInt &Remainder)
Definition: APInt.cpp:1860
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1372
APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition: APInt.cpp:1002
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:351
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1448
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition: APInt.h:189
bool isNegative() const
Determine sign of this APInt.
Definition: APInt.h:309
APInt sadd_ov(const APInt &RHS, bool &Overflow) const
Definition: APInt.cpp:1898
bool sle(const APInt &RHS) const
Signed less or equal comparison.
Definition: APInt.h:1146
APInt uadd_ov(const APInt &RHS, bool &Overflow) const
Definition: APInt.cpp:1905
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1598
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition: APInt.h:199
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1010
unsigned logBase2() const
Definition: APInt.h:1719
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:807
bool isMask(unsigned numBits) const
Definition: APInt.h:468
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition: APInt.h:314
APInt sext(unsigned width) const
Sign extend to a new width.
Definition: APInt.cpp:954
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition: APInt.h:1237
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:420
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:286
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:276
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1217
bool isOne() const
Determine if this is a value of 1.
Definition: APInt.h:369
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1522
an instruction to allocate memory on the stack
Definition: Instructions.h:61
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:495
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:696
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:726
@ Or
*p = old | v
Definition: Instructions.h:720
@ And
*p = old & v
Definition: Instructions.h:716
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:724
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:730
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:728
@ Nand
*p = ~(old & v)
Definition: Instructions.h:718
bool isFloatingPointOperation() const
Definition: Instructions.h:864
BinOp getOperation() const
Definition: Instructions.h:787
This is an SDNode representing atomic operations.
bool hasFnAttr(Attribute::AttrKind Kind) const
Return true if the attribute exists for the function.
static Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
Definition: Attributes.cpp:94
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:209
const BlockAddress * getBlockAddress() const
The address of a basic block.
Definition: Constants.h:890
Function * getFunction() const
Definition: Constants.h:918
A "pseudo-class" with methods for operating on BUILD_VECTORs.
ConstantFPSDNode * getConstantFPSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant FP or null if this is not a constant FP splat.
bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
ConstantSDNode * getConstantSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant or null if this is not a constant splat.
int32_t getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements, uint32_t BitWidth) const
If this is a constant FP splat and the splatted constant FP is an exact power or 2,...
CCState - This class holds information needed while lowering arguments and return values.
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
CCValAssign - Represent assignment of one arg/retval to a location.
bool isRegLoc() const
Register getLocReg() const
LocInfo getLocInfo() const
bool needsCustom() const
bool isMemLoc() const
int64_t getLocMemOffset() const
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1410
unsigned arg_size() const
Definition: InstrTypes.h:1408
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1594
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
bool isZero() const
Return true if the value is positive or negative zero.
This is the shared class of boolean and integer constants.
Definition: Constants.h:81
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:206
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:146
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
int64_t getSExtValue() const
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1399
This is an important base class in LLVM.
Definition: Constant.h:42
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:370
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
bool isLittleEndian() const
Layout endianness...
Definition: DataLayout.h:238
IntegerType * getIntPtrType(LLVMContext &C, unsigned AddressSpace=0) const
Returns an integer type with size at least as big as that of a pointer in the given address space.
Definition: DataLayout.cpp:878
bool isBigEndian() const
Definition: DataLayout.h:239
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:504
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:874
A debug info location.
Definition: DebugLoc.h:33
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:314
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:311
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:322
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition: FastISel.h:66
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
static FixedVectorType * getInteger(FixedVectorType *VTy)
Definition: DerivedTypes.h:551
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:168
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type * getParamType(unsigned i) const
Parameter type accessors.
Definition: DerivedTypes.h:135
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition: Function.h:698
bool empty() const
Definition: Function.h:822
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:207
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition: Function.h:695
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:274
Constant * getPersonalityFn() const
Get the personality function associated with this function.
Definition: Function.cpp:1963
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:350
arg_iterator arg_end()
Definition: Function.h:840
arg_iterator arg_begin()
Definition: Function.h:831
size_t size() const
Definition: Function.h:821
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:358
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:719
const GlobalValue * getGlobal() const
bool isThreadLocal() const
If the value is "Thread Local", its value isn't shared by the threads.
Definition: GlobalValue.h:263
bool hasExternalWeakLinkage() const
Definition: GlobalValue.h:529
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Definition: Globals.cpp:124
Type * getValueType() const
Definition: GlobalValue.h:296
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:91
Value * CreateZExtOrBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2142
CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Definition: IRBuilder.h:1042
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2477
Value * CreateConstGEP1_32(Type *Ty, Value *Ptr, unsigned Idx0, const Twine &Name="")
Definition: IRBuilder.h:1884
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2528
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition: IRBuilder.h:1050
Value * CreateSIToFP(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2099
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:536
Value * CreatePointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2175
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1193
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2521
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:463
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:933
Value * CreateFPToUI(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2072
Value * CreateIntToPtr(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2127
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1442
ConstantInt * getInt8(uint8_t C)
Get a constant 8-bit value.
Definition: IRBuilder.h:473
Value * CreateUIToFP(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2086
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:171
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition: IRBuilder.h:1871
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:488
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2132
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1421
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2026
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2499
LLVMContext & getContext() const
Definition: IRBuilder.h:173
Value * CreatePtrToInt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2122
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition: IRBuilder.h:2012
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1502
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition: IRBuilder.h:566
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2417
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1461
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition: IRBuilder.h:513
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2671
This instruction inserts a single (scalar) element into a VectorType value.
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:66
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:92
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:70
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:274
const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
Definition: Instruction.cpp:74
Class to represent integer types.
Definition: DerivedTypes.h:40
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:55
constexpr unsigned getScalarSizeInBits() const
Definition: LowLevelType.h:267
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
Definition: LowLevelType.h:100
constexpr TypeSize getSizeInBytes() const
Returns the total size of the type in bytes, i.e.
Definition: LowLevelType.h:203
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
Definition: Instructions.h:174
Value * getPointerOperand()
Definition: Instructions.h:253
Type * getPointerOperandType() const
Definition: Instructions.h:256
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
bool is128BitVector() const
Return true if this is a 128-bit vector type.
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
@ INVALID_SIMPLE_VALUE_TYPE
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:230
bool isScalableVT() const
Return true if the type is a scalable type.
static auto all_valuetypes()
SimpleValueType Iteration.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto scalable_vector_valuetypes()
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool isFixedLengthVector() const
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
bool is64BitVector() const
Return true if this is a 64-bit vector type.
static auto fp_fixedlen_vector_valuetypes()
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
MachineInstr * remove_instr(MachineInstr *I)
Remove the possibly bundled instruction from the instruction list without deleting it.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
Instructions::iterator instr_iterator
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
SSPLayoutKind getObjectSSPLayout(int ObjectIdx) const
void computeMaxCallFrameSize(MachineFunction &MF, std::vector< MachineBasicBlock::iterator > *FrameSDOps=nullptr)
Computes the maximum size of a callframe.
int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
@ SSPLK_None
Did not trigger a stack protector.
void setFrameAddressIsTaken(bool T)
int getStackProtectorIndex() const
Return the index for the stack protector object.
int CreateSpillStackObject(uint64_t Size, Align Alignment)
Create a new statically sized stack object that represents a spill slot, returning a nonnegative iden...
void setStackID(int ObjectIdx, uint8_t ID)
void setHasTailCall(bool V=true)
bool hasMustTailInVarArgFunc() const
Returns true if the function is variadic and contains a musttail call.
void setReturnAddressIsTaken(bool s)
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
void RemoveStackObject(int ObjectIdx)
Remove or mark dead a statically sized stack object.
int CreateVariableSizedObject(Align Alignment, const AllocaInst *Alloca)
Notify the MachineFrameInfo object that a variable sized object has been created.
int getObjectIndexEnd() const
Return one past the maximum frame object index.
bool hasStackProtectorIndex() const
uint8_t getStackID(int ObjectIdx) const
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
void setObjectAlignment(int ObjectIdx, Align Alignment)
setObjectAlignment - Change the alignment of the specified stack object.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
MachineOperand class - Representation of each machine instruction operand.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
An SDNode that represents everything that will be needed to construct a MachineInstr.
size_type size() const
Definition: MapVector.h:60
This class is used to represent an MGATHER node.
const SDValue & getPassThru() const
ISD::LoadExtType getExtensionType() const
This is a base class used to represent MGATHER and MSCATTER nodes.
const SDValue & getIndex() const
const SDValue & getScale() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
ISD::MemIndexType getIndexType() const
How is Index applied to BasePtr when computing addresses.
const SDValue & getInc() const
const SDValue & getScale() const
const SDValue & getMask() const
const SDValue & getIntID() const
const SDValue & getIndex() const
const SDValue & getBasePtr() const
ISD::MemIndexType getIndexType() const
This class is used to represent an MLOAD node.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
const SDValue & getMask() const
const SDValue & getPassThru() const
const SDValue & getOffset() const
bool isUnindexed() const
Return true if this is NOT a pre/post inc/dec load/store.
ISD::MemIndexedMode getAddressingMode() const
Return the addressing mode for this load or store: unindexed, pre-inc, pre-dec, post-inc,...
This class is used to represent an MSCATTER node.
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This class is used to represent an MSTORE node.
const SDValue & getOffset() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
bool isVolatile() const
Align getOriginalAlign() const
Returns alignment and volatility of the memory access.
AtomicOrdering getSuccessOrdering() const
Return the atomic ordering requirements for this memory operation.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getBasePtr() const
const MachinePointerInfo & getPointerInfo() const
AtomicOrdering getMergedOrdering() const
Return a single atomic ordering that is at least as strong as both the success and failure orderings ...
const SDValue & getChain() const
bool isNonTemporal() const
bool isAtomic() const
Return true if the memory operation ordering is Unordered or higher.
EVT getMemoryVT() const
Return the type of the in-memory value.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
bool getRtLibUseGOT() const
Returns true if PLT should be avoided for RTLib calls.
Definition: Module.cpp:700
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
void dump() const
Definition: Pass.cpp:136
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:662
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1852
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
bool isStrictFPOpcode()
Test if this node is a strict floating point pseudo-op.
ArrayRef< SDUse > ops() const
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
void dump() const
Dump this node, for debugging.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< value_op_iterator > op_values() const
iterator_range< use_iterator > uses()
size_t use_size() const
Return the number of uses of this node.
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
void setCFIType(uint32_t Type)
bool isUndef() const
Return true if the type of the node type undefined.
void setFlags(SDNodeFlags NewFlags)
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
void dump() const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
void setNode(SDNode *N)
set the SDNode
unsigned getOpcode() const
unsigned getNumOperands() const
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool hasStreamingInterface() const
bool hasStreamingCompatibleInterface() const
bool hasNonStreamingInterface() const
bool hasStreamingBody() const
bool hasZAState() const
Class to represent scalable SIMD vectors.
Definition: DerivedTypes.h:586
static ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition: Type.cpp:713
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:227
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:737
unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
SDValue getMaskedGather(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy)
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
Definition: SelectionDAG.h:490
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
SDValue getSplatValue(SDValue V, bool LegalTypes=false)
If V is a splat vector, return its scalar source operand by extracting that element from the source v...
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
SDValue getVScale(const SDLoc &DL, EVT VT, APInt MulImm, bool ConstantFold=true)
Return a node that represents the runtime scaling 'MulImm * RuntimeVL'.
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), AAResults *AA=nullptr)
SDNode * isConstantIntBuildVectorOrConstantInt(SDValue N) const
Test whether the given value is a constant int or similar node.
SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain)
If an existing load has uses of its chain, create a token factor node with that chain and the new mem...
SDValue getJumpTableDebugInfo(int JTI, SDValue Chain, const SDLoc &DL)
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getStepVector(const SDLoc &DL, EVT ResVT, const APInt &StepVal)
Returns a vector of type ResVT whose elements contain the linear sequence <0, Step,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge)
Set NoMergeSiteInfo to be associated with Node if NoMerge is true.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:494
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
Definition: SelectionDAG.h:747
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:843
bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts, unsigned Depth=0) const
Test whether V has a splatted value for all the demanded elements.
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:488
const SelectionDAGTargetInfo & getSelectionDAGInfo() const
Definition: SelectionDAG.h:496
bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const
Return true if loads are next to each other and can be merged.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, bool isTargetGA=false, unsigned TargetFlags=0)
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
Definition: SelectionDAG.h:676
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getSplatVector(EVT VT, const SDLoc &DL, SDValue Op)
Definition: SelectionDAG.h:877
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
bool SignBitIsZero(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero.
SDValue getRegister(unsigned Reg, EVT VT)
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
SDValue getBoolExtOrTrunc(SDValue Op, const SDLoc &SL, EVT VT, EVT OpVT)
Convert Op, which must be of integer type, to the integer type VT, by using an extension appropriate ...
SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
static const fltSemantics & EVTToAPFloatSemantics(EVT VT)
Returns an APFloat semantics tag appropriate for the given type.
SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:489
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
Definition: SelectionDAG.h:788
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:691
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:783
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:483
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
Definition: SelectionDAG.h:814
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition: SelectionDAG.h:860
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
void addCallSiteInfo(const SDNode *Node, CallSiteInfo &&CallInfo)
Set CallSiteInfo to be associated with Node.
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
LLVMContext * getContext() const
Definition: SelectionDAG.h:501
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags)
Get the specified node if it's already available, or else return NULL.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:754
SDValue getTargetInsertSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand, SDValue Subreg)
A convenience function for creating TargetInstrInfo::INSERT_SUBREG nodes.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:571
SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
SDValue getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, bool IsTruncating=false)
This instruction constructs a fixed permutation of two input vectors.
VectorType * getType() const
Overload to return most specific vector type.
static bool isSingleSourceMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector.
static void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
static bool isSplatMask(const int *Mask, EVT VT)
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:344
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:479
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:166
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:818
void push_back(const T &Elt)
Definition: SmallVector.h:426
pointer data()
Return a pointer to the vector's buffer, even if empty().
Definition: SmallVector.h:299
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:33
An instruction for storing to memory.
Definition: Instructions.h:290
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition: StringRef.h:455
StringRef slice(size_t Start, size_t End) const
Return a reference to the substring from [Start, End).
Definition: StringRef.h:669
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:137
StringRef save(const char *S)
Definition: StringSaver.h:30
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:69
R Default(T Value)
Definition: StringSwitch.h:182
Class to represent struct types.
Definition: DerivedTypes.h:216
static StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition: Type.cpp:373
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
EVT getMemValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
virtual bool shouldExpandBuildVectorWithShuffles(EVT, unsigned DefinedValues) const
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
virtual Value * getSafeStackPointerLocation(IRBuilderBase &IRB) const
Returns the target-specific address of the unsafe stack pointer.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
virtual bool shouldLocalize(const MachineInstr &MI, const TargetTransformInfo *TTI) const
Check whether or not MI needs to be moved close to its uses.
void setMaximumJumpTableSize(unsigned)
Indicate the maximum number of entries in jump tables.
const TargetMachine & getTargetMachine() const
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
unsigned MaxGluedStoresPerMemcpy
Specify max number of store instructions to glue in inlined memcpy.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setMaxBytesForAlignment(unsigned MaxBytes)
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual Value * getSDagStackGuard(const Module &M) const
Return the variable that's previously inserted by insertSSPDeclarations, if any, otherwise return nul...
void setIndexedLoadAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed load does or does not work with the specified type and indicate w...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
unsigned getMaximumJumpTableSize() const
Return upper limit for number of entries in a jump table.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
virtual Function * getSSPStackGuardCheck(const Module &M) const
If the target has a standard stack protection check function that performs validation and error handl...
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
virtual Value * getIRStackGuard(IRBuilderBase &IRB) const
If the target has a standard location for the stack protector guard, returns the address of that loca...
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setIndexedStoreAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed store does or does not work with the specified type and indicate ...
void setLibcallName(RTLIB::Libcall Call, const char *Name)
Rename the default libcall routine name for the specified libcall.
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
virtual bool isLegalAddImmediate(int64_t) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const
Return true if it is profitable to reduce a load to a smaller type.
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
bool isLoadExtLegalOrCustom(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal or custom on this target.
virtual bool isBinOp(unsigned Opcode) const
Return true if the node is a math/logic binary operator.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
virtual EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
MVT getFrameIndexTy(const DataLayout &DL) const
Return the type for frame index, which is determined by the alloca address space specified through th...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
MVT getRegisterType(MVT VT) const
Return the type of registers that this ValueType will eventually require.
virtual void insertSSPDeclarations(Module &M) const
Inserts necessary declarations for SSP (stack protection) purpose.
virtual bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl< SDNode * > &Created) const
Build sdiv by power-of-2 with conditional move instructions Ref: "Hacker's Delight" by Henry Warren 1...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
virtual bool useLoadStackGuardNode() const
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
virtual bool isTargetCanonicalConstantNode(SDValue Op) const
Returns true if the given Opc is considered a canonical constant for the target, which should not be ...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool isPositionIndependent() const
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
void expandShiftParts(SDNode *N, SDValue &Lo, SDValue &Hi, SelectionDAG &DAG) const
Expand shift-by-parts.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
const Triple & getTargetTriple() const
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned TLSSize
Bit size of immediate TLS offsets (0 == use the default).
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TargetCostKind CostKind) const
Return the expected cost of materializing for the given integer immediate of the specified type.
@ TCC_Free
Expected to fold away in lowering.
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
bool isWindowsMSVCEnvironment() const
Checks if the environment could be MSVC.
Definition: Triple.h:636
This class represents a truncation of integer types.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
static Type * getHalfTy(LLVMContext &C)
static Type * getDoubleTy(LLVMContext &C)
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isArrayTy() const
True if this is an instance of ArrayType.
Definition: Type.h:252
static Type * getBFloatTy(LLVMContext &C)
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
static IntegerType * getInt1Ty(LLVMContext &C)
@ FloatTyID
32-bit floating point type
Definition: Type.h:58
@ DoubleTyID
64-bit floating point type
Definition: Type.h:59
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static Type * getVoidTy(LLVMContext &C)
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:302
static IntegerType * getInt16Ty(LLVMContext &C)
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
static IntegerType * getInt8Ty(LLVMContext &C)
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
bool isScalableTy() const
Return true if this is a type whose size is a known multiple of vscale.
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
static Type * getFloatTy(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
TypeID getTypeID() const
Return the type id for the type.
Definition: Type.h:137
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getContainedType(unsigned i) const
This method is used to implement the type iterator (defined at the end of the file).
Definition: Type.h:377
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1833
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
const Use & getOperandUse(unsigned i) const
Definition: User.h:182
Value * getOperand(unsigned i) const
Definition: User.h:169
unsigned getNumOperands() const
Definition: User.h:191
This class is used to represent EVT's, which are used to parameterize some operations.
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
static VectorType * getHalfElementsVectorType(VectorType *VTy)
This static method returns a VectorType with half as many elements as the input type and the same ele...
Definition: DerivedTypes.h:507
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * getInteger(VectorType *VTy)
This static method gets a VectorType with the same number of elements as the input type,...
Definition: DerivedTypes.h:454
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:676
static VectorType * getTruncatedElementVectorType(VectorType *VTy)
Definition: DerivedTypes.h:472
Type * getElementType() const
Definition: DerivedTypes.h:436
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition: TypeSize.h:254
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
#define UINT64_MAX
Definition: DataTypes.h:77
#define INT64_MAX
Definition: DataTypes.h:71
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static CondCode getInvertedCondCode(CondCode Code)
static unsigned getNZCVToSatisfyCondCode(CondCode Code)
Given a condition code, return NZCV flags that would satisfy that condition.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_NC
MO_NC - Indicates whether the linker is expected to check the symbol reference for overflow.
@ MO_G1
MO_G1 - A symbol operand with this flag (granule 1) represents the bits 16-31 of a 64-bit address,...
@ MO_PAGEOFF
MO_PAGEOFF - A symbol operand with this flag represents the offset of that symbol within a 4K page.
@ MO_GOT
MO_GOT - This flag indicates that a symbol operand represents the address of the GOT entry for the sy...
@ MO_G0
MO_G0 - A symbol operand with this flag (granule 0) represents the bits 0-15 of a 64-bit address,...
@ MO_PAGE
MO_PAGE - A symbol operand with this flag represents the pc-relative offset of the 4K page containing...
@ MO_HI12
MO_HI12 - This flag indicates that a symbol operand represents the bits 13-24 of a 64-bit address,...
@ MO_TLS
MO_TLS - Indicates that the operand being accessed is some kind of thread-local symbol.
@ MO_G2
MO_G2 - A symbol operand with this flag (granule 2) represents the bits 32-47 of a 64-bit address,...
@ MO_G3
MO_G3 - A symbol operand with this flag (granule 3) represents the high 16-bits of a 64-bit address,...
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
@ NVCAST
Natural vector cast.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
static uint8_t encodeAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType9(uint64_t Imm)
static bool isAdvSIMDModImmType4(uint64_t Imm)
static bool isAdvSIMDModImmType5(uint64_t Imm)
static int getFP32Imm(const APInt &Imm)
getFP32Imm - Return an 8-bit floating-point version of the 32-bit floating-point value.
static uint8_t encodeAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType12(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType10(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType9(uint64_t Imm)
static uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize)
encodeLogicalImmediate - Return the encoded immediate value for a logical immediate instruction of th...
static bool isAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType5(uint64_t Imm)
static int getFP64Imm(const APInt &Imm)
getFP64Imm - Return an 8-bit floating-point version of the 64-bit floating-point value.
static bool isAdvSIMDModImmType10(uint64_t Imm)
static int getFP16Imm(const APInt &Imm)
getFP16Imm - Return an 8-bit floating-point version of the 16-bit floating-point value.
static uint8_t encodeAdvSIMDModImmType8(uint64_t Imm)
static bool isAdvSIMDModImmType12(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType11(uint64_t Imm)
static bool isAdvSIMDModImmType11(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType6(uint64_t Imm)
static bool isAdvSIMDModImmType8(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType4(uint64_t Imm)
static bool isAdvSIMDModImmType6(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType1(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType1(uint64_t Imm)
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
ArrayRef< MCPhysReg > getFPRArgRegs()
int getSMEPseudoMap(uint16_t Opcode)
static constexpr unsigned SVEMaxBitsPerVector
const unsigned RoundingBitsPos
const uint64_t ReservedFPControlBits
static constexpr unsigned SVEBitsPerBlock
ArrayRef< MCPhysReg > getGPRArgRegs()
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
Key
PAL metadata keys.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
@ Entry
Definition: COFF.h:811
@ ARM64EC_Thunk_Native
Calling convention used in the ARM64EC ABI to implement calls between ARM64 code and thunks.
Definition: CallingConv.h:265
@ AArch64_VectorCall
Used between AArch64 Advanced SIMD functions.
Definition: CallingConv.h:221
@ Swift
Calling convention for Swift.
Definition: CallingConv.h:69
@ AArch64_SVE_VectorCall
Used between AArch64 SVE functions.
Definition: CallingConv.h:224
@ CFGuard_Check
Special calling convention on Windows for calling the Control Guard Check ICall funtion.
Definition: CallingConv.h:82
@ PreserveMost
Used for runtime calls that preserves most registers.
Definition: CallingConv.h:63
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2
Preserve X2-X15, X19-X29, SP, Z0-Z31, P0-P15.
Definition: CallingConv.h:241
@ CXX_FAST_TLS
Used for access functions.
Definition: CallingConv.h:72
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0
Preserve X0-X13, X19-X29, SP, Z0-Z31, P0-P15.
Definition: CallingConv.h:238
@ GHC
Used by the Glasgow Haskell Compiler (GHC).
Definition: CallingConv.h:50
@ PreserveAll
Used for runtime calls that preserves (almost) all registers.
Definition: CallingConv.h:66
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ PreserveNone
Used for runtime calls that preserves none general registers.
Definition: CallingConv.h:90
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition: CallingConv.h:76
@ Win64
The C convention as implemented on Windows/x86-64 and AArch64.
Definition: CallingConv.h:159
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition: CallingConv.h:87
@ GRAAL
Used by GraalVM. Two additional registers are reserved.
Definition: CallingConv.h:255
@ ARM64EC_Thunk_X64
Calling convention used in the ARM64EC ABI to implement calls between x64 code and thunks.
Definition: CallingConv.h:260
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
bool isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are ~0 ...
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition: ISDOpcodes.h:40
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:779
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition: ISDOpcodes.h:243
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition: ISDOpcodes.h:1169
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1165
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition: ISDOpcodes.h:490
@ VECREDUCE_SEQ_FADD
Generic reduction nodes.
Definition: ISDOpcodes.h:1382
@ MLOAD
Masked load and store - consecutive vector load and store operations with additional mask operand tha...
Definition: ISDOpcodes.h:1330
@ VECREDUCE_SMIN
Definition: ISDOpcodes.h:1415
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:257
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:573
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:743
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
Definition: ISDOpcodes.h:1198
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1284
@ STRICT_FCEIL
Definition: ISDOpcodes.h:440
@ STRICT_FTANH
Definition: ISDOpcodes.h:430
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1074
@ SET_FPMODE
Sets the current dynamic floating-point control modes.
Definition: ISDOpcodes.h:1064
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:813
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:497
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:205
@ GlobalAddress
Definition: ISDOpcodes.h:78
@ STRICT_FMINIMUM
Definition: ISDOpcodes.h:450
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:820
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:557
@ VECREDUCE_FMAX
FMIN/FMAX nodes can have flags, for NaN/NoNaN variants.
Definition: ISDOpcodes.h:1400
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ VECREDUCE_FMAXIMUM
FMINIMUM/FMAXIMUM nodes propatate NaNs and signed zeroes using the llvm.minimum and llvm....
Definition: ISDOpcodes.h:1404
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:716
@ RESET_FPMODE
Sets default dynamic floating-point control modes.
Definition: ISDOpcodes.h:1068
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:262
@ VECREDUCE_SMAX
Definition: ISDOpcodes.h:1414
@ STRICT_FSETCCS
Definition: ISDOpcodes.h:491
@ STRICT_FLOG2
Definition: ISDOpcodes.h:435
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1310
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:933
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:236
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1311
@ INIT_TRAMPOLINE
INIT_TRAMPOLINE - This corresponds to the init_trampoline intrinsic.
Definition: ISDOpcodes.h:1242
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:976
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition: ISDOpcodes.h:418
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1455
@ GlobalTLSAddress
Definition: ISDOpcodes.h:79
@ SET_ROUNDING
Set rounding mode.
Definition: ISDOpcodes.h:915
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:804
@ STRICT_FASIN
Definition: ISDOpcodes.h:425
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition: ISDOpcodes.h:684
@ STRICT_UINT_TO_FP
Definition: ISDOpcodes.h:464
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:634
@ ADDROFRETURNADDR
ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
Definition: ISDOpcodes.h:107
@ VECREDUCE_FADD
These reductions have relaxed evaluation order semantics, and have a single vector operand.
Definition: ISDOpcodes.h:1397
@ STRICT_FATAN
Definition: ISDOpcodes.h:427
@ WRITE_REGISTER
Definition: ISDOpcodes.h:125
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1264
@ VECREDUCE_FMIN
Definition: ISDOpcodes.h:1401
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:1031
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition: ISDOpcodes.h:787
@ STRICT_LROUND
Definition: ISDOpcodes.h:445
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:960
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1120
@ SSUBO
Same for subtraction.
Definition: ISDOpcodes.h:334
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:1095
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1099
@ VECTOR_INTERLEAVE
VECTOR_INTERLEAVE(VEC1, VEC2) - Returns two vectors with all input and output vectors having the same...
Definition: ISDOpcodes.h:600
@ STEP_VECTOR
STEP_VECTOR(IMM) - Returns a scalable vector whose lanes are comprised of a linear sequence of unsign...
Definition: ISDOpcodes.h:660
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:356
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:756
@ STRICT_FPOWI
Definition: ISDOpcodes.h:420
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1280
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:218
@ VECREDUCE_UMAX
Definition: ISDOpcodes.h:1416
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition: ISDOpcodes.h:641
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
Definition: ISDOpcodes.h:1194
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:330
@ STRICT_FTRUNC
Definition: ISDOpcodes.h:444
@ VECREDUCE_ADD
Integer reductions may have a result type larger than the vector element type.
Definition: ISDOpcodes.h:1409
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:910
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:673
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
Definition: ISDOpcodes.h:1059
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:734
@ ATOMIC_LOAD_CLR
Definition: ISDOpcodes.h:1309
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:614
@ PtrAuthGlobalAddress
A ptrauth constant.
Definition: ISDOpcodes.h:90
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1308
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:587
@ STRICT_FMAXIMUM
Definition: ISDOpcodes.h:449
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition: ISDOpcodes.h:47
@ STRICT_FMAXNUM
Definition: ISDOpcodes.h:438
@ READ_REGISTER
READ_REGISTER, WRITE_REGISTER - This node represents llvm.register on the DAG, which implements the n...
Definition: ISDOpcodes.h:124
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:549
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:209
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:810
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1254
@ FP_TO_UINT_SAT
Definition: ISDOpcodes.h:886
@ STRICT_FMINNUM
Definition: ISDOpcodes.h:439
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:771
@ STRICT_FSINH
Definition: ISDOpcodes.h:428
@ VSCALE
VSCALE(IMM) - Returns the runtime scaling factor used to calculate the number of elements within a sc...
Definition: ISDOpcodes.h:1372
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
Definition: ISDOpcodes.h:1291
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:1008
@ UBSANTRAP
UBSANTRAP - Trap with an immediate describing the kind of sanitizer failure.
Definition: ISDOpcodes.h:1258
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:338
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1084
@ STRICT_LRINT
Definition: ISDOpcodes.h:447
@ ConstantPool
Definition: ISDOpcodes.h:82
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:828
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:696
@ VECTOR_REVERSE
VECTOR_REVERSE(VECTOR) - Returns a vector, of the same type as VECTOR, whose elements are shuffled us...
Definition: ISDOpcodes.h:605
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:918
@ STRICT_FROUND
Definition: ISDOpcodes.h:442
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:765
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:310
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition: ISDOpcodes.h:463
@ MGATHER
Masked gather and scatter - load and store operations for a vector of random addresses with additiona...
Definition: ISDOpcodes.h:1342
@ VECREDUCE_UMIN
Definition: ISDOpcodes.h:1417
@ STRICT_FFLOOR
Definition: ISDOpcodes.h:441
@ STRICT_FROUNDEVEN
Definition: ISDOpcodes.h:443
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition: ISDOpcodes.h:100
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1306
@ STRICT_FP_TO_UINT
Definition: ISDOpcodes.h:457
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:479
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:456
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1027
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1307
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:866
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1225
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition: ISDOpcodes.h:164
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:484
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:708
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1251
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:190
@ STRICT_FCOSH
Definition: ISDOpcodes.h:429
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition: ISDOpcodes.h:679
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition: ISDOpcodes.h:407
@ STRICT_FLOG10
Definition: ISDOpcodes.h:434
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:538
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ STRICT_LLRINT
Definition: ISDOpcodes.h:448
@ VECTOR_SPLICE
VECTOR_SPLICE(VEC1, VEC2, IMM) - Returns a subvector of the same type as VEC1/VEC2 from CONCAT_VECTOR...
Definition: ISDOpcodes.h:626
@ STRICT_FEXP2
Definition: ISDOpcodes.h:432
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1305
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition: ISDOpcodes.h:981
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:899
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
Definition: ISDOpcodes.h:112
@ STRICT_FLDEXP
Definition: ISDOpcodes.h:421
@ STRICT_LLROUND
Definition: ISDOpcodes.h:446
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition: ISDOpcodes.h:861
@ EXPERIMENTAL_VECTOR_HISTOGRAM
Definition: ISDOpcodes.h:1446
@ STRICT_FNEARBYINT
Definition: ISDOpcodes.h:437
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition: ISDOpcodes.h:885
@ VECREDUCE_FMINIMUM
Definition: ISDOpcodes.h:1405
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:816
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
Definition: ISDOpcodes.h:1189
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1113
@ BlockAddress
Definition: ISDOpcodes.h:84
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:793
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:507
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:347
@ AssertZext
Definition: ISDOpcodes.h:62
@ STRICT_FRINT
Definition: ISDOpcodes.h:436
@ VECTOR_DEINTERLEAVE
VECTOR_DEINTERLEAVE(VEC1, VEC2) - Returns two vectors with all input and output vectors having the sa...
Definition: ISDOpcodes.h:594
@ ABDS
ABDS/ABDU - Absolute difference - Return the absolute difference between two numbers interpreted as s...
Definition: ISDOpcodes.h:691
@ ADJUST_TRAMPOLINE
ADJUST_TRAMPOLINE - This corresponds to the adjust_trampoline intrinsic.
Definition: ISDOpcodes.h:1248
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:320
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:198
@ STRICT_FACOS
Definition: ISDOpcodes.h:426
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:529
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
bool isExtOpcode(unsigned Opcode)
Definition: ISDOpcodes.h:1649
bool isConstantSplatVectorAllZeros(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are 0 o...
bool isVectorShrinkable(const SDNode *N, unsigned NewEltSize, bool Signed)
Returns true if the specified node is a vector where all elements can be truncated to the specified e...
CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
MemIndexType
MemIndexType enum - This enum defines how to interpret MGATHER/SCATTER's index parameter when calcula...
Definition: ISDOpcodes.h:1540
bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
Definition: ISDOpcodes.h:1527
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1578
bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1558
static const int LAST_INDEXED_MODE
Definition: ISDOpcodes.h:1529
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1513
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:816
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:875
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:168
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
VScaleVal_match m_VScale()
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
BinaryOp_match< cst_pred_ty< is_all_ones >, ValTy, Instruction::Xor, true > m_Not(const ValTy &V)
Matches a 'Not' as 'xor V, -1' or 'xor -1, V'.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
Libcall
RTLIB::Libcall enum - This enum defines all of the runtime library calls the backend can emit.
@ Define
Register definition.
@ GeneralDynamic
Definition: CodeGen.h:46
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
std::optional< Function * > getAttachedARCFunction(const CallBase *CB)
This function returns operand bundle clang_arc_attachedcall's argument, which is the address of the A...
Definition: ObjCARCUtil.h:43
bool hasAttachedCallOpBundle(const CallBase *CB)
Definition: ObjCARCUtil.h:29
DiagnosticInfoOptimizationBase::Argument NV
@ FalseVal
Definition: TGLexer.h:59
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
bool isPackedVectorType(EVT SomeVT)
Definition: VECustomDAG.cpp:22
bool RetCC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
@ Offset
Definition: DWP.cpp:480
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:853
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1742
bool CC_AArch64_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &DL, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
void GetReturnInfo(CallingConv::ID CC, Type *ReturnType, AttributeList attr, SmallVectorImpl< ISD::OutputArg > &Outs, const TargetLowering &TLI, const DataLayout &DL)
Given an LLVM IR type and return type attributes, compute the return value EVTs and flags,...
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition: Utils.cpp:295
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:255
bool CC_AArch64_DarwinPCS_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
bool CC_AArch64_Win64PCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition: MathExtras.h:359
bool RetCC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
bool isIntOrFPConstant(SDValue V)
Return true if V is either a integer or FP constant.
bool CC_AArch64_Win64_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool isTRNMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResult)
Return true for trn1 or trn2 masks of the form: <0, 8, 2, 10, 4, 12, 6, 14> or <1,...
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition: bit.h:317
Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
std::optional< unsigned > getSVEPredPatternFromNumElements(unsigned MinNumElts)
Return specific VL predicate pattern based on the number of elements.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:296
bool isNullOrNullSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant 0 integer or a splatted vector of a constant 0 integer (with n...
Definition: Utils.cpp:1528
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:346
bool CC_AArch64_Arm64EC_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool CC_AArch64_Preserve_None(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:285
unsigned M1(unsigned Val)
Definition: VE.h:376
bool isReleaseOrStronger(AtomicOrdering AO)
static Error getOffset(const SymbolRef &Sym, SectionRef Sec, uint64_t &Result)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
bool CC_AArch64_Win64_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
bool CC_AArch64_Arm64EC_Thunk_Native(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool CC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
bool CC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool isUZPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition: MathExtras.h:273
bool isREVMask(ArrayRef< int > M, unsigned EltSize, unsigned NumElts, unsigned BlockSize)
isREVMask - Check if a vector shuffle corresponds to a REV instruction with the specified blocksize.
bool RetCC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
SDValue peekThroughOneUseBitcasts(SDValue V)
Return the non-bitcasted and one-use source operand of V if it exists.
EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
constexpr int PoisonMaskElem
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Other
Any other memory.
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
CombineLevel
Definition: DAGCombine.h:15
bool isZIPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> or <4, 12,...
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:260
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
bool CC_AArch64_DarwinPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition: VE.h:375
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition: Analysis.cpp:79
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
bool isAcquireOrStronger(AtomicOrdering AO)
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
gep_type_iterator gep_type_begin(const User *GEP)
bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition: STLExtras.h:2051
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
bool CC_AArch64_DarwinPCS_ILP32_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition: STLExtras.h:2039
static const MachineMemOperand::Flags MOStridedAccess
bool CC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
@ Default
The result values are uniform if and only if all operands are uniform.
llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
static const unsigned PerfectShuffleTable[6561+1]
@ Enable
Enable colors.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
Helper structure to keep track of a SET_CC lowered into AArch64 code.
AArch64CC::CondCode CC
Helper structure to keep track of ISD::SET_CC operands.
This is used by foldLoadsRecursive() to capture a Root Load node which is of type or(load,...
Helper structure to be able to read SetCC information.
static unsigned int semanticsPrecision(const fltSemantics &)
Definition: APFloat.cpp:323
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Represent subnormal handling kind for floating point instruction inputs and outputs.
Extended Value Type.
Definition: ValueTypes.h:34
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition: ValueTypes.h:93
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:380
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:73
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:120
uint64_t getScalarStoreSize() const
Definition: ValueTypes.h:387
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition: ValueTypes.h:274
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:290
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:146
ElementCount getVectorElementCount() const
Definition: ValueTypes.h:340
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:448
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
EVT changeElementType(EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition: ValueTypes.h:112
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition: ValueTypes.h:349
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:370
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition: ValueTypes.h:415
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:455
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:397
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition: ValueTypes.h:203
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:64
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition: ValueTypes.h:366
EVT widenIntegerVectorElementType(LLVMContext &Context) const
Return a VT for an integer vector type with the size of the elements doubled.
Definition: ValueTypes.h:429
bool isScalableVT() const
Return true if the type is a scalable type.
Definition: ValueTypes.h:183
bool isFixedLengthVector() const
Definition: ValueTypes.h:177
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition: ValueTypes.h:58
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:313
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition: ValueTypes.h:282
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition: ValueTypes.h:208
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:246
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:203
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition: ValueTypes.h:173
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:318
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:156
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition: ValueTypes.h:101
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:326
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition: ValueTypes.h:438
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:151
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition: ValueTypes.h:198
Describes a register that needs to be forwarded from the prologue to a musttail call.
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:290
static KnownBits ashr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for ashr(LHS, RHS).
Definition: KnownBits.cpp:428
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition: KnownBits.h:150
unsigned getBitWidth() const
Get the bit width of this value.
Definition: KnownBits.h:40
static KnownBits lshr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for lshr(LHS, RHS).
Definition: KnownBits.cpp:370
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition: KnownBits.h:285
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition: KnownBits.h:300
APInt getSignedMinValue() const
Return the minimal signed value possible given these KnownBits.
Definition: KnownBits.h:124
static KnownBits shl(const KnownBits &LHS, const KnownBits &RHS, bool NUW=false, bool NSW=false, bool ShAmtNonZero=false)
Compute known bits for shl(LHS, RHS).
Definition: KnownBits.cpp:285
Structure used to represent pair of argument number after call lowering and register used to transfer...
SmallVector< ArgRegPair, 1 > ArgRegPairs
Vector of call argument and its forwarding register.
This class contains a discriminated union of information about pointers in memory operands,...
unsigned getAddrSpace() const
Return the LLVM IR address space number that this pointer points into.
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
static MachinePointerInfo getUnknownStack(MachineFunction &MF)
Stack memory without other information.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Constraint for a predicate of the form "cmp Pred Op, OtherOp", where Op is the value the constraint a...
Definition: PredicateInfo.h:74
These are IR-level optimization flags that may be propagated to SDNodes.
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
A MapVector that performs no allocations if smaller than a certain size.
Definition: MapVector.h:254
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::OutputArg, 32 > Outs
SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
bool CombineTo(SDValue O, SDValue N)
Helper structure to keep track of SetCC information.
GenericSetCCInfo Generic
AArch64SetCCInfo AArch64