LLVM 23.0.0git
AArch64ISelLowering.cpp
Go to the documentation of this file.
1//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the AArch64TargetLowering class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64ISelLowering.h"
15#include "AArch64ExpandImm.h"
18#include "AArch64RegisterInfo.h"
20#include "AArch64Subtarget.h"
24#include "llvm/ADT/APFloat.h"
25#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/ArrayRef.h"
27#include "llvm/ADT/STLExtras.h"
28#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/StringRef.h"
33#include "llvm/ADT/Twine.h"
61#include "llvm/IR/Attributes.h"
62#include "llvm/IR/Constants.h"
63#include "llvm/IR/DataLayout.h"
64#include "llvm/IR/DebugLoc.h"
66#include "llvm/IR/Function.h"
68#include "llvm/IR/GlobalValue.h"
69#include "llvm/IR/IRBuilder.h"
70#include "llvm/IR/Instruction.h"
73#include "llvm/IR/Intrinsics.h"
74#include "llvm/IR/IntrinsicsAArch64.h"
75#include "llvm/IR/Module.h"
77#include "llvm/IR/Type.h"
78#include "llvm/IR/Use.h"
79#include "llvm/IR/Value.h"
84#include "llvm/Support/Debug.h"
94#include <algorithm>
95#include <bitset>
96#include <cassert>
97#include <cctype>
98#include <cstdint>
99#include <cstdlib>
100#include <iterator>
101#include <limits>
102#include <optional>
103#include <tuple>
104#include <utility>
105#include <vector>
106
107using namespace llvm;
108
109#define DEBUG_TYPE "aarch64-lower"
110
111STATISTIC(NumTailCalls, "Number of tail calls");
112STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
113
114// FIXME: The necessary dtprel relocations don't seem to be supported
115// well in the GNU bfd and gold linkers at the moment. Therefore, by
116// default, for now, fall back to GeneralDynamic code generation.
118 "aarch64-elf-ldtls-generation", cl::Hidden,
119 cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
120 cl::init(false));
121
122static cl::opt<bool>
123EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
124 cl::desc("Enable AArch64 logical imm instruction "
125 "optimization"),
126 cl::init(true));
127
128// Temporary option added for the purpose of testing functionality added
129// to DAGCombiner.cpp in D92230. It is expected that this can be removed
130// in future when both implementations will be based off MGATHER rather
131// than the GLD1 nodes added for the SVE gather load intrinsics.
132static cl::opt<bool>
133EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
134 cl::desc("Combine extends of AArch64 masked "
135 "gather intrinsics"),
136 cl::init(true));
137
138static cl::opt<bool> EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden,
139 cl::desc("Combine ext and trunc to TBL"),
140 cl::init(true));
141
142// All of the XOR, OR and CMP use ALU ports, and data dependency will become the
143// bottleneck after this transform on high end CPU. So this max leaf node
144// limitation is guard cmp+ccmp will be profitable.
145static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden,
146 cl::desc("Maximum of xors"));
147
148// By turning this on, we will not fallback to DAG ISel when encountering
149// scalable vector types for all instruction, even if SVE is not yet supported
150// with some instructions.
151// See [AArch64TargetLowering::fallbackToDAGISel] for implementation details.
153 "aarch64-enable-gisel-sve", cl::Hidden,
154 cl::desc("Enable / disable SVE scalable vectors in Global ISel"),
155 cl::init(false));
156
157// TODO: This option should be removed once we switch to always using PTRADD in
158// the SelectionDAG.
160 "aarch64-use-featcpa-codegen", cl::Hidden,
161 cl::desc("Generate ISD::PTRADD nodes for pointer arithmetic in "
162 "SelectionDAG for FEAT_CPA"),
163 cl::init(false));
164
165/// Value type used for condition codes.
166constexpr MVT CondCodeVT = MVT::i32;
167
168/// Value type used for NZCV flags.
169constexpr MVT FlagsVT = MVT::i32;
170
171static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2,
172 AArch64::X3, AArch64::X4, AArch64::X5,
173 AArch64::X6, AArch64::X7};
174static const MCPhysReg FPRArgRegs[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
175 AArch64::Q3, AArch64::Q4, AArch64::Q5,
176 AArch64::Q6, AArch64::Q7};
177
179
181
182static inline EVT getPackedSVEVectorVT(EVT VT) {
183 switch (VT.getSimpleVT().SimpleTy) {
184 default:
185 llvm_unreachable("unexpected element type for vector");
186 case MVT::i8:
187 return MVT::nxv16i8;
188 case MVT::i16:
189 return MVT::nxv8i16;
190 case MVT::i32:
191 return MVT::nxv4i32;
192 case MVT::i64:
193 return MVT::nxv2i64;
194 case MVT::f16:
195 return MVT::nxv8f16;
196 case MVT::f32:
197 return MVT::nxv4f32;
198 case MVT::f64:
199 return MVT::nxv2f64;
200 case MVT::bf16:
201 return MVT::nxv8bf16;
202 }
203}
204
205// NOTE: Currently there's only a need to return integer vector types. If this
206// changes then just add an extra "type" parameter.
208 switch (EC.getKnownMinValue()) {
209 default:
210 llvm_unreachable("unexpected element count for vector");
211 case 16:
212 return MVT::nxv16i8;
213 case 8:
214 return MVT::nxv8i16;
215 case 4:
216 return MVT::nxv4i32;
217 case 2:
218 return MVT::nxv2i64;
219 }
220}
221
223 assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
224 "Expected scalable predicate vector type!");
225 switch (VT.getVectorMinNumElements()) {
226 default:
227 llvm_unreachable("unexpected element count for vector");
228 case 2:
229 return MVT::nxv2i64;
230 case 4:
231 return MVT::nxv4i32;
232 case 8:
233 return MVT::nxv8i16;
234 case 16:
235 return MVT::nxv16i8;
236 }
237}
238
239/// Returns true if VT's elements occupy the lowest bit positions of its
240/// associated register class without any intervening space.
241///
242/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
243/// same register class, but only nxv8f16 can be treated as a packed vector.
244static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
246 "Expected legal vector type!");
247 return VT.isFixedLengthVector() ||
249}
250
251static inline bool isPackedPredicateType(EVT VT, SelectionDAG &DAG) {
253 "Expected legal type!");
254 return VT == MVT::nxv16i1;
255}
256
257/// Returns true if the conceptual representation for \p VT does not map
258/// directly to its physical register representation, meaning there are gaps
259/// between elements in the register. In practice, the vector elements will be
260/// strided by a power of two and placed starting from lane 0. For example,
261/// nxv8i1 or nxv2f32 are unpacked types.
262///
263///\pre VT is a legal type.
264static inline bool isUnpackedType(EVT VT, SelectionDAG &DAG) {
265 bool Res = !isPackedVectorType(VT, DAG) && !isPackedPredicateType(VT, DAG);
266 assert((!Res || VT.isScalableVector()) &&
267 "Unexpected fixed-size unpacked type.");
268 return Res;
269}
270
271// Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
272// predicate and end with a passthru value matching the result type.
273static bool isMergePassthruOpcode(unsigned Opc) {
274 switch (Opc) {
275 default:
276 return false;
277 case AArch64ISD::BITREVERSE_MERGE_PASSTHRU:
278 case AArch64ISD::BSWAP_MERGE_PASSTHRU:
279 case AArch64ISD::REVH_MERGE_PASSTHRU:
280 case AArch64ISD::REVW_MERGE_PASSTHRU:
281 case AArch64ISD::REVD_MERGE_PASSTHRU:
282 case AArch64ISD::CTLZ_MERGE_PASSTHRU:
283 case AArch64ISD::CTPOP_MERGE_PASSTHRU:
284 case AArch64ISD::DUP_MERGE_PASSTHRU:
285 case AArch64ISD::ABS_MERGE_PASSTHRU:
286 case AArch64ISD::NEG_MERGE_PASSTHRU:
287 case AArch64ISD::FNEG_MERGE_PASSTHRU:
288 case AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU:
289 case AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU:
290 case AArch64ISD::FCEIL_MERGE_PASSTHRU:
291 case AArch64ISD::FFLOOR_MERGE_PASSTHRU:
292 case AArch64ISD::FNEARBYINT_MERGE_PASSTHRU:
293 case AArch64ISD::FRINT_MERGE_PASSTHRU:
294 case AArch64ISD::FRINT32_MERGE_PASSTHRU:
295 case AArch64ISD::FRINT64_MERGE_PASSTHRU:
296 case AArch64ISD::FROUND_MERGE_PASSTHRU:
297 case AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU:
298 case AArch64ISD::FTRUNC_MERGE_PASSTHRU:
299 case AArch64ISD::FTRUNC32_MERGE_PASSTHRU:
300 case AArch64ISD::FTRUNC64_MERGE_PASSTHRU:
301 case AArch64ISD::FP_ROUND_MERGE_PASSTHRU:
302 case AArch64ISD::FP_EXTEND_MERGE_PASSTHRU:
303 case AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU:
304 case AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU:
305 case AArch64ISD::FCVTX_MERGE_PASSTHRU:
306 case AArch64ISD::FCVTZU_MERGE_PASSTHRU:
307 case AArch64ISD::FCVTZS_MERGE_PASSTHRU:
308 case AArch64ISD::FSQRT_MERGE_PASSTHRU:
309 case AArch64ISD::FRECPX_MERGE_PASSTHRU:
310 case AArch64ISD::FABS_MERGE_PASSTHRU:
311 return true;
312 }
313}
314
315// Returns true if inactive lanes are known to be zeroed by construction.
317 switch (Op.getOpcode()) {
318 default:
319 return false;
320 // We guarantee i1 splat_vectors to zero the other lanes
323 case AArch64ISD::PTRUE:
324 case AArch64ISD::SETCC_MERGE_ZERO:
325 return true;
327 switch (Op.getConstantOperandVal(0)) {
328 default:
329 return false;
330 case Intrinsic::aarch64_sve_ptrue:
331 case Intrinsic::aarch64_sve_pnext:
332 case Intrinsic::aarch64_sve_cmpeq:
333 case Intrinsic::aarch64_sve_cmpne:
334 case Intrinsic::aarch64_sve_cmpge:
335 case Intrinsic::aarch64_sve_cmpgt:
336 case Intrinsic::aarch64_sve_cmphs:
337 case Intrinsic::aarch64_sve_cmphi:
338 case Intrinsic::aarch64_sve_cmpeq_wide:
339 case Intrinsic::aarch64_sve_cmpne_wide:
340 case Intrinsic::aarch64_sve_cmpge_wide:
341 case Intrinsic::aarch64_sve_cmpgt_wide:
342 case Intrinsic::aarch64_sve_cmplt_wide:
343 case Intrinsic::aarch64_sve_cmple_wide:
344 case Intrinsic::aarch64_sve_cmphs_wide:
345 case Intrinsic::aarch64_sve_cmphi_wide:
346 case Intrinsic::aarch64_sve_cmplo_wide:
347 case Intrinsic::aarch64_sve_cmpls_wide:
348 case Intrinsic::aarch64_sve_fcmpeq:
349 case Intrinsic::aarch64_sve_fcmpne:
350 case Intrinsic::aarch64_sve_fcmpge:
351 case Intrinsic::aarch64_sve_fcmpgt:
352 case Intrinsic::aarch64_sve_fcmpuo:
353 case Intrinsic::aarch64_sve_facgt:
354 case Intrinsic::aarch64_sve_facge:
355 case Intrinsic::aarch64_sve_whilege:
356 case Intrinsic::aarch64_sve_whilegt:
357 case Intrinsic::aarch64_sve_whilehi:
358 case Intrinsic::aarch64_sve_whilehs:
359 case Intrinsic::aarch64_sve_whilele:
360 case Intrinsic::aarch64_sve_whilelo:
361 case Intrinsic::aarch64_sve_whilels:
362 case Intrinsic::aarch64_sve_whilelt:
363 case Intrinsic::aarch64_sve_match:
364 case Intrinsic::aarch64_sve_nmatch:
365 case Intrinsic::aarch64_sve_whilege_x2:
366 case Intrinsic::aarch64_sve_whilegt_x2:
367 case Intrinsic::aarch64_sve_whilehi_x2:
368 case Intrinsic::aarch64_sve_whilehs_x2:
369 case Intrinsic::aarch64_sve_whilele_x2:
370 case Intrinsic::aarch64_sve_whilelo_x2:
371 case Intrinsic::aarch64_sve_whilels_x2:
372 case Intrinsic::aarch64_sve_whilelt_x2:
373 return true;
374 }
375 }
376}
377
378static std::tuple<SDValue, SDValue>
380 SDLoc DL(Disc);
381 SDValue AddrDisc;
382 SDValue ConstDisc;
383
384 // If this is a blend, remember the constant and address discriminators.
385 // Otherwise, it's either a constant discriminator, or a non-blended
386 // address discriminator.
387 if (Disc->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
388 Disc->getConstantOperandVal(0) == Intrinsic::ptrauth_blend) {
389 AddrDisc = Disc->getOperand(1);
390 ConstDisc = Disc->getOperand(2);
391 } else {
392 ConstDisc = Disc;
393 }
394
395 // If the constant discriminator (either the blend RHS, or the entire
396 // discriminator value) isn't a 16-bit constant, bail out, and let the
397 // discriminator be computed separately.
398 const auto *ConstDiscN = dyn_cast<ConstantSDNode>(ConstDisc);
399 if (!ConstDiscN || !isUInt<16>(ConstDiscN->getZExtValue()))
400 return std::make_tuple(DAG->getTargetConstant(0, DL, MVT::i64), Disc);
401
402 // If there's no address discriminator, use NoRegister, which we'll later
403 // replace with XZR, or directly use a Z variant of the inst. when available.
404 if (!AddrDisc)
405 AddrDisc = DAG->getRegister(AArch64::NoRegister, MVT::i64);
406
407 return std::make_tuple(
408 DAG->getTargetConstant(ConstDiscN->getZExtValue(), DL, MVT::i64),
409 AddrDisc);
410}
411
413 const AArch64Subtarget &STI)
414 : TargetLowering(TM, STI), Subtarget(&STI) {
415 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
416 // we have to make something up. Arbitrarily, choose ZeroOrOne.
418 // When comparing vectors the result sets the different elements in the
419 // vector to all-one or all-zero.
421
422 // Set up the register classes.
423 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
424 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
425
426 if (Subtarget->hasLS64()) {
427 addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
428 setOperationAction(ISD::LOAD, MVT::i64x8, Custom);
430 }
431
432 if (Subtarget->hasFPARMv8()) {
433 addRegisterClass(MVT::aarch64mfp8, &AArch64::FPR8RegClass);
434 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
435 addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
436 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
437 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
438 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
439 }
440
441 if (Subtarget->hasNEON()) {
442 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
443 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
444
445 addDRType(MVT::v2f32);
446 addDRType(MVT::v8i8);
447 addDRType(MVT::v4i16);
448 addDRType(MVT::v2i32);
449 addDRType(MVT::v1i64);
450 addDRType(MVT::v1f64);
451 addDRType(MVT::v4f16);
452 addDRType(MVT::v4bf16);
453
454 addQRType(MVT::v4f32);
455 addQRType(MVT::v2f64);
456 addQRType(MVT::v16i8);
457 addQRType(MVT::v8i16);
458 addQRType(MVT::v4i32);
459 addQRType(MVT::v2i64);
460 addQRType(MVT::v8f16);
461 addQRType(MVT::v8bf16);
462 }
463
464 if (Subtarget->isSVEorStreamingSVEAvailable()) {
465 // Add legal sve predicate types
466 addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass);
467 addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
468 addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
469 addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
470 addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
471
472 // Add sve predicate as counter type
473 addRegisterClass(MVT::aarch64svcount, &AArch64::PPRRegClass);
474
475 // Add legal sve data types
476 addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
477 addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
478 addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
479 addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
480
481 addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
482 addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
483 addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
484 addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
485 addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
486 addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
487
488 addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
489 addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
490 addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
491
492 if (Subtarget->useSVEForFixedLengthVectors()) {
495 addRegisterClass(VT, &AArch64::ZPRRegClass);
496
499 addRegisterClass(VT, &AArch64::ZPRRegClass);
500 }
501 }
502
503 // Compute derived properties from the register classes
504 computeRegisterProperties(Subtarget->getRegisterInfo());
505
506 // Provide all sorts of operation actions
534 if (Subtarget->hasFPARMv8()) {
537 }
550
552
556
559
561
562 // Custom lowering hooks are needed for XOR
563 // to fold it into CSINC/CSINV.
566
569
570 // Virtually no operation on f128 is legal, but LLVM can't expand them when
571 // there's a valid register class, so we need custom operations in most cases.
596 // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
597 // aren't handled.
598
599 // Lowering for many of the conversions is actually specified by the non-f128
600 // type. The LowerXXX function will be trivial when f128 isn't involved.
625 if (Subtarget->hasFPARMv8()) {
628 }
631 if (Subtarget->hasFPARMv8()) {
634 }
637
642
643 // Variable arguments.
648
649 // Variable-sized objects.
652
653 // Lowering Funnel Shifts to EXTR
658
660
661 // Constant pool entries
663
664 // BlockAddress
666
667 // AArch64 lacks both left-rotate and popcount instructions.
673 }
674
675 // AArch64 doesn't have i32 MULH{S|U}.
678
679 // AArch64 doesn't have {U|S}MUL_LOHI.
684
685 if (Subtarget->hasCSSC()) {
689
691
695
698
703
708 } else {
712
715
718 }
719
725 }
732
733 // Custom lower Add/Sub/Mul with overflow.
746
755
764 if (Subtarget->hasFullFP16()) {
767 } else {
770 }
771
772 for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
786 setOperationAction(Op, MVT::f16, Promote);
787 setOperationAction(Op, MVT::v4f16, Expand);
788 setOperationAction(Op, MVT::v8f16, Expand);
789 setOperationAction(Op, MVT::bf16, Promote);
790 setOperationAction(Op, MVT::v4bf16, Expand);
791 setOperationAction(Op, MVT::v8bf16, Expand);
792 }
793
794 // Legalize fcanonicalize to circumvent default expansion
795 setOperationAction(ISD::FCANONICALIZE, {MVT::f32, MVT::f64}, Legal);
796 if (Subtarget->hasFullFP16()) {
798 }
799
800 // fpextend from f16 or bf16 to f32 is legal
805 // fpextend from bf16 to f64 needs to be split into two fpextends
808
809 auto LegalizeNarrowFP = [this](MVT ScalarVT) {
810 for (auto Op : {
814 ISD::FADD,
815 ISD::FSUB,
816 ISD::FMUL,
817 ISD::FDIV,
818 ISD::FMA,
851 })
852 setOperationAction(Op, ScalarVT, Promote);
853
854 for (auto Op : {ISD::FNEG, ISD::FABS})
855 setOperationAction(Op, ScalarVT, Legal);
856
857 // Round-to-integer need custom lowering for fp16, as Promote doesn't work
858 // because the result type is integer.
862 setOperationAction(Op, ScalarVT, Custom);
863
864 // promote v4f16 to v4f32 when that is known to be safe.
865 auto V4Narrow = MVT::getVectorVT(ScalarVT, 4);
866 setOperationPromotedToType(ISD::FADD, V4Narrow, MVT::v4f32);
867 setOperationPromotedToType(ISD::FSUB, V4Narrow, MVT::v4f32);
868 setOperationPromotedToType(ISD::FMUL, V4Narrow, MVT::v4f32);
869 setOperationPromotedToType(ISD::FDIV, V4Narrow, MVT::v4f32);
870 setOperationPromotedToType(ISD::FCEIL, V4Narrow, MVT::v4f32);
871 setOperationPromotedToType(ISD::FFLOOR, V4Narrow, MVT::v4f32);
872 setOperationPromotedToType(ISD::FROUND, V4Narrow, MVT::v4f32);
873 setOperationPromotedToType(ISD::FTRUNC, V4Narrow, MVT::v4f32);
874 setOperationPromotedToType(ISD::FROUNDEVEN, V4Narrow, MVT::v4f32);
875 setOperationPromotedToType(ISD::FRINT, V4Narrow, MVT::v4f32);
876 setOperationPromotedToType(ISD::FNEARBYINT, V4Narrow, MVT::v4f32);
877 setOperationPromotedToType(ISD::FCANONICALIZE, V4Narrow, MVT::v4f32);
878 setOperationPromotedToType(ISD::SETCC, V4Narrow, MVT::v4f32);
879
888
889 auto V8Narrow = MVT::getVectorVT(ScalarVT, 8);
890 setOperationPromotedToType(ISD::FCANONICALIZE, V8Narrow, MVT::v8f32);
891 setOperationPromotedToType(ISD::SETCC, V8Narrow, MVT::v8f32);
892 setOperationPromotedToType(ISD::VECREDUCE_FADD, V8Narrow, MVT::v8f32);
893 setOperationPromotedToType(ISD::VECREDUCE_FMUL, V8Narrow, MVT::v8f32);
894
915 };
916
917 if (!Subtarget->hasFullFP16()) {
918 LegalizeNarrowFP(MVT::f16);
919 }
920 LegalizeNarrowFP(MVT::bf16);
923
924 // AArch64 has implementations of a lot of rounding-like FP operations.
925 // clang-format off
926 for (auto Op :
938 for (MVT Ty : {MVT::f32, MVT::f64})
940 if (Subtarget->hasFullFP16())
941 setOperationAction(Op, MVT::f16, Legal);
942 }
943 // clang-format on
944
945 // Basic strict FP operations are legal
948 for (MVT Ty : {MVT::f32, MVT::f64})
950 if (Subtarget->hasFullFP16())
951 setOperationAction(Op, MVT::f16, Legal);
952 }
953
955
961
963 if (!Subtarget->hasLSE() && !Subtarget->outlineAtomics()) {
966 } else {
969 }
972
973 // Generate outline atomics library calls only if LSE was not specified for
974 // subtarget
975 if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
1001 }
1002
1003 if (Subtarget->outlineAtomics() && !Subtarget->hasLSFE()) {
1008
1013
1018
1023
1028 }
1029
1030 if (Subtarget->hasLSE128()) {
1031 // Custom lowering because i128 is not legal. Must be replaced by 2x64
1032 // values. ATOMIC_LOAD_AND also needs op legalisation to emit LDCLRP.
1036 }
1037
1038 // 128-bit loads and stores can be done without expanding
1039 setOperationAction(ISD::LOAD, MVT::i128, Custom);
1041
1042 // Aligned 128-bit loads and stores are single-copy atomic according to the
1043 // v8.4a spec. LRCPC3 introduces 128-bit STILP/LDIAPP but still requires LSE2.
1044 if (Subtarget->hasLSE2()) {
1047 }
1048
1049 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
1050 // custom lowering, as there are no un-paired non-temporal stores and
1051 // legalization will break up 256 bit inputs.
1052 setOperationAction(ISD::STORE, MVT::v32i8, Custom);
1053 setOperationAction(ISD::STORE, MVT::v16i16, Custom);
1054 setOperationAction(ISD::STORE, MVT::v16f16, Custom);
1055 setOperationAction(ISD::STORE, MVT::v16bf16, Custom);
1056 setOperationAction(ISD::STORE, MVT::v8i32, Custom);
1057 setOperationAction(ISD::STORE, MVT::v8f32, Custom);
1058 setOperationAction(ISD::STORE, MVT::v4f64, Custom);
1059 setOperationAction(ISD::STORE, MVT::v4i64, Custom);
1060
1061 // 256 bit non-temporal loads can be lowered to LDNP. This is done using
1062 // custom lowering, as there are no un-paired non-temporal loads legalization
1063 // will break up 256 bit inputs.
1064 setOperationAction(ISD::LOAD, MVT::v32i8, Custom);
1065 setOperationAction(ISD::LOAD, MVT::v16i16, Custom);
1066 setOperationAction(ISD::LOAD, MVT::v16f16, Custom);
1067 setOperationAction(ISD::LOAD, MVT::v16bf16, Custom);
1068 setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
1069 setOperationAction(ISD::LOAD, MVT::v8f32, Custom);
1070 setOperationAction(ISD::LOAD, MVT::v4f64, Custom);
1071 setOperationAction(ISD::LOAD, MVT::v4i64, Custom);
1072
1073 // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
1075
1076 // Issue __sincos_stret if available.
1079
1080 // Make floating-point constants legal for the large code model, so they don't
1081 // become loads from the constant pool.
1082 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
1085 }
1086
1087 // AArch64 does not have floating-point extending loads, i1 sign-extending
1088 // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
1089 for (MVT VT : MVT::fp_valuetypes()) {
1090 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
1091 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
1092 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
1093 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
1094 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
1095 }
1096 for (MVT VT : MVT::integer_valuetypes())
1097 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);
1098
1099 for (MVT WideVT : MVT::fp_valuetypes()) {
1100 for (MVT NarrowVT : MVT::fp_valuetypes()) {
1101 if (WideVT.getScalarSizeInBits() > NarrowVT.getScalarSizeInBits()) {
1102 setTruncStoreAction(WideVT, NarrowVT, Expand);
1103 }
1104 }
1105 }
1106
1107 if (Subtarget->hasFPARMv8()) {
1111 }
1112
1113 // Indexed loads and stores are supported.
1114 for (unsigned im = (unsigned)ISD::PRE_INC;
1116 setIndexedLoadAction(im, MVT::i8, Legal);
1117 setIndexedLoadAction(im, MVT::i16, Legal);
1118 setIndexedLoadAction(im, MVT::i32, Legal);
1119 setIndexedLoadAction(im, MVT::i64, Legal);
1120 setIndexedLoadAction(im, MVT::f64, Legal);
1121 setIndexedLoadAction(im, MVT::f32, Legal);
1122 setIndexedLoadAction(im, MVT::f16, Legal);
1123 setIndexedLoadAction(im, MVT::bf16, Legal);
1124 setIndexedStoreAction(im, MVT::i8, Legal);
1125 setIndexedStoreAction(im, MVT::i16, Legal);
1126 setIndexedStoreAction(im, MVT::i32, Legal);
1127 setIndexedStoreAction(im, MVT::i64, Legal);
1128 setIndexedStoreAction(im, MVT::f64, Legal);
1129 setIndexedStoreAction(im, MVT::f32, Legal);
1130 setIndexedStoreAction(im, MVT::f16, Legal);
1131 setIndexedStoreAction(im, MVT::bf16, Legal);
1132 }
1133
1134 // Trap.
1135 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1138
1139 // We combine OR nodes for ccmp operations.
1141 // Try to create BICs for vector ANDs.
1143
1144 // llvm.init.trampoline and llvm.adjust.trampoline
1147
1148 // Vector add and sub nodes may conceal a high-half opportunity.
1149 // Also, try to fold ADD into CSINC/CSINV..
1152
1155
1156 // Try and combine setcc/select_cc with csel and bool-vector bitcasts.
1159
1161
1169
1171
1173
1175
1179
1182
1184
1186
1188
1190
1196
1198
1202
1203 // In case of strict alignment, avoid an excessive number of byte wide stores.
1206 Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32;
1207
1211 Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16;
1212
1215 Subtarget->requiresStrictAlign() ? MaxStoresPerMemmoveOptSize : 16;
1216
1219 Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8;
1220
1222
1224
1225 EnableExtLdPromotion = true;
1226
1227 // Set required alignment.
1229 // Set preferred alignments.
1230
1231 // Don't align loops on Windows. The SEH unwind info generation needs to
1232 // know the exact length of functions before the alignments have been
1233 // expanded.
1234 if (!Subtarget->isTargetWindows())
1238
1239 // Only change the limit for entries in a jump table if specified by
1240 // the sub target, but not at the command line.
1241 unsigned MaxJT = STI.getMaximumJumpTableSize();
1242 if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
1244
1246
1248
1250 if (Subtarget->hasSME())
1252
1253 if (Subtarget->isNeonAvailable()) {
1254 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
1255 // silliness like this:
1256 // clang-format off
1257 for (auto Op :
1278 setOperationAction(Op, MVT::v1f64, Expand);
1279 // clang-format on
1280
1281 for (auto Op :
1286 setOperationAction(Op, MVT::v1i64, Expand);
1287
1288 // AArch64 doesn't have a direct vector ->f32 conversion instructions for
1289 // elements smaller than i32, so promote the input to i32 first.
1290 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
1291 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
1292
1293 // Similarly, there is no direct i32 -> f64 vector conversion instruction.
1294 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
1295 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1298 for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
1300
1301 if (Subtarget->hasFullFP16()) {
1304
1313 } else {
1314 // when AArch64 doesn't have fullfp16 support, promote the input
1315 // to i32 first.
1316 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
1317 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
1318 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32);
1319 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32);
1320 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
1321 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
1322 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
1323 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
1324 }
1325
1326 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
1327 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
1328 // CTLS (Count Leading Sign bits) - Legal for BHS types (8/16/32-bit
1329 // elements) No hardware support for 64-bit element vectors
1330 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1331 MVT::v4i32})
1339 for (auto VT : {MVT::v1i64, MVT::v2i64}) {
1344 }
1345
1346 // Custom handling for some quad-vector types to detect MULL.
1347 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
1348 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1349 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1350 setOperationAction(ISD::MUL, MVT::v4i16, Custom);
1351 setOperationAction(ISD::MUL, MVT::v2i32, Custom);
1352 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1353
1354 // Saturates
1355 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v1i64,
1356 MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1361 }
1362
1363 for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1364 MVT::v4i32}) {
1371 }
1372
1373 // Vector reductions
1374 for (MVT VT : { MVT::v4f16, MVT::v2f32,
1375 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1376 if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1381
1383 }
1384 }
1385 if (Subtarget->hasFullFP16())
1387
1388 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1389 MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1398 }
1403
1405 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
1406 // Likewise, narrowing and extending vector loads/stores aren't handled
1407 // directly.
1410
1411 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1414 } else {
1417 }
1420
1421 if (VT == MVT::v4i16 || VT == MVT::v8i16 || VT == MVT::v2i32 ||
1422 VT == MVT::v4i32 || VT == MVT::v2i64)
1424 else
1426
1427 if (VT == MVT::v8i8 || VT == MVT::v16i8 || VT == MVT::v8i16 ||
1428 VT == MVT::v4i16 || VT == MVT::v2i32 || VT == MVT::v4i32)
1430 else
1432
1433 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1434 setTruncStoreAction(VT, InnerVT, Expand);
1435 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1436 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1437 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1438 }
1439 }
1440
1441 for (auto Op :
1447 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1449 if (Subtarget->hasFullFP16())
1450 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1452 }
1453
1454 // LRINT and LLRINT.
1455 for (auto Op : {ISD::LRINT, ISD::LLRINT}) {
1456 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1458 if (Subtarget->hasFullFP16())
1459 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1461 }
1462
1463 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
1464
1469
1473
1474 setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i8, Custom);
1475 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i8, Custom);
1476 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i8, Custom);
1477 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i8, Custom);
1478 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8, Custom);
1479 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8, Custom);
1480 setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1481 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1482 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1483 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1484 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1485 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1486 setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i16, Custom);
1487 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i16, Custom);
1488 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i16, Custom);
1489 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i16, Custom);
1490 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Custom);
1491 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Custom);
1492
1493 // ADDP custom lowering
1494 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1496 // FADDP custom lowering
1497 for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1499
1500 if (Subtarget->hasDotProd()) {
1501 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
1503
1504 setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v16i8, Legal);
1505 setPartialReduceMLAAction(MLAOps, MVT::v2i32, MVT::v8i8, Legal);
1506 setPartialReduceMLAAction(MLAOps, MVT::v2i32, MVT::v16i8, Custom);
1507 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v16i8, Custom);
1508
1509 if (Subtarget->hasMatMulInt8()) {
1511 MVT::v16i8, Legal);
1513 MVT::v16i8, Custom);
1514
1516 MVT::v8i8, Legal);
1517 }
1518 }
1519
1520 if (Subtarget->hasF16F32DOT() || Subtarget->hasFP16FML()) {
1522 MVT::v4f16, Legal);
1524 MVT::v8f16, Legal);
1525 }
1526
1527 if (Subtarget->hasBF16())
1529 MVT::v8bf16, Legal);
1530
1532 setOperationAction(ISD::CLMUL, {MVT::v8i8, MVT::v16i8}, Legal);
1533 if (Subtarget->hasAES()) {
1534 setOperationAction(ISD::CLMUL, {MVT::i16, MVT::i32, MVT::i64}, Custom);
1535 setOperationAction(ISD::CLMUL, {MVT::v1i64, MVT::v2i64}, Legal);
1536 setOperationAction(ISD::CLMULH, {MVT::v1i64, MVT::v2i64}, Legal);
1537 }
1538
1539 } else /* !isNeonAvailable */ {
1541 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1543
1544 if (VT.is128BitVector() || VT.is64BitVector()) {
1548 Subtarget->isLittleEndian() ? Legal : Expand);
1549 }
1550 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1551 setTruncStoreAction(VT, InnerVT, Expand);
1552 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1553 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1554 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1555 }
1556 }
1557 }
1558
1559 for (MVT VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1563 }
1564
1565 if (Subtarget->hasSME()) {
1567 }
1568
1569 // FIXME: Move lowering for more nodes here if those are common between
1570 // SVE and SME.
1571 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1572 for (auto VT :
1573 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1578 }
1579 for (auto VT : {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1}) {
1581 Custom);
1584 }
1585
1586 if (Subtarget->isSVEorStreamingSVEAvailable() &&
1587 (Subtarget->hasSVE2p1() || Subtarget->hasSME2()))
1589
1590 for (auto VT : {MVT::v16i8, MVT::v8i8, MVT::v4i16, MVT::v2i32})
1592
1593 for (auto VT : {MVT::v8f16, MVT::v4f32, MVT::v2f64})
1595 }
1596
1597 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1598 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1642
1648
1657
1662
1666
1667 if (!Subtarget->isLittleEndian())
1669
1670 if (Subtarget->hasSVE2() ||
1671 (Subtarget->hasSME() && Subtarget->isStreaming()))
1672 // For SLI/SRI.
1674 }
1675
1676 for (auto VT : {MVT::nxv4i32, MVT::nxv2i64}) {
1679 }
1680
1681 // Illegal unpacked integer vector types.
1682 for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1685 }
1686
1687 // Type legalize unpacked bitcasts.
1688 for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32})
1690
1691 for (auto VT :
1692 { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
1693 MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
1695
1696 // Promote predicate as counter load/stores to standard predicates.
1697 setOperationPromotedToType(ISD::LOAD, MVT::aarch64svcount, MVT::nxv16i1);
1698 setOperationPromotedToType(ISD::STORE, MVT::aarch64svcount, MVT::nxv16i1);
1699
1700 // Predicate as counter legalization actions.
1701 setOperationAction(ISD::SELECT, MVT::aarch64svcount, Custom);
1702 setOperationAction(ISD::SELECT_CC, MVT::aarch64svcount, Expand);
1703
1704 for (auto VT :
1705 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1713
1717
1718 // There are no legal MVT::nxv16f## based types.
1719 if (VT != MVT::nxv16i1) {
1724 }
1725 }
1726
1727 // NEON doesn't support masked loads/stores, but SME and SVE do.
1728 for (auto VT :
1729 {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
1730 MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1731 MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1734 }
1735
1736 // Firstly, exclude all scalable vector extending loads/truncating stores,
1737 // include both integer and floating scalable vector.
1739 for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
1740 setTruncStoreAction(VT, InnerVT, Expand);
1741 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1742 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1743 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1744 }
1745 }
1746
1747 // Then, selectively enable those which we directly support.
1748 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i8, Legal);
1749 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i16, Legal);
1750 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i32, Legal);
1751 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i8, Legal);
1752 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i16, Legal);
1753 setTruncStoreAction(MVT::nxv8i16, MVT::nxv8i8, Legal);
1754 for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1755 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal);
1756 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal);
1757 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal);
1758 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal);
1759 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal);
1760 setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal);
1761 }
1762
1763 // SVE supports truncating stores of 64 and 128-bit vectors
1764 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);
1765 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);
1766 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom);
1767 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
1768 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
1769
1770 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1771 MVT::nxv4f32, MVT::nxv2f64}) {
1816
1839
1851 }
1852
1853 for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1870 }
1871
1872 if (Subtarget->hasSVEB16B16() &&
1873 Subtarget->isNonStreamingSVEorSME2Available()) {
1874 // Note: Use SVE for bfloat16 operations when +sve-b16b16 is available.
1875 for (auto VT : {MVT::v4bf16, MVT::v8bf16, MVT::nxv2bf16, MVT::nxv4bf16,
1876 MVT::nxv8bf16}) {
1885 }
1886 }
1887
1888 for (auto Opcode :
1893 setOperationPromotedToType(Opcode, MVT::nxv2bf16, MVT::nxv2f32);
1894 setOperationPromotedToType(Opcode, MVT::nxv4bf16, MVT::nxv4f32);
1895 setOperationPromotedToType(Opcode, MVT::nxv8bf16, MVT::nxv8f32);
1896 }
1897
1898 if (!Subtarget->hasSVEB16B16() ||
1899 !Subtarget->isNonStreamingSVEorSME2Available()) {
1900 for (MVT VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1901 MVT PromotedVT = VT.changeVectorElementType(MVT::f32);
1902 setOperationPromotedToType(ISD::FADD, VT, PromotedVT);
1903 setOperationPromotedToType(ISD::FMA, VT, PromotedVT);
1908 setOperationPromotedToType(ISD::FSUB, VT, PromotedVT);
1909
1910 if (VT != MVT::nxv2bf16 && Subtarget->hasBF16())
1912 else
1913 setOperationPromotedToType(ISD::FMUL, VT, PromotedVT);
1914 }
1915
1916 if (Subtarget->hasBF16() && Subtarget->isNeonAvailable())
1917 setOperationAction(ISD::FMUL, MVT::v8bf16, Custom);
1918 }
1919
1922
1923 // A number of operations like MULH and integer divides are not supported by
1924 // NEON but are available in SVE.
1925 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1926 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1933 }
1934
1935 // NEON doesn't support 64-bit vector integer muls, but SVE does.
1936 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1937 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1938
1939 // With SVE2 we can try lowering these to pairwise operations (e.g. smaxp).
1940 if (Subtarget->hasSVE2() || Subtarget->isStreamingSVEAvailable()) {
1945 }
1946
1947 // NOTE: Currently this has to happen after computeRegisterProperties rather
1948 // than the preferred option of combining it with the addRegisterClass call.
1949 if (Subtarget->useSVEForFixedLengthVectors()) {
1952 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1953 addTypeForFixedLengthSVE(VT);
1954 }
1957 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1958 addTypeForFixedLengthSVE(VT);
1959 }
1960
1961 // 64bit results can mean a bigger than NEON input.
1962 for (auto VT : {MVT::v8i8, MVT::v4i16})
1965
1966 // 128bit results imply a bigger than NEON input.
1967 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1969 for (auto VT : {MVT::v8f16, MVT::v4f32, MVT::v8bf16})
1971
1972 // These operations are not supported on NEON but SVE can do them.
1974 setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
1975 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1976 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
1977 setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
1978 setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
1979 setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
1980 setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
1981 setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
1982 setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
1983 setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
1984 setOperationAction(ISD::UMIN, MVT::v2i64, Custom);
1989
1990 // Int operations with no NEON support.
1991 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1992 MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
1998 }
1999
2000 // Use SVE for vectors with more than 2 elements.
2001 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
2003 }
2004
2006 MVT::nxv2i64);
2008 MVT::nxv2i64);
2010 MVT::nxv4i32);
2012 MVT::nxv4i32);
2014 MVT::nxv8i16);
2016 MVT::nxv8i16);
2018 MVT::nxv16i8);
2020 MVT::nxv16i8);
2021
2023
2024 for (auto VT : {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1})
2027 VT, Custom);
2028 }
2029
2030 // Handle partial reduction operations
2031 if (Subtarget->isSVEorStreamingSVEAvailable()) {
2032 // Mark known legal pairs as 'Legal' (these will expand to UDOT or SDOT).
2033 // Other pairs will default to 'Expand'.
2034 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
2036 setPartialReduceMLAAction(MLAOps, MVT::nxv2i64, MVT::nxv8i16, Legal);
2037 setPartialReduceMLAAction(MLAOps, MVT::nxv4i32, MVT::nxv16i8, Legal);
2038
2039 setPartialReduceMLAAction(MLAOps, MVT::nxv2i64, MVT::nxv16i8, Custom);
2040
2041 if (Subtarget->hasMatMulInt8()) {
2043 MVT::nxv16i8, Legal);
2045 MVT::nxv16i8, Custom);
2046 }
2047
2048 if (Subtarget->hasSVE2() || Subtarget->hasSME()) {
2049 // Wide add types
2050 setPartialReduceMLAAction(MLAOps, MVT::nxv2i64, MVT::nxv4i32, Legal);
2051 setPartialReduceMLAAction(MLAOps, MVT::nxv4i32, MVT::nxv8i16, Legal);
2052 setPartialReduceMLAAction(MLAOps, MVT::nxv8i16, MVT::nxv16i8, Legal);
2053
2054 setOperationAction(ISD::CLMUL, {MVT::nxv16i8, MVT::nxv4i32}, Legal);
2055
2057 MVT::nxv8f16, Legal);
2058
2059 // We can use SVE2p1 fdot or SVE2 fmlalb/t to emulate the fixed-length
2060 // variant (unless NEON fdot is natively available).
2061 if (!Subtarget->isNeonAvailable() ||
2062 (!Subtarget->hasF16F32DOT() && !Subtarget->hasFP16FML())) {
2064 MVT::v8f16, Custom);
2066 MVT::v4f16, Custom);
2067 }
2068 }
2069
2070 if (Subtarget->hasBF16())
2072 MVT::nxv8bf16, Legal);
2073 }
2074
2075 if (Subtarget->hasSVEAES() &&
2076 (Subtarget->isSVEAvailable() || Subtarget->hasSSVE_AES()))
2077 setOperationAction(ISD::CLMUL, MVT::nxv2i64, Legal);
2078
2079 // Handle non-aliasing elements mask
2080 if (Subtarget->hasSVE2() ||
2081 (Subtarget->hasSME() && Subtarget->isStreaming())) {
2082 // FIXME: Support wider fixed-length types when msve-vector-bits is used.
2083 for (auto VT : {MVT::v2i32, MVT::v4i16, MVT::v8i8, MVT::v16i8}) {
2086 }
2087 for (auto VT : {MVT::nxv2i1, MVT::nxv4i1, MVT::nxv8i1, MVT::nxv16i1}) {
2090 }
2091 }
2092
2093 // Handle operations that are only available in non-streaming SVE mode.
2094 if (Subtarget->isSVEAvailable()) {
2095 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64,
2096 MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
2097 MVT::nxv4f32, MVT::nxv2f64, MVT::nxv2bf16, MVT::nxv4bf16,
2098 MVT::nxv8bf16, MVT::v4f16, MVT::v8f16, MVT::v2f32,
2099 MVT::v4f32, MVT::v1f64, MVT::v2f64, MVT::v8i8,
2100 MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
2101 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
2104 }
2105
2106 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
2107 MVT::nxv4f32, MVT::nxv2f64, MVT::v4f16, MVT::v8f16,
2108 MVT::v2f32, MVT::v4f32, MVT::v2f64})
2110
2111 // We can lower types that have <vscale x {2|4}> elements to compact.
2112 for (auto VT :
2113 {MVT::nxv4i32, MVT::nxv2i64, MVT::nxv2f32, MVT::nxv4f32, MVT::nxv2f64})
2115
2116 // If we have SVE, we can use SVE logic for legal NEON vectors in the lowest
2117 // bits of the SVE register.
2118 for (auto VT : {MVT::v2i32, MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32,
2119 MVT::v2f64})
2121
2122 // Promote v4i16/f16 to v4i32/f32 as the SVE container for v4i16 is nxv8,
2123 // which is not supported with for compact (with only +sve).
2124 setOperationPromotedToType(ISD::VECTOR_COMPRESS, MVT::v4bf16, MVT::v4i16);
2125 setOperationPromotedToType(ISD::VECTOR_COMPRESS, MVT::v4f16, MVT::v4i16);
2126 setOperationPromotedToType(ISD::VECTOR_COMPRESS, MVT::v4i16, MVT::v4i32);
2127
2128 for (auto VT : {MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64,
2129 MVT::nxv2f32, MVT::nxv2f64, MVT::nxv4i8, MVT::nxv4i16,
2130 MVT::nxv4i32, MVT::nxv4f32}) {
2131 // Use a custom lowering for masked stores that could be a supported
2132 // compressing store. Note: These types still use the normal (Legal)
2133 // lowering for non-compressing masked stores.
2135 }
2136
2137 // Histcnt is SVE2 only
2138 if (Subtarget->hasSVE2()) {
2140 Custom);
2142 Custom);
2143
2144 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
2146 // Must be lowered to SVE instructions.
2147 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v4i32, Custom);
2148 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v8i16, Custom);
2149 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v16i8, Custom);
2150 setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v8i16, Custom);
2151 setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v16i8, Custom);
2152 setPartialReduceMLAAction(MLAOps, MVT::v8i16, MVT::v16i8, Custom);
2153 }
2154 }
2155
2156 if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
2157 // Only required for llvm.aarch64.mops.memset.tag
2159 }
2160
2162
2163 if (Subtarget->hasSVE()) {
2168 }
2169
2170 PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
2171
2172 IsStrictFPEnabled = true;
2174
2175 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
2176 // it, but it's just a wrapper around ldexp.
2177 if (Subtarget->isTargetWindows()) {
2179 if (isOperationExpand(Op, MVT::f32))
2180 setOperationAction(Op, MVT::f32, Promote);
2181 }
2182
2183 // LegalizeDAG currently can't expand fp16/bf16 LDEXP/FREXP on targets where
2184 // i16 isn't legal.
2186 if (isOperationExpand(Op, MVT::f16))
2187 setOperationAction(Op, MVT::f16, Promote);
2188 if (isOperationExpand(Op, MVT::bf16))
2189 setOperationAction(Op, MVT::bf16, Promote);
2190 }
2191}
2192
2194 return static_cast<const AArch64TargetMachine &>(getTargetMachine());
2195}
2196
2197void AArch64TargetLowering::addTypeForNEON(MVT VT) {
2198 assert(VT.isVector() && "VT should be a vector type");
2199
2200 if (VT.isFloatingPoint()) {
2202 setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
2203 setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
2204 }
2205
2206 // Mark vector float intrinsics as expand.
2207 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
2226 }
2227
2228 // But we do support custom-lowering for FCOPYSIGN.
2229 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
2230 ((VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v4f16 ||
2231 VT == MVT::v8f16) &&
2232 Subtarget->hasFullFP16()))
2234
2249
2253 for (MVT InnerVT : MVT::all_valuetypes())
2254 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
2255
2256 // CNT supports only B element sizes, then use UADDLP to widen.
2257 if (VT != MVT::v8i8 && VT != MVT::v16i8)
2259
2265
2266 for (unsigned Opcode :
2269 setOperationAction(Opcode, VT, Custom);
2270
2271 if (!VT.isFloatingPoint())
2273
2274 // [SU][MIN|MAX] are available for all NEON types apart from i64.
2275 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
2276 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
2277 setOperationAction(Opcode, VT, Legal);
2278
2279 // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
2280 // NEON types.
2281 if (VT.isFloatingPoint() &&
2282 VT.getVectorElementType() != MVT::bf16 &&
2283 (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
2284 for (unsigned Opcode :
2290 setOperationAction(Opcode, VT, Legal);
2291
2292 // Strict fp extend and trunc are legal
2293 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16)
2295 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64)
2297
2298 // FIXME: We could potentially make use of the vector comparison instructions
2299 // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
2300 // complications:
2301 // * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,
2302 // so we would need to expand when the condition code doesn't match the
2303 // kind of comparison.
2304 // * Some kinds of comparison require more than one FCMXY instruction so
2305 // would need to be expanded instead.
2306 // * The lowering of the non-strict versions involves target-specific ISD
2307 // nodes so we would likely need to add strict versions of all of them and
2308 // handle them appropriately.
2311
2312 // When little-endian we can use ordinary d and q register loads/stores for
2313 // vector types, but when big-endian we need to use structure load/store which
2314 // only allow post-index addressing.
2315 if (Subtarget->isLittleEndian()) {
2316 for (unsigned im = (unsigned)ISD::PRE_INC;
2317 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
2320 }
2321 } else {
2324 }
2325
2326 if (Subtarget->hasD128()) {
2329 }
2330
2331 if (VT.isInteger()) {
2332 // Let common code emit inverted variants of compares we do support.
2338 }
2339}
2340
2342 EVT OpVT) const {
2343 // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
2344 if (!Subtarget->isSVEorStreamingSVEAvailable() ||
2345 ResVT.getVectorElementType() != MVT::i1)
2346 return true;
2347
2348 // Only support illegal types if the result is scalable and min elements > 1.
2349 if (ResVT.getVectorMinNumElements() == 1 ||
2350 (ResVT.isFixedLengthVector() && (ResVT.getVectorNumElements() > 16 ||
2351 (OpVT != MVT::i32 && OpVT != MVT::i64))))
2352 return true;
2353
2354 // 32 & 64 bit operands are supported. We can promote anything < 64 bits,
2355 // but anything larger should be expanded.
2356 if (OpVT.getFixedSizeInBits() > 64)
2357 return true;
2358
2359 return false;
2360}
2361
2363 if (!Subtarget->isSVEorStreamingSVEAvailable())
2364 return true;
2365
2366 // We can only use the BRKB + CNTP sequence with legal predicate types. We can
2367 // also support fixed-width predicates.
2368 return VT != MVT::nxv16i1 && VT != MVT::nxv8i1 && VT != MVT::nxv4i1 &&
2369 VT != MVT::nxv2i1 && VT != MVT::v16i1 && VT != MVT::v8i1 &&
2370 VT != MVT::v4i1 && VT != MVT::v2i1;
2371}
2372
2374 unsigned SearchSize) const {
2375 // MATCH is SVE2 and only available in non-streaming mode.
2376 if (!Subtarget->hasSVE2() || !Subtarget->isSVEAvailable())
2377 return true;
2378 // Furthermore, we can only use it for 8-bit or 16-bit elements.
2379 if (VT == MVT::nxv8i16 || VT == MVT::v8i16)
2380 return SearchSize != 8;
2381 if (VT == MVT::nxv16i8 || VT == MVT::v16i8 || VT == MVT::v8i8)
2382 return SearchSize != 8 && SearchSize != 16;
2383 return true;
2384}
2385
2386void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
2387 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
2388
2389 // By default everything must be expanded.
2390 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
2392
2393 if (VT.isFloatingPoint()) {
2403 }
2404
2406 VT == MVT::v1f64 ? Expand : Custom;
2407
2408 // Mark integer truncating stores/extending loads as having custom lowering
2409 if (VT.isInteger()) {
2410 MVT InnerVT = VT.changeVectorElementType(MVT::i8);
2411 while (InnerVT != VT) {
2412 setTruncStoreAction(VT, InnerVT, Default);
2413 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Default);
2414 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Default);
2415 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
2416 InnerVT = InnerVT.changeVectorElementType(
2417 MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
2418 }
2419 }
2420
2421 // Mark floating-point truncating stores/extending loads as having custom
2422 // lowering
2423 if (VT.isFloatingPoint()) {
2424 MVT InnerVT = VT.changeVectorElementType(MVT::f16);
2425 while (InnerVT != VT) {
2426 setTruncStoreAction(VT, InnerVT, Custom);
2427 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
2428 InnerVT = InnerVT.changeVectorElementType(
2430 }
2431 }
2432
2433 bool PreferNEON = VT.is64BitVector() || VT.is128BitVector();
2434 bool PreferSVE = !PreferNEON && Subtarget->isSVEAvailable();
2435
2436 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
2438 unsigned NumElts = VT.getVectorNumElements();
2439 if (VT.getVectorElementType() == MVT::i64) {
2440 setPartialReduceMLAAction(MLAOps, VT,
2441 MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);
2442 setPartialReduceMLAAction(MLAOps, VT,
2443 MVT::getVectorVT(MVT::i16, NumElts * 4), Custom);
2444 setPartialReduceMLAAction(MLAOps, VT,
2445 MVT::getVectorVT(MVT::i32, NumElts * 2), Custom);
2446 } else if (VT.getVectorElementType() == MVT::i32) {
2447 setPartialReduceMLAAction(MLAOps, VT,
2448 MVT::getVectorVT(MVT::i8, NumElts * 4), Custom);
2449 setPartialReduceMLAAction(MLAOps, VT,
2450 MVT::getVectorVT(MVT::i16, NumElts * 2), Custom);
2451 } else if (VT.getVectorElementType() == MVT::i16) {
2452 setPartialReduceMLAAction(MLAOps, VT,
2453 MVT::getVectorVT(MVT::i8, NumElts * 2), Custom);
2454 }
2455 if (Subtarget->hasMatMulInt8()) {
2456 if (VT.getVectorElementType() == MVT::i32)
2458 MVT::getVectorVT(MVT::i8, NumElts * 4), Custom);
2459 else if (VT.getVectorElementType() == MVT::i64)
2461 MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);
2462 }
2463
2464 if (Subtarget->hasSVE2p1() && VT.getVectorElementType() == MVT::f32) {
2466 MVT::getVectorVT(MVT::f16, NumElts * 2), Custom);
2467 }
2468
2469 // Lower fixed length vector operations to scalable equivalents.
2476 setOperationAction(ISD::BITCAST, VT, PreferNEON ? Legal : Default);
2517 setOperationAction(ISD::LOAD, VT, PreferNEON ? Legal : Default);
2520 setOperationAction(ISD::MGATHER, VT, PreferSVE ? Default : Expand);
2522 setOperationAction(ISD::MSCATTER, VT, PreferSVE ? Default : Expand);
2541 setOperationAction(ISD::STORE, VT, PreferNEON ? Legal : Default);
2568}
2569
2570void AArch64TargetLowering::addDRType(MVT VT) {
2571 addRegisterClass(VT, &AArch64::FPR64RegClass);
2572 if (Subtarget->isNeonAvailable())
2573 addTypeForNEON(VT);
2574}
2575
2576void AArch64TargetLowering::addQRType(MVT VT) {
2577 addRegisterClass(VT, &AArch64::FPR128RegClass);
2578 if (Subtarget->isNeonAvailable())
2579 addTypeForNEON(VT);
2580}
2581
2583 LLVMContext &C, EVT VT) const {
2584 if (!VT.isVector())
2585 return MVT::i32;
2586 if (VT.isScalableVector())
2587 return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
2589}
2590
2591// isIntImmediate - This method tests to see if the node is a constant
2592// operand. If so Imm will receive the value.
2593static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
2595 Imm = C->getZExtValue();
2596 return true;
2597 }
2598 return false;
2599}
2600
2601bool isVectorizedBinOp(unsigned Opcode) {
2602 switch (Opcode) {
2603 case AArch64ISD::SQDMULH:
2604 return true;
2605 default:
2606 return false;
2607 }
2608}
2609
2610// isOpcWithIntImmediate - This method tests to see if the node is a specific
2611// opcode and that it has a immediate integer right operand.
2612// If so Imm will receive the value.
2613static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
2614 uint64_t &Imm) {
2615 return N->getOpcode() == Opc &&
2616 isIntImmediate(N->getOperand(1).getNode(), Imm);
2617}
2618
2619static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
2620 const APInt &Demanded,
2622 unsigned NewOpc) {
2623 uint64_t OldImm = Imm, NewImm, Enc;
2624 uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
2625
2626 // Return if the immediate is already all zeros, all ones, a bimm32 or a
2627 // bimm64.
2628 if (Imm == 0 || Imm == Mask ||
2630 return false;
2631
2632 unsigned EltSize = Size;
2633 uint64_t DemandedBits = Demanded.getZExtValue();
2634
2635 // Clear bits that are not demanded.
2636 Imm &= DemandedBits;
2637
2638 while (true) {
2639 // The goal here is to set the non-demanded bits in a way that minimizes
2640 // the number of switching between 0 and 1. In order to achieve this goal,
2641 // we set the non-demanded bits to the value of the preceding demanded bits.
2642 // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
2643 // non-demanded bit), we copy bit0 (1) to the least significant 'x',
2644 // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
2645 // The final result is 0b11000011.
2646 uint64_t NonDemandedBits = ~DemandedBits;
2647 uint64_t InvertedImm = ~Imm & DemandedBits;
2648 uint64_t RotatedImm =
2649 ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
2650 NonDemandedBits;
2651 uint64_t Sum = RotatedImm + NonDemandedBits;
2652 bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
2653 uint64_t Ones = (Sum + Carry) & NonDemandedBits;
2654 NewImm = (Imm | Ones) & Mask;
2655
2656 // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
2657 // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
2658 // we halve the element size and continue the search.
2659 if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
2660 break;
2661
2662 // We cannot shrink the element size any further if it is 2-bits.
2663 if (EltSize == 2)
2664 return false;
2665
2666 EltSize /= 2;
2667 Mask >>= EltSize;
2668 uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
2669
2670 // Return if there is mismatch in any of the demanded bits of Imm and Hi.
2671 if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
2672 return false;
2673
2674 // Merge the upper and lower halves of Imm and DemandedBits.
2675 Imm |= Hi;
2676 DemandedBits |= DemandedBitsHi;
2677 }
2678
2679 ++NumOptimizedImms;
2680
2681 // Replicate the element across the register width.
2682 while (EltSize < Size) {
2683 NewImm |= NewImm << EltSize;
2684 EltSize *= 2;
2685 }
2686
2687 (void)OldImm;
2688 assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
2689 "demanded bits should never be altered");
2690 assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
2691
2692 // Create the new constant immediate node.
2693 EVT VT = Op.getValueType();
2694 SDLoc DL(Op);
2695 SDValue New;
2696
2697 // If the new constant immediate is all-zeros or all-ones, let the target
2698 // independent DAG combine optimize this node.
2699 if (NewImm == 0 || NewImm == OrigMask) {
2700 New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
2701 TLO.DAG.getConstant(NewImm, DL, VT));
2702 // Otherwise, create a machine node so that target independent DAG combine
2703 // doesn't undo this optimization.
2704 } else {
2706 SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
2707 New = SDValue(
2708 TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
2709 }
2710
2711 return TLO.CombineTo(Op, New);
2712}
2713
2715 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
2716 TargetLoweringOpt &TLO) const {
2717 // Delay this optimization to as late as possible.
2718 if (!TLO.LegalOps)
2719 return false;
2720
2722 return false;
2723
2724 EVT VT = Op.getValueType();
2725 if (VT.isVector())
2726 return false;
2727
2728 unsigned Size = VT.getSizeInBits();
2729
2730 if (Size != 32 && Size != 64)
2731 return false;
2732
2733 // Exit early if we demand all bits.
2734 if (DemandedBits.isAllOnes())
2735 return false;
2736
2737 unsigned NewOpc;
2738 switch (Op.getOpcode()) {
2739 default:
2740 return false;
2741 case ISD::AND:
2742 NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
2743 break;
2744 case ISD::OR:
2745 NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
2746 break;
2747 case ISD::XOR:
2748 NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
2749 break;
2750 }
2751 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
2752 if (!C)
2753 return false;
2754 uint64_t Imm = C->getZExtValue();
2755 return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
2756}
2757
2758/// computeKnownBitsForTargetNode - Determine which of the bits specified in
2759/// Mask are known to be either zero or one and return them Known.
2761 const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
2762 const SelectionDAG &DAG, unsigned Depth) const {
2763 switch (Op.getOpcode()) {
2764 default:
2765 break;
2766 case AArch64ISD::DUP: {
2767 SDValue SrcOp = Op.getOperand(0);
2768 Known = DAG.computeKnownBits(SrcOp, Depth + 1);
2769 if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
2770 assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
2771 "Expected DUP implicit truncation");
2772 Known = Known.trunc(Op.getScalarValueSizeInBits());
2773 }
2774 break;
2775 }
2776 case AArch64ISD::CSEL: {
2777 KnownBits Known2;
2778 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2779 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2780 Known = Known.intersectWith(Known2);
2781 break;
2782 }
2783 case AArch64ISD::CSNEG:
2784 case AArch64ISD::CSINC:
2785 case AArch64ISD::CSINV: {
2786 KnownBits KnownOp0 = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2787 KnownBits KnownOp1 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2788
2789 // The result is either:
2790 // CSINC: KnownOp0 or KnownOp1 + 1
2791 // CSINV: KnownOp0 or ~KnownOp1
2792 // CSNEG: KnownOp0 or KnownOp1 * -1
2793 if (Op.getOpcode() == AArch64ISD::CSINC)
2794 KnownOp1 = KnownBits::add(
2795 KnownOp1,
2796 KnownBits::makeConstant(APInt(Op.getScalarValueSizeInBits(), 1)));
2797 else if (Op.getOpcode() == AArch64ISD::CSINV)
2798 std::swap(KnownOp1.Zero, KnownOp1.One);
2799 else if (Op.getOpcode() == AArch64ISD::CSNEG)
2800 KnownOp1 =
2802 Op.getScalarValueSizeInBits())));
2803
2804 Known = KnownOp0.intersectWith(KnownOp1);
2805 break;
2806 }
2807 case AArch64ISD::BICi: {
2808 // Compute the bit cleared value.
2809 APInt Mask =
2810 ~(Op->getConstantOperandAPInt(1) << Op->getConstantOperandAPInt(2))
2811 .trunc(Known.getBitWidth());
2812 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2813 Known &= KnownBits::makeConstant(Mask);
2814 break;
2815 }
2816 case AArch64ISD::VLSHR: {
2817 KnownBits Known2;
2818 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2819 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2820 Known = KnownBits::lshr(Known, Known2);
2821 break;
2822 }
2823 case AArch64ISD::VASHR: {
2824 KnownBits Known2;
2825 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2826 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2827 Known = KnownBits::ashr(Known, Known2);
2828 break;
2829 }
2830 case AArch64ISD::VSHL: {
2831 KnownBits Known2;
2832 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2833 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2834 Known = KnownBits::shl(Known, Known2);
2835 break;
2836 }
2837 case AArch64ISD::MOVI: {
2839 APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)));
2840 break;
2841 }
2842 case AArch64ISD::MOVIshift: {
2844 APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)
2845 << Op->getConstantOperandVal(1)));
2846 break;
2847 }
2848 case AArch64ISD::MOVImsl: {
2849 unsigned ShiftAmt = AArch64_AM::getShiftValue(Op->getConstantOperandVal(1));
2851 Known.getBitWidth(), ~(~Op->getConstantOperandVal(0) << ShiftAmt)));
2852 break;
2853 }
2854 case AArch64ISD::MOVIedit: {
2856 Known.getBitWidth(),
2857 AArch64_AM::decodeAdvSIMDModImmType10(Op->getConstantOperandVal(0))));
2858 break;
2859 }
2860 case AArch64ISD::MVNIshift: {
2862 APInt(Known.getBitWidth(),
2863 ~(Op->getConstantOperandVal(0) << Op->getConstantOperandVal(1)),
2864 /*isSigned*/ false, /*implicitTrunc*/ true));
2865 break;
2866 }
2867 case AArch64ISD::MVNImsl: {
2868 unsigned ShiftAmt = AArch64_AM::getShiftValue(Op->getConstantOperandVal(1));
2870 APInt(Known.getBitWidth(), (~Op->getConstantOperandVal(0) << ShiftAmt),
2871 /*isSigned*/ false, /*implicitTrunc*/ true));
2872 break;
2873 }
2874 case AArch64ISD::LOADgot:
2875 case AArch64ISD::ADDlow: {
2876 if (!Subtarget->isTargetILP32())
2877 break;
2878 // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
2879 Known.Zero = APInt::getHighBitsSet(64, 32);
2880 break;
2881 }
2882 case AArch64ISD::ASSERT_ZEXT_BOOL: {
2883 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2884 Known.Zero |= APInt(Known.getBitWidth(), 0xFE);
2885 break;
2886 }
2888 Intrinsic::ID IntID =
2889 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
2890 switch (IntID) {
2891 default: return;
2892 case Intrinsic::aarch64_ldaxr:
2893 case Intrinsic::aarch64_ldxr: {
2894 unsigned BitWidth = Known.getBitWidth();
2895 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
2896 unsigned MemBits = VT.getScalarSizeInBits();
2897 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
2898 return;
2899 }
2900 }
2901 break;
2902 }
2904 case ISD::INTRINSIC_VOID: {
2905 unsigned IntNo = Op.getConstantOperandVal(0);
2906 switch (IntNo) {
2907 default:
2908 break;
2909 case Intrinsic::aarch64_neon_uaddlv: {
2910 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2911 unsigned BitWidth = Known.getBitWidth();
2912 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2913 unsigned Bound = (VT == MVT::v8i8) ? 11 : 12;
2914 assert(BitWidth >= Bound && "Unexpected width!");
2916 Known.Zero |= Mask;
2917 }
2918 break;
2919 }
2920 case Intrinsic::aarch64_neon_umaxv:
2921 case Intrinsic::aarch64_neon_uminv: {
2922 // Figure out the datatype of the vector operand. The UMINV instruction
2923 // will zero extend the result, so we can mark as known zero all the
2924 // bits larger than the element datatype. 32-bit or larget doesn't need
2925 // this as those are legal types and will be handled by isel directly.
2926 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2927 unsigned BitWidth = Known.getBitWidth();
2928 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2929 assert(BitWidth >= 8 && "Unexpected width!");
2931 Known.Zero |= Mask;
2932 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
2933 assert(BitWidth >= 16 && "Unexpected width!");
2935 Known.Zero |= Mask;
2936 }
2937 break;
2938 } break;
2939 }
2940 }
2941 }
2942}
2943
2945 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
2946 unsigned Depth) const {
2947 EVT VT = Op.getValueType();
2948 unsigned VTBits = VT.getScalarSizeInBits();
2949 unsigned Opcode = Op.getOpcode();
2950 switch (Opcode) {
2951 case AArch64ISD::FCMEQ:
2952 case AArch64ISD::FCMGE:
2953 case AArch64ISD::FCMGT:
2954 // Compares return either 0 or all-ones
2955 return VTBits;
2956 case AArch64ISD::VASHR: {
2957 unsigned Tmp =
2958 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
2959 return std::min<uint64_t>(Tmp + Op.getConstantOperandVal(1), VTBits);
2960 }
2961 }
2962
2963 return 1;
2964}
2965
2967 EVT) const {
2968 return MVT::i64;
2969}
2970
2972 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2973 unsigned *Fast) const {
2974
2975 // Allow SVE loads/stores where the alignment >= the size of the element type,
2976 // even with +strict-align. Predicated SVE loads/stores (e.g. ld1/st1), used
2977 // for stores that come from IR, only require element-size alignment (even if
2978 // unaligned accesses are disabled). Without this, these will be forced to
2979 // have 16-byte alignment with +strict-align (and fail to lower as we don't
2980 // yet support TLI.expandUnalignedLoad() and TLI.expandUnalignedStore()).
2981 if (VT.isScalableVector()) {
2982 unsigned ElementSizeBits = VT.getScalarSizeInBits();
2983 if (ElementSizeBits % 8 == 0 && Alignment >= Align(ElementSizeBits / 8))
2984 return true;
2985 }
2986
2987 if (Subtarget->requiresStrictAlign())
2988 return false;
2989
2990 if (Fast) {
2991 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2992 *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
2993 // See comments in performSTORECombine() for more details about
2994 // these conditions.
2995
2996 // Code that uses clang vector extensions can mark that it
2997 // wants unaligned accesses to be treated as fast by
2998 // underspecifying alignment to be 1 or 2.
2999 Alignment <= 2 ||
3000
3001 // Disregard v2i64. Memcpy lowering produces those and splitting
3002 // them regresses performance on micro-benchmarks and olden/bh.
3003 VT == MVT::v2i64;
3004 }
3005 return true;
3006}
3007
3008// Same as above but handling LLTs instead.
3010 LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
3011 unsigned *Fast) const {
3012 if (Subtarget->requiresStrictAlign())
3013 return false;
3014
3015 if (Fast) {
3016 // Some CPUs are fine with unaligned stores except for 128-bit ones.
3017 *Fast = !Subtarget->isMisaligned128StoreSlow() ||
3018 Ty.getSizeInBytes() != 16 ||
3019 // See comments in performSTORECombine() for more details about
3020 // these conditions.
3021
3022 // Code that uses clang vector extensions can mark that it
3023 // wants unaligned accesses to be treated as fast by
3024 // underspecifying alignment to be 1 or 2.
3025 Alignment <= 2 ||
3026
3027 // Disregard v2i64. Memcpy lowering produces those and splitting
3028 // them regresses performance on micro-benchmarks and olden/bh.
3029 Ty == LLT::fixed_vector(2, 64);
3030 }
3031 return true;
3032}
3033
3035 FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo,
3036 const LibcallLoweringInfo *libcallLowering) const {
3037 return AArch64::createFastISel(funcInfo, libInfo, libcallLowering);
3038}
3039
3042 MachineBasicBlock *MBB) const {
3043 // We materialise the F128CSEL pseudo-instruction as some control flow and a
3044 // phi node:
3045
3046 // OrigBB:
3047 // [... previous instrs leading to comparison ...]
3048 // b.ne TrueBB
3049 // b EndBB
3050 // TrueBB:
3051 // ; Fallthrough
3052 // EndBB:
3053 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
3054
3055 MachineFunction *MF = MBB->getParent();
3056 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3057 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
3058 DebugLoc DL = MI.getDebugLoc();
3059 MachineFunction::iterator It = ++MBB->getIterator();
3060
3061 Register DestReg = MI.getOperand(0).getReg();
3062 Register IfTrueReg = MI.getOperand(1).getReg();
3063 Register IfFalseReg = MI.getOperand(2).getReg();
3064 unsigned CondCode = MI.getOperand(3).getImm();
3065 bool NZCVKilled = MI.getOperand(4).isKill();
3066
3067 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
3068 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
3069 MF->insert(It, TrueBB);
3070 MF->insert(It, EndBB);
3071
3072 // Transfer rest of current basic-block to EndBB
3073 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
3074 MBB->end());
3076
3077 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
3078 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
3079 MBB->addSuccessor(TrueBB);
3080 MBB->addSuccessor(EndBB);
3081
3082 // TrueBB falls through to the end.
3083 TrueBB->addSuccessor(EndBB);
3084
3085 if (!NZCVKilled) {
3086 TrueBB->addLiveIn(AArch64::NZCV);
3087 EndBB->addLiveIn(AArch64::NZCV);
3088 }
3089
3090 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
3091 .addReg(IfTrueReg)
3092 .addMBB(TrueBB)
3093 .addReg(IfFalseReg)
3094 .addMBB(MBB);
3095
3096 MI.eraseFromParent();
3097 return EndBB;
3098}
3099
3107
3110 MachineBasicBlock *MBB) const {
3111 MachineFunction &MF = *MBB->getParent();
3112 MachineBasicBlock::iterator MBBI = MI.getIterator();
3113 const AArch64InstrInfo &TII =
3114 *MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
3115 Register TargetReg = MI.getOperand(0).getReg();
3117 TII.probedStackAlloc(MBBI, TargetReg, false);
3118
3119 MI.eraseFromParent();
3120 return NextInst->getParent();
3121}
3122
3125 MachineBasicBlock *MBB) const {
3126 MachineFunction *MF = MBB->getParent();
3127 MachineRegisterInfo &MRI = MF->getRegInfo();
3128
3129 const TargetRegisterClass *RC_GPR = &AArch64::GPR64RegClass;
3130 const TargetRegisterClass *RC_GPRsp = &AArch64::GPR64spRegClass;
3131
3132 Register RegVL_GPR = MRI.createVirtualRegister(RC_GPR);
3133 Register RegVL_GPRsp = MRI.createVirtualRegister(RC_GPRsp); // for ADDSVL src
3134 Register RegSVL_GPR = MRI.createVirtualRegister(RC_GPR);
3135 Register RegSVL_GPRsp = MRI.createVirtualRegister(RC_GPRsp); // for ADDSVL dst
3136
3137 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3138 DebugLoc DL = MI.getDebugLoc();
3139
3140 // RDVL requires GPR64, ADDSVL requires GPR64sp
3141 // We need to insert COPY instructions, these will later be removed by the
3142 // RegisterCoalescer
3143 BuildMI(*MBB, MI, DL, TII->get(AArch64::RDVLI_XI), RegVL_GPR).addImm(1);
3144 BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), RegVL_GPRsp)
3145 .addReg(RegVL_GPR);
3146
3147 BuildMI(*MBB, MI, DL, TII->get(AArch64::ADDSVL_XXI), RegSVL_GPRsp)
3148 .addReg(RegVL_GPRsp)
3149 .addImm(-1);
3150 BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), RegSVL_GPR)
3151 .addReg(RegSVL_GPRsp);
3152
3153 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
3154 MachineFunction::iterator It = ++MBB->getIterator();
3155 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(LLVM_BB);
3156 MachineBasicBlock *PassBB = MF->CreateMachineBasicBlock(LLVM_BB);
3157 MF->insert(It, TrapBB);
3158 MF->insert(It, PassBB);
3159
3160 // Continue if vector lengths match
3161 BuildMI(*MBB, MI, DL, TII->get(AArch64::CBZX))
3162 .addReg(RegSVL_GPR)
3163 .addMBB(PassBB);
3164
3165 // Transfer rest of current BB to PassBB
3166 PassBB->splice(PassBB->begin(), MBB,
3167 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
3169
3170 // Trap if vector lengths mismatch
3171 BuildMI(TrapBB, DL, TII->get(AArch64::BRK)).addImm(1);
3172
3173 MBB->addSuccessor(TrapBB);
3174 MBB->addSuccessor(PassBB);
3175
3176 MI.eraseFromParent();
3177 return PassBB;
3178}
3179
3181AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
3183 MachineBasicBlock *BB) const {
3184 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3185 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
3186
3187 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
3188 MIB.add(MI.getOperand(1)); // slice index register
3189 MIB.add(MI.getOperand(2)); // slice index offset
3190 MIB.add(MI.getOperand(3)); // pg
3191 MIB.add(MI.getOperand(4)); // base
3192 MIB.add(MI.getOperand(5)); // offset
3193
3194 MI.eraseFromParent(); // The pseudo is gone now.
3195 return BB;
3196}
3197
3200 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3202 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA));
3203
3204 MIB.addReg(AArch64::ZA, RegState::Define);
3205 MIB.add(MI.getOperand(0)); // Vector select register
3206 MIB.add(MI.getOperand(1)); // Vector select offset
3207 MIB.add(MI.getOperand(2)); // Base
3208 MIB.add(MI.getOperand(1)); // Offset, same as vector select offset
3209
3210 MI.eraseFromParent(); // The pseudo is gone now.
3211 return BB;
3212}
3213
3216 unsigned Opcode,
3217 bool Op0IsDef) const {
3218 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3220
3221 MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opcode))
3222 .addReg(MI.getOperand(0).getReg(), getDefRegState(Op0IsDef));
3223 for (unsigned I = 1; I < MI.getNumOperands(); ++I)
3224 MIB.add(MI.getOperand(I));
3225
3226 MI.eraseFromParent(); // The pseudo is gone now.
3227 return BB;
3228}
3229
3231AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg,
3233 MachineBasicBlock *BB) const {
3234 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3235 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
3236 unsigned StartIdx = 0;
3237
3238 bool HasTile = BaseReg != AArch64::ZA;
3239 bool HasZPROut = HasTile && MI.getOperand(0).isReg();
3240 if (HasZPROut) {
3241 MIB.add(MI.getOperand(StartIdx)); // Output ZPR
3242 ++StartIdx;
3243 }
3244 if (HasTile) {
3245 MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm(),
3246 RegState::Define); // Output ZA Tile
3247 MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm()); // Input Za Tile
3248 StartIdx++;
3249 } else {
3250 // Avoids all instructions with mnemonic za.<sz>[Reg, Imm,
3251 if (MI.getOperand(0).isReg() && !MI.getOperand(1).isImm()) {
3252 MIB.add(MI.getOperand(StartIdx)); // Output ZPR
3253 ++StartIdx;
3254 }
3255 MIB.addReg(BaseReg, RegState::Define).addReg(BaseReg);
3256 }
3257 for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I)
3258 MIB.add(MI.getOperand(I));
3259
3260 MI.eraseFromParent(); // The pseudo is gone now.
3261 return BB;
3262}
3263
3266 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3268 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M));
3269 MIB.add(MI.getOperand(0)); // Mask
3270
3271 unsigned Mask = MI.getOperand(0).getImm();
3272 for (unsigned I = 0; I < 8; I++) {
3273 if (Mask & (1 << I))
3274 MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine);
3275 }
3276
3277 MI.eraseFromParent(); // The pseudo is gone now.
3278 return BB;
3279}
3280
3283 MachineBasicBlock *BB) const {
3284 MachineFunction *MF = BB->getParent();
3285 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3286 const DebugLoc &DL = MI.getDebugLoc();
3287 Register ResultReg = MI.getOperand(0).getReg();
3288 if (MF->getRegInfo().use_empty(ResultReg)) {
3289 // Nothing to do. Pseudo erased below.
3290 } else if (Subtarget->hasSME()) {
3291 BuildMI(*BB, MI, DL, TII->get(AArch64::MRS), ResultReg)
3292 .addImm(AArch64SysReg::SVCR)
3293 .addReg(AArch64::VG, RegState::Implicit);
3294 } else {
3295 RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE;
3296 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3297 BuildMI(*BB, MI, DL, TII->get(AArch64::BL))
3299 .addReg(AArch64::X0, RegState::ImplicitDefine)
3300 .addRegMask(TRI->getCallPreservedMask(*MF, getLibcallCallingConv(LC)));
3301 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), ResultReg)
3302 .addReg(AArch64::X0);
3303 }
3304 MI.eraseFromParent();
3305 return BB;
3306}
3307
3308// Helper function to find the instruction that defined a virtual register.
3309// If unable to find such instruction, returns nullptr.
3311 Register Reg) {
3312 while (Reg.isVirtual()) {
3314 assert(DefMI && "Virtual register definition not found");
3315 unsigned Opcode = DefMI->getOpcode();
3316
3317 if (Opcode == AArch64::COPY) {
3318 Reg = DefMI->getOperand(1).getReg();
3319 // Vreg is defined by copying from physreg.
3320 if (Reg.isPhysical())
3321 return DefMI;
3322 continue;
3323 }
3324 if (Opcode == AArch64::SUBREG_TO_REG) {
3325 Reg = DefMI->getOperand(1).getReg();
3326 continue;
3327 }
3328
3329 return DefMI;
3330 }
3331 return nullptr;
3332}
3333
3336 MachineOperand &AddrDiscOp, const TargetRegisterClass *AddrDiscRC) const {
3337 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3338 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
3339 const DebugLoc &DL = MI.getDebugLoc();
3340
3341 Register AddrDisc = AddrDiscOp.getReg();
3342 int64_t IntDisc = IntDiscOp.getImm();
3343 assert(IntDisc == 0 && "Blend components are already expanded");
3344
3345 const MachineInstr *DiscMI = stripVRegCopies(MRI, AddrDisc);
3346 if (DiscMI) {
3347 switch (DiscMI->getOpcode()) {
3348 case AArch64::MOVKXi:
3349 // blend(addr, imm) which is lowered as "MOVK addr, #imm, #48".
3350 // #imm should be an immediate and not a global symbol, for example.
3351 if (DiscMI->getOperand(2).isImm() &&
3352 DiscMI->getOperand(3).getImm() == 48) {
3353 AddrDisc = DiscMI->getOperand(1).getReg();
3354 IntDisc = DiscMI->getOperand(2).getImm();
3355 }
3356 break;
3357 case AArch64::MOVi32imm:
3358 case AArch64::MOVi64imm:
3359 // Small immediate integer constant passed via VReg.
3360 if (DiscMI->getOperand(1).isImm() &&
3361 isUInt<16>(DiscMI->getOperand(1).getImm())) {
3362 AddrDisc = AArch64::NoRegister;
3363 IntDisc = DiscMI->getOperand(1).getImm();
3364 }
3365 break;
3366 }
3367 }
3368
3369 // For uniformity, always use NoRegister, as XZR is not necessarily contained
3370 // in the requested register class.
3371 if (AddrDisc == AArch64::XZR)
3372 AddrDisc = AArch64::NoRegister;
3373
3374 // Make sure AddrDisc operand respects the register class imposed by MI.
3375 if (AddrDisc && MRI.getRegClass(AddrDisc) != AddrDiscRC) {
3376 Register TmpReg = MRI.createVirtualRegister(AddrDiscRC);
3377 BuildMI(*BB, MI, DL, TII->get(AArch64::COPY), TmpReg).addReg(AddrDisc);
3378 AddrDisc = TmpReg;
3379 }
3380
3381 AddrDiscOp.setReg(AddrDisc);
3382 IntDiscOp.setImm(IntDisc);
3383}
3384
3386 MachineInstr &MI, MachineBasicBlock *BB) const {
3387
3388 int SMEOrigInstr = AArch64::getSMEPseudoMap(MI.getOpcode());
3389 if (SMEOrigInstr != -1) {
3390 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3391 uint64_t SMEMatrixType =
3392 TII->get(MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask;
3393 switch (SMEMatrixType) {
3395 return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB);
3397 return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB);
3399 return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB);
3401 return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB);
3403 return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB);
3405 return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB);
3406 }
3407 }
3408
3409 switch (MI.getOpcode()) {
3410 default:
3411#ifndef NDEBUG
3412 MI.dump();
3413#endif
3414 llvm_unreachable("Unexpected instruction for custom inserter!");
3415 case AArch64::EntryPStateSM:
3416 return EmitEntryPStateSM(MI, BB);
3417 case AArch64::F128CSEL:
3418 return EmitF128CSEL(MI, BB);
3419 case TargetOpcode::STATEPOINT:
3420 // STATEPOINT is a pseudo instruction which has no implicit defs/uses
3421 // while bl call instruction (where statepoint will be lowered at the end)
3422 // has implicit def. This def is early-clobber as it will be set at
3423 // the moment of the call and earlier than any use is read.
3424 // Add this implicit dead def here as a workaround.
3425 MI.addOperand(*MI.getMF(),
3427 AArch64::LR, /*isDef*/ true,
3428 /*isImp*/ true, /*isKill*/ false, /*isDead*/ true,
3429 /*isUndef*/ false, /*isEarlyClobber*/ true));
3430 [[fallthrough]];
3431 case TargetOpcode::STACKMAP:
3432 case TargetOpcode::PATCHPOINT:
3433 return emitPatchPoint(MI, BB);
3434
3435 case TargetOpcode::PATCHABLE_EVENT_CALL:
3436 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
3437 return BB;
3438
3439 case AArch64::CATCHRET:
3440 return EmitLoweredCatchRet(MI, BB);
3441
3442 case AArch64::PROBED_STACKALLOC_DYN:
3443 return EmitDynamicProbedAlloc(MI, BB);
3444
3445 case AArch64::CHECK_MATCHING_VL_PSEUDO:
3446 return EmitCheckMatchingVL(MI, BB);
3447
3448 case AArch64::LD1_MXIPXX_H_PSEUDO_B:
3449 return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
3450 case AArch64::LD1_MXIPXX_H_PSEUDO_H:
3451 return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB);
3452 case AArch64::LD1_MXIPXX_H_PSEUDO_S:
3453 return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB);
3454 case AArch64::LD1_MXIPXX_H_PSEUDO_D:
3455 return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB);
3456 case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
3457 return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB);
3458 case AArch64::LD1_MXIPXX_V_PSEUDO_B:
3459 return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB);
3460 case AArch64::LD1_MXIPXX_V_PSEUDO_H:
3461 return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB);
3462 case AArch64::LD1_MXIPXX_V_PSEUDO_S:
3463 return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB);
3464 case AArch64::LD1_MXIPXX_V_PSEUDO_D:
3465 return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB);
3466 case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
3467 return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
3468 case AArch64::LDR_ZA_PSEUDO:
3469 return EmitFill(MI, BB);
3470 case AArch64::LDR_TX_PSEUDO:
3471 return EmitZTInstr(MI, BB, AArch64::LDR_TX, /*Op0IsDef=*/true);
3472 case AArch64::STR_TX_PSEUDO:
3473 return EmitZTInstr(MI, BB, AArch64::STR_TX, /*Op0IsDef=*/false);
3474 case AArch64::ZERO_M_PSEUDO:
3475 return EmitZero(MI, BB);
3476 case AArch64::ZERO_T_PSEUDO:
3477 return EmitZTInstr(MI, BB, AArch64::ZERO_T, /*Op0IsDef=*/true);
3478 case AArch64::MOVT_TIZ_PSEUDO:
3479 return EmitZTInstr(MI, BB, AArch64::MOVT_TIZ, /*Op0IsDef=*/true);
3480
3481 case AArch64::PAC:
3482 fixupPtrauthDiscriminator(MI, BB, MI.getOperand(3), MI.getOperand(4),
3483 &AArch64::GPR64noipRegClass);
3484 return BB;
3485 }
3486}
3487
3488//===----------------------------------------------------------------------===//
3489// AArch64 Lowering private implementation.
3490//===----------------------------------------------------------------------===//
3491
3492//===----------------------------------------------------------------------===//
3493// Lowering Code
3494//===----------------------------------------------------------------------===//
3495
3496// Forward declarations of SVE fixed length lowering helpers
3501 SelectionDAG &DAG);
3504 EVT VT);
3506
3507/// isZerosVector - Check whether SDNode N is a zero-filled vector.
3508static bool isZerosVector(const SDNode *N) {
3509 // Look through a bit convert.
3510 while (N->getOpcode() == ISD::BITCAST)
3511 N = N->getOperand(0).getNode();
3512
3514 return true;
3515
3516 if (N->getOpcode() != AArch64ISD::DUP)
3517 return false;
3518
3519 auto Opnd0 = N->getOperand(0);
3520 return isNullConstant(Opnd0) || isNullFPConstant(Opnd0);
3521}
3522
3523static bool isOneVector(SDValue V) {
3524 return isOneOrOneSplat(V) ||
3525 (V.getOpcode() == AArch64ISD::DUP && isOneConstant(V.getOperand(0)));
3526}
3527
3528/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
3529/// CC
3531 SDValue RHS = {}) {
3532 switch (CC) {
3533 default:
3534 llvm_unreachable("Unknown condition code!");
3535 case ISD::SETNE:
3536 return AArch64CC::NE;
3537 case ISD::SETEQ:
3538 return AArch64CC::EQ;
3539 case ISD::SETGT:
3540 return AArch64CC::GT;
3541 case ISD::SETGE:
3543 case ISD::SETLT:
3545 case ISD::SETLE:
3546 return AArch64CC::LE;
3547 case ISD::SETUGT:
3548 return AArch64CC::HI;
3549 case ISD::SETUGE:
3550 return AArch64CC::HS;
3551 case ISD::SETULT:
3552 return AArch64CC::LO;
3553 case ISD::SETULE:
3554 return AArch64CC::LS;
3555 }
3556}
3557
3558/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
3560 AArch64CC::CondCode &CondCode,
3561 AArch64CC::CondCode &CondCode2) {
3562 CondCode2 = AArch64CC::AL;
3563 switch (CC) {
3564 default:
3565 llvm_unreachable("Unknown FP condition!");
3566 case ISD::SETEQ:
3567 case ISD::SETOEQ:
3568 CondCode = AArch64CC::EQ;
3569 break;
3570 case ISD::SETGT:
3571 case ISD::SETOGT:
3572 CondCode = AArch64CC::GT;
3573 break;
3574 case ISD::SETGE:
3575 case ISD::SETOGE:
3576 CondCode = AArch64CC::GE;
3577 break;
3578 case ISD::SETOLT:
3579 CondCode = AArch64CC::MI;
3580 break;
3581 case ISD::SETOLE:
3582 CondCode = AArch64CC::LS;
3583 break;
3584 case ISD::SETONE:
3585 CondCode = AArch64CC::MI;
3586 CondCode2 = AArch64CC::GT;
3587 break;
3588 case ISD::SETO:
3589 CondCode = AArch64CC::VC;
3590 break;
3591 case ISD::SETUO:
3592 CondCode = AArch64CC::VS;
3593 break;
3594 case ISD::SETUEQ:
3595 CondCode = AArch64CC::EQ;
3596 CondCode2 = AArch64CC::VS;
3597 break;
3598 case ISD::SETUGT:
3599 CondCode = AArch64CC::HI;
3600 break;
3601 case ISD::SETUGE:
3602 CondCode = AArch64CC::PL;
3603 break;
3604 case ISD::SETLT:
3605 case ISD::SETULT:
3606 CondCode = AArch64CC::LT;
3607 break;
3608 case ISD::SETLE:
3609 case ISD::SETULE:
3610 CondCode = AArch64CC::LE;
3611 break;
3612 case ISD::SETNE:
3613 case ISD::SETUNE:
3614 CondCode = AArch64CC::NE;
3615 break;
3616 }
3617}
3618
3619/// Convert a DAG fp condition code to an AArch64 CC.
3620/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
3621/// should be AND'ed instead of OR'ed.
3623 AArch64CC::CondCode &CondCode,
3624 AArch64CC::CondCode &CondCode2) {
3625 CondCode2 = AArch64CC::AL;
3626 switch (CC) {
3627 default:
3628 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3629 assert(CondCode2 == AArch64CC::AL);
3630 break;
3631 case ISD::SETONE:
3632 // (a one b)
3633 // == ((a olt b) || (a ogt b))
3634 // == ((a ord b) && (a une b))
3635 CondCode = AArch64CC::VC;
3636 CondCode2 = AArch64CC::NE;
3637 break;
3638 case ISD::SETUEQ:
3639 // (a ueq b)
3640 // == ((a uno b) || (a oeq b))
3641 // == ((a ule b) && (a uge b))
3642 CondCode = AArch64CC::PL;
3643 CondCode2 = AArch64CC::LE;
3644 break;
3645 }
3646}
3647
3648/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
3649/// CC usable with the vector instructions. Fewer operations are available
3650/// without a real NZCV register, so we have to use less efficient combinations
3651/// to get the same effect.
3653 AArch64CC::CondCode &CondCode,
3654 AArch64CC::CondCode &CondCode2,
3655 bool &Invert) {
3656 Invert = false;
3657 switch (CC) {
3658 default:
3659 // Mostly the scalar mappings work fine.
3660 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3661 break;
3662 case ISD::SETUO:
3663 Invert = true;
3664 [[fallthrough]];
3665 case ISD::SETO:
3666 CondCode = AArch64CC::MI;
3667 CondCode2 = AArch64CC::GE;
3668 break;
3669 case ISD::SETLE:
3670 CondCode = AArch64CC::LS;
3671 CondCode2 = AArch64CC::AL;
3672 break;
3673 case ISD::SETLT:
3674 CondCode = AArch64CC::MI;
3675 CondCode2 = AArch64CC::AL;
3676 break;
3677 case ISD::SETUEQ:
3678 case ISD::SETULT:
3679 case ISD::SETULE:
3680 case ISD::SETUGT:
3681 case ISD::SETUGE:
3682 // All of the compare-mask comparisons are ordered, but we can switch
3683 // between the two by a double inversion. E.g. ULE == !OGT.
3684 Invert = true;
3685 changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
3686 CondCode, CondCode2);
3687 break;
3688 }
3689}
3690
3691/// Like SelectionDAG::getCondCode(), but for AArch64 condition codes.
3693 // TODO: Should be TargetConstant (need to s/imm/timm in patterns).
3694 return DAG.getConstant(CC, SDLoc(), CondCodeVT);
3695}
3696
3698 // Matches AArch64DAGToDAGISel::SelectArithImmed().
3699 bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
3700 LLVM_DEBUG(dbgs() << "Is imm " << C
3701 << " legal: " << (IsLegal ? "yes\n" : "no\n"));
3702 return IsLegal;
3703}
3704
3706 // Works for negative immediates too, as it can be written as an ADDS
3707 // instruction with a negated immediate.
3708 return isLegalArithImmed(C.abs().getZExtValue());
3709}
3710
3712 uint64_t Imm = C.getZExtValue();
3714 AArch64_IMM::expandMOVImm(Imm, 32, Insn);
3715 return Insn.size();
3716}
3717
3719 // 0 - INT_MIN sign wraps, so no signed wrap means cmn is safe.
3720 if (Op->getFlags().hasNoSignedWrap())
3721 return true;
3722
3723 // We can still figure out if the second operand is safe to use
3724 // in a CMN instruction by checking if it is known to be not the minimum
3725 // signed value. If it is not, then we can safely use CMN.
3726 // Note: We can eventually remove this check and simply rely on
3727 // Op->getFlags().hasNoSignedWrap() once SelectionDAG/ISelLowering
3728 // consistently sets them appropriately when making said nodes.
3729
3730 KnownBits KnownSrc = DAG.computeKnownBits(Op.getOperand(1));
3731 return !KnownSrc.getSignedMinValue().isMinSignedValue();
3732}
3733
3734// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
3735// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
3736// can be set differently by this operation. It comes down to whether
3737// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
3738// everything is fine. If not then the optimization is wrong. Thus general
3739// comparisons are only valid if op2 != 0 and op2 != INT_MIN.
3740//
3741// So, finally, the only LLVM-native comparisons that don't mention C or V
3742// are the ones that aren't unsigned comparisons. They're the only ones we can
3743// safely use CMN for in the absence of information about op2.
3745 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
3746 (isIntEqualitySetCC(CC) ||
3747 (isUnsignedIntSetCC(CC) && DAG.isKnownNeverZero(Op.getOperand(1))) ||
3748 (isSignedIntSetCC(CC) && isSafeSignedCMN(Op, DAG)));
3749}
3750
3752 SelectionDAG &DAG, SDValue Chain,
3753 bool IsSignaling) {
3754 EVT VT = LHS.getValueType();
3755 assert(VT != MVT::f128);
3756
3757 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3758
3759 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3760 LHS = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
3761 {Chain, LHS});
3762 RHS = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
3763 {LHS.getValue(1), RHS});
3764 Chain = RHS.getValue(1);
3765 }
3766 unsigned Opcode =
3767 IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP;
3768 return DAG.getNode(Opcode, DL, {FlagsVT, MVT::Other}, {Chain, LHS, RHS});
3769}
3770
3772 const SDLoc &DL, SelectionDAG &DAG) {
3773 EVT VT = LHS.getValueType();
3774 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3775
3776 if (VT.isFloatingPoint()) {
3777 assert(VT != MVT::f128);
3778 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3779 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3780 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3781 }
3782 return DAG.getNode(AArch64ISD::FCMP, DL, FlagsVT, LHS, RHS);
3783 }
3784
3785 // The CMP instruction is just an alias for SUBS, and representing it as
3786 // SUBS means that it's possible to get CSE with subtract operations.
3787 // A later phase can perform the optimization of setting the destination
3788 // register to WZR/XZR if it ends up being unused.
3789 unsigned Opcode = AArch64ISD::SUBS;
3790
3791 if (isCMN(RHS, CC, DAG)) {
3792 // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
3793 Opcode = AArch64ISD::ADDS;
3794 RHS = RHS.getOperand(1);
3795 } else if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
3796 isIntEqualitySetCC(CC)) {
3797 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3798 // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
3799 Opcode = AArch64ISD::ADDS;
3800 LHS = LHS.getOperand(1);
3801 } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
3802 if (LHS.getOpcode() == ISD::AND) {
3803 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
3804 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
3805 // of the signed comparisons.
3806 const SDValue ANDSNode =
3807 DAG.getNode(AArch64ISD::ANDS, DL, DAG.getVTList(VT, FlagsVT),
3808 LHS.getOperand(0), LHS.getOperand(1));
3809 // Replace all users of (and X, Y) with newly generated (ands X, Y)
3810 DAG.ReplaceAllUsesWith(LHS, ANDSNode);
3811 return ANDSNode.getValue(1);
3812 } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
3813 // Use result of ANDS
3814 return LHS.getValue(1);
3815 }
3816 }
3817
3818 return DAG.getNode(Opcode, DL, DAG.getVTList(VT, FlagsVT), LHS, RHS)
3819 .getValue(1);
3820}
3821
3822/// \defgroup AArch64CCMP CMP;CCMP matching
3823///
3824/// These functions deal with the formation of CMP;CCMP;... sequences.
3825/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
3826/// a comparison. They set the NZCV flags to a predefined value if their
3827/// predicate is false. This allows to express arbitrary conjunctions, for
3828/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
3829/// expressed as:
3830/// cmp A
3831/// ccmp B, inv(CB), CA
3832/// check for CB flags
3833///
3834/// This naturally lets us implement chains of AND operations with SETCC
3835/// operands. And we can even implement some other situations by transforming
3836/// them:
3837/// - We can implement (NEG SETCC) i.e. negating a single comparison by
3838/// negating the flags used in a CCMP/FCCMP operations.
3839/// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
3840/// by negating the flags we test for afterwards. i.e.
3841/// NEG (CMP CCMP CCCMP ...) can be implemented.
3842/// - Note that we can only ever negate all previously processed results.
3843/// What we can not implement by flipping the flags to test is a negation
3844/// of two sub-trees (because the negation affects all sub-trees emitted so
3845/// far, so the 2nd sub-tree we emit would also affect the first).
3846/// With those tools we can implement some OR operations:
3847/// - (OR (SETCC A) (SETCC B)) can be implemented via:
3848/// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
3849/// - After transforming OR to NEG/AND combinations we may be able to use NEG
3850/// elimination rules from earlier to implement the whole thing as a
3851/// CCMP/FCCMP chain.
3852///
3853/// As complete example:
3854/// or (or (setCA (cmp A)) (setCB (cmp B)))
3855/// (and (setCC (cmp C)) (setCD (cmp D)))"
3856/// can be reassociated to:
3857/// or (and (setCC (cmp C)) setCD (cmp D))
3858// (or (setCA (cmp A)) (setCB (cmp B)))
3859/// can be transformed to:
3860/// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
3861/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
3862/// which can be implemented as:
3863/// cmp C
3864/// ccmp D, inv(CD), CC
3865/// ccmp A, CA, inv(CD)
3866/// ccmp B, CB, inv(CA)
3867/// check for CB flags
3868///
3869/// A counterexample is "or (and A B) (and C D)" which translates to
3870/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
3871/// can only implement 1 of the inner (not) operations, but not both!
3872/// @{
3873
3874/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
3876 ISD::CondCode CC, SDValue CCOp,
3878 AArch64CC::CondCode OutCC,
3879 const SDLoc &DL, SelectionDAG &DAG) {
3880 unsigned Opcode = 0;
3881 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3882
3883 if (LHS.getValueType().isFloatingPoint()) {
3884 assert(LHS.getValueType() != MVT::f128);
3885 if ((LHS.getValueType() == MVT::f16 && !FullFP16) ||
3886 LHS.getValueType() == MVT::bf16) {
3887 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3888 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3889 }
3890 Opcode = AArch64ISD::FCCMP;
3891 } else if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(RHS)) {
3892 APInt Imm = Const->getAPIntValue();
3893 if (Imm.isNegative() && Imm.sgt(-32)) {
3894 Opcode = AArch64ISD::CCMN;
3895 RHS = DAG.getConstant(Imm.abs(), DL, Const->getValueType(0));
3896 }
3897 } else if (isCMN(RHS, CC, DAG)) {
3898 Opcode = AArch64ISD::CCMN;
3899 RHS = RHS.getOperand(1);
3900 } else if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
3901 isIntEqualitySetCC(CC)) {
3902 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3903 // we combine a (CCMP (sub 0, op1), op2) into a CCMN instruction ?
3904 Opcode = AArch64ISD::CCMN;
3905 LHS = LHS.getOperand(1);
3906 }
3907 if (Opcode == 0)
3908 Opcode = AArch64ISD::CCMP;
3909
3910 SDValue Condition = getCondCode(DAG, Predicate);
3912 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
3913 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
3914 return DAG.getNode(Opcode, DL, FlagsVT, LHS, RHS, NZCVOp, Condition, CCOp);
3915}
3916
3917/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
3918/// expressed as a conjunction. See \ref AArch64CCMP.
3919/// \param CanNegate Set to true if we can negate the whole sub-tree just by
3920/// changing the conditions on the SETCC tests.
3921/// (this means we can call emitConjunctionRec() with
3922/// Negate==true on this sub-tree)
3923/// \param MustBeFirst Set to true if this subtree needs to be negated and we
3924/// cannot do the negation naturally. We are required to
3925/// emit the subtree first in this case.
3926/// \param PreferFirst Set to true if processing this subtree first may
3927/// result in more efficient code.
3928/// \param WillNegate Is true if are called when the result of this
3929/// subexpression must be negated. This happens when the
3930/// outer expression is an OR. We can use this fact to know
3931/// that we have a double negation (or (or ...) ...) that
3932/// can be implemented for free.
3933static bool canEmitConjunction(SelectionDAG &DAG, const SDValue Val,
3934 bool &CanNegate, bool &MustBeFirst,
3935 bool &PreferFirst, bool WillNegate,
3936 unsigned Depth = 0) {
3937 if (!Val.hasOneUse())
3938 return false;
3939 unsigned Opcode = Val->getOpcode();
3940 if (Opcode == ISD::SETCC) {
3941 EVT VT = Val->getOperand(0).getValueType();
3942 if (VT == MVT::f128)
3943 return false;
3944 CanNegate = true;
3945 MustBeFirst = false;
3946 // Designate this operation as a preferred first operation if the result
3947 // of a SUB operation can be reused.
3948 PreferFirst = DAG.doesNodeExist(ISD::SUB, DAG.getVTList(VT),
3949 {Val->getOperand(0), Val->getOperand(1)});
3950 return true;
3951 }
3952 // Protect against exponential runtime and stack overflow.
3953 if (Depth > 6)
3954 return false;
3955 if (Opcode == ISD::AND || Opcode == ISD::OR) {
3956 bool IsOR = Opcode == ISD::OR;
3957 SDValue O0 = Val->getOperand(0);
3958 SDValue O1 = Val->getOperand(1);
3959 bool CanNegateL;
3960 bool MustBeFirstL;
3961 bool PreferFirstL;
3962 if (!canEmitConjunction(DAG, O0, CanNegateL, MustBeFirstL, PreferFirstL,
3963 IsOR, Depth + 1))
3964 return false;
3965 bool CanNegateR;
3966 bool MustBeFirstR;
3967 bool PreferFirstR;
3968 if (!canEmitConjunction(DAG, O1, CanNegateR, MustBeFirstR, PreferFirstR,
3969 IsOR, Depth + 1))
3970 return false;
3971
3972 if (MustBeFirstL && MustBeFirstR)
3973 return false;
3974
3975 if (IsOR) {
3976 // For an OR expression we need to be able to naturally negate at least
3977 // one side or we cannot do the transformation at all.
3978 if (!CanNegateL && !CanNegateR)
3979 return false;
3980 // If we the result of the OR will be negated and we can naturally negate
3981 // the leaves, then this sub-tree as a whole negates naturally.
3982 CanNegate = WillNegate && CanNegateL && CanNegateR;
3983 // If we cannot naturally negate the whole sub-tree, then this must be
3984 // emitted first.
3985 MustBeFirst = !CanNegate;
3986 } else {
3987 assert(Opcode == ISD::AND && "Must be OR or AND");
3988 // We cannot naturally negate an AND operation.
3989 CanNegate = false;
3990 MustBeFirst = MustBeFirstL || MustBeFirstR;
3991 }
3992 PreferFirst = PreferFirstL || PreferFirstR;
3993 return true;
3994 }
3995 return false;
3996}
3997
3998/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
3999/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
4000/// Tries to transform the given i1 producing node @p Val to a series compare
4001/// and conditional compare operations. @returns an NZCV flags producing node
4002/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
4003/// transformation was not possible.
4004/// \p Negate is true if we want this sub-tree being negated just by changing
4005/// SETCC conditions.
4007 AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
4009 // We're at a tree leaf, produce a conditional comparison operation.
4010 unsigned Opcode = Val->getOpcode();
4011 if (Opcode == ISD::SETCC) {
4012 SDValue LHS = Val->getOperand(0);
4013 SDValue RHS = Val->getOperand(1);
4014 ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
4015 bool isInteger = LHS.getValueType().isInteger();
4016 if (Negate)
4017 CC = getSetCCInverse(CC, LHS.getValueType());
4018 SDLoc DL(Val);
4019 // Determine OutCC and handle FP special case.
4020 if (isInteger) {
4021 OutCC = changeIntCCToAArch64CC(CC, RHS);
4022 } else {
4023 assert(LHS.getValueType().isFloatingPoint());
4024 AArch64CC::CondCode ExtraCC;
4025 changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
4026 // Some floating point conditions can't be tested with a single condition
4027 // code. Construct an additional comparison in this case.
4028 if (ExtraCC != AArch64CC::AL) {
4029 SDValue ExtraCmp;
4030 if (!CCOp.getNode())
4031 ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
4032 else
4033 ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
4034 ExtraCC, DL, DAG);
4035 CCOp = ExtraCmp;
4036 Predicate = ExtraCC;
4037 }
4038 }
4039
4040 // Produce a normal comparison if we are first in the chain
4041 if (!CCOp)
4042 return emitComparison(LHS, RHS, CC, DL, DAG);
4043 // Otherwise produce a ccmp.
4044 return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
4045 DAG);
4046 }
4047 assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
4048
4049 bool IsOR = Opcode == ISD::OR;
4050
4051 SDValue LHS = Val->getOperand(0);
4052 bool CanNegateL;
4053 bool MustBeFirstL;
4054 bool PreferFirstL;
4055 bool ValidL = canEmitConjunction(DAG, LHS, CanNegateL, MustBeFirstL,
4056 PreferFirstL, IsOR);
4057 assert(ValidL && "Valid conjunction/disjunction tree");
4058 (void)ValidL;
4059
4060 SDValue RHS = Val->getOperand(1);
4061 bool CanNegateR;
4062 bool MustBeFirstR;
4063 bool PreferFirstR;
4064 bool ValidR = canEmitConjunction(DAG, RHS, CanNegateR, MustBeFirstR,
4065 PreferFirstR, IsOR);
4066 assert(ValidR && "Valid conjunction/disjunction tree");
4067 (void)ValidR;
4068
4069 bool ShouldFirstL = PreferFirstL && !PreferFirstR && !MustBeFirstR;
4070
4071 // Swap sub-tree that must or should come first to the right side.
4072 if (MustBeFirstL || ShouldFirstL) {
4073 assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
4074 std::swap(LHS, RHS);
4075 std::swap(CanNegateL, CanNegateR);
4076 std::swap(MustBeFirstL, MustBeFirstR);
4077 }
4078
4079 bool NegateR;
4080 bool NegateAfterR;
4081 bool NegateL;
4082 bool NegateAfterAll;
4083 if (Opcode == ISD::OR) {
4084 // Swap the sub-tree that we can negate naturally to the left.
4085 if (!CanNegateL) {
4086 assert(CanNegateR && "at least one side must be negatable");
4087 assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
4088 assert(!Negate);
4089 std::swap(LHS, RHS);
4090 NegateR = false;
4091 NegateAfterR = true;
4092 } else {
4093 // Negate the left sub-tree if possible, otherwise negate the result.
4094 NegateR = CanNegateR;
4095 NegateAfterR = !CanNegateR;
4096 }
4097 NegateL = true;
4098 NegateAfterAll = !Negate;
4099 } else {
4100 assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
4101 assert(!Negate && "Valid conjunction/disjunction tree");
4102
4103 NegateL = false;
4104 NegateR = false;
4105 NegateAfterR = false;
4106 NegateAfterAll = false;
4107 }
4108
4109 // Emit sub-trees.
4110 AArch64CC::CondCode RHSCC;
4111 SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
4112 if (NegateAfterR)
4113 RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
4114 SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
4115 if (NegateAfterAll)
4116 OutCC = AArch64CC::getInvertedCondCode(OutCC);
4117 return CmpL;
4118}
4119
4120/// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
4121/// In some cases this is even possible with OR operations in the expression.
4122/// See \ref AArch64CCMP.
4123/// \see emitConjunctionRec().
4125 AArch64CC::CondCode &OutCC) {
4126 bool DummyCanNegate;
4127 bool DummyMustBeFirst;
4128 bool DummyPreferFirst;
4129 if (!canEmitConjunction(DAG, Val, DummyCanNegate, DummyMustBeFirst,
4130 DummyPreferFirst, false))
4131 return SDValue();
4132
4133 return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
4134}
4135
4136/// @}
4137
4138/// Returns how profitable it is to fold a comparison's operand's shift and/or
4139/// extension operations.
4141 auto isSupportedExtend = [&](SDValue V) {
4142 if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
4143 return true;
4144
4145 if (V.getOpcode() == ISD::AND)
4146 if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
4147 uint64_t Mask = MaskCst->getZExtValue();
4148 return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
4149 }
4150
4151 return false;
4152 };
4153
4154 if (!Op.hasOneUse())
4155 return 0;
4156
4157 if (isSupportedExtend(Op))
4158 return 1;
4159
4160 unsigned Opc = Op.getOpcode();
4161 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
4162 if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
4163 uint64_t Shift = ShiftCst->getZExtValue();
4164 if (isSupportedExtend(Op.getOperand(0)))
4165 return (Shift <= 4) ? 2 : 1;
4166 EVT VT = Op.getValueType();
4167 if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
4168 return 1;
4169 }
4170
4171 return 0;
4172}
4173
4174// emitComparison() converts comparison with one or negative one to comparison
4175// with 0. Note that this only works for signed comparisons because of how ANDS
4176// works.
4178 // Only works for ANDS and AND.
4179 if (LHS.getOpcode() != ISD::AND && LHS.getOpcode() != AArch64ISD::ANDS)
4180 return false;
4181
4182 if (C.isOne() && (CC == ISD::SETLT || CC == ISD::SETGE)) {
4183 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
4184 return true;
4185 }
4186
4187 if (C.isAllOnes() && (CC == ISD::SETLE || CC == ISD::SETGT)) {
4188 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
4189 return true;
4190 }
4191
4192 return false;
4193}
4194
4196 SDValue &AArch64cc, SelectionDAG &DAG,
4197 const SDLoc &DL) {
4198 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
4199 EVT VT = RHS.getValueType();
4200 APInt C = RHSC->getAPIntValue();
4201 // shouldBeAdjustedToZero is a special case to better fold with
4202 // emitComparison().
4203 if (shouldBeAdjustedToZero(LHS, C, CC)) {
4204 // Adjust the constant to zero.
4205 // CC has already been adjusted.
4206 RHS = DAG.getConstant(0, DL, VT);
4207 } else if (!isLegalCmpImmed(C)) {
4208 unsigned NumImmForC = numberOfInstrToLoadImm(C);
4209 // Constant does not fit, try adjusting it by one?
4210 switch (CC) {
4211 default:
4212 break;
4213 case ISD::SETLT:
4214 case ISD::SETGE:
4215 if (!C.isMinSignedValue()) {
4216 APInt CMinusOne = C - 1;
4217 if (isLegalCmpImmed(CMinusOne) ||
4218 (NumImmForC > numberOfInstrToLoadImm(CMinusOne))) {
4219 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
4220 RHS = DAG.getConstant(CMinusOne, DL, VT);
4221 }
4222 }
4223 break;
4224 case ISD::SETULT:
4225 case ISD::SETUGE: {
4226 // C is not 0 because it is a legal immediate.
4227 assert(!C.isZero() && "C should not be zero here");
4228 APInt CMinusOne = C - 1;
4229 if (isLegalCmpImmed(CMinusOne) ||
4230 (NumImmForC > numberOfInstrToLoadImm(CMinusOne))) {
4231 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
4232 RHS = DAG.getConstant(CMinusOne, DL, VT);
4233 }
4234 break;
4235 }
4236 case ISD::SETLE:
4237 case ISD::SETGT:
4238 if (!C.isMaxSignedValue()) {
4239 APInt CPlusOne = C + 1;
4240 if (isLegalCmpImmed(CPlusOne) ||
4241 (NumImmForC > numberOfInstrToLoadImm(CPlusOne))) {
4242 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
4243 RHS = DAG.getConstant(CPlusOne, DL, VT);
4244 }
4245 }
4246 break;
4247 case ISD::SETULE:
4248 case ISD::SETUGT: {
4249 if (!C.isAllOnes()) {
4250 APInt CPlusOne = C + 1;
4251 if (isLegalCmpImmed(CPlusOne) ||
4252 (NumImmForC > numberOfInstrToLoadImm(CPlusOne))) {
4253 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
4254 RHS = DAG.getConstant(CPlusOne, DL, VT);
4255 }
4256 }
4257 break;
4258 }
4259 }
4260 }
4261 }
4262
4263 // Comparisons are canonicalized so that the RHS operand is simpler than the
4264 // LHS one, the extreme case being when RHS is an immediate. However, AArch64
4265 // can fold some shift+extend operations on the RHS operand, so swap the
4266 // operands if that can be done.
4267 //
4268 // For example:
4269 // lsl w13, w11, #1
4270 // cmp w13, w12
4271 // can be turned into:
4272 // cmp w12, w11, lsl #1
4273 if (!isa<ConstantSDNode>(RHS) || !isLegalCmpImmed(RHS->getAsAPIntVal())) {
4274 bool LHSIsCMN = isCMN(LHS, CC, DAG);
4275 bool RHSIsCMN = isCMN(RHS, CC, DAG);
4276 SDValue TheLHS = LHSIsCMN ? LHS.getOperand(1) : LHS;
4277 SDValue TheRHS = RHSIsCMN ? RHS.getOperand(1) : RHS;
4278
4279 if (getCmpOperandFoldingProfit(TheLHS) + (LHSIsCMN ? 1 : 0) >
4280 getCmpOperandFoldingProfit(TheRHS) + (RHSIsCMN ? 1 : 0)) {
4281 std::swap(LHS, RHS);
4283 }
4284 }
4285
4286 SDValue Cmp;
4288 if (isIntEqualitySetCC(CC) && isa<ConstantSDNode>(RHS)) {
4290
4291 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
4292 // For the i8 operand, the largest immediate is 255, so this can be easily
4293 // encoded in the compare instruction. For the i16 operand, however, the
4294 // largest immediate cannot be encoded in the compare.
4295 // Therefore, use a sign extending load and cmn to avoid materializing the
4296 // -1 constant. For example,
4297 // movz w1, #65535
4298 // ldrh w0, [x0, #0]
4299 // cmp w0, w1
4300 // >
4301 // ldrsh w0, [x0, #0]
4302 // cmn w0, #1
4303 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
4304 // if and only if (sext LHS) == (sext RHS). The checks are in place to
4305 // ensure both the LHS and RHS are truly zero extended and to make sure the
4306 // transformation is profitable.
4307 if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
4308 cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
4309 cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
4310 LHS.getNode()->hasNUsesOfValue(1, 0)) {
4311 int16_t ValueofRHS = RHS->getAsZExtVal();
4312 if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
4313 SDValue SExt =
4314 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, LHS.getValueType(), LHS,
4315 DAG.getValueType(MVT::i16));
4316 Cmp = emitComparison(
4317 SExt, DAG.getSignedConstant(ValueofRHS, DL, RHS.getValueType()), CC,
4318 DL, DAG);
4320 }
4321 }
4322
4323 if (!Cmp && (RHSC->isZero() || RHSC->isOne())) {
4324 if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
4325 if ((CC == ISD::SETNE) ^ RHSC->isZero())
4327 }
4328 }
4329 }
4330
4331 if (!Cmp) {
4332 Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
4334 }
4335 AArch64cc = getCondCode(DAG, AArch64CC);
4336 return Cmp;
4337}
4338
4339static std::pair<SDValue, SDValue>
4341 assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
4342 "Unsupported value type");
4343 SDValue Value, Overflow;
4344 SDLoc DL(Op);
4345 SDValue LHS = Op.getOperand(0);
4346 SDValue RHS = Op.getOperand(1);
4347 unsigned Opc = 0;
4348 switch (Op.getOpcode()) {
4349 default:
4350 llvm_unreachable("Unknown overflow instruction!");
4351 case ISD::SADDO:
4352 Opc = AArch64ISD::ADDS;
4353 CC = AArch64CC::VS;
4354 break;
4355 case ISD::UADDO:
4356 Opc = AArch64ISD::ADDS;
4357 CC = AArch64CC::HS;
4358 break;
4359 case ISD::SSUBO:
4360 Opc = AArch64ISD::SUBS;
4361 CC = AArch64CC::VS;
4362 break;
4363 case ISD::USUBO:
4364 Opc = AArch64ISD::SUBS;
4365 CC = AArch64CC::LO;
4366 break;
4367 // Multiply needs a little bit extra work.
4368 case ISD::SMULO:
4369 case ISD::UMULO: {
4370 CC = AArch64CC::NE;
4371 bool IsSigned = Op.getOpcode() == ISD::SMULO;
4372 if (Op.getValueType() == MVT::i32) {
4373 // Extend to 64-bits, then perform a 64-bit multiply.
4374 unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4375 LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
4376 RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
4377 SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
4378 Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
4379
4380 // Check that the result fits into a 32-bit integer.
4381 SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
4382 if (IsSigned) {
4383 // cmp xreg, wreg, sxtw
4384 SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value);
4385 Overflow =
4386 DAG.getNode(AArch64ISD::SUBS, DL, VTs, Mul, SExtMul).getValue(1);
4387 } else {
4388 // tst xreg, #0xffffffff00000000
4389 SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64);
4390 Overflow =
4391 DAG.getNode(AArch64ISD::ANDS, DL, VTs, Mul, UpperBits).getValue(1);
4392 }
4393 break;
4394 }
4395 assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
4396 // For the 64 bit multiply
4397 Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
4398 if (IsSigned) {
4399 SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
4400 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
4401 DAG.getConstant(63, DL, MVT::i64));
4402 // It is important that LowerBits is last, otherwise the arithmetic
4403 // shift will not be folded into the compare (SUBS).
4404 SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
4405 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
4406 .getValue(1);
4407 } else {
4408 SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
4409 SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
4410 Overflow =
4411 DAG.getNode(AArch64ISD::SUBS, DL, VTs,
4412 DAG.getConstant(0, DL, MVT::i64),
4413 UpperBits).getValue(1);
4414 }
4415 break;
4416 }
4417 } // switch (...)
4418
4419 if (Opc) {
4420 SDVTList VTs = DAG.getVTList(Op->getValueType(0), FlagsVT);
4421
4422 // Emit the AArch64 operation with overflow check.
4423 Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
4424 Overflow = Value.getValue(1);
4425 }
4426 return std::make_pair(Value, Overflow);
4427}
4428
4429SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
4430 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
4431 !Subtarget->isNeonAvailable()))
4432 return LowerToScalableOp(Op, DAG);
4433
4434 SDValue Sel = Op.getOperand(0);
4435 SDValue Other = Op.getOperand(1);
4436 SDLoc DL(Sel);
4437
4438 // If the operand is an overflow checking operation, invert the condition
4439 // code and kill the Not operation. I.e., transform:
4440 // (xor (overflow_op_bool, 1))
4441 // -->
4442 // (csel 1, 0, invert(cc), overflow_op_bool)
4443 // ... which later gets transformed to just a cset instruction with an
4444 // inverted condition code, rather than a cset + eor sequence.
4446 // Only lower legal XALUO ops.
4448 return SDValue();
4449
4450 SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
4451 SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
4453 SDValue Value, Overflow;
4454 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
4455 SDValue CCVal = getCondCode(DAG, getInvertedCondCode(CC));
4456 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
4457 CCVal, Overflow);
4458 }
4459 // If neither operand is a SELECT_CC, give up.
4460 if (Sel.getOpcode() != ISD::SELECT_CC)
4461 std::swap(Sel, Other);
4462 if (Sel.getOpcode() != ISD::SELECT_CC)
4463 return Op;
4464
4465 // The folding we want to perform is:
4466 // (xor x, (select_cc a, b, cc, 0, -1) )
4467 // -->
4468 // (csel x, (xor x, -1), cc ...)
4469 //
4470 // The latter will get matched to a CSINV instruction.
4471
4472 ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
4473 SDValue LHS = Sel.getOperand(0);
4474 SDValue RHS = Sel.getOperand(1);
4475 SDValue TVal = Sel.getOperand(2);
4476 SDValue FVal = Sel.getOperand(3);
4477
4478 // FIXME: This could be generalized to non-integer comparisons.
4479 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
4480 return Op;
4481
4482 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
4483 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
4484
4485 // The values aren't constants, this isn't the pattern we're looking for.
4486 if (!CFVal || !CTVal)
4487 return Op;
4488
4489 // We can commute the SELECT_CC by inverting the condition. This
4490 // might be needed to make this fit into a CSINV pattern.
4491 if (CTVal->isAllOnes() && CFVal->isZero()) {
4492 std::swap(TVal, FVal);
4493 std::swap(CTVal, CFVal);
4494 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
4495 }
4496
4497 // If the constants line up, perform the transform!
4498 if (CTVal->isZero() && CFVal->isAllOnes()) {
4499 SDValue CCVal;
4500 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, DL);
4501
4502 FVal = Other;
4503 TVal = DAG.getNode(ISD::XOR, DL, Other.getValueType(), Other,
4504 DAG.getAllOnesConstant(DL, Other.getValueType()));
4505
4506 return DAG.getNode(AArch64ISD::CSEL, DL, Sel.getValueType(), FVal, TVal,
4507 CCVal, Cmp);
4508 }
4509
4510 return Op;
4511}
4512
4513// If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C'
4514// bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else
4515// sets 'C' bit to 0.
4517 SDLoc DL(Value);
4518 EVT VT = Value.getValueType();
4519 SDValue Op0 = Invert ? DAG.getConstant(0, DL, VT) : Value;
4520 SDValue Op1 = Invert ? Value : DAG.getConstant(1, DL, VT);
4521 SDValue Cmp =
4522 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT), Op0, Op1);
4523 return Cmp.getValue(1);
4524}
4525
4526// If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0.
4527// If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1.
4529 bool Invert) {
4530 assert(Glue.getResNo() == 1);
4531 SDLoc DL(Glue);
4532 SDValue Zero = DAG.getConstant(0, DL, VT);
4533 SDValue One = DAG.getConstant(1, DL, VT);
4535 SDValue CC = getCondCode(DAG, Cond);
4536 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
4537}
4538
4539// Value is 1 if 'V' bit of NZCV is 1, else 0
4541 assert(Glue.getResNo() == 1);
4542 SDLoc DL(Glue);
4543 SDValue Zero = DAG.getConstant(0, DL, VT);
4544 SDValue One = DAG.getConstant(1, DL, VT);
4546 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
4547}
4548
4549// This lowering is inefficient, but it will get cleaned up by
4550// `foldOverflowCheck`
4552 unsigned Opcode, bool IsSigned) {
4553 EVT VT0 = Op.getValue(0).getValueType();
4554 EVT VT1 = Op.getValue(1).getValueType();
4555
4556 if (VT0 != MVT::i32 && VT0 != MVT::i64)
4557 return SDValue();
4558
4559 bool InvertCarry = Opcode == AArch64ISD::SBCS;
4560 SDValue OpLHS = Op.getOperand(0);
4561 SDValue OpRHS = Op.getOperand(1);
4562 SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry);
4563
4564 SDLoc DL(Op);
4565
4566 SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, FlagsVT), OpLHS,
4567 OpRHS, OpCarryIn);
4568
4569 SDValue OutFlag =
4570 IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG)
4571 : carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry);
4572
4573 return DAG.getMergeValues({Sum, OutFlag}, DL);
4574}
4575
4576static SDValue lowerIntNeonIntrinsic(SDValue Op, unsigned Opcode,
4577 SelectionDAG &DAG,
4578 bool LastOperandIsImm = false) {
4579 if (Op.getValueType().isVector())
4580 return SDValue();
4581
4582 SDLoc DL(Op);
4584 const unsigned NumOperands = Op.getNumOperands();
4585 auto getFloatVT = [](EVT VT) {
4586 assert((VT == MVT::i32 || VT == MVT::i64) && "Unexpected VT");
4587 return VT == MVT::i32 ? MVT::f32 : MVT::f64;
4588 };
4589 auto bitcastToFloat = [&](SDValue Val) {
4590 return DAG.getBitcast(getFloatVT(Val.getValueType()), Val);
4591 };
4592
4593 // Skip first operand as it is intrinsic ID.
4594 for (unsigned I = 1; I < NumOperands; ++I) {
4595 SDValue Val = Op.getOperand(I);
4596 const bool KeepInt = LastOperandIsImm && (I == NumOperands - 1);
4597 NewOps.push_back(KeepInt ? Val : bitcastToFloat(Val));
4598 }
4599 EVT OrigVT = Op.getValueType();
4600 SDValue OpNode = DAG.getNode(Opcode, DL, getFloatVT(OrigVT), NewOps);
4601 return DAG.getBitcast(OrigVT, OpNode);
4602}
4603
4605 // Let legalize expand this if it isn't a legal type yet.
4606 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
4607 return SDValue();
4608
4609 SDLoc DL(Op);
4611 // The actual operation that sets the overflow or carry flag.
4612 SDValue Value, Overflow;
4613 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
4614
4615 // We use 0 and 1 as false and true values.
4616 SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
4617 SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
4618
4619 // We use an inverted condition, because the conditional select is inverted
4620 // too. This will allow it to be selected to a single instruction:
4621 // CSINC Wd, WZR, WZR, invert(cond).
4622 SDValue CCVal = getCondCode(DAG, getInvertedCondCode(CC));
4623 Overflow =
4624 DAG.getNode(AArch64ISD::CSEL, DL, MVT::i32, FVal, TVal, CCVal, Overflow);
4625
4626 return DAG.getMergeValues({Value, Overflow}, DL);
4627}
4628
4629// Prefetch operands are:
4630// 1: Address to prefetch
4631// 2: bool isWrite
4632// 3: int locality (0 = no locality ... 3 = extreme locality)
4633// 4: bool isDataCache
4635 SDLoc DL(Op);
4636 unsigned IsWrite = Op.getConstantOperandVal(2);
4637 unsigned Locality = Op.getConstantOperandVal(3);
4638 unsigned IsData = Op.getConstantOperandVal(4);
4639
4640 bool IsStream = !Locality;
4641 // When the locality number is set
4642 if (Locality) {
4643 // The front-end should have filtered out the out-of-range values
4644 assert(Locality <= 3 && "Prefetch locality out-of-range");
4645 // The locality degree is the opposite of the cache speed.
4646 // Put the number the other way around.
4647 // The encoding starts at 0 for level 1
4648 Locality = 3 - Locality;
4649 }
4650
4651 // built the mask value encoding the expected behavior.
4652 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
4653 (!IsData << 3) | // IsDataCache bit
4654 (Locality << 1) | // Cache level bits
4655 (unsigned)IsStream; // Stream bit
4656 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
4657 DAG.getTargetConstant(PrfOp, DL, MVT::i32),
4658 Op.getOperand(1));
4659}
4660
4661// Converts SETCC (AND X Y) Z ULT -> SETCC (AND X (Y & ~(Z - 1)) 0 EQ when Y is
4662// a power of 2. This is then lowered to ANDS X (Y & ~(Z - 1)) instead of SUBS
4663// (AND X Y) Z which produces a better opt with EmitComparison
4665 SelectionDAG &DAG, const SDLoc DL) {
4666 if (CC == ISD::SETULT && LHS.getOpcode() == ISD::AND && LHS->hasOneUse()) {
4667 ConstantSDNode *LHSConstOp = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
4669 if (LHSConstOp && RHSConst) {
4670 uint64_t LHSConstValue = LHSConstOp->getZExtValue();
4671 uint64_t RHSConstant = RHSConst->getZExtValue();
4672 if (isPowerOf2_64(RHSConstant)) {
4673 uint64_t NewMaskValue = LHSConstValue & ~(RHSConstant - 1);
4674 LHS =
4675 DAG.getNode(ISD::AND, DL, LHS.getValueType(), LHS.getOperand(0),
4676 DAG.getConstant(NewMaskValue, DL, LHS.getValueType()));
4677 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4678 CC = ISD::SETEQ;
4679 }
4680 }
4681 }
4682}
4683
4684SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
4685 SelectionDAG &DAG) const {
4686 EVT VT = Op.getValueType();
4687 if (VT.isScalableVector()) {
4688 SDValue SrcVal = Op.getOperand(0);
4689
4690 if (VT == MVT::nxv2f64 && SrcVal.getValueType() == MVT::nxv2bf16) {
4691 // Break conversion in two with the first part converting to f32 and the
4692 // second using native f32->VT instructions.
4693 SDLoc DL(Op);
4694 return DAG.getNode(ISD::FP_EXTEND, DL, VT,
4695 DAG.getNode(ISD::FP_EXTEND, DL, MVT::nxv2f32, SrcVal));
4696 }
4697
4698 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
4699 }
4700
4701 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
4702 return LowerFixedLengthFPExtendToSVE(Op, DAG);
4703
4704 bool IsStrict = Op->isStrictFPOpcode();
4705 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
4706 EVT Op0VT = Op0.getValueType();
4707 if (VT == MVT::f64) {
4708 // FP16->FP32 extends are legal for v32 and v4f32.
4709 if (Op0VT == MVT::f32 || Op0VT == MVT::f16)
4710 return Op;
4711 // Split bf16->f64 extends into two fpextends.
4712 if (Op0VT == MVT::bf16 && IsStrict) {
4713 SDValue Ext1 =
4714 DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(Op), {MVT::f32, MVT::Other},
4715 {Op0, Op.getOperand(0)});
4716 return DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(Op), {VT, MVT::Other},
4717 {Ext1, Ext1.getValue(1)});
4718 }
4719 if (Op0VT == MVT::bf16)
4720 return DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), VT,
4721 DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, Op0));
4722 return SDValue();
4723 }
4724
4725 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
4726 return SDValue();
4727}
4728
4729SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
4730 SelectionDAG &DAG) const {
4731 EVT VT = Op.getValueType();
4732 bool IsStrict = Op->isStrictFPOpcode();
4733 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4734 EVT SrcVT = SrcVal.getValueType();
4735 bool Trunc = Op.getConstantOperandVal(IsStrict ? 2 : 1) == 1;
4736 SDNodeFlags Flags = Op->getFlags();
4737
4738 if (VT.isScalableVector()) {
4739 // Let common code split the operation.
4740 if (SrcVT == MVT::nxv8f32)
4741 return Op;
4742
4743 if (VT.getScalarType() != MVT::bf16)
4744 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4745
4746 SDLoc DL(Op);
4747 constexpr EVT I32 = MVT::nxv4i32;
4748 auto ImmV = [&](int I) -> SDValue { return DAG.getConstant(I, DL, I32); };
4749
4750 SDValue NaN;
4751 SDValue Narrow;
4752
4753 if (SrcVT == MVT::nxv2f32 || SrcVT == MVT::nxv4f32) {
4754 if (Subtarget->hasBF16())
4755 return LowerToPredicatedOp(Op, DAG,
4756 AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4757
4758 Narrow = getSVESafeBitCast(I32, SrcVal, DAG);
4759
4760 // Set the quiet bit.
4761 if (!DAG.isKnownNeverSNaN(SrcVal) && !Flags.hasNoNaNs())
4762 NaN = DAG.getNode(ISD::OR, DL, I32, Narrow, ImmV(0x400000));
4763 } else if (SrcVT == MVT::nxv2f64 &&
4764 (Subtarget->hasSVE2() || Subtarget->isStreamingSVEAvailable())) {
4765 // Round to float without introducing rounding errors and try again.
4766 SDValue Pg = getPredicateForVector(DAG, DL, MVT::nxv2f32);
4767 Narrow = DAG.getNode(AArch64ISD::FCVTX_MERGE_PASSTHRU, DL, MVT::nxv2f32,
4768 Pg, SrcVal, DAG.getPOISON(MVT::nxv2f32));
4769
4771 if (IsStrict)
4772 NewOps.push_back(Op.getOperand(0));
4773 NewOps.push_back(Narrow);
4774 NewOps.push_back(Op.getOperand(IsStrict ? 2 : 1));
4775 return DAG.getNode(Op.getOpcode(), DL, VT, NewOps, Op->getFlags());
4776 } else
4777 return SDValue();
4778
4779 if (!Trunc) {
4780 SDValue Lsb = DAG.getNode(ISD::SRL, DL, I32, Narrow, ImmV(16));
4781 Lsb = DAG.getNode(ISD::AND, DL, I32, Lsb, ImmV(1));
4782 SDValue RoundingBias = DAG.getNode(ISD::ADD, DL, I32, Lsb, ImmV(0x7fff));
4783 Narrow = DAG.getNode(ISD::ADD, DL, I32, Narrow, RoundingBias);
4784 }
4785
4786 // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4787 // 0x80000000.
4788 if (NaN) {
4789 EVT I1 = I32.changeElementType(*DAG.getContext(), MVT::i1);
4790 EVT CondVT = VT.changeElementType(*DAG.getContext(), MVT::i1);
4791 SDValue IsNaN = DAG.getSetCC(DL, CondVT, SrcVal, SrcVal, ISD::SETUO);
4792 IsNaN = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, I1, IsNaN);
4793 Narrow = DAG.getSelect(DL, I32, IsNaN, NaN, Narrow);
4794 }
4795
4796 // Now that we have rounded, shift the bits into position.
4797 Narrow = DAG.getNode(ISD::SRL, DL, I32, Narrow, ImmV(16));
4798 return getSVESafeBitCast(VT, Narrow, DAG);
4799 }
4800
4801 if (useSVEForFixedLengthVectorVT(SrcVT, !Subtarget->isNeonAvailable()))
4802 return LowerFixedLengthFPRoundToSVE(Op, DAG);
4803
4804 // Expand cases where the result type is BF16 but we don't have hardware
4805 // instructions to lower it.
4806 if (VT.getScalarType() == MVT::bf16 &&
4807 !((Subtarget->hasNEON() || Subtarget->hasSME()) &&
4808 Subtarget->hasBF16())) {
4809 SDLoc DL(Op);
4810 SDValue Narrow = SrcVal;
4811 SDValue NaN;
4812 EVT I32 = SrcVT.changeElementType(*DAG.getContext(), MVT::i32);
4813 EVT F32 = SrcVT.changeElementType(*DAG.getContext(), MVT::f32);
4814 if (SrcVT.getScalarType() == MVT::f32) {
4815 bool NeverSNaN = DAG.isKnownNeverSNaN(Narrow);
4816 Narrow = DAG.getNode(ISD::BITCAST, DL, I32, Narrow);
4817 if (!NeverSNaN) {
4818 // Set the quiet bit.
4819 NaN = DAG.getNode(ISD::OR, DL, I32, Narrow,
4820 DAG.getConstant(0x400000, DL, I32));
4821 }
4822 } else if (SrcVT.getScalarType() == MVT::f64) {
4823 Narrow = DAG.getNode(AArch64ISD::FCVTXN, DL, F32, Narrow);
4824 Narrow = DAG.getNode(ISD::BITCAST, DL, I32, Narrow);
4825 } else {
4826 return SDValue();
4827 }
4828 if (!Trunc) {
4829 SDValue One = DAG.getConstant(1, DL, I32);
4830 SDValue Lsb = DAG.getNode(ISD::SRL, DL, I32, Narrow,
4831 DAG.getShiftAmountConstant(16, I32, DL));
4832 Lsb = DAG.getNode(ISD::AND, DL, I32, Lsb, One);
4833 SDValue RoundingBias =
4834 DAG.getNode(ISD::ADD, DL, I32, DAG.getConstant(0x7fff, DL, I32), Lsb);
4835 Narrow = DAG.getNode(ISD::ADD, DL, I32, Narrow, RoundingBias);
4836 }
4837
4838 // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4839 // 0x80000000.
4840 if (NaN) {
4841 SDValue IsNaN = DAG.getSetCC(
4842 DL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT),
4843 SrcVal, SrcVal, ISD::SETUO);
4844 Narrow = DAG.getSelect(DL, I32, IsNaN, NaN, Narrow);
4845 }
4846
4847 // Now that we have rounded, shift the bits into position.
4848 Narrow = DAG.getNode(ISD::SRL, DL, I32, Narrow,
4849 DAG.getShiftAmountConstant(16, I32, DL));
4850 if (VT.isVector()) {
4851 EVT I16 = I32.changeVectorElementType(*DAG.getContext(), MVT::i16);
4852 Narrow = DAG.getNode(ISD::TRUNCATE, DL, I16, Narrow);
4853 return DAG.getNode(ISD::BITCAST, DL, VT, Narrow);
4854 }
4855 Narrow = DAG.getNode(ISD::BITCAST, DL, F32, Narrow);
4856 SDValue Result = DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, Narrow);
4857 return IsStrict ? DAG.getMergeValues({Result, Op.getOperand(0)}, DL)
4858 : Result;
4859 }
4860
4861 if (SrcVT != MVT::f128) {
4862 // Expand cases where the input is a vector bigger than NEON.
4864 return SDValue();
4865
4866 // It's legal except when f128 is involved
4867 return Op;
4868 }
4869
4870 return SDValue();
4871}
4872
4873SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
4874 SelectionDAG &DAG) const {
4875 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4876 // Any additional optimization in this function should be recorded
4877 // in the cost tables.
4878 bool IsStrict = Op->isStrictFPOpcode();
4879 EVT InVT = Op.getOperand(IsStrict ? 1 : 0).getValueType();
4880 EVT VT = Op.getValueType();
4881
4882 assert(!(IsStrict && VT.isScalableVector()) &&
4883 "Unimplemented SVE support for STRICT_FP_to_INT!");
4884
4885 // f16 conversions are promoted to f32 when full fp16 is not supported.
4886 if ((InVT.getVectorElementType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4887 InVT.getVectorElementType() == MVT::bf16) {
4888 EVT NewVT = VT.changeElementType(*DAG.getContext(), MVT::f32);
4889 SDLoc DL(Op);
4890 if (IsStrict) {
4891 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {NewVT, MVT::Other},
4892 {Op.getOperand(0), Op.getOperand(1)});
4893 return DAG.getNode(Op.getOpcode(), DL, {VT, MVT::Other},
4894 {Ext.getValue(1), Ext.getValue(0)});
4895 }
4896 return DAG.getNode(
4897 Op.getOpcode(), DL, Op.getValueType(),
4898 DAG.getNode(ISD::FP_EXTEND, DL, NewVT, Op.getOperand(0)));
4899 }
4900
4901 if (VT.isScalableVector()) {
4902 if (VT.getVectorElementType() == MVT::i1) {
4903 SDLoc DL(Op);
4904 EVT CvtVT = getPromotedVTForPredicate(VT);
4905 SDValue Cvt = DAG.getNode(Op.getOpcode(), DL, CvtVT, Op.getOperand(0));
4906 SDValue Zero = DAG.getConstant(0, DL, CvtVT);
4907 return DAG.getSetCC(DL, VT, Cvt, Zero, ISD::SETNE);
4908 }
4909
4910 // Let common code split the operation.
4911 if (InVT == MVT::nxv8f32)
4912 return Op;
4913
4914 unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
4915 ? AArch64ISD::FCVTZU_MERGE_PASSTHRU
4916 : AArch64ISD::FCVTZS_MERGE_PASSTHRU;
4917 return LowerToPredicatedOp(Op, DAG, Opcode);
4918 }
4919
4920 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
4921 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
4922 return LowerFixedLengthFPToIntToSVE(Op, DAG);
4923
4924 uint64_t VTSize = VT.getFixedSizeInBits();
4925 uint64_t InVTSize = InVT.getFixedSizeInBits();
4926 if (VTSize < InVTSize) {
4927 SDLoc DL(Op);
4928 if (IsStrict) {
4930 SDValue Cv = DAG.getNode(Op.getOpcode(), DL, {InVT, MVT::Other},
4931 {Op.getOperand(0), Op.getOperand(1)});
4932 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, Cv);
4933 return DAG.getMergeValues({Trunc, Cv.getValue(1)}, DL);
4934 }
4935 SDValue Cv =
4936 DAG.getNode(Op.getOpcode(), DL, InVT.changeVectorElementTypeToInteger(),
4937 Op.getOperand(0));
4938 return DAG.getNode(ISD::TRUNCATE, DL, VT, Cv);
4939 }
4940
4941 if (VTSize > InVTSize) {
4942 SDLoc DL(Op);
4943 MVT ExtVT =
4946 if (IsStrict) {
4947 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {ExtVT, MVT::Other},
4948 {Op.getOperand(0), Op.getOperand(1)});
4949 return DAG.getNode(Op.getOpcode(), DL, {VT, MVT::Other},
4950 {Ext.getValue(1), Ext.getValue(0)});
4951 }
4952 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, DL, ExtVT, Op.getOperand(0));
4953 return DAG.getNode(Op.getOpcode(), DL, VT, Ext);
4954 }
4955
4956 // Use a scalar operation for conversions between single-element vectors of
4957 // the same size.
4958 if (InVT.getVectorNumElements() == 1) {
4959 SDLoc DL(Op);
4960 SDValue Extract = DAG.getNode(
4962 Op.getOperand(IsStrict ? 1 : 0), DAG.getConstant(0, DL, MVT::i64));
4963 EVT ScalarVT = VT.getScalarType();
4964 if (IsStrict)
4965 return DAG.getNode(Op.getOpcode(), DL, {ScalarVT, MVT::Other},
4966 {Op.getOperand(0), Extract});
4967 return DAG.getNode(Op.getOpcode(), DL, ScalarVT, Extract);
4968 }
4969
4970 // Type changing conversions are illegal.
4971 return Op;
4972}
4973
4974SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
4975 SelectionDAG &DAG) const {
4976 bool IsStrict = Op->isStrictFPOpcode();
4977 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4978
4979 if (SrcVal.getValueType().isVector())
4980 return LowerVectorFP_TO_INT(Op, DAG);
4981
4982 // f16 conversions are promoted to f32 when full fp16 is not supported.
4983 if ((SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4984 SrcVal.getValueType() == MVT::bf16) {
4985 SDLoc DL(Op);
4986 if (IsStrict) {
4987 SDValue Ext =
4988 DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
4989 {Op.getOperand(0), SrcVal});
4990 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
4991 {Ext.getValue(1), Ext.getValue(0)});
4992 }
4993 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(),
4994 DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, SrcVal));
4995 }
4996
4997 if (SrcVal.getValueType() != MVT::f128) {
4998 // It's legal except when f128 is involved
4999 return Op;
5000 }
5001
5002 return SDValue();
5003}
5004
5005SDValue
5006AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
5007 SelectionDAG &DAG) const {
5008 // AArch64 FP-to-int conversions saturate to the destination element size, so
5009 // we can lower common saturating conversions to simple instructions.
5010 SDValue SrcVal = Op.getOperand(0);
5011 EVT SrcVT = SrcVal.getValueType();
5012 EVT DstVT = Op.getValueType();
5013 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
5014
5015 uint64_t SrcElementWidth = SrcVT.getScalarSizeInBits();
5016 uint64_t DstElementWidth = DstVT.getScalarSizeInBits();
5017 uint64_t SatWidth = SatVT.getScalarSizeInBits();
5018 assert(SatWidth <= DstElementWidth &&
5019 "Saturation width cannot exceed result width");
5020
5021 // TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT.
5022 // Currently, the `llvm.fpto[su]i.sat.*` intrinsics don't accept scalable
5023 // types, so this is hard to reach.
5024 if (DstVT.isScalableVector())
5025 return SDValue();
5026
5027 EVT SrcElementVT = SrcVT.getVectorElementType();
5028
5029 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
5030 SDLoc DL(Op);
5031 SDValue SrcVal2;
5032 if ((SrcElementVT == MVT::f16 &&
5033 (!Subtarget->hasFullFP16() || DstElementWidth > 16)) ||
5034 SrcElementVT == MVT::bf16) {
5035 MVT F32VT = MVT::getVectorVT(MVT::f32, SrcVT.getVectorNumElements());
5036 SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F32VT, SrcVal);
5037 // If we are extending to a v8f32, split into two v4f32 to produce legal
5038 // types.
5039 if (F32VT.getSizeInBits() > 128) {
5040 std::tie(SrcVal, SrcVal2) = DAG.SplitVector(SrcVal, DL);
5041 F32VT = F32VT.getHalfNumVectorElementsVT();
5042 }
5043 SrcVT = F32VT;
5044 SrcElementVT = MVT::f32;
5045 SrcElementWidth = 32;
5046 } else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 &&
5047 SrcElementVT != MVT::f16 && SrcElementVT != MVT::bf16)
5048 return SDValue();
5049
5050 // Expand to f64 if we are saturating to i64, to help keep the lanes the same
5051 // width and produce a fcvtzu.
5052 if (SatWidth == 64 && SrcElementWidth < 64) {
5053 MVT F64VT = MVT::getVectorVT(MVT::f64, SrcVT.getVectorNumElements());
5054 SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F64VT, SrcVal);
5055 SrcVT = F64VT;
5056 SrcElementVT = MVT::f64;
5057 SrcElementWidth = 64;
5058 }
5059 // Cases that we can emit directly.
5060 if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth) {
5061 SDValue Res = DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
5062 DAG.getValueType(DstVT.getScalarType()));
5063 if (SrcVal2) {
5064 SDValue Res2 = DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal2,
5065 DAG.getValueType(DstVT.getScalarType()));
5066 return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Res, Res2);
5067 }
5068 return Res;
5069 }
5070
5071 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
5072 // result. This is only valid if the legal cvt is larger than the saturate
5073 // width. For double, as we don't have MIN/MAX, it can be simpler to scalarize
5074 // (at least until sqxtn is selected).
5075 if (SrcElementWidth < SatWidth || SrcElementVT == MVT::f64)
5076 return SDValue();
5077
5078 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
5079 SDValue NativeCvt = DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal,
5080 DAG.getValueType(IntVT.getScalarType()));
5081 SDValue NativeCvt2 =
5082 SrcVal2 ? DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal2,
5083 DAG.getValueType(IntVT.getScalarType()))
5084 : SDValue();
5085 SDValue Sat, Sat2;
5086 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
5087 SDValue MinC = DAG.getConstant(
5088 APInt::getSignedMaxValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
5089 SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC);
5090 SDValue Min2 = SrcVal2 ? DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt2, MinC) : SDValue();
5091 SDValue MaxC = DAG.getConstant(
5092 APInt::getSignedMinValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
5093 Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC);
5094 Sat2 = SrcVal2 ? DAG.getNode(ISD::SMAX, DL, IntVT, Min2, MaxC) : SDValue();
5095 } else {
5096 SDValue MinC = DAG.getConstant(
5097 APInt::getAllOnes(SatWidth).zext(SrcElementWidth), DL, IntVT);
5098 Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC);
5099 Sat2 = SrcVal2 ? DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt2, MinC) : SDValue();
5100 }
5101
5102 if (SrcVal2)
5103 Sat = DAG.getNode(ISD::CONCAT_VECTORS, DL,
5105 Sat, Sat2);
5106
5107 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
5108}
5109
5110SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
5111 SelectionDAG &DAG) const {
5112 // AArch64 FP-to-int conversions saturate to the destination register size, so
5113 // we can lower common saturating conversions to simple instructions.
5114 SDValue SrcVal = Op.getOperand(0);
5115 EVT SrcVT = SrcVal.getValueType();
5116
5117 if (SrcVT.isVector())
5118 return LowerVectorFP_TO_INT_SAT(Op, DAG);
5119
5120 EVT DstVT = Op.getValueType();
5121 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
5122 uint64_t SatWidth = SatVT.getScalarSizeInBits();
5123 uint64_t DstWidth = DstVT.getScalarSizeInBits();
5124 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
5125
5126 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
5127 if ((SrcVT == MVT::f16 && !Subtarget->hasFullFP16()) || SrcVT == MVT::bf16) {
5128 SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal);
5129 SrcVT = MVT::f32;
5130 } else if (SrcVT != MVT::f64 && SrcVT != MVT::f32 && SrcVT != MVT::f16 &&
5131 SrcVT != MVT::bf16)
5132 return SDValue();
5133
5134 SDLoc DL(Op);
5135 // Cases that we can emit directly.
5136 if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 ||
5137 (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) &&
5138 DstVT == SatVT && (DstVT == MVT::i64 || DstVT == MVT::i32))
5139 return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
5140 DAG.getValueType(DstVT));
5141
5142 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
5143 // result. This is only valid if the legal cvt is larger than the saturate
5144 // width.
5145 if (DstWidth < SatWidth)
5146 return SDValue();
5147
5148 if (SrcVT == MVT::f16 && SatVT == MVT::i16 && DstVT == MVT::i32) {
5149 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
5150 SDValue CVTf32 =
5151 DAG.getNode(AArch64ISD::FCVTZS_HALF, DL, MVT::f32, SrcVal);
5152 SDValue Bitcast = DAG.getBitcast(DstVT, CVTf32);
5153 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, Bitcast,
5154 DAG.getValueType(SatVT));
5155 }
5156 SDValue CVTf32 = DAG.getNode(AArch64ISD::FCVTZU_HALF, DL, MVT::f32, SrcVal);
5157 return DAG.getBitcast(DstVT, CVTf32);
5158 }
5159
5160 SDValue NativeCvt =
5161 DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, DAG.getValueType(DstVT));
5162 SDValue Sat;
5163 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
5164 SDValue MinC = DAG.getConstant(
5165 APInt::getSignedMaxValue(SatWidth).sext(DstWidth), DL, DstVT);
5166 SDValue Min = DAG.getNode(ISD::SMIN, DL, DstVT, NativeCvt, MinC);
5167 SDValue MaxC = DAG.getConstant(
5168 APInt::getSignedMinValue(SatWidth).sext(DstWidth), DL, DstVT);
5169 Sat = DAG.getNode(ISD::SMAX, DL, DstVT, Min, MaxC);
5170 } else {
5171 SDValue MinC = DAG.getConstant(
5172 APInt::getAllOnes(SatWidth).zext(DstWidth), DL, DstVT);
5173 Sat = DAG.getNode(ISD::UMIN, DL, DstVT, NativeCvt, MinC);
5174 }
5175
5176 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
5177}
5178
5179SDValue AArch64TargetLowering::LowerVectorXRINT(SDValue Op,
5180 SelectionDAG &DAG) const {
5181 EVT VT = Op.getValueType();
5182 SDValue Src = Op.getOperand(0);
5183 SDLoc DL(Op);
5184
5185 assert(VT.isVector() && "Expected vector type");
5186
5187 EVT CastVT = VT.changeVectorElementType(
5188 *DAG.getContext(), Src.getValueType().getVectorElementType());
5189
5190 // Round the floating-point value into a floating-point register with the
5191 // current rounding mode.
5192 SDValue FOp = DAG.getNode(ISD::FRINT, DL, CastVT, Src);
5193
5194 // Truncate the rounded floating point to an integer.
5195 return DAG.getNode(ISD::FP_TO_SINT_SAT, DL, VT, FOp,
5197}
5198
5199SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
5200 SelectionDAG &DAG) const {
5201 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
5202 // Any additional optimization in this function should be recorded
5203 // in the cost tables.
5204 bool IsStrict = Op->isStrictFPOpcode();
5205 EVT VT = Op.getValueType();
5206 SDLoc DL(Op);
5207 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
5208 EVT InVT = In.getValueType();
5209 unsigned Opc = Op.getOpcode();
5210 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
5211
5212 assert(!(IsStrict && VT.isScalableVector()) &&
5213 "Unimplemented SVE support for ISD:::STRICT_INT_TO_FP!");
5214
5215 // NOTE: i1->bf16 does not require promotion to f32.
5216 if (VT.isScalableVector() && InVT.getVectorElementType() == MVT::i1) {
5217 SDValue FalseVal = DAG.getConstantFP(0.0, DL, VT);
5218 SDValue TrueVal = IsSigned ? DAG.getConstantFP(-1.0, DL, VT)
5219 : DAG.getConstantFP(1.0, DL, VT);
5220 return DAG.getNode(ISD::VSELECT, DL, VT, In, TrueVal, FalseVal);
5221 }
5222
5223 // Promote bf16 conversions to f32.
5224 if (VT.getVectorElementType() == MVT::bf16) {
5225 EVT F32 = VT.changeElementType(*DAG.getContext(), MVT::f32);
5226 if (IsStrict) {
5227 SDValue Val = DAG.getNode(Op.getOpcode(), DL, {F32, MVT::Other},
5228 {Op.getOperand(0), In});
5229 return DAG.getNode(ISD::STRICT_FP_ROUND, DL,
5230 {Op.getValueType(), MVT::Other},
5231 {Val.getValue(1), Val.getValue(0),
5232 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)});
5233 }
5234 return DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(),
5235 DAG.getNode(Op.getOpcode(), DL, F32, In),
5236 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5237 }
5238
5239 if (VT.isScalableVector()) {
5240 // Let common code split the operation.
5241 if (VT == MVT::nxv8f32)
5242 return Op;
5243
5244 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
5245 : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
5246 return LowerToPredicatedOp(Op, DAG, Opcode);
5247 }
5248
5249 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
5250 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
5251 return LowerFixedLengthIntToFPToSVE(Op, DAG);
5252
5253 uint64_t VTSize = VT.getFixedSizeInBits();
5254 uint64_t InVTSize = InVT.getFixedSizeInBits();
5255 if (VTSize < InVTSize) {
5256 // AArch64 doesn't have a direct vector instruction to convert
5257 // fixed point to floating point AND narrow it at the same time.
5258 // Additional rounding when the target is f32/f64 causes double
5259 // rounding issues. Conversion to f16 is fine due to narrow width.
5260 bool IsTargetf32 = VT.getVectorElementType() == MVT::f32;
5261 bool IsTargetf16 = false;
5262 if (Op.hasOneUse() &&
5263 Op->user_begin()->getOpcode() == ISD::CONCAT_VECTORS) {
5264 // Some vector types are split during legalization into half, followed by
5265 // concatenation, followed by rounding to the original vector type. If we
5266 // end up resolving to f16 type, we shouldn't worry about rounding errors.
5267 SDNode *U = *Op->user_begin();
5268 if (U->hasOneUse() && U->user_begin()->getOpcode() == ISD::FP_ROUND) {
5269 EVT TmpVT = U->user_begin()->getValueType(0);
5270 if (TmpVT.getScalarType() == MVT::f16)
5271 IsTargetf16 = true;
5272 }
5273 }
5274
5275 if (IsTargetf32 && !IsTargetf16) {
5276 return !IsStrict ? DAG.UnrollVectorOp(Op.getNode()) : SDValue();
5277 }
5278
5279 MVT CastVT =
5281 InVT.getVectorNumElements());
5282 if (IsStrict) {
5283 In = DAG.getNode(Opc, DL, {CastVT, MVT::Other}, {Op.getOperand(0), In});
5284 return DAG.getNode(ISD::STRICT_FP_ROUND, DL, {VT, MVT::Other},
5285 {In.getValue(1), In.getValue(0),
5286 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)});
5287 }
5288 In = DAG.getNode(Opc, DL, CastVT, In);
5289 return DAG.getNode(ISD::FP_ROUND, DL, VT, In,
5290 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5291 }
5292
5293 if (VTSize > InVTSize) {
5294 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
5295 EVT CastVT = VT.changeVectorElementTypeToInteger();
5296 In = DAG.getNode(CastOpc, DL, CastVT, In);
5297 if (IsStrict)
5298 return DAG.getNode(Opc, DL, {VT, MVT::Other}, {Op.getOperand(0), In});
5299 return DAG.getNode(Opc, DL, VT, In);
5300 }
5301
5302 // Use a scalar operation for conversions between single-element vectors of
5303 // the same size.
5304 if (VT.getVectorNumElements() == 1) {
5305 SDValue Extract =
5307 DAG.getConstant(0, DL, MVT::i64));
5308 EVT ScalarVT = VT.getScalarType();
5309 if (IsStrict)
5310 return DAG.getNode(Op.getOpcode(), DL, {ScalarVT, MVT::Other},
5311 {Op.getOperand(0), Extract});
5312 return DAG.getNode(Op.getOpcode(), DL, ScalarVT, Extract);
5313 }
5314
5315 return Op;
5316}
5317
5318SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
5319 SelectionDAG &DAG) const {
5320 if (Op.getValueType().isVector())
5321 return LowerVectorINT_TO_FP(Op, DAG);
5322
5323 bool IsStrict = Op->isStrictFPOpcode();
5324 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
5325
5326 bool IsSigned = Op->getOpcode() == ISD::STRICT_SINT_TO_FP ||
5327 Op->getOpcode() == ISD::SINT_TO_FP;
5328
5329 auto IntToFpViaPromotion = [&](EVT PromoteVT) {
5330 SDLoc DL(Op);
5331 if (IsStrict) {
5332 SDValue Val = DAG.getNode(Op.getOpcode(), DL, {PromoteVT, MVT::Other},
5333 {Op.getOperand(0), SrcVal});
5334 return DAG.getNode(ISD::STRICT_FP_ROUND, DL,
5335 {Op.getValueType(), MVT::Other},
5336 {Val.getValue(1), Val.getValue(0),
5337 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)});
5338 }
5339 return DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(),
5340 DAG.getNode(Op.getOpcode(), DL, PromoteVT, SrcVal),
5341 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5342 };
5343
5344 if (Op.getValueType() == MVT::bf16) {
5345 unsigned MaxWidth = IsSigned
5346 ? DAG.ComputeMaxSignificantBits(SrcVal)
5347 : DAG.computeKnownBits(SrcVal).countMaxActiveBits();
5348 // bf16 conversions are promoted to f32 when converting from i16.
5349 if (MaxWidth <= 24) {
5350 return IntToFpViaPromotion(MVT::f32);
5351 }
5352
5353 // bf16 conversions are promoted to f64 when converting from i32.
5354 if (MaxWidth <= 53) {
5355 return IntToFpViaPromotion(MVT::f64);
5356 }
5357
5358 // We need to be careful about i64 -> bf16.
5359 // Consider an i32 22216703.
5360 // This number cannot be represented exactly as an f32 and so a itofp will
5361 // turn it into 22216704.0 fptrunc to bf16 will turn this into 22282240.0
5362 // However, the correct bf16 was supposed to be 22151168.0
5363 // We need to use sticky rounding to get this correct.
5364 if (SrcVal.getValueType() == MVT::i64) {
5365 SDLoc DL(Op);
5366 // This algorithm is equivalent to the following:
5367 // uint64_t SrcHi = SrcVal & ~0xfffull;
5368 // uint64_t SrcLo = SrcVal & 0xfffull;
5369 // uint64_t Highest = SrcVal >> 53;
5370 // bool HasHighest = Highest != 0;
5371 // uint64_t ToRound = HasHighest ? SrcHi : SrcVal;
5372 // double Rounded = static_cast<double>(ToRound);
5373 // uint64_t RoundedBits = std::bit_cast<uint64_t>(Rounded);
5374 // uint64_t HasLo = SrcLo != 0;
5375 // bool NeedsAdjustment = HasHighest & HasLo;
5376 // uint64_t AdjustedBits = RoundedBits | uint64_t{NeedsAdjustment};
5377 // double Adjusted = std::bit_cast<double>(AdjustedBits);
5378 // return static_cast<__bf16>(Adjusted);
5379 //
5380 // Essentially, what happens is that SrcVal either fits perfectly in a
5381 // double-precision value or it is too big. If it is sufficiently small,
5382 // we should just go u64 -> double -> bf16 in a naive way. Otherwise, we
5383 // ensure that u64 -> double has no rounding error by only using the 52
5384 // MSB of the input. The low order bits will get merged into a sticky bit
5385 // which will avoid issues incurred by double rounding.
5386
5387 // Signed conversion is more or less like so:
5388 // copysign((__bf16)abs(SrcVal), SrcVal)
5389 SDValue SignBit;
5390 if (IsSigned) {
5391 SignBit = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5392 DAG.getConstant(1ull << 63, DL, MVT::i64));
5393 SrcVal = DAG.getNode(ISD::ABS, DL, MVT::i64, SrcVal);
5394 }
5395 SDValue SrcHi = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5396 DAG.getConstant(~0xfffull, DL, MVT::i64));
5397 SDValue SrcLo = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5398 DAG.getConstant(0xfffull, DL, MVT::i64));
5399 SDValue Highest =
5400 DAG.getNode(ISD::SRL, DL, MVT::i64, SrcVal,
5401 DAG.getShiftAmountConstant(53, MVT::i64, DL));
5402 SDValue Zero64 = DAG.getConstant(0, DL, MVT::i64);
5403 SDValue ToRound =
5404 DAG.getSelectCC(DL, Highest, Zero64, SrcHi, SrcVal, ISD::SETNE);
5405 SDValue Rounded =
5406 IsStrict ? DAG.getNode(Op.getOpcode(), DL, {MVT::f64, MVT::Other},
5407 {Op.getOperand(0), ToRound})
5408 : DAG.getNode(Op.getOpcode(), DL, MVT::f64, ToRound);
5409
5410 SDValue RoundedBits = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Rounded);
5411 if (SignBit) {
5412 RoundedBits = DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, SignBit);
5413 }
5414
5415 SDValue HasHighest = DAG.getSetCC(
5416 DL,
5417 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
5418 Highest, Zero64, ISD::SETNE);
5419
5420 SDValue HasLo = DAG.getSetCC(
5421 DL,
5422 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
5423 SrcLo, Zero64, ISD::SETNE);
5424
5425 SDValue NeedsAdjustment =
5426 DAG.getNode(ISD::AND, DL, HasLo.getValueType(), HasHighest, HasLo);
5427 NeedsAdjustment = DAG.getZExtOrTrunc(NeedsAdjustment, DL, MVT::i64);
5428
5429 SDValue AdjustedBits =
5430 DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, NeedsAdjustment);
5431 SDValue Adjusted = DAG.getNode(ISD::BITCAST, DL, MVT::f64, AdjustedBits);
5432 return IsStrict
5433 ? DAG.getNode(
5435 {Op.getValueType(), MVT::Other},
5436 {Rounded.getValue(1), Adjusted,
5437 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)})
5438 : DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(), Adjusted,
5439 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5440 }
5441 }
5442
5443 // f16 conversions are promoted to f32 when full fp16 is not supported.
5444 if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
5445 return IntToFpViaPromotion(MVT::f32);
5446 }
5447
5448 // i128 conversions are libcalls.
5449 if (SrcVal.getValueType() == MVT::i128)
5450 return SDValue();
5451
5452 // Other conversions are legal, unless it's to the completely software-based
5453 // fp128.
5454 if (Op.getValueType() != MVT::f128)
5455 return Op;
5456 return SDValue();
5457}
5458
5459static MVT getSVEContainerType(EVT ContentTy);
5460
5461SDValue
5462AArch64TargetLowering::LowerLOOP_DEPENDENCE_MASK(SDValue Op,
5463 SelectionDAG &DAG) const {
5464 assert((Subtarget->hasSVE2() ||
5465 (Subtarget->hasSME() && Subtarget->isStreaming())) &&
5466 "Lowering loop_dependence_raw_mask or loop_dependence_war_mask "
5467 "requires SVE or SME");
5468
5469 SDLoc DL(Op);
5470 EVT VT = Op.getValueType();
5471 unsigned LaneOffset = Op.getConstantOperandVal(3);
5472 unsigned NumElements = VT.getVectorMinNumElements();
5473 uint64_t EltSizeInBytes = Op.getConstantOperandVal(2);
5474 EVT AddrTy = Op->getOperand(0).getValueType();
5475
5476 // Lane offsets and other element sizes are not supported by whilewr/rw.
5477 if (LaneOffset != 0 || !is_contained({1u, 2u, 4u, 8u}, EltSizeInBytes))
5478 return SDValue();
5479
5480 EVT EltVT = MVT::getIntegerVT(EltSizeInBytes * 8);
5481 EVT PredVT =
5482 getPackedSVEVectorVT(EltVT).changeElementType(*DAG.getContext(), MVT::i1);
5483
5484 if (PredVT == VT) {
5485 // Legal whilewr/rw (lowered by tablegen matcher).
5486 if (AddrTy == MVT::i64)
5487 return Op;
5488
5489 // Almost legal whilewr/rw (addresses must be promoted to i64).
5490 assert(AddrTy == MVT::i32 && "Only expected i32 to be legal!");
5491 return DAG.getNode(
5492 Op.getOpcode(), DL, VT,
5493 DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Op->getOperand(0)),
5494 DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Op->getOperand(1)),
5495 DAG.getConstant(EltSizeInBytes, DL, MVT::i64), Op->getOperand(3));
5496 }
5497
5498 // Expand if this mask needs splitting (this will produce a whilelo).
5499 if (NumElements > PredVT.getVectorMinNumElements())
5500 return SDValue();
5501
5502 SDValue Mask =
5503 DAG.getNode(Op.getOpcode(), DL, PredVT, to_vector(Op->op_values()));
5504
5505 if (VT.isFixedLengthVector()) {
5506 EVT WidePredVT =
5507 PredVT.changeElementType(*DAG.getContext(), VT.getScalarType());
5508 SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, DL, WidePredVT, Mask);
5509 return convertFromScalableVector(DAG, VT, MaskAsInt);
5510 }
5511
5512 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Mask,
5513 DAG.getConstant(0, DL, MVT::i64));
5514}
5515
5516SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
5517 SelectionDAG &DAG) const {
5518 EVT OpVT = Op.getValueType();
5519 EVT ArgVT = Op.getOperand(0).getValueType();
5520
5522 return LowerFixedLengthBitcastToSVE(Op, DAG);
5523
5524 if (OpVT.isScalableVector()) {
5525 assert(isTypeLegal(OpVT) && "Unexpected result type!");
5526
5527 // Handle type legalisation first.
5528 if (!isTypeLegal(ArgVT)) {
5529 assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
5530 "Expected int->fp bitcast!");
5531
5532 // Bitcasting between unpacked vector types of different element counts is
5533 // not a NOP because the live elements are laid out differently.
5534 // 01234567
5535 // e.g. nxv2i32 = XX??XX??
5536 // nxv4f16 = X?X?X?X?
5537 if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount())
5538 return SDValue();
5539
5540 SDValue ExtResult =
5541 DAG.getNode(ISD::ANY_EXTEND, SDLoc(Op), getSVEContainerType(ArgVT),
5542 Op.getOperand(0));
5543 return getSVESafeBitCast(OpVT, ExtResult, DAG);
5544 }
5545
5546 // Bitcasts between legal types with the same element count are legal.
5547 if (OpVT.getVectorElementCount() == ArgVT.getVectorElementCount())
5548 return Op;
5549
5550 // getSVESafeBitCast does not support casting between unpacked types.
5551 if (!isPackedVectorType(OpVT, DAG))
5552 return SDValue();
5553
5554 return getSVESafeBitCast(OpVT, Op.getOperand(0), DAG);
5555 }
5556
5557 if (OpVT != MVT::f16 && OpVT != MVT::bf16)
5558 return SDValue();
5559
5560 // Bitcasts between f16 and bf16 are legal.
5561 if (ArgVT == MVT::f16 || ArgVT == MVT::bf16)
5562 return Op;
5563
5564 SDValue Src = Op.getOperand(0);
5565 SDLoc DL(Op);
5566 if (ArgVT.isVector() && ArgVT.getSizeInBits() == 16) {
5567 Src = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Src);
5568 ArgVT = MVT::i16;
5569 }
5570
5571 assert(ArgVT == MVT::i16);
5572
5573 Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);
5574 Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
5575 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, OpVT, Op);
5576}
5577
5578// Returns lane if Op extracts from a two-element vector and lane is constant
5579// (i.e., extractelt(<2 x Ty> %v, ConstantLane)), and std::nullopt otherwise.
5580static std::optional<uint64_t>
5582 SDNode *OpNode = Op.getNode();
5583 if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
5584 return std::nullopt;
5585
5586 EVT VT = OpNode->getOperand(0).getValueType();
5588 if (!VT.isFixedLengthVector() || VT.getVectorNumElements() != 2 || !C)
5589 return std::nullopt;
5590
5591 return C->getZExtValue();
5592}
5593
5595 bool isSigned) {
5596 EVT VT = N.getValueType();
5597
5598 if (N.getOpcode() != ISD::BUILD_VECTOR)
5599 return false;
5600
5601 for (const SDValue &Elt : N->op_values()) {
5603 unsigned EltSize = VT.getScalarSizeInBits();
5604 unsigned HalfSize = EltSize / 2;
5605 if (isSigned) {
5606 if (!isIntN(HalfSize, C->getSExtValue()))
5607 return false;
5608 } else {
5609 if (!isUIntN(HalfSize, C->getZExtValue()))
5610 return false;
5611 }
5612 continue;
5613 }
5614 return false;
5615 }
5616
5617 return true;
5618}
5619
5621 EVT VT = N.getValueType();
5622 assert(VT.is128BitVector() && "Unexpected vector MULL size");
5623 EVT HalfVT = EVT::getVectorVT(
5624 *DAG.getContext(),
5627 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), HalfVT, N);
5628}
5629
5631 return N.getOpcode() == ISD::SIGN_EXTEND ||
5632 N.getOpcode() == ISD::ANY_EXTEND ||
5633 isExtendedBUILD_VECTOR(N, DAG, true);
5634}
5635
5637 return N.getOpcode() == ISD::ZERO_EXTEND ||
5638 N.getOpcode() == ISD::ANY_EXTEND ||
5639 isExtendedBUILD_VECTOR(N, DAG, false);
5640}
5641
5643 unsigned Opcode = N.getOpcode();
5644 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
5645 SDValue N0 = N.getOperand(0);
5646 SDValue N1 = N.getOperand(1);
5647 return N0->hasOneUse() && N1->hasOneUse() &&
5648 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
5649 }
5650 return false;
5651}
5652
5654 unsigned Opcode = N.getOpcode();
5655 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
5656 SDValue N0 = N.getOperand(0);
5657 SDValue N1 = N.getOperand(1);
5658 return N0->hasOneUse() && N1->hasOneUse() &&
5659 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
5660 }
5661 return false;
5662}
5663
5664SDValue AArch64TargetLowering::LowerGET_ROUNDING(SDValue Op,
5665 SelectionDAG &DAG) const {
5666 // The rounding mode is in bits 23:22 of the FPSCR.
5667 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
5668 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
5669 // so that the shift + and get folded into a bitfield extract.
5670 SDLoc DL(Op);
5671
5672 SDValue Chain = Op.getOperand(0);
5673 SDValue FPCR_64 =
5674 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other},
5675 {Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL,
5676 MVT::i64)});
5677 Chain = FPCR_64.getValue(1);
5678 SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPCR_64);
5679 SDValue FltRounds = DAG.getNode(ISD::ADD, DL, MVT::i32, FPCR_32,
5680 DAG.getConstant(1U << 22, DL, MVT::i32));
5681 SDValue RMODE = DAG.getNode(ISD::SRL, DL, MVT::i32, FltRounds,
5682 DAG.getConstant(22, DL, MVT::i32));
5683 SDValue AND = DAG.getNode(ISD::AND, DL, MVT::i32, RMODE,
5684 DAG.getConstant(3, DL, MVT::i32));
5685 return DAG.getMergeValues({AND, Chain}, DL);
5686}
5687
5688SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op,
5689 SelectionDAG &DAG) const {
5690 SDLoc DL(Op);
5691 SDValue Chain = Op->getOperand(0);
5692 SDValue RMValue = Op->getOperand(1);
5693
5694 // The rounding mode is in bits 23:22 of the FPCR.
5695 // The llvm.set.rounding argument value to the rounding mode in FPCR mapping
5696 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
5697 // ((arg - 1) & 3) << 22).
5698 //
5699 // The argument of llvm.set.rounding must be within the segment [0, 3], so
5700 // NearestTiesToAway (4) is not handled here. It is responsibility of the code
5701 // generated llvm.set.rounding to ensure this condition.
5702
5703 // Calculate new value of FPCR[23:22].
5704 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
5705 DAG.getConstant(1, DL, MVT::i32));
5706 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
5707 DAG.getConstant(0x3, DL, MVT::i32));
5708 RMValue =
5709 DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
5710 DAG.getConstant(AArch64::RoundingBitsPos, DL, MVT::i32));
5711 RMValue = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, RMValue);
5712
5713 // Get current value of FPCR.
5714 SDValue Ops[] = {
5715 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5716 SDValue FPCR =
5717 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5718 Chain = FPCR.getValue(1);
5719 FPCR = FPCR.getValue(0);
5720
5721 // Put new rounding mode into FPSCR[23:22].
5722 const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos);
5723 FPCR = DAG.getNode(ISD::AND, DL, MVT::i64, FPCR,
5724 DAG.getConstant(RMMask, DL, MVT::i64));
5725 FPCR = DAG.getNode(ISD::OR, DL, MVT::i64, FPCR, RMValue);
5726 SDValue Ops2[] = {
5727 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
5728 FPCR};
5729 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5730}
5731
5732SDValue AArch64TargetLowering::LowerGET_FPMODE(SDValue Op,
5733 SelectionDAG &DAG) const {
5734 SDLoc DL(Op);
5735 SDValue Chain = Op->getOperand(0);
5736
5737 // Get current value of FPCR.
5738 SDValue Ops[] = {
5739 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5740 SDValue FPCR =
5741 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5742 Chain = FPCR.getValue(1);
5743 FPCR = FPCR.getValue(0);
5744
5745 // Truncate FPCR to 32 bits.
5746 SDValue Result = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPCR);
5747
5748 return DAG.getMergeValues({Result, Chain}, DL);
5749}
5750
5751SDValue AArch64TargetLowering::LowerSET_FPMODE(SDValue Op,
5752 SelectionDAG &DAG) const {
5753 SDLoc DL(Op);
5754 SDValue Chain = Op->getOperand(0);
5755 SDValue Mode = Op->getOperand(1);
5756
5757 // Extend the specified value to 64 bits.
5758 SDValue FPCR = DAG.getZExtOrTrunc(Mode, DL, MVT::i64);
5759
5760 // Set new value of FPCR.
5761 SDValue Ops2[] = {
5762 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
5763 FPCR};
5764 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5765}
5766
5767SDValue AArch64TargetLowering::LowerRESET_FPMODE(SDValue Op,
5768 SelectionDAG &DAG) const {
5769 SDLoc DL(Op);
5770 SDValue Chain = Op->getOperand(0);
5771
5772 // Get current value of FPCR.
5773 SDValue Ops[] = {
5774 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5775 SDValue FPCR =
5776 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5777 Chain = FPCR.getValue(1);
5778 FPCR = FPCR.getValue(0);
5779
5780 // Clear bits that are not reserved.
5781 SDValue FPSCRMasked = DAG.getNode(
5782 ISD::AND, DL, MVT::i64, FPCR,
5784
5785 // Set new value of FPCR.
5786 SDValue Ops2[] = {
5787 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
5788 FPSCRMasked};
5789 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5790}
5791
5792static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG,
5793 SDLoc DL, bool &IsMLA) {
5794 bool IsN0SExt = isSignExtended(N0, DAG);
5795 bool IsN1SExt = isSignExtended(N1, DAG);
5796 if (IsN0SExt && IsN1SExt)
5797 return AArch64ISD::SMULL;
5798
5799 bool IsN0ZExt = isZeroExtended(N0, DAG);
5800 bool IsN1ZExt = isZeroExtended(N1, DAG);
5801
5802 if (IsN0ZExt && IsN1ZExt)
5803 return AArch64ISD::UMULL;
5804
5805 // Select UMULL if we can replace the other operand with an extend.
5806 EVT VT = N0.getValueType();
5807 unsigned EltSize = VT.getScalarSizeInBits();
5808 APInt Mask = APInt::getHighBitsSet(EltSize, EltSize / 2);
5809 if (IsN0ZExt || IsN1ZExt) {
5810 if (DAG.MaskedValueIsZero(IsN0ZExt ? N1 : N0, Mask))
5811 return AArch64ISD::UMULL;
5812 } else if (VT == MVT::v2i64 && DAG.MaskedValueIsZero(N0, Mask) &&
5813 DAG.MaskedValueIsZero(N1, Mask)) {
5814 // For v2i64 we look more aggressively at both operands being zero, to avoid
5815 // scalarization.
5816 return AArch64ISD::UMULL;
5817 }
5818
5819 if (IsN0SExt || IsN1SExt) {
5820 if (DAG.ComputeNumSignBits(IsN0SExt ? N1 : N0) > EltSize / 2)
5821 return AArch64ISD::SMULL;
5822 } else if (VT == MVT::v2i64 && DAG.ComputeNumSignBits(N0) > EltSize / 2 &&
5823 DAG.ComputeNumSignBits(N1) > EltSize / 2) {
5824 return AArch64ISD::SMULL;
5825 }
5826
5827 if (!IsN1SExt && !IsN1ZExt)
5828 return 0;
5829
5830 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
5831 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
5832 if (IsN1SExt && isAddSubSExt(N0, DAG)) {
5833 IsMLA = true;
5834 return AArch64ISD::SMULL;
5835 }
5836 if (IsN1ZExt && isAddSubZExt(N0, DAG)) {
5837 IsMLA = true;
5838 return AArch64ISD::UMULL;
5839 }
5840 if (IsN0ZExt && isAddSubZExt(N1, DAG)) {
5841 std::swap(N0, N1);
5842 IsMLA = true;
5843 return AArch64ISD::UMULL;
5844 }
5845 return 0;
5846}
5847
5848// Transform mul<v2i64, splat(const)> into a SHL and ADD/SUB
5849// (1) multiply-by-(power-of-2 +/- 1) into shift and add/sub.
5850// mul x, (2^N + 1) --> add (shl x, N), x
5851// mul x, (2^N - 1) --> sub (shl x, N), x
5852// Examples: x * 33 --> (x << 5) + x
5853// x * 15 --> (x << 4) - x
5854// x * -33 --> -((x << 5) + x)
5855// x * -15 --> -((x << 4) - x) ; this reduces --> x - (x << 4)
5856// (2) multiply-by-(power-of-2 +/- power-of-2) into shifts and add/sub.
5857// mul x, (2^N + 2^M) --> (add (shl x, N), (shl x, M))
5858// mul x, (2^N - 2^M) --> (sub (shl x, N), (shl x, M))
5859// Examples: x * 0x8800 --> (x << 15) + (x << 11)
5860// x * 0xf800 --> (x << 16) - (x << 11)
5861// x * -0x8800 --> -((x << 15) + (x << 11))
5862// x * -0xf800 --> -((x << 16) - (x << 11)) ; (x << 11) - (x << 16)
5864 const SDNode *Operand = N->getOperand(1).getNode();
5865 APInt SplatValue;
5866
5867 // Not a constant splat so should just stay as a multiplication operation
5868 if (!ISD::isConstantSplatVector(Operand, SplatValue) ||
5869 !SplatValue.getBoolValue())
5870 return SDValue();
5871
5872 bool IsNegative = SplatValue.isNegative();
5873 SplatValue = SplatValue.abs();
5874 // Placeholder for MathOp
5875 unsigned MathOp = ISD::DELETED_NODE;
5876 unsigned TZeros = SplatValue.countr_zero();
5877
5878 // Shift the splat value by all the zeros, this won't affect the parity
5879 // this will help us find the first and second multiple to use.
5880 SplatValue.lshrInPlace(TZeros);
5881
5882 if ((SplatValue - 1).isPowerOf2())
5883 MathOp = ISD::ADD;
5884 else if ((SplatValue + 1).isPowerOf2())
5885 MathOp = ISD::SUB;
5886
5887 // If the constant is not (2^n + 1) or (2^n - 1), it would require
5888 // more than one addition/subtraction. For v2i64, the cost of
5889 // multiple vector adds/shifts often exceeds the cost of
5890 // scalarization (moving to GPRs to use a single MUL).
5891 if (MathOp != ISD::DELETED_NODE) {
5892 SDLoc DL(N);
5893 EVT VT = N->getValueType(0);
5894 SDValue LHS = N->getOperand(0);
5895
5896 unsigned ShiftAmt = MathOp == ISD::ADD ? (SplatValue - 1).logBase2()
5897 : (SplatValue + 1).logBase2();
5898 ShiftAmt += TZeros;
5899
5900 SDValue Shl =
5901 DAG.getNode(ISD::SHL, DL, VT, LHS, DAG.getConstant(ShiftAmt, DL, VT));
5902
5903 SDValue NewLHS = TZeros ? DAG.getNode(ISD::SHL, DL, VT, LHS,
5904 DAG.getConstant(TZeros, DL, VT))
5905 : LHS;
5906 SDValue Combined = DAG.getNode(MathOp, DL, VT, Shl, NewLHS);
5907 if (IsNegative)
5908 Combined = DAG.getNegative(Combined, DL, VT);
5909 return Combined;
5910 }
5911 return SDValue();
5912}
5913
5914SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
5915 EVT VT = Op.getValueType();
5916
5917 bool OverrideNEON = !Subtarget->isNeonAvailable();
5918 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
5919 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5920
5921 // Multiplications are only custom-lowered for 128-bit and 64-bit vectors so
5922 // that VMULL can be detected. Otherwise v2i64 multiplications are not legal.
5923 assert((VT.is128BitVector() || VT.is64BitVector()) && VT.isInteger() &&
5924 "unexpected type for custom-lowering ISD::MUL");
5925 SDValue N0 = Op.getOperand(0);
5926 SDValue N1 = Op.getOperand(1);
5927 bool isMLA = false;
5928 EVT OVT = VT;
5929 if (VT.is64BitVector()) {
5930 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5931 isNullConstant(N0.getOperand(1)) &&
5934 isNullConstant(N1.getOperand(1)) &&
5936 N0 = N0.getOperand(0);
5937 N1 = N1.getOperand(0);
5938 VT = N0.getValueType();
5939 } else {
5940 if (VT == MVT::v1i64) {
5941 if (Subtarget->hasSVE())
5942 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5943 // Fall through to expand this. It is not legal.
5944 return SDValue();
5945 } else
5946 // Other vector multiplications are legal.
5947 return Op;
5948 }
5949 }
5950
5951 SDLoc DL(Op);
5952 unsigned NewOpc = selectUmullSmull(N0, N1, DAG, DL, isMLA);
5953
5954 if (!NewOpc) {
5955 if (VT.getVectorElementType() == MVT::i64) {
5956 // If SVE is available then i64 vector multiplications can also be made
5957 // legal.
5958 if (Subtarget->hasSVE())
5959 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5960 // Try to optimize the mul to a shift left and add instead of scalarizing.
5961 if (SDValue ShlAdd = convertMulToShlAdd(Op.getNode(), DAG))
5962 return ShlAdd;
5963 // Fall through to expanding as the mul is not legal.
5964 return SDValue();
5965 } else
5966 // Other vector multiplications are legal.
5967 return Op;
5968 }
5969
5970 // Legalize to a S/UMULL instruction
5971 SDValue Op0;
5972 SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
5973 if (!isMLA) {
5974 Op0 = skipExtensionForVectorMULL(N0, DAG);
5976 Op1.getValueType().is64BitVector() &&
5977 "unexpected types for extended operands to VMULL");
5978 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OVT,
5979 DAG.getNode(NewOpc, DL, VT, Op0, Op1),
5980 DAG.getConstant(0, DL, MVT::i64));
5981 }
5982 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
5983 // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
5984 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
5987 EVT Op1VT = Op1.getValueType();
5988 return DAG.getNode(
5990 DAG.getNode(N0.getOpcode(), DL, VT,
5991 DAG.getNode(NewOpc, DL, VT,
5992 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
5993 DAG.getNode(NewOpc, DL, VT,
5994 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)),
5995 DAG.getConstant(0, DL, MVT::i64));
5996}
5997
5998static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
5999 int Pattern) {
6000 if (Pattern == AArch64SVEPredPattern::all)
6001 return DAG.getConstant(1, DL, VT);
6002
6003 // When the number of active elements of a pattern matches the scalable vector
6004 // length, we can upgrade the pattern to ALL and emit a splat instead.
6005 if (unsigned PatNumElts = getNumElementsFromSVEPredPattern(Pattern)) {
6006 const AArch64Subtarget &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
6007 unsigned NumElts = VT.getVectorMinNumElements();
6008 unsigned VScale = Subtarget.getSVEVectorSizeInBits() / 128;
6009 if (PatNumElts == (NumElts * VScale))
6010 return DAG.getConstant(1, DL, VT);
6011 }
6012
6013 return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
6014 DAG.getTargetConstant(Pattern, DL, MVT::i32));
6015}
6016
6018 bool IsSigned, bool IsEqual) {
6019 unsigned Op0 = N->getOpcode() == ISD::INTRINSIC_WO_CHAIN ? 1 : 0;
6020 unsigned Op1 = N->getOpcode() == ISD::INTRINSIC_WO_CHAIN ? 2 : 1;
6021
6022 if (!N->getValueType(0).isScalableVector() ||
6023 !isa<ConstantSDNode>(N->getOperand(Op1)))
6024 return SDValue();
6025
6026 SDLoc DL(N);
6027 APInt Y = N->getConstantOperandAPInt(Op1);
6028
6029 // When the second operand is the maximum value, comparisons that include
6030 // equality can never fail and thus we can return an all active predicate.
6031 if (IsEqual)
6032 if (IsSigned ? Y.isMaxSignedValue() : Y.isMaxValue())
6033 return DAG.getConstant(1, DL, N->getValueType(0));
6034
6035 if (!isa<ConstantSDNode>(N->getOperand(Op0)))
6036 return SDValue();
6037
6038 APInt X = N->getConstantOperandAPInt(Op0);
6039
6040 bool Overflow;
6041 APInt NumActiveElems =
6042 IsSigned ? Y.ssub_ov(X, Overflow) : Y.usub_ov(X, Overflow);
6043
6044 if (Overflow)
6045 return SDValue();
6046
6047 if (IsEqual) {
6048 APInt One(NumActiveElems.getBitWidth(), 1, IsSigned);
6049 NumActiveElems = IsSigned ? NumActiveElems.sadd_ov(One, Overflow)
6050 : NumActiveElems.uadd_ov(One, Overflow);
6051 if (Overflow)
6052 return SDValue();
6053 }
6054
6055 std::optional<unsigned> PredPattern =
6057 unsigned MinSVEVectorSize = std::max(
6059 unsigned ElementSize = 128 / N->getValueType(0).getVectorMinNumElements();
6060 if (PredPattern != std::nullopt &&
6061 NumActiveElems.getZExtValue() <= (MinSVEVectorSize / ElementSize))
6062 return getPTrue(DAG, DL, N->getValueType(0), *PredPattern);
6063
6064 return SDValue();
6065}
6066
6067// Match get.active.lane.mask(0, cttz.elts(x)) -> brkb(x)
6068// Match get.active.lane.mask(0, add(cttz.elts(x), 1)) -> brka(x)
6070 SDLoc DL(N);
6071 EVT VT = N->getValueType(0);
6072 // Lower bound must be 0.
6073 if (!isZeroOrZeroSplat(N->getOperand(0)))
6074 return SDValue();
6075
6076 SDValue Upper = N->getOperand(1);
6077
6078 // Default to brkb, switch to brka if we find a +1.
6079 unsigned BrkID = Intrinsic::aarch64_sve_brkb_z;
6080 if (Upper->getOpcode() == ISD::ADD && isOneOrOneSplat(Upper.getOperand(1))) {
6081 Upper = Upper.getOperand(0);
6082 BrkID = Intrinsic::aarch64_sve_brka_z;
6083 }
6084
6085 // We're looking for an upper bound based on CTTZ_ELTS; this would be selected
6086 // as a cntp(brk(Pg, Mask)), but if we're just going to make a whilelo based
6087 // on that then we just need the brk.
6088 if (Upper.getOpcode() != AArch64ISD::CTTZ_ELTS || !VT.isScalableVector() ||
6089 Upper.getOperand(0).getValueType() != VT)
6090 return SDValue();
6091
6092 SDValue Pg = Upper->getOperand(0);
6093 SDValue Mask = Upper->getOperand(1);
6094
6095 // brk{a,b} only support .b forms, so cast to make sure all our p regs match.
6096 Pg = getSVEPredicateBitCast(MVT::nxv16i1, Pg, DAG);
6097 SDValue MaskR =
6098 DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Mask);
6099 SDValue ID = DAG.getTargetConstant(BrkID, DL, MVT::i64);
6100 SDValue Brk =
6101 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv16i1, ID, Pg, MaskR);
6102 return DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Brk);
6103}
6104
6105// Returns a safe bitcast between two scalable vector predicates, where
6106// any newly created lanes from a widening bitcast are defined as zero.
6108 SDLoc DL(Op);
6109 EVT InVT = Op.getValueType();
6110
6111 assert(InVT.getVectorElementType() == MVT::i1 &&
6112 VT.getVectorElementType() == MVT::i1 &&
6113 "Expected a predicate-to-predicate bitcast");
6115 InVT.isScalableVector() &&
6116 DAG.getTargetLoweringInfo().isTypeLegal(InVT) &&
6117 "Only expect to cast between legal scalable predicate types!");
6118
6119 // Return the operand if the cast isn't changing type,
6120 if (InVT == VT)
6121 return Op;
6122
6123 // Look through casts to <vscale x 16 x i1> when their input has more lanes
6124 // than VT. This will increase the chances of removing casts that introduce
6125 // new lanes, which have to be explicitly zero'd.
6126 if (Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
6127 Op.getConstantOperandVal(0) == Intrinsic::aarch64_sve_convert_to_svbool &&
6128 Op.getOperand(1).getValueType().bitsGT(VT))
6129 Op = Op.getOperand(1);
6130
6131 SDValue Reinterpret = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
6132
6133 // We only have to zero the lanes if new lanes are being defined, e.g. when
6134 // casting from <vscale x 2 x i1> to <vscale x 16 x i1>. If this is not the
6135 // case (e.g. when casting from <vscale x 16 x i1> -> <vscale x 2 x i1>) then
6136 // we can return here.
6137 if (InVT.bitsGT(VT))
6138 return Reinterpret;
6139
6140 // Check if the other lanes are already known to be zeroed by
6141 // construction.
6143 return Reinterpret;
6144
6145 // Zero the newly introduced lanes.
6146 SDValue Mask = DAG.getConstant(1, DL, InVT);
6147 Mask = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Mask);
6148 return DAG.getNode(ISD::AND, DL, VT, Reinterpret, Mask);
6149}
6150
6151SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG,
6152 SDValue Chain, SDLoc DL,
6153 EVT VT) const {
6154 RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE;
6155 RTLIB::LibcallImpl LCImpl = DAG.getLibcalls().getLibcallImpl(LC);
6156 SDValue Callee =
6157 DAG.getExternalSymbol(LCImpl, getPointerTy(DAG.getDataLayout()));
6158 Type *Int64Ty = Type::getInt64Ty(*DAG.getContext());
6159 Type *RetTy = StructType::get(Int64Ty, Int64Ty);
6160 TargetLowering::CallLoweringInfo CLI(DAG);
6162 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
6163 DAG.getLibcalls().getLibcallImplCallingConv(LCImpl), RetTy, Callee,
6164 std::move(Args));
6165 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
6166 SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ 1, DL, MVT::i64);
6167 return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(0),
6168 Mask);
6169}
6170
6171// Lower an SME LDR/STR ZA intrinsic
6172// Case 1: If the vector number (vecnum) is an immediate in range, it gets
6173// folded into the instruction
6174// ldr(%tileslice, %ptr, 11) -> ldr [%tileslice, 11], [%ptr, 11]
6175// Case 2: If the vecnum is not an immediate, then it is used to modify the base
6176// and tile slice registers
6177// ldr(%tileslice, %ptr, %vecnum)
6178// ->
6179// %svl = rdsvl
6180// %ptr2 = %ptr + %svl * %vecnum
6181// %tileslice2 = %tileslice + %vecnum
6182// ldr [%tileslice2, 0], [%ptr2, 0]
6183// Case 3: If the vecnum is an immediate out of range, then the same is done as
6184// case 2, but the base and slice registers are modified by the greatest
6185// multiple of 15 lower than the vecnum and the remainder is folded into the
6186// instruction. This means that successive loads and stores that are offset from
6187// each other can share the same base and slice register updates.
6188// ldr(%tileslice, %ptr, 22)
6189// ldr(%tileslice, %ptr, 23)
6190// ->
6191// %svl = rdsvl
6192// %ptr2 = %ptr + %svl * 15
6193// %tileslice2 = %tileslice + 15
6194// ldr [%tileslice2, 7], [%ptr2, 7]
6195// ldr [%tileslice2, 8], [%ptr2, 8]
6196// Case 4: If the vecnum is an add of an immediate, then the non-immediate
6197// operand and the immediate can be folded into the instruction, like case 2.
6198// ldr(%tileslice, %ptr, %vecnum + 7)
6199// ldr(%tileslice, %ptr, %vecnum + 8)
6200// ->
6201// %svl = rdsvl
6202// %ptr2 = %ptr + %svl * %vecnum
6203// %tileslice2 = %tileslice + %vecnum
6204// ldr [%tileslice2, 7], [%ptr2, 7]
6205// ldr [%tileslice2, 8], [%ptr2, 8]
6206// Case 5: The vecnum being an add of an immediate out of range is also handled,
6207// in which case the same remainder logic as case 3 is used.
6209 SDLoc DL(N);
6210
6211 SDValue TileSlice = N->getOperand(2);
6212 SDValue Base = N->getOperand(3);
6213 SDValue VecNum = N->getOperand(4);
6214 int32_t ConstAddend = 0;
6215 SDValue VarAddend = VecNum;
6216
6217 // If the vnum is an add of an immediate, we can fold it into the instruction
6218 if (VecNum.getOpcode() == ISD::ADD &&
6219 isa<ConstantSDNode>(VecNum.getOperand(1))) {
6220 ConstAddend = cast<ConstantSDNode>(VecNum.getOperand(1))->getSExtValue();
6221 VarAddend = VecNum.getOperand(0);
6222 } else if (auto ImmNode = dyn_cast<ConstantSDNode>(VecNum)) {
6223 ConstAddend = ImmNode->getSExtValue();
6224 VarAddend = SDValue();
6225 }
6226
6227 int32_t ImmAddend = ConstAddend % 16;
6228 if (int32_t C = (ConstAddend - ImmAddend)) {
6229 SDValue CVal = DAG.getTargetConstant(C, DL, MVT::i32);
6230 VarAddend = VarAddend
6231 ? DAG.getNode(ISD::ADD, DL, MVT::i32, {VarAddend, CVal})
6232 : CVal;
6233 }
6234
6235 if (VarAddend) {
6236 // Get the vector length that will be multiplied by vnum
6237 auto SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
6238 DAG.getConstant(1, DL, MVT::i32));
6239
6240 // Multiply SVL and vnum then add it to the base
6241 SDValue Mul = DAG.getNode(
6242 ISD::MUL, DL, MVT::i64,
6243 {SVL, DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, VarAddend)});
6244 Base = DAG.getNode(ISD::ADD, DL, MVT::i64, {Base, Mul});
6245 // Just add vnum to the tileslice
6246 TileSlice = DAG.getNode(ISD::ADD, DL, MVT::i32, {TileSlice, VarAddend});
6247 }
6248
6249 return DAG.getNode(IsLoad ? AArch64ISD::SME_ZA_LDR : AArch64ISD::SME_ZA_STR,
6250 DL, MVT::Other,
6251 {/*Chain=*/N.getOperand(0), TileSlice, Base,
6252 DAG.getTargetConstant(ImmAddend, DL, MVT::i32)});
6253}
6254
6256 SDLoc DL(Op);
6257 SDValue ID =
6258 DAG.getTargetConstant(Intrinsic::aarch64_sve_match, DL, MVT::i64);
6259
6260 auto Op1 = Op.getOperand(1);
6261 auto Op2 = Op.getOperand(2);
6262 auto Mask = Op.getOperand(3);
6263
6264 EVT Op1VT = Op1.getValueType();
6265 EVT Op2VT = Op2.getValueType();
6266 EVT ResVT = Op.getValueType();
6267
6268 assert((Op1VT.getVectorElementType() == MVT::i8 ||
6269 Op1VT.getVectorElementType() == MVT::i16) &&
6270 "Expected 8-bit or 16-bit characters.");
6271
6272 // Scalable vector type used to wrap operands.
6273 // A single container is enough for both operands because ultimately the
6274 // operands will have to be wrapped to the same type (nxv16i8 or nxv8i16).
6275 EVT OpContainerVT = Op1VT.isScalableVector()
6276 ? Op1VT
6278
6279 if (Op2VT.is128BitVector()) {
6280 // If Op2 is a full 128-bit vector, wrap it trivially in a scalable vector.
6281 Op2 = convertToScalableVector(DAG, OpContainerVT, Op2);
6282 // Further, if the result is scalable, broadcast Op2 to a full SVE register.
6283 if (ResVT.isScalableVector())
6284 Op2 = DAG.getNode(AArch64ISD::DUPLANE128, DL, OpContainerVT, Op2,
6285 DAG.getTargetConstant(0, DL, MVT::i64));
6286 } else {
6287 // If Op2 is not a full 128-bit vector, we always need to broadcast it.
6288 unsigned Op2BitWidth = Op2VT.getFixedSizeInBits();
6289 MVT Op2IntVT = MVT::getIntegerVT(Op2BitWidth);
6290 EVT Op2PromotedVT = getPackedSVEVectorVT(Op2IntVT);
6291 Op2 = DAG.getBitcast(MVT::getVectorVT(Op2IntVT, 1), Op2);
6292 Op2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op2IntVT, Op2,
6293 DAG.getConstant(0, DL, MVT::i64));
6294 Op2 = DAG.getSplatVector(Op2PromotedVT, DL, Op2);
6295 Op2 = DAG.getBitcast(OpContainerVT, Op2);
6296 }
6297
6298 // If the result is scalable, we just need to carry out the MATCH.
6299 if (ResVT.isScalableVector())
6300 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResVT, ID, Mask, Op1, Op2);
6301
6302 // If the result is fixed, we can still use MATCH but we need to wrap the
6303 // first operand and the mask in scalable vectors before doing so.
6304
6305 // Wrap the operands.
6306 Op1 = convertToScalableVector(DAG, OpContainerVT, Op1);
6307 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, Op1VT, Mask);
6308 Mask = convertFixedMaskToScalableVector(Mask, DAG);
6309
6310 // Carry out the match.
6311 SDValue Match = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Mask.getValueType(),
6312 ID, Mask, Op1, Op2);
6313
6314 // Extract and promote the match result (nxv16i1/nxv8i1) to ResVT
6315 // (v16i8/v8i8).
6316 Match = DAG.getNode(ISD::SIGN_EXTEND, DL, OpContainerVT, Match);
6317 Match = convertFromScalableVector(DAG, Op1VT, Match);
6318 return DAG.getNode(ISD::TRUNCATE, DL, ResVT, Match);
6319}
6320
6321SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
6322 SelectionDAG &DAG) const {
6323 unsigned IntNo = Op.getConstantOperandVal(1);
6324 SDLoc DL(Op);
6325 switch (IntNo) {
6326 default:
6327 return SDValue(); // Don't custom lower most intrinsics.
6328 case Intrinsic::aarch64_prefetch: {
6329 SDValue Chain = Op.getOperand(0);
6330 SDValue Addr = Op.getOperand(2);
6331
6332 unsigned IsWrite = Op.getConstantOperandVal(3);
6333 unsigned Locality = Op.getConstantOperandVal(4);
6334 unsigned IsStream = Op.getConstantOperandVal(5);
6335 unsigned IsData = Op.getConstantOperandVal(6);
6336 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
6337 (!IsData << 3) | // IsDataCache bit
6338 (Locality << 1) | // Cache level bits
6339 (unsigned)IsStream; // Stream bit
6340
6341 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Chain,
6342 DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr);
6343 }
6344 case Intrinsic::aarch64_range_prefetch: {
6345 SDValue Chain = Op.getOperand(0);
6346 SDValue Addr = Op.getOperand(2);
6347
6348 unsigned IsWrite = Op.getConstantOperandVal(3);
6349 unsigned IsStream = Op.getConstantOperandVal(4);
6350 unsigned PrfOp = (IsStream << 2) | IsWrite;
6351
6352 SDValue Metadata = Op.getOperand(5);
6353 return DAG.getNode(AArch64ISD::RANGE_PREFETCH, DL, MVT::Other, Chain,
6354 DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr,
6355 Metadata);
6356 }
6357 case Intrinsic::aarch64_prefetch_ir:
6358 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other,
6359 Op.getOperand(0), // Chain
6360 DAG.getTargetConstant(24, DL, MVT::i32), // Rt
6361 Op.getOperand(2)); // Addr
6362 case Intrinsic::aarch64_sme_str:
6363 case Intrinsic::aarch64_sme_ldr: {
6364 return LowerSMELdrStr(Op, DAG, IntNo == Intrinsic::aarch64_sme_ldr);
6365 }
6366 case Intrinsic::aarch64_sme_za_enable:
6367 return DAG.getNode(
6368 AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue),
6369 Op->getOperand(0), // Chain
6370 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));
6371 case Intrinsic::aarch64_sme_za_disable:
6372 return DAG.getNode(
6373 AArch64ISD::SMSTOP, DL, DAG.getVTList(MVT::Other, MVT::Glue),
6374 Op->getOperand(0), // Chain
6375 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));
6376 }
6377}
6378
6379SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
6380 SelectionDAG &DAG) const {
6381 unsigned IntNo = Op.getConstantOperandVal(1);
6382 SDLoc DL(Op);
6383 switch (IntNo) {
6384 default:
6385 return SDValue(); // Don't custom lower most intrinsics.
6386 case Intrinsic::aarch64_mops_memset_tag: {
6387 auto Node = cast<MemIntrinsicSDNode>(Op.getNode());
6388 SDValue Chain = Node->getChain();
6389 SDValue Dst = Op.getOperand(2);
6390 SDValue Val = Op.getOperand(3);
6391 Val = DAG.getAnyExtOrTrunc(Val, DL, MVT::i64);
6392 SDValue Size = Op.getOperand(4);
6393 auto Alignment = Node->getMemOperand()->getAlign();
6394 bool IsVol = Node->isVolatile();
6395 auto DstPtrInfo = Node->getPointerInfo();
6396
6397 const auto &SDI =
6398 static_cast<const AArch64SelectionDAGInfo &>(DAG.getSelectionDAGInfo());
6399 SDValue MS = SDI.EmitMOPS(AArch64::MOPSMemorySetTaggingPseudo, DAG, DL,
6400 Chain, Dst, Val, Size, Alignment, IsVol,
6401 DstPtrInfo, MachinePointerInfo{});
6402
6403 // MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the
6404 // intrinsic has 2. So hide SizeWb using MERGE_VALUES. Otherwise
6405 // LowerOperationWrapper will complain that the number of results has
6406 // changed.
6407 return DAG.getMergeValues({MS.getValue(0), MS.getValue(2)}, DL);
6408 }
6409 }
6410}
6411
6412SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
6413 SelectionDAG &DAG) const {
6414 unsigned IntNo = Op.getConstantOperandVal(0);
6415 SDLoc DL(Op);
6416 switch (IntNo) {
6417 default: return SDValue(); // Don't custom lower most intrinsics.
6418 case Intrinsic::thread_pointer: {
6419 EVT PtrVT = getPointerTy(DAG.getDataLayout());
6420 return DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
6421 }
6422 case Intrinsic::aarch64_sve_whilewr_b:
6423 return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
6424 Op.getOperand(1), Op.getOperand(2),
6425 DAG.getConstant(1, DL, MVT::i64),
6426 DAG.getConstant(0, DL, MVT::i64));
6427 case Intrinsic::aarch64_sve_whilewr_h:
6428 return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
6429 Op.getOperand(1), Op.getOperand(2),
6430 DAG.getConstant(2, DL, MVT::i64),
6431 DAG.getConstant(0, DL, MVT::i64));
6432 case Intrinsic::aarch64_sve_whilewr_s:
6433 return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
6434 Op.getOperand(1), Op.getOperand(2),
6435 DAG.getConstant(4, DL, MVT::i64),
6436 DAG.getConstant(0, DL, MVT::i64));
6437 case Intrinsic::aarch64_sve_whilewr_d:
6438 return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
6439 Op.getOperand(1), Op.getOperand(2),
6440 DAG.getConstant(8, DL, MVT::i64),
6441 DAG.getConstant(0, DL, MVT::i64));
6442 case Intrinsic::aarch64_sve_whilerw_b:
6443 return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
6444 Op.getOperand(1), Op.getOperand(2),
6445 DAG.getConstant(1, DL, MVT::i64),
6446 DAG.getConstant(0, DL, MVT::i64));
6447 case Intrinsic::aarch64_sve_whilerw_h:
6448 return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
6449 Op.getOperand(1), Op.getOperand(2),
6450 DAG.getConstant(2, DL, MVT::i64),
6451 DAG.getConstant(0, DL, MVT::i64));
6452 case Intrinsic::aarch64_sve_whilerw_s:
6453 return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
6454 Op.getOperand(1), Op.getOperand(2),
6455 DAG.getConstant(4, DL, MVT::i64),
6456 DAG.getConstant(0, DL, MVT::i64));
6457 case Intrinsic::aarch64_sve_whilerw_d:
6458 return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
6459 Op.getOperand(1), Op.getOperand(2),
6460 DAG.getConstant(8, DL, MVT::i64),
6461 DAG.getConstant(0, DL, MVT::i64));
6462 case Intrinsic::aarch64_neon_abs: {
6463 EVT Ty = Op.getValueType();
6464 if (Ty == MVT::i64) {
6465 SDValue Result =
6466 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op.getOperand(1));
6467 Result = DAG.getNode(ISD::ABS, DL, MVT::v1i64, Result);
6468 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Result,
6469 DAG.getConstant(0, DL, MVT::i64));
6470 } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
6471 return DAG.getNode(ISD::ABS, DL, Ty, Op.getOperand(1));
6472 } else {
6473 report_fatal_error("Unexpected type for AArch64 NEON intrinsic");
6474 }
6475 }
6476 case Intrinsic::aarch64_neon_pmull64: {
6477 SDValue LHS = Op.getOperand(1);
6478 SDValue RHS = Op.getOperand(2);
6479
6480 std::optional<uint64_t> LHSLane =
6482 std::optional<uint64_t> RHSLane =
6484
6485 assert((!LHSLane || *LHSLane < 2) && "Expect lane to be None or 0 or 1");
6486 assert((!RHSLane || *RHSLane < 2) && "Expect lane to be None or 0 or 1");
6487
6488 // 'aarch64_neon_pmull64' takes i64 parameters; while pmull/pmull2
6489 // instructions execute on SIMD registers. So canonicalize i64 to v1i64,
6490 // which ISel recognizes better. For example, generate a ldr into d*
6491 // registers as opposed to a GPR load followed by a fmov.
6492 auto TryVectorizeOperand = [](SDValue N, std::optional<uint64_t> NLane,
6493 std::optional<uint64_t> OtherLane,
6494 const SDLoc &DL,
6495 SelectionDAG &DAG) -> SDValue {
6496 // If the operand is an higher half itself, rewrite it to
6497 // extract_high_v2i64; this way aarch64_neon_pmull64 could
6498 // re-use the dag-combiner function with aarch64_neon_{pmull,smull,umull}.
6499 if (NLane == 1)
6500 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v1i64,
6501 N.getOperand(0), DAG.getConstant(1, DL, MVT::i64));
6502
6503 // Operand N is not a higher half but the other operand is.
6504 if (OtherLane == 1) {
6505 // If this operand is a lower half, rewrite it to
6506 // extract_high_v2i64(duplane(<2 x Ty>, 0)). This saves a roundtrip to
6507 // align lanes of two operands. A roundtrip sequence (to move from lane
6508 // 1 to lane 0) is like this:
6509 // mov x8, v0.d[1]
6510 // fmov d0, x8
6511 if (NLane == 0)
6512 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v1i64,
6513 DAG.getNode(AArch64ISD::DUPLANE64, DL, MVT::v2i64,
6514 N.getOperand(0),
6515 DAG.getConstant(0, DL, MVT::i64)),
6516 DAG.getConstant(1, DL, MVT::i64));
6517
6518 // Otherwise just dup from main to all lanes.
6519 return DAG.getNode(AArch64ISD::DUP, DL, MVT::v1i64, N);
6520 }
6521
6522 // Neither operand is an extract of higher half, so codegen may just use
6523 // the non-high version of PMULL instruction. Use v1i64 to represent i64.
6524 assert(N.getValueType() == MVT::i64 &&
6525 "Intrinsic aarch64_neon_pmull64 requires i64 parameters");
6526 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, N);
6527 };
6528
6529 LHS = TryVectorizeOperand(LHS, LHSLane, RHSLane, DL, DAG);
6530 RHS = TryVectorizeOperand(RHS, RHSLane, LHSLane, DL, DAG);
6531
6532 return DAG.getNode(AArch64ISD::PMULL, DL, Op.getValueType(), LHS, RHS);
6533 }
6534 case Intrinsic::aarch64_neon_smax:
6535 return DAG.getNode(ISD::SMAX, DL, Op.getValueType(), Op.getOperand(1),
6536 Op.getOperand(2));
6537 case Intrinsic::aarch64_neon_umax:
6538 return DAG.getNode(ISD::UMAX, DL, Op.getValueType(), Op.getOperand(1),
6539 Op.getOperand(2));
6540 case Intrinsic::aarch64_neon_smin:
6541 return DAG.getNode(ISD::SMIN, DL, Op.getValueType(), Op.getOperand(1),
6542 Op.getOperand(2));
6543 case Intrinsic::aarch64_neon_umin:
6544 return DAG.getNode(ISD::UMIN, DL, Op.getValueType(), Op.getOperand(1),
6545 Op.getOperand(2));
6546 case Intrinsic::aarch64_neon_scalar_sqxtn:
6547 case Intrinsic::aarch64_neon_scalar_sqxtun:
6548 case Intrinsic::aarch64_neon_scalar_uqxtn: {
6549 assert(Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::f32);
6550 if (Op.getValueType() == MVT::i32)
6551 return DAG.getNode(ISD::BITCAST, DL, MVT::i32,
6552 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::f32,
6553 Op.getOperand(0),
6554 DAG.getNode(ISD::BITCAST, DL, MVT::f64,
6555 Op.getOperand(1))));
6556 return SDValue();
6557 }
6558 case Intrinsic::aarch64_neon_sqxtn:
6559 return DAG.getNode(ISD::TRUNCATE_SSAT_S, DL, Op.getValueType(),
6560 Op.getOperand(1));
6561 case Intrinsic::aarch64_neon_sqxtun:
6562 return DAG.getNode(ISD::TRUNCATE_SSAT_U, DL, Op.getValueType(),
6563 Op.getOperand(1));
6564 case Intrinsic::aarch64_neon_uqxtn:
6565 return DAG.getNode(ISD::TRUNCATE_USAT_U, DL, Op.getValueType(),
6566 Op.getOperand(1));
6567 case Intrinsic::aarch64_neon_sqshrn:
6568 if (Op.getValueType().isVector())
6569 return DAG.getNode(ISD::TRUNCATE_SSAT_S, DL, Op.getValueType(),
6570 DAG.getNode(AArch64ISD::VASHR, DL,
6571 Op.getOperand(1).getValueType(),
6572 Op.getOperand(1), Op.getOperand(2)));
6573 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQSHRN, DAG,
6574 /*LastOperandIsImm=*/true);
6575 case Intrinsic::aarch64_neon_sqshrun:
6576 if (Op.getValueType().isVector())
6577 return DAG.getNode(ISD::TRUNCATE_SSAT_U, DL, Op.getValueType(),
6578 DAG.getNode(AArch64ISD::VASHR, DL,
6579 Op.getOperand(1).getValueType(),
6580 Op.getOperand(1), Op.getOperand(2)));
6581 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQSHRUN, DAG,
6582 /*LastOperandIsImm=*/true);
6583 case Intrinsic::aarch64_neon_uqshrn:
6584 if (Op.getValueType().isVector())
6585 return DAG.getNode(ISD::TRUNCATE_USAT_U, DL, Op.getValueType(),
6586 DAG.getNode(AArch64ISD::VLSHR, DL,
6587 Op.getOperand(1).getValueType(),
6588 Op.getOperand(1), Op.getOperand(2)));
6589 return lowerIntNeonIntrinsic(Op, AArch64ISD::UQSHRN, DAG,
6590 /*LastOperandIsImm=*/true);
6591 case Intrinsic::aarch64_neon_sqrshrn:
6592 if (Op.getValueType().isVector())
6593 return DAG.getNode(ISD::TRUNCATE_SSAT_S, DL, Op.getValueType(),
6594 DAG.getNode(AArch64ISD::SRSHR_I, DL,
6595 Op.getOperand(1).getValueType(),
6596 Op.getOperand(1), Op.getOperand(2)));
6597 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQRSHRN, DAG,
6598 /*LastOperandIsImm=*/true);
6599 case Intrinsic::aarch64_neon_sqrshrun:
6600 if (Op.getValueType().isVector())
6601 return DAG.getNode(ISD::TRUNCATE_SSAT_U, DL, Op.getValueType(),
6602 DAG.getNode(AArch64ISD::SRSHR_I, DL,
6603 Op.getOperand(1).getValueType(),
6604 Op.getOperand(1), Op.getOperand(2)));
6605 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQRSHRUN, DAG,
6606 /*LastOperandIsImm=*/true);
6607 case Intrinsic::aarch64_neon_uqrshrn:
6608 if (Op.getValueType().isVector())
6609 return DAG.getNode(ISD::TRUNCATE_USAT_U, DL, Op.getValueType(),
6610 DAG.getNode(AArch64ISD::URSHR_I, DL,
6611 Op.getOperand(1).getValueType(),
6612 Op.getOperand(1), Op.getOperand(2)));
6613 return lowerIntNeonIntrinsic(Op, AArch64ISD::UQRSHRN, DAG,
6614 /*LastOperandIsImm=*/true);
6615 case Intrinsic::aarch64_neon_sqdmulh:
6616 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQDMULH, DAG);
6617 case Intrinsic::aarch64_neon_sqrdmulh:
6618 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQRDMULH, DAG);
6619 case Intrinsic::aarch64_neon_sqrdmlah:
6620 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQRDMLAH, DAG);
6621 case Intrinsic::aarch64_neon_sqrdmlsh:
6622 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQRDMLSH, DAG);
6623 case Intrinsic::aarch64_neon_sqrshl:
6624 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQRSHL, DAG);
6625 case Intrinsic::aarch64_neon_sqshl:
6626 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQSHL, DAG);
6627 case Intrinsic::aarch64_neon_uqrshl:
6628 return lowerIntNeonIntrinsic(Op, AArch64ISD::UQRSHL, DAG);
6629 case Intrinsic::aarch64_neon_uqshl:
6630 return lowerIntNeonIntrinsic(Op, AArch64ISD::UQSHL, DAG);
6631 case Intrinsic::aarch64_neon_sqadd:
6632 if (Op.getValueType().isVector())
6633 return DAG.getNode(ISD::SADDSAT, DL, Op.getValueType(), Op.getOperand(1),
6634 Op.getOperand(2));
6635 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQADD, DAG);
6636
6637 case Intrinsic::aarch64_neon_sqsub:
6638 if (Op.getValueType().isVector())
6639 return DAG.getNode(ISD::SSUBSAT, DL, Op.getValueType(), Op.getOperand(1),
6640 Op.getOperand(2));
6641 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQSUB, DAG);
6642
6643 case Intrinsic::aarch64_neon_uqadd:
6644 if (Op.getValueType().isVector())
6645 return DAG.getNode(ISD::UADDSAT, DL, Op.getValueType(), Op.getOperand(1),
6646 Op.getOperand(2));
6647 return lowerIntNeonIntrinsic(Op, AArch64ISD::UQADD, DAG);
6648 case Intrinsic::aarch64_neon_suqadd:
6649 return lowerIntNeonIntrinsic(Op, AArch64ISD::SUQADD, DAG);
6650 case Intrinsic::aarch64_neon_usqadd:
6651 return lowerIntNeonIntrinsic(Op, AArch64ISD::USQADD, DAG);
6652 case Intrinsic::aarch64_neon_uqsub:
6653 if (Op.getValueType().isVector())
6654 return DAG.getNode(ISD::USUBSAT, DL, Op.getValueType(), Op.getOperand(1),
6655 Op.getOperand(2));
6656 return lowerIntNeonIntrinsic(Op, AArch64ISD::UQSUB, DAG);
6657 case Intrinsic::aarch64_neon_sqdmulls_scalar:
6658 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQDMULL, DAG);
6659 case Intrinsic::aarch64_neon_sqabs:
6660 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQABS, DAG);
6661 case Intrinsic::aarch64_neon_sqneg:
6662 return lowerIntNeonIntrinsic(Op, AArch64ISD::SQNEG, DAG);
6663 case Intrinsic::aarch64_sve_whilelt:
6664 return optimizeIncrementingWhile(Op.getNode(), DAG, /*IsSigned=*/true,
6665 /*IsEqual=*/false);
6666 case Intrinsic::aarch64_sve_whilels:
6667 return optimizeIncrementingWhile(Op.getNode(), DAG, /*IsSigned=*/false,
6668 /*IsEqual=*/true);
6669 case Intrinsic::aarch64_sve_whilele:
6670 return optimizeIncrementingWhile(Op.getNode(), DAG, /*IsSigned=*/true,
6671 /*IsEqual=*/true);
6672 case Intrinsic::aarch64_sve_sunpkhi:
6673 return DAG.getNode(AArch64ISD::SUNPKHI, DL, Op.getValueType(),
6674 Op.getOperand(1));
6675 case Intrinsic::aarch64_sve_sunpklo:
6676 return DAG.getNode(AArch64ISD::SUNPKLO, DL, Op.getValueType(),
6677 Op.getOperand(1));
6678 case Intrinsic::aarch64_sve_uunpkhi:
6679 return DAG.getNode(AArch64ISD::UUNPKHI, DL, Op.getValueType(),
6680 Op.getOperand(1));
6681 case Intrinsic::aarch64_sve_uunpklo:
6682 return DAG.getNode(AArch64ISD::UUNPKLO, DL, Op.getValueType(),
6683 Op.getOperand(1));
6684 case Intrinsic::aarch64_sve_clasta_n:
6685 return DAG.getNode(AArch64ISD::CLASTA_N, DL, Op.getValueType(),
6686 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6687 case Intrinsic::aarch64_sve_clastb_n:
6688 return DAG.getNode(AArch64ISD::CLASTB_N, DL, Op.getValueType(),
6689 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6690 case Intrinsic::aarch64_sve_lasta:
6691 return DAG.getNode(AArch64ISD::LASTA, DL, Op.getValueType(),
6692 Op.getOperand(1), Op.getOperand(2));
6693 case Intrinsic::aarch64_sve_lastb:
6694 return DAG.getNode(AArch64ISD::LASTB, DL, Op.getValueType(),
6695 Op.getOperand(1), Op.getOperand(2));
6696 case Intrinsic::aarch64_sve_tbl:
6697 return DAG.getNode(AArch64ISD::TBL, DL, Op.getValueType(), Op.getOperand(1),
6698 Op.getOperand(2));
6699 case Intrinsic::aarch64_sve_trn1:
6700 return DAG.getNode(AArch64ISD::TRN1, DL, Op.getValueType(),
6701 Op.getOperand(1), Op.getOperand(2));
6702 case Intrinsic::aarch64_sve_trn2:
6703 return DAG.getNode(AArch64ISD::TRN2, DL, Op.getValueType(),
6704 Op.getOperand(1), Op.getOperand(2));
6705 case Intrinsic::aarch64_sve_uzp1:
6706 return DAG.getNode(AArch64ISD::UZP1, DL, Op.getValueType(),
6707 Op.getOperand(1), Op.getOperand(2));
6708 case Intrinsic::aarch64_sve_uzp2:
6709 return DAG.getNode(AArch64ISD::UZP2, DL, Op.getValueType(),
6710 Op.getOperand(1), Op.getOperand(2));
6711 case Intrinsic::aarch64_sve_zip1:
6712 return DAG.getNode(AArch64ISD::ZIP1, DL, Op.getValueType(),
6713 Op.getOperand(1), Op.getOperand(2));
6714 case Intrinsic::aarch64_sve_zip2:
6715 return DAG.getNode(AArch64ISD::ZIP2, DL, Op.getValueType(),
6716 Op.getOperand(1), Op.getOperand(2));
6717 case Intrinsic::aarch64_sve_splice:
6718 return DAG.getNode(AArch64ISD::SPLICE, DL, Op.getValueType(),
6719 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6720 case Intrinsic::aarch64_sve_ptrue:
6721 return getPTrue(DAG, DL, Op.getValueType(), Op.getConstantOperandVal(1));
6722 case Intrinsic::aarch64_sve_clz:
6723 return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, DL, Op.getValueType(),
6724 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6725 case Intrinsic::aarch64_sme_cntsd: {
6726 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, DL, Op.getValueType(),
6727 DAG.getConstant(1, DL, MVT::i32));
6728 return DAG.getNode(ISD::SRL, DL, Op.getValueType(), Bytes,
6729 DAG.getConstant(3, DL, MVT::i32), SDNodeFlags::Exact);
6730 }
6731 case Intrinsic::aarch64_sve_cnt: {
6732 SDValue Data = Op.getOperand(3);
6733 // CTPOP only supports integer operands.
6734 if (Data.getValueType().isFloatingPoint())
6735 Data = DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Data);
6736 return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, DL, Op.getValueType(),
6737 Op.getOperand(2), Data, Op.getOperand(1));
6738 }
6739 case Intrinsic::aarch64_sve_dupq_lane:
6740 return LowerDUPQLane(Op, DAG);
6741 case Intrinsic::aarch64_sve_convert_from_svbool:
6742 if (Op.getValueType() == MVT::aarch64svcount)
6743 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Op.getOperand(1));
6744 return getSVEPredicateBitCast(Op.getValueType(), Op.getOperand(1), DAG);
6745 case Intrinsic::aarch64_sve_convert_to_svbool:
6746 if (Op.getOperand(1).getValueType() == MVT::aarch64svcount)
6747 return DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, Op.getOperand(1));
6748 return getSVEPredicateBitCast(MVT::nxv16i1, Op.getOperand(1), DAG);
6749 case Intrinsic::aarch64_sve_fneg:
6750 return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, DL, Op.getValueType(),
6751 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6752 case Intrinsic::aarch64_sve_frintp:
6753 return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, DL, Op.getValueType(),
6754 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6755 case Intrinsic::aarch64_sve_frintm:
6756 return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, DL, Op.getValueType(),
6757 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6758 case Intrinsic::aarch64_sve_frinti:
6759 return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, DL,
6760 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6761 Op.getOperand(1));
6762 case Intrinsic::aarch64_sve_frintx:
6763 return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, DL, Op.getValueType(),
6764 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6765 case Intrinsic::aarch64_sve_frint32x:
6766 return DAG.getNode(AArch64ISD::FRINT32_MERGE_PASSTHRU, DL,
6767 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6768 Op.getOperand(1));
6769 case Intrinsic::aarch64_sve_frint64x:
6770 return DAG.getNode(AArch64ISD::FRINT64_MERGE_PASSTHRU, DL,
6771 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6772 Op.getOperand(1));
6773 case Intrinsic::aarch64_sve_frinta:
6774 return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, DL, Op.getValueType(),
6775 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6776 case Intrinsic::aarch64_sve_frintn:
6777 return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, DL,
6778 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6779 Op.getOperand(1));
6780 case Intrinsic::aarch64_sve_frintz:
6781 return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, DL, Op.getValueType(),
6782 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6783 case Intrinsic::aarch64_sve_frint32z:
6784 return DAG.getNode(AArch64ISD::FTRUNC32_MERGE_PASSTHRU, DL,
6785 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6786 Op.getOperand(1));
6787 case Intrinsic::aarch64_sve_frint64z:
6788 return DAG.getNode(AArch64ISD::FTRUNC64_MERGE_PASSTHRU, DL,
6789 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6790 Op.getOperand(1));
6791 case Intrinsic::aarch64_sve_ucvtf:
6792 return DAG.getNode(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU, DL,
6793 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6794 Op.getOperand(1));
6795 case Intrinsic::aarch64_sve_scvtf:
6796 return DAG.getNode(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU, DL,
6797 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6798 Op.getOperand(1));
6799 case Intrinsic::aarch64_sve_fcvtzu:
6800 return DAG.getNode(AArch64ISD::FCVTZU_MERGE_PASSTHRU, DL, Op.getValueType(),
6801 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6802 case Intrinsic::aarch64_sve_fcvtzs:
6803 return DAG.getNode(AArch64ISD::FCVTZS_MERGE_PASSTHRU, DL, Op.getValueType(),
6804 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6805 case Intrinsic::aarch64_sve_fsqrt:
6806 return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, DL, Op.getValueType(),
6807 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6808 case Intrinsic::aarch64_sve_frecpx:
6809 return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, DL, Op.getValueType(),
6810 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6811 case Intrinsic::aarch64_sve_frecpe_x:
6812 return DAG.getNode(AArch64ISD::FRECPE, DL, Op.getValueType(),
6813 Op.getOperand(1));
6814 case Intrinsic::aarch64_sve_frecps_x:
6815 return DAG.getNode(AArch64ISD::FRECPS, DL, Op.getValueType(),
6816 Op.getOperand(1), Op.getOperand(2));
6817 case Intrinsic::aarch64_sve_frsqrte_x:
6818 return DAG.getNode(AArch64ISD::FRSQRTE, DL, Op.getValueType(),
6819 Op.getOperand(1));
6820 case Intrinsic::aarch64_sve_frsqrts_x:
6821 return DAG.getNode(AArch64ISD::FRSQRTS, DL, Op.getValueType(),
6822 Op.getOperand(1), Op.getOperand(2));
6823 case Intrinsic::aarch64_sve_fabs:
6824 return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, DL, Op.getValueType(),
6825 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6826 case Intrinsic::aarch64_sve_abs:
6827 return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, DL, Op.getValueType(),
6828 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6829 case Intrinsic::aarch64_sve_neg:
6830 return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, DL, Op.getValueType(),
6831 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6832 case Intrinsic::aarch64_sve_insr: {
6833 SDValue Scalar = Op.getOperand(2);
6834 EVT ScalarTy = Scalar.getValueType();
6835 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
6836 Scalar = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Scalar);
6837
6838 return DAG.getNode(AArch64ISD::INSR, DL, Op.getValueType(),
6839 Op.getOperand(1), Scalar);
6840 }
6841 case Intrinsic::aarch64_sve_rbit:
6842 return DAG.getNode(AArch64ISD::BITREVERSE_MERGE_PASSTHRU, DL,
6843 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6844 Op.getOperand(1));
6845 case Intrinsic::aarch64_sve_revb:
6846 return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, DL, Op.getValueType(),
6847 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6848 case Intrinsic::aarch64_sve_revh:
6849 return DAG.getNode(AArch64ISD::REVH_MERGE_PASSTHRU, DL, Op.getValueType(),
6850 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6851 case Intrinsic::aarch64_sve_revw:
6852 return DAG.getNode(AArch64ISD::REVW_MERGE_PASSTHRU, DL, Op.getValueType(),
6853 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6854 case Intrinsic::aarch64_sve_revd:
6855 return DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, DL, Op.getValueType(),
6856 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6857 case Intrinsic::aarch64_sve_sxtb:
6858 return DAG.getNode(
6859 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6860 Op.getOperand(2), Op.getOperand(3),
6861 DAG.getValueType(Op.getValueType().changeVectorElementType(
6862 *DAG.getContext(), MVT::i8)),
6863 Op.getOperand(1));
6864 case Intrinsic::aarch64_sve_sxth:
6865 return DAG.getNode(
6866 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6867 Op.getOperand(2), Op.getOperand(3),
6868 DAG.getValueType(Op.getValueType().changeVectorElementType(
6869 *DAG.getContext(), MVT::i16)),
6870 Op.getOperand(1));
6871 case Intrinsic::aarch64_sve_sxtw:
6872 return DAG.getNode(
6873 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6874 Op.getOperand(2), Op.getOperand(3),
6875 DAG.getValueType(Op.getValueType().changeVectorElementType(
6876 *DAG.getContext(), MVT::i32)),
6877 Op.getOperand(1));
6878 case Intrinsic::aarch64_sve_uxtb:
6879 return DAG.getNode(
6880 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6881 Op.getOperand(2), Op.getOperand(3),
6882 DAG.getValueType(Op.getValueType().changeVectorElementType(
6883 *DAG.getContext(), MVT::i8)),
6884 Op.getOperand(1));
6885 case Intrinsic::aarch64_sve_uxth:
6886 return DAG.getNode(
6887 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6888 Op.getOperand(2), Op.getOperand(3),
6889 DAG.getValueType(Op.getValueType().changeVectorElementType(
6890 *DAG.getContext(), MVT::i16)),
6891 Op.getOperand(1));
6892 case Intrinsic::aarch64_sve_uxtw:
6893 return DAG.getNode(
6894 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6895 Op.getOperand(2), Op.getOperand(3),
6896 DAG.getValueType(Op.getValueType().changeVectorElementType(
6897 *DAG.getContext(), MVT::i32)),
6898 Op.getOperand(1));
6899 case Intrinsic::localaddress: {
6900 const auto &MF = DAG.getMachineFunction();
6901 const auto *RegInfo = Subtarget->getRegisterInfo();
6902 unsigned Reg = RegInfo->getLocalAddressRegister(MF);
6903 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg,
6904 Op.getSimpleValueType());
6905 }
6906
6907 case Intrinsic::eh_recoverfp: {
6908 // FIXME: This needs to be implemented to correctly handle highly aligned
6909 // stack objects. For now we simply return the incoming FP. Refer D53541
6910 // for more details.
6911 SDValue FnOp = Op.getOperand(1);
6912 SDValue IncomingFPOp = Op.getOperand(2);
6913 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
6914 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
6915 if (!Fn)
6917 "llvm.eh.recoverfp must take a function as the first argument");
6918 return IncomingFPOp;
6919 }
6920 case Intrinsic::aarch64_neon_vsri:
6921 case Intrinsic::aarch64_neon_vsli:
6922 case Intrinsic::aarch64_sve_sri:
6923 case Intrinsic::aarch64_sve_sli: {
6924 EVT Ty = Op.getValueType();
6925
6926 if (!Ty.isVector())
6927 report_fatal_error("Unexpected type for aarch64_neon_vsli");
6928
6929 assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());
6930
6931 bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri ||
6932 IntNo == Intrinsic::aarch64_sve_sri;
6933 unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
6934 return DAG.getNode(Opcode, DL, Ty, Op.getOperand(1), Op.getOperand(2),
6935 Op.getOperand(3));
6936 }
6937
6938 case Intrinsic::aarch64_neon_srhadd:
6939 case Intrinsic::aarch64_neon_urhadd:
6940 case Intrinsic::aarch64_neon_shadd:
6941 case Intrinsic::aarch64_neon_uhadd: {
6942 bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
6943 IntNo == Intrinsic::aarch64_neon_shadd);
6944 bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
6945 IntNo == Intrinsic::aarch64_neon_urhadd);
6946 unsigned Opcode = IsSignedAdd
6947 ? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS)
6948 : (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU);
6949 return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(1),
6950 Op.getOperand(2));
6951 }
6952 case Intrinsic::aarch64_neon_saddlp:
6953 case Intrinsic::aarch64_neon_uaddlp: {
6954 unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp
6955 ? AArch64ISD::UADDLP
6956 : AArch64ISD::SADDLP;
6957 return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(1));
6958 }
6959 case Intrinsic::aarch64_neon_sdot:
6960 case Intrinsic::aarch64_neon_udot:
6961 case Intrinsic::aarch64_sve_sdot:
6962 case Intrinsic::aarch64_sve_udot: {
6963 unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot ||
6964 IntNo == Intrinsic::aarch64_sve_udot)
6965 ? AArch64ISD::UDOT
6966 : AArch64ISD::SDOT;
6967 return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(1),
6968 Op.getOperand(2), Op.getOperand(3));
6969 }
6970 case Intrinsic::aarch64_neon_usdot:
6971 case Intrinsic::aarch64_sve_usdot: {
6972 return DAG.getNode(AArch64ISD::USDOT, DL, Op.getValueType(),
6973 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6974 }
6975 case Intrinsic::aarch64_neon_saddlv:
6976 case Intrinsic::aarch64_neon_uaddlv: {
6977 EVT OpVT = Op.getOperand(1).getValueType();
6978 EVT ResVT = Op.getValueType();
6979 assert(
6980 ((ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 ||
6981 OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) ||
6982 (ResVT == MVT::i64 && (OpVT == MVT::v4i32 || OpVT == MVT::v2i32))) &&
6983 "Unexpected aarch64_neon_u/saddlv type");
6984 (void)OpVT;
6985 // In order to avoid insert_subvector, use v4i32 rather than v2i32.
6986 SDValue ADDLV = DAG.getNode(
6987 IntNo == Intrinsic::aarch64_neon_uaddlv ? AArch64ISD::UADDLV
6988 : AArch64ISD::SADDLV,
6989 DL, ResVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64, Op.getOperand(1));
6990 SDValue EXTRACT_VEC_ELT = DAG.getNode(
6991 ISD::EXTRACT_VECTOR_ELT, DL, ResVT == MVT::i32 ? MVT::i32 : MVT::i64,
6992 ADDLV, DAG.getConstant(0, DL, MVT::i64));
6993 return EXTRACT_VEC_ELT;
6994 }
6995 case Intrinsic::experimental_vector_match: {
6996 return LowerVectorMatch(Op, DAG);
6997 }
6998 case Intrinsic::aarch64_cls:
6999 case Intrinsic::aarch64_cls64: {
7000 SDValue Res = DAG.getNode(ISD::CTLS, DL, Op.getOperand(1).getValueType(),
7001 Op.getOperand(1));
7002 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Res);
7003 }
7004 case Intrinsic::aarch64_neon_cls: {
7005 // Lower NEON CLS intrinsic to ISD::CTLS
7006 return DAG.getNode(ISD::CTLS, DL, Op.getValueType(), Op.getOperand(1));
7007 }
7008 case Intrinsic::aarch64_sve_pmul:
7009 case Intrinsic::aarch64_neon_pmul:
7010 return DAG.getNode(ISD::CLMUL, DL, Op.getValueType(), Op.getOperand(1),
7011 Op.getOperand(2));
7012 }
7013}
7014
7015bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
7016 if (VT.getVectorElementType() == MVT::i8 ||
7017 VT.getVectorElementType() == MVT::i16) {
7018 EltTy = MVT::i32;
7019 return true;
7020 }
7021 return false;
7022}
7023
7024bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(SDValue Extend,
7025 EVT DataVT) const {
7026 const EVT IndexVT = Extend.getOperand(0).getValueType();
7027 // SVE only supports implicit extension of 32-bit indices.
7028 if (!Subtarget->hasSVE() || IndexVT.getVectorElementType() != MVT::i32)
7029 return false;
7030
7031 // Indices cannot be smaller than the main data type.
7032 if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits())
7033 return false;
7034
7035 // Scalable vectors with "vscale * 2" or fewer elements sit within a 64-bit
7036 // element container type, which would violate the previous clause.
7037 return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2;
7038}
7039
7040/// Helper function to check if a small vector load can be optimized.
7042 const AArch64Subtarget &Subtarget) {
7043 if (!Subtarget.isNeonAvailable())
7044 return false;
7045 if (LD->isVolatile())
7046 return false;
7047
7048 EVT MemVT = LD->getMemoryVT();
7049 if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8 && MemVT != MVT::v2i16)
7050 return false;
7051
7052 Align Alignment = LD->getAlign();
7053 Align RequiredAlignment = Align(MemVT.getStoreSize().getFixedValue());
7054 if (Subtarget.requiresStrictAlign() && Alignment < RequiredAlignment)
7055 return false;
7056
7057 return true;
7058}
7059
7060bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
7061 EVT ExtVT = ExtVal.getValueType();
7062 // Small, illegal vectors can be extended inreg.
7063 if (auto *Load = dyn_cast<LoadSDNode>(ExtVal.getOperand(0))) {
7064 if (ExtVT.isFixedLengthVector() && ExtVT.getStoreSizeInBits() <= 128 &&
7065 isEligibleForSmallVectorLoadOpt(Load, *Subtarget))
7066 return true;
7067 }
7068 if (!ExtVT.isScalableVector() && !Subtarget->useSVEForFixedLengthVectors())
7069 return false;
7070
7071 // It may be worth creating extending masked loads if there are multiple
7072 // masked loads using the same predicate. That way we'll end up creating
7073 // extending masked loads that may then get split by the legaliser. This
7074 // results in just one set of predicate unpacks at the start, instead of
7075 // multiple sets of vector unpacks after each load.
7076 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal->getOperand(0))) {
7077 if (!isLoadLegalOrCustom(ExtVT, Ld->getValueType(0), Ld->getAlign(),
7078 Ld->getAddressSpace(), ISD::ZEXTLOAD, false)) {
7079 // Disable extending masked loads for fixed-width for now, since the code
7080 // quality doesn't look great.
7081 if (!ExtVT.isScalableVector())
7082 return false;
7083
7084 unsigned NumExtMaskedLoads = 0;
7085 for (auto *U : Ld->getMask()->users())
7086 if (isa<MaskedLoadSDNode>(U))
7087 NumExtMaskedLoads++;
7088
7089 if (NumExtMaskedLoads <= 1)
7090 return false;
7091 }
7092 }
7093
7094 EVT PreExtScalarVT = ExtVal->getOperand(0).getValueType().getScalarType();
7095 return PreExtScalarVT == MVT::i8 || PreExtScalarVT == MVT::i16 ||
7096 PreExtScalarVT == MVT::i32 || PreExtScalarVT == MVT::i64;
7097}
7098
7099unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
7100 std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
7101 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
7102 AArch64ISD::GLD1_MERGE_ZERO},
7103 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
7104 AArch64ISD::GLD1_UXTW_MERGE_ZERO},
7105 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
7106 AArch64ISD::GLD1_MERGE_ZERO},
7107 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
7108 AArch64ISD::GLD1_SXTW_MERGE_ZERO},
7109 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
7110 AArch64ISD::GLD1_SCALED_MERGE_ZERO},
7111 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
7112 AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO},
7113 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
7114 AArch64ISD::GLD1_SCALED_MERGE_ZERO},
7115 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
7116 AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO},
7117 };
7118 auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
7119 return AddrModes.find(Key)->second;
7120}
7121
7122unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
7123 switch (Opcode) {
7124 default:
7125 llvm_unreachable("unimplemented opcode");
7126 return Opcode;
7127 case AArch64ISD::GLD1_MERGE_ZERO:
7128 return AArch64ISD::GLD1S_MERGE_ZERO;
7129 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
7130 return AArch64ISD::GLD1S_IMM_MERGE_ZERO;
7131 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
7132 return AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
7133 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
7134 return AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
7135 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
7136 return AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
7137 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
7138 return AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
7139 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
7140 return AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
7141 }
7142}
7143
7144SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
7145 SelectionDAG &DAG) const {
7146 MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);
7147
7148 SDLoc DL(Op);
7149 SDValue Chain = MGT->getChain();
7150 SDValue PassThru = MGT->getPassThru();
7151 SDValue Mask = MGT->getMask();
7152 SDValue BasePtr = MGT->getBasePtr();
7153 SDValue Index = MGT->getIndex();
7154 SDValue Scale = MGT->getScale();
7155 EVT VT = Op.getValueType();
7156 EVT MemVT = MGT->getMemoryVT();
7157 ISD::LoadExtType ExtType = MGT->getExtensionType();
7158 ISD::MemIndexType IndexType = MGT->getIndexType();
7159
7160 // SVE supports zero (and so undef) passthrough values only, everything else
7161 // must be handled manually by an explicit select on the load's output.
7162 if (!PassThru->isUndef() && !isZerosVector(PassThru.getNode())) {
7163 SDValue Ops[] = {Chain, DAG.getUNDEF(VT), Mask, BasePtr, Index, Scale};
7164 SDValue Load =
7165 DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
7166 MGT->getMemOperand(), IndexType, ExtType);
7167 SDValue Select = DAG.getSelect(DL, VT, Mask, Load, PassThru);
7168 return DAG.getMergeValues({Select, Load.getValue(1)}, DL);
7169 }
7170
7171 bool IsScaled = MGT->isIndexScaled();
7172 bool IsSigned = MGT->isIndexSigned();
7173
7174 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
7175 // must be calculated before hand.
7176 uint64_t ScaleVal = Scale->getAsZExtVal();
7177 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
7178 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
7179 EVT IndexVT = Index.getValueType();
7180 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
7181 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
7182 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
7183
7184 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
7185 return DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
7186 MGT->getMemOperand(), IndexType, ExtType);
7187 }
7188
7189 // Lower fixed length gather to a scalable equivalent.
7190 if (VT.isFixedLengthVector()) {
7191 assert(Subtarget->useSVEForFixedLengthVectors() &&
7192 "Cannot lower when not using SVE for fixed vectors!");
7193
7194 // NOTE: Handle floating-point as if integer then bitcast the result.
7195 EVT DataVT = VT.changeVectorElementTypeToInteger();
7196 MemVT = MemVT.changeVectorElementTypeToInteger();
7197
7198 // Find the smallest integer fixed length vector we can use for the gather.
7199 EVT PromotedVT = VT.changeVectorElementType(*DAG.getContext(), MVT::i32);
7200 if (DataVT.getVectorElementType() == MVT::i64 ||
7201 Index.getValueType().getVectorElementType() == MVT::i64 ||
7202 Mask.getValueType().getVectorElementType() == MVT::i64)
7203 PromotedVT = VT.changeVectorElementType(*DAG.getContext(), MVT::i64);
7204
7205 // Promote vector operands except for passthrough, which we know is either
7206 // undef or zero, and thus best constructed directly.
7207 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
7208 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
7209 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
7210
7211 // A promoted result type forces the need for an extending load.
7212 if (PromotedVT != DataVT && ExtType == ISD::NON_EXTLOAD)
7213 ExtType = ISD::EXTLOAD;
7214
7215 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
7216
7217 // Convert fixed length vector operands to scalable.
7218 MemVT = ContainerVT.changeVectorElementType(*DAG.getContext(),
7219 MemVT.getVectorElementType());
7220 Index = convertToScalableVector(DAG, ContainerVT, Index);
7222 PassThru = PassThru->isUndef() ? DAG.getUNDEF(ContainerVT)
7223 : DAG.getConstant(0, DL, ContainerVT);
7224
7225 // Emit equivalent scalable vector gather.
7226 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
7227 SDValue Load =
7228 DAG.getMaskedGather(DAG.getVTList(ContainerVT, MVT::Other), MemVT, DL,
7229 Ops, MGT->getMemOperand(), IndexType, ExtType);
7230
7231 // Extract fixed length data then convert to the required result type.
7232 SDValue Result = convertFromScalableVector(DAG, PromotedVT, Load);
7233 Result = DAG.getNode(ISD::TRUNCATE, DL, DataVT, Result);
7234 if (VT.isFloatingPoint())
7235 Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
7236
7237 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
7238 }
7239
7240 // Everything else is legal.
7241 return Op;
7242}
7243
7244SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
7245 SelectionDAG &DAG) const {
7246 MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);
7247
7248 SDLoc DL(Op);
7249 SDValue Chain = MSC->getChain();
7250 SDValue StoreVal = MSC->getValue();
7251 SDValue Mask = MSC->getMask();
7252 SDValue BasePtr = MSC->getBasePtr();
7253 SDValue Index = MSC->getIndex();
7254 SDValue Scale = MSC->getScale();
7255 EVT VT = StoreVal.getValueType();
7256 EVT MemVT = MSC->getMemoryVT();
7257 ISD::MemIndexType IndexType = MSC->getIndexType();
7258 bool Truncating = MSC->isTruncatingStore();
7259
7260 bool IsScaled = MSC->isIndexScaled();
7261 bool IsSigned = MSC->isIndexSigned();
7262
7263 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
7264 // must be calculated before hand.
7265 uint64_t ScaleVal = Scale->getAsZExtVal();
7266 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
7267 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
7268 EVT IndexVT = Index.getValueType();
7269 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
7270 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
7271 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
7272
7273 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
7274 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
7275 MSC->getMemOperand(), IndexType, Truncating);
7276 }
7277
7278 // Lower fixed length scatter to a scalable equivalent.
7279 if (VT.isFixedLengthVector()) {
7280 assert(Subtarget->useSVEForFixedLengthVectors() &&
7281 "Cannot lower when not using SVE for fixed vectors!");
7282
7283 // Once bitcast we treat floating-point scatters as if integer.
7284 if (VT.isFloatingPoint()) {
7286 MemVT = MemVT.changeVectorElementTypeToInteger();
7287 StoreVal = DAG.getNode(ISD::BITCAST, DL, VT, StoreVal);
7288 }
7289
7290 // Find the smallest integer fixed length vector we can use for the scatter.
7291 EVT PromotedVT = VT.changeVectorElementType(*DAG.getContext(), MVT::i32);
7292 if (VT.getVectorElementType() == MVT::i64 ||
7293 Index.getValueType().getVectorElementType() == MVT::i64 ||
7294 Mask.getValueType().getVectorElementType() == MVT::i64)
7295 PromotedVT = VT.changeVectorElementType(*DAG.getContext(), MVT::i64);
7296
7297 // Promote vector operands.
7298 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
7299 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
7300 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
7301 StoreVal = DAG.getNode(ISD::ANY_EXTEND, DL, PromotedVT, StoreVal);
7302
7303 // A promoted value type forces the need for a truncating store.
7304 if (PromotedVT != VT)
7305 Truncating = true;
7306
7307 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
7308
7309 // Convert fixed length vector operands to scalable.
7310 MemVT = ContainerVT.changeVectorElementType(*DAG.getContext(),
7311 MemVT.getVectorElementType());
7312 Index = convertToScalableVector(DAG, ContainerVT, Index);
7314 StoreVal = convertToScalableVector(DAG, ContainerVT, StoreVal);
7315
7316 // Emit equivalent scalable vector scatter.
7317 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
7318 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
7319 MSC->getMemOperand(), IndexType, Truncating);
7320 }
7321
7322 // Everything else is legal.
7323 return Op;
7324}
7325
7326SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
7327 SDLoc DL(Op);
7328 MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Op);
7329 assert(LoadNode && "Expected custom lowering of a masked load node");
7330 EVT VT = Op->getValueType(0);
7331
7332 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
7333 return LowerFixedLengthVectorMLoadToSVE(Op, DAG);
7334
7335 SDValue PassThru = LoadNode->getPassThru();
7336 SDValue Mask = LoadNode->getMask();
7337
7338 if (!LoadNode->isExpandingLoad()) {
7339 if (PassThru->isUndef() || isZerosVector(PassThru.getNode()))
7340 return Op;
7341
7343 VT, DL, LoadNode->getChain(), LoadNode->getBasePtr(),
7344 LoadNode->getOffset(), Mask, DAG.getUNDEF(VT), LoadNode->getMemoryVT(),
7345 LoadNode->getMemOperand(), LoadNode->getAddressingMode(),
7346 LoadNode->getExtensionType());
7347
7348 SDValue Result = DAG.getSelect(DL, VT, Mask, Load, PassThru);
7349 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
7350 }
7351
7352 // Return if EXPAND instruction is not available.
7353 if ((!Subtarget->isSVEAvailable() || !Subtarget->hasSVE2p2()) &&
7354 (!Subtarget->isSVEorStreamingSVEAvailable() || !Subtarget->hasSME2p2()))
7355 return SDValue();
7356
7357 // Create mask using the number of active lanes in the predicate.
7358 SDValue CntActive = DAG.getNode(
7359 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,
7360 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64), Mask,
7361 Mask);
7362
7363 SDValue ActiveMask =
7364 DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, DL, Mask->getValueType(0),
7365 DAG.getConstant(0, DL, MVT::i64), CntActive);
7366
7367 // Contiguous load of elements using the active lane mask above.
7369 VT, DL, LoadNode->getChain(), LoadNode->getBasePtr(),
7370 LoadNode->getOffset(), ActiveMask, DAG.getUNDEF(VT),
7371 LoadNode->getMemoryVT(), LoadNode->getMemOperand(),
7372 LoadNode->getAddressingMode(), LoadNode->getExtensionType());
7373
7374 // Expand instruction copies the low-numbered elements to active elements
7375 // in the original predicate and zeros all other lanes.
7376 SDValue Result = DAG.getNode(
7378 DAG.getTargetConstant(Intrinsic::aarch64_sve_expand, DL, MVT::i64), Mask,
7379 Load);
7380
7381 // Copy the passthrough value unless zero/undef.
7382 if (!PassThru->isUndef() && !isZerosVector(PassThru.getNode()))
7383 Result = DAG.getSelect(DL, VT, Mask, Result, PassThru);
7384
7385 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
7386}
7387
7388// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
7390 EVT VT, EVT MemVT,
7391 SelectionDAG &DAG) {
7392 assert(VT.isVector() && "VT should be a vector type");
7393 assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
7394
7395 SDValue Value = ST->getValue();
7396
7397 // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
7398 // the word lane which represent the v4i8 subvector. It optimizes the store
7399 // to:
7400 //
7401 // xtn v0.8b, v0.8h
7402 // str s0, [x0]
7403
7404 SDValue Poison = DAG.getPOISON(MVT::i16);
7405 SDValue PoisonVec =
7406 DAG.getBuildVector(MVT::v4i16, DL, {Poison, Poison, Poison, Poison});
7407
7408 SDValue TruncExt =
7409 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Value, PoisonVec);
7410 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
7411
7412 Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
7413 SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
7414 Trunc, DAG.getConstant(0, DL, MVT::i64));
7415
7416 return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
7417 ST->getBasePtr(), ST->getMemOperand());
7418}
7419
7421 SDLoc DL(Op);
7422 SDValue Src = Op.getOperand(0);
7423 MVT DestVT = Op.getSimpleValueType();
7424 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7426
7427 unsigned SrcAS = N->getSrcAddressSpace();
7428 unsigned DestAS = N->getDestAddressSpace();
7429 assert(SrcAS != DestAS &&
7430 "addrspacecast must be between different address spaces");
7431 assert(TLI.getTargetMachine().getPointerSize(SrcAS) !=
7432 TLI.getTargetMachine().getPointerSize(DestAS) &&
7433 "addrspacecast must be between different ptr sizes");
7434 (void)TLI;
7435
7436 if (SrcAS == ARM64AS::PTR32_SPTR) {
7437 return DAG.getNode(ISD::SIGN_EXTEND, DL, DestVT, Src,
7438 DAG.getTargetConstant(0, DL, DestVT));
7439 } else if (SrcAS == ARM64AS::PTR32_UPTR) {
7440 return DAG.getNode(ISD::ZERO_EXTEND, DL, DestVT, Src,
7441 DAG.getTargetConstant(0, DL, DestVT));
7442 } else if ((DestAS == ARM64AS::PTR32_SPTR) ||
7443 (DestAS == ARM64AS::PTR32_UPTR)) {
7444 SDValue Ext = DAG.getAnyExtOrTrunc(Src, DL, DestVT);
7445 SDValue Trunc = DAG.getZeroExtendInReg(Ext, DL, DestVT);
7446 return Trunc;
7447 } else {
7448 return Src;
7449 }
7450}
7451
7452// Coordinated with STNP handling in
7453// `llvm/lib/Target/AArch64/AArch64InstrInfo.td` and
7454// `LowerNTStore`
7455static bool isLegalNTStore(Type *DataType, Align Alignment,
7456 const DataLayout &DL) {
7457 // Currently we only support NT stores lowering for little-endian targets.
7458 if (!DL.isLittleEndian())
7459 return false;
7460
7461 // The backend can lower to STNPWi in this case
7462 if (DataType->isIntegerTy(64))
7463 return true;
7464
7465 auto *DataTypeTy = dyn_cast<FixedVectorType>(DataType);
7466 if (!DataTypeTy)
7467 return false;
7468
7469 // Check fixed vector legality
7470 unsigned NumElements = DataTypeTy->getNumElements();
7471 unsigned EltSizeBits = DataTypeTy->getElementType()->getScalarSizeInBits();
7472
7473 // Currently only power-of-2 vectors are supported
7474 if (!isPowerOf2_64(NumElements) || !isPowerOf2_64(EltSizeBits))
7475 return false;
7476
7477 unsigned TotalSizeBits = DataTypeTy->getPrimitiveSizeInBits().getFixedValue();
7478
7479 // The backend can lower to STNPSi or STNPDi in this case
7480 // via `llvm/lib/Target/AArch64/AArch64InstrInfo.td`
7481 if (TotalSizeBits == 64u || TotalSizeBits == 128u)
7482 return true;
7483
7484 // The backend can lower to STNPQi in this case via `LowerNTStore`
7485 if (TotalSizeBits == 256u && (EltSizeBits == 8u || EltSizeBits == 16u ||
7486 EltSizeBits == 32u || EltSizeBits == 64u))
7487 return true;
7488
7489 return false;
7490}
7491
7492// Lower non-temporal stores that would otherwise be broken by legalization.
7493//
7494// Coordinated with STNP constraints in
7495// `llvm/lib/Target/AArch64/AArch64InstrInfo.td` and
7496// `isLegalNTStore`
7497static SDValue LowerNTStore(StoreSDNode *StoreNode, EVT VT, EVT MemVT,
7498 const SDLoc &DL, SelectionDAG &DAG) {
7499 assert(StoreNode && "Expected a store operation");
7500 assert(StoreNode->isNonTemporal() && "Expected a non-temporal store");
7501
7502 // Currently, STNP lowering can only either keep or increase code size, thus
7503 // we predicate it to not apply when optimizing for code size.
7504 if (DAG.shouldOptForSize())
7505 return SDValue();
7506
7507 // Currently we only support NT stores lowering for little-endian targets.
7508 if (!DAG.getDataLayout().isLittleEndian())
7509 return SDValue();
7510
7511 if (VT.isVector()) {
7512 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
7513 // the custom lowering, as there are no un-paired non-temporal stores and
7514 // legalization will break up 256 bit inputs.
7516 if (VT.isVector() && MemVT.getSizeInBits() == 256u && EC.isKnownEven() &&
7517 (MemVT.getScalarSizeInBits() == 8u ||
7518 MemVT.getScalarSizeInBits() == 16u ||
7519 MemVT.getScalarSizeInBits() == 32u ||
7520 MemVT.getScalarSizeInBits() == 64u)) {
7521 SDValue Lo =
7524 StoreNode->getValue(), DAG.getConstant(0, DL, MVT::i64));
7525 SDValue Hi =
7528 StoreNode->getValue(),
7529 DAG.getConstant(EC.getKnownMinValue() / 2, DL, MVT::i64));
7530 SDValue Result = DAG.getMemIntrinsicNode(
7531 AArch64ISD::STNP, DL, DAG.getVTList(MVT::Other),
7532 {StoreNode->getChain(), DAG.getBitcast(MVT::v2i64, Lo),
7533 DAG.getBitcast(MVT::v2i64, Hi), StoreNode->getBasePtr()},
7534 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
7536 StoreNode->getAlign(), DAG.getDataLayout()) &&
7537 "Lowering should be consistent with legality");
7538 return Result;
7539 }
7540 }
7541 return SDValue();
7542}
7543
7544// Custom lowering for any store, vector or scalar and/or default or with
7545// a truncate operations. Currently only custom lower truncate operation
7546// from vector v4i16 to v4i8 or volatile stores of i128.
7547SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
7548 SelectionDAG &DAG) const {
7549 SDLoc Dl(Op);
7550 StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
7551 assert (StoreNode && "Can only custom lower store nodes");
7552
7553 SDValue Value = StoreNode->getValue();
7554
7555 EVT VT = Value.getValueType();
7556 EVT MemVT = StoreNode->getMemoryVT();
7557
7558 if (StoreNode->isNonTemporal()) {
7559 if (auto MaybeSTNP = LowerNTStore(StoreNode, VT, MemVT, Dl, DAG))
7560 return MaybeSTNP;
7561 }
7562
7563 if (VT.isVector()) {
7565 VT,
7566 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
7567 return LowerFixedLengthVectorStoreToSVE(Op, DAG);
7568
7569 unsigned AS = StoreNode->getAddressSpace();
7570 Align Alignment = StoreNode->getAlign();
7571 if (Alignment < MemVT.getStoreSize() &&
7572 !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment,
7573 StoreNode->getMemOperand()->getFlags(),
7574 nullptr)) {
7575 return scalarizeVectorStore(StoreNode, DAG);
7576 }
7577
7578 if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 &&
7579 MemVT == MVT::v4i8) {
7580 return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
7581 }
7582 } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
7583 return LowerStore128(Op, DAG);
7584 } else if (MemVT == MVT::i64x8) {
7585 SDValue Value = StoreNode->getValue();
7586 assert(Value->getValueType(0) == MVT::i64x8);
7587 SDValue Chain = StoreNode->getChain();
7588 SDValue Base = StoreNode->getBasePtr();
7589 EVT PtrVT = Base.getValueType();
7590 for (unsigned i = 0; i < 8; i++) {
7591 SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64, Value,
7592 DAG.getConstant(i, Dl, MVT::i32));
7593 SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base,
7594 DAG.getConstant(i * 8, Dl, PtrVT));
7595 Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(),
7596 StoreNode->getBaseAlign());
7597 }
7598 return Chain;
7599 }
7600
7601 return SDValue();
7602}
7603
7604/// Lower atomic or volatile 128-bit stores to a single STP instruction.
7605SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
7606 SelectionDAG &DAG) const {
7607 MemSDNode *StoreNode = cast<MemSDNode>(Op);
7608 assert(StoreNode->getMemoryVT() == MVT::i128);
7609 assert(StoreNode->isVolatile() || StoreNode->isAtomic());
7610
7611 bool IsStoreRelease =
7613 if (StoreNode->isAtomic())
7614 assert((Subtarget->hasFeature(AArch64::FeatureLSE2) &&
7615 Subtarget->hasFeature(AArch64::FeatureRCPC3) && IsStoreRelease) ||
7618
7619 SDValue Value = (StoreNode->getOpcode() == ISD::STORE ||
7620 StoreNode->getOpcode() == ISD::ATOMIC_STORE)
7621 ? StoreNode->getOperand(1)
7622 : StoreNode->getOperand(2);
7623 SDLoc DL(Op);
7624 auto StoreValue = DAG.SplitScalar(Value, DL, MVT::i64, MVT::i64);
7625 unsigned Opcode = IsStoreRelease ? AArch64ISD::STILP : AArch64ISD::STP;
7626 if (DAG.getDataLayout().isBigEndian())
7627 std::swap(StoreValue.first, StoreValue.second);
7629 Opcode, DL, DAG.getVTList(MVT::Other),
7630 {StoreNode->getChain(), StoreValue.first, StoreValue.second,
7631 StoreNode->getBasePtr()},
7632 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
7633 return Result;
7634}
7635
7636/// Helper function to optimize loads of extended small vectors.
7637/// These patterns would otherwise get scalarized into inefficient sequences.
7639 const AArch64Subtarget &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
7640 if (!isEligibleForSmallVectorLoadOpt(Load, Subtarget))
7641 return SDValue();
7642
7643 EVT MemVT = Load->getMemoryVT();
7644 EVT ResVT = Load->getValueType(0);
7645 unsigned NumElts = ResVT.getVectorNumElements();
7646 unsigned DstEltBits = ResVT.getScalarSizeInBits();
7647 unsigned SrcEltBits = MemVT.getScalarSizeInBits();
7648
7649 unsigned ExtOpcode;
7650 switch (Load->getExtensionType()) {
7651 case ISD::EXTLOAD:
7652 case ISD::ZEXTLOAD:
7653 ExtOpcode = ISD::ZERO_EXTEND;
7654 break;
7655 case ISD::SEXTLOAD:
7656 ExtOpcode = ISD::SIGN_EXTEND;
7657 break;
7658 case ISD::NON_EXTLOAD:
7659 return SDValue();
7660 }
7661
7662 SDLoc DL(Load);
7663 SDValue Chain = Load->getChain();
7664 SDValue BasePtr = Load->getBasePtr();
7665 const MachinePointerInfo &PtrInfo = Load->getPointerInfo();
7666 Align Alignment = Load->getAlign();
7667
7668 // Load the data as an FP scalar to avoid issues with integer loads.
7669 unsigned LoadBits = MemVT.getStoreSizeInBits();
7670 MVT ScalarLoadType = MVT::getFloatingPointVT(LoadBits);
7671 SDValue ScalarLoad =
7672 DAG.getLoad(ScalarLoadType, DL, Chain, BasePtr, PtrInfo, Alignment);
7673
7674 MVT ScalarToVecTy = MVT::getVectorVT(ScalarLoadType, 128 / LoadBits);
7675 SDValue ScalarToVec =
7676 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ScalarToVecTy, ScalarLoad);
7677 MVT BitcastTy =
7678 MVT::getVectorVT(MVT::getIntegerVT(SrcEltBits), 128 / SrcEltBits);
7679 SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, BitcastTy, ScalarToVec);
7680
7681 SDValue Res = Bitcast;
7682 unsigned CurrentEltBits = Res.getValueType().getScalarSizeInBits();
7683 unsigned CurrentNumElts = Res.getValueType().getVectorNumElements();
7684 while (CurrentEltBits < DstEltBits) {
7685 if (Res.getValueSizeInBits() >= 128) {
7686 CurrentNumElts = CurrentNumElts / 2;
7687 MVT ExtractVT =
7688 MVT::getVectorVT(MVT::getIntegerVT(CurrentEltBits), CurrentNumElts);
7689 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, Res,
7690 DAG.getConstant(0, DL, MVT::i64));
7691 }
7692 CurrentEltBits = CurrentEltBits * 2;
7693 MVT ExtVT =
7694 MVT::getVectorVT(MVT::getIntegerVT(CurrentEltBits), CurrentNumElts);
7695 Res = DAG.getNode(ExtOpcode, DL, ExtVT, Res);
7696 }
7697
7698 if (CurrentNumElts != NumElts) {
7699 MVT FinalVT = MVT::getVectorVT(MVT::getIntegerVT(CurrentEltBits), NumElts);
7700 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, FinalVT, Res,
7701 DAG.getConstant(0, DL, MVT::i64));
7702 }
7703
7704 return DAG.getMergeValues({Res, ScalarLoad.getValue(1)}, DL);
7705}
7706
7707SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
7708 SelectionDAG &DAG) const {
7709 SDLoc DL(Op);
7710 LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
7711 assert(LoadNode && "Expected custom lowering of a load node");
7712
7713 if (SDValue Result = tryLowerSmallVectorExtLoad(LoadNode, DAG))
7714 return Result;
7715
7716 if (LoadNode->getMemoryVT() == MVT::i64x8) {
7718 SDValue Base = LoadNode->getBasePtr();
7719 SDValue Chain = LoadNode->getChain();
7720 EVT PtrVT = Base.getValueType();
7721 for (unsigned i = 0; i < 8; i++) {
7722 SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
7723 DAG.getConstant(i * 8, DL, PtrVT));
7724 SDValue Part =
7725 DAG.getLoad(MVT::i64, DL, Chain, Ptr, LoadNode->getPointerInfo(),
7726 LoadNode->getBaseAlign());
7727 Ops.push_back(Part);
7728 Chain = SDValue(Part.getNode(), 1);
7729 }
7730 SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops);
7731 return DAG.getMergeValues({Loaded, Chain}, DL);
7732 }
7733
7734 return SDValue();
7735}
7736
7737SDValue AArch64TargetLowering::LowerFixedLengthVectorCompressToSVE(
7738 SDValue Op, SelectionDAG &DAG) const {
7739 SDLoc DL(Op);
7740 EVT VT = Op.getValueType();
7741
7742 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
7743 SDValue Vec = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
7744 SDValue Mask = convertFixedMaskToScalableVector(Op.getOperand(1), DAG);
7745 SDValue Passthru =
7746 convertToScalableVector(DAG, ContainerVT, Op.getOperand(2));
7747
7748 SDValue Result =
7749 DAG.getNode(ISD::VECTOR_COMPRESS, DL, ContainerVT, Vec, Mask, Passthru);
7750 return convertFromScalableVector(DAG, VT, Result);
7751}
7752
7753SDValue AArch64TargetLowering::LowerVECTOR_COMPRESS(SDValue Op,
7754 SelectionDAG &DAG) const {
7755 EVT VT = Op.getValueType();
7756 if (!Subtarget->isSVEAvailable())
7757 return SDValue();
7758
7759 if (VT.isFixedLengthVector())
7760 return LowerFixedLengthVectorCompressToSVE(Op, DAG);
7761
7762 SDLoc DL(Op);
7763 SDValue Vec = Op.getOperand(0);
7764 SDValue Mask = Op.getOperand(1);
7765 SDValue Passthru = Op.getOperand(2);
7766 EVT MaskVT = Mask.getValueType();
7767
7768 SDValue Compressed = DAG.getNode(
7770 DAG.getTargetConstant(Intrinsic::aarch64_sve_compact, DL, MVT::i64), Mask,
7771 Vec);
7772
7773 // compact fills with 0s, so if our passthru is all 0s, do nothing here.
7774 if (Passthru.isUndef() ||
7776 return Compressed;
7777
7778 SDValue CntActive = DAG.getNode(
7779 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,
7780 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64), Mask,
7781 Mask);
7782
7783 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
7784 SDValue CompressedMask =
7785 DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, DL, MaskVT, Zero, CntActive);
7786
7787 return DAG.getNode(ISD::VSELECT, DL, VT, CompressedMask, Compressed,
7788 Passthru);
7789}
7790
7791// Generate SUBS and CSEL for integer abs.
7792SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
7793 MVT VT = Op.getSimpleValueType();
7794
7795 if (VT.isVector())
7796 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU);
7797
7798 SDLoc DL(Op);
7799 SDValue Neg = DAG.getNegative(Op.getOperand(0), DL, VT);
7800
7801 // Generate SUBS & CSEL.
7802 SDValue Cmp = DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT),
7803 Op.getOperand(0), DAG.getConstant(0, DL, VT));
7804 return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
7805 getCondCode(DAG, AArch64CC::PL), Cmp.getValue(1));
7806}
7807
7809 SDValue Chain = Op.getOperand(0);
7810 SDValue Cond = Op.getOperand(1);
7811 SDValue Dest = Op.getOperand(2);
7812
7814 if (SDValue Cmp = emitConjunction(DAG, Cond, CC)) {
7815 SDLoc DL(Op);
7816 SDValue CCVal = getCondCode(DAG, CC);
7817 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CCVal,
7818 Cmp);
7819 }
7820
7821 return SDValue();
7822}
7823
7824// Treat FSHR with constant shifts as legal operation, otherwise it is expanded
7825// FSHL is converted to FSHR before deciding what to do with it
7827 SDValue Shifts = Op.getOperand(2);
7828 // Check if the shift amount is a constant and normalise to [0, SrcBitLen)
7829 // If opcode is FSHL, convert it to FSHR
7830 if (auto *ShiftNo = dyn_cast<ConstantSDNode>(Shifts)) {
7831 SDLoc DL(Op);
7832 MVT VT = Op.getSimpleValueType();
7833 unsigned int NewShiftNo = ShiftNo->getZExtValue() % VT.getFixedSizeInBits();
7834
7835 if (Op.getOpcode() == ISD::FSHL) {
7836 if (NewShiftNo == 0)
7837 return Op.getOperand(0);
7838
7839 NewShiftNo = VT.getFixedSizeInBits() - NewShiftNo;
7840 return DAG.getNode(
7841 ISD::FSHR, DL, VT, Op.getOperand(0), Op.getOperand(1),
7842 DAG.getConstant(NewShiftNo, DL, Shifts.getValueType()));
7843 }
7844
7845 if (Op.getOpcode() == ISD::FSHR) {
7846 if (NewShiftNo == 0)
7847 return Op.getOperand(1);
7848
7849 if (ShiftNo->getZExtValue() == NewShiftNo)
7850 return Op;
7851
7852 // Rewrite using the normalised shift amount.
7853 return DAG.getNode(
7854 ISD::FSHR, DL, VT, Op.getOperand(0), Op.getOperand(1),
7855 DAG.getConstant(NewShiftNo, DL, Shifts.getValueType()));
7856 }
7857 }
7858
7859 return SDValue();
7860}
7861
7863 SDValue X = Op.getOperand(0);
7864 EVT XScalarTy = X.getValueType();
7865 SDValue Exp = Op.getOperand(1);
7866
7867 SDLoc DL(Op);
7868 EVT XVT, ExpVT;
7869 switch (Op.getSimpleValueType().SimpleTy) {
7870 default:
7871 return SDValue();
7872 case MVT::bf16:
7873 case MVT::f16:
7874 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X);
7875 [[fallthrough]];
7876 case MVT::f32:
7877 XVT = MVT::nxv4f32;
7878 ExpVT = MVT::nxv4i32;
7879 break;
7880 case MVT::f64:
7881 XVT = MVT::nxv2f64;
7882 ExpVT = MVT::nxv2i64;
7883 Exp = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Exp);
7884 break;
7885 }
7886
7887 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
7888 SDValue VX =
7889 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, XVT, DAG.getPOISON(XVT), X, Zero);
7890 SDValue VExp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpVT,
7891 DAG.getPOISON(ExpVT), Exp, Zero);
7892 SDValue VPg = DAG.getConstant(
7893 1, DL, XVT.changeVectorElementType(*DAG.getContext(), MVT::i1));
7894 SDValue FScale = DAG.getNode(
7896 DAG.getTargetConstant(Intrinsic::aarch64_sve_fscale, DL, MVT::i64), VPg,
7897 VX, VExp);
7898 SDValue Final =
7899 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, X.getValueType(), FScale, Zero);
7900 if (X.getValueType() != XScalarTy)
7901 Final = DAG.getNode(ISD::FP_ROUND, DL, XScalarTy, Final,
7902 DAG.getIntPtrConstant(1, SDLoc(Op), /*isTarget=*/true));
7903 return Final;
7904}
7905
7906SDValue AArch64TargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
7907 SelectionDAG &DAG) const {
7908 return Op.getOperand(0);
7909}
7910
7911SDValue AArch64TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
7912 SelectionDAG &DAG) const {
7913 SDValue Chain = Op.getOperand(0);
7914 SDValue Trmp = Op.getOperand(1); // trampoline, >=32 bytes
7915 SDValue FPtr = Op.getOperand(2); // nested function
7916 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
7917
7918 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
7919
7920 // ldr NestReg, .+16
7921 // ldr x17, .+20
7922 // br x17
7923 // .word 0
7924 // .nest: .qword nest
7925 // .fptr: .qword fptr
7926 SDValue OutChains[5];
7927
7928 const Function *Func =
7929 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
7930 CallingConv::ID CC = Func->getCallingConv();
7931 unsigned NestReg;
7932
7933 switch (CC) {
7934 default:
7935 NestReg = 0x0f; // X15
7936 break;
7938 // Must be kept in sync with AArch64CallingConv.td
7939 NestReg = 0x04; // X4
7940 break;
7941 }
7942
7943 const char FptrReg = 0x11; // X17
7944
7945 SDValue Addr = Trmp;
7946
7947 SDLoc DL(Op);
7948 OutChains[0] = DAG.getStore(
7949 Chain, DL, DAG.getConstant(0x58000080u | NestReg, DL, MVT::i32), Addr,
7950 MachinePointerInfo(TrmpAddr));
7951
7952 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7953 DAG.getConstant(4, DL, MVT::i64));
7954 OutChains[1] = DAG.getStore(
7955 Chain, DL, DAG.getConstant(0x580000b0u | FptrReg, DL, MVT::i32), Addr,
7956 MachinePointerInfo(TrmpAddr, 4));
7957
7958 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7959 DAG.getConstant(8, DL, MVT::i64));
7960 OutChains[2] =
7961 DAG.getStore(Chain, DL, DAG.getConstant(0xd61f0220u, DL, MVT::i32), Addr,
7962 MachinePointerInfo(TrmpAddr, 8));
7963
7964 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7965 DAG.getConstant(16, DL, MVT::i64));
7966 OutChains[3] =
7967 DAG.getStore(Chain, DL, Nest, Addr, MachinePointerInfo(TrmpAddr, 16));
7968
7969 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7970 DAG.getConstant(24, DL, MVT::i64));
7971 OutChains[4] =
7972 DAG.getStore(Chain, DL, FPtr, Addr, MachinePointerInfo(TrmpAddr, 24));
7973
7974 SDValue StoreToken = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
7975
7976 SDValue EndOfTrmp = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7977 DAG.getConstant(12, DL, MVT::i64));
7978
7979 // Call clear cache on the trampoline instructions.
7980 return DAG.getNode(ISD::CLEAR_CACHE, DL, MVT::Other, StoreToken, Trmp,
7981 EndOfTrmp);
7982}
7983
7984SDValue AArch64TargetLowering::LowerFMUL(SDValue Op, SelectionDAG &DAG) const {
7985 SDLoc DL(Op);
7986 EVT VT = Op.getValueType();
7987 if (VT.getScalarType() != MVT::bf16 ||
7988 (Subtarget->hasSVEB16B16() &&
7989 Subtarget->isNonStreamingSVEorSME2Available()))
7990 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED);
7991
7992 assert(Subtarget->hasBF16() && "Expected +bf16 for custom FMUL lowering");
7993 assert((VT == MVT::nxv4bf16 || VT == MVT::nxv8bf16 || VT == MVT::v8bf16) &&
7994 "Unexpected FMUL VT");
7995
7996 auto MakeGetIntrinsic = [&](Intrinsic::ID IID) {
7997 return [&, IID](EVT VT, auto... Ops) {
7998 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
7999 DAG.getConstant(IID, DL, MVT::i32), Ops...);
8000 };
8001 };
8002
8003 auto Reinterpret = [&](SDValue Value, EVT VT) {
8004 EVT SrcVT = Value.getValueType();
8005 if (VT == SrcVT)
8006 return Value;
8007 if (SrcVT.isFixedLengthVector())
8008 return convertToScalableVector(DAG, VT, Value);
8009 if (VT.isFixedLengthVector())
8010 return convertFromScalableVector(DAG, VT, Value);
8011 return DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Value);
8012 };
8013
8014 bool UseSVEBFMLAL = VT.isScalableVector();
8015 auto FCVT = MakeGetIntrinsic(Intrinsic::aarch64_sve_fcvt_bf16f32_v2);
8016 auto FCVTNT = MakeGetIntrinsic(Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2);
8017
8018 // Note: The NEON BFMLAL[BT] reads even/odd lanes like the SVE variant.
8019 // This does not match BFCVTN[2], so we use SVE to convert back to bf16.
8020 auto BFMLALB =
8021 MakeGetIntrinsic(UseSVEBFMLAL ? Intrinsic::aarch64_sve_bfmlalb
8022 : Intrinsic::aarch64_neon_bfmlalb);
8023 auto BFMLALT =
8024 MakeGetIntrinsic(UseSVEBFMLAL ? Intrinsic::aarch64_sve_bfmlalt
8025 : Intrinsic::aarch64_neon_bfmlalt);
8026
8027 EVT AccVT = UseSVEBFMLAL ? MVT::nxv4f32 : MVT::v4f32;
8028 bool IgnoreZeroSign = DAG.canIgnoreSignBitOfZero(Op);
8029 SDValue Zero = DAG.getConstantFP(IgnoreZeroSign ? +0.0F : -0.0F, DL, AccVT);
8030 SDValue Pg = getPredicateForVector(DAG, DL, AccVT);
8031
8032 // Lower bf16 FMUL as a pair (VT == [nx]v8bf16) of BFMLAL top/bottom
8033 // instructions. These result in two f32 vectors, which can be converted back
8034 // to bf16 with FCVT and FCVTNT.
8035 SDValue LHS = Op.getOperand(0);
8036 SDValue RHS = Op.getOperand(1);
8037
8038 // All SVE intrinsics expect to operate on full bf16 vector types.
8039 if (UseSVEBFMLAL) {
8040 LHS = Reinterpret(LHS, MVT::nxv8bf16);
8041 RHS = Reinterpret(RHS, MVT::nxv8bf16);
8042 }
8043
8044 SDValue BottomF32 = Reinterpret(BFMLALB(AccVT, Zero, LHS, RHS), MVT::nxv4f32);
8045 SDValue BottomBF16 =
8046 FCVT(MVT::nxv8bf16, DAG.getPOISON(MVT::nxv8bf16), Pg, BottomF32);
8047 // Note: nxv4bf16 only uses even lanes.
8048 if (VT == MVT::nxv4bf16)
8049 return Reinterpret(BottomBF16, VT);
8050
8051 SDValue TopF32 = Reinterpret(BFMLALT(AccVT, Zero, LHS, RHS), MVT::nxv4f32);
8052 SDValue TopBF16 = FCVTNT(MVT::nxv8bf16, BottomBF16, Pg, TopF32);
8053 return Reinterpret(TopBF16, VT);
8054}
8055
8056SDValue AArch64TargetLowering::LowerFMA(SDValue Op, SelectionDAG &DAG) const {
8057 SDValue OpA = Op->getOperand(0);
8058 SDValue OpB = Op->getOperand(1);
8059 SDValue OpC = Op->getOperand(2);
8060 EVT VT = Op.getValueType();
8061 SDLoc DL(Op);
8062
8063 assert(VT.isVector() && "Scalar fma lowering should be handled by patterns");
8064
8065 // Bail early if we're definitely not looking to merge FNEGs into the FMA.
8066 if (VT != MVT::v8f16 && VT != MVT::v4f32 && VT != MVT::v2f64)
8067 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
8068
8069 if (OpC.getOpcode() != ISD::FNEG)
8070 return useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())
8071 ? LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED)
8072 : Op; // Fallback to NEON lowering.
8073
8074 // Convert FMA/FNEG nodes to SVE to enable the following patterns:
8075 // fma(a, b, neg(c)) -> fnmls(a, b, c)
8076 // fma(neg(a), b, neg(c)) -> fnmla(a, b, c)
8077 // fma(a, neg(b), neg(c)) -> fnmla(a, b, c)
8078 SDValue Pg = getPredicateForVector(DAG, DL, VT);
8079 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
8080
8081 auto ConvertToScalableFnegMt = [&](SDValue Op) {
8082 if (Op.getOpcode() == ISD::FNEG)
8083 Op = LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
8084 return convertToScalableVector(DAG, ContainerVT, Op);
8085 };
8086
8087 OpA = ConvertToScalableFnegMt(OpA);
8088 OpB = ConvertToScalableFnegMt(OpB);
8089 OpC = ConvertToScalableFnegMt(OpC);
8090
8091 SDValue ScalableRes =
8092 DAG.getNode(AArch64ISD::FMA_PRED, DL, ContainerVT, Pg, OpA, OpB, OpC);
8093 return convertFromScalableVector(DAG, VT, ScalableRes);
8094}
8095
8097 EVT VT = Op.getValueType();
8098 assert(
8099 (VT == MVT::i64 || VT == MVT::i32 || VT == MVT::i16 || VT == MVT::i8) &&
8100 "Unexpected Type");
8101 SDLoc DL(Op);
8102 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), VT, 64 / VT.getSizeInBits());
8103 EVT CLMULTy = VT == MVT::i8 ? MVT::v8i8 : MVT::v1i64;
8104 EVT ExtractTy = VT == MVT::i64 ? MVT::i64 : MVT::i32;
8105 SDValue VecOp0 =
8106 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op.getOperand(0));
8107 SDValue VecOp1 =
8108 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op.getOperand(1));
8109
8110 if (VecVT != CLMULTy) {
8111 VecOp0 = DAG.getNode(ISD::BITCAST, DL, CLMULTy, VecOp0);
8112 VecOp1 = DAG.getNode(ISD::BITCAST, DL, CLMULTy, VecOp1);
8113 }
8114 SDValue CLMUL = DAG.getNode(ISD::CLMUL, DL, CLMULTy, VecOp0, VecOp1);
8115 if (ExtractTy == MVT::i32)
8116 CLMUL = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, CLMUL);
8117 SDValue ExtractVecElt =
8118 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractTy, CLMUL,
8119 DAG.getTargetConstant(0, DL, MVT::i64));
8120 if (ExtractTy != VT)
8121 ExtractVecElt = DAG.getNode(ISD::TRUNCATE, DL, VT, ExtractVecElt);
8122 return ExtractVecElt;
8123}
8124
8126 SelectionDAG &DAG) const {
8127 LLVM_DEBUG(dbgs() << "Custom lowering: ");
8128 LLVM_DEBUG(Op.dump());
8129
8130 switch (Op.getOpcode()) {
8131 default:
8132 llvm_unreachable("unimplemented operand");
8133 return SDValue();
8136 return LowerLOOP_DEPENDENCE_MASK(Op, DAG);
8137 case ISD::BITCAST:
8138 return LowerBITCAST(Op, DAG);
8139 case ISD::GlobalAddress:
8140 return LowerGlobalAddress(Op, DAG);
8142 return LowerGlobalTLSAddress(Op, DAG);
8144 return LowerPtrAuthGlobalAddress(Op, DAG);
8146 return LowerADJUST_TRAMPOLINE(Op, DAG);
8148 return LowerINIT_TRAMPOLINE(Op, DAG);
8149 case ISD::SETCC:
8150 case ISD::STRICT_FSETCC:
8152 return LowerSETCC(Op, DAG);
8153 case ISD::SETCCCARRY:
8154 return LowerSETCCCARRY(Op, DAG);
8155 case ISD::BRCOND:
8156 return LowerBRCOND(Op, DAG);
8157 case ISD::BR_CC:
8158 return LowerBR_CC(Op, DAG);
8159 case ISD::SELECT:
8160 return LowerSELECT(Op, DAG);
8161 case ISD::SELECT_CC:
8162 return LowerSELECT_CC(Op, DAG);
8163 case ISD::JumpTable:
8164 return LowerJumpTable(Op, DAG);
8165 case ISD::BR_JT:
8166 return LowerBR_JT(Op, DAG);
8167 case ISD::BRIND:
8168 return LowerBRIND(Op, DAG);
8169 case ISD::ConstantPool:
8170 return LowerConstantPool(Op, DAG);
8171 case ISD::BlockAddress:
8172 return LowerBlockAddress(Op, DAG);
8173 case ISD::VASTART:
8174 return LowerVASTART(Op, DAG);
8175 case ISD::VACOPY:
8176 return LowerVACOPY(Op, DAG);
8177 case ISD::VAARG:
8178 return LowerVAARG(Op, DAG);
8179 case ISD::UADDO_CARRY:
8180 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, false /*unsigned*/);
8181 case ISD::USUBO_CARRY:
8182 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, false /*unsigned*/);
8183 case ISD::SADDO_CARRY:
8184 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, true /*signed*/);
8185 case ISD::SSUBO_CARRY:
8186 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, true /*signed*/);
8187 case ISD::SADDO:
8188 case ISD::UADDO:
8189 case ISD::SSUBO:
8190 case ISD::USUBO:
8191 case ISD::SMULO:
8192 case ISD::UMULO:
8193 return LowerXALUO(Op, DAG);
8194 case ISD::FADD:
8195 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
8196 case ISD::FSUB:
8197 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED);
8198 case ISD::FMUL:
8199 return LowerFMUL(Op, DAG);
8200 case ISD::FMA:
8201 return LowerFMA(Op, DAG);
8202 case ISD::FDIV:
8203 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED);
8204 case ISD::FNEG:
8205 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
8206 case ISD::FCEIL:
8207 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU);
8208 case ISD::FFLOOR:
8209 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU);
8210 case ISD::FNEARBYINT:
8211 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);
8212 case ISD::FRINT:
8213 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU);
8214 case ISD::FROUND:
8215 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU);
8216 case ISD::FROUNDEVEN:
8217 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);
8218 case ISD::FTRUNC:
8219 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU);
8220 case ISD::FSQRT:
8221 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU);
8222 case ISD::FABS:
8223 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU);
8224 case ISD::FP_ROUND:
8226 return LowerFP_ROUND(Op, DAG);
8227 case ISD::FP_EXTEND:
8229 return LowerFP_EXTEND(Op, DAG);
8230 case ISD::FRAMEADDR:
8231 return LowerFRAMEADDR(Op, DAG);
8232 case ISD::SPONENTRY:
8233 return LowerSPONENTRY(Op, DAG);
8234 case ISD::RETURNADDR:
8235 return LowerRETURNADDR(Op, DAG);
8237 return LowerADDROFRETURNADDR(Op, DAG);
8239 return LowerCONCAT_VECTORS(Op, DAG);
8241 return LowerINSERT_VECTOR_ELT(Op, DAG);
8243 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
8244 case ISD::BUILD_VECTOR:
8245 return LowerBUILD_VECTOR(Op, DAG);
8248 return LowerEXTEND_VECTOR_INREG(Op, DAG);
8250 return LowerZERO_EXTEND_VECTOR_INREG(Op, DAG);
8252 return LowerVECTOR_SHUFFLE(Op, DAG);
8253 case ISD::SPLAT_VECTOR:
8254 return LowerSPLAT_VECTOR(Op, DAG);
8256 return LowerEXTRACT_SUBVECTOR(Op, DAG);
8258 return LowerINSERT_SUBVECTOR(Op, DAG);
8259 case ISD::MASKED_SDIV:
8260 case ISD::MASKED_UDIV:
8261 case ISD::SDIV:
8262 case ISD::UDIV:
8263 return LowerDIV(Op, DAG);
8264 case ISD::SMIN:
8265 case ISD::UMIN:
8266 case ISD::SMAX:
8267 case ISD::UMAX:
8268 return LowerMinMax(Op, DAG);
8269 case ISD::SRA:
8270 case ISD::SRL:
8271 case ISD::SHL:
8272 return LowerVectorSRA_SRL_SHL(Op, DAG);
8273 case ISD::SHL_PARTS:
8274 case ISD::SRL_PARTS:
8275 case ISD::SRA_PARTS:
8276 return LowerShiftParts(Op, DAG);
8277 case ISD::CTPOP:
8278 case ISD::PARITY:
8279 return LowerCTPOP_PARITY(Op, DAG);
8280 case ISD::FCOPYSIGN:
8281 return LowerFCOPYSIGN(Op, DAG);
8282 case ISD::OR:
8283 return LowerVectorOR(Op, DAG);
8284 case ISD::XOR:
8285 return LowerXOR(Op, DAG);
8286 case ISD::PREFETCH:
8287 return LowerPREFETCH(Op, DAG);
8288 case ISD::SINT_TO_FP:
8289 case ISD::UINT_TO_FP:
8292 return LowerINT_TO_FP(Op, DAG);
8293 case ISD::FP_TO_SINT:
8294 case ISD::FP_TO_UINT:
8297 return LowerFP_TO_INT(Op, DAG);
8300 return LowerFP_TO_INT_SAT(Op, DAG);
8301 case ISD::GET_ROUNDING:
8302 return LowerGET_ROUNDING(Op, DAG);
8303 case ISD::SET_ROUNDING:
8304 return LowerSET_ROUNDING(Op, DAG);
8305 case ISD::GET_FPMODE:
8306 return LowerGET_FPMODE(Op, DAG);
8307 case ISD::SET_FPMODE:
8308 return LowerSET_FPMODE(Op, DAG);
8309 case ISD::RESET_FPMODE:
8310 return LowerRESET_FPMODE(Op, DAG);
8311 case ISD::MUL:
8312 return LowerMUL(Op, DAG);
8313 case ISD::MULHS:
8314 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED);
8315 case ISD::MULHU:
8316 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED);
8318 return LowerINTRINSIC_W_CHAIN(Op, DAG);
8320 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
8322 return LowerINTRINSIC_VOID(Op, DAG);
8323 case ISD::ATOMIC_STORE:
8324 if (cast<MemSDNode>(Op)->getMemoryVT() == MVT::i128) {
8325 assert(Subtarget->hasLSE2() || Subtarget->hasRCPC3());
8326 return LowerStore128(Op, DAG);
8327 }
8328 return SDValue();
8329 case ISD::STORE:
8330 return LowerSTORE(Op, DAG);
8331 case ISD::MSTORE:
8332 return LowerMSTORE(Op, DAG);
8333 case ISD::MGATHER:
8334 return LowerMGATHER(Op, DAG);
8335 case ISD::MSCATTER:
8336 return LowerMSCATTER(Op, DAG);
8338 return LowerVECREDUCE_SEQ_FADD(Op, DAG);
8339 case ISD::VECREDUCE_ADD:
8340 case ISD::VECREDUCE_AND:
8341 case ISD::VECREDUCE_OR:
8342 case ISD::VECREDUCE_XOR:
8352 return LowerVECREDUCE(Op, DAG);
8353 case ISD::VECREDUCE_MUL:
8355 return LowerVECREDUCE_MUL(Op, DAG);
8357 return LowerATOMIC_LOAD_AND(Op, DAG);
8359 return LowerDYNAMIC_STACKALLOC(Op, DAG);
8360 case ISD::VSCALE:
8361 return LowerVSCALE(Op, DAG);
8363 return LowerVECTOR_COMPRESS(Op, DAG);
8364 case ISD::ANY_EXTEND:
8365 case ISD::SIGN_EXTEND:
8366 case ISD::ZERO_EXTEND:
8367 return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
8368 case ISD::ADDRSPACECAST:
8369 return LowerADDRSPACECAST(Op, DAG);
8371 // Only custom lower when ExtraVT has a legal byte based element type.
8372 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
8373 EVT ExtraEltVT = ExtraVT.getVectorElementType();
8374 if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&
8375 (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64))
8376 return SDValue();
8377
8378 return LowerToPredicatedOp(Op, DAG,
8379 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU);
8380 }
8381 case ISD::TRUNCATE:
8382 return LowerTRUNCATE(Op, DAG);
8383 case ISD::MLOAD:
8384 return LowerMLOAD(Op, DAG);
8385 case ISD::LOAD:
8386 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
8387 !Subtarget->isNeonAvailable()))
8388 return LowerFixedLengthVectorLoadToSVE(Op, DAG);
8389 return LowerLOAD(Op, DAG);
8390 case ISD::ADD:
8391 case ISD::AND:
8392 case ISD::SUB:
8393 return LowerToScalableOp(Op, DAG);
8394 case ISD::FMAXIMUM:
8395 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED);
8396 case ISD::FMAXNUM:
8397 case ISD::FMAXNUM_IEEE:
8398 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED);
8399 case ISD::FMINIMUM:
8400 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMIN_PRED);
8401 case ISD::FMINNUM:
8402 case ISD::FMINNUM_IEEE:
8403 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED);
8404 case ISD::VSELECT:
8405 return LowerFixedLengthVectorSelectToSVE(Op, DAG);
8406 case ISD::ABS:
8407 return LowerABS(Op, DAG);
8408 case ISD::ABDS:
8409 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDS_PRED);
8410 case ISD::ABDU:
8411 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED);
8412 case ISD::AVGFLOORS:
8413 return LowerAVG(Op, DAG, AArch64ISD::HADDS_PRED);
8414 case ISD::AVGFLOORU:
8415 return LowerAVG(Op, DAG, AArch64ISD::HADDU_PRED);
8416 case ISD::AVGCEILS:
8417 return LowerAVG(Op, DAG, AArch64ISD::RHADDS_PRED);
8418 case ISD::AVGCEILU:
8419 return LowerAVG(Op, DAG, AArch64ISD::RHADDU_PRED);
8420 case ISD::BITREVERSE:
8421 return LowerBitreverse(Op, DAG);
8422 case ISD::BSWAP:
8423 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU);
8424 case ISD::CTLZ:
8425 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU);
8426 case ISD::CTTZ:
8427 return LowerCTTZ(Op, DAG);
8430 return LowerVECTOR_SPLICE(Op, DAG);
8432 return LowerVECTOR_DEINTERLEAVE(Op, DAG);
8434 return LowerVECTOR_INTERLEAVE(Op, DAG);
8436 return LowerGET_ACTIVE_LANE_MASK(Op, DAG);
8437 case ISD::LRINT:
8438 case ISD::LLRINT:
8439 if (Op.getValueType().isVector())
8440 return LowerVectorXRINT(Op, DAG);
8441 [[fallthrough]];
8442 case ISD::LROUND:
8443 case ISD::LLROUND: {
8444 assert((Op.getOperand(0).getValueType() == MVT::f16 ||
8445 Op.getOperand(0).getValueType() == MVT::bf16) &&
8446 "Expected custom lowering of rounding operations only for f16");
8447 SDLoc DL(Op);
8448 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Op.getOperand(0));
8449 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), Ext);
8450 }
8451 case ISD::STRICT_LROUND:
8453 case ISD::STRICT_LRINT:
8454 case ISD::STRICT_LLRINT: {
8455 assert((Op.getOperand(1).getValueType() == MVT::f16 ||
8456 Op.getOperand(1).getValueType() == MVT::bf16) &&
8457 "Expected custom lowering of rounding operations only for f16");
8458 SDLoc DL(Op);
8459 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
8460 {Op.getOperand(0), Op.getOperand(1)});
8461 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
8462 {Ext.getValue(1), Ext.getValue(0)});
8463 }
8464 case ISD::WRITE_REGISTER: {
8465 assert(Op.getOperand(2).getValueType() == MVT::i128 &&
8466 "WRITE_REGISTER custom lowering is only for 128-bit sysregs");
8467 SDLoc DL(Op);
8468
8469 SDValue Chain = Op.getOperand(0);
8470 SDValue SysRegName = Op.getOperand(1);
8471 std::pair<SDValue, SDValue> Pair =
8472 DAG.SplitScalar(Op.getOperand(2), DL, MVT::i64, MVT::i64);
8473
8474 // chain = MSRR(chain, sysregname, lo, hi)
8475 SDValue Result = DAG.getNode(AArch64ISD::MSRR, DL, MVT::Other, Chain,
8476 SysRegName, Pair.first, Pair.second);
8477
8478 return Result;
8479 }
8480 case ISD::FSHL:
8481 case ISD::FSHR:
8482 return LowerFunnelShift(Op, DAG);
8483 case ISD::FLDEXP:
8484 return LowerFLDEXP(Op, DAG);
8486 return LowerVECTOR_HISTOGRAM(Op, DAG);
8491 return LowerPARTIAL_REDUCE_MLA(Op, DAG);
8492 case ISD::CLMUL:
8493 return LowerCLMUL(Op, DAG);
8494 case ISD::FCANONICALIZE:
8495 return LowerFCANONICALIZE(Op, DAG);
8496 case ISD::CTTZ_ELTS:
8498 SDLoc DL(Op);
8499 SDValue CttzOp = Op.getOperand(0);
8500 EVT VT = CttzOp.getValueType();
8501 assert(VT.getVectorElementType() == MVT::i1 && "Expected MVT::i1");
8502
8503 if (VT.isFixedLengthVector()) {
8504 // We can use SVE instructions to lower this intrinsic by first creating
8505 // an SVE predicate register mask from the fixed-width vector.
8506 VT = getTypeToTransformTo(*DAG.getContext(), VT);
8507 SDValue Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, CttzOp);
8508 CttzOp = convertFixedMaskToScalableVector(Mask, DAG);
8509 }
8510
8511 SDValue Pg = getPredicateForVector(DAG, DL, VT);
8512 SDValue NewCttzElts =
8513 DAG.getNode(AArch64ISD::CTTZ_ELTS, DL, MVT::i64, Pg, CttzOp);
8514 return DAG.getZExtOrTrunc(NewCttzElts, DL, Op.getValueType());
8515 }
8516 }
8517}
8518
8520 return !Subtarget->useSVEForFixedLengthVectors();
8521}
8522
8524 EVT VT, bool OverrideNEON) const {
8525 if (!VT.isFixedLengthVector() || !VT.isSimple())
8526 return false;
8527
8528 // Don't use SVE for vectors we cannot scalarize if required.
8529 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
8530 // Fixed length predicates should be promoted to i8.
8531 // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
8532 case MVT::i1:
8533 default:
8534 return false;
8535 case MVT::i8:
8536 case MVT::i16:
8537 case MVT::i32:
8538 case MVT::i64:
8539 case MVT::f16:
8540 case MVT::f32:
8541 case MVT::f64:
8542 break;
8543 }
8544
8545 // NEON-sized vectors can be emulated using SVE instructions.
8546 if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
8547 return Subtarget->isSVEorStreamingSVEAvailable();
8548
8549 // Ensure NEON MVTs only belong to a single register class.
8550 if (VT.getFixedSizeInBits() <= 128)
8551 return false;
8552
8553 // Ensure wider than NEON code generation is enabled.
8554 if (!Subtarget->useSVEForFixedLengthVectors())
8555 return false;
8556
8557 // Don't use SVE for types that don't fit.
8558 if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
8559 return false;
8560
8561 // TODO: Perhaps an artificial restriction, but worth having whilst getting
8562 // the base fixed length SVE support in place.
8563 if (!VT.isPow2VectorType())
8564 return false;
8565
8566 return true;
8567}
8568
8569//===----------------------------------------------------------------------===//
8570// Calling Convention Implementation
8571//===----------------------------------------------------------------------===//
8572
8573static unsigned getIntrinsicID(const SDNode *N) {
8574 unsigned Opcode = N->getOpcode();
8575 switch (Opcode) {
8576 default:
8579 unsigned IID = N->getConstantOperandVal(0);
8580 if (IID < Intrinsic::num_intrinsics)
8581 return IID;
8583 }
8584 }
8585}
8586
8588 SDValue N1) const {
8589 if (!N0.hasOneUse())
8590 return false;
8591
8592 unsigned IID = getIntrinsicID(N1.getNode());
8593 // Avoid reassociating expressions that can be lowered to smlal/umlal.
8594 if (IID == Intrinsic::aarch64_neon_umull ||
8595 N1.getOpcode() == AArch64ISD::UMULL ||
8596 IID == Intrinsic::aarch64_neon_smull ||
8597 N1.getOpcode() == AArch64ISD::SMULL)
8598 return N0.getOpcode() != ISD::ADD;
8599
8600 return true;
8601}
8602
8603/// Selects the correct CCAssignFn for a given CallingConvention value.
8605 bool IsVarArg) const {
8606 switch (CC) {
8607 default:
8608 reportFatalUsageError("unsupported calling convention");
8609 case CallingConv::GHC:
8610 return CC_AArch64_GHC;
8612 // The VarArg implementation makes assumptions about register
8613 // argument passing that do not hold for preserve_none, so we
8614 // instead fall back to C argument passing.
8615 // The non-vararg case is handled in the CC function itself.
8616 if (!IsVarArg)
8618 [[fallthrough]];
8619 case CallingConv::C:
8620 case CallingConv::Fast:
8624 case CallingConv::Swift:
8626 case CallingConv::Tail:
8627 case CallingConv::GRAAL:
8628 if (Subtarget->isTargetWindows()) {
8629 if (IsVarArg) {
8630 if (Subtarget->isWindowsArm64EC())
8633 }
8634 return CC_AArch64_Win64PCS;
8635 }
8636 if (!Subtarget->isTargetDarwin())
8637 return CC_AArch64_AAPCS;
8638 if (!IsVarArg)
8639 return CC_AArch64_DarwinPCS;
8640 return Subtarget->isTargetILP32() ? CC_AArch64_DarwinPCS_ILP32_VarArg
8642 case CallingConv::Win64:
8643 if (IsVarArg) {
8644 if (Subtarget->isWindowsArm64EC())
8647 }
8648 return CC_AArch64_Win64PCS;
8650 if (Subtarget->isWindowsArm64EC())
8658 return CC_AArch64_AAPCS;
8663 }
8664}
8665
8666CCAssignFn *
8668 switch (CC) {
8669 default:
8670 return RetCC_AArch64_AAPCS;
8674 if (Subtarget->isWindowsArm64EC())
8676 return RetCC_AArch64_AAPCS;
8677 }
8678}
8679
8680static bool isPassedInFPR(EVT VT) {
8681 return VT.isFixedLengthVector() ||
8682 (VT.isFloatingPoint() && !VT.isScalableVector());
8683}
8684
8685SDValue AArch64TargetLowering::lowerEHPadEntry(SDValue Chain, SDLoc const &DL,
8686 SelectionDAG &DAG) const {
8687 assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
8688 SDValue Glue = Chain.getValue(1);
8689
8690 MachineFunction &MF = DAG.getMachineFunction();
8691 auto &FuncInfo = *MF.getInfo<AArch64FunctionInfo>();
8692
8693 SMEAttrs SMEFnAttrs = FuncInfo.getSMEFnAttrs();
8694
8695 // The following conditions are true on entry to an exception handler:
8696 // - PSTATE.SM is 0.
8697 // - PSTATE.ZA is 0.
8698 // - TPIDR2_EL0 is null.
8699 // See:
8700 // https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst#exceptions
8701 //
8702 // Therefore, if the function that contains this exception handler is a
8703 // streaming[-compatible] function, we must re-enable streaming mode.
8704 //
8705 // These mode changes are usually optimized away in catch blocks as they
8706 // occur before the __cxa_begin_catch (which is a non-streaming function),
8707 // but are necessary in some cases (such as for cleanups).
8708 //
8709 // Additionally, if the function has ZA or ZT0 state, we must restore it.
8710
8711 // [COND_]SMSTART SM
8712 if (SMEFnAttrs.hasStreamingInterfaceOrBody())
8713 Chain = changeStreamingMode(DAG, DL, /*Enable=*/true, Chain,
8714 /*Glue*/ Glue, AArch64SME::Always);
8715 else if (SMEFnAttrs.hasStreamingCompatibleInterface())
8716 Chain = changeStreamingMode(DAG, DL, /*Enable=*/true, Chain, Glue,
8718 return Chain;
8719}
8720
8721SDValue AArch64TargetLowering::LowerFormalArguments(
8722 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
8723 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
8724 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
8725 MachineFunction &MF = DAG.getMachineFunction();
8726 const Function &F = MF.getFunction();
8727 MachineFrameInfo &MFI = MF.getFrameInfo();
8728 bool IsWin64 =
8729 Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg());
8730 bool StackViaX4 = CallConv == CallingConv::ARM64EC_Thunk_X64 ||
8731 (isVarArg && Subtarget->isWindowsArm64EC());
8732 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8733
8735 GetReturnInfo(CallConv, F.getReturnType(), F.getAttributes(), Outs,
8737 if (any_of(Outs, [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); }))
8738 FuncInfo->setIsSVECC(true);
8739
8740 // Assign locations to all of the incoming arguments.
8742 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
8743
8744 // At this point, Ins[].VT may already be promoted to i32. To correctly
8745 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
8746 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
8747 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
8748 // we use a special version of AnalyzeFormalArguments to pass in ValVT and
8749 // LocVT.
8750 unsigned NumArgs = Ins.size();
8751 Function::const_arg_iterator CurOrigArg = F.arg_begin();
8752 unsigned CurArgIdx = 0;
8753 bool UseVarArgCC = false;
8754 if (IsWin64)
8755 UseVarArgCC = isVarArg;
8756
8757 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC);
8758
8759 for (unsigned i = 0; i != NumArgs; ++i) {
8760 MVT ValVT = Ins[i].VT;
8761 if (Ins[i].isOrigArg()) {
8762 std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
8763 CurArgIdx = Ins[i].getOrigArgIndex();
8764
8765 // Get type of the original argument.
8766 EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
8767 /*AllowUnknown*/ true);
8768 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
8769 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
8770 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
8771 ValVT = MVT::i8;
8772 else if (ActualMVT == MVT::i16)
8773 ValVT = MVT::i16;
8774 }
8775 bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags,
8776 Ins[i].OrigTy, CCInfo);
8777 assert(!Res && "Call operand has unhandled type");
8778 (void)Res;
8779 }
8780
8781 SMEAttrs Attrs = FuncInfo->getSMEFnAttrs();
8782 bool IsLocallyStreaming =
8783 !Attrs.hasStreamingInterface() && Attrs.hasStreamingBody();
8784 assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
8785 SDValue Glue = Chain.getValue(1);
8786
8787 unsigned ExtraArgLocs = 0;
8788 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
8789 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
8790
8791 if (Ins[i].Flags.isByVal()) {
8792 // Byval is used for HFAs in the PCS, but the system should work in a
8793 // non-compliant manner for larger structs.
8794 EVT PtrVT = getPointerTy(DAG.getDataLayout());
8795 int Size = Ins[i].Flags.getByValSize();
8796 unsigned NumRegs = (Size + 7) / 8;
8797
8798 // FIXME: This works on big-endian for composite byvals, which are the common
8799 // case. It should also work for fundamental types too.
8800 unsigned FrameIdx =
8801 MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
8802 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
8803 InVals.push_back(FrameIdxN);
8804
8805 continue;
8806 }
8807
8808 if (Ins[i].Flags.isSwiftAsync())
8809 MF.getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
8810
8811 SDValue ArgValue;
8812 if (VA.isRegLoc()) {
8813 // Arguments stored in registers.
8814 EVT RegVT = VA.getLocVT();
8815 const TargetRegisterClass *RC;
8816
8817 if (RegVT == MVT::i32)
8818 RC = &AArch64::GPR32RegClass;
8819 else if (RegVT == MVT::i64)
8820 RC = &AArch64::GPR64RegClass;
8821 else if (RegVT == MVT::f16 || RegVT == MVT::bf16)
8822 RC = &AArch64::FPR16RegClass;
8823 else if (RegVT == MVT::f32)
8824 RC = &AArch64::FPR32RegClass;
8825 else if (RegVT == MVT::f64 || RegVT.is64BitVector())
8826 RC = &AArch64::FPR64RegClass;
8827 else if (RegVT == MVT::f128 || RegVT.is128BitVector())
8828 RC = &AArch64::FPR128RegClass;
8829 else if (RegVT.isScalableVector() &&
8830 RegVT.getVectorElementType() == MVT::i1) {
8831 FuncInfo->setIsSVECC(true);
8832 RC = &AArch64::PPRRegClass;
8833 } else if (RegVT == MVT::aarch64svcount) {
8834 FuncInfo->setIsSVECC(true);
8835 RC = &AArch64::PPRRegClass;
8836 } else if (RegVT.isScalableVector()) {
8837 FuncInfo->setIsSVECC(true);
8838 RC = &AArch64::ZPRRegClass;
8839 } else
8840 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
8841
8842 // Transform the arguments in physical registers into virtual ones.
8843 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
8844
8845 if (IsLocallyStreaming) {
8846 // LocallyStreamingFunctions must insert the SMSTART in the correct
8847 // position, so we use Glue to ensure no instructions can be scheduled
8848 // between the chain of:
8849 // t0: ch,glue = EntryNode
8850 // t1: res,ch,glue = CopyFromReg
8851 // ...
8852 // tn: res,ch,glue = CopyFromReg t(n-1), ..
8853 // t(n+1): ch, glue = SMSTART t0:0, ...., tn:2
8854 // ^^^^^^
8855 // This will be the new Chain/Root node.
8856 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT, Glue);
8857 Glue = ArgValue.getValue(2);
8858 if (isPassedInFPR(ArgValue.getValueType())) {
8859 ArgValue =
8860 DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
8861 DAG.getVTList(ArgValue.getValueType(), MVT::Glue),
8862 {ArgValue, Glue});
8863 Glue = ArgValue.getValue(1);
8864 }
8865 } else
8866 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
8867
8868 // If this is an 8, 16 or 32-bit value, it is really passed promoted
8869 // to 64 bits. Insert an assert[sz]ext to capture this, then
8870 // truncate to the right size.
8871 switch (VA.getLocInfo()) {
8872 default:
8873 llvm_unreachable("Unknown loc info!");
8874 case CCValAssign::Full:
8875 break;
8877 assert(
8878 (VA.getValVT().isScalableVT() || Subtarget->isWindowsArm64EC()) &&
8879 "Indirect arguments should be scalable on most subtargets");
8880 break;
8881 case CCValAssign::BCvt:
8882 ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
8883 break;
8884 case CCValAssign::AExt:
8885 case CCValAssign::SExt:
8886 case CCValAssign::ZExt:
8887 break;
8889 ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
8890 DAG.getConstant(32, DL, RegVT));
8891 ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
8892 break;
8893 }
8894 } else { // VA.isRegLoc()
8895 assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
8896 unsigned ArgOffset = VA.getLocMemOffset();
8897 unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
8898 ? VA.getLocVT().getSizeInBits()
8899 : VA.getValVT().getSizeInBits()) / 8;
8900
8901 uint32_t BEAlign = 0;
8902 if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
8903 !Ins[i].Flags.isInConsecutiveRegs())
8904 BEAlign = 8 - ArgSize;
8905
8906 SDValue FIN;
8907 MachinePointerInfo PtrInfo;
8908 if (StackViaX4) {
8909 // In both the ARM64EC varargs convention and the thunk convention,
8910 // arguments on the stack are accessed relative to x4, not sp. In
8911 // the thunk convention, there's an additional offset of 32 bytes
8912 // to account for the shadow store.
8913 unsigned ObjOffset = ArgOffset + BEAlign;
8914 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
8915 ObjOffset += 32;
8916 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
8917 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
8918 FIN = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
8919 DAG.getConstant(ObjOffset, DL, MVT::i64));
8921 } else {
8922 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
8923
8924 // Create load nodes to retrieve arguments from the stack.
8925 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
8926 PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
8927 }
8928
8929 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
8931 MVT MemVT = VA.getValVT();
8932
8933 switch (VA.getLocInfo()) {
8934 default:
8935 break;
8936 case CCValAssign::Trunc:
8937 case CCValAssign::BCvt:
8938 MemVT = VA.getLocVT();
8939 break;
8941 assert(
8942 (VA.getValVT().isScalableVT() || Subtarget->isWindowsArm64EC()) &&
8943 "Indirect arguments should be scalable on most subtargets");
8944 MemVT = VA.getLocVT();
8945 break;
8946 case CCValAssign::SExt:
8947 ExtType = ISD::SEXTLOAD;
8948 break;
8949 case CCValAssign::ZExt:
8950 ExtType = ISD::ZEXTLOAD;
8951 break;
8952 case CCValAssign::AExt:
8953 ExtType = ISD::EXTLOAD;
8954 break;
8955 }
8956
8957 ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, PtrInfo,
8958 MemVT);
8959 }
8960
8961 if (VA.getLocInfo() == CCValAssign::Indirect) {
8962 assert((VA.getValVT().isScalableVT() ||
8963 Subtarget->isWindowsArm64EC()) &&
8964 "Indirect arguments should be scalable on most subtargets");
8965
8966 TypeSize PartSize = VA.getValVT().getStoreSize();
8967 unsigned NumParts = 1;
8968 if (Ins[i].Flags.isInConsecutiveRegs()) {
8969 while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
8970 ++NumParts;
8971 }
8972
8973 MVT PartLoad = VA.getValVT();
8974 SDValue Ptr = ArgValue;
8975
8976 // Ensure we generate all loads for each tuple part, whilst updating the
8977 // pointer after each load correctly using vscale.
8978 while (NumParts > 0) {
8979 ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo());
8980 InVals.push_back(ArgValue);
8981 NumParts--;
8982 if (NumParts > 0) {
8983 SDValue BytesIncrement =
8984 DAG.getTypeSize(DL, Ptr.getValueType(), PartSize);
8985 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
8986 BytesIncrement, SDNodeFlags::NoUnsignedWrap);
8987 ExtraArgLocs++;
8988 i++;
8989 }
8990 }
8991 } else {
8992 if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
8993 ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
8994 ArgValue, DAG.getValueType(MVT::i32));
8995
8996 // i1 arguments are zero-extended to i8 by the caller. Emit a
8997 // hint to reflect this.
8998 if (Ins[i].isOrigArg()) {
8999 Argument *OrigArg = F.getArg(Ins[i].getOrigArgIndex());
9000 if (OrigArg->getType()->isIntegerTy(1)) {
9001 if (!Ins[i].Flags.isZExt()) {
9002 ArgValue = DAG.getNode(AArch64ISD::ASSERT_ZEXT_BOOL, DL,
9003 ArgValue.getValueType(), ArgValue);
9004 }
9005 }
9006 }
9007
9008 InVals.push_back(ArgValue);
9009 }
9010 }
9011 assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
9012
9013 if (Attrs.hasStreamingCompatibleInterface()) {
9014 SDValue EntryPStateSM =
9015 DAG.getNode(AArch64ISD::ENTRY_PSTATE_SM, DL,
9016 DAG.getVTList(MVT::i64, MVT::Other), {Chain});
9017
9018 // Copy the value to a virtual register, and save that in FuncInfo.
9019 Register EntryPStateSMReg =
9020 MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
9021 Chain = DAG.getCopyToReg(EntryPStateSM.getValue(1), DL, EntryPStateSMReg,
9022 EntryPStateSM);
9023 FuncInfo->setPStateSMReg(EntryPStateSMReg);
9024 }
9025
9026 // Insert the SMSTART if this is a locally streaming function and
9027 // make sure it is Glued to the last CopyFromReg value.
9028 if (IsLocallyStreaming) {
9029 if (Attrs.hasStreamingCompatibleInterface())
9030 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
9032 else
9033 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
9035
9036 // Ensure that the SMSTART happens after the CopyWithChain such that its
9037 // chain result is used.
9038 for (unsigned I=0; I<InVals.size(); ++I) {
9041 Chain = DAG.getCopyToReg(Chain, DL, Reg, InVals[I]);
9042 InVals[I] = DAG.getCopyFromReg(Chain, DL, Reg,
9043 InVals[I].getValueType());
9044 }
9045 }
9046
9047 // varargs
9048 if (isVarArg) {
9050 if (!Subtarget->isTargetDarwin() || IsWin64) {
9051 // The AAPCS variadic function ABI is identical to the non-variadic
9052 // one. As a result there may be more arguments in registers and we
9053 // should save them for future reference.
9054 // Win64 variadic functions also pass arguments in registers, but all
9055 // float arguments are passed in integer registers.
9056 saveVarArgRegisters(CCInfo, DAG, DL, Chain);
9057 }
9058
9059 // This will point to the next argument passed via stack.
9060 unsigned VarArgsOffset = CCInfo.getStackSize();
9061 // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
9062 VarArgsOffset =
9063 alignTo(VarArgsOffset, Subtarget->isTargetILP32() ? 4 : 8);
9064 FuncInfo->setVarArgsStackOffset(VarArgsOffset);
9065 FuncInfo->setVarArgsStackIndex(
9066 MFI.CreateFixedObject(4, VarArgsOffset, true));
9067 }
9068
9069 if (MFI.hasMustTailInVarArgFunc()) {
9070 SmallVector<MVT, 2> RegParmTypes;
9071 RegParmTypes.push_back(MVT::i64);
9072 RegParmTypes.push_back(MVT::f128);
9073 // Compute the set of forwarded registers. The rest are scratch.
9074 SmallVectorImpl<ForwardedRegister> &Forwards =
9075 FuncInfo->getForwardedMustTailRegParms();
9076 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
9078
9079 // Conservatively forward X8, since it might be used for aggregate return.
9080 if (!CCInfo.isAllocated(AArch64::X8)) {
9081 Register X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
9082 Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
9083 }
9084 }
9085 }
9086
9087 // On Windows, InReg pointers must be returned, so record the pointer in a
9088 // virtual register at the start of the function so it can be returned in the
9089 // epilogue.
9090 if (IsWin64 || F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64) {
9091 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
9092 if ((F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64 ||
9093 Ins[I].Flags.isInReg()) &&
9094 Ins[I].Flags.isSRet()) {
9095 assert(!FuncInfo->getSRetReturnReg());
9096
9097 MVT PtrTy = getPointerTy(DAG.getDataLayout());
9098 Register Reg =
9100 FuncInfo->setSRetReturnReg(Reg);
9101
9102 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
9103 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
9104 break;
9105 }
9106 }
9107 }
9108
9109 unsigned StackArgSize = CCInfo.getStackSize();
9110 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
9111 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
9112 // This is a non-standard ABI so by fiat I say we're allowed to make full
9113 // use of the stack area to be popped, which must be aligned to 16 bytes in
9114 // any case:
9115 StackArgSize = alignTo(StackArgSize, 16);
9116
9117 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
9118 // a multiple of 16.
9119 FuncInfo->setArgumentStackToRestore(StackArgSize);
9120
9121 // This realignment carries over to the available bytes below. Our own
9122 // callers will guarantee the space is free by giving an aligned value to
9123 // CALLSEQ_START.
9124 }
9125 // Even if we're not expected to free up the space, it's useful to know how
9126 // much is there while considering tail calls (because we can reuse it).
9127 FuncInfo->setBytesInStackArgArea(StackArgSize);
9128
9129 if (Subtarget->hasCustomCallingConv())
9130 Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
9131
9132 if (Subtarget->isTargetWindows() || hasInlineStackProbe(MF)) {
9133 SDValue Size;
9134 if (Attrs.hasZAState()) {
9135 SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
9136 DAG.getConstant(1, DL, MVT::i32));
9137 Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
9138 } else if (Attrs.hasAgnosticZAInterface()) {
9139 RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE_SIZE;
9140 RTLIB::LibcallImpl LCImpl = DAG.getLibcalls().getLibcallImpl(LC);
9141
9142 SDValue Callee =
9143 DAG.getExternalSymbol(LCImpl, getPointerTy(DAG.getDataLayout()));
9144 auto *RetTy = EVT(MVT::i64).getTypeForEVT(*DAG.getContext());
9145 TargetLowering::CallLoweringInfo CLI(DAG);
9146 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
9147 DAG.getLibcalls().getLibcallImplCallingConv(LCImpl), RetTy, Callee,
9148 {});
9149 std::tie(Size, Chain) = LowerCallTo(CLI);
9150 }
9151 if (Size) {
9152 SDValue Buffer = DAG.getNode(
9153 ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other),
9154 {Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
9155 Chain = Buffer.getValue(1);
9156
9157 Register BufferPtr =
9158 MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
9159 Chain = DAG.getCopyToReg(Chain, DL, BufferPtr, Buffer);
9160 Chain = DAG.getNode(AArch64ISD::SME_STATE_ALLOC, DL,
9161 DAG.getVTList(MVT::Other), Chain);
9162 FuncInfo->setEarlyAllocSMESaveBuffer(BufferPtr);
9163 MFI.CreateVariableSizedObject(Align(16), nullptr);
9164 }
9165 }
9166
9167 if (CallConv == CallingConv::PreserveNone) {
9168 for (const ISD::InputArg &I : Ins) {
9169 if (I.Flags.isSwiftSelf() || I.Flags.isSwiftError() ||
9170 I.Flags.isSwiftAsync()) {
9171 MachineFunction &MF = DAG.getMachineFunction();
9172 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9173 MF.getFunction(),
9174 "Swift attributes can't be used with preserve_none",
9175 DL.getDebugLoc()));
9176 break;
9177 }
9178 }
9179 }
9180
9181 return Chain;
9182}
9183
9184void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
9185 SelectionDAG &DAG,
9186 const SDLoc &DL,
9187 SDValue &Chain) const {
9188 MachineFunction &MF = DAG.getMachineFunction();
9189 MachineFrameInfo &MFI = MF.getFrameInfo();
9190 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
9191 auto PtrVT = getPointerTy(DAG.getDataLayout());
9192 Function &F = MF.getFunction();
9193 bool IsWin64 =
9194 Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg());
9195
9197
9199 unsigned NumGPRArgRegs = GPRArgRegs.size();
9200 if (Subtarget->isWindowsArm64EC()) {
9201 // In the ARM64EC ABI, only x0-x3 are used to pass arguments to varargs
9202 // functions.
9203 NumGPRArgRegs = 4;
9204 }
9205 unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
9206
9207 unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
9208 int GPRIdx = 0;
9209 if (GPRSaveSize != 0) {
9210 if (IsWin64) {
9211 GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
9212 if (GPRSaveSize & 15)
9213 // The extra size here, if triggered, will always be 8.
9214 MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
9215 } else
9216 GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);
9217
9218 SDValue FIN;
9219 if (Subtarget->isWindowsArm64EC()) {
9220 // With the Arm64EC ABI, we reserve the save area as usual, but we
9221 // compute its address relative to x4. For a normal AArch64->AArch64
9222 // call, x4 == sp on entry, but calls from an entry thunk can pass in a
9223 // different address.
9224 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
9225 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
9226 FIN = DAG.getNode(ISD::SUB, DL, MVT::i64, Val,
9227 DAG.getConstant(GPRSaveSize, DL, MVT::i64));
9228 } else {
9229 FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
9230 }
9231
9232 for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
9233 Register VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
9234 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
9235 SDValue Store =
9236 DAG.getStore(Val.getValue(1), DL, Val, FIN,
9238 MF, GPRIdx, (i - FirstVariadicGPR) * 8)
9239 : MachinePointerInfo::getStack(MF, i * 8));
9240 MemOps.push_back(Store);
9241 FIN =
9242 DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
9243 }
9244 }
9245 FuncInfo->setVarArgsGPRIndex(GPRIdx);
9246 FuncInfo->setVarArgsGPRSize(GPRSaveSize);
9247
9248 if (Subtarget->hasFPARMv8() && !IsWin64) {
9250 const unsigned NumFPRArgRegs = FPRArgRegs.size();
9251 unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
9252
9253 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
9254 int FPRIdx = 0;
9255 if (FPRSaveSize != 0) {
9256 FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false);
9257
9258 SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
9259
9260 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
9261 Register VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
9262 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
9263
9264 SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
9265 MachinePointerInfo::getStack(MF, i * 16));
9266 MemOps.push_back(Store);
9267 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
9268 DAG.getConstant(16, DL, PtrVT));
9269 }
9270 }
9271 FuncInfo->setVarArgsFPRIndex(FPRIdx);
9272 FuncInfo->setVarArgsFPRSize(FPRSaveSize);
9273 }
9274
9275 if (!MemOps.empty()) {
9276 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
9277 }
9278}
9279
9280/// LowerCallResult - Lower the result values of a call into the
9281/// appropriate copies out of appropriate physical registers.
9282SDValue AArch64TargetLowering::LowerCallResult(
9283 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
9284 const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL,
9285 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
9286 SDValue ThisVal, bool RequiresSMChange) const {
9287 DenseMap<unsigned, SDValue> CopiedRegs;
9288 // Copy all of the result registers out of their specified physreg.
9289 for (unsigned i = 0; i != RVLocs.size(); ++i) {
9290 CCValAssign VA = RVLocs[i];
9291
9292 // Pass 'this' value directly from the argument to return value, to avoid
9293 // reg unit interference
9294 if (i == 0 && isThisReturn) {
9295 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
9296 "unexpected return calling convention register assignment");
9297 InVals.push_back(ThisVal);
9298 continue;
9299 }
9300
9301 // Avoid copying a physreg twice since RegAllocFast is incompetent and only
9302 // allows one use of a physreg per block.
9303 SDValue Val = CopiedRegs.lookup(VA.getLocReg());
9304 if (!Val) {
9305 Val =
9306 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
9307 Chain = Val.getValue(1);
9308 InGlue = Val.getValue(2);
9309 CopiedRegs[VA.getLocReg()] = Val;
9310 }
9311
9312 switch (VA.getLocInfo()) {
9313 default:
9314 llvm_unreachable("Unknown loc info!");
9315 case CCValAssign::Full:
9316 break;
9317 case CCValAssign::BCvt:
9318 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
9319 break;
9321 Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
9322 DAG.getConstant(32, DL, VA.getLocVT()));
9323 [[fallthrough]];
9324 case CCValAssign::AExt:
9325 [[fallthrough]];
9326 case CCValAssign::ZExt:
9327 Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
9328 break;
9329 }
9330
9331 if (RequiresSMChange && isPassedInFPR(VA.getValVT()))
9332 Val = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
9333 DAG.getVTList(Val.getValueType(), MVT::Glue), Val);
9334
9335 InVals.push_back(Val);
9336 }
9337
9338 return Chain;
9339}
9340
9341/// Return true if the calling convention is one that we can guarantee TCO for.
9342static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
9343 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
9345}
9346
9347/// Return true if we might ever do TCO for calls with this calling convention.
9349 switch (CC) {
9350 case CallingConv::C:
9355 case CallingConv::Swift:
9357 case CallingConv::Tail:
9358 case CallingConv::Fast:
9359 return true;
9360 default:
9361 return false;
9362 }
9363}
9364
9365/// Return true if the call convention supports varargs
9366/// Currently only those that pass varargs like the C
9367/// calling convention does are eligible
9368/// Calling conventions listed in this function must also
9369/// be properly handled in AArch64Subtarget::isCallingConvWin64
9371 switch (CC) {
9372 case CallingConv::C:
9374 // SVE vector call is only partially supported, but it should
9375 // support named arguments being passed. Any arguments being passed
9376 // as varargs, are still unsupported.
9378 return true;
9379 default:
9380 return false;
9381 }
9382}
9383
9385 const AArch64Subtarget *Subtarget,
9387 CCState &CCInfo) {
9388 const SelectionDAG &DAG = CLI.DAG;
9389 CallingConv::ID CalleeCC = CLI.CallConv;
9390 bool IsVarArg = CLI.IsVarArg;
9391 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
9392 bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC, IsVarArg);
9393
9394 // For Arm64EC thunks, allocate 32 extra bytes at the bottom of the stack
9395 // for the shadow store.
9396 if (CalleeCC == CallingConv::ARM64EC_Thunk_X64)
9397 CCInfo.AllocateStack(32, Align(16));
9398
9399 unsigned NumArgs = Outs.size();
9400 for (unsigned i = 0; i != NumArgs; ++i) {
9401 MVT ArgVT = Outs[i].VT;
9402 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
9403
9404 bool UseVarArgCC = false;
9405 if (IsVarArg) {
9406 // On Windows, the fixed arguments in a vararg call are passed in GPRs
9407 // too, so use the vararg CC to force them to integer registers.
9408 if (IsCalleeWin64) {
9409 UseVarArgCC = true;
9410 } else {
9411 UseVarArgCC = ArgFlags.isVarArg();
9412 }
9413 }
9414
9415 if (!UseVarArgCC) {
9416 // Get type of the original argument.
9417 EVT ActualVT =
9418 TLI.getValueType(DAG.getDataLayout(), CLI.Args[Outs[i].OrigArgIndex].Ty,
9419 /*AllowUnknown*/ true);
9420 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT;
9421 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
9422 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
9423 ArgVT = MVT::i8;
9424 else if (ActualMVT == MVT::i16)
9425 ArgVT = MVT::i16;
9426 }
9427
9428 // FIXME: CCAssignFnForCall should be called once, for the call and not per
9429 // argument. This logic should exactly mirror LowerFormalArguments.
9430 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CalleeCC, UseVarArgCC);
9431 bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags,
9432 Outs[i].OrigTy, CCInfo);
9433 assert(!Res && "Call operand has unhandled type");
9434 (void)Res;
9435 }
9436}
9437
9438static SMECallAttrs
9441 if (CLI.CB)
9442 return SMECallAttrs(*CLI.CB, &RTLCI);
9443 if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
9444 return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(ES->getSymbol(), RTLCI));
9446}
9447
9448bool AArch64TargetLowering::isEligibleForTailCallOptimization(
9449 const CallLoweringInfo &CLI) const {
9450 CallingConv::ID CalleeCC = CLI.CallConv;
9451 if (!mayTailCallThisCC(CalleeCC))
9452 return false;
9453
9454 SDValue Callee = CLI.Callee;
9455 bool IsVarArg = CLI.IsVarArg;
9456 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
9457 const SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
9458 const SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
9459 const SelectionDAG &DAG = CLI.DAG;
9460 MachineFunction &MF = DAG.getMachineFunction();
9461 const Function &CallerF = MF.getFunction();
9462 CallingConv::ID CallerCC = CallerF.getCallingConv();
9463
9464 // SME Streaming functions are not eligible for TCO as they may require
9465 // the streaming mode or ZA/ZT0 to be restored after returning from the call.
9466 SMECallAttrs CallAttrs =
9467 getSMECallAttrs(CallerF, getRuntimeLibcallsInfo(), CLI);
9468 if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() ||
9469 CallAttrs.requiresPreservingAllZAState() ||
9470 CallAttrs.requiresPreservingZT0() ||
9471 CallAttrs.caller().hasStreamingBody() || CallAttrs.caller().isNewZA() ||
9472 CallAttrs.caller().isNewZT0())
9473 return false;
9474
9475 // Functions using the C or Fast calling convention that have an SVE signature
9476 // preserve more registers and should assume the SVE_VectorCall CC.
9477 // The check for matching callee-saved regs will determine whether it is
9478 // eligible for TCO.
9479 if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) &&
9480 MF.getInfo<AArch64FunctionInfo>()->isSVECC())
9482
9483 bool CCMatch = CallerCC == CalleeCC;
9484
9485 // When using the Windows calling convention on a non-windows OS, we want
9486 // to back up and restore X18 in such functions; we can't do a tail call
9487 // from those functions.
9488 if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
9489 CalleeCC != CallingConv::Win64)
9490 return false;
9491
9492 // Byval parameters hand the function a pointer directly into the stack area
9493 // we want to reuse during a tail call. Working around this *is* possible (see
9494 // X86) but less efficient and uglier in LowerCall.
9495 for (Function::const_arg_iterator i = CallerF.arg_begin(),
9496 e = CallerF.arg_end();
9497 i != e; ++i) {
9498 if (i->hasByValAttr())
9499 return false;
9500
9501 // On Windows, "inreg" attributes signify non-aggregate indirect returns.
9502 // In this case, it is necessary to save X0/X1 in the callee and return it
9503 // in X0. Tail call opt may interfere with this, so we disable tail call
9504 // opt when the caller has an "inreg" attribute -- except if the callee
9505 // also has that attribute on the same argument, and the same value is
9506 // passed.
9507 if (i->hasInRegAttr()) {
9508 unsigned ArgIdx = i - CallerF.arg_begin();
9509 if (!CLI.CB || CLI.CB->arg_size() <= ArgIdx)
9510 return false;
9511 AttributeSet Attrs = CLI.CB->getParamAttributes(ArgIdx);
9512 if (!Attrs.hasAttribute(Attribute::InReg) ||
9513 !Attrs.hasAttribute(Attribute::StructRet) || !i->hasStructRetAttr() ||
9514 CLI.CB->getArgOperand(ArgIdx) != i) {
9515 return false;
9516 }
9517 }
9518 }
9519
9520 if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
9521 return CCMatch;
9522
9523 // Externally-defined functions with weak linkage should not be
9524 // tail-called on AArch64 when the OS does not support dynamic
9525 // pre-emption of symbols, as the AAELF spec requires normal calls
9526 // to undefined weak functions to be replaced with a NOP or jump to the
9527 // next instruction. The behaviour of branch instructions in this
9528 // situation (as used for tail calls) is implementation-defined, so we
9529 // cannot rely on the linker replacing the tail call with a return.
9530 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
9531 const GlobalValue *GV = G->getGlobal();
9532 const Triple &TT = getTargetMachine().getTargetTriple();
9533 if (GV->hasExternalWeakLinkage() &&
9534 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
9535 return false;
9536 }
9537
9538 // Now we search for cases where we can use a tail call without changing the
9539 // ABI. Sibcall is used in some places (particularly gcc) to refer to this
9540 // concept.
9541
9542 // I want anyone implementing a new calling convention to think long and hard
9543 // about this assert.
9544 if (IsVarArg && !callConvSupportsVarArgs(CalleeCC))
9545 report_fatal_error("Unsupported variadic calling convention");
9546
9547 LLVMContext &C = *DAG.getContext();
9548 // Check that the call results are passed in the same way.
9549 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
9550 CCAssignFnForCall(CalleeCC, IsVarArg),
9551 CCAssignFnForCall(CallerCC, IsVarArg)))
9552 return false;
9553 // The callee has to preserve all registers the caller needs to preserve.
9554 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9555 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
9556 if (!CCMatch) {
9557 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
9558 if (Subtarget->hasCustomCallingConv()) {
9559 TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
9560 TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
9561 }
9562 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
9563 return false;
9564 }
9565
9566 // Nothing more to check if the callee is taking no arguments
9567 if (Outs.empty())
9568 return true;
9569
9571 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C);
9572
9573 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
9574
9575 if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) {
9576 // When we are musttail, additional checks have been done and we can safely ignore this check
9577 // At least two cases here: if caller is fastcc then we can't have any
9578 // memory arguments (we'd be expected to clean up the stack afterwards). If
9579 // caller is C then we could potentially use its argument area.
9580
9581 // FIXME: for now we take the most conservative of these in both cases:
9582 // disallow all variadic memory operands.
9583 for (const CCValAssign &ArgLoc : ArgLocs)
9584 if (!ArgLoc.isRegLoc())
9585 return false;
9586 }
9587
9588 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
9589
9590 // If any of the arguments is passed indirectly, it must be SVE, so the
9591 // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
9592 // allocate space on the stack. That is why we determine this explicitly here
9593 // the call cannot be a tailcall.
9594 if (llvm::any_of(ArgLocs, [&](CCValAssign &A) {
9595 assert((A.getLocInfo() != CCValAssign::Indirect ||
9596 A.getValVT().isScalableVector() ||
9597 Subtarget->isWindowsArm64EC()) &&
9598 "Expected value to be scalable");
9599 return A.getLocInfo() == CCValAssign::Indirect;
9600 }))
9601 return false;
9602
9603 // If the stack arguments for this call do not fit into our own save area then
9604 // the call cannot be made tail.
9605 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
9606 return false;
9607
9608 const MachineRegisterInfo &MRI = MF.getRegInfo();
9609 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
9610 return false;
9611
9612 return true;
9613}
9614
9615SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
9616 SelectionDAG &DAG,
9617 MachineFrameInfo &MFI,
9618 int ClobberedFI) const {
9619 SmallVector<SDValue, 8> ArgChains;
9620 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
9621 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
9622
9623 // Include the original chain at the beginning of the list. When this is
9624 // used by target LowerCall hooks, this helps legalize find the
9625 // CALLSEQ_BEGIN node.
9626 ArgChains.push_back(Chain);
9627
9628 // Add a chain value for each stack argument corresponding
9629 for (SDNode *U : DAG.getEntryNode().getNode()->users())
9630 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U))
9631 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
9632 if (FI->getIndex() < 0) {
9633 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
9634 int64_t InLastByte = InFirstByte;
9635 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
9636
9637 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
9638 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
9639 ArgChains.push_back(SDValue(L, 1));
9640 }
9641
9642 // Build a tokenfactor for all the chains.
9643 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
9644}
9645
9646bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
9647 bool TailCallOpt) const {
9648 return (CallCC == CallingConv::Fast && TailCallOpt) ||
9649 CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail;
9650}
9651
9652// Check if the value is zero-extended from i1 to i8
9653static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
9654 unsigned SizeInBits = Arg.getValueType().getSizeInBits();
9655 if (SizeInBits < 8)
9656 return false;
9657
9658 APInt RequiredZero(SizeInBits, 0xFE);
9659 KnownBits Bits = DAG.computeKnownBits(Arg, 4);
9660 bool ZExtBool = (Bits.Zero & RequiredZero) == RequiredZero;
9661 return ZExtBool;
9662}
9663
9664void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
9665 SDNode *Node) const {
9666 // Live-in physreg copies that are glued to SMSTART are applied as
9667 // implicit-def's in the InstrEmitter. Here we remove them, allowing the
9668 // register allocator to pass call args in callee saved regs, without extra
9669 // copies to avoid these fake clobbers of actually-preserved GPRs.
9670 if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1 ||
9671 MI.getOpcode() == AArch64::MSRpstatePseudo) {
9672 for (unsigned I = MI.getNumOperands() - 1; I > 0; --I)
9673 if (MachineOperand &MO = MI.getOperand(I);
9674 MO.isReg() && MO.isImplicit() && MO.isDef() &&
9675 (AArch64::GPR32RegClass.contains(MO.getReg()) ||
9676 AArch64::GPR64RegClass.contains(MO.getReg())))
9677 MI.removeOperand(I);
9678
9679 // The SVE vector length can change when entering/leaving streaming mode.
9680 // FPMR is set to 0 when entering/leaving streaming mode.
9681 if (MI.getOperand(0).getImm() == AArch64SVCR::SVCRSM ||
9682 MI.getOperand(0).getImm() == AArch64SVCR::SVCRSMZA) {
9683 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
9684 /*IsImplicit=*/true));
9685 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/true,
9686 /*IsImplicit=*/true));
9687 MI.addOperand(MachineOperand::CreateReg(AArch64::FPMR, /*IsDef=*/true,
9688 /*IsImplicit=*/true));
9689 }
9690 }
9691
9692 // Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that
9693 // have nothing to do with VG, were it not that they are used to materialise a
9694 // frame-address. If they contain a frame-index to a scalable vector, this
9695 // will likely require an ADDVL instruction to materialise the address, thus
9696 // reading VG.
9697 const MachineFunction &MF = *MI.getMF();
9698 if (MF.getInfo<AArch64FunctionInfo>()->hasStreamingModeChanges() &&
9699 (MI.getOpcode() == AArch64::ADDXri ||
9700 MI.getOpcode() == AArch64::SUBXri)) {
9701 const MachineOperand &MO = MI.getOperand(1);
9702 if (MO.isFI() && MF.getFrameInfo().hasScalableStackID(MO.getIndex()))
9703 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
9704 /*IsImplicit=*/true));
9705 }
9706}
9707
9709 SelectionDAG &DAG, SDLoc DL, bool Enable, SDValue Chain, SDValue InGlue,
9710 unsigned Condition, bool InsertVectorLengthCheck) const {
9713 FuncInfo->setHasStreamingModeChanges(true);
9714
9715 auto GetCheckVL = [&](SDValue Chain, SDValue InGlue = SDValue()) -> SDValue {
9716 SmallVector<SDValue, 2> Ops = {Chain};
9717 if (InGlue)
9718 Ops.push_back(InGlue);
9719 return DAG.getNode(AArch64ISD::CHECK_MATCHING_VL, DL,
9720 DAG.getVTList(MVT::Other, MVT::Glue), Ops);
9721 };
9722
9723 if (InsertVectorLengthCheck && Enable) {
9724 // Non-streaming -> Streaming
9725 // Insert vector length check before smstart
9726 SDValue CheckVL = GetCheckVL(Chain, InGlue);
9727 Chain = CheckVL.getValue(0);
9728 InGlue = CheckVL.getValue(1);
9729 }
9730
9731 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9732 SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask());
9733 SDValue MSROp =
9734 DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32);
9735 SmallVector<SDValue> Ops = {Chain, MSROp};
9736 unsigned Opcode;
9737 if (Condition != AArch64SME::Always) {
9738 Register PStateReg = FuncInfo->getPStateSMReg();
9739 assert(PStateReg.isValid() && "PStateSM Register is invalid");
9740 SDValue PStateSM =
9741 DAG.getCopyFromReg(Chain, DL, PStateReg, MVT::i64, InGlue);
9742 // Use chain and glue from the CopyFromReg.
9743 Ops[0] = PStateSM.getValue(1);
9744 InGlue = PStateSM.getValue(2);
9745 SDValue ConditionOp = DAG.getTargetConstant(Condition, DL, MVT::i64);
9746 Opcode = Enable ? AArch64ISD::COND_SMSTART : AArch64ISD::COND_SMSTOP;
9747 Ops.push_back(ConditionOp);
9748 Ops.push_back(PStateSM);
9749 } else {
9750 Opcode = Enable ? AArch64ISD::SMSTART : AArch64ISD::SMSTOP;
9751 }
9752 Ops.push_back(RegMask);
9753
9754 if (InGlue)
9755 Ops.push_back(InGlue);
9756
9757 SDValue SMChange =
9758 DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
9759
9760 if (!InsertVectorLengthCheck || Enable)
9761 return SMChange;
9762
9763 // Streaming -> Non-streaming
9764 // Insert vector length check after smstop since we cannot read VL
9765 // in streaming mode
9766 return GetCheckVL(SMChange.getValue(0), SMChange.getValue(1));
9767}
9768
9771 if (!CallAttrs.caller().hasStreamingCompatibleInterface() ||
9772 CallAttrs.caller().hasStreamingBody())
9773 return AArch64SME::Always;
9774 if (CallAttrs.callee().hasNonStreamingInterface())
9776 if (CallAttrs.callee().hasStreamingInterface())
9778
9779 llvm_unreachable("Unsupported attributes");
9780}
9781
9782/// Check whether a stack argument requires lowering in a tail call.
9784 const CCValAssign &VA, SDValue Arg,
9785 ISD::ArgFlagsTy Flags, int CallOffset) {
9786 // FIXME: We should be able to handle this case, but it's not clear how to.
9787 if (Flags.isZExt() || Flags.isSExt())
9788 return true;
9789
9790 for (;;) {
9791 // Look through nodes that don't alter the bits of the incoming value.
9792 unsigned Op = Arg.getOpcode();
9793 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST ||
9794 Arg->isAssert() || Op == AArch64ISD::ASSERT_ZEXT_BOOL) {
9795 Arg = Arg.getOperand(0);
9796 continue;
9797 }
9798 break;
9799 }
9800
9801 // If the argument is a load from the same immutable stack slot, we can reuse
9802 // it.
9803 if (auto *LoadNode = dyn_cast<LoadSDNode>(Arg)) {
9804 if (auto *FINode = dyn_cast<FrameIndexSDNode>(LoadNode->getBasePtr())) {
9805 const MachineFrameInfo &MFI = MF.getFrameInfo();
9806 int FI = FINode->getIndex();
9807 if (!MFI.isImmutableObjectIndex(FI))
9808 return true;
9809 if (CallOffset != MFI.getObjectOffset(FI))
9810 return true;
9811 uint64_t SizeInBits = LoadNode->getMemoryVT().getFixedSizeInBits();
9812 if (SizeInBits != VA.getValVT().getSizeInBits())
9813 return true;
9814 return false;
9815 }
9816 }
9817
9818 return true;
9819}
9820
9821/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
9822/// and add input and output parameter nodes.
9823SDValue
9824AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
9825 SmallVectorImpl<SDValue> &InVals) const {
9826 SelectionDAG &DAG = CLI.DAG;
9827 SDLoc &DL = CLI.DL;
9828 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
9829 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
9830 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
9831 SDValue Chain = CLI.Chain;
9832 SDValue Callee = CLI.Callee;
9833 bool &IsTailCall = CLI.IsTailCall;
9834 CallingConv::ID &CallConv = CLI.CallConv;
9835 bool IsVarArg = CLI.IsVarArg;
9836 const CallBase *CB = CLI.CB;
9837
9838 MachineFunction &MF = DAG.getMachineFunction();
9839 MachineFunction::CallSiteInfo CSInfo;
9840 bool IsThisReturn = false;
9841
9842 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
9843 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
9844 bool IsCFICall = CLI.CB && CLI.CB->isIndirectCall() && CLI.CFIType;
9845 bool IsSibCall = false;
9846 bool GuardWithBTI = false;
9847
9848 if (CLI.CB && CLI.CB->hasFnAttr(Attribute::ReturnsTwice) &&
9849 !Subtarget->noBTIAtReturnTwice()) {
9850 GuardWithBTI = FuncInfo->branchTargetEnforcement();
9851 }
9852
9853 // Analyze operands of the call, assigning locations to each operand.
9855 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
9856
9857 if (IsVarArg) {
9858 unsigned NumArgs = Outs.size();
9859
9860 for (unsigned i = 0; i != NumArgs; ++i) {
9861 if (Outs[i].Flags.isVarArg() && Outs[i].VT.isScalableVector())
9862 report_fatal_error("Passing SVE types to variadic functions is "
9863 "currently not supported");
9864 }
9865 }
9866
9867 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
9868
9869 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
9870 // Assign locations to each value returned by this call.
9872 CCState RetCCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
9873 *DAG.getContext());
9874 RetCCInfo.AnalyzeCallResult(Ins, RetCC);
9875
9876 // Set type id for call site info.
9877 setTypeIdForCallsiteInfo(CB, MF, CSInfo);
9878
9879 // Check callee args/returns for SVE registers and set calling convention
9880 // accordingly.
9881 if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) {
9882 auto HasSVERegLoc = [](CCValAssign &Loc) {
9883 if (!Loc.isRegLoc())
9884 return false;
9885 return AArch64::ZPRRegClass.contains(Loc.getLocReg()) ||
9886 AArch64::PPRRegClass.contains(Loc.getLocReg());
9887 };
9888 if (any_of(RVLocs, HasSVERegLoc) || any_of(ArgLocs, HasSVERegLoc))
9890 }
9891
9892 // Determine whether we need any streaming mode changes.
9893 SMECallAttrs CallAttrs =
9895
9896 std::optional<unsigned> ZAMarkerNode;
9897 if (CallAttrs.requiresLazySave() || CallAttrs.requiresPreservingAllZAState())
9898 ZAMarkerNode = AArch64ISD::REQUIRES_ZA_SAVE;
9899 else if (CallAttrs.requiresPreservingZT0())
9900 ZAMarkerNode = AArch64ISD::REQUIRES_ZT0_SAVE;
9901 else if (CallAttrs.caller().hasZAState() || CallAttrs.caller().hasZT0State())
9902 ZAMarkerNode = AArch64ISD::INOUT_ZA_USE;
9903
9904 if (IsTailCall) {
9905 // Check if it's really possible to do a tail call.
9906 IsTailCall = isEligibleForTailCallOptimization(CLI);
9907
9908 // A sibling call is one where we're under the usual C ABI and not planning
9909 // to change that but can still do a tail call:
9910 if (!ZAMarkerNode && !TailCallOpt && IsTailCall &&
9911 CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail)
9912 IsSibCall = true;
9913
9914 if (IsTailCall)
9915 ++NumTailCalls;
9916 }
9917
9918 if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
9919 report_fatal_error("failed to perform tail call elimination on a call "
9920 "site marked musttail");
9921
9922 // Get a count of how many bytes are to be pushed on the stack.
9923 unsigned NumBytes = CCInfo.getStackSize();
9924
9925 if (IsSibCall) {
9926 // Since we're not changing the ABI to make this a tail call, the memory
9927 // operands are already available in the caller's incoming argument space.
9928 NumBytes = 0;
9929 }
9930
9931 // FPDiff is the byte offset of the call's argument area from the callee's.
9932 // Stores to callee stack arguments will be placed in FixedStackSlots offset
9933 // by this amount for a tail call. In a sibling call it must be 0 because the
9934 // caller will deallocate the entire stack and the callee still expects its
9935 // arguments to begin at SP+0. Completely unused for non-tail calls.
9936 int FPDiff = 0;
9937
9938 if (IsTailCall && !IsSibCall) {
9939 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
9940
9941 // Since callee will pop argument stack as a tail call, we must keep the
9942 // popped size 16-byte aligned.
9943 NumBytes = alignTo(NumBytes, 16);
9944
9945 // FPDiff will be negative if this tail call requires more space than we
9946 // would automatically have in our incoming argument space. Positive if we
9947 // can actually shrink the stack.
9948 FPDiff = NumReusableBytes - NumBytes;
9949
9950 // Update the required reserved area if this is the tail call requiring the
9951 // most argument stack space.
9952 if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff)
9953 FuncInfo->setTailCallReservedStack(-FPDiff);
9954
9955 // The stack pointer must be 16-byte aligned at all times it's used for a
9956 // memory operation, which in practice means at *all* times and in
9957 // particular across call boundaries. Therefore our own arguments started at
9958 // a 16-byte aligned SP and the delta applied for the tail call should
9959 // satisfy the same constraint.
9960 assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
9961 }
9962
9963 auto DescribeCallsite =
9964 [&](OptimizationRemarkAnalysis &R) -> OptimizationRemarkAnalysis & {
9965 R << "call from '" << ore::NV("Caller", MF.getName()) << "' to '";
9966 if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
9967 R << ore::NV("Callee", ES->getSymbol());
9968 else if (CLI.CB && CLI.CB->getCalledFunction())
9969 R << ore::NV("Callee", CLI.CB->getCalledFunction()->getName());
9970 else
9971 R << "unknown callee";
9972 R << "'";
9973 return R;
9974 };
9975
9976 bool RequiresSMChange = CallAttrs.requiresSMChange();
9977 if (RequiresSMChange) {
9978 OptimizationRemarkEmitter ORE(&MF.getFunction());
9979 ORE.emit([&]() {
9980 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMETransition",
9981 CLI.CB)
9982 : OptimizationRemarkAnalysis("sme", "SMETransition",
9983 &MF.getFunction());
9984 DescribeCallsite(R) << " requires a streaming mode transition";
9985 return R;
9986 });
9987 }
9988
9989 // Adjust the stack pointer for the new arguments... and mark ZA uses.
9990 // These operations are automatically eliminated by the prolog/epilog pass
9991 assert((!IsSibCall || !ZAMarkerNode) && "ZA markers require CALLSEQ_START");
9992 if (!IsSibCall) {
9993 Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL);
9994 if (ZAMarkerNode) {
9995 // Note: We need the CALLSEQ_START to glue the ZAMarkerNode to, simply
9996 // using a chain can result in incorrect scheduling. The markers refer to
9997 // the position just before the CALLSEQ_START (though occur after as
9998 // CALLSEQ_START lacks in-glue).
9999 Chain =
10000 DAG.getNode(*ZAMarkerNode, DL, DAG.getVTList(MVT::Other, MVT::Glue),
10001 {Chain, Chain.getValue(1)});
10002 }
10003 }
10004
10005 SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
10007
10009 SmallSet<unsigned, 8> RegsUsed;
10010 SmallVector<SDValue, 8> MemOpChains;
10011 auto PtrVT = getPointerTy(DAG.getDataLayout());
10012
10013 if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
10014 const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
10015 for (const auto &F : Forwards) {
10016 SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
10017 RegsToPass.emplace_back(F.PReg, Val);
10018 }
10019 }
10020
10021 // Walk the register/memloc assignments, inserting copies/loads.
10022 unsigned ExtraArgLocs = 0;
10023 for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
10024 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
10025 SDValue Arg = OutVals[i];
10026 ISD::ArgFlagsTy Flags = Outs[i].Flags;
10027
10028 // Promote the value if needed.
10029 switch (VA.getLocInfo()) {
10030 default:
10031 llvm_unreachable("Unknown loc info!");
10032 case CCValAssign::Full:
10033 break;
10034 case CCValAssign::SExt:
10035 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
10036 break;
10037 case CCValAssign::ZExt:
10038 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
10039 break;
10040 case CCValAssign::AExt:
10041 if (Outs[i].ArgVT == MVT::i1) {
10042 // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
10043 //
10044 // Check if we actually have to do this, because the value may
10045 // already be zero-extended.
10046 //
10047 // We cannot just emit a (zext i8 (trunc (assert-zext i8)))
10048 // and rely on DAGCombiner to fold this, because the following
10049 // (anyext i32) is combined with (zext i8) in DAG.getNode:
10050 //
10051 // (ext (zext x)) -> (zext x)
10052 //
10053 // This will give us (zext i32), which we cannot remove, so
10054 // try to check this beforehand.
10055 if (!checkZExtBool(Arg, DAG)) {
10056 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
10057 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
10058 }
10059 }
10060 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
10061 break;
10063 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
10064 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
10065 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
10066 DAG.getConstant(32, DL, VA.getLocVT()));
10067 break;
10068 case CCValAssign::BCvt:
10069 Arg = DAG.getBitcast(VA.getLocVT(), Arg);
10070 break;
10071 case CCValAssign::Trunc:
10072 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
10073 break;
10074 case CCValAssign::FPExt:
10075 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
10076 break;
10078 bool isScalable = VA.getValVT().isScalableVT();
10079 assert((isScalable || Subtarget->isWindowsArm64EC()) &&
10080 "Indirect arguments should be scalable on most subtargets");
10081
10082 TypeSize StoreSize = VA.getValVT().getStoreSize();
10083 TypeSize PartSize = StoreSize;
10084 unsigned NumParts = 1;
10085 if (Outs[i].Flags.isInConsecutiveRegs()) {
10086 while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
10087 ++NumParts;
10088 StoreSize *= NumParts;
10089 }
10090
10091 Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
10092 Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
10093 MachineFrameInfo &MFI = MF.getFrameInfo();
10094 int FI =
10095 MFI.CreateStackObject(StoreSize.getKnownMinValue(), Alignment, false);
10096 if (isScalable) {
10097 bool IsPred = VA.getValVT() == MVT::aarch64svcount ||
10098 VA.getValVT().getVectorElementType() == MVT::i1;
10101 }
10102
10103 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI);
10104 SDValue Ptr = DAG.getFrameIndex(
10106 SDValue SpillSlot = Ptr;
10107
10108 // Ensure we generate all stores for each tuple part, whilst updating the
10109 // pointer after each store correctly using vscale.
10110 while (NumParts) {
10111 SDValue Store = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI);
10112 MemOpChains.push_back(Store);
10113
10114 NumParts--;
10115 if (NumParts > 0) {
10116 SDValue BytesIncrement =
10117 DAG.getTypeSize(DL, Ptr.getValueType(), PartSize);
10118 MPI = MachinePointerInfo(MPI.getAddrSpace());
10119 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
10120 BytesIncrement, SDNodeFlags::NoUnsignedWrap);
10121 ExtraArgLocs++;
10122 i++;
10123 }
10124 }
10125
10126 Arg = SpillSlot;
10127 break;
10128 }
10129
10130 if (VA.isRegLoc()) {
10131 if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
10132 Outs[0].VT == MVT::i64) {
10133 assert(VA.getLocVT() == MVT::i64 &&
10134 "unexpected calling convention register assignment");
10135 assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
10136 "unexpected use of 'returned'");
10137 IsThisReturn = true;
10138 }
10139 if (RegsUsed.count(VA.getLocReg())) {
10140 // If this register has already been used then we're trying to pack
10141 // parts of an [N x i32] into an X-register. The extension type will
10142 // take care of putting the two halves in the right place but we have to
10143 // combine them.
10144 SDValue &Bits =
10145 llvm::find_if(RegsToPass,
10146 [=](const std::pair<unsigned, SDValue> &Elt) {
10147 return Elt.first == VA.getLocReg();
10148 })
10149 ->second;
10150 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
10151 // Call site info is used for function's parameter entry value
10152 // tracking. For now we track only simple cases when parameter
10153 // is transferred through whole register.
10155 [&VA](MachineFunction::ArgRegPair ArgReg) {
10156 return ArgReg.Reg == VA.getLocReg();
10157 });
10158 } else {
10159 // Add an extra level of indirection for streaming mode changes by
10160 // using a pseudo copy node that cannot be rematerialised between a
10161 // smstart/smstop and the call by the simple register coalescer.
10162 if (RequiresSMChange && isPassedInFPR(Arg.getValueType()))
10163 Arg = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
10164 DAG.getVTList(Arg.getValueType(), MVT::Glue), Arg);
10165 RegsToPass.emplace_back(VA.getLocReg(), Arg);
10166 RegsUsed.insert(VA.getLocReg());
10167 const TargetOptions &Options = DAG.getTarget().Options;
10168 if (Options.EmitCallSiteInfo)
10169 CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i);
10170 }
10171 } else {
10172 assert(VA.isMemLoc());
10173
10174 SDValue DstAddr;
10175 MachinePointerInfo DstInfo;
10176
10177 // FIXME: This works on big-endian for composite byvals, which are the
10178 // common case. It should also work for fundamental types too.
10179 uint32_t BEAlign = 0;
10180 unsigned OpSize;
10181 if (VA.getLocInfo() == CCValAssign::Indirect ||
10183 OpSize = VA.getLocVT().getFixedSizeInBits();
10184 else
10185 OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
10186 : VA.getValVT().getSizeInBits();
10187 OpSize = (OpSize + 7) / 8;
10188 if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
10189 !Flags.isInConsecutiveRegs()) {
10190 if (OpSize < 8)
10191 BEAlign = 8 - OpSize;
10192 }
10193 unsigned LocMemOffset = VA.getLocMemOffset();
10194 int32_t Offset = LocMemOffset + BEAlign;
10195
10196 if (IsTailCall) {
10197 // When the frame pointer is perfectly aligned for the tail call and the
10198 // same stack argument is passed down intact, we can reuse it.
10199 if (!FPDiff && !shouldLowerTailCallStackArg(MF, VA, Arg, Flags, Offset))
10200 continue;
10201
10202 Offset = Offset + FPDiff;
10203 int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
10204
10205 DstAddr = DAG.getFrameIndex(FI, PtrVT);
10206 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
10207
10208 // Make sure any stack arguments overlapping with where we're storing
10209 // are loaded before this eventual operation. Otherwise they'll be
10210 // clobbered.
10211 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
10212 } else {
10213 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
10214
10215 DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
10216 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
10217 }
10218
10219 if (Outs[i].Flags.isByVal()) {
10220 SDValue SizeNode =
10221 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
10222 SDValue Cpy = DAG.getMemcpy(
10223 Chain, DL, DstAddr, Arg, SizeNode,
10224 Outs[i].Flags.getNonZeroByValAlign(),
10225 /*isVol = */ false, /*AlwaysInline = */ false,
10226 /*CI=*/nullptr, std::nullopt, DstInfo, MachinePointerInfo());
10227
10228 MemOpChains.push_back(Cpy);
10229 } else {
10230 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
10231 // promoted to a legal register type i32, we should truncate Arg back to
10232 // i1/i8/i16.
10233 if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
10234 VA.getValVT() == MVT::i16)
10235 Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
10236
10237 SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
10238 MemOpChains.push_back(Store);
10239 }
10240 }
10241 }
10242
10243 if (IsVarArg && Subtarget->isWindowsArm64EC() &&
10244 !(CLI.CB && CLI.CB->isMustTailCall())) {
10245 SDValue ParamPtr = StackPtr;
10246 if (IsTailCall) {
10247 // Create a dummy object at the top of the stack that can be used to get
10248 // the SP after the epilogue
10249 int FI = MF.getFrameInfo().CreateFixedObject(1, FPDiff, true);
10250 ParamPtr = DAG.getFrameIndex(FI, PtrVT);
10251 }
10252
10253 // For vararg calls, the Arm64EC ABI requires values in x4 and x5
10254 // describing the argument list. x4 contains the address of the
10255 // first stack parameter. x5 contains the size in bytes of all parameters
10256 // passed on the stack.
10257 RegsToPass.emplace_back(AArch64::X4, ParamPtr);
10258 RegsToPass.emplace_back(AArch64::X5,
10259 DAG.getConstant(NumBytes, DL, MVT::i64));
10260 }
10261
10262 if (!MemOpChains.empty())
10263 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
10264
10265 SDValue InGlue;
10266 if (RequiresSMChange) {
10267 bool InsertVectorLengthCheck =
10269 Chain = changeStreamingMode(
10270 DAG, DL, CallAttrs.callee().hasStreamingInterface(), Chain, InGlue,
10271 getSMToggleCondition(CallAttrs), InsertVectorLengthCheck);
10272 InGlue = Chain.getValue(1);
10273 }
10274
10275 // Build a sequence of copy-to-reg nodes chained together with token chain
10276 // and flag operands which copy the outgoing args into the appropriate regs.
10277 for (auto &RegToPass : RegsToPass) {
10278 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
10279 RegToPass.second, InGlue);
10280 InGlue = Chain.getValue(1);
10281 }
10282
10283 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
10284 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
10285 // node so that legalize doesn't hack it.
10286 const GlobalValue *CalledGlobal = nullptr;
10287 unsigned OpFlags = 0;
10288 if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
10289 CalledGlobal = G->getGlobal();
10290 OpFlags = Subtarget->classifyGlobalFunctionReference(CalledGlobal,
10292 if (OpFlags & AArch64II::MO_GOT) {
10293 Callee = DAG.getTargetGlobalAddress(CalledGlobal, DL, PtrVT, 0, OpFlags);
10294 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
10295 } else if (!CLI.PAI || !IsTailCall) {
10296 const GlobalValue *GV = G->getGlobal();
10297 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
10298 }
10299 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
10300 bool UseGot = (getTargetMachine().getCodeModel() == CodeModel::Large &&
10301 Subtarget->isTargetMachO()) ||
10303 const char *Sym = S->getSymbol();
10304 if (UseGot) {
10306 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
10307 } else {
10308 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
10309 }
10310 }
10311
10312 // We don't usually want to end the call-sequence here because we would tidy
10313 // the frame up *after* the call, however in the ABI-changing tail-call case
10314 // we've carefully laid out the parameters so that when sp is reset they'll be
10315 // in the correct location.
10316 if (IsTailCall && !IsSibCall) {
10317 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, DL);
10318 InGlue = Chain.getValue(1);
10319 }
10320
10321 unsigned Opc = IsTailCall ? AArch64ISD::TC_RETURN : AArch64ISD::CALL;
10322
10323 std::vector<SDValue> Ops;
10324 Ops.push_back(Chain);
10325 Ops.push_back(Callee);
10326
10327 // Calls with operand bundle "clang.arc.attachedcall" are special. They should
10328 // be expanded to the call, directly followed by a special marker sequence and
10329 // a call to an ObjC library function. Use CALL_RVMARKER to do that.
10330 if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
10331 assert(!IsTailCall &&
10332 "tail calls cannot be marked with clang.arc.attachedcall");
10333 Opc = AArch64ISD::CALL_RVMARKER;
10334
10335 // Add a target global address for the retainRV/claimRV runtime function
10336 // just before the call target.
10337 Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
10338 auto GA = DAG.getTargetGlobalAddress(ARCFn, DL, PtrVT);
10339 Ops.insert(Ops.begin() + 1, GA);
10340
10341 // We may or may not need to emit both the marker and the retain/claim call.
10342 // Tell the pseudo expansion using an additional boolean op.
10343 bool ShouldEmitMarker = objcarc::attachedCallOpBundleNeedsMarker(CLI.CB);
10344 SDValue DoEmitMarker =
10345 DAG.getTargetConstant(ShouldEmitMarker, DL, MVT::i32);
10346 Ops.insert(Ops.begin() + 2, DoEmitMarker);
10347 } else if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
10348 Opc = AArch64ISD::CALL_ARM64EC_TO_X64;
10349 } else if (GuardWithBTI) {
10350 Opc = AArch64ISD::CALL_BTI;
10351 }
10352
10353 if (IsTailCall) {
10354 // Each tail call may have to adjust the stack by a different amount, so
10355 // this information must travel along with the operation for eventual
10356 // consumption by emitEpilogue.
10357 Ops.push_back(DAG.getSignedTargetConstant(FPDiff, DL, MVT::i32));
10358 }
10359
10360 if (CLI.PAI) {
10361 const uint64_t Key = CLI.PAI->Key;
10363 "Invalid auth call key");
10364
10365 // Split the discriminator into address/integer components.
10366 SDValue AddrDisc, IntDisc;
10367 std::tie(IntDisc, AddrDisc) =
10368 extractPtrauthBlendDiscriminators(CLI.PAI->Discriminator, &DAG);
10369
10370 if (Opc == AArch64ISD::CALL_RVMARKER)
10371 Opc = AArch64ISD::AUTH_CALL_RVMARKER;
10372 else
10373 Opc = IsTailCall ? AArch64ISD::AUTH_TC_RETURN : AArch64ISD::AUTH_CALL;
10374 Ops.push_back(DAG.getTargetConstant(Key, DL, MVT::i32));
10375 Ops.push_back(IntDisc);
10376 Ops.push_back(AddrDisc);
10377 }
10378
10379 // Add argument registers to the end of the list so that they are known live
10380 // into the call.
10381 for (auto &RegToPass : RegsToPass)
10382 Ops.push_back(DAG.getRegister(RegToPass.first,
10383 RegToPass.second.getValueType()));
10384
10385 // Add a register mask operand representing the call-preserved registers.
10386 const uint32_t *Mask;
10387 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
10388 if (IsThisReturn) {
10389 // For 'this' returns, use the X0-preserving mask if applicable
10390 Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
10391 if (!Mask) {
10392 IsThisReturn = false;
10393 Mask = TRI->getCallPreservedMask(MF, CallConv);
10394 }
10395 } else
10396 Mask = TRI->getCallPreservedMask(MF, CallConv);
10397
10398 if (Subtarget->hasCustomCallingConv())
10399 TRI->UpdateCustomCallPreservedMask(MF, &Mask);
10400
10401 if (TRI->isAnyArgRegReserved(MF))
10402 TRI->emitReservedArgRegCallError(MF);
10403
10404 assert(Mask && "Missing call preserved mask for calling convention");
10405 Ops.push_back(DAG.getRegisterMask(Mask));
10406
10407 if (InGlue.getNode())
10408 Ops.push_back(InGlue);
10409
10410 if (CLI.DeactivationSymbol)
10411 Ops.push_back(DAG.getDeactivationSymbol(CLI.DeactivationSymbol));
10412
10413 // If we're doing a tall call, use a TC_RETURN here rather than an
10414 // actual call instruction.
10415 if (IsTailCall) {
10417 SDValue Ret = DAG.getNode(Opc, DL, MVT::Other, Ops);
10418 if (IsCFICall)
10419 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
10420
10421 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
10422 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
10423 if (CalledGlobal &&
10424 MF.getFunction().getParent()->getModuleFlag("import-call-optimization"))
10425 DAG.addCalledGlobal(Ret.getNode(), CalledGlobal, OpFlags);
10426 return Ret;
10427 }
10428
10429 // Returns a chain and a flag for retval copy to use.
10430 Chain = DAG.getNode(Opc, DL, {MVT::Other, MVT::Glue}, Ops);
10431 if (IsCFICall)
10432 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
10433
10434 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
10435 InGlue = Chain.getValue(1);
10436 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
10437 if (CalledGlobal &&
10438 MF.getFunction().getParent()->getModuleFlag("import-call-optimization"))
10439 DAG.addCalledGlobal(Chain.getNode(), CalledGlobal, OpFlags);
10440
10441 uint64_t CalleePopBytes =
10442 DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
10443
10444 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, DL);
10445 InGlue = Chain.getValue(1);
10446
10447 // Handle result values, copying them out of physregs into vregs that we
10448 // return.
10449 SDValue Result = LowerCallResult(
10450 Chain, InGlue, CallConv, IsVarArg, RVLocs, DL, DAG, InVals, IsThisReturn,
10451 IsThisReturn ? OutVals[0] : SDValue(), RequiresSMChange);
10452
10453 if (!Ins.empty())
10454 InGlue = Result.getValue(Result->getNumValues() - 1);
10455
10456 if (RequiresSMChange) {
10458 DAG, DL, !CallAttrs.callee().hasStreamingInterface(), Result, InGlue,
10459 getSMToggleCondition(CallAttrs));
10460 }
10461
10462 if (RequiresSMChange) {
10463 for (unsigned I = 0; I < InVals.size(); ++I) {
10464 // The smstart/smstop is chained as part of the call, but when the
10465 // resulting chain is discarded (which happens when the call is not part
10466 // of a chain, e.g. a call to @llvm.cos()), we need to ensure the
10467 // smstart/smstop is chained to the result value. We can do that by doing
10468 // a vreg -> vreg copy.
10471 SDValue X = DAG.getCopyToReg(Result, DL, Reg, InVals[I]);
10472 InVals[I] = DAG.getCopyFromReg(X, DL, Reg,
10473 InVals[I].getValueType());
10474 }
10475 }
10476
10477 if (CallConv == CallingConv::PreserveNone) {
10478 for (const ISD::OutputArg &O : Outs) {
10479 if (O.Flags.isSwiftSelf() || O.Flags.isSwiftError() ||
10480 O.Flags.isSwiftAsync()) {
10481 MachineFunction &MF = DAG.getMachineFunction();
10482 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10483 MF.getFunction(),
10484 "Swift attributes can't be used with preserve_none",
10485 DL.getDebugLoc()));
10486 break;
10487 }
10488 }
10489 }
10490
10491 return Result;
10492}
10493
10494bool AArch64TargetLowering::CanLowerReturn(
10495 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
10497 const Type *RetTy) const {
10498 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
10500 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
10501 return CCInfo.CheckReturn(Outs, RetCC);
10502}
10503
10504SDValue
10505AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
10506 bool isVarArg,
10508 const SmallVectorImpl<SDValue> &OutVals,
10509 const SDLoc &DL, SelectionDAG &DAG) const {
10510 auto &MF = DAG.getMachineFunction();
10511 auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
10512
10513 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
10515 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
10516 CCInfo.AnalyzeReturn(Outs, RetCC);
10517
10518 // Copy the result values into the output registers.
10519 SDValue Glue;
10521 SmallSet<unsigned, 4> RegsUsed;
10522 for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
10523 ++i, ++realRVLocIdx) {
10524 CCValAssign &VA = RVLocs[i];
10525 assert(VA.isRegLoc() && "Can only return in registers!");
10526 SDValue Arg = OutVals[realRVLocIdx];
10527
10528 switch (VA.getLocInfo()) {
10529 default:
10530 llvm_unreachable("Unknown loc info!");
10531 case CCValAssign::Full:
10532 if (Outs[i].ArgVT == MVT::i1) {
10533 // AAPCS requires i1 to be zero-extended to i8 by the producer of the
10534 // value. This is strictly redundant on Darwin (which uses "zeroext
10535 // i1"), but will be optimised out before ISel.
10536 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
10537 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
10538 }
10539 break;
10540 case CCValAssign::BCvt:
10541 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
10542 break;
10543 case CCValAssign::AExt:
10544 case CCValAssign::ZExt:
10545 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
10546 break;
10548 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
10549 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
10550 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
10551 DAG.getConstant(32, DL, VA.getLocVT()));
10552 break;
10553 }
10554
10555 if (RegsUsed.count(VA.getLocReg())) {
10556 SDValue &Bits =
10557 llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) {
10558 return Elt.first == VA.getLocReg();
10559 })->second;
10560 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
10561 } else {
10562 RetVals.emplace_back(VA.getLocReg(), Arg);
10563 RegsUsed.insert(VA.getLocReg());
10564 }
10565 }
10566
10567 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
10568
10569 // Emit SMSTOP before returning from a locally streaming function
10570 SMEAttrs FuncAttrs = FuncInfo->getSMEFnAttrs();
10571 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) {
10572 if (FuncAttrs.hasStreamingCompatibleInterface())
10573 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
10574 /*Glue*/ SDValue(),
10576 else
10577 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
10578 /*Glue*/ SDValue(), AArch64SME::Always);
10579 Glue = Chain.getValue(1);
10580 }
10581
10582 SmallVector<SDValue, 4> RetOps(1, Chain);
10583 for (auto &RetVal : RetVals) {
10584 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface() &&
10585 isPassedInFPR(RetVal.second.getValueType()))
10586 RetVal.second =
10587 DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
10588 DAG.getVTList(RetVal.second.getValueType(), MVT::Glue),
10589 RetVal.second);
10590 Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Glue);
10591 Glue = Chain.getValue(1);
10592 RetOps.push_back(
10593 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
10594 }
10595
10596 // Windows AArch64 ABIs require that for returning structs by value we copy
10597 // the sret argument into X0 for the return.
10598 // We saved the argument into a virtual register in the entry block,
10599 // so now we copy the value out and into X0.
10600 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
10601 SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
10603
10604 unsigned RetValReg = AArch64::X0;
10605 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
10606 RetValReg = AArch64::X8;
10607 Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Glue);
10608 Glue = Chain.getValue(1);
10609
10610 RetOps.push_back(
10611 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
10612 }
10613
10614 const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&MF);
10615 if (I) {
10616 for (; *I; ++I) {
10617 if (AArch64::GPR64RegClass.contains(*I))
10618 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
10619 else if (AArch64::FPR64RegClass.contains(*I))
10620 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
10621 else
10622 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
10623 }
10624 }
10625
10626 RetOps[0] = Chain; // Update chain.
10627
10628 // Add the glue if we have it.
10629 if (Glue.getNode())
10630 RetOps.push_back(Glue);
10631
10632 if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
10633 // ARM64EC entry thunks use a special return sequence: instead of a regular
10634 // "ret" instruction, they need to explicitly call the emulator.
10635 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10636 SDValue Arm64ECRetDest =
10637 DAG.getExternalSymbol("__os_arm64x_dispatch_ret", PtrVT);
10638 Arm64ECRetDest =
10639 getAddr(cast<ExternalSymbolSDNode>(Arm64ECRetDest), DAG, 0);
10640 Arm64ECRetDest = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Arm64ECRetDest,
10641 MachinePointerInfo());
10642 RetOps.insert(RetOps.begin() + 1, Arm64ECRetDest);
10643 RetOps.insert(RetOps.begin() + 2, DAG.getTargetConstant(0, DL, MVT::i32));
10644 return DAG.getNode(AArch64ISD::TC_RETURN, DL, MVT::Other, RetOps);
10645 }
10646
10647 return DAG.getNode(AArch64ISD::RET_GLUE, DL, MVT::Other, RetOps);
10648}
10649
10650//===----------------------------------------------------------------------===//
10651// Other Lowering Code
10652//===----------------------------------------------------------------------===//
10653
10654SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
10655 SelectionDAG &DAG,
10656 unsigned Flag) const {
10657 return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
10658 N->getOffset(), Flag);
10659}
10660
10661SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
10662 SelectionDAG &DAG,
10663 unsigned Flag) const {
10664 return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
10665}
10666
10667SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
10668 SelectionDAG &DAG,
10669 unsigned Flag) const {
10670 return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
10671 N->getOffset(), Flag);
10672}
10673
10674SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
10675 SelectionDAG &DAG,
10676 unsigned Flag) const {
10677 return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
10678}
10679
10680SDValue AArch64TargetLowering::getTargetNode(ExternalSymbolSDNode *N, EVT Ty,
10681 SelectionDAG &DAG,
10682 unsigned Flag) const {
10683 return DAG.getTargetExternalSymbol(N->getSymbol(), Ty, Flag);
10684}
10685
10686// (loadGOT sym)
10687template <class NodeTy>
10688SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
10689 unsigned Flags) const {
10690 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
10691 SDLoc DL(N);
10692 EVT Ty = getPointerTy(DAG.getDataLayout());
10693 SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
10694 // FIXME: Once remat is capable of dealing with instructions with register
10695 // operands, expand this into two nodes instead of using a wrapper node.
10696 if (DAG.getMachineFunction()
10697 .getInfo<AArch64FunctionInfo>()
10698 ->hasELFSignedGOT())
10699 return SDValue(DAG.getMachineNode(AArch64::LOADgotAUTH, DL, Ty, GotAddr),
10700 0);
10701 return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
10702}
10703
10704// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
10705template <class NodeTy>
10706SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
10707 unsigned Flags) const {
10708 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
10709 SDLoc DL(N);
10710 EVT Ty = getPointerTy(DAG.getDataLayout());
10711 const unsigned char MO_NC = AArch64II::MO_NC;
10712 return DAG.getNode(
10713 AArch64ISD::WrapperLarge, DL, Ty,
10714 getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
10715 getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
10716 getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
10717 getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
10718}
10719
10720// (addlow (adrp %hi(sym)) %lo(sym))
10721template <class NodeTy>
10722SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
10723 unsigned Flags) const {
10724 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
10725 SDLoc DL(N);
10726 EVT Ty = getPointerTy(DAG.getDataLayout());
10727 SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
10728 SDValue Lo = getTargetNode(N, Ty, DAG,
10730 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi);
10731 return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
10732}
10733
10734// (adr sym)
10735template <class NodeTy>
10736SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
10737 unsigned Flags) const {
10738 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
10739 SDLoc DL(N);
10740 EVT Ty = getPointerTy(DAG.getDataLayout());
10741 SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
10742 return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);
10743}
10744
10745SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
10746 SelectionDAG &DAG) const {
10747 GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
10748 const GlobalValue *GV = GN->getGlobal();
10749 unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
10750
10751 if (OpFlags != AArch64II::MO_NO_FLAG)
10753 "unexpected offset in global node");
10754
10755 // This also catches the large code model case for Darwin, and tiny code
10756 // model with got relocations.
10757 if ((OpFlags & AArch64II::MO_GOT) != 0) {
10758 return getGOT(GN, DAG, OpFlags);
10759 }
10760
10764 Result = getAddrLarge(GN, DAG, OpFlags);
10765 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
10766 Result = getAddrTiny(GN, DAG, OpFlags);
10767 } else {
10768 Result = getAddr(GN, DAG, OpFlags);
10769 }
10770 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10771 SDLoc DL(GN);
10773 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
10775 return Result;
10776}
10777
10778/// Convert a TLS address reference into the correct sequence of loads
10779/// and calls to compute the variable's address (for Darwin, currently) and
10780/// return an SDValue containing the final node.
10781
10782/// Darwin only has one TLS scheme which must be capable of dealing with the
10783/// fully general situation, in the worst case. This means:
10784/// + "extern __thread" declaration.
10785/// + Defined in a possibly unknown dynamic library.
10786///
10787/// The general system is that each __thread variable has a [3 x i64] descriptor
10788/// which contains information used by the runtime to calculate the address. The
10789/// only part of this the compiler needs to know about is the first xword, which
10790/// contains a function pointer that must be called with the address of the
10791/// entire descriptor in "x0".
10792///
10793/// Since this descriptor may be in a different unit, in general even the
10794/// descriptor must be accessed via an indirect load. The "ideal" code sequence
10795/// is:
10796/// adrp x0, _var@TLVPPAGE
10797/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
10798/// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
10799/// ; the function pointer
10800/// blr x1 ; Uses descriptor address in x0
10801/// ; Address of _var is now in x0.
10802///
10803/// If the address of _var's descriptor *is* known to the linker, then it can
10804/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
10805/// a slight efficiency gain.
10806SDValue
10807AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
10808 SelectionDAG &DAG) const {
10809 assert(Subtarget->isTargetDarwin() &&
10810 "This function expects a Darwin target");
10811
10812 SDLoc DL(Op);
10813 MVT PtrVT = getPointerTy(DAG.getDataLayout());
10814 MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout());
10815 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
10816
10817 SDValue TLVPAddr =
10818 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
10819 SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
10820
10821 // The first entry in the descriptor is a function pointer that we must call
10822 // to obtain the address of the variable.
10823 SDValue Chain = DAG.getEntryNode();
10824 SDValue FuncTLVGet = DAG.getLoad(
10825 PtrMemVT, DL, Chain, DescAddr,
10827 Align(PtrMemVT.getSizeInBits() / 8),
10829 Chain = FuncTLVGet.getValue(1);
10830
10831 // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
10832 FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT);
10833
10834 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
10835 MFI.setAdjustsStack(true);
10836
10837 // TLS calls preserve all registers except those that absolutely must be
10838 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
10839 // silly).
10840 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
10841 const uint32_t *Mask = TRI->getTLSCallPreservedMask();
10842 if (Subtarget->hasCustomCallingConv())
10843 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
10844
10845 // Finally, we can make the call. This is just a degenerate version of a
10846 // normal AArch64 call node: x0 takes the address of the descriptor, and
10847 // returns the address of the variable in this thread.
10848 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
10849
10850 unsigned Opcode = AArch64ISD::CALL;
10852 Ops.push_back(Chain);
10853 Ops.push_back(FuncTLVGet);
10854
10855 // With ptrauth-calls, the tlv access thunk pointer is authenticated (IA, 0).
10856 if (DAG.getMachineFunction().getFunction().hasFnAttribute("ptrauth-calls")) {
10857 Opcode = AArch64ISD::AUTH_CALL;
10858 Ops.push_back(DAG.getTargetConstant(AArch64PACKey::IA, DL, MVT::i32));
10859 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64)); // Integer Disc.
10860 Ops.push_back(DAG.getRegister(AArch64::NoRegister, MVT::i64)); // Addr Disc.
10861 }
10862
10863 Ops.push_back(DAG.getRegister(AArch64::X0, MVT::i64));
10864 Ops.push_back(DAG.getRegisterMask(Mask));
10865 Ops.push_back(Chain.getValue(1));
10866 Chain = DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
10867 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
10868}
10869
10870/// Convert a thread-local variable reference into a sequence of instructions to
10871/// compute the variable's address for the local exec TLS model of ELF targets.
10872/// The sequence depends on the maximum TLS area size.
10873SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
10874 SDValue ThreadBase,
10875 const SDLoc &DL,
10876 SelectionDAG &DAG) const {
10877 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10878 SDValue TPOff, Addr;
10879
10880 switch (DAG.getTarget().Options.TLSSize) {
10881 default:
10882 llvm_unreachable("Unexpected TLS size");
10883
10884 case 12: {
10885 // mrs x0, TPIDR_EL0
10886 // add x0, x0, :tprel_lo12:a
10888 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
10889 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
10890 Var,
10891 DAG.getTargetConstant(0, DL, MVT::i32)),
10892 0);
10893 }
10894
10895 case 24: {
10896 // mrs x0, TPIDR_EL0
10897 // add x0, x0, :tprel_hi12:a
10898 // add x0, x0, :tprel_lo12_nc:a
10899 SDValue HiVar = DAG.getTargetGlobalAddress(
10900 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
10901 SDValue LoVar = DAG.getTargetGlobalAddress(
10902 GV, DL, PtrVT, 0,
10904 Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
10905 HiVar,
10906 DAG.getTargetConstant(0, DL, MVT::i32)),
10907 0);
10908 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,
10909 LoVar,
10910 DAG.getTargetConstant(0, DL, MVT::i32)),
10911 0);
10912 }
10913
10914 case 32: {
10915 // mrs x1, TPIDR_EL0
10916 // movz x0, #:tprel_g1:a
10917 // movk x0, #:tprel_g0_nc:a
10918 // add x0, x1, x0
10919 SDValue HiVar = DAG.getTargetGlobalAddress(
10920 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
10921 SDValue LoVar = DAG.getTargetGlobalAddress(
10922 GV, DL, PtrVT, 0,
10924 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
10925 DAG.getTargetConstant(16, DL, MVT::i32)),
10926 0);
10927 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
10928 DAG.getTargetConstant(0, DL, MVT::i32)),
10929 0);
10930 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
10931 }
10932
10933 case 48: {
10934 // mrs x1, TPIDR_EL0
10935 // movz x0, #:tprel_g2:a
10936 // movk x0, #:tprel_g1_nc:a
10937 // movk x0, #:tprel_g0_nc:a
10938 // add x0, x1, x0
10939 SDValue HiVar = DAG.getTargetGlobalAddress(
10940 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G2);
10941 SDValue MiVar = DAG.getTargetGlobalAddress(
10942 GV, DL, PtrVT, 0,
10944 SDValue LoVar = DAG.getTargetGlobalAddress(
10945 GV, DL, PtrVT, 0,
10947 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
10948 DAG.getTargetConstant(32, DL, MVT::i32)),
10949 0);
10950 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar,
10951 DAG.getTargetConstant(16, DL, MVT::i32)),
10952 0);
10953 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
10954 DAG.getTargetConstant(0, DL, MVT::i32)),
10955 0);
10956 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
10957 }
10958 }
10959}
10960
10961/// When accessing thread-local variables under either the general-dynamic or
10962/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
10963/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
10964/// is a function pointer to carry out the resolution.
10965///
10966/// The sequence is:
10967/// adrp x0, :tlsdesc:var
10968/// ldr x1, [x0, #:tlsdesc_lo12:var]
10969/// add x0, x0, #:tlsdesc_lo12:var
10970/// .tlsdesccall var
10971/// blr x1
10972/// (TPIDR_EL0 offset now in x0)
10973///
10974/// The above sequence must be produced unscheduled, to enable the linker to
10975/// optimize/relax this sequence.
10976/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
10977/// above sequence, and expanded really late in the compilation flow, to ensure
10978/// the sequence is produced as per above.
10979SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
10980 const SDLoc &DL,
10981 SelectionDAG &DAG) const {
10982 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10983 auto &MF = DAG.getMachineFunction();
10984 auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
10985
10986 SDValue Glue;
10987 SDValue Chain = DAG.getEntryNode();
10988 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
10989
10990 SMECallAttrs TLSCallAttrs(FuncInfo->getSMEFnAttrs(), {}, SMEAttrs::Normal);
10991 bool RequiresSMChange = TLSCallAttrs.requiresSMChange();
10992
10993 auto ChainAndGlue = [](SDValue Chain) -> std::pair<SDValue, SDValue> {
10994 return {Chain, Chain.getValue(1)};
10995 };
10996
10997 if (RequiresSMChange)
10998 std::tie(Chain, Glue) =
10999 ChainAndGlue(changeStreamingMode(DAG, DL, /*Enable=*/false, Chain, Glue,
11000 getSMToggleCondition(TLSCallAttrs)));
11001
11002 unsigned Opcode =
11003 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>()->hasELFSignedGOT()
11004 ? AArch64ISD::TLSDESC_AUTH_CALLSEQ
11005 : AArch64ISD::TLSDESC_CALLSEQ;
11006 SDValue Ops[] = {Chain, SymAddr, Glue};
11007 std::tie(Chain, Glue) = ChainAndGlue(DAG.getNode(
11008 Opcode, DL, NodeTys, Glue ? ArrayRef(Ops) : ArrayRef(Ops).drop_back()));
11009
11010 if (TLSCallAttrs.requiresLazySave())
11011 std::tie(Chain, Glue) = ChainAndGlue(DAG.getNode(
11012 AArch64ISD::REQUIRES_ZA_SAVE, DL, NodeTys, {Chain, Chain.getValue(1)}));
11013
11014 if (RequiresSMChange)
11015 std::tie(Chain, Glue) =
11016 ChainAndGlue(changeStreamingMode(DAG, DL, /*Enable=*/true, Chain, Glue,
11017 getSMToggleCondition(TLSCallAttrs)));
11018
11019 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
11020}
11021
11022SDValue
11023AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
11024 SelectionDAG &DAG) const {
11025 assert(Subtarget->isTargetELF() && "This function expects an ELF target");
11026
11027 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
11028 AArch64FunctionInfo *MFI =
11029 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
11030
11034
11036 if (Model == TLSModel::LocalDynamic)
11038 }
11039
11041 Model != TLSModel::LocalExec)
11042 report_fatal_error("ELF TLS only supported in small memory model or "
11043 "in local exec TLS model");
11044 // Different choices can be made for the maximum size of the TLS area for a
11045 // module. For the small address model, the default TLS size is 16MiB and the
11046 // maximum TLS size is 4GiB.
11047 // FIXME: add tiny and large code model support for TLS access models other
11048 // than local exec. We currently generate the same code as small for tiny,
11049 // which may be larger than needed.
11050
11051 SDValue TPOff;
11052 EVT PtrVT = getPointerTy(DAG.getDataLayout());
11053 SDLoc DL(Op);
11054 const GlobalValue *GV = GA->getGlobal();
11055
11056 SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
11057
11058 if (Model == TLSModel::LocalExec) {
11059 return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
11060 } else if (Model == TLSModel::InitialExec) {
11061 TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
11062 TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
11063 } else if (Model == TLSModel::LocalDynamic) {
11064 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
11065 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
11066 // the beginning of the module's TLS region, followed by a DTPREL offset
11067 // calculation.
11068
11069 // These accesses will need deduplicating if there's more than one.
11071
11072 // The call needs a relocation too for linker relaxation. It doesn't make
11073 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
11074 // the address.
11075 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
11077
11078 // Now we can calculate the offset from TPIDR_EL0 to this module's
11079 // thread-local area.
11080 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
11081
11082 // Now use :dtprel_whatever: operations to calculate this variable's offset
11083 // in its thread-storage area.
11084 SDValue HiVar = DAG.getTargetGlobalAddress(
11085 GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
11086 SDValue LoVar = DAG.getTargetGlobalAddress(
11087 GV, DL, MVT::i64, 0,
11089
11090 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
11091 DAG.getTargetConstant(0, DL, MVT::i32)),
11092 0);
11093 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
11094 DAG.getTargetConstant(0, DL, MVT::i32)),
11095 0);
11096 } else if (Model == TLSModel::GeneralDynamic) {
11097 // The call needs a relocation too for linker relaxation. It doesn't make
11098 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
11099 // the address.
11100 SDValue SymAddr =
11101 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
11102
11103 // Finally we can make a call to calculate the offset from tpidr_el0.
11104 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
11105 } else
11106 llvm_unreachable("Unsupported ELF TLS access model");
11107
11108 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
11109}
11110
11111SDValue
11112AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
11113 SelectionDAG &DAG) const {
11114 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
11115
11116 SDValue Chain = DAG.getEntryNode();
11117 EVT PtrVT = getPointerTy(DAG.getDataLayout());
11118 SDLoc DL(Op);
11119
11120 SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);
11121
11122 // Load the ThreadLocalStoragePointer from the TEB
11123 // A pointer to the TLS array is located at offset 0x58 from the TEB.
11124 SDValue TLSArray =
11125 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
11126 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
11127 Chain = TLSArray.getValue(1);
11128
11129 // Load the TLS index from the C runtime;
11130 // This does the same as getAddr(), but without having a GlobalAddressSDNode.
11131 // This also does the same as LOADgot, but using a generic i32 load,
11132 // while LOADgot only loads i64.
11133 SDValue TLSIndexHi =
11134 DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE);
11135 SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
11136 "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
11137 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi);
11138 SDValue TLSIndex =
11139 DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo);
11140 TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
11141 Chain = TLSIndex.getValue(1);
11142
11143 // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
11144 // offset into the TLSArray.
11145 TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
11146 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
11147 DAG.getConstant(3, DL, PtrVT));
11148 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
11149 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
11150 MachinePointerInfo());
11151 Chain = TLS.getValue(1);
11152
11153 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
11154 const GlobalValue *GV = GA->getGlobal();
11155 SDValue TGAHi = DAG.getTargetGlobalAddress(
11156 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
11157 SDValue TGALo = DAG.getTargetGlobalAddress(
11158 GV, DL, PtrVT, 0,
11160
11161 // Add the offset from the start of the .tls section (section base).
11162 SDValue Addr =
11163 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
11164 DAG.getTargetConstant(0, DL, MVT::i32)),
11165 0);
11166 Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo);
11167 return Addr;
11168}
11169
11170SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
11171 SelectionDAG &DAG) const {
11172 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
11173 if (DAG.getTarget().useEmulatedTLS())
11174 return LowerToTLSEmulatedModel(GA, DAG);
11175
11176 if (Subtarget->isTargetDarwin())
11177 return LowerDarwinGlobalTLSAddress(Op, DAG);
11178 if (Subtarget->isTargetELF())
11179 return LowerELFGlobalTLSAddress(Op, DAG);
11180 if (Subtarget->isTargetWindows())
11181 return LowerWindowsGlobalTLSAddress(Op, DAG);
11182
11183 llvm_unreachable("Unexpected platform trying to use TLS");
11184}
11185
11186//===----------------------------------------------------------------------===//
11187// PtrAuthGlobalAddress lowering
11188//
11189// We have 3 lowering alternatives to choose from:
11190// - MOVaddrPAC: similar to MOVaddr, with added PAC.
11191// If the GV doesn't need a GOT load (i.e., is locally defined)
11192// materialize the pointer using adrp+add+pac. See LowerMOVaddrPAC.
11193//
11194// - LOADgotPAC: similar to LOADgot, with added PAC.
11195// If the GV needs a GOT load, materialize the pointer using the usual
11196// GOT adrp+ldr, +pac. Pointers in GOT are assumed to be not signed, the GOT
11197// section is assumed to be read-only (for example, via relro mechanism). See
11198// LowerMOVaddrPAC.
11199//
11200// - LOADauthptrstatic: similar to LOADgot, but use a
11201// special stub slot instead of a GOT slot.
11202// Load a signed pointer for symbol 'sym' from a stub slot named
11203// 'sym$auth_ptr$key$disc' filled by dynamic linker during relocation
11204// resolving. This usually lowers to adrp+ldr, but also emits an entry into
11205// .data with an @AUTH relocation. See LowerLOADauthptrstatic.
11206//
11207// All 3 are pseudos that are expand late to longer sequences: this lets us
11208// provide integrity guarantees on the to-be-signed intermediate values.
11209//
11210// LOADauthptrstatic is undesirable because it requires a large section filled
11211// with often similarly-signed pointers, making it a good harvesting target.
11212// Thus, it's only used for ptrauth references to extern_weak to avoid null
11213// checks.
11214
11216 SDValue TGA, SDLoc DL, EVT VT, AArch64PACKey::ID KeyC,
11217 SDValue Discriminator, SDValue AddrDiscriminator, SelectionDAG &DAG) {
11218 const auto *TGN = cast<GlobalAddressSDNode>(TGA.getNode());
11219 assert(TGN->getGlobal()->hasExternalWeakLinkage());
11220
11221 // Offsets and extern_weak don't mix well: ptrauth aside, you'd get the
11222 // offset alone as a pointer if the symbol wasn't available, which would
11223 // probably break null checks in users. Ptrauth complicates things further:
11224 // error out.
11225 if (TGN->getOffset() != 0)
11227 "unsupported non-zero offset in weak ptrauth global reference");
11228
11229 if (!isNullConstant(AddrDiscriminator))
11230 report_fatal_error("unsupported weak addr-div ptrauth global");
11231
11232 SDValue Key = DAG.getTargetConstant(KeyC, DL, MVT::i32);
11233 return SDValue(DAG.getMachineNode(AArch64::LOADauthptrstatic, DL, MVT::i64,
11234 {TGA, Key, Discriminator}),
11235 0);
11236}
11237
11238SDValue
11239AArch64TargetLowering::LowerPtrAuthGlobalAddress(SDValue Op,
11240 SelectionDAG &DAG) const {
11241 SDValue Ptr = Op.getOperand(0);
11242 uint64_t KeyC = Op.getConstantOperandVal(1);
11243 SDValue AddrDiscriminator = Op.getOperand(2);
11244 uint64_t DiscriminatorC = Op.getConstantOperandVal(3);
11245 EVT VT = Op.getValueType();
11246 SDLoc DL(Op);
11247
11248 if (KeyC > AArch64PACKey::LAST)
11249 report_fatal_error("key in ptrauth global out of range [0, " +
11250 Twine((int)AArch64PACKey::LAST) + "]");
11251
11252 // Blend only works if the integer discriminator is 16-bit wide.
11253 if (!isUInt<16>(DiscriminatorC))
11255 "constant discriminator in ptrauth global out of range [0, 0xffff]");
11256
11257 // Choosing between 3 lowering alternatives is target-specific.
11258 if (!Subtarget->isTargetELF() && !Subtarget->isTargetMachO())
11259 report_fatal_error("ptrauth global lowering only supported on MachO/ELF");
11260
11261 int64_t PtrOffsetC = 0;
11262 if (Ptr.getOpcode() == ISD::ADD) {
11263 PtrOffsetC = Ptr.getConstantOperandVal(1);
11264 Ptr = Ptr.getOperand(0);
11265 }
11266 const auto *PtrN = cast<GlobalAddressSDNode>(Ptr.getNode());
11267 const GlobalValue *PtrGV = PtrN->getGlobal();
11268
11269 // Classify the reference to determine whether it needs a GOT load.
11270 const unsigned OpFlags =
11271 Subtarget->ClassifyGlobalReference(PtrGV, getTargetMachine());
11272 const bool NeedsGOTLoad = ((OpFlags & AArch64II::MO_GOT) != 0);
11273 assert(((OpFlags & (~AArch64II::MO_GOT)) == 0) &&
11274 "unsupported non-GOT op flags on ptrauth global reference");
11275
11276 // Fold any offset into the GV; our pseudos expect it there.
11277 PtrOffsetC += PtrN->getOffset();
11278 SDValue TPtr = DAG.getTargetGlobalAddress(PtrGV, DL, VT, PtrOffsetC,
11279 /*TargetFlags=*/0);
11280 assert(PtrN->getTargetFlags() == 0 &&
11281 "unsupported target flags on ptrauth global");
11282
11283 SDValue Key = DAG.getTargetConstant(KeyC, DL, MVT::i32);
11284 SDValue Discriminator = DAG.getTargetConstant(DiscriminatorC, DL, MVT::i64);
11285 SDValue TAddrDiscriminator = !isNullConstant(AddrDiscriminator)
11286 ? AddrDiscriminator
11287 : DAG.getRegister(AArch64::XZR, MVT::i64);
11288
11289 // No GOT load needed -> MOVaddrPAC
11290 if (!NeedsGOTLoad) {
11291 assert(!PtrGV->hasExternalWeakLinkage() && "extern_weak should use GOT");
11292 return SDValue(
11293 DAG.getMachineNode(AArch64::MOVaddrPAC, DL, MVT::i64,
11294 {TPtr, Key, TAddrDiscriminator, Discriminator}),
11295 0);
11296 }
11297
11298 // GOT load -> LOADgotPAC
11299 // Note that we disallow extern_weak refs to avoid null checks later.
11300 if (!PtrGV->hasExternalWeakLinkage())
11301 return SDValue(
11302 DAG.getMachineNode(AArch64::LOADgotPAC, DL, MVT::i64,
11303 {TPtr, Key, TAddrDiscriminator, Discriminator}),
11304 0);
11305
11306 // extern_weak ref -> LOADauthptrstatic
11308 TPtr, DL, VT, (AArch64PACKey::ID)KeyC, Discriminator, AddrDiscriminator,
11309 DAG);
11310}
11311
11312// Looks through \param Val to determine the bit that can be used to
11313// check the sign of the value. It returns the unextended value and
11314// the sign bit position.
11315std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
11316 if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG)
11317 return {Val.getOperand(0),
11318 cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() -
11319 1};
11320
11321 if (Val.getOpcode() == ISD::SIGN_EXTEND)
11322 return {Val.getOperand(0),
11323 Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1};
11324
11325 return {Val, Val.getValueSizeInBits() - 1};
11326}
11327
11328// Op is an SDValue that is being compared to 0. If the comparison is a bit
11329// test, optimize it to a TBZ or TBNZ.
11331 SDValue Dest, unsigned Opcode,
11332 SelectionDAG &DAG) {
11333 if (Op.getOpcode() != ISD::AND)
11334 return SDValue();
11335
11336 // See if we can use a TBZ to fold in an AND as well.
11337 // TBZ has a smaller branch displacement than CBZ. If the offset is
11338 // out of bounds, a late MI-layer pass rewrites branches.
11339 // 403.gcc is an example that hits this case.
11340 if (isa<ConstantSDNode>(Op.getOperand(1)) &&
11341 isPowerOf2_64(Op.getConstantOperandVal(1))) {
11342 SDValue Test = Op.getOperand(0);
11343 uint64_t Mask = Op.getConstantOperandVal(1);
11344 return DAG.getNode(Opcode, DL, MVT::Other, Chain, Test,
11345 DAG.getConstant(Log2_64(Mask), DL, MVT::i64), Dest);
11346 }
11347
11348 if (Op.getOperand(0).getOpcode() == ISD::SHL) {
11349 auto Op00 = Op.getOperand(0).getOperand(0);
11350 if (isa<ConstantSDNode>(Op00) && Op00->getAsZExtVal() == 1) {
11351 auto Shr = DAG.getNode(ISD::SRL, DL, Op00.getValueType(),
11352 Op.getOperand(1), Op.getOperand(0).getOperand(1));
11353 return DAG.getNode(Opcode, DL, MVT::Other, Chain, Shr,
11354 DAG.getConstant(0, DL, MVT::i64), Dest);
11355 }
11356 }
11357
11358 return SDValue();
11359}
11360
11361SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
11362 SDValue Chain = Op.getOperand(0);
11363 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
11364 SDValue LHS = Op.getOperand(2);
11365 SDValue RHS = Op.getOperand(3);
11366 SDValue Dest = Op.getOperand(4);
11367 SDLoc DL(Op);
11368
11369 MachineFunction &MF = DAG.getMachineFunction();
11370 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
11371 // will not be produced, as they are conditional branch instructions that do
11372 // not set flags.
11373 bool ProduceNonFlagSettingCondBr =
11374 !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
11375
11376 // Handle f128 first, since lowering it will result in comparing the return
11377 // value of a libcall against zero, which is just what the rest of LowerBR_CC
11378 // is expecting to deal with.
11379 if (LHS.getValueType() == MVT::f128) {
11380 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, DL, LHS, RHS);
11381
11382 // If softenSetCCOperands returned a scalar, we need to compare the result
11383 // against zero to select between true and false values.
11384 if (!RHS.getNode()) {
11385 RHS = DAG.getConstant(0, DL, LHS.getValueType());
11386 CC = ISD::SETNE;
11387 }
11388 }
11389
11390 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
11391 // instruction.
11393 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
11394 // Only lower legal XALUO ops.
11395 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
11396 return SDValue();
11397
11398 // The actual operation with overflow check.
11400 SDValue Value, Overflow;
11401 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
11402
11403 if (CC == ISD::SETNE)
11404 OFCC = getInvertedCondCode(OFCC);
11405 SDValue CCVal = getCondCode(DAG, OFCC);
11406
11407 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CCVal,
11408 Overflow);
11409 }
11410
11411 if (LHS.getValueType().isInteger()) {
11412 assert((LHS.getValueType() == RHS.getValueType()) &&
11413 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
11414
11415 // Normalize (LHS CC 1) -> (LHS NewCC 0) when LHS is known to be 0 or 1.
11416 // This enables the CBZ/CBNZ matching below.
11417 const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
11418 if (RHSC && RHSC->getZExtValue() == 1 && ProduceNonFlagSettingCondBr &&
11419 // Don't do this when LHS is an overflow/carry result (resNo == 1)
11420 // because we can fold cset + cmp #1 + b.cc into a direct
11421 // flag-consuming branch, which CBZ/CBNZ would prevent.
11422 LHS.getResNo() == 0 &&
11423 // This is true only when we somehow know that it's either 0 or 1.
11424 DAG.computeKnownBits(LHS).getMaxValue().ule(1)) {
11425 // Output params unused; we only care whether it returns true.
11426 bool CanNegate, MustBeFirst, PreferFirst;
11427 // Also skip when LHS is a conjunction tree (AND/OR of SETCCs) --
11428 // emitConjunction will lower it as a CCMP chain, which is better
11429 // than materializing the boolean for CBZ.
11430 if (!canEmitConjunction(DAG, LHS, CanNegate, MustBeFirst, PreferFirst,
11431 false)) {
11433 switch (CC) {
11434 // SETLT/SETGE are canonicalized away before reaching here, but
11435 // handle them defensively.
11436 case ISD::SETNE:
11437 case ISD::SETULT:
11438 case ISD::SETLT:
11439 NewCC = ISD::SETEQ;
11440 break;
11441 case ISD::SETEQ:
11442 case ISD::SETUGE:
11443 case ISD::SETGE:
11444 NewCC = ISD::SETNE;
11445 break;
11446 default:
11447 break;
11448 }
11449 if (NewCC != ISD::SETCC_INVALID) {
11450 CC = NewCC;
11451 RHS = DAG.getConstant(0, DL, LHS.getValueType());
11452 RHSC = cast<ConstantSDNode>(RHS);
11453 }
11454 }
11455 }
11456
11457 // If the RHS of the comparison is zero, we can potentially fold this
11458 // to a specialized branch.
11459 if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
11460 if (CC == ISD::SETEQ) {
11461 if (SDValue Result =
11462 optimizeBitTest(DL, LHS, Chain, Dest, AArch64ISD::TBZ, DAG))
11463 return Result;
11464
11465 return DAG.getNode(AArch64ISD::CBZ, DL, MVT::Other, Chain, LHS, Dest);
11466 } else if (CC == ISD::SETNE) {
11467 if (SDValue Result =
11468 optimizeBitTest(DL, LHS, Chain, Dest, AArch64ISD::TBNZ, DAG))
11469 return Result;
11470
11471 return DAG.getNode(AArch64ISD::CBNZ, DL, MVT::Other, Chain, LHS, Dest);
11472 } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
11473 // Don't combine AND since emitComparison converts the AND to an ANDS
11474 // (a.k.a. TST) and the test in the test bit and branch instruction
11475 // becomes redundant. This would also increase register pressure.
11476 uint64_t SignBitPos;
11477 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
11478 return DAG.getNode(AArch64ISD::TBNZ, DL, MVT::Other, Chain, LHS,
11479 DAG.getConstant(SignBitPos, DL, MVT::i64), Dest);
11480 }
11481 }
11482 if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
11483 LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
11484 // Don't combine AND since emitComparison converts the AND to an ANDS
11485 // (a.k.a. TST) and the test in the test bit and branch instruction
11486 // becomes redundant. This would also increase register pressure.
11487 uint64_t SignBitPos;
11488 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
11489 return DAG.getNode(AArch64ISD::TBZ, DL, MVT::Other, Chain, LHS,
11490 DAG.getConstant(SignBitPos, DL, MVT::i64), Dest);
11491 }
11492
11493 // Try to emit Armv9.6 CB instructions. We prefer tb{n}z/cb{n}z due to their
11494 // larger branch displacement but do prefer CB over cmp + br.
11495 if (Subtarget->hasCMPBR() &&
11497 ProduceNonFlagSettingCondBr) {
11498 SDValue Cond =
11500 return DAG.getNode(AArch64ISD::CB, DL, MVT::Other, Chain, Cond, LHS, RHS,
11501 Dest);
11502 }
11503
11504 SDValue CCVal;
11505 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, DL);
11506 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CCVal,
11507 Cmp);
11508 }
11509
11510 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||
11511 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
11512
11513 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
11514 // clean. Some of them require two branches to implement.
11515 SDValue Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
11516 AArch64CC::CondCode CC1, CC2;
11517 changeFPCCToAArch64CC(CC, CC1, CC2);
11518 SDValue CC1Val = getCondCode(DAG, CC1);
11519 SDValue BR1 =
11520 DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CC1Val, Cmp);
11521 if (CC2 != AArch64CC::AL) {
11522 SDValue CC2Val = getCondCode(DAG, CC2);
11523 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, BR1, Dest, CC2Val,
11524 Cmp);
11525 }
11526
11527 return BR1;
11528}
11529
11530SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
11531 SelectionDAG &DAG) const {
11532 if (!Subtarget->isNeonAvailable() &&
11533 !Subtarget->useSVEForFixedLengthVectors())
11534 return SDValue();
11535
11536 EVT VT = Op.getValueType();
11537 EVT IntVT = VT.changeTypeToInteger();
11538 SDLoc DL(Op);
11539
11540 SDValue In1 = Op.getOperand(0);
11541 SDValue In2 = Op.getOperand(1);
11542 EVT SrcVT = In2.getValueType();
11543
11544 if (!SrcVT.bitsEq(VT))
11545 In2 = DAG.getFPExtendOrRound(In2, DL, VT);
11546
11547 if (VT.isScalableVector())
11548 IntVT =
11550
11551 if (VT.isFixedLengthVector() &&
11552 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
11553 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
11554
11555 In1 = convertToScalableVector(DAG, ContainerVT, In1);
11556 In2 = convertToScalableVector(DAG, ContainerVT, In2);
11557
11558 SDValue Res = DAG.getNode(ISD::FCOPYSIGN, DL, ContainerVT, In1, In2);
11559 return convertFromScalableVector(DAG, VT, Res);
11560 }
11561
11562 // With SVE, but without Neon, extend the scalars to scalable vectors and use
11563 // a SVE FCOPYSIGN.
11564 if (!VT.isVector() && !Subtarget->isNeonAvailable() &&
11565 Subtarget->isSVEorStreamingSVEAvailable()) {
11566 if (VT != MVT::f16 && VT != MVT::f32 && VT != MVT::f64 && VT != MVT::bf16)
11567 return SDValue();
11568 EVT SVT = getPackedSVEVectorVT(VT);
11569
11570 SDValue Poison = DAG.getPOISON(SVT);
11571 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
11572 SDValue Ins1 =
11573 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SVT, Poison, In1, Zero);
11574 SDValue Ins2 =
11575 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SVT, Poison, In2, Zero);
11576 SDValue FCS = DAG.getNode(ISD::FCOPYSIGN, DL, SVT, Ins1, Ins2);
11577 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, FCS, Zero);
11578 }
11579
11580 auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) {
11581 if (VT.isScalableVector())
11582 return getSVESafeBitCast(VT, Op, DAG);
11583
11584 return DAG.getBitcast(VT, Op);
11585 };
11586
11587 SDValue VecVal1, VecVal2;
11588 EVT VecVT;
11589 auto SetVecVal = [&](int Idx = -1) {
11590 if (!VT.isVector()) {
11591 SDValue Poison = DAG.getPOISON(VecVT);
11592 VecVal1 = DAG.getTargetInsertSubreg(Idx, DL, VecVT, Poison, In1);
11593 VecVal2 = DAG.getTargetInsertSubreg(Idx, DL, VecVT, Poison, In2);
11594 } else {
11595 VecVal1 = BitCast(VecVT, In1, DAG);
11596 VecVal2 = BitCast(VecVT, In2, DAG);
11597 }
11598 };
11599 if (VT.isVector()) {
11600 VecVT = IntVT;
11601 SetVecVal();
11602 } else if (VT == MVT::f64) {
11603 VecVT = MVT::v2i64;
11604 SetVecVal(AArch64::dsub);
11605 } else if (VT == MVT::f32) {
11606 VecVT = MVT::v4i32;
11607 SetVecVal(AArch64::ssub);
11608 } else if (VT == MVT::f16 || VT == MVT::bf16) {
11609 VecVT = MVT::v8i16;
11610 SetVecVal(AArch64::hsub);
11611 } else {
11612 llvm_unreachable("Invalid type for copysign!");
11613 }
11614
11615 unsigned BitWidth = In1.getScalarValueSizeInBits();
11616 SDValue SignMaskV = DAG.getConstant(~APInt::getSignMask(BitWidth), DL, VecVT);
11617
11618 // We want to materialize a mask with every bit but the high bit set, but the
11619 // AdvSIMD immediate moves cannot materialize that in a single instruction for
11620 // 64-bit elements. Instead, materialize all bits set and then negate that.
11621 if (VT == MVT::f64 || VT == MVT::v2f64) {
11622 SignMaskV = DAG.getConstant(APInt::getAllOnes(BitWidth), DL, VecVT);
11623 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, SignMaskV);
11624 SignMaskV = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, SignMaskV);
11625 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, SignMaskV);
11626 }
11627
11628 SDValue BSP =
11629 DAG.getNode(AArch64ISD::BSP, DL, VecVT, SignMaskV, VecVal1, VecVal2);
11630 if (VT == MVT::f16 || VT == MVT::bf16)
11631 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, BSP);
11632 if (VT == MVT::f32)
11633 return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, BSP);
11634 if (VT == MVT::f64)
11635 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, BSP);
11636
11637 return BitCast(VT, BSP, DAG);
11638}
11639
11640SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
11641 SelectionDAG &DAG) const {
11643 Attribute::NoImplicitFloat))
11644 return SDValue();
11645
11646 EVT VT = Op.getValueType();
11647 if (VT.isScalableVector() ||
11648 useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
11649 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
11650
11651 bool IsParity = Op.getOpcode() == ISD::PARITY;
11652 SDValue Val = Op.getOperand(0);
11653 SDLoc DL(Op);
11654
11655 // for i32, general parity function using EORs is more efficient compared to
11656 // using floating point
11657 if (VT == MVT::i32 && IsParity)
11658 return SDValue();
11659
11660 if (Subtarget->isSVEorStreamingSVEAvailable()) {
11661 if (VT == MVT::i32 || VT == MVT::i64) {
11662 EVT ContainerVT = VT == MVT::i32 ? MVT::nxv4i32 : MVT::nxv2i64;
11663 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
11664 DAG.getPOISON(ContainerVT), Val,
11665 DAG.getVectorIdxConstant(0, DL));
11666 Val = DAG.getNode(ISD::CTPOP, DL, ContainerVT, Val);
11667 Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Val,
11668 DAG.getVectorIdxConstant(0, DL));
11669 if (IsParity)
11670 Val = DAG.getNode(ISD::AND, DL, VT, Val, DAG.getConstant(1, DL, VT));
11671 return Val;
11672 }
11673
11674 if (VT == MVT::i128) {
11675 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Val);
11676 Val = convertToScalableVector(DAG, MVT::nxv2i64, Val);
11677 Val = DAG.getNode(ISD::CTPOP, DL, MVT::nxv2i64, Val);
11678 Val = convertFromScalableVector(DAG, MVT::v2i64, Val);
11679 Val = DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i64, Val);
11680 Val = DAG.getZExtOrTrunc(Val, DL, VT);
11681 if (IsParity)
11682 Val = DAG.getNode(ISD::AND, DL, VT, Val, DAG.getConstant(1, DL, VT));
11683 return Val;
11684 }
11685 }
11686
11687 if (!Subtarget->isNeonAvailable())
11688 return SDValue();
11689
11690 // If there is no CNT instruction available, GPR popcount can
11691 // be more efficiently lowered to the following sequence that uses
11692 // AdvSIMD registers/instructions as long as the copies to/from
11693 // the AdvSIMD registers are cheap.
11694 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
11695 // CNT V0.8B, V0.8B // 8xbyte pop-counts
11696 // ADDV B0, V0.8B // sum 8xbyte pop-counts
11697 // FMOV X0, D0 // copy result back to integer reg
11698 if (VT == MVT::i32 || VT == MVT::i64) {
11699 if (VT == MVT::i32)
11700 Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
11701 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
11702
11703 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
11704 SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v8i8, CtPop);
11705 AddV = DAG.getNode(AArch64ISD::NVCAST, DL,
11706 VT == MVT::i32 ? MVT::v2i32 : MVT::v1i64, AddV);
11707 AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, AddV,
11708 DAG.getConstant(0, DL, MVT::i64));
11709 if (IsParity)
11710 AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
11711 return AddV;
11712 } else if (VT == MVT::i128) {
11713 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
11714
11715 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
11716 SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v16i8, CtPop);
11717 AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64,
11718 DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v2i64, AddV),
11719 DAG.getConstant(0, DL, MVT::i64));
11720 AddV = DAG.getZExtOrTrunc(AddV, DL, VT);
11721 if (IsParity)
11722 AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
11723 return AddV;
11724 }
11725
11726 assert(!IsParity && "ISD::PARITY of vector types not supported");
11727
11728 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
11729 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
11730 "Unexpected type for custom ctpop lowering");
11731
11732 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
11733 Val = DAG.getBitcast(VT8Bit, Val);
11734 Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);
11735
11736 if (Subtarget->hasDotProd() && VT.getScalarSizeInBits() != 16 &&
11737 VT.getVectorNumElements() >= 2) {
11738 EVT DT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
11739 SDValue Zeros = DAG.getConstant(0, DL, DT);
11740 SDValue Ones = DAG.getConstant(1, DL, VT8Bit);
11741
11742 if (VT == MVT::v2i64) {
11743 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
11744 Val = DAG.getNode(AArch64ISD::UADDLP, DL, VT, Val);
11745 } else if (VT == MVT::v2i32) {
11746 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
11747 } else if (VT == MVT::v4i32) {
11748 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
11749 } else {
11750 llvm_unreachable("Unexpected type for custom ctpop lowering");
11751 }
11752
11753 return Val;
11754 }
11755
11756 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
11757 unsigned EltSize = 8;
11758 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
11759 while (EltSize != VT.getScalarSizeInBits()) {
11760 EltSize *= 2;
11761 NumElts /= 2;
11762 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
11763 Val = DAG.getNode(AArch64ISD::UADDLP, DL, WidenVT, Val);
11764 }
11765
11766 return Val;
11767}
11768
11769SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
11770 EVT VT = Op.getValueType();
11771 SDLoc DL(Op);
11772 SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0));
11773 return DAG.getNode(ISD::CTLZ, DL, VT, RBIT);
11774}
11775
11776SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
11777 SelectionDAG &DAG) const {
11778
11779 EVT VT = Op.getValueType();
11780 SDLoc DL(Op);
11781 unsigned Opcode = Op.getOpcode();
11782 ISD::CondCode CC;
11783 switch (Opcode) {
11784 default:
11785 llvm_unreachable("Wrong instruction");
11786 case ISD::SMAX:
11787 CC = ISD::SETGT;
11788 break;
11789 case ISD::SMIN:
11790 CC = ISD::SETLT;
11791 break;
11792 case ISD::UMAX:
11793 CC = ISD::SETUGT;
11794 break;
11795 case ISD::UMIN:
11796 CC = ISD::SETULT;
11797 break;
11798 }
11799
11800 // Note: This lowering only overrides NEON for v1i64 and v2i64, where we
11801 // prefer using SVE if available.
11802 if (VT.isScalableVector() ||
11803 useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true)) {
11804 switch (Opcode) {
11805 default:
11806 llvm_unreachable("Wrong instruction");
11807 case ISD::SMAX:
11808 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED);
11809 case ISD::SMIN:
11810 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED);
11811 case ISD::UMAX:
11812 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED);
11813 case ISD::UMIN:
11814 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED);
11815 }
11816 }
11817
11818 SDValue Op0 = Op.getOperand(0);
11819 SDValue Op1 = Op.getOperand(1);
11820 SDValue Cond = DAG.getSetCC(DL, VT, Op0, Op1, CC);
11821 return DAG.getSelect(DL, VT, Cond, Op0, Op1);
11822}
11823
11824SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
11825 SelectionDAG &DAG) const {
11826 EVT VT = Op.getValueType();
11827
11828 if (VT.isScalableVector() ||
11830 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
11831 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU);
11832
11833 SDLoc DL(Op);
11834 SDValue REVB;
11835 MVT VST;
11836
11837 switch (VT.getSimpleVT().SimpleTy) {
11838 default:
11839 llvm_unreachable("Invalid type for bitreverse!");
11840
11841 case MVT::v2i32: {
11842 VST = MVT::v8i8;
11843 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
11844
11845 break;
11846 }
11847
11848 case MVT::v4i32: {
11849 VST = MVT::v16i8;
11850 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
11851
11852 break;
11853 }
11854
11855 case MVT::v1i64: {
11856 VST = MVT::v8i8;
11857 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
11858
11859 break;
11860 }
11861
11862 case MVT::v2i64: {
11863 VST = MVT::v16i8;
11864 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
11865
11866 break;
11867 }
11868 }
11869
11870 return DAG.getNode(AArch64ISD::NVCAST, DL, VT,
11871 DAG.getNode(ISD::BITREVERSE, DL, VST, REVB));
11872}
11873
11874// Check whether the continuous comparison sequence.
11875static bool
11876isOrXorChain(SDValue N, unsigned &Num,
11877 SmallVector<std::pair<SDValue, SDValue>, 16> &WorkList) {
11878 if (Num == MaxXors)
11879 return false;
11880
11881 // Skip the one-use zext
11882 if (N->getOpcode() == ISD::ZERO_EXTEND && N->hasOneUse())
11883 N = N->getOperand(0);
11884
11885 // The leaf node must be XOR
11886 if (N->getOpcode() == ISD::XOR) {
11887 WorkList.push_back(std::make_pair(N->getOperand(0), N->getOperand(1)));
11888 Num++;
11889 return true;
11890 }
11891
11892 // All the non-leaf nodes must be OR.
11893 if (N->getOpcode() != ISD::OR || !N->hasOneUse())
11894 return false;
11895
11896 if (isOrXorChain(N->getOperand(0), Num, WorkList) &&
11897 isOrXorChain(N->getOperand(1), Num, WorkList))
11898 return true;
11899 return false;
11900}
11901
11902// Transform chains of ORs and XORs, which usually outlined by memcmp/bmp.
11904 SDValue LHS = N->getOperand(0);
11905 SDValue RHS = N->getOperand(1);
11906 SDLoc DL(N);
11907 EVT VT = N->getValueType(0);
11909
11910 // Only handle integer compares.
11911 if (N->getOpcode() != ISD::SETCC)
11912 return SDValue();
11913
11914 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
11915 // Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as:
11916 // sub A0, A1; ccmp B0, B1, 0, eq; cmp inv(Cond) flag
11917 unsigned NumXors = 0;
11918 if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) &&
11919 LHS->getOpcode() == ISD::OR && LHS->hasOneUse() &&
11920 isOrXorChain(LHS, NumXors, WorkList)) {
11921 SDValue XOR0, XOR1;
11922 std::tie(XOR0, XOR1) = WorkList[0];
11923 unsigned LogicOp = (Cond == ISD::SETEQ) ? ISD::AND : ISD::OR;
11924 SDValue Cmp = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
11925 for (unsigned I = 1; I < WorkList.size(); I++) {
11926 std::tie(XOR0, XOR1) = WorkList[I];
11927 SDValue CmpChain = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
11928 Cmp = DAG.getNode(LogicOp, DL, VT, Cmp, CmpChain);
11929 }
11930
11931 // Exit early by inverting the condition, which help reduce indentations.
11932 return Cmp;
11933 }
11934
11935 return SDValue();
11936}
11937
11938SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
11939
11940 if (Op.getValueType().isVector())
11941 return LowerVSETCC(Op, DAG);
11942
11943 bool IsStrict = Op->isStrictFPOpcode();
11944 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
11945 unsigned OpNo = IsStrict ? 1 : 0;
11946 SDValue Chain;
11947 if (IsStrict)
11948 Chain = Op.getOperand(0);
11949 SDValue LHS = Op.getOperand(OpNo + 0);
11950 SDValue RHS = Op.getOperand(OpNo + 1);
11951 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get();
11952 SDLoc DL(Op);
11953
11954 // We chose ZeroOrOneBooleanContents, so use zero and one.
11955 EVT VT = Op.getValueType();
11956 SDValue TVal = DAG.getConstant(1, DL, VT);
11957 SDValue FVal = DAG.getConstant(0, DL, VT);
11958
11959 // Handle f128 first, since one possible outcome is a normal integer
11960 // comparison which gets picked up by the next if statement.
11961 if (LHS.getValueType() == MVT::f128) {
11962 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, DL, LHS, RHS, Chain,
11963 IsSignaling);
11964
11965 // If softenSetCCOperands returned a scalar, use it.
11966 if (!RHS.getNode()) {
11967 assert(LHS.getValueType() == Op.getValueType() &&
11968 "Unexpected setcc expansion!");
11969 return IsStrict ? DAG.getMergeValues({LHS, Chain}, DL) : LHS;
11970 }
11971 }
11972
11973 if (LHS.getValueType().isInteger()) {
11974 if (Subtarget->hasCSSC() && CC == ISD::SETNE && isNullConstant(RHS)) {
11975 SDValue One = DAG.getConstant(1, DL, LHS.getValueType());
11976 SDValue UMin = DAG.getNode(ISD::UMIN, DL, LHS.getValueType(), LHS, One);
11977 SDValue Res = DAG.getZExtOrTrunc(UMin, DL, VT);
11978 return IsStrict ? DAG.getMergeValues({Res, Chain}, DL) : Res;
11979 }
11980 simplifySetCCIntoEq(CC, LHS, RHS, DAG, DL);
11981
11982 SDValue CCVal;
11984 LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, DL);
11985
11986 // Note that we inverted the condition above, so we reverse the order of
11987 // the true and false operands here. This will allow the setcc to be
11988 // matched to a single CSINC instruction.
11989 SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, VT, FVal, TVal, CCVal, Cmp);
11990 return IsStrict ? DAG.getMergeValues({Res, Chain}, DL) : Res;
11991 }
11992
11993 // Now we know we're dealing with FP values.
11994 assert(LHS.getValueType() == MVT::bf16 || LHS.getValueType() == MVT::f16 ||
11995 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
11996
11997 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
11998 // and do the comparison.
11999 SDValue Cmp;
12000 if (IsStrict)
12001 Cmp = emitStrictFPComparison(LHS, RHS, DL, DAG, Chain, IsSignaling);
12002 else
12003 Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
12004
12005 AArch64CC::CondCode CC1, CC2;
12006 changeFPCCToAArch64CC(CC, CC1, CC2);
12007 SDValue Res;
12008 if (CC2 == AArch64CC::AL) {
12009 changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1,
12010 CC2);
12011 SDValue CC1Val = getCondCode(DAG, CC1);
12012
12013 // Note that we inverted the condition above, so we reverse the order of
12014 // the true and false operands here. This will allow the setcc to be
12015 // matched to a single CSINC instruction.
12016 Res = DAG.getNode(AArch64ISD::CSEL, DL, VT, FVal, TVal, CC1Val, Cmp);
12017 } else {
12018 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
12019 // totally clean. Some of them require two CSELs to implement. As is in
12020 // this case, we emit the first CSEL and then emit a second using the output
12021 // of the first as the RHS. We're effectively OR'ing the two CC's together.
12022
12023 // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
12024 SDValue CC1Val = getCondCode(DAG, CC1);
12025 SDValue CS1 =
12026 DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, FVal, CC1Val, Cmp);
12027
12028 SDValue CC2Val = getCondCode(DAG, CC2);
12029 Res = DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, CS1, CC2Val, Cmp);
12030 }
12031 return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, DL) : Res;
12032}
12033
12034SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,
12035 SelectionDAG &DAG) const {
12036
12037 SDValue LHS = Op.getOperand(0);
12038 SDValue RHS = Op.getOperand(1);
12039 EVT VT = LHS.getValueType();
12040 if (VT != MVT::i32 && VT != MVT::i64)
12041 return SDValue();
12042
12043 SDLoc DL(Op);
12044 SDValue Carry = Op.getOperand(2);
12045 // SBCS uses a carry not a borrow so the carry flag should be inverted first.
12046 SDValue InvCarry = valueToCarryFlag(Carry, DAG, true);
12047 SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, FlagsVT),
12048 LHS, RHS, InvCarry);
12049
12050 EVT OpVT = Op.getValueType();
12051 SDValue TVal = DAG.getConstant(1, DL, OpVT);
12052 SDValue FVal = DAG.getConstant(0, DL, OpVT);
12053
12054 ISD::CondCode Cond = cast<CondCodeSDNode>(Op.getOperand(3))->get();
12056 SDValue CCVal = getCondCode(DAG, changeIntCCToAArch64CC(CondInv));
12057 // Inputs are swapped because the condition is inverted. This will allow
12058 // matching with a single CSINC instruction.
12059 return DAG.getNode(AArch64ISD::CSEL, DL, OpVT, FVal, TVal, CCVal,
12060 Cmp.getValue(1));
12061}
12062
12063/// Emit vector comparison for floating-point values, producing a mask.
12065 AArch64CC::CondCode CC, bool NoNans, EVT VT,
12066 const SDLoc &DL, SelectionDAG &DAG) {
12067 assert(VT.getSizeInBits() == LHS.getValueType().getSizeInBits() &&
12068 "function only supposed to emit natural comparisons");
12069
12070 switch (CC) {
12071 default:
12072 return SDValue();
12073 case AArch64CC::NE: {
12074 SDValue Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, DL, VT, LHS, RHS);
12075 // Use vector semantics for the inversion to potentially save a copy between
12076 // SIMD and regular registers.
12077 if (!LHS.getValueType().isVector()) {
12078 EVT VecVT =
12079 EVT::getVectorVT(*DAG.getContext(), VT, 128 / VT.getSizeInBits());
12080 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
12081 SDValue MaskVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT,
12082 DAG.getPOISON(VecVT), Fcmeq, Zero);
12083 SDValue InvertedMask = DAG.getNOT(DL, MaskVec, VecVT);
12084 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, InvertedMask, Zero);
12085 }
12086 return DAG.getNOT(DL, Fcmeq, VT);
12087 }
12088 case AArch64CC::EQ:
12089 return DAG.getNode(AArch64ISD::FCMEQ, DL, VT, LHS, RHS);
12090 case AArch64CC::GE:
12091 return DAG.getNode(AArch64ISD::FCMGE, DL, VT, LHS, RHS);
12092 case AArch64CC::GT:
12093 return DAG.getNode(AArch64ISD::FCMGT, DL, VT, LHS, RHS);
12094 case AArch64CC::LE:
12095 if (!NoNans)
12096 return SDValue();
12097 // If we ignore NaNs then we can use to the LS implementation.
12098 [[fallthrough]];
12099 case AArch64CC::LS:
12100 return DAG.getNode(AArch64ISD::FCMGE, DL, VT, RHS, LHS);
12101 case AArch64CC::LT:
12102 if (!NoNans)
12103 return SDValue();
12104 // If we ignore NaNs then we can use to the MI implementation.
12105 [[fallthrough]];
12106 case AArch64CC::MI:
12107 return DAG.getNode(AArch64ISD::FCMGT, DL, VT, RHS, LHS);
12108 }
12109}
12110
12111/// For SELECT_CC, when the true/false values are (-1, 0) and the compared
12112/// values are scalars, try to emit a mask generating vector instruction.
12114 SDValue FVal, ISD::CondCode CC, bool NoNaNs,
12115 const SDLoc &DL, SelectionDAG &DAG) {
12116 assert(!LHS.getValueType().isVector());
12117 assert(!RHS.getValueType().isVector());
12118
12119 auto *CTVal = dyn_cast<ConstantSDNode>(TVal);
12120 auto *CFVal = dyn_cast<ConstantSDNode>(FVal);
12121 if (!CTVal || !CFVal)
12122 return {};
12123 if (!(CTVal->isAllOnes() && CFVal->isZero()) &&
12124 !(CTVal->isZero() && CFVal->isAllOnes()))
12125 return {};
12126
12127 if (CTVal->isZero())
12128 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
12129
12130 EVT VT = TVal.getValueType();
12131 if (VT.getSizeInBits() != LHS.getValueType().getSizeInBits())
12132 return {};
12133
12134 if (!NoNaNs && (CC == ISD::SETUO || CC == ISD::SETO)) {
12135 bool OneNaN = false;
12136 if (LHS == RHS) {
12137 OneNaN = true;
12138 } else if (DAG.isKnownNeverNaN(RHS)) {
12139 OneNaN = true;
12140 RHS = LHS;
12141 } else if (DAG.isKnownNeverNaN(LHS)) {
12142 OneNaN = true;
12143 LHS = RHS;
12144 }
12145 if (OneNaN)
12146 CC = (CC == ISD::SETUO) ? ISD::SETUNE : ISD::SETOEQ;
12147 }
12148
12151 bool ShouldInvert = false;
12152 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
12153 SDValue Cmp = emitVectorComparison(LHS, RHS, CC1, NoNaNs, VT, DL, DAG);
12154 SDValue Cmp2;
12155 if (CC2 != AArch64CC::AL) {
12156 Cmp2 = emitVectorComparison(LHS, RHS, CC2, NoNaNs, VT, DL, DAG);
12157 if (!Cmp2)
12158 return {};
12159 }
12160 if (!Cmp2 && !ShouldInvert)
12161 return Cmp;
12162
12163 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), VT, 128 / VT.getSizeInBits());
12164 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
12165 SDValue Poison = DAG.getPOISON(VecVT);
12166 Cmp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT, Poison, Cmp, Zero);
12167 if (Cmp2) {
12168 Cmp2 = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT, Poison, Cmp2, Zero);
12169 Cmp = DAG.getNode(ISD::OR, DL, VecVT, Cmp, Cmp2);
12170 }
12171 if (ShouldInvert)
12172 Cmp = DAG.getNOT(DL, Cmp, VecVT);
12173 Cmp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Cmp, Zero);
12174 return Cmp;
12175}
12176
12177SDValue AArch64TargetLowering::LowerSELECT_CC(
12180 const SDLoc &DL, SelectionDAG &DAG) const {
12181 // Handle f128 first, because it will result in a comparison of some RTLIB
12182 // call result against zero.
12183 if (LHS.getValueType() == MVT::f128) {
12184 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, DL, LHS, RHS);
12185
12186 // If softenSetCCOperands returned a scalar, we need to compare the result
12187 // against zero to select between true and false values.
12188 if (!RHS.getNode()) {
12189 RHS = DAG.getConstant(0, DL, LHS.getValueType());
12190 CC = ISD::SETNE;
12191 }
12192 }
12193
12194 // Also handle f16, for which we need to do a f32 comparison.
12195 if ((LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
12196 LHS.getValueType() == MVT::bf16) {
12197 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
12198 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
12199 }
12200
12201 // Next, handle integers.
12202 if (LHS.getValueType().isInteger()) {
12203 assert((LHS.getValueType() == RHS.getValueType()) &&
12204 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
12205
12206 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
12207 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
12208 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
12209
12210 // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
12211 // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
12212 // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
12213 // Both require less instructions than compare and conditional select.
12214 if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TVal &&
12215 RHSC && RHSC->isZero() && CFVal && CFVal->isZero() &&
12216 LHS.getValueType() == RHS.getValueType()) {
12217 EVT VT = LHS.getValueType();
12218 SDValue Shift =
12219 DAG.getNode(ISD::SRA, DL, VT, LHS,
12220 DAG.getConstant(VT.getSizeInBits() - 1, DL, VT));
12221
12222 if (CC == ISD::SETGT)
12223 Shift = DAG.getNOT(DL, Shift, VT);
12224
12225 return DAG.getNode(ISD::AND, DL, VT, LHS, Shift);
12226 }
12227
12228 // Check for sign bit test patterns that can use TST optimization.
12229 // (SELECT_CC setlt, sign_extend_inreg, 0, tval, fval)
12230 // -> TST %operand, sign_bit; CSEL
12231 // (SELECT_CC setlt, sign_extend, 0, tval, fval)
12232 // -> TST %operand, sign_bit; CSEL
12233 if (CC == ISD::SETLT && RHSC && RHSC->isZero() && LHS.hasOneUse() &&
12234 (LHS.getOpcode() == ISD::SIGN_EXTEND_INREG ||
12235 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
12236
12237 uint64_t SignBitPos;
12238 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
12239 EVT TestVT = LHS.getValueType();
12240 SDValue SignBitConst = DAG.getConstant(1ULL << SignBitPos, DL, TestVT);
12241 SDValue TST =
12242 DAG.getNode(AArch64ISD::ANDS, DL, DAG.getVTList(TestVT, MVT::i32),
12243 LHS, SignBitConst);
12244
12245 SDValue Flags = TST.getValue(1);
12246 return DAG.getNode(AArch64ISD::CSEL, DL, TVal.getValueType(), TVal, FVal,
12247 DAG.getConstant(AArch64CC::NE, DL, MVT::i32), Flags);
12248 }
12249
12250 // Canonicalise absolute difference patterns:
12251 // select_cc lhs, rhs, sub(lhs, rhs), sub(rhs, lhs), cc ->
12252 // select_cc lhs, rhs, sub(lhs, rhs), neg(sub(lhs, rhs)), cc
12253 //
12254 // select_cc lhs, rhs, sub(rhs, lhs), sub(lhs, rhs), cc ->
12255 // select_cc lhs, rhs, neg(sub(lhs, rhs)), sub(lhs, rhs), cc
12256 // The second forms can be matched into subs+cneg.
12257 // NOTE: Drop poison generating flags from the negated operand to avoid
12258 // inadvertently propagating poison after the canonicalisation.
12259 if (TVal.getOpcode() == ISD::SUB && FVal.getOpcode() == ISD::SUB) {
12260 if (TVal.getOperand(0) == LHS && TVal.getOperand(1) == RHS &&
12261 FVal.getOperand(0) == RHS && FVal.getOperand(1) == LHS) {
12263 FVal = DAG.getNegative(TVal, DL, TVal.getValueType());
12264 } else if (TVal.getOperand(0) == RHS && TVal.getOperand(1) == LHS &&
12265 FVal.getOperand(0) == LHS && FVal.getOperand(1) == RHS) {
12267 TVal = DAG.getNegative(FVal, DL, FVal.getValueType());
12268 }
12269 }
12270
12271 unsigned Opcode = AArch64ISD::CSEL;
12272
12273 // If both the TVal and the FVal are constants, see if we can swap them in
12274 // order to for a CSINV or CSINC out of them.
12275 if (CTVal && CFVal && CTVal->isAllOnes() && CFVal->isZero()) {
12276 std::swap(TVal, FVal);
12277 std::swap(CTVal, CFVal);
12278 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
12279 } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isZero()) {
12280 std::swap(TVal, FVal);
12281 std::swap(CTVal, CFVal);
12282 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
12283 } else if (TVal.getOpcode() == ISD::XOR) {
12284 // If TVal is a NOT we want to swap TVal and FVal so that we can match
12285 // with a CSINV rather than a CSEL.
12286 if (isAllOnesConstant(TVal.getOperand(1))) {
12287 std::swap(TVal, FVal);
12288 std::swap(CTVal, CFVal);
12289 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
12290 }
12291 } else if (TVal.getOpcode() == ISD::SUB) {
12292 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
12293 // that we can match with a CSNEG rather than a CSEL.
12294 if (isNullConstant(TVal.getOperand(0))) {
12295 std::swap(TVal, FVal);
12296 std::swap(CTVal, CFVal);
12297 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
12298 }
12299 } else if (CTVal && CFVal) {
12300 const int64_t TrueVal = CTVal->getSExtValue();
12301 const int64_t FalseVal = CFVal->getSExtValue();
12302 bool Swap = false;
12303
12304 // If both TVal and FVal are constants, see if FVal is the
12305 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
12306 // instead of a CSEL in that case.
12307 if (TrueVal == ~FalseVal) {
12308 Opcode = AArch64ISD::CSINV;
12309 } else if (FalseVal > std::numeric_limits<int64_t>::min() &&
12310 TrueVal == -FalseVal) {
12311 Opcode = AArch64ISD::CSNEG;
12312 } else if (TVal.getValueType() == MVT::i32) {
12313 // If our operands are only 32-bit wide, make sure we use 32-bit
12314 // arithmetic for the check whether we can use CSINC. This ensures that
12315 // the addition in the check will wrap around properly in case there is
12316 // an overflow (which would not be the case if we do the check with
12317 // 64-bit arithmetic).
12318 const uint32_t TrueVal32 = CTVal->getZExtValue();
12319 const uint32_t FalseVal32 = CFVal->getZExtValue();
12320
12321 if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
12322 Opcode = AArch64ISD::CSINC;
12323
12324 if (TrueVal32 > FalseVal32) {
12325 Swap = true;
12326 }
12327 }
12328 } else {
12329 // 64-bit check whether we can use CSINC.
12330 const uint64_t TrueVal64 = TrueVal;
12331 const uint64_t FalseVal64 = FalseVal;
12332
12333 if ((TrueVal64 == FalseVal64 + 1) || (TrueVal64 + 1 == FalseVal64)) {
12334 Opcode = AArch64ISD::CSINC;
12335
12336 if (TrueVal > FalseVal) {
12337 Swap = true;
12338 }
12339 }
12340 }
12341
12342 // Swap TVal and FVal if necessary.
12343 if (Swap) {
12344 std::swap(TVal, FVal);
12345 std::swap(CTVal, CFVal);
12346 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
12347 }
12348
12349 if (Opcode != AArch64ISD::CSEL) {
12350 // Drop FVal since we can get its value by simply inverting/negating
12351 // TVal.
12352 FVal = TVal;
12353 }
12354 }
12355
12356 // Avoid materializing a constant when possible by reusing a known value in
12357 // a register. However, don't perform this optimization if the known value
12358 // is one, zero or negative one in the case of a CSEL. We can always
12359 // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
12360 // FVal, respectively.
12361 ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
12362 if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
12363 !RHSVal->isZero() && !RHSVal->isAllOnes()) {
12365 // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
12366 // "a != C ? x : a" to avoid materializing C.
12367 if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
12368 TVal = LHS;
12369 else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
12370 FVal = LHS;
12371 } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
12372 assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
12373 // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
12374 // avoid materializing C.
12376 if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
12377 Opcode = AArch64ISD::CSINV;
12378 TVal = LHS;
12379 FVal = DAG.getConstant(0, DL, FVal.getValueType());
12380 }
12381 }
12382
12383 SDValue CCVal;
12384 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, DL);
12385 EVT VT = TVal.getValueType();
12386 return DAG.getNode(Opcode, DL, VT, TVal, FVal, CCVal, Cmp);
12387 }
12388
12389 // Now we know we're dealing with FP values.
12390 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
12391 LHS.getValueType() == MVT::f64);
12392 assert(LHS.getValueType() == RHS.getValueType());
12393 EVT VT = TVal.getValueType();
12394
12395 // If the purpose of the comparison is to select between all ones
12396 // or all zeros, try to use a vector comparison because the operands are
12397 // already stored in SIMD registers.
12398 if (Subtarget->isNeonAvailable() && all_of(Users, [](const SDNode *U) {
12399 switch (U->getOpcode()) {
12400 default:
12401 return false;
12404 case AArch64ISD::DUP:
12405 return true;
12406 }
12407 })) {
12408 bool NoNaNs = Flags.hasNoNaNs();
12409 SDValue VectorCmp =
12410 emitFloatCompareMask(LHS, RHS, TVal, FVal, CC, NoNaNs, DL, DAG);
12411 if (VectorCmp)
12412 return VectorCmp;
12413 }
12414
12415 SDValue Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
12416
12417 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
12418 // clean. Some of them require two CSELs to implement.
12419 AArch64CC::CondCode CC1, CC2;
12420 changeFPCCToAArch64CC(CC, CC1, CC2);
12421
12422 if (Flags.hasNoSignedZeros()) {
12423 // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
12424 // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
12425 ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
12426 if (RHSVal && RHSVal->isZero()) {
12427 ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
12428 ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);
12429
12430 if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
12431 CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
12432 TVal = LHS;
12433 else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
12434 CFVal && CFVal->isZero() &&
12435 FVal.getValueType() == LHS.getValueType())
12436 FVal = LHS;
12437 }
12438 }
12439
12440 // Emit first, and possibly only, CSEL.
12441 SDValue CC1Val = getCondCode(DAG, CC1);
12442 SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, FVal, CC1Val, Cmp);
12443
12444 // If we need a second CSEL, emit it, using the output of the first as the
12445 // RHS. We're effectively OR'ing the two CC's together.
12446 if (CC2 != AArch64CC::AL) {
12447 SDValue CC2Val = getCondCode(DAG, CC2);
12448 return DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, CS1, CC2Val, Cmp);
12449 }
12450
12451 // Otherwise, return the output of the first CSEL.
12452 return CS1;
12453}
12454
12455SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
12456 SelectionDAG &DAG) const {
12457 EVT Ty = Op.getValueType();
12458 if (!isa<ConstantSDNode>(Op.getOperand(2)))
12459 return SDValue();
12460 auto Idx = Op.getConstantOperandAPInt(2);
12461 int64_t IdxVal = Idx.getSExtValue();
12462 assert(Ty.isScalableVector() &&
12463 "Only expect scalable vectors for custom lowering of VECTOR_SPLICE");
12464
12465 // We can use the splice instruction for certain index values where we are
12466 // able to efficiently generate the correct predicate. The index will be
12467 // inverted and used directly as the input to the ptrue instruction, i.e.
12468 // -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the
12469 // splice predicate. However, we can only do this if we can guarantee that
12470 // there are enough elements in the vector, hence we check the index <= min
12471 // number of elements.
12472 std::optional<unsigned> PredPattern;
12473 if (Ty.isScalableVector() && Op.getOpcode() == ISD::VECTOR_SPLICE_RIGHT &&
12474 (PredPattern = getSVEPredPatternFromNumElements(IdxVal)) !=
12475 std::nullopt) {
12476 SDLoc DL(Op);
12477
12478 // Create a predicate where all but the last -IdxVal elements are false.
12479 EVT PredVT = Ty.changeVectorElementType(*DAG.getContext(), MVT::i1);
12480 SDValue Pred = getPTrue(DAG, DL, PredVT, *PredPattern);
12481 Pred = DAG.getNode(ISD::VECTOR_REVERSE, DL, PredVT, Pred);
12482
12483 // Now splice the two inputs together using the predicate.
12484 return DAG.getNode(AArch64ISD::SPLICE, DL, Ty, Pred, Op.getOperand(0),
12485 Op.getOperand(1));
12486 }
12487
12488 // We can select to an EXT instruction when indexing the first 256 bytes.
12490 if (Op.getOpcode() == ISD::VECTOR_SPLICE_LEFT &&
12491 (IdxVal * BlockSize / 8) < 256)
12492 return Op;
12493
12494 return SDValue();
12495}
12496
12497SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
12498 SelectionDAG &DAG) const {
12499 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
12500 SDValue LHS = Op.getOperand(0);
12501 SDValue RHS = Op.getOperand(1);
12502 SDValue TVal = Op.getOperand(2);
12503 SDValue FVal = Op.getOperand(3);
12504 SDNodeFlags Flags = Op->getFlags();
12505 SDLoc DL(Op);
12506 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), Flags, DL, DAG);
12507}
12508
12509SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
12510 SelectionDAG &DAG) const {
12511 SDValue CCVal = Op->getOperand(0);
12512 SDValue TVal = Op->getOperand(1);
12513 SDValue FVal = Op->getOperand(2);
12514 SDLoc DL(Op);
12515
12516 EVT Ty = Op.getValueType();
12517 if (Ty == MVT::aarch64svcount) {
12518 TVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, TVal);
12519 FVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, FVal);
12520 SDValue Sel =
12521 DAG.getNode(ISD::SELECT, DL, MVT::nxv16i1, CCVal, TVal, FVal);
12522 return DAG.getNode(ISD::BITCAST, DL, Ty, Sel);
12523 }
12524
12525 if (Ty.isScalableVector()) {
12526 MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());
12527 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, CCVal);
12528 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
12529 }
12530
12531 if (useSVEForFixedLengthVectorVT(Ty, !Subtarget->isNeonAvailable())) {
12532 // FIXME: Ideally this would be the same as above using i1 types, however
12533 // for the moment we can't deal with fixed i1 vector types properly, so
12534 // instead extend the predicate to a result type sized integer vector.
12535 MVT SplatValVT = MVT::getIntegerVT(Ty.getScalarSizeInBits());
12536 MVT PredVT = MVT::getVectorVT(SplatValVT, Ty.getVectorElementCount());
12537 SDValue SplatVal = DAG.getSExtOrTrunc(CCVal, DL, SplatValVT);
12538 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, SplatVal);
12539 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
12540 }
12541
12542 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
12543 // instruction.
12544 if (ISD::isOverflowIntrOpRes(CCVal)) {
12545 // Only lower legal XALUO ops.
12546 if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
12547 return SDValue();
12548
12550 SDValue Value, Overflow;
12551 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
12552 SDValue CCVal = getCondCode(DAG, OFCC);
12553
12554 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
12555 CCVal, Overflow);
12556 }
12557
12558 // Lower it the same way as we would lower a SELECT_CC node.
12559 ISD::CondCode CC;
12560 SDValue LHS, RHS;
12561 if (CCVal.getOpcode() == ISD::SETCC) {
12562 LHS = CCVal.getOperand(0);
12563 RHS = CCVal.getOperand(1);
12564 CC = cast<CondCodeSDNode>(CCVal.getOperand(2))->get();
12565 } else {
12566 LHS = CCVal;
12567 RHS = DAG.getConstant(0, DL, CCVal.getValueType());
12568 CC = ISD::SETNE;
12569 }
12570
12571 // If we are lowering a f16 and we do not have fullf16, convert to a f32 in
12572 // order to use FCSELSrrr
12573 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
12574 SDValue Poison = DAG.getPOISON(MVT::f32);
12575 TVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32, Poison, TVal);
12576 FVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32, Poison, FVal);
12577 }
12578
12579 SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(),
12580 Op->getFlags(), DL, DAG);
12581
12582 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
12583 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res);
12584 }
12585
12586 return Res;
12587}
12588
12589SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
12590 SelectionDAG &DAG) const {
12591 // Jump table entries as PC relative offsets. No additional tweaking
12592 // is necessary here. Just get the address of the jump table.
12593 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
12594
12597 !Subtarget->isTargetMachO())
12598 return getAddrLarge(JT, DAG);
12599 if (CM == CodeModel::Tiny)
12600 return getAddrTiny(JT, DAG);
12601 return getAddr(JT, DAG);
12602}
12603
12604SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
12605 SelectionDAG &DAG) const {
12606 // Jump table entries as PC relative offsets. No additional tweaking
12607 // is necessary here. Just get the address of the jump table.
12608 SDLoc DL(Op);
12609 SDValue JT = Op.getOperand(1);
12610 SDValue Entry = Op.getOperand(2);
12611 int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();
12612
12613 auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
12614 AFI->setJumpTableEntryInfo(JTI, 4, nullptr);
12615
12616 // With aarch64-jump-table-hardening, we only expand the jump table dispatch
12617 // sequence later, to guarantee the integrity of the intermediate values.
12619 "aarch64-jump-table-hardening")) {
12621 if (Subtarget->isTargetMachO()) {
12622 if (CM != CodeModel::Small && CM != CodeModel::Large)
12623 report_fatal_error("Unsupported code-model for hardened jump-table");
12624 } else {
12625 // Note that COFF support would likely also need JUMP_TABLE_DEBUG_INFO.
12626 assert(Subtarget->isTargetELF() &&
12627 "jump table hardening only supported on MachO/ELF");
12628 if (CM != CodeModel::Small)
12629 report_fatal_error("Unsupported code-model for hardened jump-table");
12630 }
12631
12632 SDValue X16Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::X16,
12633 Entry, SDValue());
12634 SDNode *B = DAG.getMachineNode(AArch64::BR_JumpTable, DL, MVT::Other,
12635 DAG.getTargetJumpTable(JTI, MVT::i32),
12636 X16Copy.getValue(0), X16Copy.getValue(1));
12637 return SDValue(B, 0);
12638 }
12639
12640 SDNode *Dest =
12641 DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
12642 Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
12643 SDValue JTInfo = DAG.getJumpTableDebugInfo(JTI, Op.getOperand(0), DL);
12644 return DAG.getNode(ISD::BRIND, DL, MVT::Other, JTInfo, SDValue(Dest, 0));
12645}
12646
12647SDValue AArch64TargetLowering::LowerBRIND(SDValue Op, SelectionDAG &DAG) const {
12648 SDValue Chain = Op.getOperand(0);
12649 SDValue Dest = Op.getOperand(1);
12650
12651 // BR_JT is lowered to BRIND, but the later lowering is specific to indirectbr
12652 // Skip over the jump-table BRINDs, where the destination is JumpTableDest32.
12653 if (Dest->isMachineOpcode() &&
12654 Dest->getMachineOpcode() == AArch64::JumpTableDest32)
12655 return SDValue();
12656
12657 const MachineFunction &MF = DAG.getMachineFunction();
12658 std::optional<uint16_t> BADisc =
12659 Subtarget->getPtrAuthBlockAddressDiscriminatorIfEnabled(MF.getFunction());
12660 if (!BADisc)
12661 return SDValue();
12662
12663 SDLoc DL(Op);
12664
12665 SDValue Disc = DAG.getTargetConstant(*BADisc, DL, MVT::i64);
12667 SDValue AddrDisc = DAG.getRegister(AArch64::XZR, MVT::i64);
12668
12669 SDNode *BrA = DAG.getMachineNode(AArch64::BRA, DL, MVT::Other,
12670 {Dest, Key, Disc, AddrDisc, Chain});
12671 return SDValue(BrA, 0);
12672}
12673
12674SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
12675 SelectionDAG &DAG) const {
12676 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
12678 if (CM == CodeModel::Large) {
12679 // Use the GOT for the large code model on iOS.
12680 if (Subtarget->isTargetMachO()) {
12681 return getGOT(CP, DAG);
12682 }
12684 return getAddrLarge(CP, DAG);
12685 } else if (CM == CodeModel::Tiny) {
12686 return getAddrTiny(CP, DAG);
12687 }
12688 return getAddr(CP, DAG);
12689}
12690
12691SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
12692 SelectionDAG &DAG) const {
12693 BlockAddressSDNode *BAN = cast<BlockAddressSDNode>(Op);
12694 const BlockAddress *BA = BAN->getBlockAddress();
12695
12696 if (std::optional<uint16_t> BADisc =
12697 Subtarget->getPtrAuthBlockAddressDiscriminatorIfEnabled(
12698 *BA->getFunction())) {
12699 SDLoc DL(Op);
12700
12701 // This isn't cheap, but BRIND is rare.
12702 SDValue TargetBA = DAG.getTargetBlockAddress(BA, BAN->getValueType(0));
12703
12704 SDValue Disc = DAG.getTargetConstant(*BADisc, DL, MVT::i64);
12705
12707 SDValue AddrDisc = DAG.getRegister(AArch64::XZR, MVT::i64);
12708
12709 SDNode *MOV =
12710 DAG.getMachineNode(AArch64::MOVaddrPAC, DL, {MVT::Other, MVT::Glue},
12711 {TargetBA, Key, AddrDisc, Disc});
12712 return DAG.getCopyFromReg(SDValue(MOV, 0), DL, AArch64::X16, MVT::i64,
12713 SDValue(MOV, 1));
12714 }
12715
12717 if (CM == CodeModel::Large && !Subtarget->isTargetMachO()) {
12719 return getAddrLarge(BAN, DAG);
12720 } else if (CM == CodeModel::Tiny) {
12721 return getAddrTiny(BAN, DAG);
12722 }
12723 return getAddr(BAN, DAG);
12724}
12725
12726SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
12727 SelectionDAG &DAG) const {
12728 AArch64FunctionInfo *FuncInfo =
12729 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
12730
12731 SDLoc DL(Op);
12732 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
12734 FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout()));
12735 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
12736 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
12737 MachinePointerInfo(SV));
12738}
12739
12740SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
12741 SelectionDAG &DAG) const {
12742 MachineFunction &MF = DAG.getMachineFunction();
12743 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
12744
12745 SDLoc DL(Op);
12746 SDValue FR;
12747 if (Subtarget->isWindowsArm64EC()) {
12748 // With the Arm64EC ABI, we compute the address of the varargs save area
12749 // relative to x4. For a normal AArch64->AArch64 call, x4 == sp on entry,
12750 // but calls from an entry thunk can pass in a different address.
12751 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
12752 SDValue Val = DAG.getCopyFromReg(DAG.getEntryNode(), DL, VReg, MVT::i64);
12753 uint64_t StackOffset;
12754 if (FuncInfo->getVarArgsGPRSize() > 0)
12755 StackOffset = -(uint64_t)FuncInfo->getVarArgsGPRSize();
12756 else
12757 StackOffset = FuncInfo->getVarArgsStackOffset();
12758 FR = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
12759 DAG.getConstant(StackOffset, DL, MVT::i64));
12760 } else {
12761 FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
12762 ? FuncInfo->getVarArgsGPRIndex()
12763 : FuncInfo->getVarArgsStackIndex(),
12765 }
12766 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
12767 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
12768 MachinePointerInfo(SV));
12769}
12770
12771SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
12772 SelectionDAG &DAG) const {
12773 // The layout of the va_list struct is specified in the AArch64 Procedure Call
12774 // Standard, section B.3.
12775 MachineFunction &MF = DAG.getMachineFunction();
12776 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
12777 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
12778 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
12779 auto PtrVT = getPointerTy(DAG.getDataLayout());
12780 SDLoc DL(Op);
12781
12782 SDValue Chain = Op.getOperand(0);
12783 SDValue VAList = Op.getOperand(1);
12784 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
12786
12787 // void *__stack at offset 0
12788 unsigned Offset = 0;
12789 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
12790 Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT);
12791 MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
12792 MachinePointerInfo(SV), Align(PtrSize)));
12793
12794 // void *__gr_top at offset 8 (4 on ILP32)
12795 Offset += PtrSize;
12796 int GPRSize = FuncInfo->getVarArgsGPRSize();
12797 if (GPRSize > 0) {
12798 SDValue GRTop, GRTopAddr;
12799
12800 GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12801 DAG.getConstant(Offset, DL, PtrVT));
12802
12803 GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
12804 GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
12805 DAG.getSignedConstant(GPRSize, DL, PtrVT));
12806 GRTop = DAG.getZExtOrTrunc(GRTop, DL, PtrMemVT);
12807
12808 MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
12809 MachinePointerInfo(SV, Offset),
12810 Align(PtrSize)));
12811 }
12812
12813 // void *__vr_top at offset 16 (8 on ILP32)
12814 Offset += PtrSize;
12815 int FPRSize = FuncInfo->getVarArgsFPRSize();
12816 if (FPRSize > 0) {
12817 SDValue VRTop, VRTopAddr;
12818 VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12819 DAG.getConstant(Offset, DL, PtrVT));
12820
12821 VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
12822 VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
12823 DAG.getSignedConstant(FPRSize, DL, PtrVT));
12824 VRTop = DAG.getZExtOrTrunc(VRTop, DL, PtrMemVT);
12825
12826 MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
12827 MachinePointerInfo(SV, Offset),
12828 Align(PtrSize)));
12829 }
12830
12831 // int __gr_offs at offset 24 (12 on ILP32)
12832 Offset += PtrSize;
12833 SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12834 DAG.getConstant(Offset, DL, PtrVT));
12835 MemOps.push_back(
12836 DAG.getStore(Chain, DL, DAG.getSignedConstant(-GPRSize, DL, MVT::i32),
12837 GROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
12838
12839 // int __vr_offs at offset 28 (16 on ILP32)
12840 Offset += 4;
12841 SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12842 DAG.getConstant(Offset, DL, PtrVT));
12843 MemOps.push_back(
12844 DAG.getStore(Chain, DL, DAG.getSignedConstant(-FPRSize, DL, MVT::i32),
12845 VROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
12846
12847 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
12848}
12849
12850SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
12851 SelectionDAG &DAG) const {
12852 MachineFunction &MF = DAG.getMachineFunction();
12853 Function &F = MF.getFunction();
12854
12855 if (Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg()))
12856 return LowerWin64_VASTART(Op, DAG);
12857 else if (Subtarget->isTargetDarwin())
12858 return LowerDarwin_VASTART(Op, DAG);
12859 else
12860 return LowerAAPCS_VASTART(Op, DAG);
12861}
12862
12863SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
12864 SelectionDAG &DAG) const {
12865 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
12866 // pointer.
12867 SDLoc DL(Op);
12868 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
12869 unsigned VaListSize =
12870 (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
12871 ? PtrSize
12872 : Subtarget->isTargetILP32() ? 20 : 32;
12873 const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
12874 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
12875
12876 return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
12877 DAG.getConstant(VaListSize, DL, MVT::i32),
12878 Align(PtrSize), false, false, /*CI=*/nullptr,
12879 std::nullopt, MachinePointerInfo(DestSV),
12880 MachinePointerInfo(SrcSV));
12881}
12882
12883SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
12884 assert(Subtarget->isTargetDarwin() &&
12885 "automatic va_arg instruction only works on Darwin");
12886
12887 const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
12888 EVT VT = Op.getValueType();
12889 SDLoc DL(Op);
12890 SDValue Chain = Op.getOperand(0);
12891 SDValue Addr = Op.getOperand(1);
12892 MaybeAlign Align(Op.getConstantOperandVal(3));
12893 unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
12894 auto PtrVT = getPointerTy(DAG.getDataLayout());
12895 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
12896 SDValue VAList =
12897 DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V));
12898 Chain = VAList.getValue(1);
12899 VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);
12900
12901 if (VT.isScalableVector())
12902 report_fatal_error("Passing SVE types to variadic functions is "
12903 "currently not supported");
12904
12905 if (Align && *Align > MinSlotSize) {
12906 VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12907 DAG.getConstant(Align->value() - 1, DL, PtrVT));
12908 VAList =
12909 DAG.getNode(ISD::AND, DL, PtrVT, VAList,
12910 DAG.getSignedConstant(-(int64_t)Align->value(), DL, PtrVT));
12911 }
12912
12913 Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
12914 unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
12915
12916 // Scalar integer and FP values smaller than 64 bits are implicitly extended
12917 // up to 64 bits. At the very least, we have to increase the striding of the
12918 // vaargs list to match this, and for FP values we need to introduce
12919 // FP_ROUND nodes as well.
12920 if (VT.isInteger() && !VT.isVector())
12921 ArgSize = std::max(ArgSize, MinSlotSize);
12922 bool NeedFPTrunc = false;
12923 if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
12924 ArgSize = 8;
12925 NeedFPTrunc = true;
12926 }
12927
12928 // Increment the pointer, VAList, to the next vaarg
12929 SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12930 DAG.getConstant(ArgSize, DL, PtrVT));
12931 VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT);
12932
12933 // Store the incremented VAList to the legalized pointer
12934 SDValue APStore =
12935 DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
12936
12937 // Load the actual argument out of the pointer VAList
12938 if (NeedFPTrunc) {
12939 // Load the value as an f64.
12940 SDValue WideFP =
12941 DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
12942 // Round the value down to an f32.
12943 SDValue NarrowFP =
12944 DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
12945 DAG.getIntPtrConstant(1, DL, /*isTarget=*/true));
12946 SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
12947 // Merge the rounded value with the chain output of the load.
12948 return DAG.getMergeValues(Ops, DL);
12949 }
12950
12951 return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
12952}
12953
12954SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
12955 SelectionDAG &DAG) const {
12956 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
12957 MFI.setFrameAddressIsTaken(true);
12958
12959 EVT VT = Op.getValueType();
12960 SDLoc DL(Op);
12961 unsigned Depth = Op.getConstantOperandVal(0);
12962 SDValue FrameAddr =
12963 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
12964 while (Depth--)
12965 FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
12966 MachinePointerInfo());
12967
12968 if (Subtarget->isTargetILP32())
12969 FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr,
12970 DAG.getValueType(VT));
12971
12972 return FrameAddr;
12973}
12974
12975SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
12976 SelectionDAG &DAG) const {
12977 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
12978
12979 EVT VT = getPointerTy(DAG.getDataLayout());
12980 int FI = MFI.CreateFixedObject(4, 0, false);
12981 return DAG.getFrameIndex(FI, VT);
12982}
12983
12984#define GET_REGISTER_MATCHER
12985#include "AArch64GenAsmMatcher.inc"
12986
12987// FIXME? Maybe this could be a TableGen attribute on some registers and
12988// this table could be generated automatically from RegInfo.
12989Register AArch64TargetLowering::
12990getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
12992 if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
12993 const AArch64RegisterInfo *MRI = Subtarget->getRegisterInfo();
12994 unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
12995 if (!Subtarget->isXRegisterReserved(DwarfRegNum) &&
12996 !MRI->isReservedReg(MF, Reg))
12997 Reg = Register();
12998 }
12999 return Reg;
13000}
13001
13002SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
13003 SelectionDAG &DAG) const {
13005
13006 EVT VT = Op.getValueType();
13007 SDLoc DL(Op);
13008
13009 SDValue FrameAddr =
13010 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
13012
13013 return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);
13014}
13015
13016SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
13017 SelectionDAG &DAG) const {
13018 MachineFunction &MF = DAG.getMachineFunction();
13019 MachineFrameInfo &MFI = MF.getFrameInfo();
13020 MFI.setReturnAddressIsTaken(true);
13021
13022 EVT VT = Op.getValueType();
13023 SDLoc DL(Op);
13024 unsigned Depth = Op.getConstantOperandVal(0);
13025 SDValue ReturnAddress;
13026 if (Depth) {
13027 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
13029 ReturnAddress = DAG.getLoad(
13030 VT, DL, DAG.getEntryNode(),
13031 DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo());
13032 } else {
13033 // Return LR, which contains the return address. Mark it an implicit
13034 // live-in.
13035 Register Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
13036 ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
13037 }
13038
13039 // The XPACLRI instruction assembles to a hint-space instruction before
13040 // Armv8.3-A therefore this instruction can be safely used for any pre
13041 // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
13042 // that instead.
13043 SDNode *St;
13044 if (Subtarget->hasPAuth()) {
13045 St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress);
13046 } else {
13047 // XPACLRI operates on LR therefore we must move the operand accordingly.
13048 SDValue Chain =
13049 DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress);
13050 St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain);
13051 }
13052 return SDValue(St, 0);
13053}
13054
13055/// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two
13056/// i32 values and take a 2 x i32 value to shift plus a shift amount.
13057SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op,
13058 SelectionDAG &DAG) const {
13059 SDValue Lo, Hi;
13060 expandShiftParts(Op.getNode(), Lo, Hi, DAG);
13061 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
13062}
13063
13065 const GlobalAddressSDNode *GA) const {
13066 // Offsets are folded in the DAG combine rather than here so that we can
13067 // intelligently choose an offset based on the uses.
13068 return false;
13069}
13070
13072 EVT VT) const {
13073 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
13074 // 16-bit case when target has full fp16 support.
13075 // We encode bf16 bit patterns as if they were fp16. This results in very
13076 // strange looking assembly but should populate the register with appropriate
13077 // values. Let's say we wanted to encode 0xR3FC0 which is 1.5 in BF16. We will
13078 // end up encoding this as the imm8 0x7f. This imm8 will be expanded to the
13079 // FP16 1.9375 which shares the same bit pattern as BF16 1.5.
13080 // FIXME: We should be able to handle f128 as well with a clever lowering.
13081 const APInt ImmInt = Imm.bitcastToAPInt();
13082
13083 if (VT == MVT::f64)
13084 return AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero();
13085
13086 if (VT == MVT::f32)
13087 return AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero();
13088
13089 if (VT == MVT::f16 || VT == MVT::bf16)
13090 return (Subtarget->hasFullFP16() && AArch64_AM::getFP16Imm(ImmInt) != -1) ||
13091 Imm.isPosZero();
13092
13093 return false;
13094}
13095
13097 bool OptForSize) const {
13098 bool IsLegal = isFPImmLegalAsFMov(Imm, VT);
13099 const APInt ImmInt = Imm.bitcastToAPInt();
13100
13101 // If we can not materialize in immediate field for fmov, check if the
13102 // value can be encoded as the immediate operand of a logical instruction.
13103 // The immediate value will be created with either MOVZ, MOVN, or ORR.
13104 // TODO: fmov h0, w0 is also legal, however we don't have an isel pattern to
13105 // generate that fmov.
13106 if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) {
13107 // The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
13108 // however the mov+fmov sequence is always better because of the reduced
13109 // cache pressure. The timings are still the same if you consider
13110 // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
13111 // movw+movk is fused). So by default we limit up to 2 instructions
13112 // or 4 with hasFuseLiterals.
13115 assert(Insn.size() <= 4 &&
13116 "Should be able to build any value with at most 4 moves");
13117 unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 4 : 2));
13118 IsLegal = Insn.size() <= Limit;
13119 }
13120
13121 LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT
13122 << " imm value: "; Imm.dump(););
13123 return IsLegal;
13124}
13125
13126//===----------------------------------------------------------------------===//
13127// AArch64 Optimization Hooks
13128//===----------------------------------------------------------------------===//
13129
13130static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
13131 SDValue Operand, SelectionDAG &DAG,
13132 int &ExtraSteps) {
13133 EVT VT = Operand.getValueType();
13134 if ((ST->hasNEON() &&
13135 (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
13136 VT == MVT::f32 || VT == MVT::v1f32 || VT == MVT::v2f32 ||
13137 VT == MVT::v4f32)) ||
13138 (ST->hasSVE() &&
13139 (VT == MVT::nxv8f16 || VT == MVT::nxv4f32 || VT == MVT::nxv2f64))) {
13141 // For the reciprocal estimates, convergence is quadratic, so the number
13142 // of digits is doubled after each iteration. In ARMv8, the accuracy of
13143 // the initial estimate is 2^-8. Thus the number of extra steps to refine
13144 // the result for float (23 mantissa bits) is 2 and for double (52
13145 // mantissa bits) is 3.
13146 constexpr unsigned AccurateBits = 8;
13147 unsigned DesiredBits = APFloat::semanticsPrecision(VT.getFltSemantics());
13148 ExtraSteps = DesiredBits <= AccurateBits
13149 ? 0
13150 : Log2_64_Ceil(DesiredBits) - Log2_64_Ceil(AccurateBits);
13151 }
13152
13153 return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
13154 }
13155
13156 return SDValue();
13157}
13158
13159SDValue AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
13160 const DenormalMode &Mode,
13161 SDNodeFlags Flags) const {
13162 SDLoc DL(Op);
13163 EVT VT = Op.getValueType();
13164 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
13165 SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
13166 return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ, /*Chain=*/{},
13167 /*Signaling=*/false, Flags);
13168}
13169
13170SDValue
13171AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op,
13172 SelectionDAG &DAG) const {
13173 return Op;
13174}
13175
13176SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
13177 SelectionDAG &DAG, int Enabled,
13178 int &ExtraSteps,
13179 bool &UseOneConst,
13180 bool Reciprocal) const {
13182 (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
13183 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
13184 DAG, ExtraSteps)) {
13185 SDLoc DL(Operand);
13186 EVT VT = Operand.getValueType();
13187
13188 // Ensure nodes can be recognized by isAssociativeAndCommutative.
13189 SDNodeFlags Flags =
13191
13192 // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
13193 // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
13194 for (int i = ExtraSteps; i > 0; --i) {
13195 SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
13196 Flags);
13197 Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
13198 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
13199 }
13200 if (!Reciprocal)
13201 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
13202
13203 ExtraSteps = 0;
13204 return Estimate;
13205 }
13206
13207 return SDValue();
13208}
13209
13210SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
13211 SelectionDAG &DAG, int Enabled,
13212 int &ExtraSteps) const {
13214 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
13215 DAG, ExtraSteps)) {
13216 SDLoc DL(Operand);
13217 EVT VT = Operand.getValueType();
13218
13220
13221 // Newton reciprocal iteration: E * (2 - X * E)
13222 // AArch64 reciprocal iteration instruction: (2 - M * N)
13223 for (int i = ExtraSteps; i > 0; --i) {
13224 SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
13225 Estimate, Flags);
13226 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
13227 }
13228
13229 ExtraSteps = 0;
13230 return Estimate;
13231 }
13232
13233 return SDValue();
13234}
13235
13236//===----------------------------------------------------------------------===//
13237// AArch64 Inline Assembly Support
13238//===----------------------------------------------------------------------===//
13239
13240// Table of Constraints
13241// TODO: This is the current set of constraints supported by ARM for the
13242// compiler, not all of them may make sense.
13243//
13244// r - A general register
13245// w - An FP/SIMD register of some size in the range v0-v31
13246// x - An FP/SIMD register of some size in the range v0-v15
13247// I - Constant that can be used with an ADD instruction
13248// J - Constant that can be used with a SUB instruction
13249// K - Constant that can be used with a 32-bit logical instruction
13250// L - Constant that can be used with a 64-bit logical instruction
13251// M - Constant that can be used as a 32-bit MOV immediate
13252// N - Constant that can be used as a 64-bit MOV immediate
13253// Q - A memory reference with base register and no offset
13254// S - A symbolic address
13255// Y - Floating point constant zero
13256// Z - Integer constant zero
13257//
13258// Note that general register operands will be output using their 64-bit x
13259// register name, whatever the size of the variable, unless the asm operand
13260// is prefixed by the %w modifier. Floating-point and SIMD register operands
13261// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
13262// %q modifier.
13263const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
13264 // At this point, we have to lower this constraint to something else, so we
13265 // lower it to an "r" or "w". However, by doing this we will force the result
13266 // to be in register, while the X constraint is much more permissive.
13267 //
13268 // Although we are correct (we are free to emit anything, without
13269 // constraints), we might break use cases that would expect us to be more
13270 // efficient and emit something else.
13271 if (!Subtarget->hasFPARMv8())
13272 return "r";
13273
13274 if (ConstraintVT.isFloatingPoint())
13275 return "w";
13276
13277 if (ConstraintVT.isVector() &&
13278 (ConstraintVT.getSizeInBits() == 64 ||
13279 ConstraintVT.getSizeInBits() == 128))
13280 return "w";
13281
13282 return "r";
13283}
13284
13286
13287// Returns a {Reg, RegisterClass} tuple if the constraint is
13288// a specific predicate register.
13289//
13290// For some constraint like "{pn3}" the default path in
13291// TargetLowering::getRegForInlineAsmConstraint() leads it to determine that a
13292// suitable register class for this register is "PPRorPNR", after which it
13293// determines that nxv16i1 is an appropriate type for the constraint, which is
13294// not what we want. The code here pre-empts this by matching the register
13295// explicitly.
13296static std::optional<std::pair<unsigned, const TargetRegisterClass *>>
13298 if (!Constraint.starts_with('{') || !Constraint.ends_with('}') ||
13299 (Constraint[1] != 'p' && Constraint[1] != 'z'))
13300 return std::nullopt;
13301
13302 bool IsPredicate = Constraint[1] == 'p';
13303 Constraint = Constraint.substr(2, Constraint.size() - 3);
13304 bool IsPredicateAsCount = IsPredicate && Constraint.starts_with("n");
13305 if (IsPredicateAsCount)
13306 Constraint = Constraint.drop_front(1);
13307
13308 unsigned V;
13309 if (Constraint.getAsInteger(10, V) || V > 31)
13310 return std::nullopt;
13311
13312 if (IsPredicateAsCount)
13313 return std::make_pair(AArch64::PN0 + V, &AArch64::PNRRegClass);
13314 if (IsPredicate)
13315 return std::make_pair(AArch64::P0 + V, &AArch64::PPRRegClass);
13316 return std::make_pair(AArch64::Z0 + V, &AArch64::ZPRRegClass);
13317}
13318
13319static std::optional<PredicateConstraint>
13322 .Case("Uph", PredicateConstraint::Uph)
13325 .Default(std::nullopt);
13326}
13327
13328static const TargetRegisterClass *
13330 if (VT != MVT::aarch64svcount &&
13331 (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1))
13332 return nullptr;
13333
13334 switch (Constraint) {
13336 return VT == MVT::aarch64svcount ? &AArch64::PNR_p8to15RegClass
13337 : &AArch64::PPR_p8to15RegClass;
13339 return VT == MVT::aarch64svcount ? &AArch64::PNR_3bRegClass
13340 : &AArch64::PPR_3bRegClass;
13342 return VT == MVT::aarch64svcount ? &AArch64::PNRRegClass
13343 : &AArch64::PPRRegClass;
13344 }
13345
13346 llvm_unreachable("Missing PredicateConstraint!");
13347}
13348
13350
13351static std::optional<ReducedGprConstraint>
13354 .Case("Uci", ReducedGprConstraint::Uci)
13356 .Default(std::nullopt);
13357}
13358
13359static const TargetRegisterClass *
13361 if (!VT.isScalarInteger() || VT.getFixedSizeInBits() > 64)
13362 return nullptr;
13363
13364 switch (Constraint) {
13366 return &AArch64::MatrixIndexGPR32_8_11RegClass;
13368 return &AArch64::MatrixIndexGPR32_12_15RegClass;
13369 }
13370
13371 llvm_unreachable("Missing ReducedGprConstraint!");
13372}
13373
13374// The set of cc code supported is from
13375// https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Flag-Output-Operands
13378 .Case("{@cchi}", AArch64CC::HI)
13379 .Case("{@cccs}", AArch64CC::HS)
13380 .Case("{@cclo}", AArch64CC::LO)
13381 .Case("{@ccls}", AArch64CC::LS)
13382 .Case("{@cccc}", AArch64CC::LO)
13383 .Case("{@cceq}", AArch64CC::EQ)
13384 .Case("{@ccgt}", AArch64CC::GT)
13385 .Case("{@ccge}", AArch64CC::GE)
13386 .Case("{@cclt}", AArch64CC::LT)
13387 .Case("{@ccle}", AArch64CC::LE)
13388 .Case("{@cchs}", AArch64CC::HS)
13389 .Case("{@ccne}", AArch64CC::NE)
13390 .Case("{@ccvc}", AArch64CC::VC)
13391 .Case("{@ccpl}", AArch64CC::PL)
13392 .Case("{@ccvs}", AArch64CC::VS)
13393 .Case("{@ccmi}", AArch64CC::MI)
13395 return Cond;
13396}
13397
13398/// Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR,
13399/// WZR, invert(<cond>)'.
13401 SelectionDAG &DAG) {
13402 return DAG.getNode(AArch64ISD::CSINC, DL, MVT::i32,
13403 DAG.getConstant(0, DL, MVT::i32),
13404 DAG.getConstant(0, DL, MVT::i32),
13405 getCondCode(DAG, getInvertedCondCode(CC)), NZCV);
13406}
13407
13408// Lower @cc flag output via getSETCC.
13409SDValue AArch64TargetLowering::LowerAsmOutputForConstraint(
13410 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
13411 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
13412 AArch64CC::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
13413 if (Cond == AArch64CC::Invalid)
13414 return SDValue();
13415 // The output variable should be a scalar integer.
13416 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
13417 OpInfo.ConstraintVT.getSizeInBits() < 8)
13418 report_fatal_error("Flag output operand is of invalid type");
13419
13420 // Get NZCV register. Only update chain when copyfrom is glued.
13421 if (Glue.getNode()) {
13422 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, FlagsVT, Glue);
13423 Chain = Glue.getValue(1);
13424 } else
13425 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, FlagsVT);
13426 // Extract CC code.
13427 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
13428
13430
13431 // Truncate or ZERO_EXTEND based on value types.
13432 if (OpInfo.ConstraintVT.getSizeInBits() <= 32)
13433 Result = DAG.getNode(ISD::TRUNCATE, DL, OpInfo.ConstraintVT, CC);
13434 else
13435 Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
13436
13437 return Result;
13438}
13439
13440/// getConstraintType - Given a constraint letter, return the type of
13441/// constraint it is for this target.
13443AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
13444 if (Constraint.size() == 1) {
13445 switch (Constraint[0]) {
13446 default:
13447 break;
13448 case 'x':
13449 case 'w':
13450 case 'y':
13451 return C_RegisterClass;
13452 // An address with a single base register. Due to the way we
13453 // currently handle addresses it is the same as 'r'.
13454 case 'Q':
13455 return C_Memory;
13456 case 'I':
13457 case 'J':
13458 case 'K':
13459 case 'L':
13460 case 'M':
13461 case 'N':
13462 case 'Y':
13463 case 'Z':
13464 return C_Immediate;
13465 case 'z':
13466 case 'S': // A symbol or label reference with a constant offset
13467 return C_Other;
13468 }
13469 } else if (parsePredicateConstraint(Constraint))
13470 return C_RegisterClass;
13471 else if (parseReducedGprConstraint(Constraint))
13472 return C_RegisterClass;
13473 else if (parseConstraintCode(Constraint) != AArch64CC::Invalid)
13474 return C_Other;
13475 return TargetLowering::getConstraintType(Constraint);
13476}
13477
13478/// Examine constraint type and operand type and determine a weight value.
13479/// This object must already have been set up with the operand type
13480/// and the current alternative constraint selected.
13482AArch64TargetLowering::getSingleConstraintMatchWeight(
13483 AsmOperandInfo &info, const char *constraint) const {
13485 Value *CallOperandVal = info.CallOperandVal;
13486 // If we don't have a value, we can't do a match,
13487 // but allow it at the lowest weight.
13488 if (!CallOperandVal)
13489 return CW_Default;
13490 Type *type = CallOperandVal->getType();
13491 // Look at the constraint type.
13492 switch (*constraint) {
13493 default:
13495 break;
13496 case 'x':
13497 case 'w':
13498 case 'y':
13499 if (type->isFloatingPointTy() || type->isVectorTy())
13500 weight = CW_Register;
13501 break;
13502 case 'z':
13503 weight = CW_Constant;
13504 break;
13505 case 'U':
13506 if (parsePredicateConstraint(constraint) ||
13507 parseReducedGprConstraint(constraint))
13508 weight = CW_Register;
13509 break;
13510 }
13511 return weight;
13512}
13513
13514std::pair<unsigned, const TargetRegisterClass *>
13515AArch64TargetLowering::getRegForInlineAsmConstraint(
13516 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
13517 if (Constraint.size() == 1) {
13518 switch (Constraint[0]) {
13519 case 'r':
13520 if (VT.isScalableVector())
13521 return std::make_pair(0U, nullptr);
13522 if (Subtarget->hasLS64() && VT.getSizeInBits() == 512)
13523 return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass);
13524 if (VT.getFixedSizeInBits() == 64)
13525 return std::make_pair(0U, &AArch64::GPR64commonRegClass);
13526 return std::make_pair(0U, &AArch64::GPR32commonRegClass);
13527 case 'w': {
13528 if (!Subtarget->hasFPARMv8())
13529 break;
13530 if (VT.isScalableVector()) {
13531 if (VT.getVectorElementType() != MVT::i1)
13532 return std::make_pair(0U, &AArch64::ZPRRegClass);
13533 return std::make_pair(0U, nullptr);
13534 }
13535 if (VT == MVT::Other)
13536 break;
13537 uint64_t VTSize = VT.getFixedSizeInBits();
13538 if (VTSize == 16)
13539 return std::make_pair(0U, &AArch64::FPR16RegClass);
13540 if (VTSize == 32)
13541 return std::make_pair(0U, &AArch64::FPR32RegClass);
13542 if (VTSize == 64)
13543 return std::make_pair(0U, &AArch64::FPR64RegClass);
13544 if (VTSize == 128)
13545 return std::make_pair(0U, &AArch64::FPR128RegClass);
13546 break;
13547 }
13548 // The instructions that this constraint is designed for can
13549 // only take 128-bit registers so just use that regclass.
13550 case 'x':
13551 if (!Subtarget->hasFPARMv8())
13552 break;
13553 if (VT.isScalableVector())
13554 return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
13555 if (VT.getSizeInBits() == 128)
13556 return std::make_pair(0U, &AArch64::FPR128_loRegClass);
13557 break;
13558 case 'y':
13559 if (!Subtarget->hasFPARMv8())
13560 break;
13561 if (VT.isScalableVector())
13562 return std::make_pair(0U, &AArch64::ZPR_3bRegClass);
13563 break;
13564 }
13565 } else {
13566 if (const auto P = parseSVERegAsConstraint(Constraint)) {
13567 // SME functions that are not in streaming mode, should
13568 // still observe clobbers of Z-registers by clobbering
13569 // the lower 128bits of those registers.
13570 if (AArch64::ZPRRegClass.hasSubClassEq(P->second) &&
13571 !Subtarget->isSVEorStreamingSVEAvailable())
13572 return std::make_pair(TRI->getSubReg(P->first, AArch64::zsub),
13573 &AArch64::FPR128RegClass);
13574 return *P;
13575 }
13576 if (const auto PC = parsePredicateConstraint(Constraint))
13577 if (const auto *RegClass = getPredicateRegisterClass(*PC, VT))
13578 return std::make_pair(0U, RegClass);
13579
13580 if (const auto RGC = parseReducedGprConstraint(Constraint))
13581 if (const auto *RegClass = getReducedGprRegisterClass(*RGC, VT))
13582 return std::make_pair(0U, RegClass);
13583 }
13584 if (StringRef("{cc}").equals_insensitive(Constraint) ||
13586 return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
13587
13588 if (Constraint == "{za}") {
13589 return std::make_pair(unsigned(AArch64::ZA), &AArch64::MPRRegClass);
13590 }
13591
13592 if (Constraint == "{zt0}") {
13593 return std::make_pair(unsigned(AArch64::ZT0), &AArch64::ZTRRegClass);
13594 }
13595
13596 // Use the default implementation in TargetLowering to convert the register
13597 // constraint into a member of a register class.
13598 std::pair<unsigned, const TargetRegisterClass *> Res;
13600
13601 // Not found as a standard register?
13602 if (!Res.second) {
13603 unsigned Size = Constraint.size();
13604 if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
13605 tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
13606 int RegNo;
13607 bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
13608 if (!Failed && RegNo >= 0 && RegNo <= 31) {
13609 // v0 - v31 are aliases of q0/d0/s0/h0 - ...31 depending on size.
13610 // By default we'll emit v0-v31 for this unless there's a modifier where
13611 // we'll emit the correct register as well.
13612 if (VT != MVT::Other) {
13613 switch (VT.getSizeInBits()) {
13614 case 16:
13615 Res.first = AArch64::FPR16RegClass.getRegister(RegNo);
13616 Res.second = &AArch64::FPR16RegClass;
13617 break;
13618 case 32:
13619 Res.first = AArch64::FPR32RegClass.getRegister(RegNo);
13620 Res.second = &AArch64::FPR32RegClass;
13621 break;
13622 case 64:
13623 Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
13624 Res.second = &AArch64::FPR64RegClass;
13625 break;
13626 case 128:
13627 Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
13628 Res.second = &AArch64::FPR128RegClass;
13629 break;
13630 default:
13631 return std::make_pair(0U, nullptr);
13632 }
13633 } else {
13634 Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
13635 Res.second = &AArch64::FPR128RegClass;
13636 }
13637 }
13638 }
13639 }
13640
13641 if (Res.second && !Subtarget->hasFPARMv8() &&
13642 !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&
13643 !AArch64::GPR64allRegClass.hasSubClassEq(Res.second))
13644 return std::make_pair(0U, nullptr);
13645
13646 return Res;
13647}
13648
13650 llvm::Type *Ty,
13651 bool AllowUnknown) const {
13652 if (Subtarget->hasLS64() && Ty->isIntegerTy(512))
13653 return EVT(MVT::i64x8);
13654
13655 return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown);
13656}
13657
13658/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
13659/// vector. If it is invalid, don't add anything to Ops.
13660void AArch64TargetLowering::LowerAsmOperandForConstraint(
13661 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
13662 SelectionDAG &DAG) const {
13663 SDValue Result;
13664
13665 // Currently only support length 1 constraints.
13666 if (Constraint.size() != 1)
13667 return;
13668
13669 char ConstraintLetter = Constraint[0];
13670 switch (ConstraintLetter) {
13671 default:
13672 break;
13673
13674 // This set of constraints deal with valid constants for various instructions.
13675 // Validate and return a target constant for them if we can.
13676 case 'z': {
13677 // 'z' maps to xzr or wzr so it needs an input of 0.
13678 if (!isNullConstant(Op))
13679 return;
13680
13681 if (Op.getValueType() == MVT::i64)
13682 Result = DAG.getRegister(AArch64::XZR, MVT::i64);
13683 else
13684 Result = DAG.getRegister(AArch64::WZR, MVT::i32);
13685 break;
13686 }
13687 case 'S':
13688 // Use the generic code path for "s". In GCC's aarch64 port, "S" is
13689 // supported for PIC while "s" isn't, making "s" less useful. We implement
13690 // "S" but not "s".
13692 break;
13693
13694 case 'I':
13695 case 'J':
13696 case 'K':
13697 case 'L':
13698 case 'M':
13699 case 'N':
13701 if (!C)
13702 return;
13703
13704 // Grab the value and do some validation.
13705 uint64_t CVal = C->getZExtValue();
13706 switch (ConstraintLetter) {
13707 // The I constraint applies only to simple ADD or SUB immediate operands:
13708 // i.e. 0 to 4095 with optional shift by 12
13709 // The J constraint applies only to ADD or SUB immediates that would be
13710 // valid when negated, i.e. if [an add pattern] were to be output as a SUB
13711 // instruction [or vice versa], in other words -1 to -4095 with optional
13712 // left shift by 12.
13713 case 'I':
13714 if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
13715 break;
13716 return;
13717 case 'J': {
13718 uint64_t NVal = -C->getSExtValue();
13719 if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
13720 CVal = C->getSExtValue();
13721 break;
13722 }
13723 return;
13724 }
13725 // The K and L constraints apply *only* to logical immediates, including
13726 // what used to be the MOVI alias for ORR (though the MOVI alias has now
13727 // been removed and MOV should be used). So these constraints have to
13728 // distinguish between bit patterns that are valid 32-bit or 64-bit
13729 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
13730 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
13731 // versa.
13732 case 'K':
13733 if (AArch64_AM::isLogicalImmediate(CVal, 32))
13734 break;
13735 return;
13736 case 'L':
13737 if (AArch64_AM::isLogicalImmediate(CVal, 64))
13738 break;
13739 return;
13740 // The M and N constraints are a superset of K and L respectively, for use
13741 // with the MOV (immediate) alias. As well as the logical immediates they
13742 // also match 32 or 64-bit immediates that can be loaded either using a
13743 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
13744 // (M) or 64-bit 0x1234000000000000 (N) etc.
13745 // As a note some of this code is liberally stolen from the asm parser.
13746 case 'M': {
13747 if (!isUInt<32>(CVal))
13748 return;
13749 if (AArch64_AM::isLogicalImmediate(CVal, 32))
13750 break;
13751 if ((CVal & 0xFFFF) == CVal)
13752 break;
13753 if ((CVal & 0xFFFF0000ULL) == CVal)
13754 break;
13755 uint64_t NCVal = ~(uint32_t)CVal;
13756 if ((NCVal & 0xFFFFULL) == NCVal)
13757 break;
13758 if ((NCVal & 0xFFFF0000ULL) == NCVal)
13759 break;
13760 return;
13761 }
13762 case 'N': {
13763 if (AArch64_AM::isLogicalImmediate(CVal, 64))
13764 break;
13765 if ((CVal & 0xFFFFULL) == CVal)
13766 break;
13767 if ((CVal & 0xFFFF0000ULL) == CVal)
13768 break;
13769 if ((CVal & 0xFFFF00000000ULL) == CVal)
13770 break;
13771 if ((CVal & 0xFFFF000000000000ULL) == CVal)
13772 break;
13773 uint64_t NCVal = ~CVal;
13774 if ((NCVal & 0xFFFFULL) == NCVal)
13775 break;
13776 if ((NCVal & 0xFFFF0000ULL) == NCVal)
13777 break;
13778 if ((NCVal & 0xFFFF00000000ULL) == NCVal)
13779 break;
13780 if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
13781 break;
13782 return;
13783 }
13784 default:
13785 return;
13786 }
13787
13788 // All assembler immediates are 64-bit integers.
13789 Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
13790 break;
13791 }
13792
13793 if (Result.getNode()) {
13794 Ops.push_back(Result);
13795 return;
13796 }
13797
13798 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
13799}
13800
13801//===----------------------------------------------------------------------===//
13802// AArch64 Advanced SIMD Support
13803//===----------------------------------------------------------------------===//
13804
13805/// WidenVector - Given a value in the V64 register class, produce the
13806/// equivalent value in the V128 register class.
13808 EVT VT = V64Reg.getValueType();
13809 unsigned NarrowSize = VT.getVectorNumElements();
13810 MVT EltTy = VT.getVectorElementType().getSimpleVT();
13811 MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
13812 SDLoc DL(V64Reg);
13813
13814 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getPOISON(WideTy),
13815 V64Reg, DAG.getConstant(0, DL, MVT::i64));
13816}
13817
13818/// getExtFactor - Determine the adjustment factor for the position when
13819/// generating an "extract from vector registers" instruction.
13820static unsigned getExtFactor(SDValue &V) {
13821 EVT EltType = V.getValueType().getVectorElementType();
13822 return EltType.getSizeInBits() / 8;
13823}
13824
13825// Check if a vector is built from one vector via extracted elements of
13826// another together with an AND mask, ensuring that all elements fit
13827// within range. This can be reconstructed using AND and NEON's TBL1.
13829 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
13830 SDLoc DL(Op);
13831 EVT VT = Op.getValueType();
13832 assert(!VT.isScalableVector() &&
13833 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
13834
13835 // Can only recreate a shuffle with 16xi8 or 8xi8 elements, as they map
13836 // directly to TBL1.
13837 if (VT != MVT::v16i8 && VT != MVT::v8i8)
13838 return SDValue();
13839
13840 unsigned NumElts = VT.getVectorNumElements();
13841 assert((NumElts == 8 || NumElts == 16) &&
13842 "Need to have exactly 8 or 16 elements in vector.");
13843
13844 SDValue SourceVec;
13845 SDValue MaskSourceVec;
13846 SmallVector<SDValue, 16> AndMaskConstants;
13847
13848 for (unsigned i = 0; i < NumElts; ++i) {
13849 SDValue V = Op.getOperand(i);
13850 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
13851 return SDValue();
13852
13853 SDValue OperandSourceVec = V.getOperand(0);
13854 if (!SourceVec)
13855 SourceVec = OperandSourceVec;
13856 else if (SourceVec != OperandSourceVec)
13857 return SDValue();
13858
13859 // This only looks at shuffles with elements that are
13860 // a) truncated by a constant AND mask extracted from a mask vector, or
13861 // b) extracted directly from a mask vector.
13862 SDValue MaskSource = V.getOperand(1);
13863 if (MaskSource.getOpcode() == ISD::AND) {
13864 if (!isa<ConstantSDNode>(MaskSource.getOperand(1)))
13865 return SDValue();
13866
13867 AndMaskConstants.push_back(MaskSource.getOperand(1));
13868 MaskSource = MaskSource->getOperand(0);
13869 } else if (!AndMaskConstants.empty()) {
13870 // Either all or no operands should have an AND mask.
13871 return SDValue();
13872 }
13873
13874 // An ANY_EXTEND may be inserted between the AND and the source vector
13875 // extraction. We don't care about that, so we can just skip it.
13876 if (MaskSource.getOpcode() == ISD::ANY_EXTEND)
13877 MaskSource = MaskSource.getOperand(0);
13878
13879 if (MaskSource.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
13880 return SDValue();
13881
13882 SDValue MaskIdx = MaskSource.getOperand(1);
13883 if (!isa<ConstantSDNode>(MaskIdx) ||
13884 !cast<ConstantSDNode>(MaskIdx)->getConstantIntValue()->equalsInt(i))
13885 return SDValue();
13886
13887 // We only apply this if all elements come from the same vector with the
13888 // same vector type.
13889 if (!MaskSourceVec) {
13890 MaskSourceVec = MaskSource->getOperand(0);
13891 if (MaskSourceVec.getValueType() != VT)
13892 return SDValue();
13893 } else if (MaskSourceVec != MaskSource->getOperand(0)) {
13894 return SDValue();
13895 }
13896 }
13897
13898 // We need a v16i8 for TBL, so we extend the source with a placeholder vector
13899 // for v8i8 to get a v16i8. As the pattern we are replacing is extract +
13900 // insert, we know that the index in the mask must be smaller than the number
13901 // of elements in the source, or we would have an out-of-bounds access.
13902 if (NumElts == 8)
13903 SourceVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, SourceVec,
13904 DAG.getPOISON(VT));
13905
13906 // Preconditions met, so we can use a vector (AND +) TBL to build this vector.
13907 if (!AndMaskConstants.empty())
13908 MaskSourceVec = DAG.getNode(ISD::AND, DL, VT, MaskSourceVec,
13909 DAG.getBuildVector(VT, DL, AndMaskConstants));
13910
13911 return DAG.getNode(
13913 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32),
13914 SourceVec, MaskSourceVec);
13915}
13916
13917// Gather data to see if the operation can be modelled as a
13918// shuffle in combination with VEXTs.
13920 SelectionDAG &DAG) const {
13921 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
13922 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
13923 SDLoc DL(Op);
13924 EVT VT = Op.getValueType();
13925 assert(!VT.isScalableVector() &&
13926 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
13927 unsigned NumElts = VT.getVectorNumElements();
13928
13929 struct ShuffleSourceInfo {
13930 SDValue Vec;
13931 unsigned MinElt;
13932 unsigned MaxElt;
13933
13934 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
13935 // be compatible with the shuffle we intend to construct. As a result
13936 // ShuffleVec will be some sliding window into the original Vec.
13937 SDValue ShuffleVec;
13938
13939 // Code should guarantee that element i in Vec starts at element "WindowBase
13940 // + i * WindowScale in ShuffleVec".
13941 int WindowBase;
13942 int WindowScale;
13943
13944 ShuffleSourceInfo(SDValue Vec)
13945 : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
13946 ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
13947
13948 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
13949 };
13950
13951 // First gather all vectors used as an immediate source for this BUILD_VECTOR
13952 // node.
13954 for (unsigned i = 0; i < NumElts; ++i) {
13955 SDValue V = Op.getOperand(i);
13956 if (V.isUndef())
13957 continue;
13958 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13959 !isa<ConstantSDNode>(V.getOperand(1)) ||
13960 V.getOperand(0).getValueType().isScalableVector()) {
13961 LLVM_DEBUG(
13962 dbgs() << "Reshuffle failed: "
13963 "a shuffle can only come from building a vector from "
13964 "various elements of other fixed-width vectors, provided "
13965 "their indices are constant\n");
13966 return SDValue();
13967 }
13968
13969 // Add this element source to the list if it's not already there.
13970 SDValue SourceVec = V.getOperand(0);
13971 auto Source = find(Sources, SourceVec);
13972 if (Source == Sources.end())
13973 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
13974
13975 // Update the minimum and maximum lane number seen.
13976 unsigned EltNo = V.getConstantOperandVal(1);
13977 Source->MinElt = std::min(Source->MinElt, EltNo);
13978 Source->MaxElt = std::max(Source->MaxElt, EltNo);
13979 }
13980
13981 // If we have 3 or 4 sources, try to generate a TBL, which will at least be
13982 // better than moving to/from gpr registers for larger vectors.
13983 if ((Sources.size() == 3 || Sources.size() == 4) && NumElts > 4) {
13984 // Construct a mask for the tbl. We may need to adjust the index for types
13985 // larger than i8.
13987 unsigned OutputFactor = VT.getScalarSizeInBits() / 8;
13988 for (unsigned I = 0; I < NumElts; ++I) {
13989 SDValue V = Op.getOperand(I);
13990 if (V.isUndef()) {
13991 for (unsigned OF = 0; OF < OutputFactor; OF++)
13992 Mask.push_back(-1);
13993 continue;
13994 }
13995 // Set the Mask lanes adjusted for the size of the input and output
13996 // lanes. The Mask is always i8, so it will set OutputFactor lanes per
13997 // output element, adjusted in their positions per input and output types.
13998 unsigned Lane = V.getConstantOperandVal(1);
13999 for (unsigned S = 0; S < Sources.size(); S++) {
14000 if (V.getOperand(0) == Sources[S].Vec) {
14001 unsigned InputSize = Sources[S].Vec.getScalarValueSizeInBits();
14002 unsigned InputBase = 16 * S + Lane * InputSize / 8;
14003 for (unsigned OF = 0; OF < OutputFactor; OF++)
14004 Mask.push_back(InputBase + OF);
14005 break;
14006 }
14007 }
14008 }
14009
14010 // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to
14011 // v16i8, and the TBLMask
14012 SmallVector<SDValue, 16> TBLOperands;
14013 TBLOperands.push_back(DAG.getConstant(Sources.size() == 3
14014 ? Intrinsic::aarch64_neon_tbl3
14015 : Intrinsic::aarch64_neon_tbl4,
14016 DL, MVT::i32));
14017 for (unsigned i = 0; i < Sources.size(); i++) {
14018 SDValue Src = Sources[i].Vec;
14019 EVT SrcVT = Src.getValueType();
14020 Src = DAG.getBitcast(SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, Src);
14021 assert((SrcVT.is64BitVector() || SrcVT.is128BitVector()) &&
14022 "Expected a legally typed vector");
14023 if (SrcVT.is64BitVector())
14024 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Src,
14025 DAG.getPOISON(MVT::v8i8));
14026 TBLOperands.push_back(Src);
14027 }
14028
14030 for (unsigned i = 0; i < Mask.size(); i++)
14031 TBLMask.push_back(DAG.getConstant(Mask[i], DL, MVT::i32));
14032 assert((Mask.size() == 8 || Mask.size() == 16) &&
14033 "Expected a v8i8 or v16i8 Mask");
14034 TBLOperands.push_back(DAG.getBuildVector(
14035 Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, DL, TBLMask));
14036
14037 SDValue Shuffle =
14039 Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, TBLOperands);
14040 return DAG.getBitcast(VT, Shuffle);
14041 }
14042
14043 if (Sources.size() > 2) {
14044 LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something "
14045 << "sensible when at most two source vectors are "
14046 << "involved\n");
14047 return SDValue();
14048 }
14049
14050 // Find out the smallest element size among result and two sources, and use
14051 // it as element size to build the shuffle_vector.
14052 EVT SmallestEltTy = VT.getVectorElementType();
14053 for (auto &Source : Sources) {
14054 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
14055 if (SrcEltTy.bitsLT(SmallestEltTy)) {
14056 SmallestEltTy = SrcEltTy;
14057 }
14058 }
14059 unsigned ResMultiplier =
14060 VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits();
14061 uint64_t VTSize = VT.getFixedSizeInBits();
14062 NumElts = VTSize / SmallestEltTy.getFixedSizeInBits();
14063 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
14064
14065 // If the source vector is too wide or too narrow, we may nevertheless be able
14066 // to construct a compatible shuffle either by concatenating it with UNDEF or
14067 // extracting a suitable range of elements.
14068 for (auto &Src : Sources) {
14069 EVT SrcVT = Src.ShuffleVec.getValueType();
14070
14071 TypeSize SrcVTSize = SrcVT.getSizeInBits();
14072 if (SrcVTSize == TypeSize::getFixed(VTSize))
14073 continue;
14074
14075 // This stage of the search produces a source with the same element type as
14076 // the original, but with a total width matching the BUILD_VECTOR output.
14077 EVT EltVT = SrcVT.getVectorElementType();
14078 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
14079 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
14080
14081 if (SrcVTSize.getFixedValue() < VTSize) {
14082 assert(2 * SrcVTSize == VTSize);
14083 // We can pad out the smaller vector for free, so if it's part of a
14084 // shuffle...
14085 Src.ShuffleVec =
14086 DAG.getNode(ISD::CONCAT_VECTORS, DL, DestVT, Src.ShuffleVec,
14087 DAG.getPOISON(Src.ShuffleVec.getValueType()));
14088 continue;
14089 }
14090
14091 if (SrcVTSize.getFixedValue() != 2 * VTSize) {
14092 LLVM_DEBUG(
14093 dbgs() << "Reshuffle failed: result vector too small to extract\n");
14094 return SDValue();
14095 }
14096
14097 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
14098 LLVM_DEBUG(
14099 dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
14100 return SDValue();
14101 }
14102
14103 if (Src.MinElt >= NumSrcElts) {
14104 // The extraction can just take the second half
14105 Src.ShuffleVec =
14106 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
14107 DAG.getConstant(NumSrcElts, DL, MVT::i64));
14108 Src.WindowBase = -NumSrcElts;
14109 } else if (Src.MaxElt < NumSrcElts) {
14110 // The extraction can just take the first half
14111 Src.ShuffleVec =
14112 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
14113 DAG.getConstant(0, DL, MVT::i64));
14114 } else {
14115 // An actual VEXT is needed
14116 SDValue VEXTSrc1 =
14117 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
14118 DAG.getConstant(0, DL, MVT::i64));
14119 SDValue VEXTSrc2 =
14120 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
14121 DAG.getConstant(NumSrcElts, DL, MVT::i64));
14122 unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
14123
14124 if (!SrcVT.is64BitVector()) {
14125 LLVM_DEBUG(
14126 dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
14127 "for SVE vectors.");
14128 return SDValue();
14129 }
14130
14131 Src.ShuffleVec =
14132 DAG.getNode(AArch64ISD::EXT, DL, DestVT, VEXTSrc1, VEXTSrc2,
14133 DAG.getConstant(Imm, DL, MVT::i32));
14134 Src.WindowBase = -Src.MinElt;
14135 }
14136 }
14137
14138 // Another possible incompatibility occurs from the vector element types. We
14139 // can fix this by bitcasting the source vectors to the same type we intend
14140 // for the shuffle.
14141 for (auto &Src : Sources) {
14142 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
14143 if (SrcEltTy == SmallestEltTy)
14144 continue;
14145 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
14146 if (DAG.getDataLayout().isBigEndian()) {
14147 Src.ShuffleVec =
14148 DAG.getNode(AArch64ISD::NVCAST, DL, ShuffleVT, Src.ShuffleVec);
14149 } else {
14150 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Src.ShuffleVec);
14151 }
14152 Src.WindowScale =
14153 SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits();
14154 Src.WindowBase *= Src.WindowScale;
14155 }
14156
14157 // Final check before we try to actually produce a shuffle.
14158 LLVM_DEBUG({
14159 for (auto Src : Sources)
14160 assert(Src.ShuffleVec.getValueType() == ShuffleVT);
14161 });
14162
14163 // The stars all align, our next step is to produce the mask for the shuffle.
14164 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
14165 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
14166 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
14167 SDValue Entry = Op.getOperand(i);
14168 if (Entry.isUndef())
14169 continue;
14170
14171 auto Src = find(Sources, Entry.getOperand(0));
14172 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
14173
14174 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
14175 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
14176 // segment.
14177 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
14178 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
14179 VT.getScalarSizeInBits());
14180 int LanesDefined = BitsDefined / BitsPerShuffleLane;
14181
14182 // This source is expected to fill ResMultiplier lanes of the final shuffle,
14183 // starting at the appropriate offset.
14184 int *LaneMask = &Mask[i * ResMultiplier];
14185
14186 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
14187 ExtractBase += NumElts * (Src - Sources.begin());
14188 for (int j = 0; j < LanesDefined; ++j)
14189 LaneMask[j] = ExtractBase + j;
14190 }
14191
14192 // Final check before we try to produce nonsense...
14193 if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
14194 LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
14195 return SDValue();
14196 }
14197
14198 SDValue Poison = DAG.getPOISON(ShuffleVT);
14200 for (unsigned i = 0; i < Sources.size(); ++i)
14201 ShuffleOps[i] = Sources[i].ShuffleVec;
14202
14203 SDValue Shuffle =
14204 DAG.getVectorShuffle(ShuffleVT, DL, ShuffleOps[0], ShuffleOps[1], Mask);
14205 SDValue V;
14206 if (DAG.getDataLayout().isBigEndian()) {
14207 V = DAG.getNode(AArch64ISD::NVCAST, DL, VT, Shuffle);
14208 } else {
14209 V = DAG.getNode(ISD::BITCAST, DL, VT, Shuffle);
14210 }
14211
14212 LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
14213 dbgs() << "Reshuffle, creating node: "; V.dump(););
14214
14215 return V;
14216}
14217
14218// check if an EXT instruction can handle the shuffle mask when the
14219// vector sources of the shuffle are the same.
14220static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
14221 unsigned NumElts = VT.getVectorNumElements();
14222
14223 // Assume that the first shuffle index is not UNDEF. Fail if it is.
14224 if (M[0] < 0)
14225 return false;
14226
14227 Imm = M[0];
14228
14229 // If this is a VEXT shuffle, the immediate value is the index of the first
14230 // element. The other shuffle indices must be the successive elements after
14231 // the first one.
14232 unsigned ExpectedElt = Imm;
14233 for (unsigned i = 1; i < NumElts; ++i) {
14234 // Increment the expected index. If it wraps around, just follow it
14235 // back to index zero and keep going.
14236 ++ExpectedElt;
14237 if (ExpectedElt == NumElts)
14238 ExpectedElt = 0;
14239
14240 if (M[i] < 0)
14241 continue; // ignore UNDEF indices
14242 if (ExpectedElt != static_cast<unsigned>(M[i]))
14243 return false;
14244 }
14245
14246 return true;
14247}
14248
14249// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
14250// v4i32s. This is really a truncate, which we can construct out of (legal)
14251// concats and truncate nodes.
14253 if (V.getValueType() != MVT::v16i8)
14254 return SDValue();
14255 assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");
14256
14257 for (unsigned X = 0; X < 4; X++) {
14258 // Check the first item in each group is an extract from lane 0 of a v4i32
14259 // or v4i16.
14260 SDValue BaseExt = V.getOperand(X * 4);
14261 if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14262 (BaseExt.getOperand(0).getValueType() != MVT::v4i16 &&
14263 BaseExt.getOperand(0).getValueType() != MVT::v4i32) ||
14264 !isa<ConstantSDNode>(BaseExt.getOperand(1)) ||
14265 BaseExt.getConstantOperandVal(1) != 0)
14266 return SDValue();
14267 SDValue Base = BaseExt.getOperand(0);
14268 // And check the other items are extracts from the same vector.
14269 for (unsigned Y = 1; Y < 4; Y++) {
14270 SDValue Ext = V.getOperand(X * 4 + Y);
14271 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14272 Ext.getOperand(0) != Base ||
14274 Ext.getConstantOperandVal(1) != Y)
14275 return SDValue();
14276 }
14277 }
14278
14279 // Turn the buildvector into a series of truncates and concates, which will
14280 // become uzip1's. Any v4i32s we found get truncated to v4i16, which are
14281 // concat together to produce 2 v8i16. These are both truncated and concat
14282 // together.
14283 SDLoc DL(V);
14284 SDValue Trunc[4] = {
14285 V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0),
14286 V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)};
14287 for (SDValue &V : Trunc)
14288 if (V.getValueType() == MVT::v4i32)
14289 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, V);
14290 SDValue Concat0 =
14291 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]);
14292 SDValue Concat1 =
14293 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]);
14294 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0);
14295 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1);
14296 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1);
14297}
14298
14299/// Check if a vector shuffle corresponds to a DUP instructions with a larger
14300/// element width than the vector lane type. If that is the case the function
14301/// returns true and writes the value of the DUP instruction lane operand into
14302/// DupLaneOp
14303static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,
14304 unsigned &DupLaneOp) {
14305 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
14306 "Only possible block sizes for wide DUP are: 16, 32, 64");
14307
14308 if (BlockSize <= VT.getScalarSizeInBits())
14309 return false;
14310 if (BlockSize % VT.getScalarSizeInBits() != 0)
14311 return false;
14312 if (VT.getSizeInBits() % BlockSize != 0)
14313 return false;
14314
14315 size_t SingleVecNumElements = VT.getVectorNumElements();
14316 size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits();
14317 size_t NumBlocks = VT.getSizeInBits() / BlockSize;
14318
14319 // We are looking for masks like
14320 // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
14321 // might be replaced by 'undefined'. BlockIndices will eventually contain
14322 // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
14323 // for the above examples)
14324 SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1);
14325 for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++)
14326 for (size_t I = 0; I < NumEltsPerBlock; I++) {
14327 int Elt = M[BlockIndex * NumEltsPerBlock + I];
14328 if (Elt < 0)
14329 continue;
14330 // For now we don't support shuffles that use the second operand
14331 if ((unsigned)Elt >= SingleVecNumElements)
14332 return false;
14333 if (BlockElts[I] < 0)
14334 BlockElts[I] = Elt;
14335 else if (BlockElts[I] != Elt)
14336 return false;
14337 }
14338
14339 // We found a candidate block (possibly with some undefs). It must be a
14340 // sequence of consecutive integers starting with a value divisible by
14341 // NumEltsPerBlock with some values possibly replaced by undef-s.
14342
14343 // Find first non-undef element
14344 auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; });
14345 assert(FirstRealEltIter != BlockElts.end() &&
14346 "Shuffle with all-undefs must have been caught by previous cases, "
14347 "e.g. isSplat()");
14348 if (FirstRealEltIter == BlockElts.end()) {
14349 DupLaneOp = 0;
14350 return true;
14351 }
14352
14353 // Index of FirstRealElt in BlockElts
14354 size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();
14355
14356 if ((unsigned)*FirstRealEltIter < FirstRealIndex)
14357 return false;
14358 // BlockElts[0] must have the following value if it isn't undef:
14359 size_t Elt0 = *FirstRealEltIter - FirstRealIndex;
14360
14361 // Check the first element
14362 if (Elt0 % NumEltsPerBlock != 0)
14363 return false;
14364 // Check that the sequence indeed consists of consecutive integers (modulo
14365 // undefs)
14366 for (size_t I = 0; I < NumEltsPerBlock; I++)
14367 if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I)
14368 return false;
14369
14370 DupLaneOp = Elt0 / NumEltsPerBlock;
14371 return true;
14372}
14373
14374// check if an EXT instruction can handle the shuffle mask when the
14375// vector sources of the shuffle are different.
14376static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
14377 unsigned &Imm) {
14378 // Look for the first non-undef element.
14379 const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
14380
14381 // Benefit from APInt to handle overflow when calculating expected element.
14382 unsigned NumElts = VT.getVectorNumElements();
14383 unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
14384 APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1, /*isSigned=*/false,
14385 /*implicitTrunc=*/true);
14386 // The following shuffle indices must be the successive elements after the
14387 // first real element.
14388 bool FoundWrongElt = std::any_of(FirstRealElt + 1, M.end(), [&](int Elt) {
14389 return Elt != ExpectedElt++ && Elt >= 0;
14390 });
14391 if (FoundWrongElt)
14392 return false;
14393
14394 // The index of an EXT is the first element if it is not UNDEF.
14395 // Watch out for the beginning UNDEFs. The EXT index should be the expected
14396 // value of the first element. E.g.
14397 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
14398 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
14399 // ExpectedElt is the last mask index plus 1.
14400 Imm = ExpectedElt.getZExtValue();
14401
14402 // There are two difference cases requiring to reverse input vectors.
14403 // For example, for vector <4 x i32> we have the following cases,
14404 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
14405 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
14406 // For both cases, we finally use mask <5, 6, 7, 0>, which requires
14407 // to reverse two input vectors.
14408 if (Imm < NumElts)
14409 ReverseEXT = true;
14410 else
14411 Imm -= NumElts;
14412
14413 return true;
14414}
14415
14416// Check if an EXT instruction can handle the shuffle mask when one source is a
14417// splat. This matches shuffles where the splat occupies either a prefix or a
14418// suffix and the remaining lanes are a contiguous slice from the non-splat
14419// source.
14420static bool isEXTMaskWithSplat(ArrayRef<int> M, EVT VT, unsigned SplatOperand,
14421 bool &ReverseEXT, unsigned &Imm) {
14422 unsigned NumElts = VT.getVectorNumElements();
14423 unsigned OtherBase = SplatOperand == 0 ? NumElts : 0;
14424 auto IsSplatElt = [=](int Elt) {
14425 return Elt < 0 ||
14426 (SplatOperand == 0 ? Elt < (int)NumElts : Elt >= (int)NumElts);
14427 };
14428
14429 unsigned PrefixSplatElts = 0;
14430 while (PrefixSplatElts != NumElts && IsSplatElt(M[PrefixSplatElts]))
14431 ++PrefixSplatElts;
14432
14433 if (PrefixSplatElts > 0 && PrefixSplatElts < NumElts) {
14434 bool Match = true;
14435 for (unsigned I = PrefixSplatElts; I != NumElts; ++I) {
14436 int Expected = OtherBase + I - PrefixSplatElts;
14437 if (M[I] >= 0 && M[I] != Expected) {
14438 Match = false;
14439 break;
14440 }
14441 }
14442
14443 if (Match) {
14444 ReverseEXT = SplatOperand == 1;
14445 Imm = NumElts - PrefixSplatElts;
14446 return true;
14447 }
14448 }
14449
14450 unsigned SuffixSplatElts = 0;
14451 while (SuffixSplatElts != NumElts &&
14452 IsSplatElt(M[NumElts - 1 - SuffixSplatElts]))
14453 ++SuffixSplatElts;
14454
14455 if (0 < SuffixSplatElts && SuffixSplatElts < NumElts) {
14456 bool Match = true;
14457 for (unsigned I = 0; I != NumElts - SuffixSplatElts; ++I) {
14458 int Expected = OtherBase + I + SuffixSplatElts;
14459 if (M[I] >= 0 && M[I] != Expected) {
14460 Match = false;
14461 break;
14462 }
14463 }
14464
14465 if (Match) {
14466 ReverseEXT = SplatOperand == 0;
14467 Imm = SuffixSplatElts;
14468 return true;
14469 }
14470 }
14471
14472 return false;
14473}
14474
14475/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
14476/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
14477/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
14478static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
14479 unsigned NumElts = VT.getVectorNumElements();
14480 if (NumElts % 2 != 0)
14481 return false;
14482 WhichResult = (M[0] == 0 ? 0 : 1);
14483 unsigned Idx = WhichResult * NumElts / 2;
14484 for (unsigned i = 0; i != NumElts; i += 2) {
14485 if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
14486 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
14487 return false;
14488 Idx += 1;
14489 }
14490
14491 return true;
14492}
14493
14494/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
14495/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
14496/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
14497static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
14498 unsigned Half = VT.getVectorNumElements() / 2;
14499 WhichResult = (M[0] == 0 ? 0 : 1);
14500 for (unsigned j = 0; j != 2; ++j) {
14501 unsigned Idx = WhichResult;
14502 for (unsigned i = 0; i != Half; ++i) {
14503 int MIdx = M[i + j * Half];
14504 if (MIdx >= 0 && (unsigned)MIdx != Idx)
14505 return false;
14506 Idx += 2;
14507 }
14508 }
14509
14510 return true;
14511}
14512
14513/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
14514/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
14515/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
14516static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
14517 unsigned NumElts = VT.getVectorNumElements();
14518 if (NumElts % 2 != 0)
14519 return false;
14520 WhichResult = (M[0] == 0 ? 0 : 1);
14521 for (unsigned i = 0; i < NumElts; i += 2) {
14522 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
14523 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
14524 return false;
14525 }
14526 return true;
14527}
14528
14529static bool isINSMask(ArrayRef<int> M, int NumInputElements,
14530 bool &DstIsLeft, int &Anomaly) {
14531 if (M.size() != static_cast<size_t>(NumInputElements))
14532 return false;
14533
14534 int NumLHSMatch = 0, NumRHSMatch = 0;
14535 int LastLHSMismatch = -1, LastRHSMismatch = -1;
14536
14537 for (int i = 0; i < NumInputElements; ++i) {
14538 if (M[i] == -1) {
14539 ++NumLHSMatch;
14540 ++NumRHSMatch;
14541 continue;
14542 }
14543
14544 if (M[i] == i)
14545 ++NumLHSMatch;
14546 else
14547 LastLHSMismatch = i;
14548
14549 if (M[i] == i + NumInputElements)
14550 ++NumRHSMatch;
14551 else
14552 LastRHSMismatch = i;
14553 }
14554
14555 if (NumLHSMatch == NumInputElements - 1) {
14556 DstIsLeft = true;
14557 Anomaly = LastLHSMismatch;
14558 return true;
14559 } else if (NumRHSMatch == NumInputElements - 1) {
14560 DstIsLeft = false;
14561 Anomaly = LastRHSMismatch;
14562 return true;
14563 }
14564
14565 return false;
14566}
14567
14568static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
14569 if (VT.getSizeInBits() != 128)
14570 return false;
14571
14572 unsigned NumElts = VT.getVectorNumElements();
14573
14574 for (int I = 0, E = NumElts / 2; I != E; I++) {
14575 if (Mask[I] != I)
14576 return false;
14577 }
14578
14579 int Offset = NumElts / 2;
14580 for (int I = NumElts / 2, E = NumElts; I != E; I++) {
14581 if (Mask[I] != I + SplitLHS * Offset)
14582 return false;
14583 }
14584
14585 return true;
14586}
14587
14589 SDLoc DL(Op);
14590 EVT VT = Op.getValueType();
14591 SDValue V0 = Op.getOperand(0);
14592 SDValue V1 = Op.getOperand(1);
14593 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
14594
14597 return SDValue();
14598
14599 bool SplitV0 = V0.getValueSizeInBits() == 128;
14600
14601 if (!isConcatMask(Mask, VT, SplitV0))
14602 return SDValue();
14603
14604 EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
14605 if (SplitV0) {
14606 V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
14607 DAG.getConstant(0, DL, MVT::i64));
14608 }
14609 if (V1.getValueSizeInBits() == 128) {
14610 V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
14611 DAG.getConstant(0, DL, MVT::i64));
14612 }
14613 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
14614}
14615
14616/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
14617/// the specified operations to build the shuffle. ID is the perfect-shuffle
14618//ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
14619//table entry and LHS/RHS are the immediate inputs for this stage of the
14620//shuffle.
14622 unsigned PFEntry, SDValue LHS,
14623 SDValue RHS, SelectionDAG &DAG,
14624 const SDLoc &DL) {
14625 unsigned OpNum = (PFEntry >> 26) & 0x0F;
14626 unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
14627 unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
14628
14629 enum {
14630 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
14631 OP_VREV,
14632 OP_VDUP0,
14633 OP_VDUP1,
14634 OP_VDUP2,
14635 OP_VDUP3,
14636 OP_VEXT1,
14637 OP_VEXT2,
14638 OP_VEXT3,
14639 OP_VUZPL, // VUZP, left result
14640 OP_VUZPR, // VUZP, right result
14641 OP_VZIPL, // VZIP, left result
14642 OP_VZIPR, // VZIP, right result
14643 OP_VTRNL, // VTRN, left result
14644 OP_VTRNR, // VTRN, right result
14645 OP_MOVLANE // Move lane. RHSID is the lane to move into
14646 };
14647
14648 if (OpNum == OP_COPY) {
14649 if (LHSID == (1 * 9 + 2) * 9 + 3)
14650 return LHS;
14651 assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
14652 return RHS;
14653 }
14654
14655 if (OpNum == OP_MOVLANE) {
14656 // Decompose a PerfectShuffle ID to get the Mask for lane Elt
14657 auto getPFIDLane = [](unsigned ID, int Elt) -> int {
14658 assert(Elt < 4 && "Expected Perfect Lanes to be less than 4");
14659 Elt = 3 - Elt;
14660 while (Elt > 0) {
14661 ID /= 9;
14662 Elt--;
14663 }
14664 return (ID % 9 == 8) ? -1 : ID % 9;
14665 };
14666
14667 // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We
14668 // get the lane to move from the PFID, which is always from the
14669 // original vectors (V1 or V2).
14671 LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, DL);
14672 EVT VT = OpLHS.getValueType();
14673 assert(RHSID < 8 && "Expected a lane index for RHSID!");
14674 unsigned ExtLane = 0;
14675 SDValue Input;
14676
14677 // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs
14678 // convert into a higher type.
14679 if (RHSID & 0x4) {
14680 int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1;
14681 if (MaskElt == -1)
14682 MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1;
14683 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
14684 ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2);
14685 Input = MaskElt < 2 ? V1 : V2;
14686 if (VT.getScalarSizeInBits() == 16) {
14687 Input = DAG.getBitcast(MVT::v2f32, Input);
14688 OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS);
14689 } else {
14690 assert(VT.getScalarSizeInBits() == 32 &&
14691 "Expected 16 or 32 bit shuffle elements");
14692 Input = DAG.getBitcast(MVT::v2f64, Input);
14693 OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS);
14694 }
14695 } else {
14696 int MaskElt = getPFIDLane(ID, RHSID);
14697 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
14698 ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
14699 Input = MaskElt < 4 ? V1 : V2;
14700 // Be careful about creating illegal types. Use f16 instead of i16.
14701 if (VT == MVT::v4i16) {
14702 Input = DAG.getBitcast(MVT::v4f16, Input);
14703 OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS);
14704 }
14705 }
14707 Input.getValueType().getVectorElementType(),
14708 Input, DAG.getVectorIdxConstant(ExtLane, DL));
14709 SDValue Ins =
14710 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Input.getValueType(), OpLHS,
14711 Ext, DAG.getVectorIdxConstant(RHSID & 0x3, DL));
14712 return DAG.getBitcast(VT, Ins);
14713 }
14714
14715 SDValue OpLHS, OpRHS;
14716 OpLHS = GeneratePerfectShuffle(LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS,
14717 RHS, DAG, DL);
14718 OpRHS = GeneratePerfectShuffle(RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS,
14719 RHS, DAG, DL);
14720 EVT VT = OpLHS.getValueType();
14721
14722 switch (OpNum) {
14723 default:
14724 llvm_unreachable("Unknown shuffle opcode!");
14725 case OP_VREV: {
14726 // VREV divides the vector in half and swaps within the half.
14727 if (VT.getVectorElementType() == MVT::i32 ||
14728 VT.getVectorElementType() == MVT::f32)
14729 return DAG.getNode(AArch64ISD::REV64, DL, VT, OpLHS);
14730 // vrev <4 x i16> -> REV32
14731 if (VT.getVectorElementType() == MVT::i16 ||
14732 VT.getVectorElementType() == MVT::f16 ||
14733 VT.getVectorElementType() == MVT::bf16)
14734 return DAG.getNode(AArch64ISD::REV32, DL, VT, OpLHS);
14735 // vrev <4 x i8> -> BSWAP which is REV16
14736 assert(VT == MVT::v8i8 || VT == MVT::v16i8);
14737 EVT BSVT = VT == MVT::v8i8 ? MVT::v4i16 : MVT::v8i16;
14738 return DAG.getNode(
14739 AArch64ISD::NVCAST, DL, VT,
14740 DAG.getNode(ISD::BSWAP, DL, BSVT,
14741 DAG.getNode(AArch64ISD::NVCAST, DL, BSVT, OpLHS)));
14742 }
14743 case OP_VDUP0:
14744 case OP_VDUP1:
14745 case OP_VDUP2:
14746 case OP_VDUP3: {
14747 EVT EltTy = VT.getVectorElementType();
14748 unsigned Opcode;
14749 if (EltTy == MVT::i8)
14750 Opcode = AArch64ISD::DUPLANE8;
14751 else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)
14752 Opcode = AArch64ISD::DUPLANE16;
14753 else if (EltTy == MVT::i32 || EltTy == MVT::f32)
14754 Opcode = AArch64ISD::DUPLANE32;
14755 else if (EltTy == MVT::i64 || EltTy == MVT::f64)
14756 Opcode = AArch64ISD::DUPLANE64;
14757 else
14758 llvm_unreachable("Invalid vector element type?");
14759
14760 if (VT.getSizeInBits() == 64)
14761 OpLHS = WidenVector(OpLHS, DAG);
14762 SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, DL, MVT::i64);
14763 return DAG.getNode(Opcode, DL, VT, OpLHS, Lane);
14764 }
14765 case OP_VEXT1:
14766 case OP_VEXT2:
14767 case OP_VEXT3: {
14768 unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
14769 return DAG.getNode(AArch64ISD::EXT, DL, VT, OpLHS, OpRHS,
14770 DAG.getConstant(Imm, DL, MVT::i32));
14771 }
14772 case OP_VUZPL:
14773 return DAG.getNode(AArch64ISD::UZP1, DL, VT, OpLHS, OpRHS);
14774 case OP_VUZPR:
14775 return DAG.getNode(AArch64ISD::UZP2, DL, VT, OpLHS, OpRHS);
14776 case OP_VZIPL:
14777 return DAG.getNode(AArch64ISD::ZIP1, DL, VT, OpLHS, OpRHS);
14778 case OP_VZIPR:
14779 return DAG.getNode(AArch64ISD::ZIP2, DL, VT, OpLHS, OpRHS);
14780 case OP_VTRNL:
14781 return DAG.getNode(AArch64ISD::TRN1, DL, VT, OpLHS, OpRHS);
14782 case OP_VTRNR:
14783 return DAG.getNode(AArch64ISD::TRN2, DL, VT, OpLHS, OpRHS);
14784 }
14785}
14786
14788 SelectionDAG &DAG) {
14789 // Check to see if we can use the TBL instruction.
14790 SDValue V1 = Op.getOperand(0);
14791 SDValue V2 = Op.getOperand(1);
14792 SDLoc DL(Op);
14793
14794 EVT EltVT = Op.getValueType().getVectorElementType();
14795 unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
14796
14797 bool Swap = false;
14798 if (V1.isUndef() || isZerosVector(V1.getNode())) {
14799 std::swap(V1, V2);
14800 Swap = true;
14801 }
14802
14803 // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill
14804 // out of range values with 0s. We do need to make sure that any out-of-range
14805 // values are really out-of-range for a v16i8 vector.
14806 bool IsUndefOrZero = V2.isUndef() || isZerosVector(V2.getNode());
14807 MVT IndexVT = MVT::v8i8;
14808 unsigned IndexLen = 8;
14809 if (Op.getValueSizeInBits() == 128) {
14810 IndexVT = MVT::v16i8;
14811 IndexLen = 16;
14812 }
14813
14815 for (int Val : ShuffleMask) {
14816 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
14817 unsigned Offset = Byte + Val * BytesPerElt;
14818 if (Swap)
14819 Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen;
14820 if (IsUndefOrZero && Offset >= IndexLen)
14821 Offset = 255;
14822 TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
14823 }
14824 }
14825
14826 SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
14827 SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
14828
14829 SDValue Shuffle;
14830 if (IsUndefOrZero) {
14831 if (IndexLen == 8)
14832 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
14833 Shuffle = DAG.getNode(
14834 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
14835 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32),
14836 V1Cst,
14837 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
14838 } else {
14839 if (IndexLen == 8) {
14840 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
14841 Shuffle = DAG.getNode(
14842 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
14843 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32),
14844 V1Cst,
14845 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
14846 } else {
14847 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
14848 // cannot currently represent the register constraints on the input
14849 // table registers.
14850 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
14851 // DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
14852 // IndexLen));
14853 Shuffle = DAG.getNode(
14854 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
14855 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32),
14856 V1Cst, V2Cst,
14857 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
14858 }
14859 }
14860 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
14861}
14862
14863static unsigned getDUPLANEOp(EVT EltType) {
14864 if (EltType == MVT::i8)
14865 return AArch64ISD::DUPLANE8;
14866 if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)
14867 return AArch64ISD::DUPLANE16;
14868 if (EltType == MVT::i32 || EltType == MVT::f32)
14869 return AArch64ISD::DUPLANE32;
14870 if (EltType == MVT::i64 || EltType == MVT::f64)
14871 return AArch64ISD::DUPLANE64;
14872
14873 llvm_unreachable("Invalid vector element type?");
14874}
14875
14876static SDValue constructDup(SDValue V, int Lane, SDLoc DL, EVT VT,
14877 unsigned Opcode, SelectionDAG &DAG) {
14878 // Try to eliminate a bitcasted extract subvector before a DUPLANE.
14879 auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
14880 // Match: dup (bitcast (extract_subv X, C)), LaneC
14881 if (BitCast.getOpcode() != ISD::BITCAST ||
14883 return false;
14884
14885 // The extract index must align in the destination type. That may not
14886 // happen if the bitcast is from narrow to wide type.
14887 SDValue Extract = BitCast.getOperand(0);
14888 unsigned ExtIdx = Extract.getConstantOperandVal(1);
14889 unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
14890 unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
14891 unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
14892 if (ExtIdxInBits % CastedEltBitWidth != 0)
14893 return false;
14894
14895 // Can't handle cases where vector size is not 128-bit
14896 if (!Extract.getOperand(0).getValueType().is128BitVector())
14897 return false;
14898
14899 // Update the lane value by offsetting with the scaled extract index.
14900 LaneC += ExtIdxInBits / CastedEltBitWidth;
14901
14902 // Determine the casted vector type of the wide vector input.
14903 // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
14904 // Examples:
14905 // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
14906 // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
14907 unsigned SrcVecNumElts =
14908 Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
14910 SrcVecNumElts);
14911 return true;
14912 };
14913 MVT CastVT;
14914 if (getScaledOffsetDup(V, Lane, CastVT)) {
14915 V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0));
14916 } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
14917 V.getOperand(0).getValueType().is128BitVector()) {
14918 // The lane is incremented by the index of the extract.
14919 // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
14920 Lane += V.getConstantOperandVal(1);
14921 V = V.getOperand(0);
14922 } else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
14923 // The lane is decremented if we are splatting from the 2nd operand.
14924 // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
14925 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
14926 Lane -= Idx * VT.getVectorNumElements() / 2;
14927 V = WidenVector(V.getOperand(Idx), DAG);
14928 } else if (VT.getSizeInBits() == 64) {
14929 // Widen the operand to 128-bit register with undef.
14930 V = WidenVector(V, DAG);
14931 }
14932 return DAG.getNode(Opcode, DL, VT, V, DAG.getConstant(Lane, DL, MVT::i64));
14933}
14934
14935// Try to widen element type to get a new mask value for a better permutation
14936// sequence, so that we can use NEON shuffle instructions, such as zip1/2,
14937// UZP1/2, TRN1/2, REV, INS, etc.
14938// For example:
14939// shufflevector <4 x i32> %a, <4 x i32> %b,
14940// <4 x i32> <i32 6, i32 7, i32 2, i32 3>
14941// is equivalent to:
14942// shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
14943// Finally, we can get:
14944// mov v0.d[0], v1.d[1]
14946 SDLoc DL(Op);
14947 EVT VT = Op.getValueType();
14948 EVT ScalarVT = VT.getVectorElementType();
14949 unsigned ElementSize = ScalarVT.getFixedSizeInBits();
14950 SDValue V0 = Op.getOperand(0);
14951 SDValue V1 = Op.getOperand(1);
14952 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
14953
14954 // If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ...
14955 // We need to make sure the wider element type is legal. Thus, ElementSize
14956 // should be not larger than 32 bits, and i1 type should also be excluded.
14957 if (ElementSize > 32 || ElementSize == 1)
14958 return SDValue();
14959
14960 SmallVector<int, 8> NewMask;
14961 if (widenShuffleMaskElts(Mask, NewMask)) {
14962 MVT NewEltVT = VT.isFloatingPoint()
14963 ? MVT::getFloatingPointVT(ElementSize * 2)
14964 : MVT::getIntegerVT(ElementSize * 2);
14965 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
14966 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
14967 V0 = DAG.getBitcast(NewVT, V0);
14968 V1 = DAG.getBitcast(NewVT, V1);
14969 return DAG.getBitcast(VT,
14970 DAG.getVectorShuffle(NewVT, DL, V0, V1, NewMask));
14971 }
14972 }
14973
14974 return SDValue();
14975}
14976
14977// Try to fold shuffle (tbl2, tbl2) into a single tbl4.
14979 ArrayRef<int> ShuffleMask,
14980 SelectionDAG &DAG) {
14981 SDValue Tbl1 = Op->getOperand(0);
14982 SDValue Tbl2 = Op->getOperand(1);
14983 SDLoc DL(Op);
14984 SDValue Tbl2ID =
14985 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i64);
14986
14987 EVT VT = Op.getValueType();
14988 if (Tbl1.getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
14989 Tbl1.getOperand(0) != Tbl2ID ||
14991 Tbl2.getOperand(0) != Tbl2ID)
14992 return SDValue();
14993
14994 if (Tbl1.getValueType() != MVT::v16i8 || Tbl2.getValueType() != MVT::v16i8)
14995 return SDValue();
14996
14997 SDValue Mask1 = Tbl1.getOperand(3);
14998 SDValue Mask2 = Tbl2.getOperand(3);
14999 if (Mask1.getOpcode() != ISD::BUILD_VECTOR ||
15000 Mask2.getOpcode() != ISD::BUILD_VECTOR)
15001 return SDValue();
15002
15003 SmallVector<SDValue, 16> TBLMaskParts(16, SDValue());
15004 for (unsigned I = 0; I < 16; I++) {
15005 if (ShuffleMask[I] < 16)
15006 TBLMaskParts[I] = Mask1.getOperand(ShuffleMask[I]);
15007 else {
15008 auto *C = dyn_cast<ConstantSDNode>(Mask2.getOperand(ShuffleMask[I] - 16));
15009 if (!C)
15010 return SDValue();
15011 TBLMaskParts[I] = DAG.getConstant(C->getSExtValue() + 32, DL, MVT::i32);
15012 }
15013 }
15014
15015 SDValue TBLMask = DAG.getBuildVector(VT, DL, TBLMaskParts);
15016 SDValue ID =
15017 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl4, DL, MVT::i64);
15018
15019 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::v16i8,
15020 {ID, Tbl1->getOperand(1), Tbl1->getOperand(2),
15021 Tbl2->getOperand(1), Tbl2->getOperand(2), TBLMask});
15022}
15023
15024SDValue
15025AArch64TargetLowering::LowerEXTEND_VECTOR_INREG(SDValue Op,
15026 SelectionDAG &DAG) const {
15027 SDLoc DL(Op);
15028 EVT VT = Op.getValueType();
15029 assert(VT.isScalableVector() && "Unexpected result type!");
15030
15031 bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG;
15032 unsigned UnpackOpcode = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
15033
15034 // Repeatedly unpack Val until the result is of the desired type.
15035 SDValue Val = Op.getOperand(0);
15036 switch (Val.getSimpleValueType().SimpleTy) {
15037 default:
15038 return SDValue();
15039 case MVT::nxv16i8:
15040 Val = DAG.getNode(UnpackOpcode, DL, MVT::nxv8i16, Val);
15041 if (VT == MVT::nxv8i16)
15042 break;
15043 [[fallthrough]];
15044 case MVT::nxv8i16:
15045 Val = DAG.getNode(UnpackOpcode, DL, MVT::nxv4i32, Val);
15046 if (VT == MVT::nxv4i32)
15047 break;
15048 [[fallthrough]];
15049 case MVT::nxv4i32:
15050 Val = DAG.getNode(UnpackOpcode, DL, MVT::nxv2i64, Val);
15051 assert(VT == MVT::nxv2i64 && "Unexpected result type!");
15052 break;
15053 }
15054
15055 return Val;
15056}
15057
15058// Baseline legalization for ZERO_EXTEND_VECTOR_INREG will blend-in zeros,
15059// but we don't have an appropriate instruction,
15060// so custom-lower it as ZIP1-with-zeros.
15061SDValue
15062AArch64TargetLowering::LowerZERO_EXTEND_VECTOR_INREG(SDValue Op,
15063 SelectionDAG &DAG) const {
15064 SDLoc DL(Op);
15065 EVT VT = Op.getValueType();
15066
15067 if (VT.isScalableVector())
15068 return LowerEXTEND_VECTOR_INREG(Op, DAG);
15069
15070 SDValue SrcOp = Op.getOperand(0);
15071 EVT SrcVT = SrcOp.getValueType();
15072 assert(VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits() == 0 &&
15073 "Unexpected extension factor.");
15074 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
15075 // FIXME: support multi-step zipping?
15076 if (Scale != 2)
15077 return SDValue();
15078 SDValue Zeros = DAG.getConstant(0, DL, SrcVT);
15079 return DAG.getBitcast(VT,
15080 DAG.getNode(AArch64ISD::ZIP1, DL, SrcVT, SrcOp, Zeros));
15081}
15082
15083SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
15084 SelectionDAG &DAG) const {
15085 SDLoc DL(Op);
15086 EVT VT = Op.getValueType();
15087
15088 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
15089
15090 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
15091 return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG);
15092
15093 // Convert shuffles that are directly supported on NEON to target-specific
15094 // DAG nodes, instead of keeping them as shuffles and matching them again
15095 // during code selection. This is more efficient and avoids the possibility
15096 // of inconsistencies between legalization and selection.
15097 ArrayRef<int> ShuffleMask = SVN->getMask();
15098
15099 SDValue V1 = Op.getOperand(0);
15100 SDValue V2 = Op.getOperand(1);
15101
15102 assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!");
15103 assert(ShuffleMask.size() == VT.getVectorNumElements() &&
15104 "Unexpected VECTOR_SHUFFLE mask size!");
15105
15106 if (SDValue Res = tryToConvertShuffleOfTbl2ToTbl4(Op, ShuffleMask, DAG))
15107 return Res;
15108
15109 if (SVN->isSplat()) {
15110 int Lane = SVN->getSplatIndex();
15111 // If this is undef splat, generate it via "just" vdup, if possible.
15112 if (Lane == -1)
15113 Lane = 0;
15114
15115 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
15116 return DAG.getNode(AArch64ISD::DUP, DL, V1.getValueType(),
15117 V1.getOperand(0));
15118 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
15119 // constant. If so, we can just reference the lane's definition directly.
15120 if (V1.getOpcode() == ISD::BUILD_VECTOR &&
15122 return DAG.getNode(AArch64ISD::DUP, DL, VT, V1.getOperand(Lane));
15123
15124 // Otherwise, duplicate from the lane of the input vector.
15125 unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
15126 return constructDup(V1, Lane, DL, VT, Opcode, DAG);
15127 }
15128
15129 // Check if the mask matches a DUP for a wider element
15130 for (unsigned LaneSize : {64U, 32U, 16U}) {
15131 unsigned Lane = 0;
15132 if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) {
15133 unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
15134 : LaneSize == 32 ? AArch64ISD::DUPLANE32
15135 : AArch64ISD::DUPLANE16;
15136 // Cast V1 to an integer vector with required lane size
15137 MVT NewEltTy = MVT::getIntegerVT(LaneSize);
15138 unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
15139 MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount);
15140 V1 = DAG.getBitcast(NewVecTy, V1);
15141 // Construct the DUP instruction
15142 V1 = constructDup(V1, Lane, DL, NewVecTy, Opcode, DAG);
15143 // Cast back to the original type
15144 return DAG.getBitcast(VT, V1);
15145 }
15146 }
15147
15148 unsigned NumElts = VT.getVectorNumElements();
15149 unsigned EltSize = VT.getScalarSizeInBits();
15150 if (isREVMask(ShuffleMask, EltSize, NumElts, 64))
15151 return DAG.getNode(AArch64ISD::REV64, DL, V1.getValueType(), V1);
15152 if (isREVMask(ShuffleMask, EltSize, NumElts, 32))
15153 return DAG.getNode(AArch64ISD::REV32, DL, V1.getValueType(), V1);
15154 if (isREVMask(ShuffleMask, EltSize, NumElts, 16)) {
15155 EVT VT = V1.getValueType();
15156 assert(VT == MVT::v8i8 || VT == MVT::v16i8);
15157 EVT BSVT = VT == MVT::v8i8 ? MVT::v4i16 : MVT::v8i16;
15158 return DAG.getNode(
15159 AArch64ISD::NVCAST, DL, VT,
15160 DAG.getNode(ISD::BSWAP, DL, BSVT,
15161 DAG.getNode(AArch64ISD::NVCAST, DL, BSVT, V1)));
15162 }
15163
15164 if (((NumElts == 8 && EltSize == 16) || (NumElts == 16 && EltSize == 8)) &&
15165 ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size())) {
15166 SDValue Rev = DAG.getNode(AArch64ISD::REV64, DL, VT, V1);
15167 return DAG.getNode(AArch64ISD::EXT, DL, VT, Rev, Rev,
15168 DAG.getConstant(8, DL, MVT::i32));
15169 }
15170
15171 bool IsSplat1 =
15172 V1.getValueType() == VT && DAG.isSplatValue(V1, /*AllowUndefs=*/false);
15173 bool IsSplat2 =
15174 V2.getValueType() == VT && DAG.isSplatValue(V2, /*AllowUndefs=*/false);
15175 for (unsigned SplatOperand : {0U, 1U}) {
15176 if ((SplatOperand == 0 && !IsSplat1) || (SplatOperand == 1 && !IsSplat2))
15177 continue;
15178
15179 bool ReverseSplatEXT = false;
15180 unsigned SplatImm;
15181 if (isEXTMaskWithSplat(ShuffleMask, VT, SplatOperand, ReverseSplatEXT,
15182 SplatImm)) {
15183 SDValue ExtOp1 = V1;
15184 SDValue ExtOp2 = V2;
15185 if (ReverseSplatEXT)
15186 std::swap(ExtOp1, ExtOp2);
15187 SplatImm *= getExtFactor(ExtOp1);
15188 return DAG.getNode(AArch64ISD::EXT, DL, VT, ExtOp1, ExtOp2,
15189 DAG.getConstant(SplatImm, DL, MVT::i32));
15190 }
15191 }
15192
15193 bool ReverseEXT = false;
15194 unsigned Imm;
15195 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
15196 if (ReverseEXT)
15197 std::swap(V1, V2);
15198 Imm *= getExtFactor(V1);
15199 return DAG.getNode(AArch64ISD::EXT, DL, V1.getValueType(), V1, V2,
15200 DAG.getConstant(Imm, DL, MVT::i32));
15201 } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
15202 Imm *= getExtFactor(V1);
15203 return DAG.getNode(AArch64ISD::EXT, DL, V1.getValueType(), V1, V1,
15204 DAG.getConstant(Imm, DL, MVT::i32));
15205 }
15206
15207 unsigned WhichResult;
15208 unsigned OperandOrder;
15209 if (isZIPMask(ShuffleMask, NumElts, WhichResult, OperandOrder)) {
15210 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
15211 return DAG.getNode(Opc, DL, V1.getValueType(), OperandOrder == 0 ? V1 : V2,
15212 OperandOrder == 0 ? V2 : V1);
15213 }
15214 if (isUZPMask(ShuffleMask, NumElts, WhichResult)) {
15215 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
15216 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V2);
15217 }
15218 if (isTRNMask(ShuffleMask, NumElts, WhichResult, OperandOrder)) {
15219 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
15220 return DAG.getNode(Opc, DL, V1.getValueType(), OperandOrder == 0 ? V1 : V2,
15221 OperandOrder == 0 ? V2 : V1);
15222 }
15223
15224 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
15225 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
15226 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V1);
15227 }
15228 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
15229 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
15230 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V1);
15231 }
15232 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
15233 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
15234 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V1);
15235 }
15236
15238 return Concat;
15239
15240 bool DstIsLeft;
15241 int Anomaly;
15242 int NumInputElements = V1.getValueType().getVectorNumElements();
15243 if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
15244 SDValue DstVec = DstIsLeft ? V1 : V2;
15245 SDValue DstLaneV = DAG.getConstant(Anomaly, DL, MVT::i64);
15246
15247 SDValue SrcVec = V1;
15248 int SrcLane = ShuffleMask[Anomaly];
15249 if (SrcLane >= NumInputElements) {
15250 SrcVec = V2;
15251 SrcLane -= NumElts;
15252 }
15253 SDValue SrcLaneV = DAG.getConstant(SrcLane, DL, MVT::i64);
15254
15255 EVT ScalarVT = VT.getVectorElementType();
15256
15257 if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger())
15258 ScalarVT = MVT::i32;
15259
15260 return DAG.getNode(
15261 ISD::INSERT_VECTOR_ELT, DL, VT, DstVec,
15262 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, SrcVec, SrcLaneV),
15263 DstLaneV);
15264 }
15265
15266 if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG))
15267 return NewSD;
15268
15269 // If the shuffle is not directly supported and it has 4 elements, use
15270 // the PerfectShuffle-generated table to synthesize it from other shuffles.
15271 if (NumElts == 4) {
15272 unsigned PFIndexes[4];
15273 for (unsigned i = 0; i != 4; ++i) {
15274 if (ShuffleMask[i] < 0)
15275 PFIndexes[i] = 8;
15276 else
15277 PFIndexes[i] = ShuffleMask[i];
15278 }
15279
15280 // Compute the index in the perfect shuffle table.
15281 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
15282 PFIndexes[2] * 9 + PFIndexes[3];
15283 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
15284 return GeneratePerfectShuffle(PFTableIndex, V1, V2, PFEntry, V1, V2, DAG,
15285 DL);
15286 }
15287
15288 // Check for a "select shuffle", generating a BSL to pick between lanes in
15289 // V1/V2.
15290 if (ShuffleVectorInst::isSelectMask(ShuffleMask, NumElts)) {
15291 assert(VT.getScalarSizeInBits() <= 32 &&
15292 "Expected larger vector element sizes to be handled already");
15293 SmallVector<SDValue> MaskElts;
15294 for (int M : ShuffleMask)
15295 MaskElts.push_back(DAG.getConstant(
15296 M >= static_cast<int>(NumElts) ? 0 : 0xffffffff, DL, MVT::i32));
15297 EVT IVT = VT.changeVectorElementTypeToInteger();
15298 SDValue MaskConst = DAG.getBuildVector(IVT, DL, MaskElts);
15299 return DAG.getBitcast(VT, DAG.getNode(AArch64ISD::BSP, DL, IVT, MaskConst,
15300 DAG.getBitcast(IVT, V1),
15301 DAG.getBitcast(IVT, V2)));
15302 }
15303
15304 // Fall back to generating a TBL
15305 return GenerateTBL(Op, ShuffleMask, DAG);
15306}
15307
15308SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
15309 SelectionDAG &DAG) const {
15310 EVT VT = Op.getValueType();
15311
15312 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
15313 return LowerToScalableOp(Op, DAG);
15314
15315 assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i1 &&
15316 "Unexpected vector type!");
15317
15318 // We can handle the constant cases during isel.
15319 if (isa<ConstantSDNode>(Op.getOperand(0)))
15320 return Op;
15321
15322 // There isn't a natural way to handle the general i1 case, so we use some
15323 // trickery with whilelo.
15324 SDLoc DL(Op);
15325 SDValue SplatVal = DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, MVT::i64);
15326 SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, SplatVal,
15327 DAG.getValueType(MVT::i1));
15328 SDValue ID =
15329 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
15330 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
15331 if (VT == MVT::nxv1i1)
15332 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::nxv1i1,
15333 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv2i1, ID,
15334 Zero, SplatVal),
15335 Zero);
15336 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, ID, Zero, SplatVal);
15337}
15338
15339SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
15340 SelectionDAG &DAG) const {
15341 SDLoc DL(Op);
15342
15343 EVT VT = Op.getValueType();
15344 if (!isTypeLegal(VT) || !VT.isScalableVector())
15345 return SDValue();
15346
15347 // Current lowering only supports the SVE-ACLE types.
15349 return SDValue();
15350
15351 // The DUPQ operation is independent of element type so normalise to i64s.
15352 SDValue Idx128 = Op.getOperand(2);
15353
15354 // DUPQ can be used when idx is in range.
15355 auto *CIdx = dyn_cast<ConstantSDNode>(Idx128);
15356 if (CIdx && (CIdx->getZExtValue() <= 3)) {
15357 SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
15358 return DAG.getNode(AArch64ISD::DUPLANE128, DL, VT, Op.getOperand(1), CI);
15359 }
15360
15361 SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
15362
15363 // The ACLE says this must produce the same result as:
15364 // svtbl(data, svadd_x(svptrue_b64(),
15365 // svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
15366 // index * 2))
15367 SDValue One = DAG.getConstant(1, DL, MVT::i64);
15368 SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One);
15369
15370 // create the vector 0,1,0,1,...
15371 SDValue SV = DAG.getStepVector(DL, MVT::nxv2i64);
15372 SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);
15373
15374 // create the vector idx64,idx64+1,idx64,idx64+1,...
15375 SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128);
15376 SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);
15377 SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);
15378
15379 // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
15380 SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
15381 return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
15382}
15383
15384
15385static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
15386 APInt &UndefBits) {
15387 EVT VT = BVN->getValueType(0);
15388 APInt SplatBits, SplatUndef;
15389 unsigned SplatBitSize;
15390 bool HasAnyUndefs;
15391 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
15392 unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
15393
15394 for (unsigned i = 0; i < NumSplats; ++i) {
15395 CnstBits <<= SplatBitSize;
15396 UndefBits <<= SplatBitSize;
15397 CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
15398 UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
15399 }
15400
15401 return true;
15402 }
15403
15404 return false;
15405}
15406
15407// Try 64-bit splatted SIMD immediate.
15408static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
15409 const APInt &Bits) {
15410 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
15411 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
15412 EVT VT = Op.getValueType();
15413 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;
15414
15417
15418 SDLoc DL(Op);
15419 SDValue Mov =
15420 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32));
15421 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
15422 }
15423 }
15424
15425 return SDValue();
15426}
15427
15428// Try 32-bit splatted SIMD immediate.
15429static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
15430 const APInt &Bits,
15431 const SDValue *LHS = nullptr) {
15432 EVT VT = Op.getValueType();
15433 if (VT.isFixedLengthVector() &&
15435 return SDValue();
15436
15437 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
15438 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
15439 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
15440 bool isAdvSIMDModImm = false;
15441 uint64_t Shift;
15442
15443 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {
15445 Shift = 0;
15446 }
15447 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
15449 Shift = 8;
15450 }
15451 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
15453 Shift = 16;
15454 }
15455 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
15457 Shift = 24;
15458 }
15459
15460 if (isAdvSIMDModImm) {
15461 SDLoc DL(Op);
15462 SDValue Mov;
15463
15464 if (LHS)
15465 Mov = DAG.getNode(NewOp, DL, MovTy,
15466 DAG.getNode(AArch64ISD::NVCAST, DL, MovTy, *LHS),
15467 DAG.getConstant(Value, DL, MVT::i32),
15468 DAG.getConstant(Shift, DL, MVT::i32));
15469 else
15470 Mov =
15471 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32),
15472 DAG.getConstant(Shift, DL, MVT::i32));
15473
15474 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
15475 }
15476 }
15477
15478 return SDValue();
15479}
15480
15481// Try 16-bit splatted SIMD immediate.
15482static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
15483 const APInt &Bits,
15484 const SDValue *LHS = nullptr) {
15485 EVT VT = Op.getValueType();
15486 if (VT.isFixedLengthVector() &&
15488 return SDValue();
15489
15490 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
15491 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
15492 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
15493 bool isAdvSIMDModImm = false;
15494 uint64_t Shift;
15495
15496 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {
15498 Shift = 0;
15499 }
15500 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
15502 Shift = 8;
15503 }
15504
15505 if (isAdvSIMDModImm) {
15506 SDLoc DL(Op);
15507 SDValue Mov;
15508
15509 if (LHS)
15510 Mov = DAG.getNode(NewOp, DL, MovTy,
15511 DAG.getNode(AArch64ISD::NVCAST, DL, MovTy, *LHS),
15512 DAG.getConstant(Value, DL, MVT::i32),
15513 DAG.getConstant(Shift, DL, MVT::i32));
15514 else
15515 Mov =
15516 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32),
15517 DAG.getConstant(Shift, DL, MVT::i32));
15518
15519 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
15520 }
15521 }
15522
15523 return SDValue();
15524}
15525
15526// Try 32-bit splatted SIMD immediate with shifted ones.
15528 SelectionDAG &DAG, const APInt &Bits) {
15529 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
15530 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
15531 EVT VT = Op.getValueType();
15532 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
15533 bool isAdvSIMDModImm = false;
15534 uint64_t Shift;
15535
15536 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {
15538 Shift = 264;
15539 }
15540 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
15542 Shift = 272;
15543 }
15544
15545 if (isAdvSIMDModImm) {
15546 SDLoc DL(Op);
15547 SDValue Mov =
15548 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32),
15549 DAG.getConstant(Shift, DL, MVT::i32));
15550 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
15551 }
15552 }
15553
15554 return SDValue();
15555}
15556
15557// Try 8-bit splatted SIMD immediate.
15558static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
15559 const APInt &Bits) {
15560 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
15561 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
15562 EVT VT = Op.getValueType();
15563 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
15564
15567
15568 SDLoc DL(Op);
15569 SDValue Mov =
15570 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32));
15571 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
15572 }
15573 }
15574
15575 return SDValue();
15576}
15577
15578// Try FP splatted SIMD immediate.
15579static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
15580 const APInt &Bits) {
15581 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
15582 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
15583 EVT VT = Op.getValueType();
15584 bool isWide = (VT.getSizeInBits() == 128);
15585 MVT MovTy;
15586 bool isAdvSIMDModImm = false;
15587
15588 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {
15590 MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
15591 }
15592 else if (isWide &&
15593 (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
15595 MovTy = MVT::v2f64;
15596 }
15597
15598 if (isAdvSIMDModImm) {
15599 SDLoc DL(Op);
15600 SDValue Mov =
15601 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32));
15602 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
15603 }
15604 }
15605
15606 return SDValue();
15607}
15608
15609// Specialized code to quickly find if PotentialBVec is a BuildVector that
15610// consists of only the same constant int value, returned in reference arg
15611// ConstVal
15612static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
15613 uint64_t &ConstVal) {
15614 BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
15615 if (!Bvec)
15616 return false;
15618 if (!FirstElt)
15619 return false;
15620 EVT VT = Bvec->getValueType(0);
15621 unsigned NumElts = VT.getVectorNumElements();
15622 for (unsigned i = 1; i < NumElts; ++i)
15623 if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
15624 return false;
15625 ConstVal = FirstElt->getZExtValue();
15626 return true;
15627}
15628
15630 // Look through cast.
15631 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST)
15632 N = N.getOperand(0);
15633
15634 return ISD::isConstantSplatVectorAllZeros(N.getNode());
15635}
15636
15638 unsigned NumElts = N.getValueType().getVectorMinNumElements();
15639
15640 // Look through cast.
15641 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {
15642 N = N.getOperand(0);
15643 // When reinterpreting from a type with fewer elements the "new" elements
15644 // are not active, so bail if they're likely to be used.
15645 if (N.getValueType().getVectorMinNumElements() < NumElts)
15646 return false;
15647 }
15648
15649 if (ISD::isConstantSplatVectorAllOnes(N.getNode()))
15650 return true;
15651
15652 // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
15653 // or smaller than the implicit element type represented by N.
15654 // NOTE: A larger element count implies a smaller element type.
15655 if (N.getOpcode() == AArch64ISD::PTRUE &&
15656 N.getConstantOperandVal(0) == AArch64SVEPredPattern::all)
15657 return N.getValueType().getVectorMinNumElements() >= NumElts;
15658
15659 return false;
15660}
15661
15662// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
15663// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
15664// BUILD_VECTORs with constant element C1, C2 is a constant, and:
15665// - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
15666// - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
15667// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
15669 EVT VT = N->getValueType(0);
15670
15671 if (!VT.isVector())
15672 return SDValue();
15673
15674 SDLoc DL(N);
15675
15676 SDValue And;
15677 SDValue Shift;
15678
15679 SDValue FirstOp = N->getOperand(0);
15680 unsigned FirstOpc = FirstOp.getOpcode();
15681 SDValue SecondOp = N->getOperand(1);
15682 unsigned SecondOpc = SecondOp.getOpcode();
15683
15684 // Is one of the operands an AND or a BICi? The AND may have been optimised to
15685 // a BICi in order to use an immediate instead of a register.
15686 // Is the other operand an shl or lshr? This will have been turned into:
15687 // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift
15688 // or (AArch64ISD::SHL_PRED || AArch64ISD::SRL_PRED) mask, vector, #shiftVec.
15689 if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
15690 (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR ||
15691 SecondOpc == AArch64ISD::SHL_PRED ||
15692 SecondOpc == AArch64ISD::SRL_PRED)) {
15693 And = FirstOp;
15694 Shift = SecondOp;
15695
15696 } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
15697 (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR ||
15698 FirstOpc == AArch64ISD::SHL_PRED ||
15699 FirstOpc == AArch64ISD::SRL_PRED)) {
15700 And = SecondOp;
15701 Shift = FirstOp;
15702 } else
15703 return SDValue();
15704
15705 bool IsAnd = And.getOpcode() == ISD::AND;
15706 bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR ||
15707 Shift.getOpcode() == AArch64ISD::SRL_PRED;
15708 bool ShiftHasPredOp = Shift.getOpcode() == AArch64ISD::SHL_PRED ||
15709 Shift.getOpcode() == AArch64ISD::SRL_PRED;
15710
15711 // Is the shift amount constant and are all lanes active?
15712 uint64_t C2;
15713 if (ShiftHasPredOp) {
15714 if (!isAllActivePredicate(DAG, Shift.getOperand(0)))
15715 return SDValue();
15716 APInt C;
15718 return SDValue();
15719 C2 = C.getZExtValue();
15720 } else if (ConstantSDNode *C2node =
15722 C2 = C2node->getZExtValue();
15723 else
15724 return SDValue();
15725
15726 APInt C1AsAPInt;
15727 unsigned ElemSizeInBits = VT.getScalarSizeInBits();
15728 if (IsAnd) {
15729 // Is the and mask vector all constant?
15730 if (!ISD::isConstantSplatVector(And.getOperand(1).getNode(), C1AsAPInt))
15731 return SDValue();
15732 } else {
15733 // Reconstruct the corresponding AND immediate from the two BICi immediates.
15734 ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1));
15735 ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2));
15736 assert(C1nodeImm && C1nodeShift);
15737 C1AsAPInt = ~(C1nodeImm->getAPIntValue() << C1nodeShift->getAPIntValue());
15738 C1AsAPInt = C1AsAPInt.zextOrTrunc(ElemSizeInBits);
15739 }
15740
15741 // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
15742 // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
15743 // how much one can shift elements of a particular size?
15744 if (C2 > ElemSizeInBits)
15745 return SDValue();
15746
15747 APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2)
15748 : APInt::getLowBitsSet(ElemSizeInBits, C2);
15749 if (C1AsAPInt != RequiredC1)
15750 return SDValue();
15751
15752 SDValue X = And.getOperand(0);
15753 SDValue Y = ShiftHasPredOp ? Shift.getOperand(1) : Shift.getOperand(0);
15754 SDValue Imm = ShiftHasPredOp ? DAG.getTargetConstant(C2, DL, MVT::i32)
15755 : Shift.getOperand(1);
15756
15757 unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
15758 return DAG.getNode(Inst, DL, VT, X, Y, Imm);
15759}
15760
15762 EVT VT = N->getValueType(0);
15763 assert(VT.isVector() && "Expected vector type in tryLowerToBSL\n");
15764 SDLoc DL(N);
15765 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
15766
15767 if (VT.isScalableVector() && !Subtarget.hasSVE2())
15768 return SDValue();
15769
15770 SDValue N0 = N->getOperand(0);
15771 if (N0.getOpcode() != ISD::AND)
15772 return SDValue();
15773
15774 SDValue N1 = N->getOperand(1);
15775 if (N1.getOpcode() != ISD::AND)
15776 return SDValue();
15777
15778 // InstCombine does (not (neg a)) => (add a -1).
15779 // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
15780 // Loop over all combinations of AND operands.
15781 for (int i = 1; i >= 0; --i) {
15782 for (int j = 1; j >= 0; --j) {
15783 SDValue O0 = N0->getOperand(i);
15784 SDValue O1 = N1->getOperand(j);
15785 SDValue Sub, Add, SubSibling, AddSibling;
15786
15787 // Find a SUB and an ADD operand, one from each AND.
15788 if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
15789 Sub = O0;
15790 Add = O1;
15791 SubSibling = N0->getOperand(1 - i);
15792 AddSibling = N1->getOperand(1 - j);
15793 } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
15794 Add = O0;
15795 Sub = O1;
15796 AddSibling = N0->getOperand(1 - i);
15797 SubSibling = N1->getOperand(1 - j);
15798 } else
15799 continue;
15800
15801 if (!ISD::isConstantSplatVectorAllZeros(Sub.getOperand(0).getNode()))
15802 continue;
15803
15804 // Constant ones is always righthand operand of the Add.
15805 if (!ISD::isConstantSplatVectorAllOnes(Add.getOperand(1).getNode()))
15806 continue;
15807
15808 if (Sub.getOperand(1) != Add.getOperand(0))
15809 continue;
15810
15811 return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling);
15812 }
15813 }
15814
15815 // (or (and a b) (and (not a) c)) => (bsl a b c)
15816 // We only have to look for constant vectors here since the general, variable
15817 // case can be handled in TableGen.
15818 unsigned Bits = VT.getScalarSizeInBits();
15819 for (int i = 1; i >= 0; --i)
15820 for (int j = 1; j >= 0; --j) {
15821 APInt Val1, Val2;
15822
15823 if (ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) &&
15825 ~Val1.trunc(Bits) == Val2.trunc(Bits)) {
15826 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
15827 N0->getOperand(1 - i), N1->getOperand(1 - j));
15828 }
15831 if (!BVN0 || !BVN1)
15832 continue;
15833
15834 bool FoundMatch = true;
15835 for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
15838 if (!CN0 || !CN1 ||
15839 CN0->getAPIntValue().trunc(Bits) !=
15840 ~CN1->getAsAPIntVal().trunc(Bits)) {
15841 FoundMatch = false;
15842 break;
15843 }
15844 }
15845 if (FoundMatch)
15846 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
15847 N0->getOperand(1 - i), N1->getOperand(1 - j));
15848 }
15849
15850 return SDValue();
15851}
15852
15853SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
15854 SelectionDAG &DAG) const {
15855 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
15856 !Subtarget->isNeonAvailable()))
15857 return LowerToScalableOp(Op, DAG);
15858
15859 if (SDValue Res = tryLowerToBSL(Op, DAG))
15860 return Res;
15861
15862 // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
15863 if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
15864 return Res;
15865
15866 EVT VT = Op.getValueType();
15867 if (VT.isScalableVector())
15868 return Op;
15869
15870 SDValue LHS = Op.getOperand(0);
15871 BuildVectorSDNode *BVN =
15872 dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
15873 if (!BVN) {
15874 // OR commutes, so try swapping the operands.
15875 LHS = Op.getOperand(1);
15876 BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
15877 }
15878 if (!BVN)
15879 return Op;
15880
15881 APInt DefBits(VT.getSizeInBits(), 0);
15882 APInt UndefBits(VT.getSizeInBits(), 0);
15883 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
15884 SDValue NewOp;
15885
15886 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
15887 DefBits, &LHS)) ||
15888 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
15889 DefBits, &LHS)))
15890 return NewOp;
15891
15892 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
15893 UndefBits, &LHS)) ||
15894 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
15895 UndefBits, &LHS)))
15896 return NewOp;
15897 }
15898
15899 // We can always fall back to a non-immediate OR.
15900 return Op;
15901}
15902
15903// Normalize the operands of BUILD_VECTOR. The value of constant operands will
15904// be truncated to fit element width.
15906 SelectionDAG &DAG) {
15907 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
15908 SDLoc DL(Op);
15909 EVT VT = Op.getValueType();
15910 EVT EltTy= VT.getVectorElementType();
15911
15912 if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
15913 return Op;
15914
15916 for (SDValue Lane : Op->ops()) {
15917 // For integer vectors, type legalization would have promoted the
15918 // operands already. Otherwise, if Op is a floating-point splat
15919 // (with operands cast to integers), then the only possibilities
15920 // are constants and UNDEFs.
15921 if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
15922 Lane = DAG.getConstant(
15923 CstLane->getAPIntValue().trunc(EltTy.getSizeInBits()).getZExtValue(),
15924 DL, MVT::i32);
15925 } else if (Lane.getOpcode() == ISD::POISON) {
15926 Lane = DAG.getPOISON(MVT::i32);
15927 } else if (Lane.getOpcode() == ISD::UNDEF) {
15928 Lane = DAG.getUNDEF(MVT::i32);
15929 } else {
15930 assert(Lane.getValueType() == MVT::i32 &&
15931 "Unexpected BUILD_VECTOR operand type");
15932 }
15933 Ops.push_back(Lane);
15934 }
15935 return DAG.getBuildVector(VT, DL, Ops);
15936}
15937
15939 const AArch64Subtarget *ST, APInt &DefBits) {
15940 EVT VT = Op.getValueType();
15941 // TODO: We should be able to support 64-bit destinations too
15942 if (!ST->hasSVE() || !VT.is128BitVector() ||
15943 DefBits.getHiBits(64) != DefBits.getLoBits(64))
15944 return SDValue();
15945
15946 // See if we can make use of the SVE dup instruction.
15947 APInt Val64 = DefBits.trunc(64);
15948 int32_t ImmVal, ShiftVal;
15949 uint64_t Encoding;
15950 if (!AArch64_AM::isSVECpyDupImm(64, Val64.getSExtValue(), ImmVal, ShiftVal) &&
15951 !AArch64_AM::isSVELogicalImm(64, Val64.getZExtValue(), Encoding))
15952 return SDValue();
15953
15954 SDLoc DL(Op);
15955 SDValue SplatVal = DAG.getNode(AArch64ISD::DUP, DL, MVT::v2i64,
15956 DAG.getConstant(Val64, DL, MVT::i64));
15957 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, SplatVal);
15958}
15959
15961 const AArch64Subtarget *ST) {
15962 EVT VT = Op.getValueType();
15963 assert((VT.getSizeInBits() == 64 || VT.getSizeInBits() == 128) &&
15964 "Expected a legal NEON vector");
15965
15966 APInt DefBits(VT.getSizeInBits(), 0);
15967 APInt UndefBits(VT.getSizeInBits(), 0);
15969 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
15970 auto TryMOVIWithBits = [&](APInt DefBits) {
15971 SDValue NewOp;
15972 if ((NewOp =
15973 tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
15974 (NewOp =
15975 tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
15976 (NewOp =
15977 tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
15978 (NewOp =
15979 tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
15980 (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
15981 (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
15982 return NewOp;
15983
15984 APInt NotDefBits = ~DefBits;
15985 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG,
15986 NotDefBits)) ||
15987 (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG,
15988 NotDefBits)) ||
15989 (NewOp =
15990 tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, NotDefBits)))
15991 return NewOp;
15992 return SDValue();
15993 };
15994 if (SDValue R = TryMOVIWithBits(DefBits))
15995 return R;
15996 if (SDValue R = TryMOVIWithBits(UndefBits))
15997 return R;
15998
15999 // Try to materialise the constant using SVE when available.
16000 if (SDValue R = trySVESplat64(Op, DAG, ST, DefBits))
16001 return R;
16002
16003 // See if a fneg of the constant can be materialized with a MOVI, etc
16004 auto TryWithFNeg = [&](APInt DefBits, MVT FVT) {
16005 // FNegate each sub-element of the constant
16006 assert(VT.getSizeInBits() % FVT.getScalarSizeInBits() == 0);
16007 APInt Neg = APInt::getHighBitsSet(FVT.getSizeInBits(), 1)
16008 .zext(VT.getSizeInBits());
16009 APInt NegBits(VT.getSizeInBits(), 0);
16010 unsigned NumElts = VT.getSizeInBits() / FVT.getScalarSizeInBits();
16011 for (unsigned i = 0; i < NumElts; i++)
16012 NegBits |= Neg << (FVT.getScalarSizeInBits() * i);
16013 NegBits = DefBits ^ NegBits;
16014
16015 // Try to create the new constants with MOVI, and if so generate a fneg
16016 // for it.
16017 if (SDValue NewOp = TryMOVIWithBits(NegBits)) {
16018 SDLoc DL(Op);
16019 MVT VFVT = NumElts == 1 ? FVT : MVT::getVectorVT(FVT, NumElts);
16020 return DAG.getNode(
16021 AArch64ISD::NVCAST, DL, VT,
16022 DAG.getNode(ISD::FNEG, DL, VFVT,
16023 DAG.getNode(AArch64ISD::NVCAST, DL, VFVT, NewOp)));
16024 }
16025 return SDValue();
16026 };
16027 SDValue R;
16028 if ((R = TryWithFNeg(DefBits, MVT::f32)) ||
16029 (R = TryWithFNeg(DefBits, MVT::f64)) ||
16030 (ST->hasFullFP16() && (R = TryWithFNeg(DefBits, MVT::f16))))
16031 return R;
16032 }
16033
16034 return SDValue();
16035}
16036
16037SDValue AArch64TargetLowering::LowerFixedLengthBuildVectorToSVE(
16038 SDValue Op, SelectionDAG &DAG) const {
16039 EVT VT = Op.getValueType();
16040 SDLoc DL(Op);
16041 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
16042 auto *BVN = cast<BuildVectorSDNode>(Op);
16043
16044 if (auto SeqInfo = BVN->isArithmeticSequence()) {
16045 SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT);
16046 SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second);
16047 SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps);
16048 return convertFromScalableVector(DAG, VT, Seq);
16049 }
16050
16051 unsigned NumElems = VT.getVectorNumElements();
16052 if (!VT.isPow2VectorType() || VT.getFixedSizeInBits() > 128 ||
16053 NumElems <= 1 || BVN->isConstant())
16054 return SDValue();
16055
16056 auto IsExtractElt = [](SDValue Op) {
16057 return Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT;
16058 };
16059
16060 // For integer types that are not already in vectors limit to at most four
16061 // elements. This is an arbitrary restriction to avoid many fmovs from GPRs.
16062 if (VT.getScalarType().isInteger() &&
16063 NumElems - count_if(Op->op_values(), IsExtractElt) > 4)
16064 return SDValue();
16065
16066 // Lower (pow2) BUILD_VECTORS that are <= 128-bit to a sequence of ZIP1s.
16067 SDValue ZeroI64 = DAG.getConstant(0, DL, MVT::i64);
16069 Op->op_values(), [&, Poison = DAG.getPOISON(ContainerVT)](SDValue Op) {
16070 return Op.isUndef() ? Poison
16071 : DAG.getNode(ISD::INSERT_VECTOR_ELT, DL,
16072 ContainerVT, Poison, Op, ZeroI64);
16073 });
16074
16075 ElementCount ZipEC = ContainerVT.getVectorElementCount();
16076 while (Intermediates.size() > 1) {
16077 EVT ZipVT = getPackedSVEVectorVT(ZipEC);
16078
16079 for (unsigned I = 0; I < Intermediates.size(); I += 2) {
16080 SDValue Op0 = DAG.getBitcast(ZipVT, Intermediates[I + 0]);
16081 SDValue Op1 = DAG.getBitcast(ZipVT, Intermediates[I + 1]);
16082 Intermediates[I / 2] =
16083 Op1.isUndef() ? Op0
16084 : DAG.getNode(AArch64ISD::ZIP1, DL, ZipVT, Op0, Op1);
16085 }
16086
16087 Intermediates.resize(Intermediates.size() / 2);
16088 ZipEC = ZipEC.divideCoefficientBy(2);
16089 }
16090
16091 assert(Intermediates.size() == 1);
16092 SDValue Vec = DAG.getBitcast(ContainerVT, Intermediates[0]);
16093 return convertFromScalableVector(DAG, VT, Vec);
16094}
16095
16096SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
16097 SelectionDAG &DAG) const {
16098 EVT VT = Op.getValueType();
16099
16100 bool OverrideNEON = !Subtarget->isNeonAvailable() ||
16101 cast<BuildVectorSDNode>(Op)->isArithmeticSequence();
16102 if (useSVEForFixedLengthVectorVT(VT, OverrideNEON))
16103 return LowerFixedLengthBuildVectorToSVE(Op, DAG);
16104
16105 // Try to build a simple constant vector.
16106 Op = NormalizeBuildVector(Op, DAG);
16107 // Thought this might return a non-BUILD_VECTOR (e.g. CONCAT_VECTORS), if so,
16108 // abort.
16109 if (Op.getOpcode() != ISD::BUILD_VECTOR)
16110 return SDValue();
16111
16112 // Certain vector constants, used to express things like logical NOT and
16113 // arithmetic NEG, are passed through unmodified. This allows special
16114 // patterns for these operations to match, which will lower these constants
16115 // to whatever is proven necessary.
16116 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
16117 if (BVN->isConstant()) {
16118 if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
16119 unsigned BitSize = VT.getVectorElementType().getSizeInBits();
16120 APInt Val(BitSize,
16121 Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
16122 if (Val.isZero() || (VT.isInteger() && Val.isAllOnes()))
16123 return Op;
16124 }
16125 if (ConstantFPSDNode *Const = BVN->getConstantFPSplatNode())
16126 if (Const->isZero() && !Const->isNegative())
16127 return Op;
16128 }
16129
16130 if (SDValue V = ConstantBuildVector(Op, DAG, Subtarget))
16131 return V;
16132
16133 // Scan through the operands to find some interesting properties we can
16134 // exploit:
16135 // 1) If only one value is used, we can use a DUP, or
16136 // 2) if only the low element is not undef, we can just insert that, or
16137 // 3) if only one constant value is used (w/ some non-constant lanes),
16138 // we can splat the constant value into the whole vector then fill
16139 // in the non-constant lanes.
16140 // 4) FIXME: If different constant values are used, but we can intelligently
16141 // select the values we'll be overwriting for the non-constant
16142 // lanes such that we can directly materialize the vector
16143 // some other way (MOVI, e.g.), we can be sneaky.
16144 // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
16145 SDLoc DL(Op);
16146 unsigned NumElts = VT.getVectorNumElements();
16147 bool isOnlyLowElement = true;
16148 bool usesOnlyOneValue = true;
16149 bool usesOnlyOneConstantValue = true;
16150 bool isConstant = true;
16151 bool AllLanesExtractElt = true;
16152 unsigned NumConstantLanes = 0;
16153 unsigned NumDifferentLanes = 0;
16154 unsigned NumUndefLanes = 0;
16155 SDValue Value;
16156 SDValue ConstantValue;
16157 SmallMapVector<SDValue, unsigned, 16> DifferentValueMap;
16158 unsigned ConsecutiveValCount = 0;
16159 SDValue PrevVal;
16160 auto IsZero = [&](SDValue V) {
16161 return isNullConstant(V) || isNullFPConstant(V);
16162 };
16163 bool MaybeLowHalfZeroHigh =
16164 VT.isFixedLengthVector() && VT.getSizeInBits() == 128;
16165 unsigned HalfElts = MaybeLowHalfZeroHigh ? (NumElts >> 1) : 0;
16166 SDValue LowHalfFirstVal = MaybeLowHalfZeroHigh ? Op.getOperand(0) : SDValue();
16167 for (unsigned i = 0; i < NumElts; ++i) {
16168 SDValue V = Op.getOperand(i);
16169 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16170 AllLanesExtractElt = false;
16171 if (V.isUndef()) {
16172 ++NumUndefLanes;
16173 MaybeLowHalfZeroHigh = false;
16174 continue;
16175 }
16176 if (i > 0)
16177 isOnlyLowElement = false;
16178 if (!isIntOrFPConstant(V))
16179 isConstant = false;
16180
16181 if (isIntOrFPConstant(V)) {
16182 ++NumConstantLanes;
16183 if (!ConstantValue.getNode())
16184 ConstantValue = V;
16185 else if (ConstantValue != V)
16186 usesOnlyOneConstantValue = false;
16187 }
16188
16189 if (!Value.getNode())
16190 Value = V;
16191 else if (V != Value) {
16192 usesOnlyOneValue = false;
16193 ++NumDifferentLanes;
16194 }
16195
16196 if (PrevVal != V) {
16197 ConsecutiveValCount = 0;
16198 PrevVal = V;
16199 }
16200 if (MaybeLowHalfZeroHigh) {
16201 if (i < HalfElts) {
16202 if (V != LowHalfFirstVal)
16203 MaybeLowHalfZeroHigh = false;
16204 } else if (!IsZero(V)) {
16205 MaybeLowHalfZeroHigh = false;
16206 }
16207 }
16208
16209 // Keep different values and its last consecutive count. For example,
16210 //
16211 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
16212 // t24, t24, t24, t24, t24, t24, t24, t24
16213 // t23 = consecutive count 8
16214 // t24 = consecutive count 8
16215 // ------------------------------------------------------------------
16216 // t22: v16i8 = build_vector t24, t24, t23, t23, t23, t23, t23, t24,
16217 // t24, t24, t24, t24, t24, t24, t24, t24
16218 // t23 = consecutive count 5
16219 // t24 = consecutive count 9
16220 DifferentValueMap[V] = ++ConsecutiveValCount;
16221 }
16222
16223 if (!Value.getNode()) {
16224 LLVM_DEBUG(
16225 dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
16226 return DAG.getUNDEF(VT);
16227 }
16228
16229 // Convert BUILD_VECTOR where all elements but the lowest are undef into
16230 // SCALAR_TO_VECTOR, except for when we have a single-element constant vector
16231 // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
16232 if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(Value))) {
16233 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
16234 "SCALAR_TO_VECTOR node\n");
16235 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Value);
16236 }
16237
16238 if (MaybeLowHalfZeroHigh && LowHalfFirstVal.getNode() &&
16239 !LowHalfFirstVal.isUndef() && !isIntOrFPConstant(LowHalfFirstVal)) {
16240 EVT LaneVT = VT.getVectorElementType();
16241 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
16242
16243 SDValue HiZero = LaneVT.isInteger() ? DAG.getConstant(0, DL, HalfVT)
16244 : DAG.getConstantFP(0.0, DL, HalfVT);
16245
16246 SDValue LoHalf =
16247 LaneVT.getSizeInBits() == 64
16248 ? DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, HalfVT, LowHalfFirstVal)
16249 : DAG.getNode(AArch64ISD::DUP, DL, HalfVT, LowHalfFirstVal);
16250
16251 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoHalf, HiZero);
16252 }
16253
16254 if (AllLanesExtractElt) {
16255 SDNode *Vector = nullptr;
16256 bool Even = false;
16257 bool Odd = false;
16258 // Check whether the extract elements match the Even pattern <0,2,4,...> or
16259 // the Odd pattern <1,3,5,...>.
16260 for (unsigned i = 0; i < NumElts; ++i) {
16261 SDValue V = Op.getOperand(i);
16262 const SDNode *N = V.getNode();
16263 if (!isa<ConstantSDNode>(N->getOperand(1))) {
16264 Even = false;
16265 Odd = false;
16266 break;
16267 }
16268 SDValue N0 = N->getOperand(0);
16269
16270 // All elements are extracted from the same vector.
16271 if (!Vector) {
16272 Vector = N0.getNode();
16273 // Check that the type of EXTRACT_VECTOR_ELT matches the type of
16274 // BUILD_VECTOR.
16275 if (VT.getVectorElementType() !=
16277 break;
16278 } else if (Vector != N0.getNode()) {
16279 Odd = false;
16280 Even = false;
16281 break;
16282 }
16283
16284 // Extracted values are either at Even indices <0,2,4,...> or at Odd
16285 // indices <1,3,5,...>.
16286 uint64_t Val = N->getConstantOperandVal(1);
16287 if (Val == 2 * i) {
16288 Even = true;
16289 continue;
16290 }
16291 if (Val - 1 == 2 * i) {
16292 Odd = true;
16293 continue;
16294 }
16295
16296 // Something does not match: abort.
16297 Odd = false;
16298 Even = false;
16299 break;
16300 }
16301 if (Even || Odd) {
16302 SDValue LHS =
16304 DAG.getConstant(0, DL, MVT::i64));
16305 SDValue RHS =
16307 DAG.getConstant(NumElts, DL, MVT::i64));
16308
16309 if (Even && !Odd)
16310 return DAG.getNode(AArch64ISD::UZP1, DL, VT, LHS, RHS);
16311 if (Odd && !Even)
16312 return DAG.getNode(AArch64ISD::UZP2, DL, VT, LHS, RHS);
16313 }
16314 }
16315
16316 // Use DUP for non-constant splats. For f32 constant splats, reduce to
16317 // i32 and try again.
16318 if (usesOnlyOneValue) {
16319 if (!isConstant) {
16320 if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
16321 Value.getValueType() != VT) {
16322 LLVM_DEBUG(
16323 dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
16324 return DAG.getNode(AArch64ISD::DUP, DL, VT, Value);
16325 }
16326
16327 // This is actually a DUPLANExx operation, which keeps everything vectory.
16328
16329 SDValue Lane = Value.getOperand(1);
16330 Value = Value.getOperand(0);
16331 if (Value.getValueSizeInBits() == 64) {
16332 LLVM_DEBUG(
16333 dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
16334 "widening it\n");
16335 Value = WidenVector(Value, DAG);
16336 }
16337
16338 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
16339 return DAG.getNode(Opcode, DL, VT, Value, Lane);
16340 }
16341
16344 EVT EltTy = VT.getVectorElementType();
16345 assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
16346 EltTy == MVT::f64) && "Unsupported floating-point vector type");
16347 LLVM_DEBUG(
16348 dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
16349 "BITCASTS, and try again\n");
16350 MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
16351 for (unsigned i = 0; i < NumElts; ++i)
16352 Ops.push_back(DAG.getNode(ISD::BITCAST, DL, NewType, Op.getOperand(i)));
16353 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
16354 SDValue Val = DAG.getBuildVector(VecVT, DL, Ops);
16355 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
16356 Val.dump(););
16357 Val = LowerBUILD_VECTOR(Val, DAG);
16358 if (Val.getNode())
16359 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
16360 }
16361 }
16362
16363 // If we need to insert a small number of different non-constant elements and
16364 // the vector width is sufficiently large, prefer using DUP with the common
16365 // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
16366 // skip the constant lane handling below.
16367 bool PreferDUPAndInsert =
16368 !isConstant && NumDifferentLanes >= 1 &&
16369 NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) &&
16370 NumDifferentLanes >= NumConstantLanes;
16371
16372 // If there was only one constant value used and for more than one lane,
16373 // start by splatting that value, then replace the non-constant lanes. This
16374 // is better than the default, which will perform a separate initialization
16375 // for each lane.
16376 if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) {
16377 // Firstly, try to materialize the splat constant.
16378 SDValue Val = DAG.getSplatBuildVector(VT, DL, ConstantValue);
16379 unsigned BitSize = VT.getScalarSizeInBits();
16380 APInt ConstantValueAPInt(1, 0);
16381 if (auto *C = dyn_cast<ConstantSDNode>(ConstantValue))
16382 ConstantValueAPInt = C->getAPIntValue().zextOrTrunc(BitSize);
16383 if (!isNullConstant(ConstantValue) && !isNullFPConstant(ConstantValue) &&
16384 !ConstantValueAPInt.isAllOnes()) {
16385 Val = ConstantBuildVector(Val, DAG, Subtarget);
16386 if (!Val)
16387 // Otherwise, materialize the constant and splat it.
16388 Val = DAG.getNode(AArch64ISD::DUP, DL, VT, ConstantValue);
16389 }
16390
16391 // Now insert the non-constant lanes.
16392 for (unsigned i = 0; i < NumElts; ++i) {
16393 SDValue V = Op.getOperand(i);
16394 SDValue LaneIdx = DAG.getConstant(i, DL, MVT::i64);
16395 if (!isIntOrFPConstant(V) && !V.isUndef())
16396 // Note that type legalization likely mucked about with the VT of the
16397 // source operand, so we may have to convert it here before inserting.
16398 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Val, V, LaneIdx);
16399 }
16400 return Val;
16401 }
16402
16403 // Handle 64-bit constant BUILD_VECTORs by packing them into an i64 immediate.
16404 // This is cheaper than a load if the immediate can be materialized in a few
16405 // mov instructions. This optimization is disabled for big-endian targets for
16406 // now.
16407 if (BVN->isConstant() && VT.isFixedLengthVector() &&
16408 VT.getSizeInBits() == 64 && !DAG.getDataLayout().isBigEndian()) {
16409 const SDLoc DL(Op);
16410 APInt PackedVal(64, 0);
16411 unsigned BitPos = 0;
16412
16413 unsigned EltSizeInBits = VT.getScalarSizeInBits();
16414 for (unsigned i = 0, e = BVN->getNumOperands(); i != e; ++i) {
16415 const SDValue &LaneOp = BVN->getOperand(i);
16416 APInt LaneBits;
16417 if (LaneOp.getOpcode() == ISD::UNDEF)
16418 LaneBits = APInt(EltSizeInBits, 0);
16419 else if (auto *C = dyn_cast<ConstantSDNode>(LaneOp))
16420 LaneBits = C->getAPIntValue();
16421 else if (auto *CFP = dyn_cast<ConstantFPSDNode>(LaneOp))
16422 LaneBits = CFP->getValueAPF().bitcastToAPInt();
16423 else
16424 return SDValue();
16425
16426 PackedVal |= LaneBits.trunc(VT.getScalarSizeInBits()).zext(64) << BitPos;
16427 BitPos += EltSizeInBits;
16428 }
16429
16430 // This optimization kicks in if the number of mov instructions
16431 // is under 2
16433 AArch64_IMM::expandMOVImm(PackedVal.getZExtValue(), 64, Insns);
16434 if (Insns.size() > 2)
16435 return SDValue();
16436
16437 SDValue ScalarConst = DAG.getConstant(PackedVal, DL, MVT::i64);
16438 // Use BITCAST to reinterpret the scalar constant's bits as a vector.
16439 return DAG.getNode(ISD::BITCAST, DL, VT, ScalarConst);
16440 }
16441
16442 // This will generate a load from the constant pool.
16443 if (isConstant) {
16444 LLVM_DEBUG(
16445 dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
16446 "expansion\n");
16447 return SDValue();
16448 }
16449
16450 // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
16451 // v4i32s. This is really a truncate, which we can construct out of (legal)
16452 // concats and truncate nodes.
16454 return M;
16455
16456 // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
16457 if (NumElts >= 4) {
16458 if (SDValue Shuffle = ReconstructShuffle(Op, DAG))
16459 return Shuffle;
16460
16461 if (SDValue Shuffle = ReconstructShuffleWithRuntimeMask(Op, DAG))
16462 return Shuffle;
16463 }
16464
16465 if (PreferDUPAndInsert) {
16466 // First, build a constant vector with the common element.
16468 SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, DL, Ops), DAG);
16469 // Next, insert the elements that do not match the common value.
16470 for (unsigned I = 0; I < NumElts; ++I)
16471 if (Op.getOperand(I) != Value)
16472 NewVector =
16473 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NewVector,
16474 Op.getOperand(I), DAG.getConstant(I, DL, MVT::i64));
16475
16476 return NewVector;
16477 }
16478
16479 // If vector consists of two different values, try to generate two DUPs and
16480 // (CONCAT_VECTORS or VECTOR_SHUFFLE).
16481 if (DifferentValueMap.size() == 2 && NumUndefLanes == 0) {
16483 // Check the consecutive count of the value is the half number of vector
16484 // elements. In this case, we can use CONCAT_VECTORS. For example,
16485 //
16486 // canUseVECTOR_CONCAT = true;
16487 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
16488 // t24, t24, t24, t24, t24, t24, t24, t24
16489 //
16490 // canUseVECTOR_CONCAT = false;
16491 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t24, t24, t24,
16492 // t24, t24, t24, t24, t24, t24, t24, t24
16493 bool canUseVECTOR_CONCAT = true;
16494 for (auto Pair : DifferentValueMap) {
16495 // Check different values have same length which is NumElts / 2.
16496 if (Pair.second != NumElts / 2)
16497 canUseVECTOR_CONCAT = false;
16498 Vals.push_back(Pair.first);
16499 }
16500
16501 // If canUseVECTOR_CONCAT is true, we can generate two DUPs and
16502 // CONCAT_VECTORs. For example,
16503 //
16504 // t22: v16i8 = BUILD_VECTOR t23, t23, t23, t23, t23, t23, t23, t23,
16505 // t24, t24, t24, t24, t24, t24, t24, t24
16506 // ==>
16507 // t26: v8i8 = AArch64ISD::DUP t23
16508 // t28: v8i8 = AArch64ISD::DUP t24
16509 // t29: v16i8 = concat_vectors t26, t28
16510 if (canUseVECTOR_CONCAT) {
16511 EVT SubVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
16512 if (isTypeLegal(SubVT) && SubVT.isVector() &&
16513 SubVT.getVectorNumElements() >= 2) {
16514 SmallVector<SDValue, 8> Ops1(NumElts / 2, Vals[0]);
16515 SmallVector<SDValue, 8> Ops2(NumElts / 2, Vals[1]);
16516 SDValue DUP1 =
16517 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, DL, Ops1), DAG);
16518 SDValue DUP2 =
16519 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, DL, Ops2), DAG);
16521 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, DUP1, DUP2);
16522 return CONCAT_VECTORS;
16523 }
16524 }
16525
16526 // Let's try to generate VECTOR_SHUFFLE. For example,
16527 //
16528 // t24: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t26, t26, t26, t26
16529 // ==>
16530 // t27: v8i8 = BUILD_VECTOR t26, t26, t26, t26, t26, t26, t26, t26
16531 // t28: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t25, t25, t25, t25
16532 // t29: v8i8 = vector_shuffle<0,1,2,3,12,13,14,15> t27, t28
16533 if (NumElts >= 8) {
16534 SmallVector<int, 16> MaskVec;
16535 // Build mask for VECTOR_SHUFLLE.
16536 SDValue FirstLaneVal = Op.getOperand(0);
16537 for (unsigned i = 0; i < NumElts; ++i) {
16538 SDValue Val = Op.getOperand(i);
16539 if (FirstLaneVal == Val)
16540 MaskVec.push_back(i);
16541 else
16542 MaskVec.push_back(i + NumElts);
16543 }
16544
16545 SmallVector<SDValue, 8> Ops1(NumElts, Vals[0]);
16546 SmallVector<SDValue, 8> Ops2(NumElts, Vals[1]);
16547 SDValue VEC1 = DAG.getBuildVector(VT, DL, Ops1);
16548 SDValue VEC2 = DAG.getBuildVector(VT, DL, Ops2);
16550 DAG.getVectorShuffle(VT, DL, VEC1, VEC2, MaskVec);
16551 return VECTOR_SHUFFLE;
16552 }
16553 }
16554
16555 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
16556 // know the default expansion would otherwise fall back on something even
16557 // worse. For a vector with one or two non-undef values, that's
16558 // scalar_to_vector for the elements followed by a shuffle (provided the
16559 // shuffle is valid for the target) and materialization element by element
16560 // on the stack followed by a load for everything else.
16561 if (!isConstant && !usesOnlyOneValue) {
16562 LLVM_DEBUG(
16563 dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
16564 "of INSERT_VECTOR_ELT\n");
16565
16566 SDValue Vec = DAG.getPOISON(VT);
16567 SDValue Op0 = Op.getOperand(0);
16568 unsigned i = 0;
16569
16570 // Use SCALAR_TO_VECTOR for lane zero to
16571 // a) Avoid a RMW dependency on the full vector register, and
16572 // b) Allow the register coalescer to fold away the copy if the
16573 // value is already in an S or D register, and we're forced to emit an
16574 // INSERT_SUBREG that we can't fold anywhere.
16575 //
16576 // We also allow types like i8 and i16 which are illegal scalar but legal
16577 // vector element types. After type-legalization the inserted value is
16578 // extended (i32) and it is safe to cast them to the vector type by ignoring
16579 // the upper bits of the lowest lane (e.g. v8i8, v4i16).
16580 if (!Op0.isUndef()) {
16581 LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
16582 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Op0);
16583 ++i;
16584 }
16585 LLVM_DEBUG({
16586 if (i < NumElts)
16587 dbgs() << "Creating nodes for the other vector elements:\n";
16588 });
16589 for (; i < NumElts; ++i) {
16590 SDValue V = Op.getOperand(i);
16591 if (V.isUndef())
16592 continue;
16593 SDValue LaneIdx = DAG.getConstant(i, DL, MVT::i64);
16594 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Vec, V, LaneIdx);
16595 }
16596 return Vec;
16597 }
16598
16599 LLVM_DEBUG(
16600 dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
16601 "better alternative\n");
16602 return SDValue();
16603}
16604
16605SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
16606 SelectionDAG &DAG) const {
16607 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
16608 !Subtarget->isNeonAvailable()))
16609 return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
16610
16611 assert(Op.getValueType().isScalableVector() &&
16612 isTypeLegal(Op.getValueType()) &&
16613 "Expected legal scalable vector type!");
16614
16615 if (isTypeLegal(Op.getOperand(0).getValueType())) {
16616 unsigned NumOperands = Op->getNumOperands();
16617 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
16618 "Unexpected number of operands in CONCAT_VECTORS");
16619
16620 if (NumOperands == 2)
16621 return Op;
16622
16623 // Concat each pair of subvectors and pack into the lower half of the array.
16624 SmallVector<SDValue> ConcatOps(Op->ops());
16625 while (ConcatOps.size() > 1) {
16626 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
16627 SDValue V1 = ConcatOps[I];
16628 SDValue V2 = ConcatOps[I + 1];
16629 EVT SubVT = V1.getValueType();
16630 EVT PairVT = SubVT.getDoubleNumVectorElementsVT(*DAG.getContext());
16631 ConcatOps[I / 2] =
16632 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), PairVT, V1, V2);
16633 }
16634 ConcatOps.resize(ConcatOps.size() / 2);
16635 }
16636 return ConcatOps[0];
16637 }
16638
16639 return SDValue();
16640}
16641
16642SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
16643 SelectionDAG &DAG) const {
16644 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
16645
16646 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
16647 !Subtarget->isNeonAvailable()))
16648 return LowerFixedLengthInsertVectorElt(Op, DAG);
16649
16650 EVT VT = Op.getValueType();
16651 SDValue Vec = Op.getOperand(0);
16652 SDValue Elt = Op.getOperand(1);
16653 SDValue Idx = Op.getOperand(2);
16654
16655 if (VT.getScalarType() == MVT::i1) {
16656 SDLoc DL(Op);
16657
16658 // MVT::nxv1i8 is not a legal type so widen->insert->shrink instead.
16659 if (VT == MVT::nxv1i1) {
16660 SDValue WidenVec =
16661 DAG.getInsertSubvector(DL, DAG.getPOISON(MVT::nxv2i1), Vec, 0);
16662 SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::nxv2i1,
16663 WidenVec, Elt, Idx);
16664 return DAG.getExtractSubvector(DL, MVT::nxv1i1, Insert, 0);
16665 }
16666
16667 EVT PromoteVT = getPromotedVTForPredicate(VT);
16668 SDValue PromoteVec = DAG.getNode(ISD::ANY_EXTEND, DL, PromoteVT, Vec);
16669 if (PromoteVT.getVectorElementType() == MVT::i64)
16670 Elt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Elt);
16672 PromoteVec, Elt, Idx);
16673 return DAG.getNode(ISD::TRUNCATE, DL, VT, Insert);
16674 }
16675
16676 // Check for non-constant or out of range lane.
16677 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Idx);
16678 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
16679 return SDValue();
16680
16681 return Op;
16682}
16683
16684SDValue
16685AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
16686 SelectionDAG &DAG) const {
16687 assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
16688 EVT VT = Op.getOperand(0).getValueType();
16689
16690 if (VT.getScalarType() == MVT::i1) {
16691 SDLoc DL(Op);
16692 // There are no operations to extend a nxv1i1 predicate to a nxv1i128 vector
16693 // An easy lowering is widening the input predicate to nxv2i1.
16694 if (VT == MVT::nxv1i1) {
16695 SDValue WidenedPred = DAG.getInsertSubvector(
16696 DL, DAG.getPOISON(MVT::nxv2i1), Op->getOperand(0), 0);
16697 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
16698 WidenedPred, Op.getOperand(1));
16699 }
16700 // We can't directly extract from an SVE predicate; extend it first.
16701 // (This isn't the only possible lowering, but it's straightforward.)
16702 EVT VectorVT = getPromotedVTForPredicate(VT);
16703 SDValue Extend =
16704 DAG.getNode(ISD::ANY_EXTEND, DL, VectorVT, Op.getOperand(0));
16705 MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32;
16706 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractTy,
16707 Extend, Op.getOperand(1));
16708 return DAG.getAnyExtOrTrunc(Extract, DL, Op.getValueType());
16709 }
16710
16711 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
16712 return LowerFixedLengthExtractVectorElt(Op, DAG);
16713
16714 // Check for non-constant or out of range lane.
16715 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
16716 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
16717 return SDValue();
16718
16719 // Insertion/extraction are legal for V128 types.
16720 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
16721 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
16722 VT == MVT::v8f16 || VT == MVT::v8bf16)
16723 return Op;
16724
16725 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
16726 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
16727 VT != MVT::v4bf16)
16728 return SDValue();
16729
16730 // For V64 types, we perform extraction by expanding the value
16731 // to a V128 type and perform the extraction on that.
16732 SDLoc DL(Op);
16733 SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
16734 EVT WideTy = WideVec.getValueType();
16735
16736 EVT ExtrTy = WideTy.getVectorElementType();
16737 if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
16738 ExtrTy = MVT::i32;
16739
16740 // For extractions, we just return the result directly.
16741 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
16742 Op.getOperand(1));
16743}
16744
16745SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
16746 SelectionDAG &DAG) const {
16747 EVT VT = Op.getValueType();
16749 "Only cases that extract a fixed length vector are supported!");
16750 EVT InVT = Op.getOperand(0).getValueType();
16751
16752 // If we don't have legal types yet, do nothing
16753 if (!isTypeLegal(InVT))
16754 return SDValue();
16755
16756 if (InVT.is128BitVector()) {
16757 assert(VT.is64BitVector() && "Extracting unexpected vector type!");
16758 unsigned Idx = Op.getConstantOperandVal(1);
16759
16760 // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
16761 if (Idx == 0)
16762 return Op;
16763
16764 // If this is extracting the upper 64-bits of a 128-bit vector, we match
16765 // that directly.
16766 if (Idx * InVT.getScalarSizeInBits() == 64 && Subtarget->isNeonAvailable())
16767 return Op;
16768 }
16769
16770 if (InVT.isScalableVector() ||
16771 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable())) {
16772 SDLoc DL(Op);
16773 SDValue Vec = Op.getOperand(0);
16774 SDValue Idx = Op.getOperand(1);
16775
16776 EVT PackedVT = getPackedSVEVectorVT(InVT.getVectorElementType());
16777 if (PackedVT != InVT) {
16778 // Pack input into the bottom part of an SVE register and try again.
16779 SDValue Container = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, PackedVT,
16780 DAG.getPOISON(PackedVT), Vec,
16781 DAG.getVectorIdxConstant(0, DL));
16782 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Container, Idx);
16783 }
16784
16785 // This will get matched by custom code during ISelDAGToDAG.
16786 if (isNullConstant(Idx))
16787 return Op;
16788
16789 assert(InVT.isScalableVector() && "Unexpected vector type!");
16790 // Move requested subvector to the start of the vector and try again.
16791 SDValue Splice =
16792 DAG.getNode(ISD::VECTOR_SPLICE_LEFT, DL, InVT, Vec, Vec, Idx);
16793 return convertFromScalableVector(DAG, VT, Splice);
16794 }
16795
16796 return SDValue();
16797}
16798
16799SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
16800 SelectionDAG &DAG) const {
16801 assert(Op.getValueType().isScalableVector() &&
16802 "Only expect to lower inserts into scalable vectors!");
16803
16804 EVT InVT = Op.getOperand(1).getValueType();
16805 unsigned Idx = Op.getConstantOperandVal(2);
16806
16807 SDValue Vec0 = Op.getOperand(0);
16808 SDValue Vec1 = Op.getOperand(1);
16809 SDLoc DL(Op);
16810 EVT VT = Op.getValueType();
16811
16812 if (InVT.isScalableVector()) {
16813 if (!isTypeLegal(VT))
16814 return SDValue();
16815
16816 // Break down insert_subvector into simpler parts.
16817 if (VT.getVectorElementType() == MVT::i1) {
16818 unsigned NumElts = VT.getVectorMinNumElements();
16819 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
16820
16821 SDValue Lo, Hi;
16822 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
16823 DAG.getVectorIdxConstant(0, DL));
16824 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
16825 DAG.getVectorIdxConstant(NumElts / 2, DL));
16826 if (Idx < (NumElts / 2))
16827 Lo = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Lo, Vec1,
16828 DAG.getVectorIdxConstant(Idx, DL));
16829 else
16830 Hi = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Hi, Vec1,
16831 DAG.getVectorIdxConstant(Idx - (NumElts / 2), DL));
16832
16833 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
16834 }
16835
16836 // We can select these directly.
16837 if (isTypeLegal(InVT) && Vec0.isUndef())
16838 return Op;
16839
16840 // Ensure the subvector is half the size of the main vector.
16841 if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
16842 return SDValue();
16843
16844 // Here narrow and wide refers to the vector element types. After "casting"
16845 // both vectors must have the same bit length and so because the subvector
16846 // has fewer elements, those elements need to be bigger.
16847 EVT NarrowVT = getPackedSVEVectorVT(VT.getVectorElementCount());
16848 EVT WideVT = getPackedSVEVectorVT(InVT.getVectorElementCount());
16849
16850 // NOP cast operands to the largest legal vector of the same element count.
16851 if (VT.isFloatingPoint()) {
16852 Vec0 = getSVESafeBitCast(NarrowVT, Vec0, DAG);
16853 Vec1 = getSVESafeBitCast(NarrowVT, Vec1, DAG);
16854 } else {
16855 // Legal integer vectors are already their largest so Vec0 is fine as is.
16856 Vec1 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
16857 Vec1 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, Vec1);
16858 }
16859
16860 // To replace the top/bottom half of vector V with vector SubV we widen the
16861 // preserved half of V, concatenate this to SubV (the order depending on the
16862 // half being replaced) and then narrow the result.
16863 SDValue Narrow;
16864 if (Idx == 0) {
16865 SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0);
16866 HiVec0 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, HiVec0);
16867 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, Vec1, HiVec0);
16868 } else {
16869 assert(Idx == InVT.getVectorMinNumElements() &&
16870 "Invalid subvector index!");
16871 SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0);
16872 LoVec0 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, LoVec0);
16873 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, LoVec0, Vec1);
16874 }
16875
16876 return getSVESafeBitCast(VT, Narrow, DAG);
16877 }
16878
16879 if (Idx == 0 && isPackedVectorType(VT, DAG)) {
16880 // This will be matched by custom code during ISelDAGToDAG.
16881 if (Vec0.isUndef())
16882 return Op;
16883
16884 std::optional<unsigned> PredPattern =
16886 auto PredTy = VT.changeVectorElementType(*DAG.getContext(), MVT::i1);
16887 SDValue PTrue = getPTrue(DAG, DL, PredTy, *PredPattern);
16888 SDValue ScalableVec1 = convertToScalableVector(DAG, VT, Vec1);
16889 return DAG.getNode(ISD::VSELECT, DL, VT, PTrue, ScalableVec1, Vec0);
16890 }
16891
16892 return SDValue();
16893}
16894
16895static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated) {
16896 if (Op.getOpcode() != AArch64ISD::DUP &&
16897 Op.getOpcode() != ISD::SPLAT_VECTOR &&
16898 Op.getOpcode() != ISD::BUILD_VECTOR)
16899 return false;
16900
16901 if (Op.getOpcode() == ISD::BUILD_VECTOR &&
16902 !isAllConstantBuildVector(Op, SplatVal))
16903 return false;
16904
16905 if (Op.getOpcode() != ISD::BUILD_VECTOR &&
16906 !isa<ConstantSDNode>(Op->getOperand(0)))
16907 return false;
16908
16909 SplatVal = Op->getConstantOperandVal(0);
16910 if (Op.getValueType().getVectorElementType() != MVT::i64)
16911 SplatVal = (int32_t)SplatVal;
16912
16913 Negated = false;
16914 if (isPowerOf2_64(SplatVal))
16915 return true;
16916
16917 Negated = true;
16918 if (isPowerOf2_64(-SplatVal)) {
16919 SplatVal = -SplatVal;
16920 return true;
16921 }
16922
16923 return false;
16924}
16925
16926SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
16927 EVT VT = Op.getValueType();
16928 SDLoc DL(Op);
16929
16930 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
16931 return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
16932
16933 unsigned Opc = Op.getOpcode();
16934 assert((Opc == ISD::SDIV || Opc == ISD::UDIV) && "Expected a DIV opcode.");
16935 assert(VT.isScalableVector() && "Expected a scalable vector.");
16936 bool Signed = Opc == ISD::SDIV;
16937
16938 bool Negated;
16939 uint64_t SplatVal;
16940 // NOTE: SRAD cannot be used to represent sdiv-by-one.
16941 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated) &&
16942 SplatVal > 1) {
16944 SDValue Res =
16945 DAG.getNode(AArch64ISD::ASRD_MERGE_OP1, DL, VT, Pg, Op->getOperand(0),
16946 DAG.getTargetConstant(Log2_64(SplatVal), DL, MVT::i32));
16947 if (Negated)
16948 Res = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res);
16949
16950 return Res;
16951 }
16952
16953 if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64) {
16954 unsigned MaskedOpcode = Signed ? ISD::MASKED_SDIV : ISD::MASKED_UDIV;
16955 return DAG.getNode(MaskedOpcode, DL, VT, Op.getOperand(0), Op.getOperand(1),
16956 getPredicateForVector(DAG, DL, VT));
16957 }
16958
16959 // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
16960 // operations, and truncate the result.
16961 EVT WidenedVT;
16962 if (VT == MVT::nxv16i8)
16963 WidenedVT = MVT::nxv8i16;
16964 else if (VT == MVT::nxv8i16)
16965 WidenedVT = MVT::nxv4i32;
16966 else
16967 llvm_unreachable("Unexpected Custom DIV operation");
16968
16969 unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
16970 unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
16971 SDValue Op0Lo = DAG.getNode(UnpkLo, DL, WidenedVT, Op.getOperand(0));
16972 SDValue Op1Lo = DAG.getNode(UnpkLo, DL, WidenedVT, Op.getOperand(1));
16973 SDValue Op0Hi = DAG.getNode(UnpkHi, DL, WidenedVT, Op.getOperand(0));
16974 SDValue Op1Hi = DAG.getNode(UnpkHi, DL, WidenedVT, Op.getOperand(1));
16975 SDValue ResultLo = DAG.getNode(Opc, DL, WidenedVT, Op0Lo, Op1Lo);
16976 SDValue ResultHi = DAG.getNode(Opc, DL, WidenedVT, Op0Hi, Op1Hi);
16977 SDValue ResultLoCast = DAG.getNode(AArch64ISD::NVCAST, DL, VT, ResultLo);
16978 SDValue ResultHiCast = DAG.getNode(AArch64ISD::NVCAST, DL, VT, ResultHi);
16979 return DAG.getNode(AArch64ISD::UZP1, DL, VT, ResultLoCast, ResultHiCast);
16980}
16981
16982bool AArch64TargetLowering::shouldExpandBuildVectorWithShuffles(
16983 EVT VT, unsigned DefinedValues) const {
16984 if (!Subtarget->isNeonAvailable())
16985 return false;
16987}
16988
16990 // Currently no fixed length shuffles that require SVE are legal.
16991 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
16992 return false;
16993
16994 if (VT.getVectorNumElements() == 4 &&
16995 (VT.is128BitVector() || VT.is64BitVector())) {
16996 unsigned Cost = getPerfectShuffleCost(M);
16997 if (Cost <= 1)
16998 return true;
16999 }
17000
17001 bool DummyBool;
17002 int DummyInt;
17003 unsigned DummyUnsigned;
17004
17005 unsigned EltSize = VT.getScalarSizeInBits();
17006 unsigned NumElts = VT.getVectorNumElements();
17008 isREVMask(M, EltSize, NumElts, 64) ||
17009 isREVMask(M, EltSize, NumElts, 32) ||
17010 isREVMask(M, EltSize, NumElts, 16) ||
17011 isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
17012 isSingletonEXTMask(M, VT, DummyUnsigned) ||
17013 isTRNMask(M, NumElts, DummyUnsigned, DummyUnsigned) ||
17014 isUZPMask(M, NumElts, DummyUnsigned) ||
17015 isZIPMask(M, NumElts, DummyUnsigned, DummyUnsigned) ||
17016 isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
17017 isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
17018 isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
17019 isINSMask(M, NumElts, DummyBool, DummyInt) ||
17020 isConcatMask(M, VT, VT.getSizeInBits() == 128));
17021}
17022
17024 EVT VT) const {
17025 // Just delegate to the generic legality, clear masks aren't special.
17026 return isShuffleMaskLegal(M, VT);
17027}
17028
17029/// getVShiftImm - Check if this is a valid build_vector for the immediate
17030/// operand of a vector shift operation, where all the elements of the
17031/// build_vector must have the same constant integer value.
17032static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
17033 // Ignore bit_converts.
17034 while (Op.getOpcode() == ISD::BITCAST)
17035 Op = Op.getOperand(0);
17037 APInt SplatBits, SplatUndef;
17038 unsigned SplatBitSize;
17039 bool HasAnyUndefs;
17040 if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
17041 HasAnyUndefs, ElementBits) ||
17042 SplatBitSize > ElementBits)
17043 return false;
17044 Cnt = SplatBits.getSExtValue();
17045 return true;
17046}
17047
17048/// isVShiftLImm - Check if this is a valid build_vector for the immediate
17049/// operand of a vector shift left operation. That value must be in the range:
17050/// 0 <= Value < ElementBits for a left shift; or
17051/// 0 <= Value <= ElementBits for a long left shift.
17052static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
17053 assert(VT.isVector() && "vector shift count is not a vector type");
17054 int64_t ElementBits = VT.getScalarSizeInBits();
17055 if (!getVShiftImm(Op, ElementBits, Cnt))
17056 return false;
17057 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
17058}
17059
17060/// isVShiftRImm - Check if this is a valid build_vector for the immediate
17061/// operand of a vector shift right operation. The value must be in the range:
17062/// 1 <= Value <= ElementBits for a right shift; or
17063static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
17064 assert(VT.isVector() && "vector shift count is not a vector type");
17065 int64_t ElementBits = VT.getScalarSizeInBits();
17066 if (!getVShiftImm(Op, ElementBits, Cnt))
17067 return false;
17068 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
17069}
17070
17071SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
17072 SelectionDAG &DAG) const {
17073 EVT VT = Op.getValueType();
17074
17075 if (VT.getScalarType() == MVT::i1) {
17076 // Lower i1 truncate to `(x & 1) != 0`.
17077 SDLoc DL(Op);
17078 EVT OpVT = Op.getOperand(0).getValueType();
17079 SDValue Zero = DAG.getConstant(0, DL, OpVT);
17080 SDValue One = DAG.getConstant(1, DL, OpVT);
17081 SDValue And = DAG.getNode(ISD::AND, DL, OpVT, Op.getOperand(0), One);
17082 return DAG.getSetCC(DL, VT, And, Zero, ISD::SETNE);
17083 }
17084
17085 if (!VT.isVector() || VT.isScalableVector())
17086 return SDValue();
17087
17088 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
17089 !Subtarget->isNeonAvailable()))
17090 return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
17091
17092 // We can select these directly.
17093 if (VT.is64BitVector() && Op.getOperand(0).getValueType().is128BitVector())
17094 return Op;
17095
17096 return SDValue();
17097}
17098
17099// Check if we can we lower this SRL to a rounding shift instruction. ResVT is
17100// possibly a truncated type, it tells how many bits of the value are to be
17101// used.
17103 SelectionDAG &DAG,
17104 unsigned &ShiftValue,
17105 SDValue &RShOperand) {
17106 if (Shift->getOpcode() != ISD::SRL)
17107 return false;
17108
17109 EVT VT = Shift.getValueType();
17110 assert(VT.isScalableVT());
17111
17112 auto ShiftOp1 =
17114 if (!ShiftOp1)
17115 return false;
17116
17117 ShiftValue = ShiftOp1->getZExtValue();
17118 if (ShiftValue < 1 || ShiftValue > ResVT.getScalarSizeInBits())
17119 return false;
17120
17121 SDValue Add = Shift->getOperand(0);
17122 if (Add->getOpcode() != ISD::ADD || !Add->hasOneUse())
17123 return false;
17124
17126 "ResVT must be truncated or same type as the shift.");
17127 // Check if an overflow can lead to incorrect results.
17128 uint64_t ExtraBits = VT.getScalarSizeInBits() - ResVT.getScalarSizeInBits();
17129 if (ShiftValue > ExtraBits && !Add->getFlags().hasNoUnsignedWrap())
17130 return false;
17131
17132 auto AddOp1 =
17134 if (!AddOp1)
17135 return false;
17136 uint64_t AddValue = AddOp1->getZExtValue();
17137 if (AddValue != 1ULL << (ShiftValue - 1))
17138 return false;
17139
17140 RShOperand = Add->getOperand(0);
17141 return true;
17142}
17143
17144SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
17145 SelectionDAG &DAG) const {
17146 EVT VT = Op.getValueType();
17147 SDLoc DL(Op);
17148 int64_t Cnt;
17149
17150 if (!Op.getOperand(1).getValueType().isVector())
17151 return Op;
17152 unsigned EltSize = VT.getScalarSizeInBits();
17153
17154 switch (Op.getOpcode()) {
17155 case ISD::SHL:
17156 if (VT.isScalableVector() ||
17157 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
17158 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED);
17159
17160 if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
17161 return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
17162 DAG.getTargetConstant(Cnt, DL, MVT::i32));
17163 return DAG.getNode(
17165 DAG.getTargetConstant(Intrinsic::aarch64_neon_ushl, DL, MVT::i32),
17166 Op.getOperand(0), Op.getOperand(1));
17167 case ISD::SRA:
17168 case ISD::SRL:
17169 if (VT.isScalableVector() &&
17170 (Subtarget->hasSVE2() ||
17171 (Subtarget->hasSME() && Subtarget->isStreaming()))) {
17172 SDValue RShOperand;
17173 unsigned ShiftValue;
17174 if (canLowerSRLToRoundingShiftForVT(Op, VT, DAG, ShiftValue, RShOperand))
17175 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, VT,
17176 getPredicateForVector(DAG, DL, VT), RShOperand,
17177 DAG.getTargetConstant(ShiftValue, DL, MVT::i32));
17178 }
17179
17180 if (VT.isScalableVector() ||
17181 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
17182 unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
17183 : AArch64ISD::SRL_PRED;
17184 return LowerToPredicatedOp(Op, DAG, Opc);
17185 }
17186
17187 // Right shift immediate
17188 if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
17189 unsigned Opc =
17190 (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
17191 return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
17192 DAG.getTargetConstant(Cnt, DL, MVT::i32),
17193 Op->getFlags());
17194 }
17195
17196 // Right shift register. Note, there is not a shift right register
17197 // instruction, but the shift left register instruction takes a signed
17198 // value, where negative numbers specify a right shift.
17199 unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
17200 : Intrinsic::aarch64_neon_ushl;
17201 // negate the shift amount
17202 SDValue NegShift = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
17203 Op.getOperand(1));
17204 SDValue NegShiftLeft =
17206 DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
17207 NegShift);
17208 return NegShiftLeft;
17209 }
17210
17211 llvm_unreachable("unexpected shift opcode");
17212}
17213
17214SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
17215 SelectionDAG &DAG) const {
17216 if (Op.getValueType().isScalableVector())
17217 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
17218
17219 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
17220 !Subtarget->isNeonAvailable()))
17221 return LowerFixedLengthVectorSetccToSVE(Op, DAG);
17222
17223 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
17224 SDValue LHS = Op.getOperand(0);
17225 SDValue RHS = Op.getOperand(1);
17226 EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
17227 SDLoc DL(Op);
17228
17229 if (LHS.getValueType().getVectorElementType().isInteger())
17230 return Op;
17231
17232 assert(((!Subtarget->hasFullFP16() &&
17233 LHS.getValueType().getVectorElementType() != MVT::f16) ||
17234 LHS.getValueType().getVectorElementType() != MVT::bf16 ||
17235 LHS.getValueType().getVectorElementType() != MVT::f128) &&
17236 "Unexpected type!");
17237
17238 // Lower isnan(x) | isnan(never-nan) to x != x.
17239 // Lower !isnan(x) & !isnan(never-nan) to x == x.
17240 if (CC == ISD::SETUO || CC == ISD::SETO) {
17241 bool OneNaN = false;
17242 if (LHS == RHS) {
17243 OneNaN = true;
17244 } else if (DAG.isKnownNeverNaN(RHS)) {
17245 OneNaN = true;
17246 RHS = LHS;
17247 } else if (DAG.isKnownNeverNaN(LHS)) {
17248 OneNaN = true;
17249 LHS = RHS;
17250 }
17251 if (OneNaN) {
17252 CC = CC == ISD::SETUO ? ISD::SETUNE : ISD::SETOEQ;
17253 }
17254 }
17255
17256 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
17257 // clean. Some of them require two branches to implement.
17258 AArch64CC::CondCode CC1, CC2;
17259 bool ShouldInvert;
17260 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
17261
17262 bool NoNaNs = Op->getFlags().hasNoNaNs();
17263 SDValue Cmp = emitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, DL, DAG);
17264 if (!Cmp.getNode())
17265 return SDValue();
17266
17267 if (CC2 != AArch64CC::AL) {
17268 SDValue Cmp2 = emitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, DL, DAG);
17269 if (!Cmp2.getNode())
17270 return SDValue();
17271
17272 Cmp = DAG.getNode(ISD::OR, DL, CmpVT, Cmp, Cmp2);
17273 }
17274
17275 Cmp = DAG.getSExtOrTrunc(Cmp, DL, Op.getValueType());
17276
17277 if (ShouldInvert)
17278 Cmp = DAG.getNOT(DL, Cmp, Cmp.getValueType());
17279
17280 return Cmp;
17281}
17282
17283static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
17284 SelectionDAG &DAG) {
17285 SDValue VecOp = ScalarOp.getOperand(0);
17286 auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
17287 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
17288 DAG.getConstant(0, DL, MVT::i64));
17289}
17290
17291static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,
17292 SDLoc DL, SelectionDAG &DAG) {
17293 unsigned ScalarOpcode;
17294 switch (Opcode) {
17295 case ISD::VECREDUCE_AND:
17296 ScalarOpcode = ISD::AND;
17297 break;
17298 case ISD::VECREDUCE_OR:
17299 ScalarOpcode = ISD::OR;
17300 break;
17301 case ISD::VECREDUCE_XOR:
17302 ScalarOpcode = ISD::XOR;
17303 break;
17304 default:
17305 llvm_unreachable("Expected bitwise vector reduction");
17306 return SDValue();
17307 }
17308
17309 EVT VecVT = Vec.getValueType();
17310 assert(VecVT.isFixedLengthVector() && VecVT.isPow2VectorType() &&
17311 "Expected power-of-2 length vector");
17312
17313 EVT ElemVT = VecVT.getVectorElementType();
17314
17315 SDValue Result;
17316 unsigned NumElems = VecVT.getVectorNumElements();
17317
17318 // Special case for boolean reductions
17319 if (ElemVT == MVT::i1) {
17320 // Split large vectors into smaller ones
17321 if (NumElems > 16) {
17322 SDValue Lo, Hi;
17323 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
17324 EVT HalfVT = Lo.getValueType();
17325 SDValue HalfVec = DAG.getNode(ScalarOpcode, DL, HalfVT, Lo, Hi);
17326 return getVectorBitwiseReduce(Opcode, HalfVec, VT, DL, DAG);
17327 }
17328
17329 // Results of setcc operations get widened to 128 bits if their input
17330 // operands are 128 bits wide, otherwise vectors that are less than 64 bits
17331 // get widened to neatly fit a 64 bit register, so e.g. <4 x i1> gets
17332 // lowered to either <4 x i16> or <4 x i32>. Sign extending to this element
17333 // size leads to the best codegen, since e.g. setcc results might need to be
17334 // truncated otherwise.
17335 unsigned ExtendedWidth = 64;
17336 if (Vec.getOpcode() == ISD::SETCC &&
17337 Vec.getOperand(0).getValueSizeInBits() >= 128) {
17338 ExtendedWidth = 128;
17339 }
17340 EVT ExtendedVT = MVT::getIntegerVT(std::max(ExtendedWidth / NumElems, 8u));
17341
17342 // any_ext doesn't work with umin/umax, so only use it for uadd.
17343 unsigned ExtendOp =
17344 ScalarOpcode == ISD::XOR ? ISD::ANY_EXTEND : ISD::SIGN_EXTEND;
17345 SDValue Extended = DAG.getNode(
17346 ExtendOp, DL,
17347 VecVT.changeVectorElementType(*DAG.getContext(), ExtendedVT), Vec);
17348 // The uminp/uminv and umaxp/umaxv instructions don't have .2d variants, so
17349 // in that case we bitcast the sign extended values from v2i64 to v4i32
17350 // before reduction for optimal code generation.
17351 if ((ScalarOpcode == ISD::AND || ScalarOpcode == ISD::OR) &&
17352 NumElems == 2 && ExtendedWidth == 128) {
17353 Extended = DAG.getBitcast(MVT::v4i32, Extended);
17354 ExtendedVT = MVT::i32;
17355 }
17356 switch (ScalarOpcode) {
17357 case ISD::AND:
17358 Result = DAG.getNode(ISD::VECREDUCE_UMIN, DL, ExtendedVT, Extended);
17359 break;
17360 case ISD::OR:
17361 Result = DAG.getNode(ISD::VECREDUCE_UMAX, DL, ExtendedVT, Extended);
17362 break;
17363 case ISD::XOR:
17364 Result = DAG.getNode(ISD::VECREDUCE_ADD, DL, ExtendedVT, Extended);
17365 break;
17366 default:
17367 llvm_unreachable("Unexpected Opcode");
17368 }
17369
17370 Result = DAG.getAnyExtOrTrunc(Result, DL, MVT::i1);
17371 } else {
17372 // Iteratively split the vector in half and combine using the bitwise
17373 // operation until it fits in a 64 bit register.
17374 while (VecVT.getSizeInBits() > 64) {
17375 SDValue Lo, Hi;
17376 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
17377 VecVT = Lo.getValueType();
17378 NumElems = VecVT.getVectorNumElements();
17379 Vec = DAG.getNode(ScalarOpcode, DL, VecVT, Lo, Hi);
17380 }
17381
17382 EVT ScalarVT = EVT::getIntegerVT(*DAG.getContext(), VecVT.getSizeInBits());
17383
17384 // Do the remaining work on a scalar since it allows the code generator to
17385 // combine the shift and bitwise operation into one instruction and since
17386 // integer instructions can have higher throughput than vector instructions.
17387 SDValue Scalar = DAG.getBitcast(ScalarVT, Vec);
17388
17389 // Iteratively combine the lower and upper halves of the scalar using the
17390 // bitwise operation, halving the relevant region of the scalar in each
17391 // iteration, until the relevant region is just one element of the original
17392 // vector.
17393 for (unsigned Shift = NumElems / 2; Shift > 0; Shift /= 2) {
17394 SDValue ShiftAmount =
17395 DAG.getConstant(Shift * ElemVT.getSizeInBits(), DL, MVT::i64);
17396 SDValue Shifted =
17397 DAG.getNode(ISD::SRL, DL, ScalarVT, Scalar, ShiftAmount);
17398 Scalar = DAG.getNode(ScalarOpcode, DL, ScalarVT, Scalar, Shifted);
17399 }
17400
17401 Result = DAG.getAnyExtOrTrunc(Scalar, DL, ElemVT);
17402 }
17403
17404 return DAG.getAnyExtOrTrunc(Result, DL, VT);
17405}
17406
17407SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
17408 SelectionDAG &DAG) const {
17409 SDLoc DL(Op);
17410 SDValue Src = Op.getOperand(0);
17411 EVT SrcVT = Src.getValueType();
17412
17413 // Scalarize v2f16 to turn it into a faddp. This will be more efficient than
17414 // widening by inserting zeroes.
17415 if (Subtarget->hasFullFP16() && Op.getOpcode() == ISD::VECREDUCE_FADD &&
17416 SrcVT == MVT::v2f16) {
17417 return DAG.getNode(ISD::FADD, DL, MVT::f16,
17418 DAG.getExtractVectorElt(DL, MVT::f16, Src, 0),
17419 DAG.getExtractVectorElt(DL, MVT::f16, Src, 1));
17420 }
17421
17422 // Try lowering the reduction to SVE. This will fail for NEON reductions where
17423 // SVE is not preferred.
17424 if (SDValue Result = LowerReductionToSVE(Op, DAG))
17425 return Result;
17426
17427 // Lower NEON reductions.
17428 switch (Op.getOpcode()) {
17429 case ISD::VECREDUCE_AND:
17430 case ISD::VECREDUCE_OR:
17431 case ISD::VECREDUCE_XOR:
17432 return getVectorBitwiseReduce(Op.getOpcode(), Op.getOperand(0),
17433 Op.getValueType(), DL, DAG);
17434 case ISD::VECREDUCE_ADD:
17435 return getReductionSDNode(AArch64ISD::UADDV, DL, Op, DAG);
17437 return getReductionSDNode(AArch64ISD::SMAXV, DL, Op, DAG);
17439 return getReductionSDNode(AArch64ISD::SMINV, DL, Op, DAG);
17441 return getReductionSDNode(AArch64ISD::UMAXV, DL, Op, DAG);
17443 return getReductionSDNode(AArch64ISD::UMINV, DL, Op, DAG);
17444 default:
17445 llvm_unreachable("Unhandled reduction");
17446 }
17447}
17448
17449SDValue AArch64TargetLowering::LowerVECREDUCE_MUL(SDValue Op,
17450 SelectionDAG &DAG) const {
17451 SDLoc DL(Op);
17452 SDValue Src = Op.getOperand(0);
17453 EVT SrcVT = Src.getValueType();
17454 assert(SrcVT.isScalableVector() && "Unexpected operand type!");
17455
17456 SDVTList SrcVTs = DAG.getVTList(SrcVT, SrcVT);
17457 unsigned BaseOpc = ISD::getVecReduceBaseOpcode(Op.getOpcode());
17458 SDValue Identity = DAG.getIdentityElement(BaseOpc, DL, SrcVT, Op->getFlags());
17459
17460 // Whilst we don't know the size of the vector we do know the maximum size so
17461 // can perform a tree reduction with an identity vector, which means once we
17462 // arrive at the result the remaining stages (when the vector is smaller than
17463 // the maximum) have no affect.
17464
17466 unsigned Stages = llvm::Log2_32(Segments * SrcVT.getVectorMinNumElements());
17467
17468 for (unsigned I = 0; I < Stages; ++I) {
17469 Src = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, SrcVTs, Src, Identity);
17470 Src = DAG.getNode(BaseOpc, DL, SrcVT, Src.getValue(0), Src.getValue(1));
17471 }
17472
17473 return DAG.getExtractVectorElt(DL, Op.getValueType(), Src, 0);
17474}
17475
17476SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
17477 SelectionDAG &DAG) const {
17478 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
17479 // No point replacing if we don't have the relevant instruction/libcall anyway
17480 if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
17481 return SDValue();
17482
17483 // LSE has an atomic load-clear instruction, but not a load-and.
17484 SDLoc DL(Op);
17485 MVT VT = Op.getSimpleValueType();
17486 assert(VT != MVT::i128 && "Handled elsewhere, code replicated.");
17487 SDValue RHS = Op.getOperand(2);
17488 AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
17489 RHS = DAG.getNode(ISD::XOR, DL, VT, DAG.getAllOnesConstant(DL, VT), RHS);
17490 return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, DL, AN->getMemoryVT(),
17491 Op.getOperand(0), Op.getOperand(1), RHS,
17492 AN->getMemOperand());
17493}
17494
17495SDValue
17496AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op,
17497 SelectionDAG &DAG) const {
17498
17499 SDLoc DL(Op);
17500 // Get the inputs.
17501 SDNode *Node = Op.getNode();
17502 SDValue Chain = Op.getOperand(0);
17503 SDValue Size = Op.getOperand(1);
17504 MaybeAlign Align =
17505 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
17506 EVT VT = Node->getValueType(0);
17507
17509 "no-stack-arg-probe")) {
17510 SDValue SP = DAG.getCopyFromReg(Chain, DL, AArch64::SP, MVT::i64);
17511 Chain = SP.getValue(1);
17512 SP = DAG.getNode(ISD::SUB, DL, MVT::i64, SP, Size);
17513 if (Align)
17514 SP = DAG.getNode(ISD::AND, DL, VT, SP.getValue(0),
17515 DAG.getSignedConstant(-Align->value(), DL, VT));
17516 Chain = DAG.getCopyToReg(Chain, DL, AArch64::SP, SP);
17517 SDValue Ops[2] = {SP, Chain};
17518 return DAG.getMergeValues(Ops, DL);
17519 }
17520
17521 RTLIB::LibcallImpl ChkStkImpl = getLibcallImpl(RTLIB::STACK_PROBE);
17522 if (ChkStkImpl == RTLIB::Unsupported)
17523 return SDValue();
17524
17525 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
17526
17527 EVT PtrVT = getPointerTy(DAG.getDataLayout());
17529 getLibcallImplName(ChkStkImpl).data(), PtrVT, 0);
17530
17531 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
17532 const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
17533 if (Subtarget->hasCustomCallingConv())
17534 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
17535
17536 Size = DAG.getNode(ISD::SRL, DL, MVT::i64, Size,
17537 DAG.getConstant(4, DL, MVT::i64));
17538 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X15, Size, SDValue());
17539 Chain =
17540 DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
17541 Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
17542 DAG.getRegisterMask(Mask), Chain.getValue(1));
17543 // To match the actual intent better, we should read the output from X15 here
17544 // again (instead of potentially spilling it to the stack), but rereading Size
17545 // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
17546 // here.
17547
17548 Size = DAG.getNode(ISD::SHL, DL, MVT::i64, Size,
17549 DAG.getConstant(4, DL, MVT::i64));
17550
17551 SDValue SP = DAG.getCopyFromReg(Chain, DL, AArch64::SP, MVT::i64);
17552 Chain = SP.getValue(1);
17553 SP = DAG.getNode(ISD::SUB, DL, MVT::i64, SP, Size);
17554 if (Align)
17555 SP = DAG.getNode(ISD::AND, DL, VT, SP.getValue(0),
17556 DAG.getSignedConstant(-Align->value(), DL, VT));
17557 Chain = DAG.getCopyToReg(Chain, DL, AArch64::SP, SP);
17558
17559 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), DL);
17560
17561 SDValue Ops[2] = {SP, Chain};
17562 return DAG.getMergeValues(Ops, DL);
17563}
17564
17565SDValue
17566AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op,
17567 SelectionDAG &DAG) const {
17568 // Get the inputs.
17569 SDNode *Node = Op.getNode();
17570 SDValue Chain = Op.getOperand(0);
17571 SDValue Size = Op.getOperand(1);
17572
17573 MaybeAlign Align =
17574 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
17575 SDLoc DL(Op);
17576 EVT VT = Node->getValueType(0);
17577
17578 // Construct the new SP value in a GPR.
17579 SDValue SP = DAG.getCopyFromReg(Chain, DL, AArch64::SP, MVT::i64);
17580 Chain = SP.getValue(1);
17581 SP = DAG.getNode(ISD::SUB, DL, MVT::i64, SP, Size);
17582 if (Align)
17583 SP = DAG.getNode(ISD::AND, DL, VT, SP.getValue(0),
17584 DAG.getSignedConstant(-Align->value(), DL, VT));
17585
17586 // Set the real SP to the new value with a probing loop.
17587 Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, DL, MVT::Other, Chain, SP);
17588 SDValue Ops[2] = {SP, Chain};
17589 return DAG.getMergeValues(Ops, DL);
17590}
17591
17592SDValue
17593AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
17594 SelectionDAG &DAG) const {
17595 MachineFunction &MF = DAG.getMachineFunction();
17596
17597 if (Subtarget->isTargetWindows())
17598 return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG);
17599 else if (hasInlineStackProbe(MF))
17600 return LowerInlineDYNAMIC_STACKALLOC(Op, DAG);
17601 else
17602 return SDValue();
17603}
17604
17605SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG,
17606 unsigned NewOp) const {
17607 if (Subtarget->hasSVE2())
17608 return LowerToPredicatedOp(Op, DAG, NewOp);
17609
17610 // Default to expand.
17611 return SDValue();
17612}
17613
17614SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
17615 SelectionDAG &DAG) const {
17616 EVT VT = Op.getValueType();
17617 assert(VT != MVT::i64 && "Expected illegal VSCALE node");
17618
17619 SDLoc DL(Op);
17620 APInt MulImm = Op.getConstantOperandAPInt(0);
17621 return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sext(64)), DL,
17622 VT);
17623}
17624
17625/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
17626template <unsigned NumVecs>
17627static void
17630 Info.opc = ISD::INTRINSIC_VOID;
17631 // Retrieve EC from first vector argument.
17632 const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType());
17634#ifndef NDEBUG
17635 // Check the assumption that all input vectors are the same type.
17636 for (unsigned I = 0; I < NumVecs; ++I)
17637 assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&
17638 "Invalid type.");
17639#endif
17640 // memVT is `NumVecs * VT`.
17641 Info.memVT = EVT::getVectorVT(CI.getType()->getContext(), VT.getScalarType(),
17642 EC * NumVecs);
17643 Info.ptrVal = CI.getArgOperand(CI.arg_size() - 1);
17644 Info.offset = 0;
17645 Info.align.reset();
17646 Info.flags = MachineMemOperand::MOStore;
17647}
17648
17649/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
17650/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
17651/// specified in the intrinsic calls.
17654 MachineFunction &MF, unsigned Intrinsic) const {
17655 IntrinsicInfo Info;
17656 auto &DL = I.getDataLayout();
17657 switch (Intrinsic) {
17658 case Intrinsic::aarch64_sve_st2:
17659 setInfoSVEStN<2>(*this, DL, Info, I);
17660 Infos.push_back(Info);
17661 return;
17662 case Intrinsic::aarch64_sve_st3:
17663 setInfoSVEStN<3>(*this, DL, Info, I);
17664 Infos.push_back(Info);
17665 return;
17666 case Intrinsic::aarch64_sve_st4:
17667 setInfoSVEStN<4>(*this, DL, Info, I);
17668 Infos.push_back(Info);
17669 return;
17670 case Intrinsic::aarch64_neon_ld2:
17671 case Intrinsic::aarch64_neon_ld3:
17672 case Intrinsic::aarch64_neon_ld4:
17673 case Intrinsic::aarch64_neon_ld1x2:
17674 case Intrinsic::aarch64_neon_ld1x3:
17675 case Intrinsic::aarch64_neon_ld1x4: {
17676 Info.opc = ISD::INTRINSIC_W_CHAIN;
17677 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
17678 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
17679 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
17680 Info.offset = 0;
17681 Info.align.reset();
17682 // volatile loads with NEON intrinsics not supported
17683 Info.flags = MachineMemOperand::MOLoad;
17684 Infos.push_back(Info);
17685 return;
17686 }
17687 case Intrinsic::aarch64_neon_ld2lane:
17688 case Intrinsic::aarch64_neon_ld3lane:
17689 case Intrinsic::aarch64_neon_ld4lane:
17690 case Intrinsic::aarch64_neon_ld2r:
17691 case Intrinsic::aarch64_neon_ld3r:
17692 case Intrinsic::aarch64_neon_ld4r: {
17693 Info.opc = ISD::INTRINSIC_W_CHAIN;
17694 // ldx return struct with the same vec type
17695 Type *RetTy = I.getType();
17696 auto *StructTy = cast<StructType>(RetTy);
17697 unsigned NumElts = StructTy->getNumElements();
17698 Type *VecTy = StructTy->getElementType(0);
17699 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
17700 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
17701 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
17702 Info.offset = 0;
17703 Info.align.reset();
17704 // volatile loads with NEON intrinsics not supported
17705 Info.flags = MachineMemOperand::MOLoad;
17706 Infos.push_back(Info);
17707 return;
17708 }
17709 case Intrinsic::aarch64_neon_st2:
17710 case Intrinsic::aarch64_neon_st3:
17711 case Intrinsic::aarch64_neon_st4:
17712 case Intrinsic::aarch64_neon_st1x2:
17713 case Intrinsic::aarch64_neon_st1x3:
17714 case Intrinsic::aarch64_neon_st1x4: {
17715 Info.opc = ISD::INTRINSIC_VOID;
17716 unsigned NumElts = 0;
17717 for (const Value *Arg : I.args()) {
17718 Type *ArgTy = Arg->getType();
17719 if (!ArgTy->isVectorTy())
17720 break;
17721 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
17722 }
17723 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
17724 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
17725 Info.offset = 0;
17726 Info.align.reset();
17727 // volatile stores with NEON intrinsics not supported
17728 Info.flags = MachineMemOperand::MOStore;
17729 Infos.push_back(Info);
17730 return;
17731 }
17732 case Intrinsic::aarch64_neon_st2lane:
17733 case Intrinsic::aarch64_neon_st3lane:
17734 case Intrinsic::aarch64_neon_st4lane: {
17735 Info.opc = ISD::INTRINSIC_VOID;
17736 unsigned NumElts = 0;
17737 // all the vector type is same
17738 Type *VecTy = I.getArgOperand(0)->getType();
17739 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
17740
17741 for (const Value *Arg : I.args()) {
17742 Type *ArgTy = Arg->getType();
17743 if (!ArgTy->isVectorTy())
17744 break;
17745 NumElts += 1;
17746 }
17747
17748 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
17749 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
17750 Info.offset = 0;
17751 Info.align.reset();
17752 // volatile stores with NEON intrinsics not supported
17753 Info.flags = MachineMemOperand::MOStore;
17754 Infos.push_back(Info);
17755 return;
17756 }
17757 case Intrinsic::aarch64_ldaxr:
17758 case Intrinsic::aarch64_ldxr: {
17759 Type *ValTy = I.getParamElementType(0);
17760 Info.opc = ISD::INTRINSIC_W_CHAIN;
17761 Info.memVT = MVT::getVT(ValTy);
17762 Info.ptrVal = I.getArgOperand(0);
17763 Info.offset = 0;
17764 Info.align = DL.getABITypeAlign(ValTy);
17766 Infos.push_back(Info);
17767 return;
17768 }
17769 case Intrinsic::aarch64_stlxr:
17770 case Intrinsic::aarch64_stxr: {
17771 Type *ValTy = I.getParamElementType(1);
17772 Info.opc = ISD::INTRINSIC_W_CHAIN;
17773 Info.memVT = MVT::getVT(ValTy);
17774 Info.ptrVal = I.getArgOperand(1);
17775 Info.offset = 0;
17776 Info.align = DL.getABITypeAlign(ValTy);
17778 Infos.push_back(Info);
17779 return;
17780 }
17781 case Intrinsic::aarch64_ldaxp:
17782 case Intrinsic::aarch64_ldxp:
17783 Info.opc = ISD::INTRINSIC_W_CHAIN;
17784 Info.memVT = MVT::i128;
17785 Info.ptrVal = I.getArgOperand(0);
17786 Info.offset = 0;
17787 Info.align = Align(16);
17789 Infos.push_back(Info);
17790 return;
17791 case Intrinsic::aarch64_stlxp:
17792 case Intrinsic::aarch64_stxp:
17793 Info.opc = ISD::INTRINSIC_W_CHAIN;
17794 Info.memVT = MVT::i128;
17795 Info.ptrVal = I.getArgOperand(2);
17796 Info.offset = 0;
17797 Info.align = Align(16);
17799 Infos.push_back(Info);
17800 return;
17801 case Intrinsic::aarch64_sve_ldnt1: {
17802 Type *ElTy = cast<VectorType>(I.getType())->getElementType();
17803 Info.opc = ISD::INTRINSIC_W_CHAIN;
17804 Info.memVT = MVT::getVT(I.getType());
17805 Info.ptrVal = I.getArgOperand(1);
17806 Info.offset = 0;
17807 Info.align = DL.getABITypeAlign(ElTy);
17809 Infos.push_back(Info);
17810 return;
17811 }
17812 case Intrinsic::aarch64_sve_stnt1: {
17813 Type *ElTy =
17814 cast<VectorType>(I.getArgOperand(0)->getType())->getElementType();
17815 Info.opc = ISD::INTRINSIC_W_CHAIN;
17816 Info.memVT = MVT::getVT(I.getOperand(0)->getType());
17817 Info.ptrVal = I.getArgOperand(2);
17818 Info.offset = 0;
17819 Info.align = DL.getABITypeAlign(ElTy);
17821 Infos.push_back(Info);
17822 return;
17823 }
17824 case Intrinsic::aarch64_mops_memset_tag: {
17825 Value *Dst = I.getArgOperand(0);
17826 Value *Val = I.getArgOperand(1);
17827 Info.opc = ISD::INTRINSIC_W_CHAIN;
17828 Info.memVT = MVT::getVT(Val->getType());
17829 Info.ptrVal = Dst;
17830 Info.offset = 0;
17831 Info.align = I.getParamAlign(0).valueOrOne();
17832 Info.flags = MachineMemOperand::MOStore;
17833 // The size of the memory being operated on is unknown at this point
17834 Info.size = MemoryLocation::UnknownSize;
17835 Infos.push_back(Info);
17836 return;
17837 }
17838 default:
17839 break;
17840 }
17841}
17842
17844 SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT,
17845 std::optional<unsigned> ByteOffset) const {
17846 // TODO: This may be worth removing. Check regression tests for diffs.
17847 if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT,
17848 ByteOffset))
17849 return false;
17850
17851 // If we're reducing the load width in order to avoid having to use an extra
17852 // instruction to do extension then it's probably a good idea.
17853 if (ExtTy != ISD::NON_EXTLOAD)
17854 return true;
17855 // Don't reduce load width if it would prevent us from combining a shift into
17856 // the offset.
17857 MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
17858 assert(Mem);
17859 const SDValue &Base = Mem->getBasePtr();
17860 if (Base.getOpcode() == ISD::ADD &&
17861 Base.getOperand(1).getOpcode() == ISD::SHL &&
17862 Base.getOperand(1).hasOneUse() &&
17863 Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
17864 // It's unknown whether a scalable vector has a power-of-2 bitwidth.
17865 if (Mem->getMemoryVT().isScalableVector())
17866 return false;
17867 // The shift can be combined if it matches the size of the value being
17868 // loaded (and so reducing the width would make it not match).
17869 uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
17870 uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
17871 if (ShiftAmount == Log2_32(LoadBytes))
17872 return false;
17873 }
17874 // We have no reason to disallow reducing the load width, so allow it.
17875 return true;
17876}
17877
17878// Treat a sext_inreg(extract(..)) as free if it has multiple uses.
17880 EVT VT = Extend.getValueType();
17881 if ((VT == MVT::i64 || VT == MVT::i32) && Extend->use_size()) {
17882 SDValue Extract = Extend.getOperand(0);
17883 if (Extract.getOpcode() == ISD::ANY_EXTEND && Extract.hasOneUse())
17884 Extract = Extract.getOperand(0);
17885 if (Extract.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Extract.hasOneUse()) {
17886 EVT VecVT = Extract.getOperand(0).getValueType();
17887 if (VecVT.getScalarType() == MVT::i8 || VecVT.getScalarType() == MVT::i16)
17888 return false;
17889 }
17890 }
17891 return true;
17892}
17893
17894// Truncations from 64-bit GPR to 32-bit GPR is free.
17896 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
17897 return false;
17898 uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedValue();
17899 uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedValue();
17900 return NumBits1 > NumBits2;
17901}
17903 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
17904 return false;
17905 uint64_t NumBits1 = VT1.getFixedSizeInBits();
17906 uint64_t NumBits2 = VT2.getFixedSizeInBits();
17907 return NumBits1 > NumBits2;
17908}
17909
17910/// Check if it is profitable to hoist instruction in then/else to if.
17911/// Not profitable if I and it's user can form a FMA instruction
17912/// because we prefer FMSUB/FMADD.
17914 if (I->getOpcode() != Instruction::FMul)
17915 return true;
17916
17917 if (!I->hasOneUse())
17918 return true;
17919
17920 Instruction *User = I->user_back();
17921
17922 if (!(User->getOpcode() == Instruction::FSub ||
17923 User->getOpcode() == Instruction::FAdd))
17924 return true;
17925
17927 const Function *F = I->getFunction();
17928 const DataLayout &DL = F->getDataLayout();
17929 Type *Ty = User->getOperand(0)->getType();
17930
17931 return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
17933 (Options.AllowFPOpFusion == FPOpFusion::Fast ||
17934 I->getFastMathFlags().allowContract()));
17935}
17936
17937// All 32-bit GPR operations implicitly zero the high-half of the corresponding
17938// 64-bit GPR.
17940 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
17941 return false;
17942 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
17943 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
17944 return NumBits1 == 32 && NumBits2 == 64;
17945}
17947 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
17948 return false;
17949 unsigned NumBits1 = VT1.getSizeInBits();
17950 unsigned NumBits2 = VT2.getSizeInBits();
17951 return NumBits1 == 32 && NumBits2 == 64;
17952}
17953
17955 EVT VT1 = Val.getValueType();
17956 if (isZExtFree(VT1, VT2)) {
17957 return true;
17958 }
17959
17960 if (Val.getOpcode() != ISD::LOAD)
17961 return false;
17962
17963 // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
17964 return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
17965 VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
17966 VT1.getSizeInBits() <= 32);
17967}
17968
17969bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
17970 if (isa<FPExtInst>(Ext))
17971 return false;
17972
17973 // Vector types are not free.
17974 if (Ext->getType()->isVectorTy())
17975 return false;
17976
17977 for (const Use &U : Ext->uses()) {
17978 // The extension is free if we can fold it with a left shift in an
17979 // addressing mode or an arithmetic operation: add, sub, and cmp.
17980
17981 // Is there a shift?
17982 const Instruction *Instr = cast<Instruction>(U.getUser());
17983
17984 // Is this a constant shift?
17985 switch (Instr->getOpcode()) {
17986 case Instruction::Shl:
17987 if (!isa<ConstantInt>(Instr->getOperand(1)))
17988 return false;
17989 break;
17990 case Instruction::GetElementPtr: {
17991 gep_type_iterator GTI = gep_type_begin(Instr);
17992 auto &DL = Ext->getDataLayout();
17993 std::advance(GTI, U.getOperandNo()-1);
17994 Type *IdxTy = GTI.getIndexedType();
17995 // This extension will end up with a shift because of the scaling factor.
17996 // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
17997 // Get the shift amount based on the scaling factor:
17998 // log2(sizeof(IdxTy)) - log2(8).
17999 if (IdxTy->isScalableTy())
18000 return false;
18001 uint64_t ShiftAmt =
18002 llvm::countr_zero(DL.getTypeStoreSizeInBits(IdxTy).getFixedValue()) -
18003 3;
18004 // Is the constant foldable in the shift of the addressing mode?
18005 // I.e., shift amount is between 1 and 4 inclusive.
18006 if (ShiftAmt == 0 || ShiftAmt > 4)
18007 return false;
18008 break;
18009 }
18010 case Instruction::Trunc:
18011 // Check if this is a noop.
18012 // trunc(sext ty1 to ty2) to ty1.
18013 if (Instr->getType() == Ext->getOperand(0)->getType())
18014 continue;
18015 [[fallthrough]];
18016 default:
18017 return false;
18018 }
18019
18020 // At this point we can use the bfm family, so this extension is free
18021 // for that use.
18022 }
18023 return true;
18024}
18025
18026static bool createTblShuffleMask(unsigned SrcWidth, unsigned DstWidth,
18027 unsigned NumElts, bool IsLittleEndian,
18028 SmallVectorImpl<int> &Mask) {
18029 if (DstWidth % 8 != 0 || DstWidth <= 16 || DstWidth > 64)
18030 return false;
18031
18032 assert(DstWidth % SrcWidth == 0 &&
18033 "TBL lowering is not supported for a conversion instruction with this "
18034 "source and destination element type.");
18035
18036 unsigned Factor = DstWidth / SrcWidth;
18037 unsigned MaskLen = NumElts * Factor;
18038
18039 Mask.clear();
18040 Mask.resize(MaskLen, NumElts);
18041
18042 unsigned SrcIndex = 0;
18043 for (unsigned I = IsLittleEndian ? 0 : Factor - 1; I < MaskLen; I += Factor)
18044 Mask[I] = SrcIndex++;
18045
18046 return true;
18047}
18048
18050 FixedVectorType *ZExtTy,
18051 FixedVectorType *DstTy,
18052 bool IsLittleEndian) {
18053 auto *SrcTy = cast<FixedVectorType>(Op->getType());
18054 unsigned NumElts = SrcTy->getNumElements();
18055 auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
18056 auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
18057
18058 SmallVector<int> Mask;
18059 if (!createTblShuffleMask(SrcWidth, DstWidth, NumElts, IsLittleEndian, Mask))
18060 return nullptr;
18061
18062 auto *FirstEltZero = Builder.CreateInsertElement(
18063 PoisonValue::get(SrcTy), Builder.getIntN(SrcWidth, 0), uint64_t(0));
18064 Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
18065 Result = Builder.CreateBitCast(Result, DstTy);
18066 if (DstTy != ZExtTy)
18067 Result = Builder.CreateZExt(Result, ZExtTy);
18068 return Result;
18069}
18070
18072 FixedVectorType *DstTy,
18073 bool IsLittleEndian) {
18074 auto *SrcTy = cast<FixedVectorType>(Op->getType());
18075 auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
18076 auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
18077
18078 SmallVector<int> Mask;
18079 if (!createTblShuffleMask(SrcWidth, DstWidth, SrcTy->getNumElements(),
18080 !IsLittleEndian, Mask))
18081 return nullptr;
18082
18083 auto *FirstEltZero = Builder.CreateInsertElement(
18084 PoisonValue::get(SrcTy), Builder.getIntN(SrcWidth, 0), uint64_t(0));
18085
18086 return Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
18087}
18088
18089static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
18090 IRBuilder<> Builder(TI);
18092 int NumElements = cast<FixedVectorType>(TI->getType())->getNumElements();
18093 auto *SrcTy = cast<FixedVectorType>(TI->getOperand(0)->getType());
18094 auto *DstTy = cast<FixedVectorType>(TI->getType());
18095 assert(SrcTy->getElementType()->isIntegerTy() &&
18096 "Non-integer type source vector element is not supported");
18097 assert(DstTy->getElementType()->isIntegerTy(8) &&
18098 "Unsupported destination vector element type");
18099 unsigned SrcElemTySz =
18100 cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
18101 unsigned DstElemTySz =
18102 cast<IntegerType>(DstTy->getElementType())->getBitWidth();
18103 assert((SrcElemTySz % DstElemTySz == 0) &&
18104 "Cannot lower truncate to tbl instructions for a source element size "
18105 "that is not divisible by the destination element size");
18106 unsigned TruncFactor = SrcElemTySz / DstElemTySz;
18107 assert((SrcElemTySz == 16 || SrcElemTySz == 32 || SrcElemTySz == 64) &&
18108 "Unsupported source vector element type size");
18109 Type *VecTy = FixedVectorType::get(Builder.getInt8Ty(), 16);
18110
18111 // Create a mask to choose every nth byte from the source vector table of
18112 // bytes to create the truncated destination vector, where 'n' is the truncate
18113 // ratio. For example, for a truncate from Yxi64 to Yxi8, choose
18114 // 0,8,16,..Y*8th bytes for the little-endian format
18116 for (int Itr = 0; Itr < 16; Itr++) {
18117 if (Itr < NumElements)
18118 MaskConst.push_back(Builder.getInt8(
18119 IsLittleEndian ? Itr * TruncFactor
18120 : Itr * TruncFactor + (TruncFactor - 1)));
18121 else
18122 MaskConst.push_back(Builder.getInt8(255));
18123 }
18124
18125 int MaxTblSz = 128 * 4;
18126 int MaxSrcSz = SrcElemTySz * NumElements;
18127 int ElemsPerTbl =
18128 (MaxTblSz > MaxSrcSz) ? NumElements : (MaxTblSz / SrcElemTySz);
18129 assert(ElemsPerTbl <= 16 &&
18130 "Maximum elements selected using TBL instruction cannot exceed 16!");
18131
18132 int ShuffleCount = 128 / SrcElemTySz;
18133 SmallVector<int> ShuffleLanes;
18134 for (int i = 0; i < ShuffleCount; ++i)
18135 ShuffleLanes.push_back(i);
18136
18137 // Create TBL's table of bytes in 1,2,3 or 4 FP/SIMD registers using shuffles
18138 // over the source vector. If TBL's maximum 4 FP/SIMD registers are saturated,
18139 // call TBL & save the result in a vector of TBL results for combining later.
18141 while (ShuffleLanes.back() < NumElements) {
18142 Parts.push_back(Builder.CreateBitCast(
18143 Builder.CreateShuffleVector(TI->getOperand(0), ShuffleLanes), VecTy));
18144
18145 if (Parts.size() == 4) {
18146 Parts.push_back(ConstantVector::get(MaskConst));
18147 Results.push_back(
18148 Builder.CreateIntrinsic(Intrinsic::aarch64_neon_tbl4, VecTy, Parts));
18149 Parts.clear();
18150 }
18151
18152 for (int i = 0; i < ShuffleCount; ++i)
18153 ShuffleLanes[i] += ShuffleCount;
18154 }
18155
18156 assert((Parts.empty() || Results.empty()) &&
18157 "Lowering trunc for vectors requiring different TBL instructions is "
18158 "not supported!");
18159 // Call TBL for the residual table bytes present in 1,2, or 3 FP/SIMD
18160 // registers
18161 if (!Parts.empty()) {
18162 Intrinsic::ID TblID;
18163 switch (Parts.size()) {
18164 case 1:
18165 TblID = Intrinsic::aarch64_neon_tbl1;
18166 break;
18167 case 2:
18168 TblID = Intrinsic::aarch64_neon_tbl2;
18169 break;
18170 case 3:
18171 TblID = Intrinsic::aarch64_neon_tbl3;
18172 break;
18173 }
18174
18175 Parts.push_back(ConstantVector::get(MaskConst));
18176 Results.push_back(Builder.CreateIntrinsic(TblID, VecTy, Parts));
18177 }
18178
18179 // Extract the destination vector from TBL result(s) after combining them
18180 // where applicable. Currently, at most two TBLs are supported.
18181 assert(Results.size() <= 2 && "Trunc lowering does not support generation of "
18182 "more than 2 tbl instructions!");
18183 Value *FinalResult = Results[0];
18184 if (Results.size() == 1) {
18185 if (ElemsPerTbl < 16) {
18186 SmallVector<int> FinalMask(ElemsPerTbl);
18187 std::iota(FinalMask.begin(), FinalMask.end(), 0);
18188 FinalResult = Builder.CreateShuffleVector(Results[0], FinalMask);
18189 }
18190 } else {
18191 SmallVector<int> FinalMask(ElemsPerTbl * Results.size());
18192 if (ElemsPerTbl < 16) {
18193 std::iota(FinalMask.begin(), FinalMask.begin() + ElemsPerTbl, 0);
18194 std::iota(FinalMask.begin() + ElemsPerTbl, FinalMask.end(), 16);
18195 } else {
18196 std::iota(FinalMask.begin(), FinalMask.end(), 0);
18197 }
18198 FinalResult =
18199 Builder.CreateShuffleVector(Results[0], Results[1], FinalMask);
18200 }
18201
18202 TI->replaceAllUsesWith(FinalResult);
18203 TI->eraseFromParent();
18204}
18205
18207 Instruction *I, Loop *L, const TargetTransformInfo &TTI) const {
18208 // shuffle_vector instructions are serialized when targeting SVE,
18209 // see LowerSPLAT_VECTOR. This peephole is not beneficial.
18210 if (!EnableExtToTBL || Subtarget->useSVEForFixedLengthVectors())
18211 return false;
18212
18213 // Try to optimize conversions using tbl. This requires materializing constant
18214 // index vectors, which can increase code size and add loads. Skip the
18215 // transform unless the conversion is in a loop block guaranteed to execute
18216 // and we are not optimizing for size.
18217 Function *F = I->getParent()->getParent();
18218 if (!L || L->getHeader() != I->getParent() || F->hasOptSize())
18219 return false;
18220
18221 auto *SrcTy = dyn_cast<FixedVectorType>(I->getOperand(0)->getType());
18222 auto *DstTy = dyn_cast<FixedVectorType>(I->getType());
18223 if (!SrcTy || !DstTy)
18224 return false;
18225
18226 // Convert 'zext <Y x i8> %x to <Y x i8X>' to a shuffle that can be
18227 // lowered to tbl instructions to insert the original i8 elements
18228 // into i8x lanes. This is enabled for cases where it is beneficial.
18229 auto *ZExt = dyn_cast<ZExtInst>(I);
18230 if (ZExt && SrcTy->getElementType()->isIntegerTy(8)) {
18231 auto DstWidth = DstTy->getElementType()->getScalarSizeInBits();
18232 if (DstWidth % 8 != 0)
18233 return false;
18234
18235 auto *TruncDstType =
18237 // If the ZExt can be lowered to a single ZExt to the next power-of-2 and
18238 // the remaining ZExt folded into the user, don't use tbl lowering.
18239 auto SrcWidth = SrcTy->getElementType()->getScalarSizeInBits();
18240 if (TTI.getCastInstrCost(I->getOpcode(), DstTy, TruncDstType,
18243 if (SrcWidth * 2 >= TruncDstType->getElementType()->getScalarSizeInBits())
18244 return false;
18245
18246 DstTy = TruncDstType;
18247 }
18248
18249 // mul(zext(i8), sext) can be transformed into smull(zext, sext) which
18250 // performs one extend implicitly. If DstWidth is at most 4 * SrcWidth, at
18251 // most one extra extend step is needed and using tbl is not profitable.
18252 // Similarly, bail out if partial_reduce(acc, zext(i8)) can be lowered to a
18253 // udot instruction.
18254 if (SrcWidth * 4 <= DstWidth) {
18255 if (all_of(I->users(), [&](auto *U) {
18256 using namespace llvm::PatternMatch;
18257 auto *SingleUser = cast<Instruction>(&*U);
18258 if (match(SingleUser, m_c_Mul(m_Specific(I), m_SExt(m_Value()))))
18259 return true;
18260 if (match(SingleUser,
18261 m_Intrinsic<Intrinsic::vector_partial_reduce_add>(
18262 m_Value(), m_Specific(I))))
18263 return true;
18264 return false;
18265 }))
18266 return false;
18267 }
18268
18269 if (DstTy->getScalarSizeInBits() >= 64)
18270 return false;
18271
18272 IRBuilder<> Builder(ZExt);
18274 Builder, ZExt->getOperand(0), cast<FixedVectorType>(ZExt->getType()),
18275 DstTy, Subtarget->isLittleEndian());
18276 if (!Result)
18277 return false;
18278 ZExt->replaceAllUsesWith(Result);
18279 ZExt->eraseFromParent();
18280 return true;
18281 }
18282
18283 auto *UIToFP = dyn_cast<UIToFPInst>(I);
18284 if (UIToFP && ((SrcTy->getElementType()->isIntegerTy(8) &&
18285 DstTy->getElementType()->isFloatTy()) ||
18286 (SrcTy->getElementType()->isIntegerTy(16) &&
18287 DstTy->getElementType()->isDoubleTy()))) {
18288 IRBuilder<> Builder(I);
18290 Builder, I->getOperand(0), FixedVectorType::getInteger(DstTy),
18291 FixedVectorType::getInteger(DstTy), Subtarget->isLittleEndian());
18292 assert(ZExt && "Cannot fail for the i8 to float conversion");
18293 auto *UI = Builder.CreateUIToFP(ZExt, DstTy);
18294 I->replaceAllUsesWith(UI);
18295 I->eraseFromParent();
18296 return true;
18297 }
18298
18299 auto *SIToFP = dyn_cast<SIToFPInst>(I);
18300 if (SIToFP && SrcTy->getElementType()->isIntegerTy(8) &&
18301 DstTy->getElementType()->isFloatTy()) {
18302 IRBuilder<> Builder(I);
18303 auto *Shuffle = createTblShuffleForSExt(Builder, I->getOperand(0),
18305 Subtarget->isLittleEndian());
18306 assert(Shuffle && "Cannot fail for the i8 to float conversion");
18307 auto *Cast = Builder.CreateBitCast(Shuffle, VectorType::getInteger(DstTy));
18308 auto *AShr = Builder.CreateAShr(Cast, 24, "", true);
18309 auto *SI = Builder.CreateSIToFP(AShr, DstTy);
18310 I->replaceAllUsesWith(SI);
18311 I->eraseFromParent();
18312 return true;
18313 }
18314
18315 // Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui
18316 // followed by a truncate lowered to using tbl.4.
18317 auto *FPToUI = dyn_cast<FPToUIInst>(I);
18318 if (FPToUI &&
18319 (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
18320 SrcTy->getElementType()->isFloatTy() &&
18321 DstTy->getElementType()->isIntegerTy(8)) {
18322 IRBuilder<> Builder(I);
18323 auto *WideConv = Builder.CreateFPToUI(FPToUI->getOperand(0),
18324 VectorType::getInteger(SrcTy));
18325 auto *TruncI = Builder.CreateTrunc(WideConv, DstTy);
18326 I->replaceAllUsesWith(TruncI);
18327 I->eraseFromParent();
18328 createTblForTrunc(cast<TruncInst>(TruncI), Subtarget->isLittleEndian());
18329 return true;
18330 }
18331
18332 // Convert 'trunc <(8|16) x (i32|i64)> %x to <(8|16) x i8>' to an appropriate
18333 // tbl instruction selecting the lowest/highest (little/big endian) 8 bits
18334 // per lane of the input that is represented using 1,2,3 or 4 128-bit table
18335 // registers
18336 auto *TI = dyn_cast<TruncInst>(I);
18337 if (TI && DstTy->getElementType()->isIntegerTy(8) &&
18338 ((SrcTy->getElementType()->isIntegerTy(32) ||
18339 SrcTy->getElementType()->isIntegerTy(64)) &&
18340 (SrcTy->getNumElements() == 16 || SrcTy->getNumElements() == 8))) {
18341 createTblForTrunc(TI, Subtarget->isLittleEndian());
18342 return true;
18343 }
18344
18345 return false;
18346}
18347
18349 Align &RequiredAlignment) const {
18350 if (!LoadedType.isSimple() ||
18351 (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
18352 return false;
18353 // Cyclone supports unaligned accesses.
18354 RequiredAlignment = Align(1);
18355 unsigned NumBits = LoadedType.getSizeInBits();
18356 return NumBits == 32 || NumBits == 64;
18357}
18358
18359/// A helper function for determining the number of interleaved accesses we
18360/// will generate when lowering accesses of the given type.
18362 VectorType *VecTy, const DataLayout &DL, bool UseScalable) const {
18363 unsigned VecSize = 128;
18364 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
18365 unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
18366 if (UseScalable && isa<FixedVectorType>(VecTy))
18367 VecSize = std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
18368 return std::max<unsigned>(1, (MinElts * ElSize + 127) / VecSize);
18369}
18370
18373 if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
18374 I.hasMetadata(FALKOR_STRIDED_ACCESS_MD))
18375 return MOStridedAccess;
18377}
18378
18380 VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const {
18381 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
18382 auto EC = VecTy->getElementCount();
18383 unsigned MinElts = EC.getKnownMinValue();
18384
18385 UseScalable = false;
18386
18387 if (isa<FixedVectorType>(VecTy) && !Subtarget->isNeonAvailable() &&
18388 (!Subtarget->useSVEForFixedLengthVectors() ||
18390 return false;
18391
18392 if (isa<ScalableVectorType>(VecTy) &&
18393 !Subtarget->isSVEorStreamingSVEAvailable())
18394 return false;
18395
18396 // Ensure the number of vector elements is greater than 1.
18397 if (MinElts < 2)
18398 return false;
18399
18400 // Ensure the element type is legal.
18401 if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
18402 return false;
18403
18404 if (EC.isScalable()) {
18405 UseScalable = true;
18406 return isPowerOf2_32(MinElts) && (MinElts * ElSize) % 128 == 0;
18407 }
18408
18409 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
18410 if (Subtarget->useSVEForFixedLengthVectors()) {
18411 unsigned MinSVEVectorSize =
18412 std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
18413 if (VecSize % MinSVEVectorSize == 0 ||
18414 (VecSize < MinSVEVectorSize && isPowerOf2_32(MinElts) &&
18415 (!Subtarget->isNeonAvailable() || VecSize > 128))) {
18416 UseScalable = true;
18417 return true;
18418 }
18419 }
18420
18421 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
18422 // 128 will be split into multiple interleaved accesses.
18423 return Subtarget->isNeonAvailable() && (VecSize == 64 || VecSize % 128 == 0);
18424}
18425
18427 if (VTy->getElementType() == Type::getDoubleTy(VTy->getContext()))
18428 return ScalableVectorType::get(VTy->getElementType(), 2);
18429
18430 if (VTy->getElementType() == Type::getFloatTy(VTy->getContext()))
18431 return ScalableVectorType::get(VTy->getElementType(), 4);
18432
18433 if (VTy->getElementType() == Type::getBFloatTy(VTy->getContext()))
18434 return ScalableVectorType::get(VTy->getElementType(), 8);
18435
18436 if (VTy->getElementType() == Type::getHalfTy(VTy->getContext()))
18437 return ScalableVectorType::get(VTy->getElementType(), 8);
18438
18439 if (VTy->getElementType() == Type::getInt64Ty(VTy->getContext()))
18440 return ScalableVectorType::get(VTy->getElementType(), 2);
18441
18442 if (VTy->getElementType() == Type::getInt32Ty(VTy->getContext()))
18443 return ScalableVectorType::get(VTy->getElementType(), 4);
18444
18445 if (VTy->getElementType() == Type::getInt16Ty(VTy->getContext()))
18446 return ScalableVectorType::get(VTy->getElementType(), 8);
18447
18448 if (VTy->getElementType() == Type::getInt8Ty(VTy->getContext()))
18449 return ScalableVectorType::get(VTy->getElementType(), 16);
18450
18451 llvm_unreachable("Cannot handle input vector type");
18452}
18453
18454static Function *getStructuredLoadFunction(Module *M, unsigned Factor,
18455 bool Scalable, Type *LDVTy,
18456 Type *PtrTy) {
18457 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
18458 static const Intrinsic::ID SVELoads[3] = {Intrinsic::aarch64_sve_ld2_sret,
18459 Intrinsic::aarch64_sve_ld3_sret,
18460 Intrinsic::aarch64_sve_ld4_sret};
18461 static const Intrinsic::ID NEONLoads[3] = {Intrinsic::aarch64_neon_ld2,
18462 Intrinsic::aarch64_neon_ld3,
18463 Intrinsic::aarch64_neon_ld4};
18464 if (Scalable)
18465 return Intrinsic::getOrInsertDeclaration(M, SVELoads[Factor - 2],
18466 {LDVTy, PtrTy});
18467
18468 return Intrinsic::getOrInsertDeclaration(M, NEONLoads[Factor - 2],
18469 {LDVTy, PtrTy});
18470}
18471
18472static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
18473 bool Scalable, Type *STVTy,
18474 Type *PtrTy) {
18475 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
18476 static const Intrinsic::ID SVEStores[3] = {Intrinsic::aarch64_sve_st2,
18477 Intrinsic::aarch64_sve_st3,
18478 Intrinsic::aarch64_sve_st4};
18479 static const Intrinsic::ID NEONStores[3] = {Intrinsic::aarch64_neon_st2,
18480 Intrinsic::aarch64_neon_st3,
18481 Intrinsic::aarch64_neon_st4};
18482 if (Scalable)
18483 return Intrinsic::getOrInsertDeclaration(M, SVEStores[Factor - 2],
18484 {STVTy, PtrTy});
18485
18486 return Intrinsic::getOrInsertDeclaration(M, NEONStores[Factor - 2],
18487 {STVTy, PtrTy});
18488}
18489
18490/// Lower an interleaved load into a ldN intrinsic.
18491///
18492/// E.g. Lower an interleaved load (Factor = 2):
18493/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
18494/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
18495/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
18496///
18497/// Into:
18498/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
18499/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
18500/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
18502 Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
18503 ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const {
18504 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
18505 "Invalid interleave factor");
18506 assert(!Shuffles.empty() && "Empty shufflevector input");
18507 assert(Shuffles.size() == Indices.size() &&
18508 "Unmatched number of shufflevectors and indices");
18509
18510 auto *LI = dyn_cast<LoadInst>(Load);
18511 if (!LI)
18512 return false;
18513 assert(!Mask && GapMask.popcount() == Factor && "Unexpected mask on a load");
18514
18515 const DataLayout &DL = LI->getDataLayout();
18516
18517 VectorType *VTy = Shuffles[0]->getType();
18518
18519 // Skip if we do not have NEON and skip illegal vector types. We can
18520 // "legalize" wide vector types into multiple interleaved accesses as long as
18521 // the vector types are divisible by 128.
18522 bool UseScalable;
18523 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
18524 return false;
18525
18526 // Check if the interleave is a zext(shuffle), that can be better optimized
18527 // into shift / and masks. For the moment we do this just for uitofp (not
18528 // zext) to avoid issues with widening instructions.
18529 if (Shuffles.size() == 4 && all_of(Shuffles, [](ShuffleVectorInst *SI) {
18530 using namespace llvm::PatternMatch;
18531 return SI->hasOneUse() && match(SI->user_back(), m_UIToFP(m_Value())) &&
18532 SI->getType()->getScalarSizeInBits() * 4 ==
18533 SI->user_back()->getType()->getScalarSizeInBits();
18534 }))
18535 return false;
18536
18537 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
18538
18539 auto *FVTy = cast<FixedVectorType>(VTy);
18540
18541 // A pointer vector can not be the return type of the ldN intrinsics. Need to
18542 // load integer vectors first and then convert to pointer vectors.
18543 Type *EltTy = FVTy->getElementType();
18544 if (EltTy->isPointerTy())
18545 FVTy =
18546 FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements());
18547
18548 // If we're going to generate more than one load, reset the sub-vector type
18549 // to something legal.
18550 FVTy = FixedVectorType::get(FVTy->getElementType(),
18551 FVTy->getNumElements() / NumLoads);
18552
18553 auto *LDVTy =
18554 UseScalable ? cast<VectorType>(getSVEContainerIRType(FVTy)) : FVTy;
18555
18556 IRBuilder<> Builder(LI);
18557
18558 // The base address of the load.
18559 Value *BaseAddr = LI->getPointerOperand();
18560
18561 Type *PtrTy = LI->getPointerOperandType();
18562 Type *PredTy = VectorType::get(Type::getInt1Ty(LDVTy->getContext()),
18563 LDVTy->getElementCount());
18564
18565 Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor,
18566 UseScalable, LDVTy, PtrTy);
18567
18568 // Holds sub-vectors extracted from the load intrinsic return values. The
18569 // sub-vectors are associated with the shufflevector instructions they will
18570 // replace.
18572
18573 Value *PTrue = nullptr;
18574 if (UseScalable) {
18575 std::optional<unsigned> PgPattern =
18576 getSVEPredPatternFromNumElements(FVTy->getNumElements());
18577 if (Subtarget->getMinSVEVectorSizeInBits() ==
18578 Subtarget->getMaxSVEVectorSizeInBits() &&
18579 Subtarget->getMinSVEVectorSizeInBits() == DL.getTypeSizeInBits(FVTy))
18580 PgPattern = AArch64SVEPredPattern::all;
18581
18582 auto *PTruePat =
18583 ConstantInt::get(Type::getInt32Ty(LDVTy->getContext()), *PgPattern);
18584 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
18585 {PTruePat});
18586 }
18587
18588 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
18589
18590 // If we're generating more than one load, compute the base address of
18591 // subsequent loads as an offset from the previous.
18592 if (LoadCount > 0)
18593 BaseAddr = Builder.CreateConstGEP1_32(LDVTy->getElementType(), BaseAddr,
18594 FVTy->getNumElements() * Factor);
18595
18596 CallInst *LdN;
18597 if (UseScalable)
18598 LdN = Builder.CreateCall(LdNFunc, {PTrue, BaseAddr}, "ldN");
18599 else
18600 LdN = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
18601
18602 // Extract and store the sub-vectors returned by the load intrinsic.
18603 for (unsigned i = 0; i < Shuffles.size(); i++) {
18604 ShuffleVectorInst *SVI = Shuffles[i];
18605 unsigned Index = Indices[i];
18606
18607 Value *SubVec = Builder.CreateExtractValue(LdN, Index);
18608
18609 if (UseScalable)
18610 SubVec = Builder.CreateExtractVector(FVTy, SubVec, uint64_t(0));
18611
18612 // Convert the integer vector to pointer vector if the element is pointer.
18613 if (EltTy->isPointerTy())
18614 SubVec = Builder.CreateIntToPtr(
18616 FVTy->getNumElements()));
18617
18618 SubVecs[SVI].push_back(SubVec);
18619 }
18620 }
18621
18622 // Replace uses of the shufflevector instructions with the sub-vectors
18623 // returned by the load intrinsic. If a shufflevector instruction is
18624 // associated with more than one sub-vector, those sub-vectors will be
18625 // concatenated into a single wide vector.
18626 for (ShuffleVectorInst *SVI : Shuffles) {
18627 auto &SubVec = SubVecs[SVI];
18628 auto *WideVec =
18629 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
18630 SVI->replaceAllUsesWith(WideVec);
18631 }
18632
18633 return true;
18634}
18635
18636template <typename Iter>
18637bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
18638 int MaxLookupDist = 20;
18639 unsigned IdxWidth = DL.getIndexSizeInBits(0);
18640 APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
18641 const Value *PtrA1 =
18643
18644 while (++It != End) {
18645 if (It->isDebugOrPseudoInst())
18646 continue;
18647 if (MaxLookupDist-- == 0)
18648 break;
18649 if (const auto *SI = dyn_cast<StoreInst>(&*It)) {
18650 const Value *PtrB1 =
18651 SI->getPointerOperand()->stripAndAccumulateInBoundsConstantOffsets(
18652 DL, OffsetB);
18653 if (PtrA1 == PtrB1 &&
18654 (OffsetA.sextOrTrunc(IdxWidth) - OffsetB.sextOrTrunc(IdxWidth))
18655 .abs() == 16)
18656 return true;
18657 }
18658 }
18659
18660 return false;
18661}
18662
18663/// Lower an interleaved store into a stN intrinsic.
18664///
18665/// E.g. Lower an interleaved store (Factor = 3):
18666/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
18667/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
18668/// store <12 x i32> %i.vec, <12 x i32>* %ptr
18669///
18670/// Into:
18671/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
18672/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
18673/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
18674/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
18675///
18676/// Note that the new shufflevectors will be removed and we'll only generate one
18677/// st3 instruction in CodeGen.
18678///
18679/// Example for a more general valid mask (Factor 3). Lower:
18680/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
18681/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
18682/// store <12 x i32> %i.vec, <12 x i32>* %ptr
18683///
18684/// Into:
18685/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
18686/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
18687/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
18688/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
18690 Value *LaneMask,
18691 ShuffleVectorInst *SVI,
18692 unsigned Factor,
18693 const APInt &GapMask) const {
18694
18695 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
18696 "Invalid interleave factor");
18697 auto *SI = dyn_cast<StoreInst>(Store);
18698 if (!SI)
18699 return false;
18700 assert(!LaneMask && GapMask.popcount() == Factor &&
18701 "Unexpected mask on store");
18702
18703 auto *VecTy = cast<FixedVectorType>(SVI->getType());
18704 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
18705
18706 unsigned LaneLen = VecTy->getNumElements() / Factor;
18707 Type *EltTy = VecTy->getElementType();
18708 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
18709
18710 const DataLayout &DL = SI->getDataLayout();
18711 bool UseScalable;
18712
18713 // Skip if we do not have NEON and skip illegal vector types. We can
18714 // "legalize" wide vector types into multiple interleaved accesses as long as
18715 // the vector types are divisible by 128.
18716 if (!isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
18717 return false;
18718
18719 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
18720
18721 Value *Op0 = SVI->getOperand(0);
18722 Value *Op1 = SVI->getOperand(1);
18723 IRBuilder<> Builder(SI);
18724
18725 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
18726 // vectors to integer vectors.
18727 if (EltTy->isPointerTy()) {
18728 Type *IntTy = DL.getIntPtrType(EltTy);
18729 unsigned NumOpElts =
18730 cast<FixedVectorType>(Op0->getType())->getNumElements();
18731
18732 // Convert to the corresponding integer vector.
18733 auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts);
18734 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
18735 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
18736
18737 SubVecTy = FixedVectorType::get(IntTy, LaneLen);
18738 }
18739
18740 // If we're going to generate more than one store, reset the lane length
18741 // and sub-vector type to something legal.
18742 LaneLen /= NumStores;
18743 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
18744
18745 auto *STVTy = UseScalable ? cast<VectorType>(getSVEContainerIRType(SubVecTy))
18746 : SubVecTy;
18747
18748 // The base address of the store.
18749 Value *BaseAddr = SI->getPointerOperand();
18750
18751 auto Mask = SVI->getShuffleMask();
18752
18753 // Sanity check if all the indices are NOT in range.
18754 // If mask is `poison`, `Mask` may be a vector of -1s.
18755 // If all of them are `poison`, OOB read will happen later.
18756 if (llvm::all_of(Mask, equal_to(PoisonMaskElem))) {
18757 return false;
18758 }
18759 // A 64bit st2 which does not start at element 0 will involved adding extra
18760 // ext elements making the st2 unprofitable, and if there is a nearby store
18761 // that points to BaseAddr+16 or BaseAddr-16 then it can be better left as a
18762 // zip;ldp pair which has higher throughput.
18763 if (Factor == 2 && SubVecTy->getPrimitiveSizeInBits() == 64 &&
18764 (Mask[0] != 0 ||
18765 hasNearbyPairedStore(SI->getIterator(), SI->getParent()->end(), BaseAddr,
18766 DL) ||
18767 hasNearbyPairedStore(SI->getReverseIterator(), SI->getParent()->rend(),
18768 BaseAddr, DL)))
18769 return false;
18770
18771 // Conditionally skip nontemporal stores to prioritize emitting non-temporal
18772 // store instructions, even though AArch64 doesn't have non-temporal
18773 // interleaved stores.
18774 //
18775 // The check is conservative:
18776 //
18777 // - Only when not optimizing for size, as STNP lowering can increase size.
18778 // - Don't skip if the interleaving factor is greater than 2, as the shuffling
18779 // overhead becomes higher.
18780 // - Don't skip if the store value types which are not directly legal.
18781 Function *F = SI->getFunction();
18782 if (Factor == 2 && SI->hasMetadata(LLVMContext::MD_nontemporal) &&
18783 !F->hasOptSize() && !F->hasMinSize() &&
18784 isLegalNTStore(SI->getValueOperand()->getType(), SI->getAlign(), DL))
18785 return false;
18786
18787 Type *PtrTy = SI->getPointerOperandType();
18788 Type *PredTy = VectorType::get(Type::getInt1Ty(STVTy->getContext()),
18789 STVTy->getElementCount());
18790
18791 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
18792 UseScalable, STVTy, PtrTy);
18793
18794 Value *PTrue = nullptr;
18795 if (UseScalable) {
18796 std::optional<unsigned> PgPattern =
18797 getSVEPredPatternFromNumElements(SubVecTy->getNumElements());
18798 if (Subtarget->getMinSVEVectorSizeInBits() ==
18799 Subtarget->getMaxSVEVectorSizeInBits() &&
18800 Subtarget->getMinSVEVectorSizeInBits() ==
18801 DL.getTypeSizeInBits(SubVecTy))
18802 PgPattern = AArch64SVEPredPattern::all;
18803
18804 auto *PTruePat =
18805 ConstantInt::get(Type::getInt32Ty(STVTy->getContext()), *PgPattern);
18806 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
18807 {PTruePat});
18808 }
18809
18810 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
18811
18813
18814 // Split the shufflevector operands into sub vectors for the new stN call.
18815 for (unsigned i = 0; i < Factor; i++) {
18816 Value *Shuffle;
18817 unsigned IdxI = StoreCount * LaneLen * Factor + i;
18818 if (Mask[IdxI] >= 0) {
18819 Shuffle = Builder.CreateShuffleVector(
18820 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0));
18821 } else {
18822 unsigned StartMask = 0;
18823 for (unsigned j = 1; j < LaneLen; j++) {
18824 unsigned IdxJ = StoreCount * LaneLen * Factor + j * Factor + i;
18825 if (Mask[IdxJ] >= 0) {
18826 StartMask = Mask[IdxJ] - j;
18827 break;
18828 }
18829 }
18830 // Note: Filling undef gaps with random elements is ok, since
18831 // those elements were being written anyway (with undefs).
18832 // In the case of all undefs we're defaulting to using elems from 0
18833 // Note: StartMask cannot be negative, it's checked in
18834 // isReInterleaveMask
18835 Shuffle = Builder.CreateShuffleVector(
18836 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0));
18837 }
18838
18839 if (UseScalable)
18840 Shuffle = Builder.CreateInsertVector(STVTy, PoisonValue::get(STVTy),
18841 Shuffle, uint64_t(0));
18842
18843 Ops.push_back(Shuffle);
18844 }
18845
18846 if (UseScalable)
18847 Ops.push_back(PTrue);
18848
18849 // If we generating more than one store, we compute the base address of
18850 // subsequent stores as an offset from the previous.
18851 if (StoreCount > 0)
18852 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
18853 BaseAddr, LaneLen * Factor);
18854
18855 Ops.push_back(BaseAddr);
18856 Builder.CreateCall(StNFunc, Ops);
18857 }
18858 return true;
18859}
18860
18862 Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
18863 const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
18864 if (Factor != 2 && Factor != 3 && Factor != 4) {
18865 LLVM_DEBUG(dbgs() << "Matching ld2, ld3 and ld4 patterns failed\n");
18866 return false;
18867 }
18868 auto *LI = dyn_cast<LoadInst>(Load);
18869 if (!LI)
18870 return false;
18871 assert(!Mask && "Unexpected mask on a load\n");
18872
18874
18875 const DataLayout &DL = LI->getModule()->getDataLayout();
18876 bool UseScalable;
18877 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
18878 return false;
18879
18880 // TODO: Add support for using SVE instructions with fixed types later, using
18881 // the code from lowerInterleavedLoad to obtain the correct container type.
18882 if (UseScalable && !VTy->isScalableTy())
18883 return false;
18884
18885 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
18886 VectorType *LdTy =
18888 VTy->getElementCount().divideCoefficientBy(NumLoads));
18889
18890 Type *PtrTy = LI->getPointerOperandType();
18891 Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor,
18892 UseScalable, LdTy, PtrTy);
18893
18894 IRBuilder<> Builder(LI);
18895 Value *Pred = nullptr;
18896 if (UseScalable)
18897 Pred =
18898 Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue());
18899
18900 Value *BaseAddr = LI->getPointerOperand();
18901 Value *Result = nullptr;
18902 if (NumLoads > 1) {
18903 // Create multiple legal small ldN.
18904 SmallVector<Value *, 4> ExtractedLdValues(Factor, PoisonValue::get(VTy));
18905 for (unsigned I = 0; I < NumLoads; ++I) {
18906 Value *Offset = Builder.getInt64(I * Factor);
18907
18908 Value *Address = Builder.CreateGEP(LdTy, BaseAddr, {Offset});
18909 Value *LdN = nullptr;
18910 if (UseScalable)
18911 LdN = Builder.CreateCall(LdNFunc, {Pred, Address}, "ldN");
18912 else
18913 LdN = Builder.CreateCall(LdNFunc, Address, "ldN");
18914 Value *Idx =
18915 Builder.getInt64(I * LdTy->getElementCount().getKnownMinValue());
18916 for (unsigned J = 0; J < Factor; ++J) {
18917 ExtractedLdValues[J] = Builder.CreateInsertVector(
18918 VTy, ExtractedLdValues[J], Builder.CreateExtractValue(LdN, J), Idx);
18919 }
18920 LLVM_DEBUG(dbgs() << "LdN4 res: "; LdN->dump());
18921 }
18922
18923 // Merge the values from different factors.
18924 Result = PoisonValue::get(DI->getType());
18925 for (unsigned J = 0; J < Factor; ++J)
18926 Result = Builder.CreateInsertValue(Result, ExtractedLdValues[J], J);
18927 } else {
18928 if (UseScalable)
18929 Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
18930 else
18931 Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
18932 }
18933
18934 // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
18935 DI->replaceAllUsesWith(Result);
18936 return true;
18937}
18938
18940 Instruction *Store, Value *Mask,
18941 ArrayRef<Value *> InterleavedValues) const {
18942 unsigned Factor = InterleavedValues.size();
18943 if (Factor != 2 && Factor != 3 && Factor != 4) {
18944 LLVM_DEBUG(dbgs() << "Matching st2, st3 and st4 patterns failed\n");
18945 return false;
18946 }
18948 if (!SI)
18949 return false;
18950 assert(!Mask && "Unexpected mask on plain store");
18951
18952 VectorType *VTy = cast<VectorType>(InterleavedValues[0]->getType());
18953 const DataLayout &DL = SI->getModule()->getDataLayout();
18954
18955 bool UseScalable;
18956 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
18957 return false;
18958
18959 // TODO: Add support for using SVE instructions with fixed types later, using
18960 // the code from lowerInterleavedStore to obtain the correct container type.
18961 if (UseScalable && !VTy->isScalableTy())
18962 return false;
18963
18964 unsigned NumStores = getNumInterleavedAccesses(VTy, DL, UseScalable);
18965
18966 VectorType *StTy =
18968 VTy->getElementCount().divideCoefficientBy(NumStores));
18969
18970 Type *PtrTy = SI->getPointerOperandType();
18971 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
18972 UseScalable, StTy, PtrTy);
18973
18974 IRBuilder<> Builder(SI);
18975
18976 Value *BaseAddr = SI->getPointerOperand();
18977 Value *Pred = nullptr;
18978
18979 if (UseScalable)
18980 Pred =
18981 Builder.CreateVectorSplat(StTy->getElementCount(), Builder.getTrue());
18982
18983 auto ExtractedValues = InterleavedValues;
18984 SmallVector<Value *, 4> StoreOperands(InterleavedValues);
18985 if (UseScalable)
18986 StoreOperands.push_back(Pred);
18987 StoreOperands.push_back(BaseAddr);
18988 for (unsigned I = 0; I < NumStores; ++I) {
18989 Value *Address = BaseAddr;
18990 if (NumStores > 1) {
18991 Value *Offset = Builder.getInt64(I * Factor);
18992 Address = Builder.CreateGEP(StTy, BaseAddr, {Offset});
18993 Value *Idx =
18994 Builder.getInt64(I * StTy->getElementCount().getKnownMinValue());
18995 for (unsigned J = 0; J < Factor; J++) {
18996 StoreOperands[J] =
18997 Builder.CreateExtractVector(StTy, ExtractedValues[J], Idx);
18998 }
18999 // update the address
19000 StoreOperands[StoreOperands.size() - 1] = Address;
19001 }
19002 Builder.CreateCall(StNFunc, StoreOperands);
19003 }
19004 return true;
19005}
19006
19008 LLVMContext &Context, const MemOp &Op,
19009 const AttributeList &FuncAttributes) const {
19010 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
19011 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
19012 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
19013 // For zero memset, only use AdvSIMD for 32-byte and above. It would have
19014 // taken one instruction to materialize the v2i64 zero and one store (with
19015 // restrictive addressing mode). Just do i64 stores.
19016 // For non-zero memset, use NEON even for smaller sizes as dup is efficient.
19017 bool IsSmallZeroMemset = Op.isMemset() && Op.size() < 32 && Op.isZeroMemset();
19018 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
19019 if (Op.isAligned(AlignCheck))
19020 return true;
19021 unsigned Fast;
19022 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
19024 Fast;
19025 };
19026
19027 // For non-zero memset, use NEON even for smaller sizes as dup + scalar store
19028 // is efficient
19029 if (CanUseNEON && Op.isMemset() && !IsSmallZeroMemset &&
19030 AlignmentIsAcceptable(MVT::v16i8, Align(1)))
19031 return MVT::v16i8;
19032 if (CanUseFP && !IsSmallZeroMemset &&
19033 AlignmentIsAcceptable(MVT::f128, Align(16)))
19034 return MVT::f128;
19035 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
19036 return MVT::i64;
19037 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
19038 return MVT::i32;
19039 return MVT::Other;
19040}
19041
19043 LLVMContext &Context, std::vector<EVT> &MemOps, unsigned Limit,
19044 const MemOp &Op, unsigned DstAS, unsigned SrcAS,
19045 const AttributeList &FuncAttributes, EVT *LargestVT) const {
19046 // For non-zero memset with v16i8, don't downgrade. We can extract smaller
19047 // stores (i64, i32, i16, i8) from the v16i8 splat efficiently.
19048 EVT VT = getOptimalMemOpType(Context, Op, FuncAttributes);
19049 if (VT == MVT::v16i8 && Op.isMemset() && !Op.isZeroMemset() &&
19050 Op.size() < 16) {
19051 unsigned Size = Op.size();
19052 unsigned RemainingSize = Size;
19053
19054 // Break down the size into stores that we can extract from v16i8.
19055 // We support: i64 (8 bytes), i32 (4 bytes), i16 (2 bytes), i8 (1 byte)
19056 // Use the largest possible stores first to minimize the number of
19057 // operations.
19058 while (RemainingSize > 0) {
19059 EVT TargetVT;
19060
19061 // Try largest stores first
19062 if (RemainingSize >= 8) {
19063 TargetVT = MVT::i64;
19064 RemainingSize -= 8;
19065 } else if (RemainingSize >= 4) {
19066 TargetVT = MVT::i32;
19067 RemainingSize -= 4;
19068 } else if (RemainingSize >= 2) {
19069 TargetVT = MVT::i16;
19070 RemainingSize -= 2;
19071 } else if (RemainingSize >= 1) {
19072 TargetVT = MVT::i8;
19073 RemainingSize -= 1;
19074 } else {
19075 // Should not reach here, but fall back to default implementation
19076 break;
19077 }
19078
19079 MemOps.push_back(TargetVT);
19080 }
19081
19082 // If we successfully decomposed the entire size, set LargestVT to v16i8
19083 // to ensure getMemsetValue generates the efficient vector splat (DUP).
19084 // We don't add v16i8 to MemOps since we only need it for value generation.
19085 if (RemainingSize == 0 && !MemOps.empty()) {
19086 if (LargestVT)
19087 *LargestVT = VT; // v16i8 for vector splat generation
19088 return true;
19089 }
19090
19091 // Clear MemOps if we didn't successfully handle everything
19092 MemOps.clear();
19093 }
19094 // Otherwise, use the default implementation
19096 Context, MemOps, Limit, Op, DstAS, SrcAS, FuncAttributes, LargestVT);
19097}
19098
19100 const MemOp &Op, const AttributeList &FuncAttributes) const {
19101 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
19102 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
19103 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
19104 // For zero memset, only use AdvSIMD for 32-byte and above. It would have
19105 // taken one instruction to materialize the v2i64 zero and one store (with
19106 // restrictive addressing mode). Just do i64 stores.
19107 // For non-zero memset, use NEON even for smaller sizes as dup is efficient.
19108 bool IsSmallZeroMemset = Op.isMemset() && Op.size() < 32 && Op.isZeroMemset();
19109 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
19110 if (Op.isAligned(AlignCheck))
19111 return true;
19112 unsigned Fast;
19113 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
19115 Fast;
19116 };
19117
19118 // For non-zero memset, use NEON for all sizes where it's beneficial.
19119 // NEON dup + scalar store works for any alignment and is efficient.
19120 if (CanUseNEON && Op.isMemset() && !IsSmallZeroMemset &&
19121 AlignmentIsAcceptable(MVT::v16i8, Align(1)))
19122 return LLT::fixed_vector(2, LLT::integer(64));
19123 if (CanUseFP && !IsSmallZeroMemset &&
19124 AlignmentIsAcceptable(MVT::f128, Align(16)))
19125 return LLT::floatIEEE(128);
19126 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
19127 return LLT::integer(64);
19128 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
19129 return LLT::integer(32);
19130 return LLT();
19131}
19132
19133// 12-bit optionally shifted immediates are legal for adds.
19135 if (Immed == std::numeric_limits<int64_t>::min()) {
19136 return false;
19137 }
19138 // Same encoding for add/sub, just flip the sign.
19139 return isLegalArithImmed((uint64_t)std::abs(Immed));
19140}
19141
19143 // We will only emit addvl/inc* instructions for SVE2
19144 if (!Subtarget->hasSVE2())
19145 return false;
19146
19147 // addvl's immediates are in terms of the number of bytes in a register.
19148 // Since there are 16 in the base supported size (128bits), we need to
19149 // divide the immediate by that much to give us a useful immediate to
19150 // multiply by vscale. We can't have a remainder as a result of this.
19151 if (Imm % 16 == 0)
19152 return isInt<6>(Imm / 16);
19153
19154 // Inc[b|h|w|d] instructions take a pattern and a positive immediate
19155 // multiplier. For now, assume a pattern of 'all'. Incb would be a subset
19156 // of addvl as a result, so only take h|w|d into account.
19157 // Dec[h|w|d] will cover subtractions.
19158 // Immediates are in the range [1,16], so we can't do a 2's complement check.
19159 // FIXME: Can we make use of other patterns to cover other immediates?
19160
19161 // inch|dech
19162 if (Imm % 8 == 0)
19163 return std::abs(Imm / 8) <= 16;
19164 // incw|decw
19165 if (Imm % 4 == 0)
19166 return std::abs(Imm / 4) <= 16;
19167 // incd|decd
19168 if (Imm % 2 == 0)
19169 return std::abs(Imm / 2) <= 16;
19170
19171 return false;
19172}
19173
19174// Return false to prevent folding
19175// (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine,
19176// if the folding leads to worse code.
19178 SDValue AddNode, SDValue ConstNode) const {
19179 // Let the DAGCombiner decide for vector types and large types.
19180 const EVT VT = AddNode.getValueType();
19181 if (VT.isVector() || VT.getScalarSizeInBits() > 64)
19182 return true;
19183
19184 // It is worse if c1 is legal add immediate, while c1*c2 is not
19185 // and has to be composed by at least two instructions.
19186 const ConstantSDNode *C1Node = cast<ConstantSDNode>(AddNode.getOperand(1));
19187 const ConstantSDNode *C2Node = cast<ConstantSDNode>(ConstNode);
19188 const int64_t C1 = C1Node->getSExtValue();
19189 const APInt C1C2 = C1Node->getAPIntValue() * C2Node->getAPIntValue();
19191 return true;
19193 // Adapt to the width of a register.
19194 unsigned BitSize = VT.getSizeInBits() <= 32 ? 32 : 64;
19195 AArch64_IMM::expandMOVImm(C1C2.getZExtValue(), BitSize, Insn);
19196 if (Insn.size() > 1)
19197 return false;
19198
19199 // Default to true and let the DAGCombiner decide.
19200 return true;
19201}
19202
19203// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
19204// immediates is the same as for an add or a sub.
19206 return isLegalAddImmediate(Immed);
19207}
19208
19209/// isLegalAddressingMode - Return true if the addressing mode represented
19210/// by AM is legal for this target, for a load/store of the specified type.
19212 const AddrMode &AMode, Type *Ty,
19213 unsigned AS, Instruction *I) const {
19214 // AArch64 has five basic addressing modes:
19215 // reg
19216 // reg + 9-bit signed offset
19217 // reg + SIZE_IN_BYTES * 12-bit unsigned offset
19218 // reg1 + reg2
19219 // reg + SIZE_IN_BYTES * reg
19220
19221 // No global is ever allowed as a base.
19222 if (AMode.BaseGV)
19223 return false;
19224
19225 // No reg+reg+imm addressing.
19226 if (AMode.HasBaseReg && AMode.BaseOffs && AMode.Scale)
19227 return false;
19228
19229 // Canonicalise `1*ScaledReg + imm` into `BaseReg + imm` and
19230 // `2*ScaledReg` into `BaseReg + ScaledReg`
19231 AddrMode AM = AMode;
19232 if (AM.Scale && !AM.HasBaseReg) {
19233 if (AM.Scale == 1) {
19234 AM.HasBaseReg = true;
19235 AM.Scale = 0;
19236 } else if (AM.Scale == 2) {
19237 AM.HasBaseReg = true;
19238 AM.Scale = 1;
19239 } else {
19240 return false;
19241 }
19242 }
19243
19244 // A base register is required in all addressing modes.
19245 if (!AM.HasBaseReg)
19246 return false;
19247
19248 if (Ty->isScalableTy()) {
19249 if (isa<ScalableVectorType>(Ty)) {
19250 // See if we have a foldable vscale-based offset, for vector types which
19251 // are either legal or smaller than the minimum; more work will be
19252 // required if we need to consider addressing for types which need
19253 // legalization by splitting.
19254 uint64_t VecNumBytes = DL.getTypeSizeInBits(Ty).getKnownMinValue() / 8;
19255 if (AM.HasBaseReg && !AM.BaseOffs && AM.ScalableOffset && !AM.Scale &&
19256 (AM.ScalableOffset % VecNumBytes == 0) && VecNumBytes <= 16 &&
19257 isPowerOf2_64(VecNumBytes))
19258 return isInt<4>(AM.ScalableOffset / (int64_t)VecNumBytes);
19259
19260 uint64_t VecElemNumBytes =
19261 DL.getTypeSizeInBits(cast<VectorType>(Ty)->getElementType()) / 8;
19262 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset &&
19263 (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes);
19264 }
19265
19266 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset && !AM.Scale;
19267 }
19268
19269 // No scalable offsets allowed for non-scalable types.
19270 if (AM.ScalableOffset)
19271 return false;
19272
19273 // check reg + imm case:
19274 // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
19275 uint64_t NumBytes = 0;
19276 if (Ty->isSized()) {
19277 uint64_t NumBits = DL.getTypeSizeInBits(Ty);
19278 NumBytes = NumBits / 8;
19279 if (!isPowerOf2_64(NumBits))
19280 NumBytes = 0;
19281 }
19282
19283 return Subtarget->getInstrInfo()->isLegalAddressingMode(NumBytes, AM.BaseOffs,
19284 AM.Scale);
19285}
19286
19287// Check whether the 2 offsets belong to the same imm24 range, and their high
19288// 12bits are same, then their high part can be decoded with the offset of add.
19289int64_t
19291 int64_t MaxOffset) const {
19292 int64_t HighPart = MinOffset & ~0xfffULL;
19293 if (MinOffset >> 12 == MaxOffset >> 12 && isLegalAddImmediate(HighPart)) {
19294 // Rebase the value to an integer multiple of imm12.
19295 return HighPart;
19296 }
19297
19298 return 0;
19299}
19300
19302 // Consider splitting large offset of struct or array.
19303 return true;
19304}
19305
19307 const MachineFunction &MF, EVT VT) const {
19308 EVT ScalarVT = VT.getScalarType();
19309
19310 if (!ScalarVT.isSimple())
19311 return false;
19312
19313 switch (ScalarVT.getSimpleVT().SimpleTy) {
19314 case MVT::f16:
19315 return Subtarget->hasFullFP16();
19316 case MVT::f32:
19317 case MVT::f64:
19318 return true;
19319 case MVT::bf16:
19320 return VT.isScalableVector() && Subtarget->hasBF16() &&
19321 Subtarget->isNonStreamingSVEorSME2Available();
19322 default:
19323 break;
19324 }
19325
19326 return false;
19327}
19328
19330 Type *Ty) const {
19331 switch (Ty->getScalarType()->getTypeID()) {
19332 case Type::FloatTyID:
19333 case Type::DoubleTyID:
19334 return true;
19335 default:
19336 return false;
19337 }
19338}
19339
19341 EVT VT, CodeGenOptLevel OptLevel) const {
19342 return (OptLevel >= CodeGenOptLevel::Aggressive) && !VT.isScalableVector() &&
19344}
19345
19346const MCPhysReg *
19348 // LR is a callee-save register, but we must treat it as clobbered by any call
19349 // site. Hence we include LR in the scratch registers, which are in turn added
19350 // as implicit-defs for stackmaps and patchpoints.
19351 static const MCPhysReg ScratchRegs[] = {
19352 AArch64::X16, AArch64::X17, AArch64::LR, 0
19353 };
19354 return ScratchRegs;
19355}
19356
19358 static const MCPhysReg RCRegs[] = {AArch64::FPCR};
19359 return RCRegs;
19360}
19361
19362bool
19364 CombineLevel Level) const {
19365 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
19366 N->getOpcode() == ISD::SRL) &&
19367 "Expected shift op");
19368
19369 SDValue ShiftLHS = N->getOperand(0);
19370 EVT VT = N->getValueType(0);
19371
19372 if (!ShiftLHS->hasOneUse())
19373 return false;
19374
19375 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
19376 !ShiftLHS.getOperand(0)->hasOneUse())
19377 return false;
19378
19379 // If ShiftLHS is unsigned bit extraction: ((x >> C) & mask), then do not
19380 // combine it with shift 'N' to let it be lowered to UBFX except:
19381 // ((x >> C) & mask) << C.
19382 if (ShiftLHS.getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
19383 isa<ConstantSDNode>(ShiftLHS.getOperand(1))) {
19384 uint64_t TruncMask = ShiftLHS.getConstantOperandVal(1);
19385 if (isMask_64(TruncMask)) {
19386 SDValue AndLHS = ShiftLHS.getOperand(0);
19387 if (AndLHS.getOpcode() == ISD::SRL) {
19388 if (auto *SRLC = dyn_cast<ConstantSDNode>(AndLHS.getOperand(1))) {
19389 if (N->getOpcode() == ISD::SHL)
19390 if (auto *SHLC = dyn_cast<ConstantSDNode>(N->getOperand(1)))
19391 return SRLC->getZExtValue() == SHLC->getZExtValue();
19392 return false;
19393 }
19394 }
19395 }
19396 }
19397 return true;
19398}
19399
19401 const SDNode *N) const {
19402 assert(N->getOpcode() == ISD::XOR &&
19403 (N->getOperand(0).getOpcode() == ISD::SHL ||
19404 N->getOperand(0).getOpcode() == ISD::SRL) &&
19405 "Expected XOR(SHIFT) pattern");
19406
19407 // Only commute if the entire NOT mask is a hidden shifted mask.
19408 auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
19409 auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
19410 if (XorC && ShiftC) {
19411 unsigned MaskIdx, MaskLen;
19412 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
19413 unsigned ShiftAmt = ShiftC->getZExtValue();
19414 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
19415 if (N->getOperand(0).getOpcode() == ISD::SHL)
19416 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
19417 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
19418 }
19419 }
19420
19421 return false;
19422}
19423
19425 const SDNode *N) const {
19426 assert(((N->getOpcode() == ISD::SHL &&
19427 N->getOperand(0).getOpcode() == ISD::SRL) ||
19428 (N->getOpcode() == ISD::SRL &&
19429 N->getOperand(0).getOpcode() == ISD::SHL)) &&
19430 "Expected shift-shift mask");
19431 // Don't allow multiuse shift folding with the same shift amount.
19432 if (!N->getOperand(0)->hasOneUse())
19433 return false;
19434
19435 // Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns.
19436 EVT VT = N->getValueType(0);
19437 if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 || VT == MVT::i64)) {
19438 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
19439 auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
19440 return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue());
19441 }
19442
19443 // We do not need to fold when this shifting used in specific load case:
19444 // (ldr x, (add x, (shl (srl x, c1) 2)))
19445 if (N->getOpcode() == ISD::SHL && N->hasOneUse()) {
19446 if (auto C2 = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
19447 unsigned ShlAmt = C2->getZExtValue();
19448 if (auto ShouldADD = *N->user_begin();
19449 ShouldADD->getOpcode() == ISD::ADD && ShouldADD->hasOneUse()) {
19450 if (auto Load = dyn_cast<LoadSDNode>(*ShouldADD->user_begin())) {
19451 EVT MemVT = Load->getMemoryVT();
19452
19453 if (Load->getValueType(0).isScalableVector())
19454 return (8ULL << ShlAmt) != MemVT.getScalarSizeInBits();
19455
19456 if (isIndexedLoadLegal(ISD::PRE_INC, MemVT))
19457 return (8ULL << ShlAmt) != MemVT.getFixedSizeInBits();
19458 }
19459 }
19460 }
19461 }
19462
19463 return true;
19464}
19465
19467 unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
19468 SDValue Y) const {
19469 return VT.isScalableVector() && isTypeLegal(VT) &&
19470 SelectOpcode == ISD::VSELECT;
19471}
19472
19474 Type *Ty) const {
19475 assert(Ty->isIntegerTy());
19476
19477 unsigned BitSize = Ty->getPrimitiveSizeInBits();
19478 if (BitSize == 0)
19479 return false;
19480
19481 int64_t Val = Imm.getSExtValue();
19482 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
19483 return true;
19484
19485 if (Val < 0)
19486 Val = ~Val;
19487 if (BitSize == 32)
19488 Val &= (1LL << 32) - 1;
19489
19490 unsigned Shift = llvm::Log2_64((uint64_t)Val) / 16;
19491 // MOVZ is free so return true for one or fewer MOVK.
19492 return Shift < 3;
19493}
19494
19496 unsigned Index) const {
19498 return false;
19499
19500 return (Index == 0 || Index == ResVT.getVectorMinNumElements());
19501}
19502
19504 LLVMContext &Context, EVT VT) const {
19505 if (getTypeAction(Context, VT) != TypeExpandInteger)
19506 return false;
19507
19508 EVT LegalTy = EVT::getIntegerVT(Context, VT.getSizeInBits() / 2);
19509 return getTypeAction(Context, LegalTy) == TargetLowering::TypeLegal;
19510}
19511
19512/// Turn vector tests of the signbit in the form of:
19513/// xor (sra X, elt_size(X)-1), -1
19514/// into:
19515/// cmge X, X, #0
19517 const AArch64Subtarget *Subtarget) {
19518 EVT VT = N->getValueType(0);
19519 if (!Subtarget->hasNEON() || !VT.isVector())
19520 return SDValue();
19521
19522 // There must be a shift right algebraic before the xor, and the xor must be a
19523 // 'not' operation.
19524 SDValue Shift = N->getOperand(0);
19525 SDValue Ones = N->getOperand(1);
19526 if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
19528 return SDValue();
19529
19530 // The shift should be smearing the sign bit across each vector element.
19531 auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
19532 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
19533 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
19534 return SDValue();
19535
19536 SDLoc DL(N);
19537 SDValue Zero = DAG.getConstant(0, DL, Shift.getValueType());
19538 return DAG.getSetCC(DL, VT, Shift.getOperand(0), Zero, ISD::SETGE);
19539}
19540
19541// Given a vecreduce_add node, detect the below pattern and convert it to the
19542// node sequence with UABDL, [S|U]ADB and UADDLP.
19543//
19544// i32 vecreduce_add(
19545// v16i32 abs(
19546// v16i32 sub(
19547// v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b))))
19548//
19549// or
19550//
19551// i32 vecreduce_add(
19552// v16i32 zext(
19553// v16i16 abs(
19554// v16i16 sub(
19555// v16i16 [sign|zero]_extend(v16i8 a), v16i16 [sign|zero]_extend(v16i8 b))))
19556//
19557// =================>
19558// i32 vecreduce_add(
19559// v4i32 UADDLP(
19560// v8i16 add(
19561// v8i16 zext(
19562// v8i8 [S|U]ABD low8:v16i8 a, low8:v16i8 b
19563// v8i16 zext(
19564// v8i8 [S|U]ABD high8:v16i8 a, high8:v16i8 b
19566 SelectionDAG &DAG) {
19567 // Assumed i32 vecreduce_add
19568 if (N->getValueType(0) != MVT::i32)
19569 return SDValue();
19570
19571 SDValue VecReduceOp0 = N->getOperand(0);
19572 bool SawTrailingZext = false;
19573 // Look through an optional post-ABS ZEXT from v16i16 -> v16i32.
19574 if (VecReduceOp0.getOpcode() == ISD::ZERO_EXTEND &&
19575 VecReduceOp0->getValueType(0) == MVT::v16i32 &&
19576 VecReduceOp0->getOperand(0)->getOpcode() == ISD::ABS &&
19577 VecReduceOp0->getOperand(0)->getValueType(0) == MVT::v16i16) {
19578 SawTrailingZext = true;
19579 VecReduceOp0 = VecReduceOp0.getOperand(0);
19580 }
19581
19582 // Peel off an optional post-ABS extend (v16i16 -> v16i32).
19583 MVT AbsInputVT = SawTrailingZext ? MVT::v16i16 : MVT::v16i32;
19584 // Assumed v16i16 or v16i32 abs input
19585 unsigned Opcode = VecReduceOp0.getOpcode();
19586 if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != AbsInputVT)
19587 return SDValue();
19588
19589 SDValue ABS = VecReduceOp0;
19590 // Assumed v16i16 or v16i32 sub
19591 if (ABS->getOperand(0)->getOpcode() != ISD::SUB ||
19592 ABS->getOperand(0)->getValueType(0) != AbsInputVT)
19593 return SDValue();
19594
19595 SDValue SUB = ABS->getOperand(0);
19596 unsigned Opcode0 = SUB->getOperand(0).getOpcode();
19597 unsigned Opcode1 = SUB->getOperand(1).getOpcode();
19598 // Assumed v16i16 or v16i32 type
19599 if (SUB->getOperand(0)->getValueType(0) != AbsInputVT ||
19600 SUB->getOperand(1)->getValueType(0) != AbsInputVT)
19601 return SDValue();
19602
19603 // Assumed zext or sext
19604 bool IsZExt = false;
19605 if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) {
19606 IsZExt = true;
19607 } else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) {
19608 IsZExt = false;
19609 } else
19610 return SDValue();
19611
19612 SDValue EXT0 = SUB->getOperand(0);
19613 SDValue EXT1 = SUB->getOperand(1);
19614 // Assumed zext's operand has v16i8 type
19615 if (EXT0->getOperand(0)->getValueType(0) != MVT::v16i8 ||
19616 EXT1->getOperand(0)->getValueType(0) != MVT::v16i8)
19617 return SDValue();
19618
19619 // Pattern is detected. Let's convert it to sequence of nodes.
19620 SDLoc DL(N);
19621
19622 // First, create the node pattern of UABD/SABD.
19623 SDValue UABDHigh8Op0 =
19624 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
19625 DAG.getConstant(8, DL, MVT::i64));
19626 SDValue UABDHigh8Op1 =
19627 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
19628 DAG.getConstant(8, DL, MVT::i64));
19629 SDValue UABDHigh8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
19630 UABDHigh8Op0, UABDHigh8Op1);
19631 SDValue UABDL = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDHigh8);
19632
19633 // Second, create the node pattern of UABAL.
19634 SDValue UABDLo8Op0 =
19635 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
19636 DAG.getConstant(0, DL, MVT::i64));
19637 SDValue UABDLo8Op1 =
19638 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
19639 DAG.getConstant(0, DL, MVT::i64));
19640 SDValue UABDLo8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
19641 UABDLo8Op0, UABDLo8Op1);
19642 SDValue ZExtUABD = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDLo8);
19643 SDValue UABAL = DAG.getNode(ISD::ADD, DL, MVT::v8i16, UABDL, ZExtUABD);
19644
19645 // Third, create the node of UADDLP.
19646 SDValue UADDLP = DAG.getNode(AArch64ISD::UADDLP, DL, MVT::v4i32, UABAL);
19647
19648 // Fourth, create the node of VECREDUCE_ADD.
19649 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP);
19650}
19651
19652static SDValue
19654 const AArch64Subtarget *ST) {
19655 if (DCI.isBeforeLegalize())
19656 return SDValue();
19657
19658 if (SDValue Brk = optimizeBrk(N, DCI.DAG))
19659 return Brk;
19660
19661 if (SDValue While = optimizeIncrementingWhile(N, DCI.DAG, /*IsSigned=*/false,
19662 /*IsEqual=*/false))
19663 return While;
19664
19665 if (!N->getValueType(0).isScalableVector() ||
19666 !ST->isSVEorStreamingSVEAvailable() ||
19667 !(ST->hasSVE2p1() || ST->hasSME2()))
19668 return SDValue();
19669
19670 // Count the number of users which are extract_vectors.
19671 unsigned NumExts = count_if(N->users(), [](SDNode *Use) {
19672 return Use->getOpcode() == ISD::EXTRACT_SUBVECTOR;
19673 });
19674
19675 auto MaskEC = N->getValueType(0).getVectorElementCount();
19676 if (!MaskEC.isKnownMultipleOf(NumExts))
19677 return SDValue();
19678
19679 ElementCount ExtMinEC = MaskEC.divideCoefficientBy(NumExts);
19680 if (ExtMinEC.getKnownMinValue() < 2)
19681 return SDValue();
19682
19683 SmallVector<SDNode *> Extracts(NumExts, nullptr);
19684 for (SDNode *Use : N->users()) {
19685 if (Use->getOpcode() != ISD::EXTRACT_SUBVECTOR)
19686 continue;
19687
19688 // Ensure the extract type is correct (e.g. if NumExts is 4 and
19689 // the mask return type is nxv8i1, each extract should be nxv2i1.
19690 if (Use->getValueType(0).getVectorElementCount() != ExtMinEC)
19691 return SDValue();
19692
19693 // There should be exactly one extract for each part of the mask.
19694 unsigned Offset = Use->getConstantOperandVal(1);
19695 unsigned Part = Offset / ExtMinEC.getKnownMinValue();
19696 if (Extracts[Part] != nullptr)
19697 return SDValue();
19698
19699 Extracts[Part] = Use;
19700 }
19701
19702 SelectionDAG &DAG = DCI.DAG;
19703 SDLoc DL(N);
19704 SDValue ID =
19705 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo_x2, DL, MVT::i64);
19706
19707 SDValue Idx = N->getOperand(0);
19708 SDValue TC = N->getOperand(1);
19709 if (Idx.getValueType() != MVT::i64) {
19710 Idx = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Idx);
19711 TC = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, TC);
19712 }
19713
19714 // Create the whilelo_x2 intrinsics from each pair of extracts
19715 EVT ExtVT = Extracts[0]->getValueType(0);
19716 EVT DoubleExtVT = ExtVT.getDoubleNumVectorElementsVT(*DAG.getContext());
19717 auto R =
19718 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, {ExtVT, ExtVT}, {ID, Idx, TC});
19719 DCI.CombineTo(Extracts[0], R.getValue(0));
19720 DCI.CombineTo(Extracts[1], R.getValue(1));
19721 SmallVector<SDValue> Concats = {DAG.getNode(
19722 ISD::CONCAT_VECTORS, DL, DoubleExtVT, R.getValue(0), R.getValue(1))};
19723
19724 if (NumExts == 2) {
19725 assert(N->getValueType(0) == DoubleExtVT);
19726 return Concats[0];
19727 }
19728
19729 auto Elts =
19730 DAG.getElementCount(DL, MVT::i64, ExtVT.getVectorElementCount() * 2);
19731 for (unsigned I = 2; I < NumExts; I += 2) {
19732 // After the first whilelo_x2, we need to increment the starting value.
19733 Idx = DAG.getNode(ISD::UADDSAT, DL, MVT::i64, Idx, Elts);
19734 R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, {ExtVT, ExtVT}, {ID, Idx, TC});
19735 DCI.CombineTo(Extracts[I], R.getValue(0));
19736 DCI.CombineTo(Extracts[I + 1], R.getValue(1));
19737 Concats.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, DoubleExtVT,
19738 R.getValue(0), R.getValue(1)));
19739 }
19740
19741 return DAG.getNode(ISD::CONCAT_VECTORS, DL, N->getValueType(0), Concats);
19742}
19743
19744// Turn vecreduce.add(ZExt(predicate)) into cntp(predicate).
19746 const AArch64Subtarget *ST) {
19747 SDValue Op = N->getOperand(0);
19748 if (Op->getOpcode() != ISD::ZERO_EXTEND)
19749 return SDValue();
19750
19751 SDValue ZExtOp = Op->getOperand(0);
19752 EVT VT = ZExtOp.getValueType();
19753 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 ||
19755 return SDValue();
19756
19757 SDLoc DL(N);
19758 SDValue Cntp = DAG.getNode(
19759 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,
19760 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64), ZExtOp,
19761 ZExtOp);
19762 return DAG.getZExtOrTrunc(Cntp, DL, N->getValueType(0));
19763}
19764
19766 const AArch64Subtarget *ST) {
19767 if (SDValue Result = performVecReduceAddCntpCombine(N, DAG, ST))
19768 return Result;
19769
19770 if (!ST->isNeonAvailable())
19771 return SDValue();
19772
19773 if (!ST->hasDotProd())
19775
19776 // Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
19777 // vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
19778 // vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
19779 // If we have vectors larger than v16i8 we extract v16i8 vectors,
19780 // Follow the same steps above to get DOT instructions concatenate them
19781 // and generate vecreduce.add(concat_vector(DOT, DOT2, ..)).
19782
19783 SDValue Op0 = N->getOperand(0);
19784 if (N->getValueType(0) != MVT::i32 || Op0.getValueType().isScalableVT() ||
19785 Op0.getValueType().getVectorElementType() != MVT::i32)
19786 return SDValue();
19787
19788 unsigned ExtOpcode = Op0.getOpcode();
19789 SDValue A = Op0;
19790 SDValue B;
19791 unsigned DotOpcode;
19792 if (ExtOpcode == ISD::MUL) {
19793 A = Op0.getOperand(0);
19794 B = Op0.getOperand(1);
19795 if (A.getOperand(0).getValueType() != B.getOperand(0).getValueType())
19796 return SDValue();
19797 auto OpCodeA = A.getOpcode();
19798 if (OpCodeA != ISD::ZERO_EXTEND && OpCodeA != ISD::SIGN_EXTEND)
19799 return SDValue();
19800
19801 auto OpCodeB = B.getOpcode();
19802 if (OpCodeB != ISD::ZERO_EXTEND && OpCodeB != ISD::SIGN_EXTEND)
19803 return SDValue();
19804
19805 if (OpCodeA == OpCodeB) {
19806 DotOpcode =
19807 OpCodeA == ISD::ZERO_EXTEND ? AArch64ISD::UDOT : AArch64ISD::SDOT;
19808 } else {
19809 // Check USDOT support support
19810 if (!ST->hasMatMulInt8())
19811 return SDValue();
19812 DotOpcode = AArch64ISD::USDOT;
19813 if (OpCodeA == ISD::SIGN_EXTEND)
19814 std::swap(A, B);
19815 }
19816 } else if (ExtOpcode == ISD::ZERO_EXTEND) {
19817 DotOpcode = AArch64ISD::UDOT;
19818 } else if (ExtOpcode == ISD::SIGN_EXTEND) {
19819 DotOpcode = AArch64ISD::SDOT;
19820 } else {
19821 return SDValue();
19822 }
19823
19824 EVT Op0VT = A.getOperand(0).getValueType();
19825 bool IsValidElementCount = Op0VT.getVectorNumElements() % 8 == 0;
19826 bool IsValidSize = Op0VT.getScalarSizeInBits() == 8;
19827 if (!IsValidElementCount || !IsValidSize)
19828 return SDValue();
19829
19830 SDLoc DL(Op0);
19831 // For non-mla reductions B can be set to 1. For MLA we take the operand of
19832 // the extend B.
19833 if (!B)
19834 B = DAG.getConstant(1, DL, Op0VT);
19835 else
19836 B = B.getOperand(0);
19837
19838 unsigned IsMultipleOf16 = Op0VT.getVectorNumElements() % 16 == 0;
19839 unsigned NumOfVecReduce;
19840 EVT TargetType;
19841 if (IsMultipleOf16) {
19842 NumOfVecReduce = Op0VT.getVectorNumElements() / 16;
19843 TargetType = MVT::v4i32;
19844 } else {
19845 NumOfVecReduce = Op0VT.getVectorNumElements() / 8;
19846 TargetType = MVT::v2i32;
19847 }
19848 // Handle the case where we need to generate only one Dot operation.
19849 if (NumOfVecReduce == 1) {
19850 SDValue Zeros = DAG.getConstant(0, DL, TargetType);
19851 SDValue Dot = DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros,
19852 A.getOperand(0), B);
19853 return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
19854 }
19855 // Generate Dot instructions that are multiple of 16.
19856 unsigned VecReduce16Num = Op0VT.getVectorNumElements() / 16;
19857 SmallVector<SDValue, 4> SDotVec16;
19858 unsigned I = 0;
19859 for (; I < VecReduce16Num; I += 1) {
19860 SDValue Zeros = DAG.getConstant(0, DL, MVT::v4i32);
19861 SDValue Op0 =
19862 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, A.getOperand(0),
19863 DAG.getConstant(I * 16, DL, MVT::i64));
19864 SDValue Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, B,
19865 DAG.getConstant(I * 16, DL, MVT::i64));
19866 SDValue Dot =
19867 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Op0, Op1);
19868 SDotVec16.push_back(Dot);
19869 }
19870 // Concatenate dot operations.
19871 EVT SDot16EVT =
19872 EVT::getVectorVT(*DAG.getContext(), MVT::i32, 4 * VecReduce16Num);
19873 SDValue ConcatSDot16 =
19874 DAG.getNode(ISD::CONCAT_VECTORS, DL, SDot16EVT, SDotVec16);
19875 SDValue VecReduceAdd16 =
19876 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), ConcatSDot16);
19877 unsigned VecReduce8Num = (Op0VT.getVectorNumElements() % 16) / 8;
19878 if (VecReduce8Num == 0)
19879 return VecReduceAdd16;
19880
19881 // Generate the remainder Dot operation that is multiple of 8.
19882 SDValue Zeros = DAG.getConstant(0, DL, MVT::v2i32);
19883 SDValue Vec8Op0 =
19884 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, A.getOperand(0),
19885 DAG.getConstant(I * 16, DL, MVT::i64));
19886 SDValue Vec8Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, B,
19887 DAG.getConstant(I * 16, DL, MVT::i64));
19888 SDValue Dot =
19889 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Vec8Op0, Vec8Op1);
19890 SDValue VecReduceAdd8 =
19891 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
19892 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), VecReduceAdd16,
19893 VecReduceAdd8);
19894}
19895
19896// Given an (integer) vecreduce, we know the order of the inputs does not
19897// matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x))))
19898// into UADDV(UADDLP(x)). This can also happen through an extra add, where we
19899// transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))).
19901 auto DetectAddExtract = [&](SDValue A) {
19902 // Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning
19903 // UADDLP(x) if found.
19904 assert(A.getOpcode() == ISD::ADD);
19905 EVT VT = A.getValueType();
19906 SDValue Op0 = A.getOperand(0);
19907 SDValue Op1 = A.getOperand(1);
19908 if (Op0.getOpcode() != Op1.getOpcode() ||
19909 (Op0.getOpcode() != ISD::ZERO_EXTEND &&
19910 Op0.getOpcode() != ISD::SIGN_EXTEND))
19911 return SDValue();
19912 SDValue Ext0 = Op0.getOperand(0);
19913 SDValue Ext1 = Op1.getOperand(0);
19914 if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
19916 Ext0.getOperand(0) != Ext1.getOperand(0) ||
19918 return SDValue();
19919 // Check that the type is twice the add types, and the extract are from
19920 // upper/lower parts of the same source.
19922 VT.getVectorNumElements() * 2)
19923 return SDValue();
19924 if ((Ext0.getConstantOperandVal(1) != 0 ||
19926 (Ext1.getConstantOperandVal(1) != 0 ||
19928 return SDValue();
19929 unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP
19930 : AArch64ISD::SADDLP;
19931 return DAG.getNode(Opcode, SDLoc(A), VT, Ext0.getOperand(0));
19932 };
19933
19934 if (SDValue R = DetectAddExtract(A))
19935 return R;
19936
19937 if (A.getOperand(0).getOpcode() == ISD::ADD && A.getOperand(0).hasOneUse())
19938 if (SDValue R = performUADDVAddCombine(A.getOperand(0), DAG))
19939 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
19940 A.getOperand(1));
19941 if (A.getOperand(1).getOpcode() == ISD::ADD && A.getOperand(1).hasOneUse())
19942 if (SDValue R = performUADDVAddCombine(A.getOperand(1), DAG))
19943 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
19944 A.getOperand(0));
19945 return SDValue();
19946}
19947
19948// We can convert a UADDV(add(zext(64-bit source), zext(64-bit source))) into
19949// UADDLV(concat), where the concat represents the 64-bit zext sources.
19951 // Look for add(zext(64-bit source), zext(64-bit source)), returning
19952 // UADDLV(concat(zext, zext)) if found.
19953 assert(A.getOpcode() == ISD::ADD);
19954 EVT VT = A.getValueType();
19955 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
19956 return SDValue();
19957 SDValue Op0 = A.getOperand(0);
19958 SDValue Op1 = A.getOperand(1);
19959 if (Op0.getOpcode() != ISD::ZERO_EXTEND || Op0.getOpcode() != Op1.getOpcode())
19960 return SDValue();
19961 SDValue Ext0 = Op0.getOperand(0);
19962 SDValue Ext1 = Op1.getOperand(0);
19963 EVT ExtVT0 = Ext0.getValueType();
19964 EVT ExtVT1 = Ext1.getValueType();
19965 // Check zext VTs are the same and 64-bit length.
19966 if (ExtVT0 != ExtVT1 ||
19967 VT.getScalarSizeInBits() != (2 * ExtVT0.getScalarSizeInBits()))
19968 return SDValue();
19969 // Get VT for concat of zext sources.
19970 EVT PairVT = ExtVT0.getDoubleNumVectorElementsVT(*DAG.getContext());
19971 SDValue Concat =
19972 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(A), PairVT, Ext0, Ext1);
19973
19974 switch (VT.getSimpleVT().SimpleTy) {
19975 case MVT::v2i64:
19976 case MVT::v4i32:
19977 return DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), VT, Concat);
19978 case MVT::v8i16: {
19979 SDValue Uaddlv =
19980 DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), MVT::v4i32, Concat);
19981 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(A), MVT::v8i16, Uaddlv);
19982 }
19983 default:
19984 llvm_unreachable("Unhandled vector type");
19985 }
19986}
19987
19989 SDValue A = N->getOperand(0);
19990 if (A.getOpcode() == ISD::ADD) {
19991 if (SDValue R = performUADDVAddCombine(A, DAG))
19992 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), R);
19993 else if (SDValue R = performUADDVZextCombine(A, DAG))
19994 return R;
19995 }
19996
19997 // uaddv(A) --> A if all lanes of A are known to be zeros except the 0th lane.
19998 MVT OpVT = A.getSimpleValueType();
19999 assert(N->getSimpleValueType(0) == OpVT &&
20000 "The operand type should be consistent with the result type of UADDV");
20002 Mask.clearBit(0);
20003 KnownBits KnownLeadingLanes = DAG.computeKnownBits(A, Mask);
20004 if (KnownLeadingLanes.isZero())
20005 return A;
20006
20007 return SDValue();
20008}
20009
20013 APInt::getAllOnes(N->getValueType(0).getScalarSizeInBits());
20014 APInt DemandedElts =
20015 APInt::getAllOnes(N->getValueType(0).getVectorNumElements());
20016
20018 SDValue(N, 0), DemandedBits, DemandedElts, DCI))
20019 return SDValue(N, 0);
20020 return SDValue();
20021}
20022
20025 const AArch64Subtarget *Subtarget) {
20026 if (DCI.isBeforeLegalizeOps())
20027 return SDValue();
20028
20029 return foldVectorXorShiftIntoCmp(N, DAG, Subtarget);
20030}
20031
20032SDValue
20033AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
20034 SelectionDAG &DAG,
20035 SmallVectorImpl<SDNode *> &Created) const {
20036 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
20037 if (isIntDivCheap(N->getValueType(0), Attr))
20038 return SDValue(N, 0); // Lower SDIV as SDIV
20039
20040 EVT VT = N->getValueType(0);
20041
20042 // If SVE is available, we can generate
20043 // sdiv(x,y) -> ptrue + asrd , where 'y' is positive pow-2 divisor.
20044 // sdiv(x,y) -> ptrue + asrd + subr , where 'y' is negative pow-2 divisor.
20045 if (VT.isVector() && Subtarget->isSVEorStreamingSVEAvailable())
20046 return SDValue(N, 0);
20047
20048 // fold (sdiv X, pow2)
20049 if ((VT != MVT::i32 && VT != MVT::i64) ||
20050 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
20051 return SDValue();
20052
20053 // If the divisor is 2 or -2, the default expansion is better. It will add
20054 // (N->getValueType(0) >> (BitWidth - 1)) to it before shifting right.
20055 if (Divisor == 2 ||
20056 Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))
20057 return SDValue();
20058
20059 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
20060}
20061
20062SDValue
20063AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
20064 SelectionDAG &DAG,
20065 SmallVectorImpl<SDNode *> &Created) const {
20066 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
20067 if (isIntDivCheap(N->getValueType(0), Attr))
20068 return SDValue(N, 0); // Lower SREM as SREM
20069
20070 EVT VT = N->getValueType(0);
20071
20072 // For scalable and fixed types, mark them as cheap so we can handle it much
20073 // later. This allows us to handle larger than legal types.
20074 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
20075 return SDValue(N, 0);
20076
20077 // fold (srem X, pow2)
20078 if ((VT != MVT::i32 && VT != MVT::i64) ||
20079 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
20080 return SDValue();
20081
20082 unsigned Lg2 = Divisor.countr_zero();
20083 if (Lg2 == 0)
20084 return SDValue();
20085
20086 SDLoc DL(N);
20087 SDValue N0 = N->getOperand(0);
20088 SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
20089 SDValue Zero = DAG.getConstant(0, DL, VT);
20090 SDValue CCVal, CSNeg;
20091 if (Lg2 == 1) {
20092 SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETGE, CCVal, DAG, DL);
20093 SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
20094 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, And, And, CCVal, Cmp);
20095
20096 Created.push_back(Cmp.getNode());
20097 Created.push_back(And.getNode());
20098 } else {
20099 SDValue CCVal = getCondCode(DAG, AArch64CC::MI);
20100 SDVTList VTs = DAG.getVTList(VT, FlagsVT);
20101
20102 SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0);
20103 SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
20104 SDValue AndNeg = DAG.getNode(ISD::AND, DL, VT, Negs, Pow2MinusOne);
20105 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, AndPos, AndNeg, CCVal,
20106 Negs.getValue(1));
20107
20108 Created.push_back(Negs.getNode());
20109 Created.push_back(AndPos.getNode());
20110 Created.push_back(AndNeg.getNode());
20111 }
20112
20113 return CSNeg;
20114}
20115
20117 switch(getIntrinsicID(S.getNode())) {
20118 default:
20119 break;
20120 case Intrinsic::aarch64_sve_cntb:
20121 case Intrinsic::aarch64_sve_cnth:
20122 case Intrinsic::aarch64_sve_cntw:
20123 case Intrinsic::aarch64_sve_cntd:
20124 return true;
20125 }
20126 return false;
20127}
20128
20129// Returns the maximum (scalable) value that can be returned by an SVE count
20130// intrinsic. Returns std::nullopt if \p Op is not aarch64_sve_cnt*.
20131static std::optional<ElementCount> getMaxValueForSVECntIntrinsic(SDValue Op) {
20132 Intrinsic::ID IID = getIntrinsicID(Op.getNode());
20133 if (IID == Intrinsic::aarch64_sve_cntp)
20134 return Op.getOperand(1).getValueType().getVectorElementCount();
20135 switch (IID) {
20136 case Intrinsic::aarch64_sve_cntd:
20137 return ElementCount::getScalable(2);
20138 case Intrinsic::aarch64_sve_cntw:
20139 return ElementCount::getScalable(4);
20140 case Intrinsic::aarch64_sve_cnth:
20141 return ElementCount::getScalable(8);
20142 case Intrinsic::aarch64_sve_cntb:
20143 return ElementCount::getScalable(16);
20144 default:
20145 return std::nullopt;
20146 }
20147}
20148
20149/// Calculates what the pre-extend type is, based on the extension
20150/// operation node provided by \p Extend.
20151///
20152/// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
20153/// pre-extend type is pulled directly from the operand, while other extend
20154/// operations need a bit more inspection to get this information.
20155///
20156/// \param Extend The SDNode from the DAG that represents the extend operation
20157///
20158/// \returns The type representing the \p Extend source type, or \p MVT::Other
20159/// if no valid type can be determined
20161 switch (Extend.getOpcode()) {
20162 case ISD::SIGN_EXTEND:
20163 case ISD::ZERO_EXTEND:
20164 case ISD::ANY_EXTEND:
20165 return Extend.getOperand(0).getValueType();
20166 case ISD::AssertSext:
20167 case ISD::AssertZext:
20169 VTSDNode *TypeNode = dyn_cast<VTSDNode>(Extend.getOperand(1));
20170 if (!TypeNode)
20171 return MVT::Other;
20172 return TypeNode->getVT();
20173 }
20174 case ISD::AND: {
20177 if (!Constant)
20178 return MVT::Other;
20179
20180 uint32_t Mask = Constant->getZExtValue();
20181
20182 if (Mask == UCHAR_MAX)
20183 return MVT::i8;
20184 else if (Mask == USHRT_MAX)
20185 return MVT::i16;
20186 else if (Mask == UINT_MAX)
20187 return MVT::i32;
20188
20189 return MVT::Other;
20190 }
20191 default:
20192 return MVT::Other;
20193 }
20194}
20195
20196/// Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern
20197/// into sext/zext(buildvector) or sext/zext(shuffle) making use of the vector
20198/// SExt/ZExt rather than the scalar SExt/ZExt
20200 EVT VT = BV.getValueType();
20201 if (BV.getOpcode() != ISD::BUILD_VECTOR &&
20203 return SDValue();
20204
20205 // Use the first item in the buildvector/shuffle to get the size of the
20206 // extend, and make sure it looks valid.
20207 SDValue Extend = BV->getOperand(0);
20208 unsigned ExtendOpcode = Extend.getOpcode();
20209 bool IsAnyExt = ExtendOpcode == ISD::ANY_EXTEND;
20210 bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
20211 ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
20212 ExtendOpcode == ISD::AssertSext;
20213 if (!IsAnyExt && !IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
20214 ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
20215 return SDValue();
20216 // Shuffle inputs are vector, limit to SIGN_EXTEND/ZERO_EXTEND/ANY_EXTEND to
20217 // ensure calculatePreExtendType will work without issue.
20218 if (BV.getOpcode() == ISD::VECTOR_SHUFFLE &&
20219 ExtendOpcode != ISD::SIGN_EXTEND && ExtendOpcode != ISD::ZERO_EXTEND &&
20220 ExtendOpcode != ISD::ANY_EXTEND)
20221 return SDValue();
20222
20223 // Restrict valid pre-extend data type
20224 EVT PreExtendType = calculatePreExtendType(Extend);
20225 if (PreExtendType == MVT::Other ||
20226 PreExtendType.getScalarSizeInBits() != VT.getScalarSizeInBits() / 2)
20227 return SDValue();
20228
20229 // Make sure all other operands are equally extended.
20230 bool SeenZExtOrSExt = !IsAnyExt;
20231 for (SDValue Op : drop_begin(BV->ops())) {
20232 if (Op.isUndef())
20233 continue;
20234
20235 if (calculatePreExtendType(Op) != PreExtendType)
20236 return SDValue();
20237
20238 unsigned Opc = Op.getOpcode();
20241 return SDValue();
20242
20243 if (Opc == ISD::ANY_EXTEND)
20244 continue;
20245
20246 bool OpcIsSExt = Opc == ISD::SIGN_EXTEND || Opc == ISD::SIGN_EXTEND_INREG ||
20248
20249 if (SeenZExtOrSExt && OpcIsSExt != IsSExt)
20250 return SDValue();
20251
20252 IsSExt = OpcIsSExt;
20253 SeenZExtOrSExt = true;
20254 }
20255
20256 SDValue NBV;
20257 SDLoc DL(BV);
20258 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
20259 EVT PreExtendVT =
20260 VT.changeVectorElementType(*DAG.getContext(), PreExtendType);
20261 EVT PreExtendLegalType =
20262 PreExtendType.getScalarSizeInBits() < 32 ? MVT::i32 : PreExtendType;
20264 for (SDValue Op : BV->ops())
20265 NewOps.push_back(Op.isUndef() ? DAG.getPOISON(PreExtendLegalType)
20266 : DAG.getAnyExtOrTrunc(Op.getOperand(0), DL,
20267 PreExtendLegalType));
20268 NBV = DAG.getNode(ISD::BUILD_VECTOR, DL, PreExtendVT, NewOps);
20269 } else { // BV.getOpcode() == ISD::VECTOR_SHUFFLE
20270 EVT PreExtendVT = VT.changeVectorElementType(*DAG.getContext(),
20271 PreExtendType.getScalarType());
20272 NBV = DAG.getVectorShuffle(PreExtendVT, DL, BV.getOperand(0).getOperand(0),
20273 BV.getOperand(1).isUndef()
20274 ? DAG.getPOISON(PreExtendVT)
20275 : BV.getOperand(1).getOperand(0),
20276 cast<ShuffleVectorSDNode>(BV)->getMask());
20277 }
20278 unsigned ExtOpc = !SeenZExtOrSExt
20280 : (IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND);
20281 return DAG.getNode(ExtOpc, DL, VT, NBV);
20282}
20283
20284/// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
20285/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
20287 // If the value type isn't a vector, none of the operands are going to be dups
20288 EVT VT = Mul->getValueType(0);
20289 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
20290 return SDValue();
20291
20292 SDValue Op0 = performBuildShuffleExtendCombine(Mul->getOperand(0), DAG);
20293 SDValue Op1 = performBuildShuffleExtendCombine(Mul->getOperand(1), DAG);
20294
20295 // Neither operands have been changed, don't make any further changes
20296 if (!Op0 && !Op1)
20297 return SDValue();
20298
20299 SDLoc DL(Mul);
20300 return DAG.getNode(Mul->getOpcode(), DL, VT, Op0 ? Op0 : Mul->getOperand(0),
20301 Op1 ? Op1 : Mul->getOperand(1));
20302}
20303
20304// Multiplying an RDSVL value by a constant can sometimes be done cheaper by
20305// folding a power-of-two factor of the constant into the RDSVL immediate and
20306// compensating with an extra shift.
20307//
20308// We rewrite:
20309// (mul (srl (rdsvl 1), w), x)
20310// to one of:
20311// (shl (rdsvl y), z) if z > 0
20312// (srl (rdsvl y), abs(z)) if z < 0
20313// where integers y, z satisfy x = y * 2^(w + z) and y ∈ [-32, 31].
20315 SDLoc DL(Mul);
20316 EVT VT = Mul->getValueType(0);
20317 SDValue MulOp0 = Mul->getOperand(0);
20318 int ConstMultiplier =
20319 cast<ConstantSDNode>(Mul->getOperand(1))->getSExtValue();
20320 if ((MulOp0->getOpcode() != ISD::SRL) ||
20321 (MulOp0->getOperand(0).getOpcode() != AArch64ISD::RDSVL))
20322 return SDValue();
20323
20324 unsigned AbsConstValue = abs(ConstMultiplier);
20325 unsigned OperandShift =
20326 cast<ConstantSDNode>(MulOp0->getOperand(1))->getZExtValue();
20327
20328 // z ≤ ctz(|x|) - w (largest extra shift we can take while keeping y
20329 // integral)
20330 int UpperBound = llvm::countr_zero(AbsConstValue) - OperandShift;
20331
20332 // To keep y in range, with B = 31 for x > 0 and B = 32 for x < 0, we need:
20333 // 2^(w + z) ≥ ceil(x / B) ⇒ z ≥ ceil_log2(ceil(x / B)) - w (LowerBound).
20334 unsigned B = ConstMultiplier < 0 ? 32 : 31;
20335 unsigned CeilAxOverB = (AbsConstValue + (B - 1)) / B; // ceil(|x|/B)
20336 int LowerBound = llvm::Log2_32_Ceil(CeilAxOverB) - OperandShift;
20337
20338 // No valid solution found.
20339 if (LowerBound > UpperBound)
20340 return SDValue();
20341
20342 // Any value of z in [LowerBound, UpperBound] is valid. Prefer no extra
20343 // shift if possible.
20344 int Shift = std::min(std::max(/*prefer*/ 0, LowerBound), UpperBound);
20345
20346 // y = x / 2^(w + z)
20347 int32_t RdsvlMul = (AbsConstValue >> (OperandShift + Shift)) *
20348 (ConstMultiplier < 0 ? -1 : 1);
20349 auto Rdsvl = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
20350 DAG.getSignedConstant(RdsvlMul, DL, MVT::i32));
20351
20352 if (Shift == 0)
20353 return Rdsvl;
20354 return DAG.getNode(Shift < 0 ? ISD::SRL : ISD::SHL, DL, VT, Rdsvl,
20355 DAG.getConstant(abs(Shift), DL, MVT::i32),
20357}
20358
20359// Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz
20360// Same for other types with equivalent constants.
20362 EVT VT = N->getValueType(0);
20363 if (VT != MVT::v2i64 && VT != MVT::v1i64 && VT != MVT::v2i32 &&
20364 VT != MVT::v4i32 && VT != MVT::v4i16 && VT != MVT::v8i16)
20365 return SDValue();
20366 if (N->getOperand(0).getOpcode() != ISD::AND ||
20367 N->getOperand(0).getOperand(0).getOpcode() != ISD::SRL)
20368 return SDValue();
20369
20370 SDValue And = N->getOperand(0);
20371 SDValue Srl = And.getOperand(0);
20372
20373 APInt V1, V2, V3;
20374 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), V1) ||
20375 !ISD::isConstantSplatVector(And.getOperand(1).getNode(), V2) ||
20377 return SDValue();
20378
20379 unsigned HalfSize = VT.getScalarSizeInBits() / 2;
20380 if (!V1.isMask(HalfSize) || V2 != (1ULL | 1ULL << HalfSize) ||
20381 V3 != (HalfSize - 1))
20382 return SDValue();
20383
20384 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
20385 EVT::getIntegerVT(*DAG.getContext(), HalfSize),
20386 VT.getVectorElementCount() * 2);
20387
20388 SDLoc DL(N);
20389 SDValue In = DAG.getNode(AArch64ISD::NVCAST, DL, HalfVT, Srl.getOperand(0));
20390 SDValue Zero = DAG.getConstant(0, DL, In.getValueType());
20391 SDValue CM = DAG.getSetCC(DL, HalfVT, Zero, In, ISD::SETGT);
20392 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, CM);
20393}
20394
20395// Transform vector add(zext i8 to i32, zext i8 to i32)
20396// into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
20397// This allows extra uses of saddl/uaddl at the lower vector widths, and less
20398// extends.
20400 EVT VT = N->getValueType(0);
20401 if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 ||
20402 (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
20403 N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) ||
20404 (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
20405 N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) ||
20406 N->getOperand(0).getOperand(0).getValueType() !=
20407 N->getOperand(1).getOperand(0).getValueType())
20408 return SDValue();
20409
20410 if (N->getOpcode() == ISD::MUL &&
20411 N->getOperand(0).getOpcode() != N->getOperand(1).getOpcode())
20412 return SDValue();
20413
20414 SDValue N0 = N->getOperand(0).getOperand(0);
20415 SDValue N1 = N->getOperand(1).getOperand(0);
20416 EVT InVT = N0.getValueType();
20417
20418 EVT S1 = InVT.getScalarType();
20419 EVT S2 = VT.getScalarType();
20420 if ((S2 == MVT::i32 && S1 == MVT::i8) ||
20421 (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) {
20422 SDLoc DL(N);
20423 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
20426 SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0);
20427 SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1);
20428 SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1);
20429 return DAG.getNode(N->getOpcode() == ISD::MUL ? N->getOperand(0).getOpcode()
20430 : (unsigned)ISD::SIGN_EXTEND,
20431 DL, VT, NewOp);
20432 }
20433 return SDValue();
20434}
20435
20438 const AArch64Subtarget *Subtarget) {
20439
20440 if (SDValue Ext = performMulVectorExtendCombine(N, DAG))
20441 return Ext;
20443 return Ext;
20444 if (SDValue Ext = performVectorExtCombine(N, DAG))
20445 return Ext;
20446 if (DCI.isBeforeLegalizeOps())
20447 return SDValue();
20448
20449 // Canonicalize X*(Y+1) -> X*Y+X and (X+1)*Y -> X*Y+Y,
20450 // and in MachineCombiner pass, add+mul will be combined into madd.
20451 // Similarly, X*(1-Y) -> X - X*Y and (1-Y)*X -> X - Y*X.
20452 SDLoc DL(N);
20453 EVT VT = N->getValueType(0);
20454 SDValue N0 = N->getOperand(0);
20455 SDValue N1 = N->getOperand(1);
20456 SDValue MulOper;
20457 unsigned AddSubOpc;
20458
20459 auto IsAddSubWith1 = [&](SDValue V) -> bool {
20460 AddSubOpc = V->getOpcode();
20461 if ((AddSubOpc == ISD::ADD || AddSubOpc == ISD::SUB) && V->hasOneUse()) {
20462 SDValue Opnd = V->getOperand(1);
20463 MulOper = V->getOperand(0);
20464 if (AddSubOpc == ISD::SUB)
20465 std::swap(Opnd, MulOper);
20466 if (auto C = dyn_cast<ConstantSDNode>(Opnd))
20467 return C->isOne();
20468 }
20469 return false;
20470 };
20471
20472 if (IsAddSubWith1(N0)) {
20473 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N1, MulOper);
20474 return DAG.getNode(AddSubOpc, DL, VT, N1, MulVal);
20475 }
20476
20477 if (IsAddSubWith1(N1)) {
20478 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N0, MulOper);
20479 return DAG.getNode(AddSubOpc, DL, VT, N0, MulVal);
20480 }
20481
20482 // The below optimizations require a constant RHS.
20483 if (!isa<ConstantSDNode>(N1))
20484 return SDValue();
20485
20486 if (SDValue Ext = performMulRdsvlCombine(N, DAG))
20487 return Ext;
20488
20490 const APInt &ConstValue = C->getAPIntValue();
20491
20492 // Allow the scaling to be folded into the `cnt` instruction by preventing
20493 // the scaling to be obscured here. This makes it easier to pattern match.
20494 if (IsSVECntIntrinsic(N0) ||
20495 (N0->getOpcode() == ISD::TRUNCATE &&
20496 (IsSVECntIntrinsic(N0->getOperand(0)))))
20497 if (ConstValue.sge(1) && ConstValue.sle(16))
20498 return SDValue();
20499
20500 // Multiplication of a power of two plus/minus one can be done more
20501 // cheaply as shift+add/sub. For now, this is true unilaterally. If
20502 // future CPUs have a cheaper MADD instruction, this may need to be
20503 // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
20504 // 64-bit is 5 cycles, so this is always a win.
20505 // More aggressively, some multiplications N0 * C can be lowered to
20506 // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
20507 // e.g. 6=3*2=(2+1)*2, 45=(1+4)*(1+8)
20508 // TODO: lower more cases.
20509
20510 // TrailingZeroes is used to test if the mul can be lowered to
20511 // shift+add+shift.
20512 unsigned TrailingZeroes = ConstValue.countr_zero();
20513 if (TrailingZeroes) {
20514 // Conservatively do not lower to shift+add+shift if the mul might be
20515 // folded into smul or umul.
20516 if (N0->hasOneUse() && (isSignExtended(N0, DAG) ||
20517 isZeroExtended(N0, DAG)))
20518 return SDValue();
20519 // Conservatively do not lower to shift+add+shift if the mul might be
20520 // folded into madd or msub.
20521 if (N->hasOneUse() && (N->user_begin()->getOpcode() == ISD::ADD ||
20522 N->user_begin()->getOpcode() == ISD::SUB))
20523 return SDValue();
20524 }
20525 // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
20526 // and shift+add+shift.
20527 APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
20528 unsigned ShiftAmt;
20529
20530 auto Shl = [&](SDValue N0, unsigned N1) {
20531 if (!N0.getNode())
20532 return SDValue();
20533 // If shift causes overflow, ignore this combine.
20534 if (N1 >= N0.getValueSizeInBits())
20535 return SDValue();
20536 SDValue RHS = DAG.getConstant(N1, DL, MVT::i64);
20537 return DAG.getNode(ISD::SHL, DL, VT, N0, RHS);
20538 };
20539 auto Add = [&](SDValue N0, SDValue N1) {
20540 if (!N0.getNode() || !N1.getNode())
20541 return SDValue();
20542 return DAG.getNode(ISD::ADD, DL, VT, N0, N1);
20543 };
20544 auto Sub = [&](SDValue N0, SDValue N1) {
20545 if (!N0.getNode() || !N1.getNode())
20546 return SDValue();
20547 return DAG.getNode(ISD::SUB, DL, VT, N0, N1);
20548 };
20549 auto Negate = [&](SDValue N) {
20550 if (!N0.getNode())
20551 return SDValue();
20552 SDValue Zero = DAG.getConstant(0, DL, VT);
20553 return DAG.getNode(ISD::SUB, DL, VT, Zero, N);
20554 };
20555
20556 // Can the const C be decomposed into (1+2^M1)*(1+2^N1), eg:
20557 // C = 45 is equal to (1+4)*(1+8), we don't decompose it into (1+2)*(16-1) as
20558 // the (2^N - 1) can't be execused via a single instruction.
20559 auto isPowPlusPlusConst = [](APInt C, APInt &M, APInt &N) {
20560 unsigned BitWidth = C.getBitWidth();
20561 for (unsigned i = 1; i < BitWidth / 2; i++) {
20562 APInt Rem;
20563 APInt X(BitWidth, (1 << i) + 1);
20564 APInt::sdivrem(C, X, N, Rem);
20565 APInt NVMinus1 = N - 1;
20566 if (Rem == 0 && NVMinus1.isPowerOf2()) {
20567 M = X;
20568 return true;
20569 }
20570 }
20571 return false;
20572 };
20573
20574 // Can the const C be decomposed into (2^M + 1) * 2^N + 1), eg:
20575 // C = 11 is equal to (1+4)*2+1, we don't decompose it into (1+2)*4-1 as
20576 // the (2^N - 1) can't be execused via a single instruction.
20577 auto isPowPlusPlusOneConst = [](APInt C, APInt &M, APInt &N) {
20578 APInt CVMinus1 = C - 1;
20579 if (CVMinus1.isNegative())
20580 return false;
20581 unsigned TrailingZeroes = CVMinus1.countr_zero();
20582 APInt SCVMinus1 = CVMinus1.ashr(TrailingZeroes) - 1;
20583 if (SCVMinus1.isPowerOf2()) {
20584 unsigned BitWidth = SCVMinus1.getBitWidth();
20585 M = APInt(BitWidth, SCVMinus1.logBase2());
20586 N = APInt(BitWidth, TrailingZeroes);
20587 return true;
20588 }
20589 return false;
20590 };
20591
20592 // Can the const C be decomposed into (1 - (1 - 2^M) * 2^N), eg:
20593 // C = 29 is equal to 1 - (1 - 2^3) * 2^2.
20594 auto isPowMinusMinusOneConst = [](APInt C, APInt &M, APInt &N) {
20595 APInt CVMinus1 = C - 1;
20596 if (CVMinus1.isNegative())
20597 return false;
20598 unsigned TrailingZeroes = CVMinus1.countr_zero();
20599 APInt CVPlus1 = CVMinus1.ashr(TrailingZeroes) + 1;
20600 if (CVPlus1.isPowerOf2()) {
20601 unsigned BitWidth = CVPlus1.getBitWidth();
20602 M = APInt(BitWidth, CVPlus1.logBase2());
20603 N = APInt(BitWidth, TrailingZeroes);
20604 return true;
20605 }
20606 return false;
20607 };
20608
20609 if (ConstValue.isNonNegative()) {
20610 // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
20611 // (mul x, 2^N - 1) => (sub (shl x, N), x)
20612 // (mul x, (2^(N-M) - 1) * 2^M) => (sub (shl x, N), (shl x, M))
20613 // (mul x, (2^M + 1) * (2^N + 1))
20614 // => MV = (add (shl x, M), x); (add (shl MV, N), MV)
20615 // (mul x, (2^M + 1) * 2^N + 1))
20616 // => MV = add (shl x, M), x); add (shl MV, N), x)
20617 // (mul x, 1 - (1 - 2^M) * 2^N))
20618 // => MV = sub (x - (shl x, M)); sub (x - (shl MV, N))
20619 APInt SCVMinus1 = ShiftedConstValue - 1;
20620 APInt SCVPlus1 = ShiftedConstValue + 1;
20621 APInt CVPlus1 = ConstValue + 1;
20622 APInt CVM, CVN;
20623 if (SCVMinus1.isPowerOf2()) {
20624 ShiftAmt = SCVMinus1.logBase2();
20625 return Shl(Add(Shl(N0, ShiftAmt), N0), TrailingZeroes);
20626 } else if (CVPlus1.isPowerOf2()) {
20627 ShiftAmt = CVPlus1.logBase2();
20628 return Sub(Shl(N0, ShiftAmt), N0);
20629 } else if (SCVPlus1.isPowerOf2()) {
20630 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
20631 return Sub(Shl(N0, ShiftAmt), Shl(N0, TrailingZeroes));
20632 }
20633 if (Subtarget->hasALULSLFast() &&
20634 isPowPlusPlusConst(ConstValue, CVM, CVN)) {
20635 APInt CVMMinus1 = CVM - 1;
20636 APInt CVNMinus1 = CVN - 1;
20637 unsigned ShiftM1 = CVMMinus1.logBase2();
20638 unsigned ShiftN1 = CVNMinus1.logBase2();
20639 // ALULSLFast implicate that Shifts <= 4 places are fast
20640 if (ShiftM1 <= 4 && ShiftN1 <= 4) {
20641 SDValue MVal = Add(Shl(N0, ShiftM1), N0);
20642 return Add(Shl(MVal, ShiftN1), MVal);
20643 }
20644 }
20645 if (Subtarget->hasALULSLFast() &&
20646 isPowPlusPlusOneConst(ConstValue, CVM, CVN)) {
20647 unsigned ShiftM = CVM.getZExtValue();
20648 unsigned ShiftN = CVN.getZExtValue();
20649 // ALULSLFast implicate that Shifts <= 4 places are fast
20650 if (ShiftM <= 4 && ShiftN <= 4) {
20651 SDValue MVal = Add(Shl(N0, CVM.getZExtValue()), N0);
20652 return Add(Shl(MVal, CVN.getZExtValue()), N0);
20653 }
20654 }
20655
20656 if (Subtarget->hasALULSLFast() &&
20657 isPowMinusMinusOneConst(ConstValue, CVM, CVN)) {
20658 unsigned ShiftM = CVM.getZExtValue();
20659 unsigned ShiftN = CVN.getZExtValue();
20660 // ALULSLFast implicate that Shifts <= 4 places are fast
20661 if (ShiftM <= 4 && ShiftN <= 4) {
20662 SDValue MVal = Sub(N0, Shl(N0, CVM.getZExtValue()));
20663 return Sub(N0, Shl(MVal, CVN.getZExtValue()));
20664 }
20665 }
20666 } else {
20667 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
20668 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
20669 // (mul x, -(2^(N-M) - 1) * 2^M) => (sub (shl x, M), (shl x, N))
20670 APInt SCVPlus1 = -ShiftedConstValue + 1;
20671 APInt CVNegPlus1 = -ConstValue + 1;
20672 APInt CVNegMinus1 = -ConstValue - 1;
20673 if (CVNegPlus1.isPowerOf2()) {
20674 ShiftAmt = CVNegPlus1.logBase2();
20675 return Sub(N0, Shl(N0, ShiftAmt));
20676 } else if (CVNegMinus1.isPowerOf2()) {
20677 ShiftAmt = CVNegMinus1.logBase2();
20678 return Negate(Add(Shl(N0, ShiftAmt), N0));
20679 } else if (SCVPlus1.isPowerOf2()) {
20680 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
20681 return Sub(Shl(N0, TrailingZeroes), Shl(N0, ShiftAmt));
20682 }
20683 }
20684
20685 return SDValue();
20686}
20687
20689 SelectionDAG &DAG) {
20690 // Take advantage of vector comparisons producing 0 or -1 in each lane to
20691 // optimize away operation when it's from a constant.
20692 //
20693 // The general transformation is:
20694 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
20695 // AND(VECTOR_CMP(x,y), constant2)
20696 // constant2 = UNARYOP(constant)
20697
20698 // Early exit if this isn't a vector operation, the operand of the
20699 // unary operation isn't a bitwise AND, or if the sizes of the operations
20700 // aren't the same.
20701 EVT VT = N->getValueType(0);
20702 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
20703 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
20704 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
20705 return SDValue();
20706
20707 // Now check that the other operand of the AND is a constant. We could
20708 // make the transformation for non-constant splats as well, but it's unclear
20709 // that would be a benefit as it would not eliminate any operations, just
20710 // perform one more step in scalar code before moving to the vector unit.
20711 if (BuildVectorSDNode *BV =
20712 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
20713 // Bail out if the vector isn't a constant.
20714 if (!BV->isConstant())
20715 return SDValue();
20716
20717 // Everything checks out. Build up the new and improved node.
20718 SDLoc DL(N);
20719 EVT IntVT = BV->getValueType(0);
20720 // Create a new constant of the appropriate type for the transformed
20721 // DAG.
20722 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
20723 // The AND node needs bitcasts to/from an integer vector type around it.
20724 SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
20725 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
20726 N->getOperand(0)->getOperand(0), MaskConst);
20727 SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
20728 return Res;
20729 }
20730
20731 return SDValue();
20732}
20733
20734/// Tries to replace scalar FP <-> INT conversions with SVE in streaming
20735/// functions, this can help to reduce the number of fmovs to/from GPRs.
20736static SDValue
20739 const AArch64Subtarget *Subtarget) {
20740 if (N->isStrictFPOpcode())
20741 return SDValue();
20742
20743 if (DCI.isBeforeLegalizeOps())
20744 return SDValue();
20745
20746 if (Subtarget->hasFPRCVT())
20747 return SDValue();
20748
20749 if (!Subtarget->isSVEorStreamingSVEAvailable() ||
20750 (!Subtarget->isStreaming() && !Subtarget->isStreamingCompatible()))
20751 return SDValue();
20752
20753 auto isSupportedType = [](EVT VT) {
20754 return !VT.isVector() && VT != MVT::bf16 && VT != MVT::f128;
20755 };
20756
20757 SDValue SrcVal = N->getOperand(0);
20758 EVT SrcTy = SrcVal.getValueType();
20759 EVT DestTy = N->getValueType(0);
20760
20761 if (!isSupportedType(SrcTy) || !isSupportedType(DestTy))
20762 return SDValue();
20763
20764 EVT SrcVecTy;
20765 EVT DestVecTy;
20766 if (DestTy.bitsGT(SrcTy)) {
20767 DestVecTy = getPackedSVEVectorVT(DestTy);
20768 SrcVecTy = DestVecTy.changeVectorElementType(*DAG.getContext(), SrcTy);
20769 } else {
20770 SrcVecTy = getPackedSVEVectorVT(SrcTy);
20771 DestVecTy = SrcVecTy.changeVectorElementType(*DAG.getContext(), DestTy);
20772 }
20773
20774 // Ensure the resulting src/dest vector type is legal.
20775 if (SrcVecTy == MVT::nxv2i32 || DestVecTy == MVT::nxv2i32)
20776 return SDValue();
20777
20778 SDLoc DL(N);
20779 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
20780 SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SrcVecTy,
20781 DAG.getPOISON(SrcVecTy), SrcVal, ZeroIdx);
20782
20783 // FP_TO_*_SAT carries the saturating scalar VT as operand 1, so preserve it.
20784 SmallVector<SDValue, 2> ConvertOps = {Vec};
20785 if (N->getOpcode() == ISD::FP_TO_SINT_SAT ||
20786 N->getOpcode() == ISD::FP_TO_UINT_SAT)
20787 ConvertOps.push_back(DAG.getValueType(DestVecTy.getVectorElementType()));
20788 SDValue Convert = DAG.getNode(N->getOpcode(), DL, DestVecTy, ConvertOps);
20789 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestTy, Convert, ZeroIdx);
20790}
20791
20794 const AArch64Subtarget *Subtarget) {
20795 // First try to optimize away the conversion when it's conditionally from
20796 // a constant. Vectors only.
20798 return Res;
20799
20800 if (SDValue Res =
20801 tryToReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
20802 return Res;
20803
20804 EVT VT = N->getValueType(0);
20805 if (VT != MVT::f16 && VT != MVT::f32 && VT != MVT::f64)
20806 return SDValue();
20807 if (VT == MVT::f16 && !Subtarget->hasFullFP16())
20808 return SDValue();
20809
20810 // Only optimize when the source and destination types have the same width.
20811 if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
20812 return SDValue();
20813
20814 // If the result of an integer load is only used by an integer-to-float
20815 // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
20816 // This eliminates an "integer-to-vector-move" UOP and improves throughput.
20817 SDValue N0 = N->getOperand(0);
20818 if (Subtarget->isNeonAvailable() && ISD::isNormalLoad(N0.getNode()) &&
20819 N0.hasOneUse() &&
20820 // Do not change the width of a volatile load.
20821 !cast<LoadSDNode>(N0)->isVolatile()) {
20822 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
20823 SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
20824 LN0->getPointerInfo(), LN0->getAlign(),
20825 LN0->getMemOperand()->getFlags());
20826
20827 // Make sure successors of the original load stay after it by updating them
20828 // to use the new Chain.
20829 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
20830
20831 unsigned Opcode =
20832 (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
20833 return DAG.getNode(Opcode, SDLoc(N), VT, Load);
20834 }
20835
20836 return SDValue();
20837}
20838
20839/// Fold a floating-point multiply by power of two into floating-point to
20840/// fixed-point conversion.
20843 const AArch64Subtarget *Subtarget) {
20844 if (SDValue Res =
20845 tryToReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
20846 return Res;
20847
20848 return SDValue();
20849}
20850
20851// Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to
20852// convert to csel(ccmp(.., cc0)), depending on cc1:
20853
20854// (AND (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
20855// =>
20856// (CSET cc1 (CCMP x1 y1 !cc1 cc0 cmp0))
20857//
20858// (OR (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
20859// =>
20860// (CSET cc1 (CCMP x1 y1 cc1 !cc0 cmp0))
20862 EVT VT = N->getValueType(0);
20863 SDValue CSel0 = N->getOperand(0);
20864 SDValue CSel1 = N->getOperand(1);
20865
20866 if (CSel0.getOpcode() != AArch64ISD::CSEL ||
20867 CSel1.getOpcode() != AArch64ISD::CSEL)
20868 return SDValue();
20869
20870 if (!CSel0->hasOneUse() || !CSel1->hasOneUse())
20871 return SDValue();
20872
20873 if (!isNullConstant(CSel0.getOperand(0)) ||
20874 !isOneConstant(CSel0.getOperand(1)) ||
20875 !isNullConstant(CSel1.getOperand(0)) ||
20876 !isOneConstant(CSel1.getOperand(1)))
20877 return SDValue();
20878
20879 SDValue Cmp0 = CSel0.getOperand(3);
20880 SDValue Cmp1 = CSel1.getOperand(3);
20883 if (!Cmp0->hasOneUse() || !Cmp1->hasOneUse())
20884 return SDValue();
20885 if (Cmp1.getOpcode() != AArch64ISD::SUBS &&
20886 Cmp0.getOpcode() == AArch64ISD::SUBS) {
20887 std::swap(Cmp0, Cmp1);
20888 std::swap(CC0, CC1);
20889 }
20890
20891 if (Cmp1.getOpcode() != AArch64ISD::SUBS)
20892 return SDValue();
20893
20894 SDLoc DL(N);
20895 SDValue CCmp, Condition;
20896 unsigned NZCV;
20897
20898 if (N->getOpcode() == ISD::AND || N->getOpcode() == AArch64ISD::ANDS) {
20900 Condition = getCondCode(DAG, InvCC0);
20902 } else {
20904 Condition = getCondCode(DAG, CC0);
20906 }
20907
20908 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
20909
20910 auto *Op1 = dyn_cast<ConstantSDNode>(Cmp1.getOperand(1));
20911 if (Op1 && Op1->getAPIntValue().isNegative() &&
20912 Op1->getAPIntValue().sgt(-32)) {
20913 // CCMP accept the constant int the range [0, 31]
20914 // if the Op1 is a constant in the range [-31, -1], we
20915 // can select to CCMN to avoid the extra mov
20916 SDValue AbsOp1 =
20917 DAG.getConstant(Op1->getAPIntValue().abs(), DL, Op1->getValueType(0));
20918 CCmp = DAG.getNode(AArch64ISD::CCMN, DL, FlagsVT, Cmp1.getOperand(0),
20919 AbsOp1, NZCVOp, Condition, Cmp0);
20920 } else {
20921 CCmp = DAG.getNode(AArch64ISD::CCMP, DL, FlagsVT, Cmp1.getOperand(0),
20922 Cmp1.getOperand(1), NZCVOp, Condition, Cmp0);
20923 }
20924 return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0),
20925 CSel0.getOperand(1), getCondCode(DAG, CC1), CCmp);
20926}
20927
20928// Attempt to use REVs for half-rotations of vectors of i16, i32 and i64.
20929// Patterns for i32:
20930//
20931// (OR (SHL_PRED Pg, X, (splat 16)),
20932// (SRL_PRED Pg, X, (splat 16)))
20933// =>
20934// REVH Pg, X, poison
20935//
20936// (OR (VSHL X, 16), (VLSHR X, 16))
20937// =>
20938// NVCAST (REV32 X)
20941 assert(N->getOpcode() == ISD::OR && "Expected OR instruction");
20942
20943 if (DCI.isBeforeLegalizeOps())
20944 return SDValue();
20945
20946 EVT VT = N->getValueType(0);
20947 if (!VT.isVector())
20948 return SDValue();
20949
20950 unsigned EltSize = VT.getScalarSizeInBits();
20951 if (EltSize != 16 && EltSize != 32 && EltSize != 64)
20952 return SDValue();
20953
20954 SDLoc DL(N);
20955 SDValue N0 = N->getOperand(0);
20956 SDValue N1 = N->getOperand(1);
20957
20958 if (VT.isScalableVector()) {
20959 if (N0.getOpcode() == AArch64ISD::SRL_PRED)
20960 std::swap(N0, N1);
20961 if (N0.getOpcode() != AArch64ISD::SHL_PRED ||
20962 N1.getOpcode() != AArch64ISD::SRL_PRED)
20963 return SDValue();
20964
20965 // Ensure we have common inputs.
20966 if (N0.getOperand(0) != N1.getOperand(0) ||
20967 N0.getOperand(1) != N1.getOperand(1) ||
20968 N0.getOperand(2) != N1.getOperand(2))
20969 return SDValue();
20970
20971 APInt ShAmt;
20972 if (!ISD::isConstantSplatVector(N0.getOperand(2).getNode(), ShAmt) ||
20973 EltSize / 2 != ShAmt)
20974 return SDValue();
20975
20976 unsigned RevOp;
20977 if (EltSize == 16)
20978 RevOp = AArch64ISD::BSWAP_MERGE_PASSTHRU;
20979 else if (EltSize == 32)
20980 RevOp = AArch64ISD::REVH_MERGE_PASSTHRU;
20981 else /* EltSize == 64 */
20982 RevOp = AArch64ISD::REVW_MERGE_PASSTHRU;
20983
20984 return DAG.getNode(RevOp, DL, VT, N0.getOperand(0), N0.getOperand(1),
20985 DAG.getPOISON(VT));
20986 }
20987
20988 assert(VT.isFixedLengthVector() && "Expected fixed length vector type");
20989
20990 // Half rotations of i16 vectors should be combined to bswap, so we shouldn't
20991 // need custom code for them here.
20992 // Note: This doesn't apply to scalable vectors as we allow arbitrary (but
20993 // matching) predicates in the shifts. Predicated rotations aren't matched to
20994 // rotl / rotr, and subsequently aren't combined to bswap.
20995 if (EltSize == 16)
20996 return SDValue();
20997
20998 if (N0.getOpcode() == AArch64ISD::VLSHR)
20999 std::swap(N0, N1);
21000 if (N0.getOpcode() != AArch64ISD::VSHL || N1.getOpcode() != AArch64ISD::VLSHR)
21001 return SDValue();
21002
21003 // Ensure common inputs.
21004 if (N0.getOperand(0) != N1.getOperand(0) ||
21005 N0.getOperand(1) != N1.getOperand(1))
21006 return SDValue();
21007
21008 if (EltSize / 2 != N0.getConstantOperandVal(1))
21009 return SDValue();
21010
21011 EVT HalfVT;
21012 unsigned RevOp;
21013 if (EltSize == 32) {
21014 RevOp = AArch64ISD::REV32;
21015 HalfVT = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16;
21016 } else /* EltSize == 64 */ {
21017 RevOp = AArch64ISD::REV64;
21018 HalfVT = VT.is64BitVector() ? MVT::v2i32 : MVT::v4i32;
21019 }
21020
21021 return DAG.getNode(AArch64ISD::NVCAST, DL, VT,
21022 DAG.getNode(RevOp, DL, HalfVT, N0->getOperand(0)));
21023}
21024
21025// (and/or X, (splat (not Y))) -> (and/or X, (not (splat Y)))
21026// so that it gets selected as (bic/orn X, (dup Y))
21028 unsigned Opc = N->getOpcode();
21029 assert(Opc == ISD::AND || Opc == ISD::OR);
21030 using namespace llvm::SDPatternMatch;
21031 SDValue X, Y;
21032 if (!sd_match(N, m_c_BinOp(Opc, m_Value(X),
21034 m_Not(m_Value(Y)), m_Zero()),
21035 m_Poison()))))
21036 return SDValue();
21037
21038 EVT VT = N->getValueType(0);
21039 SDLoc DL(N);
21040 SDValue Not = DAG.getNOT(DL, DAG.getSplat(VT, DL, Y), VT);
21041 return DAG.getNode(Opc, DL, VT, X, Not);
21042}
21043
21046 SelectionDAG &DAG = DCI.DAG;
21047
21048 if (SDValue R = performANDORCSELCombine(N, DAG))
21049 return R;
21050
21051 if (SDValue R = tryCombineToREV(N, DAG, DCI))
21052 return R;
21053
21054 if (SDValue R = performANDORDUPNOTCombine(N, DAG))
21055 return R;
21056
21057 return SDValue();
21058}
21059
21061 if (!MemVT.getVectorElementType().isSimple())
21062 return false;
21063
21064 uint64_t MaskForTy = 0ull;
21065 switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) {
21066 case MVT::i8:
21067 MaskForTy = 0xffull;
21068 break;
21069 case MVT::i16:
21070 MaskForTy = 0xffffull;
21071 break;
21072 case MVT::i32:
21073 MaskForTy = 0xffffffffull;
21074 break;
21075 default:
21076 return false;
21077 break;
21078 }
21079
21080 if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR)
21081 if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0)))
21082 return Op0->getAPIntValue().getLimitedValue() == MaskForTy;
21083
21084 return false;
21085}
21086
21088 SDValue LeafOp = SDValue(N, 0);
21089 SDValue Op = N->getOperand(0);
21090 while (Op.getOpcode() == AArch64ISD::REINTERPRET_CAST &&
21091 LeafOp.getValueType() != Op.getValueType())
21092 Op = Op->getOperand(0);
21093 if (LeafOp.getValueType() == Op.getValueType())
21094 return Op;
21095 return SDValue();
21096}
21097
21100 SelectionDAG &DAG = DCI.DAG;
21101 SDValue Src = N->getOperand(0);
21102 unsigned Opc = Src->getOpcode();
21103
21104 // Zero/any extend of an unsigned unpack
21105 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
21106 SDValue UnpkOp = Src->getOperand(0);
21107 SDValue Dup = N->getOperand(1);
21108
21109 if (Dup.getOpcode() != ISD::SPLAT_VECTOR)
21110 return SDValue();
21111
21112 SDLoc DL(N);
21114 if (!C)
21115 return SDValue();
21116
21117 uint64_t ExtVal = C->getZExtValue();
21118
21119 auto MaskAndTypeMatch = [ExtVal](EVT VT) -> bool {
21120 return ((ExtVal == 0xFF && VT == MVT::i8) ||
21121 (ExtVal == 0xFFFF && VT == MVT::i16) ||
21122 (ExtVal == 0xFFFFFFFF && VT == MVT::i32));
21123 };
21124
21125 // If the mask is fully covered by the unpack, we don't need to push
21126 // a new AND onto the operand
21127 EVT EltTy = UnpkOp->getValueType(0).getVectorElementType();
21128 if (MaskAndTypeMatch(EltTy))
21129 return Src;
21130
21131 // If this is 'and (uunpklo/hi (extload MemTy -> ExtTy)), mask', then check
21132 // to see if the mask is all-ones of size MemTy.
21133 auto MaskedLoadOp = dyn_cast<MaskedLoadSDNode>(UnpkOp);
21134 if (MaskedLoadOp && (MaskedLoadOp->getExtensionType() == ISD::ZEXTLOAD ||
21135 MaskedLoadOp->getExtensionType() == ISD::EXTLOAD)) {
21136 EVT EltTy = MaskedLoadOp->getMemoryVT().getVectorElementType();
21137 if (MaskAndTypeMatch(EltTy))
21138 return Src;
21139 }
21140
21141 // Truncate to prevent a DUP with an over wide constant
21142 APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits());
21143
21144 // Otherwise, make sure we propagate the AND to the operand
21145 // of the unpack
21146 Dup = DAG.getNode(ISD::SPLAT_VECTOR, DL, UnpkOp->getValueType(0),
21147 DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));
21148
21149 SDValue And = DAG.getNode(ISD::AND, DL,
21150 UnpkOp->getValueType(0), UnpkOp, Dup);
21151
21152 return DAG.getNode(Opc, DL, N->getValueType(0), And);
21153 }
21154
21155 if (DCI.isBeforeLegalizeOps())
21156 return SDValue();
21157
21158 // If both sides of AND operations are i1 splat_vectors then
21159 // we can produce just i1 splat_vector as the result.
21160 if (isAllActivePredicate(DAG, N->getOperand(0)))
21161 return N->getOperand(1);
21162 if (isAllActivePredicate(DAG, N->getOperand(1)))
21163 return N->getOperand(0);
21164
21166 return SDValue();
21167
21168 SDValue Mask = N->getOperand(1);
21169
21170 if (!Src.hasOneUse())
21171 return SDValue();
21172
21173 EVT MemVT;
21174
21175 // SVE load instructions perform an implicit zero-extend, which makes them
21176 // perfect candidates for combining.
21177 switch (Opc) {
21178 case AArch64ISD::LD1_MERGE_ZERO:
21179 case AArch64ISD::LDNF1_MERGE_ZERO:
21180 case AArch64ISD::LDFF1_MERGE_ZERO:
21181 MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT();
21182 break;
21183 case AArch64ISD::GLD1_MERGE_ZERO:
21184 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
21185 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
21186 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
21187 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
21188 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
21189 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
21190 case AArch64ISD::GLDFF1_MERGE_ZERO:
21191 case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
21192 case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
21193 case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
21194 case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
21195 case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
21196 case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
21197 case AArch64ISD::GLDNT1_MERGE_ZERO:
21198 MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
21199 break;
21200 default:
21201 return SDValue();
21202 }
21203
21204 if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT))
21205 return Src;
21206
21207 return SDValue();
21208}
21209
21210// Transform and(fcmp(a, b), fcmp(c, d)) into fccmp(fcmp(a, b), c, d)
21213
21214 // This function performs an optimization on a specific pattern involving
21215 // an AND operation and SETCC (Set Condition Code) node.
21216
21217 SDValue SetCC = N->getOperand(0);
21218 EVT VT = N->getValueType(0);
21219 SelectionDAG &DAG = DCI.DAG;
21220
21221 // Checks if the current node (N) is used by any SELECT instruction and
21222 // returns an empty SDValue to avoid applying the optimization to prevent
21223 // incorrect results
21224 for (auto U : N->users())
21225 if (U->getOpcode() == ISD::SELECT)
21226 return SDValue();
21227
21228 // Check if the operand is a SETCC node with floating-point comparison
21229 if (SetCC.getOpcode() == ISD::SETCC &&
21230 SetCC.getOperand(0).getValueType() == MVT::f32) {
21231
21232 SDValue Cmp;
21234
21235 // Check if the DAG is after legalization and if we can emit the conjunction
21236 if (!DCI.isBeforeLegalize() &&
21237 (Cmp = emitConjunction(DAG, SDValue(N, 0), CC))) {
21238
21240
21241 SDLoc DL(N);
21242 return DAG.getNode(AArch64ISD::CSINC, DL, VT, DAG.getConstant(0, DL, VT),
21243 DAG.getConstant(0, DL, VT),
21244 getCondCode(DAG, InvertedCC), Cmp);
21245 }
21246 }
21247 return SDValue();
21248}
21249
21252 SelectionDAG &DAG = DCI.DAG;
21253 SDValue LHS = N->getOperand(0);
21254 SDValue RHS = N->getOperand(1);
21255 EVT VT = N->getValueType(0);
21256
21257 if (SDValue R = performANDORCSELCombine(N, DAG))
21258 return R;
21259
21260 if (SDValue R = performANDSETCCCombine(N,DCI))
21261 return R;
21262
21263 if (SDValue R = performANDORDUPNOTCombine(N, DAG))
21264 return R;
21265
21266 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
21267 return SDValue();
21268
21269 if (VT.isScalableVector())
21270 return performSVEAndCombine(N, DCI);
21271
21272 // The combining code below works only for NEON vectors. In particular, it
21273 // does not work for SVE when dealing with vectors wider than 128 bits.
21274 if (!VT.is64BitVector() && !VT.is128BitVector())
21275 return SDValue();
21276
21278 if (!BVN)
21279 return SDValue();
21280
21281 // AND does not accept an immediate, so check if we can use a BIC immediate
21282 // instruction instead. We do this here instead of using a (and x, (mvni imm))
21283 // pattern in isel, because some immediates may be lowered to the preferred
21284 // (and x, (movi imm)) form, even though an mvni representation also exists.
21285 APInt DefBits(VT.getSizeInBits(), 0);
21286 APInt UndefBits(VT.getSizeInBits(), 0);
21287 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
21288 SDValue NewOp;
21289
21290 // Any bits known to already be 0 need not be cleared again, which can help
21291 // reduce the size of the immediate to one supported by the instruction.
21292 KnownBits Known = DAG.computeKnownBits(LHS);
21293 APInt ZeroSplat(VT.getSizeInBits(), 0);
21294 for (unsigned I = 0; I < VT.getSizeInBits() / Known.Zero.getBitWidth(); I++)
21295 ZeroSplat |= Known.Zero.zext(VT.getSizeInBits())
21296 << (Known.Zero.getBitWidth() * I);
21297
21298 DefBits = ~(DefBits | ZeroSplat);
21299 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
21300 DefBits, &LHS)) ||
21301 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
21302 DefBits, &LHS)))
21303 return NewOp;
21304
21305 UndefBits = ~(UndefBits | ZeroSplat);
21306 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
21307 UndefBits, &LHS)) ||
21308 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
21309 UndefBits, &LHS)))
21310 return NewOp;
21311 }
21312
21313 return SDValue();
21314}
21315
21318 SelectionDAG &DAG = DCI.DAG;
21319 SDValue LHS = N->getOperand(0);
21320 SDValue RHS = N->getOperand(1);
21321 EVT VT = N->getValueType(0);
21322 SDLoc DL(N);
21323
21324 if (!N->getFlags().hasAllowReassociation())
21325 return SDValue();
21326
21327 // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
21328 auto ReassocComplex = [&](SDValue A, SDValue B) {
21329 if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
21330 return SDValue();
21331 unsigned Opc = A.getConstantOperandVal(0);
21332 if (Opc != Intrinsic::aarch64_neon_vcmla_rot0 &&
21333 Opc != Intrinsic::aarch64_neon_vcmla_rot90 &&
21334 Opc != Intrinsic::aarch64_neon_vcmla_rot180 &&
21335 Opc != Intrinsic::aarch64_neon_vcmla_rot270)
21336 return SDValue();
21337 SDValue VCMLA = DAG.getNode(
21338 ISD::INTRINSIC_WO_CHAIN, DL, VT, A.getOperand(0),
21339 DAG.getNode(ISD::FADD, DL, VT, A.getOperand(1), B, N->getFlags()),
21340 A.getOperand(2), A.getOperand(3));
21341 VCMLA->setFlags(A->getFlags());
21342 return VCMLA;
21343 };
21344 if (SDValue R = ReassocComplex(LHS, RHS))
21345 return R;
21346 if (SDValue R = ReassocComplex(RHS, LHS))
21347 return R;
21348
21349 return SDValue();
21350}
21351
21352static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
21353 switch (Opcode) {
21354 case ISD::STRICT_FADD:
21355 case ISD::FADD:
21356 return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64;
21357 case ISD::ADD:
21358 return VT == MVT::i64;
21359 default:
21360 return false;
21361 }
21362}
21363
21364static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
21366
21368 if ((N.getOpcode() == ISD::SETCC) ||
21369 // get_active_lane_mask is lowered to a whilelo instruction.
21370 (N.getOpcode() == ISD::GET_ACTIVE_LANE_MASK) ||
21371 (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
21372 (N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege ||
21373 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege_x2 ||
21374 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt ||
21375 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt_x2 ||
21376 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi ||
21377 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi_x2 ||
21378 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs ||
21379 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs_x2 ||
21380 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele ||
21381 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele_x2 ||
21382 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo ||
21383 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo_x2 ||
21384 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels ||
21385 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels_x2 ||
21386 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt ||
21387 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt_x2)))
21388 return true;
21389
21390 return false;
21391}
21392
21393// Materialize : i1 = extract_vector_elt t37, Constant:i64<0>
21394// ... into: "ptrue p, all" + PTEST
21395static SDValue
21398 const AArch64Subtarget *Subtarget) {
21399 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
21400 // Make sure PTEST can be legalised with illegal types.
21401 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
21402 return SDValue();
21403
21404 SDValue N0 = N->getOperand(0);
21405 EVT VT = N0.getValueType();
21406
21407 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 ||
21408 !isNullConstant(N->getOperand(1)))
21409 return SDValue();
21410
21411 // Restricted the DAG combine to only cases where we're extracting from a
21412 // flag-setting operation.
21413 if (!isPredicateCCSettingOp(N0) || N0.getResNo() != 0)
21414 return SDValue();
21415
21416 // Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0
21417 SelectionDAG &DAG = DCI.DAG;
21418 SDValue Pg = DAG.getConstant(1, SDLoc(N), VT);
21419 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::FIRST_ACTIVE);
21420}
21421
21422// Materialize : Idx = (add (mul vscale, NumEls), -1)
21423// i1 = extract_vector_elt t37, Constant:i64<Idx>
21424// ... into: "ptrue p, all" + PTEST
21425static SDValue
21428 const AArch64Subtarget *Subtarget) {
21429 using namespace llvm::SDPatternMatch;
21430 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
21431 // Make sure PTEST is legal types.
21432 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
21433 return SDValue();
21434
21435 SDValue N0 = N->getOperand(0);
21436 EVT OpVT = N0.getValueType();
21437
21438 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
21439 return SDValue();
21440
21441 SDValue Idx = N->getOperand(1);
21442 unsigned NumEls = OpVT.getVectorElementCount().getKnownMinValue();
21443 if (!sd_match(Idx, m_ZExtOrSelf(
21444 m_Add(m_VScale(m_SpecificInt(NumEls)), m_AllOnes()))))
21445 return SDValue();
21446
21447 // Extracts of lane EC-1 for SVE can be expressed as PTEST(Op, LAST) ? 1 : 0
21448 SelectionDAG &DAG = DCI.DAG;
21449 SDValue Pg = DAG.getConstant(1, SDLoc(N), OpVT);
21450 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::LAST_ACTIVE);
21451}
21452
21453static SDValue
21455 const AArch64Subtarget *Subtarget) {
21456 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
21457 SelectionDAG &DAG = DCI.DAG;
21458 SDValue Vec = N->getOperand(0);
21459 SDValue Idx = N->getOperand(1);
21460
21462 return SDValue();
21463
21464 // Only legal for 8, 16, 32, and 64 bit element types.
21465 EVT EltVT = Vec.getValueType().getVectorElementType();
21466 if (!is_contained(ArrayRef({MVT::i8, MVT::i16, MVT::i32, MVT::i64, MVT::f16,
21467 MVT::bf16, MVT::f32, MVT::f64}),
21468 EltVT.getSimpleVT().SimpleTy))
21469 return SDValue();
21470
21471 SDValue Mask = Idx.getOperand(0);
21472 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21473 if (!TLI.isOperationLegal(ISD::VECTOR_FIND_LAST_ACTIVE, Mask.getValueType()))
21474 return SDValue();
21475
21476 return DAG.getNode(AArch64ISD::LASTB, SDLoc(N), N->getValueType(0), Mask,
21477 Vec);
21478}
21479
21480static SDValue
21482 const AArch64Subtarget *Subtarget) {
21483 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
21484 if (SDValue Res = performFirstTrueTestVectorCombine(N, DCI, Subtarget))
21485 return Res;
21486 if (SDValue Res = performLastTrueTestVectorCombine(N, DCI, Subtarget))
21487 return Res;
21488 if (SDValue Res = performExtractLastActiveCombine(N, DCI, Subtarget))
21489 return Res;
21490
21491 SelectionDAG &DAG = DCI.DAG;
21492 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
21493
21494 EVT VT = N->getValueType(0);
21495 const bool FullFP16 = Subtarget->hasFullFP16();
21496 bool IsStrict = N0->isStrictFPOpcode();
21497
21498 // extract(dup x) -> x
21499 if (N0.getOpcode() == AArch64ISD::DUP)
21500 return VT.isInteger() ? DAG.getZExtOrTrunc(N0.getOperand(0), SDLoc(N), VT)
21501 : N0.getOperand(0);
21502
21503 // Rewrite for pairwise fadd pattern
21504 // (f32 (extract_vector_elt
21505 // (fadd (vXf32 Other)
21506 // (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0))
21507 // ->
21508 // (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
21509 // (extract_vector_elt (vXf32 Other) 1))
21510 // For strict_fadd we need to make sure the old strict_fadd can be deleted, so
21511 // we can only do this when it's used only by the extract_vector_elt.
21512 if (isNullConstant(N1) && hasPairwiseAdd(N0->getOpcode(), VT, FullFP16) &&
21513 (!IsStrict || N0.hasOneUse())) {
21514 SDLoc DL(N0);
21515 SDValue N00 = N0->getOperand(IsStrict ? 1 : 0);
21516 SDValue N01 = N0->getOperand(IsStrict ? 2 : 1);
21517
21519 SDValue Other = N00;
21520
21521 // And handle the commutative case.
21522 if (!Shuffle) {
21523 Shuffle = dyn_cast<ShuffleVectorSDNode>(N00);
21524 Other = N01;
21525 }
21526
21527 if (Shuffle && Shuffle->getMaskElt(0) == 1 &&
21528 Other == Shuffle->getOperand(0)) {
21529 SDValue Extract1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
21530 DAG.getConstant(0, DL, MVT::i64));
21531 SDValue Extract2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
21532 DAG.getConstant(1, DL, MVT::i64));
21533 if (!IsStrict)
21534 return DAG.getNode(N0->getOpcode(), DL, VT, Extract1, Extract2);
21535
21536 // For strict_fadd we need uses of the final extract_vector to be replaced
21537 // with the strict_fadd, but we also need uses of the chain output of the
21538 // original strict_fadd to use the chain output of the new strict_fadd as
21539 // otherwise it may not be deleted.
21540 SDValue Ret = DAG.getNode(N0->getOpcode(), DL,
21541 {VT, MVT::Other},
21542 {N0->getOperand(0), Extract1, Extract2});
21543 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Ret);
21544 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Ret.getValue(1));
21545 return SDValue(N, 0);
21546 }
21547 }
21548
21549 // Given an extract(load) or extract(extend(load)), produce a scalar load
21550 // instead to avoid the cross-register-bank copies.
21551 if (DCI.isAfterLegalizeDAG() && Subtarget->isLittleEndian() &&
21552 VT.isInteger() && isa<ConstantSDNode>(N1)) {
21553 SDValue LoadN0 = N0;
21554 // Look through sext/zext and extract_subvector / insert_subvector if
21555 // required.
21556 if ((N0.getOpcode() == ISD::ZERO_EXTEND ||
21557 N0.getOpcode() == ISD::SIGN_EXTEND ||
21558 N0.getOpcode() == ISD::ANY_EXTEND) &&
21559 N0.getOperand(0).hasOneUse())
21560 LoadN0 = N0.getOperand(0);
21561 unsigned OffsetElts = 0;
21562 if (LoadN0.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
21563 OffsetElts = LoadN0.getConstantOperandVal(1);
21564 LoadN0 = LoadN0.getOperand(0);
21565 }
21566 if (LoadN0.getOpcode() == ISD::INSERT_SUBVECTOR &&
21567 LoadN0.getOperand(0).isUndef() &&
21568 isNullConstant(LoadN0.getOperand(2)) &&
21569 LoadN0.getOperand(1).hasOneUse())
21570 LoadN0 = LoadN0.getOperand(1);
21571
21572 // Check all the uses are valid and can be scalarized. We check that all the
21573 // uses are extracts and those extracts are not re-inserted into an
21574 // operation best treated as a vector register.
21575 auto Load = dyn_cast<LoadSDNode>(LoadN0);
21576 if (Load && Load->isSimple() && ISD::isNormalLoad(Load) &&
21577 Load->getMemoryVT().isByteSized() &&
21578 all_of(N0->uses(), [&](const SDUse &U) {
21579 return U.getResNo() != N0.getResNo() ||
21580 (U.getUser()->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
21581 !any_of(U.getUser()->uses(), [](const SDUse &U2) {
21582 return U2.getUser()->getOpcode() ==
21583 ISD::INSERT_VECTOR_ELT ||
21584 U2.getUser()->getOpcode() == ISD::BUILD_VECTOR ||
21585 U2.getUser()->getOpcode() == ISD::SCALAR_TO_VECTOR;
21586 }));
21587 })) {
21588
21589 SDLoc DL(Load);
21590
21591 // Generate a new scalar load.
21592 unsigned Offset = (OffsetElts + N->getConstantOperandVal(1)) *
21593 Load->getValueType(0).getScalarSizeInBits() / 8;
21594 SDValue BasePtr = DAG.getObjectPtrOffset(
21595 DL, Load->getBasePtr(), DAG.getConstant(Offset, DL, MVT::i64));
21596 ISD::LoadExtType ExtType =
21600 : ISD::EXTLOAD);
21601 SDValue ScalarLoad =
21602 DAG.getExtLoad(ExtType, DL, VT, Load->getChain(), BasePtr,
21603 Load->getPointerInfo().getWithOffset(Offset),
21604 Load->getValueType(0).getScalarType(),
21605 commonAlignment(Load->getAlign(), Offset),
21606 Load->getMemOperand()->getFlags(), Load->getAAInfo());
21607 DAG.makeEquivalentMemoryOrdering(Load, ScalarLoad);
21608 return ScalarLoad;
21609 }
21610 }
21611
21612 return SDValue();
21613}
21614
21617 SelectionDAG &DAG) {
21618 SDLoc DL(N);
21619 EVT VT = N->getValueType(0);
21620 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
21621 unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();
21622
21623 // For unpacked types:
21624 // concat(zip1(a, b), zip2(a, b)) => trn1(a, b)
21625 if (DCI.isAfterLegalizeDAG() && isUnpackedType(N0.getValueType(), DAG) &&
21626 N->getNumOperands() == 2 && N0Opc == AArch64ISD::ZIP1 &&
21627 N1Opc == AArch64ISD::ZIP2 && N0.getOperand(0) == N1.getOperand(0) &&
21628 N0.getOperand(1) == N1.getOperand(1)) {
21629 // If the type is unpacked, then each element is separated by a gap at least
21630 // as big as the element size. It is therefore safe to re-interpret the
21631 // inputs with double the elements and ignore odd elements (hence TRN1).
21632 SDValue Op0MoreElems =
21633 DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, N0.getOperand(0));
21634 SDValue Op1MoreElems =
21635 DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, N0.getOperand(1));
21636 return DAG.getNode(AArch64ISD::TRN1, DL, VT, Op0MoreElems, Op1MoreElems);
21637 }
21638
21639 if (VT.isScalableVector())
21640 return SDValue();
21641
21642 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
21643 N1Opc == ISD::TRUNCATE) {
21644 SDValue N00 = N0->getOperand(0);
21645 SDValue N10 = N1->getOperand(0);
21646 EVT N00VT = N00.getValueType();
21647 unsigned N00Opc = N00.getOpcode(), N10Opc = N10.getOpcode();
21648
21649 // Optimize concat_vectors of truncated vectors, where the intermediate
21650 // type is illegal, to avoid said illegality, e.g.,
21651 // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
21652 // (v2i16 (truncate (v2i64)))))
21653 // ->
21654 // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
21655 // (v4i32 (bitcast (v2i64))),
21656 // <0, 2, 4, 6>)))
21657 // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
21658 // on both input and result type, so we might generate worse code.
21659 // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
21660 if (N00VT == N10.getValueType() &&
21661 (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
21662 N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
21663 MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
21665 for (size_t i = 0; i < Mask.size(); ++i)
21666 Mask[i] = i * 2;
21667 return DAG.getNode(ISD::TRUNCATE, DL, VT,
21668 DAG.getVectorShuffle(
21669 MidVT, DL,
21670 DAG.getNode(ISD::BITCAST, DL, MidVT, N00),
21671 DAG.getNode(ISD::BITCAST, DL, MidVT, N10), Mask));
21672 }
21673
21674 // Optimize two large shifts and a combine into a single combine and shift
21675 // For AArch64 architectures, sequences like the following:
21676 //
21677 // ushr v0.4s, v0.4s, #20
21678 // ushr v1.4s, v1.4s, #20
21679 // uzp1 v0.8h, v0.8h, v1.8h
21680 //
21681 // Can be optimized to:
21682 //
21683 // uzp2 v0.8h, v0.8h, v1.8h
21684 // ushr v0.8h, v0.8h, #4
21685 //
21686 // This optimization reduces instruction count.
21687 if (N00Opc == AArch64ISD::VLSHR && N10Opc == AArch64ISD::VLSHR &&
21688 N00->getOperand(1) == N10->getOperand(1)) {
21689 SDValue N000 = N00->getOperand(0);
21690 SDValue N100 = N10->getOperand(0);
21691 uint64_t N001ConstVal = N00->getConstantOperandVal(1),
21692 N101ConstVal = N10->getConstantOperandVal(1),
21693 NScalarSize = N->getValueType(0).getScalarSizeInBits();
21694
21695 if (N001ConstVal == N101ConstVal && N001ConstVal > NScalarSize) {
21696 N000 = DAG.getNode(AArch64ISD::NVCAST, DL, VT, N000);
21697 N100 = DAG.getNode(AArch64ISD::NVCAST, DL, VT, N100);
21698 SDValue Uzp = DAG.getNode(AArch64ISD::UZP2, DL, VT, N000, N100);
21699 SDValue NewShiftConstant =
21700 DAG.getTargetConstant(N001ConstVal - NScalarSize, DL, MVT::i32);
21701
21702 return DAG.getNode(AArch64ISD::VLSHR, DL, VT, Uzp, NewShiftConstant);
21703 }
21704 }
21705 }
21706
21707 if (N->getOperand(0).getValueType() == MVT::v4i8 ||
21708 N->getOperand(0).getValueType() == MVT::v2i16 ||
21709 N->getOperand(0).getValueType() == MVT::v2i8) {
21710 EVT SrcVT = N->getOperand(0).getValueType();
21711 // If we have a concat of v4i8 loads, convert them to a buildvector of f32
21712 // loads to prevent having to go through the v4i8 load legalization that
21713 // needs to extend each element into a larger type.
21714 if (N->getNumOperands() % 2 == 0 &&
21715 all_of(N->op_values(), [SrcVT](SDValue V) {
21716 if (V.getValueType() != SrcVT)
21717 return false;
21718 if (V.isUndef())
21719 return true;
21720 LoadSDNode *LD = dyn_cast<LoadSDNode>(V);
21721 return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() &&
21722 LD->getExtensionType() == ISD::NON_EXTLOAD;
21723 })) {
21724 EVT FVT = SrcVT == MVT::v2i8 ? MVT::f16 : MVT::f32;
21725 EVT NVT = EVT::getVectorVT(*DAG.getContext(), FVT, N->getNumOperands());
21727
21728 for (unsigned i = 0; i < N->getNumOperands(); i++) {
21729 SDValue V = N->getOperand(i);
21730 if (V.isUndef())
21731 Ops.push_back(DAG.getUNDEF(FVT));
21732 else {
21734 SDValue NewLoad = DAG.getLoad(FVT, DL, LD->getChain(),
21735 LD->getBasePtr(), LD->getMemOperand());
21736 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1));
21737 Ops.push_back(NewLoad);
21738 }
21739 }
21740 return DAG.getBitcast(N->getValueType(0),
21741 DAG.getBuildVector(NVT, DL, Ops));
21742 }
21743 }
21744
21745 // Canonicalise concat_vectors to replace concatenations of truncated nots
21746 // with nots of concatenated truncates. This in some cases allows for multiple
21747 // redundant negations to be eliminated.
21748 // (concat_vectors (v4i16 (truncate (not (v4i32)))),
21749 // (v4i16 (truncate (not (v4i32)))))
21750 // ->
21751 // (not (concat_vectors (v4i16 (truncate (v4i32))),
21752 // (v4i16 (truncate (v4i32)))))
21753 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
21754 N1Opc == ISD::TRUNCATE && N->isOnlyUserOf(N0.getNode()) &&
21755 N->isOnlyUserOf(N1.getNode())) {
21756 auto isBitwiseVectorNegate = [](SDValue V) {
21757 return V->getOpcode() == ISD::XOR &&
21758 ISD::isConstantSplatVectorAllOnes(V.getOperand(1).getNode());
21759 };
21760 SDValue N00 = N0->getOperand(0);
21761 SDValue N10 = N1->getOperand(0);
21762 if (isBitwiseVectorNegate(N00) && N0->isOnlyUserOf(N00.getNode()) &&
21763 isBitwiseVectorNegate(N10) && N1->isOnlyUserOf(N10.getNode())) {
21764 return DAG.getNOT(
21765 DL,
21768 N00->getOperand(0)),
21770 N10->getOperand(0))),
21771 VT);
21772 }
21773 }
21774
21775 // Wait till after everything is legalized to try this. That way we have
21776 // legal vector types and such.
21777 if (DCI.isBeforeLegalizeOps())
21778 return SDValue();
21779
21780 // Optimise concat_vectors of two identical binops with a 128-bit destination
21781 // size, combine into an binop of two contacts of the source vectors. eg:
21782 // concat(uhadd(a,b), uhadd(c, d)) -> uhadd(concat(a, c), concat(b, d))
21783 if (N->getNumOperands() == 2 && N0Opc == N1Opc && VT.is128BitVector() &&
21784 (DAG.getTargetLoweringInfo().isBinOp(N0Opc) ||
21785 isVectorizedBinOp(N0Opc)) &&
21786 N0->hasOneUse() && N1->hasOneUse()) {
21787 SDValue N00 = N0->getOperand(0);
21788 SDValue N01 = N0->getOperand(1);
21789 SDValue N10 = N1->getOperand(0);
21790 SDValue N11 = N1->getOperand(1);
21791
21792 if (!N00.isUndef() && !N01.isUndef() && !N10.isUndef() && !N11.isUndef()) {
21793 SDValue Concat0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N00, N10);
21794 SDValue Concat1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N01, N11);
21795 return DAG.getNode(N0Opc, DL, VT, Concat0, Concat1);
21796 }
21797 }
21798
21799 auto IsRSHRN = [](SDValue Shr) {
21800 if (Shr.getOpcode() != AArch64ISD::VLSHR)
21801 return false;
21802 SDValue Op = Shr.getOperand(0);
21803 EVT VT = Op.getValueType();
21804 unsigned ShtAmt = Shr.getConstantOperandVal(1);
21805 if (ShtAmt > VT.getScalarSizeInBits() / 2 || Op.getOpcode() != ISD::ADD)
21806 return false;
21807
21808 APInt Imm;
21809 if (Op.getOperand(1).getOpcode() == AArch64ISD::MOVIshift)
21810 Imm = APInt(VT.getScalarSizeInBits(),
21811 Op.getOperand(1).getConstantOperandVal(0)
21812 << Op.getOperand(1).getConstantOperandVal(1));
21813 else if (Op.getOperand(1).getOpcode() == AArch64ISD::DUP &&
21814 isa<ConstantSDNode>(Op.getOperand(1).getOperand(0)))
21815 Imm = APInt(VT.getScalarSizeInBits(),
21816 Op.getOperand(1).getConstantOperandVal(0));
21817 else
21818 return false;
21819
21820 if (Imm != 1ULL << (ShtAmt - 1))
21821 return false;
21822 return true;
21823 };
21824
21825 // concat(rshrn(x), rshrn(y)) -> rshrn(concat(x, y))
21826 if (N->getNumOperands() == 2 && IsRSHRN(N0) &&
21827 ((IsRSHRN(N1) &&
21829 N1.isUndef())) {
21830 SDValue X = N0.getOperand(0).getOperand(0);
21831 SDValue Y = N1.isUndef() ? DAG.getUNDEF(X.getValueType())
21832 : N1.getOperand(0).getOperand(0);
21833 EVT BVT =
21834 X.getValueType().getDoubleNumVectorElementsVT(*DCI.DAG.getContext());
21835 SDValue CC = DAG.getNode(ISD::CONCAT_VECTORS, DL, BVT, X, Y);
21836 SDValue Add = DAG.getNode(
21837 ISD::ADD, DL, BVT, CC,
21838 DAG.getConstant(1ULL << (N0.getConstantOperandVal(1) - 1), DL, BVT));
21839 SDValue Shr =
21840 DAG.getNode(AArch64ISD::VLSHR, DL, BVT, Add, N0.getOperand(1));
21841 return Shr;
21842 }
21843
21844 // concat(zip1(a, b), zip2(a, b)) is zip1(a, b)
21845 if (N->getNumOperands() == 2 && N0Opc == AArch64ISD::ZIP1 &&
21846 N1Opc == AArch64ISD::ZIP2 && N0.getOperand(0) == N1.getOperand(0) &&
21847 N0.getOperand(1) == N1.getOperand(1)) {
21848 SDValue E0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
21849 DAG.getPOISON(N0.getValueType()));
21850 SDValue E1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(1),
21851 DAG.getPOISON(N0.getValueType()));
21852 return DAG.getNode(AArch64ISD::ZIP1, DL, VT, E0, E1);
21853 }
21854
21855 // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
21856 // splat. The indexed instructions are going to be expecting a DUPLANE64, so
21857 // canonicalise to that.
21858 if (N->getNumOperands() == 2 && N0 == N1 && VT.getVectorNumElements() == 2) {
21859 assert(VT.getScalarSizeInBits() == 64);
21860 return DAG.getNode(AArch64ISD::DUPLANE64, DL, VT, WidenVector(N0, DAG),
21861 DAG.getConstant(0, DL, MVT::i64));
21862 }
21863
21864 // Canonicalise concat_vectors so that the right-hand vector has as few
21865 // bit-casts as possible before its real operation. The primary matching
21866 // destination for these operations will be the narrowing "2" instructions,
21867 // which depend on the operation being performed on this right-hand vector.
21868 // For example,
21869 // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
21870 // becomes
21871 // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
21872
21873 if (N->getNumOperands() != 2 || N1Opc != ISD::BITCAST)
21874 return SDValue();
21875 SDValue RHS = N1->getOperand(0);
21876 MVT RHSTy = RHS.getValueType().getSimpleVT();
21877 // If the RHS is not a vector, this is not the pattern we're looking for.
21878 if (!RHSTy.isVector())
21879 return SDValue();
21880
21881 LLVM_DEBUG(
21882 dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
21883
21884 MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
21885 RHSTy.getVectorNumElements() * 2);
21886 return DAG.getNode(ISD::BITCAST, DL, VT,
21887 DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatTy,
21888 DAG.getNode(ISD::BITCAST, DL, RHSTy, N0),
21889 RHS));
21890}
21891
21892static SDValue
21894 SelectionDAG &DAG) {
21895 if (DCI.isBeforeLegalizeOps())
21896 return SDValue();
21897
21898 EVT VT = N->getValueType(0);
21899 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)
21900 return SDValue();
21901
21902 SDValue V = N->getOperand(0);
21903
21904 // NOTE: This combine exists in DAGCombiner, but that version's legality check
21905 // blocks this combine because the non-const case requires custom lowering.
21906 //
21907 // ty1 extract_vector(ty2 splat(const))) -> ty1 splat(const)
21908 if (V.getOpcode() == ISD::SPLAT_VECTOR)
21909 if (isa<ConstantSDNode>(V.getOperand(0)))
21910 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, V.getOperand(0));
21911
21912 return SDValue();
21913}
21914
21915static SDValue
21917 SelectionDAG &DAG) {
21918 SDLoc DL(N);
21919 SDValue Vec = N->getOperand(0);
21920 SDValue SubVec = N->getOperand(1);
21921 uint64_t IdxVal = N->getConstantOperandVal(2);
21922 EVT VecVT = Vec.getValueType();
21923 EVT SubVT = SubVec.getValueType();
21924
21925 // Promote fixed length vector constants.
21926 if (VecVT.isScalableVector() && SubVT.isFixedLengthVector() &&
21927 Vec.isUndef()) {
21928 SDValue SplatVal = DAG.getSplatValue(SubVec);
21929 if (auto C = dyn_cast_or_null<ConstantSDNode>(SplatVal))
21930 return DAG.getConstant(C->getAPIntValue(), DL, VecVT);
21931
21932 if (auto C = dyn_cast_or_null<ConstantFPSDNode>(SplatVal))
21933 return DAG.getConstantFP(C->getValueAPF(), DL, VecVT);
21934 }
21935
21936 // Only do this for legal fixed vector types.
21937 if (!VecVT.isFixedLengthVector() ||
21938 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
21939 !DAG.getTargetLoweringInfo().isTypeLegal(SubVT))
21940 return SDValue();
21941
21942 // Ignore widening patterns.
21943 if (IdxVal == 0 && Vec.isUndef())
21944 return SDValue();
21945
21946 // Subvector must be half the width and an "aligned" insertion.
21947 unsigned NumSubElts = SubVT.getVectorNumElements();
21948 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
21949 (IdxVal != 0 && IdxVal != NumSubElts))
21950 return SDValue();
21951
21952 // Fold insert_subvector -> concat_vectors
21953 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
21954 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
21955 SDValue Lo, Hi;
21956 if (IdxVal == 0) {
21957 Lo = SubVec;
21958 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
21959 DAG.getVectorIdxConstant(NumSubElts, DL));
21960 } else {
21961 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
21962 DAG.getVectorIdxConstant(0, DL));
21963 Hi = SubVec;
21964 }
21965 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
21966}
21967
21970 SelectionDAG &DAG) {
21971 // Wait until after everything is legalized to try this. That way we have
21972 // legal vector types and such.
21973 if (DCI.isBeforeLegalizeOps())
21974 return SDValue();
21975 // Transform a scalar conversion of a value from a lane extract into a
21976 // lane extract of a vector conversion. E.g., from foo1 to foo2:
21977 // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
21978 // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
21979 //
21980 // The second form interacts better with instruction selection and the
21981 // register allocator to avoid cross-class register copies that aren't
21982 // coalescable due to a lane reference.
21983
21984 // Check the operand and see if it originates from a lane extract.
21985 SDValue Op1 = N->getOperand(1);
21987 return SDValue();
21988
21989 // Yep, no additional predication needed. Perform the transform.
21990 SDValue IID = N->getOperand(0);
21991 SDValue Shift = N->getOperand(2);
21992 SDValue Vec = Op1.getOperand(0);
21993 SDValue Lane = Op1.getOperand(1);
21994 EVT ResTy = N->getValueType(0);
21995 EVT VecResTy;
21996 SDLoc DL(N);
21997
21998 // The vector width should be 128 bits by the time we get here, even
21999 // if it started as 64 bits (the extract_vector handling will have
22000 // done so). Bail if it is not.
22001 if (Vec.getValueSizeInBits() != 128)
22002 return SDValue();
22003
22004 if (Vec.getValueType() == MVT::v4i32)
22005 VecResTy = MVT::v4f32;
22006 else if (Vec.getValueType() == MVT::v2i64)
22007 VecResTy = MVT::v2f64;
22008 else
22009 return SDValue();
22010
22011 SDValue Convert =
22012 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
22013 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
22014}
22015
22016// AArch64 high-vector "long" operations are formed by performing the non-high
22017// version on an extract_subvector of each operand which gets the high half:
22018//
22019// (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
22020//
22021// However, there are cases which don't have an extract_high explicitly, but
22022// have another operation that can be made compatible with one for free. For
22023// example:
22024//
22025// (dupv64 scalar) --> (extract_high (dup128 scalar))
22026//
22027// This routine does the actual conversion of such DUPs, once outer routines
22028// have determined that everything else is in order.
22029// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
22030// similarly here.
22032 MVT VT = N.getSimpleValueType();
22033 if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
22034 N.getConstantOperandVal(1) == 0)
22035 N = N.getOperand(0);
22036
22037 switch (N.getOpcode()) {
22038 case AArch64ISD::DUP:
22039 case AArch64ISD::DUPLANE8:
22040 case AArch64ISD::DUPLANE16:
22041 case AArch64ISD::DUPLANE32:
22042 case AArch64ISD::DUPLANE64:
22043 case AArch64ISD::MOVI:
22044 case AArch64ISD::MOVIshift:
22045 case AArch64ISD::MOVIedit:
22046 case AArch64ISD::MOVImsl:
22047 case AArch64ISD::MVNIshift:
22048 case AArch64ISD::MVNImsl:
22049 break;
22050 default:
22051 // FMOV could be supported, but isn't very useful, as it would only occur
22052 // if you passed a bitcast' floating point immediate to an eligible long
22053 // integer op (addl, smull, ...).
22054 return SDValue();
22055 }
22056
22057 if (!VT.is64BitVector())
22058 return SDValue();
22059
22060 SDLoc DL(N);
22061 unsigned NumElems = VT.getVectorNumElements();
22062 if (N.getValueType().is64BitVector()) {
22063 MVT ElementTy = VT.getVectorElementType();
22064 MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
22065 N = DAG.getNode(N->getOpcode(), DL, NewVT, N->ops());
22066 }
22067
22068 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N,
22069 DAG.getConstant(NumElems, DL, MVT::i64));
22070}
22071
22073 if (N.getOpcode() == ISD::BITCAST)
22074 N = N.getOperand(0);
22075 if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
22076 return false;
22077 if (N.getOperand(0).getValueType().isScalableVector())
22078 return false;
22079 return N.getConstantOperandAPInt(1) ==
22080 N.getOperand(0).getValueType().getVectorNumElements() / 2;
22081}
22082
22083/// Helper structure to keep track of ISD::SET_CC operands.
22089
22090/// Helper structure to keep track of a SET_CC lowered into AArch64 code.
22095
22096/// Helper structure to keep track of SetCC information.
22101
22102/// Helper structure to be able to read SetCC information. If set to
22103/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
22104/// GenericSetCCInfo.
22109
22110/// Check whether or not \p Op is a SET_CC operation, either a generic or
22111/// an
22112/// AArch64 lowered one.
22113/// \p SetCCInfo is filled accordingly.
22114/// \post SetCCInfo is meanginfull only when this function returns true.
22115/// \return True when Op is a kind of SET_CC operation.
22117 // If this is a setcc, this is straight forward.
22118 if (Op.getOpcode() == ISD::SETCC) {
22119 SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
22120 SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
22121 SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
22122 SetCCInfo.IsAArch64 = false;
22123 return true;
22124 }
22125 // Otherwise, check if this is a matching csel instruction.
22126 // In other words:
22127 // - csel 1, 0, cc
22128 // - csel 0, 1, !cc
22129 if (Op.getOpcode() != AArch64ISD::CSEL)
22130 return false;
22131 // Set the information about the operands.
22132 // TODO: we want the operands of the Cmp not the csel
22133 SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
22134 SetCCInfo.IsAArch64 = true;
22135 SetCCInfo.Info.AArch64.CC =
22136 static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
22137
22138 // Check that the operands matches the constraints:
22139 // (1) Both operands must be constants.
22140 // (2) One must be 1 and the other must be 0.
22141 ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
22142 ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
22143
22144 // Check (1).
22145 if (!TValue || !FValue)
22146 return false;
22147
22148 // Check (2).
22149 if (!TValue->isOne()) {
22150 // Update the comparison when we are interested in !cc.
22151 std::swap(TValue, FValue);
22152 SetCCInfo.Info.AArch64.CC =
22154 }
22155 return TValue->isOne() && FValue->isZero();
22156}
22157
22158// Returns true if Op is setcc or zext of setcc.
22159static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
22160 if (isSetCC(Op, Info))
22161 return true;
22162 return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
22163 isSetCC(Op->getOperand(0), Info));
22164}
22165
22166// The folding we want to perform is:
22167// (add x, [zext] (setcc cc ...) )
22168// -->
22169// (csel x, (add x, 1), !cc ...)
22170//
22171// The latter will get matched to a CSINC instruction.
22173 assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
22174 SDValue LHS = Op->getOperand(0);
22175 SDValue RHS = Op->getOperand(1);
22176 SetCCInfoAndKind InfoAndKind;
22177
22178 // If both operands are a SET_CC, then we don't want to perform this
22179 // folding and create another csel as this results in more instructions
22180 // (and higher register usage).
22181 if (isSetCCOrZExtSetCC(LHS, InfoAndKind) &&
22182 isSetCCOrZExtSetCC(RHS, InfoAndKind))
22183 return SDValue();
22184
22185 // If neither operand is a SET_CC, give up.
22186 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
22187 std::swap(LHS, RHS);
22188 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
22189 return SDValue();
22190 }
22191
22192 // FIXME: This could be generatized to work for FP comparisons.
22193 EVT CmpVT = InfoAndKind.IsAArch64
22194 ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
22195 : InfoAndKind.Info.Generic.Opnd0->getValueType();
22196 if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
22197 return SDValue();
22198
22199 SDValue CCVal;
22200 SDValue Cmp;
22201 SDLoc DL(Op);
22202 if (InfoAndKind.IsAArch64) {
22203 CCVal = DAG.getConstant(
22205 MVT::i32);
22206 Cmp = *InfoAndKind.Info.AArch64.Cmp;
22207 } else
22208 Cmp = getAArch64Cmp(
22209 *InfoAndKind.Info.Generic.Opnd0, *InfoAndKind.Info.Generic.Opnd1,
22210 ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, CmpVT), CCVal, DAG,
22211 DL);
22212
22213 EVT VT = Op->getValueType(0);
22214 LHS = DAG.getNode(ISD::ADD, DL, VT, RHS, DAG.getConstant(1, DL, VT));
22215 return DAG.getNode(AArch64ISD::CSEL, DL, VT, RHS, LHS, CCVal, Cmp);
22216}
22217
22218// ADD(UADDV a, UADDV b) --> UADDV(ADD a, b)
22220 EVT VT = N->getValueType(0);
22221 // Only scalar integer and vector types.
22222 if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger())
22223 return SDValue();
22224
22225 SDValue LHS = N->getOperand(0);
22226 SDValue RHS = N->getOperand(1);
22227 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
22228 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getValueType() != VT)
22229 return SDValue();
22230
22231 auto *LHSN1 = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
22232 auto *RHSN1 = dyn_cast<ConstantSDNode>(RHS->getOperand(1));
22233 if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isZero())
22234 return SDValue();
22235
22236 SDValue Op1 = LHS->getOperand(0);
22237 SDValue Op2 = RHS->getOperand(0);
22238 EVT OpVT1 = Op1.getValueType();
22239 EVT OpVT2 = Op2.getValueType();
22240 if (Op1.getOpcode() != AArch64ISD::UADDV || OpVT1 != OpVT2 ||
22241 Op2.getOpcode() != AArch64ISD::UADDV ||
22242 OpVT1.getVectorElementType() != VT)
22243 return SDValue();
22244
22245 SDValue Val1 = Op1.getOperand(0);
22246 SDValue Val2 = Op2.getOperand(0);
22247 EVT ValVT = Val1->getValueType(0);
22248 SDLoc DL(N);
22249 SDValue AddVal = DAG.getNode(ISD::ADD, DL, ValVT, Val1, Val2);
22250 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
22251 DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal),
22252 DAG.getConstant(0, DL, MVT::i64));
22253}
22254
22255/// Perform the scalar expression combine in the form of:
22256/// CSEL(c, 1, cc) + b => CSINC(b+c, b, cc)
22257/// CSNEG(c, -1, cc) + b => CSINC(b+c, b, cc)
22259 EVT VT = N->getValueType(0);
22260 if (!VT.isScalarInteger() || N->getOpcode() != ISD::ADD)
22261 return SDValue();
22262
22263 SDValue LHS = N->getOperand(0);
22264 SDValue RHS = N->getOperand(1);
22265
22266 // Handle commutivity.
22267 if (LHS.getOpcode() != AArch64ISD::CSEL &&
22268 LHS.getOpcode() != AArch64ISD::CSNEG) {
22269 std::swap(LHS, RHS);
22270 if (LHS.getOpcode() != AArch64ISD::CSEL &&
22271 LHS.getOpcode() != AArch64ISD::CSNEG) {
22272 return SDValue();
22273 }
22274 }
22275
22276 if (!LHS.hasOneUse())
22277 return SDValue();
22278
22280 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
22281
22282 // The CSEL should include a const one operand, and the CSNEG should include
22283 // One or NegOne operand.
22284 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(LHS.getOperand(0));
22285 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
22286 if (!CTVal || !CFVal)
22287 return SDValue();
22288
22289 if (!(LHS.getOpcode() == AArch64ISD::CSEL &&
22290 (CTVal->isOne() || CFVal->isOne())) &&
22291 !(LHS.getOpcode() == AArch64ISD::CSNEG &&
22292 (CTVal->isOne() || CFVal->isAllOnes())))
22293 return SDValue();
22294
22295 // Switch CSEL(1, c, cc) to CSEL(c, 1, !cc)
22296 if (LHS.getOpcode() == AArch64ISD::CSEL && CTVal->isOne() &&
22297 !CFVal->isOne()) {
22298 std::swap(CTVal, CFVal);
22300 }
22301
22302 SDLoc DL(N);
22303 // Switch CSNEG(1, c, cc) to CSNEG(-c, -1, !cc)
22304 if (LHS.getOpcode() == AArch64ISD::CSNEG && CTVal->isOne() &&
22305 !CFVal->isAllOnes()) {
22306 APInt C = -1 * CFVal->getAPIntValue();
22307 CTVal = cast<ConstantSDNode>(DAG.getConstant(C, DL, VT));
22308 CFVal = cast<ConstantSDNode>(DAG.getAllOnesConstant(DL, VT));
22310 }
22311
22312 // It might be neutral for larger constants, as the immediate need to be
22313 // materialized in a register.
22314 APInt ADDC = CTVal->getAPIntValue();
22315 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22316 if (!TLI.isLegalAddImmediate(ADDC.getSExtValue()))
22317 return SDValue();
22318
22319 assert(((LHS.getOpcode() == AArch64ISD::CSEL && CFVal->isOne()) ||
22320 (LHS.getOpcode() == AArch64ISD::CSNEG && CFVal->isAllOnes())) &&
22321 "Unexpected constant value");
22322
22323 SDValue NewNode = DAG.getNode(ISD::ADD, DL, VT, RHS, SDValue(CTVal, 0));
22324 SDValue CCVal = getCondCode(DAG, AArch64CC);
22325 SDValue Cmp = LHS.getOperand(3);
22326
22327 return DAG.getNode(AArch64ISD::CSINC, DL, VT, NewNode, RHS, CCVal, Cmp);
22328}
22329
22330// ADD(UDOT(zero, x, y), A) --> UDOT(A, x, y)
22332 EVT VT = N->getValueType(0);
22333 if (N->getOpcode() != ISD::ADD)
22334 return SDValue();
22335
22336 SDValue Dot = N->getOperand(0);
22337 SDValue A = N->getOperand(1);
22338 // Handle commutivity
22339 auto isZeroDot = [](SDValue Dot) {
22340 return (Dot.getOpcode() == AArch64ISD::UDOT ||
22341 Dot.getOpcode() == AArch64ISD::SDOT ||
22342 Dot.getOpcode() == AArch64ISD::USDOT) &&
22344 };
22345 if (!isZeroDot(Dot))
22346 std::swap(Dot, A);
22347 if (!isZeroDot(Dot))
22348 return SDValue();
22349
22350 return DAG.getNode(Dot.getOpcode(), SDLoc(N), VT, A, Dot.getOperand(1),
22351 Dot.getOperand(2));
22352}
22353
22355 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0));
22356}
22357
22358// Try to fold
22359//
22360// (neg (csel X, Y)) -> (csel (neg X), (neg Y))
22361//
22362// The folding helps csel to be matched with csneg without generating
22363// redundant neg instruction, which includes negation of the csel expansion
22364// of abs node lowered by lowerABS.
22366 if (!isNegatedInteger(SDValue(N, 0)))
22367 return SDValue();
22368
22369 SDValue CSel = N->getOperand(1);
22370 if (CSel.getOpcode() != AArch64ISD::CSEL || !CSel->hasOneUse())
22371 return SDValue();
22372
22373 SDValue N0 = CSel.getOperand(0);
22374 SDValue N1 = CSel.getOperand(1);
22375
22376 // If neither of them are negations, it's not worth the folding as it
22377 // introduces two additional negations while reducing one negation.
22378 if (!isNegatedInteger(N0) && !isNegatedInteger(N1))
22379 return SDValue();
22380
22381 SDLoc DL(N);
22382 EVT VT = CSel.getValueType();
22383
22384 SDValue N0N = DAG.getNegative(N0, DL, VT);
22385 SDValue N1N = DAG.getNegative(N1, DL, VT);
22386
22387 return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0N, N1N, CSel.getOperand(2),
22388 CSel.getOperand(3));
22389}
22390
22391// The basic add/sub long vector instructions have variants with "2" on the end
22392// which act on the high-half of their inputs. They are normally matched by
22393// patterns like:
22394//
22395// (add (zeroext (extract_high LHS)),
22396// (zeroext (extract_high RHS)))
22397// -> uaddl2 vD, vN, vM
22398//
22399// However, if one of the extracts is something like a duplicate, this
22400// instruction can still be used profitably. This function puts the DAG into a
22401// more appropriate form for those patterns to trigger.
22404 SelectionDAG &DAG = DCI.DAG;
22405 if (DCI.isBeforeLegalizeOps())
22406 return SDValue();
22407
22408 MVT VT = N->getSimpleValueType(0);
22409 if (!VT.is128BitVector()) {
22410 if (N->getOpcode() == ISD::ADD)
22411 return performSetccAddFolding(N, DAG);
22412 return SDValue();
22413 }
22414
22415 // Make sure both branches are extended in the same way.
22416 SDValue LHS = N->getOperand(0);
22417 SDValue RHS = N->getOperand(1);
22418 if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
22419 LHS.getOpcode() != ISD::SIGN_EXTEND) ||
22420 LHS.getOpcode() != RHS.getOpcode())
22421 return SDValue();
22422
22423 unsigned ExtType = LHS.getOpcode();
22424
22425 // It's not worth doing if at least one of the inputs isn't already an
22426 // extract, but we don't know which it'll be so we have to try both.
22427 if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) {
22428 RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
22429 if (!RHS.getNode())
22430 return SDValue();
22431
22432 RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
22433 } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) {
22434 LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
22435 if (!LHS.getNode())
22436 return SDValue();
22437
22438 LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
22439 }
22440
22441 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
22442}
22443
22444static bool isCMP(SDValue Op) {
22445 return Op.getOpcode() == AArch64ISD::SUBS &&
22446 !Op.getNode()->hasAnyUseOfValue(0);
22447}
22448
22449// (CSEL 1 0 CC Cond) => CC
22450// (CSEL 0 1 CC Cond) => !CC
22451static std::optional<AArch64CC::CondCode> getCSETCondCode(SDValue Op) {
22452 if (Op.getOpcode() != AArch64ISD::CSEL)
22453 return std::nullopt;
22454 auto CC = static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
22455 if (CC == AArch64CC::AL || CC == AArch64CC::NV)
22456 return std::nullopt;
22457 SDValue OpLHS = Op.getOperand(0);
22458 SDValue OpRHS = Op.getOperand(1);
22459 if (isOneConstant(OpLHS) && isNullConstant(OpRHS))
22460 return CC;
22461 if (isNullConstant(OpLHS) && isOneConstant(OpRHS))
22462 return getInvertedCondCode(CC);
22463
22464 return std::nullopt;
22465}
22466
22467// (ADC{S} l r (CMP (CSET HS carry) 1)) => (ADC{S} l r carry)
22468// (SBC{S} l r (CMP 0 (CSET LO carry))) => (SBC{S} l r carry)
22469static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd) {
22470 SDValue CmpOp = Op->getOperand(2);
22471 if (!isCMP(CmpOp))
22472 return SDValue();
22473
22474 if (IsAdd) {
22475 if (!isOneConstant(CmpOp.getOperand(1)))
22476 return SDValue();
22477 } else {
22478 if (!isNullConstant(CmpOp.getOperand(0)))
22479 return SDValue();
22480 }
22481
22482 SDValue CsetOp = CmpOp->getOperand(IsAdd ? 0 : 1);
22483 auto CC = getCSETCondCode(CsetOp);
22484 if (CC != (IsAdd ? AArch64CC::HS : AArch64CC::LO))
22485 return SDValue();
22486
22487 return DAG.getNode(Op->getOpcode(), SDLoc(Op), Op->getVTList(),
22488 Op->getOperand(0), Op->getOperand(1),
22489 CsetOp.getOperand(3));
22490}
22491
22492// (ADC x 0 cond) => (CINC x HS cond)
22494 SDValue LHS = N->getOperand(0);
22495 SDValue RHS = N->getOperand(1);
22496 SDValue Cond = N->getOperand(2);
22497
22498 if (!isNullConstant(RHS))
22499 return SDValue();
22500
22501 EVT VT = N->getValueType(0);
22502 SDLoc DL(N);
22503
22504 // (CINC x cc cond) <=> (CSINC x x !cc cond)
22506 return DAG.getNode(AArch64ISD::CSINC, DL, VT, LHS, LHS, CC, Cond);
22507}
22508
22511 SelectionDAG &DAG) {
22512 SDLoc DL(N);
22513 EVT VT = N->getValueType(0);
22514
22516 (VT == MVT::v4f16 || VT == MVT::v4bf16)) {
22517 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1),
22518 Elt2 = N->getOperand(2), Elt3 = N->getOperand(3);
22519 if (Elt0->getOpcode() == ISD::FP_ROUND &&
22520 Elt1->getOpcode() == ISD::FP_ROUND &&
22521 isa<ConstantSDNode>(Elt0->getOperand(1)) &&
22522 isa<ConstantSDNode>(Elt1->getOperand(1)) &&
22523 Elt0->getConstantOperandVal(1) == Elt1->getConstantOperandVal(1) &&
22525 Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
22526 // Constant index.
22528 isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&
22529 Elt0->getOperand(0)->getOperand(0) ==
22530 Elt1->getOperand(0)->getOperand(0) &&
22531 Elt0->getOperand(0)->getConstantOperandVal(1) == 0 &&
22532 Elt1->getOperand(0)->getConstantOperandVal(1) == 1) {
22533 SDValue LowLanesSrcVec = Elt0->getOperand(0)->getOperand(0);
22534 if (LowLanesSrcVec.getValueType() == MVT::v2f64) {
22535 SDValue HighLanes;
22536 if (Elt2->isUndef() && Elt3->isUndef()) {
22537 HighLanes = DAG.getPOISON(MVT::v2f32);
22538 } else if (Elt2->getOpcode() == ISD::FP_ROUND &&
22539 Elt3->getOpcode() == ISD::FP_ROUND &&
22540 isa<ConstantSDNode>(Elt2->getOperand(1)) &&
22541 isa<ConstantSDNode>(Elt3->getOperand(1)) &&
22542 Elt2->getConstantOperandVal(1) ==
22543 Elt3->getConstantOperandVal(1) &&
22544 Elt2->getOperand(0)->getOpcode() ==
22546 Elt3->getOperand(0)->getOpcode() ==
22548 // Constant index.
22549 isa<ConstantSDNode>(Elt2->getOperand(0)->getOperand(1)) &&
22550 isa<ConstantSDNode>(Elt3->getOperand(0)->getOperand(1)) &&
22551 Elt2->getOperand(0)->getOperand(0) ==
22552 Elt3->getOperand(0)->getOperand(0) &&
22553 Elt2->getOperand(0)->getConstantOperandVal(1) == 0 &&
22554 Elt3->getOperand(0)->getConstantOperandVal(1) == 1) {
22555 SDValue HighLanesSrcVec = Elt2->getOperand(0)->getOperand(0);
22556 HighLanes =
22557 DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, HighLanesSrcVec);
22558 }
22559 if (HighLanes) {
22560 SDValue DoubleToSingleSticky =
22561 DAG.getNode(AArch64ISD::FCVTXN, DL, MVT::v2f32, LowLanesSrcVec);
22562 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
22563 DoubleToSingleSticky, HighLanes);
22564 return DAG.getNode(ISD::FP_ROUND, DL, VT, Concat,
22565 Elt0->getOperand(1));
22566 }
22567 }
22568 }
22569 }
22570
22571 if (VT == MVT::v2f64) {
22572 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
22573 if (Elt0->getOpcode() == ISD::FP_EXTEND &&
22574 Elt1->getOpcode() == ISD::FP_EXTEND &&
22576 Elt1->getOperand(0)->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
22577 Elt0->getOperand(0)->getOperand(0) ==
22578 Elt1->getOperand(0)->getOperand(0) &&
22579 // Constant index.
22581 isa<ConstantSDNode>(Elt1->getOperand(0)->getOperand(1)) &&
22582 Elt0->getOperand(0)->getConstantOperandVal(1) + 1 ==
22583 Elt1->getOperand(0)->getConstantOperandVal(1) &&
22584 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
22585 // ResultType's known minimum vector length.
22586 Elt0->getOperand(0)->getConstantOperandVal(1) %
22588 0) {
22589 SDValue SrcVec = Elt0->getOperand(0)->getOperand(0);
22590 if (SrcVec.getValueType() == MVT::v4f16 ||
22591 SrcVec.getValueType() == MVT::v4bf16) {
22592 SDValue HalfToSingle =
22593 DAG.getNode(ISD::FP_EXTEND, DL, MVT::v4f32, SrcVec);
22594 SDValue SubvectorIdx = Elt0->getOperand(0)->getOperand(1);
22595 SDValue Extract =
22597 VT.changeVectorElementType(*DAG.getContext(), MVT::f32),
22598 HalfToSingle, SubvectorIdx);
22599 return DAG.getNode(ISD::FP_EXTEND, DL, VT, Extract);
22600 }
22601 }
22602 }
22603
22604 // A build vector of two extracted elements is equivalent to an
22605 // extract subvector where the inner vector is any-extended to the
22606 // extract_vector_elt VT.
22607 // (build_vector (extract_elt_iXX_to_i32 vec Idx+0)
22608 // (extract_elt_iXX_to_i32 vec Idx+1))
22609 // => (extract_subvector (anyext_iXX_to_i32 vec) Idx)
22610
22611 // For now, only consider the v2i32 case, which arises as a result of
22612 // legalization.
22613 if (VT != MVT::v2i32)
22614 return SDValue();
22615
22616 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
22617 // Reminder, EXTRACT_VECTOR_ELT has the effect of any-extending to its VT.
22618 if (Elt0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
22619 Elt1->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
22620 // Constant index.
22621 isa<ConstantSDNode>(Elt0->getOperand(1)) &&
22622 isa<ConstantSDNode>(Elt1->getOperand(1)) &&
22623 // Both EXTRACT_VECTOR_ELT from same vector...
22624 Elt0->getOperand(0) == Elt1->getOperand(0) &&
22625 // ... and contiguous. First element's index +1 == second element's index.
22626 Elt0->getConstantOperandVal(1) + 1 == Elt1->getConstantOperandVal(1) &&
22627 // EXTRACT_SUBVECTOR requires that Idx be a constant multiple of
22628 // ResultType's known minimum vector length.
22629 Elt0->getConstantOperandVal(1) % VT.getVectorMinNumElements() == 0) {
22630 SDValue VecToExtend = Elt0->getOperand(0);
22631 EVT ExtVT = VecToExtend.getValueType().changeVectorElementType(
22632 *DAG.getContext(), MVT::i32);
22633 if (!DAG.getTargetLoweringInfo().isTypeLegal(ExtVT))
22634 return SDValue();
22635
22636 SDValue SubvectorIdx = DAG.getVectorIdxConstant(Elt0->getConstantOperandVal(1), DL);
22637
22638 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, DL, ExtVT, VecToExtend);
22639 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Ext,
22640 SubvectorIdx);
22641 }
22642
22643 return SDValue();
22644}
22645
22646// A special combine for the sqdmulh family of instructions.
22647// smin( sra ( mul( sext v0, sext v1 ) ), SHIFT_AMOUNT ),
22648// SATURATING_VAL ) can be reduced to sqdmulh(...)
22650
22651 if (N->getOpcode() != ISD::SMIN)
22652 return SDValue();
22653
22654 EVT DestVT = N->getValueType(0);
22655
22656 if (!DestVT.isVector() || DestVT.getScalarSizeInBits() > 64 ||
22657 DestVT.isScalableVector())
22658 return SDValue();
22659
22660 ConstantSDNode *Clamp = isConstOrConstSplat(N->getOperand(1));
22661
22662 if (!Clamp)
22663 return SDValue();
22664
22665 MVT ScalarType;
22666 unsigned ShiftAmt = 0;
22667 switch (Clamp->getSExtValue()) {
22668 case (1ULL << 15) - 1:
22669 ScalarType = MVT::i16;
22670 ShiftAmt = 16;
22671 break;
22672 case (1ULL << 31) - 1:
22673 ScalarType = MVT::i32;
22674 ShiftAmt = 32;
22675 break;
22676 default:
22677 return SDValue();
22678 }
22679
22680 SDValue Sra = N->getOperand(0);
22681 if (Sra.getOpcode() != ISD::SRA || !Sra.hasOneUse())
22682 return SDValue();
22683
22684 ConstantSDNode *RightShiftVec = isConstOrConstSplat(Sra.getOperand(1));
22685 if (!RightShiftVec)
22686 return SDValue();
22687 unsigned SExtValue = RightShiftVec->getSExtValue();
22688
22689 if (SExtValue != (ShiftAmt - 1))
22690 return SDValue();
22691
22692 SDValue Mul = Sra.getOperand(0);
22693 if (Mul.getOpcode() != ISD::MUL)
22694 return SDValue();
22695
22696 SDValue SExt0 = Mul.getOperand(0);
22697 SDValue SExt1 = Mul.getOperand(1);
22698
22699 if (SExt0.getOpcode() != ISD::SIGN_EXTEND ||
22700 SExt1.getOpcode() != ISD::SIGN_EXTEND)
22701 return SDValue();
22702
22703 EVT SExt0Type = SExt0.getOperand(0).getValueType();
22704 EVT SExt1Type = SExt1.getOperand(0).getValueType();
22705
22706 if (SExt0Type != SExt1Type || SExt0Type.getScalarType() != ScalarType ||
22707 SExt0Type.getFixedSizeInBits() > 128 || !SExt0Type.isPow2VectorType() ||
22708 SExt0Type.getVectorNumElements() == 1)
22709 return SDValue();
22710
22711 SDLoc DL(N);
22712 SDValue V0 = SExt0.getOperand(0);
22713 SDValue V1 = SExt1.getOperand(0);
22714
22715 // Ensure input vectors are extended to legal types
22716 if (SExt0Type.getFixedSizeInBits() < 64) {
22717 unsigned VecNumElements = SExt0Type.getVectorNumElements();
22718 EVT ExtVecVT = MVT::getVectorVT(MVT::getIntegerVT(64 / VecNumElements),
22719 VecNumElements);
22720 V0 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVecVT, V0);
22721 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVecVT, V1);
22722 }
22723
22724 SDValue SQDMULH =
22725 DAG.getNode(AArch64ISD::SQDMULH, DL, V0.getValueType(), V0, V1);
22726
22727 return DAG.getNode(ISD::SIGN_EXTEND, DL, DestVT, SQDMULH);
22728}
22729
22732 SDLoc DL(N);
22733 EVT VT = N->getValueType(0);
22734 SDValue N0 = N->getOperand(0);
22735 if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() &&
22736 N0.getOpcode() == AArch64ISD::DUP) {
22737 SDValue Op = N0.getOperand(0);
22738 if (VT.getScalarType() == MVT::i32 &&
22739 N0.getOperand(0).getValueType().getScalarType() == MVT::i64)
22740 Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op);
22741 return DAG.getNode(N0.getOpcode(), DL, VT, Op);
22742 }
22743
22744 // Performing the following combine produces a preferable form for ISEL.
22745 // i32 (trunc (extract Vi64, idx)) -> i32 (extract (nvcast Vi32), idx*2))
22747 N0.hasOneUse()) {
22748 SDValue Op = N0.getOperand(0);
22749 SDValue ExtractIndexNode = N0.getOperand(1);
22750 if (!isa<ConstantSDNode>(ExtractIndexNode))
22751 return SDValue();
22752
22753 // For a legal DAG, EXTRACT_VECTOR_ELT can only have produced an i32 or i64.
22754 // So we can only expect: i32 (trunc (i64 (extract Vi64, idx))).
22755 assert((VT == MVT::i32 && N0.getValueType() == MVT::i64) &&
22756 "Unexpected legalisation result!");
22757
22758 EVT SrcVectorType = Op.getValueType();
22759 // We also assume that SrcVectorType cannot be a V64 (see
22760 // LowerEXTRACT_VECTOR_ELT).
22761 assert((SrcVectorType == MVT::v2i64 || SrcVectorType == MVT::nxv2i64) &&
22762 "Unexpected legalisation result!");
22763
22764 unsigned ExtractIndex =
22765 cast<ConstantSDNode>(ExtractIndexNode)->getZExtValue();
22766 MVT CastVT = SrcVectorType.isScalableVector() ? MVT::nxv4i32 : MVT::v4i32;
22767
22768 Op = DAG.getNode(AArch64ISD::NVCAST, DL, CastVT, Op);
22769 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op,
22770 DAG.getVectorIdxConstant(ExtractIndex * 2, DL));
22771 }
22772
22773 return SDValue();
22774}
22775
22776// Check an node is an extend or shift operand
22778 unsigned Opcode = N.getOpcode();
22779 if (ISD::isExtOpcode(Opcode) || Opcode == ISD::SIGN_EXTEND_INREG) {
22780 EVT SrcVT;
22781 if (Opcode == ISD::SIGN_EXTEND_INREG)
22782 SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT();
22783 else
22784 SrcVT = N.getOperand(0).getValueType();
22785
22786 return SrcVT == MVT::i32 || SrcVT == MVT::i16 || SrcVT == MVT::i8;
22787 } else if (Opcode == ISD::AND) {
22788 ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
22789 if (!CSD)
22790 return false;
22791 uint64_t AndMask = CSD->getZExtValue();
22792 return AndMask == 0xff || AndMask == 0xffff || AndMask == 0xffffffff;
22793 } else if (Opcode == ISD::SHL || Opcode == ISD::SRL || Opcode == ISD::SRA) {
22794 return isa<ConstantSDNode>(N.getOperand(1));
22795 }
22796
22797 return false;
22798}
22799
22800// (N - Y) + Z --> (Z - Y) + N
22801// when N is an extend or shift operand
22803 SelectionDAG &DAG) {
22804 auto IsOneUseExtend = [](SDValue N) {
22805 return N.hasOneUse() && isExtendOrShiftOperand(N);
22806 };
22807
22808 // DAGCombiner will revert the combination when Z is constant cause
22809 // dead loop. So don't enable the combination when Z is constant.
22810 // If Z is one use shift C, we also can't do the optimization.
22811 // It will falling to self infinite loop.
22812 if (isa<ConstantSDNode>(Z) || IsOneUseExtend(Z))
22813 return SDValue();
22814
22815 if (SUB.getOpcode() != ISD::SUB || !SUB.hasOneUse())
22816 return SDValue();
22817
22818 SDValue Shift = SUB.getOperand(0);
22819 if (!IsOneUseExtend(Shift))
22820 return SDValue();
22821
22822 SDLoc DL(N);
22823 EVT VT = N->getValueType(0);
22824
22825 SDValue Y = SUB.getOperand(1);
22826 SDValue NewSub = DAG.getNode(ISD::SUB, DL, VT, Z, Y);
22827 return DAG.getNode(ISD::ADD, DL, VT, NewSub, Shift);
22828}
22829
22831 SelectionDAG &DAG) {
22832 // NOTE: Swapping LHS and RHS is not done for SUB, since SUB is not
22833 // commutative.
22834 if (N->getOpcode() != ISD::ADD)
22835 return SDValue();
22836
22837 // Bail out when value type is not one of {i32, i64}, since AArch64 ADD with
22838 // shifted register is only available for i32 and i64.
22839 EVT VT = N->getValueType(0);
22840 if (VT != MVT::i32 && VT != MVT::i64)
22841 return SDValue();
22842
22843 SDLoc DL(N);
22844 SDValue LHS = N->getOperand(0);
22845 SDValue RHS = N->getOperand(1);
22846
22847 if (SDValue Val = performAddCombineSubShift(N, LHS, RHS, DAG))
22848 return Val;
22849 if (SDValue Val = performAddCombineSubShift(N, RHS, LHS, DAG))
22850 return Val;
22851
22852 uint64_t LHSImm = 0, RHSImm = 0;
22853 // If both operand are shifted by imm and shift amount is not greater than 4
22854 // for one operand, swap LHS and RHS to put operand with smaller shift amount
22855 // on RHS.
22856 //
22857 // On many AArch64 processors (Cortex A78, Neoverse N1/N2/V1, etc), ADD with
22858 // LSL shift (shift <= 4) has smaller latency and larger throughput than ADD
22859 // with LSL (shift > 4). For the rest of processors, this is no-op for
22860 // performance or correctness.
22861 if (isOpcWithIntImmediate(LHS.getNode(), ISD::SHL, LHSImm) &&
22862 isOpcWithIntImmediate(RHS.getNode(), ISD::SHL, RHSImm) && LHSImm <= 4 &&
22863 RHSImm > 4 && LHS.hasOneUse())
22864 return DAG.getNode(ISD::ADD, DL, VT, RHS, LHS);
22865
22866 return SDValue();
22867}
22868
22869// The mid end will reassociate sub(sub(x, m1), m2) to sub(x, add(m1, m2))
22870// This reassociates it back to allow the creation of more mls instructions.
22872 if (N->getOpcode() != ISD::SUB)
22873 return SDValue();
22874
22875 SDValue Add = N->getOperand(1);
22876 SDValue X = N->getOperand(0);
22877 if (Add.getOpcode() != ISD::ADD)
22878 return SDValue();
22879
22880 if (!Add.hasOneUse())
22881 return SDValue();
22883 return SDValue();
22884
22885 SDValue M1 = Add.getOperand(0);
22886 SDValue M2 = Add.getOperand(1);
22887 if (M1.getOpcode() != ISD::MUL && M1.getOpcode() != AArch64ISD::SMULL &&
22888 M1.getOpcode() != AArch64ISD::UMULL)
22889 return SDValue();
22890 if (M2.getOpcode() != ISD::MUL && M2.getOpcode() != AArch64ISD::SMULL &&
22891 M2.getOpcode() != AArch64ISD::UMULL)
22892 return SDValue();
22893
22894 EVT VT = N->getValueType(0);
22895 SDValue Sub = DAG.getNode(ISD::SUB, SDLoc(N), VT, X, M1);
22896 return DAG.getNode(ISD::SUB, SDLoc(N), VT, Sub, M2);
22897}
22898
22899// Combine into mla/mls.
22900// This works on the patterns of:
22901// add v1, (mul v2, v3)
22902// sub v1, (mul v2, v3)
22903// for vectors of type <1 x i64> and <2 x i64> when SVE is available.
22904// It will transform the add/sub to a scalable version, so that we can
22905// make use of SVE's MLA/MLS that will be generated for that pattern
22906static SDValue
22908 SelectionDAG &DAG = DCI.DAG;
22909 // Make sure that the types are legal
22910 if (!DCI.isAfterLegalizeDAG())
22911 return SDValue();
22912 // Before using SVE's features, check first if it's available.
22913 if (!DAG.getSubtarget<AArch64Subtarget>().hasSVE())
22914 return SDValue();
22915
22916 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::SUB)
22917 return SDValue();
22918
22919 if (!N->getValueType(0).isFixedLengthVector())
22920 return SDValue();
22921
22922 auto performOpt = [&DAG, &N](SDValue Op0, SDValue Op1) -> SDValue {
22923 if (Op1.getOpcode() != ISD::EXTRACT_SUBVECTOR)
22924 return SDValue();
22925
22926 if (!cast<ConstantSDNode>(Op1->getOperand(1))->isZero())
22927 return SDValue();
22928
22929 SDValue MulValue = Op1->getOperand(0);
22930 if (MulValue.getOpcode() != AArch64ISD::MUL_PRED)
22931 return SDValue();
22932
22933 if (!Op1.hasOneUse() || !MulValue.hasOneUse())
22934 return SDValue();
22935
22936 EVT ScalableVT = MulValue.getValueType();
22937 if (!ScalableVT.isScalableVector())
22938 return SDValue();
22939
22940 SDValue ScaledOp = convertToScalableVector(DAG, ScalableVT, Op0);
22941 SDValue NewValue =
22942 DAG.getNode(N->getOpcode(), SDLoc(N), ScalableVT, {ScaledOp, MulValue});
22943 return convertFromScalableVector(DAG, N->getValueType(0), NewValue);
22944 };
22945
22946 if (SDValue res = performOpt(N->getOperand(0), N->getOperand(1)))
22947 return res;
22948 else if (N->getOpcode() == ISD::ADD)
22949 return performOpt(N->getOperand(1), N->getOperand(0));
22950
22951 return SDValue();
22952}
22953
22954// Given a i64 add from a v1i64 extract, convert to a neon v1i64 add. This can
22955// help, for example, to produce ssra from sshr+add.
22957 EVT VT = N->getValueType(0);
22958 if (VT != MVT::i64 ||
22959 DAG.getTargetLoweringInfo().isOperationExpand(N->getOpcode(), MVT::v1i64))
22960 return SDValue();
22961 SDValue Op0 = N->getOperand(0);
22962 SDValue Op1 = N->getOperand(1);
22963
22964 // At least one of the operands should be an extract, and the other should be
22965 // something that is easy to convert to v1i64 type (in this case a load).
22966 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
22967 Op0.getOpcode() != ISD::LOAD)
22968 return SDValue();
22969 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
22970 Op1.getOpcode() != ISD::LOAD)
22971 return SDValue();
22972
22973 SDLoc DL(N);
22974 if (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
22975 Op0.getOperand(0).getValueType() == MVT::v1i64) {
22976 Op0 = Op0.getOperand(0);
22977 Op1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op1);
22978 } else if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
22979 Op1.getOperand(0).getValueType() == MVT::v1i64) {
22980 Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op0);
22981 Op1 = Op1.getOperand(0);
22982 } else
22983 return SDValue();
22984
22985 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64,
22986 DAG.getNode(N->getOpcode(), DL, MVT::v1i64, Op0, Op1),
22987 DAG.getConstant(0, DL, MVT::i64));
22988}
22989
22992 if (!BV->hasOneUse())
22993 return false;
22994 if (auto *Ld = dyn_cast<LoadSDNode>(BV)) {
22995 if (!Ld || !Ld->isSimple())
22996 return false;
22997 Loads.push_back(Ld);
22998 return true;
22999 } else if (BV.getOpcode() == ISD::BUILD_VECTOR ||
23001 for (unsigned Op = 0; Op < BV.getNumOperands(); Op++) {
23002 auto *Ld = dyn_cast<LoadSDNode>(BV.getOperand(Op));
23003 if (!Ld || !Ld->isSimple() || !BV.getOperand(Op).hasOneUse())
23004 return false;
23005 Loads.push_back(Ld);
23006 }
23007 return true;
23008 } else if (B.getOpcode() == ISD::VECTOR_SHUFFLE) {
23009 // Try to find a tree of shuffles and concats from how IR shuffles of loads
23010 // are lowered. Note that this only comes up because we do not always visit
23011 // operands before uses. After that is fixed this can be removed and in the
23012 // meantime this is fairly specific to the lowering we expect from IR.
23013 // t46: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19> t44, t45
23014 // t44: v16i8 = vector_shuffle<0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u> t42, t43
23015 // t42: v16i8 = concat_vectors t40, t36, undef:v4i8, undef:v4i8
23016 // t40: v4i8,ch = load<(load (s32) from %ir.17)> t0, t22, undef:i64
23017 // t36: v4i8,ch = load<(load (s32) from %ir.13)> t0, t18, undef:i64
23018 // t43: v16i8 = concat_vectors t32, undef:v4i8, undef:v4i8, undef:v4i8
23019 // t32: v4i8,ch = load<(load (s32) from %ir.9)> t0, t14, undef:i64
23020 // t45: v16i8 = concat_vectors t28, undef:v4i8, undef:v4i8, undef:v4i8
23021 // t28: v4i8,ch = load<(load (s32) from %ir.0)> t0, t2, undef:i64
23022 if (B.getOperand(0).getOpcode() != ISD::VECTOR_SHUFFLE ||
23023 B.getOperand(0).getOperand(0).getOpcode() != ISD::CONCAT_VECTORS ||
23024 B.getOperand(0).getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
23025 B.getOperand(1).getOpcode() != ISD::CONCAT_VECTORS ||
23026 B.getOperand(1).getNumOperands() != 4)
23027 return false;
23028 auto SV1 = cast<ShuffleVectorSDNode>(B);
23029 auto SV2 = cast<ShuffleVectorSDNode>(B.getOperand(0));
23030 int NumElts = B.getValueType().getVectorNumElements();
23031 int NumSubElts = NumElts / 4;
23032 for (int I = 0; I < NumSubElts; I++) {
23033 // <0,1,2,3,4,5,6,7,8,9,10,11,16,17,18,19>
23034 if (SV1->getMaskElt(I) != I ||
23035 SV1->getMaskElt(I + NumSubElts) != I + NumSubElts ||
23036 SV1->getMaskElt(I + NumSubElts * 2) != I + NumSubElts * 2 ||
23037 SV1->getMaskElt(I + NumSubElts * 3) != I + NumElts)
23038 return false;
23039 // <0,1,2,3,4,5,6,7,16,17,18,19,u,u,u,u>
23040 if (SV2->getMaskElt(I) != I ||
23041 SV2->getMaskElt(I + NumSubElts) != I + NumSubElts ||
23042 SV2->getMaskElt(I + NumSubElts * 2) != I + NumElts)
23043 return false;
23044 }
23045 auto *Ld0 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(0));
23046 auto *Ld1 = dyn_cast<LoadSDNode>(SV2->getOperand(0).getOperand(1));
23047 auto *Ld2 = dyn_cast<LoadSDNode>(SV2->getOperand(1).getOperand(0));
23048 auto *Ld3 = dyn_cast<LoadSDNode>(B.getOperand(1).getOperand(0));
23049 if (!Ld0 || !Ld1 || !Ld2 || !Ld3 || !Ld0->isSimple() || !Ld1->isSimple() ||
23050 !Ld2->isSimple() || !Ld3->isSimple())
23051 return false;
23052 Loads.push_back(Ld0);
23053 Loads.push_back(Ld1);
23054 Loads.push_back(Ld2);
23055 Loads.push_back(Ld3);
23056 return true;
23057 }
23058 return false;
23059}
23060
23062 SelectionDAG &DAG,
23063 unsigned &NumSubLoads) {
23064 if (!Op0.hasOneUse() || !Op1.hasOneUse())
23065 return false;
23066
23067 SmallVector<LoadSDNode *> Loads0, Loads1;
23068 if (isLoadOrMultipleLoads(Op0, Loads0) &&
23069 isLoadOrMultipleLoads(Op1, Loads1)) {
23070 if (NumSubLoads && Loads0.size() != NumSubLoads)
23071 return false;
23072 NumSubLoads = Loads0.size();
23073 return Loads0.size() == Loads1.size() &&
23074 all_of(zip(Loads0, Loads1), [&DAG](auto L) {
23075 unsigned Size = get<0>(L)->getValueType(0).getSizeInBits();
23076 return Size == get<1>(L)->getValueType(0).getSizeInBits() &&
23078 Size / 8, 1);
23079 });
23080 }
23081
23082 if (Op0.getOpcode() != Op1.getOpcode())
23083 return false;
23084
23085 switch (Op0.getOpcode()) {
23086 case ISD::ADD:
23087 case ISD::SUB:
23089 DAG, NumSubLoads) &&
23091 DAG, NumSubLoads);
23092 case ISD::SIGN_EXTEND:
23093 case ISD::ANY_EXTEND:
23094 case ISD::ZERO_EXTEND:
23095 EVT XVT = Op0.getOperand(0).getValueType();
23096 if (XVT.getScalarSizeInBits() != 8 && XVT.getScalarSizeInBits() != 16 &&
23097 XVT.getScalarSizeInBits() != 32)
23098 return false;
23100 DAG, NumSubLoads);
23101 }
23102 return false;
23103}
23104
23105// This method attempts to fold trees of add(ext(load p), shl(ext(load p+4))
23106// into a single load of twice the size, that we extract the bottom part and top
23107// part so that the shl can use a shll2 instruction. The two loads in that
23108// example can also be larger trees of instructions, which are identical except
23109// for the leaves which are all loads offset from the LHS, including
23110// buildvectors of multiple loads. For example the RHS tree could be
23111// sub(zext(buildvec(load p+4, load q+4)), zext(buildvec(load r+4, load s+4)))
23112// Whilst it can be common for the larger loads to replace LDP instructions
23113// (which doesn't gain anything on it's own), the larger loads can help create
23114// more efficient code, and in buildvectors prevent the need for ld1 lane
23115// inserts which can be slower than normal loads.
23117 EVT VT = N->getValueType(0);
23118 if (!VT.isFixedLengthVector() ||
23119 (VT.getScalarSizeInBits() != 16 && VT.getScalarSizeInBits() != 32 &&
23120 VT.getScalarSizeInBits() != 64))
23121 return SDValue();
23122
23123 SDValue Other = N->getOperand(0);
23124 SDValue Shift = N->getOperand(1);
23125 if (Shift.getOpcode() != ISD::SHL && N->getOpcode() != ISD::SUB)
23126 std::swap(Shift, Other);
23127 APInt ShiftAmt;
23128 if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse() ||
23129 !ISD::isConstantSplatVector(Shift.getOperand(1).getNode(), ShiftAmt))
23130 return SDValue();
23131
23132 if (!ISD::isExtOpcode(Shift.getOperand(0).getOpcode()) ||
23133 !ISD::isExtOpcode(Other.getOpcode()) ||
23134 Shift.getOperand(0).getOperand(0).getValueType() !=
23135 Other.getOperand(0).getValueType() ||
23136 !Other.hasOneUse() || !Shift.getOperand(0).hasOneUse())
23137 return SDValue();
23138
23139 SDValue Op0 = Other.getOperand(0);
23140 SDValue Op1 = Shift.getOperand(0).getOperand(0);
23141
23142 unsigned NumSubLoads = 0;
23143 if (!areLoadedOffsetButOtherwiseSame(Op0, Op1, DAG, NumSubLoads))
23144 return SDValue();
23145
23146 // Attempt to rule out some unprofitable cases using heuristics (some working
23147 // around suboptimal code generation), notably if the extend not be able to
23148 // use ushll2 instructions as the types are not large enough. Otherwise zip's
23149 // will need to be created which can increase the instruction count.
23150 unsigned NumElts = Op0.getValueType().getVectorNumElements();
23151 unsigned NumSubElts = NumElts / NumSubLoads;
23152 if (NumSubElts * VT.getScalarSizeInBits() < 128 ||
23153 (Other.getOpcode() != Shift.getOperand(0).getOpcode() &&
23154 Op0.getValueType().getSizeInBits() < 128 &&
23156 return SDValue();
23157
23158 // Recreate the tree with the new combined loads.
23159 std::function<SDValue(SDValue, SDValue, SelectionDAG &)> GenCombinedTree =
23160 [&GenCombinedTree](SDValue Op0, SDValue Op1, SelectionDAG &DAG) {
23161 EVT DVT =
23163
23164 SmallVector<LoadSDNode *> Loads0, Loads1;
23165 if (isLoadOrMultipleLoads(Op0, Loads0) &&
23166 isLoadOrMultipleLoads(Op1, Loads1)) {
23167 EVT LoadVT = EVT::getVectorVT(
23168 *DAG.getContext(), Op0.getValueType().getScalarType(),
23169 Op0.getValueType().getVectorNumElements() / Loads0.size());
23170 EVT DLoadVT = LoadVT.getDoubleNumVectorElementsVT(*DAG.getContext());
23171
23172 SmallVector<SDValue> NewLoads;
23173 for (const auto &[L0, L1] : zip(Loads0, Loads1)) {
23174 SDValue Load = DAG.getLoad(DLoadVT, SDLoc(L0), L0->getChain(),
23175 L0->getBasePtr(), L0->getPointerInfo(),
23176 L0->getBaseAlign());
23177 DAG.makeEquivalentMemoryOrdering(L0, Load.getValue(1));
23178 DAG.makeEquivalentMemoryOrdering(L1, Load.getValue(1));
23179 NewLoads.push_back(Load);
23180 }
23181 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op0), DVT, NewLoads);
23182 }
23183
23185 for (const auto &[O0, O1] : zip(Op0->op_values(), Op1->op_values()))
23186 Ops.push_back(GenCombinedTree(O0, O1, DAG));
23187 return DAG.getNode(Op0.getOpcode(), SDLoc(Op0), DVT, Ops);
23188 };
23189 SDValue NewOp = GenCombinedTree(Op0, Op1, DAG);
23190
23191 SmallVector<int> LowMask(NumElts, 0), HighMask(NumElts, 0);
23192 int Hi = NumSubElts, Lo = 0;
23193 for (unsigned i = 0; i < NumSubLoads; i++) {
23194 for (unsigned j = 0; j < NumSubElts; j++) {
23195 LowMask[i * NumSubElts + j] = Lo++;
23196 HighMask[i * NumSubElts + j] = Hi++;
23197 }
23198 Lo += NumSubElts;
23199 Hi += NumSubElts;
23200 }
23201 SDLoc DL(N);
23202 SDValue Ext0, Ext1;
23203 // Extract the top and bottom lanes, then extend the result. Possibly extend
23204 // the result then extract the lanes if the two operands match as it produces
23205 // slightly smaller code.
23206 if (Other.getOpcode() != Shift.getOperand(0).getOpcode()) {
23208 NewOp, DAG.getConstant(0, DL, MVT::i64));
23209 SDValue SubH =
23210 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Op0.getValueType(), NewOp,
23211 DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
23212 SDValue Extr0 =
23213 DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, LowMask);
23214 SDValue Extr1 =
23215 DAG.getVectorShuffle(Op0.getValueType(), DL, SubL, SubH, HighMask);
23216 Ext0 = DAG.getNode(Other.getOpcode(), DL, VT, Extr0);
23217 Ext1 = DAG.getNode(Shift.getOperand(0).getOpcode(), DL, VT, Extr1);
23218 } else {
23220 SDValue Ext = DAG.getNode(Other.getOpcode(), DL, DVT, NewOp);
23221 SDValue SubL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
23222 DAG.getConstant(0, DL, MVT::i64));
23223 SDValue SubH =
23224 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Ext,
23225 DAG.getConstant(NumSubElts * NumSubLoads, DL, MVT::i64));
23226 Ext0 = DAG.getVectorShuffle(VT, DL, SubL, SubH, LowMask);
23227 Ext1 = DAG.getVectorShuffle(VT, DL, SubL, SubH, HighMask);
23228 }
23229 SDValue NShift =
23230 DAG.getNode(Shift.getOpcode(), DL, VT, Ext1, Shift.getOperand(1));
23231 return DAG.getNode(N->getOpcode(), DL, VT, Ext0, NShift);
23232}
23233
23234// Attempt to combine the following patterns:
23235// SUB x, (CSET LO, (CMP a, b)) -> SBC x, 0, (CMP a, b)
23236// SUB (SUB x, y), (CSET LO, (CMP a, b)) -> SBC x, y, (CMP a, b)
23237// Also handles CSET HI by swapping the CMP operands (a > b ≡ b < a).
23238// The CSET may be preceded by a ZEXT.
23240 if (N->getOpcode() != ISD::SUB)
23241 return SDValue();
23242
23243 EVT VT = N->getValueType(0);
23244 if (VT != MVT::i32 && VT != MVT::i64)
23245 return SDValue();
23246
23247 SDValue N1 = N->getOperand(1);
23248 if (N1.getOpcode() == ISD::ZERO_EXTEND && N1.hasOneUse())
23249 N1 = N1.getOperand(0);
23250 auto CC = getCSETCondCode(N1);
23251 if (!N1.hasOneUse() || (CC != AArch64CC::LO && CC != AArch64CC::HI))
23252 return SDValue();
23253
23254 SDValue Flags = N1.getOperand(3);
23255 if (Flags.getOpcode() != AArch64ISD::SUBS)
23256 return SDValue();
23257
23258 SDValue N0 = N->getOperand(0);
23259 bool CanFoldSub = N0.getOpcode() == ISD::SUB;
23260
23261 // For HI (unsigned >), swap the SUBS operands to obtain LO (unsigned <).
23262 if (CC == AArch64CC::HI) {
23263 if (!Flags.hasOneUse())
23264 return SDValue();
23265 // Skip when the inner SUB can't be folded and the swap would cost a mov.
23266 auto *RHSC = dyn_cast<ConstantSDNode>(Flags.getOperand(1));
23267 if ((!CanFoldSub || !N0.hasOneUse()) && RHSC &&
23269 return SDValue();
23270 Flags = DAG.getNode(AArch64ISD::SUBS, SDLoc(Flags), Flags->getVTList(),
23271 Flags.getOperand(1), Flags.getOperand(0))
23272 .getValue(1);
23273 }
23274
23275 SDLoc DL(N);
23276 if (CanFoldSub)
23277 return DAG.getNode(AArch64ISD::SBC, DL, VT, N0.getOperand(0),
23278 N0.getOperand(1), Flags);
23279 return DAG.getNode(AArch64ISD::SBC, DL, VT, N0, DAG.getConstant(0, DL, VT),
23280 Flags);
23281}
23282
23283// add(trunc(ashr(A, C)), trunc(lshr(A, BW-1))), with C >= BW
23284// ->
23285// X = trunc(ashr(A, C)); add(x, lshr(X, BW-1)
23286// The original converts into ashr+lshr+xtn+xtn+add. The second becomes
23287// ashr+xtn+usra. The first form has less total latency due to more parallelism,
23288// but more micro-ops and seems to be slower in practice.
23290 using namespace llvm::SDPatternMatch;
23291 EVT VT = N->getValueType(0);
23292 if (VT != MVT::v2i32 && VT != MVT::v4i16 && VT != MVT::v8i8)
23293 return SDValue();
23294
23295 SDValue AShr, LShr;
23296 if (!sd_match(N, m_Add(m_Trunc(m_Value(AShr)), m_Trunc(m_Value(LShr)))))
23297 return SDValue();
23298 if (AShr.getOpcode() != AArch64ISD::VASHR)
23299 std::swap(AShr, LShr);
23300 if (AShr.getOpcode() != AArch64ISD::VASHR ||
23301 LShr.getOpcode() != AArch64ISD::VLSHR ||
23302 AShr.getOperand(0) != LShr.getOperand(0) ||
23304 LShr.getConstantOperandVal(1) != VT.getScalarSizeInBits() * 2 - 1)
23305 return SDValue();
23306
23307 SDLoc DL(N);
23308 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, AShr);
23309 SDValue Shift = DAG.getNode(
23310 AArch64ISD::VLSHR, DL, VT, Trunc,
23311 DAG.getTargetConstant(VT.getScalarSizeInBits() - 1, DL, MVT::i32));
23312 return DAG.getNode(ISD::ADD, DL, VT, Trunc, Shift);
23313}
23314
23316 if (N->getOpcode() != ISD::SUB)
23317 return SDValue();
23318
23319 EVT VT = N->getValueType(0);
23320 if (!VT.isFixedLengthVector())
23321 return SDValue();
23322
23323 SDValue Zero = N->getOperand(0);
23324 SDValue And = N->getOperand(1);
23325 if (!isZerosVector(Zero.getNode()) || And.getOpcode() != ISD::AND ||
23326 !isOneVector(And.getOperand(1)))
23327 return SDValue();
23328
23329 SDLoc DL(N);
23330 return DAG.getSetCC(DL, VT, And, DAG.getConstant(0, DL, VT), ISD::SETNE);
23331}
23332
23333// Fold ADD(SBC(Y, 0, W), C) -> SBC(Y, -C, W)
23334// SBC(Y, 0, W) = Y - 0 - ~carry = Y + carry - 1
23335// Adding C: Y + carry - 1 + C = Y - (-C) - ~carry = SBC(Y, -C, W)
23337 if (N->getOpcode() != ISD::ADD)
23338 return SDValue();
23339 EVT VT = N->getValueType(0);
23340 if (VT != MVT::i32 && VT != MVT::i64)
23341 return SDValue();
23342
23343 SDValue SBC = N->getOperand(0);
23344 SDValue C = N->getOperand(1);
23345 // ADD is commutative; operands may be on either side.
23346 if (SBC.getOpcode() != AArch64ISD::SBC)
23347 std::swap(SBC, C);
23348 if (SBC.getOpcode() != AArch64ISD::SBC || !SBC.hasOneUse())
23349 return SDValue();
23350 if (!isNullConstant(SBC.getOperand(1)))
23351 return SDValue();
23352 // AArch64 SBC (non-flag-setting) has only one output; no flags guard needed.
23353 SDLoc DL(N);
23354 return DAG.getNode(AArch64ISD::SBC, DL, VT, SBC.getOperand(0),
23355 DAG.getNegative(C, DL, VT), SBC.getOperand(2));
23356}
23357
23360 // Try to change sum of two reductions.
23361 if (SDValue Val = performAddUADDVCombine(N, DCI.DAG))
23362 return Val;
23363 if (SDValue Val = performAddDotCombine(N, DCI.DAG))
23364 return Val;
23365 if (SDValue Val = performAddCSelIntoCSinc(N, DCI.DAG))
23366 return Val;
23367 if (SDValue Val = performNegCSelCombine(N, DCI.DAG))
23368 return Val;
23369 if (SDValue Val = performVectorExtCombine(N, DCI.DAG))
23370 return Val;
23372 return Val;
23373 if (SDValue Val = performSubAddMULCombine(N, DCI.DAG))
23374 return Val;
23375 if (SDValue Val = performSVEMulAddSubCombine(N, DCI))
23376 return Val;
23377 if (SDValue Val = performAddSubIntoVectorOp(N, DCI.DAG))
23378 return Val;
23379 if (SDValue Val = performSubWithBorrowCombine(N, DCI.DAG))
23380 return Val;
23381 if (SDValue Val = performAddTruncShiftCombine(N, DCI.DAG))
23382 return Val;
23383 if (SDValue Val = performSubNegAndOneCombine(N, DCI.DAG))
23384 return Val;
23385 if (SDValue Val = performAddWithSBCCombine(N, DCI.DAG))
23386 return Val;
23387 if (SDValue Val = performExtBinopLoadFold(N, DCI.DAG))
23388 return Val;
23389
23390 return performAddSubLongCombine(N, DCI);
23391}
23392
23393// Massage DAGs which we can use the high-half "long" operations on into
23394// something isel will recognize better. E.g.
23395//
23396// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
23397// (aarch64_neon_umull (extract_high (v2i64 vec)))
23398// (extract_high (v2i64 (dup128 scalar)))))
23399//
23402 SelectionDAG &DAG) {
23403 if (DCI.isBeforeLegalizeOps())
23404 return SDValue();
23405
23406 SDValue LHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 0 : 1);
23407 SDValue RHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 1 : 2);
23408 assert(LHS.getValueType().is64BitVector() &&
23409 RHS.getValueType().is64BitVector() &&
23410 "unexpected shape for long operation");
23411
23412 // Either node could be a DUP, but it's not worth doing both of them (you'd
23413 // just as well use the non-high version) so look for a corresponding extract
23414 // operation on the other "wing".
23417 if (!RHS.getNode())
23418 return SDValue();
23421 if (!LHS.getNode())
23422 return SDValue();
23423 } else
23424 return SDValue();
23425
23426 if (IID == Intrinsic::not_intrinsic)
23427 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), LHS, RHS);
23428
23429 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
23430 N->getOperand(0), LHS, RHS);
23431}
23432
23433static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
23434 MVT ElemTy = N->getSimpleValueType(0).getScalarType();
23435 unsigned ElemBits = ElemTy.getSizeInBits();
23436
23437 int64_t ShiftAmount;
23438 if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
23439 APInt SplatValue, SplatUndef;
23440 unsigned SplatBitSize;
23441 bool HasAnyUndefs;
23442 if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
23443 HasAnyUndefs, ElemBits) ||
23444 SplatBitSize != ElemBits)
23445 return SDValue();
23446
23447 ShiftAmount = SplatValue.getSExtValue();
23448 } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
23449 ShiftAmount = CVN->getSExtValue();
23450 } else
23451 return SDValue();
23452
23453 // If the shift amount is zero, remove the shift intrinsic.
23454 if (ShiftAmount == 0 && IID != Intrinsic::aarch64_neon_sqshlu)
23455 return N->getOperand(1);
23456
23457 unsigned Opcode;
23458 bool IsRightShift;
23459 switch (IID) {
23460 default:
23461 llvm_unreachable("Unknown shift intrinsic");
23462 case Intrinsic::aarch64_neon_sqshl:
23463 Opcode = AArch64ISD::SQSHL_I;
23464 IsRightShift = false;
23465 break;
23466 case Intrinsic::aarch64_neon_uqshl:
23467 Opcode = AArch64ISD::UQSHL_I;
23468 IsRightShift = false;
23469 break;
23470 case Intrinsic::aarch64_neon_srshl:
23471 Opcode = AArch64ISD::SRSHR_I;
23472 IsRightShift = true;
23473 break;
23474 case Intrinsic::aarch64_neon_urshl:
23475 Opcode = AArch64ISD::URSHR_I;
23476 IsRightShift = true;
23477 break;
23478 case Intrinsic::aarch64_neon_sqshlu:
23479 Opcode = AArch64ISD::SQSHLU_I;
23480 IsRightShift = false;
23481 break;
23482 case Intrinsic::aarch64_neon_sshl:
23483 case Intrinsic::aarch64_neon_ushl:
23484 // For positive shift amounts we can use SHL, as ushl/sshl perform a regular
23485 // left shift for positive shift amounts. For negative shifts we can use a
23486 // VASHR/VLSHR as appropriate.
23487 if (ShiftAmount < 0) {
23488 Opcode = IID == Intrinsic::aarch64_neon_sshl ? AArch64ISD::VASHR
23489 : AArch64ISD::VLSHR;
23490 ShiftAmount = -ShiftAmount;
23491 } else
23492 Opcode = AArch64ISD::VSHL;
23493 IsRightShift = false;
23494 break;
23495 }
23496
23497 EVT VT = N->getValueType(0);
23498 SDValue Op = N->getOperand(1);
23499 SDLoc DL(N);
23500 if (VT == MVT::i64) {
23501 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, Op);
23502 VT = MVT::v1i64;
23503 }
23504
23505 if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
23506 Op = DAG.getNode(Opcode, DL, VT, Op,
23507 DAG.getSignedConstant(-ShiftAmount, DL, MVT::i32, true));
23508 if (N->getValueType(0) == MVT::i64)
23509 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Op,
23510 DAG.getConstant(0, DL, MVT::i64));
23511 return Op;
23512 } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
23513 Op = DAG.getNode(Opcode, DL, VT, Op,
23514 DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
23515 if (N->getValueType(0) == MVT::i64)
23516 Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Op,
23517 DAG.getConstant(0, DL, MVT::i64));
23518 return Op;
23519 }
23520
23521 return SDValue();
23522}
23523
23524// The CRC32[BH] instructions ignore the high bits of their data operand. Since
23525// the intrinsics must be legal and take an i32, this means there's almost
23526// certainly going to be a zext in the DAG which we can eliminate.
23527static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
23528 SDValue AndN = N->getOperand(2);
23529 if (AndN.getOpcode() != ISD::AND)
23530 return SDValue();
23531
23533 if (!CMask || CMask->getZExtValue() != Mask)
23534 return SDValue();
23535
23536 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
23537 N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
23538}
23539
23541 SelectionDAG &DAG) {
23542 SDLoc DL(N);
23543 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0),
23544 DAG.getNode(Opc, DL, N->getOperand(1).getSimpleValueType(),
23545 N->getOperand(1)),
23546 DAG.getConstant(0, DL, MVT::i64));
23547}
23548
23550 SDLoc DL(N);
23551 SDValue Op1 = N->getOperand(1);
23552 SDValue Op2 = N->getOperand(2);
23553 EVT ScalarTy = Op2.getValueType();
23554 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
23555 ScalarTy = MVT::i32;
23556
23557 // Lower index_vector(base, step) to mul(step step_vector(1)) + splat(base).
23558 SDValue StepVector = DAG.getStepVector(DL, N->getValueType(0));
23559 SDValue Step = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op2);
23560 SDValue Mul = DAG.getNode(ISD::MUL, DL, N->getValueType(0), StepVector, Step);
23561 SDValue Base = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op1);
23562 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), Mul, Base);
23563}
23564
23566 SDLoc DL(N);
23567 SDValue Scalar = N->getOperand(3);
23568 EVT ScalarTy = Scalar.getValueType();
23569
23570 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
23571 Scalar = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Scalar);
23572
23573 SDValue Passthru = N->getOperand(1);
23574 SDValue Pred = N->getOperand(2);
23575 return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, DL, N->getValueType(0),
23576 Pred, Scalar, Passthru);
23577}
23578
23580 SDLoc DL(N);
23581 LLVMContext &Ctx = *DAG.getContext();
23582 EVT VT = N->getValueType(0);
23583
23584 assert(VT.isScalableVector() && "Expected a scalable vector.");
23585
23586 // Current lowering only supports the SVE-ACLE types.
23588 return SDValue();
23589
23590 unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8;
23591 unsigned ByteSize = VT.getSizeInBits().getKnownMinValue() / 8;
23592 EVT ByteVT =
23593 EVT::getVectorVT(Ctx, MVT::i8, ElementCount::getScalable(ByteSize));
23594
23595 // Convert everything to the domain of EXT (i.e bytes).
23596 SDValue Op0 = DAG.getNode(ISD::BITCAST, DL, ByteVT, N->getOperand(1));
23597 SDValue Op1 = DAG.getNode(ISD::BITCAST, DL, ByteVT, N->getOperand(2));
23598 SDValue Op2 = DAG.getNode(ISD::MUL, DL, MVT::i32, N->getOperand(3),
23599 DAG.getConstant(ElemSize, DL, MVT::i32));
23600
23601 SDValue EXT = DAG.getNode(AArch64ISD::EXT, DL, ByteVT, Op0, Op1, Op2);
23602 return DAG.getNode(ISD::BITCAST, DL, VT, EXT);
23603}
23604
23607 SelectionDAG &DAG) {
23608 if (DCI.isBeforeLegalize())
23609 return SDValue();
23610
23611 SDValue Comparator = N->getOperand(3);
23612 if (Comparator.getOpcode() == AArch64ISD::DUP ||
23613 Comparator.getOpcode() == ISD::SPLAT_VECTOR) {
23614 unsigned IID = getIntrinsicID(N);
23615 EVT VT = N->getValueType(0);
23616 EVT CmpVT = N->getOperand(2).getValueType();
23617 SDValue Pred = N->getOperand(1);
23618 SDValue Imm;
23619 SDLoc DL(N);
23620
23621 switch (IID) {
23622 default:
23623 llvm_unreachable("Called with wrong intrinsic!");
23624 break;
23625
23626 // Signed comparisons
23627 case Intrinsic::aarch64_sve_cmpeq_wide:
23628 case Intrinsic::aarch64_sve_cmpne_wide:
23629 case Intrinsic::aarch64_sve_cmpge_wide:
23630 case Intrinsic::aarch64_sve_cmpgt_wide:
23631 case Intrinsic::aarch64_sve_cmplt_wide:
23632 case Intrinsic::aarch64_sve_cmple_wide: {
23633 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
23634 int64_t ImmVal = CN->getSExtValue();
23635 if (ImmVal >= -16 && ImmVal <= 15)
23636 Imm = DAG.getSignedConstant(ImmVal, DL, MVT::i32);
23637 else
23638 return SDValue();
23639 }
23640 break;
23641 }
23642 // Unsigned comparisons
23643 case Intrinsic::aarch64_sve_cmphs_wide:
23644 case Intrinsic::aarch64_sve_cmphi_wide:
23645 case Intrinsic::aarch64_sve_cmplo_wide:
23646 case Intrinsic::aarch64_sve_cmpls_wide: {
23647 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
23648 uint64_t ImmVal = CN->getZExtValue();
23649 if (ImmVal <= 127)
23650 Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
23651 else
23652 return SDValue();
23653 }
23654 break;
23655 }
23656 }
23657
23658 if (!Imm)
23659 return SDValue();
23660
23661 SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm);
23662 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred,
23663 N->getOperand(2), Splat, DAG.getCondCode(CC));
23664 }
23665
23666 return SDValue();
23667}
23668
23671 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23672
23673 SDLoc DL(Op);
23674 assert(Op.getValueType().isScalableVector() &&
23675 TLI.isTypeLegal(Op.getValueType()) &&
23676 "Expected legal scalable vector type!");
23677 assert(Op.getValueType() == Pg.getValueType() &&
23678 "Expected same type for PTEST operands");
23679
23680 // Ensure target specific opcodes are using legal type.
23681 EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
23682 SDValue TVal = DAG.getConstant(1, DL, OutVT);
23683 SDValue FVal = DAG.getConstant(0, DL, OutVT);
23684
23685 // Ensure operands have type nxv16i1.
23686 if (Op.getValueType() != MVT::nxv16i1) {
23689 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Pg);
23690 else
23691 Pg = getSVEPredicateBitCast(MVT::nxv16i1, Pg, DAG);
23692 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Op);
23693 }
23694
23695 unsigned PTest = AArch64ISD::PTEST;
23697 PTest = AArch64ISD::PTEST_ANY;
23698 else if (Cond == AArch64CC::FIRST_ACTIVE)
23699 PTest = AArch64ISD::PTEST_FIRST;
23700
23701 // Set condition code (CC) flags.
23702 SDValue Test = DAG.getNode(PTest, DL, MVT::i32, Pg, Op);
23703
23704 // Convert CC to integer based on requested condition.
23705 // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
23706 SDValue CC = getCondCode(DAG, getInvertedCondCode(Cond));
23707 SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, OutVT, FVal, TVal, CC, Test);
23708 return DAG.getZExtOrTrunc(Res, DL, VT);
23709}
23710
23712 SelectionDAG &DAG) {
23713 SDLoc DL(N);
23714
23715 SDValue Pred = N->getOperand(1);
23716 SDValue VecToReduce = N->getOperand(2);
23717
23718 // NOTE: The integer reduction's result type is not always linked to the
23719 // operand's element type so we construct it from the intrinsic's result type.
23720 EVT ReduceVT = getPackedSVEVectorVT(N->getValueType(0));
23721 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
23722
23723 // SVE reductions set the whole vector register with the first element
23724 // containing the reduction result, which we'll now extract.
23725 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
23726 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
23727 Zero);
23728}
23729
23731 SelectionDAG &DAG) {
23732 SDLoc DL(N);
23733
23734 SDValue Pred = N->getOperand(1);
23735 SDValue VecToReduce = N->getOperand(2);
23736
23737 EVT ReduceVT = VecToReduce.getValueType();
23738 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
23739
23740 // SVE reductions set the whole vector register with the first element
23741 // containing the reduction result, which we'll now extract.
23742 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
23743 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
23744 Zero);
23745}
23746
23748 SelectionDAG &DAG) {
23749 SDLoc DL(N);
23750
23751 SDValue Pred = N->getOperand(1);
23752 SDValue InitVal = N->getOperand(2);
23753 SDValue VecToReduce = N->getOperand(3);
23754 EVT ReduceVT = VecToReduce.getValueType();
23755
23756 // Ordered reductions use the first lane of the result vector as the
23757 // reduction's initial value.
23758 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
23759 InitVal = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReduceVT,
23760 DAG.getPOISON(ReduceVT), InitVal, Zero);
23761
23762 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce);
23763
23764 // SVE reductions set the whole vector register with the first element
23765 // containing the reduction result, which we'll now extract.
23766 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
23767 Zero);
23768}
23769
23771 SelectionDAG &DAG) {
23772 if (N->getValueType(0) != MVT::i16)
23773 return SDValue();
23774
23775 SDLoc DL(N);
23776 SDValue CVT = DAG.getNode(Opcode, DL, MVT::f32, N->getOperand(1));
23777 SDValue Bitcast = DAG.getBitcast(MVT::i32, CVT);
23778 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Bitcast);
23779}
23780
23781// If a merged operation has no inactive lanes we can relax it to a predicated
23782// or unpredicated operation, which potentially allows better isel (perhaps
23783// using immediate forms) or relaxing register reuse requirements.
23785 SelectionDAG &DAG, bool UnpredOp = false,
23786 bool SwapOperands = false) {
23787 assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!");
23788 assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!");
23789 SDValue Pg = N->getOperand(1);
23790 SDValue Op1 = N->getOperand(SwapOperands ? 3 : 2);
23791 SDValue Op2 = N->getOperand(SwapOperands ? 2 : 3);
23792
23793 // ISD way to specify an all active predicate.
23794 if (isAllActivePredicate(DAG, Pg)) {
23795 if (UnpredOp)
23796 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Op1, Op2);
23797
23798 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Pg, Op1, Op2);
23799 }
23800
23801 // FUTURE: SplatVector(true)
23802 return SDValue();
23803}
23804
23805static SDValue combineSVEBitSel(unsigned IID, SDNode *N, SelectionDAG &DAG) {
23806 SDLoc DL(N);
23807 EVT VT = N->getValueType(0);
23808 SDValue Op1 = N->getOperand(1);
23809 SDValue Op2 = N->getOperand(2);
23810 SDValue Op3 = N->getOperand(3);
23811
23812 switch (IID) {
23813 default:
23814 llvm_unreachable("Called with wrong intrinsic!");
23815 case Intrinsic::aarch64_sve_bsl:
23816 return DAG.getNode(AArch64ISD::BSP, DL, VT, Op3, Op1, Op2);
23817 case Intrinsic::aarch64_sve_bsl1n:
23818 return DAG.getNode(AArch64ISD::BSP, DL, VT, Op3, DAG.getNOT(DL, Op1, VT),
23819 Op2);
23820 case Intrinsic::aarch64_sve_bsl2n:
23821 return DAG.getNode(AArch64ISD::BSP, DL, VT, Op3, Op1,
23822 DAG.getNOT(DL, Op2, VT));
23823 case Intrinsic::aarch64_sve_nbsl:
23824 return DAG.getNOT(DL, DAG.getNode(AArch64ISD::BSP, DL, VT, Op3, Op1, Op2),
23825 VT);
23826 }
23827}
23828
23829/// Optimize patterns where we insert zeros into vector lanes before a
23830/// floating-point add reduction. This handles both the aarch64.neon.faddv
23831/// intrinsic and ISD::VECREDUCE_FADD/VECREDUCE_SEQ_FADD.
23832/// When Start is set, the reduction is sequential (left-to-right), otherwise
23833/// it is pairwise (tree).
23834static SDValue
23836 const AArch64Subtarget *Subtarget, SDValue Vec,
23837 SDValue Start = SDValue()) {
23838 EVT VT = Vec.getValueType();
23839 if (!VT.isFixedLengthVector())
23840 return SDValue();
23841 EVT EltVT = VT.getVectorElementType();
23842 if (EltVT != MVT::f32 && EltVT != MVT::f64 &&
23843 !(EltVT == MVT::f16 && Subtarget->hasFullFP16()))
23844 return SDValue();
23845
23846 SDLoc DL(N);
23847 unsigned NumElts = VT.getVectorNumElements();
23848 // Pairwise reduction requires power-of-2 number of elements.
23849 if (!isPowerOf2_32(NumElts))
23850 return SDValue();
23851
23852 // Check if we can output a signed zero.
23853 // This avoids the scenario where all the added values are -0.0 except the
23854 // +0.0 element we chose to ignore.
23855 SDNodeFlags Flags = N->getFlags();
23856 bool IsSignedZeroSafe =
23857 Flags.hasNoSignedZeros() || DAG.canIgnoreSignBitOfZero(SDValue(N, 0));
23858
23859 // Determine which elements are known to be zero. When signed zeros can be
23860 // ignored, an element is "zero" if all bits except the sign bit are known
23861 // zero (i.e., the element is +0.0 or -0.0).
23862 unsigned EltBitWidth = EltVT.getSizeInBits();
23863 APInt ZeroMask = IsSignedZeroSafe ? APInt::getSignedMaxValue(EltBitWidth)
23864 : APInt::getAllOnes(EltBitWidth);
23865 APInt KnownZeroElts = APInt::getZero(NumElts);
23866 for (unsigned I = 0; I < NumElts; ++I) {
23867 APInt DemandedElt = APInt::getOneBitSet(NumElts, I);
23868 KnownBits KB = DAG.computeKnownBits(Vec, DemandedElt);
23869 if (ZeroMask.isSubsetOf(KB.Zero))
23870 KnownZeroElts.setBit(I);
23871 }
23872 unsigned NumZeroElts = KnownZeroElts.popcount();
23873 // All elements are zero.
23874 if (NumZeroElts == NumElts) {
23875 if (!Start)
23876 return DAG.getConstantFP(0.0, DL, EltVT);
23877 if (IsSignedZeroSafe)
23878 return Start;
23879 return DAG.getNode(ISD::FADD, DL, EltVT, Start,
23880 DAG.getConstantFP(0.0, DL, EltVT));
23881 }
23882
23883 // Decomposing the reduction into scalar FADDs is only profitable when
23884 // enough elements are zero.
23885 unsigned MinZeroElts;
23886 switch (NumElts) {
23887 case 2:
23888 case 4:
23889 MinZeroElts = 1;
23890 break;
23891 case 8:
23892 MinZeroElts = 3;
23893 break;
23894 default:
23895 MinZeroElts = NumElts / 2;
23896 break;
23897 }
23898 if (NumZeroElts < MinZeroElts)
23899 return SDValue();
23900
23901 if (!IsSignedZeroSafe)
23902 return SDValue();
23903
23904 // Sequential reduction- add every element in-order, skipping zeros.
23905 if (Start) {
23906 SDValue Acc = Start;
23907 for (unsigned I = 0; I < NumElts; I++) {
23908 if (KnownZeroElts[I])
23909 continue;
23910 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vec,
23911 DAG.getConstant(I, DL, MVT::i64));
23912
23913 Acc = DAG.getNode(ISD::FADD, DL, EltVT, Acc, Elt, Flags);
23914 }
23915 return Acc;
23916 }
23917
23918 // Pairwise reduction- extract all elements, then reduce pairwise.
23920 for (unsigned I = 0; I < NumElts; I++) {
23921 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vec,
23922 DAG.getConstant(I, DL, MVT::i64)));
23923 }
23924 while (Elts.size() > 1) {
23926 APInt NewKnownZeroElts = APInt::getZero(Elts.size() / 2);
23927 for (unsigned I = 0; I < Elts.size(); I += 2) {
23928 bool ZeroI = KnownZeroElts[I];
23929 bool ZeroI1 = KnownZeroElts[I + 1];
23930 if (ZeroI && ZeroI1) {
23931 // Both elements are zero, result is zero.
23932 NewElts.push_back(Elts[I]);
23933 NewKnownZeroElts.setBit(I / 2);
23934 } else if (ZeroI) {
23935 NewElts.push_back(Elts[I + 1]);
23936 } else if (ZeroI1) {
23937 NewElts.push_back(Elts[I]);
23938 } else {
23939 NewElts.push_back(
23940 DAG.getNode(ISD::FADD, DL, EltVT, Elts[I], Elts[I + 1], Flags));
23941 }
23942 }
23943 Elts = std::move(NewElts);
23944 KnownZeroElts = NewKnownZeroElts;
23945 }
23946 return Elts[0];
23947}
23948
23951 const AArch64Subtarget *Subtarget) {
23952 SelectionDAG &DAG = DCI.DAG;
23953 unsigned IID = getIntrinsicID(N);
23954 switch (IID) {
23955 default:
23956 break;
23957 case Intrinsic::aarch64_neon_vcvtfxs2fp:
23958 case Intrinsic::aarch64_neon_vcvtfxu2fp:
23959 return tryCombineFixedPointConvert(N, DCI, DAG);
23960 case Intrinsic::aarch64_neon_saddv:
23961 return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG);
23962 case Intrinsic::aarch64_neon_uaddv:
23963 return combineAcrossLanesIntrinsic(AArch64ISD::UADDV, N, DAG);
23964 case Intrinsic::aarch64_neon_sminv:
23965 return combineAcrossLanesIntrinsic(AArch64ISD::SMINV, N, DAG);
23966 case Intrinsic::aarch64_neon_uminv:
23967 return combineAcrossLanesIntrinsic(AArch64ISD::UMINV, N, DAG);
23968 case Intrinsic::aarch64_neon_smaxv:
23969 return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG);
23970 case Intrinsic::aarch64_neon_umaxv:
23971 return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG);
23972 case Intrinsic::aarch64_neon_faddv:
23973 return tryCombineFADDReductionWithZero(N, DAG, Subtarget, N->getOperand(1));
23974 case Intrinsic::aarch64_neon_fmax:
23975 return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0),
23976 N->getOperand(1), N->getOperand(2));
23977 case Intrinsic::aarch64_neon_fmin:
23978 return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0),
23979 N->getOperand(1), N->getOperand(2));
23980 case Intrinsic::aarch64_neon_fmaxnm:
23981 return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
23982 N->getOperand(1), N->getOperand(2));
23983 case Intrinsic::aarch64_neon_fminnm:
23984 return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
23985 N->getOperand(1), N->getOperand(2));
23986 case Intrinsic::aarch64_neon_smull:
23987 return DAG.getNode(AArch64ISD::SMULL, SDLoc(N), N->getValueType(0),
23988 N->getOperand(1), N->getOperand(2));
23989 case Intrinsic::aarch64_neon_umull:
23990 return DAG.getNode(AArch64ISD::UMULL, SDLoc(N), N->getValueType(0),
23991 N->getOperand(1), N->getOperand(2));
23992 case Intrinsic::aarch64_neon_pmull:
23993 return DAG.getNode(AArch64ISD::PMULL, SDLoc(N), N->getValueType(0),
23994 N->getOperand(1), N->getOperand(2));
23995 case Intrinsic::aarch64_neon_sqdmull:
23996 return tryCombineLongOpWithDup(IID, N, DCI, DAG);
23997 case Intrinsic::aarch64_neon_sqshl:
23998 case Intrinsic::aarch64_neon_uqshl:
23999 case Intrinsic::aarch64_neon_sqshlu:
24000 case Intrinsic::aarch64_neon_srshl:
24001 case Intrinsic::aarch64_neon_urshl:
24002 case Intrinsic::aarch64_neon_sshl:
24003 case Intrinsic::aarch64_neon_ushl:
24004 return tryCombineShiftImm(IID, N, DAG);
24005 case Intrinsic::aarch64_neon_sabd:
24006 return DAG.getNode(ISD::ABDS, SDLoc(N), N->getValueType(0),
24007 N->getOperand(1), N->getOperand(2));
24008 case Intrinsic::aarch64_neon_uabd:
24009 return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
24010 N->getOperand(1), N->getOperand(2));
24011 case Intrinsic::aarch64_neon_fcvtzs:
24012 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTZS_HALF, DAG);
24013 case Intrinsic::aarch64_neon_fcvtzu:
24014 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTZU_HALF, DAG);
24015 case Intrinsic::aarch64_neon_fcvtas:
24016 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTAS_HALF, DAG);
24017 case Intrinsic::aarch64_neon_fcvtau:
24018 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTAU_HALF, DAG);
24019 case Intrinsic::aarch64_neon_fcvtms:
24020 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTMS_HALF, DAG);
24021 case Intrinsic::aarch64_neon_fcvtmu:
24022 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTMU_HALF, DAG);
24023 case Intrinsic::aarch64_neon_fcvtns:
24024 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTNS_HALF, DAG);
24025 case Intrinsic::aarch64_neon_fcvtnu:
24026 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTNU_HALF, DAG);
24027 case Intrinsic::aarch64_neon_fcvtps:
24028 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTPS_HALF, DAG);
24029 case Intrinsic::aarch64_neon_fcvtpu:
24030 return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTPU_HALF, DAG);
24031 case Intrinsic::aarch64_crc32b:
24032 case Intrinsic::aarch64_crc32cb:
24033 return tryCombineCRC32(0xff, N, DAG);
24034 case Intrinsic::aarch64_crc32h:
24035 case Intrinsic::aarch64_crc32ch:
24036 return tryCombineCRC32(0xffff, N, DAG);
24037 case Intrinsic::aarch64_sve_saddv:
24038 // There is no i64 version of SADDV because the sign is irrelevant.
24039 if (N->getOperand(2).getValueType().getVectorElementType() == MVT::i64)
24040 return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG);
24041 else
24042 return combineSVEReductionInt(N, AArch64ISD::SADDV_PRED, DAG);
24043 case Intrinsic::aarch64_sve_uaddv:
24044 return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG);
24045 case Intrinsic::aarch64_sve_smaxv:
24046 return combineSVEReductionInt(N, AArch64ISD::SMAXV_PRED, DAG);
24047 case Intrinsic::aarch64_sve_umaxv:
24048 return combineSVEReductionInt(N, AArch64ISD::UMAXV_PRED, DAG);
24049 case Intrinsic::aarch64_sve_sminv:
24050 return combineSVEReductionInt(N, AArch64ISD::SMINV_PRED, DAG);
24051 case Intrinsic::aarch64_sve_uminv:
24052 return combineSVEReductionInt(N, AArch64ISD::UMINV_PRED, DAG);
24053 case Intrinsic::aarch64_sve_orv:
24054 return combineSVEReductionInt(N, AArch64ISD::ORV_PRED, DAG);
24055 case Intrinsic::aarch64_sve_eorv:
24056 return combineSVEReductionInt(N, AArch64ISD::EORV_PRED, DAG);
24057 case Intrinsic::aarch64_sve_andv:
24058 return combineSVEReductionInt(N, AArch64ISD::ANDV_PRED, DAG);
24059 case Intrinsic::aarch64_sve_index:
24060 return LowerSVEIntrinsicIndex(N, DAG);
24061 case Intrinsic::aarch64_sve_dup:
24062 return LowerSVEIntrinsicDUP(N, DAG);
24063 case Intrinsic::aarch64_sve_dup_x:
24064 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0),
24065 N->getOperand(1));
24066 case Intrinsic::aarch64_sve_ext:
24067 return LowerSVEIntrinsicEXT(N, DAG);
24068 case Intrinsic::aarch64_sve_mul_u:
24069 return DAG.getNode(AArch64ISD::MUL_PRED, SDLoc(N), N->getValueType(0),
24070 N->getOperand(1), N->getOperand(2), N->getOperand(3));
24071 case Intrinsic::aarch64_sve_smulh_u:
24072 return DAG.getNode(AArch64ISD::MULHS_PRED, SDLoc(N), N->getValueType(0),
24073 N->getOperand(1), N->getOperand(2), N->getOperand(3));
24074 case Intrinsic::aarch64_sve_umulh_u:
24075 return DAG.getNode(AArch64ISD::MULHU_PRED, SDLoc(N), N->getValueType(0),
24076 N->getOperand(1), N->getOperand(2), N->getOperand(3));
24077 case Intrinsic::aarch64_sve_smin_u:
24078 return DAG.getNode(AArch64ISD::SMIN_PRED, SDLoc(N), N->getValueType(0),
24079 N->getOperand(1), N->getOperand(2), N->getOperand(3));
24080 case Intrinsic::aarch64_sve_umin_u:
24081 return DAG.getNode(AArch64ISD::UMIN_PRED, SDLoc(N), N->getValueType(0),
24082 N->getOperand(1), N->getOperand(2), N->getOperand(3));
24083 case Intrinsic::aarch64_sve_smax_u:
24084 return DAG.getNode(AArch64ISD::SMAX_PRED, SDLoc(N), N->getValueType(0),
24085 N->getOperand(1), N->getOperand(2), N->getOperand(3));
24086 case Intrinsic::aarch64_sve_umax_u:
24087 return DAG.getNode(AArch64ISD::UMAX_PRED, SDLoc(N), N->getValueType(0),
24088 N->getOperand(1), N->getOperand(2), N->getOperand(3));
24089 case Intrinsic::aarch64_sve_lsl_u:
24090 return DAG.getNode(AArch64ISD::SHL_PRED, SDLoc(N), N->getValueType(0),
24091 N->getOperand(1), N->getOperand(2), N->getOperand(3));
24092 case Intrinsic::aarch64_sve_lsr_u:
24093 return DAG.getNode(AArch64ISD::SRL_PRED, SDLoc(N), N->getValueType(0),
24094 N->getOperand(1), N->getOperand(2), N->getOperand(3));
24095 case Intrinsic::aarch64_sve_asr_u:
24096 return DAG.getNode(AArch64ISD::SRA_PRED, SDLoc(N), N->getValueType(0),
24097 N->getOperand(1), N->getOperand(2), N->getOperand(3));
24098 case Intrinsic::aarch64_sve_fadd_u:
24099 return DAG.getNode(AArch64ISD::FADD_PRED, SDLoc(N), N->getValueType(0),
24100 N->getOperand(1), N->getOperand(2), N->getOperand(3));
24101 case Intrinsic::aarch64_sve_fdiv_u:
24102 return DAG.getNode(AArch64ISD::FDIV_PRED, SDLoc(N), N->getValueType(0),
24103 N->getOperand(1), N->getOperand(2), N->getOperand(3));
24104 case Intrinsic::aarch64_sve_fmax_u:
24105 return DAG.getNode(AArch64ISD::FMAX_PRED, SDLoc(N), N->getValueType(0),
24106 N->getOperand(1), N->getOperand(2), N->getOperand(3));
24107 case Intrinsic::aarch64_sve_fmaxnm_u:
24108 return DAG.getNode(AArch64ISD::FMAXNM_PRED, SDLoc(N), N->getValueType(0),
24109 N->getOperand(1), N->getOperand(2), N->getOperand(3));
24110 case Intrinsic::aarch64_sve_fmla_u:
24111 return DAG.getNode(AArch64ISD::FMA_PRED, SDLoc(N), N->getValueType(0),
24112 N->getOperand(1), N->getOperand(3), N->getOperand(4),
24113 N->getOperand(2));
24114 case Intrinsic::aarch64_sve_fmin_u:
24115 return DAG.getNode(AArch64ISD::FMIN_PRED, SDLoc(N), N->getValueType(0),
24116 N->getOperand(1), N->getOperand(2), N->getOperand(3));
24117 case Intrinsic::aarch64_sve_fminnm_u:
24118 return DAG.getNode(AArch64ISD::FMINNM_PRED, SDLoc(N), N->getValueType(0),
24119 N->getOperand(1), N->getOperand(2), N->getOperand(3));
24120 case Intrinsic::aarch64_sve_fmul_u:
24121 return DAG.getNode(AArch64ISD::FMUL_PRED, SDLoc(N), N->getValueType(0),
24122 N->getOperand(1), N->getOperand(2), N->getOperand(3));
24123 case Intrinsic::aarch64_sve_fsub_u:
24124 return DAG.getNode(AArch64ISD::FSUB_PRED, SDLoc(N), N->getValueType(0),
24125 N->getOperand(1), N->getOperand(2), N->getOperand(3));
24126 case Intrinsic::aarch64_sve_add_u:
24127 return DAG.getNode(ISD::ADD, SDLoc(N), N->getValueType(0), N->getOperand(2),
24128 N->getOperand(3));
24129 case Intrinsic::aarch64_sve_sub_u:
24130 return DAG.getNode(ISD::SUB, SDLoc(N), N->getValueType(0), N->getOperand(2),
24131 N->getOperand(3));
24132 case Intrinsic::aarch64_sve_subr:
24133 return convertMergedOpToPredOp(N, ISD::SUB, DAG, true, true);
24134 case Intrinsic::aarch64_sve_and_u:
24135 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0), N->getOperand(2),
24136 N->getOperand(3));
24137 case Intrinsic::aarch64_sve_bic_u:
24138 return DAG.getNode(AArch64ISD::BIC, SDLoc(N), N->getValueType(0),
24139 N->getOperand(2), N->getOperand(3));
24140 case Intrinsic::aarch64_sve_saddwb:
24141 return DAG.getNode(AArch64ISD::SADDWB, SDLoc(N), N->getValueType(0),
24142 N->getOperand(1), N->getOperand(2));
24143 case Intrinsic::aarch64_sve_saddwt:
24144 return DAG.getNode(AArch64ISD::SADDWT, SDLoc(N), N->getValueType(0),
24145 N->getOperand(1), N->getOperand(2));
24146 case Intrinsic::aarch64_sve_uaddwb:
24147 return DAG.getNode(AArch64ISD::UADDWB, SDLoc(N), N->getValueType(0),
24148 N->getOperand(1), N->getOperand(2));
24149 case Intrinsic::aarch64_sve_uaddwt:
24150 return DAG.getNode(AArch64ISD::UADDWT, SDLoc(N), N->getValueType(0),
24151 N->getOperand(1), N->getOperand(2));
24152 case Intrinsic::aarch64_sve_eor_u:
24153 return DAG.getNode(ISD::XOR, SDLoc(N), N->getValueType(0), N->getOperand(2),
24154 N->getOperand(3));
24155 case Intrinsic::aarch64_sve_orr_u:
24156 return DAG.getNode(ISD::OR, SDLoc(N), N->getValueType(0), N->getOperand(2),
24157 N->getOperand(3));
24158 case Intrinsic::aarch64_sve_sabd_u:
24159 if (SDValue V = convertMergedOpToPredOp(N, ISD::ABDS, DAG, true))
24160 return V;
24161 return DAG.getNode(AArch64ISD::ABDS_PRED, SDLoc(N), N->getValueType(0),
24162 N->getOperand(1), N->getOperand(2), N->getOperand(3));
24163 case Intrinsic::aarch64_sve_uabd_u:
24164 if (SDValue V = convertMergedOpToPredOp(N, ISD::ABDU, DAG, true))
24165 return V;
24166 return DAG.getNode(AArch64ISD::ABDU_PRED, SDLoc(N), N->getValueType(0),
24167 N->getOperand(1), N->getOperand(2), N->getOperand(3));
24168 case Intrinsic::aarch64_sve_sqadd:
24169 return convertMergedOpToPredOp(N, ISD::SADDSAT, DAG, true);
24170 case Intrinsic::aarch64_sve_sqsub_u:
24171 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
24172 N->getOperand(2), N->getOperand(3));
24173 case Intrinsic::aarch64_sve_uqadd:
24174 return convertMergedOpToPredOp(N, ISD::UADDSAT, DAG, true);
24175 case Intrinsic::aarch64_sve_uqsub_u:
24176 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
24177 N->getOperand(2), N->getOperand(3));
24178 case Intrinsic::aarch64_sve_sqadd_x:
24179 return DAG.getNode(ISD::SADDSAT, SDLoc(N), N->getValueType(0),
24180 N->getOperand(1), N->getOperand(2));
24181 case Intrinsic::aarch64_sve_sqsub_x:
24182 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
24183 N->getOperand(1), N->getOperand(2));
24184 case Intrinsic::aarch64_sve_uqadd_x:
24185 return DAG.getNode(ISD::UADDSAT, SDLoc(N), N->getValueType(0),
24186 N->getOperand(1), N->getOperand(2));
24187 case Intrinsic::aarch64_sve_uqsub_x:
24188 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
24189 N->getOperand(1), N->getOperand(2));
24190 case Intrinsic::aarch64_sve_asrd:
24191 return DAG.getNode(AArch64ISD::ASRD_MERGE_OP1, SDLoc(N), N->getValueType(0),
24192 N->getOperand(1), N->getOperand(2), N->getOperand(3));
24193 case Intrinsic::aarch64_sve_cmphs:
24194 if (!N->getOperand(2).getValueType().isFloatingPoint())
24195 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
24196 N->getValueType(0), N->getOperand(1), N->getOperand(2),
24197 N->getOperand(3), DAG.getCondCode(ISD::SETUGE));
24198 break;
24199 case Intrinsic::aarch64_sve_cmphi:
24200 if (!N->getOperand(2).getValueType().isFloatingPoint())
24201 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
24202 N->getValueType(0), N->getOperand(1), N->getOperand(2),
24203 N->getOperand(3), DAG.getCondCode(ISD::SETUGT));
24204 break;
24205 case Intrinsic::aarch64_sve_fcmpge:
24206 case Intrinsic::aarch64_sve_cmpge:
24207 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
24208 N->getValueType(0), N->getOperand(1), N->getOperand(2),
24209 N->getOperand(3), DAG.getCondCode(ISD::SETGE));
24210 break;
24211 case Intrinsic::aarch64_sve_fcmpgt:
24212 case Intrinsic::aarch64_sve_cmpgt:
24213 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
24214 N->getValueType(0), N->getOperand(1), N->getOperand(2),
24215 N->getOperand(3), DAG.getCondCode(ISD::SETGT));
24216 break;
24217 case Intrinsic::aarch64_sve_fcmpeq:
24218 case Intrinsic::aarch64_sve_cmpeq:
24219 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
24220 N->getValueType(0), N->getOperand(1), N->getOperand(2),
24221 N->getOperand(3), DAG.getCondCode(ISD::SETEQ));
24222 break;
24223 case Intrinsic::aarch64_sve_fcmpne:
24224 case Intrinsic::aarch64_sve_cmpne:
24225 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
24226 N->getValueType(0), N->getOperand(1), N->getOperand(2),
24227 N->getOperand(3), DAG.getCondCode(ISD::SETNE));
24228 break;
24229 case Intrinsic::aarch64_sve_fcmpuo:
24230 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
24231 N->getValueType(0), N->getOperand(1), N->getOperand(2),
24232 N->getOperand(3), DAG.getCondCode(ISD::SETUO));
24233 break;
24234 case Intrinsic::aarch64_sve_fadda:
24235 return combineSVEReductionOrderedFP(N, AArch64ISD::FADDA_PRED, DAG);
24236 case Intrinsic::aarch64_sve_faddv:
24237 return combineSVEReductionFP(N, AArch64ISD::FADDV_PRED, DAG);
24238 case Intrinsic::aarch64_sve_fmaxnmv:
24239 return combineSVEReductionFP(N, AArch64ISD::FMAXNMV_PRED, DAG);
24240 case Intrinsic::aarch64_sve_fmaxv:
24241 return combineSVEReductionFP(N, AArch64ISD::FMAXV_PRED, DAG);
24242 case Intrinsic::aarch64_sve_fminnmv:
24243 return combineSVEReductionFP(N, AArch64ISD::FMINNMV_PRED, DAG);
24244 case Intrinsic::aarch64_sve_fminv:
24245 return combineSVEReductionFP(N, AArch64ISD::FMINV_PRED, DAG);
24246 case Intrinsic::aarch64_sve_sel:
24247 return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0),
24248 N->getOperand(1), N->getOperand(2), N->getOperand(3));
24249 case Intrinsic::aarch64_sve_cmpeq_wide:
24250 return tryConvertSVEWideCompare(N, ISD::SETEQ, DCI, DAG);
24251 case Intrinsic::aarch64_sve_cmpne_wide:
24252 return tryConvertSVEWideCompare(N, ISD::SETNE, DCI, DAG);
24253 case Intrinsic::aarch64_sve_cmpge_wide:
24254 return tryConvertSVEWideCompare(N, ISD::SETGE, DCI, DAG);
24255 case Intrinsic::aarch64_sve_cmpgt_wide:
24256 return tryConvertSVEWideCompare(N, ISD::SETGT, DCI, DAG);
24257 case Intrinsic::aarch64_sve_cmplt_wide:
24258 return tryConvertSVEWideCompare(N, ISD::SETLT, DCI, DAG);
24259 case Intrinsic::aarch64_sve_cmple_wide:
24260 return tryConvertSVEWideCompare(N, ISD::SETLE, DCI, DAG);
24261 case Intrinsic::aarch64_sve_cmphs_wide:
24262 return tryConvertSVEWideCompare(N, ISD::SETUGE, DCI, DAG);
24263 case Intrinsic::aarch64_sve_cmphi_wide:
24264 return tryConvertSVEWideCompare(N, ISD::SETUGT, DCI, DAG);
24265 case Intrinsic::aarch64_sve_cmplo_wide:
24266 return tryConvertSVEWideCompare(N, ISD::SETULT, DCI, DAG);
24267 case Intrinsic::aarch64_sve_cmpls_wide:
24268 return tryConvertSVEWideCompare(N, ISD::SETULE, DCI, DAG);
24269 case Intrinsic::aarch64_sve_ptest_any:
24270 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
24272 case Intrinsic::aarch64_sve_ptest_first:
24273 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
24275 case Intrinsic::aarch64_sve_ptest_last:
24276 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
24278 case Intrinsic::aarch64_sve_whilelo:
24279 return DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, SDLoc(N), N->getValueType(0),
24280 N->getOperand(1), N->getOperand(2));
24281 case Intrinsic::aarch64_sve_bsl:
24282 case Intrinsic::aarch64_sve_bsl1n:
24283 case Intrinsic::aarch64_sve_bsl2n:
24284 case Intrinsic::aarch64_sve_nbsl:
24285 return combineSVEBitSel(IID, N, DAG);
24286 }
24287 return SDValue();
24288}
24289
24290static bool isCheapToExtend(const SDValue &N) {
24291 unsigned OC = N->getOpcode();
24292 return OC == ISD::LOAD || OC == ISD::MLOAD ||
24294}
24295
24296static SDValue
24298 SelectionDAG &DAG) {
24299 // If we have (sext (setcc A B)) and A and B are cheap to extend,
24300 // we can move the sext into the arguments and have the same result. For
24301 // example, if A and B are both loads, we can make those extending loads and
24302 // avoid an extra instruction. This pattern appears often in VLS code
24303 // generation where the inputs to the setcc have a different size to the
24304 // instruction that wants to use the result of the setcc.
24305 assert(N->getOpcode() == ISD::SIGN_EXTEND &&
24306 N->getOperand(0)->getOpcode() == ISD::SETCC);
24307 const SDValue SetCC = N->getOperand(0);
24308
24309 const SDValue CCOp0 = SetCC.getOperand(0);
24310 const SDValue CCOp1 = SetCC.getOperand(1);
24311 if (!CCOp0->getValueType(0).isInteger() ||
24312 !CCOp1->getValueType(0).isInteger())
24313 return SDValue();
24314
24315 ISD::CondCode Code =
24316 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get();
24317
24318 ISD::NodeType ExtType =
24319 isSignedIntSetCC(Code) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
24320
24321 if (isCheapToExtend(SetCC.getOperand(0)) &&
24322 isCheapToExtend(SetCC.getOperand(1))) {
24323 const SDValue Ext1 =
24324 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp0);
24325 const SDValue Ext2 =
24326 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp1);
24327
24328 return DAG.getSetCC(
24329 SDLoc(SetCC), N->getValueType(0), Ext1, Ext2,
24330 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get());
24331 }
24332
24333 return SDValue();
24334}
24335
24336// Convert zext(extract(shuffle a, b, [0,4,8,12])) -> and(uzp1(a, b), 255)
24337// This comes from interleaved vectorization. It is performed late to capture
24338// uitofp converts too.
24340 SelectionDAG &DAG) {
24341 EVT VT = N->getValueType(0);
24342 if ((VT != MVT::v4i32 && VT != MVT::v8i16) ||
24343 N->getOpcode() != ISD::ZERO_EXTEND ||
24344 N->getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR)
24345 return SDValue();
24346
24347 unsigned ExtOffset = N->getOperand(0).getConstantOperandVal(1);
24348 if (ExtOffset != 0 && ExtOffset != VT.getVectorNumElements())
24349 return SDValue();
24350
24351 auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0).getOperand(0));
24352 if (!Shuffle)
24353 return SDValue();
24354
24355 // From here it is safe to assume InVT is a fixed-length vector. The only
24356 // legal scalable vector shuffle is splat, and it should have been lowered to
24357 // vector_splat.
24358 EVT InVT = N->getOperand(0).getOperand(0).getValueType();
24359 assert(InVT.isFixedLengthVector() && "Unexpected scalable shufflevector.");
24360 if (InVT.getVectorNumElements() != VT.getVectorNumElements() * 2 ||
24361 InVT.getScalarSizeInBits() * 2 != VT.getScalarSizeInBits())
24362 return SDValue();
24363
24364 unsigned Idx;
24366 Shuffle->getMask().slice(ExtOffset, VT.getVectorNumElements()), 4, Idx);
24367 // An undef interleave shuffle can come up after other canonicalizations,
24368 // where the shuffle has been converted to
24369 // zext(extract(shuffle b, undef, [u,u,0,4]))
24370 bool IsUndefDeInterleave = false;
24371 if (!IsDeInterleave)
24372 IsUndefDeInterleave =
24373 Shuffle->getOperand(1).isUndef() &&
24374 all_of(
24375 Shuffle->getMask().slice(ExtOffset, VT.getVectorNumElements() / 2),
24376 [](int M) { return M < 0; }) &&
24378 Shuffle->getMask().slice(ExtOffset + VT.getVectorNumElements() / 2,
24379 VT.getVectorNumElements() / 2),
24380 4, Idx);
24381 if ((!IsDeInterleave && !IsUndefDeInterleave) || Idx >= 4)
24382 return SDValue();
24383 SDLoc DL(N);
24384 SDValue BC1 = DAG.getNode(AArch64ISD::NVCAST, DL, VT,
24385 Shuffle->getOperand(IsUndefDeInterleave ? 1 : 0));
24386 SDValue BC2 = DAG.getNode(AArch64ISD::NVCAST, DL, VT,
24387 Shuffle->getOperand(IsUndefDeInterleave ? 0 : 1));
24388 SDValue UZP = DAG.getNode(Idx < 2 ? AArch64ISD::UZP1 : AArch64ISD::UZP2, DL,
24389 VT, BC1, BC2);
24390 if ((Idx & 1) == 1)
24391 UZP = DAG.getNode(ISD::SRL, DL, VT, UZP,
24392 DAG.getConstant(InVT.getScalarSizeInBits(), DL, VT));
24393 return DAG.getNode(
24394 ISD::AND, DL, VT, UZP,
24395 DAG.getConstant((1 << InVT.getScalarSizeInBits()) - 1, DL, VT));
24396}
24397
24398// This comes up similar to the above when lowering deinterleaving shuffles from
24399// zexts. We have legalized the operations in the generally case to
24400// zext(extract_subvector(uzp(a, b))), which can be converted to and(a, mask) if
24401// the extract is to the low half and the uzp is uzp1. There would be an extra
24402// shift if the uzp was uzp2 to grab the upper half. Due to the combine above
24403// there could also be an existing and / shift that can be combined in, either
24404// before of after the extract.
24406 EVT VT = N->getValueType(0);
24407 if (N->getOpcode() != ISD::ZERO_EXTEND ||
24408 (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16))
24409 return SDValue();
24410
24411 SDValue Op = N->getOperand(0);
24412 unsigned ExtOffset = (unsigned)-1;
24413 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
24414 ExtOffset = Op.getConstantOperandVal(1);
24415 Op = Op.getOperand(0);
24416 // Avoid NVCAST from a scalable vector to a fixed-size one.
24417 if (Op.getValueType().isScalableVector())
24418 return SDValue();
24419 }
24420
24421 unsigned Shift = 0;
24423 Op.getValueType().getScalarSizeInBits());
24424
24425 if (Op.getOpcode() == AArch64ISD::VLSHR) {
24426 Shift = Op.getConstantOperandVal(1);
24427 Op = Op.getOperand(0);
24428 Mask = Mask.lshr(Shift);
24429 }
24430 if (Op.getOpcode() == ISD::AND &&
24431 ISD::isConstantSplatVector(Op.getOperand(1).getNode(), Mask)) {
24432 Op = Op.getOperand(0);
24433 Mask = Mask.zext(VT.getScalarSizeInBits());
24434 } else if (Op.getOpcode() == AArch64ISD::BICi) {
24435 Mask = ~APInt(Op.getValueType().getScalarSizeInBits(),
24436 Op.getConstantOperandVal(1) << Op.getConstantOperandVal(2));
24437 Mask = Mask.zext(VT.getScalarSizeInBits());
24438 Op = Op.getOperand(0);
24439 }
24440
24441 if (ExtOffset == (unsigned)-1) {
24442 if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
24443 ExtOffset = Op.getConstantOperandVal(1);
24444 Op = Op.getOperand(0);
24445 // Avoid NVCAST from a scalable vector to a fixed-size one.
24446 if (Op.getValueType().isScalableVector())
24447 return SDValue();
24448 } else
24449 return SDValue();
24450 }
24451 if (ExtOffset != 0 && ExtOffset != VT.getVectorNumElements())
24452 return SDValue();
24453
24454 if (Op.getOpcode() != AArch64ISD::UZP1 && Op.getOpcode() != AArch64ISD::UZP2)
24455 return SDValue();
24456 if (Op.getOpcode() == AArch64ISD::UZP2)
24457 Shift += VT.getScalarSizeInBits() / 2;
24458
24459 SDLoc DL(N);
24460 SDValue BC = DAG.getNode(AArch64ISD::NVCAST, DL, VT,
24461 Op.getOperand(ExtOffset == 0 ? 0 : 1));
24462 if (Shift != 0)
24463 BC = DAG.getNode(AArch64ISD::VLSHR, DL, VT, BC,
24464 DAG.getTargetConstant(Shift, DL, MVT::i32));
24465 return DAG.getNode(ISD::AND, DL, VT, BC, DAG.getConstant(Mask, DL, VT));
24466}
24467
24468// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
24470 unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG,
24471 TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget &Subtarget) {
24472 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
24473 Opcode != ISD::ANY_EXTEND)
24474 return SDValue();
24475 if (!DCI.isBeforeLegalizeOps())
24476 return SDValue();
24477 if (!Subtarget.hasNEON())
24478 return SDValue();
24479
24480 EVT SVT = VT.getScalarType();
24481 EVT InSVT = N0.getValueType().getScalarType();
24482 unsigned EltSizeInBits = SVT.getSizeInBits();
24483
24484 // Input type must be extending a bool vector (bit-casted from a scalar
24485 // integer) to legal integer types.
24486 if (!VT.isVector())
24487 return SDValue();
24488 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
24489 return SDValue();
24490 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
24491 return SDValue();
24492
24493 SDValue N00 = N0.getOperand(0);
24494 EVT SclVT = N00.getValueType();
24495 if (!SclVT.isScalarInteger())
24496 return SDValue();
24497
24498 SDValue Vec;
24499 SmallVector<int> ShuffleMask;
24500 unsigned NumElts = VT.getVectorNumElements();
24501 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
24502
24503 // Broadcast the scalar integer to the vector elements.
24504 bool IsBE = DAG.getDataLayout().isBigEndian();
24505 if (NumElts > EltSizeInBits) {
24506 // If the scalar integer is greater than the vector element size, then we
24507 // must split it down into sub-sections for broadcasting. For example:
24508 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
24509 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
24510 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
24511 unsigned Scale = NumElts / EltSizeInBits;
24512 EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
24513 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
24514 Vec = DAG.getBitcast(VT, Vec);
24515
24516 for (unsigned I = 0; I != Scale; ++I)
24517 ShuffleMask.append(EltSizeInBits, (int)I);
24518
24519 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
24520 } else {
24521 // For smaller scalar integers, we can simply any-extend it to the vector
24522 // element size (we don't care about the upper bits) and broadcast it to all
24523 // elements.
24524 Vec = DAG.getSplat(VT, DL, DAG.getAnyExtOrTrunc(N00, DL, SVT));
24525 }
24526
24527 // Now, mask the relevant bit in each element.
24529 for (unsigned I = 0; I != NumElts; ++I) {
24530 unsigned ScalarBit = IsBE ? (NumElts - 1 - I) : I;
24531 int BitIdx = ScalarBit % EltSizeInBits;
24532 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
24533 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
24534 }
24535 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
24536 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
24537
24538 // Compare against the bitmask and extend the result.
24539 EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
24540 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
24541 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
24542
24543 // For SEXT, this is now done, otherwise shift the result down for
24544 // zero-extension.
24545 if (Opcode == ISD::SIGN_EXTEND)
24546 return Vec;
24547 return DAG.getNode(ISD::SRL, DL, VT, Vec,
24548 DAG.getConstant(EltSizeInBits - 1, DL, VT));
24549}
24550
24551// Combine:
24552// ext(duplane(insert_subvector(undef, trunc(X), 0), idx))
24553// Into:
24554// duplane(X, idx)
24555// This eliminates XTN/SSHLL sequences when splatting from boolean vectors.
24557 SDValue Dup = N->getOperand(0);
24558 unsigned DupOpc = Dup.getOpcode();
24559 if (!Dup->hasOneUse() ||
24560 (DupOpc != AArch64ISD::DUPLANE8 && DupOpc != AArch64ISD::DUPLANE16 &&
24561 DupOpc != AArch64ISD::DUPLANE32))
24562 return SDValue();
24563
24564 SDValue Insert = Dup.getOperand(0);
24565 if (!Insert.hasOneUse() || Insert.getOpcode() != ISD::INSERT_SUBVECTOR ||
24566 !Insert.getOperand(0).isUndef() || !isNullConstant(Insert.getOperand(2)))
24567 return SDValue();
24568
24569 SDValue Trunc = Insert.getOperand(1);
24570 if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)
24571 return SDValue();
24572
24573 SDValue Src = Trunc.getOperand(0);
24574 EVT SrcVT = Src.getValueType();
24575 EVT DstVT = N->getValueType(0);
24576 // Without VLS 256+, DUPLANE requires 128-bit inputs.
24577 if (SrcVT != DstVT || !SrcVT.is128BitVector())
24578 return SDValue();
24579
24580 // Verify that Src is already sign/zero-extended from the truncated bit width.
24581 EVT TruncVT = Trunc.getValueType();
24582 unsigned SrcBits = SrcVT.getScalarSizeInBits();
24583 unsigned TruncBits = TruncVT.getScalarSizeInBits();
24584 if (N->getOpcode() == ISD::SIGN_EXTEND) {
24585 if (DAG.ComputeNumSignBits(Src) <= SrcBits - TruncBits)
24586 return SDValue();
24587 } else if (N->getOpcode() == ISD::ZERO_EXTEND) {
24588 APInt Mask = APInt::getHighBitsSet(SrcBits, SrcBits - TruncBits);
24589 if (!DAG.MaskedValueIsZero(Src, Mask))
24590 return SDValue();
24591 } else {
24592 assert(N->getOpcode() == ISD::ANY_EXTEND);
24593 }
24594
24595 unsigned NewDupOpc;
24596 switch (SrcVT.getScalarSizeInBits()) {
24597 case 16:
24598 NewDupOpc = AArch64ISD::DUPLANE16;
24599 break;
24600 case 32:
24601 NewDupOpc = AArch64ISD::DUPLANE32;
24602 break;
24603 case 64:
24604 NewDupOpc = AArch64ISD::DUPLANE64;
24605 break;
24606 default:
24607 return SDValue();
24608 }
24609
24610 return DAG.getNode(NewDupOpc, SDLoc(N), DstVT, Src, Dup.getOperand(1));
24611}
24612
24615 SelectionDAG &DAG,
24616 const AArch64Subtarget *Subtarget) {
24617 // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
24618 // we can convert that DUP into another extract_high (of a bigger DUP), which
24619 // helps the backend to decide that an sabdl2 would be useful, saving a real
24620 // extract_high operation.
24621 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
24622 N->getOperand(0).getValueType().is64BitVector() &&
24623 (N->getOperand(0).getOpcode() == ISD::ABDU ||
24624 N->getOperand(0).getOpcode() == ISD::ABDS)) {
24625 SDNode *ABDNode = N->getOperand(0).getNode();
24626 SDValue NewABD =
24628 if (!NewABD.getNode())
24629 return SDValue();
24630
24631 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD);
24632 }
24633
24635 return R;
24636 if (SDValue R = performZExtUZPCombine(N, DAG))
24637 return R;
24638
24639 SDLoc dl(N);
24640 SDValue N0 = N->getOperand(0);
24641 EVT VT = N->getValueType(0);
24642 if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), dl, VT, N0,
24643 DAG, DCI, *Subtarget))
24644 return V;
24645
24646 if (N->getValueType(0).isFixedLengthVector() &&
24647 N->getOpcode() == ISD::SIGN_EXTEND &&
24648 N->getOperand(0)->getOpcode() == ISD::SETCC)
24649 return performSignExtendSetCCCombine(N, DCI, DAG);
24650
24651 // If we see ({any,zero}_extend (bswap ...)) with bswap returning an i16, we
24652 // can replace this pattern with (rev16 ({any,zero}_extend ...)). This saves
24653 // a machine instruction compared to (lsr (rev ...)) or (and (rev16 ..)),
24654 // which is what this pattern would otherwise be lowered to.
24655 // For any_extend: the top half of the result is unused, so rev16 is correct.
24656 // For zero_extend: rev16 preserves the zero upper half when the input is
24657 // zero-extended (e.g. from LDRHHui), because it swaps bytes within each
24658 // 16-bit half independently.
24659 // Only apply this optimisation if extending to i32 or i64, because this type
24660 // will become the input type to REV16 in the new pattern, so must be a
24661 // legitimate REV16 input type.
24662 SDValue Bswap = N->getOperand(0);
24663 if ((N->getOpcode() == ISD::ANY_EXTEND ||
24664 N->getOpcode() == ISD::ZERO_EXTEND) &&
24665 Bswap.getOpcode() == ISD::BSWAP && Bswap.getValueType() == MVT::i16 &&
24666 (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64)) {
24667 SDLoc DL(N);
24668 SDValue NewExtend = DAG.getNode(N->getOpcode(), DL, N->getValueType(0),
24669 Bswap->getOperand(0));
24670 return DAG.getNode(AArch64ISD::REV16, SDLoc(N), N->getValueType(0),
24671 NewExtend);
24672 }
24673
24675 return R;
24676
24677 return SDValue();
24678}
24679
24681 SDValue SplatVal, unsigned NumVecElts) {
24682 assert(!St.isTruncatingStore() && "cannot split truncating vector store");
24683 Align OrigAlignment = St.getAlign();
24684 unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
24685
24686 // Create scalar stores. This is at least as good as the code sequence for a
24687 // split unaligned store which is a dup.s, ext.b, and two stores.
24688 // Most of the time the three stores should be replaced by store pair
24689 // instructions (stp).
24690 SDLoc DL(&St);
24691 SDValue BasePtr = St.getBasePtr();
24692 uint64_t BaseOffset = 0;
24693
24694 const MachinePointerInfo &PtrInfo = St.getPointerInfo();
24695 SDValue NewST1 =
24696 DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
24697 OrigAlignment, St.getMemOperand()->getFlags());
24698
24699 // As this in ISel, we will not merge this add which may degrade results.
24700 if (BasePtr->getOpcode() == ISD::ADD &&
24701 isa<ConstantSDNode>(BasePtr->getOperand(1))) {
24702 BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
24703 BasePtr = BasePtr->getOperand(0);
24704 }
24705
24706 unsigned Offset = EltOffset;
24707 while (--NumVecElts) {
24708 Align Alignment = commonAlignment(OrigAlignment, Offset);
24709 SDValue OffsetPtr =
24710 DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
24711 DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
24712 NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
24713 PtrInfo.getWithOffset(Offset), Alignment,
24714 St.getMemOperand()->getFlags());
24715 Offset += EltOffset;
24716 }
24717 return NewST1;
24718}
24719
24720// Returns an SVE type that ContentTy can be trivially sign or zero extended
24721// into.
24722static MVT getSVEContainerType(EVT ContentTy) {
24723 assert(ContentTy.isSimple() && "No SVE containers for extended types");
24724
24725 switch (ContentTy.getSimpleVT().SimpleTy) {
24726 default:
24727 llvm_unreachable("No known SVE container for this MVT type");
24728 case MVT::nxv2i8:
24729 case MVT::nxv2i16:
24730 case MVT::nxv2i32:
24731 case MVT::nxv2i64:
24732 case MVT::nxv2f32:
24733 case MVT::nxv2f64:
24734 return MVT::nxv2i64;
24735 case MVT::nxv4i8:
24736 case MVT::nxv4i16:
24737 case MVT::nxv4i32:
24738 case MVT::nxv4f32:
24739 return MVT::nxv4i32;
24740 case MVT::nxv8i8:
24741 case MVT::nxv8i16:
24742 case MVT::nxv8f16:
24743 case MVT::nxv8bf16:
24744 return MVT::nxv8i16;
24745 case MVT::nxv16i8:
24746 return MVT::nxv16i8;
24747 }
24748}
24749
24751 SDLoc DL(N);
24752 EVT VT = N->getValueType(0);
24753
24755 return SDValue();
24756
24757 EVT ContainerVT = VT;
24758 if (ContainerVT.isInteger())
24759 ContainerVT = getSVEContainerType(ContainerVT);
24760
24761 SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other);
24762 SDValue Ops[] = { N->getOperand(0), // Chain
24763 N->getOperand(2), // Pg
24764 N->getOperand(3), // Base
24765 DAG.getValueType(VT) };
24766
24767 SDValue Load = DAG.getNode(Opc, DL, VTs, Ops);
24768 SDValue LoadChain = SDValue(Load.getNode(), 1);
24769
24770 if (ContainerVT.isInteger() && (VT != ContainerVT))
24771 Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0));
24772
24773 return DAG.getMergeValues({ Load, LoadChain }, DL);
24774}
24775
24777 SDLoc DL(N);
24778 EVT VT = N->getValueType(0);
24779 EVT PtrTy = N->getOperand(3).getValueType();
24780
24781 EVT LoadVT = VT;
24782 if (VT.isFloatingPoint())
24783 LoadVT = VT.changeTypeToInteger();
24784
24785 auto *MINode = cast<MemIntrinsicSDNode>(N);
24786 SDValue PassThru = DAG.getConstant(0, DL, LoadVT);
24787 SDValue L =
24788 DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(), MINode->getOperand(3),
24789 DAG.getPOISON(PtrTy), MINode->getOperand(2), PassThru,
24790 MINode->getMemoryVT(), MINode->getMemOperand(),
24792
24793 if (VT.isFloatingPoint()) {
24794 SDValue Ops[] = {DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1)};
24795 return DAG.getMergeValues(Ops, DL);
24796 }
24797
24798 return L;
24799}
24800
24801template <unsigned Opcode>
24803 static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO ||
24804 Opcode == AArch64ISD::LD1RO_MERGE_ZERO,
24805 "Unsupported opcode.");
24806 SDLoc DL(N);
24807 EVT VT = N->getValueType(0);
24808
24809 EVT LoadVT = VT;
24810 if (VT.isFloatingPoint())
24811 LoadVT = VT.changeTypeToInteger();
24812
24813 SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)};
24814 SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops);
24815 SDValue LoadChain = SDValue(Load.getNode(), 1);
24816
24817 if (VT.isFloatingPoint())
24818 Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0));
24819
24820 return DAG.getMergeValues({Load, LoadChain}, DL);
24821}
24822
24824 SDLoc DL(N);
24825 SDValue Data = N->getOperand(2);
24826 EVT DataVT = Data.getValueType();
24827 EVT HwSrcVt = getSVEContainerType(DataVT);
24828 SDValue InputVT = DAG.getValueType(DataVT);
24829
24830 if (DataVT.isFloatingPoint())
24831 InputVT = DAG.getValueType(HwSrcVt);
24832
24833 SDValue SrcNew;
24834 if (Data.getValueType().isFloatingPoint())
24835 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Data);
24836 else
24837 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Data);
24838
24839 SDValue Ops[] = { N->getOperand(0), // Chain
24840 SrcNew,
24841 N->getOperand(4), // Base
24842 N->getOperand(3), // Pg
24843 InputVT
24844 };
24845
24846 return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops);
24847}
24848
24850 SDLoc DL(N);
24851
24852 SDValue Data = N->getOperand(2);
24853 EVT DataVT = Data.getValueType();
24854 EVT PtrTy = N->getOperand(4).getValueType();
24855
24856 if (DataVT.isFloatingPoint())
24858
24859 auto *MINode = cast<MemIntrinsicSDNode>(N);
24860 return DAG.getMaskedStore(MINode->getChain(), DL, Data, MINode->getOperand(4),
24861 DAG.getPOISON(PtrTy), MINode->getOperand(3),
24862 MINode->getMemoryVT(), MINode->getMemOperand(),
24863 ISD::UNINDEXED, false, false);
24864}
24865
24866/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
24867/// load store optimizer pass will merge them to store pair stores. This should
24868/// be better than a movi to create the vector zero followed by a vector store
24869/// if the zero constant is not re-used, since one instructions and one register
24870/// live range will be removed.
24871///
24872/// For example, the final generated code should be:
24873///
24874/// stp xzr, xzr, [x0]
24875///
24876/// instead of:
24877///
24878/// movi v0.2d, #0
24879/// str q0, [x0]
24880///
24882 SDValue StVal = St.getValue();
24883 EVT VT = StVal.getValueType();
24884
24885 // Avoid scalarizing zero splat stores for scalable vectors.
24886 if (VT.isScalableVector())
24887 return SDValue();
24888
24889 // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
24890 // 2, 3 or 4 i32 elements.
24891 int NumVecElts = VT.getVectorNumElements();
24892 if (!(((NumVecElts == 2 || NumVecElts == 3) &&
24893 VT.getVectorElementType().getSizeInBits() == 64) ||
24894 ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) &&
24895 VT.getVectorElementType().getSizeInBits() == 32)))
24896 return SDValue();
24897
24898 if (StVal.getOpcode() != ISD::BUILD_VECTOR)
24899 return SDValue();
24900
24901 // If the zero constant has more than one use then the vector store could be
24902 // better since the constant mov will be amortized and stp q instructions
24903 // should be able to be formed.
24904 if (!StVal.hasOneUse())
24905 return SDValue();
24906
24907 // If the store is truncating then it's going down to i16 or smaller, which
24908 // means it can be implemented in a single store anyway.
24909 if (St.isTruncatingStore())
24910 return SDValue();
24911
24912 // If the immediate offset of the address operand is too large for the stp
24913 // instruction, then bail out.
24914 if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {
24915 int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);
24917 return SDValue();
24918 }
24919
24920 for (int I = 0; I < NumVecElts; ++I) {
24921 SDValue EltVal = StVal.getOperand(I);
24922 if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal))
24923 return SDValue();
24924 }
24925
24926 // Use a CopyFromReg WZR/XZR here to prevent
24927 // DAGCombiner::MergeConsecutiveStores from undoing this transformation.
24928 SDLoc DL(&St);
24929 unsigned ZeroReg;
24930 EVT ZeroVT;
24931 if (VT.getVectorElementType().getSizeInBits() == 32) {
24932 ZeroReg = AArch64::WZR;
24933 ZeroVT = MVT::i32;
24934 } else {
24935 ZeroReg = AArch64::XZR;
24936 ZeroVT = MVT::i64;
24937 }
24938 SDValue SplatVal =
24939 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ZeroReg, ZeroVT);
24940 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
24941}
24942
24943/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
24944/// value. The load store optimizer pass will merge them to store pair stores.
24945/// This has better performance than a splat of the scalar followed by a split
24946/// vector store. Even if the stores are not merged it is four stores vs a dup,
24947/// followed by an ext.b and two stores.
24949 SDValue StVal = St.getValue();
24950 EVT VT = StVal.getValueType();
24951
24952 // Don't replace floating point stores, they possibly won't be transformed to
24953 // stp because of the store pair suppress pass.
24954 if (VT.isFloatingPoint())
24955 return SDValue();
24956
24957 // We can express a splat as store pair(s) for 2 or 4 elements.
24958 unsigned NumVecElts = VT.getVectorNumElements();
24959 if (NumVecElts != 4 && NumVecElts != 2)
24960 return SDValue();
24961
24962 // If the store is truncating then it's going down to i16 or smaller, which
24963 // means it can be implemented in a single store anyway.
24964 if (St.isTruncatingStore())
24965 return SDValue();
24966
24967 // Check that this is a splat.
24968 // Make sure that each of the relevant vector element locations are inserted
24969 // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
24970 std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
24971 SDValue SplatVal;
24972 for (unsigned I = 0; I < NumVecElts; ++I) {
24973 // Check for insert vector elements.
24974 if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
24975 return SDValue();
24976
24977 // Check that same value is inserted at each vector element.
24978 if (I == 0)
24979 SplatVal = StVal.getOperand(1);
24980 else if (StVal.getOperand(1) != SplatVal)
24981 return SDValue();
24982
24983 // Check insert element index.
24985 if (!CIndex)
24986 return SDValue();
24987 uint64_t IndexVal = CIndex->getZExtValue();
24988 if (IndexVal >= NumVecElts)
24989 return SDValue();
24990 IndexNotInserted.reset(IndexVal);
24991
24992 StVal = StVal.getOperand(0);
24993 }
24994 // Check that all vector element locations were inserted to.
24995 if (IndexNotInserted.any())
24996 return SDValue();
24997
24998 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
24999}
25000
25002 SelectionDAG &DAG,
25003 const AArch64Subtarget *Subtarget) {
25004
25006 if (S->isVolatile() || S->isIndexed())
25007 return SDValue();
25008
25009 SDValue StVal = S->getValue();
25010 EVT VT = StVal.getValueType();
25011
25012 if (!VT.isFixedLengthVector())
25013 return SDValue();
25014
25015 // If we get a splat of zeros, convert this vector store to a store of
25016 // scalars. They will be merged into store pairs of xzr thereby removing one
25017 // instruction and one register.
25018 if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S))
25019 return ReplacedZeroSplat;
25020
25021 // FIXME: The logic for deciding if an unaligned store should be split should
25022 // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
25023 // a call to that function here.
25024
25025 if (!Subtarget->isMisaligned128StoreSlow())
25026 return SDValue();
25027
25028 // Don't split at -Oz.
25030 return SDValue();
25031
25032 // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
25033 // those up regresses performance on micro-benchmarks and olden/bh.
25034 if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
25035 return SDValue();
25036
25037 // Split unaligned 16B stores. They are terrible for performance.
25038 // Don't split stores with alignment of 1 or 2. Code that uses clang vector
25039 // extensions can use this to mark that it does not want splitting to happen
25040 // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
25041 // eliminating alignment hazards is only 1 in 8 for alignment of 2.
25042 if (VT.getSizeInBits() != 128 || S->getAlign() >= Align(16) ||
25043 S->getAlign() <= Align(2))
25044 return SDValue();
25045
25046 // If we get a splat of a scalar convert this vector store to a store of
25047 // scalars. They will be merged into store pairs thereby removing two
25048 // instructions.
25049 if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S))
25050 return ReplacedSplat;
25051
25052 SDLoc DL(S);
25053
25054 // Split VT into two.
25055 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
25056 unsigned NumElts = HalfVT.getVectorNumElements();
25057 SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
25058 DAG.getConstant(0, DL, MVT::i64));
25059 SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
25060 DAG.getConstant(NumElts, DL, MVT::i64));
25061 SDValue BasePtr = S->getBasePtr();
25062 SDValue NewST1 =
25063 DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
25064 S->getAlign(), S->getMemOperand()->getFlags());
25065 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
25066 DAG.getConstant(8, DL, MVT::i64));
25067 return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
25068 S->getPointerInfo(), S->getAlign(),
25069 S->getMemOperand()->getFlags());
25070}
25071
25073 assert(N->getOpcode() == AArch64ISD::SPLICE && "Unexpected Opcode!");
25074
25075 // splice(pg, op1, undef) -> op1
25076 if (N->getOperand(2).isUndef())
25077 return N->getOperand(1);
25078
25079 return SDValue();
25080}
25081
25083 const AArch64Subtarget *Subtarget) {
25084 assert((N->getOpcode() == AArch64ISD::UUNPKHI ||
25085 N->getOpcode() == AArch64ISD::UUNPKLO) &&
25086 "Unexpected Opcode!");
25087
25088 // uunpklo/hi undef -> undef
25089 if (N->getOperand(0).isUndef())
25090 return DAG.getUNDEF(N->getValueType(0));
25091
25092 // If this is a masked load followed by an UUNPKLO, fold this into a masked
25093 // extending load. We can do this even if this is already a masked
25094 // {z,}extload.
25095 if (N->getOperand(0).getOpcode() == ISD::MLOAD &&
25096 N->getOpcode() == AArch64ISD::UUNPKLO) {
25097 MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(N->getOperand(0));
25098 SDValue Mask = MLD->getMask();
25099 SDLoc DL(N);
25100
25101 if (MLD->isUnindexed() && MLD->getExtensionType() != ISD::SEXTLOAD &&
25102 SDValue(MLD, 0).hasOneUse() && Mask->getOpcode() == AArch64ISD::PTRUE &&
25103 (MLD->getPassThru()->isUndef() ||
25104 isZerosVector(MLD->getPassThru().getNode()))) {
25105 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
25106 unsigned PgPattern = Mask->getConstantOperandVal(0);
25107 EVT VT = N->getValueType(0);
25108
25109 // Ensure we can double the size of the predicate pattern
25110 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
25111 if (NumElts &&
25112 NumElts * VT.getVectorElementType().getSizeInBits() <= MinSVESize) {
25113 Mask = getPTrue(DAG, DL,
25114 VT.changeVectorElementType(*DAG.getContext(), MVT::i1),
25115 PgPattern);
25116 SDValue PassThru = DAG.getConstant(0, DL, VT);
25117 SDValue NewLoad = DAG.getMaskedLoad(
25118 VT, DL, MLD->getChain(), MLD->getBasePtr(), MLD->getOffset(), Mask,
25119 PassThru, MLD->getMemoryVT(), MLD->getMemOperand(),
25121
25122 DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), NewLoad.getValue(1));
25123
25124 return NewLoad;
25125 }
25126 }
25127 }
25128
25129 return SDValue();
25130}
25131
25133 if (N->getOpcode() != AArch64ISD::UZP1)
25134 return false;
25135 SDValue Op0 = N->getOperand(0);
25136 EVT SrcVT = Op0->getValueType(0);
25137 EVT DstVT = N->getValueType(0);
25138 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv16i8) ||
25139 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv8i16) ||
25140 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv4i32);
25141}
25142
25143// Try to combine rounding shifts where the operands come from an extend, and
25144// the result is truncated and combined into one vector.
25145// uzp1(rshrnb(uunpklo(X),C), rshrnb(uunpkhi(X), C)) -> urshr(X, C)
25147 assert(N->getOpcode() == AArch64ISD::UZP1 && "Only UZP1 expected.");
25148 SDValue Op0 = N->getOperand(0);
25149 SDValue Op1 = N->getOperand(1);
25150 EVT ResVT = N->getValueType(0);
25151
25152 unsigned RshOpc = Op0.getOpcode();
25153 if (RshOpc != AArch64ISD::RSHRNB_I)
25154 return SDValue();
25155
25156 // Same op code and imm value?
25157 SDValue ShiftValue = Op0.getOperand(1);
25158 if (RshOpc != Op1.getOpcode() || ShiftValue != Op1.getOperand(1))
25159 return SDValue();
25160
25161 // Same unextended operand value?
25162 SDValue Lo = Op0.getOperand(0);
25163 SDValue Hi = Op1.getOperand(0);
25164 if (Lo.getOpcode() != AArch64ISD::UUNPKLO &&
25165 Hi.getOpcode() != AArch64ISD::UUNPKHI)
25166 return SDValue();
25167 SDValue OrigArg = Lo.getOperand(0);
25168 if (OrigArg != Hi.getOperand(0))
25169 return SDValue();
25170
25171 SDLoc DL(N);
25172 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, ResVT,
25173 getPredicateForVector(DAG, DL, ResVT), OrigArg,
25174 ShiftValue);
25175}
25176
25177// Try to simplify:
25178// t1 = nxv8i16 add(X, 1 << (ShiftValue - 1))
25179// t2 = nxv8i16 srl(t1, ShiftValue)
25180// to
25181// t1 = nxv8i16 rshrnb(X, shiftvalue).
25182// rshrnb will zero the top half bits of each element. Therefore, this combine
25183// should only be performed when a following instruction with the rshrnb
25184// as an operand does not care about the top half of each element. For example,
25185// a uzp1 or a truncating store.
25187 const AArch64Subtarget *Subtarget) {
25188 EVT VT = Srl->getValueType(0);
25189 if (!VT.isScalableVector() || !Subtarget->hasSVE2())
25190 return SDValue();
25191
25192 EVT ResVT;
25193 if (VT == MVT::nxv8i16)
25194 ResVT = MVT::nxv16i8;
25195 else if (VT == MVT::nxv4i32)
25196 ResVT = MVT::nxv8i16;
25197 else if (VT == MVT::nxv2i64)
25198 ResVT = MVT::nxv4i32;
25199 else
25200 return SDValue();
25201
25202 SDLoc DL(Srl);
25203 unsigned ShiftValue;
25204 SDValue RShOperand;
25205 if (!canLowerSRLToRoundingShiftForVT(Srl, ResVT, DAG, ShiftValue, RShOperand))
25206 return SDValue();
25207 SDValue Rshrnb = DAG.getNode(
25208 AArch64ISD::RSHRNB_I, DL, ResVT,
25209 {RShOperand, DAG.getTargetConstant(ShiftValue, DL, MVT::i32)});
25210 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Rshrnb);
25211}
25212
25214 if (V.getOpcode() != AArch64ISD::NVCAST)
25215 return SDValue();
25216
25217 SDValue Op = V.getOperand(0);
25218 if (!Op.getValueType().isVector() ||
25219 V.getValueType().getVectorElementCount() !=
25220 Op.getValueType().getVectorElementCount() * 2)
25221 return SDValue();
25222
25223 return Op;
25224}
25225
25227 const AArch64Subtarget *Subtarget) {
25228 SDLoc DL(N);
25229 SDValue Op0 = N->getOperand(0);
25230 SDValue Op1 = N->getOperand(1);
25231 EVT ResVT = N->getValueType(0);
25232
25233 // uzp(extract_lo(x), extract_hi(x)) -> extract_lo(uzp x, x)
25234 if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
25236 Op0.getOperand(0) == Op1.getOperand(0)) {
25237
25238 SDValue SourceVec = Op0.getOperand(0);
25239 uint64_t ExtIdx0 = Op0.getConstantOperandVal(1);
25240 uint64_t ExtIdx1 = Op1.getConstantOperandVal(1);
25241 uint64_t NumElements = SourceVec.getValueType().getVectorMinNumElements();
25242 if (ExtIdx0 == 0 && ExtIdx1 == NumElements / 2) {
25243 EVT OpVT = Op0.getOperand(1).getValueType();
25244 EVT WidenedResVT = ResVT.getDoubleNumVectorElementsVT(*DAG.getContext());
25245 SDValue Uzp = DAG.getNode(N->getOpcode(), DL, WidenedResVT, SourceVec,
25246 DAG.getPOISON(WidenedResVT));
25247 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Uzp,
25248 DAG.getConstant(0, DL, OpVT));
25249 }
25250 }
25251
25252 // Following optimizations only work with uzp1.
25253 if (N->getOpcode() == AArch64ISD::UZP2)
25254 return SDValue();
25255
25256 if (SDValue Urshr = tryCombineExtendRShTrunc(N, DAG))
25257 return Urshr;
25258
25259 if (SDValue PreCast = isNVCastToHalfWidthElements(Op0)) {
25260 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(PreCast, DAG, Subtarget)) {
25261 Rshrnb = DAG.getNode(AArch64ISD::NVCAST, DL, ResVT, Rshrnb);
25262 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Rshrnb, Op1);
25263 }
25264 }
25265
25266 if (SDValue PreCast = isNVCastToHalfWidthElements(Op1)) {
25267 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(PreCast, DAG, Subtarget)) {
25268 Rshrnb = DAG.getNode(AArch64ISD::NVCAST, DL, ResVT, Rshrnb);
25269 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Rshrnb);
25270 }
25271 }
25272
25273 // uzp1<ty>(nvcast(unpklo(uzp1<ty>(x, y))), z) => uzp1<ty>(x, z)
25274 if (SDValue PreCast = isNVCastToHalfWidthElements(Op0)) {
25275 if (PreCast.getOpcode() == AArch64ISD::UUNPKLO) {
25276 if (PreCast.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
25277 SDValue X = PreCast.getOperand(0).getOperand(0);
25278 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, X, Op1);
25279 }
25280 }
25281 }
25282
25283 // uzp1<ty>(x, nvcast(unpkhi(uzp1<ty>(y, z)))) => uzp1<ty>(x, z)
25284 if (SDValue PreCast = isNVCastToHalfWidthElements(Op1)) {
25285 if (PreCast.getOpcode() == AArch64ISD::UUNPKHI) {
25286 if (PreCast.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
25287 SDValue Z = PreCast.getOperand(0).getOperand(1);
25288 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Z);
25289 }
25290 }
25291 }
25292
25293 // These optimizations only work on little endian.
25294 if (!DAG.getDataLayout().isLittleEndian())
25295 return SDValue();
25296
25297 // uzp1(x, undef) -> concat(truncate(x), undef)
25298 if (Op1.isUndef()) {
25299 EVT BCVT = MVT::Other, HalfVT = MVT::Other;
25300 switch (ResVT.getSimpleVT().SimpleTy) {
25301 default:
25302 break;
25303 case MVT::v16i8:
25304 BCVT = MVT::v8i16;
25305 HalfVT = MVT::v8i8;
25306 break;
25307 case MVT::v8i16:
25308 BCVT = MVT::v4i32;
25309 HalfVT = MVT::v4i16;
25310 break;
25311 case MVT::v4i32:
25312 BCVT = MVT::v2i64;
25313 HalfVT = MVT::v2i32;
25314 break;
25315 }
25316 if (BCVT != MVT::Other) {
25317 SDValue BC = DAG.getBitcast(BCVT, Op0);
25318 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC);
25319 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc,
25320 DAG.getPOISON(HalfVT));
25321 }
25322 }
25323
25324 // uzp1(bitcast(x), bitcast(y)) -> uzp1(x, y)
25325 // Example:
25326 // nxv4i32 = uzp1 bitcast(nxv4i32 x to nxv2i64), bitcast(nxv4i32 y to nxv2i64)
25327 // to
25328 // nxv4i32 = uzp1 nxv4i32 x, nxv4i32 y
25330 Op0.getOpcode() == ISD::BITCAST && Op1.getOpcode() == ISD::BITCAST) {
25331 if (Op0.getOperand(0).getValueType() == Op1.getOperand(0).getValueType()) {
25332 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0.getOperand(0),
25333 Op1.getOperand(0));
25334 }
25335 }
25336
25337 if (ResVT != MVT::v2i32 && ResVT != MVT::v4i16 && ResVT != MVT::v8i8)
25338 return SDValue();
25339
25340 SDValue SourceOp0 = peekThroughBitcasts(Op0);
25341 SDValue SourceOp1 = peekThroughBitcasts(Op1);
25342
25343 // truncating uzp1(x, y) -> xtn(concat (x, y))
25344 if (SourceOp0.getValueType() == SourceOp1.getValueType()) {
25345 EVT Op0Ty = SourceOp0.getValueType();
25346 if ((ResVT == MVT::v4i16 && Op0Ty == MVT::v2i32) ||
25347 (ResVT == MVT::v8i8 && Op0Ty == MVT::v4i16)) {
25348 SDValue Concat =
25351 SourceOp0, SourceOp1);
25352 return DAG.getNode(ISD::TRUNCATE, DL, ResVT, Concat);
25353 }
25354 }
25355
25356 // uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y))
25357 if (SourceOp0.getOpcode() != ISD::TRUNCATE ||
25358 SourceOp1.getOpcode() != ISD::TRUNCATE)
25359 return SDValue();
25360 SourceOp0 = SourceOp0.getOperand(0);
25361 SourceOp1 = SourceOp1.getOperand(0);
25362
25363 if (SourceOp0.getValueType() != SourceOp1.getValueType() ||
25364 !SourceOp0.getValueType().isSimple())
25365 return SDValue();
25366
25367 EVT ResultTy;
25368
25369 switch (SourceOp0.getSimpleValueType().SimpleTy) {
25370 case MVT::v2i64:
25371 ResultTy = MVT::v4i32;
25372 break;
25373 case MVT::v4i32:
25374 ResultTy = MVT::v8i16;
25375 break;
25376 case MVT::v8i16:
25377 ResultTy = MVT::v16i8;
25378 break;
25379 default:
25380 return SDValue();
25381 }
25382
25383 SDValue UzpOp0 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp0);
25384 SDValue UzpOp1 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp1);
25385 SDValue UzpResult =
25386 DAG.getNode(AArch64ISD::UZP1, DL, UzpOp0.getValueType(), UzpOp0, UzpOp1);
25387
25388 EVT BitcastResultTy;
25389
25390 switch (ResVT.getSimpleVT().SimpleTy) {
25391 case MVT::v2i32:
25392 BitcastResultTy = MVT::v2i64;
25393 break;
25394 case MVT::v4i16:
25395 BitcastResultTy = MVT::v4i32;
25396 break;
25397 case MVT::v8i8:
25398 BitcastResultTy = MVT::v8i16;
25399 break;
25400 default:
25401 llvm_unreachable("Should be one of {v2i32, v4i16, v8i8}");
25402 }
25403
25404 return DAG.getNode(ISD::TRUNCATE, DL, ResVT,
25405 DAG.getNode(ISD::BITCAST, DL, BitcastResultTy, UzpResult));
25406}
25407
25409 unsigned Opc = N->getOpcode();
25410
25411 const bool Scaled = Opc == AArch64ISD::GLD1_SCALED_MERGE_ZERO ||
25412 Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
25413 const bool Signed = Opc == AArch64ISD::GLD1S_MERGE_ZERO ||
25414 Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
25415 const bool Extended = Opc == AArch64ISD::GLD1_SXTW_MERGE_ZERO ||
25416 Opc == AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO ||
25417 Opc == AArch64ISD::GLD1_UXTW_MERGE_ZERO ||
25418 Opc == AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO;
25419
25420 SDLoc DL(N);
25421 SDValue Chain = N->getOperand(0);
25422 SDValue Pg = N->getOperand(1);
25423 SDValue Base = N->getOperand(2);
25424 SDValue Offset = N->getOperand(3);
25425 SDValue Ty = N->getOperand(4);
25426
25427 EVT ResVT = N->getValueType(0);
25428
25429 const auto OffsetOpc = Offset.getOpcode();
25430 const bool OffsetIsZExt =
25431 OffsetOpc == AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU;
25432 const bool OffsetIsSExt =
25433 OffsetOpc == AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU;
25434
25435 // Fold sign/zero extensions of vector offsets into GLD1 nodes where possible.
25436 if (!Extended && (OffsetIsSExt || OffsetIsZExt)) {
25437 SDValue ExtPg = Offset.getOperand(0);
25438 VTSDNode *ExtFrom = cast<VTSDNode>(Offset.getOperand(2).getNode());
25439 EVT ExtFromEVT = ExtFrom->getVT().getVectorElementType();
25440
25441 // If the predicate for the sign- or zero-extended offset is the
25442 // same as the predicate used for this load and the sign-/zero-extension
25443 // was from a 32-bits...
25444 if (ExtPg == Pg && ExtFromEVT == MVT::i32) {
25445 SDValue UnextendedOffset = Offset.getOperand(1);
25446
25447 unsigned NewOpc = getGatherVecOpcode(Scaled, OffsetIsSExt, true);
25448 if (Signed)
25449 NewOpc = getSignExtendedGatherOpcode(NewOpc);
25450
25451 return DAG.getNode(NewOpc, DL, {ResVT, MVT::Other},
25452 {Chain, Pg, Base, UnextendedOffset, Ty});
25453 }
25454 }
25455
25456 return SDValue();
25457}
25458
25459/// Optimize a vector shift instruction and its operand if shifted out
25460/// bits are not used.
25462 const AArch64TargetLowering &TLI,
25464 assert(N->getOpcode() == AArch64ISD::VASHR ||
25465 N->getOpcode() == AArch64ISD::VLSHR);
25466
25467 SDValue Op = N->getOperand(0);
25468 unsigned OpScalarSize = Op.getScalarValueSizeInBits();
25469
25470 unsigned ShiftImm = N->getConstantOperandVal(1);
25471 assert(OpScalarSize > ShiftImm && "Invalid shift imm");
25472
25473 // Remove sign_extend_inreg (ashr(shl(x)) based on the number of sign bits.
25474 if (N->getOpcode() == AArch64ISD::VASHR &&
25475 Op.getOpcode() == AArch64ISD::VSHL &&
25476 N->getOperand(1) == Op.getOperand(1))
25477 if (DCI.DAG.ComputeNumSignBits(Op.getOperand(0)) > ShiftImm)
25478 return Op.getOperand(0);
25479
25480 // If the shift is exact, the shifted out bits matter.
25481 if (N->getFlags().hasExact())
25482 return SDValue();
25483
25484 APInt ShiftedOutBits = APInt::getLowBitsSet(OpScalarSize, ShiftImm);
25485 APInt DemandedMask = ~ShiftedOutBits;
25486
25487 if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
25488 return SDValue(N, 0);
25489
25490 return SDValue();
25491}
25492
25494 // sunpklo(sext(pred)) -> sext(extract_low_half(pred))
25495 // This transform works in partnership with performSetCCPunpkCombine to
25496 // remove unnecessary transfer of predicates into standard registers and back
25497 if (N->getOperand(0).getOpcode() == ISD::SIGN_EXTEND &&
25498 N->getOperand(0)->getOperand(0)->getValueType(0).getScalarType() ==
25499 MVT::i1) {
25500 SDValue CC = N->getOperand(0)->getOperand(0);
25501 auto VT = CC->getValueType(0).getHalfNumVectorElementsVT(*DAG.getContext());
25502 SDValue Unpk = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, CC,
25503 DAG.getVectorIdxConstant(0, SDLoc(N)));
25504 return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), N->getValueType(0), Unpk);
25505 }
25506
25507 return SDValue();
25508}
25509
25510/// Target-specific DAG combine function for post-increment LD1 (lane) and
25511/// post-increment LD1R.
25514 bool IsLaneOp) {
25515 if (DCI.isBeforeLegalizeOps())
25516 return SDValue();
25517
25518 SelectionDAG &DAG = DCI.DAG;
25519 EVT VT = N->getValueType(0);
25520
25521 if (!VT.is128BitVector() && !VT.is64BitVector())
25522 return SDValue();
25523
25524 // If it is not LOAD, can not do such combine.
25525 unsigned LoadIdx = IsLaneOp ? 1 : 0;
25526 LoadSDNode *LD = dyn_cast<LoadSDNode>(N->getOperand(LoadIdx).getNode());
25527 if (!LD)
25528 return SDValue();
25529
25530 // If the Generic combiner already helped form a pre- or post-indexed load,
25531 // skip forming one here.
25532 if (LD->isIndexed())
25533 return SDValue();
25534
25535 // The vector lane must be a constant in the LD1LANE opcode.
25536 SDValue Lane;
25537 if (IsLaneOp) {
25538 Lane = N->getOperand(2);
25539 auto *LaneC = dyn_cast<ConstantSDNode>(Lane);
25540 if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements())
25541 return SDValue();
25542 if (LaneC->getZExtValue() == 0 && isNullOrNullSplat(N->getOperand(0)))
25543 return SDValue();
25544 }
25545
25546 LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
25547 EVT MemVT = LoadSDN->getMemoryVT();
25548 // Check if memory operand is the same type as the vector element.
25549 if (MemVT != VT.getVectorElementType())
25550 return SDValue();
25551
25552 // Check if there are other uses. If so, do not combine as it will introduce
25553 // an extra load.
25554 for (SDUse &U : LD->uses()) {
25555 if (U.getResNo() == 1) // Ignore uses of the chain result.
25556 continue;
25557 if (U.getUser() != N)
25558 return SDValue();
25559 }
25560
25561 // If there is one use and it can splat the value, prefer that operation.
25562 // TODO: This could be expanded to more operations if they reliably use the
25563 // index variants.
25564 if (N->hasOneUse()) {
25565 unsigned UseOpc = N->user_begin()->getOpcode();
25566 if (UseOpc == ISD::FMUL || UseOpc == ISD::FMA)
25567 return SDValue();
25568 }
25569
25570 SDValue Addr = LD->getOperand(1);
25571 SDValue Vector = N->getOperand(0);
25572 // Search for a use of the address operand that is an increment.
25573 for (SDUse &Use : Addr->uses()) {
25574 SDNode *User = Use.getUser();
25575 if (User->getOpcode() != ISD::ADD || Use.getResNo() != Addr.getResNo())
25576 continue;
25577
25578 // If the increment is a constant, it must match the memory ref size.
25579 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
25580 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
25581 uint32_t IncVal = CInc->getZExtValue();
25582 unsigned NumBytes = VT.getScalarSizeInBits() / 8;
25583 if (IncVal != NumBytes)
25584 continue;
25585 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
25586 }
25587
25588 // To avoid cycle construction make sure that neither the load nor the add
25589 // are predecessors to each other or the Vector.
25592 Visited.insert(Addr.getNode());
25593 Worklist.push_back(User);
25594 Worklist.push_back(LD);
25595 Worklist.push_back(Vector.getNode());
25596 if (SDNode::hasPredecessorHelper(LD, Visited, Worklist) ||
25597 SDNode::hasPredecessorHelper(User, Visited, Worklist))
25598 continue;
25599
25601 Ops.push_back(LD->getOperand(0)); // Chain
25602 if (IsLaneOp) {
25603 Ops.push_back(Vector); // The vector to be inserted
25604 Ops.push_back(Lane); // The lane to be inserted in the vector
25605 }
25606 Ops.push_back(Addr);
25607 Ops.push_back(Inc);
25608
25609 EVT Tys[3] = { VT, MVT::i64, MVT::Other };
25610 SDVTList SDTys = DAG.getVTList(Tys);
25611 unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
25612 SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
25613 MemVT,
25614 LoadSDN->getMemOperand());
25615
25616 // Update the uses.
25617 SDValue NewResults[] = {
25618 SDValue(LD, 0), // The result of load
25619 SDValue(UpdN.getNode(), 2) // Chain
25620 };
25621 DCI.CombineTo(LD, NewResults);
25622 DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result
25623 DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register
25624
25625 break;
25626 }
25627 return SDValue();
25628}
25629
25630/// Simplify ``Addr`` given that the top byte of it is ignored by HW during
25631/// address translation.
25632static bool performTBISimplification(SDValue Addr,
25634 SelectionDAG &DAG) {
25635 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
25636 // If MTE is enabled, TBI only applies to the top 4 bits.
25637 // Both arm64 and arm64e processes on Darwin may run with MTE enabled.
25638 unsigned NumIgnoreBits =
25639 Subtarget.hasMTE() || Subtarget.isTargetDarwin() ? 4 : 8;
25640 APInt DemandedMask = APInt::getLowBitsSet(64, 64 - NumIgnoreBits);
25641 KnownBits Known;
25643 !DCI.isBeforeLegalizeOps());
25644 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25645 if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) {
25646 DCI.CommitTargetLoweringOpt(TLO);
25647 return true;
25648 }
25649 return false;
25650}
25651
25652static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N) {
25653 assert((N->getOpcode() == ISD::STORE || N->getOpcode() == ISD::MSTORE) &&
25654 "Expected STORE dag node in input!");
25655
25656 if (auto Store = dyn_cast<StoreSDNode>(N)) {
25657 if (!Store->isTruncatingStore() || Store->isIndexed())
25658 return SDValue();
25659 SDValue Ext = Store->getValue();
25660 auto ExtOpCode = Ext.getOpcode();
25661 if (ExtOpCode != ISD::ZERO_EXTEND && ExtOpCode != ISD::SIGN_EXTEND &&
25662 ExtOpCode != ISD::ANY_EXTEND)
25663 return SDValue();
25664 SDValue Orig = Ext->getOperand(0);
25665 if (Store->getMemoryVT() != Orig.getValueType())
25666 return SDValue();
25667 return DAG.getStore(Store->getChain(), SDLoc(Store), Orig,
25668 Store->getBasePtr(), Store->getMemOperand());
25669 }
25670
25671 return SDValue();
25672}
25673
25674// A custom combine to lower load <3 x i8> as the more efficient sequence
25675// below:
25676// ldrb wX, [x0, #2]
25677// ldrh wY, [x0]
25678// orr wX, wY, wX, lsl #16
25679// fmov s0, wX
25680//
25681// Note that an alternative sequence with even fewer (although usually more
25682// complex/expensive) instructions would be:
25683// ld1r.4h { v0 }, [x0], #2
25684// ld1.b { v0 }[2], [x0]
25685//
25686// Generating this sequence unfortunately results in noticeably worse codegen
25687// for code that extends the loaded v3i8, due to legalization breaking vector
25688// shuffle detection in a way that is very difficult to work around.
25689// TODO: Revisit once v3i8 legalization has been improved in general.
25690static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) {
25691 EVT MemVT = LD->getMemoryVT();
25692 if (MemVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3) ||
25693 LD->getBaseAlign() >= 4)
25694 return SDValue();
25695
25696 SDLoc DL(LD);
25698 SDValue Chain = LD->getChain();
25699 SDValue BasePtr = LD->getBasePtr();
25700 MachineMemOperand *MMO = LD->getMemOperand();
25701 assert(LD->getOffset().isUndef() && "undef offset expected");
25702
25703 // Load 2 x i8, then 1 x i8.
25704 SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, MMO);
25705 TypeSize Offset2 = TypeSize::getFixed(2);
25706 SDValue L8 = DAG.getLoad(MVT::i8, DL, Chain,
25707 DAG.getMemBasePlusOffset(BasePtr, Offset2, DL),
25708 MF.getMachineMemOperand(MMO, 2, 1));
25709
25710 // Extend to i32.
25711 SDValue Ext16 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L16);
25712 SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8);
25713
25714 // Pack 2 x i8 and 1 x i8 in an i32 and convert to v4i8.
25715 SDValue Shl = DAG.getNode(ISD::SHL, DL, MVT::i32, Ext8,
25716 DAG.getConstant(16, DL, MVT::i32));
25717 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, Ext16, Shl);
25718 SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v4i8, Or);
25719
25720 // Extract v3i8 again.
25721 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, Cast,
25722 DAG.getConstant(0, DL, MVT::i64));
25724 ISD::TokenFactor, DL, MVT::Other,
25725 {SDValue(cast<SDNode>(L16), 1), SDValue(cast<SDNode>(L8), 1)});
25726 return DAG.getMergeValues({Extract, TokenFactor}, DL);
25727}
25728
25729// Perform TBI simplification if supported by the target and try to break up
25730// nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
25731// load instructions can be selected.
25732static SDValue performLOADCombine(SDNode *N,
25734 SelectionDAG &DAG,
25735 const AArch64Subtarget *Subtarget) {
25736 if (Subtarget->supportsAddressTopByteIgnored())
25737 performTBISimplification(N->getOperand(1), DCI, DAG);
25738
25740 EVT RegVT = LD->getValueType(0);
25741 EVT MemVT = LD->getMemoryVT();
25742 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25743 SDLoc DL(LD);
25744
25745 // Cast ptr32 and ptr64 pointers to the default address space before a load.
25746 unsigned AddrSpace = LD->getAddressSpace();
25747 if (AddrSpace == ARM64AS::PTR64 || AddrSpace == ARM64AS::PTR32_SPTR ||
25748 AddrSpace == ARM64AS::PTR32_UPTR) {
25749 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
25750 if (PtrVT != LD->getBasePtr().getSimpleValueType()) {
25751 SDValue Cast =
25752 DAG.getAddrSpaceCast(DL, PtrVT, LD->getBasePtr(), AddrSpace, 0);
25753 return DAG.getExtLoad(LD->getExtensionType(), DL, RegVT, LD->getChain(),
25754 Cast, LD->getPointerInfo(), MemVT,
25755 LD->getBaseAlign(),
25756 LD->getMemOperand()->getFlags());
25757 }
25758 }
25759
25760 if (LD->isVolatile() || !Subtarget->isLittleEndian())
25761 return SDValue(N, 0);
25762
25763 if (SDValue Res = combineV3I8LoadExt(LD, DAG))
25764 return Res;
25765
25766 if (!LD->isNonTemporal())
25767 return SDValue(N, 0);
25768
25769 if (MemVT.isScalableVector() || MemVT.getSizeInBits() <= 256 ||
25770 MemVT.getSizeInBits() % 256 == 0 ||
25771 256 % MemVT.getScalarSizeInBits() != 0)
25772 return SDValue(N, 0);
25773
25774 SDValue Chain = LD->getChain();
25775 SDValue BasePtr = LD->getBasePtr();
25776 SDNodeFlags Flags = LD->getFlags();
25778 SmallVector<SDValue, 4> LoadOpsChain;
25779 // Replace any non temporal load over 256-bit with a series of 256 bit loads
25780 // and a scalar/vector load less than 256. This way we can utilize 256-bit
25781 // loads and reduce the amount of load instructions generated.
25782 MVT NewVT =
25784 256 / MemVT.getVectorElementType().getSizeInBits());
25785 unsigned Num256Loads = MemVT.getSizeInBits() / 256;
25786 // Create all 256-bit loads starting from offset 0 and up to Num256Loads-1*32.
25787 for (unsigned I = 0; I < Num256Loads; I++) {
25788 unsigned PtrOffset = I * 32;
25789 SDValue NewPtr = DAG.getMemBasePlusOffset(
25790 BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);
25791 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
25792 SDValue NewLoad = DAG.getLoad(
25793 NewVT, DL, Chain, NewPtr, LD->getPointerInfo().getWithOffset(PtrOffset),
25794 NewAlign, LD->getMemOperand()->getFlags(), LD->getAAInfo());
25795 LoadOps.push_back(NewLoad);
25796 LoadOpsChain.push_back(SDValue(cast<SDNode>(NewLoad), 1));
25797 }
25798
25799 // Process remaining bits of the load operation.
25800 // This is done by creating an UNDEF vector to match the size of the
25801 // 256-bit loads and inserting the remaining load to it. We extract the
25802 // original load type at the end using EXTRACT_SUBVECTOR instruction.
25803 unsigned BitsRemaining = MemVT.getSizeInBits() % 256;
25804 unsigned PtrOffset = (MemVT.getSizeInBits() - BitsRemaining) / 8;
25805 MVT RemainingVT = MVT::getVectorVT(
25807 BitsRemaining / MemVT.getVectorElementType().getSizeInBits());
25808 SDValue NewPtr = DAG.getMemBasePlusOffset(
25809 BasePtr, TypeSize::getFixed(PtrOffset), DL, Flags);
25810 Align NewAlign = commonAlignment(LD->getAlign(), PtrOffset);
25811 SDValue RemainingLoad =
25812 DAG.getLoad(RemainingVT, DL, Chain, NewPtr,
25813 LD->getPointerInfo().getWithOffset(PtrOffset), NewAlign,
25814 LD->getMemOperand()->getFlags(), LD->getAAInfo());
25815 SDValue PoisonVector = DAG.getPOISON(NewVT);
25816 SDValue InsertIdx = DAG.getVectorIdxConstant(0, DL);
25817 SDValue ExtendedRemainingLoad =
25818 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewVT,
25819 {PoisonVector, RemainingLoad, InsertIdx});
25820 LoadOps.push_back(ExtendedRemainingLoad);
25821 LoadOpsChain.push_back(SDValue(cast<SDNode>(RemainingLoad), 1));
25822 EVT ConcatVT =
25824 LoadOps.size() * NewVT.getVectorNumElements());
25825 SDValue ConcatVectors =
25826 DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, LoadOps);
25827 // Extract the original vector type size.
25828 SDValue ExtractSubVector =
25829 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT,
25830 {ConcatVectors, DAG.getVectorIdxConstant(0, DL)});
25832 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, LoadOpsChain);
25833 return DAG.getMergeValues({ExtractSubVector, TokenFactor}, DL);
25834}
25835
25836static EVT tryGetOriginalBoolVectorType(SDValue Op, int Depth = 0) {
25837 EVT VecVT = Op.getValueType();
25838 assert(VecVT.isVector() && VecVT.getVectorElementType() == MVT::i1 &&
25839 "Need boolean vector type.");
25840
25841 if (Depth > 3)
25843
25844 // We can get the base type from a vector compare or truncate.
25845 if (Op.getOpcode() == ISD::SETCC || Op.getOpcode() == ISD::TRUNCATE)
25846 return Op.getOperand(0).getValueType();
25847
25848 // If an operand is a bool vector, continue looking.
25850 for (SDValue Operand : Op->op_values()) {
25851 if (Operand.getValueType() != VecVT)
25852 continue;
25853
25854 EVT OperandVT = tryGetOriginalBoolVectorType(Operand, Depth + 1);
25855 if (!BaseVT.isSimple())
25856 BaseVT = OperandVT;
25857 else if (OperandVT != BaseVT)
25859 }
25860
25861 return BaseVT;
25862}
25863
25864static bool getBoolVectorBitcastCompare(SDValue Vec, SDValue RHS,
25865 const SDLoc &DL, SelectionDAG &DAG,
25866 SDValue &CompareLHS,
25867 SDValue &CompareRHS) {
25868 if (DAG.getDataLayout().isBigEndian())
25869 return false;
25870
25871 EVT VecVT = Vec.getValueType();
25872 assert(VecVT.isFixedLengthVector() &&
25873 VecVT.getVectorElementType() == MVT::i1 &&
25874 "Expected a fixed-length bool vector");
25875
25876 unsigned NumElts = VecVT.getVectorNumElements();
25877 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
25878 return false;
25879
25880 auto getCanonicalCompareVecVT = [&]() {
25881 unsigned BitsPerElement = std::max(64 / NumElts, 8u);
25882 return MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement), NumElts);
25883 };
25884
25885 EVT CompareVecVT = tryGetOriginalBoolVectorType(Vec);
25886 if (!CompareVecVT.isSimple() || CompareVecVT.getSizeInBits() > 128) {
25887 CompareVecVT = getCanonicalCompareVecVT();
25888 }
25889 CompareVecVT = CompareVecVT.changeVectorElementTypeToInteger();
25890
25891 if (CompareVecVT.getSizeInBits() > 128)
25892 return false;
25893
25894 SDValue CompareBits = DAG.getSExtOrTrunc(Vec, DL, CompareVecVT);
25895 unsigned CompareBitsSize = CompareBits.getValueSizeInBits();
25896
25897 // Use a canonical 64/128-bit vector representation before bitcasting to a
25898 // scalar view. Some legal original vector types are smaller than 64-bit,
25899 // which would make the direct scalar bitcasts below invalid.
25900 if (CompareBitsSize != 64 && CompareBitsSize != 128) {
25901 CompareVecVT = getCanonicalCompareVecVT();
25902 CompareBits = DAG.getSExtOrTrunc(Vec, DL, CompareVecVT);
25903 CompareBitsSize = CompareBits.getValueSizeInBits();
25904 }
25905
25906 if (CompareBitsSize != 64 && CompareBitsSize != 128)
25907 return false;
25908
25909 bool IsNull = isNullConstant(RHS);
25910 if (CompareBitsSize == 64) {
25911 CompareLHS = DAG.getBitcast(MVT::i64, CompareBits);
25912 CompareRHS = IsNull ? DAG.getConstant(0, DL, MVT::i64)
25913 : DAG.getAllOnesConstant(DL, MVT::i64);
25914 } else {
25915 SDValue PairwiseBits = DAG.getBitcast(MVT::v2i64, CompareBits);
25916 SDValue Lo = DAG.getExtractVectorElt(DL, MVT::i64, PairwiseBits, 0);
25917 SDValue Hi = DAG.getExtractVectorElt(DL, MVT::i64, PairwiseBits, 1);
25918 CompareLHS = DAG.getNode(ISD::ADD, DL, MVT::i64, Lo, Hi);
25919 CompareRHS = IsNull ? DAG.getConstant(0, DL, MVT::i64)
25920 : DAG.getSignedConstant(-2, DL, MVT::i64);
25921 }
25922
25923 return true;
25924}
25925
25926// When converting a <N x iX> vector to <N x i1> to store or use as a scalar
25927// iN, we can use a trick that extracts the i^th bit from the i^th element and
25928// then performs a vector add to get a scalar bitmask. This requires that each
25929// element's bits are either all 1 or all 0.
25930static SDValue vectorToScalarBitmask(SDNode *N, SelectionDAG &DAG) {
25931 SDLoc DL(N);
25932 SDValue ComparisonResult(N, 0);
25933 EVT VecVT = ComparisonResult.getValueType();
25934 assert(VecVT.isVector() && "Must be a vector type");
25935
25936 unsigned NumElts = VecVT.getVectorNumElements();
25937 if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
25938 return SDValue();
25939
25940 if (VecVT.getVectorElementType() != MVT::i1 &&
25941 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT))
25942 return SDValue();
25943
25944 // If we can find the original types to work on instead of a vector of i1,
25945 // we can avoid extend/extract conversion instructions.
25946 if (VecVT.getVectorElementType() == MVT::i1) {
25947 VecVT = tryGetOriginalBoolVectorType(ComparisonResult);
25948 if (!VecVT.isSimple()) {
25949 unsigned BitsPerElement = std::max(64 / NumElts, 8u); // >= 64-bit vector
25950 VecVT = MVT::getVectorVT(MVT::getIntegerVT(BitsPerElement), NumElts);
25951 }
25952 }
25953 VecVT = VecVT.changeVectorElementTypeToInteger();
25954
25955 // Large vectors don't map directly to this conversion, so to avoid too many
25956 // edge cases, we don't apply it here. The conversion will likely still be
25957 // applied later via multiple smaller vectors, whose results are concatenated.
25958 if (VecVT.getSizeInBits() > 128)
25959 return SDValue();
25960
25961 // Ensure that all elements' bits are either 0s or 1s.
25962 ComparisonResult = DAG.getSExtOrTrunc(ComparisonResult, DL, VecVT);
25963
25964 bool IsLE = DAG.getDataLayout().isLittleEndian();
25965 SmallVector<SDValue, 16> MaskConstants;
25967 VecVT == MVT::v16i8) {
25968 // v16i8 is a special case, as we have 16 entries but only 8 positional bits
25969 // per entry. We split it into two halves, apply the mask, zip the halves to
25970 // create 8x 16-bit values, and the perform the vector reduce.
25971 for (unsigned Half = 0; Half < 2; ++Half) {
25972 for (unsigned I = 0; I < 8; ++I) {
25973 // On big-endian targets, the lane order in sub-byte vector elements
25974 // gets reversed, so we need to flip the bit index.
25975 unsigned MaskBit = IsLE ? (1u << I) : (1u << (7 - I));
25976 MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i32));
25977 }
25978 }
25979 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);
25980 SDValue RepresentativeBits =
25981 DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);
25982
25983 SDValue UpperRepresentativeBits =
25984 DAG.getNode(AArch64ISD::EXT, DL, VecVT, RepresentativeBits,
25985 RepresentativeBits, DAG.getConstant(8, DL, MVT::i32));
25986 SDValue Zipped = DAG.getNode(AArch64ISD::ZIP1, DL, VecVT,
25987 RepresentativeBits, UpperRepresentativeBits);
25988 Zipped = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, Zipped);
25989 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i16, Zipped);
25990 }
25991
25992 // All other vector sizes.
25993 unsigned NumEl = VecVT.getVectorNumElements();
25994 for (unsigned I = 0; I < NumEl; ++I) {
25995 unsigned MaskBit = IsLE ? (1u << I) : (1u << (NumEl - 1 - I));
25996 MaskConstants.push_back(DAG.getConstant(MaskBit, DL, MVT::i64));
25997 }
25998
25999 SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, MaskConstants);
26000 SDValue RepresentativeBits =
26001 DAG.getNode(ISD::AND, DL, VecVT, ComparisonResult, Mask);
26002 EVT ResultVT = MVT::getIntegerVT(std::max<unsigned>(
26003 NumElts, VecVT.getVectorElementType().getSizeInBits()));
26004 return DAG.getNode(ISD::VECREDUCE_ADD, DL, ResultVT, RepresentativeBits);
26005}
26006
26007static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG,
26008 StoreSDNode *Store) {
26009 if (!Store->isTruncatingStore())
26010 return SDValue();
26011
26012 SDLoc DL(Store);
26013 SDValue VecOp = Store->getValue();
26014 EVT VT = VecOp.getValueType();
26015 EVT MemVT = Store->getMemoryVT();
26016
26017 if (!MemVT.isVector() || !VT.isVector() ||
26018 MemVT.getVectorElementType() != MVT::i1)
26019 return SDValue();
26020
26021 // If we are storing a vector that we are currently building, let
26022 // `scalarizeVectorStore()` handle this more efficiently.
26023 if (VecOp.getOpcode() == ISD::BUILD_VECTOR)
26024 return SDValue();
26025
26026 VecOp = DAG.getNode(ISD::TRUNCATE, DL, MemVT, VecOp);
26027 SDValue VectorBits = vectorToScalarBitmask(VecOp.getNode(), DAG);
26028 if (!VectorBits)
26029 return SDValue();
26030
26031 EVT StoreVT =
26033 SDValue ExtendedBits = DAG.getZExtOrTrunc(VectorBits, DL, StoreVT);
26034 return DAG.getStore(Store->getChain(), DL, ExtendedBits, Store->getBasePtr(),
26035 Store->getMemOperand());
26036}
26037
26038// Combine store (fp_to_int X) to use vector semantics around the conversion
26039// when NEON is available. This allows us to store the in-vector result directly
26040// without transferring the result into a GPR in the process.
26041static SDValue combineStoreValueFPToInt(StoreSDNode *ST,
26043 SelectionDAG &DAG,
26044 const AArch64Subtarget *Subtarget) {
26045 if (!Subtarget->isNeonAvailable())
26046 return SDValue();
26047 // Source operand is already a vector.
26048 SDValue Value = ST->getValue();
26049 if (Value.getValueType().isVector())
26050 return SDValue();
26051
26052 // Look through potential assertions.
26053 while (Value->isAssert())
26054 Value = Value.getOperand(0);
26055
26056 if (Value.getOpcode() != ISD::FP_TO_SINT &&
26057 Value.getOpcode() != ISD::FP_TO_UINT)
26058 return SDValue();
26059 if (!Value->hasOneUse())
26060 return SDValue();
26061
26062 SDValue FPSrc = Value.getOperand(0);
26063 EVT SrcVT = FPSrc.getValueType();
26064 if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
26065 return SDValue();
26066
26067 // No support for assignments such as i64 = fp_to_sint i32
26068 EVT VT = Value.getSimpleValueType();
26069 if (VT != SrcVT.changeTypeToInteger())
26070 return SDValue();
26071
26072 // Create a 128-bit element vector to avoid widening. The floating point
26073 // conversion is transformed into a single element conversion via a pattern.
26074 unsigned NumElements = 128 / SrcVT.getFixedSizeInBits();
26075 EVT VecSrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumElements);
26076 EVT VecDstVT = VecSrcVT.changeTypeToInteger();
26077 SDLoc DL(ST);
26078 SDValue VecFP = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, FPSrc);
26079 SDValue VecConv = DAG.getNode(Value.getOpcode(), DL, VecDstVT, VecFP);
26080
26082 SDValue Extracted =
26083 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VecConv, Zero);
26084
26085 DCI.CombineTo(ST->getValue().getNode(), Extracted);
26086 return SDValue(ST, 0);
26087}
26088
26089bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT) {
26090 return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv8i8) ||
26091 (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv4i16) ||
26092 (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv2i32);
26093}
26094
26095// Combine store (trunc X to <3 x i8>) to sequence of ST1.b.
26096static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG,
26097 const AArch64Subtarget *Subtarget) {
26098 SDValue Value = ST->getValue();
26099 EVT ValueVT = Value.getValueType();
26100
26101 if (ST->isVolatile() || !Subtarget->isLittleEndian() ||
26102 Value.getOpcode() != ISD::TRUNCATE ||
26103 ValueVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3))
26104 return SDValue();
26105
26106 assert(ST->getOffset().isUndef() && "undef offset expected");
26107 SDLoc DL(ST);
26108 auto WideVT = EVT::getVectorVT(
26109 *DAG.getContext(),
26110 Value->getOperand(0).getValueType().getVectorElementType(), 4);
26111 SDValue PoisonVector = DAG.getPOISON(WideVT);
26112 SDValue WideTrunc = DAG.getNode(
26113 ISD::INSERT_SUBVECTOR, DL, WideVT,
26114 {PoisonVector, Value->getOperand(0), DAG.getVectorIdxConstant(0, DL)});
26115 SDValue Cast = DAG.getNode(
26116 ISD::BITCAST, DL, WideVT.getSizeInBits() == 64 ? MVT::v8i8 : MVT::v16i8,
26117 WideTrunc);
26118
26120 SDValue Chain = ST->getChain();
26121 MachineMemOperand *MMO = ST->getMemOperand();
26122 unsigned IdxScale = WideVT.getScalarSizeInBits() / 8;
26123 SDValue E2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
26124 DAG.getConstant(2 * IdxScale, DL, MVT::i64));
26125 TypeSize Offset2 = TypeSize::getFixed(2);
26126 SDValue Ptr2 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset2, DL);
26127 Chain = DAG.getStore(Chain, DL, E2, Ptr2, MF.getMachineMemOperand(MMO, 2, 1));
26128
26129 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
26130 DAG.getConstant(1 * IdxScale, DL, MVT::i64));
26131 TypeSize Offset1 = TypeSize::getFixed(1);
26132 SDValue Ptr1 = DAG.getMemBasePlusOffset(ST->getBasePtr(), Offset1, DL);
26133 Chain = DAG.getStore(Chain, DL, E1, Ptr1, MF.getMachineMemOperand(MMO, 1, 1));
26134
26135 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, Cast,
26136 DAG.getConstant(0, DL, MVT::i64));
26137 Chain = DAG.getStore(Chain, DL, E0, ST->getBasePtr(),
26138 MF.getMachineMemOperand(MMO, 0, 1));
26139 return Chain;
26140}
26141
26142static unsigned getFPSubregForVT(EVT VT) {
26143 assert(VT.isSimple() && "Expected simple VT");
26144 switch (VT.getSimpleVT().SimpleTy) {
26145 case MVT::aarch64mfp8:
26146 return AArch64::bsub;
26147 case MVT::f16:
26148 return AArch64::hsub;
26149 case MVT::f32:
26150 return AArch64::ssub;
26151 case MVT::f64:
26152 return AArch64::dsub;
26153 default:
26154 llvm_unreachable("Unexpected VT!");
26155 }
26156}
26157
26158static SDValue performSTORECombine(SDNode *N,
26160 SelectionDAG &DAG,
26161 const AArch64Subtarget *Subtarget) {
26163 SDValue Chain = ST->getChain();
26164 SDValue Value = ST->getValue();
26165 SDValue Ptr = ST->getBasePtr();
26166 EVT ValueVT = Value.getValueType();
26167 EVT MemVT = ST->getMemoryVT();
26168 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
26169 SDLoc DL(ST);
26170
26171 if (SDValue Res = combineStoreValueFPToInt(ST, DCI, DAG, Subtarget))
26172 return Res;
26173
26174 auto hasValidElementTypeForFPTruncStore = [](EVT VT) {
26175 EVT EltVT = VT.getVectorElementType();
26176 return EltVT == MVT::f32 || EltVT == MVT::f64;
26177 };
26178
26179 // Cast ptr32 and ptr64 pointers to the default address space before a store.
26180 unsigned AddrSpace = ST->getAddressSpace();
26181 if (AddrSpace == ARM64AS::PTR64 || AddrSpace == ARM64AS::PTR32_SPTR ||
26182 AddrSpace == ARM64AS::PTR32_UPTR) {
26183 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
26184 if (PtrVT != Ptr.getSimpleValueType()) {
26185 SDValue Cast = DAG.getAddrSpaceCast(DL, PtrVT, Ptr, AddrSpace, 0);
26186 return DAG.getStore(Chain, DL, Value, Cast, ST->getPointerInfo(),
26187 ST->getBaseAlign(), ST->getMemOperand()->getFlags(),
26188 ST->getAAInfo());
26189 }
26190 }
26191
26192 if (SDValue Res = combineI8TruncStore(ST, DAG, Subtarget))
26193 return Res;
26194
26195 // If this is an FP_ROUND followed by a store, fold this into a truncating
26196 // store. We can do this even if this is already a truncstore.
26197 // We purposefully don't care about legality of the nodes here as we know
26198 // they can be split down into something legal.
26199 if (DCI.isBeforeLegalizeOps() && Value.getOpcode() == ISD::FP_ROUND &&
26200 Value.getNode()->hasOneUse() && ST->isUnindexed() &&
26201 Subtarget->useSVEForFixedLengthVectors() &&
26202 ValueVT.isFixedLengthVector() &&
26203 ValueVT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits() &&
26204 hasValidElementTypeForFPTruncStore(Value.getOperand(0).getValueType()))
26205 return DAG.getTruncStore(Chain, DL, Value.getOperand(0), Ptr, MemVT,
26206 ST->getMemOperand());
26207
26208 if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
26209 return Split;
26210
26211 if (Subtarget->supportsAddressTopByteIgnored() &&
26212 performTBISimplification(N->getOperand(2), DCI, DAG))
26213 return SDValue(N, 0);
26214
26215 if (SDValue Store = foldTruncStoreOfExt(DAG, N))
26216 return Store;
26217
26218 if (SDValue Store = combineBoolVectorAndTruncateStore(DAG, ST))
26219 return Store;
26220
26221 if (ST->isTruncatingStore() &&
26222 isHalvingTruncateOfLegalScalableType(ValueVT, MemVT)) {
26223 if (SDValue Rshrnb =
26224 trySimplifySrlAddToRshrnb(ST->getOperand(1), DAG, Subtarget)) {
26225 return DAG.getTruncStore(ST->getChain(), ST, Rshrnb, ST->getBasePtr(),
26226 MemVT, ST->getMemOperand());
26227 }
26228 }
26229
26230 // This is an integer vector_extract_elt followed by a (possibly truncating)
26231 // store. We may be able to replace this with a store of an FP subregister.
26232 if (DCI.isAfterLegalizeDAG() && ST->isUnindexed() &&
26233 Value.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
26234
26235 SDValue Vector = Value.getOperand(0);
26236 SDValue ExtIdx = Value.getOperand(1);
26237 EVT VectorVT = Vector.getValueType();
26238 EVT ElemVT = VectorVT.getVectorElementType();
26239
26240 if (!ValueVT.isInteger())
26241 return SDValue();
26242
26243 // Propagate zero constants (applying this fold may miss optimizations).
26245 SDValue ZeroElt = DAG.getConstant(0, DL, ValueVT);
26246 DAG.ReplaceAllUsesWith(Value, ZeroElt);
26247 return SDValue();
26248 }
26249
26250 if (ValueVT != MemVT && !ST->isTruncatingStore())
26251 return SDValue();
26252
26253 // This could generate an additional extract if the index is non-zero and
26254 // the extracted value has multiple uses.
26255 auto *ExtCst = dyn_cast<ConstantSDNode>(ExtIdx);
26256 if ((!ExtCst || !ExtCst->isZero()) && !Value.hasOneUse())
26257 return SDValue();
26258
26259 // These can lower to st1, which is preferable if we're unlikely to fold the
26260 // addressing into the store.
26261 if (Subtarget->isNeonAvailable() && ElemVT == MemVT &&
26262 (VectorVT.is64BitVector() || VectorVT.is128BitVector()) && ExtCst &&
26263 !ExtCst->isZero() && ST->getBasePtr().getOpcode() != ISD::ADD)
26264 return SDValue();
26265
26266 if (MemVT == MVT::i64 || MemVT == MVT::i32) {
26267 // Heuristic: If there are other users of w/x integer scalars extracted
26268 // from this vector that won't fold into the store -- abandon folding.
26269 // Applying this fold may disrupt paired stores.
26270 for (const auto &Use : Vector->uses()) {
26271 if (Use.getResNo() != Vector.getResNo())
26272 continue;
26273 const SDNode *User = Use.getUser();
26274 if (User->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
26275 (!User->hasOneUse() ||
26276 (*User->user_begin())->getOpcode() != ISD::STORE))
26277 return SDValue();
26278 }
26279 }
26280
26281 SDValue ExtVector = Vector;
26282 if (!ExtCst || !ExtCst->isZero()) {
26283 // Handle extracting from lanes != 0.
26285 Value.getValueType(), Vector, ExtIdx);
26287 ExtVector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT,
26288 DAG.getPOISON(VectorVT), Ext, Zero);
26289 }
26290
26291 EVT FPMemVT = MemVT == MVT::i8
26292 ? MVT::aarch64mfp8
26294 SDValue FPSubreg = DAG.getTargetExtractSubreg(getFPSubregForVT(FPMemVT), DL,
26295 FPMemVT, ExtVector);
26296
26297 return DAG.getStore(ST->getChain(), DL, FPSubreg, ST->getBasePtr(),
26298 ST->getMemOperand());
26299 }
26300
26301 return SDValue();
26302}
26303
26304static bool
26305isSequentialConcatOfVectorInterleave(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
26306 if (N->getOpcode() != ISD::CONCAT_VECTORS)
26307 return false;
26308
26309 unsigned NumParts = N->getNumOperands();
26310
26311 // We should be concatenating each sequential result from a
26312 // VECTOR_INTERLEAVE.
26313 SDNode *InterleaveOp = N->getOperand(0).getNode();
26314 if (InterleaveOp->getOpcode() != ISD::VECTOR_INTERLEAVE ||
26315 InterleaveOp->getNumOperands() != NumParts)
26316 return false;
26317
26318 for (unsigned I = 0; I < NumParts; I++)
26319 if (N->getOperand(I) != SDValue(InterleaveOp, I))
26320 return false;
26321
26322 Ops.append(InterleaveOp->op_begin(), InterleaveOp->op_end());
26323 return true;
26324}
26325
26326static SDValue getNarrowMaskForInterleavedOps(SelectionDAG &DAG, SDLoc &DL,
26327 SDValue WideMask,
26328 unsigned RequiredNumParts) {
26329 if (WideMask->getOpcode() == ISD::CONCAT_VECTORS) {
26330 SmallVector<SDValue, 4> MaskInterleaveOps;
26331 if (!isSequentialConcatOfVectorInterleave(WideMask.getNode(),
26332 MaskInterleaveOps))
26333 return SDValue();
26334
26335 if (MaskInterleaveOps.size() != RequiredNumParts)
26336 return SDValue();
26337
26338 // Make sure the inputs to the vector interleave are identical.
26339 if (!llvm::all_equal(MaskInterleaveOps))
26340 return SDValue();
26341
26342 return MaskInterleaveOps[0];
26343 }
26344
26345 if (WideMask->getOpcode() != ISD::SPLAT_VECTOR)
26346 return SDValue();
26347
26349 assert(EC.isKnownMultipleOf(RequiredNumParts) &&
26350 "Expected element count divisible by number of parts");
26351 EC = EC.divideCoefficientBy(RequiredNumParts);
26352 return DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::getVectorVT(MVT::i1, EC),
26353 WideMask->getOperand(0));
26354}
26355
26356static SDValue performInterleavedMaskedStoreCombine(
26358 if (!DCI.isBeforeLegalize())
26359 return SDValue();
26360
26362 SDValue WideValue = MST->getValue();
26363
26364 // Bail out if the stored value has an unexpected number of uses, since we'll
26365 // have to perform manual interleaving and may as well just use normal masked
26366 // stores. Also, discard masked stores that are truncating or indexed.
26367 if (!WideValue.hasOneUse() || !ISD::isNormalMaskedStore(MST) ||
26368 !MST->isSimple() || !MST->getOffset().isUndef())
26369 return SDValue();
26370
26371 SmallVector<SDValue, 4> ValueInterleaveOps;
26372 if (!isSequentialConcatOfVectorInterleave(WideValue.getNode(),
26373 ValueInterleaveOps))
26374 return SDValue();
26375
26376 unsigned NumParts = ValueInterleaveOps.size();
26377 if (NumParts != 2 && NumParts != 4)
26378 return SDValue();
26379
26380 // At the moment we're unlikely to see a fixed-width vector interleave as
26381 // we usually generate shuffles instead.
26382 EVT SubVecTy = ValueInterleaveOps[0].getValueType();
26383 if (!SubVecTy.isScalableVT() ||
26384 SubVecTy.getSizeInBits().getKnownMinValue() != 128 ||
26385 !DAG.getTargetLoweringInfo().isTypeLegal(SubVecTy))
26386 return SDValue();
26387
26388 SDLoc DL(N);
26389 SDValue NarrowMask =
26390 getNarrowMaskForInterleavedOps(DAG, DL, MST->getMask(), NumParts);
26391 if (!NarrowMask)
26392 return SDValue();
26393
26394 const Intrinsic::ID IID =
26395 NumParts == 2 ? Intrinsic::aarch64_sve_st2 : Intrinsic::aarch64_sve_st4;
26396 SmallVector<SDValue, 8> NewStOps;
26397 NewStOps.append({MST->getChain(), DAG.getConstant(IID, DL, MVT::i32)});
26398 NewStOps.append(ValueInterleaveOps);
26399 NewStOps.append({NarrowMask, MST->getBasePtr()});
26400 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, NewStOps);
26401}
26402
26403static SDValue performMSTORECombine(SDNode *N,
26405 SelectionDAG &DAG,
26406 const AArch64Subtarget *Subtarget) {
26408 SDValue Value = MST->getValue();
26409 SDValue Mask = MST->getMask();
26410 SDLoc DL(N);
26411
26412 if (SDValue Res = performInterleavedMaskedStoreCombine(N, DCI, DAG))
26413 return Res;
26414
26415 // If this is a UZP1 followed by a masked store, fold this into a masked
26416 // truncating store. We can do this even if this is already a masked
26417 // truncstore.
26418 if (Value.getOpcode() == AArch64ISD::UZP1 && Value->hasOneUse() &&
26419 MST->isUnindexed() && Mask->getOpcode() == AArch64ISD::PTRUE &&
26420 Value.getValueType().isInteger()) {
26421 Value = Value.getOperand(0);
26422 if (Value.getOpcode() == ISD::BITCAST) {
26423 EVT HalfVT =
26424 Value.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
26425 EVT InVT = Value.getOperand(0).getValueType();
26426
26427 if (HalfVT.widenIntegerVectorElementType(*DAG.getContext()) == InVT) {
26428 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
26429 unsigned PgPattern = Mask->getConstantOperandVal(0);
26430
26431 // Ensure we can double the size of the predicate pattern
26432 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
26433 if (NumElts && NumElts * InVT.getVectorElementType().getSizeInBits() <=
26434 MinSVESize) {
26435 Mask = getPTrue(
26436 DAG, DL, InVT.changeVectorElementType(*DAG.getContext(), MVT::i1),
26437 PgPattern);
26438 return DAG.getMaskedStore(MST->getChain(), DL, Value.getOperand(0),
26439 MST->getBasePtr(), MST->getOffset(), Mask,
26440 MST->getMemoryVT(), MST->getMemOperand(),
26441 MST->getAddressingMode(),
26442 /*IsTruncating=*/true);
26443 }
26444 }
26445 }
26446 }
26447
26448 if (MST->isTruncatingStore()) {
26449 EVT ValueVT = Value->getValueType(0);
26450 EVT MemVT = MST->getMemoryVT();
26451 if (!isHalvingTruncateOfLegalScalableType(ValueVT, MemVT))
26452 return SDValue();
26453 if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Value, DAG, Subtarget)) {
26454 return DAG.getMaskedStore(MST->getChain(), DL, Rshrnb, MST->getBasePtr(),
26455 MST->getOffset(), MST->getMask(),
26456 MST->getMemoryVT(), MST->getMemOperand(),
26457 MST->getAddressingMode(), true);
26458 }
26459 }
26460
26461 return SDValue();
26462}
26463
26464/// \return true if part of the index was folded into the Base.
26465static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale,
26466 SDLoc DL, SelectionDAG &DAG) {
26467 // This function assumes a vector of i64 indices.
26468 EVT IndexVT = Index.getValueType();
26469 if (!IndexVT.isVector() || IndexVT.getVectorElementType() != MVT::i64)
26470 return false;
26471
26472 // Simplify:
26473 // BasePtr = Ptr
26474 // Index = X + splat(Offset)
26475 // ->
26476 // BasePtr = Ptr + Offset * scale.
26477 // Index = X
26478 if (Index.getOpcode() == ISD::ADD) {
26479 if (auto Offset = DAG.getSplatValue(Index.getOperand(1))) {
26480 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
26481 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
26482 Index = Index.getOperand(0);
26483 return true;
26484 }
26485 }
26486
26487 // Simplify:
26488 // BasePtr = Ptr
26489 // Index = (X + splat(Offset)) << splat(Shift)
26490 // ->
26491 // BasePtr = Ptr + (Offset << Shift) * scale)
26492 // Index = X << splat(shift)
26493 if (Index.getOpcode() == ISD::SHL &&
26494 Index.getOperand(0).getOpcode() == ISD::ADD) {
26495 SDValue Add = Index.getOperand(0);
26496 SDValue ShiftOp = Index.getOperand(1);
26497 SDValue OffsetOp = Add.getOperand(1);
26498 if (auto Shift = DAG.getSplatValue(ShiftOp))
26499 if (auto Offset = DAG.getSplatValue(OffsetOp)) {
26500 Offset = DAG.getNode(ISD::SHL, DL, MVT::i64, Offset, Shift);
26501 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
26502 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
26503 Index = DAG.getNode(ISD::SHL, DL, Index.getValueType(),
26504 Add.getOperand(0), ShiftOp);
26505 return true;
26506 }
26507 }
26508
26509 return false;
26510}
26511
26512// Analyse the specified address returning true if a more optimal addressing
26513// mode is available. When returning true all parameters are updated to reflect
26514// their recommended values.
26516 SDValue &BasePtr, SDValue &Index,
26517 SelectionDAG &DAG) {
26518 // Try to iteratively fold parts of the index into the base pointer to
26519 // simplify the index as much as possible.
26520 bool Changed = false;
26521 while (foldIndexIntoBase(BasePtr, Index, N->getScale(), SDLoc(N), DAG))
26522 Changed = true;
26523
26524 EVT IndexVT = Index.getValueType();
26525 EVT DataVT = N->getOperand(1).getValueType();
26526
26527 // Only consider element types that are pointer sized as smaller types can
26528 // be easily promoted.
26529 if (IndexVT.getVectorElementType() != MVT::i64 || IndexVT == MVT::nxv2i64)
26530 return Changed;
26531
26532 // Don't attempt to shrink the index for fixed vectors of 64 bit data since it
26533 // will later be re-extended to 64 bits in legalization
26534 if (DataVT.isFixedLengthVector() && DataVT.getScalarSizeInBits() == 64)
26535 return Changed;
26536 if (ISD::isVectorShrinkable(Index.getNode(), 32, N->isIndexSigned())) {
26537 EVT NewIndexVT =
26538 IndexVT.changeVectorElementType(*DAG.getContext(), MVT::i32);
26539 Index = DAG.getNode(ISD::TRUNCATE, SDLoc(N), NewIndexVT, Index);
26540 return true;
26541 }
26542
26543 // Match:
26544 // Index = step(const)
26545 int64_t Stride = 0;
26546 if (Index.getOpcode() == ISD::STEP_VECTOR) {
26547 Stride = cast<ConstantSDNode>(Index.getOperand(0))->getSExtValue();
26548 }
26549
26550 // Return early because no supported pattern is found.
26551 if (Stride == 0)
26552 return Changed;
26553
26554 if (Stride < std::numeric_limits<int32_t>::min() ||
26555 Stride > std::numeric_limits<int32_t>::max())
26556 return Changed;
26557
26558 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
26559 unsigned MaxVScale =
26561 int64_t LastElementOffset =
26562 IndexVT.getVectorMinNumElements() * Stride * MaxVScale;
26563
26564 if (LastElementOffset < std::numeric_limits<int32_t>::min() ||
26565 LastElementOffset > std::numeric_limits<int32_t>::max())
26566 return Changed;
26567
26568 EVT NewIndexVT = IndexVT.changeVectorElementType(*DAG.getContext(), MVT::i32);
26569 // Stride does not scale explicitly by 'Scale', because it happens in
26570 // the gather/scatter addressing mode.
26571 Index = DAG.getStepVector(SDLoc(N), NewIndexVT, APInt(32, Stride, true));
26572 return true;
26573}
26574
26577 if (!DCI.isBeforeLegalize())
26578 return SDValue();
26580
26581 SDLoc DL(MGS);
26582 SDValue Chain = MGS->getChain();
26583 SDValue Scale = MGS->getScale();
26584 SDValue Index = MGS->getIndex();
26585 SDValue Mask = MGS->getMask();
26586 SDValue BasePtr = MGS->getBasePtr();
26587 ISD::MemIndexType IndexType = MGS->getIndexType();
26588
26589 if (!findMoreOptimalIndexType(MGS, BasePtr, Index, DAG))
26590 return SDValue();
26591
26592 // Here we catch such cases early and change MGATHER's IndexType to allow
26593 // the use of an Index that's more legalisation friendly.
26594 if (auto *MGT = dyn_cast<MaskedGatherSDNode>(MGS)) {
26595 SDValue PassThru = MGT->getPassThru();
26596 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
26597 return DAG.getMaskedGather(
26598 DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL,
26599 Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType());
26600 }
26601 if (auto *MSC = dyn_cast<MaskedScatterSDNode>(MGS)) {
26602 SDValue Data = MSC->getValue();
26603 SDValue Ops[] = {Chain, Data, Mask, BasePtr, Index, Scale};
26604 return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(),
26605 DL, Ops, MSC->getMemOperand(), IndexType,
26606 MSC->isTruncatingStore());
26607 }
26608 auto *HG = cast<MaskedHistogramSDNode>(MGS);
26609
26610 // Histograms don't do any legalisation on the loaded data type,
26611 // so if the 'add' would need to be performed on a vector of i64's, then
26612 // we can't use the more optimal addressing with i32 offsets as that
26613 // would return a vector of nxv4i32, which wouldn't get widened.
26614 if (HG->getInc().getValueType().getScalarSizeInBits() >
26615 Index.getValueType().getScalarSizeInBits())
26616 // FIXME: If the increment value is a constant or extended value,
26617 // we can truncate the increment value.
26618 return SDValue();
26619
26620 SDValue Ops[] = {Chain, HG->getInc(), Mask, BasePtr,
26621 Index, Scale, HG->getIntID()};
26622 return DAG.getMaskedHistogram(DAG.getVTList(MVT::Other), HG->getMemoryVT(),
26623 DL, Ops, HG->getMemOperand(), IndexType);
26624}
26625
26626/// Target-specific DAG combine function for NEON load/store intrinsics
26627/// to merge base address updates.
26630 SelectionDAG &DAG) {
26631 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
26632 return SDValue();
26633
26634 unsigned AddrOpIdx = N->getNumOperands() - 1;
26635 SDValue Addr = N->getOperand(AddrOpIdx);
26636
26637 // Search for a use of the address operand that is an increment.
26638 for (SDUse &Use : Addr->uses()) {
26639 SDNode *User = Use.getUser();
26640 if (User->getOpcode() != ISD::ADD || Use.getResNo() != Addr.getResNo())
26641 continue;
26642
26643 // Check that the add is independent of the load/store. Otherwise, folding
26644 // it would create a cycle.
26647 Visited.insert(Addr.getNode());
26648 Worklist.push_back(N);
26649 Worklist.push_back(User);
26650 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
26651 SDNode::hasPredecessorHelper(User, Visited, Worklist))
26652 continue;
26653
26654 // Find the new opcode for the updating load/store.
26655 bool IsStore = false;
26656 bool IsLaneOp = false;
26657 bool IsDupOp = false;
26658 unsigned NewOpc = 0;
26659 unsigned NumVecs = 0;
26660 unsigned IntNo = N->getConstantOperandVal(1);
26661 switch (IntNo) {
26662 default: llvm_unreachable("unexpected intrinsic for Neon base update");
26663 case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post;
26664 NumVecs = 2; break;
26665 case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post;
26666 NumVecs = 3; break;
26667 case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post;
26668 NumVecs = 4; break;
26669 case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post;
26670 NumVecs = 2; IsStore = true; break;
26671 case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post;
26672 NumVecs = 3; IsStore = true; break;
26673 case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post;
26674 NumVecs = 4; IsStore = true; break;
26675 case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post;
26676 NumVecs = 2; break;
26677 case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post;
26678 NumVecs = 3; break;
26679 case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post;
26680 NumVecs = 4; break;
26681 case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post;
26682 NumVecs = 2; IsStore = true; break;
26683 case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post;
26684 NumVecs = 3; IsStore = true; break;
26685 case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post;
26686 NumVecs = 4; IsStore = true; break;
26687 case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost;
26688 NumVecs = 2; IsDupOp = true; break;
26689 case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost;
26690 NumVecs = 3; IsDupOp = true; break;
26691 case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost;
26692 NumVecs = 4; IsDupOp = true; break;
26693 case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost;
26694 NumVecs = 2; IsLaneOp = true; break;
26695 case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost;
26696 NumVecs = 3; IsLaneOp = true; break;
26697 case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost;
26698 NumVecs = 4; IsLaneOp = true; break;
26699 case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost;
26700 NumVecs = 2; IsStore = true; IsLaneOp = true; break;
26701 case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost;
26702 NumVecs = 3; IsStore = true; IsLaneOp = true; break;
26703 case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost;
26704 NumVecs = 4; IsStore = true; IsLaneOp = true; break;
26705 }
26706
26707 EVT VecTy;
26708 if (IsStore)
26709 VecTy = N->getOperand(2).getValueType();
26710 else
26711 VecTy = N->getValueType(0);
26712
26713 // If the increment is a constant, it must match the memory ref size.
26714 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
26715 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
26716 uint32_t IncVal = CInc->getZExtValue();
26717 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
26718 if (IsLaneOp || IsDupOp)
26719 NumBytes /= VecTy.getVectorNumElements();
26720 if (IncVal != NumBytes)
26721 continue;
26722 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
26723 }
26725 Ops.push_back(N->getOperand(0)); // Incoming chain
26726 // Load lane and store have vector list as input.
26727 if (IsLaneOp || IsStore)
26728 for (unsigned i = 2; i < AddrOpIdx; ++i)
26729 Ops.push_back(N->getOperand(i));
26730 Ops.push_back(Addr); // Base register
26731 Ops.push_back(Inc);
26732
26733 // Return Types.
26734 EVT Tys[6];
26735 unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
26736 unsigned n;
26737 for (n = 0; n < NumResultVecs; ++n)
26738 Tys[n] = VecTy;
26739 Tys[n++] = MVT::i64; // Type of write back register
26740 Tys[n] = MVT::Other; // Type of the chain
26741 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
26742
26744 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
26745 MemInt->getMemoryVT(),
26746 MemInt->getMemOperand());
26747
26748 // Update the uses.
26749 std::vector<SDValue> NewResults;
26750 for (unsigned i = 0; i < NumResultVecs; ++i) {
26751 NewResults.push_back(SDValue(UpdN.getNode(), i));
26752 }
26753 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
26754 DCI.CombineTo(N, NewResults);
26755 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
26756
26757 break;
26758 }
26759 return SDValue();
26760}
26761
26762// Checks to see if the value is the prescribed width and returns information
26763// about its extension mode.
26764static
26765bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
26766 ExtType = ISD::NON_EXTLOAD;
26767 switch(V.getNode()->getOpcode()) {
26768 default:
26769 return false;
26770 case ISD::LOAD: {
26771 LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
26772 if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
26773 || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
26774 ExtType = LoadNode->getExtensionType();
26775 return true;
26776 }
26777 return false;
26778 }
26779 case ISD::AssertSext: {
26780 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
26781 if ((TypeNode->getVT() == MVT::i8 && width == 8)
26782 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
26783 ExtType = ISD::SEXTLOAD;
26784 return true;
26785 }
26786 return false;
26787 }
26788 case ISD::AssertZext: {
26789 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
26790 if ((TypeNode->getVT() == MVT::i8 && width == 8)
26791 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
26792 ExtType = ISD::ZEXTLOAD;
26793 return true;
26794 }
26795 return false;
26796 }
26797 case ISD::Constant:
26798 case ISD::TargetConstant: {
26799 return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
26800 1LL << (width - 1);
26801 }
26802 }
26803
26804 return true;
26805}
26806
26807// This function does a whole lot of voodoo to determine if the tests are
26808// equivalent without and with a mask. Essentially what happens is that given a
26809// DAG resembling:
26810//
26811// +-------------+ +-------------+ +-------------+ +-------------+
26812// | Input | | AddConstant | | CompConstant| | CC |
26813// +-------------+ +-------------+ +-------------+ +-------------+
26814// | | | |
26815// V V | +----------+
26816// +-------------+ +----+ | |
26817// | ADD | |0xff| | |
26818// +-------------+ +----+ | |
26819// | | | |
26820// V V | |
26821// +-------------+ | |
26822// | AND | | |
26823// +-------------+ | |
26824// | | |
26825// +-----+ | |
26826// | | |
26827// V V V
26828// +-------------+
26829// | CMP |
26830// +-------------+
26831//
26832// The AND node may be safely removed for some combinations of inputs. In
26833// particular we need to take into account the extension type of the Input,
26834// the exact values of AddConstant, CompConstant, and CC, along with the nominal
26835// width of the input (this can work for any width inputs, the above graph is
26836// specific to 8 bits.
26837//
26838// The specific equations were worked out by generating output tables for each
26839// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
26840// problem was simplified by working with 4 bit inputs, which means we only
26841// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
26842// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
26843// patterns present in both extensions (0,7). For every distinct set of
26844// AddConstant and CompConstants bit patterns we can consider the masked and
26845// unmasked versions to be equivalent if the result of this function is true for
26846// all 16 distinct bit patterns of for the current extension type of Input (w0).
26847//
26848// sub w8, w0, w1
26849// and w10, w8, #0x0f
26850// cmp w8, w2
26851// cset w9, AArch64CC
26852// cmp w10, w2
26853// cset w11, AArch64CC
26854// cmp w9, w11
26855// cset w0, eq
26856// ret
26857//
26858// Since the above function shows when the outputs are equivalent it defines
26859// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
26860// would be expensive to run during compiles. The equations below were written
26861// in a test harness that confirmed they gave equivalent outputs to the above
26862// for all inputs function, so they can be used determine if the removal is
26863// legal instead.
26864//
26865// isEquivalentMaskless() is the code for testing if the AND can be removed
26866// factored out of the DAG recognition as the DAG can take several forms.
26867
26868static bool isEquivalentMaskless(unsigned CC, unsigned width,
26869 ISD::LoadExtType ExtType, int AddConstant,
26870 int CompConstant) {
26871 // By being careful about our equations and only writing the in term
26872 // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
26873 // make them generally applicable to all bit widths.
26874 int MaxUInt = (1 << width);
26875
26876 // For the purposes of these comparisons sign extending the type is
26877 // equivalent to zero extending the add and displacing it by half the integer
26878 // width. Provided we are careful and make sure our equations are valid over
26879 // the whole range we can just adjust the input and avoid writing equations
26880 // for sign extended inputs.
26881 if (ExtType == ISD::SEXTLOAD)
26882 AddConstant -= (1 << (width-1));
26883
26884 switch(CC) {
26885 case AArch64CC::LE:
26886 case AArch64CC::GT:
26887 if ((AddConstant == 0) ||
26888 (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
26889 (AddConstant >= 0 && CompConstant < 0) ||
26890 (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
26891 return true;
26892 break;
26893 case AArch64CC::LT:
26894 case AArch64CC::GE:
26895 if ((AddConstant == 0) ||
26896 (AddConstant >= 0 && CompConstant <= 0) ||
26897 (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
26898 return true;
26899 break;
26900 case AArch64CC::HI:
26901 case AArch64CC::LS:
26902 if ((AddConstant >= 0 && CompConstant < 0) ||
26903 (AddConstant <= 0 && CompConstant >= -1 &&
26904 CompConstant < AddConstant + MaxUInt))
26905 return true;
26906 break;
26907 case AArch64CC::PL:
26908 case AArch64CC::MI:
26909 if ((AddConstant == 0) ||
26910 (AddConstant > 0 && CompConstant <= 0) ||
26911 (AddConstant < 0 && CompConstant <= AddConstant))
26912 return true;
26913 break;
26914 case AArch64CC::LO:
26915 case AArch64CC::HS:
26916 if ((AddConstant >= 0 && CompConstant <= 0) ||
26917 (AddConstant <= 0 && CompConstant >= 0 &&
26918 CompConstant <= AddConstant + MaxUInt))
26919 return true;
26920 break;
26921 case AArch64CC::EQ:
26922 case AArch64CC::NE:
26923 if ((AddConstant > 0 && CompConstant < 0) ||
26924 (AddConstant < 0 && CompConstant >= 0 &&
26925 CompConstant < AddConstant + MaxUInt) ||
26926 (AddConstant >= 0 && CompConstant >= 0 &&
26927 CompConstant >= AddConstant) ||
26928 (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
26929 return true;
26930 break;
26931 case AArch64CC::VS:
26932 case AArch64CC::VC:
26933 case AArch64CC::AL:
26934 case AArch64CC::NV:
26935 return true;
26936 case AArch64CC::Invalid:
26937 break;
26938 }
26939
26940 return false;
26941}
26942
26943// (X & C) >u Mask --> (X & (C & (~Mask)) != 0
26944// (X & C) <u Pow2 --> (X & (C & ~(Pow2-1)) == 0
26946 SDNode *AndNode, SelectionDAG &DAG,
26947 unsigned CCIndex, unsigned CmpIndex,
26948 unsigned CC) {
26949 ConstantSDNode *SubsC = dyn_cast<ConstantSDNode>(SubsNode->getOperand(1));
26950 if (!SubsC)
26951 return SDValue();
26952
26953 APInt SubsAP = SubsC->getAPIntValue();
26954 if (CC == AArch64CC::HI) {
26955 if (!SubsAP.isMask())
26956 return SDValue();
26957 } else if (CC == AArch64CC::LO) {
26958 if (!SubsAP.isPowerOf2())
26959 return SDValue();
26960 } else
26961 return SDValue();
26962
26964 if (!AndC)
26965 return SDValue();
26966
26967 APInt MaskAP = CC == AArch64CC::HI ? SubsAP : (SubsAP - 1);
26968
26969 SDLoc DL(N);
26970 APInt AndSMask = (~MaskAP) & AndC->getAPIntValue();
26971 SDValue ANDS = DAG.getNode(
26972 AArch64ISD::ANDS, DL, SubsNode->getVTList(), AndNode->getOperand(0),
26973 DAG.getConstant(AndSMask, DL, SubsC->getValueType(0)));
26974 SDValue AArch64_CC =
26976 N->getOperand(CCIndex)->getValueType(0));
26977
26978 // For now, only performCSELCombine and performBRCONDCombine call this
26979 // function. And both of them pass 2 for CCIndex, 3 for CmpIndex with 4
26980 // operands. So just init the ops direct to simplify the code. If we have some
26981 // other case with different CCIndex, CmpIndex, we need to use for loop to
26982 // rewrite the code here.
26983 // TODO: Do we need to assert number of operand is 4 here?
26984 assert((CCIndex == 2 && CmpIndex == 3) &&
26985 "Expected CCIndex to be 2 and CmpIndex to be 3.");
26986 SDValue Ops[] = {N->getOperand(0), N->getOperand(1), AArch64_CC,
26987 ANDS.getValue(1)};
26988 return DAG.getNode(N->getOpcode(), N, N->getVTList(), Ops);
26989}
26990
26991static
26994 SelectionDAG &DAG, unsigned CCIndex,
26995 unsigned CmpIndex) {
26996 unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
26997 SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
26998 unsigned CondOpcode = SubsNode->getOpcode();
26999
27000 if (CondOpcode != AArch64ISD::SUBS || SubsNode->hasAnyUseOfValue(0) ||
27001 !SubsNode->hasOneUse())
27002 return SDValue();
27003
27004 // There is a SUBS feeding this condition. Is it fed by a mask we can
27005 // use?
27006
27007 SDNode *AndNode = SubsNode->getOperand(0).getNode();
27008 unsigned MaskBits = 0;
27009
27010 if (AndNode->getOpcode() != ISD::AND)
27011 return SDValue();
27012
27013 if (SDValue Val = performSubsToAndsCombine(N, SubsNode, AndNode, DAG, CCIndex,
27014 CmpIndex, CC))
27015 return Val;
27016
27017 // X & M ?= C --> (C << clz(M)) ?= (X << clz(M)) where M is a non-empty
27018 // sequence of ones starting at the least significant bit with the remainder
27019 // zero and C is a constant s.t. (C & ~M) == 0 that cannot be materialised
27020 // into a SUBS (immediate). The transformed form can be matched into a SUBS
27021 // (shifted register).
27022 if ((CC == AArch64CC::EQ || CC == AArch64CC::NE) && AndNode->hasOneUse() &&
27023 isa<ConstantSDNode>(AndNode->getOperand(1)) &&
27024 isa<ConstantSDNode>(SubsNode->getOperand(1))) {
27025 SDValue X = AndNode->getOperand(0);
27026 APInt M = AndNode->getConstantOperandAPInt(1);
27027 APInt C = SubsNode->getConstantOperandAPInt(1);
27028
27029 if (M.isMask() && C.isSubsetOf(M) && !isLegalArithImmed(C.getZExtValue())) {
27030 SDLoc DL(SubsNode);
27031 EVT VT = SubsNode->getValueType(0);
27032 unsigned ShiftAmt = M.countl_zero();
27033 SDValue ShiftedX = DAG.getNode(
27034 ISD::SHL, DL, VT, X, DAG.getShiftAmountConstant(ShiftAmt, VT, DL));
27035 SDValue ShiftedC = DAG.getConstant(C << ShiftAmt, DL, VT);
27036 SDValue NewSubs = DAG.getNode(AArch64ISD::SUBS, DL, SubsNode->getVTList(),
27037 ShiftedC, ShiftedX);
27038 DCI.CombineTo(SubsNode, NewSubs, NewSubs.getValue(1));
27039 return SDValue(N, 0);
27040 }
27041 }
27042
27043 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
27044 uint32_t CNV = CN->getZExtValue();
27045 if (CNV == 255)
27046 MaskBits = 8;
27047 else if (CNV == 65535)
27048 MaskBits = 16;
27049 }
27050
27051 if (!MaskBits)
27052 return SDValue();
27053
27054 SDValue AddValue = AndNode->getOperand(0);
27055
27056 if (AddValue.getOpcode() != ISD::ADD)
27057 return SDValue();
27058
27059 // The basic dag structure is correct, grab the inputs and validate them.
27060
27061 SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
27062 SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
27063 SDValue SubsInputValue = SubsNode->getOperand(1);
27064
27065 // The mask is present and the provenance of all the values is a smaller type,
27066 // lets see if the mask is superfluous.
27067
27068 if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
27069 !isa<ConstantSDNode>(SubsInputValue.getNode()))
27070 return SDValue();
27071
27072 ISD::LoadExtType ExtType;
27073
27074 if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
27075 !checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
27076 !checkValueWidth(AddInputValue1, MaskBits, ExtType) )
27077 return SDValue();
27078
27079 if(!isEquivalentMaskless(CC, MaskBits, ExtType,
27080 cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
27081 cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
27082 return SDValue();
27083
27084 // The AND is not necessary, remove it.
27085
27086 SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
27087 SubsNode->getValueType(1));
27088 SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
27089
27090 SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
27091 DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
27092
27093 return SDValue(N, 0);
27094}
27095
27096// Optimize compare with zero and branch.
27099 SelectionDAG &DAG) {
27101 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
27102 // will not be produced, as they are conditional branch instructions that do
27103 // not set flags.
27104 if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
27105 return SDValue();
27106
27107 if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
27108 N = NV.getNode();
27109 SDValue Chain = N->getOperand(0);
27110 SDValue Dest = N->getOperand(1);
27111 SDValue CCVal = N->getOperand(2);
27112 SDValue Cmp = N->getOperand(3);
27113
27114 assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
27115 unsigned CC = CCVal->getAsZExtVal();
27116 if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
27117 return SDValue();
27118
27119 // Fold away brcond(NE, cmp(csel(1, 0, CC, Cmp), 1)) -> brcond(~CC, Cmp)
27120 if (isCMP(Cmp) && CC == AArch64CC::NE && isOneConstant(Cmp.getOperand(1))) {
27121 SDValue CSel = Cmp.getOperand(0);
27122 auto CSelCC = getCSETCondCode(CSel);
27123 if (CSelCC) {
27124 SDLoc DL(N);
27125 return DAG.getNode(N->getOpcode(), DL, N->getVTList(), Chain, Dest,
27126 getCondCode(DAG, getInvertedCondCode(*CSelCC)),
27127 CSel.getOperand(3));
27128 }
27129 }
27130
27131 unsigned CmpOpc = Cmp.getOpcode();
27132 if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
27133 return SDValue();
27134
27135 // Only attempt folding if there is only one use of the flag and no use of the
27136 // value.
27137 if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
27138 return SDValue();
27139
27140 SDValue LHS = Cmp.getOperand(0);
27141 SDValue RHS = Cmp.getOperand(1);
27142
27143 assert(LHS.getValueType() == RHS.getValueType() &&
27144 "Expected the value type to be the same for both operands!");
27145 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
27146 return SDValue();
27147
27148 if (isNullConstant(LHS))
27149 std::swap(LHS, RHS);
27150
27151 if (!isNullConstant(RHS))
27152 return SDValue();
27153
27154 if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
27155 LHS.getOpcode() == ISD::SRL)
27156 return SDValue();
27157
27158 // Fold the compare into the branch instruction.
27159 SDValue BR;
27160 if (CC == AArch64CC::EQ)
27161 BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
27162 else
27163 BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
27164
27165 // Do not add new nodes to DAG combiner worklist.
27166 DCI.CombineTo(N, BR, false);
27167
27168 return SDValue();
27169}
27170
27172 unsigned CC = N->getConstantOperandVal(2);
27173 SDValue SUBS = N->getOperand(3);
27174 SDValue Zero, CTTZ;
27175
27176 if (CC == AArch64CC::EQ && SUBS.getOpcode() == AArch64ISD::SUBS) {
27177 Zero = N->getOperand(0);
27178 CTTZ = N->getOperand(1);
27179 } else if (CC == AArch64CC::NE && SUBS.getOpcode() == AArch64ISD::SUBS) {
27180 Zero = N->getOperand(1);
27181 CTTZ = N->getOperand(0);
27182 } else
27183 return SDValue();
27184
27185 if ((CTTZ.getOpcode() != ISD::CTTZ && CTTZ.getOpcode() != ISD::TRUNCATE) ||
27186 (CTTZ.getOpcode() == ISD::TRUNCATE &&
27187 CTTZ.getOperand(0).getOpcode() != ISD::CTTZ))
27188 return SDValue();
27189
27190 assert((CTTZ.getValueType() == MVT::i32 || CTTZ.getValueType() == MVT::i64) &&
27191 "Illegal type in CTTZ folding");
27192
27193 if (!isNullConstant(Zero) || !isNullConstant(SUBS.getOperand(1)))
27194 return SDValue();
27195
27196 SDValue X = CTTZ.getOpcode() == ISD::TRUNCATE
27197 ? CTTZ.getOperand(0).getOperand(0)
27198 : CTTZ.getOperand(0);
27199
27200 if (X != SUBS.getOperand(0))
27201 return SDValue();
27202
27203 unsigned BitWidth = CTTZ.getOpcode() == ISD::TRUNCATE
27204 ? CTTZ.getOperand(0).getValueSizeInBits()
27205 : CTTZ.getValueSizeInBits();
27206 SDValue BitWidthMinusOne =
27207 DAG.getConstant(BitWidth - 1, SDLoc(N), CTTZ.getValueType());
27208 return DAG.getNode(ISD::AND, SDLoc(N), CTTZ.getValueType(), CTTZ,
27209 BitWidthMinusOne);
27210}
27211
27212// (CSEL l r EQ (CMP (CSEL x y cc2 cond) x)) => (CSEL l r cc2 cond)
27213// (CSEL l r EQ (CMP (CSEL x y cc2 cond) y)) => (CSEL l r !cc2 cond)
27214// Where x and y are constants and x != y
27215
27216// (CSEL l r NE (CMP (CSEL x y cc2 cond) x)) => (CSEL l r !cc2 cond)
27217// (CSEL l r NE (CMP (CSEL x y cc2 cond) y)) => (CSEL l r cc2 cond)
27218// Where x and y are constants and x != y
27220 SDValue L = Op->getOperand(0);
27221 SDValue R = Op->getOperand(1);
27222 AArch64CC::CondCode OpCC =
27223 static_cast<AArch64CC::CondCode>(Op->getConstantOperandVal(2));
27224
27225 SDValue OpCmp = Op->getOperand(3);
27226 if (!isCMP(OpCmp))
27227 return SDValue();
27228
27229 SDValue CmpLHS = OpCmp.getOperand(0);
27230 SDValue CmpRHS = OpCmp.getOperand(1);
27231
27232 if (CmpRHS.getOpcode() == AArch64ISD::CSEL)
27233 std::swap(CmpLHS, CmpRHS);
27234 else if (CmpLHS.getOpcode() != AArch64ISD::CSEL)
27235 return SDValue();
27236
27237 SDValue X = CmpLHS->getOperand(0);
27238 SDValue Y = CmpLHS->getOperand(1);
27239 if (!isa<ConstantSDNode>(X) || !isa<ConstantSDNode>(Y) || X == Y) {
27240 return SDValue();
27241 }
27242
27243 // If one of the constant is opaque constant, x,y sdnode is still different
27244 // but the real value maybe the same. So check APInt here to make sure the
27245 // code is correct.
27248 if (CX->getAPIntValue() == CY->getAPIntValue())
27249 return SDValue();
27250
27252 static_cast<AArch64CC::CondCode>(CmpLHS->getConstantOperandVal(2));
27253 SDValue Cond = CmpLHS->getOperand(3);
27254
27255 if (CmpRHS == Y)
27257 else if (CmpRHS != X)
27258 return SDValue();
27259
27260 if (OpCC == AArch64CC::NE)
27262 else if (OpCC != AArch64CC::EQ)
27263 return SDValue();
27264
27265 SDLoc DL(Op);
27266 EVT VT = Op->getValueType(0);
27267
27268 SDValue CCValue = getCondCode(DAG, CC);
27269 return DAG.getNode(AArch64ISD::CSEL, DL, VT, L, R, CCValue, Cond);
27270}
27271
27272// Reassociate the true/false expressions of a CSEL instruction to obtain a
27273// common subexpression with the comparison instruction. For example, change
27274// (CSEL (ADD (ADD x y) -c) f LO (SUBS x c)) to
27275// (CSEL (ADD (SUBS x c) y) f LO (SUBS x c)) such that (SUBS x c) is a common
27276// subexpression.
27278 SDValue SubsNode = N->getOperand(3);
27279 if (SubsNode.getOpcode() != AArch64ISD::SUBS || !SubsNode.hasOneUse())
27280 return SDValue();
27281
27282 SDValue CmpOpToMatch = SubsNode.getOperand(1);
27283 SDValue CmpOpOther = SubsNode.getOperand(0);
27284 EVT VT = N->getValueType(0);
27285
27286 unsigned ExpectedOpcode;
27287 SDValue ExpectedOp;
27288 SDValue SubsOp;
27289 auto *CmpOpConst = dyn_cast<ConstantSDNode>(CmpOpToMatch);
27290 if (CmpOpConst) {
27291 ExpectedOpcode = ISD::ADD;
27292 ExpectedOp =
27293 DAG.getConstant(-CmpOpConst->getAPIntValue(), SDLoc(CmpOpConst),
27294 CmpOpConst->getValueType(0));
27295 SubsOp = DAG.getConstant(CmpOpConst->getAPIntValue(), SDLoc(CmpOpConst),
27296 CmpOpConst->getValueType(0));
27297 } else {
27298 ExpectedOpcode = ISD::SUB;
27299 ExpectedOp = CmpOpToMatch;
27300 SubsOp = CmpOpToMatch;
27301 }
27302
27303 // Get the operand that can be reassociated with the SUBS instruction.
27304 auto GetReassociationOp = [&](SDValue Op, SDValue ExpectedOp) {
27305 if (Op.getOpcode() != ExpectedOpcode)
27306 return SDValue();
27307 if (Op.getOperand(0).getOpcode() != ISD::ADD ||
27308 !Op.getOperand(0).hasOneUse())
27309 return SDValue();
27310 SDValue X = Op.getOperand(0).getOperand(0);
27311 SDValue Y = Op.getOperand(0).getOperand(1);
27312 if (X != CmpOpOther)
27313 std::swap(X, Y);
27314 if (X != CmpOpOther)
27315 return SDValue();
27316 if (ExpectedOp != Op.getOperand(1))
27317 return SDValue();
27318 return Y;
27319 };
27320
27321 // Try the reassociation using the given constant and condition code.
27322 auto Fold = [&](AArch64CC::CondCode NewCC, SDValue ExpectedOp,
27323 SDValue SubsOp) {
27324 SDValue TReassocOp = GetReassociationOp(N->getOperand(0), ExpectedOp);
27325 SDValue FReassocOp = GetReassociationOp(N->getOperand(1), ExpectedOp);
27326 if (!TReassocOp && !FReassocOp)
27327 return SDValue();
27328
27329 SDValue NewCmp =
27330 DAG.getNode(AArch64ISD::SUBS, SDLoc(SubsNode),
27331 DAG.getVTList(VT, FlagsVT), CmpOpOther, SubsOp);
27332
27333 auto Reassociate = [&](SDValue ReassocOp, unsigned OpNum) {
27334 if (!ReassocOp)
27335 return N->getOperand(OpNum);
27336 SDValue Res = DAG.getNode(ISD::ADD, SDLoc(N->getOperand(OpNum)), VT,
27337 NewCmp.getValue(0), ReassocOp);
27338 DAG.ReplaceAllUsesWith(N->getOperand(OpNum), Res);
27339 return Res;
27340 };
27341
27342 SDValue TValReassoc = Reassociate(TReassocOp, 0);
27343 SDValue FValReassoc = Reassociate(FReassocOp, 1);
27344 return DAG.getNode(AArch64ISD::CSEL, SDLoc(N), VT, TValReassoc, FValReassoc,
27345 getCondCode(DAG, NewCC), NewCmp.getValue(1));
27346 };
27347
27348 auto CC = static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(2));
27349
27350 // First, try to eliminate the compare instruction by searching for a
27351 // subtraction with the same constant.
27352 if (SDValue R = Fold(CC, ExpectedOp, SubsOp))
27353 return R;
27354
27355 if (!CmpOpConst) {
27356 // Try again with the operands of the SUBS instruction and the condition
27357 // swapped. Due to canonicalization, this only helps for non-constant
27358 // operands of the SUBS instruction.
27359 std::swap(CmpOpToMatch, CmpOpOther);
27360 if (SDValue R = Fold(getSwappedCondition(CC), CmpOpToMatch, CmpOpToMatch))
27361 return R;
27362 return SDValue();
27363 }
27364
27365 if ((CC == AArch64CC::EQ || CC == AArch64CC::NE) && !CmpOpConst->isZero())
27366 return SDValue();
27367
27368 // Next, search for a subtraction with a slightly different constant. By
27369 // adjusting the condition code, we can still eliminate the compare
27370 // instruction. Adjusting the constant is only valid if it does not result
27371 // in signed/unsigned wrap for signed/unsigned comparisons, respectively.
27372 // Since such comparisons are trivially true/false, we should not encounter
27373 // them here but check for them nevertheless to be on the safe side.
27374 auto CheckedFold = [&](bool Check, APInt NewCmpConst,
27375 AArch64CC::CondCode NewCC) {
27376 auto ExpectedOp = DAG.getConstant(-NewCmpConst, SDLoc(CmpOpConst),
27377 CmpOpConst->getValueType(0));
27378 auto SubsOp = DAG.getConstant(NewCmpConst, SDLoc(CmpOpConst),
27379 CmpOpConst->getValueType(0));
27380 return Check ? Fold(NewCC, ExpectedOp, SubsOp) : SDValue();
27381 };
27382 switch (CC) {
27383 case AArch64CC::EQ:
27384 case AArch64CC::LS:
27385 return CheckedFold(!CmpOpConst->getAPIntValue().isMaxValue(),
27386 CmpOpConst->getAPIntValue() + 1, AArch64CC::LO);
27387 case AArch64CC::NE:
27388 case AArch64CC::HI:
27389 return CheckedFold(!CmpOpConst->getAPIntValue().isMaxValue(),
27390 CmpOpConst->getAPIntValue() + 1, AArch64CC::HS);
27391 case AArch64CC::LO:
27392 return CheckedFold(!CmpOpConst->getAPIntValue().isZero(),
27393 CmpOpConst->getAPIntValue() - 1, AArch64CC::LS);
27394 case AArch64CC::HS:
27395 return CheckedFold(!CmpOpConst->getAPIntValue().isZero(),
27396 CmpOpConst->getAPIntValue() - 1, AArch64CC::HI);
27397 case AArch64CC::LT:
27398 return CheckedFold(!CmpOpConst->getAPIntValue().isMinSignedValue(),
27399 CmpOpConst->getAPIntValue() - 1, AArch64CC::LE);
27400 case AArch64CC::LE:
27401 return CheckedFold(!CmpOpConst->getAPIntValue().isMaxSignedValue(),
27402 CmpOpConst->getAPIntValue() + 1, AArch64CC::LT);
27403 case AArch64CC::GT:
27404 return CheckedFold(!CmpOpConst->getAPIntValue().isMaxSignedValue(),
27405 CmpOpConst->getAPIntValue() + 1, AArch64CC::GE);
27406 case AArch64CC::GE:
27407 return CheckedFold(!CmpOpConst->getAPIntValue().isMinSignedValue(),
27408 CmpOpConst->getAPIntValue() - 1, AArch64CC::GT);
27409 default:
27410 return SDValue();
27411 }
27412}
27413
27415 AArch64CC::CondCode OpCC =
27416 static_cast<AArch64CC::CondCode>(Op->getConstantOperandVal(2));
27417
27418 if (OpCC != AArch64CC::NE)
27419 return SDValue();
27420
27421 SDValue PTest = Op->getOperand(3);
27422 if (PTest.getOpcode() != AArch64ISD::PTEST_ANY)
27423 return SDValue();
27424
27425 SDValue TruePred = PTest.getOperand(0);
27426 SDValue AnyPred = PTest.getOperand(1);
27427
27428 if (TruePred.getOpcode() == AArch64ISD::REINTERPRET_CAST)
27429 TruePred = TruePred.getOperand(0);
27430
27431 if (AnyPred.getOpcode() == AArch64ISD::REINTERPRET_CAST)
27432 AnyPred = AnyPred.getOperand(0);
27433
27434 if (TruePred != AnyPred && !isAllActivePredicate(DAG, TruePred))
27435 return SDValue();
27436
27437 SDValue LastB = Op->getOperand(0);
27438 SDValue Default = Op->getOperand(1);
27439
27440 if (LastB.getOpcode() != AArch64ISD::LASTB || LastB.getOperand(0) != AnyPred)
27441 return SDValue();
27442
27443 return DAG.getNode(AArch64ISD::CLASTB_N, SDLoc(Op), Op->getValueType(0),
27444 AnyPred, Default, LastB.getOperand(1));
27445}
27446
27447// Optimize CSEL instructions
27450 SelectionDAG &DAG) {
27451 // CSEL x, x, cc -> x
27452 if (N->getOperand(0) == N->getOperand(1))
27453 return N->getOperand(0);
27454
27455 if (SDValue R = foldCSELOfCSEL(N, DAG))
27456 return R;
27457
27458 // Try to reassociate the true/false expressions so that we can do CSE with
27459 // a SUBS instruction used to perform the comparison.
27461 return R;
27462
27463 // CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1
27464 // CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1
27465 if (SDValue Folded = foldCSELofCTTZ(N, DAG))
27466 return Folded;
27467
27468 // CSEL a, b, cc, SUBS(x, y) -> CSEL a, b, swapped(cc), SUBS(y, x)
27469 // if SUB(y, x) already exists and we can produce a swapped predicate for cc.
27470 SDValue Cond = N->getOperand(3);
27471 if (DCI.isAfterLegalizeDAG() && Cond.getOpcode() == AArch64ISD::SUBS &&
27472 Cond.hasOneUse() && Cond->hasNUsesOfValue(0, 0) &&
27473 DAG.doesNodeExist(ISD::SUB, N->getVTList(),
27474 {Cond.getOperand(1), Cond.getOperand(0)}) &&
27475 !DAG.doesNodeExist(ISD::SUB, N->getVTList(),
27476 {Cond.getOperand(0), Cond.getOperand(1)}) &&
27477 !isNullConstant(Cond.getOperand(1))) {
27478 AArch64CC::CondCode OldCond =
27479 static_cast<AArch64CC::CondCode>(N->getConstantOperandVal(2));
27480 AArch64CC::CondCode NewCond = getSwappedCondition(OldCond);
27481 if (NewCond != AArch64CC::AL) {
27482 SDLoc DL(N);
27483 SDValue Sub = DAG.getNode(AArch64ISD::SUBS, DL, Cond->getVTList(),
27484 Cond.getOperand(1), Cond.getOperand(0));
27485 return DAG.getNode(AArch64ISD::CSEL, DL, N->getVTList(), N->getOperand(0),
27486 N->getOperand(1), getCondCode(DAG, NewCond),
27487 Sub.getValue(1));
27488 }
27489 }
27490
27491 // CSEL (LASTB P, Z), X, NE(ANY P) -> CLASTB P, X, Z
27492 if (SDValue CondLast = foldCSELofLASTB(N, DAG))
27493 return CondLast;
27494
27495 return performCONDCombine(N, DCI, DAG, 2, 3);
27496}
27497
27498// Try to re-use an already extended operand of a vector SetCC feeding a
27499// extended select. Doing so avoids requiring another full extension of the
27500// SET_CC result when lowering the select.
27502 EVT Op0MVT = Op->getOperand(0).getValueType();
27503 if (!Op0MVT.isVector() || Op->use_empty())
27504 return SDValue();
27505
27506 // Make sure that all uses of Op are VSELECTs with result matching types where
27507 // the result type has a larger element type than the SetCC operand.
27508 SDNode *FirstUse = *Op->user_begin();
27509 if (FirstUse->getOpcode() != ISD::VSELECT)
27510 return SDValue();
27511 EVT UseMVT = FirstUse->getValueType(0);
27512 if (UseMVT.getScalarSizeInBits() <= Op0MVT.getScalarSizeInBits())
27513 return SDValue();
27514 if (any_of(Op->users(), [&UseMVT](const SDNode *N) {
27515 return N->getOpcode() != ISD::VSELECT || N->getValueType(0) != UseMVT;
27516 }))
27517 return SDValue();
27518
27519 APInt V;
27520 if (!ISD::isConstantSplatVector(Op->getOperand(1).getNode(), V))
27521 return SDValue();
27522
27523 SDLoc DL(Op);
27524 SDValue Op0ExtV;
27525 SDValue Op1ExtV;
27526 ISD::CondCode CC = cast<CondCodeSDNode>(Op->getOperand(2))->get();
27527 // Check if the first operand of the SET_CC is already extended. If it is,
27528 // split the SET_CC and re-use the extended version of the operand.
27529 SDNode *Op0SExt = DAG.getNodeIfExists(ISD::SIGN_EXTEND, DAG.getVTList(UseMVT),
27530 Op->getOperand(0));
27531 SDNode *Op0ZExt = DAG.getNodeIfExists(ISD::ZERO_EXTEND, DAG.getVTList(UseMVT),
27532 Op->getOperand(0));
27533 if (Op0SExt && (isSignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
27534 Op0ExtV = SDValue(Op0SExt, 0);
27535 Op1ExtV = DAG.getNode(ISD::SIGN_EXTEND, DL, UseMVT, Op->getOperand(1));
27536 } else if (Op0ZExt && (isUnsignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
27537 Op0ExtV = SDValue(Op0ZExt, 0);
27538 Op1ExtV = DAG.getNode(ISD::ZERO_EXTEND, DL, UseMVT, Op->getOperand(1));
27539 } else
27540 return SDValue();
27541
27542 return DAG.getNode(ISD::SETCC, DL,
27543 UseMVT.changeVectorElementType(*DAG.getContext(), MVT::i1),
27544 Op0ExtV, Op1ExtV, Op->getOperand(2));
27545}
27546
27547static SDValue
27549 SelectionDAG &DAG) {
27550 SDValue Vec = N->getOperand(0);
27551 if (DCI.isBeforeLegalize() &&
27552 Vec.getValueType().getVectorElementType() == MVT::i1 &&
27555 SDLoc DL(N);
27556 return getVectorBitwiseReduce(N->getOpcode(), Vec, N->getValueType(0), DL,
27557 DAG);
27558 }
27559
27560 return SDValue();
27561}
27562
27565 SelectionDAG &DAG) {
27566 assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!");
27567 SDValue LHS = N->getOperand(0);
27568 SDValue RHS = N->getOperand(1);
27569 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
27570 SDLoc DL(N);
27571 EVT VT = N->getValueType(0);
27572
27573 if (SDValue V = tryToWidenSetCCOperands(N, DAG))
27574 return V;
27575
27576 // setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X
27577 if (Cond == ISD::SETNE && isOneConstant(RHS) &&
27578 LHS->getOpcode() == AArch64ISD::CSEL &&
27579 isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
27580 LHS->hasOneUse()) {
27581 // Invert CSEL's condition.
27582 auto OldCond =
27583 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
27584 auto NewCond = getInvertedCondCode(OldCond);
27585
27586 // csel 0, 1, !cond, X
27587 SDValue CSEL = DAG.getNode(AArch64ISD::CSEL, DL, LHS.getValueType(),
27588 LHS.getOperand(0), LHS.getOperand(1),
27589 getCondCode(DAG, NewCond), LHS.getOperand(3));
27590 return DAG.getZExtOrTrunc(CSEL, DL, VT);
27591 }
27592
27593 // setcc (srl x, imm), 0, ne ==> setcc (and x, (-1 << imm)), 0, ne
27594 if (Cond == ISD::SETNE && isNullConstant(RHS) &&
27595 LHS->getOpcode() == ISD::SRL && isa<ConstantSDNode>(LHS->getOperand(1)) &&
27596 LHS->hasOneUse()) {
27597 EVT TstVT = LHS->getValueType(0);
27598 if (TstVT.isScalarInteger() && TstVT.getFixedSizeInBits() <= 64 &&
27599 LHS->getConstantOperandVal(1) < TstVT.getFixedSizeInBits()) {
27600 // this pattern will get better opt in emitComparison
27601 uint64_t TstImm = -1ULL << LHS->getConstantOperandVal(1);
27602 SDValue TST = DAG.getNode(ISD::AND, DL, TstVT, LHS->getOperand(0),
27603 DAG.getSignedConstant(TstImm, DL, TstVT));
27604 return DAG.getNode(ISD::SETCC, DL, VT, TST, RHS, N->getOperand(2));
27605 }
27606 }
27607
27608 // When a bool vector bitcast is only compared against zero or all ones, it
27609 // is enough to test a widened scalar view of the comparison bits. This
27610 // avoids materializing the packed bitmask via vectorToScalarBitmask().
27611 if (DCI.isBeforeLegalize() && VT.isScalarInteger() &&
27612 (Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
27614 LHS->getOpcode() == ISD::BITCAST) {
27615 EVT ToVT = LHS->getValueType(0);
27616 EVT FromVT = LHS->getOperand(0).getValueType();
27617 if (FromVT.isFixedLengthVector() &&
27618 FromVT.getVectorElementType() == MVT::i1) {
27619 SDValue CompareLHS, CompareRHS;
27620 if (getBoolVectorBitcastCompare(LHS.getOperand(0), RHS, DL, DAG,
27621 CompareLHS, CompareRHS))
27622 return DAG.getSetCC(DL, VT, CompareLHS, CompareRHS, Cond);
27623
27624 bool IsNull = isNullConstant(RHS);
27626 DL, MVT::i1, LHS->getOperand(0));
27627 LHS = DAG.getNode(IsNull ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, DL, ToVT,
27628 LHS);
27629 return DAG.getSetCC(DL, VT, LHS, RHS, Cond);
27630 }
27631 }
27632
27633 // Try to perform the memcmp when the result is tested for [in]equality with 0
27634 if (SDValue V = performOrXorChainCombine(N, DAG))
27635 return V;
27636
27637 EVT CmpVT = LHS.getValueType();
27638
27639 // NOTE: This exists as a combine only because it proved too awkward to match
27640 // splat(1) across all the NEON types during isel.
27641 APInt SplatLHSVal;
27642 if (CmpVT.isInteger() && Cond == ISD::SETGT &&
27643 ISD::isConstantSplatVector(LHS.getNode(), SplatLHSVal) &&
27644 SplatLHSVal.isOne())
27645 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, CmpVT), RHS, ISD::SETGE);
27646
27647 return SDValue();
27648}
27649
27652 SelectionDAG &DAG) {
27653 assert(N->getOpcode() == ISD::SELECT_CC && "Unexpected opcode!");
27654
27655 if (!DCI.isBeforeLegalize())
27656 return SDValue();
27657
27658 SDValue LHS = N->getOperand(0);
27659 SDValue RHS = N->getOperand(1);
27660 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(4))->get();
27661 if (Cond != ISD::SETEQ && Cond != ISD::SETNE)
27662 return SDValue();
27663
27665 RHS.getOpcode() == ISD::BITCAST)
27666 std::swap(LHS, RHS);
27667
27669 LHS.getOpcode() != ISD::BITCAST)
27670 return SDValue();
27671
27672 EVT FromVT = LHS.getOperand(0).getValueType();
27673 if (!FromVT.isFixedLengthVector() || FromVT.getVectorElementType() != MVT::i1)
27674 return SDValue();
27675
27676 SDLoc DL(N);
27677 SDValue CompareLHS, CompareRHS;
27678 if (!getBoolVectorBitcastCompare(LHS.getOperand(0), RHS, DL, DAG, CompareLHS,
27679 CompareRHS))
27680 return SDValue();
27681
27682 return DAG.getNode(ISD::SELECT_CC, DL, N->getValueType(0), CompareLHS,
27683 CompareRHS, N->getOperand(2), N->getOperand(3),
27684 N->getOperand(4));
27685}
27686
27687// Replace a flag-setting operator (eg ANDS) with the generic version
27688// (eg AND) if the flag is unused.
27691 unsigned GenericOpcode) {
27692 SDLoc DL(N);
27693 SDValue LHS = N->getOperand(0);
27694 SDValue RHS = N->getOperand(1);
27695 EVT VT = N->getValueType(0);
27696
27697 // If the flag result isn't used, convert back to a generic opcode.
27698 if (!N->hasAnyUseOfValue(1)) {
27699 SDValue Res = DCI.DAG.getNode(GenericOpcode, DL, VT, N->ops());
27700 return DCI.CombineTo(N, Res, SDValue(N, 1));
27701 }
27702
27703 // Combine equivalent generic nodes into this node, re-using the result.
27704 if (SDNode *Generic = DCI.DAG.getNodeIfExists(
27705 GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS},
27706 /*AllowCommute=*/true))
27707 DCI.CombineTo(Generic, SDValue(N, 0));
27708
27709 return SDValue();
27710}
27711
27714 SelectionDAG &DAG = DCI.DAG;
27716 return R;
27717
27718 // If we have no uses of the AND value, use performANDORCSELCombine to try to
27719 // convert ANDS(CSET(CMP), CSET(CMP)) into CMP(CSET(CCMP(CMP))). The outer
27720 // CMP(CSET should be removed by other combines, folded into the use of the
27721 // CMP.
27722 if (!N->hasAnyUseOfValue(0))
27723 if (SDValue R = performANDORCSELCombine(N, DAG))
27724 return DAG.getNode(AArch64ISD::SUBS, SDLoc(N), N->getVTList(), R,
27725 DAG.getConstant(0, SDLoc(N), N->getValueType(0)));
27726
27727 return SDValue();
27728}
27729
27731 // setcc_merge_zero pred
27732 // (sign_extend (extract_subvector (setcc_merge_zero ... pred ...))), 0, ne
27733 // => extract_subvector (inner setcc_merge_zero)
27734 SDValue Pred = N->getOperand(0);
27735 SDValue LHS = N->getOperand(1);
27736 SDValue RHS = N->getOperand(2);
27737 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
27738
27739 if (Cond != ISD::SETNE || !isZerosVector(RHS.getNode()) ||
27740 LHS->getOpcode() != ISD::SIGN_EXTEND)
27741 return SDValue();
27742
27743 SDValue Extract = LHS->getOperand(0);
27744 if (Extract->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
27745 Extract->getValueType(0) != N->getValueType(0) ||
27746 Extract->getConstantOperandVal(1) != 0)
27747 return SDValue();
27748
27749 SDValue InnerSetCC = Extract->getOperand(0);
27750 if (InnerSetCC->getOpcode() != AArch64ISD::SETCC_MERGE_ZERO)
27751 return SDValue();
27752
27753 // By this point we've effectively got
27754 // zero_inactive_lanes_and_trunc_i1(sext_i1(A)). If we can prove A's inactive
27755 // lanes are already zero then the trunc(sext()) sequence is redundant and we
27756 // can operate on A directly.
27757 SDValue InnerPred = InnerSetCC.getOperand(0);
27758 if (Pred.getOpcode() == AArch64ISD::PTRUE &&
27759 InnerPred.getOpcode() == AArch64ISD::PTRUE &&
27760 Pred.getConstantOperandVal(0) == InnerPred.getConstantOperandVal(0) &&
27761 Pred->getConstantOperandVal(0) >= AArch64SVEPredPattern::vl1 &&
27762 Pred->getConstantOperandVal(0) <= AArch64SVEPredPattern::vl256)
27763 return Extract;
27764
27765 return SDValue();
27766}
27767
27768static bool isSignExtInReg(const SDValue &V) {
27769 if (V.getOpcode() != AArch64ISD::VASHR ||
27770 V.getOperand(0).getOpcode() != AArch64ISD::VSHL)
27771 return false;
27772
27773 unsigned BitWidth = V->getValueType(0).getScalarSizeInBits();
27774 unsigned ShiftAmtR = V.getConstantOperandVal(1);
27775 unsigned ShiftAmtL = V.getOperand(0).getConstantOperandVal(1);
27776 return (ShiftAmtR == ShiftAmtL && ShiftAmtR == (BitWidth - 1));
27777}
27778
27779static SDValue
27781 assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
27782 "Unexpected opcode!");
27783
27784 SelectionDAG &DAG = DCI.DAG;
27785 SDValue Pred = N->getOperand(0);
27786 SDValue LHS = N->getOperand(1);
27787 SDValue RHS = N->getOperand(2);
27788 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
27789
27790 if (SDValue V = performSetCCPunpkCombine(N, DAG))
27791 return V;
27792
27793 if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) &&
27794 LHS->getOpcode() == ISD::SIGN_EXTEND &&
27795 LHS->getOperand(0)->getValueType(0) == N->getValueType(0)) {
27796 // setcc_merge_zero(
27797 // pred, extend(setcc_merge_zero(pred, ...)), != splat(0))
27798 // => setcc_merge_zero(pred, ...)
27799 if (LHS->getOperand(0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
27800 LHS->getOperand(0)->getOperand(0) == Pred)
27801 return LHS->getOperand(0);
27802
27803 // setcc_merge_zero(
27804 // all_active, extend(nxvNi1 ...), != splat(0))
27805 // -> nxvNi1 ...
27806 if (isAllActivePredicate(DAG, Pred))
27807 return LHS->getOperand(0);
27808
27809 // setcc_merge_zero(
27810 // pred, extend(nxvNi1 ...), != splat(0))
27811 // -> nxvNi1 and(pred, ...)
27812 if (DCI.isAfterLegalizeDAG())
27813 // Do this after legalization to allow more folds on setcc_merge_zero
27814 // to be recognized.
27815 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0),
27816 LHS->getOperand(0), Pred);
27817 }
27818
27819 // setcc_merge_zero(
27820 // pred, insert_subvector(undef, signext_inreg(x), 0), != splat(0))
27821 // => setcc_merge_zero(
27822 // pred, insert_subvector(undef, shl(x), 0), != splat(0))
27823 // or:
27824 // => setcc_merge_zero(
27825 // pred, insert_subvector(undef, x, 0), != splat(0))
27826 // iff it can be proven that x is already sign-extended.
27827 if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) &&
27828 LHS->getOpcode() == ISD::INSERT_SUBVECTOR && LHS.hasOneUse()) {
27829 SDValue L0 = LHS->getOperand(0);
27830 SDValue L1 = LHS->getOperand(1);
27831 SDValue L2 = LHS->getOperand(2);
27832
27833 if (L0.isUndef() && isNullConstant(L2) && isSignExtInReg(L1)) {
27834 SDLoc DL(N);
27835 SDValue ExtVal = L1.getOperand(0);
27836 unsigned NumShiftBits = ExtVal.getConstantOperandVal(1);
27837 SDValue ShlSrc = ExtVal.getOperand(0);
27838 if (DCI.DAG.ComputeNumSignBits(ShlSrc) > NumShiftBits)
27839 ExtVal = ShlSrc;
27841 LHS.getValueType(), L0, ExtVal, L2);
27842 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, N->getValueType(0),
27843 Pred, NewLHS, RHS, N->getOperand(3));
27844 }
27845 }
27846
27847 return SDValue();
27848}
27849
27850// Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
27851// as well as whether the test should be inverted. This code is required to
27852// catch these cases (as opposed to standard dag combines) because
27853// AArch64ISD::TBZ is matched during legalization.
27854static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
27855 SelectionDAG &DAG) {
27856
27857 if (!Op->hasOneUse())
27858 return Op;
27859
27860 // We don't handle undef/constant-fold cases below, as they should have
27861 // already been taken care of (e.g. and of 0, test of undefined shifted bits,
27862 // etc.)
27863
27864 // (tbz (trunc x), b) -> (tbz x, b)
27865 // This case is just here to enable more of the below cases to be caught.
27866 if (Op->getOpcode() == ISD::TRUNCATE &&
27867 Bit < Op->getValueType(0).getSizeInBits()) {
27868 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
27869 }
27870
27871 // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
27872 if (Op->getOpcode() == ISD::ANY_EXTEND &&
27873 Bit < Op->getOperand(0).getValueSizeInBits()) {
27874 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
27875 }
27876
27877 if (Op->getNumOperands() != 2)
27878 return Op;
27879
27880 auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
27881 if (!C)
27882 return Op;
27883
27884 switch (Op->getOpcode()) {
27885 default:
27886 return Op;
27887
27888 // (tbz (and x, m), b) -> (tbz x, b)
27889 case ISD::AND:
27890 if ((C->getZExtValue() >> Bit) & 1)
27891 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
27892 return Op;
27893
27894 // (tbz (shl x, c), b) -> (tbz x, b-c)
27895 case ISD::SHL:
27896 if (C->getZExtValue() <= Bit &&
27897 (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
27898 Bit = Bit - C->getZExtValue();
27899 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
27900 }
27901 return Op;
27902
27903 // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
27904 case ISD::SRA:
27905 Bit = Bit + C->getZExtValue();
27906 if (Bit >= Op->getValueType(0).getSizeInBits())
27907 Bit = Op->getValueType(0).getSizeInBits() - 1;
27908 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
27909
27910 // (tbz (srl x, c), b) -> (tbz x, b+c)
27911 case ISD::SRL:
27912 if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
27913 Bit = Bit + C->getZExtValue();
27914 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
27915 }
27916 return Op;
27917
27918 // (tbz (xor x, -1), b) -> (tbnz x, b)
27919 case ISD::XOR:
27920 if ((C->getZExtValue() >> Bit) & 1)
27921 Invert = !Invert;
27922 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
27923 }
27924}
27925
27926// Optimize test single bit zero/non-zero and branch.
27929 SelectionDAG &DAG) {
27930 unsigned Bit = N->getConstantOperandVal(2);
27931 bool Invert = false;
27932 SDValue TestSrc = N->getOperand(1);
27933 SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);
27934
27935 if (TestSrc == NewTestSrc)
27936 return SDValue();
27937
27938 unsigned NewOpc = N->getOpcode();
27939 if (Invert) {
27940 if (NewOpc == AArch64ISD::TBZ)
27941 NewOpc = AArch64ISD::TBNZ;
27942 else {
27943 assert(NewOpc == AArch64ISD::TBNZ);
27944 NewOpc = AArch64ISD::TBZ;
27945 }
27946 }
27947
27948 SDLoc DL(N);
27949 return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
27950 DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
27951}
27952
27953// Swap vselect operands where it may allow a predicated operation to achieve
27954// the `sel`.
27955//
27956// (vselect (setcc ( condcode) (_) (_)) (a) (op (a) (b)))
27957// => (vselect (setcc (!condcode) (_) (_)) (op (a) (b)) (a))
27959 auto SelectA = N->getOperand(1);
27960 auto SelectB = N->getOperand(2);
27961 auto NTy = N->getValueType(0);
27962
27963 if (!NTy.isScalableVector())
27964 return SDValue();
27965 SDValue SetCC = N->getOperand(0);
27966 if (SetCC.getOpcode() != ISD::SETCC || !SetCC.hasOneUse())
27967 return SDValue();
27968
27969 switch (SelectB.getOpcode()) {
27970 default:
27971 return SDValue();
27972 case ISD::FMUL:
27973 case ISD::FSUB:
27974 case ISD::FADD:
27975 break;
27976 }
27977 if (SelectA != SelectB.getOperand(0))
27978 return SDValue();
27979
27980 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
27981 ISD::CondCode InverseCC =
27983 auto InverseSetCC =
27984 DAG.getSetCC(SDLoc(SetCC), SetCC.getValueType(), SetCC.getOperand(0),
27985 SetCC.getOperand(1), InverseCC);
27986
27987 return DAG.getNode(ISD::VSELECT, SDLoc(N), NTy,
27988 {InverseSetCC, SelectB, SelectA});
27989}
27990
27993 assert(N->getOpcode() == ISD::VSELECT && "Expected VSELECT opcode");
27994 SDValue Cond = N->getOperand(0);
27995 SDValue TrueVal = N->getOperand(1);
27996 SDValue FalseVal = N->getOperand(2);
27997 bool TrueValIsPow = TrueVal.getOpcode() == ISD::FPOW;
27998 bool FalseValIsPow = FalseVal.getOpcode() == ISD::FPOW;
27999
28000 // If both inputs are pow we could equally remove the select and simply
28001 // select between pow inputs instead.
28002 if (TrueValIsPow == FalseValIsPow)
28003 return SDValue();
28004
28005 if ((TrueValIsPow && !TrueVal.hasOneUse()) ||
28006 (FalseValIsPow && !FalseVal.hasOneUse()))
28007 return SDValue();
28008
28009 EVT VT = N->getValueType(0);
28010 RTLIB::Libcall LC = RTLIB::getPOW(VT);
28011 SelectionDAG &DAG = DCI.DAG;
28012 auto &TLI = DAG.getTargetLoweringInfo();
28013 bool HasLibCall =
28014 TLI.getLibcallLoweringInfo().getLibcallImpl(LC) != RTLIB::Unsupported;
28015 if (!HasLibCall)
28016 return SDValue();
28017
28018 SDValue OldPow = TrueValIsPow ? TrueVal : FalseVal;
28019 SDValue OldPowArg0 = OldPow->getOperand(0);
28020
28021 // Bail out if argument 0 is already a select, in order to avoid an infinite
28022 // combine loop.
28023 if (OldPowArg0.getOpcode() == ISD::VSELECT)
28024 return SDValue();
28025
28026 // For a given call pow(x, y) when x=1.0 it is guaranteed to return 1.0 for
28027 // any value of y.
28028 SDLoc DL(N);
28029 SDValue SplatOne = DAG.getConstantFP(1.0, DL, VT);
28030 SDValue NewPowArg0;
28031 if (TrueValIsPow)
28032 NewPowArg0 = DAG.getNode(ISD::VSELECT, DL, VT, Cond, OldPowArg0, SplatOne);
28033 else
28034 NewPowArg0 = DAG.getNode(ISD::VSELECT, DL, VT, Cond, SplatOne, OldPowArg0);
28035 SDValue NewPow = DAG.getNode(ISD::FPOW, DL, VT, NewPowArg0,
28036 OldPow->getOperand(1), OldPow->getFlags());
28037
28038 if (TrueValIsPow)
28039 return DAG.getNode(ISD::VSELECT, DL, VT, Cond, NewPow, FalseVal);
28040 return DAG.getNode(ISD::VSELECT, DL, VT, Cond, TrueVal, NewPow);
28041}
28042
28043// vselect (v1i1 setcc) ->
28044// vselect (v1iXX setcc) (XX is the size of the compared operand type)
28045// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
28046// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
28047// such VSELECT.
28050 const AArch64Subtarget *Subtarget) {
28051 SelectionDAG &DAG = DCI.DAG;
28052
28053 if (auto SwapResult = trySwapVSelectOperands(N, DAG))
28054 return SwapResult;
28055
28056 SDValue N0 = N->getOperand(0);
28057 SDValue IfTrue = N->getOperand(1);
28058 SDValue IfFalse = N->getOperand(2);
28059 EVT ResVT = N->getValueType(0);
28060 EVT CCVT = N0.getValueType();
28061
28062 if (isAllActivePredicate(DAG, N0))
28063 return N->getOperand(1);
28064
28065 if (isAllInactivePredicate(N0))
28066 return N->getOperand(2);
28067
28068 if (isMergePassthruOpcode(IfTrue.getOpcode()) && IfTrue.hasOneUse()) {
28069 // vselect A, (merge_pasthru_op all_active, B,{Bn,} -), C
28070 // vselect A, (merge_pasthru_op -, B,{Bn,} undef), C
28071 // vselect A, (merge_pasthru_op A, B,{Bn,} -), C
28072 // -> merge_pasthru_op A, B,{Bn,} C
28073 if (isAllActivePredicate(DAG, IfTrue->getOperand(0)) ||
28074 IfTrue->getOperand(IfTrue.getNumOperands() - 1).isUndef() ||
28075 IfTrue->getOperand(0) == N0) {
28077 Ops[0] = N0;
28078 Ops[IfTrue.getNumOperands() - 1] = IfFalse;
28079
28080 return DAG.getNode(IfTrue.getOpcode(), SDLoc(N), ResVT, Ops);
28081 }
28082 }
28083
28084 SDValue SetCC = N->getOperand(0);
28085
28086 // Attempt to convert a (vXi1 bitcast(iX N0)) selection mask before it might
28087 // get split by legalization.
28088 if (N0.getOpcode() == ISD::BITCAST && CCVT.isVector() &&
28089 CCVT.getVectorElementType() == MVT::i1) {
28090 SDLoc DL(N);
28091 EVT ExtCondVT = ResVT.changeVectorElementTypeToInteger();
28092
28094 ISD::SIGN_EXTEND, DL, ExtCondVT, N0, DAG, DCI, *Subtarget)) {
28095 ExtCond = DAG.getNode(ISD::TRUNCATE, DL, CCVT, ExtCond);
28096 return DAG.getSelect(DL, ResVT, ExtCond, IfTrue, IfFalse);
28097 }
28098 }
28099
28100 if (SDValue R = performVselectPowCombine(N, DCI))
28101 return R;
28102
28103 EVT CmpVT = N0.getOperand(0).getValueType();
28104 if (N0.getOpcode() != ISD::SETCC ||
28106 CCVT.getVectorElementType() != MVT::i1 ||
28108 return SDValue();
28109
28110 // Only combine when the result type is of the same size as the compared
28111 // operands.
28112 if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
28113 return SDValue();
28114
28115 SetCC = DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
28116 N0.getOperand(0), N0.getOperand(1),
28117 cast<CondCodeSDNode>(N0.getOperand(2))->get());
28118 return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
28119 IfTrue, IfFalse);
28120}
28121
28122/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
28123/// the compare-mask instructions rather than going via NZCV, even if LHS and
28124/// RHS are really scalar. This replaces any scalar setcc in the above pattern
28125/// with a vector one followed by a DUP shuffle on the result.
28128 SelectionDAG &DAG = DCI.DAG;
28129 SDValue N0 = N->getOperand(0);
28130 EVT ResVT = N->getValueType(0);
28131
28132 if (N0.getOpcode() != ISD::SETCC)
28133 return SDValue();
28134
28135 if (ResVT.isScalableVT())
28136 return SDValue();
28137
28138 // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
28139 // scalar SetCCResultType. We also don't expect vectors, because we assume
28140 // that selects fed by vector SETCCs are canonicalized to VSELECT.
28141 assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) &&
28142 "Scalar-SETCC feeding SELECT has unexpected result type!");
28143
28144 // Don't try to do this optimization when the setcc itself has i1 operands.
28145 // There are no legal vectors of i1, so this would be pointless. v1f16 is
28146 // ruled out to prevent the creation of setcc that need to be scalarized.
28147 EVT SrcVT = N0.getOperand(0).getValueType();
28148 if (SrcVT == MVT::i1 ||
28149 (SrcVT.isFloatingPoint() && SrcVT != MVT::f32 && SrcVT != MVT::f64))
28150 return SDValue();
28151
28152 // If NumMaskElts == 0, the comparison is larger than select result. The
28153 // largest real NEON comparison is 64-bits per lane, which means the result is
28154 // at most 32-bits and an illegal vector. Just bail out for now.
28155 unsigned NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
28156 if (!ResVT.isVector() || NumMaskElts == 0)
28157 return SDValue();
28158
28159 // Avoid creating vectors with excessive VFs before legalization.
28160 if (DCI.isBeforeLegalize() && NumMaskElts != ResVT.getVectorNumElements())
28161 return SDValue();
28162
28163 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
28165
28166 // Also bail out if the vector CCVT isn't the same size as ResVT.
28167 // This can happen if the SETCC operand size doesn't divide the ResVT size
28168 // (e.g., f64 vs v3f32).
28169 if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
28170 return SDValue();
28171
28172 // Make sure we didn't create illegal types, if we're not supposed to.
28173 assert(DCI.isBeforeLegalize() ||
28174 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));
28175
28176 // First perform a vector comparison, where lane 0 is the one we're interested
28177 // in.
28178 SDLoc DL(N0);
28179 SDValue LHS =
28180 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
28181 SDValue RHS =
28182 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
28183 SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
28184
28185 // Now duplicate the comparison mask we want across all other lanes.
28186 SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
28187 SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
28188 Mask = DAG.getNode(ISD::BITCAST, DL,
28189 ResVT.changeVectorElementTypeToInteger(), Mask);
28190
28191 return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
28192}
28193
28196 EVT VT = N->getValueType(0);
28197 SDLoc DL(N);
28198 // If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the
28199 // 128bit vector version.
28200 if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) {
28202 SmallVector<SDValue> Ops(N->ops());
28203 if (SDNode *LN = DCI.DAG.getNodeIfExists(N->getOpcode(),
28204 DCI.DAG.getVTList(LVT), Ops)) {
28205 return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0),
28206 DCI.DAG.getConstant(0, DL, MVT::i64));
28207 }
28208 }
28209
28210 if (N->getOpcode() == AArch64ISD::DUP) {
28211 SDValue Op = N->getOperand(0);
28212
28213 // Optimize DUP(extload/zextload i8/i16/i32) to avoid GPR->FPR transfer.
28214 // For example:
28215 // v4i32 = DUP (i32 (zextloadi8 addr))
28216 // =>
28217 // v4i32 = SCALAR_TO_VECTOR (i32 (zextloadi8 addr)) ; Matches to ldr b0
28218 // v4i32 = DUPLANE32 (v4i32), 0
28219 if (auto *LD = dyn_cast<LoadSDNode>(Op)) {
28220 ISD::LoadExtType ExtType = LD->getExtensionType();
28221 EVT MemVT = LD->getMemoryVT();
28222 EVT ElemVT = VT.getVectorElementType();
28223 if ((ExtType == ISD::EXTLOAD || ExtType == ISD::ZEXTLOAD) &&
28224 (MemVT == MVT::i8 || MemVT == MVT::i16 || MemVT == MVT::i32) &&
28225 ElemVT != MemVT && LD->hasOneUse()) {
28226 EVT Vec128VT = EVT::getVectorVT(*DCI.DAG.getContext(), ElemVT,
28227 128 / ElemVT.getSizeInBits());
28228 SDValue ScalarToVec =
28229 DCI.DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, Vec128VT, Op);
28230 return DCI.DAG.getNode(getDUPLANEOp(ElemVT), DL, VT, ScalarToVec,
28231 DCI.DAG.getConstant(0, DL, MVT::i64));
28232 }
28233 }
28234
28235 // If the instruction is known to produce a scalar in SIMD registers, we can
28236 // duplicate it across the vector lanes using DUPLANE instead of moving it
28237 // to a GPR first. For example, this allows us to handle:
28238 // v4i32 = DUP (i32 (FCMGT (f32, f32)))
28239 // FIXME: Ideally, we should be able to handle all instructions that
28240 // produce a scalar value in FPRs.
28241 if (Op.getOpcode() == AArch64ISD::FCMEQ ||
28242 Op.getOpcode() == AArch64ISD::FCMGE ||
28243 Op.getOpcode() == AArch64ISD::FCMGT) {
28244 EVT ElemVT = VT.getVectorElementType();
28245 EVT ExpandedVT = VT;
28246 // Insert into a 128-bit vector to match DUPLANE's pattern.
28247 if (VT.getSizeInBits() != 128)
28248 ExpandedVT = EVT::getVectorVT(*DCI.DAG.getContext(), ElemVT,
28249 128 / ElemVT.getSizeInBits());
28250 SDValue Zero = DCI.DAG.getConstant(0, DL, MVT::i64);
28251 SDValue Vec = DCI.DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpandedVT,
28252 DCI.DAG.getPOISON(ExpandedVT), Op, Zero);
28253 return DCI.DAG.getNode(getDUPLANEOp(ElemVT), DL, VT, Vec, Zero);
28254 }
28255
28256 if (DCI.isAfterLegalizeDAG()) {
28257 // If scalar dup's operand is extract_vector_elt, try to combine them into
28258 // duplane. For example,
28259 //
28260 // t21: i32 = extract_vector_elt t19, Constant:i64<0>
28261 // t18: v4i32 = AArch64ISD::DUP t21
28262 // ==>
28263 // t22: v4i32 = AArch64ISD::DUPLANE32 t19, Constant:i64<0>
28264 SDValue EXTRACT_VEC_ELT = N->getOperand(0);
28265 if (EXTRACT_VEC_ELT.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
28266 if (VT == EXTRACT_VEC_ELT.getOperand(0).getValueType()) {
28267 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
28268 return DCI.DAG.getNode(Opcode, DL, VT, EXTRACT_VEC_ELT.getOperand(0),
28269 EXTRACT_VEC_ELT.getOperand(1));
28270 }
28271 }
28272 }
28273
28274 return performPostLD1Combine(N, DCI, false);
28275 }
28276
28277 return SDValue();
28278}
28279
28280/// Get rid of unnecessary NVCASTs (that don't change the type).
28282 if (N->getValueType(0) == N->getOperand(0).getValueType())
28283 return N->getOperand(0);
28284 if (N->getOperand(0).getOpcode() == AArch64ISD::NVCAST)
28285 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(N), N->getValueType(0),
28286 N->getOperand(0).getOperand(0));
28287
28288 return SDValue();
28289}
28290
28291// If all users of the globaladdr are of the form (globaladdr + constant), find
28292// the smallest constant, fold it into the globaladdr's offset and rewrite the
28293// globaladdr as (globaladdr + constant) - constant.
28295 const AArch64Subtarget *Subtarget,
28296 const TargetMachine &TM) {
28297 auto *GN = cast<GlobalAddressSDNode>(N);
28298 if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=
28300 return SDValue();
28301
28302 uint64_t MinOffset = -1ull;
28303 for (SDNode *N : GN->users()) {
28304 if (N->getOpcode() != ISD::ADD)
28305 return SDValue();
28306 auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0));
28307 if (!C)
28308 C = dyn_cast<ConstantSDNode>(N->getOperand(1));
28309 if (!C)
28310 return SDValue();
28311 MinOffset = std::min(MinOffset, C->getZExtValue());
28312 }
28313 uint64_t Offset = MinOffset + GN->getOffset();
28314
28315 // Require that the new offset is larger than the existing one. Otherwise, we
28316 // can end up oscillating between two possible DAGs, for example,
28317 // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
28318 if (Offset <= uint64_t(GN->getOffset()))
28319 return SDValue();
28320
28321 // Check whether folding this offset is legal. It must not go out of bounds of
28322 // the referenced object to avoid violating the code model, and must be
28323 // smaller than 2^20 because this is the largest offset expressible in all
28324 // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF
28325 // stores an immediate signed 21 bit offset.)
28326 //
28327 // This check also prevents us from folding negative offsets, which will end
28328 // up being treated in the same way as large positive ones. They could also
28329 // cause code model violations, and aren't really common enough to matter.
28330 if (Offset >= (1 << 20))
28331 return SDValue();
28332
28333 const GlobalValue *GV = GN->getGlobal();
28334 Type *T = GV->getValueType();
28335 if (!T->isSized() ||
28337 return SDValue();
28338
28339 SDLoc DL(GN);
28340 SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset);
28341 return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,
28342 DAG.getConstant(MinOffset, DL, MVT::i64));
28343}
28344
28346 const AArch64Subtarget *Subtarget) {
28347 SDValue BR = N->getOperand(0);
28348 if (!Subtarget->hasCSSC() || BR.getOpcode() != ISD::BITREVERSE ||
28349 !BR.getValueType().isScalarInteger())
28350 return SDValue();
28351
28352 SDLoc DL(N);
28353 return DAG.getNode(ISD::CTTZ, DL, BR.getValueType(), BR.getOperand(0));
28354}
28355
28356// Turns the vector of indices into a vector of byte offstes by scaling Offset
28357// by (BitWidth / 8).
28359 SDLoc DL, unsigned BitWidth) {
28360 assert(Offset.getValueType().isScalableVector() &&
28361 "This method is only for scalable vectors of offsets");
28362
28363 SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64);
28364 SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift);
28365
28366 return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift);
28367}
28368
28369/// Check if the value of \p OffsetInBytes can be used as an immediate for
28370/// the gather load/prefetch and scatter store instructions with vector base and
28371/// immediate offset addressing mode:
28372///
28373/// [<Zn>.[S|D]{, #<imm>}]
28374///
28375/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
28376inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes,
28377 unsigned ScalarSizeInBytes) {
28378 // The immediate is not a multiple of the scalar size.
28379 if (OffsetInBytes % ScalarSizeInBytes)
28380 return false;
28381
28382 // The immediate is out of range.
28383 if (OffsetInBytes / ScalarSizeInBytes > 31)
28384 return false;
28385
28386 return true;
28387}
28388
28389/// Check if the value of \p Offset represents a valid immediate for the SVE
28390/// gather load/prefetch and scatter store instructiona with vector base and
28391/// immediate offset addressing mode:
28392///
28393/// [<Zn>.[S|D]{, #<imm>}]
28394///
28395/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
28397 unsigned ScalarSizeInBytes) {
28398 ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode());
28399 return OffsetConst && isValidImmForSVEVecImmAddrMode(
28400 OffsetConst->getZExtValue(), ScalarSizeInBytes);
28401}
28402
28404 unsigned Opcode,
28405 bool OnlyPackedOffsets = true) {
28406 const SDValue Src = N->getOperand(2);
28407 const EVT SrcVT = Src->getValueType(0);
28408 assert(SrcVT.isScalableVector() &&
28409 "Scatter stores are only possible for SVE vectors");
28410
28411 SDLoc DL(N);
28412 MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT();
28413
28414 // Make sure that source data will fit into an SVE register
28416 return SDValue();
28417
28418 // For FPs, ACLE only supports _packed_ single and double precision types.
28419 // SST1Q_[INDEX_]PRED is the ST1Q for sve2p1 and should allow all sizes.
28420 if (SrcElVT.isFloatingPoint())
28421 if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64) &&
28422 ((Opcode != AArch64ISD::SST1Q_PRED &&
28423 Opcode != AArch64ISD::SST1Q_INDEX_PRED) ||
28424 ((SrcVT != MVT::nxv8f16) && (SrcVT != MVT::nxv8bf16))))
28425 return SDValue();
28426
28427 // Depending on the addressing mode, this is either a pointer or a vector of
28428 // pointers (that fits into one register)
28429 SDValue Base = N->getOperand(4);
28430 // Depending on the addressing mode, this is either a single offset or a
28431 // vector of offsets (that fits into one register)
28432 SDValue Offset = N->getOperand(5);
28433
28434 // For "scalar + vector of indices", just scale the indices. This only
28435 // applies to non-temporal scatters because there's no instruction that takes
28436 // indices.
28437 if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {
28438 Offset =
28440 Opcode = AArch64ISD::SSTNT1_PRED;
28441 } else if (Opcode == AArch64ISD::SST1Q_INDEX_PRED) {
28442 Offset =
28444 Opcode = AArch64ISD::SST1Q_PRED;
28445 }
28446
28447 // In the case of non-temporal gather loads there's only one SVE instruction
28448 // per data-size: "scalar + vector", i.e.
28449 // * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
28450 // Since we do have intrinsics that allow the arguments to be in a different
28451 // order, we may need to swap them to match the spec.
28452 if ((Opcode == AArch64ISD::SSTNT1_PRED || Opcode == AArch64ISD::SST1Q_PRED) &&
28453 Offset.getValueType().isVector())
28455
28456 // SST1_IMM requires that the offset is an immediate that is:
28457 // * a multiple of #SizeInBytes,
28458 // * in the range [0, 31 x #SizeInBytes],
28459 // where #SizeInBytes is the size in bytes of the stored items. For
28460 // immediates outside that range and non-immediate scalar offsets use SST1 or
28461 // SST1_UXTW instead.
28462 if (Opcode == AArch64ISD::SST1_IMM_PRED) {
28464 SrcVT.getScalarSizeInBits() / 8)) {
28465 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
28466 Opcode = AArch64ISD::SST1_UXTW_PRED;
28467 else
28468 Opcode = AArch64ISD::SST1_PRED;
28469
28471 }
28472 }
28473
28474 auto &TLI = DAG.getTargetLoweringInfo();
28475 if (!TLI.isTypeLegal(Base.getValueType()))
28476 return SDValue();
28477
28478 // Some scatter store variants allow unpacked offsets, but only as nxv2i32
28479 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
28480 // nxv2i64. Legalize accordingly.
28481 if (!OnlyPackedOffsets &&
28482 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
28483 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
28484
28485 if (!TLI.isTypeLegal(Offset.getValueType()))
28486 return SDValue();
28487
28488 // Source value type that is representable in hardware
28489 EVT HwSrcVt = getSVEContainerType(SrcVT);
28490
28491 // Keep the original type of the input data to store - this is needed to be
28492 // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For
28493 // FP values we want the integer equivalent, so just use HwSrcVt.
28494 SDValue InputVT = DAG.getValueType(SrcVT);
28495 if (SrcVT.isFloatingPoint())
28496 InputVT = DAG.getValueType(HwSrcVt);
28497
28498 SDVTList VTs = DAG.getVTList(MVT::Other);
28499 SDValue SrcNew;
28500
28501 if (Src.getValueType().isFloatingPoint())
28502 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Src);
28503 else
28504 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Src);
28505
28506 SDValue Ops[] = {N->getOperand(0), // Chain
28507 SrcNew,
28508 N->getOperand(3), // Pg
28509 Base,
28510 Offset,
28511 InputVT};
28512
28513 return DAG.getNode(Opcode, DL, VTs, Ops);
28514}
28515
28517 unsigned Opcode,
28518 bool OnlyPackedOffsets = true) {
28519 const EVT RetVT = N->getValueType(0);
28520 assert(RetVT.isScalableVector() &&
28521 "Gather loads are only possible for SVE vectors");
28522
28523 SDLoc DL(N);
28524
28525 // Make sure that the loaded data will fit into an SVE register
28527 return SDValue();
28528
28529 // Depending on the addressing mode, this is either a pointer or a vector of
28530 // pointers (that fits into one register)
28531 SDValue Base = N->getOperand(3);
28532 // Depending on the addressing mode, this is either a single offset or a
28533 // vector of offsets (that fits into one register)
28534 SDValue Offset = N->getOperand(4);
28535
28536 // For "scalar + vector of indices", scale the indices to obtain unscaled
28537 // offsets. This applies to non-temporal and quadword gathers, which do not
28538 // have an addressing mode with scaled offset.
28539 if (Opcode == AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) {
28541 RetVT.getScalarSizeInBits());
28542 Opcode = AArch64ISD::GLDNT1_MERGE_ZERO;
28543 } else if (Opcode == AArch64ISD::GLD1Q_INDEX_MERGE_ZERO) {
28545 RetVT.getScalarSizeInBits());
28546 Opcode = AArch64ISD::GLD1Q_MERGE_ZERO;
28547 }
28548
28549 // In the case of non-temporal gather loads and quadword gather loads there's
28550 // only one addressing mode : "vector + scalar", e.g.
28551 // ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
28552 // Since we do have intrinsics that allow the arguments to be in a different
28553 // order, we may need to swap them to match the spec.
28554 if ((Opcode == AArch64ISD::GLDNT1_MERGE_ZERO ||
28555 Opcode == AArch64ISD::GLD1Q_MERGE_ZERO) &&
28556 Offset.getValueType().isVector())
28558
28559 // GLD{FF}1_IMM requires that the offset is an immediate that is:
28560 // * a multiple of #SizeInBytes,
28561 // * in the range [0, 31 x #SizeInBytes],
28562 // where #SizeInBytes is the size in bytes of the loaded items. For
28563 // immediates outside that range and non-immediate scalar offsets use
28564 // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead.
28565 if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO ||
28566 Opcode == AArch64ISD::GLDFF1_IMM_MERGE_ZERO) {
28568 RetVT.getScalarSizeInBits() / 8)) {
28569 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
28570 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
28571 ? AArch64ISD::GLD1_UXTW_MERGE_ZERO
28572 : AArch64ISD::GLDFF1_UXTW_MERGE_ZERO;
28573 else
28574 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
28575 ? AArch64ISD::GLD1_MERGE_ZERO
28576 : AArch64ISD::GLDFF1_MERGE_ZERO;
28577
28579 }
28580 }
28581
28582 auto &TLI = DAG.getTargetLoweringInfo();
28583 if (!TLI.isTypeLegal(Base.getValueType()))
28584 return SDValue();
28585
28586 // Some gather load variants allow unpacked offsets, but only as nxv2i32
28587 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
28588 // nxv2i64. Legalize accordingly.
28589 if (!OnlyPackedOffsets &&
28590 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
28591 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
28592
28593 // Return value type that is representable in hardware
28594 EVT HwRetVt = getSVEContainerType(RetVT);
28595
28596 // Keep the original output value type around - this is needed to be able to
28597 // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP
28598 // values we want the integer equivalent, so just use HwRetVT.
28599 SDValue OutVT = DAG.getValueType(RetVT);
28600 if (RetVT.isFloatingPoint())
28601 OutVT = DAG.getValueType(HwRetVt);
28602
28603 SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other);
28604 SDValue Ops[] = {N->getOperand(0), // Chain
28605 N->getOperand(2), // Pg
28606 Base, Offset, OutVT};
28607
28608 SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops);
28609 SDValue LoadChain = SDValue(Load.getNode(), 1);
28610
28611 if (RetVT.isInteger() && (RetVT != HwRetVt))
28612 Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0));
28613
28614 // If the original return value was FP, bitcast accordingly. Doing it here
28615 // means that we can avoid adding TableGen patterns for FPs.
28616 if (RetVT.isFloatingPoint())
28617 Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0));
28618
28619 return DAG.getMergeValues({Load, LoadChain}, DL);
28620}
28621
28622static SDValue
28624 SelectionDAG &DAG) {
28625 SDLoc DL(N);
28626 SDValue Src = N->getOperand(0);
28627 unsigned Opc = Src->getOpcode();
28628
28629 // Sign extend of an unsigned unpack -> signed unpack
28630 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
28631
28632 unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
28633 : AArch64ISD::SUNPKLO;
28634
28635 // Push the sign extend to the operand of the unpack
28636 // This is necessary where, for example, the operand of the unpack
28637 // is another unpack:
28638 // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)
28639 // ->
28640 // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)
28641 // ->
28642 // 4i32 sunpklo(8i16 sunpklo(16i8 opnd))
28643 SDValue ExtOp = Src->getOperand(0);
28644 auto VT = cast<VTSDNode>(N->getOperand(1))->getVT();
28645 EVT EltTy = VT.getVectorElementType();
28646
28647 if (EltTy.getSizeInBits() >
28649 return SDValue();
28650
28651 EVT ExtVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext());
28652
28654 ExtOp, DAG.getValueType(ExtVT));
28655
28656 return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
28657 }
28658
28659 // Sign extend of CSET -> CSETM.
28660 if (Opc == AArch64ISD::CSEL &&
28661 cast<VTSDNode>(N->getOperand(1))->getVT() == MVT::i1) {
28662 EVT VT = N->getValueType(0);
28663 SDValue TVal = Src.getOperand(0);
28664 SDValue FVal = Src.getOperand(1);
28665
28666 // SIGN_EXTEND_INREG (CSEL 0, 1, cc, NZCV), i1 --> CSEL 0, -1, cc, NZCV
28667 if (isNullConstant(TVal) && isOneConstant(FVal))
28668 return DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal,
28669 DAG.getAllOnesConstant(DL, VT), Src.getOperand(2),
28670 Src.getOperand(3));
28671
28672 // SIGN_EXTEND_INREG (CSEL 1, 0, cc, NZCV), i1 --> CSEL -1, 0, cc, NZCV
28673 if (isOneConstant(TVal) && isNullConstant(FVal))
28674 return DAG.getNode(AArch64ISD::CSEL, DL, VT,
28675 DAG.getAllOnesConstant(DL, VT), FVal,
28676 Src.getOperand(2), Src.getOperand(3));
28677 }
28678
28679 if (DCI.isBeforeLegalizeOps())
28680 return SDValue();
28681
28683 return SDValue();
28684
28685 // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
28686 // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
28687 unsigned NewOpc;
28688 unsigned MemVTOpNum = 4;
28689 switch (Opc) {
28690 case AArch64ISD::LD1_MERGE_ZERO:
28691 NewOpc = AArch64ISD::LD1S_MERGE_ZERO;
28692 MemVTOpNum = 3;
28693 break;
28694 case AArch64ISD::LDNF1_MERGE_ZERO:
28695 NewOpc = AArch64ISD::LDNF1S_MERGE_ZERO;
28696 MemVTOpNum = 3;
28697 break;
28698 case AArch64ISD::LDFF1_MERGE_ZERO:
28699 NewOpc = AArch64ISD::LDFF1S_MERGE_ZERO;
28700 MemVTOpNum = 3;
28701 break;
28702 case AArch64ISD::GLD1_MERGE_ZERO:
28703 NewOpc = AArch64ISD::GLD1S_MERGE_ZERO;
28704 break;
28705 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
28706 NewOpc = AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
28707 break;
28708 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
28709 NewOpc = AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
28710 break;
28711 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
28712 NewOpc = AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
28713 break;
28714 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
28715 NewOpc = AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
28716 break;
28717 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
28718 NewOpc = AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
28719 break;
28720 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
28721 NewOpc = AArch64ISD::GLD1S_IMM_MERGE_ZERO;
28722 break;
28723 case AArch64ISD::GLDFF1_MERGE_ZERO:
28724 NewOpc = AArch64ISD::GLDFF1S_MERGE_ZERO;
28725 break;
28726 case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
28727 NewOpc = AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO;
28728 break;
28729 case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
28730 NewOpc = AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO;
28731 break;
28732 case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
28733 NewOpc = AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO;
28734 break;
28735 case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
28736 NewOpc = AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO;
28737 break;
28738 case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
28739 NewOpc = AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO;
28740 break;
28741 case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
28742 NewOpc = AArch64ISD::GLDFF1S_IMM_MERGE_ZERO;
28743 break;
28744 case AArch64ISD::GLDNT1_MERGE_ZERO:
28745 NewOpc = AArch64ISD::GLDNT1S_MERGE_ZERO;
28746 break;
28747 default:
28748 return SDValue();
28749 }
28750
28751 EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT();
28752 EVT SrcMemVT = cast<VTSDNode>(Src->getOperand(MemVTOpNum))->getVT();
28753
28754 if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse())
28755 return SDValue();
28756
28757 EVT DstVT = N->getValueType(0);
28758 SDVTList VTs = DAG.getVTList(DstVT, MVT::Other);
28759
28761 for (unsigned I = 0; I < Src->getNumOperands(); ++I)
28762 Ops.push_back(Src->getOperand(I));
28763
28764 SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops);
28765 DCI.CombineTo(N, ExtLoad);
28766 DCI.CombineTo(Src.getNode(), ExtLoad, ExtLoad.getValue(1));
28767
28768 // Return N so it doesn't get rechecked
28769 return SDValue(N, 0);
28770}
28771
28772/// Legalize the gather prefetch (scalar + vector addressing mode) when the
28773/// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
28774/// != nxv2i32) do not need legalization.
28776 const unsigned OffsetPos = 4;
28777 SDValue Offset = N->getOperand(OffsetPos);
28778
28779 // Not an unpacked vector, bail out.
28780 if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
28781 return SDValue();
28782
28783 // Extend the unpacked offset vector to 64-bit lanes.
28784 SDLoc DL(N);
28785 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset);
28787 // Replace the offset operand with the 64-bit one.
28788 Ops[OffsetPos] = Offset;
28789
28790 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
28791}
28792
28793/// Combines a node carrying the intrinsic
28794/// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses
28795/// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to
28796/// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the
28797/// sve gather prefetch instruction with vector plus immediate addressing mode.
28799 unsigned ScalarSizeInBytes) {
28800 const unsigned ImmPos = 4, OffsetPos = 3;
28801 // No need to combine the node if the immediate is valid...
28802 if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes))
28803 return SDValue();
28804
28805 // ...otherwise swap the offset base with the offset...
28807 std::swap(Ops[ImmPos], Ops[OffsetPos]);
28808 // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
28809 // `aarch64_sve_prfb_gather_uxtw_index`.
28810 SDLoc DL(N);
28811 Ops[1] = DAG.getTargetConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index,
28812 DL, MVT::i64);
28813
28814 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
28815}
28816
28817// Return true if the vector operation can guarantee only the first lane of its
28818// result contains data, with all bits in other lanes set to zero.
28820 switch (Op.getOpcode()) {
28821 default:
28822 return false;
28823 case AArch64ISD::ANDV_PRED:
28824 case AArch64ISD::EORV_PRED:
28825 case AArch64ISD::FADDA_PRED:
28826 case AArch64ISD::FADDV_PRED:
28827 case AArch64ISD::FMAXNMV_PRED:
28828 case AArch64ISD::FMAXV_PRED:
28829 case AArch64ISD::FMINNMV_PRED:
28830 case AArch64ISD::FMINV_PRED:
28831 case AArch64ISD::ORV_PRED:
28832 case AArch64ISD::SADDV_PRED:
28833 case AArch64ISD::SMAXV_PRED:
28834 case AArch64ISD::SMINV_PRED:
28835 case AArch64ISD::UADDV_PRED:
28836 case AArch64ISD::UMAXV_PRED:
28837 case AArch64ISD::UMINV_PRED:
28838 return true;
28839 }
28840}
28841
28842// Return true if the vector operation can guarantee that the first lane of its
28843// result is active.
28845 switch (Op.getOpcode()) {
28846 default:
28847 return false;
28848 case AArch64ISD::REINTERPRET_CAST:
28849 return isLane0KnownActive(Op->getOperand(0));
28850 case ISD::SPLAT_VECTOR:
28851 return isOneConstant(Op.getOperand(0));
28852 case AArch64ISD::PTRUE:
28853 return Op.getConstantOperandVal(0) == AArch64SVEPredPattern::all;
28854 };
28855}
28856
28858 assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!");
28859 SDValue InsertVec = N->getOperand(0);
28860 SDValue InsertElt = N->getOperand(1);
28861 SDValue InsertIdx = N->getOperand(2);
28862
28863 // We only care about inserts into the first element...
28864 if (!isNullConstant(InsertIdx))
28865 return SDValue();
28866 // ...of a zero'd vector...
28868 return SDValue();
28869 // ...where the inserted data was previously extracted...
28870 if (InsertElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
28871 return SDValue();
28872
28873 SDValue ExtractVec = InsertElt.getOperand(0);
28874 SDValue ExtractIdx = InsertElt.getOperand(1);
28875
28876 // ...from the first element of a vector.
28877 if (!isNullConstant(ExtractIdx))
28878 return SDValue();
28879
28880 // If we get here we are effectively trying to zero lanes 1-N of a vector.
28881
28882 // Ensure there's no type conversion going on.
28883 if (N->getValueType(0) != ExtractVec.getValueType())
28884 return SDValue();
28885
28886 if (!isLanes1toNKnownZero(ExtractVec))
28887 return SDValue();
28888
28889 // The explicit zeroing is redundant.
28890 return ExtractVec;
28891}
28892
28893static SDValue
28896 return Res;
28897
28898 // Turn INSERT_VECTOR_ELT(undef, Elt, Idx) into SPLAT_VECTOR(Elt)
28899 // Do not bother with inserts into lane 0 because there are patterns to select
28900 // them using INSERT_SUBREG hsub/ssub/dsub.
28901 SDLoc DL(N);
28902 SDValue Vec = N->getOperand(0);
28903 SDValue Elt = N->getOperand(1);
28904 SDValue Idx = N->getOperand(2);
28905 EVT VecVT = Vec.getValueType();
28906 if (VecVT.isScalableVector() && Vec->isUndef() && !isNullConstant(Idx))
28907 return DCI.DAG.getNode(ISD::SPLAT_VECTOR, DL, VecVT, Elt);
28908
28909 return performPostLD1Combine(N, DCI, true);
28910}
28911
28914 const AArch64Subtarget *Subtarget) {
28915 SDValue N0 = N->getOperand(0);
28916 EVT VT = N->getValueType(0);
28917
28918 // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
28919 if (N->hasOneUse() && N->user_begin()->getOpcode() == ISD::FP_ROUND)
28920 return SDValue();
28921
28922 auto hasValidElementTypeForFPExtLoad = [](EVT VT) {
28923 EVT EltVT = VT.getVectorElementType();
28924 return EltVT == MVT::f32 || EltVT == MVT::f64;
28925 };
28926
28927 // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
28928 // We purposefully don't care about legality of the nodes here as we know
28929 // they can be split down into something legal.
28930 if (DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(N0.getNode()) &&
28931 N0.hasOneUse() && Subtarget->useSVEForFixedLengthVectors() &&
28932 VT.isFixedLengthVector() && hasValidElementTypeForFPExtLoad(VT) &&
28933 VT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits()) {
28934 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
28935 SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
28936 LN0->getChain(), LN0->getBasePtr(),
28937 N0.getValueType(), LN0->getMemOperand());
28938 DCI.CombineTo(N, ExtLoad);
28939 DCI.CombineTo(
28940 N0.getNode(),
28941 DAG.getNode(ISD::FP_ROUND, SDLoc(N0), N0.getValueType(), ExtLoad,
28942 DAG.getIntPtrConstant(1, SDLoc(N0), /*isTarget=*/true)),
28943 ExtLoad.getValue(1));
28944 return SDValue(N, 0); // Return N so it doesn't get rechecked!
28945 }
28946
28947 return SDValue();
28948}
28949
28951 const AArch64Subtarget *Subtarget) {
28952 EVT VT = N->getValueType(0);
28953
28954 // Don't expand for NEON, SVE2 or SME
28955 if (!VT.isScalableVector() || Subtarget->hasSVE2() || Subtarget->hasSME())
28956 return SDValue();
28957
28958 SDLoc DL(N);
28959
28960 SDValue Mask = N->getOperand(0);
28961 SDValue In1 = N->getOperand(1);
28962 SDValue In2 = N->getOperand(2);
28963
28964 SDValue InvMask = DAG.getNOT(DL, Mask, VT);
28965 SDValue Sel = DAG.getNode(ISD::AND, DL, VT, Mask, In1);
28966 SDValue SelInv = DAG.getNode(ISD::AND, DL, VT, InvMask, In2);
28967 return DAG.getNode(ISD::OR, DL, VT, Sel, SelInv);
28968}
28969
28971 EVT VT = N->getValueType(0);
28972
28973 SDValue Insert = N->getOperand(0);
28974 if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR)
28975 return SDValue();
28976
28977 if (!Insert.getOperand(0).isUndef())
28978 return SDValue();
28979
28980 uint64_t IdxInsert = Insert.getConstantOperandVal(2);
28981 uint64_t IdxDupLane = N->getConstantOperandVal(1);
28982 if (IdxInsert != 0 || IdxDupLane != 0)
28983 return SDValue();
28984
28985 SDValue Bitcast = Insert.getOperand(1);
28986 if (Bitcast.getOpcode() != ISD::BITCAST)
28987 return SDValue();
28988
28989 SDValue Subvec = Bitcast.getOperand(0);
28990 EVT SubvecVT = Subvec.getValueType();
28991 if (!SubvecVT.is128BitVector())
28992 return SDValue();
28993 EVT NewSubvecVT =
28995
28996 SDLoc DL(N);
28997 SDValue NewInsert =
28998 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewSubvecVT,
28999 DAG.getPOISON(NewSubvecVT), Subvec, Insert->getOperand(2));
29000 SDValue NewDuplane128 = DAG.getNode(AArch64ISD::DUPLANE128, DL, NewSubvecVT,
29001 NewInsert, N->getOperand(1));
29002 return DAG.getNode(ISD::BITCAST, DL, VT, NewDuplane128);
29003}
29004
29005// Try to combine mull with uzp1.
29008 SelectionDAG &DAG) {
29009 if (DCI.isBeforeLegalizeOps())
29010 return SDValue();
29011
29012 SDValue LHS = N->getOperand(0);
29013 SDValue RHS = N->getOperand(1);
29014
29015 SDValue ExtractHigh;
29016 SDValue ExtractLow;
29017 SDValue TruncHigh;
29018 SDValue TruncLow;
29019 SDLoc DL(N);
29020
29021 // Check the operands are trunc and extract_high.
29023 RHS.getOpcode() == ISD::TRUNCATE) {
29024 TruncHigh = RHS;
29025 if (LHS.getOpcode() == ISD::BITCAST)
29026 ExtractHigh = LHS.getOperand(0);
29027 else
29028 ExtractHigh = LHS;
29030 LHS.getOpcode() == ISD::TRUNCATE) {
29031 TruncHigh = LHS;
29032 if (RHS.getOpcode() == ISD::BITCAST)
29033 ExtractHigh = RHS.getOperand(0);
29034 else
29035 ExtractHigh = RHS;
29036 } else
29037 return SDValue();
29038
29039 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
29040 // with uzp1.
29041 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
29042 SDValue TruncHighOp = TruncHigh.getOperand(0);
29043 EVT TruncHighOpVT = TruncHighOp.getValueType();
29044 if (TruncHighOp.getOpcode() == AArch64ISD::DUP ||
29045 DAG.isSplatValue(TruncHighOp, false))
29046 return SDValue();
29047
29048 // Check there is other extract_high with same source vector.
29049 // For example,
29050 //
29051 // t18: v4i16 = extract_subvector t2, Constant:i64<0>
29052 // t12: v4i16 = truncate t11
29053 // t31: v4i32 = AArch64ISD::SMULL t18, t12
29054 // t23: v4i16 = extract_subvector t2, Constant:i64<4>
29055 // t16: v4i16 = truncate t15
29056 // t30: v4i32 = AArch64ISD::SMULL t23, t1
29057 //
29058 // This dagcombine assumes the two extract_high uses same source vector in
29059 // order to detect the pair of the mull. If they have different source vector,
29060 // this code will not work.
29061 // TODO: Should also try to look through a bitcast.
29062 bool HasFoundMULLow = true;
29063 SDValue ExtractHighSrcVec = ExtractHigh.getOperand(0);
29064 if (ExtractHighSrcVec->use_size() != 2)
29065 HasFoundMULLow = false;
29066
29067 // Find ExtractLow.
29068 for (SDNode *User : ExtractHighSrcVec.getNode()->users()) {
29069 if (User == ExtractHigh.getNode())
29070 continue;
29071
29072 if (User->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
29074 HasFoundMULLow = false;
29075 break;
29076 }
29077
29078 ExtractLow.setNode(User);
29079 }
29080
29081 if (!ExtractLow || !ExtractLow->hasOneUse())
29082 HasFoundMULLow = false;
29083
29084 // Check ExtractLow's user.
29085 if (HasFoundMULLow) {
29086 SDNode *ExtractLowUser = *ExtractLow.getNode()->user_begin();
29087 if (ExtractLowUser->getOpcode() != N->getOpcode()) {
29088 HasFoundMULLow = false;
29089 } else {
29090 if (ExtractLowUser->getOperand(0) == ExtractLow) {
29091 if (ExtractLowUser->getOperand(1).getOpcode() == ISD::TRUNCATE)
29092 TruncLow = ExtractLowUser->getOperand(1);
29093 else
29094 HasFoundMULLow = false;
29095 } else {
29096 if (ExtractLowUser->getOperand(0).getOpcode() == ISD::TRUNCATE)
29097 TruncLow = ExtractLowUser->getOperand(0);
29098 else
29099 HasFoundMULLow = false;
29100 }
29101 }
29102 }
29103
29104 // If the truncate's operand is BUILD_VECTOR with DUP, do not combine the op
29105 // with uzp1.
29106 // You can see the regressions on test/CodeGen/AArch64/aarch64-smull.ll
29107 EVT TruncHighVT = TruncHigh.getValueType();
29108 EVT UZP1VT = TruncHighVT.getDoubleNumVectorElementsVT(*DAG.getContext());
29109 SDValue TruncLowOp =
29110 HasFoundMULLow ? TruncLow.getOperand(0) : DAG.getPOISON(UZP1VT);
29111 EVT TruncLowOpVT = TruncLowOp.getValueType();
29112 if (HasFoundMULLow && (TruncLowOp.getOpcode() == AArch64ISD::DUP ||
29113 DAG.isSplatValue(TruncLowOp, false)))
29114 return SDValue();
29115
29116 // Create uzp1, extract_high and extract_low.
29117 if (TruncHighOpVT != UZP1VT)
29118 TruncHighOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncHighOp);
29119 if (TruncLowOpVT != UZP1VT)
29120 TruncLowOp = DAG.getNode(ISD::BITCAST, DL, UZP1VT, TruncLowOp);
29121
29122 SDValue UZP1 =
29123 DAG.getNode(AArch64ISD::UZP1, DL, UZP1VT, TruncLowOp, TruncHighOp);
29124 SDValue HighIdxCst =
29125 DAG.getConstant(TruncHighVT.getVectorNumElements(), DL, MVT::i64);
29126 SDValue NewTruncHigh =
29127 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncHighVT, UZP1, HighIdxCst);
29128 DAG.ReplaceAllUsesWith(TruncHigh, NewTruncHigh);
29129
29130 if (HasFoundMULLow) {
29131 EVT TruncLowVT = TruncLow.getValueType();
29132 SDValue NewTruncLow = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, TruncLowVT,
29133 UZP1, ExtractLow.getOperand(1));
29134 DAG.ReplaceAllUsesWith(TruncLow, NewTruncLow);
29135 }
29136
29137 return SDValue(N, 0);
29138}
29139
29142 SelectionDAG &DAG) {
29143 if (SDValue Val =
29145 return Val;
29146
29147 if (SDValue Val = tryCombineMULLWithUZP1(N, DCI, DAG))
29148 return Val;
29149
29150 return SDValue();
29151}
29152
29155 SelectionDAG &DAG) {
29156 if (DCI.isBeforeLegalize())
29157 return SDValue();
29158
29159 SDLoc DL(N);
29160 auto Mask = N->getOperand(0);
29161 auto Pred = N->getOperand(1);
29162
29163 if (!isLane0KnownActive(Mask))
29164 return SDValue();
29165
29166 if (Pred->getOpcode() == AArch64ISD::REINTERPRET_CAST)
29167 Pred = Pred->getOperand(0);
29168
29169 if (Pred->getOpcode() == ISD::CONCAT_VECTORS) {
29170 Pred = Pred->getOperand(0);
29171 Pred = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Pred);
29172 return DAG.getNode(AArch64ISD::PTEST_FIRST, DL, N->getValueType(0), Mask,
29173 Pred);
29174 }
29175
29176 return SDValue();
29177}
29178
29179static SDValue
29181 SelectionDAG &DAG) {
29182 SDLoc DL(N);
29183
29184 // If a DUP(Op0) already exists, reuse it for the scalar_to_vector.
29185 if (DCI.isAfterLegalizeDAG()) {
29186 if (SDNode *LN = DCI.DAG.getNodeIfExists(AArch64ISD::DUP, N->getVTList(),
29187 N->getOperand(0)))
29188 return SDValue(LN, 0);
29189 }
29190
29191 // Let's do below transform.
29192 //
29193 // t34: v4i32 = AArch64ISD::UADDLV t2
29194 // t35: i32 = extract_vector_elt t34, Constant:i64<0>
29195 // t7: i64 = zero_extend t35
29196 // t20: v1i64 = scalar_to_vector t7
29197 // ==>
29198 // t34: v4i32 = AArch64ISD::UADDLV t2
29199 // t39: v2i32 = extract_subvector t34, Constant:i64<0>
29200 // t40: v1i64 = AArch64ISD::NVCAST t39
29201 if (DCI.isBeforeLegalizeOps())
29202 return SDValue();
29203
29204 EVT VT = N->getValueType(0);
29205 if (VT != MVT::v1i64)
29206 return SDValue();
29207
29208 SDValue ZEXT = N->getOperand(0);
29209 if (ZEXT.getOpcode() != ISD::ZERO_EXTEND || ZEXT.getValueType() != MVT::i64)
29210 return SDValue();
29211
29212 SDValue EXTRACT_VEC_ELT = ZEXT.getOperand(0);
29213 if (EXTRACT_VEC_ELT.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
29214 EXTRACT_VEC_ELT.getValueType() != MVT::i32)
29215 return SDValue();
29216
29217 if (!isNullConstant(EXTRACT_VEC_ELT.getOperand(1)))
29218 return SDValue();
29219
29220 SDValue UADDLV = EXTRACT_VEC_ELT.getOperand(0);
29221 if (UADDLV.getOpcode() != AArch64ISD::UADDLV ||
29222 UADDLV.getValueType() != MVT::v4i32 ||
29223 UADDLV.getOperand(0).getValueType() != MVT::v8i8)
29224 return SDValue();
29225
29226 // Let's generate new sequence with AArch64ISD::NVCAST.
29227 SDValue EXTRACT_SUBVEC =
29228 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, UADDLV,
29229 DAG.getConstant(0, DL, MVT::i64));
29230 SDValue NVCAST =
29231 DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, EXTRACT_SUBVEC);
29232
29233 return NVCAST;
29234}
29235
29238 if (!DCI.isBeforeLegalize())
29239 return SDValue();
29240
29241 unsigned NumParts = N->getNumOperands();
29242 if (NumParts != 2 && NumParts != 4)
29243 return SDValue();
29244
29245 EVT SubVecTy = N->getValueType(0);
29246
29247 // At the moment we're unlikely to see a fixed-width vector deinterleave as
29248 // we usually generate shuffles instead.
29249 unsigned MinNumElements = SubVecTy.getVectorMinNumElements();
29250 if (!SubVecTy.isScalableVector() ||
29251 SubVecTy.getSizeInBits().getKnownMinValue() != 128 ||
29252 !DAG.getTargetLoweringInfo().isTypeLegal(SubVecTy))
29253 return SDValue();
29254
29255 // Make sure each input operand is the correct extract_subvector of the same
29256 // wider vector.
29257 SDValue Op0 = N->getOperand(0);
29258 for (unsigned I = 0; I < NumParts; I++) {
29259 SDValue OpI = N->getOperand(I);
29260 if (OpI->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
29261 OpI->getOperand(0) != Op0->getOperand(0))
29262 return SDValue();
29263 if (OpI->getConstantOperandVal(1) != (I * MinNumElements))
29264 return SDValue();
29265 }
29266
29267 // Normal loads are currently already handled by the InterleavedAccessPass so
29268 // we don't expect to see them here. Bail out if the masked load has an
29269 // unexpected number of uses, since we want to avoid a situation where we have
29270 // both deinterleaving loads and normal loads in the same block. Also, discard
29271 // masked loads that are extending, indexed, have an unexpected offset or have
29272 // an unsupported passthru value until we find a valid use case.
29273 auto MaskedLoad = dyn_cast<MaskedLoadSDNode>(Op0->getOperand(0));
29274 if (!MaskedLoad || !MaskedLoad->hasNUsesOfValue(NumParts, 0) ||
29275 !MaskedLoad->isSimple() || !ISD::isNormalMaskedLoad(MaskedLoad) ||
29276 !MaskedLoad->getOffset().isUndef() ||
29277 (!MaskedLoad->getPassThru()->isUndef() &&
29278 !isZerosVector(MaskedLoad->getPassThru().getNode())))
29279 return SDValue();
29280
29281 // Now prove that the mask is an interleave of identical masks.
29282 SDLoc DL(N);
29283 SDValue NarrowMask =
29284 getNarrowMaskForInterleavedOps(DAG, DL, MaskedLoad->getMask(), NumParts);
29285 if (!NarrowMask)
29286 return SDValue();
29287
29288 const Intrinsic::ID IID = NumParts == 2 ? Intrinsic::aarch64_sve_ld2_sret
29289 : Intrinsic::aarch64_sve_ld4_sret;
29290 SDValue NewLdOps[] = {MaskedLoad->getChain(),
29291 DAG.getConstant(IID, DL, MVT::i32), NarrowMask,
29292 MaskedLoad->getBasePtr()};
29293 SDValue Res;
29294 if (NumParts == 2)
29296 {SubVecTy, SubVecTy, MVT::Other}, NewLdOps);
29297 else
29299 {SubVecTy, SubVecTy, SubVecTy, SubVecTy, MVT::Other},
29300 NewLdOps);
29301
29302 // We can now generate a structured load!
29303 SmallVector<SDValue, 4> ResOps(NumParts);
29304 for (unsigned Idx = 0; Idx < NumParts; Idx++)
29305 ResOps[Idx] = SDValue(Res.getNode(), Idx);
29306
29307 // Replace uses of the original chain result with the new chain result.
29308 DAG.ReplaceAllUsesOfValueWith(SDValue(MaskedLoad, 1),
29309 SDValue(Res.getNode(), NumParts));
29310 return DCI.CombineTo(N, ResOps, false);
29311}
29312
29313/// If the operand is a bitwise AND with a constant RHS, and the shift has a
29314/// constant RHS and is the only use, we can pull it out of the shift, i.e.
29315///
29316/// (shl (and X, C1), C2) -> (and (shl X, C2), (shl C1, C2))
29317///
29318/// We prefer this canonical form to match existing isel patterns.
29321 SelectionDAG &DAG) {
29322 if (DCI.isBeforeLegalizeOps())
29323 return SDValue();
29324
29325 SDValue Op0 = N->getOperand(0);
29326 if (Op0.getOpcode() != ISD::AND || !Op0.hasOneUse())
29327 return SDValue();
29328
29329 SDValue C1 = Op0->getOperand(1);
29330 SDValue C2 = N->getOperand(1);
29332 return SDValue();
29333
29334 // Might be folded into shifted op, do not lower.
29335 if (N->hasOneUse()) {
29336 unsigned UseOpc = N->user_begin()->getOpcode();
29337 if (UseOpc == ISD::ADD || UseOpc == ISD::SUB || UseOpc == ISD::SETCC ||
29338 UseOpc == AArch64ISD::ADDS || UseOpc == AArch64ISD::SUBS)
29339 return SDValue();
29340 }
29341
29342 SDLoc DL(N);
29343 EVT VT = N->getValueType(0);
29344
29345 // Don't combine unless (shl C1, C2) can be constant folded. Otherwise,
29346 // DAGCombiner will simplify (and (op x...), (op y...)) -> (op (and x, y))
29347 // causing infinite loop. Result may also be worse.
29348 SDValue NewRHS = DAG.getNode(ISD::SHL, DL, VT, C1, C2);
29349 if (!isa<ConstantSDNode>(NewRHS))
29350 return SDValue();
29351
29352 SDValue X = Op0->getOperand(0);
29353 SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, X, C2);
29354 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewRHS);
29355}
29356
29358 unsigned IntrinsicID = N->getConstantOperandVal(1);
29359 auto Register =
29360 (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR
29361 : AArch64SysReg::RNDRRS);
29362 SDLoc DL(N);
29363 SDValue A = DAG.getNode(
29364 AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, FlagsVT, MVT::Other),
29365 N->getOperand(0), DAG.getConstant(Register, DL, MVT::i32));
29366 SDValue B = DAG.getNode(AArch64ISD::CSINC, DL, MVT::i32,
29367 DAG.getConstant(0, DL, MVT::i32),
29368 DAG.getConstant(0, DL, MVT::i32),
29369 getCondCode(DAG, AArch64CC::NE), A.getValue(1));
29370 return DAG.getMergeValues(
29371 {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);
29372}
29373
29376 SelectionDAG &DAG) {
29377 using namespace llvm::SDPatternMatch;
29378 if (!DCI.isBeforeLegalize())
29379 return SDValue();
29380
29381 // ctpop(zext(bitcast(vector_mask))) -> neg(signed_reduce_add(vector_mask))
29382 EVT SrcVT;
29383 SDValue Mask;
29384 if (!sd_match(N->getOperand(0),
29385 m_ZExtOrSelf(m_VT(SrcVT, m_BitCast(m_Value(Mask))))))
29386 return SDValue();
29387
29388 EVT VT = N->getValueType(0);
29389 EVT MaskVT = Mask.getValueType();
29390
29391 if (VT.isVector() || !MaskVT.isFixedLengthVector() ||
29392 MaskVT.getVectorElementType() != MVT::i1)
29393 return SDValue();
29394
29395 EVT ReduceInVT = EVT::getVectorVT(*DAG.getContext(), SrcVT,
29396 MaskVT.getVectorElementCount());
29397
29398 EVT CmpVT;
29399 // Use the same VT as the SETcc if -CTPOP would not overflow.
29400 if (sd_match(Mask, m_SetCC(m_VT(CmpVT), m_Value(), m_Value()))) {
29401 CmpVT = CmpVT.changeVectorElementTypeToInteger();
29402 if (Log2_64_Ceil(MaskVT.getSizeInBits()) <= CmpVT.getScalarSizeInBits() - 1)
29403 ReduceInVT = CmpVT;
29404 }
29405
29406 SDLoc DL(N);
29407 EVT PopVT = ReduceInVT.getScalarType();
29408 // Sign extend to best fit ZeroOrNegativeOneBooleanContent.
29409 SDValue ExtMask = DAG.getNode(ISD::SIGN_EXTEND, DL, ReduceInVT, Mask);
29410 SDValue NegPopCount = DAG.getNode(ISD::VECREDUCE_ADD, DL, PopVT, ExtMask);
29411 SDValue ExtPopCount = DAG.getSExtOrTrunc(NegPopCount, DL, VT);
29412 return DAG.getNegative(ExtPopCount, DL, VT);
29413}
29414
29415static unsigned getReductionForOpcode(unsigned Op) {
29416 switch (Op) {
29417 case ISD::SMIN:
29418 return ISD::VECREDUCE_SMIN;
29419 case ISD::SMAX:
29420 return ISD::VECREDUCE_SMAX;
29421 case ISD::UMIN:
29422 return ISD::VECREDUCE_UMIN;
29423 case ISD::UMAX:
29424 return ISD::VECREDUCE_UMAX;
29425 default:
29426 llvm_unreachable("unimplemented mapping");
29427 }
29428}
29429
29431 const AArch64TargetLowering &TLI) {
29432 using namespace llvm::SDPatternMatch;
29433 if (SDValue V = trySQDMULHCombine(N, DAG))
29434 return V;
29435
29436 unsigned ReductionOpcode = getReductionForOpcode(N->getOpcode());
29437 if (!TLI.isOperationLegalOrCustom(ReductionOpcode, MVT::v2i64))
29438 return SDValue();
29439
29440 // Fold `min/max(vec[0], vec[1])` to `vecreduce_min/max(vec)` for v2i64.
29441
29442 APInt Idx;
29443 SDValue Vec;
29444 if (!sd_match(N->getOperand(0),
29445 m_OneUse(m_ExtractElt(m_SpecificVT(MVT::v2i64, m_Value(Vec)),
29446 m_ConstInt(Idx)))))
29447 return SDValue();
29448
29449 if (!sd_match(
29450 N->getOperand(1),
29452 return SDValue();
29453
29454 return DAG.getNode(ReductionOpcode, SDLoc(N), N->getValueType(0), Vec);
29455}
29456
29458 DAGCombinerInfo &DCI) const {
29459 SelectionDAG &DAG = DCI.DAG;
29460 switch (N->getOpcode()) {
29461 default:
29462 LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
29463 break;
29465 return performVectorDeinterleaveCombine(N, DCI, DAG);
29466 case ISD::VECREDUCE_AND:
29467 case ISD::VECREDUCE_OR:
29468 case ISD::VECREDUCE_XOR:
29469 return performVecReduceBitwiseCombine(N, DCI, DAG);
29470 case ISD::ADD:
29471 case ISD::SUB:
29472 return performAddSubCombine(N, DCI);
29473 case ISD::BUILD_VECTOR:
29474 return performBuildVectorCombine(N, DCI, DAG);
29475 case ISD::UMAX:
29476 case ISD::UMIN:
29477 case ISD::SMAX:
29478 case ISD::SMIN:
29479 return performMINMAXCombine(N, DAG, *this);
29480 case ISD::TRUNCATE:
29481 return performTruncateCombine(N, DAG, DCI);
29482 case AArch64ISD::ANDS:
29483 return performANDSCombine(N, DCI);
29484 case AArch64ISD::ADC:
29485 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
29486 return R;
29487 return foldADCToCINC(N, DAG);
29488 case AArch64ISD::SBC:
29489 return foldOverflowCheck(N, DAG, /* IsAdd */ false);
29490 case AArch64ISD::ADCS:
29491 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
29492 return R;
29493 return performFlagSettingCombine(N, DCI, AArch64ISD::ADC);
29494 case AArch64ISD::SBCS:
29495 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ false))
29496 return R;
29497 return performFlagSettingCombine(N, DCI, AArch64ISD::SBC);
29498 case AArch64ISD::ADDS:
29499 return performFlagSettingCombine(N, DCI, ISD::ADD);
29500 case AArch64ISD::SUBS:
29501 return performFlagSettingCombine(N, DCI, ISD::SUB);
29502 case AArch64ISD::BICi:
29503 return performBICiCombine(N, DAG, DCI);
29504 case ISD::XOR:
29505 return performXorCombine(N, DAG, DCI, Subtarget);
29506 case ISD::MUL:
29507 return performMulCombine(N, DAG, DCI, Subtarget);
29508 case ISD::SINT_TO_FP:
29509 case ISD::UINT_TO_FP:
29510 return performIntToFpCombine(N, DAG, DCI, Subtarget);
29511 case ISD::FP_TO_SINT:
29512 case ISD::FP_TO_UINT:
29515 return performFpToIntCombine(N, DAG, DCI, Subtarget);
29516 case ISD::OR:
29517 return performORCombine(N, DCI);
29518 case ISD::AND:
29519 return performANDCombine(N, DCI);
29520 case ISD::FADD:
29521 return performFADDCombine(N, DCI);
29523 return performIntrinsicCombine(N, DCI, Subtarget);
29524 case ISD::ANY_EXTEND:
29525 case ISD::ZERO_EXTEND:
29526 case ISD::SIGN_EXTEND:
29527 return performExtendCombine(N, DCI, DAG, Subtarget);
29529 return performSignExtendInRegCombine(N, DCI, DAG);
29531 return performConcatVectorsCombine(N, DCI, DAG);
29533 return performExtractSubvectorCombine(N, DCI, DAG);
29535 return performInsertSubvectorCombine(N, DCI, DAG);
29536 case ISD::SELECT:
29537 return performSelectCombine(N, DCI);
29538 case ISD::SELECT_CC:
29539 return performSELECT_CCCombine(N, DCI, DAG);
29540 case ISD::VSELECT:
29541 return performVSelectCombine(N, DCI, Subtarget);
29542 case ISD::SETCC:
29543 return performSETCCCombine(N, DCI, DAG);
29544 case ISD::LOAD:
29545 return performLOADCombine(N, DCI, DAG, Subtarget);
29546 case ISD::STORE:
29547 return performSTORECombine(N, DCI, DAG, Subtarget);
29548 case ISD::MSTORE:
29549 return performMSTORECombine(N, DCI, DAG, Subtarget);
29550 case ISD::MGATHER:
29551 case ISD::MSCATTER:
29553 return performMaskedGatherScatterCombine(N, DCI, DAG);
29554 case ISD::FP_EXTEND:
29555 return performFPExtendCombine(N, DAG, DCI, Subtarget);
29556 case AArch64ISD::BRCOND:
29557 return performBRCONDCombine(N, DCI, DAG);
29558 case AArch64ISD::TBNZ:
29559 case AArch64ISD::TBZ:
29560 return performTBZCombine(N, DCI, DAG);
29561 case AArch64ISD::CSEL:
29562 return performCSELCombine(N, DCI, DAG);
29563 case AArch64ISD::DUP:
29564 case AArch64ISD::DUPLANE8:
29565 case AArch64ISD::DUPLANE16:
29566 case AArch64ISD::DUPLANE32:
29567 case AArch64ISD::DUPLANE64:
29568 return performDUPCombine(N, DCI);
29569 case AArch64ISD::DUPLANE128:
29570 return performDupLane128Combine(N, DAG);
29571 case AArch64ISD::NVCAST:
29572 return performNVCASTCombine(N, DAG);
29573 case AArch64ISD::SPLICE:
29574 return performSpliceCombine(N, DAG);
29575 case AArch64ISD::UUNPKLO:
29576 case AArch64ISD::UUNPKHI:
29577 return performUnpackCombine(N, DAG, Subtarget);
29578 case AArch64ISD::UZP1:
29579 case AArch64ISD::UZP2:
29580 return performUzpCombine(N, DAG, Subtarget);
29581 case AArch64ISD::SETCC_MERGE_ZERO:
29582 return performSetccMergeZeroCombine(N, DCI);
29583 case AArch64ISD::REINTERPRET_CAST:
29585 case AArch64ISD::GLD1_MERGE_ZERO:
29586 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
29587 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
29588 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
29589 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
29590 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
29591 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
29592 case AArch64ISD::GLD1S_MERGE_ZERO:
29593 case AArch64ISD::GLD1S_SCALED_MERGE_ZERO:
29594 case AArch64ISD::GLD1S_UXTW_MERGE_ZERO:
29595 case AArch64ISD::GLD1S_SXTW_MERGE_ZERO:
29596 case AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO:
29597 case AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO:
29598 case AArch64ISD::GLD1S_IMM_MERGE_ZERO:
29599 return performGLD1Combine(N, DAG);
29600 case AArch64ISD::VASHR:
29601 case AArch64ISD::VLSHR:
29602 return performVectorShiftCombine(N, *this, DCI);
29603 case AArch64ISD::SUNPKLO:
29604 return performSunpkloCombine(N, DAG);
29605 case AArch64ISD::BSP:
29606 return performBSPExpandForSVE(N, DAG, Subtarget);
29608 return performInsertVectorEltCombine(N, DCI);
29610 return performExtractVectorEltCombine(N, DCI, Subtarget);
29611 case ISD::VECREDUCE_ADD:
29612 return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
29614 return tryCombineFADDReductionWithZero(N, DCI.DAG, Subtarget,
29615 N->getOperand(0));
29617 return tryCombineFADDReductionWithZero(N, DCI.DAG, Subtarget,
29618 N->getOperand(1), N->getOperand(0));
29620 return performActiveLaneMaskCombine(N, DCI, Subtarget);
29621 case AArch64ISD::UADDV:
29622 return performUADDVCombine(N, DAG);
29623 case AArch64ISD::SMULL:
29624 case AArch64ISD::UMULL:
29625 case AArch64ISD::PMULL:
29626 return performMULLCombine(N, DCI, DAG);
29627 case AArch64ISD::PTEST_FIRST:
29628 return performPTestFirstCombine(N, DCI, DAG);
29631 switch (N->getConstantOperandVal(1)) {
29632 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
29633 return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/);
29634 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
29635 return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /*=ScalarSizeInBytes*/);
29636 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
29637 return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /*=ScalarSizeInBytes*/);
29638 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
29639 return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /*=ScalarSizeInBytes*/);
29640 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
29641 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
29642 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
29643 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
29644 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
29645 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
29646 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
29647 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
29649 case Intrinsic::aarch64_neon_ld2:
29650 case Intrinsic::aarch64_neon_ld3:
29651 case Intrinsic::aarch64_neon_ld4:
29652 case Intrinsic::aarch64_neon_ld1x2:
29653 case Intrinsic::aarch64_neon_ld1x3:
29654 case Intrinsic::aarch64_neon_ld1x4:
29655 case Intrinsic::aarch64_neon_ld2lane:
29656 case Intrinsic::aarch64_neon_ld3lane:
29657 case Intrinsic::aarch64_neon_ld4lane:
29658 case Intrinsic::aarch64_neon_ld2r:
29659 case Intrinsic::aarch64_neon_ld3r:
29660 case Intrinsic::aarch64_neon_ld4r:
29661 case Intrinsic::aarch64_neon_st2:
29662 case Intrinsic::aarch64_neon_st3:
29663 case Intrinsic::aarch64_neon_st4:
29664 case Intrinsic::aarch64_neon_st1x2:
29665 case Intrinsic::aarch64_neon_st1x3:
29666 case Intrinsic::aarch64_neon_st1x4:
29667 case Intrinsic::aarch64_neon_st2lane:
29668 case Intrinsic::aarch64_neon_st3lane:
29669 case Intrinsic::aarch64_neon_st4lane:
29670 return performNEONPostLDSTCombine(N, DCI, DAG);
29671 case Intrinsic::aarch64_sve_ldnt1:
29672 return performLDNT1Combine(N, DAG);
29673 case Intrinsic::aarch64_sve_ld1rq:
29675 case Intrinsic::aarch64_sve_ld1ro:
29677 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
29678 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
29679 case Intrinsic::aarch64_sve_ldnt1_gather:
29680 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
29681 case Intrinsic::aarch64_sve_ldnt1_gather_index:
29682 return performGatherLoadCombine(N, DAG,
29683 AArch64ISD::GLDNT1_INDEX_MERGE_ZERO);
29684 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
29685 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
29686 case Intrinsic::aarch64_sve_ld1:
29687 return performLD1Combine(N, DAG, AArch64ISD::LD1_MERGE_ZERO);
29688 case Intrinsic::aarch64_sve_ldnf1:
29689 return performLD1Combine(N, DAG, AArch64ISD::LDNF1_MERGE_ZERO);
29690 case Intrinsic::aarch64_sve_ldff1:
29691 return performLD1Combine(N, DAG, AArch64ISD::LDFF1_MERGE_ZERO);
29692 case Intrinsic::aarch64_sve_st1:
29693 return performST1Combine(N, DAG);
29694 case Intrinsic::aarch64_sve_stnt1:
29695 return performSTNT1Combine(N, DAG);
29696 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
29697 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
29698 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
29699 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
29700 case Intrinsic::aarch64_sve_stnt1_scatter:
29701 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
29702 case Intrinsic::aarch64_sve_stnt1_scatter_index:
29703 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_INDEX_PRED);
29704 case Intrinsic::aarch64_sve_ld1_gather:
29705 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_MERGE_ZERO);
29706 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
29707 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
29708 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1Q_MERGE_ZERO);
29709 case Intrinsic::aarch64_sve_ld1q_gather_index:
29710 return performGatherLoadCombine(N, DAG,
29711 AArch64ISD::GLD1Q_INDEX_MERGE_ZERO);
29712 case Intrinsic::aarch64_sve_ld1_gather_index:
29713 return performGatherLoadCombine(N, DAG,
29714 AArch64ISD::GLD1_SCALED_MERGE_ZERO);
29715 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
29716 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_SXTW_MERGE_ZERO,
29717 /*OnlyPackedOffsets=*/false);
29718 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
29719 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_UXTW_MERGE_ZERO,
29720 /*OnlyPackedOffsets=*/false);
29721 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
29722 return performGatherLoadCombine(N, DAG,
29723 AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO,
29724 /*OnlyPackedOffsets=*/false);
29725 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
29726 return performGatherLoadCombine(N, DAG,
29727 AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO,
29728 /*OnlyPackedOffsets=*/false);
29729 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
29730 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_IMM_MERGE_ZERO);
29731 case Intrinsic::aarch64_sve_ldff1_gather:
29732 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_MERGE_ZERO);
29733 case Intrinsic::aarch64_sve_ldff1_gather_index:
29734 return performGatherLoadCombine(N, DAG,
29735 AArch64ISD::GLDFF1_SCALED_MERGE_ZERO);
29736 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
29737 return performGatherLoadCombine(N, DAG,
29738 AArch64ISD::GLDFF1_SXTW_MERGE_ZERO,
29739 /*OnlyPackedOffsets=*/false);
29740 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
29741 return performGatherLoadCombine(N, DAG,
29742 AArch64ISD::GLDFF1_UXTW_MERGE_ZERO,
29743 /*OnlyPackedOffsets=*/false);
29744 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
29745 return performGatherLoadCombine(N, DAG,
29746 AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO,
29747 /*OnlyPackedOffsets=*/false);
29748 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
29749 return performGatherLoadCombine(N, DAG,
29750 AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO,
29751 /*OnlyPackedOffsets=*/false);
29752 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
29753 return performGatherLoadCombine(N, DAG,
29754 AArch64ISD::GLDFF1_IMM_MERGE_ZERO);
29755 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
29756 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
29757 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1Q_PRED);
29758 case Intrinsic::aarch64_sve_st1q_scatter_index:
29759 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1Q_INDEX_PRED);
29760 case Intrinsic::aarch64_sve_st1_scatter:
29761 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_PRED);
29762 case Intrinsic::aarch64_sve_st1_scatter_index:
29763 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SCALED_PRED);
29764 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
29765 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SXTW_PRED,
29766 /*OnlyPackedOffsets=*/false);
29767 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
29768 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_UXTW_PRED,
29769 /*OnlyPackedOffsets=*/false);
29770 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
29771 return performScatterStoreCombine(N, DAG,
29772 AArch64ISD::SST1_SXTW_SCALED_PRED,
29773 /*OnlyPackedOffsets=*/false);
29774 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
29775 return performScatterStoreCombine(N, DAG,
29776 AArch64ISD::SST1_UXTW_SCALED_PRED,
29777 /*OnlyPackedOffsets=*/false);
29778 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
29779 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_IMM_PRED);
29780 case Intrinsic::aarch64_rndr:
29781 case Intrinsic::aarch64_rndrrs:
29782 return performRNDRCombine(N, DAG);
29783 case Intrinsic::aarch64_sme_ldr_zt:
29784 return DAG.getNode(AArch64ISD::RESTORE_ZT, SDLoc(N),
29785 DAG.getVTList(MVT::Other), N->getOperand(0),
29786 N->getOperand(2), N->getOperand(3));
29787 case Intrinsic::aarch64_sme_str_zt:
29788 return DAG.getNode(AArch64ISD::SAVE_ZT, SDLoc(N),
29789 DAG.getVTList(MVT::Other), N->getOperand(0),
29790 N->getOperand(2), N->getOperand(3));
29791 default:
29792 break;
29793 }
29794 break;
29795 case ISD::GlobalAddress:
29796 return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
29797 case ISD::CTLZ:
29798 return performCTLZCombine(N, DAG, Subtarget);
29800 return performScalarToVectorCombine(N, DCI, DAG);
29801 case ISD::SHL:
29802 return performSHLCombine(N, DCI, DAG);
29803 case ISD::CTPOP:
29804 return performCTPOPCombine(N, DCI, DAG);
29805 }
29806 return SDValue();
29807}
29808
29809// Check if the return value is used as only a return value, as otherwise
29810// we can't perform a tail-call. In particular, we need to check for
29811// target ISD nodes that are returns and any other "odd" constructs
29812// that the generic analysis code won't necessarily catch.
29813bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
29814 SDValue &Chain) const {
29815 if (N->getNumValues() != 1)
29816 return false;
29817 if (!N->hasNUsesOfValue(1, 0))
29818 return false;
29819
29820 SDValue TCChain = Chain;
29821 SDNode *Copy = *N->user_begin();
29822 if (Copy->getOpcode() == ISD::CopyToReg) {
29823 // If the copy has a glue operand, we conservatively assume it isn't safe to
29824 // perform a tail call.
29825 if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
29826 MVT::Glue)
29827 return false;
29828 TCChain = Copy->getOperand(0);
29829 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
29830 return false;
29831
29832 bool HasRet = false;
29833 for (SDNode *Node : Copy->users()) {
29834 if (Node->getOpcode() != AArch64ISD::RET_GLUE)
29835 return false;
29836 HasRet = true;
29837 }
29838
29839 if (!HasRet)
29840 return false;
29841
29842 Chain = TCChain;
29843 return true;
29844}
29845
29846// Return whether the an instruction can potentially be optimized to a tail
29847// call. This will cause the optimizers to attempt to move, or duplicate,
29848// return instructions to help enable tail call optimizations for this
29849// instruction.
29850bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
29851 return CI->isTailCall();
29852}
29853
29854bool AArch64TargetLowering::isIndexingLegal(MachineInstr &MI, Register Base,
29855 Register Offset, bool IsPre,
29856 MachineRegisterInfo &MRI) const {
29857 auto CstOffset = getIConstantVRegVal(Offset, MRI);
29858 if (!CstOffset || CstOffset->isZero())
29859 return false;
29860
29861 // All of the indexed addressing mode instructions take a signed 9 bit
29862 // immediate offset. Our CstOffset is a G_PTR_ADD offset so it already
29863 // encodes the sign/indexing direction.
29864 return isInt<9>(CstOffset->getSExtValue());
29865}
29866
29867bool AArch64TargetLowering::getIndexedAddressParts(SDNode *N, SDNode *Op,
29868 SDValue &Base,
29869 SDValue &Offset,
29870 SelectionDAG &DAG) const {
29871 if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
29872 return false;
29873
29874 // Non-null if there is exactly one user of the loaded value (ignoring chain).
29875 SDNode *ValOnlyUser = nullptr;
29876 for (SDUse &U : N->uses()) {
29877 if (U.getResNo() == 1)
29878 continue; // Ignore chain.
29879 if (ValOnlyUser == nullptr)
29880 ValOnlyUser = U.getUser();
29881 else {
29882 ValOnlyUser = nullptr; // Multiple non-chain uses, bail out.
29883 break;
29884 }
29885 }
29886
29887 auto IsUndefOrZero = [](SDValue V) {
29888 return V.isUndef() || isNullOrNullSplat(V, /*AllowUndefs*/ true);
29889 };
29890
29891 // If the only user of the value is a scalable vector splat, it is
29892 // preferable to do a replicating load (ld1r*).
29893 if (ValOnlyUser && ValOnlyUser->getValueType(0).isScalableVector() &&
29894 (ValOnlyUser->getOpcode() == ISD::SPLAT_VECTOR ||
29895 (ValOnlyUser->getOpcode() == AArch64ISD::DUP_MERGE_PASSTHRU &&
29896 IsUndefOrZero(ValOnlyUser->getOperand(2)))))
29897 return false;
29898
29899 Base = Op->getOperand(0);
29900 // All of the indexed addressing mode instructions take a signed
29901 // 9 bit immediate offset.
29902 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
29903 int64_t RHSC = RHS->getSExtValue();
29904 if (Op->getOpcode() == ISD::SUB)
29905 RHSC = -(uint64_t)RHSC;
29906 if (!isInt<9>(RHSC))
29907 return false;
29908 // When big-endian VLD1/VST1 are used for vector load and store, and these
29909 // only allow an offset that's equal to the store size.
29910 EVT MemType = cast<MemSDNode>(N)->getMemoryVT();
29911 if (!Subtarget->isLittleEndian() && MemType.isVector() &&
29912 (uint64_t)RHSC != MemType.getStoreSize())
29913 return false;
29914 // Always emit pre-inc/post-inc addressing mode. Use negated constant offset
29915 // when dealing with subtraction.
29916 Offset = DAG.getConstant(RHSC, SDLoc(N), RHS->getValueType(0));
29917 return true;
29918 }
29919 return false;
29920}
29921
29922bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
29923 SDValue &Offset,
29925 SelectionDAG &DAG) const {
29926 EVT VT;
29927 SDValue Ptr;
29928 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
29929 VT = LD->getMemoryVT();
29930 Ptr = LD->getBasePtr();
29931 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
29932 VT = ST->getMemoryVT();
29933 Ptr = ST->getBasePtr();
29934 } else
29935 return false;
29936
29937 if (!getIndexedAddressParts(N, Ptr.getNode(), Base, Offset, DAG))
29938 return false;
29939 AM = ISD::PRE_INC;
29940 return true;
29941}
29942
29943bool AArch64TargetLowering::getPostIndexedAddressParts(
29945 ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
29946 EVT VT;
29947 SDValue Ptr;
29948 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
29949 VT = LD->getMemoryVT();
29950 Ptr = LD->getBasePtr();
29951 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
29952 VT = ST->getMemoryVT();
29953 Ptr = ST->getBasePtr();
29954 } else
29955 return false;
29956
29957 if (!getIndexedAddressParts(N, Op, Base, Offset, DAG))
29958 return false;
29959 // Post-indexing updates the base, so it's not a valid transform
29960 // if that's not the same as the load's pointer.
29961 if (Ptr != Base)
29962 return false;
29963 AM = ISD::POST_INC;
29964 return true;
29965}
29966
29969 SelectionDAG &DAG) {
29970 SDLoc DL(N);
29971 SDValue Op = N->getOperand(0);
29972 EVT VT = N->getValueType(0);
29973 [[maybe_unused]] EVT SrcVT = Op.getValueType();
29974 assert(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
29975 "Must be bool vector.");
29976
29977 // Special handling for Clang's __builtin_convertvector. For vectors with <8
29978 // elements, it adds a vector concatenation with undef(s). If we encounter
29979 // this here, we can skip the concat.
29980 if (Op.getOpcode() == ISD::CONCAT_VECTORS && !Op.getOperand(0).isUndef()) {
29981 bool AllUndef = true;
29982 for (unsigned I = 1; I < Op.getNumOperands(); ++I)
29983 AllUndef &= Op.getOperand(I).isUndef();
29984
29985 if (AllUndef)
29986 Op = Op.getOperand(0);
29987 }
29988
29989 SDValue VectorBits = vectorToScalarBitmask(Op.getNode(), DAG);
29990 if (VectorBits)
29991 Results.push_back(DAG.getZExtOrTrunc(VectorBits, DL, VT));
29992}
29993
29996 SelectionDAG &DAG, EVT ExtendVT,
29997 EVT CastVT) {
29998 SDLoc DL(N);
29999 SDValue Op = N->getOperand(0);
30000 EVT VT = N->getValueType(0);
30001
30002 // Use SCALAR_TO_VECTOR for lane zero
30003 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtendVT, Op);
30004 SDValue CastVal = DAG.getNode(ISD::BITCAST, DL, CastVT, Vec);
30005 SDValue IdxZero = DAG.getVectorIdxConstant(0, DL);
30006 Results.push_back(
30007 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, CastVal, IdxZero));
30008}
30009
30010void AArch64TargetLowering::ReplaceBITCASTResults(
30012 SDLoc DL(N);
30013 SDValue Op = N->getOperand(0);
30014 EVT VT = N->getValueType(0);
30015 EVT SrcVT = Op.getValueType();
30016
30017 if (VT == MVT::v2i16 && SrcVT == MVT::i32) {
30018 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v4i16);
30019 return;
30020 }
30021
30022 if (VT == MVT::v4i8 && SrcVT == MVT::i32) {
30023 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v2i32, MVT::v8i8);
30024 return;
30025 }
30026
30027 if (VT == MVT::v2i8 && SrcVT == MVT::i16) {
30028 CustomNonLegalBITCASTResults(N, Results, DAG, MVT::v4i16, MVT::v8i8);
30029 return;
30030 }
30031
30032 if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(SrcVT)) {
30033 assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
30034 "Expected fp->int bitcast!");
30035
30036 // Bitcasting between unpacked vector types of different element counts is
30037 // not a NOP because the live elements are laid out differently.
30038 // 01234567
30039 // e.g. nxv2i32 = XX??XX??
30040 // nxv4f16 = X?X?X?X?
30041 if (VT.getVectorElementCount() != SrcVT.getVectorElementCount())
30042 return;
30043
30044 SDValue CastResult = getSVESafeBitCast(getSVEContainerType(VT), Op, DAG);
30045 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, CastResult));
30046 return;
30047 }
30048
30049 if (SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
30050 !VT.isVector())
30051 return replaceBoolVectorBitcast(N, Results, DAG);
30052
30053 if (VT != MVT::i16 || (SrcVT != MVT::f16 && SrcVT != MVT::bf16))
30054 return;
30055
30056 Op = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
30057 DAG.getPOISON(MVT::i32), Op);
30058 Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
30059 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
30060}
30061
30063 SelectionDAG &DAG,
30064 const AArch64Subtarget *Subtarget) {
30065 EVT VT = N->getValueType(0);
30066 if (!VT.is256BitVector() ||
30068 !N->getFlags().hasAllowReassociation()) ||
30069 (VT.getScalarType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
30070 VT.getScalarType() == MVT::bf16)
30071 return;
30072
30073 SDValue X = N->getOperand(0);
30074 auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(1));
30075 if (!Shuf) {
30076 Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0));
30077 X = N->getOperand(1);
30078 if (!Shuf)
30079 return;
30080 }
30081
30082 if (Shuf->getOperand(0) != X || !Shuf->getOperand(1)->isUndef())
30083 return;
30084
30085 // Check the mask is 1,0,3,2,5,4,...
30086 ArrayRef<int> Mask = Shuf->getMask();
30087 for (int I = 0, E = Mask.size(); I < E; I++)
30088 if (Mask[I] != (I % 2 == 0 ? I + 1 : I - 1))
30089 return;
30090
30091 SDLoc DL(N);
30092 auto LoHi = DAG.SplitVector(X, DL);
30093 assert(LoHi.first.getValueType() == LoHi.second.getValueType());
30094 SDValue Addp = DAG.getNode(AArch64ISD::ADDP, N, LoHi.first.getValueType(),
30095 LoHi.first, LoHi.second);
30096
30097 // Shuffle the elements back into order.
30098 SmallVector<int> NMask;
30099 for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I < E; I++) {
30100 NMask.push_back(I);
30101 NMask.push_back(I);
30102 }
30103 Results.push_back(DAG.getVectorShuffle(
30104 VT, DL,
30105 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Addp,
30106 DAG.getPOISON(LoHi.first.getValueType())),
30107 DAG.getPOISON(VT), NMask));
30108}
30109
30112 SelectionDAG &DAG, unsigned InterOp,
30113 unsigned AcrossOp) {
30114 EVT LoVT, HiVT;
30115 SDValue Lo, Hi;
30116 SDLoc DL(N);
30117 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
30118 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
30119 SDValue InterVal = DAG.getNode(InterOp, DL, LoVT, Lo, Hi);
30120 SDValue SplitVal = DAG.getNode(AcrossOp, DL, LoVT, InterVal);
30121 Results.push_back(SplitVal);
30122}
30123
30124void AArch64TargetLowering::ReplaceExtractSubVectorResults(
30126 SDValue In = N->getOperand(0);
30127 EVT InVT = In.getValueType();
30128
30129 // Common code will handle these just fine.
30130 if (!InVT.isScalableVector() || !InVT.isInteger())
30131 return;
30132
30133 SDLoc DL(N);
30134 EVT VT = N->getValueType(0);
30135
30136 // The following checks bail if this is not a halving operation.
30137
30138 ElementCount ResEC = VT.getVectorElementCount();
30139
30140 if (InVT.getVectorElementCount() != (ResEC * 2))
30141 return;
30142
30143 auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1));
30144 if (!CIndex)
30145 return;
30146
30147 unsigned Index = CIndex->getZExtValue();
30148 if ((Index != 0) && (Index != ResEC.getKnownMinValue()))
30149 return;
30150
30151 unsigned Opcode = (Index == 0) ? (unsigned)ISD::ANY_EXTEND_VECTOR_INREG
30152 : (unsigned)AArch64ISD::UUNPKHI;
30153 EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(*DAG.getContext());
30154
30155 SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0));
30156 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half));
30157}
30158
30159void AArch64TargetLowering::ReplaceGetActiveLaneMaskResults(
30161 assert((Subtarget->isSVEorStreamingSVEAvailable() &&
30162 (Subtarget->hasSVE2p1() || Subtarget->hasSME2())) &&
30163 "Custom lower of get.active.lane.mask missing required feature.");
30164
30165 assert(N->getValueType(0) == MVT::nxv32i1 &&
30166 "Unexpected result type for get.active.lane.mask");
30167
30168 SDLoc DL(N);
30169 SDValue Idx = N->getOperand(0);
30170 SDValue TC = N->getOperand(1);
30171
30172 assert(Idx.getValueType().getFixedSizeInBits() <= 64 &&
30173 "Unexpected operand type for get.active.lane.mask");
30174
30175 if (Idx.getValueType() != MVT::i64) {
30176 Idx = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Idx);
30177 TC = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, TC);
30178 }
30179
30180 SDValue ID =
30181 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo_x2, DL, MVT::i64);
30182 EVT HalfVT = N->getValueType(0).getHalfNumVectorElementsVT(*DAG.getContext());
30183 auto WideMask =
30184 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, {HalfVT, HalfVT}, {ID, Idx, TC});
30185
30186 Results.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, N->getValueType(0),
30187 {WideMask.getValue(0), WideMask.getValue(1)}));
30188}
30189
30190// Create an even/odd pair of X registers holding integer value V.
30192 SDLoc DL(V.getNode());
30193 auto [VLo, VHi] = DAG.SplitScalar(V, DL, MVT::i64, MVT::i64);
30194 if (DAG.getDataLayout().isBigEndian())
30195 std::swap (VLo, VHi);
30196 SDValue RegClass =
30197 DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, DL, MVT::i32);
30198 SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, DL, MVT::i32);
30199 SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, DL, MVT::i32);
30200 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
30201 return SDValue(
30202 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::Untyped, Ops), 0);
30203}
30204
30207 SelectionDAG &DAG,
30208 const AArch64Subtarget *Subtarget) {
30209 assert(N->getValueType(0) == MVT::i128 &&
30210 "AtomicCmpSwap on types less than 128 should be legal");
30211
30212 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
30213 if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) {
30214 // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
30215 // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
30216 SDValue Ops[] = {
30217 createGPRPairNode(DAG, N->getOperand(2)), // Compare value
30218 createGPRPairNode(DAG, N->getOperand(3)), // Store value
30219 N->getOperand(1), // Ptr
30220 N->getOperand(0), // Chain in
30221 };
30222
30223 unsigned Opcode;
30224 switch (MemOp->getMergedOrdering()) {
30226 Opcode = AArch64::CASPX;
30227 break;
30229 Opcode = AArch64::CASPAX;
30230 break;
30232 Opcode = AArch64::CASPLX;
30233 break;
30236 Opcode = AArch64::CASPALX;
30237 break;
30238 default:
30239 llvm_unreachable("Unexpected ordering!");
30240 }
30241
30242 MachineSDNode *CmpSwap = DAG.getMachineNode(
30243 Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops);
30244 DAG.setNodeMemRefs(CmpSwap, {MemOp});
30245
30246 unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
30247 if (DAG.getDataLayout().isBigEndian())
30248 std::swap(SubReg1, SubReg2);
30249 SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
30250 SDValue(CmpSwap, 0));
30251 SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
30252 SDValue(CmpSwap, 0));
30253 Results.push_back(
30254 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
30255 Results.push_back(SDValue(CmpSwap, 1)); // Chain out
30256 return;
30257 }
30258
30259 unsigned Opcode;
30260 switch (MemOp->getMergedOrdering()) {
30262 Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
30263 break;
30265 Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
30266 break;
30268 Opcode = AArch64::CMP_SWAP_128_RELEASE;
30269 break;
30272 Opcode = AArch64::CMP_SWAP_128;
30273 break;
30274 default:
30275 llvm_unreachable("Unexpected ordering!");
30276 }
30277
30278 SDLoc DL(N);
30279 auto Desired = DAG.SplitScalar(N->getOperand(2), DL, MVT::i64, MVT::i64);
30280 auto New = DAG.SplitScalar(N->getOperand(3), DL, MVT::i64, MVT::i64);
30281 SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
30282 New.first, New.second, N->getOperand(0)};
30283 SDNode *CmpSwap = DAG.getMachineNode(
30284 Opcode, SDLoc(N), DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other),
30285 Ops);
30286 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
30287
30288 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
30289 SDValue(CmpSwap, 0), SDValue(CmpSwap, 1)));
30290 Results.push_back(SDValue(CmpSwap, 3));
30291}
30292
30293static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode,
30294 AtomicOrdering Ordering) {
30295 // ATOMIC_LOAD_CLR only appears when lowering ATOMIC_LOAD_AND (see
30296 // LowerATOMIC_LOAD_AND). We can't take that approach with 128-bit, because
30297 // the type is not legal. Therefore we shouldn't expect to see a 128-bit
30298 // ATOMIC_LOAD_CLR at any point.
30299 assert(ISDOpcode != ISD::ATOMIC_LOAD_CLR &&
30300 "ATOMIC_LOAD_AND should be lowered to LDCLRP directly");
30301 assert(ISDOpcode != ISD::ATOMIC_LOAD_ADD && "There is no 128 bit LDADD");
30302 assert(ISDOpcode != ISD::ATOMIC_LOAD_SUB && "There is no 128 bit LDSUB");
30303
30304 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
30305 // The operand will need to be XORed in a separate step.
30306 switch (Ordering) {
30308 return AArch64::LDCLRP;
30309 break;
30311 return AArch64::LDCLRPA;
30312 break;
30314 return AArch64::LDCLRPL;
30315 break;
30318 return AArch64::LDCLRPAL;
30319 break;
30320 default:
30321 llvm_unreachable("Unexpected ordering!");
30322 }
30323 }
30324
30325 if (ISDOpcode == ISD::ATOMIC_LOAD_OR) {
30326 switch (Ordering) {
30328 return AArch64::LDSETP;
30329 break;
30331 return AArch64::LDSETPA;
30332 break;
30334 return AArch64::LDSETPL;
30335 break;
30338 return AArch64::LDSETPAL;
30339 break;
30340 default:
30341 llvm_unreachable("Unexpected ordering!");
30342 }
30343 }
30344
30345 if (ISDOpcode == ISD::ATOMIC_SWAP) {
30346 switch (Ordering) {
30348 return AArch64::SWPP;
30349 break;
30351 return AArch64::SWPPA;
30352 break;
30354 return AArch64::SWPPL;
30355 break;
30358 return AArch64::SWPPAL;
30359 break;
30360 default:
30361 llvm_unreachable("Unexpected ordering!");
30362 }
30363 }
30364
30365 llvm_unreachable("Unexpected ISDOpcode!");
30366}
30367
30370 SelectionDAG &DAG,
30371 const AArch64Subtarget *Subtarget) {
30372 // LSE128 has a 128-bit RMW ops, but i128 is not a legal type, so lower it
30373 // here. This follows the approach of the CMP_SWAP_XXX pseudo instructions
30374 // rather than the CASP instructions, because CASP has register classes for
30375 // the pairs of registers and therefore uses REG_SEQUENCE and EXTRACT_SUBREG
30376 // to present them as single operands. LSE128 instructions use the GPR64
30377 // register class (because the pair does not have to be sequential), like
30378 // CMP_SWAP_XXX, and therefore we use TRUNCATE and BUILD_PAIR.
30379
30380 assert(N->getValueType(0) == MVT::i128 &&
30381 "AtomicLoadXXX on types less than 128 should be legal");
30382
30383 if (!Subtarget->hasLSE128())
30384 return;
30385
30386 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
30387 const SDValue &Chain = N->getOperand(0);
30388 const SDValue &Ptr = N->getOperand(1);
30389 const SDValue &Val128 = N->getOperand(2);
30390 std::pair<SDValue, SDValue> Val2x64 =
30391 DAG.SplitScalar(Val128, SDLoc(Val128), MVT::i64, MVT::i64);
30392
30393 const unsigned ISDOpcode = N->getOpcode();
30394 const unsigned MachineOpcode =
30395 getAtomicLoad128Opcode(ISDOpcode, MemOp->getMergedOrdering());
30396
30397 if (ISDOpcode == ISD::ATOMIC_LOAD_AND) {
30398 SDLoc DL(Val128);
30399 Val2x64.first =
30400 DAG.getNode(ISD::XOR, DL, MVT::i64,
30401 DAG.getAllOnesConstant(DL, MVT::i64), Val2x64.first);
30402 Val2x64.second =
30403 DAG.getNode(ISD::XOR, DL, MVT::i64,
30404 DAG.getAllOnesConstant(DL, MVT::i64), Val2x64.second);
30405 }
30406
30407 SDValue Ops[] = {Val2x64.first, Val2x64.second, Ptr, Chain};
30408 if (DAG.getDataLayout().isBigEndian())
30409 std::swap(Ops[0], Ops[1]);
30410
30411 MachineSDNode *AtomicInst =
30412 DAG.getMachineNode(MachineOpcode, SDLoc(N),
30413 DAG.getVTList(MVT::i64, MVT::i64, MVT::Other), Ops);
30414
30415 DAG.setNodeMemRefs(AtomicInst, {MemOp});
30416
30417 SDValue Lo = SDValue(AtomicInst, 0), Hi = SDValue(AtomicInst, 1);
30418 if (DAG.getDataLayout().isBigEndian())
30419 std::swap(Lo, Hi);
30420
30421 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
30422 Results.push_back(SDValue(AtomicInst, 2)); // Chain out
30423}
30424
30425void AArch64TargetLowering::ReplaceNodeResults(
30427 switch (N->getOpcode()) {
30428 default:
30429 llvm_unreachable("Don't know how to custom expand this");
30430 case ISD::BITCAST:
30431 ReplaceBITCASTResults(N, Results, DAG);
30432 return;
30433 case ISD::VECREDUCE_ADD:
30438 Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
30439 return;
30440 case ISD::ADD:
30441 case ISD::FADD:
30442 ReplaceAddWithADDP(N, Results, DAG, Subtarget);
30443 return;
30444 case ISD::CLMUL:
30445 if (SDValue Result = LowerCLMUL(SDValue(N, 0), DAG))
30446 Results.push_back(Result);
30447 return;
30448 case ISD::CTPOP:
30449 case ISD::PARITY:
30450 if (SDValue Result = LowerCTPOP_PARITY(SDValue(N, 0), DAG))
30451 Results.push_back(Result);
30452 return;
30453 case AArch64ISD::SADDV:
30454 ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV);
30455 return;
30456 case AArch64ISD::UADDV:
30457 ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV);
30458 return;
30459 case AArch64ISD::SMINV:
30460 ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV);
30461 return;
30462 case AArch64ISD::UMINV:
30463 ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV);
30464 return;
30465 case AArch64ISD::SMAXV:
30466 ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV);
30467 return;
30468 case AArch64ISD::UMAXV:
30469 ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV);
30470 return;
30471 case ISD::MULHS:
30473 Results.push_back(
30474 LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHS_PRED));
30475 return;
30476 case ISD::MULHU:
30478 Results.push_back(
30479 LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHU_PRED));
30480 return;
30481 case ISD::FP_TO_UINT:
30482 case ISD::FP_TO_SINT:
30485 assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
30486 // Let normal code take care of it by not adding anything to Results.
30487 return;
30489 ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
30490 return;
30492 assert(N->getValueType(0) != MVT::i128 &&
30493 "128-bit ATOMIC_LOAD_AND should be lowered directly to LDCLRP");
30494 break;
30497 case ISD::ATOMIC_SWAP: {
30498 assert(cast<AtomicSDNode>(N)->getVal().getValueType() == MVT::i128 &&
30499 "Expected 128-bit atomicrmw.");
30500 // These need custom type legalisation so we go directly to instruction.
30501 ReplaceATOMIC_LOAD_128Results(N, Results, DAG, Subtarget);
30502 return;
30503 }
30504 case ISD::ADDRSPACECAST: {
30505 SDValue V = LowerADDRSPACECAST(SDValue(N, 0), DAG);
30506 Results.push_back(V);
30507 return;
30508 }
30509 case ISD::ATOMIC_LOAD:
30510 case ISD::LOAD: {
30511 MemSDNode *LoadNode = cast<MemSDNode>(N);
30512 EVT MemVT = LoadNode->getMemoryVT();
30513 // Handle lowering 256 bit non temporal loads into LDNP for little-endian
30514 // targets.
30515 //
30516 // Currently we only support NT loads lowering for little-endian targets.
30517 //
30518 // Coordinated with LDNP constraints in
30519 // `llvm/lib/Target/AArch64/AArch64InstrInfo.td`
30520 // and `AArch64TTIImpl::isLegalNTLoad`.
30521 if (LoadNode->isNonTemporal() && Subtarget->isLittleEndian() &&
30522 MemVT.getSizeInBits() == 256u &&
30523 (MemVT.getScalarSizeInBits() == 8u ||
30524 MemVT.getScalarSizeInBits() == 16u ||
30525 MemVT.getScalarSizeInBits() == 32u ||
30526 MemVT.getScalarSizeInBits() == 64u)) {
30527
30528 EVT HalfVT = MemVT.getHalfNumVectorElementsVT(*DAG.getContext());
30530 AArch64ISD::LDNP, SDLoc(N),
30531 DAG.getVTList({MVT::v2i64, MVT::v2i64, MVT::Other}),
30532 {LoadNode->getChain(), LoadNode->getBasePtr()},
30533 LoadNode->getMemoryVT(), LoadNode->getMemOperand());
30534
30535 SDValue Pair = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), MemVT,
30536 DAG.getBitcast(HalfVT, Result.getValue(0)),
30537 DAG.getBitcast(HalfVT, Result.getValue(1)));
30538 Results.append({Pair, Result.getValue(2) /* Chain */});
30539 return;
30540 }
30541
30542 if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) ||
30543 LoadNode->getMemoryVT() != MVT::i128) {
30544 // Non-volatile or atomic loads are optimized later in AArch64's load/store
30545 // optimizer.
30546 return;
30547 }
30548
30549 if (SDValue(N, 0).getValueType() == MVT::i128) {
30550 auto *AN = dyn_cast<AtomicSDNode>(LoadNode);
30551 bool isLoadAcquire =
30553 unsigned Opcode = isLoadAcquire ? AArch64ISD::LDIAPP : AArch64ISD::LDP;
30554
30555 if (isLoadAcquire)
30556 assert(Subtarget->hasFeature(AArch64::FeatureRCPC3));
30557
30559 Opcode, SDLoc(N), DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
30560 {LoadNode->getChain(), LoadNode->getBasePtr()},
30561 LoadNode->getMemoryVT(), LoadNode->getMemOperand());
30562
30563 unsigned FirstRes = DAG.getDataLayout().isBigEndian() ? 1 : 0;
30564
30565 SDValue Pair =
30566 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
30567 Result.getValue(FirstRes), Result.getValue(1 - FirstRes));
30568 Results.append({Pair, Result.getValue(2) /* Chain */});
30569 }
30570 return;
30571 }
30573 ReplaceExtractSubVectorResults(N, Results, DAG);
30574 return;
30577 // Custom lowering has been requested for INSERT_SUBVECTOR and
30578 // CONCAT_VECTORS -- but delegate to common code for result type
30579 // legalisation
30580 return;
30582 ReplaceGetActiveLaneMaskResults(N, Results, DAG);
30583 return;
30585 EVT VT = N->getValueType(0);
30586
30587 Intrinsic::ID IntID =
30588 static_cast<Intrinsic::ID>(N->getConstantOperandVal(0));
30589 switch (IntID) {
30590 default:
30591 return;
30592 case Intrinsic::aarch64_sve_clasta_n: {
30593 assert((VT == MVT::i8 || VT == MVT::i16) &&
30594 "custom lowering for unexpected type");
30595 SDLoc DL(N);
30596 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
30597 auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32,
30598 N->getOperand(1), Op2, N->getOperand(3));
30599 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
30600 return;
30601 }
30602 case Intrinsic::aarch64_sve_clastb_n: {
30603 assert((VT == MVT::i8 || VT == MVT::i16) &&
30604 "custom lowering for unexpected type");
30605 SDLoc DL(N);
30606 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
30607 auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32,
30608 N->getOperand(1), Op2, N->getOperand(3));
30609 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
30610 return;
30611 }
30612 case Intrinsic::aarch64_sve_lasta: {
30613 assert((VT == MVT::i8 || VT == MVT::i16) &&
30614 "custom lowering for unexpected type");
30615 SDLoc DL(N);
30616 auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32,
30617 N->getOperand(1), N->getOperand(2));
30618 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
30619 return;
30620 }
30621 case Intrinsic::aarch64_sve_lastb: {
30622 assert((VT == MVT::i8 || VT == MVT::i16) &&
30623 "custom lowering for unexpected type");
30624 SDLoc DL(N);
30625 auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32,
30626 N->getOperand(1), N->getOperand(2));
30627 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
30628 return;
30629 }
30630 case Intrinsic::aarch64_sme_in_streaming_mode: {
30631 SDLoc DL(N);
30632 SDValue Chain = DAG.getEntryNode();
30633
30634 SDValue RuntimePStateSM =
30635 getRuntimePStateSM(DAG, Chain, DL, N->getValueType(0));
30636 Results.push_back(
30637 DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, RuntimePStateSM));
30638 return;
30639 }
30640 case Intrinsic::experimental_vector_match: {
30641 if (!VT.isFixedLengthVector() || VT.getVectorElementType() != MVT::i1)
30642 return;
30643
30644 // NOTE: Only trivial type promotion is supported.
30645 EVT NewVT = getTypeToTransformTo(*DAG.getContext(), VT);
30646 if (NewVT.getVectorNumElements() != VT.getVectorNumElements())
30647 return;
30648
30649 SDLoc DL(N);
30650 auto V = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, NewVT, N->ops());
30651 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
30652 return;
30653 }
30654 }
30655 }
30656 case ISD::READ_REGISTER: {
30657 SDLoc DL(N);
30658 assert(N->getValueType(0) == MVT::i128 &&
30659 "READ_REGISTER custom lowering is only for 128-bit sysregs");
30660 SDValue Chain = N->getOperand(0);
30661 SDValue SysRegName = N->getOperand(1);
30662
30663 SDValue Result = DAG.getNode(
30664 AArch64ISD::MRRS, DL, DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
30665 Chain, SysRegName);
30666
30667 // Sysregs are not endian. Result.getValue(0) always contains the lower half
30668 // of the 128-bit System Register value.
30669 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
30670 Result.getValue(0), Result.getValue(1));
30671 Results.push_back(Pair);
30672 Results.push_back(Result.getValue(2)); // Chain
30673 return;
30674 }
30675 }
30676}
30677
30679 if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())
30681 return true;
30682}
30683
30685 // Combine multiple FDIVs with the same divisor into multiple FMULs by the
30686 // reciprocal if there are three or more FDIVs.
30687 return 3;
30688}
30689
30692 // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8,
30693 // v4i16, v2i32 instead of to promote.
30694 if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 ||
30695 VT == MVT::v1f32)
30696 return TypeWidenVector;
30697
30699}
30700
30701// In v8.4a, ldp and stp instructions are guaranteed to be single-copy atomic
30702// provided the address is 16-byte aligned.
30704 if (!Subtarget->hasLSE2())
30705 return false;
30706
30707 if (auto LI = dyn_cast<LoadInst>(I))
30708 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
30709 LI->getAlign() >= Align(16);
30710
30711 if (auto SI = dyn_cast<StoreInst>(I))
30712 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
30713 SI->getAlign() >= Align(16);
30714
30715 return false;
30716}
30717
30719 if (!Subtarget->hasLSE128())
30720 return false;
30721
30722 // Only use SWPP for stores where LSE2 would require a fence. Unlike STP, SWPP
30723 // will clobber the two registers.
30724 if (const auto *SI = dyn_cast<StoreInst>(I))
30725 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
30726 SI->getAlign() >= Align(16) &&
30727 (SI->getOrdering() == AtomicOrdering::Release ||
30728 SI->getOrdering() == AtomicOrdering::SequentiallyConsistent);
30729
30730 if (const auto *RMW = dyn_cast<AtomicRMWInst>(I))
30731 return RMW->getValOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
30732 RMW->getAlign() >= Align(16) &&
30733 (RMW->getOperation() == AtomicRMWInst::Xchg ||
30734 RMW->getOperation() == AtomicRMWInst::And ||
30735 RMW->getOperation() == AtomicRMWInst::Or);
30736
30737 return false;
30738}
30739
30741 if (!Subtarget->hasLSE2() || !Subtarget->hasRCPC3())
30742 return false;
30743
30744 if (auto LI = dyn_cast<LoadInst>(I))
30745 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
30746 LI->getAlign() >= Align(16) &&
30747 LI->getOrdering() == AtomicOrdering::Acquire;
30748
30749 if (auto SI = dyn_cast<StoreInst>(I))
30750 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
30751 SI->getAlign() >= Align(16) &&
30752 SI->getOrdering() == AtomicOrdering::Release;
30753
30754 return false;
30755}
30756
30758 const Instruction *I) const {
30760 return false;
30762 return false;
30764 return true;
30765 return false;
30766}
30767
30769 const Instruction *I) const {
30770 // Store-Release instructions only provide seq_cst guarantees when paired with
30771 // Load-Acquire instructions. MSVC CRT does not use these instructions to
30772 // implement seq_cst loads and stores, so we need additional explicit fences
30773 // after memory writes.
30774 if (!Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
30775 return false;
30776
30777 if (auto *SI = dyn_cast<StoreInst>(I))
30778 return SI->getOrdering() == AtomicOrdering::SequentiallyConsistent;
30779
30780 auto *CAS = dyn_cast<AtomicCmpXchgInst>(I);
30781 auto *RMW = dyn_cast<AtomicRMWInst>(I);
30782 // Not a store.
30783 if (!CAS && !RMW)
30784 return false;
30785
30786 // Fence only needed for seq_cst.
30787 if (CAS &&
30788 CAS->getSuccessOrdering() != AtomicOrdering::SequentiallyConsistent)
30789 return false;
30790 if (RMW && RMW->getOrdering() != AtomicOrdering::SequentiallyConsistent)
30791 return false;
30792
30793 // We do not need a fence if we have LSE atomics.
30794 return !Subtarget->hasLSE();
30795}
30796
30797// Loads and stores less than 128-bits are already atomic; ones above that
30798// are doomed anyway, so defer to the default libcall and blame the OS when
30799// things go wrong.
30802 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
30803 if (Size != 128)
30812}
30813
30814// Loads and stores less than 128-bits are already atomic; ones above that
30815// are doomed anyway, so defer to the default libcall and blame the OS when
30816// things go wrong.
30819 unsigned Size = LI->getType()->getPrimitiveSizeInBits();
30820
30821 if (Size != 128)
30823 if (isOpSuitableForRCPC3(LI))
30825 // No LSE128 loads
30826 if (isOpSuitableForLDPSTP(LI))
30828
30829 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
30830 // implement atomicrmw without spilling. If the target address is also on the
30831 // stack and close enough to the spill slot, this can lead to a situation
30832 // where the monitor always gets cleared and the atomic operation can never
30833 // succeed. So at -O0 lower this operation to a CAS loop.
30834 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
30836
30837 // Using CAS for an atomic load has a better chance of succeeding under high
30838 // contention situations. So use it if available.
30839 return Subtarget->hasLSE() ? AtomicExpansionKind::CmpXChg
30841}
30842
30843// Return true if the atomic operation expansion will lower to use a library
30844// call, and is thus ineligible to use an LLSC expansion.
30845static bool rmwOpMayLowerToLibcall(const AArch64Subtarget &Subtarget,
30846 const AtomicRMWInst *RMW) {
30847 if (!RMW->isFloatingPointOperation())
30848 return false;
30849 switch (RMW->getType()->getScalarType()->getTypeID()) {
30850 case Type::FloatTyID:
30851 case Type::DoubleTyID:
30852 case Type::HalfTyID:
30853 case Type::BFloatTyID:
30854 // Will use soft float
30855 return !Subtarget.hasFPARMv8();
30856 default:
30857 // fp128 will emit library calls.
30858 return true;
30859 }
30860
30861 llvm_unreachable("covered type switch");
30862}
30863
30864// The "default" for integer RMW operations is to expand to an LL/SC loop.
30865// However, with the LSE instructions (or outline-atomics mode, which provides
30866// library routines in place of the LSE-instructions), we can directly emit many
30867// operations instead.
30870 const AtomicRMWInst *AI) const {
30871 Type *Ty = AI->getType();
30872 unsigned Size = Ty->getPrimitiveSizeInBits();
30873 assert(Size <= 128 && "AtomicExpandPass should've handled larger sizes.");
30874
30875 bool CanUseLSE128 = Subtarget->hasLSE128() && Size == 128 &&
30879 if (CanUseLSE128)
30881
30882 // If LSFE available, use atomic FP instructions in preference to expansion
30883 if (Subtarget->hasLSFE() && (AI->getOperation() == AtomicRMWInst::FAdd ||
30889
30890 // Leave 128 bits to LLSC or CmpXChg.
30891 if (Size < 128 && !AI->isFloatingPointOperation()) {
30892 if (Subtarget->hasLSE()) {
30893 // Nand is not supported in LSE.
30894 switch (AI->getOperation()) {
30896 case AtomicRMWInst::Add:
30897 case AtomicRMWInst::Sub:
30898 case AtomicRMWInst::And:
30899 case AtomicRMWInst::Or:
30900 case AtomicRMWInst::Xor:
30901 case AtomicRMWInst::Max:
30902 case AtomicRMWInst::Min:
30906 default:
30907 break;
30908 }
30909 }
30910 if (Subtarget->outlineAtomics()) {
30911 // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
30912 // Don't outline them unless
30913 // (1) high level <atomic> support approved:
30914 // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
30915 // (2) low level libgcc and compiler-rt support implemented by:
30916 // min/max outline atomics helpers
30917 switch (AI->getOperation()) {
30919 case AtomicRMWInst::Add:
30920 case AtomicRMWInst::Sub:
30921 case AtomicRMWInst::And:
30922 case AtomicRMWInst::Or:
30923 case AtomicRMWInst::Xor:
30925 default:
30926 break;
30927 }
30928 }
30929 }
30930
30931 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
30932 // implement atomicrmw without spilling. If the target address is also on the
30933 // stack and close enough to the spill slot, this can lead to a situation
30934 // where the monitor always gets cleared and the atomic operation can never
30935 // succeed. So at -O0 lower this operation to a CAS loop. Also worthwhile if
30936 // we have a single CAS instruction that can replace the loop.
30937 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None ||
30938 Subtarget->hasLSE() || rmwOpMayLowerToLibcall(*Subtarget, AI))
30940
30942}
30943
30946 const AtomicCmpXchgInst *AI) const {
30947 // If subtarget has LSE, leave cmpxchg intact for codegen.
30948 if (Subtarget->hasLSE() || Subtarget->outlineAtomics())
30950 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
30951 // implement cmpxchg without spilling. If the address being exchanged is also
30952 // on the stack and close enough to the spill slot, this can lead to a
30953 // situation where the monitor always gets cleared and the atomic operation
30954 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
30955 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
30957
30958 // 128-bit atomic cmpxchg is weird; AtomicExpand doesn't know how to expand
30959 // it.
30961 if (Size > 64)
30963
30965}
30966
30968 Type *ValueTy, Value *Addr,
30969 AtomicOrdering Ord) const {
30970 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
30971 bool IsAcquire = isAcquireOrStronger(Ord);
30972
30973 // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
30974 // intrinsic must return {i64, i64} and we have to recombine them into a
30975 // single i128 here.
30976 if (ValueTy->getPrimitiveSizeInBits() == 128) {
30978 IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
30979
30980 Value *LoHi =
30981 Builder.CreateIntrinsic(Int, Addr, /*FMFSource=*/nullptr, "lohi");
30982
30983 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
30984 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
30985
30986 auto *Int128Ty = Type::getInt128Ty(Builder.getContext());
30987 Lo = Builder.CreateZExt(Lo, Int128Ty, "lo64");
30988 Hi = Builder.CreateZExt(Hi, Int128Ty, "hi64");
30989
30990 Value *Or = Builder.CreateOr(
30991 Lo, Builder.CreateShl(Hi, ConstantInt::get(Int128Ty, 64)), "val64");
30992 return Builder.CreateBitCast(Or, ValueTy);
30993 }
30994
30995 Type *Tys[] = { Addr->getType() };
30997 IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
30998
30999 const DataLayout &DL = M->getDataLayout();
31000 IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy));
31001 CallInst *CI = Builder.CreateIntrinsic(Int, Tys, Addr);
31002 CI->addParamAttr(0, Attribute::get(Builder.getContext(),
31003 Attribute::ElementType, IntEltTy));
31004 Value *Trunc = Builder.CreateTrunc(CI, IntEltTy);
31005
31006 return Builder.CreateBitCast(Trunc, ValueTy);
31007}
31008
31010 IRBuilderBase &Builder) const {
31011 Builder.CreateIntrinsic(Intrinsic::aarch64_clrex, {});
31012}
31013
31015 Value *Val, Value *Addr,
31016 AtomicOrdering Ord) const {
31017 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
31018 bool IsRelease = isReleaseOrStronger(Ord);
31019
31020 // Since the intrinsics must have legal type, the i128 intrinsics take two
31021 // parameters: "i64, i64". We must marshal Val into the appropriate form
31022 // before the call.
31023 if (Val->getType()->getPrimitiveSizeInBits() == 128) {
31025 IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
31027 Type *Int64Ty = Type::getInt64Ty(M->getContext());
31028 Type *Int128Ty = Type::getInt128Ty(M->getContext());
31029
31030 Value *CastVal = Builder.CreateBitCast(Val, Int128Ty);
31031
31032 Value *Lo = Builder.CreateTrunc(CastVal, Int64Ty, "lo");
31033 Value *Hi =
31034 Builder.CreateTrunc(Builder.CreateLShr(CastVal, 64), Int64Ty, "hi");
31035 return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
31036 }
31037
31039 IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
31040 Type *Tys[] = { Addr->getType() };
31042
31043 const DataLayout &DL = M->getDataLayout();
31044 IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
31045 Val = Builder.CreateBitCast(Val, IntValTy);
31046
31047 CallInst *CI = Builder.CreateCall(
31048 Stxr, {Builder.CreateZExtOrBitCast(
31049 Val, Stxr->getFunctionType()->getParamType(0)),
31050 Addr});
31051 CI->addParamAttr(1, Attribute::get(Builder.getContext(),
31052 Attribute::ElementType, Val->getType()));
31053 return CI;
31054}
31055
31057 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
31058 const DataLayout &DL) const {
31059 if (!Ty->isArrayTy()) {
31060 const TypeSize &TySize = Ty->getPrimitiveSizeInBits();
31061 return TySize.isScalable() && TySize.getKnownMinValue() > 128;
31062 }
31063
31064 // All non aggregate members of the type must have the same type
31065 SmallVector<EVT> ValueVTs;
31066 ComputeValueVTs(*this, DL, Ty, ValueVTs);
31067 return all_equal(ValueVTs);
31068}
31069
31070bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
31071 EVT) const {
31072 return false;
31073}
31074
31075static Value *UseTlsOffset(IRBuilderBase &IRB, unsigned Offset) {
31076 Module *M = IRB.GetInsertBlock()->getParent()->getParent();
31077 Function *ThreadPointerFunc = Intrinsic::getOrInsertDeclaration(
31078 M, Intrinsic::thread_pointer, IRB.getPtrTy());
31079 return IRB.CreatePointerCast(
31080 IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc),
31081 Offset),
31082 IRB.getPtrTy(0));
31083}
31084
31086 IRBuilderBase &IRB, const LibcallLoweringInfo &Libcalls) const {
31087 // Android provides a fixed TLS slot for the stack cookie. See the definition
31088 // of TLS_SLOT_STACK_GUARD in
31089 // https://android.googlesource.com/platform/bionic/+/main/libc/platform/bionic/tls_defines.h
31090 if (Subtarget->isTargetAndroid())
31091 return UseTlsOffset(IRB, 0x28);
31092
31093 // Fuchsia is similar.
31094 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
31095 if (Subtarget->isTargetFuchsia())
31096 return UseTlsOffset(IRB, -0x10);
31097
31098 return TargetLowering::getIRStackGuard(IRB, Libcalls);
31099}
31100
31102 Module &M, const LibcallLoweringInfo &Libcalls) const {
31103 // MSVC CRT provides functionalities for stack protection.
31104 RTLIB::LibcallImpl SecurityCheckCookieLibcall =
31105 Libcalls.getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
31106
31107 RTLIB::LibcallImpl SecurityCookieVar =
31108 Libcalls.getLibcallImpl(RTLIB::STACK_CHECK_GUARD);
31109 if (SecurityCheckCookieLibcall != RTLIB::Unsupported &&
31110 SecurityCookieVar != RTLIB::Unsupported) {
31111 // MSVC CRT has a global variable holding security cookie.
31112 M.getOrInsertGlobal(getLibcallImplName(SecurityCookieVar),
31113 PointerType::getUnqual(M.getContext()));
31114
31115 // MSVC CRT has a function to validate security cookie.
31116 FunctionCallee SecurityCheckCookie =
31117 M.getOrInsertFunction(getLibcallImplName(SecurityCheckCookieLibcall),
31118 Type::getVoidTy(M.getContext()),
31119 PointerType::getUnqual(M.getContext()));
31120 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
31121 F->setCallingConv(CallingConv::Win64);
31122 F->addParamAttr(0, Attribute::AttrKind::InReg);
31123 }
31124 return;
31125 }
31127}
31128
31130 IRBuilderBase &IRB, const LibcallLoweringInfo &Libcalls) const {
31131 // Android provides a fixed TLS slot for the SafeStack pointer. See the
31132 // definition of TLS_SLOT_SAFESTACK in
31133 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
31134 if (Subtarget->isTargetAndroid())
31135 return UseTlsOffset(IRB, 0x48);
31136
31137 return TargetLowering::getSafeStackPointerLocation(IRB, Libcalls);
31138}
31139
31140/// If a physical register, this returns the register that receives the
31141/// exception address on entry to an EH pad.
31143 const Constant *PersonalityFn) const {
31144 // FIXME: This is a guess. Has this been defined yet?
31145 return AArch64::X0;
31146}
31147
31148/// If a physical register, this returns the register that receives the
31149/// exception typeid on entry to a landing pad.
31151 const Constant *PersonalityFn) const {
31152 // FIXME: This is a guess. Has this been defined yet?
31153 return AArch64::X1;
31154}
31155
31157 const Instruction &AndI) const {
31158 // Only sink 'and' mask to cmp use block if it is masking a single bit, since
31159 // this is likely to be fold the and/cmp/br into a single tbz instruction. It
31160 // may be beneficial to sink in other cases, but we would have to check that
31161 // the cmp would not get folded into the br to form a cbz for these to be
31162 // beneficial.
31164 if (!Mask)
31165 return false;
31166 return Mask->getValue().isPowerOf2();
31167}
31168
31172 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
31173 SelectionDAG &DAG) const {
31174 // Does baseline recommend not to perform the fold by default?
31176 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
31177 return false;
31178 // Else, if this is a vector shift, prefer 'shl'.
31179 return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL;
31180}
31181
31184 SelectionDAG &DAG, SDNode *N, unsigned int ExpansionFactor) const {
31186 !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
31189 ExpansionFactor);
31190}
31191
31193 // Update IsSplitCSR in AArch64unctionInfo.
31194 AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
31195 AFI->setIsSplitCSR(true);
31196}
31197
31199 MachineBasicBlock *Entry,
31200 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
31201 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
31202 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
31203 if (!IStart)
31204 return;
31205
31206 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
31207 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
31208 MachineBasicBlock::iterator MBBI = Entry->begin();
31209 for (const MCPhysReg *I = IStart; *I; ++I) {
31210 const TargetRegisterClass *RC = nullptr;
31211 if (AArch64::GPR64RegClass.contains(*I))
31212 RC = &AArch64::GPR64RegClass;
31213 else if (AArch64::FPR64RegClass.contains(*I))
31214 RC = &AArch64::FPR64RegClass;
31215 else
31216 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
31217
31218 Register NewVR = MRI->createVirtualRegister(RC);
31219 // Create copy from CSR to a virtual register.
31220 // FIXME: this currently does not emit CFI pseudo-instructions, it works
31221 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
31222 // nounwind. If we want to generalize this later, we may need to emit
31223 // CFI pseudo-instructions.
31224 assert(Entry->getParent()->getFunction().hasFnAttribute(
31225 Attribute::NoUnwind) &&
31226 "Function should be nounwind in insertCopiesSplitCSR!");
31227 Entry->addLiveIn(*I);
31228 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
31229 .addReg(*I);
31230
31231 // Insert the copy-back instructions right before the terminator.
31232 for (auto *Exit : Exits)
31233 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
31234 TII->get(TargetOpcode::COPY), *I)
31235 .addReg(NewVR);
31236 }
31237}
31238
31239bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
31240 // Integer division on AArch64 is expensive. However, when aggressively
31241 // optimizing for code size, we prefer to use a div instruction, as it is
31242 // usually smaller than the alternative sequence.
31243 // The exception to this is vector division. Since AArch64 doesn't have vector
31244 // integer division, leaving the division as-is is a loss even in terms of
31245 // size, because it will have to be scalarized, while the alternative code
31246 // sequence can be performed in vector form.
31247 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
31248 return OptSize && !VT.isVector();
31249}
31250
31252 const MachineFunction &MF) const {
31253 // Avoid merging stores into fixed-length vectors when Neon is unavailable.
31254 // In future, we could allow this when SVE is available, but currently,
31255 // the SVE lowerings for BUILD_VECTOR are limited to a few specific cases (and
31256 // the general lowering may introduce stack spills/reloads).
31257 if (MemVT.isFixedLengthVector() && !Subtarget->isNeonAvailable())
31258 return false;
31259
31260 // Do not merge to float value size (128 bytes) if no implicit float attribute
31261 // is set.
31262 bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);
31263 return !NoFloat || MemVT.getSizeInBits() <= 64;
31264}
31265
31267 // We want inc-of-add for scalars and sub-of-not for vectors.
31268 return VT.isScalarInteger();
31269}
31270
31272 EVT VT) const {
31273 // v8f16 without fp16 need to be extended to v8f32, which is more difficult to
31274 // legalize.
31275 if (FPVT == MVT::v8f16 && !Subtarget->hasFullFP16())
31276 return false;
31277 if (FPVT == MVT::v8bf16)
31278 return false;
31279 return TargetLowering::shouldConvertFpToSat(Op, FPVT, VT);
31280}
31281
31283 // Expand scalar and SVE operations using selects. Neon vectors prefer sub to
31284 // avoid vselect becoming bsl / unrolling.
31285 return !VT.isFixedLengthVector();
31286}
31287
31291 const TargetInstrInfo *TII) const {
31292 assert(MBBI->isCall() && MBBI->getCFIType() &&
31293 "Invalid call instruction for a KCFI check");
31294
31295 switch (MBBI->getOpcode()) {
31296 case AArch64::BLR:
31297 case AArch64::BLRNoIP:
31298 case AArch64::TCRETURNri:
31299 case AArch64::TCRETURNrix16x17:
31300 case AArch64::TCRETURNrix17:
31301 case AArch64::TCRETURNrinotx16:
31302 break;
31303 default:
31304 llvm_unreachable("Unexpected CFI call opcode");
31305 }
31306
31307 MachineOperand &Target = MBBI->getOperand(0);
31308 assert(Target.isReg() && "Invalid target operand for an indirect call");
31309 Target.setIsRenamable(false);
31310
31311 return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(AArch64::KCFI_CHECK))
31312 .addReg(Target.getReg())
31313 .addImm(MBBI->getCFIType())
31314 .getInstr();
31315}
31316
31318 Type *VectorTy, unsigned ElemSizeInBits, unsigned &Index) const {
31319 // On AArch64, we can efficiently extract a scalar from a splat vector using
31320 // str b/h/s/d/q0 which extracts 8/16/32/64/128 bits from the vector register.
31321 // This is useful for memset where we generate a v16i8 splat and need to store
31322 // a smaller scalar (e.g., i32 for a 4-byte memset, i16 for 2 bytes, i8 for 1
31323 // byte).
31324 if (FixedVectorType *VTy = dyn_cast<FixedVectorType>(VectorTy)) {
31325 // Handle v16i8 splat (128 bits total, 16 elements of 8 bits each) and
31326 // v8i8 splat (64 bits total, 8 elements of 8 bits each)
31327 if ((VTy->getNumElements() == 16 || VTy->getNumElements() == 8) &&
31328 VTy->getElementType()->isIntegerTy(8)) {
31329 // Check if we're extracting 8, 16, 32, or 64-bit element
31330 // All extract from element 0 since it's a splat
31331 if (ElemSizeInBits == 8 || ElemSizeInBits == 16 || ElemSizeInBits == 32 ||
31332 ElemSizeInBits == 64) {
31333 Index = 0;
31334 return true;
31335 }
31336 }
31337 }
31338 return false;
31339}
31340
31342 return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
31343}
31344
31345unsigned
31347 if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
31348 return getPointerTy(DL).getSizeInBits();
31349
31350 return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
31351}
31352
31353void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
31354 MachineFrameInfo &MFI = MF.getFrameInfo();
31355 // If we have any vulnerable SVE stack objects then the stack protector
31356 // needs to be placed at the top of the SVE stack area, as the SVE locals
31357 // are placed above the other locals, so we allocate it as if it were a
31358 // scalable vector.
31359 // FIXME: It may be worthwhile having a specific interface for this rather
31360 // than doing it here in finalizeLowering.
31361 if (MFI.hasStackProtectorIndex()) {
31362 for (unsigned int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
31363 if (MFI.hasScalableStackID(i) &&
31368 break;
31369 }
31370 }
31371 }
31374}
31375
31376// Unlike X86, we let frame lowering assign offsets to all catch objects.
31378
31379bool AArch64TargetLowering::shouldLocalize(
31380 const MachineInstr &MI, const TargetTransformInfo *TTI) const {
31381 auto &MF = *MI.getMF();
31382 auto &MRI = MF.getRegInfo();
31383 auto maxUses = [](unsigned RematCost) {
31384 // A cost of 1 means remats are basically free.
31385 if (RematCost == 1)
31386 return std::numeric_limits<unsigned>::max();
31387 if (RematCost == 2)
31388 return 2U;
31389
31390 // Remat is too expensive, only sink if there's one user.
31391 if (RematCost > 2)
31392 return 1U;
31393 llvm_unreachable("Unexpected remat cost");
31394 };
31395
31396 unsigned Opc = MI.getOpcode();
31397 switch (Opc) {
31398 case TargetOpcode::G_GLOBAL_VALUE: {
31399 // On Darwin, TLS global vars get selected into function calls, which
31400 // we don't want localized, as they can get moved into the middle of a
31401 // another call sequence.
31402 const GlobalValue &GV = *MI.getOperand(1).getGlobal();
31403 if (GV.isThreadLocal() && Subtarget->isTargetMachO())
31404 return false;
31405 return true; // Always localize G_GLOBAL_VALUE to avoid high reg pressure.
31406 }
31407 case TargetOpcode::G_FCONSTANT:
31408 case TargetOpcode::G_CONSTANT: {
31409 const ConstantInt *CI;
31410 unsigned AdditionalCost = 0;
31411
31412 if (Opc == TargetOpcode::G_CONSTANT)
31413 CI = MI.getOperand(1).getCImm();
31414 else {
31415 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
31416 // We try to estimate cost of 32/64b fpimms, as they'll likely be
31417 // materialized as integers.
31418 if (Ty.getScalarSizeInBits() != 32 && Ty.getScalarSizeInBits() != 64)
31419 break;
31420 auto APF = MI.getOperand(1).getFPImm()->getValueAPF();
31421 bool OptForSize = MF.getFunction().hasOptSize();
31423 OptForSize))
31424 return true; // Constant should be cheap.
31425 CI =
31426 ConstantInt::get(MF.getFunction().getContext(), APF.bitcastToAPInt());
31427 // FP materialization also costs an extra move, from gpr to fpr.
31428 AdditionalCost = 1;
31429 }
31430 APInt Imm = CI->getValue();
31433 assert(Cost.isValid() && "Expected a valid imm cost");
31434
31435 unsigned RematCost = Cost.getValue();
31436 RematCost += AdditionalCost;
31437 Register Reg = MI.getOperand(0).getReg();
31438 unsigned MaxUses = maxUses(RematCost);
31439 // Don't pass UINT_MAX sentinel value to hasAtMostUserInstrs().
31440 if (MaxUses == std::numeric_limits<unsigned>::max())
31441 --MaxUses;
31442 return MRI.hasAtMostUserInstrs(Reg, MaxUses);
31443 }
31444 // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
31445 // localizable.
31446 case AArch64::ADRP:
31447 case AArch64::G_ADD_LOW:
31448 // Need to localize G_PTR_ADD so that G_GLOBAL_VALUE can be localized too.
31449 case TargetOpcode::G_PTR_ADD:
31450 return true;
31451 default:
31452 break;
31453 }
31455}
31456
31458 // Fallback for scalable vectors.
31459 // Note that if EnableSVEGISel is true, we allow scalable vector types for
31460 // all instructions, regardless of whether they are actually supported.
31461 if (!EnableSVEGISel) {
31462 if (Inst.getType()->isScalableTy()) {
31463 return true;
31464 }
31465
31466 for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
31467 if (Inst.getOperand(i)->getType()->isScalableTy())
31468 return true;
31469
31470 if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
31471 if (AI->getAllocatedType()->isScalableTy())
31472 return true;
31473 }
31474 }
31475
31476 // Checks to allow the use of SME instructions
31477 if (auto *Base = dyn_cast<CallBase>(&Inst)) {
31478 const Function *Caller = Base->getCaller();
31479 // Per-call SME attribute checks via SMECallAttrs are compile-time
31480 // expensive, avoid for non-SME targets. We do the checks even if the
31481 // target only has SVE, since streaming-mode changes might still be
31482 // required.
31483 if (Subtarget->hasSME() ||
31484 Caller->hasFnAttribute("aarch64_pstate_sm_compatible") ||
31485 Caller->hasFnAttribute("aarch64_za_state_agnostic")) {
31486 auto CallAttrs = SMECallAttrs(*Base, &getRuntimeLibcallsInfo());
31487 if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() ||
31488 CallAttrs.requiresPreservingZT0() ||
31489 CallAttrs.requiresPreservingAllZAState())
31490 return true;
31491 }
31492 }
31493 return false;
31494}
31495
31496// Return the largest legal scalable vector type that matches VT's element type.
31500 "Expected legal fixed length vector!");
31501 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
31502 default:
31503 llvm_unreachable("unexpected element type for SVE container");
31504 case MVT::i8:
31505 return EVT(MVT::nxv16i8);
31506 case MVT::i16:
31507 return EVT(MVT::nxv8i16);
31508 case MVT::i32:
31509 return EVT(MVT::nxv4i32);
31510 case MVT::i64:
31511 return EVT(MVT::nxv2i64);
31512 case MVT::bf16:
31513 return EVT(MVT::nxv8bf16);
31514 case MVT::f16:
31515 return EVT(MVT::nxv8f16);
31516 case MVT::f32:
31517 return EVT(MVT::nxv4f32);
31518 case MVT::f64:
31519 return EVT(MVT::nxv2f64);
31520 }
31521}
31522
31523// Return a predicate with active lanes corresponding to the extent of VT.
31525 EVT VT) {
31528 "Expected legal fixed length vector!");
31529
31530 std::optional<unsigned> PgPattern =
31532 assert(PgPattern && "Unexpected element count for SVE predicate");
31533
31534 MVT MaskVT;
31535 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
31536 default:
31537 llvm_unreachable("unexpected element type for SVE predicate");
31538 case MVT::i8:
31539 MaskVT = MVT::nxv16i1;
31540 break;
31541 case MVT::i16:
31542 case MVT::f16:
31543 case MVT::bf16:
31544 MaskVT = MVT::nxv8i1;
31545 break;
31546 case MVT::i32:
31547 case MVT::f32:
31548 MaskVT = MVT::nxv4i1;
31549 break;
31550 case MVT::i64:
31551 case MVT::f64:
31552 MaskVT = MVT::nxv2i1;
31553 break;
31554 }
31555
31556 return getPTrue(DAG, DL, MaskVT, *PgPattern);
31557}
31558
31560 EVT VT) {
31562 "Expected legal scalable vector!");
31563 auto PredTy = VT.changeVectorElementType(*DAG.getContext(), MVT::i1);
31564 return DAG.getConstant(1, DL, PredTy);
31565}
31566
31568 if (VT.isFixedLengthVector())
31569 return getPredicateForFixedLengthVector(DAG, DL, VT);
31570
31571 return getPredicateForScalableVector(DAG, DL, VT);
31572}
31573
31574// Grow V to consume an entire SVE register.
31576 assert(VT.isScalableVector() &&
31577 "Expected to convert into a scalable vector!");
31578 assert(V.getValueType().isFixedLengthVector() &&
31579 "Expected a fixed length vector operand!");
31580 SDLoc DL(V);
31581 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
31582 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getPOISON(VT), V, Zero);
31583}
31584
31585// Shrink V so it's just big enough to maintain a VT's worth of data.
31588 "Expected to convert into a fixed length vector!");
31589 assert(V.getValueType().isScalableVector() &&
31590 "Expected a scalable vector operand!");
31591 SDLoc DL(V);
31592 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
31593 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero);
31594}
31595
31596// Convert all fixed length vector loads larger than NEON to masked_loads.
31597SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
31598 SDValue Op, SelectionDAG &DAG) const {
31599 auto Load = cast<LoadSDNode>(Op);
31600
31601 SDLoc DL(Op);
31602 EVT VT = Op.getValueType();
31603 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
31604 EVT LoadVT = ContainerVT;
31605 EVT MemVT = Load->getMemoryVT();
31606
31607 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
31608
31609 if (VT.isFloatingPoint()) {
31610 LoadVT = ContainerVT.changeTypeToInteger();
31611 MemVT = MemVT.changeTypeToInteger();
31612 }
31613
31614 SDValue NewLoad = DAG.getMaskedLoad(
31615 LoadVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), Pg,
31616 DAG.getPOISON(LoadVT), MemVT, Load->getMemOperand(),
31617 Load->getAddressingMode(), Load->getExtensionType());
31618
31619 SDValue Result = NewLoad;
31620 if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
31621 EVT ExtendVT = ContainerVT.changeVectorElementType(
31622 *DAG.getContext(), Load->getMemoryVT().getVectorElementType());
31623
31624 Result = getSVESafeBitCast(ExtendVT, Result, DAG);
31625 Result = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
31626 Pg, Result, DAG.getPOISON(ContainerVT));
31627 } else if (VT.isFloatingPoint()) {
31628 Result = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Result);
31629 }
31630
31631 Result = convertFromScalableVector(DAG, VT, Result);
31632 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
31633 return DAG.getMergeValues(MergedValues, DL);
31634}
31635
31637 SelectionDAG &DAG) {
31638 SDLoc DL(Mask);
31639 EVT InVT = Mask.getValueType();
31640 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
31642
31643 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
31644 return Pg;
31645
31646 bool InvertCond = false;
31647 if (isBitwiseNot(Mask)) {
31648 InvertCond = true;
31649 Mask = Mask.getOperand(0);
31650 }
31651
31652 SDValue Op1, Op2;
31653 ISD::CondCode CC;
31654
31655 // When Mask is the result of a SETCC, it's better to regenerate the compare.
31656 if (Mask.getOpcode() == ISD::SETCC) {
31657 Op1 = convertToScalableVector(DAG, ContainerVT, Mask.getOperand(0));
31658 Op2 = convertToScalableVector(DAG, ContainerVT, Mask.getOperand(1));
31659 CC = cast<CondCodeSDNode>(Mask.getOperand(2))->get();
31660 } else {
31661 Op1 = convertToScalableVector(DAG, ContainerVT, Mask);
31662 Op2 = DAG.getConstant(0, DL, ContainerVT);
31663 CC = ISD::SETNE;
31664 }
31665
31666 if (InvertCond)
31667 CC = getSetCCInverse(CC, Op1.getValueType());
31668
31669 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, Pg.getValueType(),
31670 {Pg, Op1, Op2, DAG.getCondCode(CC)});
31671}
31672
31673// Convert all fixed length vector loads larger than NEON to masked_loads.
31674SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(
31675 SDValue Op, SelectionDAG &DAG) const {
31677
31678 SDLoc DL(Op);
31679 EVT VT = Op.getValueType();
31680 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
31681
31682 SDValue Mask = Load->getMask();
31683 // If this is an extending load and the mask type is not the same as
31684 // load's type then we have to extend the mask type.
31685 if (VT.getScalarSizeInBits() > Mask.getValueType().getScalarSizeInBits()) {
31686 assert(Load->getExtensionType() != ISD::NON_EXTLOAD &&
31687 "Incorrect mask type");
31688 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Mask);
31689 }
31691
31692 SDValue PassThru, NewLoad, Result;
31693 bool IsPassThruZeroOrUndef = false;
31694
31695 if (Load->getPassThru()->isUndef()) {
31696 PassThru = DAG.getUNDEF(ContainerVT);
31697 IsPassThruZeroOrUndef = true;
31698 } else {
31699 if (ContainerVT.isInteger())
31700 PassThru = DAG.getConstant(0, DL, ContainerVT);
31701 else
31702 PassThru = DAG.getConstantFP(0, DL, ContainerVT);
31703 if (isZerosVector(Load->getPassThru().getNode()))
31704 IsPassThruZeroOrUndef = true;
31705 }
31706
31707 if (!Load->isExpandingLoad()) {
31708 NewLoad =
31709 DAG.getMaskedLoad(ContainerVT, DL, Load->getChain(), Load->getBasePtr(),
31710 Load->getOffset(), Mask, PassThru,
31711 Load->getMemoryVT(), Load->getMemOperand(),
31712 Load->getAddressingMode(), Load->getExtensionType());
31713 Result = NewLoad;
31714 } else {
31715 // Fixed-length masked.expandload intrinsics should have been scalarised
31716 // if the required features are not available to use EXPAND.
31717 assert(((Subtarget->isSVEAvailable() && Subtarget->hasSVE2p2()) ||
31718 (Subtarget->isSVEorStreamingSVEAvailable() &&
31719 Subtarget->hasSME2p2())) &&
31720 "Expected SVE2p2 or SME2p2");
31721
31722 SDValue CntActive = DAG.getNode(
31723 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,
31724 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64), Mask,
31725 Mask);
31726
31727 SDValue ActiveMask =
31728 DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, DL, Mask->getValueType(0),
31729 DAG.getConstant(0, DL, MVT::i64), CntActive);
31730
31731 NewLoad = DAG.getMaskedLoad(
31732 ContainerVT, DL, Load->getChain(), Load->getBasePtr(),
31733 Load->getOffset(), ActiveMask, DAG.getUNDEF(ContainerVT),
31734 Load->getMemoryVT(), Load->getMemOperand(), Load->getAddressingMode(),
31735 Load->getExtensionType());
31736
31737 Result = DAG.getNode(
31738 ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
31739 DAG.getTargetConstant(Intrinsic::aarch64_sve_expand, DL, MVT::i64),
31740 Mask, NewLoad);
31741 }
31742
31743 if (!IsPassThruZeroOrUndef) {
31744 SDValue OldPassThru =
31745 convertToScalableVector(DAG, ContainerVT, Load->getPassThru());
31746 Result = DAG.getSelect(DL, ContainerVT, Mask, Result, OldPassThru);
31747 }
31748
31749 Result = convertFromScalableVector(DAG, VT, Result);
31750 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
31751 return DAG.getMergeValues(MergedValues, DL);
31752}
31753
31754// Convert all fixed length vector stores larger than NEON to masked_stores.
31755SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
31756 SDValue Op, SelectionDAG &DAG) const {
31757 auto Store = cast<StoreSDNode>(Op);
31758
31759 SDLoc DL(Op);
31760 EVT VT = Store->getValue().getValueType();
31761 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
31762 EVT MemVT = Store->getMemoryVT();
31763
31764 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
31765 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
31766
31767 if (VT.isFloatingPoint() && Store->isTruncatingStore()) {
31768 EVT TruncVT = ContainerVT.changeVectorElementType(
31769 *DAG.getContext(), Store->getMemoryVT().getVectorElementType());
31770 MemVT = MemVT.changeTypeToInteger();
31771 NewValue = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, TruncVT, Pg,
31772 NewValue, DAG.getTargetConstant(0, DL, MVT::i64),
31773 DAG.getPOISON(TruncVT));
31774 NewValue =
31775 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
31776 } else if (VT.isFloatingPoint()) {
31777 MemVT = MemVT.changeTypeToInteger();
31778 NewValue =
31779 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
31780 }
31781
31782 return DAG.getMaskedStore(Store->getChain(), DL, NewValue,
31783 Store->getBasePtr(), Store->getOffset(), Pg, MemVT,
31784 Store->getMemOperand(), Store->getAddressingMode(),
31785 Store->isTruncatingStore());
31786}
31787
31788SDValue AArch64TargetLowering::LowerMSTORE(SDValue Op,
31789 SelectionDAG &DAG) const {
31790 SDLoc DL(Op);
31792 EVT VT = Store->getValue().getValueType();
31793 if (VT.isFixedLengthVector())
31794 return LowerFixedLengthVectorMStoreToSVE(Op, DAG);
31795
31796 if (!Store->isCompressingStore())
31797 return SDValue();
31798
31799 EVT MaskVT = Store->getMask().getValueType();
31800 EVT MaskExtVT = getPromotedVTForPredicate(MaskVT);
31801 EVT MaskReduceVT = MaskExtVT.getScalarType();
31802 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
31803
31804 SDValue MaskExt =
31805 DAG.getNode(ISD::ZERO_EXTEND, DL, MaskExtVT, Store->getMask());
31806 SDValue CntActive =
31807 DAG.getNode(ISD::VECREDUCE_ADD, DL, MaskReduceVT, MaskExt);
31808 if (MaskReduceVT != MVT::i64)
31809 CntActive = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, CntActive);
31810
31811 SDValue CompressedValue =
31812 DAG.getNode(ISD::VECTOR_COMPRESS, DL, VT, Store->getValue(),
31813 Store->getMask(), DAG.getPOISON(VT));
31814 SDValue CompressedMask =
31815 DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, DL, MaskVT, Zero, CntActive);
31816
31817 return DAG.getMaskedStore(Store->getChain(), DL, CompressedValue,
31818 Store->getBasePtr(), Store->getOffset(),
31819 CompressedMask, Store->getMemoryVT(),
31820 Store->getMemOperand(), Store->getAddressingMode(),
31821 Store->isTruncatingStore(),
31822 /*isCompressing=*/false);
31823}
31824
31825SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
31826 SDValue Op, SelectionDAG &DAG) const {
31828
31829 SDLoc DL(Op);
31830 EVT VT = Store->getValue().getValueType();
31831 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
31832
31833 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
31835
31836 return DAG.getMaskedStore(
31837 Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
31838 Mask, Store->getMemoryVT(), Store->getMemOperand(),
31839 Store->getAddressingMode(), Store->isTruncatingStore(),
31840 Store->isCompressingStore());
31841}
31842
31843SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
31844 SDValue Op, SelectionDAG &DAG) const {
31845 SDLoc DL(Op);
31846 EVT VT = Op.getValueType();
31847 EVT EltVT = VT.getVectorElementType();
31848 unsigned Opc = Op.getOpcode();
31849 bool Signed = Opc == ISD::SDIV || Opc == ISD::MASKED_SDIV;
31851
31852 bool Negated;
31853 uint64_t SplatVal;
31854 // NOTE: SRAD cannot be used to represent sdiv-by-one.
31855 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated) &&
31856 SplatVal > 1) {
31857 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
31858 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
31859 SDValue Op2 = DAG.getTargetConstant(Log2_64(SplatVal), DL, MVT::i32);
31860 SDValue Pg = Masked
31861 ? convertFixedMaskToScalableVector(Op.getOperand(2), DAG)
31863
31864 SDValue Res =
31865 DAG.getNode(AArch64ISD::ASRD_MERGE_OP1, DL, ContainerVT, Pg, Op1, Op2);
31866 if (Negated)
31867 Res = DAG.getNode(ISD::SUB, DL, ContainerVT,
31868 DAG.getConstant(0, DL, ContainerVT), Res);
31869
31870 return convertFromScalableVector(DAG, VT, Res);
31871 }
31872
31873 // Scalable vector i32/i64 DIV is supported.
31874 if (EltVT == MVT::i32 || EltVT == MVT::i64) {
31875 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
31876 SDValue LHS = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
31877 SDValue RHS = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
31879 ? convertFixedMaskToScalableVector(Op.getOperand(2), DAG)
31881
31882 unsigned MaskedOpcode = Signed ? ISD::MASKED_SDIV : ISD::MASKED_UDIV;
31883 SDValue Div = DAG.getNode(MaskedOpcode, DL, ContainerVT, LHS, RHS, Mask);
31884 return convertFromScalableVector(DAG, VT, Div);
31885 }
31886
31887 // Custom lower requires splitting the mask, which increases complexity with
31888 // no real improvement to generated code compared to default expansion.
31889 if (Masked)
31890 return SDValue();
31891
31892 // Scalable vector i8/i16 DIV is not supported. Promote it to i32.
31893 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
31894 EVT PromVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext());
31895 unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
31896
31897 // If the wider type is legal: extend, op, and truncate.
31898 EVT WideVT = VT.widenIntegerVectorElementType(*DAG.getContext());
31899 if (DAG.getTargetLoweringInfo().isTypeLegal(WideVT)) {
31900 SDValue Op0 = DAG.getNode(ExtendOpcode, DL, WideVT, Op.getOperand(0));
31901 SDValue Op1 = DAG.getNode(ExtendOpcode, DL, WideVT, Op.getOperand(1));
31902 SDValue Div = DAG.getNode(Opc, DL, WideVT, Op0, Op1);
31903 return DAG.getNode(ISD::TRUNCATE, DL, VT, Div);
31904 }
31905
31906 auto HalveAndExtendVector = [&DAG, &DL, &HalfVT, &PromVT,
31907 &ExtendOpcode](SDValue Op) {
31908 SDValue IdxZero = DAG.getConstant(0, DL, MVT::i64);
31909 SDValue IdxHalf =
31910 DAG.getConstant(HalfVT.getVectorNumElements(), DL, MVT::i64);
31911 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Op, IdxZero);
31912 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Op, IdxHalf);
31913 return std::pair<SDValue, SDValue>(
31914 {DAG.getNode(ExtendOpcode, DL, PromVT, Lo),
31915 DAG.getNode(ExtendOpcode, DL, PromVT, Hi)});
31916 };
31917
31918 // If wider type is not legal: split, extend, op, trunc and concat.
31919 auto [Op0LoExt, Op0HiExt] = HalveAndExtendVector(Op.getOperand(0));
31920 auto [Op1LoExt, Op1HiExt] = HalveAndExtendVector(Op.getOperand(1));
31921 SDValue Lo = DAG.getNode(Op.getOpcode(), DL, PromVT, Op0LoExt, Op1LoExt);
31922 SDValue Hi = DAG.getNode(Op.getOpcode(), DL, PromVT, Op0HiExt, Op1HiExt);
31923 SDValue LoTrunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, Lo);
31924 SDValue HiTrunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, Hi);
31925 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoTrunc, HiTrunc});
31926}
31927
31928SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(
31929 SDValue Op, SelectionDAG &DAG) const {
31930 EVT VT = Op.getValueType();
31931 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
31932
31933 SDLoc DL(Op);
31934 SDValue Val = Op.getOperand(0);
31935 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
31936 Val = convertToScalableVector(DAG, ContainerVT, Val);
31937
31938 bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND;
31939 unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
31940
31941 // Repeatedly unpack Val until the result is of the desired element type.
31942 switch (ContainerVT.getSimpleVT().SimpleTy) {
31943 default:
31944 llvm_unreachable("unimplemented container type");
31945 case MVT::nxv16i8:
31946 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv8i16, Val);
31947 if (VT.getVectorElementType() == MVT::i16)
31948 break;
31949 [[fallthrough]];
31950 case MVT::nxv8i16:
31951 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv4i32, Val);
31952 if (VT.getVectorElementType() == MVT::i32)
31953 break;
31954 [[fallthrough]];
31955 case MVT::nxv4i32:
31956 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv2i64, Val);
31957 assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!");
31958 break;
31959 }
31960
31961 return convertFromScalableVector(DAG, VT, Val);
31962}
31963
31964SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
31965 SDValue Op, SelectionDAG &DAG) const {
31966 EVT VT = Op.getValueType();
31967 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
31968
31969 SDLoc DL(Op);
31970 SDValue Val = Op.getOperand(0);
31971 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
31972 Val = convertToScalableVector(DAG, ContainerVT, Val);
31973
31974 // Repeatedly truncate Val until the result is of the desired element type.
31975 switch (ContainerVT.getSimpleVT().SimpleTy) {
31976 default:
31977 llvm_unreachable("unimplemented container type");
31978 case MVT::nxv2i64:
31979 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val);
31980 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val);
31981 if (VT.getVectorElementType() == MVT::i32)
31982 break;
31983 [[fallthrough]];
31984 case MVT::nxv4i32:
31985 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val);
31986 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val);
31987 if (VT.getVectorElementType() == MVT::i16)
31988 break;
31989 [[fallthrough]];
31990 case MVT::nxv8i16:
31991 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val);
31992 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val);
31993 assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
31994 break;
31995 }
31996
31997 return convertFromScalableVector(DAG, VT, Val);
31998}
31999
32000SDValue AArch64TargetLowering::LowerFixedLengthExtractVectorElt(
32001 SDValue Op, SelectionDAG &DAG) const {
32002 EVT VT = Op.getValueType();
32003 EVT InVT = Op.getOperand(0).getValueType();
32004 assert(InVT.isFixedLengthVector() && "Expected fixed length vector type!");
32005
32006 SDLoc DL(Op);
32007 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
32008 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
32009
32010 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Op.getOperand(1));
32011}
32012
32013SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt(
32014 SDValue Op, SelectionDAG &DAG) const {
32015 EVT VT = Op.getValueType();
32016 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
32017
32018 SDLoc DL(Op);
32019 EVT InVT = Op.getOperand(0).getValueType();
32020 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
32021 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
32022
32023 auto ScalableRes = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, Op0,
32024 Op.getOperand(1), Op.getOperand(2));
32025
32026 return convertFromScalableVector(DAG, VT, ScalableRes);
32027}
32028
32029// Convert vector operation 'Op' to an equivalent predicated operation whereby
32030// the original operation's type is used to construct a suitable predicate.
32031// NOTE: The results for inactive lanes are undefined.
32032SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
32033 SelectionDAG &DAG,
32034 unsigned NewOp) const {
32035 EVT VT = Op.getValueType();
32036 SDLoc DL(Op);
32037 auto Pg = getPredicateForVector(DAG, DL, VT);
32038
32039 if (VT.isFixedLengthVector()) {
32040 assert(isTypeLegal(VT) && "Expected only legal fixed-width types");
32041 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
32042
32043 // Create list of operands by converting existing ones to scalable types.
32044 SmallVector<SDValue, 4> Operands = {Pg};
32045 for (const SDValue &V : Op->op_values()) {
32046 if (isa<CondCodeSDNode>(V)) {
32047 Operands.push_back(V);
32048 continue;
32049 }
32050
32051 if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(V)) {
32052 EVT VTArg = VTNode->getVT().getVectorElementType();
32053 EVT NewVTArg =
32054 ContainerVT.changeVectorElementType(*DAG.getContext(), VTArg);
32055 Operands.push_back(DAG.getValueType(NewVTArg));
32056 continue;
32057 }
32058
32059 assert(isTypeLegal(V.getValueType()) &&
32060 "Expected only legal fixed-width types");
32061 Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
32062 }
32063
32064 if (isMergePassthruOpcode(NewOp))
32065 Operands.push_back(DAG.getPOISON(ContainerVT));
32066
32067 auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands);
32068 return convertFromScalableVector(DAG, VT, ScalableRes);
32069 }
32070
32071 assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
32072
32073 SmallVector<SDValue, 4> Operands = {Pg};
32074 for (const SDValue &V : Op->op_values()) {
32075 assert((!V.getValueType().isVector() ||
32076 V.getValueType().isScalableVector()) &&
32077 "Only scalable vectors are supported!");
32078 Operands.push_back(V);
32079 }
32080
32081 if (isMergePassthruOpcode(NewOp))
32082 Operands.push_back(DAG.getPOISON(VT));
32083
32084 return DAG.getNode(NewOp, DL, VT, Operands, Op->getFlags());
32085}
32086
32087// If a fixed length vector operation has no side effects when applied to
32088// undefined elements, we can safely use scalable vectors to perform the same
32089// operation without needing to worry about predication.
32090SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
32091 SelectionDAG &DAG) const {
32092 EVT VT = Op.getValueType();
32094 "Only expected to lower fixed length vector operation!");
32095 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
32096
32097 // Create list of operands by converting existing ones to scalable types.
32099 for (const SDValue &V : Op->op_values()) {
32100 assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
32101
32102 // Pass through non-vector operands.
32103 if (!V.getValueType().isVector()) {
32104 Ops.push_back(V);
32105 continue;
32106 }
32107
32108 // "cast" fixed length vector to a scalable vector.
32109 assert(V.getValueType().isFixedLengthVector() &&
32110 isTypeLegal(V.getValueType()) &&
32111 "Only fixed length vectors are supported!");
32112 Ops.push_back(convertToScalableVector(DAG, ContainerVT, V));
32113 }
32114
32115 auto ScalableRes = DAG.getNode(Op.getOpcode(), SDLoc(Op), ContainerVT, Ops);
32116 return convertFromScalableVector(DAG, VT, ScalableRes);
32117}
32118
32119SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
32120 SelectionDAG &DAG) const {
32121 SDLoc DL(ScalarOp);
32122 SDValue AccOp = ScalarOp.getOperand(0);
32123 SDValue VecOp = ScalarOp.getOperand(1);
32124 EVT SrcVT = VecOp.getValueType();
32125 EVT ResVT = SrcVT.getVectorElementType();
32126
32127 EVT ContainerVT = SrcVT;
32128 if (SrcVT.isFixedLengthVector()) {
32129 ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
32130 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
32131 }
32132
32133 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
32134 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
32135
32136 // Convert operands to Scalable.
32137 AccOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
32138 DAG.getPOISON(ContainerVT), AccOp, Zero);
32139
32140 // Perform reduction.
32141 SDValue Rdx = DAG.getNode(AArch64ISD::FADDA_PRED, DL, ContainerVT,
32142 Pg, AccOp, VecOp);
32143
32144 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero);
32145}
32146
32147SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
32148 SelectionDAG &DAG) const {
32149 SDLoc DL(ReduceOp);
32150 SDValue Op = ReduceOp.getOperand(0);
32151 EVT OpVT = Op.getValueType();
32152 EVT VT = ReduceOp.getValueType();
32153
32154 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
32155 return SDValue();
32156
32157 SDValue Pg = getPredicateForVector(DAG, DL, OpVT);
32158
32159 switch (ReduceOp.getOpcode()) {
32160 default:
32161 return SDValue();
32162 case ISD::VECREDUCE_OR:
32163 if (isAllActivePredicate(DAG, Pg) && OpVT == MVT::nxv16i1)
32164 // The predicate can be 'Op' because
32165 // vecreduce_or(Op & <all true>) <=> vecreduce_or(Op).
32166 return getPTest(DAG, VT, Op, Op, AArch64CC::ANY_ACTIVE);
32167 else
32168 return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE);
32169 case ISD::VECREDUCE_AND: {
32170 Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg);
32171 return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE);
32172 }
32173 case ISD::VECREDUCE_XOR: {
32174 SDValue ID =
32175 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64);
32176 if (OpVT == MVT::nxv1i1) {
32177 // Emulate a CNTP on .Q using .D and a different governing predicate.
32178 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Pg);
32179 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Op);
32180 }
32181 SDValue Cntp =
32182 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op);
32183 return DAG.getAnyExtOrTrunc(Cntp, DL, VT);
32184 }
32185 }
32186
32187 return SDValue();
32188}
32189
32190/// Returns the pairwise SVE2 op that could be used for a v2<ty> reduction.
32191static std::optional<Intrinsic::ID> getPairwiseOpForReduction(unsigned Op) {
32192 switch (Op) {
32194 return Intrinsic::aarch64_sve_sminp;
32196 return Intrinsic::aarch64_sve_smaxp;
32198 return Intrinsic::aarch64_sve_uminp;
32200 return Intrinsic::aarch64_sve_umaxp;
32201 default:
32202 return std::nullopt;
32203 }
32204}
32205
32206/// Returns the corresponding predicated SVE reduction opcode for a VECREDUCE_*.
32207static unsigned getPredicatedReductionOpcode(unsigned Op) {
32208 switch (Op) {
32209 case ISD::VECREDUCE_ADD:
32210 return AArch64ISD::UADDV_PRED;
32211 case ISD::VECREDUCE_AND:
32212 return AArch64ISD::ANDV_PRED;
32213 case ISD::VECREDUCE_OR:
32214 return AArch64ISD::ORV_PRED;
32216 return AArch64ISD::SMAXV_PRED;
32218 return AArch64ISD::SMINV_PRED;
32220 return AArch64ISD::UMAXV_PRED;
32222 return AArch64ISD::UMINV_PRED;
32223 case ISD::VECREDUCE_XOR:
32224 return AArch64ISD::EORV_PRED;
32226 return AArch64ISD::FADDV_PRED;
32228 return AArch64ISD::FMAXNMV_PRED;
32230 return AArch64ISD::FMINNMV_PRED;
32232 return AArch64ISD::FMAXV_PRED;
32234 return AArch64ISD::FMINV_PRED;
32235 default:
32236 llvm_unreachable("unexpected opcode");
32237 }
32238}
32239
32241 SDValue RdxOp, std::optional<Intrinsic::ID> &PairwiseOpIID) const {
32242 EVT SrcVT = RdxOp.getOperand(0).getValueType();
32243 if (SrcVT.isScalableVector())
32244 return true;
32245
32246 bool OverrideNEON = !Subtarget->isNeonAvailable() ||
32247 RdxOp.getOpcode() == ISD::VECREDUCE_AND ||
32248 RdxOp.getOpcode() == ISD::VECREDUCE_OR ||
32249 RdxOp.getOpcode() == ISD::VECREDUCE_XOR ||
32250 RdxOp.getOpcode() == ISD::VECREDUCE_FADD ||
32251 (RdxOp.getOpcode() != ISD::VECREDUCE_ADD &&
32252 SrcVT.getVectorElementType() == MVT::i64);
32253
32254 bool UseSVE = useSVEForFixedLengthVectorVT(
32255 SrcVT, OverrideNEON && Subtarget->useSVEForFixedLengthVectors());
32256
32257 // Always lower v2i64 vectors to pairwise SVE2 operations when possible as
32258 // NEON does not natively support reductions on v2i64. Lower v2i32 to pairwise
32259 // SVE2 operations when UseSVE is true, as the pairwise ops are likely to be
32260 // cheaper than a full reduction.
32261 if (Subtarget->hasSVE2() || Subtarget->isStreamingSVEAvailable())
32262 if (SrcVT == MVT::v2i64 || (UseSVE && SrcVT == MVT::v2i32))
32263 if ((PairwiseOpIID = getPairwiseOpForReduction(RdxOp.getOpcode())))
32264 UseSVE = true;
32265
32266 return UseSVE;
32267}
32268
32269SDValue AArch64TargetLowering::LowerReductionToSVE(SDValue Op,
32270 SelectionDAG &DAG) const {
32271 SDLoc DL(Op);
32272 SDValue VecOp = Op.getOperand(0);
32273 EVT SrcVT = VecOp.getValueType();
32274
32275 // Lower VECREDUCE_ADD of nxv2i1-nxv16i1 to CNTP rather than UADDV.
32276 if (SrcVT.isScalableVector() && Op.getOpcode() == ISD::VECREDUCE_ADD &&
32277 VecOp.getOpcode() == ISD::ZERO_EXTEND) {
32278 SDValue BoolVec = VecOp.getOperand(0);
32279 if (BoolVec.getValueType().getVectorElementType() == MVT::i1) {
32280 // CNTP(BoolVec & BoolVec) <=> CNTP(BoolVec & PTRUE)
32281 SDValue CntpOp = DAG.getNode(
32282 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,
32283 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64),
32284 BoolVec, BoolVec);
32285 return DAG.getAnyExtOrTrunc(CntpOp, DL, Op.getValueType());
32286 }
32287 }
32288
32289 if (SrcVT.isScalableVector() && SrcVT.getVectorElementType() == MVT::i1)
32290 return LowerPredReductionToSVE(Op, DAG);
32291
32292 std::optional<Intrinsic::ID> PairwiseOpIID;
32293 if (!shouldLowerReductionToSVE(Op, PairwiseOpIID))
32294 return SDValue();
32295
32296 if (!SrcVT.isScalableVector()) {
32297 EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
32298 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
32299 }
32300
32301 EVT ResVT = SrcVT.getVectorElementType();
32302 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
32303
32304 SDValue Rdx;
32305 if (PairwiseOpIID) {
32306 Rdx = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecOp.getValueType(),
32307 DAG.getConstant(*PairwiseOpIID, DL, MVT::i32), Pg, VecOp,
32308 VecOp);
32309 } else {
32310 unsigned RdxOpcode = getPredicatedReductionOpcode(Op.getOpcode());
32311 // UADDV always returns an i64 result.
32312 if (RdxOpcode == AArch64ISD::UADDV_PRED)
32313 ResVT = MVT::i64;
32314 EVT RdxVT = SrcVT;
32315 if (SrcVT.isFixedLengthVector() || RdxOpcode == AArch64ISD::UADDV_PRED)
32316 RdxVT = getPackedSVEVectorVT(ResVT);
32317 Rdx = DAG.getNode(RdxOpcode, DL, RdxVT, Pg, VecOp);
32318 }
32319
32320 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT,
32321 Rdx, DAG.getConstant(0, DL, MVT::i64));
32322
32323 // The VEC_REDUCE nodes expect an element size result.
32324 if (ResVT != Op.getValueType())
32325 Res = DAG.getAnyExtOrTrunc(Res, DL, Op.getValueType());
32326
32327 return Res;
32328}
32329
32330SDValue
32331AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,
32332 SelectionDAG &DAG) const {
32333 EVT VT = Op.getValueType();
32334 SDLoc DL(Op);
32335
32336 EVT InVT = Op.getOperand(1).getValueType();
32337 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
32338 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(1));
32339 SDValue Op2 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(2));
32340
32341 // Convert the mask to a predicated (NOTE: We don't need to worry about
32342 // inactive lanes since VSELECT is safe when given undefined elements).
32343 EVT MaskVT = Op.getOperand(0).getValueType();
32344 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskVT);
32345 auto Mask = convertToScalableVector(DAG, MaskContainerVT, Op.getOperand(0));
32346 Mask = DAG.getNode(
32348 MaskContainerVT.changeVectorElementType(*DAG.getContext(), MVT::i1),
32349 Mask);
32350
32351 auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT,
32352 Mask, Op1, Op2);
32353
32354 return convertFromScalableVector(DAG, VT, ScalableRes);
32355}
32356
32357SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(
32358 SDValue Op, SelectionDAG &DAG) const {
32359 SDLoc DL(Op);
32360 EVT InVT = Op.getOperand(0).getValueType();
32361 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
32362
32363 assert(InVT.isFixedLengthVector() && isTypeLegal(InVT) &&
32364 "Only expected to lower fixed length vector operation!");
32365 assert(Op.getValueType() == InVT.changeTypeToInteger() &&
32366 "Expected integer result of the same bit length as the inputs!");
32367
32368 auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
32369 auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
32370 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
32371
32372 EVT CmpVT = Pg.getValueType();
32373 auto Cmp = DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT,
32374 {Pg, Op1, Op2, Op.getOperand(2)});
32375
32376 EVT PromoteVT = ContainerVT.changeTypeToInteger();
32377 auto Promote = DAG.getBoolExtOrTrunc(Cmp, DL, PromoteVT, InVT);
32378 return convertFromScalableVector(DAG, Op.getValueType(), Promote);
32379}
32380
32381SDValue
32382AArch64TargetLowering::LowerFixedLengthBitcastToSVE(SDValue Op,
32383 SelectionDAG &DAG) const {
32384 SDLoc DL(Op);
32385 auto SrcOp = Op.getOperand(0);
32386 EVT VT = Op.getValueType();
32387 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
32388 EVT ContainerSrcVT =
32390
32391 SrcOp = convertToScalableVector(DAG, ContainerSrcVT, SrcOp);
32392 Op = DAG.getNode(ISD::BITCAST, DL, ContainerDstVT, SrcOp);
32393 return convertFromScalableVector(DAG, VT, Op);
32394}
32395
32396SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE(
32397 SDValue Op, SelectionDAG &DAG) const {
32398 SDLoc DL(Op);
32399 unsigned NumOperands = Op->getNumOperands();
32400
32401 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
32402 "Unexpected number of operands in CONCAT_VECTORS");
32403
32404 auto SrcOp1 = Op.getOperand(0);
32405 auto SrcOp2 = Op.getOperand(1);
32406 EVT VT = Op.getValueType();
32407 EVT SrcVT = SrcOp1.getValueType();
32408
32409 // Match a splat of 128b segments that fit in a single register.
32410 if (SrcVT.is128BitVector() && all_equal(Op.getNode()->op_values())) {
32411 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
32412 SDValue Splat =
32413 DAG.getNode(AArch64ISD::DUPLANE128, DL, ContainerVT,
32414 convertToScalableVector(DAG, ContainerVT, SrcOp1),
32415 DAG.getConstant(0, DL, MVT::i64, /*isTarget=*/true));
32416 return convertFromScalableVector(DAG, VT, Splat);
32417 }
32418
32419 if (NumOperands > 2) {
32421 EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());
32422 for (unsigned I = 0; I < NumOperands; I += 2)
32423 Ops.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, PairVT,
32424 Op->getOperand(I), Op->getOperand(I + 1)));
32425
32426 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
32427 }
32428
32429 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
32430
32432 SrcOp1 = convertToScalableVector(DAG, ContainerVT, SrcOp1);
32433 SrcOp2 = convertToScalableVector(DAG, ContainerVT, SrcOp2);
32434
32435 Op = DAG.getNode(AArch64ISD::SPLICE, DL, ContainerVT, Pg, SrcOp1, SrcOp2);
32436
32437 return convertFromScalableVector(DAG, VT, Op);
32438}
32439
32440SDValue
32441AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op,
32442 SelectionDAG &DAG) const {
32443 EVT VT = Op.getValueType();
32444 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
32445
32446 SDLoc DL(Op);
32447 SDValue Val = Op.getOperand(0);
32448 SDValue Pg = getPredicateForVector(DAG, DL, VT);
32449 EVT SrcVT = Val.getValueType();
32450 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
32451 EVT ExtendVT = ContainerVT.changeVectorElementType(
32452 *DAG.getContext(), SrcVT.getVectorElementType());
32453
32454 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
32455 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT.changeTypeToInteger(), Val);
32456
32457 Val = convertToScalableVector(DAG, ContainerVT.changeTypeToInteger(), Val);
32458 Val = getSVESafeBitCast(ExtendVT, Val, DAG);
32459 Val = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT, Pg,
32460 Val, DAG.getPOISON(ContainerVT));
32461
32462 return convertFromScalableVector(DAG, VT, Val);
32463}
32464
32465SDValue
32466AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op,
32467 SelectionDAG &DAG) const {
32468 EVT VT = Op.getValueType();
32469 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
32470
32471 SDLoc DL(Op);
32472 SDValue Val = Op.getOperand(0);
32473 EVT SrcVT = Val.getValueType();
32474 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
32475 EVT RoundVT = ContainerSrcVT.changeVectorElementType(
32476 *DAG.getContext(), VT.getVectorElementType());
32477 SDValue Pg = getPredicateForVector(DAG, DL, RoundVT);
32478
32479 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
32480 Val = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, RoundVT, Pg, Val,
32481 Op.getOperand(1), DAG.getPOISON(RoundVT));
32482 Val = getSVESafeBitCast(ContainerSrcVT.changeTypeToInteger(), Val, DAG);
32483 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
32484
32485 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
32486 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
32487}
32488
32489SDValue
32490AArch64TargetLowering::LowerFixedLengthIntToFPToSVE(SDValue Op,
32491 SelectionDAG &DAG) const {
32492 EVT VT = Op.getValueType();
32493 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
32494
32495 bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP;
32496 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
32497 : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
32498
32499 SDLoc DL(Op);
32500 SDValue Val = Op.getOperand(0);
32501 EVT SrcVT = Val.getValueType();
32502 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
32503 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
32504
32505 if (VT.bitsGE(SrcVT)) {
32507
32508 Val = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
32509 VT.changeTypeToInteger(), Val);
32510
32511 // Safe to use a larger than specified operand because by promoting the
32512 // value nothing has changed from an arithmetic point of view.
32513 Val =
32514 convertToScalableVector(DAG, ContainerDstVT.changeTypeToInteger(), Val);
32515 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
32516 DAG.getPOISON(ContainerDstVT));
32517 return convertFromScalableVector(DAG, VT, Val);
32518 } else {
32519 EVT CvtVT = ContainerSrcVT.changeVectorElementType(
32520 *DAG.getContext(), ContainerDstVT.getVectorElementType());
32522
32523 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
32524 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getPOISON(CvtVT));
32525 Val = getSVESafeBitCast(ContainerSrcVT, Val, DAG);
32526 Val = convertFromScalableVector(DAG, SrcVT, Val);
32527
32528 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
32529 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
32530 }
32531}
32532
32533SDValue
32534AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op,
32535 SelectionDAG &DAG) const {
32536 SDLoc DL(Op);
32537 EVT OpVT = Op.getValueType();
32538
32539 if (OpVT.isFixedLengthVector() && Op->getNumOperands() == 3) {
32540 Align Alignment = DAG.getReducedAlign(OpVT, /*UseABI=*/false);
32542 DAG.CreateStackTemporary(OpVT.getStoreSize() * 3, Alignment);
32543
32545 for (unsigned I = 0; I < 3; ++I) {
32546 SDValue Ptr =
32547 DAG.getMemBasePlusOffset(StackPtr, OpVT.getStoreSize() * I, DL);
32548 Chains.push_back(DAG.getStore(DAG.getEntryNode(), DL, Op.getOperand(I),
32549 Ptr, MachinePointerInfo()));
32550 }
32551
32553 Ops.push_back(DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains));
32554 Ops.push_back(
32555 DAG.getTargetConstant(Intrinsic::aarch64_neon_ld3, DL, MVT::i64));
32556 Ops.push_back(StackPtr);
32557
32558 EVT TripleOpVT =
32560 OpVT.getVectorNumElements() * 3);
32561 SDVTList VTs = DAG.getVTList(OpVT, OpVT, OpVT, MVT::Other);
32563 TripleOpVT, MachinePointerInfo(),
32564 Alignment, MachineMemOperand::MOLoad);
32565
32566 return DAG.getMergeValues(
32567 {LD3.getValue(0), LD3.getValue(1), LD3.getValue(2)}, DL);
32568 }
32569
32570 if (OpVT.isScalableVector() && Op->getNumOperands() == 3) {
32571 // aarch64_sve_ld3 only supports packed datatypes.
32572 EVT PackedVT = getPackedSVEVectorVT(OpVT.getVectorElementCount());
32573 Align Alignment = DAG.getReducedAlign(PackedVT, /*UseABI=*/false);
32575 DAG.CreateStackTemporary(PackedVT.getStoreSize() * 3, Alignment);
32576
32577 // Write out unmodified operands.
32579 for (unsigned I = 0; I < 3; ++I) {
32580 SDValue Ptr =
32581 DAG.getMemBasePlusOffset(StackPtr, PackedVT.getStoreSize() * I, DL);
32582 SDValue V = getSVESafeBitCast(PackedVT, Op.getOperand(I), DAG);
32583 Chains.push_back(
32584 DAG.getStore(DAG.getEntryNode(), DL, V, Ptr, MachinePointerInfo()));
32585 }
32586
32587 Intrinsic::ID IntID = Intrinsic::aarch64_sve_ld3_sret;
32588 EVT PredVT = PackedVT.changeVectorElementType(*DAG.getContext(), MVT::i1);
32589
32591 Ops.push_back(DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains));
32592 Ops.push_back(DAG.getTargetConstant(IntID, DL, MVT::i64));
32593 Ops.push_back(DAG.getConstant(1, DL, PredVT));
32594 Ops.push_back(StackPtr);
32595
32596 // Read back and deinterleave data.
32597 SDVTList VTs = DAG.getVTList(PackedVT, PackedVT, PackedVT, MVT::Other);
32598 SDValue LD3 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops);
32599
32601 Results.push_back(getSVESafeBitCast(OpVT, LD3.getValue(0), DAG));
32602 Results.push_back(getSVESafeBitCast(OpVT, LD3.getValue(1), DAG));
32603 Results.push_back(getSVESafeBitCast(OpVT, LD3.getValue(2), DAG));
32604 return DAG.getMergeValues(Results, DL);
32605 }
32606
32607 // Are multi-register uzp instructions available?
32608 if (Subtarget->hasSME2() && Subtarget->isStreaming() &&
32609 OpVT.isScalableVector() && OpVT.getVectorElementType() != MVT::i1) {
32610 Intrinsic::ID IntID;
32611 switch (Op->getNumOperands()) {
32612 default:
32613 return SDValue();
32614 case 2:
32615 IntID = Intrinsic::aarch64_sve_uzp_x2;
32616 break;
32617 case 4:
32618 if (Subtarget->getMinSVEVectorSizeInBits() < 256 &&
32619 OpVT.getScalarSizeInBits() == 64)
32620 return SDValue();
32621 IntID = Intrinsic::aarch64_sve_uzp_x4;
32622 break;
32623 }
32624
32626 Ops.push_back(DAG.getTargetConstant(IntID, DL, MVT::i64));
32627 Ops.append(Op->op_values().begin(), Op->op_values().end());
32628 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op->getVTList(), Ops);
32629 }
32630
32631 if (Op->getNumOperands() != 2)
32632 return SDValue();
32633
32634 if (OpVT == MVT::v1i64 || OpVT == MVT::v1f64)
32635 return DAG.getMergeValues({Op.getOperand(0), Op.getOperand(1)}, DL);
32636
32637 SDValue Even = DAG.getNode(AArch64ISD::UZP1, DL, OpVT, Op.getOperand(0),
32638 Op.getOperand(1));
32639 SDValue Odd = DAG.getNode(AArch64ISD::UZP2, DL, OpVT, Op.getOperand(0),
32640 Op.getOperand(1));
32641 return DAG.getMergeValues({Even, Odd}, DL);
32642}
32643
32644SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op,
32645 SelectionDAG &DAG) const {
32646 SDLoc DL(Op);
32647 EVT OpVT = Op.getValueType();
32648
32649 if (OpVT.isFixedLengthVector() && Op->getNumOperands() == 3) {
32650 Align Alignment = DAG.getReducedAlign(OpVT, /*UseABI=*/false);
32652 DAG.CreateStackTemporary(OpVT.getStoreSize() * 3, Alignment);
32653
32655 Ops.push_back(DAG.getEntryNode());
32656 Ops.push_back(
32657 DAG.getTargetConstant(Intrinsic::aarch64_neon_st3, DL, MVT::i64));
32658 for (SDValue V : Op->ops())
32659 Ops.push_back(V);
32660 Ops.push_back(StackPtr);
32661
32662 EVT TripleOpVT =
32664 OpVT.getVectorNumElements() * 3);
32665 SDValue Chain = DAG.getMemIntrinsicNode(
32666 ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Ops, TripleOpVT,
32667 MachinePointerInfo(), Alignment, MachineMemOperand::MOStore);
32668
32670 for (unsigned I = 0; I < 3; ++I) {
32671 SDValue Ptr =
32672 DAG.getMemBasePlusOffset(StackPtr, OpVT.getStoreSize() * I, DL);
32673 Results.push_back(
32674 DAG.getLoad(OpVT, DL, Chain, Ptr, MachinePointerInfo()));
32675 }
32676 return DAG.getMergeValues(Results, DL);
32677 }
32678
32679 if (OpVT.isScalableVector() && Op->getNumOperands() == 3) {
32680 // aarch64_sve_st3 only supports packed datatypes.
32681 EVT PackedVT = getPackedSVEVectorVT(OpVT.getVectorElementCount());
32683 for (SDValue V : Op->ops())
32684 InVecs.push_back(getSVESafeBitCast(PackedVT, V, DAG));
32685
32686 Align Alignment = DAG.getReducedAlign(PackedVT, /*UseABI=*/false);
32688 DAG.CreateStackTemporary(PackedVT.getStoreSize() * 3, Alignment);
32689
32690 Intrinsic::ID IntID = Intrinsic::aarch64_sve_st3;
32691 EVT PredVT = PackedVT.changeVectorElementType(*DAG.getContext(), MVT::i1);
32692
32694 Ops.push_back(DAG.getEntryNode());
32695 Ops.push_back(DAG.getTargetConstant(IntID, DL, MVT::i64));
32696 Ops.append(InVecs);
32697 Ops.push_back(DAG.getConstant(1, DL, PredVT));
32698 Ops.push_back(StackPtr);
32699
32700 // Interleave operands and store.
32701 SDValue Chain = DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops);
32702
32703 // Read back the interleaved data.
32705 for (unsigned I = 0; I < 3; ++I) {
32706 SDValue Ptr =
32707 DAG.getMemBasePlusOffset(StackPtr, PackedVT.getStoreSize() * I, DL);
32708 SDValue L = DAG.getLoad(PackedVT, DL, Chain, Ptr, MachinePointerInfo());
32709 Results.push_back(getSVESafeBitCast(OpVT, L, DAG));
32710 }
32711
32712 return DAG.getMergeValues(Results, DL);
32713 }
32714
32715 // Are multi-register zip instructions available?
32716 // If so, use them for packed types. Interleaves of unpacked types can be
32717 // selected using trn1.
32718 if (Subtarget->hasSME2() && Subtarget->isStreaming() &&
32719 OpVT.isScalableVector() && isPackedVectorType(OpVT, DAG)) {
32720 Intrinsic::ID IntID;
32721 switch (Op->getNumOperands()) {
32722 default:
32723 return SDValue();
32724 case 2:
32725 IntID = Intrinsic::aarch64_sve_zip_x2;
32726 break;
32727 case 4:
32728 if (Subtarget->getMinSVEVectorSizeInBits() < 256 &&
32729 OpVT.getScalarSizeInBits() == 64)
32730 return SDValue();
32731 IntID = Intrinsic::aarch64_sve_zip_x4;
32732 break;
32733 }
32734
32736 Ops.push_back(DAG.getTargetConstant(IntID, DL, MVT::i64));
32737 Ops.append(Op->op_values().begin(), Op->op_values().end());
32738 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op->getVTList(), Ops);
32739 }
32740
32741 if (Op->getNumOperands() != 2)
32742 return SDValue();
32743
32744 if (OpVT == MVT::v1i64 || OpVT == MVT::v1f64)
32745 return DAG.getMergeValues({Op.getOperand(0), Op.getOperand(1)}, DL);
32746
32747 SDValue Lo = DAG.getNode(AArch64ISD::ZIP1, DL, OpVT, Op.getOperand(0),
32748 Op.getOperand(1));
32749 SDValue Hi = DAG.getNode(AArch64ISD::ZIP2, DL, OpVT, Op.getOperand(0),
32750 Op.getOperand(1));
32751 return DAG.getMergeValues({Lo, Hi}, DL);
32752}
32753
32754SDValue AArch64TargetLowering::LowerVECTOR_HISTOGRAM(SDValue Op,
32755 SelectionDAG &DAG) const {
32756 // FIXME: Maybe share some code with LowerMGather/Scatter?
32757 MaskedHistogramSDNode *HG = cast<MaskedHistogramSDNode>(Op);
32758 SDLoc DL(HG);
32759 SDValue Chain = HG->getChain();
32760 SDValue Inc = HG->getInc();
32761 SDValue Mask = HG->getMask();
32762 SDValue Ptr = HG->getBasePtr();
32763 SDValue Index = HG->getIndex();
32764 SDValue Scale = HG->getScale();
32765 SDValue IntID = HG->getIntID();
32766
32767 // The Intrinsic ID determines the type of update operation.
32768 [[maybe_unused]] ConstantSDNode *CID = cast<ConstantSDNode>(IntID.getNode());
32769 // Right now, we only support 'add' as an update.
32770 assert(CID->getZExtValue() == Intrinsic::experimental_vector_histogram_add &&
32771 "Unexpected histogram update operation");
32772
32773 EVT IndexVT = Index.getValueType();
32774 LLVMContext &Ctx = *DAG.getContext();
32775 ElementCount EC = IndexVT.getVectorElementCount();
32776 EVT MemVT = EVT::getVectorVT(Ctx, HG->getMemoryVT(), EC);
32777 EVT IncExtVT =
32778 EVT::getIntegerVT(Ctx, AArch64::SVEBitsPerBlock / EC.getKnownMinValue());
32779 EVT IncSplatVT = EVT::getVectorVT(Ctx, IncExtVT, EC);
32780 bool ExtTrunc = IncSplatVT != MemVT;
32781
32782 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
32783 SDValue PassThru = DAG.getSplatVector(IncSplatVT, DL, Zero);
32784 SDValue IncSplat = DAG.getSplatVector(
32785 IncSplatVT, DL, DAG.getAnyExtOrTrunc(Inc, DL, IncExtVT));
32786 SDValue Ops[] = {Chain, PassThru, Mask, Ptr, Index, Scale};
32787
32788 MachineMemOperand *MMO = HG->getMemOperand();
32789 // Create an MMO for the gather, without load|store flags.
32790 MachineMemOperand *GMMO = DAG.getMachineFunction().getMachineMemOperand(
32792 MMO->getAlign(), MMO->getAAInfo());
32793 ISD::MemIndexType IndexType = HG->getIndexType();
32794 SDValue Gather = DAG.getMaskedGather(
32795 DAG.getVTList(IncSplatVT, MVT::Other), MemVT, DL, Ops, GMMO, IndexType,
32796 ExtTrunc ? ISD::EXTLOAD : ISD::NON_EXTLOAD);
32797
32798 SDValue GChain = Gather.getValue(1);
32799
32800 // Perform the histcnt, multiply by inc, add to bucket data.
32801 SDValue ID =
32802 DAG.getTargetConstant(Intrinsic::aarch64_sve_histcnt, DL, IncExtVT);
32803 SDValue HistCnt =
32804 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, ID, Mask, Index, Index);
32805 SDValue Mul = DAG.getNode(ISD::MUL, DL, IncSplatVT, HistCnt, IncSplat);
32806 SDValue Add = DAG.getNode(ISD::ADD, DL, IncSplatVT, Gather, Mul);
32807
32808 // Create an MMO for the scatter, without load|store flags.
32809 MachineMemOperand *SMMO = DAG.getMachineFunction().getMachineMemOperand(
32811 MMO->getAlign(), MMO->getAAInfo());
32812
32813 SDValue ScatterOps[] = {GChain, Add, Mask, Ptr, Index, Scale};
32814 SDValue Scatter = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MemVT, DL,
32815 ScatterOps, SMMO, IndexType, ExtTrunc);
32816 return Scatter;
32817}
32818
32819/// If a PARTIAL_REDUCE_MLA node comes in with an accumulator-input type pairing
32820/// of (nx)v2i64/(nx)v16i8, we cannot directly lower it to a (u|s)dot. We can
32821/// however still make use of the dot product instruction by instead
32822/// accumulating over two steps: (nx)v16i8 -> (nx)v4i32 -> (nx)v2i64.
32823/// If available, make use of the (U|S)ADDW(B|T) instructions, otherwise
32824/// the following pattern is emitted:
32825/// add(add(Acc, ext(EXTRACT_SUBVECTOR(N, 0)), ext(EXTRACT_SUBVECTOR(N,
32826/// NTy/2))))
32827SDValue
32828AArch64TargetLowering::LowerPARTIAL_REDUCE_MLA(SDValue Op,
32829 SelectionDAG &DAG) const {
32830 SDLoc DL(Op);
32831
32832 SDValue Acc = Op.getOperand(0);
32833 SDValue LHS = Op.getOperand(1);
32834 SDValue RHS = Op.getOperand(2);
32835 EVT ResultVT = Op.getValueType();
32836 EVT OrigResultVT = ResultVT;
32837 EVT OpVT = LHS.getValueType();
32838
32839 // We can handle this case natively by accumulating into a wider
32840 // zero-padded vector.
32841 if (ResultVT == MVT::v2i32 && OpVT == MVT::v16i8) {
32842 SDValue ZeroVec = DAG.getConstant(0, DL, MVT::v4i32);
32843 SDValue WideAcc = DAG.getInsertSubvector(DL, ZeroVec, Acc, 0);
32844 SDValue Wide =
32845 DAG.getNode(Op.getOpcode(), DL, MVT::v4i32, WideAcc, LHS, RHS);
32846 SDValue Reduced = DAG.getNode(AArch64ISD::ADDP, DL, MVT::v4i32, Wide, Wide);
32847 return DAG.getExtractSubvector(DL, MVT::v2i32, Reduced, 0);
32848 }
32849
32850 bool ConvertToScalable =
32851 ResultVT.isFixedLengthVector() &&
32852 useSVEForFixedLengthVectorVT(ResultVT, /*OverrideNEON=*/true);
32853
32854 if (ConvertToScalable) {
32855 ResultVT = getContainerForFixedLengthVector(DAG, ResultVT);
32856 OpVT = getContainerForFixedLengthVector(DAG, LHS.getValueType());
32857 Acc = convertToScalableVector(DAG, ResultVT, Acc);
32858 LHS = convertToScalableVector(DAG, OpVT, LHS);
32859 RHS = convertToScalableVector(DAG, OpVT, RHS);
32860 Op = DAG.getNode(Op.getOpcode(), DL, ResultVT, {Acc, LHS, RHS});
32861 }
32862
32863 // Two-way and four-way partial reductions are supported by patterns.
32864 // We only need to handle the 8-way partial reduction.
32865 if (ResultVT.getScalarType() != MVT::i64 || OpVT.getScalarType() != MVT::i8)
32866 return ConvertToScalable ? convertFromScalableVector(DAG, OrigResultVT, Op)
32867 : Op;
32868
32869 EVT DotVT = ResultVT.isScalableVector() ? MVT::nxv4i32 : MVT::v4i32;
32870 SDValue DotNode = DAG.getNode(Op.getOpcode(), DL, DotVT,
32871 DAG.getConstant(0, DL, DotVT), LHS, RHS);
32872
32873 SDValue Res;
32874 bool IsUnsigned = Op.getOpcode() == ISD::PARTIAL_REDUCE_UMLA;
32875 if (Subtarget->hasSVE2() || Subtarget->isStreamingSVEAvailable()) {
32876 unsigned LoOpcode = IsUnsigned ? AArch64ISD::UADDWB : AArch64ISD::SADDWB;
32877 unsigned HiOpcode = IsUnsigned ? AArch64ISD::UADDWT : AArch64ISD::SADDWT;
32878 SDValue Lo = DAG.getNode(LoOpcode, DL, ResultVT, Acc, DotNode);
32879 Res = DAG.getNode(HiOpcode, DL, ResultVT, Lo, DotNode);
32880 } else {
32881 // Fold (nx)v4i32 into (nx)v2i64
32882 auto [DotNodeLo, DotNodeHi] = DAG.SplitVector(DotNode, DL);
32883 if (IsUnsigned) {
32884 DotNodeLo = DAG.getZExtOrTrunc(DotNodeLo, DL, ResultVT);
32885 DotNodeHi = DAG.getZExtOrTrunc(DotNodeHi, DL, ResultVT);
32886 } else {
32887 DotNodeLo = DAG.getSExtOrTrunc(DotNodeLo, DL, ResultVT);
32888 DotNodeHi = DAG.getSExtOrTrunc(DotNodeHi, DL, ResultVT);
32889 }
32890 auto Lo = DAG.getNode(ISD::ADD, DL, ResultVT, Acc, DotNodeLo);
32891 Res = DAG.getNode(ISD::ADD, DL, ResultVT, Lo, DotNodeHi);
32892 }
32893
32894 return ConvertToScalable ? convertFromScalableVector(DAG, OrigResultVT, Res)
32895 : Res;
32896}
32897
32898SDValue
32899AArch64TargetLowering::LowerGET_ACTIVE_LANE_MASK(SDValue Op,
32900 SelectionDAG &DAG) const {
32901 EVT VT = Op.getValueType();
32902 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
32903
32904 assert(Subtarget->isSVEorStreamingSVEAvailable() &&
32905 "Lowering fixed length get_active_lane_mask requires SVE!");
32906
32907 // There are no dedicated fixed-length instructions for GET_ACTIVE_LANE_MASK,
32908 // but we can use SVE when available.
32909
32910 SDLoc DL(Op);
32911 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
32912 EVT WhileVT = ContainerVT.changeElementType(*DAG.getContext(), MVT::i1);
32913
32915 Op.getOperand(0), Op.getOperand(1));
32916 SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, DL, ContainerVT, Mask);
32917 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, MaskAsInt,
32918 DAG.getVectorIdxConstant(0, DL));
32919}
32920
32921SDValue
32922AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
32923 SelectionDAG &DAG) const {
32924 EVT VT = Op.getValueType();
32925 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
32926
32927 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
32928 unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU
32929 : AArch64ISD::FCVTZU_MERGE_PASSTHRU;
32930
32931 SDLoc DL(Op);
32932 SDValue Val = Op.getOperand(0);
32933 EVT SrcVT = Val.getValueType();
32934 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
32935 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
32936
32937 if (VT.bitsGT(SrcVT)) {
32938 EVT CvtVT = ContainerDstVT.changeVectorElementType(
32939 *DAG.getContext(), ContainerSrcVT.getVectorElementType());
32941
32942 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
32943 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Val);
32944
32945 Val = convertToScalableVector(DAG, ContainerDstVT, Val);
32946 Val = getSVESafeBitCast(CvtVT, Val, DAG);
32947 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
32948 DAG.getPOISON(ContainerDstVT));
32949 return convertFromScalableVector(DAG, VT, Val);
32950 } else {
32951 EVT CvtVT = ContainerSrcVT.changeTypeToInteger();
32953
32954 // Safe to use a larger than specified result since an fp_to_int where the
32955 // result doesn't fit into the destination is undefined.
32956 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
32957 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getPOISON(CvtVT));
32958 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
32959
32960 return DAG.getNode(ISD::TRUNCATE, DL, VT, Val);
32961 }
32962}
32963
32965 ArrayRef<int> ShuffleMask, EVT VT,
32966 EVT ContainerVT, SelectionDAG &DAG) {
32967 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
32968 SDLoc DL(Op);
32969 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
32970 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
32971 bool IsSingleOp =
32972 ShuffleVectorInst::isSingleSourceMask(ShuffleMask, ShuffleMask.size());
32973
32974 if (!Subtarget.isNeonAvailable() && !MinSVESize)
32975 MinSVESize = 128;
32976
32977 // Ignore two operands if no SVE2 or all index numbers couldn't
32978 // be represented.
32979 if (!IsSingleOp && !Subtarget.hasSVE2())
32980 return SDValue();
32981
32982 EVT VTOp1 = Op.getOperand(0).getValueType();
32983 unsigned BitsPerElt = VTOp1.getVectorElementType().getSizeInBits();
32984 unsigned IndexLen = MinSVESize / BitsPerElt;
32985 unsigned ElementsPerVectorReg = VTOp1.getVectorNumElements();
32986 uint64_t MaxOffset = maxUIntN(BitsPerElt);
32987 EVT MaskEltType = VTOp1.getVectorElementType().changeTypeToInteger();
32988 EVT MaskType = EVT::getVectorVT(*DAG.getContext(), MaskEltType, IndexLen);
32989 bool MinMaxEqual = (MinSVESize == MaxSVESize);
32990 assert(ElementsPerVectorReg <= IndexLen && ShuffleMask.size() <= IndexLen &&
32991 "Incorrectly legalised shuffle operation");
32992
32994 // If MinSVESize is not equal to MaxSVESize then we need to know which
32995 // TBL mask element needs adjustment.
32996 SmallVector<SDValue, 8> AddRuntimeVLMask;
32997
32998 // Bail out for 8-bits element types, because with 2048-bit SVE register
32999 // size 8 bits is only sufficient to index into the first source vector.
33000 if (!IsSingleOp && !MinMaxEqual && BitsPerElt == 8)
33001 return SDValue();
33002
33003 for (int Index : ShuffleMask) {
33004 // Handling poison index value.
33005 if (Index < 0)
33006 Index = 0;
33007 // If the mask refers to elements in the second operand, then we have to
33008 // offset the index by the number of elements in a vector. If this is number
33009 // is not known at compile-time, we need to maintain a mask with 'VL' values
33010 // to add at runtime.
33011 if ((unsigned)Index >= ElementsPerVectorReg) {
33012 if (MinMaxEqual) {
33013 Index += IndexLen - ElementsPerVectorReg;
33014 } else {
33015 Index = Index - ElementsPerVectorReg;
33016 AddRuntimeVLMask.push_back(DAG.getConstant(1, DL, MVT::i64));
33017 }
33018 } else if (!MinMaxEqual)
33019 AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
33020 // For 8-bit elements and 1024-bit SVE registers and MaxOffset equals
33021 // to 255, this might point to the last element of in the second operand
33022 // of the shufflevector, thus we are rejecting this transform.
33023 if ((unsigned)Index >= MaxOffset)
33024 return SDValue();
33025 TBLMask.push_back(DAG.getConstant(Index, DL, MVT::i64));
33026 }
33027
33028 // Choosing an out-of-range index leads to the lane being zeroed vs zero
33029 // value where it would perform first lane duplication for out of
33030 // index elements. For i8 elements an out-of-range index could be a valid
33031 // for 2048-bit vector register size.
33032 for (unsigned i = 0; i < IndexLen - ElementsPerVectorReg; ++i) {
33033 TBLMask.push_back(DAG.getConstant((int)MaxOffset, DL, MVT::i64));
33034 if (!MinMaxEqual)
33035 AddRuntimeVLMask.push_back(DAG.getConstant(0, DL, MVT::i64));
33036 }
33037
33038 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskType);
33039 SDValue VecMask =
33040 DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
33041 SDValue SVEMask = convertToScalableVector(DAG, MaskContainerVT, VecMask);
33042
33043 SDValue Shuffle;
33044 if (IsSingleOp)
33045 Shuffle = DAG.getNode(
33046 ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
33047 DAG.getTargetConstant(Intrinsic::aarch64_sve_tbl, DL, MVT::i32), Op1,
33048 SVEMask);
33049 else if (Subtarget.hasSVE2()) {
33050 if (!MinMaxEqual) {
33051 unsigned MinNumElts = AArch64::SVEBitsPerBlock / BitsPerElt;
33052 SDValue VScale = (BitsPerElt == 64)
33053 ? DAG.getVScale(DL, MVT::i64, APInt(64, MinNumElts))
33054 : DAG.getVScale(DL, MVT::i32, APInt(32, MinNumElts));
33055 SDValue VecMask =
33056 DAG.getBuildVector(MaskType, DL, ArrayRef(TBLMask.data(), IndexLen));
33057 SDValue MulByMask = DAG.getNode(
33058 ISD::MUL, DL, MaskType,
33059 DAG.getNode(ISD::SPLAT_VECTOR, DL, MaskType, VScale),
33060 DAG.getBuildVector(MaskType, DL,
33061 ArrayRef(AddRuntimeVLMask.data(), IndexLen)));
33062 SDValue UpdatedVecMask =
33063 DAG.getNode(ISD::ADD, DL, MaskType, VecMask, MulByMask);
33064 SVEMask = convertToScalableVector(
33065 DAG, getContainerForFixedLengthVector(DAG, MaskType), UpdatedVecMask);
33066 }
33067 Shuffle = DAG.getNode(
33068 ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
33069 DAG.getTargetConstant(Intrinsic::aarch64_sve_tbl2, DL, MVT::i32), Op1,
33070 Op2, SVEMask);
33071 }
33072 Shuffle = convertFromScalableVector(DAG, VT, Shuffle);
33073 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
33074}
33075
33076SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
33077 SDValue Op, SelectionDAG &DAG) const {
33078 EVT VT = Op.getValueType();
33079 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
33080
33081 auto *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
33082 auto ShuffleMask = SVN->getMask();
33083
33084 SDLoc DL(Op);
33085 SDValue Op1 = Op.getOperand(0);
33086 SDValue Op2 = Op.getOperand(1);
33087
33088 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
33089 Op1 = convertToScalableVector(DAG, ContainerVT, Op1);
33090 Op2 = convertToScalableVector(DAG, ContainerVT, Op2);
33091
33092 auto MinLegalExtractEltScalarTy = [](EVT ScalarTy) -> EVT {
33093 if (ScalarTy == MVT::i8 || ScalarTy == MVT::i16)
33094 return MVT::i32;
33095 return ScalarTy;
33096 };
33097
33098 if (SVN->isSplat()) {
33099 unsigned Lane = std::max(0, SVN->getSplatIndex());
33100 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
33101 SDValue SplatEl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
33102 DAG.getConstant(Lane, DL, MVT::i64));
33103 Op = DAG.getNode(ISD::SPLAT_VECTOR, DL, ContainerVT, SplatEl);
33104 return convertFromScalableVector(DAG, VT, Op);
33105 }
33106
33107 bool ReverseEXT = false;
33108 unsigned Imm;
33109 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm) &&
33110 Imm == VT.getVectorNumElements() - 1) {
33111 if (ReverseEXT)
33112 std::swap(Op1, Op2);
33113 EVT ScalarTy = MinLegalExtractEltScalarTy(VT.getVectorElementType());
33114 SDValue Scalar = DAG.getNode(
33115 ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
33116 DAG.getConstant(VT.getVectorNumElements() - 1, DL, MVT::i64));
33117 Op = DAG.getNode(AArch64ISD::INSR, DL, ContainerVT, Op2, Scalar);
33118 return convertFromScalableVector(DAG, VT, Op);
33119 }
33120
33121 unsigned EltSize = VT.getScalarSizeInBits();
33122 for (unsigned BlockSize : {64U, 32U, 16U}) {
33123 if (isREVMask(ShuffleMask, EltSize, VT.getVectorNumElements(), BlockSize)) {
33124 unsigned RevOp;
33125 if (EltSize == 8)
33126 RevOp = AArch64ISD::BSWAP_MERGE_PASSTHRU;
33127 else if (EltSize == 16)
33128 RevOp = AArch64ISD::REVH_MERGE_PASSTHRU;
33129 else
33130 RevOp = AArch64ISD::REVW_MERGE_PASSTHRU;
33131 EVT BlockedVT =
33133 SDValue Pg = getPredicateForVector(DAG, DL, BlockedVT);
33134 SDValue BlockedOp1 = DAG.getNode(ISD::BITCAST, DL, BlockedVT, Op1);
33135 SDValue BlockedRev = DAG.getNode(RevOp, DL, BlockedVT, Pg, BlockedOp1,
33136 DAG.getPOISON(BlockedVT));
33137 SDValue Container =
33138 DAG.getNode(ISD::BITCAST, DL, ContainerVT, BlockedRev);
33139 return convertFromScalableVector(DAG, VT, Container);
33140 }
33141 }
33142
33143 if (Subtarget->hasSVE2p1() && EltSize == 64 &&
33144 isREVMask(ShuffleMask, EltSize, VT.getVectorNumElements(), 128)) {
33145 SDValue Pg = getPredicateForVector(DAG, DL, VT);
33146 SDValue Revd = DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, DL, ContainerVT,
33147 Pg, Op1, DAG.getPOISON(ContainerVT));
33148 return convertFromScalableVector(DAG, VT, Revd);
33149 }
33150
33151 unsigned WhichResult;
33152 unsigned OperandOrder;
33153 if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult,
33154 OperandOrder) &&
33155 WhichResult == 0) {
33156 SDValue ZIP = DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT,
33157 OperandOrder == 0 ? Op1 : Op2,
33158 OperandOrder == 0 ? Op2 : Op1);
33159 return convertFromScalableVector(DAG, VT, ZIP);
33160 }
33161
33162 if (isTRNMask(ShuffleMask, VT.getVectorNumElements(), WhichResult,
33163 OperandOrder)) {
33164 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
33165 SDValue TRN =
33166 DAG.getNode(Opc, DL, ContainerVT, OperandOrder == 0 ? Op1 : Op2,
33167 OperandOrder == 0 ? Op2 : Op1);
33168 return convertFromScalableVector(DAG, VT, TRN);
33169 }
33170
33171 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
33173 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op1));
33174
33175 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
33176 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
33178 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
33179 }
33180
33181 // Functions like isZIPMask return true when a ISD::VECTOR_SHUFFLE's mask
33182 // represents the same logical operation as performed by a ZIP instruction. In
33183 // isolation these functions do not mean the ISD::VECTOR_SHUFFLE is exactly
33184 // equivalent to an AArch64 instruction. There's the extra component of
33185 // ISD::VECTOR_SHUFFLE's value type to consider. Prior to SVE these functions
33186 // only operated on 64/128bit vector types that have a direct mapping to a
33187 // target register and so an exact mapping is implied.
33188 // However, when using SVE for fixed length vectors, most legal vector types
33189 // are actually sub-vectors of a larger SVE register. When mapping
33190 // ISD::VECTOR_SHUFFLE to an SVE instruction care must be taken to consider
33191 // how the mask's indices translate. Specifically, when the mapping requires
33192 // an exact meaning for a specific vector index (e.g. Index X is the last
33193 // vector element in the register) then such mappings are often only safe when
33194 // the exact SVE register size is know. The main exception to this is when
33195 // indices are logically relative to the first element of either
33196 // ISD::VECTOR_SHUFFLE operand because these relative indices don't change
33197 // when converting from fixed-length to scalable vector types (i.e. the start
33198 // of a fixed length vector is always the start of a scalable vector).
33199 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
33200 unsigned MaxSVESize = Subtarget->getMaxSVEVectorSizeInBits();
33201 if (MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits()) {
33202 if (ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size()) &&
33203 Op2.isUndef()) {
33204 Op = DAG.getNode(ISD::VECTOR_REVERSE, DL, ContainerVT, Op1);
33205 return convertFromScalableVector(DAG, VT, Op);
33206 }
33207
33208 if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult,
33209 OperandOrder) &&
33210 WhichResult != 0) {
33211 SDValue ZIP = DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT,
33212 OperandOrder == 0 ? Op1 : Op2,
33213 OperandOrder == 0 ? Op2 : Op1);
33214 return convertFromScalableVector(DAG, VT, ZIP);
33215 }
33216
33217 if (isUZPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) {
33218 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
33220 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
33221 }
33222
33223 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult != 0)
33225 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op1));
33226
33227 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
33228 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
33230 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
33231 }
33232
33233 if ((Subtarget->hasSVE2p1() || Subtarget->hasSME2p1()) &&
33234 Subtarget->isSVEorStreamingSVEAvailable()) {
33236 "Unsupported SVE vector size");
33237
33239 unsigned SegmentElts = VT.getVectorNumElements() / Segments;
33240 if (std::optional<unsigned> Lane =
33241 isDUPQMask(ShuffleMask, Segments, SegmentElts)) {
33242 SDValue IID = DAG.getTargetConstant(Intrinsic::aarch64_sve_dup_laneq,
33243 DL, MVT::i64);
33245 DAG, VT,
33246 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ContainerVT,
33247 {IID, Op1,
33248 DAG.getConstant(*Lane, DL, MVT::i64,
33249 /*isTarget=*/true)}));
33250 }
33251 }
33252 }
33253
33254 // Try to widen the shuffle before generating a possibly expensive SVE TBL.
33255 // This may allow the shuffle to be matched as something cheaper like ZIP1.
33256 if (SDValue WideOp = tryWidenMaskForShuffle(Op, DAG))
33257 return WideOp;
33258
33259 // Avoid producing TBL instruction if we don't know SVE register minimal size,
33260 // unless NEON is not available and we can assume minimal SVE register size is
33261 // 128-bits.
33262 if (MinSVESize || !Subtarget->isNeonAvailable())
33263 return GenerateFixedLengthSVETBL(Op, Op1, Op2, ShuffleMask, VT, ContainerVT,
33264 DAG);
33265
33266 return SDValue();
33267}
33268
33269SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
33270 SelectionDAG &DAG) const {
33271 SDLoc DL(Op);
33272 EVT InVT = Op.getValueType();
33273
33274 assert(VT.isScalableVector() && isTypeLegal(VT) &&
33275 InVT.isScalableVector() && isTypeLegal(InVT) &&
33276 "Only expect to cast between legal scalable vector types!");
33277 assert(VT.getVectorElementType() != MVT::i1 &&
33278 InVT.getVectorElementType() != MVT::i1 &&
33279 "For predicate bitcasts, use getSVEPredicateBitCast");
33280
33281 if (InVT == VT)
33282 return Op;
33283
33284 EVT PackedVT = getPackedSVEVectorVT(VT.getVectorElementType());
33285 EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType());
33286
33287 // Safe bitcasting between unpacked vector types of different element counts
33288 // is currently unsupported because the following is missing the necessary
33289 // work to ensure the result's elements live where they're supposed to within
33290 // an SVE register.
33291 // 01234567
33292 // e.g. nxv2i32 = XX??XX??
33293 // nxv4f16 = X?X?X?X?
33295 VT == PackedVT || InVT == PackedInVT) &&
33296 "Unexpected bitcast!");
33297
33298 // Pack input if required.
33299 if (InVT != PackedInVT)
33300 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op);
33301
33302 if (Subtarget->isLittleEndian() ||
33303 PackedVT.getScalarSizeInBits() == PackedInVT.getScalarSizeInBits())
33304 Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
33305 else {
33306 EVT PackedVTAsInt = PackedVT.changeTypeToInteger();
33307 EVT PackedInVTAsInt = PackedInVT.changeTypeToInteger();
33308
33309 // Simulate the effect of casting through memory.
33310 Op = DAG.getNode(ISD::BITCAST, DL, PackedInVTAsInt, Op);
33311 if (PackedInVTAsInt.getScalarSizeInBits() != 8)
33312 Op = DAG.getNode(ISD::BSWAP, DL, PackedInVTAsInt, Op);
33313 Op = DAG.getNode(AArch64ISD::NVCAST, DL, PackedVTAsInt, Op);
33314 if (PackedVTAsInt.getScalarSizeInBits() != 8)
33315 Op = DAG.getNode(ISD::BSWAP, DL, PackedVTAsInt, Op);
33316 Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
33317 }
33318
33319 // Unpack result if required.
33320 if (VT != PackedVT)
33321 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
33322
33323 return Op;
33324}
33325
33327 SDValue N) const {
33328 return ::isAllActivePredicate(DAG, N);
33329}
33330
33332 return ::getPromotedVTForPredicate(VT);
33333}
33334
33335bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
33336 SDValue Op, const APInt &OriginalDemandedBits,
33337 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
33338 unsigned Depth) const {
33339
33340 unsigned Opc = Op.getOpcode();
33341 switch (Opc) {
33342 case AArch64ISD::VSHL: {
33343 // Match (VSHL (VLSHR Val X) X)
33344 SDValue ShiftL = Op;
33345 SDValue ShiftR = Op->getOperand(0);
33346 if (ShiftR->getOpcode() != AArch64ISD::VLSHR)
33347 return false;
33348
33349 if (!ShiftL.hasOneUse() || !ShiftR.hasOneUse())
33350 return false;
33351
33352 unsigned ShiftLBits = ShiftL->getConstantOperandVal(1);
33353 unsigned ShiftRBits = ShiftR->getConstantOperandVal(1);
33354
33355 // Other cases can be handled as well, but this is not
33356 // implemented.
33357 if (ShiftRBits != ShiftLBits)
33358 return false;
33359
33360 unsigned ScalarSize = Op.getScalarValueSizeInBits();
33361 assert(ScalarSize > ShiftLBits && "Invalid shift imm");
33362
33363 APInt ZeroBits = APInt::getLowBitsSet(ScalarSize, ShiftLBits);
33364 APInt UnusedBits = ~OriginalDemandedBits;
33365
33366 if ((ZeroBits & UnusedBits) != ZeroBits)
33367 return false;
33368
33369 // All bits that are zeroed by (VSHL (VLSHR Val X) X) are not
33370 // used - simplify to just Val.
33371 return TLO.CombineTo(Op, ShiftR->getOperand(0));
33372 }
33373 case AArch64ISD::BICi: {
33374 // Fold BICi if all destination bits already known to be zeroed
33375 SDValue Op0 = Op.getOperand(0);
33376 KnownBits KnownOp0 =
33377 TLO.DAG.computeKnownBits(Op0, OriginalDemandedElts, Depth + 1);
33378 // Op0 &= ~(ConstantOperandVal(1) << ConstantOperandVal(2))
33379 APInt BitsToClear =
33380 (Op->getConstantOperandAPInt(1) << Op->getConstantOperandAPInt(2))
33381 .trunc(KnownOp0.getBitWidth());
33382 APInt AlreadyZeroedBitsToClear = BitsToClear & KnownOp0.Zero;
33383 if (BitsToClear.isSubsetOf(AlreadyZeroedBitsToClear))
33384 return TLO.CombineTo(Op, Op0);
33385
33386 Known = KnownOp0 & KnownBits::makeConstant(~BitsToClear);
33387 return false;
33388 }
33390 std::optional<ElementCount> MaxCount = getMaxValueForSVECntIntrinsic(Op);
33391 if (!MaxCount)
33392 return false;
33393 unsigned MaxSVEVectorSizeInBits = Subtarget->getMaxSVEVectorSizeInBits();
33394 if (!MaxSVEVectorSizeInBits)
33395 MaxSVEVectorSizeInBits = AArch64::SVEMaxBitsPerVector;
33396 unsigned VscaleMax = MaxSVEVectorSizeInBits / 128;
33397 unsigned MaxValue = MaxCount->getKnownMinValue() * VscaleMax;
33398 // The SVE count intrinsics don't support the multiplier immediate so we
33399 // don't have to account for that here. The value returned may be slightly
33400 // over the true required bits, as this is based on the "ALL" pattern. The
33401 // other patterns are also exposed by these intrinsics, but they all
33402 // return a value that's strictly less than "ALL".
33403 unsigned RequiredBits = llvm::bit_width(MaxValue);
33404 unsigned BitWidth = Known.Zero.getBitWidth();
33405 if (RequiredBits < BitWidth)
33406 Known.Zero.setHighBits(BitWidth - RequiredBits);
33407 return false;
33408 }
33409 }
33410
33412 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
33413}
33414
33415bool AArch64TargetLowering::canCreateUndefOrPoisonForTargetNode(
33416 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
33417 UndefPoisonKind Kind, bool ConsiderFlags, unsigned Depth) const {
33418
33419 // TODO: Add more target nodes.
33420 switch (Op.getOpcode()) {
33421 case AArch64ISD::MOVI:
33422 case AArch64ISD::MOVIedit:
33423 case AArch64ISD::MOVImsl:
33424 case AArch64ISD::MOVIshift:
33425 case AArch64ISD::MVNImsl:
33426 case AArch64ISD::MVNIshift:
33427 case AArch64ISD::VASHR:
33428 case AArch64ISD::VLSHR:
33429 case AArch64ISD::VSHL:
33430 return false;
33431 }
33433 Op, DemandedElts, DAG, Kind, ConsiderFlags, Depth);
33434}
33435
33436bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const {
33437 return Op.getOpcode() == AArch64ISD::DUP ||
33438 Op.getOpcode() == AArch64ISD::MOVI ||
33439 Op.getOpcode() == AArch64ISD::MOVIshift ||
33440 Op.getOpcode() == AArch64ISD::MOVImsl ||
33441 Op.getOpcode() == AArch64ISD::MOVIedit ||
33442 Op.getOpcode() == AArch64ISD::MVNIshift ||
33443 Op.getOpcode() == AArch64ISD::MVNImsl ||
33444 // Ignoring fneg(movi(0)), because if it is folded to FPConstant(-0.0),
33445 // ISel will select fmov(mov i64 0x8000000000000000), resulting in a
33446 // fmov from fpr to gpr, which is more expensive than fneg(movi(0))
33447 (Op.getOpcode() == ISD::FNEG &&
33448 Op.getOperand(0).getOpcode() == AArch64ISD::MOVIedit &&
33449 Op.getOperand(0).getConstantOperandVal(0) == 0) ||
33450 (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
33451 Op.getOperand(0).getOpcode() == AArch64ISD::DUP) ||
33453}
33454
33456 return Subtarget->hasSVE() || Subtarget->hasSVE2() ||
33457 Subtarget->hasComplxNum();
33458}
33459
33462 auto *VTy = dyn_cast<VectorType>(Ty);
33463 if (!VTy)
33464 return false;
33465
33466 // If the vector is scalable, SVE is enabled, implying support for complex
33467 // numbers. Otherwise, we need to ensure complex number support is available
33468 if (!VTy->isScalableTy() && !Subtarget->hasComplxNum())
33469 return false;
33470
33471 auto *ScalarTy = VTy->getScalarType();
33472 unsigned NumElements = VTy->getElementCount().getKnownMinValue();
33473
33474 // We can only process vectors that have a bit size of 128 or higher (with an
33475 // additional 64 bits for Neon). Additionally, these vectors must have a
33476 // power-of-2 size, as we later split them into the smallest supported size
33477 // and merging them back together after applying complex operation.
33478 unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
33479 if ((VTyWidth < 128 && (VTy->isScalableTy() || VTyWidth != 64)) ||
33480 !llvm::isPowerOf2_32(VTyWidth))
33481 return false;
33482
33483 if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2() && VTy->isScalableTy()) {
33484 unsigned ScalarWidth = ScalarTy->getScalarSizeInBits();
33485
33487 return ScalarWidth == 32 || ScalarWidth == 64;
33488 return 8 <= ScalarWidth && ScalarWidth <= 64;
33489 }
33490
33491 // CDot is not supported outside of scalable/sve scopes
33493 return false;
33494
33495 return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) ||
33496 ScalarTy->isFloatTy() || ScalarTy->isDoubleTy();
33497}
33498
33501 ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
33502 Value *Accumulator) const {
33503 VectorType *Ty = cast<VectorType>(InputA->getType());
33504 if (Accumulator == nullptr)
33506 bool IsScalable = Ty->isScalableTy();
33507 bool IsInt = Ty->getElementType()->isIntegerTy();
33508
33509 unsigned TyWidth =
33510 Ty->getScalarSizeInBits() * Ty->getElementCount().getKnownMinValue();
33511
33512 assert(((TyWidth >= 128 && llvm::isPowerOf2_32(TyWidth)) || TyWidth == 64) &&
33513 "Vector type must be either 64 or a power of 2 that is at least 128");
33514
33515 if (TyWidth > 128) {
33516 int Stride = Ty->getElementCount().getKnownMinValue() / 2;
33517 int AccStride = cast<VectorType>(Accumulator->getType())
33518 ->getElementCount()
33519 .getKnownMinValue() /
33520 2;
33521 auto *HalfTy = VectorType::getHalfElementsVectorType(Ty);
33522 auto *LowerSplitA = B.CreateExtractVector(HalfTy, InputA, uint64_t(0));
33523 auto *LowerSplitB = B.CreateExtractVector(HalfTy, InputB, uint64_t(0));
33524 auto *UpperSplitA = B.CreateExtractVector(HalfTy, InputA, Stride);
33525 auto *UpperSplitB = B.CreateExtractVector(HalfTy, InputB, Stride);
33526 Value *LowerSplitAcc = nullptr;
33527 Value *UpperSplitAcc = nullptr;
33528 Type *FullTy = Ty;
33529 FullTy = Accumulator->getType();
33530 auto *HalfAccTy = VectorType::getHalfElementsVectorType(
33531 cast<VectorType>(Accumulator->getType()));
33532 LowerSplitAcc = B.CreateExtractVector(HalfAccTy, Accumulator, uint64_t(0));
33533 UpperSplitAcc = B.CreateExtractVector(HalfAccTy, Accumulator, AccStride);
33534 auto *LowerSplitInt = createComplexDeinterleavingIR(
33535 B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
33536 auto *UpperSplitInt = createComplexDeinterleavingIR(
33537 B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
33538
33539 auto *Result = B.CreateInsertVector(FullTy, PoisonValue::get(FullTy),
33540 LowerSplitInt, uint64_t(0));
33541 return B.CreateInsertVector(FullTy, Result, UpperSplitInt, AccStride);
33542 }
33543
33544 if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
33545 if (IsScalable) {
33546 if (IsInt)
33547 return B.CreateIntrinsic(
33548 Intrinsic::aarch64_sve_cmla_x, Ty,
33549 {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
33550
33551 auto *Mask = B.getAllOnesMask(Ty->getElementCount());
33552 return B.CreateIntrinsic(
33553 Intrinsic::aarch64_sve_fcmla, Ty,
33554 {Mask, Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
33555 }
33556
33557 Intrinsic::ID IdMap[4] = {Intrinsic::aarch64_neon_vcmla_rot0,
33558 Intrinsic::aarch64_neon_vcmla_rot90,
33559 Intrinsic::aarch64_neon_vcmla_rot180,
33560 Intrinsic::aarch64_neon_vcmla_rot270};
33561
33562
33563 return B.CreateIntrinsic(IdMap[(int)Rotation], Ty,
33564 {Accumulator, InputA, InputB});
33565 }
33566
33567 if (OperationType == ComplexDeinterleavingOperation::CAdd) {
33568 if (IsScalable) {
33571 if (IsInt)
33572 return B.CreateIntrinsic(
33573 Intrinsic::aarch64_sve_cadd_x, Ty,
33574 {InputA, InputB, B.getInt32((int)Rotation * 90)});
33575
33576 auto *Mask = B.getAllOnesMask(Ty->getElementCount());
33577 return B.CreateIntrinsic(
33578 Intrinsic::aarch64_sve_fcadd, Ty,
33579 {Mask, InputA, InputB, B.getInt32((int)Rotation * 90)});
33580 }
33581 return nullptr;
33582 }
33583
33586 IntId = Intrinsic::aarch64_neon_vcadd_rot90;
33588 IntId = Intrinsic::aarch64_neon_vcadd_rot270;
33589
33590 if (IntId == Intrinsic::not_intrinsic)
33591 return nullptr;
33592
33593 return B.CreateIntrinsic(IntId, Ty, {InputA, InputB});
33594 }
33595
33596 if (OperationType == ComplexDeinterleavingOperation::CDot && IsInt &&
33597 IsScalable) {
33598 return B.CreateIntrinsic(
33599 Intrinsic::aarch64_sve_cdot, Accumulator->getType(),
33600 {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
33601 }
33602
33603 return nullptr;
33604}
33605
33606bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const {
33607 unsigned Opc = N->getOpcode();
33608 if (ISD::isExtOpcode(Opc)) {
33609 if (any_of(N->users(),
33610 [&](SDNode *Use) { return Use->getOpcode() == ISD::MUL; }))
33611 return false;
33612 }
33613 return true;
33614}
33615
33617 return Subtarget->getMinimumJumpTableEntries();
33618}
33619
33621 CallingConv::ID CC,
33622 EVT VT) const {
33623 bool NonUnitFixedLengthVector =
33625 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
33626 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
33627
33628 EVT VT1;
33629 MVT RegisterVT;
33630 unsigned NumIntermediates;
33631 getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1, NumIntermediates,
33632 RegisterVT);
33633 return RegisterVT;
33634}
33635
33637 LLVMContext &Context, CallingConv::ID CC, EVT VT) const {
33638 bool NonUnitFixedLengthVector =
33640 if (!NonUnitFixedLengthVector || !Subtarget->useSVEForFixedLengthVectors())
33641 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
33642
33643 EVT VT1;
33644 MVT VT2;
33645 unsigned NumIntermediates;
33646 return getVectorTypeBreakdownForCallingConv(Context, CC, VT, VT1,
33647 NumIntermediates, VT2);
33648}
33649
33651 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
33652 unsigned &NumIntermediates, MVT &RegisterVT) const {
33654 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
33655 if (!RegisterVT.isFixedLengthVector() ||
33656 RegisterVT.getFixedSizeInBits() <= 128)
33657 return NumRegs;
33658
33659 assert(Subtarget->useSVEForFixedLengthVectors() && "Unexpected mode!");
33660 assert(IntermediateVT == RegisterVT && "Unexpected VT mismatch!");
33661 assert(RegisterVT.getFixedSizeInBits() % 128 == 0 && "Unexpected size!");
33662
33663 // A size mismatch here implies either type promotion or widening and would
33664 // have resulted in scalarisation if larger vectors had not be available.
33665 if (RegisterVT.getSizeInBits() * NumRegs != VT.getSizeInBits()) {
33666 EVT EltTy = VT.getVectorElementType();
33667 EVT NewVT = EVT::getVectorVT(Context, EltTy, ElementCount::getFixed(1));
33668 if (!isTypeLegal(NewVT))
33669 NewVT = EltTy;
33670
33671 IntermediateVT = NewVT;
33672 NumIntermediates = VT.getVectorNumElements();
33673 RegisterVT = getRegisterType(Context, NewVT);
33674 return NumIntermediates;
33675 }
33676
33677 // SVE VLS support does not introduce a new ABI so we should use NEON sized
33678 // types for vector arguments and returns.
33679
33680 unsigned NumSubRegs = RegisterVT.getFixedSizeInBits() / 128;
33681 NumIntermediates *= NumSubRegs;
33682 NumRegs *= NumSubRegs;
33683
33684 switch (RegisterVT.getVectorElementType().SimpleTy) {
33685 default:
33686 llvm_unreachable("unexpected element type for vector");
33687 case MVT::i8:
33688 IntermediateVT = RegisterVT = MVT::v16i8;
33689 break;
33690 case MVT::i16:
33691 IntermediateVT = RegisterVT = MVT::v8i16;
33692 break;
33693 case MVT::i32:
33694 IntermediateVT = RegisterVT = MVT::v4i32;
33695 break;
33696 case MVT::i64:
33697 IntermediateVT = RegisterVT = MVT::v2i64;
33698 break;
33699 case MVT::f16:
33700 IntermediateVT = RegisterVT = MVT::v8f16;
33701 break;
33702 case MVT::f32:
33703 IntermediateVT = RegisterVT = MVT::v4f32;
33704 break;
33705 case MVT::f64:
33706 IntermediateVT = RegisterVT = MVT::v2f64;
33707 break;
33708 case MVT::bf16:
33709 IntermediateVT = RegisterVT = MVT::v8bf16;
33710 break;
33711 }
33712
33713 return NumRegs;
33714}
33715
33717 const MachineFunction &MF) const {
33718 return !Subtarget->isTargetWindows() &&
33719 MF.getInfo<AArch64FunctionInfo>()->hasStackProbing();
33720}
33721
33723 switch (Opc) {
33727 if (VT == MVT::v8i8 || VT == MVT::v4i16 || VT == MVT::v2i32)
33728 return true;
33729 }
33730
33732}
33733
33735 EVT VT) const {
33736 return Subtarget->hasCPA() && UseFEATCPACodegen;
33737}
33738
33739SDValue AArch64TargetLowering::LowerFCANONICALIZE(SDValue Op,
33740 SelectionDAG &DAG) const {
33741 SDLoc DL(Op);
33742 EVT VT = Op.getValueType();
33743 assert(VT.isVector() && "Expected vector type!");
33744
33745 SDValue In = Op.getOperand(0);
33746 SDValue Pg = getPredicateForVector(DAG, DL, VT);
33747
33748 // FMINNM follows IEEE754-2008 and will canonicalize a floating-point number.
33749
33750 if (VT.isScalableVector())
33751 return DAG.getNode(AArch64ISD::FMINNM_PRED, DL, VT, Pg, In, In);
33752
33754 "Expected to lower to SVE!");
33755 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
33756 In = convertToScalableVector(DAG, ContainerVT, In);
33757 In = DAG.getNode(AArch64ISD::FMINNM_PRED, DL, ContainerVT, Pg, In, In);
33758 return convertFromScalableVector(DAG, VT, In);
33759}
static MCRegister MatchRegisterName(StringRef Name)
MachineInstrBuilder MachineInstrBuilder & DefMI
static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc, uint64_t &Imm)
static std::tuple< SDValue, SDValue > extractPtrauthBlendDiscriminators(SDValue Disc, SelectionDAG *DAG)
static bool isIntImmediate(const SDNode *N, uint64_t &Imm)
isIntImmediate - This method tests to see if the node is a constant operand.
static SDValue trySVESplat64(SDValue Op, SelectionDAG &DAG, const AArch64Subtarget *ST, APInt &DefBits)
static SDValue tryLowerSmallVectorExtLoad(LoadSDNode *Load, SelectionDAG &DAG)
Helper function to optimize loads of extended small vectors.
static void CustomNonLegalBITCASTResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, EVT ExtendVT, EVT CastVT)
static bool isConcatMask(ArrayRef< int > Mask, EVT VT, bool SplitLHS)
static bool isAddSubSExt(SDValue N, SelectionDAG &DAG)
static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue CCOp, AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC, const SDLoc &DL, SelectionDAG &DAG)
can be transformed to: not (and (not (and (setCC (cmp C)) (setCD (cmp D)))) (and (not (setCA (cmp A))...
static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2, bool &Invert)
changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC usable with the vector...
static SDValue performZExtDeinterleaveShuffleCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt)
isVShiftRImm - Check if this is a valid build_vector for the immediate operand of a vector shift righ...
static bool isSingletonEXTMask(ArrayRef< int > M, EVT VT, unsigned &Imm)
static SDValue foldCSELofCTTZ(SDNode *N, SelectionDAG &DAG)
static SDValue performCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex)
static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue NormalizeBuildVector(SDValue Op, SelectionDAG &DAG)
static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of zeros to a vector store by scalar stores of WZR/XZR.
static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG)
static SDValue performLastTrueTestVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue convertMulToShlAdd(SDNode *N, SelectionDAG &DAG)
static SDValue GenerateTBL(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performDUPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue reassociateCSELOperandsForCSE(SDNode *N, SelectionDAG &DAG)
static std::optional< PredicateConstraint > parsePredicateConstraint(StringRef Constraint)
static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static void analyzeCallOperands(const AArch64TargetLowering &TLI, const AArch64Subtarget *Subtarget, const TargetLowering::CallLoweringInfo &CLI, CCState &CCInfo)
static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo)
Check whether or not Op is a SET_CC operation, either a generic or an AArch64 lowered one.
static bool isLegalArithImmed(uint64_t C)
static std::optional< Intrinsic::ID > getPairwiseOpForReduction(unsigned Op)
Returns the pairwise SVE2 op that could be used for a v2<ty> reduction.
static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT)
static SDValue performVectorDeinterleaveCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static ScalableVectorType * getSVEContainerIRType(FixedVectorType *VTy)
static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG)
unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend)
static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG)
static SDValue convertFixedMaskToScalableVector(SDValue Mask, SelectionDAG &DAG)
static void setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL, AArch64TargetLowering::IntrinsicInfo &Info, const CallBase &CI)
Set the IntrinsicInfo for the aarch64_sve_st<N> intrinsics.
static bool isZeroingInactiveLanes(SDValue Op)
static SDValue performPTestFirstCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue trySwapVSelectOperands(SDNode *N, SelectionDAG &DAG)
static SDValue tryCombineMULLWithUZP1(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue combineToExtendBoolVectorInReg(unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget &Subtarget)
static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG, bool isSigned)
static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG)
static bool isZerosVector(const SDNode *N)
isZerosVector - Check whether SDNode N is a zero-filled vector.
static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue performNVCASTCombine(SDNode *N, SelectionDAG &DAG)
Get rid of unnecessary NVCASTs (that don't change the type).
static const TargetRegisterClass * getReducedGprRegisterClass(ReducedGprConstraint Constraint, EVT VT)
static const MachineInstr * stripVRegCopies(const MachineRegisterInfo &MRI, Register Reg)
static SDValue carryFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG, bool Invert)
static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset, SDLoc DL, unsigned BitWidth)
static bool isPredicateCCSettingOp(SDValue N)
static SDValue performSHLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
If the operand is a bitwise AND with a constant RHS, and the shift has a constant RHS and is the only...
static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG)
static bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType)
static SDValue performSVEAndCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
bool isVectorizedBinOp(unsigned Opcode)
static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &DL, SelectionDAG &DAG)
static SDValue overflowFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG)
static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2, ArrayRef< int > ShuffleMask, EVT VT, EVT ContainerVT, SelectionDAG &DAG)
static SDValue performBRCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static MVT getSVEContainerType(EVT ContentTy)
static SDValue performVecReduceAddCntpCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *ST)
static bool isMergePassthruOpcode(unsigned Opc)
static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG, SDLoc DL, bool &IsMLA)
static SDValue performFADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performNEONPostLDSTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
Target-specific DAG combine function for NEON load/store intrinsics to merge base address updates.
static SDValue emitVectorComparison(SDValue LHS, SDValue RHS, AArch64CC::CondCode CC, bool NoNans, EVT VT, const SDLoc &DL, SelectionDAG &DAG)
Emit vector comparison for floating-point values, producing a mask.
static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performVectorExtCombine(SDNode *N, SelectionDAG &DAG)
static void ReplaceCMP_SWAP_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp, SelectionDAG &DAG)
static SDValue performMINMAXCombine(SDNode *N, SelectionDAG &DAG, const AArch64TargetLowering &TLI)
static bool isZeroExtended(SDValue N, SelectionDAG &DAG)
static SDValue performSelectCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with the compare-mask instruct...
static bool isCheapToExtend(const SDValue &N)
static cl::opt< bool > EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden, cl::desc("Enable AArch64 logical imm instruction " "optimization"), cl::init(true))
static SDValue performExtractLastActiveCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG)
static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes, unsigned ScalarSizeInBytes)
Check if the value of OffsetInBytes can be used as an immediate for the gather load/prefetch and scat...
static bool isUZP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of "vector_shuffle v,...
static bool shouldLowerTailCallStackArg(const MachineFunction &MF, const CCValAssign &VA, SDValue Arg, ISD::ArgFlagsTy Flags, int CallOffset)
Check whether a stack argument requires lowering in a tail call.
static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
static std::optional< ElementCount > getMaxValueForSVECntIntrinsic(SDValue Op)
static unsigned getDUPLANEOp(EVT EltType)
static void changeFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget, const TargetMachine &TM)
static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST, EVT VT, EVT MemVT, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static bool canLowerSRLToRoundingShiftForVT(SDValue Shift, EVT ResVT, SelectionDAG &DAG, unsigned &ShiftValue, SDValue &RShOperand)
static bool isExtendOrShiftOperand(SDValue N)
static bool isLanes1toNKnownZero(SDValue Op)
static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG)
static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N, SelectionDAG &DAG)
static std::tuple< SDValue, SDValue > extractPtrauthBlendDiscriminators(SDValue Disc, SelectionDAG *DAG)
static EVT getPackedSVEVectorVT(EVT VT)
static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performUnpackCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerPtrAuthGlobalAddressStatically(SDValue TGA, SDLoc DL, EVT VT, AArch64PACKey::ID KeyC, SDValue Discriminator, SDValue AddrDiscriminator, SelectionDAG &DAG)
static SDValue performVecReduceBitwiseCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performFlagSettingCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned GenericOpcode)
static SDValue performSpliceCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performCSELCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static void ReplaceReductionResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, unsigned InterOp, unsigned AcrossOp)
static bool isEquivalentMaskless(unsigned CC, unsigned width, ISD::LoadExtType ExtType, int AddConstant, int CompConstant)
static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG)
static SDValue constructDup(SDValue V, int Lane, SDLoc DL, EVT VT, unsigned Opcode, SelectionDAG &DAG)
static bool isCMP(SDValue Op)
return SDValue()
static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG)
static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static bool rmwOpMayLowerToLibcall(const AArch64Subtarget &Subtarget, const AtomicRMWInst *RMW)
static Function * getStructuredLoadFunction(Module *M, unsigned Factor, bool Scalable, Type *LDVTy, Type *PtrTy)
unsigned numberOfInstrToLoadImm(APInt C)
static bool isCMN(SDValue Op, ISD::CondCode CC, SelectionDAG &DAG)
static SDValue foldCSELOfCSEL(SDNode *Op, SelectionDAG &DAG)
static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc, SelectionDAG &DAG, bool UnpredOp=false, bool SwapOperands=false)
static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
SDValue LowerSMELdrStr(SDValue N, SelectionDAG &DAG, bool IsLoad)
static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp, AArch64CC::CondCode Predicate)
Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain of CCMP/CFCMP ops.
static SDValue performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performExtendDuplaneTruncCombine(SDNode *N, SelectionDAG &DAG)
static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated)
static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian)
static SDValue performANDSCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, SelectionDAG &DAG)
static AArch64CC::CondCode parseConstraintCode(llvm::StringRef Constraint)
static bool isINSMask(ArrayRef< int > M, int NumInputElements, bool &DstIsLeft, int &Anomaly)
static SDValue performExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool callConvSupportsVarArgs(CallingConv::ID CC)
Return true if the call convention supports varargs Currently only those that pass varargs like the C...
static SDValue performBICiCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static const MCPhysReg GPRArgRegs[]
static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, APInt &UndefBits)
static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG)
static SDValue LowerCLMUL(SDValue Op, SelectionDAG &DAG)
static SDValue performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static bool isPassedInFPR(EVT VT)
static unsigned getIntrinsicID(const SDNode *N)
static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert)
static SDValue performAddUADDVCombine(SDNode *N, SelectionDAG &DAG)
static bool IsSVECntIntrinsic(SDValue S)
static SDValue performExtBinopLoadFold(SDNode *N, SelectionDAG &DAG)
static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N, SDValue &BasePtr, SDValue &Index, SelectionDAG &DAG)
static SDValue performANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue tryCombineFADDReductionWithZero(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget, SDValue Vec, SDValue Start=SDValue())
Optimize patterns where we insert zeros into vector lanes before a floating-point add reduction.
static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, SDValue V2, unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &DL)
GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit the specified operations t...
static SDValue performVSelectCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static bool isWideDUPMask(ArrayRef< int > M, EVT VT, unsigned BlockSize, unsigned &DupLaneOp)
Check if a vector shuffle corresponds to a DUP instructions with a larger element width than the vect...
constexpr MVT FlagsVT
Value type used for NZCV flags.
static SDValue performSELECT_CCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static cl::opt< bool > EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden, cl::desc("Combine ext and trunc to TBL"), cl::init(true))
static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St, SDValue SplatVal, unsigned NumVecElts)
static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG)
static std::optional< std::pair< unsigned, const TargetRegisterClass * > > parseSVERegAsConstraint(StringRef Constraint)
static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *ST)
static SDValue tryLowerToBSL(SDValue N, SelectionDAG &DAG)
static SDValue performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue removeRedundantInsertVectorElt(SDNode *N)
static std::optional< AArch64CC::CondCode > getCSETCondCode(SDValue Op)
static bool isLane0KnownActive(SDValue Op)
static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue trySQDMULHCombine(SDNode *N, SelectionDAG &DAG)
static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG)
Legalize the gather prefetch (scalar + vector addressing mode) when the offset vector is an unpacked ...
static bool isNegatedInteger(SDValue Op)
static SDValue performFirstTrueTestVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
constexpr MVT CondCodeVT
Value type used for condition codes.
static bool isLoadOrMultipleLoads(SDValue B, SmallVector< LoadSDNode * > &Loads)
static SDValue performSubAddMULCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc)
static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16)
SDValue LowerVectorMatch(SDValue Op, SelectionDAG &DAG)
static Function * getStructuredStoreFunction(Module *M, unsigned Factor, bool Scalable, Type *STVTy, Type *PtrTy)
static SDValue performZExtUZPCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performVectorShiftCombine(SDNode *N, const AArch64TargetLowering &TLI, TargetLowering::DAGCombinerInfo &DCI)
Optimize a vector shift instruction and its operand if shifted out bits are not used.
static SDValue performUADDVAddCombine(SDValue A, SelectionDAG &DAG)
static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG, unsigned ScalarSizeInBytes)
Combines a node carrying the intrinsic aarch64_sve_prf<T>_gather_scalar_offset into a node that uses ...
static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St)
Replace a splat of a scalar to a vector store by scalar stores of the scalar value.
unsigned getSignExtendedGatherOpcode(unsigned Opcode)
static bool isOrXorChain(SDValue N, unsigned &Num, SmallVector< std::pair< SDValue, SDValue >, 16 > &WorkList)
static SDValue tryCombineToREV(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI)
static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt)
getVShiftImm - Check if this is a valid build_vector for the immediate operand of a vector shift oper...
static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd)
static bool isEXTMaskWithSplat(ArrayRef< int > M, EVT VT, unsigned SplatOperand, bool &ReverseEXT, unsigned &Imm)
static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG)
static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, const APInt &Demanded, TargetLowering::TargetLoweringOpt &TLO, unsigned NewOpc)
bool isLegalCmpImmed(APInt C)
static bool isSafeSignedCMN(SDValue Op, SelectionDAG &DAG)
static unsigned getCmpOperandFoldingProfit(SDValue Op)
Returns how profitable it is to fold a comparison's operand's shift and/or extension operations.
static SDValue lowerIntNeonIntrinsic(SDValue Op, unsigned Opcode, SelectionDAG &DAG, bool LastOperandIsImm=false)
static SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue performCTPOPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG)
static SDValue performConcatVectorsCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performSVEMulAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue foldCSELofLASTB(SDNode *Op, SelectionDAG &DAG)
static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N, SelectionDAG &DAG)
static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AArch64cc, SelectionDAG &DAG, const SDLoc &DL)
static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG)
static SDValue performBuildShuffleExtendCombine(SDValue BV, SelectionDAG &DAG)
Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern into sext/zext(buildvecto...
static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits)
static Value * createTblShuffleForZExt(IRBuilderBase &Builder, Value *Op, FixedVectorType *ZExtTy, FixedVectorType *DstTy, bool IsLittleEndian)
static SDValue performAddSubIntoVectorOp(SDNode *N, SelectionDAG &DAG)
static SDValue performANDORDUPNOTCombine(SDNode *N, SelectionDAG &DAG)
static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG)
static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC, SDValue RHS={})
changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64 CC
static const MCPhysReg FPRArgRegs[]
static SDValue getSETCC(AArch64CC::CondCode CC, SDValue NZCV, const SDLoc &DL, SelectionDAG &DAG)
Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR, WZR, invert(<cond>)'.
static SDValue performAddTruncShiftCombine(SDNode *N, SelectionDAG &DAG)
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static SDValue tryCombineNeonFcvtFP16ToI16(SDNode *N, unsigned Opcode, SelectionDAG &DAG)
static void replaceBoolVectorBitcast(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG)
static bool isOneVector(SDValue V)
static SDValue performActiveLaneMaskCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *ST)
static SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT, int Pattern)
static bool isEXTMask(ArrayRef< int > M, EVT VT, bool &ReverseEXT, unsigned &Imm)
static std::optional< ReducedGprConstraint > parseReducedGprConstraint(StringRef Constraint)
static SDValue tryCombineFixedPointConvert(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT)
static SDValue performSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SMECallAttrs getSMECallAttrs(const Function &Caller, const RTLIB::RuntimeLibcallsInfo &RTLCI, const TargetLowering::CallLoweringInfo &CLI)
static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG)
Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup)) making use of the vector SExt/ZE...
static SDValue performAddSubLongCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG)
static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Fold a floating-point multiply by power of two into floating-point to fixed-point conversion.
static EVT calculatePreExtendType(SDValue Extend)
Calculates what the pre-extend type is, based on the extension operation node provided by Extend.
static SDValue performSetCCPunpkCombine(SDNode *N, SelectionDAG &DAG)
static bool isSignExtInReg(const SDValue &V)
static EVT getPromotedVTForPredicate(EVT VT)
static void changeFPCCToANDAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2)
Convert a DAG fp condition code to an AArch64 CC.
static SDValue performAddWithSBCCombine(SDNode *N, SelectionDAG &DAG)
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
Turn vector tests of the signbit in the form of: xor (sra X, elt_size(X)-1), -1 into: cmge X,...
static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG)
static bool isAllConstantBuildVector(const SDValue &PotentialBVec, uint64_t &ConstVal)
static SDValue performExtractSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue tryToReplaceScalarFPConversionWithSVE(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
Tries to replace scalar FP <-> INT conversions with SVE in streaming functions, this can help to redu...
static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG)
static Value * UseTlsOffset(IRBuilderBase &IRB, unsigned Offset)
static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG)
WidenVector - Given a value in the V64 register class, produce the equivalent value in the V128 regis...
static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG)
static bool isSignExtended(SDValue N, SelectionDAG &DAG)
static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG, const AArch64Subtarget *ST)
static bool isUnpackedType(EVT VT, SelectionDAG &DAG)
Returns true if the conceptual representation for VT does not map directly to its physical register r...
static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op, AArch64CC::CondCode Cond)
static bool isSetCCOrZExtSetCC(const SDValue &Op, SetCCInfoAndKind &Info)
cl::opt< bool > EnableAArch64ELFLocalDynamicTLSGeneration("aarch64-elf-ldtls-generation", cl::Hidden, cl::desc("Allow AArch64 Local Dynamic TLS code generation"), cl::init(false))
static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG)
static SDValue performVselectPowCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue getCondCode(SelectionDAG &DAG, AArch64CC::CondCode CC)
Like SelectionDAG::getCondCode(), but for AArch64 condition codes.
static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG)
static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG)
static SDValue optimizeIncrementingWhile(SDNode *N, SelectionDAG &DAG, bool IsSigned, bool IsEqual)
static SDValue performSunpkloCombine(SDNode *N, SelectionDAG &DAG)
static unsigned getPredicatedReductionOpcode(unsigned Op)
Returns the corresponding predicated SVE reduction opcode for a VECREDUCE_*.
static SDValue tryToConvertShuffleOfTbl2ToTbl4(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static unsigned getAtomicLoad128Opcode(unsigned ISDOpcode, AtomicOrdering Ordering)
static void ReplaceAddWithADDP(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
cl::opt< bool > EnableSVEGISel("aarch64-enable-gisel-sve", cl::Hidden, cl::desc("Enable / disable SVE scalable vectors in Global ISel"), cl::init(false))
static SDValue performSetccMergeZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue performPostLD1Combine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, bool IsLaneOp)
Target-specific DAG combine function for post-increment LD1 (lane) and post-increment LD1R.
std::pair< SDValue, uint64_t > lookThroughSignExtension(SDValue Val)
static SDValue performSubWithBorrowCombine(SDNode *N, SelectionDAG &DAG)
bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL)
static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG)
static bool canEmitConjunction(SelectionDAG &DAG, const SDValue Val, bool &CanNegate, bool &MustBeFirst, bool &PreferFirst, bool WillNegate, unsigned Depth=0)
Returns true if Val is a tree of AND/OR/SETCC operations that can be expressed as a conjunction.
static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale, SDLoc DL, SelectionDAG &DAG)
static SDValue emitFloatCompareMask(SDValue LHS, SDValue RHS, SDValue TVal, SDValue FVal, ISD::CondCode CC, bool NoNaNs, const SDLoc &DL, SelectionDAG &DAG)
For SELECT_CC, when the true/false values are (-1, 0) and the compared values are scalars,...
static SDValue performRNDRCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue skipExtensionForVectorMULL(SDValue N, SelectionDAG &DAG)
static SDValue performOrXorChainCombine(SDNode *N, SelectionDAG &DAG)
static unsigned getReductionForOpcode(unsigned Op)
static SDValue performAddCombineForShiftedOperands(SDNode *N, SelectionDAG &DAG)
static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V)
static bool shouldBeAdjustedToZero(SDValue LHS, APInt C, ISD::CondCode &CC)
static bool isPackedPredicateType(EVT VT, SelectionDAG &DAG)
static SDValue combineSVEBitSel(unsigned IID, SDNode *N, SelectionDAG &DAG)
static SDValue lowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG, unsigned Opcode, bool IsSigned)
static bool isPackedVectorType(EVT VT, SelectionDAG &DAG)
Returns true if VT's elements occupy the lowest bit positions of its associated register class withou...
static bool isTRN_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of "vector_shuffle v,...
static AArch64SME::ToggleCondition getSMToggleCondition(const SMECallAttrs &CallAttrs)
static bool isAddSubZExt(SDValue N, SelectionDAG &DAG)
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt)
isVShiftLImm - Check if this is a valid build_vector for the immediate operand of a vector shift left...
static SDValue optimizeBitTest(SDLoc DL, SDValue Op, SDValue Chain, SDValue Dest, unsigned Opcode, SelectionDAG &DAG)
static SDValue performMaskedGatherScatterCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert, SelectionDAG &DAG)
static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performBuildVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets=true)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue LowerFLDEXP(SDValue Op, SelectionDAG &DAG)
static SDValue combineSVEReductionInt(SDNode *N, unsigned Opc, SelectionDAG &DAG)
static SDValue isNVCastToHalfWidthElements(SDValue V)
static bool isHalvingTruncateAndConcatOfLegalIntScalableType(SDNode *N)
static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode, SDValue Operand, SelectionDAG &DAG, int &ExtraSteps)
static SDValue performUADDVZextCombine(SDValue A, SelectionDAG &DAG)
static SDValue performAddCSelIntoCSinc(SDNode *N, SelectionDAG &DAG)
Perform the scalar expression combine in the form of: CSEL(c, 1, cc) + b => CSINC(b+c,...
static SDValue performCTLZCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool isEligibleForSmallVectorLoadOpt(LoadSDNode *LD, const AArch64Subtarget &Subtarget)
Helper function to check if a small vector load can be optimized.
static std::optional< uint64_t > getConstantLaneNumOfExtractHalfOperand(SDValue &Op)
static void ReplaceATOMIC_LOAD_128Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &DL, SelectionDAG &DAG, SDValue Chain, bool IsSignaling)
static bool areLoadedOffsetButOtherwiseSame(SDValue Op0, SDValue Op1, SelectionDAG &DAG, unsigned &NumSubLoads)
static SDValue performMulRdsvlCombine(SDNode *Mul, SelectionDAG &DAG)
static bool isEssentiallyExtractHighSubvector(SDValue N)
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
static Value * createTblShuffleForSExt(IRBuilderBase &Builder, Value *Op, FixedVectorType *DstTy, bool IsLittleEndian)
static unsigned getExtFactor(SDValue &V)
getExtFactor - Determine the adjustment factor for the position when generating an "extract from vect...
static cl::opt< unsigned > MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden, cl::desc("Maximum of xors"))
static SDValue performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS=nullptr)
static SDValue performMULLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue performSubNegAndOneCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG)
static SDValue trySimplifySrlAddToRshrnb(SDValue Srl, SelectionDAG &DAG, const AArch64Subtarget *Subtarget)
static bool isLegalNTStore(Type *DataType, Align Alignment, const DataLayout &DL)
static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG)
static SDValue performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static SDValue performReinterpretCastCombine(SDNode *N)
SDValue ReconstructShuffleWithRuntimeMask(SDValue Op, SelectionDAG &DAG)
static SDValue performTBZCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC)
Emit expression as a conjunction (a series of CCMP/CFCMP ops).
static void simplifySetCCIntoEq(ISD::CondCode &CC, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG, const SDLoc DL)
static SDValue tryCombineExtendRShTrunc(SDNode *N, SelectionDAG &DAG)
static bool isAllInactivePredicate(SDValue N)
static SDValue LowerNTStore(StoreSDNode *StoreNode, EVT VT, EVT MemVT, const SDLoc &DL, SelectionDAG &DAG)
static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT, SDLoc DL, SelectionDAG &DAG)
static SDValue performIntrinsicCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget)
static cl::opt< bool > EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden, cl::desc("Combine extends of AArch64 masked " "gather intrinsics"), cl::init(true))
static bool isZIP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of "vector_shuffle v,...
static SDValue performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG)
static cl::opt< bool > UseFEATCPACodegen("aarch64-use-featcpa-codegen", cl::Hidden, cl::desc("Generate ISD::PTRADD nodes for pointer arithmetic in " "SelectionDAG for FEAT_CPA"), cl::init(false))
static bool createTblShuffleMask(unsigned SrcWidth, unsigned DstWidth, unsigned NumElts, bool IsLittleEndian, SmallVectorImpl< int > &Mask)
static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V)
static SDValue performAddCombineSubShift(SDNode *N, SDValue SUB, SDValue Z, SelectionDAG &DAG)
static SDValue performANDSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static const TargetRegisterClass * getPredicateRegisterClass(PredicateConstraint Constraint, EVT VT)
static SDValue performAddSubCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue optimizeBrk(SDNode *N, SelectionDAG &DAG)
static SDValue performSubsToAndsCombine(SDNode *N, SDNode *SubsNode, SDNode *AndNode, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex, unsigned CC)
static std::pair< SDValue, SDValue > getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG)
#define FALKOR_STRIDED_ACCESS_MD
@ Generic
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static msgpack::DocNode getNode(msgpack::DocNode DN, msgpack::Type Type, MCValue Val)
static bool isConstant(const MachineInstr &MI)
constexpr LLT S1
constexpr LLT F32
AMDGPU Register Bank Select
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
@ Scaled
static bool isSupportedType(const DataLayout &DL, const ARMTargetLowering &TLI, Type *T)
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
Function Alias Analysis Results
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
basic Basic Alias true
#define X(NUM, ENUM, NAME)
Definition ELF.h:851
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static bool isConstantSplatVectorMaskForType(SDNode *N, EVT ScalarTy)
dxil translate DXIL Translate Metadata
@ Default
static bool isSigned(unsigned Opcode)
#define Check(C,...)
#define im(i)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
Definition IVUsers.cpp:48
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
lazy value info
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
This file provides utility analysis objects describing memory locations.
#define T
This file defines ARC utility functions which are used by various parts of the compiler.
#define P(N)
static CodeModel::Model getCodeModel(const PPCSubtarget &S, const TargetMachine &TM, const MachineOperand &MO)
PowerPC Reduce CR logical Operation
static bool getVal(MDTuple *MD, const char *Key, uint64_t &Val)
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static Type * getValueType(Value *V, bool LookThroughCmp=false)
Returns the "element type" of the given value/instruction V.
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:483
static LLVM_ATTRIBUTE_ALWAYS_INLINE MVT::SimpleValueType getSimpleVT(const uint8_t *MatcherTable, size_t &MatcherIndex)
getSimpleVT - Decode a value in MatcherTable, if it's a VBR encoded value, use GetVBR to decode it.
This file defines the SmallSet class.
This file defines less commonly used SmallVector utilities.
This file defines the SmallVector class.
static Split data
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static const int BlockSize
Definition TarWriter.cpp:33
This pass exposes codegen information to IR-level passes.
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
static X86::CondCode getSwappedCondition(X86::CondCode CC)
Assuming the flags are set by MI(a,b), return the condition code if we modify the instructions such t...
static constexpr int Concat[]
Value * RHS
Value * LHS
BinaryOperator * Mul
The Input class is used to parse a yaml document into in-memory structs and vectors.
AArch64FunctionInfo - This class is derived from MachineFunctionInfo and contains private AArch64-spe...
void setVarArgsStackOffset(unsigned Offset)
void setTailCallReservedStack(unsigned bytes)
SmallVectorImpl< ForwardedRegister > & getForwardedMustTailRegParms()
void setBytesInStackArgArea(unsigned bytes)
void setArgumentStackToRestore(unsigned bytes)
void setHasStreamingModeChanges(bool HasChanges)
bool isReservedReg(const MachineFunction &MF, MCRegister Reg) const
const AArch64RegisterInfo * getRegisterInfo() const override
bool isNeonAvailable() const
Returns true if the target has NEON and the function at runtime is known to have NEON enabled (e....
unsigned getMaximumJumpTableSize() const
Align getPrefLoopAlignment() const
Align getPrefFunctionAlignment() const
unsigned getMaxBytesForLoopAlignment() const
bool supportsAddressTopByteIgnored() const
CPU has TBI (top byte of addresses is ignored during HW address translation) and OS enables it.
bool isStreamingCompatible() const
Returns true if the function has a streaming-compatible body.
bool isSVEorStreamingSVEAvailable() const
Returns true if the target has access to either the full range of SVE instructions,...
bool useSVEForFixedLengthVectors() const
unsigned ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const
ClassifyGlobalReference - Find the target operand flags that describe how a global value should be re...
bool isStreaming() const
Returns true if the function has a streaming body.
unsigned getSVEVectorSizeInBits() const
bool isXRegisterReserved(size_t i) const
unsigned getMaxSVEVectorSizeInBits() const
bool isCallingConvWin64(CallingConv::ID CC, bool IsVarArg) const
unsigned getMinSVEVectorSizeInBits() const
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
bool isTruncateFree(Type *Ty1, Type *Ty2) const override
Return true if it's free to truncate a value of type FromTy to type ToTy.
SDValue changeStreamingMode(SelectionDAG &DAG, SDLoc DL, bool Enable, SDValue Chain, SDValue InGlue, unsigned Condition, bool InsertVectorLengthCheck=false) const
If a change in streaming mode is required on entry to/return from a function call it emits and return...
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset) const override
Return true if it is profitable to reduce a load to a smaller type.
Value * getIRStackGuard(IRBuilderBase &IRB, const LibcallLoweringInfo &Libcalls) const override
If the target has a standard location for the stack protector cookie, returns the address of that loc...
Value * getSafeStackPointerLocation(IRBuilderBase &IRB, const LibcallLoweringInfo &Libcalls) const override
If the target has a standard location for the unsafe stack pointer, returns the address of that locat...
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
void insertSSPDeclarations(Module &M, const LibcallLoweringInfo &Libcalls) const override
Inserts necessary declarations for SSP (stack protection) purpose.
bool isShuffleMaskLegal(ArrayRef< int > M, EVT VT) const override
Return true if the given shuffle mask can be codegen'd directly, or if it should be stack expanded.
unsigned getVaListSizeInBits(const DataLayout &DL) const override
Returns the size of the platform's va_list object.
MachineBasicBlock * EmitZAInstr(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) const override
Return the prefered common base offset.
bool shouldLowerReductionToSVE(SDValue RdxOp, std::optional< Intrinsic::ID > &PairwiseOpIID) const
Returns true if RdxOp should be lowered to a SVE reduction.
bool shouldExpandCttzElements(EVT VT) const override
Return true if the @llvm.experimental.cttz.elts intrinsic should be expanded using generic code in Se...
bool lowerInterleavedStore(Instruction *Store, Value *Mask, ShuffleVectorInst *SVI, unsigned Factor, const APInt &GapMask) const override
Lower an interleaved store into a stN intrinsic.
MachineBasicBlock * EmitTileLoad(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB) const
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL, bool UseScalable) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool shouldFoldConstantShiftPairToMask(const SDNode *N) const override
Return true if it is profitable to fold a pair of shifts into a mask.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool preferSelectsOverBooleanArithmetic(EVT VT) const override
Should we prefer selects to doing arithmetic on boolean types.
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
Provide custom lowering hooks for some operations.
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
bool shouldOptimizeMulOverflowWithZeroHighBits(LLVMContext &Context, EVT VT) const override
bool shouldInsertTrailingSeqCstFenceForAtomicStore(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert a seq_cst trailing fence without reducing the or...
bool isIntDivCheap(EVT VT, AttributeList Attr) const override
Return true if integer divide is usually cheaper than a sequence of several shifts,...
bool shouldRemoveRedundantExtend(SDValue Op) const override
Return true (the default) if it is profitable to remove a sext_inreg(x) where the sext is redundant,...
bool shallExtractConstSplatVectorElementToStore(Type *VectorTy, unsigned ElemSizeInBits, unsigned &Index) const override
Return true if the target shall perform extract vector element and store given that the vector is kno...
CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC) const
Selects the correct CCAssignFn for a given CallingConvention value.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ISD::SETCC ValueType.
bool optimizeExtendOrTruncateConversion(Instruction *I, Loop *L, const TargetTransformInfo &TTI) const override
Try to optimize extending or truncating conversion instructions (like zext, trunc,...
CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const
Selects the correct CCAssignFn for a given CallingConvention value.
bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask, IntrinsicInst *DI) const override
Lower a deinterleave intrinsic to a target specific load intrinsic.
bool findOptimalMemOpLowering(LLVMContext &Context, std::vector< EVT > &MemOps, unsigned Limit, const MemOp &Op, unsigned DstAS, unsigned SrcAS, const AttributeList &FuncAttributes, EVT *LargestVT=nullptr) const override
Determines the optimal series of memory ops to replace the memset / memcpy.
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool hasInlineStackProbe(const MachineFunction &MF) const override
True if stack clash protection is enabled for this functions.
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X, SDValue Y) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool isLegalICmpImmediate(int64_t) const override
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
Value * emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override
Perform a store-conditional operation to Addr.
bool preferIncOfAddToSubOfNot(EVT VT) const override
These two forms are equivalent: sub y, (xor x, -1) add (add x, 1), y The variant with two add's is IR...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
bool isOpSuitableForLSE128(const Instruction *I) const
void fixupPtrauthDiscriminator(MachineInstr &MI, MachineBasicBlock *BB, MachineOperand &IntDiscOp, MachineOperand &AddrDiscOp, const TargetRegisterClass *AddrDiscRC) const
Replace (0, vreg) discriminator components with the operands of blend or with (immediate,...
bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef< ShuffleVectorInst * > Shuffles, ArrayRef< unsigned > Indices, unsigned Factor, const APInt &GapMask) const override
Lower an interleaved load into a ldN intrinsic.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(const AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool fallBackToDAGISel(const Instruction &Inst) const override
bool isTypeDesirableForOp(unsigned Opc, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
bool isLegalAddScalableImmediate(int64_t) const override
Return true if adding the specified scalable immediate is legal, that is the target has add instructi...
Value * createComplexDeinterleavingIR(IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator=nullptr) const override
Create the IR node for the given complex deinterleaving operation.
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const override
Returns true if the target allows unaligned memory accesses of the specified type.
MachineBasicBlock * EmitCheckMatchingVL(MachineInstr &MI, MachineBasicBlock *MBB) const
unsigned getMaxSupportedInterleaveFactor() const override
Get the maximum supported factor for interleaved memory accesses.
bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const
Returns true if VecTy is a legal interleaved access type.
bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg, const DataLayout &DL) const override
For some targets, an LLVM struct type must be broken down into multiple simple types,...
Value * emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr, AtomicOrdering Ord) const override
Perform a load-linked operation on Addr, returning a "Value *" with the corresponding pointee type.
MachineBasicBlock * EmitLoweredCatchRet(MachineInstr &MI, MachineBasicBlock *BB) const
bool isComplexDeinterleavingSupported() const override
Does this target support complex deinterleaving.
bool isZExtFree(Type *Ty1, Type *Ty2) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const override
SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const
MachineBasicBlock * EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
bool isProfitableToHoist(Instruction *I) const override
Check if it is profitable to hoist instruction in then/else to if.
bool isOpSuitableForRCPC3(const Instruction *I) const
MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const override
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
MachineBasicBlock * EmitZTInstr(MachineInstr &MI, MachineBasicBlock *BB, unsigned Opcode, bool Op0IsDef) const
MachineBasicBlock * EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const
bool shouldExpandVectorMatch(EVT VT, unsigned SearchSize) const override
Return true if the @llvm.experimental.vector.match intrinsic should be expanded for vector type ‘VT’ ...
MachineBasicBlock * EmitEntryPStateSM(MachineInstr &MI, MachineBasicBlock *BB) const
bool shouldInsertFencesForAtomic(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
Control the following reassociation of operands: (op (op x, c1), y) -> (op (op x, y),...
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
In AArch64, true if FEAT_CPA is present.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
MachineBasicBlock * EmitF128CSEL(MachineInstr &MI, MachineBasicBlock *BB) const
LLT getOptimalMemOpLLT(const MemOp &Op, const AttributeList &FuncAttributes) const override
LLT returning variant.
bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const override
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool needsFixedCatchObjects() const override
Used for exception handling on Win64.
const AArch64TargetMachine & getTM() const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool generateFMAsInMachineCombiner(EVT VT, CodeGenOptLevel OptLevel) const override
bool isComplexDeinterleavingOperationSupported(ComplexDeinterleavingOperation Operation, Type *Ty) const override
Does this target support complex deinterleaving with the given operation and type.
bool isOpSuitableForLDPSTP(const Instruction *I) const
AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI)
bool hasPairedLoad(EVT LoadedType, Align &RequiredAlignment) const override
Return true if the target supplies and combines to a paired load two loaded values of type LoadedType...
bool isLegalAddImmediate(int64_t) const override
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool shouldConsiderGEPOffsetSplit() const override
bool isVectorClearMaskLegal(ArrayRef< int > M, EVT VT) const override
Similar to isShuffleMaskLegal.
const MCPhysReg * getScratchRegisters(CallingConv::ID CC) const override
Returns a 0 terminated array of registers that can be safely used as scratch registers.
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo, const LibcallLoweringInfo *libcallLowering) const override
This method returns a target specific FastISel object, or null if the target does not support "fast" ...
void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
bool useLoadStackGuardNode(const Module &M) const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
void getTgtMemIntrinsic(SmallVectorImpl< IntrinsicInfo > &Infos, const CallBase &I, MachineFunction &MF, unsigned Intrinsic) const override
getTgtMemIntrinsic - Represent NEON load and store intrinsics as MemIntrinsicNodes.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
bool isFPImmLegalAsFMov(const APFloat &Imm, EVT VT) const
bool lowerInterleaveIntrinsicToStore(Instruction *Store, Value *Mask, ArrayRef< Value * > InterleaveValues) const override
Lower an interleave intrinsic to a target specific store intrinsic.
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) const
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
bool isDesirableToCommuteXorWithShift(const SDNode *N) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Returns false if N is a bit extraction pattern of (X >> C) & Mask.
bool enableAggressiveFMAFusion(EVT VT) const override
Enable aggressive FMA fusion on targets that want it.
MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
MachineBasicBlock * EmitDynamicProbedAlloc(MachineInstr &MI, MachineBasicBlock *MBB) const
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(const AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const override
Return true if the @llvm.get.active.lane.mask intrinsic should be expanded using generic code in Sele...
bool isMulAddWithConstProfitable(SDValue AddNode, SDValue ConstNode) const override
Return true if it may be profitable to transform (mul (add x, c1), c2) -> (add (mul x,...
bool useSVEForFixedLengthVectorVT(EVT VT, bool OverrideNEON=false) const
bool mergeStoresAfterLegalization(EVT VT) const override
SVE code generation for fixed length vectors does not custom lower BUILD_VECTOR.
static LLVM_ABI unsigned int semanticsPrecision(const fltSemantics &)
Definition APFloat.cpp:214
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
LLVM_ABI APInt getLoBits(unsigned numBits) const
Compute an APInt containing numBits lowbits from this APInt.
Definition APInt.cpp:645
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition APInt.h:450
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1055
static APInt getSignMask(unsigned BitWidth)
Get the SignMask for a specific bit width.
Definition APInt.h:230
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition APInt.h:424
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1563
static LLVM_ABI void sdivrem(const APInt &LHS, const APInt &RHS, APInt &Quotient, APInt &Remainder)
Definition APInt.cpp:1942
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1414
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1693
LLVM_ABI APInt getHiBits(unsigned numBits) const
Compute an APInt containing numBits highbits from this APInt.
Definition APInt.cpp:640
LLVM_ABI APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition APInt.cpp:1076
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:968
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1353
APInt abs() const
Get the absolute value.
Definition APInt.h:1818
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:259
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1511
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition APInt.h:210
bool isNegative() const
Determine sign of this APInt.
Definition APInt.h:330
LLVM_ABI APInt sadd_ov(const APInt &RHS, bool &Overflow) const
Definition APInt.cpp:1980
bool sle(const APInt &RHS) const
Signed less or equal comparison.
Definition APInt.h:1173
LLVM_ABI APInt uadd_ov(const APInt &RHS, bool &Overflow) const
Definition APInt.cpp:1987
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1662
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition APInt.h:220
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition APInt.cpp:1084
unsigned logBase2() const
Definition APInt.h:1784
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:834
bool getBoolValue() const
Convert APInt to a boolean value.
Definition APInt.h:472
bool isMask(unsigned numBits) const
Definition APInt.h:489
bool isNonNegative() const
Determine if this APInt Value is non-negative (>= 0)
Definition APInt.h:335
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition APInt.h:1157
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:1028
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition APInt.h:1264
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:201
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1244
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:390
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:240
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1585
void lshrInPlace(unsigned ShiftAmt)
Logical right-shift this APInt by ShiftAmt in place.
Definition APInt.h:865
an instruction to allocate memory on the stack
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
Get the array size.
Definition ArrayRef.h:141
bool empty() const
Check if the array is empty.
Definition ArrayRef.h:136
An instruction that atomically checks whether a specified value is in a memory location,...
an instruction that atomically reads a memory location, combines it with another value,...
@ Add
*p = old + v
@ FAdd
*p = old + v
@ FMinimum
*p = minimum(old, v) minimum matches the behavior of llvm.minimum.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ FMaximum
*p = maximum(old, v) maximum matches the behavior of llvm.maximum.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
bool isFloatingPointOperation() const
BinOp getOperation() const
static LLVM_ABI Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
const BlockAddress * getBlockAddress() const
Function * getFunction() const
Definition Constants.h:1107
A "pseudo-class" with methods for operating on BUILD_VECTORs.
LLVM_ABI ConstantFPSDNode * getConstantFPSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant FP or null if this is not a constant FP splat.
LLVM_ABI bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
LLVM_ABI ConstantSDNode * getConstantSplatNode(const APInt &DemandedElts, BitVector *UndefElements=nullptr) const
Returns the demanded splatted constant or null if this is not a constant splat.
LLVM_ABI std::optional< std::pair< APInt, APInt > > isArithmeticSequence() const
If this BuildVector is constant and represents an arithmetic sequence "<a, a+n, a+2n,...
LLVM_ABI bool isConstant() const
CCState - This class holds information needed while lowering arguments and return values.
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
bool needsCustom() const
int64_t getLocMemOffset() const
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
bool isZero() const
Return true if the value is positive or negative zero.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
int64_t getSExtValue() const
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
bool isLittleEndian() const
Layout endianness...
Definition DataLayout.h:217
bool isBigEndian() const
Definition DataLayout.h:218
LLVM_ABI TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
LLVM_ABI Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
A debug info location.
Definition DebugLoc.h:123
ValueT lookup(const_arg_type_t< KeyT > Val) const
Return the entry for the specified key, or a default constructed value if no such entry exists.
Definition DenseMap.h:205
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
Tagged union holding either a T or a Error.
Definition Error.h:485
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition FastISel.h:66
Class to represent fixed width SIMD vectors.
static FixedVectorType * getInteger(FixedVectorType *VTy)
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:873
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type * getParamType(unsigned i) const
Parameter type accessors.
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
Definition Function.h:714
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:211
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:711
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
Constant * getPersonalityFn() const
Get the personality function associated with this function.
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:354
arg_iterator arg_end()
Definition Function.h:877
arg_iterator arg_begin()
Definition Function.h:868
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
const Argument * const_arg_iterator
Definition Function.h:74
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:728
const GlobalValue * getGlobal() const
bool isThreadLocal() const
If the value is "Thread Local", its value isn't shared by the threads.
bool hasExternalWeakLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Definition Globals.cpp:141
Type * getValueType() const
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateConstGEP1_32(Type *Ty, Value *Ptr, unsigned Idx0, const Twine &Name="")
Definition IRBuilder.h:2023
Value * CreatePointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2289
BasicBlock * GetInsertBlock() const
Definition IRBuilder.h:201
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2553
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition IRBuilder.h:629
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition IRBuilder.h:576
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2858
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
static LLT integer(unsigned SizeInBits)
static LLT floatIEEE(unsigned SizeInBits)
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
Tracks which library functions to use for a particular subtarget.
LLVM_ABI CallingConv::ID getLibcallImplCallingConv(RTLIB::LibcallImpl Call) const
Get the CallingConv that should be used for the specified libcall.
LLVM_ABI RTLIB::LibcallImpl getLibcallImpl(RTLIB::Libcall Call) const
Return the lowering's selection of implementation call for Call.
An instruction for reading from memory.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
bool is128BitVector() const
Return true if this is a 128-bit vector type.
@ INVALID_SIMPLE_VALUE_TYPE
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
bool isScalableVT() const
Return true if the type is a scalable type.
static auto all_valuetypes()
SimpleValueType Iteration.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto scalable_vector_valuetypes()
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool isFixedLengthVector() const
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
MVT getHalfNumVectorElementsVT() const
Return a VT for a vector type with the same element type but half the number of elements.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
bool is64BitVector() const
Return true if this is a 64-bit vector type.
static auto fp_fixedlen_vector_valuetypes()
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
Instructions::iterator instr_iterator
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
SSPLayoutKind getObjectSSPLayout(int ObjectIdx) const
LLVM_ABI void computeMaxCallFrameSize(MachineFunction &MF, std::vector< MachineBasicBlock::iterator > *FrameSDOps=nullptr)
Computes the maximum size of a callframe.
LLVM_ABI int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
@ SSPLK_None
Did not trigger a stack protector.
void setFrameAddressIsTaken(bool T)
bool hasScalableStackID(int ObjectIdx) const
bool isImmutableObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to an immutable object.
int getStackProtectorIndex() const
Return the index for the stack protector object.
void setStackID(int ObjectIdx, uint8_t ID)
void setHasTailCall(bool V=true)
bool hasMustTailInVarArgFunc() const
Returns true if the function is variadic and contains a musttail call.
void setReturnAddressIsTaken(bool s)
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
bool hasVAStart() const
Returns true if the function calls the llvm.va_start intrinsic.
LLVM_ABI int CreateVariableSizedObject(Align Alignment, const AllocaInst *Alloca)
Notify the MachineFrameInfo object that a variable sized object has been created.
int getObjectIndexEnd() const
Return one past the maximum frame object index.
bool hasStackProtectorIndex() const
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
void setObjectAlignment(int ObjectIdx, Align Alignment)
setObjectAlignment - Change the alignment of the specified stack object.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
MachineOperand class - Representation of each machine instruction operand.
void setImm(int64_t immVal)
int64_t getImm() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
LLVM_ABI bool hasAtMostUserInstrs(Register Reg, unsigned MaxUsers) const
hasAtMostUses - Return true if the given register has at most MaxUsers non-debug user instructions.
bool use_empty(Register RegNo) const
use_empty - Return true if there are no instructions using the specified register.
An SDNode that represents everything that will be needed to construct a MachineInstr.
size_type size() const
Definition MapVector.h:58
const SDValue & getPassThru() const
ISD::LoadExtType getExtensionType() const
This is a base class used to represent MGATHER and MSCATTER nodes.
const SDValue & getBasePtr() const
ISD::MemIndexType getIndexType() const
How is Index applied to BasePtr when computing addresses.
const SDValue & getInc() const
const SDValue & getScale() const
const SDValue & getMask() const
const SDValue & getIntID() const
const SDValue & getIndex() const
const SDValue & getBasePtr() const
ISD::MemIndexType getIndexType() const
This class is used to represent an MLOAD node.
const SDValue & getBasePtr() const
ISD::LoadExtType getExtensionType() const
const SDValue & getMask() const
const SDValue & getPassThru() const
const SDValue & getOffset() const
bool isUnindexed() const
Return true if this is NOT a pre/post inc/dec load/store.
ISD::MemIndexedMode getAddressingMode() const
Return the addressing mode for this load or store: unindexed, pre-inc, pre-dec, post-inc,...
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This class is used to represent an MSTORE node.
const SDValue & getOffset() const
const SDValue & getBasePtr() const
const SDValue & getMask() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getBaseAlign() const
Returns alignment and volatility of the memory access.
Align getAlign() const
bool isVolatile() const
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
AtomicOrdering getSuccessOrdering() const
Return the atomic ordering requirements for this memory operation.
MachineMemOperand * getMemOperand() const
Return the unique MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getBasePtr() const
const MachinePointerInfo & getPointerInfo() const
AtomicOrdering getMergedOrdering() const
Return a single atomic ordering that is at least as strong as both the success and failure orderings ...
const SDValue & getChain() const
bool isNonTemporal() const
bool isAtomic() const
Return true if the memory operation ordering is Unordered or higher.
EVT getMemoryVT() const
Return the type of the in-memory value.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
bool getRtLibUseGOT() const
Returns true if PLT should be avoided for RTLib calls.
Definition Module.cpp:722
Metadata * getModuleFlag(StringRef Key) const
Return the corresponding value if Key appears in module flags, otherwise return null.
Definition Module.cpp:358
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isValid() const
Definition Register.h:112
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
bool isStrictFPOpcode()
Test if this node is a strict floating point pseudo-op.
ArrayRef< SDUse > ops() const
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
LLVM_ABI bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< value_op_iterator > op_values() const
void dropFlags(unsigned Mask)
iterator_range< use_iterator > uses()
SDNodeFlags getFlags() const
size_t use_size() const
Return the number of uses of this node.
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumOperands() const
Return the number of values used by this operation.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
const APInt & getConstantOperandAPInt(unsigned Num) const
Helper method returns the APInt of a ConstantSDNode operand.
LLVM_ABI bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
void setCFIType(uint32_t Type)
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
iterator_range< user_iterator > users()
void setFlags(SDNodeFlags NewFlags)
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
bool isAssert() const
Test if this node is an assert operation.
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
void setNode(SDNode *N)
set the SDNode
unsigned getOpcode() const
unsigned getNumOperands() const
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool hasStreamingInterface() const
bool hasStreamingCompatibleInterface() const
bool hasNonStreamingInterface() const
bool hasStreamingBody() const
SMECallAttrs is a utility class to hold the SMEAttrs for a callsite.
bool requiresPreservingZT0() const
bool requiresPreservingAllZAState() const
Class to represent scalable SIMD vectors.
static LLVM_ABI ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition Type.cpp:895
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getElementCount(const SDLoc &DL, EVT VT, ElementCount EC)
LLVM_ABI Align getReducedAlign(EVT VT, bool UseABI)
In most cases this function returns the ABI alignment for a given type, except for illegal vector typ...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
LLVM_ABI unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
LLVM_ABI SDValue getMaskedGather(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, ISD::LoadExtType ExtTy)
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getSplatValue(SDValue V, bool LegalTypes=false)
If V is a splat vector, return its scalar source operand by extracting that element from the source v...
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags, bool AllowCommute=false)
Get the specified node if it's already available, or else return NULL.
LLVM_ABI SDValue makeEquivalentMemoryOrdering(SDValue OldChain, SDValue NewMemOpChain)
If an existing load has uses of its chain, create a token factor node with that chain and the new mem...
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
LLVM_ABI SDValue getJumpTableDebugInfo(int JTI, SDValue Chain, const SDLoc &DL)
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getVScale(const SDLoc &DL, EVT VT, APInt MulImm)
Return a node that represents the runtime scaling 'MulImm * RuntimeVL'.
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getExtractSubvector(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Return the VT typed sub-vector of Vec at Idx.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI bool canIgnoreSignBitOfZero(const SDUse &Use) const
Check if a use of a float value is insensitive to signed zeros.
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getInsertSubvector(const SDLoc &DL, SDValue Vec, SDValue SubVec, unsigned Idx)
Insert SubVec at the Idx element of Vec.
LLVM_ABI SDValue getStepVector(const SDLoc &DL, EVT ResVT, const APInt &StepVal)
Returns a vector of type ResVT whose elements contain the linear sequence <0, Step,...
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false, SDNodeFlags Flags={})
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge)
Set NoMergeSiteInfo to be associated with Node if NoMerge is true.
LLVM_ABI bool shouldOptForSize() const
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI SDValue getIdentityElement(unsigned Opcode, const SDLoc &DL, EVT VT, SDNodeFlags Flags)
Get the (commutative) identity element for the given opcode, if it exists.
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI bool isSplatValue(SDValue V, const APInt &DemandedElts, APInt &UndefElts, unsigned Depth=0) const
Test whether V has a splatted value for all the demanded elements.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI SDValue getNegative(SDValue Val, const SDLoc &DL, EVT VT)
Create negative operation as (SUB 0, Val).
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI bool doesNodeExist(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops)
Check if a node exists without modifying its flags.
const SelectionDAGTargetInfo & getSelectionDAGInfo() const
LLVM_ABI bool areNonVolatileConsecutiveLoads(LoadSDNode *LD, LoadSDNode *Base, unsigned Bytes, int Dist) const
Return true if loads are next to each other and can be merged.
LLVM_ABI SDValue getMaskedHistogram(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType)
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
LLVM_ABI SDValue getGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, bool isTargetGA=false, unsigned TargetFlags=0)
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI SDValue getTypeSize(const SDLoc &DL, EVT VT, TypeSize TS)
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getSplatVector(EVT VT, const SDLoc &DL, SDValue Op)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
LLVM_ABI SDValue getBoolExtOrTrunc(SDValue Op, const SDLoc &SL, EVT VT, EVT OpVT)
Convert Op, which must be of integer type, to the integer type VT, by using an extension appropriate ...
LLVM_ABI SDValue getMaskedStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Base, SDValue Offset, SDValue Mask, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, bool IsTruncating=false, bool IsCompressing=false)
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
const LibcallLoweringInfo & getLibcalls() const
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
void addCalledGlobal(const SDNode *Node, const GlobalValue *GV, unsigned OpFlags)
Set CalledGlobal to be associated with Node.
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
void addCallSiteInfo(const SDNode *Node, CallSiteInfo &&CallInfo)
Set CallSiteInfo to be associated with Node.
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
LLVM_ABI SDValue getDeactivationSymbol(const GlobalValue *GV)
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
LLVM_ABI SDValue CreateStackTemporary(TypeSize Bytes, Align Alignment)
Create a stack temporary based on the size in bytes and the alignment.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
LLVM_ABI SDValue getTargetInsertSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand, SDValue Subreg)
A convenience function for creating TargetInstrInfo::INSERT_SUBREG nodes.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
SDValue getSplat(EVT VT, const SDLoc &DL, SDValue Op)
Returns a node representing a splat of one value into all lanes of the provided vector type.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
LLVM_ABI SDValue getMaskedScatter(SDVTList VTs, EVT MemVT, const SDLoc &dl, ArrayRef< SDValue > Ops, MachineMemOperand *MMO, ISD::MemIndexType IndexType, bool IsTruncating=false)
This instruction constructs a fixed permutation of two input vectors.
static LLVM_ABI bool isSelectMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from its source vectors without lane crossings.
VectorType * getType() const
Overload to return most specific vector type.
static LLVM_ABI bool isSingleSourceMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector.
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
static LLVM_ABI bool isSplatMask(ArrayRef< int > Mask)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition SmallSet.h:176
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
iterator insert(iterator I, T &&Elt)
void resize(size_type N)
void push_back(const T &Elt)
pointer data()
Return a pointer to the vector's buffer, even if empty().
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
bool getAsInteger(unsigned Radix, T &Result) const
Parse the current string as an integer of the specified radix.
Definition StringRef.h:490
constexpr StringRef substr(size_t Start, size_t N=npos) const
Return a reference to the substring from [Start, Start + N).
Definition StringRef.h:591
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition StringRef.h:258
StringRef drop_front(size_t N=1) const
Return a StringRef equal to 'this' but with the first N elements dropped.
Definition StringRef.h:629
StringRef slice(size_t Start, size_t End) const
Return a reference to the substring from [Start, End).
Definition StringRef.h:714
constexpr size_t size() const
Get the string size.
Definition StringRef.h:144
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition StringRef.h:270
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:483
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
EVT getMemValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset=std::nullopt) const
Return true if it is profitable to reduce a load to a smaller type.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
virtual bool shouldExpandBuildVectorWithShuffles(EVT, unsigned DefinedValues) const
CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const
Get the CallingConv that should be used for the specified libcall.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
MachineBasicBlock * emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const
Replace/modify any TargetFrameIndex operands with a targte-dependent sequence of memory operands that...
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
virtual Value * getIRStackGuard(IRBuilderBase &IRB, const LibcallLoweringInfo &Libcalls) const
If the target has a standard location for the stack protector guard, returns the address of that loca...
virtual bool shouldLocalize(const MachineInstr &MI, const TargetTransformInfo *TTI) const
Check whether or not MI needs to be moved close to its uses.
void setMaximumJumpTableSize(unsigned)
Indicate the maximum number of entries in jump tables.
virtual unsigned getMinimumJumpTableEntries() const
Return lower limit for number of blocks in a jump table.
const TargetMachine & getTargetMachine() const
unsigned MaxLoadsPerMemcmp
Specify maximum number of load instructions per memcmp call.
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
unsigned MaxGluedStoresPerMemcpy
Specify max number of store instructions to glue in inlined memcpy.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
virtual void insertSSPDeclarations(Module &M, const LibcallLoweringInfo &Libcalls) const
Inserts necessary declarations for SSP (stack protection) purpose.
void setMaxBytesForAlignment(unsigned MaxBytes)
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
const LibcallLoweringInfo & getLibcallLoweringInfo() const
void setIndexedLoadAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed load does or does not work with the specified type and indicate w...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
unsigned getMaximumJumpTableSize() const
Return upper limit for number of entries in a jump table.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setIndexedStoreAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed store does or does not work with the specified type and indicate ...
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
virtual bool isLegalAddImmediate(int64_t) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
virtual bool shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, unsigned OldShiftOpcode, unsigned NewShiftOpcode, SelectionDAG &DAG) const
Given the pattern (X & (C l>>/<< Y)) ==/!= 0 return true if it should be transformed into: ((X <</l>>...
void setPartialReduceMLAAction(unsigned Opc, MVT AccVT, MVT InputVT, LegalizeAction Action)
Indicate how a PARTIAL_REDUCE_U/SMLA node with Acc type AccVT and Input type InputVT should be treate...
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
unsigned MaxLoadsPerMemcmpOptSize
Likewise for functions with the OptSize attribute.
virtual bool isBinOp(unsigned Opcode) const
Return true if the node is a math/logic binary operator.
bool isIndexedLoadLegal(unsigned IdxMode, EVT VT) const
Return true if the specified indexed load is legal on this target.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
RTLIB::LibcallImpl getLibcallImpl(RTLIB::Libcall Call) const
Get the libcall impl routine name for the specified libcall.
static StringRef getLibcallImplName(RTLIB::LibcallImpl Call)
Get the libcall routine name for the specified libcall implementation.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
virtual Value * getSafeStackPointerLocation(IRBuilderBase &IRB, const LibcallLoweringInfo &Libcalls) const
Returns the target-specific address of the unsafe stack pointer.
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
virtual EVT getAsmOperandValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
MVT getFrameIndexTy(const DataLayout &DL) const
Return the type for frame index, which is determined by the alloca address space specified through th...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
MVT getRegisterType(MVT VT) const
Return the type of registers that this ValueType will eventually require.
bool isLoadLegalOrCustom(EVT ValVT, EVT MemVT, Align Alignment, unsigned AddrSpace, unsigned ExtType, bool Atomic) const
Return true if the specified load with extension is legal or custom on this target.
const RTLIB::RuntimeLibcallsInfo & getRuntimeLibcallsInfo() const
virtual bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl< SDNode * > &Created) const
Build sdiv by power-of-2 with conditional move instructions Ref: "Hacker's Delight" by Henry Warren 1...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
virtual bool isTargetCanonicalConstantNode(SDValue Op) const
Returns true if the given Opc is considered a canonical constant for the target, which should not be ...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
virtual bool findOptimalMemOpLowering(LLVMContext &Context, std::vector< EVT > &MemOps, unsigned Limit, const MemOp &Op, unsigned DstAS, unsigned SrcAS, const AttributeList &FuncAttributes, EVT *LargestVT=nullptr) const
Determines the optimal series of memory ops to replace the memset / memcpy.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool isPositionIndependent() const
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
TargetLowering(const TargetLowering &)=delete
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, UndefPoisonKind Kind, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
virtual bool useLoadStackGuardNode(const Module &M) const
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
virtual unsigned combineRepeatedFPDivisors() const
Indicate whether this target prefers to combine FDIVs with the same divisor.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
void expandShiftParts(SDNode *N, SDValue &Lo, SDValue &Hi, SelectionDAG &DAG) const
Expand shift-by-parts.
void setTypeIdForCallsiteInfo(const CallBase *CB, MachineFunction &MF, MachineFunction::CallSiteInfo &CSInfo) const
Primary interface to the complete machine description for the target machine.
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
const Triple & getTargetTriple() const
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
unsigned getPointerSize(unsigned AS) const
Get the pointer size for this target.
TargetOptions Options
CodeModel::Model getCodeModel() const
Returns the code model.
unsigned TLSSize
Bit size of immediate TLS offsets (0 == use the default).
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
LLVM_ABI InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TargetCostKind CostKind) const
Return the expected cost of materializing for the given integer immediate of the specified type.
@ TCC_Free
Expected to fold away in lowering.
Target - Wrapper for Target specific information.
This class represents a truncation of integer types.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:314
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:290
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:65
static LLVM_ABI IntegerType * getInt128Ty(LLVMContext &C)
Definition Type.cpp:315
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:313
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:284
@ HalfTyID
16-bit floating point type
Definition Type.h:57
@ FloatTyID
32-bit floating point type
Definition Type.h:59
@ BFloatTyID
16-bit floating point type (7-bit significand)
Definition Type.h:58
@ DoubleTyID
64-bit floating point type
Definition Type.h:60
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:286
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:311
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:370
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:201
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
Definition Type.cpp:312
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:310
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
TypeID getTypeID() const
Return the type id for the type.
Definition Type.h:138
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:317
static LLVM_ABI Type * getDoubleTy(LLVMContext &C)
Definition Type.cpp:291
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:290
static LLVM_ABI Type * getBFloatTy(LLVMContext &C)
Definition Type.cpp:289
static LLVM_ABI Type * getHalfTy(LLVMContext &C)
Definition Type.cpp:288
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
Value * getOperand(unsigned i) const
Definition User.h:207
unsigned getNumOperands() const
Definition User.h:229
This class is used to represent EVT's, which are used to parameterize some operations.
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
This is a wrapper around stripAndAccumulateConstantOffsets with the in-bounds requirement set to fals...
Definition Value.h:737
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:549
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI void dump() const
Support for debugging, callable in GDB: V->dump()
Base class of all SIMD vector types.
static VectorType * getHalfElementsVectorType(VectorType *VTy)
This static method returns a VectorType with half as many elements as the input type and the same ele...
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * getInteger(VectorType *VTy)
This static method gets a VectorType with the same number of elements as the input type,...
static VectorType * getTruncatedElementVectorType(VectorType *VTy)
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
A range adaptor for a pair of iterators.
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isValidCBCond(AArch64CC::CondCode Code)
True, if a given condition code can be used in a fused compare-and-branch instructions,...
static CondCode getInvertedCondCode(CondCode Code)
static unsigned getNZCVToSatisfyCondCode(CondCode Code)
Given a condition code, return NZCV flags that would satisfy that condition.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_NC
MO_NC - Indicates whether the linker is expected to check the symbol reference for overflow.
@ MO_G1
MO_G1 - A symbol operand with this flag (granule 1) represents the bits 16-31 of a 64-bit address,...
@ MO_PAGEOFF
MO_PAGEOFF - A symbol operand with this flag represents the offset of that symbol within a 4K page.
@ MO_GOT
MO_GOT - This flag indicates that a symbol operand represents the address of the GOT entry for the sy...
@ MO_G0
MO_G0 - A symbol operand with this flag (granule 0) represents the bits 0-15 of a 64-bit address,...
@ MO_PAGE
MO_PAGE - A symbol operand with this flag represents the pc-relative offset of the 4K page containing...
@ MO_HI12
MO_HI12 - This flag indicates that a symbol operand represents the bits 13-24 of a 64-bit address,...
@ MO_TLS
MO_TLS - Indicates that the operand being accessed is some kind of thread-local symbol.
@ MO_G2
MO_G2 - A symbol operand with this flag (granule 2) represents the bits 32-47 of a 64-bit address,...
@ MO_G3
MO_G3 - A symbol operand with this flag (granule 3) represents the high 16-bits of a 64-bit address,...
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
static unsigned getShiftValue(unsigned Imm)
getShiftValue - Extract the shift value.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
static uint8_t encodeAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType9(uint64_t Imm)
static bool isAdvSIMDModImmType4(uint64_t Imm)
static bool isAdvSIMDModImmType5(uint64_t Imm)
static int getFP32Imm(const APInt &Imm)
getFP32Imm - Return an 8-bit floating-point version of the 32-bit floating-point value.
static uint8_t encodeAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType12(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType10(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType9(uint64_t Imm)
static uint64_t encodeLogicalImmediate(uint64_t imm, unsigned regSize)
encodeLogicalImmediate - Return the encoded immediate value for a logical immediate instruction of th...
static bool isAdvSIMDModImmType7(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType5(uint64_t Imm)
static int getFP64Imm(const APInt &Imm)
getFP64Imm - Return an 8-bit floating-point version of the 64-bit floating-point value.
static bool isAdvSIMDModImmType10(uint64_t Imm)
static int getFP16Imm(const APInt &Imm)
getFP16Imm - Return an 8-bit floating-point version of the 16-bit floating-point value.
static uint64_t decodeAdvSIMDModImmType10(uint8_t Imm)
static uint8_t encodeAdvSIMDModImmType8(uint64_t Imm)
static bool isAdvSIMDModImmType12(uint64_t Imm)
static bool isSVELogicalImm(unsigned SizeInBits, uint64_t ImmVal, uint64_t &Encoding)
static uint8_t encodeAdvSIMDModImmType11(uint64_t Imm)
static bool isSVECpyDupImm(int SizeInBits, int64_t Val, int32_t &Imm, int32_t &Shift)
static bool isAdvSIMDModImmType11(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType6(uint64_t Imm)
static bool isAdvSIMDModImmType8(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType4(uint64_t Imm)
static bool isAdvSIMDModImmType6(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType1(uint64_t Imm)
static uint8_t encodeAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType2(uint64_t Imm)
static bool isAdvSIMDModImmType3(uint64_t Imm)
static bool isAdvSIMDModImmType1(uint64_t Imm)
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
ArrayRef< MCPhysReg > getFPRArgRegs()
static constexpr unsigned SVEMaxBitsPerVector
const unsigned RoundingBitsPos
const uint64_t ReservedFPControlBits
static constexpr unsigned SVEBitsPerBlock
int32_t getSMEPseudoMap(uint32_t Opcode)
ArrayRef< MCPhysReg > getGPRArgRegs()
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo, const LibcallLoweringInfo *libcallLowering)
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ ARM64EC_Thunk_Native
Calling convention used in the ARM64EC ABI to implement calls between ARM64 code and thunks.
@ AArch64_VectorCall
Used between AArch64 Advanced SIMD functions.
@ Swift
Calling convention for Swift.
Definition CallingConv.h:69
@ AArch64_SVE_VectorCall
Used between AArch64 SVE functions.
@ CFGuard_Check
Special calling convention on Windows for calling the Control Guard Check ICall funtion.
Definition CallingConv.h:82
@ PreserveMost
Used for runtime calls that preserves most registers.
Definition CallingConv.h:63
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2
Preserve X2-X15, X19-X29, SP, Z0-Z31, P0-P15.
@ CXX_FAST_TLS
Used for access functions.
Definition CallingConv.h:72
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0
Preserve X0-X13, X19-X29, SP, Z0-Z31, P0-P15.
@ GHC
Used by the Glasgow Haskell Compiler (GHC).
Definition CallingConv.h:50
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1
Preserve X1-X15, X19-X29, SP, Z0-Z31, P0-P15.
@ PreserveAll
Used for runtime calls that preserves (almost) all registers.
Definition CallingConv.h:66
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ PreserveNone
Used for runtime calls that preserves none general registers.
Definition CallingConv.h:90
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76
@ Win64
The C convention as implemented on Windows/x86-64 and AArch64.
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition CallingConv.h:87
@ GRAAL
Used by GraalVM. Two additional registers are reserved.
@ ARM64EC_Thunk_X64
Calling convention used in the ARM64EC ABI to implement calls between x64 code and thunks.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI bool isConstantSplatVectorAllOnes(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are ~0 ...
bool isNormalMaskedLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed masked load.
bool isNormalMaskedStore(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed masked store.
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:819
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition ISDOpcodes.h:511
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ POISON
POISON - A poison node.
Definition ISDOpcodes.h:236
@ PARTIAL_REDUCE_SMLA
PARTIAL_REDUCE_[U|S]MLA(Accumulator, Input1, Input2) The partial reduction nodes sign or zero extend ...
@ LOOP_DEPENDENCE_RAW_MASK
@ VECREDUCE_SEQ_FADD
Generic reduction nodes.
@ MLOAD
Masked load and store - consecutive vector load and store operations with additional mask operand tha...
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:275
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:600
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:779
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ SET_FPMODE
Sets the current dynamic floating-point control modes.
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:853
@ CTTZ_ELTS
Returns the number of number of trailing (least significant) zero elements in a vector.
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:518
@ VECTOR_FIND_LAST_ACTIVE
Finds the index of the last active mask element Operands: Mask.
@ FMODF
FMODF - Decomposes the operand into integral and fractional parts, each having the same type and sign...
@ FATAN2
FATAN2 - atan2, inspired by libm.
@ FSINCOSPI
FSINCOSPI - Compute both the sine and cosine times pi more accurately than FSINCOS(pi*x),...
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:220
@ GlobalAddress
Definition ISDOpcodes.h:88
@ STRICT_FMINIMUM
Definition ISDOpcodes.h:471
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:880
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:584
@ VECREDUCE_FMAX
FMIN/FMAX nodes can have flags, for NaN/NoNaN variants.
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ VECREDUCE_FMAXIMUM
FMINIMUM/FMAXIMUM nodes propatate NaNs and signed zeroes using the llvm.minimum and llvm....
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:747
@ RESET_FPMODE
Sets default dynamic floating-point control modes.
@ SIGN_EXTEND_VECTOR_INREG
SIGN_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register sign-extension of the low ...
Definition ISDOpcodes.h:910
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:280
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:993
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:254
@ CLMUL
Carry-less multiplication operations.
Definition ISDOpcodes.h:774
@ INIT_TRAMPOLINE
INIT_TRAMPOLINE - This corresponds to the init_trampoline intrinsic.
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition ISDOpcodes.h:438
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ GlobalTLSAddress
Definition ISDOpcodes.h:89
@ SET_ROUNDING
Set rounding mode.
Definition ISDOpcodes.h:975
@ PARTIAL_REDUCE_UMLA
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:844
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition ISDOpcodes.h:715
@ STRICT_UINT_TO_FP
Definition ISDOpcodes.h:485
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:665
@ ADDROFRETURNADDR
ADDROFRETURNADDR - Represents the llvm.addressofreturnaddress intrinsic.
Definition ISDOpcodes.h:117
@ VECREDUCE_FADD
These reductions have relaxed evaluation order semantics, and have a single vector operand.
@ PARTIAL_REDUCE_FMLA
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
@ TRUNCATE_SSAT_U
Definition ISDOpcodes.h:873
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition ISDOpcodes.h:827
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BR_CC
BR_CC - Conditional branch.
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:352
@ BRIND
BRIND - Indirect branch.
@ BR_JT
BR_JT - Jumptable branch.
@ VECTOR_INTERLEAVE
VECTOR_INTERLEAVE(VEC1, VEC2, ...) - Returns N vectors from N input vectors, where N is the factor to...
Definition ISDOpcodes.h:635
@ STEP_VECTOR
STEP_VECTOR(IMM) - Returns a scalable vector whose lanes are comprised of a linear sequence of unsign...
Definition ISDOpcodes.h:691
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:541
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:374
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:796
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:233
@ SPLAT_VECTOR
SPLAT_VECTOR(VAL) - Returns a vector with the scalar value VAL duplicated in all lanes.
Definition ISDOpcodes.h:672
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
@ GET_ACTIVE_LANE_MASK
GET_ACTIVE_LANE_MASK - this corrosponds to the llvm.get.active.lane.mask intrinsic.
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:348
@ CTLS
Count leading redundant sign bits.
Definition ISDOpcodes.h:792
@ VECREDUCE_ADD
Integer reductions may have a result type larger than the vector element type.
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:970
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:704
@ ATOMIC_LOAD_FMAXIMUM
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:765
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:649
@ PtrAuthGlobalAddress
A ptrauth constant.
Definition ISDOpcodes.h:100
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:614
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ STRICT_FMAXIMUM
Definition ISDOpcodes.h:470
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition ISDOpcodes.h:48
@ READ_REGISTER
READ_REGISTER, WRITE_REGISTER - This node represents llvm.register on the DAG, which implements the n...
Definition ISDOpcodes.h:139
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:576
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:224
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:850
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:811
@ VSCALE
VSCALE(IMM) - Returns the runtime scaling factor used to calculate the number of elements within a sc...
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ UBSANTRAP
UBSANTRAP - Trap with an immediate describing the kind of sanitizer failure.
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:356
@ ATOMIC_LOAD_FMINIMUM
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ VECTOR_SPLICE_LEFT
VECTOR_SPLICE_LEFT(VEC1, VEC2, OFFSET) - Shifts CONCAT_VECTORS(VEC1, VEC2) left by OFFSET elements an...
Definition ISDOpcodes.h:653
@ ANY_EXTEND_VECTOR_INREG
ANY_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register any-extension of the low la...
Definition ISDOpcodes.h:899
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:888
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:727
@ MASKED_UDIV
Masked vector arithmetic that returns poison on disabled lanes.
@ VECTOR_REVERSE
VECTOR_REVERSE(VECTOR) - Returns a vector, of the same type as VECTOR, whose elements are shuffled us...
Definition ISDOpcodes.h:640
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:978
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:805
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:328
@ STRICT_SINT_TO_FP
STRICT_[US]INT_TO_FP - Convert a signed or unsigned integer to a floating point value.
Definition ISDOpcodes.h:484
@ MGATHER
Masked gather and scatter - load and store operations for a vector of random addresses with additiona...
@ STRICT_FROUNDEVEN
Definition ISDOpcodes.h:464
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition ISDOpcodes.h:110
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:478
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:500
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:477
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:926
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ TargetConstant
TargetConstant* - Like Constant*, but the DAG does not do any folding, simplification,...
Definition ISDOpcodes.h:179
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:505
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:205
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition ISDOpcodes.h:710
@ VECTOR_SPLICE_RIGHT
VECTOR_SPLICE_RIGHT(VEC1, VEC2, OFFSET) - Shifts CONCAT_VECTORS(VEC1,VEC2) right by OFFSET elements a...
Definition ISDOpcodes.h:657
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition ISDOpcodes.h:427
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:565
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:959
@ VECTOR_COMPRESS
VECTOR_COMPRESS(Vec, Mask, Passthru) consecutively place vector elements based on mask e....
Definition ISDOpcodes.h:699
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
Definition ISDOpcodes.h:122
@ CLEAR_CACHE
llvm.clear_cache intrinsic Operands: Input Chain, Start Addres, End Address Outputs: Output Chain
@ ZERO_EXTEND_VECTOR_INREG
ZERO_EXTEND_VECTOR_INREG(Vector) - This operator represents an in-register zero-extension of the low ...
Definition ISDOpcodes.h:921
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition ISDOpcodes.h:997
@ EXPERIMENTAL_VECTOR_HISTOGRAM
Experimental vector histogram intrinsic Operands: Input Chain, Inc, Mask, Base, Index,...
@ STRICT_FNEARBYINT
Definition ISDOpcodes.h:458
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition ISDOpcodes.h:945
@ VECREDUCE_FMINIMUM
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:856
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
@ BRCOND
BRCOND - Conditional branch.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:833
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:534
@ PARTIAL_REDUCE_SUMLA
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:365
@ VECTOR_DEINTERLEAVE
VECTOR_DEINTERLEAVE(VEC1, VEC2, ...) - Returns N vectors from N input vectors, where N is the factor ...
Definition ISDOpcodes.h:624
@ CTTZ_ELTS_ZERO_POISON
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
@ TRUNCATE_SSAT_S
TRUNCATE_[SU]SAT_[SU] - Truncate for saturated operand [SU] located in middle, prefix for SAT means i...
Definition ISDOpcodes.h:871
@ ABDS
ABDS/ABDU - Absolute difference - Return the absolute difference between two numbers interpreted as s...
Definition ISDOpcodes.h:722
@ ADJUST_TRAMPOLINE
ADJUST_TRAMPOLINE - This corresponds to the adjust_trampoline intrinsic.
@ TRUNCATE_USAT_U
Definition ISDOpcodes.h:875
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:338
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:213
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:556
@ LOOP_DEPENDENCE_WAR_MASK
The llvm.loop.dependence.
bool isOverflowIntrOpRes(SDValue Op)
Returns true if the specified value is the overflow result from one of the overflow intrinsic nodes.
bool isExtOpcode(unsigned Opcode)
LLVM_ABI bool isConstantSplatVectorAllZeros(const SDNode *N, bool BuildVectorOnly=false)
Return true if the specified node is a BUILD_VECTOR or SPLAT_VECTOR where all of the elements are 0 o...
LLVM_ABI bool isVectorShrinkable(const SDNode *N, unsigned NewEltSize, bool Signed)
Returns true if the specified node is a vector where all elements can be truncated to the specified e...
LLVM_ABI CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
MemIndexType
MemIndexType enum - This enum defines how to interpret MGATHER/SCATTER's index parameter when calcula...
LLVM_ABI bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LLVM_ABI bool isBuildVectorAllOnes(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are ~0 or undef.
LLVM_ABI NodeType getVecReduceBaseOpcode(unsigned VecReduceOpcode)
Get underlying scalar opcode for VECREDUCE opcode.
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
static const int LAST_INDEXED_MODE
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
@ Bitcast
Perform the operation on a different, but equivalently sized type.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
auto m_Poison()
Match an arbitrary poison constant.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
IntrinsicID_match m_VScale()
Matches a call to llvm.vscale().
auto m_Value()
Match an arbitrary value and ignore it.
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
CastInst_match< OpTy, UIToFPInst > m_UIToFP(const OpTy &Op)
CastOperator_match< OpTy, Instruction::BitCast > m_BitCast(const OpTy &Op)
Matches BitCast.
AnyBinaryOp_match< LHS, RHS, true > m_c_BinOp(const LHS &L, const RHS &R)
Matches a BinaryOperator with LHS and RHS in either order.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
LLVM_ABI Libcall getPOW(EVT RetVT)
getPOW - Return the POW_* value for the given types, or UNKNOWN_LIBCALL if there is none.
auto m_SpecificVT(EVT RefVT, const Pattern &P)
Match a specific ValueType.
TernaryOpc_match< T0_P, T1_P, T2_P > m_SetCC(const T0_P &LHS, const T1_P &RHS, const T2_P &CC)
auto m_VT(EVT &VT)
Retreive the ValueType of the current SDValue.
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
ConstantInt_match m_ConstInt()
Match any integer constants or splat of an integer constant.
const unsigned VectorBits
Definition SystemZ.h:155
initializer< Ty > init(const Ty &Val)
std::optional< Function * > getAttachedARCFunction(const CallBase *CB)
This function returns operand bundle clang_arc_attachedcall's argument, which is the address of the A...
Definition ObjCARCUtil.h:43
bool attachedCallOpBundleNeedsMarker(const CallBase *CB)
This function determines whether the clang_arc_attachedcall should be emitted with or without the mar...
Definition ObjCARCUtil.h:58
bool hasAttachedCallOpBundle(const CallBase *CB)
Definition ObjCARCUtil.h:29
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
bool isPackedVectorType(EVT SomeVT)
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition MathExtras.h:344
@ Offset
Definition DWP.cpp:557
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:830
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
std::optional< unsigned > isDUPQMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPQMask - matches a splat of equivalent lanes within segments of a given number of elements.
bool CC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1764
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
constexpr uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition MathExtras.h:207
bool isZIPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut, unsigned &OperandOrderOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> (WhichResultOut = 0,...
LLVM_ABI void GetReturnInfo(CallingConv::ID CC, Type *ReturnType, AttributeList attr, SmallVectorImpl< ISD::OutputArg > &Outs, const TargetLowering &TLI, const DataLayout &DL)
Given an LLVM IR type and return type attributes, compute the return value EVTs and flags,...
bool CC_AArch64_Win64PCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
InstructionCost Cost
LLVM_ABI std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition Utils.cpp:294
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Define
Register definition.
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs=nullptr, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition MathExtras.h:350
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition APFloat.h:1660
bool CC_AArch64_DarwinPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition Error.h:198
bool isIntOrFPConstant(SDValue V)
Return true if V is either a integer or FP constant.
auto map_to_vector(ContainerTy &&C, FuncTy &&F)
Map a range to a SmallVector with element types deduced from the mapping.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:325
constexpr bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition MathExtras.h:243
LLVM_ABI Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
std::optional< unsigned > getSVEPredPatternFromNumElements(unsigned MinNumElts)
Return specific VL predicate pattern based on the number of elements.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
LLVM_ABI bool widenShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Try to transform a shuffle mask by replacing elements with the scaled index for an equivalent mask of...
LLVM_ABI bool isNullOrNullSplat(const MachineInstr &MI, const MachineRegisterInfo &MRI, bool AllowUndefs=false)
Return true if the value is a constant 0 integer or a splatted vector of a constant 0 integer (with n...
Definition Utils.cpp:1547
bool CC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2172
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
unsigned Log2_64(uint64_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:337
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:204
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:273
unsigned M1(unsigned Val)
Definition VE.h:377
bool isReleaseOrStronger(AtomicOrdering AO)
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
static Error getOffset(const SymbolRef &Sym, SectionRef Sec, uint64_t &Result)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
bool RetCC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
bool CC_AArch64_Arm64EC_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
bool CC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
LLVM_ABI unsigned getDeinterleaveIntrinsicFactor(Intrinsic::ID ID)
Returns the corresponding factor of llvm.vector.deinterleaveN intrinsics.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
bool isUZPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
generic_gep_type_iterator<> gep_type_iterator
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition MathExtras.h:261
bool isREVMask(ArrayRef< int > M, unsigned EltSize, unsigned NumElts, unsigned BlockSize)
isREVMask - Check if a vector shuffle corresponds to a REV instruction with the specified blocksize.
constexpr RegState getDefRegState(bool B)
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
LLVM_ABI SDValue peekThroughOneUseBitcasts(SDValue V)
Return the non-bitcasted and one-use source operand of V if it exists.
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
LLVM_ABI EHPersonality classifyEHPersonality(const Value *Pers)
See if the given exception handling personality function is one that we understand.
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ATTRIBUTE_VISIBILITY_DEFAULT AnalysisKey InnerAnalysisManagerProxy< AnalysisManagerT, IRUnitT, ExtraArgTs... >::Key
constexpr int PoisonMaskElem
LLVM_ABI bool isOneOrOneSplat(SDValue V, bool AllowUndefs=false)
Return true if the value is a constant 1 integer or a splatted vector of a constant 1 integer (with n...
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Other
Any other memory.
Definition ModRef.h:68
bool CC_AArch64_DarwinPCS_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool CC_AArch64_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
TargetTransformInfo TTI
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
CombineLevel
Definition DAGCombine.h:15
bool CC_AArch64_Win64_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI VectorType * getDeinterleavedVectorType(IntrinsicInst *DI)
Given a deinterleaveN intrinsic, return the (narrow) vector type of each factor.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Mul
Product of integers.
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
bool isAsynchronousEHPersonality(EHPersonality Pers)
Returns true if this personality function catches asynchronous exceptions.
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
bool RetCC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool isAcquireOrStronger(AtomicOrdering AO)
constexpr unsigned BitWidth
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:2018
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI bool isZeroOrZeroSplat(SDValue N, bool AllowUndefs=false)
Return true if the value is a constant 0 integer or a splatted vector of a constant 0 integer (with n...
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1771
gep_type_iterator gep_type_begin(const User *GEP)
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
UndefPoisonKind
Enumeration to track whether we are interested in Undef, Poison, or both.
Definition UndefPoison.h:20
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
Definition STLExtras.h:2191
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
LLVM_ABI bool isNullFPConstant(SDValue V)
Returns true if V is an FP constant with a value of positive zero.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2165
static const MachineMemOperand::Flags MOStridedAccess
@ Enabled
Convert any .debug_str_offsets tables to DWARF64 if needed.
Definition DWP.h:32
bool CC_AArch64_DarwinPCS_ILP32_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
@ Default
The result value is uniform if and only if all operands are uniform.
Definition Uniformity.h:20
@ Custom
The result value requires a custom uniformity check.
Definition Uniformity.h:31
constexpr bool isShiftedUInt(uint64_t x)
Checks if a unsigned integer is an N bit number shifted left by S.
Definition MathExtras.h:198
bool CC_AArch64_Arm64EC_Thunk_Native(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool CC_AArch64_Win64_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool RetCC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
bool CC_AArch64_Preserve_None(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
static const unsigned PerfectShuffleTable[6561+1]
bool isTRNMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut, unsigned &OperandOrderOut)
Return true for trn1 or trn2 masks of the form: <0, 8, 2, 10, 4, 12, 6, 14> (WhichResultOut = 0,...
@ Enable
Enable colors.
Definition WithColor.h:47
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition Error.cpp:177
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:876
#define N
Helper structure to keep track of a SET_CC lowered into AArch64 code.
AArch64CC::CondCode CC
Helper structure to keep track of ISD::SET_CC operands.
Helper structure to be able to read SetCC information.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Represent subnormal handling kind for floating point instruction inputs and outputs.
Extended Value Type.
Definition ValueTypes.h:35
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition ValueTypes.h:90
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:403
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:145
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:70
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:129
uint64_t getScalarStoreSize() const
Definition ValueTypes.h:410
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:292
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:308
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:155
ElementCount getVectorElementCount() const
Definition ValueTypes.h:358
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:479
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:381
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition ValueTypes.h:367
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:393
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition ValueTypes.h:438
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:486
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:420
EVT changeVectorElementType(LLVMContext &Context, EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:98
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:324
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition ValueTypes.h:215
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:61
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:389
EVT widenIntegerVectorElementType(LLVMContext &Context) const
Return a VT for an integer vector type with the size of the elements doubled.
Definition ValueTypes.h:460
bool isScalableVT() const
Return true if the type is a scalable type.
Definition ValueTypes.h:195
bool isFixedLengthVector() const
Definition ValueTypes.h:189
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition ValueTypes.h:55
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:176
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:331
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition ValueTypes.h:300
bool is256BitVector() const
Return true if this is a 256-bit vector type.
Definition ValueTypes.h:220
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:264
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition ValueTypes.h:182
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:336
EVT changeElementType(LLVMContext &Context, EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition ValueTypes.h:121
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:165
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:344
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:469
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:160
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition ValueTypes.h:210
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:315
bool isZero() const
Returns true if value is all zero.
Definition KnownBits.h:78
static LLVM_ABI KnownBits ashr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for ashr(LHS, RHS).
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition KnownBits.h:165
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false, bool SelfAdd=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:361
static LLVM_ABI KnownBits lshr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for lshr(LHS, RHS).
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:310
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition KnownBits.h:325
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:146
static LLVM_ABI KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
APInt getSignedMinValue() const
Return the minimal signed value possible given these KnownBits.
Definition KnownBits.h:136
static LLVM_ABI KnownBits shl(const KnownBits &LHS, const KnownBits &RHS, bool NUW=false, bool NSW=false, bool ShAmtNonZero=false)
Compute known bits for shl(LHS, RHS).
Matching combinators.
SmallVector< ArgRegPair, 1 > ArgRegPairs
Vector of call argument and its forwarding register.
This class contains a discriminated union of information about pointers in memory operands,...
LLVM_ABI unsigned getAddrSpace() const
Return the LLVM IR address space number that this pointer points into.
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getUnknownStack(MachineFunction &MF)
Stack memory without other information.
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Constraint for a predicate of the form "cmp Pred Op, OtherOp", where Op is the value the constraint a...
A simple container for information about the supported runtime calls.
These are IR-level optimization flags that may be propagated to SDNodes.
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::OutputArg, 32 > Outs
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
LLVM_ABI void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...
Helper structure to keep track of SetCC information.
GenericSetCCInfo Generic
AArch64SetCCInfo AArch64