LLVM 22.0.0git
AArch64ISelLowering.cpp
Go to the documentation of this file.
1//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the AArch64TargetLowering class.
10//
11//===----------------------------------------------------------------------===//
12
13#include "AArch64ISelLowering.h"
15#include "AArch64ExpandImm.h"
18#include "AArch64RegisterInfo.h"
20#include "AArch64Subtarget.h"
24#include "llvm/ADT/APFloat.h"
25#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/ArrayRef.h"
27#include "llvm/ADT/STLExtras.h"
28#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/StringRef.h"
33#include "llvm/ADT/Twine.h"
61#include "llvm/IR/Attributes.h"
62#include "llvm/IR/Constants.h"
63#include "llvm/IR/DataLayout.h"
64#include "llvm/IR/DebugLoc.h"
66#include "llvm/IR/Function.h"
68#include "llvm/IR/GlobalValue.h"
69#include "llvm/IR/IRBuilder.h"
70#include "llvm/IR/Instruction.h"
73#include "llvm/IR/Intrinsics.h"
74#include "llvm/IR/IntrinsicsAArch64.h"
75#include "llvm/IR/Module.h"
77#include "llvm/IR/Type.h"
78#include "llvm/IR/Use.h"
79#include "llvm/IR/Value.h"
84#include "llvm/Support/Debug.h"
94#include <algorithm>
95#include <bitset>
96#include <cassert>
97#include <cctype>
98#include <cstdint>
99#include <cstdlib>
100#include <iterator>
101#include <limits>
102#include <optional>
103#include <tuple>
104#include <utility>
105#include <vector>
106
107using namespace llvm;
108
109#define DEBUG_TYPE "aarch64-lower"
110
111STATISTIC(NumTailCalls, "Number of tail calls");
112STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
113
114// FIXME: The necessary dtprel relocations don't seem to be supported
115// well in the GNU bfd and gold linkers at the moment. Therefore, by
116// default, for now, fall back to GeneralDynamic code generation.
118 "aarch64-elf-ldtls-generation", cl::Hidden,
119 cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
120 cl::init(false));
121
122static cl::opt<bool>
123EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
124 cl::desc("Enable AArch64 logical imm instruction "
125 "optimization"),
126 cl::init(true));
127
128// Temporary option added for the purpose of testing functionality added
129// to DAGCombiner.cpp in D92230. It is expected that this can be removed
130// in future when both implementations will be based off MGATHER rather
131// than the GLD1 nodes added for the SVE gather load intrinsics.
132static cl::opt<bool>
133EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
134 cl::desc("Combine extends of AArch64 masked "
135 "gather intrinsics"),
136 cl::init(true));
137
138static cl::opt<bool> EnableExtToTBL("aarch64-enable-ext-to-tbl", cl::Hidden,
139 cl::desc("Combine ext and trunc to TBL"),
140 cl::init(true));
141
142// All of the XOR, OR and CMP use ALU ports, and data dependency will become the
143// bottleneck after this transform on high end CPU. So this max leaf node
144// limitation is guard cmp+ccmp will be profitable.
145static cl::opt<unsigned> MaxXors("aarch64-max-xors", cl::init(16), cl::Hidden,
146 cl::desc("Maximum of xors"));
147
148// By turning this on, we will not fallback to DAG ISel when encountering
149// scalable vector types for all instruction, even if SVE is not yet supported
150// with some instructions.
151// See [AArch64TargetLowering::fallbackToDAGISel] for implementation details.
153 "aarch64-enable-gisel-sve", cl::Hidden,
154 cl::desc("Enable / disable SVE scalable vectors in Global ISel"),
155 cl::init(false));
156
157// TODO: This option should be removed once we switch to always using PTRADD in
158// the SelectionDAG.
160 "aarch64-use-featcpa-codegen", cl::Hidden,
161 cl::desc("Generate ISD::PTRADD nodes for pointer arithmetic in "
162 "SelectionDAG for FEAT_CPA"),
163 cl::init(false));
164
165/// Value type used for condition codes.
166constexpr MVT CondCodeVT = MVT::i32;
167
168/// Value type used for NZCV flags.
169constexpr MVT FlagsVT = MVT::i32;
170
171static const MCPhysReg GPRArgRegs[] = {AArch64::X0, AArch64::X1, AArch64::X2,
172 AArch64::X3, AArch64::X4, AArch64::X5,
173 AArch64::X6, AArch64::X7};
174static const MCPhysReg FPRArgRegs[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
175 AArch64::Q3, AArch64::Q4, AArch64::Q5,
176 AArch64::Q6, AArch64::Q7};
177
179
181
182static inline EVT getPackedSVEVectorVT(EVT VT) {
183 switch (VT.getSimpleVT().SimpleTy) {
184 default:
185 llvm_unreachable("unexpected element type for vector");
186 case MVT::i8:
187 return MVT::nxv16i8;
188 case MVT::i16:
189 return MVT::nxv8i16;
190 case MVT::i32:
191 return MVT::nxv4i32;
192 case MVT::i64:
193 return MVT::nxv2i64;
194 case MVT::f16:
195 return MVT::nxv8f16;
196 case MVT::f32:
197 return MVT::nxv4f32;
198 case MVT::f64:
199 return MVT::nxv2f64;
200 case MVT::bf16:
201 return MVT::nxv8bf16;
202 }
203}
204
205// NOTE: Currently there's only a need to return integer vector types. If this
206// changes then just add an extra "type" parameter.
208 switch (EC.getKnownMinValue()) {
209 default:
210 llvm_unreachable("unexpected element count for vector");
211 case 16:
212 return MVT::nxv16i8;
213 case 8:
214 return MVT::nxv8i16;
215 case 4:
216 return MVT::nxv4i32;
217 case 2:
218 return MVT::nxv2i64;
219 }
220}
221
223 assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
224 "Expected scalable predicate vector type!");
225 switch (VT.getVectorMinNumElements()) {
226 default:
227 llvm_unreachable("unexpected element count for vector");
228 case 2:
229 return MVT::nxv2i64;
230 case 4:
231 return MVT::nxv4i32;
232 case 8:
233 return MVT::nxv8i16;
234 case 16:
235 return MVT::nxv16i8;
236 }
237}
238
239/// Returns true if VT's elements occupy the lowest bit positions of its
240/// associated register class without any intervening space.
241///
242/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
243/// same register class, but only nxv8f16 can be treated as a packed vector.
244static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
246 "Expected legal vector type!");
247 return VT.isFixedLengthVector() ||
249}
250
251// Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
252// predicate and end with a passthru value matching the result type.
253static bool isMergePassthruOpcode(unsigned Opc) {
254 switch (Opc) {
255 default:
256 return false;
257 case AArch64ISD::BITREVERSE_MERGE_PASSTHRU:
258 case AArch64ISD::BSWAP_MERGE_PASSTHRU:
259 case AArch64ISD::REVH_MERGE_PASSTHRU:
260 case AArch64ISD::REVW_MERGE_PASSTHRU:
261 case AArch64ISD::REVD_MERGE_PASSTHRU:
262 case AArch64ISD::CTLZ_MERGE_PASSTHRU:
263 case AArch64ISD::CTPOP_MERGE_PASSTHRU:
264 case AArch64ISD::DUP_MERGE_PASSTHRU:
265 case AArch64ISD::ABS_MERGE_PASSTHRU:
266 case AArch64ISD::NEG_MERGE_PASSTHRU:
267 case AArch64ISD::FNEG_MERGE_PASSTHRU:
268 case AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU:
269 case AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU:
270 case AArch64ISD::FCEIL_MERGE_PASSTHRU:
271 case AArch64ISD::FFLOOR_MERGE_PASSTHRU:
272 case AArch64ISD::FNEARBYINT_MERGE_PASSTHRU:
273 case AArch64ISD::FRINT_MERGE_PASSTHRU:
274 case AArch64ISD::FROUND_MERGE_PASSTHRU:
275 case AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU:
276 case AArch64ISD::FTRUNC_MERGE_PASSTHRU:
277 case AArch64ISD::FP_ROUND_MERGE_PASSTHRU:
278 case AArch64ISD::FP_EXTEND_MERGE_PASSTHRU:
279 case AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU:
280 case AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU:
281 case AArch64ISD::FCVTX_MERGE_PASSTHRU:
282 case AArch64ISD::FCVTZU_MERGE_PASSTHRU:
283 case AArch64ISD::FCVTZS_MERGE_PASSTHRU:
284 case AArch64ISD::FSQRT_MERGE_PASSTHRU:
285 case AArch64ISD::FRECPX_MERGE_PASSTHRU:
286 case AArch64ISD::FABS_MERGE_PASSTHRU:
287 return true;
288 }
289}
290
291// Returns true if inactive lanes are known to be zeroed by construction.
293 switch (Op.getOpcode()) {
294 default:
295 return false;
296 // We guarantee i1 splat_vectors to zero the other lanes
298 case ISD::GET_ACTIVE_LANE_MASK:
299 case AArch64ISD::PTRUE:
300 case AArch64ISD::SETCC_MERGE_ZERO:
301 return true;
303 switch (Op.getConstantOperandVal(0)) {
304 default:
305 return false;
306 case Intrinsic::aarch64_sve_ptrue:
307 case Intrinsic::aarch64_sve_pnext:
308 case Intrinsic::aarch64_sve_cmpeq:
309 case Intrinsic::aarch64_sve_cmpne:
310 case Intrinsic::aarch64_sve_cmpge:
311 case Intrinsic::aarch64_sve_cmpgt:
312 case Intrinsic::aarch64_sve_cmphs:
313 case Intrinsic::aarch64_sve_cmphi:
314 case Intrinsic::aarch64_sve_cmpeq_wide:
315 case Intrinsic::aarch64_sve_cmpne_wide:
316 case Intrinsic::aarch64_sve_cmpge_wide:
317 case Intrinsic::aarch64_sve_cmpgt_wide:
318 case Intrinsic::aarch64_sve_cmplt_wide:
319 case Intrinsic::aarch64_sve_cmple_wide:
320 case Intrinsic::aarch64_sve_cmphs_wide:
321 case Intrinsic::aarch64_sve_cmphi_wide:
322 case Intrinsic::aarch64_sve_cmplo_wide:
323 case Intrinsic::aarch64_sve_cmpls_wide:
324 case Intrinsic::aarch64_sve_fcmpeq:
325 case Intrinsic::aarch64_sve_fcmpne:
326 case Intrinsic::aarch64_sve_fcmpge:
327 case Intrinsic::aarch64_sve_fcmpgt:
328 case Intrinsic::aarch64_sve_fcmpuo:
329 case Intrinsic::aarch64_sve_facgt:
330 case Intrinsic::aarch64_sve_facge:
331 case Intrinsic::aarch64_sve_whilege:
332 case Intrinsic::aarch64_sve_whilegt:
333 case Intrinsic::aarch64_sve_whilehi:
334 case Intrinsic::aarch64_sve_whilehs:
335 case Intrinsic::aarch64_sve_whilele:
336 case Intrinsic::aarch64_sve_whilelo:
337 case Intrinsic::aarch64_sve_whilels:
338 case Intrinsic::aarch64_sve_whilelt:
339 case Intrinsic::aarch64_sve_match:
340 case Intrinsic::aarch64_sve_nmatch:
341 case Intrinsic::aarch64_sve_whilege_x2:
342 case Intrinsic::aarch64_sve_whilegt_x2:
343 case Intrinsic::aarch64_sve_whilehi_x2:
344 case Intrinsic::aarch64_sve_whilehs_x2:
345 case Intrinsic::aarch64_sve_whilele_x2:
346 case Intrinsic::aarch64_sve_whilelo_x2:
347 case Intrinsic::aarch64_sve_whilels_x2:
348 case Intrinsic::aarch64_sve_whilelt_x2:
349 return true;
350 }
351 }
352}
353
354static std::tuple<SDValue, SDValue>
356 SDLoc DL(Disc);
357 SDValue AddrDisc;
358 SDValue ConstDisc;
359
360 // If this is a blend, remember the constant and address discriminators.
361 // Otherwise, it's either a constant discriminator, or a non-blended
362 // address discriminator.
363 if (Disc->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
364 Disc->getConstantOperandVal(0) == Intrinsic::ptrauth_blend) {
365 AddrDisc = Disc->getOperand(1);
366 ConstDisc = Disc->getOperand(2);
367 } else {
368 ConstDisc = Disc;
369 }
370
371 // If the constant discriminator (either the blend RHS, or the entire
372 // discriminator value) isn't a 16-bit constant, bail out, and let the
373 // discriminator be computed separately.
374 const auto *ConstDiscN = dyn_cast<ConstantSDNode>(ConstDisc);
375 if (!ConstDiscN || !isUInt<16>(ConstDiscN->getZExtValue()))
376 return std::make_tuple(DAG->getTargetConstant(0, DL, MVT::i64), Disc);
377
378 // If there's no address discriminator, use NoRegister, which we'll later
379 // replace with XZR, or directly use a Z variant of the inst. when available.
380 if (!AddrDisc)
381 AddrDisc = DAG->getRegister(AArch64::NoRegister, MVT::i64);
382
383 return std::make_tuple(
384 DAG->getTargetConstant(ConstDiscN->getZExtValue(), DL, MVT::i64),
385 AddrDisc);
386}
387
389 const AArch64Subtarget &STI)
390 : TargetLowering(TM), Subtarget(&STI) {
391 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
392 // we have to make something up. Arbitrarily, choose ZeroOrOne.
394 // When comparing vectors the result sets the different elements in the
395 // vector to all-one or all-zero.
397
398 // Set up the register classes.
399 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
400 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
401
402 if (Subtarget->hasLS64()) {
403 addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
404 setOperationAction(ISD::LOAD, MVT::i64x8, Custom);
405 setOperationAction(ISD::STORE, MVT::i64x8, Custom);
406 }
407
408 if (Subtarget->hasFPARMv8()) {
409 addRegisterClass(MVT::aarch64mfp8, &AArch64::FPR8RegClass);
410 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
411 addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
412 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
413 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
414 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
415 }
416
417 if (Subtarget->hasNEON()) {
418 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
419 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
420
421 addDRType(MVT::v2f32);
422 addDRType(MVT::v8i8);
423 addDRType(MVT::v4i16);
424 addDRType(MVT::v2i32);
425 addDRType(MVT::v1i64);
426 addDRType(MVT::v1f64);
427 addDRType(MVT::v4f16);
428 addDRType(MVT::v4bf16);
429
430 addQRType(MVT::v4f32);
431 addQRType(MVT::v2f64);
432 addQRType(MVT::v16i8);
433 addQRType(MVT::v8i16);
434 addQRType(MVT::v4i32);
435 addQRType(MVT::v2i64);
436 addQRType(MVT::v8f16);
437 addQRType(MVT::v8bf16);
438 }
439
440 if (Subtarget->isSVEorStreamingSVEAvailable()) {
441 // Add legal sve predicate types
442 addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass);
443 addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
444 addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
445 addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
446 addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
447
448 // Add sve predicate as counter type
449 addRegisterClass(MVT::aarch64svcount, &AArch64::PPRRegClass);
450
451 // Add legal sve data types
452 addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
453 addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
454 addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
455 addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
456
457 addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
458 addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
459 addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
460 addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
461 addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
462 addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
463
464 addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
465 addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
466 addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
467
468 if (Subtarget->useSVEForFixedLengthVectors()) {
471 addRegisterClass(VT, &AArch64::ZPRRegClass);
472
475 addRegisterClass(VT, &AArch64::ZPRRegClass);
476 }
477 }
478
479 // Compute derived properties from the register classes
480 computeRegisterProperties(Subtarget->getRegisterInfo());
481
482 // Provide all sorts of operation actions
500 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
501 setOperationAction(ISD::BR_CC, MVT::i32, Custom);
502 setOperationAction(ISD::BR_CC, MVT::i64, Custom);
503 setOperationAction(ISD::BR_CC, MVT::f16, Custom);
504 setOperationAction(ISD::BR_CC, MVT::f32, Custom);
505 setOperationAction(ISD::BR_CC, MVT::f64, Custom);
508 if (Subtarget->hasFPARMv8()) {
511 }
520 setOperationAction(ISD::BR_JT, MVT::Other, Custom);
522 setOperationAction(ISD::BRIND, MVT::Other, Custom);
524
526
530
534
536
537 // Custom lowering hooks are needed for XOR
538 // to fold it into CSINC/CSINV.
541
542 setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
543 setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
544
545 // Virtually no operation on f128 is legal, but LLVM can't expand them when
546 // there's a valid register class, so we need custom operations in most cases.
547 setOperationAction(ISD::FABS, MVT::f128, Expand);
550 setOperationAction(ISD::FCOS, MVT::f128, Expand);
554 setOperationAction(ISD::FNEG, MVT::f128, Expand);
555 setOperationAction(ISD::FPOW, MVT::f128, Expand);
557 setOperationAction(ISD::FRINT, MVT::f128, Expand);
558 setOperationAction(ISD::FSIN, MVT::f128, Expand);
559 setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
560 setOperationAction(ISD::FSQRT, MVT::f128, Expand);
562 setOperationAction(ISD::FTAN, MVT::f128, Expand);
563 setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
567 setOperationAction(ISD::BR_CC, MVT::f128, Custom);
570 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
571 // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
572 // aren't handled.
573
574 // Lowering for many of the conversions is actually specified by the non-f128
575 // type. The LowerXXX function will be trivial when f128 isn't involved.
600 if (Subtarget->hasFPARMv8()) {
603 }
606 if (Subtarget->hasFPARMv8()) {
609 }
612
617
618 // Variable arguments.
619 setOperationAction(ISD::VASTART, MVT::Other, Custom);
620 setOperationAction(ISD::VAARG, MVT::Other, Custom);
621 setOperationAction(ISD::VACOPY, MVT::Other, Custom);
622 setOperationAction(ISD::VAEND, MVT::Other, Expand);
623
624 // Variable-sized objects.
625 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
626 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
627
628 // Lowering Funnel Shifts to EXTR
633
634 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
635
636 // Constant pool entries
638
639 // BlockAddress
641
642 // AArch64 lacks both left-rotate and popcount instructions.
648 }
649
650 // AArch64 doesn't have i32 MULH{S|U}.
653
654 // AArch64 doesn't have {U|S}MUL_LOHI.
659
660 if (Subtarget->hasCSSC()) {
664
666
670
673
678
683 } else {
687
690
693 }
694
700 }
707
708 // Custom lower Add/Sub/Mul with overflow.
721
730
731 setOperationAction(ISD::FSIN, MVT::f32, Expand);
732 setOperationAction(ISD::FSIN, MVT::f64, Expand);
733 setOperationAction(ISD::FCOS, MVT::f32, Expand);
734 setOperationAction(ISD::FCOS, MVT::f64, Expand);
735 setOperationAction(ISD::FPOW, MVT::f32, Expand);
736 setOperationAction(ISD::FPOW, MVT::f64, Expand);
739 if (Subtarget->hasFullFP16()) {
742 } else {
745 }
746
747 for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
748 ISD::FCOS, ISD::FSIN, ISD::FSINCOS,
749 ISD::FSINCOSPI, ISD::FMODF, ISD::FACOS,
750 ISD::FASIN, ISD::FATAN, ISD::FATAN2,
751 ISD::FCOSH, ISD::FSINH, ISD::FTANH,
752 ISD::FTAN, ISD::FEXP, ISD::FEXP2,
753 ISD::FEXP10, ISD::FLOG, ISD::FLOG2,
761 setOperationAction(Op, MVT::f16, Promote);
762 setOperationAction(Op, MVT::v4f16, Expand);
763 setOperationAction(Op, MVT::v8f16, Expand);
764 setOperationAction(Op, MVT::bf16, Promote);
765 setOperationAction(Op, MVT::v4bf16, Expand);
766 setOperationAction(Op, MVT::v8bf16, Expand);
767 }
768
769 // Legalize fcanonicalize to circumvent default expansion
770 setOperationAction(ISD::FCANONICALIZE, {MVT::f32, MVT::f64}, Legal);
771 if (Subtarget->hasFullFP16()) {
773 }
774
775 // fpextend from f16 or bf16 to f32 is legal
776 setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal);
777 setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Legal);
780 // fpextend from bf16 to f64 needs to be split into two fpextends
781 setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
783
784 auto LegalizeNarrowFP = [this](MVT ScalarVT) {
785 for (auto Op : {
788 ISD::BR_CC,
789 ISD::FADD,
790 ISD::FSUB,
791 ISD::FMUL,
792 ISD::FDIV,
793 ISD::FMA,
794 ISD::FCEIL,
795 ISD::FSQRT,
796 ISD::FFLOOR,
797 ISD::FNEARBYINT,
798 ISD::FRINT,
799 ISD::FROUND,
800 ISD::FROUNDEVEN,
801 ISD::FTRUNC,
802 ISD::FMINNUM,
803 ISD::FMAXNUM,
804 ISD::FMINIMUM,
805 ISD::FMAXIMUM,
806 ISD::FMINIMUMNUM,
807 ISD::FMAXIMUMNUM,
826 })
827 setOperationAction(Op, ScalarVT, Promote);
828
829 for (auto Op : {ISD::FNEG, ISD::FABS})
830 setOperationAction(Op, ScalarVT, Legal);
831
832 // Round-to-integer need custom lowering for fp16, as Promote doesn't work
833 // because the result type is integer.
834 for (auto Op : {ISD::LROUND, ISD::LLROUND, ISD::LRINT, ISD::LLRINT,
837 setOperationAction(Op, ScalarVT, Custom);
838
839 // promote v4f16 to v4f32 when that is known to be safe.
840 auto V4Narrow = MVT::getVectorVT(ScalarVT, 4);
841 setOperationPromotedToType(ISD::FADD, V4Narrow, MVT::v4f32);
842 setOperationPromotedToType(ISD::FSUB, V4Narrow, MVT::v4f32);
843 setOperationPromotedToType(ISD::FMUL, V4Narrow, MVT::v4f32);
844 setOperationPromotedToType(ISD::FDIV, V4Narrow, MVT::v4f32);
845 setOperationPromotedToType(ISD::FCEIL, V4Narrow, MVT::v4f32);
846 setOperationPromotedToType(ISD::FFLOOR, V4Narrow, MVT::v4f32);
847 setOperationPromotedToType(ISD::FROUND, V4Narrow, MVT::v4f32);
848 setOperationPromotedToType(ISD::FTRUNC, V4Narrow, MVT::v4f32);
849 setOperationPromotedToType(ISD::FROUNDEVEN, V4Narrow, MVT::v4f32);
850 setOperationPromotedToType(ISD::FRINT, V4Narrow, MVT::v4f32);
851 setOperationPromotedToType(ISD::FNEARBYINT, V4Narrow, MVT::v4f32);
852 setOperationPromotedToType(ISD::FCANONICALIZE, V4Narrow, MVT::v4f32);
853 setOperationPromotedToType(ISD::SETCC, V4Narrow, MVT::v4f32);
854
855 setOperationAction(ISD::FABS, V4Narrow, Legal);
856 setOperationAction(ISD::FNEG, V4Narrow, Legal);
858 setOperationAction(ISD::BR_CC, V4Narrow, Expand);
862 setOperationAction(ISD::FSQRT, V4Narrow, Expand);
863
864 auto V8Narrow = MVT::getVectorVT(ScalarVT, 8);
865 setOperationPromotedToType(ISD::FCANONICALIZE, V8Narrow, MVT::v8f32);
866 setOperationPromotedToType(ISD::SETCC, V8Narrow, MVT::v8f32);
867
868 setOperationAction(ISD::FABS, V8Narrow, Legal);
870 setOperationAction(ISD::FCEIL, V8Narrow, Legal);
873 setOperationAction(ISD::FFLOOR, V8Narrow, Legal);
876 setOperationAction(ISD::FNEARBYINT, V8Narrow, Legal);
877 setOperationAction(ISD::FNEG, V8Narrow, Legal);
878 setOperationAction(ISD::FROUND, V8Narrow, Legal);
879 setOperationAction(ISD::FROUNDEVEN, V8Narrow, Legal);
880 setOperationAction(ISD::FRINT, V8Narrow, Legal);
881 setOperationAction(ISD::FSQRT, V8Narrow, Expand);
883 setOperationAction(ISD::FTRUNC, V8Narrow, Legal);
884 setOperationAction(ISD::BR_CC, V8Narrow, Expand);
887 setOperationAction(ISD::FP_EXTEND, V8Narrow, Expand);
888 };
889
890 if (!Subtarget->hasFullFP16()) {
891 LegalizeNarrowFP(MVT::f16);
892 }
893 LegalizeNarrowFP(MVT::bf16);
896
897 // AArch64 has implementations of a lot of rounding-like FP operations.
898 // clang-format off
899 for (auto Op :
900 {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL,
901 ISD::FRINT, ISD::FTRUNC, ISD::FROUND,
902 ISD::FROUNDEVEN, ISD::FMINNUM, ISD::FMAXNUM,
903 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::LROUND,
904 ISD::LLROUND, ISD::LRINT, ISD::LLRINT,
905 ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE,
911 for (MVT Ty : {MVT::f32, MVT::f64})
913 if (Subtarget->hasFullFP16())
914 setOperationAction(Op, MVT::f16, Legal);
915 }
916 // clang-format on
917
918 // Basic strict FP operations are legal
921 for (MVT Ty : {MVT::f32, MVT::f64})
923 if (Subtarget->hasFullFP16())
924 setOperationAction(Op, MVT::f16, Legal);
925 }
926
927 setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
928
930 setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
931 setOperationAction(ISD::GET_FPMODE, MVT::i32, Custom);
932 setOperationAction(ISD::SET_FPMODE, MVT::i32, Custom);
933 setOperationAction(ISD::RESET_FPMODE, MVT::Other, Custom);
934
935 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
936 if (!Subtarget->hasLSE() && !Subtarget->outlineAtomics()) {
937 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, LibCall);
938 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, LibCall);
939 } else {
940 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Expand);
941 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Expand);
942 }
943 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom);
944 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
945
946 // Generate outline atomics library calls only if LSE was not specified for
947 // subtarget
948 if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
949 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, LibCall);
950 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, LibCall);
951 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, LibCall);
952 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, LibCall);
953 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, LibCall);
954 setOperationAction(ISD::ATOMIC_SWAP, MVT::i8, LibCall);
955 setOperationAction(ISD::ATOMIC_SWAP, MVT::i16, LibCall);
956 setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, LibCall);
957 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, LibCall);
958 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i8, LibCall);
959 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i16, LibCall);
960 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, LibCall);
961 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, LibCall);
962 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i8, LibCall);
963 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i16, LibCall);
964 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, LibCall);
965 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, LibCall);
966 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i8, LibCall);
967 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i16, LibCall);
968 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i32, LibCall);
969 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i64, LibCall);
970 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i8, LibCall);
971 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i16, LibCall);
972 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, LibCall);
973 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, LibCall);
974 }
975
976 if (Subtarget->outlineAtomics() && !Subtarget->hasLSFE()) {
977 setOperationAction(ISD::ATOMIC_LOAD_FADD, MVT::f16, LibCall);
978 setOperationAction(ISD::ATOMIC_LOAD_FADD, MVT::f32, LibCall);
979 setOperationAction(ISD::ATOMIC_LOAD_FADD, MVT::f64, LibCall);
980 setOperationAction(ISD::ATOMIC_LOAD_FADD, MVT::bf16, LibCall);
981
982 setOperationAction(ISD::ATOMIC_LOAD_FMAX, MVT::f16, LibCall);
983 setOperationAction(ISD::ATOMIC_LOAD_FMAX, MVT::f32, LibCall);
984 setOperationAction(ISD::ATOMIC_LOAD_FMAX, MVT::f64, LibCall);
985 setOperationAction(ISD::ATOMIC_LOAD_FMAX, MVT::bf16, LibCall);
986
987 setOperationAction(ISD::ATOMIC_LOAD_FMIN, MVT::f16, LibCall);
988 setOperationAction(ISD::ATOMIC_LOAD_FMIN, MVT::f32, LibCall);
989 setOperationAction(ISD::ATOMIC_LOAD_FMIN, MVT::f64, LibCall);
990 setOperationAction(ISD::ATOMIC_LOAD_FMIN, MVT::bf16, LibCall);
991
992 setOperationAction(ISD::ATOMIC_LOAD_FMAXIMUM, MVT::f16, LibCall);
993 setOperationAction(ISD::ATOMIC_LOAD_FMAXIMUM, MVT::f32, LibCall);
994 setOperationAction(ISD::ATOMIC_LOAD_FMAXIMUM, MVT::f64, LibCall);
995 setOperationAction(ISD::ATOMIC_LOAD_FMAXIMUM, MVT::bf16, LibCall);
996
997 setOperationAction(ISD::ATOMIC_LOAD_FMINIMUM, MVT::f16, LibCall);
998 setOperationAction(ISD::ATOMIC_LOAD_FMINIMUM, MVT::f32, LibCall);
999 setOperationAction(ISD::ATOMIC_LOAD_FMINIMUM, MVT::f64, LibCall);
1000 setOperationAction(ISD::ATOMIC_LOAD_FMINIMUM, MVT::bf16, LibCall);
1001 }
1002
1003 if (Subtarget->hasLSE128()) {
1004 // Custom lowering because i128 is not legal. Must be replaced by 2x64
1005 // values. ATOMIC_LOAD_AND also needs op legalisation to emit LDCLRP.
1006 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i128, Custom);
1007 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i128, Custom);
1008 setOperationAction(ISD::ATOMIC_SWAP, MVT::i128, Custom);
1009 }
1010
1011 // 128-bit loads and stores can be done without expanding
1012 setOperationAction(ISD::LOAD, MVT::i128, Custom);
1013 setOperationAction(ISD::STORE, MVT::i128, Custom);
1014
1015 // Aligned 128-bit loads and stores are single-copy atomic according to the
1016 // v8.4a spec. LRCPC3 introduces 128-bit STILP/LDIAPP but still requires LSE2.
1017 if (Subtarget->hasLSE2()) {
1018 setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
1019 setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);
1020 }
1021
1022 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
1023 // custom lowering, as there are no un-paired non-temporal stores and
1024 // legalization will break up 256 bit inputs.
1025 setOperationAction(ISD::STORE, MVT::v32i8, Custom);
1026 setOperationAction(ISD::STORE, MVT::v16i16, Custom);
1027 setOperationAction(ISD::STORE, MVT::v16f16, Custom);
1028 setOperationAction(ISD::STORE, MVT::v16bf16, Custom);
1029 setOperationAction(ISD::STORE, MVT::v8i32, Custom);
1030 setOperationAction(ISD::STORE, MVT::v8f32, Custom);
1031 setOperationAction(ISD::STORE, MVT::v4f64, Custom);
1032 setOperationAction(ISD::STORE, MVT::v4i64, Custom);
1033
1034 // 256 bit non-temporal loads can be lowered to LDNP. This is done using
1035 // custom lowering, as there are no un-paired non-temporal loads legalization
1036 // will break up 256 bit inputs.
1037 setOperationAction(ISD::LOAD, MVT::v32i8, Custom);
1038 setOperationAction(ISD::LOAD, MVT::v16i16, Custom);
1039 setOperationAction(ISD::LOAD, MVT::v16f16, Custom);
1040 setOperationAction(ISD::LOAD, MVT::v16bf16, Custom);
1041 setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
1042 setOperationAction(ISD::LOAD, MVT::v8f32, Custom);
1043 setOperationAction(ISD::LOAD, MVT::v4f64, Custom);
1044 setOperationAction(ISD::LOAD, MVT::v4i64, Custom);
1045
1046 // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
1047 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
1048
1049 // Issue __sincos_stret if available.
1050 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
1051 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
1052
1053 // Make floating-point constants legal for the large code model, so they don't
1054 // become loads from the constant pool.
1055 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
1058 }
1059
1060 // AArch64 does not have floating-point extending loads, i1 sign-extending
1061 // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
1062 for (MVT VT : MVT::fp_valuetypes()) {
1063 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
1064 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
1065 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
1066 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
1067 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
1068 }
1069 for (MVT VT : MVT::integer_valuetypes())
1070 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);
1071
1072 for (MVT WideVT : MVT::fp_valuetypes()) {
1073 for (MVT NarrowVT : MVT::fp_valuetypes()) {
1074 if (WideVT.getScalarSizeInBits() > NarrowVT.getScalarSizeInBits()) {
1075 setTruncStoreAction(WideVT, NarrowVT, Expand);
1076 }
1077 }
1078 }
1079
1080 if (Subtarget->hasFPARMv8()) {
1081 setOperationAction(ISD::BITCAST, MVT::i16, Custom);
1082 setOperationAction(ISD::BITCAST, MVT::f16, Custom);
1083 setOperationAction(ISD::BITCAST, MVT::bf16, Custom);
1084 }
1085
1086 // Indexed loads and stores are supported.
1087 for (unsigned im = (unsigned)ISD::PRE_INC;
1089 setIndexedLoadAction(im, MVT::i8, Legal);
1090 setIndexedLoadAction(im, MVT::i16, Legal);
1091 setIndexedLoadAction(im, MVT::i32, Legal);
1092 setIndexedLoadAction(im, MVT::i64, Legal);
1093 setIndexedLoadAction(im, MVT::f64, Legal);
1094 setIndexedLoadAction(im, MVT::f32, Legal);
1095 setIndexedLoadAction(im, MVT::f16, Legal);
1096 setIndexedLoadAction(im, MVT::bf16, Legal);
1097 setIndexedStoreAction(im, MVT::i8, Legal);
1098 setIndexedStoreAction(im, MVT::i16, Legal);
1099 setIndexedStoreAction(im, MVT::i32, Legal);
1100 setIndexedStoreAction(im, MVT::i64, Legal);
1101 setIndexedStoreAction(im, MVT::f64, Legal);
1102 setIndexedStoreAction(im, MVT::f32, Legal);
1103 setIndexedStoreAction(im, MVT::f16, Legal);
1104 setIndexedStoreAction(im, MVT::bf16, Legal);
1105 }
1106
1107 // Trap.
1108 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1109 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
1110 setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
1111
1112 // We combine OR nodes for ccmp operations.
1114 // Try to create BICs for vector ANDs.
1116
1117 // llvm.init.trampoline and llvm.adjust.trampoline
1118 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
1119 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
1120
1121 // Vector add and sub nodes may conceal a high-half opportunity.
1122 // Also, try to fold ADD into CSINC/CSINV..
1125
1128
1129 // Try and combine setcc with csel
1131
1133
1137 ISD::STORE, ISD::BUILD_VECTOR});
1140 setTargetDAGCombine(ISD::LOAD);
1141
1142 setTargetDAGCombine(ISD::MSTORE);
1143
1145
1147
1150 ISD::VECREDUCE_ADD, ISD::STEP_VECTOR});
1151
1153 {ISD::MGATHER, ISD::MSCATTER, ISD::EXPERIMENTAL_VECTOR_HISTOGRAM});
1154
1155 setTargetDAGCombine(ISD::FP_EXTEND);
1156
1158
1160
1161 setTargetDAGCombine(ISD::GET_ACTIVE_LANE_MASK);
1162
1163 setTargetDAGCombine(ISD::VECREDUCE_AND);
1164 setTargetDAGCombine(ISD::VECREDUCE_OR);
1165 setTargetDAGCombine(ISD::VECREDUCE_XOR);
1166
1168
1172
1173 // In case of strict alignment, avoid an excessive number of byte wide stores.
1176 Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32;
1177
1181 Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16;
1182
1185 Subtarget->requiresStrictAlign() ? MaxStoresPerMemmoveOptSize : 16;
1186
1189 Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8;
1190
1192
1194
1195 EnableExtLdPromotion = true;
1196
1197 // Set required alignment.
1199 // Set preferred alignments.
1200
1201 // Don't align loops on Windows. The SEH unwind info generation needs to
1202 // know the exact length of functions before the alignments have been
1203 // expanded.
1204 if (!Subtarget->isTargetWindows())
1208
1209 // Only change the limit for entries in a jump table if specified by
1210 // the sub target, but not at the command line.
1211 unsigned MaxJT = STI.getMaximumJumpTableSize();
1212 if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
1214
1216
1218
1220 if (Subtarget->hasSME())
1222
1223 if (Subtarget->isNeonAvailable()) {
1224 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
1225 // silliness like this:
1226 // clang-format off
1227 for (auto Op :
1228 {ISD::SELECT, ISD::SELECT_CC, ISD::FATAN2,
1229 ISD::BR_CC, ISD::FADD, ISD::FSUB,
1231 ISD::FNEG, ISD::FABS, ISD::FCEIL,
1232 ISD::FSQRT, ISD::FFLOOR, ISD::FNEARBYINT,
1233 ISD::FSIN, ISD::FCOS, ISD::FTAN,
1234 ISD::FASIN, ISD::FACOS, ISD::FATAN,
1235 ISD::FSINH, ISD::FCOSH, ISD::FTANH,
1236 ISD::FPOW, ISD::FLOG, ISD::FLOG2,
1237 ISD::FLOG10, ISD::FEXP, ISD::FEXP2,
1238 ISD::FEXP10, ISD::FRINT, ISD::FROUND,
1239 ISD::FROUNDEVEN, ISD::FTRUNC, ISD::FMINNUM,
1240 ISD::FMAXNUM, ISD::FMINIMUM, ISD::FMAXIMUM,
1241 ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE,
1248 setOperationAction(Op, MVT::v1f64, Expand);
1249 // clang-format on
1250
1251 for (auto Op :
1256 setOperationAction(Op, MVT::v1i64, Expand);
1257
1258 // AArch64 doesn't have a direct vector ->f32 conversion instructions for
1259 // elements smaller than i32, so promote the input to i32 first.
1260 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
1261 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
1262
1263 // Similarly, there is no direct i32 -> f64 vector conversion instruction.
1264 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
1265 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1268 for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
1270
1271 if (Subtarget->hasFullFP16()) {
1274
1283 } else {
1284 // when AArch64 doesn't have fullfp16 support, promote the input
1285 // to i32 first.
1286 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
1287 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
1288 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32);
1289 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32);
1290 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
1291 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
1292 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
1293 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
1294 }
1295
1296 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
1297 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
1304 for (auto VT : {MVT::v1i64, MVT::v2i64}) {
1309 }
1310
1311 // Custom handling for some quad-vector types to detect MULL.
1312 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
1313 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1314 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1315 setOperationAction(ISD::MUL, MVT::v4i16, Custom);
1316 setOperationAction(ISD::MUL, MVT::v2i32, Custom);
1317 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1318
1319 // Saturates
1320 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v1i64,
1321 MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1326 }
1327
1328 for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1329 MVT::v4i32}) {
1336 }
1337
1338 // Vector reductions
1339 for (MVT VT : { MVT::v4f16, MVT::v2f32,
1340 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1341 if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1342 setOperationAction(ISD::VECREDUCE_FMAX, VT, Legal);
1343 setOperationAction(ISD::VECREDUCE_FMIN, VT, Legal);
1344 setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Legal);
1345 setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Legal);
1346
1347 setOperationAction(ISD::VECREDUCE_FADD, VT, Legal);
1348 }
1349 }
1350 if (Subtarget->hasFullFP16())
1351 setOperationAction(ISD::VECREDUCE_FADD, MVT::v2f16, Custom);
1352
1353 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1354 MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1355 setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
1356 setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
1357 setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
1358 setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
1359 setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
1360 setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1361 setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1362 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1363 }
1364 setOperationAction(ISD::VECREDUCE_ADD, MVT::v2i64, Custom);
1365 setOperationAction(ISD::VECREDUCE_AND, MVT::v2i64, Custom);
1366 setOperationAction(ISD::VECREDUCE_OR, MVT::v2i64, Custom);
1367 setOperationAction(ISD::VECREDUCE_XOR, MVT::v2i64, Custom);
1368
1370 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
1371 // Likewise, narrowing and extending vector loads/stores aren't handled
1372 // directly.
1375
1376 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1379 } else {
1382 }
1385
1388
1389 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1390 setTruncStoreAction(VT, InnerVT, Expand);
1391 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1392 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1393 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1394 }
1395 }
1396
1397 for (auto Op :
1398 {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL, ISD::FRINT, ISD::FTRUNC,
1399 ISD::FROUND, ISD::FROUNDEVEN, ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE,
1403 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1405 if (Subtarget->hasFullFP16())
1406 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1408 }
1409
1410 // LRINT and LLRINT.
1411 for (auto Op : {ISD::LRINT, ISD::LLRINT}) {
1412 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1414 if (Subtarget->hasFullFP16())
1415 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1417 }
1418
1419 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
1420
1421 setOperationAction(ISD::BITCAST, MVT::i2, Custom);
1422 setOperationAction(ISD::BITCAST, MVT::i4, Custom);
1423 setOperationAction(ISD::BITCAST, MVT::i8, Custom);
1424 setOperationAction(ISD::BITCAST, MVT::i16, Custom);
1425
1426 setOperationAction(ISD::BITCAST, MVT::v2i8, Custom);
1427 setOperationAction(ISD::BITCAST, MVT::v2i16, Custom);
1428 setOperationAction(ISD::BITCAST, MVT::v4i8, Custom);
1429
1430 setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1431 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1432 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1433 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1434 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1435 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1436
1437 // ADDP custom lowering
1438 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1440 // FADDP custom lowering
1441 for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1443
1444 if (Subtarget->hasDotProd()) {
1445 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
1446 ISD::PARTIAL_REDUCE_UMLA};
1447
1448 setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v16i8, Legal);
1449 setPartialReduceMLAAction(MLAOps, MVT::v2i32, MVT::v8i8, Legal);
1450 setPartialReduceMLAAction(MLAOps, MVT::v2i32, MVT::v16i8, Custom);
1451 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v16i8, Custom);
1452
1453 if (Subtarget->hasMatMulInt8()) {
1454 setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, MVT::v4i32,
1455 MVT::v16i8, Legal);
1456 setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, MVT::v2i64,
1457 MVT::v16i8, Custom);
1458
1459 setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, MVT::v2i32,
1460 MVT::v8i8, Legal);
1461 }
1462 }
1463
1464 } else /* !isNeonAvailable */ {
1466 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1468
1469 if (VT.is128BitVector() || VT.is64BitVector()) {
1470 setOperationAction(ISD::LOAD, VT, Legal);
1471 setOperationAction(ISD::STORE, VT, Legal);
1472 setOperationAction(ISD::BITCAST, VT,
1473 Subtarget->isLittleEndian() ? Legal : Expand);
1474 }
1475 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1476 setTruncStoreAction(VT, InnerVT, Expand);
1477 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1478 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1479 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1480 }
1481 }
1482 }
1483
1484 for (MVT VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64}) {
1488 }
1489
1490 if (Subtarget->hasSME()) {
1492 }
1493
1494 // FIXME: Move lowering for more nodes here if those are common between
1495 // SVE and SME.
1496 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1497 for (auto VT :
1498 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1503 }
1504 for (auto VT : {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1}) {
1505 setOperationAction(ISD::VECTOR_FIND_LAST_ACTIVE, VT, Legal);
1506 setOperationAction(ISD::GET_ACTIVE_LANE_MASK, VT, Legal);
1507 }
1508
1509 if (Subtarget->hasSVE2p1() ||
1510 (Subtarget->hasSME2() && Subtarget->isStreaming()))
1511 setOperationAction(ISD::GET_ACTIVE_LANE_MASK, MVT::nxv32i1, Custom);
1512
1513 for (auto VT : {MVT::v16i8, MVT::v8i8, MVT::v4i16, MVT::v2i32})
1514 setOperationAction(ISD::GET_ACTIVE_LANE_MASK, VT, Custom);
1515 }
1516
1517 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1518 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1529 setOperationAction(ISD::MLOAD, VT, Custom);
1530 setOperationAction(ISD::MSTORE, VT, Legal);
1550 setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
1551 setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1552 setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1553 setOperationAction(ISD::VECREDUCE_MUL, VT, Custom);
1554 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1555 setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
1556 setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
1557 setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
1558 setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
1561
1567
1576
1581
1582 if (!Subtarget->isLittleEndian())
1583 setOperationAction(ISD::BITCAST, VT, Custom);
1584
1585 if (Subtarget->hasSVE2() ||
1586 (Subtarget->hasSME() && Subtarget->isStreaming()))
1587 // For SLI/SRI.
1589 }
1590
1591 // Illegal unpacked integer vector types.
1592 for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1595 }
1596
1597 // Type legalize unpacked bitcasts.
1598 for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32})
1599 setOperationAction(ISD::BITCAST, VT, Custom);
1600
1601 for (auto VT :
1602 { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
1603 MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
1605
1606 // Promote predicate as counter load/stores to standard predicates.
1607 setOperationPromotedToType(ISD::LOAD, MVT::aarch64svcount, MVT::nxv16i1);
1608 setOperationPromotedToType(ISD::STORE, MVT::aarch64svcount, MVT::nxv16i1);
1609
1610 // Predicate as counter legalization actions.
1611 setOperationAction(ISD::SELECT, MVT::aarch64svcount, Custom);
1612 setOperationAction(ISD::SELECT_CC, MVT::aarch64svcount, Expand);
1613
1614 for (auto VT :
1615 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1620 setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1621 setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1622 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1623
1627
1628 // There are no legal MVT::nxv16f## based types.
1629 if (VT != MVT::nxv16i1) {
1634 }
1635 }
1636
1637 // NEON doesn't support masked loads/stores, but SME and SVE do.
1638 for (auto VT :
1639 {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
1640 MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1641 MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1642 setOperationAction(ISD::MLOAD, VT, Custom);
1643 setOperationAction(ISD::MSTORE, VT, Custom);
1644 }
1645
1646 // Firstly, exclude all scalable vector extending loads/truncating stores,
1647 // include both integer and floating scalable vector.
1649 for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
1650 setTruncStoreAction(VT, InnerVT, Expand);
1651 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1652 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1653 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1654 }
1655 }
1656
1657 // Then, selectively enable those which we directly support.
1658 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i8, Legal);
1659 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i16, Legal);
1660 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i32, Legal);
1661 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i8, Legal);
1662 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i16, Legal);
1663 setTruncStoreAction(MVT::nxv8i16, MVT::nxv8i8, Legal);
1664 for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1665 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal);
1666 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal);
1667 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal);
1668 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal);
1669 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal);
1670 setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal);
1671 }
1672
1673 // SVE supports truncating stores of 64 and 128-bit vectors
1674 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);
1675 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);
1676 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom);
1677 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
1678 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
1679
1680 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1681 MVT::nxv4f32, MVT::nxv2f64}) {
1682 setOperationAction(ISD::BITCAST, VT, Custom);
1685 setOperationAction(ISD::MLOAD, VT, Custom);
1693 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1694 setOperationAction(ISD::FMAXNUM, VT, Custom);
1695 setOperationAction(ISD::FMINIMUM, VT, Custom);
1696 setOperationAction(ISD::FMINNUM, VT, Custom);
1698 setOperationAction(ISD::FNEG, VT, Custom);
1700 setOperationAction(ISD::FCEIL, VT, Custom);
1701 setOperationAction(ISD::FFLOOR, VT, Custom);
1702 setOperationAction(ISD::FNEARBYINT, VT, Custom);
1703 setOperationAction(ISD::FRINT, VT, Custom);
1704 setOperationAction(ISD::LRINT, VT, Custom);
1705 setOperationAction(ISD::LLRINT, VT, Custom);
1706 setOperationAction(ISD::FROUND, VT, Custom);
1707 setOperationAction(ISD::FROUNDEVEN, VT, Custom);
1708 setOperationAction(ISD::FTRUNC, VT, Custom);
1709 setOperationAction(ISD::FSQRT, VT, Custom);
1710 setOperationAction(ISD::FABS, VT, Custom);
1711 setOperationAction(ISD::FP_EXTEND, VT, Custom);
1713 setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
1714 setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
1715 setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
1716 setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Custom);
1717 setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Custom);
1718 setOperationAction(ISD::VECREDUCE_FMUL, VT, Custom);
1722
1725 setOperationAction(ISD::FPOW, VT, Expand);
1726 setOperationAction(ISD::FPOWI, VT, Expand);
1727 setOperationAction(ISD::FCOS, VT, Expand);
1728 setOperationAction(ISD::FSIN, VT, Expand);
1729 setOperationAction(ISD::FSINCOS, VT, Expand);
1730 setOperationAction(ISD::FTAN, VT, Expand);
1731 setOperationAction(ISD::FACOS, VT, Expand);
1732 setOperationAction(ISD::FASIN, VT, Expand);
1733 setOperationAction(ISD::FATAN, VT, Expand);
1734 setOperationAction(ISD::FATAN2, VT, Expand);
1735 setOperationAction(ISD::FCOSH, VT, Expand);
1736 setOperationAction(ISD::FSINH, VT, Expand);
1737 setOperationAction(ISD::FTANH, VT, Expand);
1738 setOperationAction(ISD::FEXP, VT, Expand);
1739 setOperationAction(ISD::FEXP2, VT, Expand);
1740 setOperationAction(ISD::FEXP10, VT, Expand);
1741 setOperationAction(ISD::FLOG, VT, Expand);
1742 setOperationAction(ISD::FLOG2, VT, Expand);
1743 setOperationAction(ISD::FLOG10, VT, Expand);
1744
1756 }
1757
1758 for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1759 setOperationAction(ISD::BITCAST, VT, Custom);
1761 setOperationAction(ISD::FABS, VT, Custom);
1763 setOperationAction(ISD::FNEG, VT, Custom);
1764 setOperationAction(ISD::FP_EXTEND, VT, Custom);
1766 setOperationAction(ISD::MLOAD, VT, Custom);
1774
1775 if (Subtarget->hasSVEB16B16() &&
1776 Subtarget->isNonStreamingSVEorSME2Available()) {
1779 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1780 setOperationAction(ISD::FMAXNUM, VT, Custom);
1781 setOperationAction(ISD::FMINIMUM, VT, Custom);
1782 setOperationAction(ISD::FMINNUM, VT, Custom);
1785 }
1786 }
1787
1788 for (auto Opcode :
1789 {ISD::FCEIL, ISD::FDIV, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
1790 ISD::FROUND, ISD::FROUNDEVEN, ISD::FSQRT, ISD::FTRUNC, ISD::SETCC,
1791 ISD::VECREDUCE_FADD, ISD::VECREDUCE_FMAX, ISD::VECREDUCE_FMAXIMUM,
1792 ISD::VECREDUCE_FMIN, ISD::VECREDUCE_FMINIMUM}) {
1793 setOperationPromotedToType(Opcode, MVT::nxv2bf16, MVT::nxv2f32);
1794 setOperationPromotedToType(Opcode, MVT::nxv4bf16, MVT::nxv4f32);
1795 setOperationPromotedToType(Opcode, MVT::nxv8bf16, MVT::nxv8f32);
1796 }
1797
1798 if (!Subtarget->hasSVEB16B16() ||
1799 !Subtarget->isNonStreamingSVEorSME2Available()) {
1800 for (auto Opcode : {ISD::FADD, ISD::FMA, ISD::FMAXIMUM, ISD::FMAXNUM,
1801 ISD::FMINIMUM, ISD::FMINNUM, ISD::FMUL, ISD::FSUB}) {
1802 setOperationPromotedToType(Opcode, MVT::nxv2bf16, MVT::nxv2f32);
1803 setOperationPromotedToType(Opcode, MVT::nxv4bf16, MVT::nxv4f32);
1804 setOperationPromotedToType(Opcode, MVT::nxv8bf16, MVT::nxv8f32);
1805 }
1806 }
1807
1810
1811 // NEON doesn't support integer divides, but SVE does
1812 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1813 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1816 }
1817
1818 // NEON doesn't support 64-bit vector integer muls, but SVE does.
1819 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1820 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1821
1822 // NOTE: Currently this has to happen after computeRegisterProperties rather
1823 // than the preferred option of combining it with the addRegisterClass call.
1824 if (Subtarget->useSVEForFixedLengthVectors()) {
1827 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1828 addTypeForFixedLengthSVE(VT);
1829 }
1832 VT, /*OverrideNEON=*/!Subtarget->isNeonAvailable()))
1833 addTypeForFixedLengthSVE(VT);
1834 }
1835
1836 // 64bit results can mean a bigger than NEON input.
1837 for (auto VT : {MVT::v8i8, MVT::v4i16})
1840
1841 // 128bit results imply a bigger than NEON input.
1842 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1844 for (auto VT : {MVT::v8f16, MVT::v4f32})
1846
1847 // These operations are not supported on NEON but SVE can do them.
1849 setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
1850 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1851 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
1852 setOperationAction(ISD::MULHS, MVT::v1i64, Custom);
1853 setOperationAction(ISD::MULHS, MVT::v2i64, Custom);
1854 setOperationAction(ISD::MULHU, MVT::v1i64, Custom);
1855 setOperationAction(ISD::MULHU, MVT::v2i64, Custom);
1856 setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
1857 setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
1858 setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
1859 setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
1860 setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
1861 setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
1862 setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
1863 setOperationAction(ISD::UMIN, MVT::v2i64, Custom);
1864 setOperationAction(ISD::VECREDUCE_SMAX, MVT::v2i64, Custom);
1865 setOperationAction(ISD::VECREDUCE_SMIN, MVT::v2i64, Custom);
1866 setOperationAction(ISD::VECREDUCE_UMAX, MVT::v2i64, Custom);
1867 setOperationAction(ISD::VECREDUCE_UMIN, MVT::v2i64, Custom);
1868
1869 // Int operations with no NEON support.
1870 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1871 MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
1874 setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1875 setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1876 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1879 }
1880
1881 // Use SVE for vectors with more than 2 elements.
1882 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1883 setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
1884 }
1885
1886 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64);
1887 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32);
1888 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16);
1889 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);
1890
1891 setOperationAction(ISD::VSCALE, MVT::i32, Custom);
1892
1893 for (auto VT : {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1})
1895 }
1896
1897 // Handle partial reduction operations
1898 if (Subtarget->isSVEorStreamingSVEAvailable()) {
1899 // Mark known legal pairs as 'Legal' (these will expand to UDOT or SDOT).
1900 // Other pairs will default to 'Expand'.
1901 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
1902 ISD::PARTIAL_REDUCE_UMLA};
1903 setPartialReduceMLAAction(MLAOps, MVT::nxv2i64, MVT::nxv8i16, Legal);
1904 setPartialReduceMLAAction(MLAOps, MVT::nxv4i32, MVT::nxv16i8, Legal);
1905
1906 setPartialReduceMLAAction(MLAOps, MVT::nxv2i64, MVT::nxv16i8, Custom);
1907
1908 if (Subtarget->hasMatMulInt8()) {
1909 setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, MVT::nxv4i32,
1910 MVT::nxv16i8, Legal);
1911 setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, MVT::nxv2i64,
1912 MVT::nxv16i8, Custom);
1913 }
1914
1915 // Wide add types
1916 if (Subtarget->hasSVE2() || Subtarget->hasSME()) {
1917 setPartialReduceMLAAction(MLAOps, MVT::nxv2i64, MVT::nxv4i32, Legal);
1918 setPartialReduceMLAAction(MLAOps, MVT::nxv4i32, MVT::nxv8i16, Legal);
1919 setPartialReduceMLAAction(MLAOps, MVT::nxv8i16, MVT::nxv16i8, Legal);
1920 }
1921
1922 // Handle floating-point partial reduction
1923 if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) {
1924 setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_FMLA, MVT::nxv4f32,
1925 MVT::nxv8f16, Legal);
1926 // We can use SVE2p1 fdot to emulate the fixed-length variant.
1927 setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_FMLA, MVT::v4f32,
1928 MVT::v8f16, Custom);
1929 }
1930 }
1931
1932 // Handle non-aliasing elements mask
1933 if (Subtarget->hasSVE2() ||
1934 (Subtarget->hasSME() && Subtarget->isStreaming())) {
1935 // FIXME: Support wider fixed-length types when msve-vector-bits is used.
1936 for (auto VT : {MVT::v2i32, MVT::v4i16, MVT::v8i8, MVT::v16i8}) {
1939 }
1940 for (auto VT : {MVT::nxv2i1, MVT::nxv4i1, MVT::nxv8i1, MVT::nxv16i1}) {
1943 }
1944 }
1945
1946 // Handle operations that are only available in non-streaming SVE mode.
1947 if (Subtarget->isSVEAvailable()) {
1948 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64,
1949 MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1950 MVT::nxv4f32, MVT::nxv2f64, MVT::nxv2bf16, MVT::nxv4bf16,
1951 MVT::nxv8bf16, MVT::v4f16, MVT::v8f16, MVT::v2f32,
1952 MVT::v4f32, MVT::v1f64, MVT::v2f64, MVT::v8i8,
1953 MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1954 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1955 setOperationAction(ISD::MGATHER, VT, Custom);
1956 setOperationAction(ISD::MSCATTER, VT, Custom);
1957 }
1958
1959 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1960 MVT::nxv4f32, MVT::nxv2f64, MVT::v4f16, MVT::v8f16,
1961 MVT::v2f32, MVT::v4f32, MVT::v2f64})
1962 setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
1963
1964 // We can lower types that have <vscale x {2|4}> elements to compact.
1965 for (auto VT :
1966 {MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv2f32,
1967 MVT::nxv2f64, MVT::nxv4i8, MVT::nxv4i16, MVT::nxv4i32, MVT::nxv4f32})
1969
1970 // If we have SVE, we can use SVE logic for legal (or smaller than legal)
1971 // NEON vectors in the lowest bits of the SVE register.
1972 for (auto VT : {MVT::v2i8, MVT::v2i16, MVT::v2i32, MVT::v2i64, MVT::v2f32,
1973 MVT::v2f64, MVT::v4i8, MVT::v4i16, MVT::v4i32, MVT::v4f32})
1975
1976 // Histcnt is SVE2 only
1977 if (Subtarget->hasSVE2()) {
1978 setOperationAction(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, MVT::nxv4i32,
1979 Custom);
1980 setOperationAction(ISD::EXPERIMENTAL_VECTOR_HISTOGRAM, MVT::nxv2i64,
1981 Custom);
1982
1983 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
1984 ISD::PARTIAL_REDUCE_UMLA};
1985 // Must be lowered to SVE instructions.
1986 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v4i32, Custom);
1987 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v8i16, Custom);
1988 setPartialReduceMLAAction(MLAOps, MVT::v2i64, MVT::v16i8, Custom);
1989 setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v8i16, Custom);
1990 setPartialReduceMLAAction(MLAOps, MVT::v4i32, MVT::v16i8, Custom);
1991 setPartialReduceMLAAction(MLAOps, MVT::v8i16, MVT::v16i8, Custom);
1992 }
1993 }
1994
1995 if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
1996 // Only required for llvm.aarch64.mops.memset.tag
1998 }
1999
2001
2002 if (Subtarget->hasSVE()) {
2003 setOperationAction(ISD::FLDEXP, MVT::f64, Custom);
2004 setOperationAction(ISD::FLDEXP, MVT::f32, Custom);
2005 setOperationAction(ISD::FLDEXP, MVT::f16, Custom);
2006 setOperationAction(ISD::FLDEXP, MVT::bf16, Custom);
2007 }
2008
2009 PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
2010
2011 IsStrictFPEnabled = true;
2013
2014 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
2015 // it, but it's just a wrapper around ldexp.
2016 if (Subtarget->isTargetWindows()) {
2017 for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
2018 if (isOperationExpand(Op, MVT::f32))
2019 setOperationAction(Op, MVT::f32, Promote);
2020 }
2021
2022 // LegalizeDAG currently can't expand fp16 LDEXP/FREXP on targets where i16
2023 // isn't legal.
2024 for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
2025 if (isOperationExpand(Op, MVT::f16))
2026 setOperationAction(Op, MVT::f16, Promote);
2027}
2028
2030 return static_cast<const AArch64TargetMachine &>(getTargetMachine());
2031}
2032
2033void AArch64TargetLowering::addTypeForNEON(MVT VT) {
2034 assert(VT.isVector() && "VT should be a vector type");
2035
2036 if (VT.isFloatingPoint()) {
2038 setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
2039 setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
2040 }
2041
2042 // Mark vector float intrinsics as expand.
2043 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
2044 setOperationAction(ISD::FSIN, VT, Expand);
2045 setOperationAction(ISD::FCOS, VT, Expand);
2046 setOperationAction(ISD::FTAN, VT, Expand);
2047 setOperationAction(ISD::FASIN, VT, Expand);
2048 setOperationAction(ISD::FACOS, VT, Expand);
2049 setOperationAction(ISD::FATAN, VT, Expand);
2050 setOperationAction(ISD::FATAN2, VT, Expand);
2051 setOperationAction(ISD::FSINH, VT, Expand);
2052 setOperationAction(ISD::FCOSH, VT, Expand);
2053 setOperationAction(ISD::FTANH, VT, Expand);
2054 setOperationAction(ISD::FPOW, VT, Expand);
2055 setOperationAction(ISD::FLOG, VT, Expand);
2056 setOperationAction(ISD::FLOG2, VT, Expand);
2057 setOperationAction(ISD::FLOG10, VT, Expand);
2058 setOperationAction(ISD::FEXP, VT, Expand);
2059 setOperationAction(ISD::FEXP2, VT, Expand);
2060 setOperationAction(ISD::FEXP10, VT, Expand);
2061 }
2062
2063 // But we do support custom-lowering for FCOPYSIGN.
2064 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
2065 ((VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v4f16 ||
2066 VT == MVT::v8f16) &&
2067 Subtarget->hasFullFP16()))
2069
2082
2086 for (MVT InnerVT : MVT::all_valuetypes())
2087 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
2088
2089 // CNT supports only B element sizes, then use UADDLP to widen.
2090 if (VT != MVT::v8i8 && VT != MVT::v16i8)
2092
2098
2099 for (unsigned Opcode :
2102 setOperationAction(Opcode, VT, Custom);
2103
2104 if (!VT.isFloatingPoint())
2106
2107 // [SU][MIN|MAX] are available for all NEON types apart from i64.
2108 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
2109 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
2110 setOperationAction(Opcode, VT, Legal);
2111
2112 // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
2113 // NEON types.
2114 if (VT.isFloatingPoint() &&
2115 VT.getVectorElementType() != MVT::bf16 &&
2116 (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
2117 for (unsigned Opcode :
2118 {ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM,
2119 ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE, ISD::STRICT_FMINIMUM,
2123 setOperationAction(Opcode, VT, Legal);
2124
2125 // Strict fp extend and trunc are legal
2126 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16)
2128 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64)
2130
2131 // FIXME: We could potentially make use of the vector comparison instructions
2132 // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
2133 // complications:
2134 // * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,
2135 // so we would need to expand when the condition code doesn't match the
2136 // kind of comparison.
2137 // * Some kinds of comparison require more than one FCMXY instruction so
2138 // would need to be expanded instead.
2139 // * The lowering of the non-strict versions involves target-specific ISD
2140 // nodes so we would likely need to add strict versions of all of them and
2141 // handle them appropriately.
2144
2145 // When little-endian we can use ordinary d and q register loads/stores for
2146 // vector types, but when big-endian we need to use structure load/store which
2147 // only allow post-index addressing.
2148 if (Subtarget->isLittleEndian()) {
2149 for (unsigned im = (unsigned)ISD::PRE_INC;
2150 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
2153 }
2154 } else {
2157 }
2158
2159 if (Subtarget->hasD128()) {
2162 }
2163
2164 if (VT.isInteger()) {
2165 // Let common code emit inverted variants of compares we do support.
2171 }
2172}
2173
2175 EVT OpVT) const {
2176 // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
2177 if (!Subtarget->isSVEorStreamingSVEAvailable() ||
2178 ResVT.getVectorElementType() != MVT::i1)
2179 return true;
2180
2181 // Only support illegal types if the result is scalable and min elements > 1.
2182 if (ResVT.getVectorMinNumElements() == 1 ||
2183 (ResVT.isFixedLengthVector() && (ResVT.getVectorNumElements() > 16 ||
2184 (OpVT != MVT::i32 && OpVT != MVT::i64))))
2185 return true;
2186
2187 // 32 & 64 bit operands are supported. We can promote anything < 64 bits,
2188 // but anything larger should be expanded.
2189 if (OpVT.getFixedSizeInBits() > 64)
2190 return true;
2191
2192 return false;
2193}
2194
2196 if (!Subtarget->isSVEorStreamingSVEAvailable())
2197 return true;
2198
2199 // We can only use the BRKB + CNTP sequence with legal predicate types. We can
2200 // also support fixed-width predicates.
2201 return VT != MVT::nxv16i1 && VT != MVT::nxv8i1 && VT != MVT::nxv4i1 &&
2202 VT != MVT::nxv2i1 && VT != MVT::v16i1 && VT != MVT::v8i1 &&
2203 VT != MVT::v4i1 && VT != MVT::v2i1;
2204}
2205
2207 unsigned SearchSize) const {
2208 // MATCH is SVE2 and only available in non-streaming mode.
2209 if (!Subtarget->hasSVE2() || !Subtarget->isSVEAvailable())
2210 return true;
2211 // Furthermore, we can only use it for 8-bit or 16-bit elements.
2212 if (VT == MVT::nxv8i16 || VT == MVT::v8i16)
2213 return SearchSize != 8;
2214 if (VT == MVT::nxv16i8 || VT == MVT::v16i8 || VT == MVT::v8i8)
2215 return SearchSize != 8 && SearchSize != 16;
2216 return true;
2217}
2218
2219void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
2220 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
2221
2222 // By default everything must be expanded.
2223 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
2225
2226 if (VT.isFloatingPoint()) {
2236 }
2237
2239 VT == MVT::v1f64 ? Expand : Custom;
2240
2241 // Mark integer truncating stores/extending loads as having custom lowering
2242 if (VT.isInteger()) {
2243 MVT InnerVT = VT.changeVectorElementType(MVT::i8);
2244 while (InnerVT != VT) {
2245 setTruncStoreAction(VT, InnerVT, Default);
2246 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Default);
2247 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Default);
2248 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
2249 InnerVT = InnerVT.changeVectorElementType(
2250 MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
2251 }
2252 }
2253
2254 // Mark floating-point truncating stores/extending loads as having custom
2255 // lowering
2256 if (VT.isFloatingPoint()) {
2257 MVT InnerVT = VT.changeVectorElementType(MVT::f16);
2258 while (InnerVT != VT) {
2259 setTruncStoreAction(VT, InnerVT, Custom);
2260 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Default);
2261 InnerVT = InnerVT.changeVectorElementType(
2263 }
2264 }
2265
2266 bool PreferNEON = VT.is64BitVector() || VT.is128BitVector();
2267 bool PreferSVE = !PreferNEON && Subtarget->isSVEAvailable();
2268
2269 static const unsigned MLAOps[] = {ISD::PARTIAL_REDUCE_SMLA,
2270 ISD::PARTIAL_REDUCE_UMLA};
2271 unsigned NumElts = VT.getVectorNumElements();
2272 if (VT.getVectorElementType() == MVT::i64) {
2273 setPartialReduceMLAAction(MLAOps, VT,
2274 MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);
2275 setPartialReduceMLAAction(MLAOps, VT,
2276 MVT::getVectorVT(MVT::i16, NumElts * 4), Custom);
2277 setPartialReduceMLAAction(MLAOps, VT,
2278 MVT::getVectorVT(MVT::i32, NumElts * 2), Custom);
2279 } else if (VT.getVectorElementType() == MVT::i32) {
2280 setPartialReduceMLAAction(MLAOps, VT,
2281 MVT::getVectorVT(MVT::i8, NumElts * 4), Custom);
2282 setPartialReduceMLAAction(MLAOps, VT,
2283 MVT::getVectorVT(MVT::i16, NumElts * 2), Custom);
2284 } else if (VT.getVectorElementType() == MVT::i16) {
2285 setPartialReduceMLAAction(MLAOps, VT,
2286 MVT::getVectorVT(MVT::i8, NumElts * 2), Custom);
2287 }
2288 if (Subtarget->hasMatMulInt8()) {
2289 if (VT.getVectorElementType() == MVT::i32)
2290 setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, VT,
2291 MVT::getVectorVT(MVT::i8, NumElts * 4), Custom);
2292 else if (VT.getVectorElementType() == MVT::i64)
2293 setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_SUMLA, VT,
2294 MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);
2295 }
2296
2297 if (Subtarget->hasSVE2p1() && VT.getVectorElementType() == MVT::f32) {
2298 setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_FMLA, VT,
2299 MVT::getVectorVT(MVT::f16, NumElts * 2), Custom);
2300 }
2301
2302 // Lower fixed length vector operations to scalable equivalents.
2309 setOperationAction(ISD::BITCAST, VT, PreferNEON ? Legal : Default);
2319 setOperationAction(ISD::FABS, VT, Default);
2321 setOperationAction(ISD::FCEIL, VT, Default);
2324 setOperationAction(ISD::FFLOOR, VT, Default);
2326 setOperationAction(ISD::FMAXIMUM, VT, Default);
2327 setOperationAction(ISD::FMAXNUM, VT, Default);
2328 setOperationAction(ISD::FMINIMUM, VT, Default);
2329 setOperationAction(ISD::FMINNUM, VT, Default);
2331 setOperationAction(ISD::FNEARBYINT, VT, Default);
2332 setOperationAction(ISD::FNEG, VT, Default);
2333 setOperationAction(ISD::FP_EXTEND, VT, Default);
2337 setOperationAction(ISD::FRINT, VT, Default);
2338 setOperationAction(ISD::LRINT, VT, Default);
2339 setOperationAction(ISD::LLRINT, VT, Default);
2340 setOperationAction(ISD::FROUND, VT, Default);
2341 setOperationAction(ISD::FROUNDEVEN, VT, Default);
2342 setOperationAction(ISD::FSQRT, VT, Default);
2344 setOperationAction(ISD::FTRUNC, VT, Default);
2345 setOperationAction(ISD::GET_ACTIVE_LANE_MASK, VT, Default);
2347 setOperationAction(ISD::LOAD, VT, PreferNEON ? Legal : Default);
2348 setOperationAction(ISD::MGATHER, VT, PreferSVE ? Default : Expand);
2349 setOperationAction(ISD::MLOAD, VT, Default);
2350 setOperationAction(ISD::MSCATTER, VT, PreferSVE ? Default : Expand);
2351 setOperationAction(ISD::MSTORE, VT, Default);
2369 setOperationAction(ISD::STORE, VT, PreferNEON ? Legal : Default);
2376 setOperationAction(ISD::VECREDUCE_ADD, VT, Default);
2377 setOperationAction(ISD::VECREDUCE_AND, VT, Default);
2378 setOperationAction(ISD::VECREDUCE_FADD, VT, Default);
2379 setOperationAction(ISD::VECREDUCE_FMAX, VT, Default);
2380 setOperationAction(ISD::VECREDUCE_FMIN, VT, Default);
2381 setOperationAction(ISD::VECREDUCE_FMAXIMUM, VT, Default);
2382 setOperationAction(ISD::VECREDUCE_FMINIMUM, VT, Default);
2383 setOperationAction(ISD::VECREDUCE_OR, VT, Default);
2384 setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, PreferSVE ? Default : Expand);
2385 setOperationAction(ISD::VECREDUCE_SMAX, VT, Default);
2386 setOperationAction(ISD::VECREDUCE_SMIN, VT, Default);
2387 setOperationAction(ISD::VECREDUCE_UMAX, VT, Default);
2388 setOperationAction(ISD::VECREDUCE_UMIN, VT, Default);
2389 setOperationAction(ISD::VECREDUCE_XOR, VT, Default);
2395}
2396
2397void AArch64TargetLowering::addDRType(MVT VT) {
2398 addRegisterClass(VT, &AArch64::FPR64RegClass);
2399 if (Subtarget->isNeonAvailable())
2400 addTypeForNEON(VT);
2401}
2402
2403void AArch64TargetLowering::addQRType(MVT VT) {
2404 addRegisterClass(VT, &AArch64::FPR128RegClass);
2405 if (Subtarget->isNeonAvailable())
2406 addTypeForNEON(VT);
2407}
2408
2410 LLVMContext &C, EVT VT) const {
2411 if (!VT.isVector())
2412 return MVT::i32;
2413 if (VT.isScalableVector())
2414 return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
2416}
2417
2418// isIntImmediate - This method tests to see if the node is a constant
2419// operand. If so Imm will receive the value.
2420static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
2422 Imm = C->getZExtValue();
2423 return true;
2424 }
2425 return false;
2426}
2427
2428bool isVectorizedBinOp(unsigned Opcode) {
2429 switch (Opcode) {
2430 case AArch64ISD::SQDMULH:
2431 return true;
2432 default:
2433 return false;
2434 }
2435}
2436
2437// isOpcWithIntImmediate - This method tests to see if the node is a specific
2438// opcode and that it has a immediate integer right operand.
2439// If so Imm will receive the value.
2440static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
2441 uint64_t &Imm) {
2442 return N->getOpcode() == Opc &&
2443 isIntImmediate(N->getOperand(1).getNode(), Imm);
2444}
2445
2446static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
2447 const APInt &Demanded,
2449 unsigned NewOpc) {
2450 uint64_t OldImm = Imm, NewImm, Enc;
2451 uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
2452
2453 // Return if the immediate is already all zeros, all ones, a bimm32 or a
2454 // bimm64.
2455 if (Imm == 0 || Imm == Mask ||
2457 return false;
2458
2459 unsigned EltSize = Size;
2460 uint64_t DemandedBits = Demanded.getZExtValue();
2461
2462 // Clear bits that are not demanded.
2463 Imm &= DemandedBits;
2464
2465 while (true) {
2466 // The goal here is to set the non-demanded bits in a way that minimizes
2467 // the number of switching between 0 and 1. In order to achieve this goal,
2468 // we set the non-demanded bits to the value of the preceding demanded bits.
2469 // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
2470 // non-demanded bit), we copy bit0 (1) to the least significant 'x',
2471 // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
2472 // The final result is 0b11000011.
2473 uint64_t NonDemandedBits = ~DemandedBits;
2474 uint64_t InvertedImm = ~Imm & DemandedBits;
2475 uint64_t RotatedImm =
2476 ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
2477 NonDemandedBits;
2478 uint64_t Sum = RotatedImm + NonDemandedBits;
2479 bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
2480 uint64_t Ones = (Sum + Carry) & NonDemandedBits;
2481 NewImm = (Imm | Ones) & Mask;
2482
2483 // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
2484 // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
2485 // we halve the element size and continue the search.
2486 if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
2487 break;
2488
2489 // We cannot shrink the element size any further if it is 2-bits.
2490 if (EltSize == 2)
2491 return false;
2492
2493 EltSize /= 2;
2494 Mask >>= EltSize;
2495 uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
2496
2497 // Return if there is mismatch in any of the demanded bits of Imm and Hi.
2498 if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
2499 return false;
2500
2501 // Merge the upper and lower halves of Imm and DemandedBits.
2502 Imm |= Hi;
2503 DemandedBits |= DemandedBitsHi;
2504 }
2505
2506 ++NumOptimizedImms;
2507
2508 // Replicate the element across the register width.
2509 while (EltSize < Size) {
2510 NewImm |= NewImm << EltSize;
2511 EltSize *= 2;
2512 }
2513
2514 (void)OldImm;
2515 assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
2516 "demanded bits should never be altered");
2517 assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
2518
2519 // Create the new constant immediate node.
2520 EVT VT = Op.getValueType();
2521 SDLoc DL(Op);
2522 SDValue New;
2523
2524 // If the new constant immediate is all-zeros or all-ones, let the target
2525 // independent DAG combine optimize this node.
2526 if (NewImm == 0 || NewImm == OrigMask) {
2527 New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
2528 TLO.DAG.getConstant(NewImm, DL, VT));
2529 // Otherwise, create a machine node so that target independent DAG combine
2530 // doesn't undo this optimization.
2531 } else {
2533 SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
2534 New = SDValue(
2535 TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
2536 }
2537
2538 return TLO.CombineTo(Op, New);
2539}
2540
2542 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
2543 TargetLoweringOpt &TLO) const {
2544 // Delay this optimization to as late as possible.
2545 if (!TLO.LegalOps)
2546 return false;
2547
2549 return false;
2550
2551 EVT VT = Op.getValueType();
2552 if (VT.isVector())
2553 return false;
2554
2555 unsigned Size = VT.getSizeInBits();
2556
2557 if (Size != 32 && Size != 64)
2558 return false;
2559
2560 // Exit early if we demand all bits.
2561 if (DemandedBits.isAllOnes())
2562 return false;
2563
2564 unsigned NewOpc;
2565 switch (Op.getOpcode()) {
2566 default:
2567 return false;
2568 case ISD::AND:
2569 NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
2570 break;
2571 case ISD::OR:
2572 NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
2573 break;
2574 case ISD::XOR:
2575 NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
2576 break;
2577 }
2578 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
2579 if (!C)
2580 return false;
2581 uint64_t Imm = C->getZExtValue();
2582 return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
2583}
2584
2585/// computeKnownBitsForTargetNode - Determine which of the bits specified in
2586/// Mask are known to be either zero or one and return them Known.
2588 const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
2589 const SelectionDAG &DAG, unsigned Depth) const {
2590 switch (Op.getOpcode()) {
2591 default:
2592 break;
2593 case AArch64ISD::DUP: {
2594 SDValue SrcOp = Op.getOperand(0);
2595 Known = DAG.computeKnownBits(SrcOp, Depth + 1);
2596 if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
2597 assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
2598 "Expected DUP implicit truncation");
2599 Known = Known.trunc(Op.getScalarValueSizeInBits());
2600 }
2601 break;
2602 }
2603 case AArch64ISD::CSEL: {
2604 KnownBits Known2;
2605 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2606 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2607 Known = Known.intersectWith(Known2);
2608 break;
2609 }
2610 case AArch64ISD::CSNEG:
2611 case AArch64ISD::CSINC:
2612 case AArch64ISD::CSINV: {
2613 KnownBits KnownOp0 = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2614 KnownBits KnownOp1 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2615
2616 // The result is either:
2617 // CSINC: KnownOp0 or KnownOp1 + 1
2618 // CSINV: KnownOp0 or ~KnownOp1
2619 // CSNEG: KnownOp0 or KnownOp1 * -1
2620 if (Op.getOpcode() == AArch64ISD::CSINC)
2621 KnownOp1 = KnownBits::add(
2622 KnownOp1,
2623 KnownBits::makeConstant(APInt(Op.getScalarValueSizeInBits(), 1)));
2624 else if (Op.getOpcode() == AArch64ISD::CSINV)
2625 std::swap(KnownOp1.Zero, KnownOp1.One);
2626 else if (Op.getOpcode() == AArch64ISD::CSNEG)
2627 KnownOp1 =
2629 Op.getScalarValueSizeInBits())));
2630
2631 Known = KnownOp0.intersectWith(KnownOp1);
2632 break;
2633 }
2634 case AArch64ISD::BICi: {
2635 // Compute the bit cleared value.
2636 APInt Mask =
2637 ~(Op->getConstantOperandAPInt(1) << Op->getConstantOperandAPInt(2))
2638 .trunc(Known.getBitWidth());
2639 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2640 Known &= KnownBits::makeConstant(Mask);
2641 break;
2642 }
2643 case AArch64ISD::VLSHR: {
2644 KnownBits Known2;
2645 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2646 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2647 Known = KnownBits::lshr(Known, Known2);
2648 break;
2649 }
2650 case AArch64ISD::VASHR: {
2651 KnownBits Known2;
2652 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2653 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2654 Known = KnownBits::ashr(Known, Known2);
2655 break;
2656 }
2657 case AArch64ISD::VSHL: {
2658 KnownBits Known2;
2659 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2660 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
2661 Known = KnownBits::shl(Known, Known2);
2662 break;
2663 }
2664 case AArch64ISD::MOVI: {
2666 APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)));
2667 break;
2668 }
2669 case AArch64ISD::MOVIshift: {
2671 APInt(Known.getBitWidth(), Op->getConstantOperandVal(0)
2672 << Op->getConstantOperandVal(1)));
2673 break;
2674 }
2675 case AArch64ISD::MOVImsl: {
2676 unsigned ShiftAmt = AArch64_AM::getShiftValue(Op->getConstantOperandVal(1));
2678 Known.getBitWidth(), ~(~Op->getConstantOperandVal(0) << ShiftAmt)));
2679 break;
2680 }
2681 case AArch64ISD::MOVIedit: {
2683 Known.getBitWidth(),
2684 AArch64_AM::decodeAdvSIMDModImmType10(Op->getConstantOperandVal(0))));
2685 break;
2686 }
2687 case AArch64ISD::MVNIshift: {
2689 APInt(Known.getBitWidth(),
2690 ~(Op->getConstantOperandVal(0) << Op->getConstantOperandVal(1)),
2691 /*isSigned*/ false, /*implicitTrunc*/ true));
2692 break;
2693 }
2694 case AArch64ISD::MVNImsl: {
2695 unsigned ShiftAmt = AArch64_AM::getShiftValue(Op->getConstantOperandVal(1));
2697 APInt(Known.getBitWidth(), (~Op->getConstantOperandVal(0) << ShiftAmt),
2698 /*isSigned*/ false, /*implicitTrunc*/ true));
2699 break;
2700 }
2701 case AArch64ISD::LOADgot:
2702 case AArch64ISD::ADDlow: {
2703 if (!Subtarget->isTargetILP32())
2704 break;
2705 // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
2706 Known.Zero = APInt::getHighBitsSet(64, 32);
2707 break;
2708 }
2709 case AArch64ISD::ASSERT_ZEXT_BOOL: {
2710 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
2711 Known.Zero |= APInt(Known.getBitWidth(), 0xFE);
2712 break;
2713 }
2715 Intrinsic::ID IntID =
2716 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
2717 switch (IntID) {
2718 default: return;
2719 case Intrinsic::aarch64_ldaxr:
2720 case Intrinsic::aarch64_ldxr: {
2721 unsigned BitWidth = Known.getBitWidth();
2722 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
2723 unsigned MemBits = VT.getScalarSizeInBits();
2724 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
2725 return;
2726 }
2727 }
2728 break;
2729 }
2731 case ISD::INTRINSIC_VOID: {
2732 unsigned IntNo = Op.getConstantOperandVal(0);
2733 switch (IntNo) {
2734 default:
2735 break;
2736 case Intrinsic::aarch64_neon_uaddlv: {
2737 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2738 unsigned BitWidth = Known.getBitWidth();
2739 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2740 unsigned Bound = (VT == MVT::v8i8) ? 11 : 12;
2741 assert(BitWidth >= Bound && "Unexpected width!");
2743 Known.Zero |= Mask;
2744 }
2745 break;
2746 }
2747 case Intrinsic::aarch64_neon_umaxv:
2748 case Intrinsic::aarch64_neon_uminv: {
2749 // Figure out the datatype of the vector operand. The UMINV instruction
2750 // will zero extend the result, so we can mark as known zero all the
2751 // bits larger than the element datatype. 32-bit or larget doesn't need
2752 // this as those are legal types and will be handled by isel directly.
2753 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
2754 unsigned BitWidth = Known.getBitWidth();
2755 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
2756 assert(BitWidth >= 8 && "Unexpected width!");
2758 Known.Zero |= Mask;
2759 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
2760 assert(BitWidth >= 16 && "Unexpected width!");
2762 Known.Zero |= Mask;
2763 }
2764 break;
2765 } break;
2766 }
2767 }
2768 }
2769}
2770
2772 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
2773 unsigned Depth) const {
2774 EVT VT = Op.getValueType();
2775 unsigned VTBits = VT.getScalarSizeInBits();
2776 unsigned Opcode = Op.getOpcode();
2777 switch (Opcode) {
2778 case AArch64ISD::FCMEQ:
2779 case AArch64ISD::FCMGE:
2780 case AArch64ISD::FCMGT:
2781 // Compares return either 0 or all-ones
2782 return VTBits;
2783 case AArch64ISD::VASHR: {
2784 unsigned Tmp =
2785 DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
2786 return std::min<uint64_t>(Tmp + Op.getConstantOperandVal(1), VTBits);
2787 }
2788 }
2789
2790 return 1;
2791}
2792
2794 EVT) const {
2795 return MVT::i64;
2796}
2797
2799 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2800 unsigned *Fast) const {
2801
2802 // Allow SVE loads/stores where the alignment >= the size of the element type,
2803 // even with +strict-align. Predicated SVE loads/stores (e.g. ld1/st1), used
2804 // for stores that come from IR, only require element-size alignment (even if
2805 // unaligned accesses are disabled). Without this, these will be forced to
2806 // have 16-byte alignment with +strict-align (and fail to lower as we don't
2807 // yet support TLI.expandUnalignedLoad() and TLI.expandUnalignedStore()).
2808 if (VT.isScalableVector()) {
2809 unsigned ElementSizeBits = VT.getScalarSizeInBits();
2810 if (ElementSizeBits % 8 == 0 && Alignment >= Align(ElementSizeBits / 8))
2811 return true;
2812 }
2813
2814 if (Subtarget->requiresStrictAlign())
2815 return false;
2816
2817 if (Fast) {
2818 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2819 *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
2820 // See comments in performSTORECombine() for more details about
2821 // these conditions.
2822
2823 // Code that uses clang vector extensions can mark that it
2824 // wants unaligned accesses to be treated as fast by
2825 // underspecifying alignment to be 1 or 2.
2826 Alignment <= 2 ||
2827
2828 // Disregard v2i64. Memcpy lowering produces those and splitting
2829 // them regresses performance on micro-benchmarks and olden/bh.
2830 VT == MVT::v2i64;
2831 }
2832 return true;
2833}
2834
2835// Same as above but handling LLTs instead.
2837 LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2838 unsigned *Fast) const {
2839 if (Subtarget->requiresStrictAlign())
2840 return false;
2841
2842 if (Fast) {
2843 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2844 *Fast = !Subtarget->isMisaligned128StoreSlow() ||
2845 Ty.getSizeInBytes() != 16 ||
2846 // See comments in performSTORECombine() for more details about
2847 // these conditions.
2848
2849 // Code that uses clang vector extensions can mark that it
2850 // wants unaligned accesses to be treated as fast by
2851 // underspecifying alignment to be 1 or 2.
2852 Alignment <= 2 ||
2853
2854 // Disregard v2i64. Memcpy lowering produces those and splitting
2855 // them regresses performance on micro-benchmarks and olden/bh.
2856 Ty == LLT::fixed_vector(2, 64);
2857 }
2858 return true;
2859}
2860
2861FastISel *
2863 const TargetLibraryInfo *libInfo) const {
2864 return AArch64::createFastISel(funcInfo, libInfo);
2865}
2866
2869 MachineBasicBlock *MBB) const {
2870 // We materialise the F128CSEL pseudo-instruction as some control flow and a
2871 // phi node:
2872
2873 // OrigBB:
2874 // [... previous instrs leading to comparison ...]
2875 // b.ne TrueBB
2876 // b EndBB
2877 // TrueBB:
2878 // ; Fallthrough
2879 // EndBB:
2880 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2881
2882 MachineFunction *MF = MBB->getParent();
2883 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2884 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2885 DebugLoc DL = MI.getDebugLoc();
2886 MachineFunction::iterator It = ++MBB->getIterator();
2887
2888 Register DestReg = MI.getOperand(0).getReg();
2889 Register IfTrueReg = MI.getOperand(1).getReg();
2890 Register IfFalseReg = MI.getOperand(2).getReg();
2891 unsigned CondCode = MI.getOperand(3).getImm();
2892 bool NZCVKilled = MI.getOperand(4).isKill();
2893
2894 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
2895 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
2896 MF->insert(It, TrueBB);
2897 MF->insert(It, EndBB);
2898
2899 // Transfer rest of current basic-block to EndBB
2900 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
2901 MBB->end());
2903
2904 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
2905 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
2906 MBB->addSuccessor(TrueBB);
2907 MBB->addSuccessor(EndBB);
2908
2909 // TrueBB falls through to the end.
2910 TrueBB->addSuccessor(EndBB);
2911
2912 if (!NZCVKilled) {
2913 TrueBB->addLiveIn(AArch64::NZCV);
2914 EndBB->addLiveIn(AArch64::NZCV);
2915 }
2916
2917 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
2918 .addReg(IfTrueReg)
2919 .addMBB(TrueBB)
2920 .addReg(IfFalseReg)
2921 .addMBB(MBB);
2922
2923 MI.eraseFromParent();
2924 return EndBB;
2925}
2926
2934
2937 MachineBasicBlock *MBB) const {
2938 MachineFunction &MF = *MBB->getParent();
2939 MachineBasicBlock::iterator MBBI = MI.getIterator();
2940 const AArch64InstrInfo &TII =
2941 *MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
2942 Register TargetReg = MI.getOperand(0).getReg();
2944 TII.probedStackAlloc(MBBI, TargetReg, false);
2945
2946 MI.eraseFromParent();
2947 return NextInst->getParent();
2948}
2949
2952 MachineBasicBlock *MBB) const {
2953 MachineFunction *MF = MBB->getParent();
2955
2956 const TargetRegisterClass *RC_GPR = &AArch64::GPR64RegClass;
2957 const TargetRegisterClass *RC_GPRsp = &AArch64::GPR64spRegClass;
2958
2959 Register RegVL_GPR = MRI.createVirtualRegister(RC_GPR);
2960 Register RegVL_GPRsp = MRI.createVirtualRegister(RC_GPRsp); // for ADDSVL src
2961 Register RegSVL_GPR = MRI.createVirtualRegister(RC_GPR);
2962 Register RegSVL_GPRsp = MRI.createVirtualRegister(RC_GPRsp); // for ADDSVL dst
2963
2964 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2965 DebugLoc DL = MI.getDebugLoc();
2966
2967 // RDVL requires GPR64, ADDSVL requires GPR64sp
2968 // We need to insert COPY instructions, these will later be removed by the
2969 // RegisterCoalescer
2970 BuildMI(*MBB, MI, DL, TII->get(AArch64::RDVLI_XI), RegVL_GPR).addImm(1);
2971 BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), RegVL_GPRsp)
2972 .addReg(RegVL_GPR);
2973
2974 BuildMI(*MBB, MI, DL, TII->get(AArch64::ADDSVL_XXI), RegSVL_GPRsp)
2975 .addReg(RegVL_GPRsp)
2976 .addImm(-1);
2977 BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY), RegSVL_GPR)
2978 .addReg(RegSVL_GPRsp);
2979
2980 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2981 MachineFunction::iterator It = ++MBB->getIterator();
2982 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(LLVM_BB);
2983 MachineBasicBlock *PassBB = MF->CreateMachineBasicBlock(LLVM_BB);
2984 MF->insert(It, TrapBB);
2985 MF->insert(It, PassBB);
2986
2987 // Continue if vector lengths match
2988 BuildMI(*MBB, MI, DL, TII->get(AArch64::CBZX))
2989 .addReg(RegSVL_GPR)
2990 .addMBB(PassBB);
2991
2992 // Transfer rest of current BB to PassBB
2993 PassBB->splice(PassBB->begin(), MBB,
2994 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
2996
2997 // Trap if vector lengths mismatch
2998 BuildMI(TrapBB, DL, TII->get(AArch64::BRK)).addImm(1);
2999
3000 MBB->addSuccessor(TrapBB);
3001 MBB->addSuccessor(PassBB);
3002
3003 MI.eraseFromParent();
3004 return PassBB;
3005}
3006
3008AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
3010 MachineBasicBlock *BB) const {
3011 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3012 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
3013
3014 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
3015 MIB.add(MI.getOperand(1)); // slice index register
3016 MIB.add(MI.getOperand(2)); // slice index offset
3017 MIB.add(MI.getOperand(3)); // pg
3018 MIB.add(MI.getOperand(4)); // base
3019 MIB.add(MI.getOperand(5)); // offset
3020
3021 MI.eraseFromParent(); // The pseudo is gone now.
3022 return BB;
3023}
3024
3027 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3029 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA));
3030
3031 MIB.addReg(AArch64::ZA, RegState::Define);
3032 MIB.add(MI.getOperand(0)); // Vector select register
3033 MIB.add(MI.getOperand(1)); // Vector select offset
3034 MIB.add(MI.getOperand(2)); // Base
3035 MIB.add(MI.getOperand(1)); // Offset, same as vector select offset
3036
3037 MI.eraseFromParent(); // The pseudo is gone now.
3038 return BB;
3039}
3040
3043 unsigned Opcode,
3044 bool Op0IsDef) const {
3045 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3047
3048 MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opcode))
3049 .addReg(MI.getOperand(0).getReg(), Op0IsDef ? RegState::Define : 0);
3050 for (unsigned I = 1; I < MI.getNumOperands(); ++I)
3051 MIB.add(MI.getOperand(I));
3052
3053 MI.eraseFromParent(); // The pseudo is gone now.
3054 return BB;
3055}
3056
3058AArch64TargetLowering::EmitZAInstr(unsigned Opc, unsigned BaseReg,
3060 MachineBasicBlock *BB) const {
3061 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3062 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
3063 unsigned StartIdx = 0;
3064
3065 bool HasTile = BaseReg != AArch64::ZA;
3066 bool HasZPROut = HasTile && MI.getOperand(0).isReg();
3067 if (HasZPROut) {
3068 MIB.add(MI.getOperand(StartIdx)); // Output ZPR
3069 ++StartIdx;
3070 }
3071 if (HasTile) {
3072 MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm(),
3073 RegState::Define); // Output ZA Tile
3074 MIB.addReg(BaseReg + MI.getOperand(StartIdx).getImm()); // Input Za Tile
3075 StartIdx++;
3076 } else {
3077 // Avoids all instructions with mnemonic za.<sz>[Reg, Imm,
3078 if (MI.getOperand(0).isReg() && !MI.getOperand(1).isImm()) {
3079 MIB.add(MI.getOperand(StartIdx)); // Output ZPR
3080 ++StartIdx;
3081 }
3082 MIB.addReg(BaseReg, RegState::Define).addReg(BaseReg);
3083 }
3084 for (unsigned I = StartIdx; I < MI.getNumOperands(); ++I)
3085 MIB.add(MI.getOperand(I));
3086
3087 MI.eraseFromParent(); // The pseudo is gone now.
3088 return BB;
3089}
3090
3093 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3095 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M));
3096 MIB.add(MI.getOperand(0)); // Mask
3097
3098 unsigned Mask = MI.getOperand(0).getImm();
3099 for (unsigned I = 0; I < 8; I++) {
3100 if (Mask & (1 << I))
3101 MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine);
3102 }
3103
3104 MI.eraseFromParent(); // The pseudo is gone now.
3105 return BB;
3106}
3107
3110 MachineBasicBlock *BB) const {
3111 MachineFunction *MF = BB->getParent();
3112 MachineFrameInfo &MFI = MF->getFrameInfo();
3114 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
3115 if (TPIDR2.Uses > 0) {
3116 // Note: This case just needs to do `SVL << 48`. It is not implemented as we
3117 // generally don't support big-endian SVE/SME.
3118 if (!Subtarget->isLittleEndian())
3120 "TPIDR2 block initialization is not supported on big-endian targets");
3121
3122 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3123 // Store buffer pointer and num_za_save_slices.
3124 // Bytes 10-15 are implicitly zeroed.
3125 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STPXi))
3126 .addReg(MI.getOperand(0).getReg())
3127 .addReg(MI.getOperand(1).getReg())
3128 .addFrameIndex(TPIDR2.FrameIndex)
3129 .addImm(0);
3130 } else
3131 MFI.RemoveStackObject(TPIDR2.FrameIndex);
3132
3133 BB->remove_instr(&MI);
3134 return BB;
3135}
3136
3139 MachineBasicBlock *BB) const {
3140 MachineFunction *MF = BB->getParent();
3141 MachineFrameInfo &MFI = MF->getFrameInfo();
3143 // TODO This function grows the stack with a subtraction, which doesn't work
3144 // on Windows. Some refactoring to share the functionality in
3145 // LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI
3146 // supports SME
3148 "Lazy ZA save is not yet supported on Windows");
3149
3150 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
3151
3152 if (TPIDR2.Uses > 0) {
3153 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3155
3156 // The SUBXrs below won't always be emitted in a form that accepts SP
3157 // directly
3158 Register SP = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3159 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), SP)
3160 .addReg(AArch64::SP);
3161
3162 // Allocate a lazy-save buffer object of the size given, normally SVL * SVL
3163 auto Size = MI.getOperand(1).getReg();
3164 auto Dest = MI.getOperand(0).getReg();
3165 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::MSUBXrrr), Dest)
3166 .addReg(Size)
3167 .addReg(Size)
3168 .addReg(SP);
3169 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3170 AArch64::SP)
3171 .addReg(Dest);
3172
3173 // We have just allocated a variable sized object, tell this to PEI.
3174 MFI.CreateVariableSizedObject(Align(16), nullptr);
3175 }
3176
3177 BB->remove_instr(&MI);
3178 return BB;
3179}
3180
3181// TODO: Find a way to merge this with EmitAllocateZABuffer.
3184 MachineBasicBlock *BB) const {
3185 MachineFunction *MF = BB->getParent();
3186 MachineFrameInfo &MFI = MF->getFrameInfo();
3189 "Lazy ZA save is not yet supported on Windows");
3190
3191 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3192 if (FuncInfo->isSMESaveBufferUsed()) {
3193 // Allocate a buffer object of the size given by MI.getOperand(1).
3194 auto Size = MI.getOperand(1).getReg();
3195 auto Dest = MI.getOperand(0).getReg();
3196 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::SUBXrx64), AArch64::SP)
3197 .addReg(AArch64::SP)
3198 .addReg(Size)
3200 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), Dest)
3201 .addReg(AArch64::SP);
3202
3203 // We have just allocated a variable sized object, tell this to PEI.
3204 MFI.CreateVariableSizedObject(Align(16), nullptr);
3205 } else
3206 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::IMPLICIT_DEF),
3207 MI.getOperand(0).getReg());
3208
3209 BB->remove_instr(&MI);
3210 return BB;
3211}
3212
3215 MachineBasicBlock *BB) const {
3216 // If the buffer is used, emit a call to __arm_sme_state_size()
3217 MachineFunction *MF = BB->getParent();
3219 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3220 if (FuncInfo->isSMESaveBufferUsed()) {
3221 RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE_SIZE;
3222 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3223 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::BL))
3225 .addReg(AArch64::X0, RegState::ImplicitDefine)
3226 .addRegMask(TRI->getCallPreservedMask(*MF, getLibcallCallingConv(LC)));
3227 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3228 MI.getOperand(0).getReg())
3229 .addReg(AArch64::X0);
3230 } else
3231 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3232 MI.getOperand(0).getReg())
3233 .addReg(AArch64::XZR);
3234 BB->remove_instr(&MI);
3235 return BB;
3236}
3237
3240 MachineBasicBlock *BB) const {
3241 MachineFunction *MF = BB->getParent();
3242 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3243 const DebugLoc &DL = MI.getDebugLoc();
3244 Register ResultReg = MI.getOperand(0).getReg();
3245 if (MF->getRegInfo().use_empty(ResultReg)) {
3246 // Nothing to do. Pseudo erased below.
3247 } else if (Subtarget->hasSME()) {
3248 BuildMI(*BB, MI, DL, TII->get(AArch64::MRS), ResultReg)
3249 .addImm(AArch64SysReg::SVCR)
3250 .addReg(AArch64::VG, RegState::Implicit);
3251 } else {
3252 RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE;
3253 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
3254 BuildMI(*BB, MI, DL, TII->get(AArch64::BL))
3256 .addReg(AArch64::X0, RegState::ImplicitDefine)
3257 .addRegMask(TRI->getCallPreservedMask(*MF, getLibcallCallingConv(LC)));
3258 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), ResultReg)
3259 .addReg(AArch64::X0);
3260 }
3261 MI.eraseFromParent();
3262 return BB;
3263}
3264
3265// Helper function to find the instruction that defined a virtual register.
3266// If unable to find such instruction, returns nullptr.
3268 Register Reg) {
3269 while (Reg.isVirtual()) {
3270 MachineInstr *DefMI = MRI.getVRegDef(Reg);
3271 assert(DefMI && "Virtual register definition not found");
3272 unsigned Opcode = DefMI->getOpcode();
3273
3274 if (Opcode == AArch64::COPY) {
3275 Reg = DefMI->getOperand(1).getReg();
3276 // Vreg is defined by copying from physreg.
3277 if (Reg.isPhysical())
3278 return DefMI;
3279 continue;
3280 }
3281 if (Opcode == AArch64::SUBREG_TO_REG) {
3282 Reg = DefMI->getOperand(2).getReg();
3283 continue;
3284 }
3285
3286 return DefMI;
3287 }
3288 return nullptr;
3289}
3290
3293 MachineOperand &AddrDiscOp, const TargetRegisterClass *AddrDiscRC) const {
3294 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3295 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
3296 const DebugLoc &DL = MI.getDebugLoc();
3297
3298 Register AddrDisc = AddrDiscOp.getReg();
3299 int64_t IntDisc = IntDiscOp.getImm();
3300 assert(IntDisc == 0 && "Blend components are already expanded");
3301
3302 const MachineInstr *DiscMI = stripVRegCopies(MRI, AddrDisc);
3303 if (DiscMI) {
3304 switch (DiscMI->getOpcode()) {
3305 case AArch64::MOVKXi:
3306 // blend(addr, imm) which is lowered as "MOVK addr, #imm, #48".
3307 // #imm should be an immediate and not a global symbol, for example.
3308 if (DiscMI->getOperand(2).isImm() &&
3309 DiscMI->getOperand(3).getImm() == 48) {
3310 AddrDisc = DiscMI->getOperand(1).getReg();
3311 IntDisc = DiscMI->getOperand(2).getImm();
3312 }
3313 break;
3314 case AArch64::MOVi32imm:
3315 case AArch64::MOVi64imm:
3316 // Small immediate integer constant passed via VReg.
3317 if (DiscMI->getOperand(1).isImm() &&
3318 isUInt<16>(DiscMI->getOperand(1).getImm())) {
3319 AddrDisc = AArch64::NoRegister;
3320 IntDisc = DiscMI->getOperand(1).getImm();
3321 }
3322 break;
3323 }
3324 }
3325
3326 // For uniformity, always use NoRegister, as XZR is not necessarily contained
3327 // in the requested register class.
3328 if (AddrDisc == AArch64::XZR)
3329 AddrDisc = AArch64::NoRegister;
3330
3331 // Make sure AddrDisc operand respects the register class imposed by MI.
3332 if (AddrDisc && MRI.getRegClass(AddrDisc) != AddrDiscRC) {
3333 Register TmpReg = MRI.createVirtualRegister(AddrDiscRC);
3334 BuildMI(*BB, MI, DL, TII->get(AArch64::COPY), TmpReg).addReg(AddrDisc);
3335 AddrDisc = TmpReg;
3336 }
3337
3338 AddrDiscOp.setReg(AddrDisc);
3339 IntDiscOp.setImm(IntDisc);
3340}
3341
3343 MachineInstr &MI, MachineBasicBlock *BB) const {
3344
3345 int SMEOrigInstr = AArch64::getSMEPseudoMap(MI.getOpcode());
3346 if (SMEOrigInstr != -1) {
3347 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3348 uint64_t SMEMatrixType =
3349 TII->get(MI.getOpcode()).TSFlags & AArch64::SMEMatrixTypeMask;
3350 switch (SMEMatrixType) {
3352 return EmitZAInstr(SMEOrigInstr, AArch64::ZA, MI, BB);
3354 return EmitZAInstr(SMEOrigInstr, AArch64::ZAB0, MI, BB);
3356 return EmitZAInstr(SMEOrigInstr, AArch64::ZAH0, MI, BB);
3358 return EmitZAInstr(SMEOrigInstr, AArch64::ZAS0, MI, BB);
3360 return EmitZAInstr(SMEOrigInstr, AArch64::ZAD0, MI, BB);
3362 return EmitZAInstr(SMEOrigInstr, AArch64::ZAQ0, MI, BB);
3363 }
3364 }
3365
3366 switch (MI.getOpcode()) {
3367 default:
3368#ifndef NDEBUG
3369 MI.dump();
3370#endif
3371 llvm_unreachable("Unexpected instruction for custom inserter!");
3372 case AArch64::InitTPIDR2Obj:
3373 return EmitInitTPIDR2Object(MI, BB);
3374 case AArch64::AllocateZABuffer:
3375 return EmitAllocateZABuffer(MI, BB);
3376 case AArch64::AllocateSMESaveBuffer:
3377 return EmitAllocateSMESaveBuffer(MI, BB);
3378 case AArch64::GetSMESaveSize:
3379 return EmitGetSMESaveSize(MI, BB);
3380 case AArch64::EntryPStateSM:
3381 return EmitEntryPStateSM(MI, BB);
3382 case AArch64::F128CSEL:
3383 return EmitF128CSEL(MI, BB);
3384 case TargetOpcode::STATEPOINT:
3385 // STATEPOINT is a pseudo instruction which has no implicit defs/uses
3386 // while bl call instruction (where statepoint will be lowered at the end)
3387 // has implicit def. This def is early-clobber as it will be set at
3388 // the moment of the call and earlier than any use is read.
3389 // Add this implicit dead def here as a workaround.
3390 MI.addOperand(*MI.getMF(),
3392 AArch64::LR, /*isDef*/ true,
3393 /*isImp*/ true, /*isKill*/ false, /*isDead*/ true,
3394 /*isUndef*/ false, /*isEarlyClobber*/ true));
3395 [[fallthrough]];
3396 case TargetOpcode::STACKMAP:
3397 case TargetOpcode::PATCHPOINT:
3398 return emitPatchPoint(MI, BB);
3399
3400 case TargetOpcode::PATCHABLE_EVENT_CALL:
3401 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
3402 return BB;
3403
3404 case AArch64::CATCHRET:
3405 return EmitLoweredCatchRet(MI, BB);
3406
3407 case AArch64::PROBED_STACKALLOC_DYN:
3408 return EmitDynamicProbedAlloc(MI, BB);
3409
3410 case AArch64::CHECK_MATCHING_VL_PSEUDO:
3411 return EmitCheckMatchingVL(MI, BB);
3412
3413 case AArch64::LD1_MXIPXX_H_PSEUDO_B:
3414 return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
3415 case AArch64::LD1_MXIPXX_H_PSEUDO_H:
3416 return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB);
3417 case AArch64::LD1_MXIPXX_H_PSEUDO_S:
3418 return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB);
3419 case AArch64::LD1_MXIPXX_H_PSEUDO_D:
3420 return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB);
3421 case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
3422 return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB);
3423 case AArch64::LD1_MXIPXX_V_PSEUDO_B:
3424 return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB);
3425 case AArch64::LD1_MXIPXX_V_PSEUDO_H:
3426 return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB);
3427 case AArch64::LD1_MXIPXX_V_PSEUDO_S:
3428 return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB);
3429 case AArch64::LD1_MXIPXX_V_PSEUDO_D:
3430 return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB);
3431 case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
3432 return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
3433 case AArch64::LDR_ZA_PSEUDO:
3434 return EmitFill(MI, BB);
3435 case AArch64::LDR_TX_PSEUDO:
3436 return EmitZTInstr(MI, BB, AArch64::LDR_TX, /*Op0IsDef=*/true);
3437 case AArch64::STR_TX_PSEUDO:
3438 return EmitZTInstr(MI, BB, AArch64::STR_TX, /*Op0IsDef=*/false);
3439 case AArch64::ZERO_M_PSEUDO:
3440 return EmitZero(MI, BB);
3441 case AArch64::ZERO_T_PSEUDO:
3442 return EmitZTInstr(MI, BB, AArch64::ZERO_T, /*Op0IsDef=*/true);
3443 case AArch64::MOVT_TIZ_PSEUDO:
3444 return EmitZTInstr(MI, BB, AArch64::MOVT_TIZ, /*Op0IsDef=*/true);
3445
3446 case AArch64::PAC:
3447 fixupPtrauthDiscriminator(MI, BB, MI.getOperand(3), MI.getOperand(4),
3448 &AArch64::GPR64noipRegClass);
3449 return BB;
3450 }
3451}
3452
3453//===----------------------------------------------------------------------===//
3454// AArch64 Lowering private implementation.
3455//===----------------------------------------------------------------------===//
3456
3457//===----------------------------------------------------------------------===//
3458// Lowering Code
3459//===----------------------------------------------------------------------===//
3460
3461// Forward declarations of SVE fixed length lowering helpers
3466 SelectionDAG &DAG);
3469 EVT VT);
3470
3471/// isZerosVector - Check whether SDNode N is a zero-filled vector.
3472static bool isZerosVector(const SDNode *N) {
3473 // Look through a bit convert.
3474 while (N->getOpcode() == ISD::BITCAST)
3475 N = N->getOperand(0).getNode();
3476
3478 return true;
3479
3480 if (N->getOpcode() != AArch64ISD::DUP)
3481 return false;
3482
3483 auto Opnd0 = N->getOperand(0);
3484 return isNullConstant(Opnd0) || isNullFPConstant(Opnd0);
3485}
3486
3487/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
3488/// CC
3490 SDValue RHS = {}) {
3491 switch (CC) {
3492 default:
3493 llvm_unreachable("Unknown condition code!");
3494 case ISD::SETNE:
3495 return AArch64CC::NE;
3496 case ISD::SETEQ:
3497 return AArch64CC::EQ;
3498 case ISD::SETGT:
3499 return AArch64CC::GT;
3500 case ISD::SETGE:
3502 case ISD::SETLT:
3504 case ISD::SETLE:
3505 return AArch64CC::LE;
3506 case ISD::SETUGT:
3507 return AArch64CC::HI;
3508 case ISD::SETUGE:
3509 return AArch64CC::HS;
3510 case ISD::SETULT:
3511 return AArch64CC::LO;
3512 case ISD::SETULE:
3513 return AArch64CC::LS;
3514 }
3515}
3516
3517/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
3519 AArch64CC::CondCode &CondCode,
3520 AArch64CC::CondCode &CondCode2) {
3521 CondCode2 = AArch64CC::AL;
3522 switch (CC) {
3523 default:
3524 llvm_unreachable("Unknown FP condition!");
3525 case ISD::SETEQ:
3526 case ISD::SETOEQ:
3527 CondCode = AArch64CC::EQ;
3528 break;
3529 case ISD::SETGT:
3530 case ISD::SETOGT:
3531 CondCode = AArch64CC::GT;
3532 break;
3533 case ISD::SETGE:
3534 case ISD::SETOGE:
3535 CondCode = AArch64CC::GE;
3536 break;
3537 case ISD::SETOLT:
3538 CondCode = AArch64CC::MI;
3539 break;
3540 case ISD::SETOLE:
3541 CondCode = AArch64CC::LS;
3542 break;
3543 case ISD::SETONE:
3544 CondCode = AArch64CC::MI;
3545 CondCode2 = AArch64CC::GT;
3546 break;
3547 case ISD::SETO:
3548 CondCode = AArch64CC::VC;
3549 break;
3550 case ISD::SETUO:
3551 CondCode = AArch64CC::VS;
3552 break;
3553 case ISD::SETUEQ:
3554 CondCode = AArch64CC::EQ;
3555 CondCode2 = AArch64CC::VS;
3556 break;
3557 case ISD::SETUGT:
3558 CondCode = AArch64CC::HI;
3559 break;
3560 case ISD::SETUGE:
3561 CondCode = AArch64CC::PL;
3562 break;
3563 case ISD::SETLT:
3564 case ISD::SETULT:
3565 CondCode = AArch64CC::LT;
3566 break;
3567 case ISD::SETLE:
3568 case ISD::SETULE:
3569 CondCode = AArch64CC::LE;
3570 break;
3571 case ISD::SETNE:
3572 case ISD::SETUNE:
3573 CondCode = AArch64CC::NE;
3574 break;
3575 }
3576}
3577
3578/// Convert a DAG fp condition code to an AArch64 CC.
3579/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
3580/// should be AND'ed instead of OR'ed.
3582 AArch64CC::CondCode &CondCode,
3583 AArch64CC::CondCode &CondCode2) {
3584 CondCode2 = AArch64CC::AL;
3585 switch (CC) {
3586 default:
3587 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3588 assert(CondCode2 == AArch64CC::AL);
3589 break;
3590 case ISD::SETONE:
3591 // (a one b)
3592 // == ((a olt b) || (a ogt b))
3593 // == ((a ord b) && (a une b))
3594 CondCode = AArch64CC::VC;
3595 CondCode2 = AArch64CC::NE;
3596 break;
3597 case ISD::SETUEQ:
3598 // (a ueq b)
3599 // == ((a uno b) || (a oeq b))
3600 // == ((a ule b) && (a uge b))
3601 CondCode = AArch64CC::PL;
3602 CondCode2 = AArch64CC::LE;
3603 break;
3604 }
3605}
3606
3607/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
3608/// CC usable with the vector instructions. Fewer operations are available
3609/// without a real NZCV register, so we have to use less efficient combinations
3610/// to get the same effect.
3612 AArch64CC::CondCode &CondCode,
3613 AArch64CC::CondCode &CondCode2,
3614 bool &Invert) {
3615 Invert = false;
3616 switch (CC) {
3617 default:
3618 // Mostly the scalar mappings work fine.
3619 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
3620 break;
3621 case ISD::SETUO:
3622 Invert = true;
3623 [[fallthrough]];
3624 case ISD::SETO:
3625 CondCode = AArch64CC::MI;
3626 CondCode2 = AArch64CC::GE;
3627 break;
3628 case ISD::SETUEQ:
3629 case ISD::SETULT:
3630 case ISD::SETULE:
3631 case ISD::SETUGT:
3632 case ISD::SETUGE:
3633 // All of the compare-mask comparisons are ordered, but we can switch
3634 // between the two by a double inversion. E.g. ULE == !OGT.
3635 Invert = true;
3636 changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
3637 CondCode, CondCode2);
3638 break;
3639 }
3640}
3641
3642/// Like SelectionDAG::getCondCode(), but for AArch64 condition codes.
3644 // TODO: Should be TargetConstant (need to s/imm/timm in patterns).
3645 return DAG.getConstant(CC, SDLoc(), CondCodeVT);
3646}
3647
3649 // Matches AArch64DAGToDAGISel::SelectArithImmed().
3650 bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
3651 LLVM_DEBUG(dbgs() << "Is imm " << C
3652 << " legal: " << (IsLegal ? "yes\n" : "no\n"));
3653 return IsLegal;
3654}
3655
3657 // Works for negative immediates too, as it can be written as an ADDS
3658 // instruction with a negated immediate.
3659 return isLegalArithImmed(C.abs().getZExtValue());
3660}
3661
3663 uint64_t Imm = C.getZExtValue();
3665 AArch64_IMM::expandMOVImm(Imm, 32, Insn);
3666 return Insn.size();
3667}
3668
3670 // 0 - INT_MIN sign wraps, so no signed wrap means cmn is safe.
3671 if (Op->getFlags().hasNoSignedWrap())
3672 return true;
3673
3674 // We can still figure out if the second operand is safe to use
3675 // in a CMN instruction by checking if it is known to be not the minimum
3676 // signed value. If it is not, then we can safely use CMN.
3677 // Note: We can eventually remove this check and simply rely on
3678 // Op->getFlags().hasNoSignedWrap() once SelectionDAG/ISelLowering
3679 // consistently sets them appropriately when making said nodes.
3680
3681 KnownBits KnownSrc = DAG.computeKnownBits(Op.getOperand(1));
3682 return !KnownSrc.getSignedMinValue().isMinSignedValue();
3683}
3684
3685// Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
3686// the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
3687// can be set differently by this operation. It comes down to whether
3688// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
3689// everything is fine. If not then the optimization is wrong. Thus general
3690// comparisons are only valid if op2 != 0 and op2 != INT_MIN.
3691//
3692// So, finally, the only LLVM-native comparisons that don't mention C or V
3693// are the ones that aren't unsigned comparisons. They're the only ones we can
3694// safely use CMN for in the absence of information about op2.
3696 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
3697 (isIntEqualitySetCC(CC) ||
3698 (isUnsignedIntSetCC(CC) && DAG.isKnownNeverZero(Op.getOperand(1))) ||
3699 (isSignedIntSetCC(CC) && isSafeSignedCMN(Op, DAG)));
3700}
3701
3703 SelectionDAG &DAG, SDValue Chain,
3704 bool IsSignaling) {
3705 EVT VT = LHS.getValueType();
3706 assert(VT != MVT::f128);
3707
3708 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3709
3710 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3711 LHS = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
3712 {Chain, LHS});
3713 RHS = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
3714 {LHS.getValue(1), RHS});
3715 Chain = RHS.getValue(1);
3716 }
3717 unsigned Opcode =
3718 IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP;
3719 return DAG.getNode(Opcode, DL, {FlagsVT, MVT::Other}, {Chain, LHS, RHS});
3720}
3721
3723 const SDLoc &DL, SelectionDAG &DAG) {
3724 EVT VT = LHS.getValueType();
3725 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3726
3727 if (VT.isFloatingPoint()) {
3728 assert(VT != MVT::f128);
3729 if ((VT == MVT::f16 && !FullFP16) || VT == MVT::bf16) {
3730 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3731 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3732 }
3733 return DAG.getNode(AArch64ISD::FCMP, DL, FlagsVT, LHS, RHS);
3734 }
3735
3736 // The CMP instruction is just an alias for SUBS, and representing it as
3737 // SUBS means that it's possible to get CSE with subtract operations.
3738 // A later phase can perform the optimization of setting the destination
3739 // register to WZR/XZR if it ends up being unused.
3740 unsigned Opcode = AArch64ISD::SUBS;
3741
3742 if (isCMN(RHS, CC, DAG)) {
3743 // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
3744 Opcode = AArch64ISD::ADDS;
3745 RHS = RHS.getOperand(1);
3746 } else if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
3747 isIntEqualitySetCC(CC)) {
3748 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3749 // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
3750 Opcode = AArch64ISD::ADDS;
3751 LHS = LHS.getOperand(1);
3752 } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
3753 if (LHS.getOpcode() == ISD::AND) {
3754 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
3755 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
3756 // of the signed comparisons.
3757 const SDValue ANDSNode =
3758 DAG.getNode(AArch64ISD::ANDS, DL, DAG.getVTList(VT, FlagsVT),
3759 LHS.getOperand(0), LHS.getOperand(1));
3760 // Replace all users of (and X, Y) with newly generated (ands X, Y)
3761 DAG.ReplaceAllUsesWith(LHS, ANDSNode);
3762 return ANDSNode.getValue(1);
3763 } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
3764 // Use result of ANDS
3765 return LHS.getValue(1);
3766 }
3767 }
3768
3769 return DAG.getNode(Opcode, DL, DAG.getVTList(VT, FlagsVT), LHS, RHS)
3770 .getValue(1);
3771}
3772
3773/// \defgroup AArch64CCMP CMP;CCMP matching
3774///
3775/// These functions deal with the formation of CMP;CCMP;... sequences.
3776/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
3777/// a comparison. They set the NZCV flags to a predefined value if their
3778/// predicate is false. This allows to express arbitrary conjunctions, for
3779/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
3780/// expressed as:
3781/// cmp A
3782/// ccmp B, inv(CB), CA
3783/// check for CB flags
3784///
3785/// This naturally lets us implement chains of AND operations with SETCC
3786/// operands. And we can even implement some other situations by transforming
3787/// them:
3788/// - We can implement (NEG SETCC) i.e. negating a single comparison by
3789/// negating the flags used in a CCMP/FCCMP operations.
3790/// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
3791/// by negating the flags we test for afterwards. i.e.
3792/// NEG (CMP CCMP CCCMP ...) can be implemented.
3793/// - Note that we can only ever negate all previously processed results.
3794/// What we can not implement by flipping the flags to test is a negation
3795/// of two sub-trees (because the negation affects all sub-trees emitted so
3796/// far, so the 2nd sub-tree we emit would also affect the first).
3797/// With those tools we can implement some OR operations:
3798/// - (OR (SETCC A) (SETCC B)) can be implemented via:
3799/// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
3800/// - After transforming OR to NEG/AND combinations we may be able to use NEG
3801/// elimination rules from earlier to implement the whole thing as a
3802/// CCMP/FCCMP chain.
3803///
3804/// As complete example:
3805/// or (or (setCA (cmp A)) (setCB (cmp B)))
3806/// (and (setCC (cmp C)) (setCD (cmp D)))"
3807/// can be reassociated to:
3808/// or (and (setCC (cmp C)) setCD (cmp D))
3809// (or (setCA (cmp A)) (setCB (cmp B)))
3810/// can be transformed to:
3811/// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
3812/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
3813/// which can be implemented as:
3814/// cmp C
3815/// ccmp D, inv(CD), CC
3816/// ccmp A, CA, inv(CD)
3817/// ccmp B, CB, inv(CA)
3818/// check for CB flags
3819///
3820/// A counterexample is "or (and A B) (and C D)" which translates to
3821/// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
3822/// can only implement 1 of the inner (not) operations, but not both!
3823/// @{
3824
3825/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
3827 ISD::CondCode CC, SDValue CCOp,
3829 AArch64CC::CondCode OutCC,
3830 const SDLoc &DL, SelectionDAG &DAG) {
3831 unsigned Opcode = 0;
3832 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3833
3834 if (LHS.getValueType().isFloatingPoint()) {
3835 assert(LHS.getValueType() != MVT::f128);
3836 if ((LHS.getValueType() == MVT::f16 && !FullFP16) ||
3837 LHS.getValueType() == MVT::bf16) {
3838 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3839 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3840 }
3841 Opcode = AArch64ISD::FCCMP;
3842 } else if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(RHS)) {
3843 APInt Imm = Const->getAPIntValue();
3844 if (Imm.isNegative() && Imm.sgt(-32)) {
3845 Opcode = AArch64ISD::CCMN;
3846 RHS = DAG.getConstant(Imm.abs(), DL, Const->getValueType(0));
3847 }
3848 } else if (isCMN(RHS, CC, DAG)) {
3849 Opcode = AArch64ISD::CCMN;
3850 RHS = RHS.getOperand(1);
3851 } else if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
3852 isIntEqualitySetCC(CC)) {
3853 // As we are looking for EQ/NE compares, the operands can be commuted ; can
3854 // we combine a (CCMP (sub 0, op1), op2) into a CCMN instruction ?
3855 Opcode = AArch64ISD::CCMN;
3856 LHS = LHS.getOperand(1);
3857 }
3858 if (Opcode == 0)
3859 Opcode = AArch64ISD::CCMP;
3860
3861 SDValue Condition = getCondCode(DAG, Predicate);
3863 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
3864 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
3865 return DAG.getNode(Opcode, DL, FlagsVT, LHS, RHS, NZCVOp, Condition, CCOp);
3866}
3867
3868/// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
3869/// expressed as a conjunction. See \ref AArch64CCMP.
3870/// \param CanNegate Set to true if we can negate the whole sub-tree just by
3871/// changing the conditions on the SETCC tests.
3872/// (this means we can call emitConjunctionRec() with
3873/// Negate==true on this sub-tree)
3874/// \param MustBeFirst Set to true if this subtree needs to be negated and we
3875/// cannot do the negation naturally. We are required to
3876/// emit the subtree first in this case.
3877/// \param WillNegate Is true if are called when the result of this
3878/// subexpression must be negated. This happens when the
3879/// outer expression is an OR. We can use this fact to know
3880/// that we have a double negation (or (or ...) ...) that
3881/// can be implemented for free.
3882static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
3883 bool &MustBeFirst, bool WillNegate,
3884 unsigned Depth = 0) {
3885 if (!Val.hasOneUse())
3886 return false;
3887 unsigned Opcode = Val->getOpcode();
3888 if (Opcode == ISD::SETCC) {
3889 if (Val->getOperand(0).getValueType() == MVT::f128)
3890 return false;
3891 CanNegate = true;
3892 MustBeFirst = false;
3893 return true;
3894 }
3895 // Protect against exponential runtime and stack overflow.
3896 if (Depth > 6)
3897 return false;
3898 if (Opcode == ISD::AND || Opcode == ISD::OR) {
3899 bool IsOR = Opcode == ISD::OR;
3900 SDValue O0 = Val->getOperand(0);
3901 SDValue O1 = Val->getOperand(1);
3902 bool CanNegateL;
3903 bool MustBeFirstL;
3904 if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
3905 return false;
3906 bool CanNegateR;
3907 bool MustBeFirstR;
3908 if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
3909 return false;
3910
3911 if (MustBeFirstL && MustBeFirstR)
3912 return false;
3913
3914 if (IsOR) {
3915 // For an OR expression we need to be able to naturally negate at least
3916 // one side or we cannot do the transformation at all.
3917 if (!CanNegateL && !CanNegateR)
3918 return false;
3919 // If we the result of the OR will be negated and we can naturally negate
3920 // the leafs, then this sub-tree as a whole negates naturally.
3921 CanNegate = WillNegate && CanNegateL && CanNegateR;
3922 // If we cannot naturally negate the whole sub-tree, then this must be
3923 // emitted first.
3924 MustBeFirst = !CanNegate;
3925 } else {
3926 assert(Opcode == ISD::AND && "Must be OR or AND");
3927 // We cannot naturally negate an AND operation.
3928 CanNegate = false;
3929 MustBeFirst = MustBeFirstL || MustBeFirstR;
3930 }
3931 return true;
3932 }
3933 return false;
3934}
3935
3936/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
3937/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
3938/// Tries to transform the given i1 producing node @p Val to a series compare
3939/// and conditional compare operations. @returns an NZCV flags producing node
3940/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
3941/// transformation was not possible.
3942/// \p Negate is true if we want this sub-tree being negated just by changing
3943/// SETCC conditions.
3945 AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
3947 // We're at a tree leaf, produce a conditional comparison operation.
3948 unsigned Opcode = Val->getOpcode();
3949 if (Opcode == ISD::SETCC) {
3950 SDValue LHS = Val->getOperand(0);
3951 SDValue RHS = Val->getOperand(1);
3952 ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
3953 bool isInteger = LHS.getValueType().isInteger();
3954 if (Negate)
3955 CC = getSetCCInverse(CC, LHS.getValueType());
3956 SDLoc DL(Val);
3957 // Determine OutCC and handle FP special case.
3958 if (isInteger) {
3959 OutCC = changeIntCCToAArch64CC(CC, RHS);
3960 } else {
3961 assert(LHS.getValueType().isFloatingPoint());
3962 AArch64CC::CondCode ExtraCC;
3963 changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
3964 // Some floating point conditions can't be tested with a single condition
3965 // code. Construct an additional comparison in this case.
3966 if (ExtraCC != AArch64CC::AL) {
3967 SDValue ExtraCmp;
3968 if (!CCOp.getNode())
3969 ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
3970 else
3971 ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
3972 ExtraCC, DL, DAG);
3973 CCOp = ExtraCmp;
3974 Predicate = ExtraCC;
3975 }
3976 }
3977
3978 // Produce a normal comparison if we are first in the chain
3979 if (!CCOp)
3980 return emitComparison(LHS, RHS, CC, DL, DAG);
3981 // Otherwise produce a ccmp.
3982 return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
3983 DAG);
3984 }
3985 assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
3986
3987 bool IsOR = Opcode == ISD::OR;
3988
3989 SDValue LHS = Val->getOperand(0);
3990 bool CanNegateL;
3991 bool MustBeFirstL;
3992 bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
3993 assert(ValidL && "Valid conjunction/disjunction tree");
3994 (void)ValidL;
3995
3996 SDValue RHS = Val->getOperand(1);
3997 bool CanNegateR;
3998 bool MustBeFirstR;
3999 bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
4000 assert(ValidR && "Valid conjunction/disjunction tree");
4001 (void)ValidR;
4002
4003 // Swap sub-tree that must come first to the right side.
4004 if (MustBeFirstL) {
4005 assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
4006 std::swap(LHS, RHS);
4007 std::swap(CanNegateL, CanNegateR);
4008 std::swap(MustBeFirstL, MustBeFirstR);
4009 }
4010
4011 bool NegateR;
4012 bool NegateAfterR;
4013 bool NegateL;
4014 bool NegateAfterAll;
4015 if (Opcode == ISD::OR) {
4016 // Swap the sub-tree that we can negate naturally to the left.
4017 if (!CanNegateL) {
4018 assert(CanNegateR && "at least one side must be negatable");
4019 assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
4020 assert(!Negate);
4021 std::swap(LHS, RHS);
4022 NegateR = false;
4023 NegateAfterR = true;
4024 } else {
4025 // Negate the left sub-tree if possible, otherwise negate the result.
4026 NegateR = CanNegateR;
4027 NegateAfterR = !CanNegateR;
4028 }
4029 NegateL = true;
4030 NegateAfterAll = !Negate;
4031 } else {
4032 assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
4033 assert(!Negate && "Valid conjunction/disjunction tree");
4034
4035 NegateL = false;
4036 NegateR = false;
4037 NegateAfterR = false;
4038 NegateAfterAll = false;
4039 }
4040
4041 // Emit sub-trees.
4042 AArch64CC::CondCode RHSCC;
4043 SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
4044 if (NegateAfterR)
4045 RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
4046 SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
4047 if (NegateAfterAll)
4048 OutCC = AArch64CC::getInvertedCondCode(OutCC);
4049 return CmpL;
4050}
4051
4052/// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
4053/// In some cases this is even possible with OR operations in the expression.
4054/// See \ref AArch64CCMP.
4055/// \see emitConjunctionRec().
4057 AArch64CC::CondCode &OutCC) {
4058 bool DummyCanNegate;
4059 bool DummyMustBeFirst;
4060 if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
4061 return SDValue();
4062
4063 return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
4064}
4065
4066/// @}
4067
4068/// Returns how profitable it is to fold a comparison's operand's shift and/or
4069/// extension operations.
4071 auto isSupportedExtend = [&](SDValue V) {
4072 if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
4073 return true;
4074
4075 if (V.getOpcode() == ISD::AND)
4076 if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
4077 uint64_t Mask = MaskCst->getZExtValue();
4078 return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
4079 }
4080
4081 return false;
4082 };
4083
4084 if (!Op.hasOneUse())
4085 return 0;
4086
4087 if (isSupportedExtend(Op))
4088 return 1;
4089
4090 unsigned Opc = Op.getOpcode();
4091 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
4092 if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
4093 uint64_t Shift = ShiftCst->getZExtValue();
4094 if (isSupportedExtend(Op.getOperand(0)))
4095 return (Shift <= 4) ? 2 : 1;
4096 EVT VT = Op.getValueType();
4097 if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
4098 return 1;
4099 }
4100
4101 return 0;
4102}
4103
4104// emitComparison() converts comparison with one or negative one to comparison
4105// with 0. Note that this only works for signed comparisons because of how ANDS
4106// works.
4108 // Only works for ANDS and AND.
4109 if (LHS.getOpcode() != ISD::AND && LHS.getOpcode() != AArch64ISD::ANDS)
4110 return false;
4111
4112 if (C.isOne() && (CC == ISD::SETLT || CC == ISD::SETGE)) {
4113 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
4114 return true;
4115 }
4116
4117 if (C.isAllOnes() && (CC == ISD::SETLE || CC == ISD::SETGT)) {
4118 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
4119 return true;
4120 }
4121
4122 return false;
4123}
4124
4126 SDValue &AArch64cc, SelectionDAG &DAG,
4127 const SDLoc &DL) {
4128 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
4129 EVT VT = RHS.getValueType();
4130 APInt C = RHSC->getAPIntValue();
4131 // shouldBeAdjustedToZero is a special case to better fold with
4132 // emitComparison().
4133 if (shouldBeAdjustedToZero(LHS, C, CC)) {
4134 // Adjust the constant to zero.
4135 // CC has already been adjusted.
4136 RHS = DAG.getConstant(0, DL, VT);
4137 } else if (!isLegalCmpImmed(C)) {
4138 unsigned NumImmForC = numberOfInstrToLoadImm(C);
4139 // Constant does not fit, try adjusting it by one?
4140 switch (CC) {
4141 default:
4142 break;
4143 case ISD::SETLT:
4144 case ISD::SETGE:
4145 if (!C.isMinSignedValue()) {
4146 APInt CMinusOne = C - 1;
4147 if (isLegalCmpImmed(CMinusOne) ||
4148 (NumImmForC > numberOfInstrToLoadImm(CMinusOne))) {
4149 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
4150 RHS = DAG.getConstant(CMinusOne, DL, VT);
4151 }
4152 }
4153 break;
4154 case ISD::SETULT:
4155 case ISD::SETUGE: {
4156 // C is not 0 because it is a legal immediate.
4157 assert(!C.isZero() && "C should not be zero here");
4158 APInt CMinusOne = C - 1;
4159 if (isLegalCmpImmed(CMinusOne) ||
4160 (NumImmForC > numberOfInstrToLoadImm(CMinusOne))) {
4161 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
4162 RHS = DAG.getConstant(CMinusOne, DL, VT);
4163 }
4164 break;
4165 }
4166 case ISD::SETLE:
4167 case ISD::SETGT:
4168 if (!C.isMaxSignedValue()) {
4169 APInt CPlusOne = C + 1;
4170 if (isLegalCmpImmed(CPlusOne) ||
4171 (NumImmForC > numberOfInstrToLoadImm(CPlusOne))) {
4172 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
4173 RHS = DAG.getConstant(CPlusOne, DL, VT);
4174 }
4175 }
4176 break;
4177 case ISD::SETULE:
4178 case ISD::SETUGT: {
4179 if (!C.isAllOnes()) {
4180 APInt CPlusOne = C + 1;
4181 if (isLegalCmpImmed(CPlusOne) ||
4182 (NumImmForC > numberOfInstrToLoadImm(CPlusOne))) {
4183 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
4184 RHS = DAG.getConstant(CPlusOne, DL, VT);
4185 }
4186 }
4187 break;
4188 }
4189 }
4190 }
4191 }
4192
4193 // Comparisons are canonicalized so that the RHS operand is simpler than the
4194 // LHS one, the extreme case being when RHS is an immediate. However, AArch64
4195 // can fold some shift+extend operations on the RHS operand, so swap the
4196 // operands if that can be done.
4197 //
4198 // For example:
4199 // lsl w13, w11, #1
4200 // cmp w13, w12
4201 // can be turned into:
4202 // cmp w12, w11, lsl #1
4203 if (!isa<ConstantSDNode>(RHS) || !isLegalCmpImmed(RHS->getAsAPIntVal())) {
4204 bool LHSIsCMN = isCMN(LHS, CC, DAG);
4205 bool RHSIsCMN = isCMN(RHS, CC, DAG);
4206 SDValue TheLHS = LHSIsCMN ? LHS.getOperand(1) : LHS;
4207 SDValue TheRHS = RHSIsCMN ? RHS.getOperand(1) : RHS;
4208
4209 if (getCmpOperandFoldingProfit(TheLHS) + (LHSIsCMN ? 1 : 0) >
4210 getCmpOperandFoldingProfit(TheRHS) + (RHSIsCMN ? 1 : 0)) {
4211 std::swap(LHS, RHS);
4213 }
4214 }
4215
4216 SDValue Cmp;
4218 if (isIntEqualitySetCC(CC) && isa<ConstantSDNode>(RHS)) {
4220
4221 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
4222 // For the i8 operand, the largest immediate is 255, so this can be easily
4223 // encoded in the compare instruction. For the i16 operand, however, the
4224 // largest immediate cannot be encoded in the compare.
4225 // Therefore, use a sign extending load and cmn to avoid materializing the
4226 // -1 constant. For example,
4227 // movz w1, #65535
4228 // ldrh w0, [x0, #0]
4229 // cmp w0, w1
4230 // >
4231 // ldrsh w0, [x0, #0]
4232 // cmn w0, #1
4233 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
4234 // if and only if (sext LHS) == (sext RHS). The checks are in place to
4235 // ensure both the LHS and RHS are truly zero extended and to make sure the
4236 // transformation is profitable.
4237 if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
4238 cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
4239 cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
4240 LHS.getNode()->hasNUsesOfValue(1, 0)) {
4241 int16_t ValueofRHS = RHS->getAsZExtVal();
4242 if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
4243 SDValue SExt =
4244 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, LHS.getValueType(), LHS,
4245 DAG.getValueType(MVT::i16));
4246 Cmp = emitComparison(
4247 SExt, DAG.getSignedConstant(ValueofRHS, DL, RHS.getValueType()), CC,
4248 DL, DAG);
4250 }
4251 }
4252
4253 if (!Cmp && (RHSC->isZero() || RHSC->isOne())) {
4254 if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
4255 if ((CC == ISD::SETNE) ^ RHSC->isZero())
4257 }
4258 }
4259 }
4260
4261 if (!Cmp) {
4262 Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
4264 }
4265 AArch64cc = getCondCode(DAG, AArch64CC);
4266 return Cmp;
4267}
4268
4269static std::pair<SDValue, SDValue>
4271 assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
4272 "Unsupported value type");
4273 SDValue Value, Overflow;
4274 SDLoc DL(Op);
4275 SDValue LHS = Op.getOperand(0);
4276 SDValue RHS = Op.getOperand(1);
4277 unsigned Opc = 0;
4278 switch (Op.getOpcode()) {
4279 default:
4280 llvm_unreachable("Unknown overflow instruction!");
4281 case ISD::SADDO:
4282 Opc = AArch64ISD::ADDS;
4283 CC = AArch64CC::VS;
4284 break;
4285 case ISD::UADDO:
4286 Opc = AArch64ISD::ADDS;
4287 CC = AArch64CC::HS;
4288 break;
4289 case ISD::SSUBO:
4290 Opc = AArch64ISD::SUBS;
4291 CC = AArch64CC::VS;
4292 break;
4293 case ISD::USUBO:
4294 Opc = AArch64ISD::SUBS;
4295 CC = AArch64CC::LO;
4296 break;
4297 // Multiply needs a little bit extra work.
4298 case ISD::SMULO:
4299 case ISD::UMULO: {
4300 CC = AArch64CC::NE;
4301 bool IsSigned = Op.getOpcode() == ISD::SMULO;
4302 if (Op.getValueType() == MVT::i32) {
4303 // Extend to 64-bits, then perform a 64-bit multiply.
4304 unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4305 LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
4306 RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
4307 SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
4308 Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
4309
4310 // Check that the result fits into a 32-bit integer.
4311 SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
4312 if (IsSigned) {
4313 // cmp xreg, wreg, sxtw
4314 SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value);
4315 Overflow =
4316 DAG.getNode(AArch64ISD::SUBS, DL, VTs, Mul, SExtMul).getValue(1);
4317 } else {
4318 // tst xreg, #0xffffffff00000000
4319 SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64);
4320 Overflow =
4321 DAG.getNode(AArch64ISD::ANDS, DL, VTs, Mul, UpperBits).getValue(1);
4322 }
4323 break;
4324 }
4325 assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
4326 // For the 64 bit multiply
4327 Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
4328 if (IsSigned) {
4329 SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
4330 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
4331 DAG.getConstant(63, DL, MVT::i64));
4332 // It is important that LowerBits is last, otherwise the arithmetic
4333 // shift will not be folded into the compare (SUBS).
4334 SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
4335 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
4336 .getValue(1);
4337 } else {
4338 SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
4339 SDVTList VTs = DAG.getVTList(MVT::i64, FlagsVT);
4340 Overflow =
4341 DAG.getNode(AArch64ISD::SUBS, DL, VTs,
4342 DAG.getConstant(0, DL, MVT::i64),
4343 UpperBits).getValue(1);
4344 }
4345 break;
4346 }
4347 } // switch (...)
4348
4349 if (Opc) {
4350 SDVTList VTs = DAG.getVTList(Op->getValueType(0), FlagsVT);
4351
4352 // Emit the AArch64 operation with overflow check.
4353 Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
4354 Overflow = Value.getValue(1);
4355 }
4356 return std::make_pair(Value, Overflow);
4357}
4358
4359SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
4360 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
4361 !Subtarget->isNeonAvailable()))
4362 return LowerToScalableOp(Op, DAG);
4363
4364 SDValue Sel = Op.getOperand(0);
4365 SDValue Other = Op.getOperand(1);
4366 SDLoc DL(Sel);
4367
4368 // If the operand is an overflow checking operation, invert the condition
4369 // code and kill the Not operation. I.e., transform:
4370 // (xor (overflow_op_bool, 1))
4371 // -->
4372 // (csel 1, 0, invert(cc), overflow_op_bool)
4373 // ... which later gets transformed to just a cset instruction with an
4374 // inverted condition code, rather than a cset + eor sequence.
4376 // Only lower legal XALUO ops.
4378 return SDValue();
4379
4380 SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
4381 SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
4383 SDValue Value, Overflow;
4384 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
4385 SDValue CCVal = getCondCode(DAG, getInvertedCondCode(CC));
4386 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
4387 CCVal, Overflow);
4388 }
4389 // If neither operand is a SELECT_CC, give up.
4390 if (Sel.getOpcode() != ISD::SELECT_CC)
4391 std::swap(Sel, Other);
4392 if (Sel.getOpcode() != ISD::SELECT_CC)
4393 return Op;
4394
4395 // The folding we want to perform is:
4396 // (xor x, (select_cc a, b, cc, 0, -1) )
4397 // -->
4398 // (csel x, (xor x, -1), cc ...)
4399 //
4400 // The latter will get matched to a CSINV instruction.
4401
4402 ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
4403 SDValue LHS = Sel.getOperand(0);
4404 SDValue RHS = Sel.getOperand(1);
4405 SDValue TVal = Sel.getOperand(2);
4406 SDValue FVal = Sel.getOperand(3);
4407
4408 // FIXME: This could be generalized to non-integer comparisons.
4409 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
4410 return Op;
4411
4412 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
4413 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
4414
4415 // The values aren't constants, this isn't the pattern we're looking for.
4416 if (!CFVal || !CTVal)
4417 return Op;
4418
4419 // We can commute the SELECT_CC by inverting the condition. This
4420 // might be needed to make this fit into a CSINV pattern.
4421 if (CTVal->isAllOnes() && CFVal->isZero()) {
4422 std::swap(TVal, FVal);
4423 std::swap(CTVal, CFVal);
4424 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
4425 }
4426
4427 // If the constants line up, perform the transform!
4428 if (CTVal->isZero() && CFVal->isAllOnes()) {
4429 SDValue CCVal;
4430 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, DL);
4431
4432 FVal = Other;
4433 TVal = DAG.getNode(ISD::XOR, DL, Other.getValueType(), Other,
4434 DAG.getAllOnesConstant(DL, Other.getValueType()));
4435
4436 return DAG.getNode(AArch64ISD::CSEL, DL, Sel.getValueType(), FVal, TVal,
4437 CCVal, Cmp);
4438 }
4439
4440 return Op;
4441}
4442
4443// If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C'
4444// bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else
4445// sets 'C' bit to 0.
4447 SDLoc DL(Value);
4448 EVT VT = Value.getValueType();
4449 SDValue Op0 = Invert ? DAG.getConstant(0, DL, VT) : Value;
4450 SDValue Op1 = Invert ? Value : DAG.getConstant(1, DL, VT);
4451 SDValue Cmp =
4452 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT), Op0, Op1);
4453 return Cmp.getValue(1);
4454}
4455
4456// If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0.
4457// If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1.
4459 bool Invert) {
4460 assert(Glue.getResNo() == 1);
4461 SDLoc DL(Glue);
4462 SDValue Zero = DAG.getConstant(0, DL, VT);
4463 SDValue One = DAG.getConstant(1, DL, VT);
4465 SDValue CC = getCondCode(DAG, Cond);
4466 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
4467}
4468
4469// Value is 1 if 'V' bit of NZCV is 1, else 0
4471 assert(Glue.getResNo() == 1);
4472 SDLoc DL(Glue);
4473 SDValue Zero = DAG.getConstant(0, DL, VT);
4474 SDValue One = DAG.getConstant(1, DL, VT);
4476 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Glue);
4477}
4478
4479// This lowering is inefficient, but it will get cleaned up by
4480// `foldOverflowCheck`
4482 unsigned Opcode, bool IsSigned) {
4483 EVT VT0 = Op.getValue(0).getValueType();
4484 EVT VT1 = Op.getValue(1).getValueType();
4485
4486 if (VT0 != MVT::i32 && VT0 != MVT::i64)
4487 return SDValue();
4488
4489 bool InvertCarry = Opcode == AArch64ISD::SBCS;
4490 SDValue OpLHS = Op.getOperand(0);
4491 SDValue OpRHS = Op.getOperand(1);
4492 SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry);
4493
4494 SDLoc DL(Op);
4495
4496 SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, FlagsVT), OpLHS,
4497 OpRHS, OpCarryIn);
4498
4499 SDValue OutFlag =
4500 IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG)
4501 : carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry);
4502
4503 return DAG.getMergeValues({Sum, OutFlag}, DL);
4504}
4505
4507 // Let legalize expand this if it isn't a legal type yet.
4508 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
4509 return SDValue();
4510
4511 SDLoc DL(Op);
4513 // The actual operation that sets the overflow or carry flag.
4514 SDValue Value, Overflow;
4515 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
4516
4517 // We use 0 and 1 as false and true values.
4518 SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
4519 SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
4520
4521 // We use an inverted condition, because the conditional select is inverted
4522 // too. This will allow it to be selected to a single instruction:
4523 // CSINC Wd, WZR, WZR, invert(cond).
4524 SDValue CCVal = getCondCode(DAG, getInvertedCondCode(CC));
4525 Overflow =
4526 DAG.getNode(AArch64ISD::CSEL, DL, MVT::i32, FVal, TVal, CCVal, Overflow);
4527
4528 return DAG.getMergeValues({Value, Overflow}, DL);
4529}
4530
4531// Prefetch operands are:
4532// 1: Address to prefetch
4533// 2: bool isWrite
4534// 3: int locality (0 = no locality ... 3 = extreme locality)
4535// 4: bool isDataCache
4537 SDLoc DL(Op);
4538 unsigned IsWrite = Op.getConstantOperandVal(2);
4539 unsigned Locality = Op.getConstantOperandVal(3);
4540 unsigned IsData = Op.getConstantOperandVal(4);
4541
4542 bool IsStream = !Locality;
4543 // When the locality number is set
4544 if (Locality) {
4545 // The front-end should have filtered out the out-of-range values
4546 assert(Locality <= 3 && "Prefetch locality out-of-range");
4547 // The locality degree is the opposite of the cache speed.
4548 // Put the number the other way around.
4549 // The encoding starts at 0 for level 1
4550 Locality = 3 - Locality;
4551 }
4552
4553 // built the mask value encoding the expected behavior.
4554 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
4555 (!IsData << 3) | // IsDataCache bit
4556 (Locality << 1) | // Cache level bits
4557 (unsigned)IsStream; // Stream bit
4558 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
4559 DAG.getTargetConstant(PrfOp, DL, MVT::i32),
4560 Op.getOperand(1));
4561}
4562
4563// Converts SETCC (AND X Y) Z ULT -> SETCC (AND X (Y & ~(Z - 1)) 0 EQ when Y is
4564// a power of 2. This is then lowered to ANDS X (Y & ~(Z - 1)) instead of SUBS
4565// (AND X Y) Z which produces a better opt with EmitComparison
4567 SelectionDAG &DAG, const SDLoc DL) {
4568 if (CC == ISD::SETULT && LHS.getOpcode() == ISD::AND && LHS->hasOneUse()) {
4569 ConstantSDNode *LHSConstOp = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
4571 if (LHSConstOp && RHSConst) {
4572 uint64_t LHSConstValue = LHSConstOp->getZExtValue();
4573 uint64_t RHSConstant = RHSConst->getZExtValue();
4574 if (isPowerOf2_64(RHSConstant)) {
4575 uint64_t NewMaskValue = LHSConstValue & ~(RHSConstant - 1);
4576 LHS =
4577 DAG.getNode(ISD::AND, DL, LHS.getValueType(), LHS.getOperand(0),
4578 DAG.getConstant(NewMaskValue, DL, LHS.getValueType()));
4579 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4580 CC = ISD::SETEQ;
4581 }
4582 }
4583 }
4584}
4585
4586SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
4587 SelectionDAG &DAG) const {
4588 EVT VT = Op.getValueType();
4589 if (VT.isScalableVector()) {
4590 SDValue SrcVal = Op.getOperand(0);
4591
4592 if (VT == MVT::nxv2f64 && SrcVal.getValueType() == MVT::nxv2bf16) {
4593 // Break conversion in two with the first part converting to f32 and the
4594 // second using native f32->VT instructions.
4595 SDLoc DL(Op);
4596 return DAG.getNode(ISD::FP_EXTEND, DL, VT,
4597 DAG.getNode(ISD::FP_EXTEND, DL, MVT::nxv2f32, SrcVal));
4598 }
4599
4600 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
4601 }
4602
4603 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
4604 return LowerFixedLengthFPExtendToSVE(Op, DAG);
4605
4606 bool IsStrict = Op->isStrictFPOpcode();
4607 SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
4608 EVT Op0VT = Op0.getValueType();
4609 if (VT == MVT::f64) {
4610 // FP16->FP32 extends are legal for v32 and v4f32.
4611 if (Op0VT == MVT::f32 || Op0VT == MVT::f16)
4612 return Op;
4613 // Split bf16->f64 extends into two fpextends.
4614 if (Op0VT == MVT::bf16 && IsStrict) {
4615 SDValue Ext1 =
4616 DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(Op), {MVT::f32, MVT::Other},
4617 {Op0, Op.getOperand(0)});
4618 return DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(Op), {VT, MVT::Other},
4619 {Ext1, Ext1.getValue(1)});
4620 }
4621 if (Op0VT == MVT::bf16)
4622 return DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), VT,
4623 DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, Op0));
4624 return SDValue();
4625 }
4626
4627 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
4628 return SDValue();
4629}
4630
4631SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
4632 SelectionDAG &DAG) const {
4633 EVT VT = Op.getValueType();
4634 bool IsStrict = Op->isStrictFPOpcode();
4635 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4636 EVT SrcVT = SrcVal.getValueType();
4637 bool Trunc = Op.getConstantOperandVal(IsStrict ? 2 : 1) == 1;
4638
4639 if (VT.isScalableVector()) {
4640 // Let common code split the operation.
4641 if (SrcVT == MVT::nxv8f32)
4642 return Op;
4643
4644 if (VT.getScalarType() != MVT::bf16)
4645 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4646
4647 SDLoc DL(Op);
4648 constexpr EVT I32 = MVT::nxv4i32;
4649 auto ImmV = [&](int I) -> SDValue { return DAG.getConstant(I, DL, I32); };
4650
4651 SDValue NaN;
4652 SDValue Narrow;
4653
4654 if (SrcVT == MVT::nxv2f32 || SrcVT == MVT::nxv4f32) {
4655 if (Subtarget->hasBF16())
4656 return LowerToPredicatedOp(Op, DAG,
4657 AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
4658
4659 Narrow = getSVESafeBitCast(I32, SrcVal, DAG);
4660
4661 // Set the quiet bit.
4662 if (!DAG.isKnownNeverSNaN(SrcVal))
4663 NaN = DAG.getNode(ISD::OR, DL, I32, Narrow, ImmV(0x400000));
4664 } else if (SrcVT == MVT::nxv2f64 &&
4665 (Subtarget->hasSVE2() || Subtarget->isStreamingSVEAvailable())) {
4666 // Round to float without introducing rounding errors and try again.
4667 SDValue Pg = getPredicateForVector(DAG, DL, MVT::nxv2f32);
4668 Narrow = DAG.getNode(AArch64ISD::FCVTX_MERGE_PASSTHRU, DL, MVT::nxv2f32,
4669 Pg, SrcVal, DAG.getUNDEF(MVT::nxv2f32));
4670
4672 if (IsStrict)
4673 NewOps.push_back(Op.getOperand(0));
4674 NewOps.push_back(Narrow);
4675 NewOps.push_back(Op.getOperand(IsStrict ? 2 : 1));
4676 return DAG.getNode(Op.getOpcode(), DL, VT, NewOps, Op->getFlags());
4677 } else
4678 return SDValue();
4679
4680 if (!Trunc) {
4681 SDValue Lsb = DAG.getNode(ISD::SRL, DL, I32, Narrow, ImmV(16));
4682 Lsb = DAG.getNode(ISD::AND, DL, I32, Lsb, ImmV(1));
4683 SDValue RoundingBias = DAG.getNode(ISD::ADD, DL, I32, Lsb, ImmV(0x7fff));
4684 Narrow = DAG.getNode(ISD::ADD, DL, I32, Narrow, RoundingBias);
4685 }
4686
4687 // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4688 // 0x80000000.
4689 if (NaN) {
4690 EVT I1 = I32.changeElementType(MVT::i1);
4691 EVT CondVT = VT.changeElementType(MVT::i1);
4692 SDValue IsNaN = DAG.getSetCC(DL, CondVT, SrcVal, SrcVal, ISD::SETUO);
4693 IsNaN = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, I1, IsNaN);
4694 Narrow = DAG.getSelect(DL, I32, IsNaN, NaN, Narrow);
4695 }
4696
4697 // Now that we have rounded, shift the bits into position.
4698 Narrow = DAG.getNode(ISD::SRL, DL, I32, Narrow, ImmV(16));
4699 return getSVESafeBitCast(VT, Narrow, DAG);
4700 }
4701
4702 if (useSVEForFixedLengthVectorVT(SrcVT, !Subtarget->isNeonAvailable()))
4703 return LowerFixedLengthFPRoundToSVE(Op, DAG);
4704
4705 // Expand cases where the result type is BF16 but we don't have hardware
4706 // instructions to lower it.
4707 if (VT.getScalarType() == MVT::bf16 &&
4708 !((Subtarget->hasNEON() || Subtarget->hasSME()) &&
4709 Subtarget->hasBF16())) {
4710 SDLoc DL(Op);
4711 SDValue Narrow = SrcVal;
4712 SDValue NaN;
4713 EVT I32 = SrcVT.changeElementType(MVT::i32);
4714 EVT F32 = SrcVT.changeElementType(MVT::f32);
4715 if (SrcVT.getScalarType() == MVT::f32) {
4716 bool NeverSNaN = DAG.isKnownNeverSNaN(Narrow);
4717 Narrow = DAG.getNode(ISD::BITCAST, DL, I32, Narrow);
4718 if (!NeverSNaN) {
4719 // Set the quiet bit.
4720 NaN = DAG.getNode(ISD::OR, DL, I32, Narrow,
4721 DAG.getConstant(0x400000, DL, I32));
4722 }
4723 } else if (SrcVT.getScalarType() == MVT::f64) {
4724 Narrow = DAG.getNode(AArch64ISD::FCVTXN, DL, F32, Narrow);
4725 Narrow = DAG.getNode(ISD::BITCAST, DL, I32, Narrow);
4726 } else {
4727 return SDValue();
4728 }
4729 if (!Trunc) {
4730 SDValue One = DAG.getConstant(1, DL, I32);
4731 SDValue Lsb = DAG.getNode(ISD::SRL, DL, I32, Narrow,
4732 DAG.getShiftAmountConstant(16, I32, DL));
4733 Lsb = DAG.getNode(ISD::AND, DL, I32, Lsb, One);
4734 SDValue RoundingBias =
4735 DAG.getNode(ISD::ADD, DL, I32, DAG.getConstant(0x7fff, DL, I32), Lsb);
4736 Narrow = DAG.getNode(ISD::ADD, DL, I32, Narrow, RoundingBias);
4737 }
4738
4739 // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
4740 // 0x80000000.
4741 if (NaN) {
4742 SDValue IsNaN = DAG.getSetCC(
4743 DL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT),
4744 SrcVal, SrcVal, ISD::SETUO);
4745 Narrow = DAG.getSelect(DL, I32, IsNaN, NaN, Narrow);
4746 }
4747
4748 // Now that we have rounded, shift the bits into position.
4749 Narrow = DAG.getNode(ISD::SRL, DL, I32, Narrow,
4750 DAG.getShiftAmountConstant(16, I32, DL));
4751 if (VT.isVector()) {
4752 EVT I16 = I32.changeVectorElementType(MVT::i16);
4753 Narrow = DAG.getNode(ISD::TRUNCATE, DL, I16, Narrow);
4754 return DAG.getNode(ISD::BITCAST, DL, VT, Narrow);
4755 }
4756 Narrow = DAG.getNode(ISD::BITCAST, DL, F32, Narrow);
4757 SDValue Result = DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, Narrow);
4758 return IsStrict ? DAG.getMergeValues({Result, Op.getOperand(0)}, DL)
4759 : Result;
4760 }
4761
4762 if (SrcVT != MVT::f128) {
4763 // Expand cases where the input is a vector bigger than NEON.
4765 return SDValue();
4766
4767 // It's legal except when f128 is involved
4768 return Op;
4769 }
4770
4771 return SDValue();
4772}
4773
4774SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
4775 SelectionDAG &DAG) const {
4776 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
4777 // Any additional optimization in this function should be recorded
4778 // in the cost tables.
4779 bool IsStrict = Op->isStrictFPOpcode();
4780 EVT InVT = Op.getOperand(IsStrict ? 1 : 0).getValueType();
4781 EVT VT = Op.getValueType();
4782
4783 assert(!(IsStrict && VT.isScalableVector()) &&
4784 "Unimplemented SVE support for STRICT_FP_to_INT!");
4785
4786 // f16 conversions are promoted to f32 when full fp16 is not supported.
4787 if ((InVT.getVectorElementType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4788 InVT.getVectorElementType() == MVT::bf16) {
4789 EVT NewVT = VT.changeElementType(MVT::f32);
4790 SDLoc DL(Op);
4791 if (IsStrict) {
4792 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {NewVT, MVT::Other},
4793 {Op.getOperand(0), Op.getOperand(1)});
4794 return DAG.getNode(Op.getOpcode(), DL, {VT, MVT::Other},
4795 {Ext.getValue(1), Ext.getValue(0)});
4796 }
4797 return DAG.getNode(
4798 Op.getOpcode(), DL, Op.getValueType(),
4799 DAG.getNode(ISD::FP_EXTEND, DL, NewVT, Op.getOperand(0)));
4800 }
4801
4802 if (VT.isScalableVector()) {
4803 if (VT.getVectorElementType() == MVT::i1) {
4804 SDLoc DL(Op);
4805 EVT CvtVT = getPromotedVTForPredicate(VT);
4806 SDValue Cvt = DAG.getNode(Op.getOpcode(), DL, CvtVT, Op.getOperand(0));
4807 SDValue Zero = DAG.getConstant(0, DL, CvtVT);
4808 return DAG.getSetCC(DL, VT, Cvt, Zero, ISD::SETNE);
4809 }
4810
4811 // Let common code split the operation.
4812 if (InVT == MVT::nxv8f32)
4813 return Op;
4814
4815 unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
4816 ? AArch64ISD::FCVTZU_MERGE_PASSTHRU
4817 : AArch64ISD::FCVTZS_MERGE_PASSTHRU;
4818 return LowerToPredicatedOp(Op, DAG, Opcode);
4819 }
4820
4821 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
4822 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
4823 return LowerFixedLengthFPToIntToSVE(Op, DAG);
4824
4825 uint64_t VTSize = VT.getFixedSizeInBits();
4826 uint64_t InVTSize = InVT.getFixedSizeInBits();
4827 if (VTSize < InVTSize) {
4828 SDLoc DL(Op);
4829 if (IsStrict) {
4831 SDValue Cv = DAG.getNode(Op.getOpcode(), DL, {InVT, MVT::Other},
4832 {Op.getOperand(0), Op.getOperand(1)});
4833 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, Cv);
4834 return DAG.getMergeValues({Trunc, Cv.getValue(1)}, DL);
4835 }
4836 SDValue Cv =
4837 DAG.getNode(Op.getOpcode(), DL, InVT.changeVectorElementTypeToInteger(),
4838 Op.getOperand(0));
4839 return DAG.getNode(ISD::TRUNCATE, DL, VT, Cv);
4840 }
4841
4842 if (VTSize > InVTSize) {
4843 SDLoc DL(Op);
4844 MVT ExtVT =
4847 if (IsStrict) {
4848 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {ExtVT, MVT::Other},
4849 {Op.getOperand(0), Op.getOperand(1)});
4850 return DAG.getNode(Op.getOpcode(), DL, {VT, MVT::Other},
4851 {Ext.getValue(1), Ext.getValue(0)});
4852 }
4853 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, DL, ExtVT, Op.getOperand(0));
4854 return DAG.getNode(Op.getOpcode(), DL, VT, Ext);
4855 }
4856
4857 // Use a scalar operation for conversions between single-element vectors of
4858 // the same size.
4859 if (InVT.getVectorNumElements() == 1) {
4860 SDLoc DL(Op);
4861 SDValue Extract = DAG.getNode(
4863 Op.getOperand(IsStrict ? 1 : 0), DAG.getConstant(0, DL, MVT::i64));
4864 EVT ScalarVT = VT.getScalarType();
4865 if (IsStrict)
4866 return DAG.getNode(Op.getOpcode(), DL, {ScalarVT, MVT::Other},
4867 {Op.getOperand(0), Extract});
4868 return DAG.getNode(Op.getOpcode(), DL, ScalarVT, Extract);
4869 }
4870
4871 // Type changing conversions are illegal.
4872 return Op;
4873}
4874
4875SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
4876 SelectionDAG &DAG) const {
4877 bool IsStrict = Op->isStrictFPOpcode();
4878 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4879
4880 if (SrcVal.getValueType().isVector())
4881 return LowerVectorFP_TO_INT(Op, DAG);
4882
4883 // f16 conversions are promoted to f32 when full fp16 is not supported.
4884 if ((SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
4885 SrcVal.getValueType() == MVT::bf16) {
4886 SDLoc DL(Op);
4887 if (IsStrict) {
4888 SDValue Ext =
4889 DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
4890 {Op.getOperand(0), SrcVal});
4891 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
4892 {Ext.getValue(1), Ext.getValue(0)});
4893 }
4894 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(),
4895 DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, SrcVal));
4896 }
4897
4898 if (SrcVal.getValueType() != MVT::f128) {
4899 // It's legal except when f128 is involved
4900 return Op;
4901 }
4902
4903 return SDValue();
4904}
4905
4906SDValue
4907AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
4908 SelectionDAG &DAG) const {
4909 // AArch64 FP-to-int conversions saturate to the destination element size, so
4910 // we can lower common saturating conversions to simple instructions.
4911 SDValue SrcVal = Op.getOperand(0);
4912 EVT SrcVT = SrcVal.getValueType();
4913 EVT DstVT = Op.getValueType();
4914 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4915
4916 uint64_t SrcElementWidth = SrcVT.getScalarSizeInBits();
4917 uint64_t DstElementWidth = DstVT.getScalarSizeInBits();
4918 uint64_t SatWidth = SatVT.getScalarSizeInBits();
4919 assert(SatWidth <= DstElementWidth &&
4920 "Saturation width cannot exceed result width");
4921
4922 // TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT.
4923 // Currently, the `llvm.fpto[su]i.sat.*` intrinsics don't accept scalable
4924 // types, so this is hard to reach.
4925 if (DstVT.isScalableVector())
4926 return SDValue();
4927
4928 EVT SrcElementVT = SrcVT.getVectorElementType();
4929
4930 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
4931 SDLoc DL(Op);
4932 SDValue SrcVal2;
4933 if ((SrcElementVT == MVT::f16 &&
4934 (!Subtarget->hasFullFP16() || DstElementWidth > 16)) ||
4935 SrcElementVT == MVT::bf16) {
4936 MVT F32VT = MVT::getVectorVT(MVT::f32, SrcVT.getVectorNumElements());
4937 SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F32VT, SrcVal);
4938 // If we are extending to a v8f32, split into two v4f32 to produce legal
4939 // types.
4940 if (F32VT.getSizeInBits() > 128) {
4941 std::tie(SrcVal, SrcVal2) = DAG.SplitVector(SrcVal, DL);
4942 F32VT = F32VT.getHalfNumVectorElementsVT();
4943 }
4944 SrcVT = F32VT;
4945 SrcElementVT = MVT::f32;
4946 SrcElementWidth = 32;
4947 } else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 &&
4948 SrcElementVT != MVT::f16 && SrcElementVT != MVT::bf16)
4949 return SDValue();
4950
4951 // Expand to f64 if we are saturating to i64, to help keep the lanes the same
4952 // width and produce a fcvtzu.
4953 if (SatWidth == 64 && SrcElementWidth < 64) {
4954 MVT F64VT = MVT::getVectorVT(MVT::f64, SrcVT.getVectorNumElements());
4955 SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F64VT, SrcVal);
4956 SrcVT = F64VT;
4957 SrcElementVT = MVT::f64;
4958 SrcElementWidth = 64;
4959 }
4960 // Cases that we can emit directly.
4961 if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth) {
4962 SDValue Res = DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
4963 DAG.getValueType(DstVT.getScalarType()));
4964 if (SrcVal2) {
4965 SDValue Res2 = DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal2,
4966 DAG.getValueType(DstVT.getScalarType()));
4967 return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Res, Res2);
4968 }
4969 return Res;
4970 }
4971
4972 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
4973 // result. This is only valid if the legal cvt is larger than the saturate
4974 // width. For double, as we don't have MIN/MAX, it can be simpler to scalarize
4975 // (at least until sqxtn is selected).
4976 if (SrcElementWidth < SatWidth || SrcElementVT == MVT::f64)
4977 return SDValue();
4978
4979 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
4980 SDValue NativeCvt = DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal,
4981 DAG.getValueType(IntVT.getScalarType()));
4982 SDValue NativeCvt2 =
4983 SrcVal2 ? DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal2,
4984 DAG.getValueType(IntVT.getScalarType()))
4985 : SDValue();
4986 SDValue Sat, Sat2;
4987 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
4988 SDValue MinC = DAG.getConstant(
4989 APInt::getSignedMaxValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4990 SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC);
4991 SDValue Min2 = SrcVal2 ? DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt2, MinC) : SDValue();
4992 SDValue MaxC = DAG.getConstant(
4993 APInt::getSignedMinValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
4994 Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC);
4995 Sat2 = SrcVal2 ? DAG.getNode(ISD::SMAX, DL, IntVT, Min2, MaxC) : SDValue();
4996 } else {
4997 SDValue MinC = DAG.getConstant(
4998 APInt::getAllOnes(SatWidth).zext(SrcElementWidth), DL, IntVT);
4999 Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC);
5000 Sat2 = SrcVal2 ? DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt2, MinC) : SDValue();
5001 }
5002
5003 if (SrcVal2)
5004 Sat = DAG.getNode(ISD::CONCAT_VECTORS, DL,
5006 Sat, Sat2);
5007
5008 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
5009}
5010
5011SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
5012 SelectionDAG &DAG) const {
5013 // AArch64 FP-to-int conversions saturate to the destination register size, so
5014 // we can lower common saturating conversions to simple instructions.
5015 SDValue SrcVal = Op.getOperand(0);
5016 EVT SrcVT = SrcVal.getValueType();
5017
5018 if (SrcVT.isVector())
5019 return LowerVectorFP_TO_INT_SAT(Op, DAG);
5020
5021 EVT DstVT = Op.getValueType();
5022 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
5023 uint64_t SatWidth = SatVT.getScalarSizeInBits();
5024 uint64_t DstWidth = DstVT.getScalarSizeInBits();
5025 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
5026
5027 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
5028 if ((SrcVT == MVT::f16 && !Subtarget->hasFullFP16()) || SrcVT == MVT::bf16) {
5029 SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal);
5030 SrcVT = MVT::f32;
5031 } else if (SrcVT != MVT::f64 && SrcVT != MVT::f32 && SrcVT != MVT::f16 &&
5032 SrcVT != MVT::bf16)
5033 return SDValue();
5034
5035 SDLoc DL(Op);
5036 // Cases that we can emit directly.
5037 if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 ||
5038 (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) &&
5039 DstVT == SatVT && (DstVT == MVT::i64 || DstVT == MVT::i32))
5040 return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
5041 DAG.getValueType(DstVT));
5042
5043 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
5044 // result. This is only valid if the legal cvt is larger than the saturate
5045 // width.
5046 if (DstWidth < SatWidth)
5047 return SDValue();
5048
5049 if (SrcVT == MVT::f16 && SatVT == MVT::i16 && DstVT == MVT::i32) {
5050 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
5051 SDValue CVTf32 =
5052 DAG.getNode(AArch64ISD::FCVTZS_HALF, DL, MVT::f32, SrcVal);
5053 SDValue Bitcast = DAG.getBitcast(DstVT, CVTf32);
5054 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, Bitcast,
5055 DAG.getValueType(SatVT));
5056 }
5057 SDValue CVTf32 = DAG.getNode(AArch64ISD::FCVTZU_HALF, DL, MVT::f32, SrcVal);
5058 return DAG.getBitcast(DstVT, CVTf32);
5059 }
5060
5061 SDValue NativeCvt =
5062 DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, DAG.getValueType(DstVT));
5063 SDValue Sat;
5064 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
5065 SDValue MinC = DAG.getConstant(
5066 APInt::getSignedMaxValue(SatWidth).sext(DstWidth), DL, DstVT);
5067 SDValue Min = DAG.getNode(ISD::SMIN, DL, DstVT, NativeCvt, MinC);
5068 SDValue MaxC = DAG.getConstant(
5069 APInt::getSignedMinValue(SatWidth).sext(DstWidth), DL, DstVT);
5070 Sat = DAG.getNode(ISD::SMAX, DL, DstVT, Min, MaxC);
5071 } else {
5072 SDValue MinC = DAG.getConstant(
5073 APInt::getAllOnes(SatWidth).zext(DstWidth), DL, DstVT);
5074 Sat = DAG.getNode(ISD::UMIN, DL, DstVT, NativeCvt, MinC);
5075 }
5076
5077 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
5078}
5079
5080SDValue AArch64TargetLowering::LowerVectorXRINT(SDValue Op,
5081 SelectionDAG &DAG) const {
5082 EVT VT = Op.getValueType();
5083 SDValue Src = Op.getOperand(0);
5084 SDLoc DL(Op);
5085
5086 assert(VT.isVector() && "Expected vector type");
5087
5088 EVT CastVT =
5089 VT.changeVectorElementType(Src.getValueType().getVectorElementType());
5090
5091 // Round the floating-point value into a floating-point register with the
5092 // current rounding mode.
5093 SDValue FOp = DAG.getNode(ISD::FRINT, DL, CastVT, Src);
5094
5095 // Truncate the rounded floating point to an integer.
5096 return DAG.getNode(ISD::FP_TO_SINT_SAT, DL, VT, FOp,
5098}
5099
5100SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
5101 SelectionDAG &DAG) const {
5102 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
5103 // Any additional optimization in this function should be recorded
5104 // in the cost tables.
5105 bool IsStrict = Op->isStrictFPOpcode();
5106 EVT VT = Op.getValueType();
5107 SDLoc DL(Op);
5108 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
5109 EVT InVT = In.getValueType();
5110 unsigned Opc = Op.getOpcode();
5111 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
5112
5113 assert(!(IsStrict && VT.isScalableVector()) &&
5114 "Unimplemented SVE support for ISD:::STRICT_INT_TO_FP!");
5115
5116 // NOTE: i1->bf16 does not require promotion to f32.
5117 if (VT.isScalableVector() && InVT.getVectorElementType() == MVT::i1) {
5118 SDValue FalseVal = DAG.getConstantFP(0.0, DL, VT);
5119 SDValue TrueVal = IsSigned ? DAG.getConstantFP(-1.0, DL, VT)
5120 : DAG.getConstantFP(1.0, DL, VT);
5121 return DAG.getNode(ISD::VSELECT, DL, VT, In, TrueVal, FalseVal);
5122 }
5123
5124 // Promote bf16 conversions to f32.
5125 if (VT.getVectorElementType() == MVT::bf16) {
5126 EVT F32 = VT.changeElementType(MVT::f32);
5127 if (IsStrict) {
5128 SDValue Val = DAG.getNode(Op.getOpcode(), DL, {F32, MVT::Other},
5129 {Op.getOperand(0), In});
5130 return DAG.getNode(ISD::STRICT_FP_ROUND, DL,
5131 {Op.getValueType(), MVT::Other},
5132 {Val.getValue(1), Val.getValue(0),
5133 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)});
5134 }
5135 return DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(),
5136 DAG.getNode(Op.getOpcode(), DL, F32, In),
5137 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5138 }
5139
5140 if (VT.isScalableVector()) {
5141 // Let common code split the operation.
5142 if (VT == MVT::nxv8f32)
5143 return Op;
5144
5145 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
5146 : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
5147 return LowerToPredicatedOp(Op, DAG, Opcode);
5148 }
5149
5150 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) ||
5151 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable()))
5152 return LowerFixedLengthIntToFPToSVE(Op, DAG);
5153
5154 uint64_t VTSize = VT.getFixedSizeInBits();
5155 uint64_t InVTSize = InVT.getFixedSizeInBits();
5156 if (VTSize < InVTSize) {
5157 // AArch64 doesn't have a direct vector instruction to convert
5158 // fixed point to floating point AND narrow it at the same time.
5159 // Additional rounding when the target is f32/f64 causes double
5160 // rounding issues. Conversion to f16 is fine due to narrow width.
5161 bool IsTargetf32 = VT.getVectorElementType() == MVT::f32;
5162 bool IsTargetf16 = false;
5163 if (Op.hasOneUse() &&
5164 Op->user_begin()->getOpcode() == ISD::CONCAT_VECTORS) {
5165 // Some vector types are split during legalization into half, followed by
5166 // concatenation, followed by rounding to the original vector type. If we
5167 // end up resolving to f16 type, we shouldn't worry about rounding errors.
5168 SDNode *U = *Op->user_begin();
5169 if (U->hasOneUse() && U->user_begin()->getOpcode() == ISD::FP_ROUND) {
5170 EVT TmpVT = U->user_begin()->getValueType(0);
5171 if (TmpVT.getScalarType() == MVT::f16)
5172 IsTargetf16 = true;
5173 }
5174 }
5175
5176 if (IsTargetf32 && !IsTargetf16) {
5177 return !IsStrict ? DAG.UnrollVectorOp(Op.getNode()) : SDValue();
5178 }
5179
5180 MVT CastVT =
5182 InVT.getVectorNumElements());
5183 if (IsStrict) {
5184 In = DAG.getNode(Opc, DL, {CastVT, MVT::Other}, {Op.getOperand(0), In});
5185 return DAG.getNode(ISD::STRICT_FP_ROUND, DL, {VT, MVT::Other},
5186 {In.getValue(1), In.getValue(0),
5187 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)});
5188 }
5189 In = DAG.getNode(Opc, DL, CastVT, In);
5190 return DAG.getNode(ISD::FP_ROUND, DL, VT, In,
5191 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5192 }
5193
5194 if (VTSize > InVTSize) {
5195 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
5196 EVT CastVT = VT.changeVectorElementTypeToInteger();
5197 In = DAG.getNode(CastOpc, DL, CastVT, In);
5198 if (IsStrict)
5199 return DAG.getNode(Opc, DL, {VT, MVT::Other}, {Op.getOperand(0), In});
5200 return DAG.getNode(Opc, DL, VT, In);
5201 }
5202
5203 // Use a scalar operation for conversions between single-element vectors of
5204 // the same size.
5205 if (VT.getVectorNumElements() == 1) {
5206 SDValue Extract =
5208 DAG.getConstant(0, DL, MVT::i64));
5209 EVT ScalarVT = VT.getScalarType();
5210 if (IsStrict)
5211 return DAG.getNode(Op.getOpcode(), DL, {ScalarVT, MVT::Other},
5212 {Op.getOperand(0), Extract});
5213 return DAG.getNode(Op.getOpcode(), DL, ScalarVT, Extract);
5214 }
5215
5216 return Op;
5217}
5218
5219SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
5220 SelectionDAG &DAG) const {
5221 if (Op.getValueType().isVector())
5222 return LowerVectorINT_TO_FP(Op, DAG);
5223
5224 bool IsStrict = Op->isStrictFPOpcode();
5225 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
5226
5227 bool IsSigned = Op->getOpcode() == ISD::STRICT_SINT_TO_FP ||
5228 Op->getOpcode() == ISD::SINT_TO_FP;
5229
5230 auto IntToFpViaPromotion = [&](EVT PromoteVT) {
5231 SDLoc DL(Op);
5232 if (IsStrict) {
5233 SDValue Val = DAG.getNode(Op.getOpcode(), DL, {PromoteVT, MVT::Other},
5234 {Op.getOperand(0), SrcVal});
5235 return DAG.getNode(ISD::STRICT_FP_ROUND, DL,
5236 {Op.getValueType(), MVT::Other},
5237 {Val.getValue(1), Val.getValue(0),
5238 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)});
5239 }
5240 return DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(),
5241 DAG.getNode(Op.getOpcode(), DL, PromoteVT, SrcVal),
5242 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5243 };
5244
5245 if (Op.getValueType() == MVT::bf16) {
5246 unsigned MaxWidth = IsSigned
5247 ? DAG.ComputeMaxSignificantBits(SrcVal)
5248 : DAG.computeKnownBits(SrcVal).countMaxActiveBits();
5249 // bf16 conversions are promoted to f32 when converting from i16.
5250 if (MaxWidth <= 24) {
5251 return IntToFpViaPromotion(MVT::f32);
5252 }
5253
5254 // bf16 conversions are promoted to f64 when converting from i32.
5255 if (MaxWidth <= 53) {
5256 return IntToFpViaPromotion(MVT::f64);
5257 }
5258
5259 // We need to be careful about i64 -> bf16.
5260 // Consider an i32 22216703.
5261 // This number cannot be represented exactly as an f32 and so a itofp will
5262 // turn it into 22216704.0 fptrunc to bf16 will turn this into 22282240.0
5263 // However, the correct bf16 was supposed to be 22151168.0
5264 // We need to use sticky rounding to get this correct.
5265 if (SrcVal.getValueType() == MVT::i64) {
5266 SDLoc DL(Op);
5267 // This algorithm is equivalent to the following:
5268 // uint64_t SrcHi = SrcVal & ~0xfffull;
5269 // uint64_t SrcLo = SrcVal & 0xfffull;
5270 // uint64_t Highest = SrcVal >> 53;
5271 // bool HasHighest = Highest != 0;
5272 // uint64_t ToRound = HasHighest ? SrcHi : SrcVal;
5273 // double Rounded = static_cast<double>(ToRound);
5274 // uint64_t RoundedBits = std::bit_cast<uint64_t>(Rounded);
5275 // uint64_t HasLo = SrcLo != 0;
5276 // bool NeedsAdjustment = HasHighest & HasLo;
5277 // uint64_t AdjustedBits = RoundedBits | uint64_t{NeedsAdjustment};
5278 // double Adjusted = std::bit_cast<double>(AdjustedBits);
5279 // return static_cast<__bf16>(Adjusted);
5280 //
5281 // Essentially, what happens is that SrcVal either fits perfectly in a
5282 // double-precision value or it is too big. If it is sufficiently small,
5283 // we should just go u64 -> double -> bf16 in a naive way. Otherwise, we
5284 // ensure that u64 -> double has no rounding error by only using the 52
5285 // MSB of the input. The low order bits will get merged into a sticky bit
5286 // which will avoid issues incurred by double rounding.
5287
5288 // Signed conversion is more or less like so:
5289 // copysign((__bf16)abs(SrcVal), SrcVal)
5290 SDValue SignBit;
5291 if (IsSigned) {
5292 SignBit = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5293 DAG.getConstant(1ull << 63, DL, MVT::i64));
5294 SrcVal = DAG.getNode(ISD::ABS, DL, MVT::i64, SrcVal);
5295 }
5296 SDValue SrcHi = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5297 DAG.getConstant(~0xfffull, DL, MVT::i64));
5298 SDValue SrcLo = DAG.getNode(ISD::AND, DL, MVT::i64, SrcVal,
5299 DAG.getConstant(0xfffull, DL, MVT::i64));
5301 DAG.getNode(ISD::SRL, DL, MVT::i64, SrcVal,
5302 DAG.getShiftAmountConstant(53, MVT::i64, DL));
5303 SDValue Zero64 = DAG.getConstant(0, DL, MVT::i64);
5304 SDValue ToRound =
5305 DAG.getSelectCC(DL, Highest, Zero64, SrcHi, SrcVal, ISD::SETNE);
5306 SDValue Rounded =
5307 IsStrict ? DAG.getNode(Op.getOpcode(), DL, {MVT::f64, MVT::Other},
5308 {Op.getOperand(0), ToRound})
5309 : DAG.getNode(Op.getOpcode(), DL, MVT::f64, ToRound);
5310
5311 SDValue RoundedBits = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Rounded);
5312 if (SignBit) {
5313 RoundedBits = DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, SignBit);
5314 }
5315
5316 SDValue HasHighest = DAG.getSetCC(
5317 DL,
5318 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
5319 Highest, Zero64, ISD::SETNE);
5320
5321 SDValue HasLo = DAG.getSetCC(
5322 DL,
5323 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
5324 SrcLo, Zero64, ISD::SETNE);
5325
5326 SDValue NeedsAdjustment =
5327 DAG.getNode(ISD::AND, DL, HasLo.getValueType(), HasHighest, HasLo);
5328 NeedsAdjustment = DAG.getZExtOrTrunc(NeedsAdjustment, DL, MVT::i64);
5329
5330 SDValue AdjustedBits =
5331 DAG.getNode(ISD::OR, DL, MVT::i64, RoundedBits, NeedsAdjustment);
5332 SDValue Adjusted = DAG.getNode(ISD::BITCAST, DL, MVT::f64, AdjustedBits);
5333 return IsStrict
5334 ? DAG.getNode(
5336 {Op.getValueType(), MVT::Other},
5337 {Rounded.getValue(1), Adjusted,
5338 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true)})
5339 : DAG.getNode(ISD::FP_ROUND, DL, Op.getValueType(), Adjusted,
5340 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
5341 }
5342 }
5343
5344 // f16 conversions are promoted to f32 when full fp16 is not supported.
5345 if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
5346 return IntToFpViaPromotion(MVT::f32);
5347 }
5348
5349 // i128 conversions are libcalls.
5350 if (SrcVal.getValueType() == MVT::i128)
5351 return SDValue();
5352
5353 // Other conversions are legal, unless it's to the completely software-based
5354 // fp128.
5355 if (Op.getValueType() != MVT::f128)
5356 return Op;
5357 return SDValue();
5358}
5359
5360static MVT getSVEContainerType(EVT ContentTy);
5361
5362SDValue
5363AArch64TargetLowering::LowerLOOP_DEPENDENCE_MASK(SDValue Op,
5364 SelectionDAG &DAG) const {
5365 SDLoc DL(Op);
5366 uint64_t EltSize = Op.getConstantOperandVal(2);
5367 EVT VT = Op.getValueType();
5368 switch (EltSize) {
5369 case 1:
5370 if (VT != MVT::v16i8 && VT != MVT::nxv16i1)
5371 return SDValue();
5372 break;
5373 case 2:
5374 if (VT != MVT::v8i8 && VT != MVT::nxv8i1)
5375 return SDValue();
5376 break;
5377 case 4:
5378 if (VT != MVT::v4i16 && VT != MVT::nxv4i1)
5379 return SDValue();
5380 break;
5381 case 8:
5382 if (VT != MVT::v2i32 && VT != MVT::nxv2i1)
5383 return SDValue();
5384 break;
5385 default:
5386 // Other element sizes are incompatible with whilewr/rw, so expand instead
5387 return SDValue();
5388 }
5389
5390 SDValue PtrA = Op.getOperand(0);
5391 SDValue PtrB = Op.getOperand(1);
5392
5393 if (VT.isScalableVT())
5394 return DAG.getNode(Op.getOpcode(), DL, VT, PtrA, PtrB, Op.getOperand(2));
5395
5396 // We can use the SVE whilewr/whilerw instruction to lower this
5397 // intrinsic by creating the appropriate sequence of scalable vector
5398 // operations and then extracting a fixed-width subvector from the scalable
5399 // vector. Scalable vector variants are already legal.
5400 EVT ContainerVT =
5402 VT.getVectorNumElements(), true);
5403 EVT WhileVT = ContainerVT.changeElementType(MVT::i1);
5404
5405 SDValue Mask =
5406 DAG.getNode(Op.getOpcode(), DL, WhileVT, PtrA, PtrB, Op.getOperand(2));
5407 SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, DL, ContainerVT, Mask);
5408 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, MaskAsInt,
5409 DAG.getVectorIdxConstant(0, DL));
5410}
5411
5412SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
5413 SelectionDAG &DAG) const {
5414 EVT OpVT = Op.getValueType();
5415 EVT ArgVT = Op.getOperand(0).getValueType();
5416
5418 return LowerFixedLengthBitcastToSVE(Op, DAG);
5419
5420 if (OpVT.isScalableVector()) {
5421 assert(isTypeLegal(OpVT) && "Unexpected result type!");
5422
5423 // Handle type legalisation first.
5424 if (!isTypeLegal(ArgVT)) {
5425 assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
5426 "Expected int->fp bitcast!");
5427
5428 // Bitcasting between unpacked vector types of different element counts is
5429 // not a NOP because the live elements are laid out differently.
5430 // 01234567
5431 // e.g. nxv2i32 = XX??XX??
5432 // nxv4f16 = X?X?X?X?
5433 if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount())
5434 return SDValue();
5435
5436 SDValue ExtResult =
5437 DAG.getNode(ISD::ANY_EXTEND, SDLoc(Op), getSVEContainerType(ArgVT),
5438 Op.getOperand(0));
5439 return getSVESafeBitCast(OpVT, ExtResult, DAG);
5440 }
5441
5442 // Bitcasts between legal types with the same element count are legal.
5443 if (OpVT.getVectorElementCount() == ArgVT.getVectorElementCount())
5444 return Op;
5445
5446 // getSVESafeBitCast does not support casting between unpacked types.
5447 if (!isPackedVectorType(OpVT, DAG))
5448 return SDValue();
5449
5450 return getSVESafeBitCast(OpVT, Op.getOperand(0), DAG);
5451 }
5452
5453 if (OpVT != MVT::f16 && OpVT != MVT::bf16)
5454 return SDValue();
5455
5456 // Bitcasts between f16 and bf16 are legal.
5457 if (ArgVT == MVT::f16 || ArgVT == MVT::bf16)
5458 return Op;
5459
5460 assert(ArgVT == MVT::i16);
5461 SDLoc DL(Op);
5462
5463 Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
5464 Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
5465 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, OpVT, Op);
5466}
5467
5468// Returns lane if Op extracts from a two-element vector and lane is constant
5469// (i.e., extractelt(<2 x Ty> %v, ConstantLane)), and std::nullopt otherwise.
5470static std::optional<uint64_t>
5472 SDNode *OpNode = Op.getNode();
5473 if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
5474 return std::nullopt;
5475
5476 EVT VT = OpNode->getOperand(0).getValueType();
5478 if (!VT.isFixedLengthVector() || VT.getVectorNumElements() != 2 || !C)
5479 return std::nullopt;
5480
5481 return C->getZExtValue();
5482}
5483
5485 bool isSigned) {
5486 EVT VT = N.getValueType();
5487
5488 if (N.getOpcode() != ISD::BUILD_VECTOR)
5489 return false;
5490
5491 for (const SDValue &Elt : N->op_values()) {
5493 unsigned EltSize = VT.getScalarSizeInBits();
5494 unsigned HalfSize = EltSize / 2;
5495 if (isSigned) {
5496 if (!isIntN(HalfSize, C->getSExtValue()))
5497 return false;
5498 } else {
5499 if (!isUIntN(HalfSize, C->getZExtValue()))
5500 return false;
5501 }
5502 continue;
5503 }
5504 return false;
5505 }
5506
5507 return true;
5508}
5509
5511 EVT VT = N.getValueType();
5512 assert(VT.is128BitVector() && "Unexpected vector MULL size");
5513 EVT HalfVT = EVT::getVectorVT(
5514 *DAG.getContext(),
5517 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), HalfVT, N);
5518}
5519
5521 return N.getOpcode() == ISD::SIGN_EXTEND ||
5522 N.getOpcode() == ISD::ANY_EXTEND ||
5523 isExtendedBUILD_VECTOR(N, DAG, true);
5524}
5525
5527 return N.getOpcode() == ISD::ZERO_EXTEND ||
5528 N.getOpcode() == ISD::ANY_EXTEND ||
5529 isExtendedBUILD_VECTOR(N, DAG, false);
5530}
5531
5533 unsigned Opcode = N.getOpcode();
5534 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
5535 SDValue N0 = N.getOperand(0);
5536 SDValue N1 = N.getOperand(1);
5537 return N0->hasOneUse() && N1->hasOneUse() &&
5538 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
5539 }
5540 return false;
5541}
5542
5544 unsigned Opcode = N.getOpcode();
5545 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
5546 SDValue N0 = N.getOperand(0);
5547 SDValue N1 = N.getOperand(1);
5548 return N0->hasOneUse() && N1->hasOneUse() &&
5549 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
5550 }
5551 return false;
5552}
5553
5554SDValue AArch64TargetLowering::LowerGET_ROUNDING(SDValue Op,
5555 SelectionDAG &DAG) const {
5556 // The rounding mode is in bits 23:22 of the FPSCR.
5557 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
5558 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
5559 // so that the shift + and get folded into a bitfield extract.
5560 SDLoc DL(Op);
5561
5562 SDValue Chain = Op.getOperand(0);
5563 SDValue FPCR_64 =
5564 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other},
5565 {Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL,
5566 MVT::i64)});
5567 Chain = FPCR_64.getValue(1);
5568 SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPCR_64);
5569 SDValue FltRounds = DAG.getNode(ISD::ADD, DL, MVT::i32, FPCR_32,
5570 DAG.getConstant(1U << 22, DL, MVT::i32));
5571 SDValue RMODE = DAG.getNode(ISD::SRL, DL, MVT::i32, FltRounds,
5572 DAG.getConstant(22, DL, MVT::i32));
5573 SDValue AND = DAG.getNode(ISD::AND, DL, MVT::i32, RMODE,
5574 DAG.getConstant(3, DL, MVT::i32));
5575 return DAG.getMergeValues({AND, Chain}, DL);
5576}
5577
5578SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op,
5579 SelectionDAG &DAG) const {
5580 SDLoc DL(Op);
5581 SDValue Chain = Op->getOperand(0);
5582 SDValue RMValue = Op->getOperand(1);
5583
5584 // The rounding mode is in bits 23:22 of the FPCR.
5585 // The llvm.set.rounding argument value to the rounding mode in FPCR mapping
5586 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
5587 // ((arg - 1) & 3) << 22).
5588 //
5589 // The argument of llvm.set.rounding must be within the segment [0, 3], so
5590 // NearestTiesToAway (4) is not handled here. It is responsibility of the code
5591 // generated llvm.set.rounding to ensure this condition.
5592
5593 // Calculate new value of FPCR[23:22].
5594 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
5595 DAG.getConstant(1, DL, MVT::i32));
5596 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
5597 DAG.getConstant(0x3, DL, MVT::i32));
5598 RMValue =
5599 DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
5600 DAG.getConstant(AArch64::RoundingBitsPos, DL, MVT::i32));
5601 RMValue = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, RMValue);
5602
5603 // Get current value of FPCR.
5604 SDValue Ops[] = {
5605 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5606 SDValue FPCR =
5607 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5608 Chain = FPCR.getValue(1);
5609 FPCR = FPCR.getValue(0);
5610
5611 // Put new rounding mode into FPSCR[23:22].
5612 const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos);
5613 FPCR = DAG.getNode(ISD::AND, DL, MVT::i64, FPCR,
5614 DAG.getConstant(RMMask, DL, MVT::i64));
5615 FPCR = DAG.getNode(ISD::OR, DL, MVT::i64, FPCR, RMValue);
5616 SDValue Ops2[] = {
5617 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
5618 FPCR};
5619 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5620}
5621
5622SDValue AArch64TargetLowering::LowerGET_FPMODE(SDValue Op,
5623 SelectionDAG &DAG) const {
5624 SDLoc DL(Op);
5625 SDValue Chain = Op->getOperand(0);
5626
5627 // Get current value of FPCR.
5628 SDValue Ops[] = {
5629 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5630 SDValue FPCR =
5631 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5632 Chain = FPCR.getValue(1);
5633 FPCR = FPCR.getValue(0);
5634
5635 // Truncate FPCR to 32 bits.
5636 SDValue Result = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, FPCR);
5637
5638 return DAG.getMergeValues({Result, Chain}, DL);
5639}
5640
5641SDValue AArch64TargetLowering::LowerSET_FPMODE(SDValue Op,
5642 SelectionDAG &DAG) const {
5643 SDLoc DL(Op);
5644 SDValue Chain = Op->getOperand(0);
5645 SDValue Mode = Op->getOperand(1);
5646
5647 // Extend the specified value to 64 bits.
5648 SDValue FPCR = DAG.getZExtOrTrunc(Mode, DL, MVT::i64);
5649
5650 // Set new value of FPCR.
5651 SDValue Ops2[] = {
5652 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
5653 FPCR};
5654 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5655}
5656
5657SDValue AArch64TargetLowering::LowerRESET_FPMODE(SDValue Op,
5658 SelectionDAG &DAG) const {
5659 SDLoc DL(Op);
5660 SDValue Chain = Op->getOperand(0);
5661
5662 // Get current value of FPCR.
5663 SDValue Ops[] = {
5664 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
5665 SDValue FPCR =
5666 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
5667 Chain = FPCR.getValue(1);
5668 FPCR = FPCR.getValue(0);
5669
5670 // Clear bits that are not reserved.
5671 SDValue FPSCRMasked = DAG.getNode(
5672 ISD::AND, DL, MVT::i64, FPCR,
5674
5675 // Set new value of FPCR.
5676 SDValue Ops2[] = {
5677 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
5678 FPSCRMasked};
5679 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
5680}
5681
5682static unsigned selectUmullSmull(SDValue &N0, SDValue &N1, SelectionDAG &DAG,
5683 SDLoc DL, bool &IsMLA) {
5684 bool IsN0SExt = isSignExtended(N0, DAG);
5685 bool IsN1SExt = isSignExtended(N1, DAG);
5686 if (IsN0SExt && IsN1SExt)
5687 return AArch64ISD::SMULL;
5688
5689 bool IsN0ZExt = isZeroExtended(N0, DAG);
5690 bool IsN1ZExt = isZeroExtended(N1, DAG);
5691
5692 if (IsN0ZExt && IsN1ZExt)
5693 return AArch64ISD::UMULL;
5694
5695 // Select UMULL if we can replace the other operand with an extend.
5696 EVT VT = N0.getValueType();
5697 unsigned EltSize = VT.getScalarSizeInBits();
5698 APInt Mask = APInt::getHighBitsSet(EltSize, EltSize / 2);
5699 if (IsN0ZExt || IsN1ZExt) {
5700 if (DAG.MaskedValueIsZero(IsN0ZExt ? N1 : N0, Mask))
5701 return AArch64ISD::UMULL;
5702 } else if (VT == MVT::v2i64 && DAG.MaskedValueIsZero(N0, Mask) &&
5703 DAG.MaskedValueIsZero(N1, Mask)) {
5704 // For v2i64 we look more aggressively at both operands being zero, to avoid
5705 // scalarization.
5706 return AArch64ISD::UMULL;
5707 }
5708
5709 if (IsN0SExt || IsN1SExt) {
5710 if (DAG.ComputeNumSignBits(IsN0SExt ? N1 : N0) > EltSize / 2)
5711 return AArch64ISD::SMULL;
5712 } else if (VT == MVT::v2i64 && DAG.ComputeNumSignBits(N0) > EltSize / 2 &&
5713 DAG.ComputeNumSignBits(N1) > EltSize / 2) {
5714 return AArch64ISD::SMULL;
5715 }
5716
5717 if (!IsN1SExt && !IsN1ZExt)
5718 return 0;
5719
5720 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
5721 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
5722 if (IsN1SExt && isAddSubSExt(N0, DAG)) {
5723 IsMLA = true;
5724 return AArch64ISD::SMULL;
5725 }
5726 if (IsN1ZExt && isAddSubZExt(N0, DAG)) {
5727 IsMLA = true;
5728 return AArch64ISD::UMULL;
5729 }
5730 if (IsN0ZExt && isAddSubZExt(N1, DAG)) {
5731 std::swap(N0, N1);
5732 IsMLA = true;
5733 return AArch64ISD::UMULL;
5734 }
5735 return 0;
5736}
5737
5738SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
5739 EVT VT = Op.getValueType();
5740
5741 bool OverrideNEON = !Subtarget->isNeonAvailable();
5742 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
5743 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5744
5745 // Multiplications are only custom-lowered for 128-bit and 64-bit vectors so
5746 // that VMULL can be detected. Otherwise v2i64 multiplications are not legal.
5747 assert((VT.is128BitVector() || VT.is64BitVector()) && VT.isInteger() &&
5748 "unexpected type for custom-lowering ISD::MUL");
5749 SDValue N0 = Op.getOperand(0);
5750 SDValue N1 = Op.getOperand(1);
5751 bool isMLA = false;
5752 EVT OVT = VT;
5753 if (VT.is64BitVector()) {
5754 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5755 isNullConstant(N0.getOperand(1)) &&
5757 isNullConstant(N1.getOperand(1))) {
5758 N0 = N0.getOperand(0);
5759 N1 = N1.getOperand(0);
5760 VT = N0.getValueType();
5761 } else {
5762 if (VT == MVT::v1i64) {
5763 if (Subtarget->hasSVE())
5764 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5765 // Fall through to expand this. It is not legal.
5766 return SDValue();
5767 } else
5768 // Other vector multiplications are legal.
5769 return Op;
5770 }
5771 }
5772
5773 SDLoc DL(Op);
5774 unsigned NewOpc = selectUmullSmull(N0, N1, DAG, DL, isMLA);
5775
5776 if (!NewOpc) {
5777 if (VT.getVectorElementType() == MVT::i64) {
5778 // If SVE is available then i64 vector multiplications can also be made
5779 // legal.
5780 if (Subtarget->hasSVE())
5781 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
5782 // Fall through to expand this. It is not legal.
5783 return SDValue();
5784 } else
5785 // Other vector multiplications are legal.
5786 return Op;
5787 }
5788
5789 // Legalize to a S/UMULL instruction
5790 SDValue Op0;
5791 SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
5792 if (!isMLA) {
5793 Op0 = skipExtensionForVectorMULL(N0, DAG);
5795 Op1.getValueType().is64BitVector() &&
5796 "unexpected types for extended operands to VMULL");
5797 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OVT,
5798 DAG.getNode(NewOpc, DL, VT, Op0, Op1),
5799 DAG.getConstant(0, DL, MVT::i64));
5800 }
5801 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
5802 // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
5803 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
5806 EVT Op1VT = Op1.getValueType();
5807 return DAG.getNode(
5809 DAG.getNode(N0.getOpcode(), DL, VT,
5810 DAG.getNode(NewOpc, DL, VT,
5811 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
5812 DAG.getNode(NewOpc, DL, VT,
5813 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)),
5814 DAG.getConstant(0, DL, MVT::i64));
5815}
5816
5817static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
5818 int Pattern) {
5819 if (Pattern == AArch64SVEPredPattern::all)
5820 return DAG.getConstant(1, DL, VT);
5821 return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
5822 DAG.getTargetConstant(Pattern, DL, MVT::i32));
5823}
5824
5826 bool IsSigned, bool IsEqual) {
5827 unsigned Op0 = N->getOpcode() == ISD::INTRINSIC_WO_CHAIN ? 1 : 0;
5828 unsigned Op1 = N->getOpcode() == ISD::INTRINSIC_WO_CHAIN ? 2 : 1;
5829
5830 if (!N->getValueType(0).isScalableVector() ||
5831 !isa<ConstantSDNode>(N->getOperand(Op1)))
5832 return SDValue();
5833
5834 SDLoc DL(N);
5835 APInt Y = N->getConstantOperandAPInt(Op1);
5836
5837 // When the second operand is the maximum value, comparisons that include
5838 // equality can never fail and thus we can return an all active predicate.
5839 if (IsEqual)
5840 if (IsSigned ? Y.isMaxSignedValue() : Y.isMaxValue())
5841 return DAG.getConstant(1, DL, N->getValueType(0));
5842
5843 if (!isa<ConstantSDNode>(N->getOperand(Op0)))
5844 return SDValue();
5845
5846 APInt X = N->getConstantOperandAPInt(Op0);
5847
5848 bool Overflow;
5849 APInt NumActiveElems =
5850 IsSigned ? Y.ssub_ov(X, Overflow) : Y.usub_ov(X, Overflow);
5851
5852 if (Overflow)
5853 return SDValue();
5854
5855 if (IsEqual) {
5856 APInt One(NumActiveElems.getBitWidth(), 1, IsSigned);
5857 NumActiveElems = IsSigned ? NumActiveElems.sadd_ov(One, Overflow)
5858 : NumActiveElems.uadd_ov(One, Overflow);
5859 if (Overflow)
5860 return SDValue();
5861 }
5862
5863 std::optional<unsigned> PredPattern =
5865 unsigned MinSVEVectorSize = std::max(
5867 unsigned ElementSize = 128 / N->getValueType(0).getVectorMinNumElements();
5868 if (PredPattern != std::nullopt &&
5869 NumActiveElems.getZExtValue() <= (MinSVEVectorSize / ElementSize))
5870 return getPTrue(DAG, DL, N->getValueType(0), *PredPattern);
5871
5872 return SDValue();
5873}
5874
5875// Returns a safe bitcast between two scalable vector predicates, where
5876// any newly created lanes from a widening bitcast are defined as zero.
5878 SDLoc DL(Op);
5879 EVT InVT = Op.getValueType();
5880
5881 assert(InVT.getVectorElementType() == MVT::i1 &&
5882 VT.getVectorElementType() == MVT::i1 &&
5883 "Expected a predicate-to-predicate bitcast");
5885 InVT.isScalableVector() &&
5886 DAG.getTargetLoweringInfo().isTypeLegal(InVT) &&
5887 "Only expect to cast between legal scalable predicate types!");
5888
5889 // Return the operand if the cast isn't changing type,
5890 if (InVT == VT)
5891 return Op;
5892
5893 // Look through casts to <vscale x 16 x i1> when their input has more lanes
5894 // than VT. This will increase the chances of removing casts that introduce
5895 // new lanes, which have to be explicitly zero'd.
5896 if (Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
5897 Op.getConstantOperandVal(0) == Intrinsic::aarch64_sve_convert_to_svbool &&
5898 Op.getOperand(1).getValueType().bitsGT(VT))
5899 Op = Op.getOperand(1);
5900
5901 SDValue Reinterpret = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
5902
5903 // We only have to zero the lanes if new lanes are being defined, e.g. when
5904 // casting from <vscale x 2 x i1> to <vscale x 16 x i1>. If this is not the
5905 // case (e.g. when casting from <vscale x 16 x i1> -> <vscale x 2 x i1>) then
5906 // we can return here.
5907 if (InVT.bitsGT(VT))
5908 return Reinterpret;
5909
5910 // Check if the other lanes are already known to be zeroed by
5911 // construction.
5913 return Reinterpret;
5914
5915 // Zero the newly introduced lanes.
5916 SDValue Mask = DAG.getConstant(1, DL, InVT);
5917 Mask = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Mask);
5918 return DAG.getNode(ISD::AND, DL, VT, Reinterpret, Mask);
5919}
5920
5921SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG,
5922 SDValue Chain, SDLoc DL,
5923 EVT VT) const {
5924 RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE;
5927 Type *Int64Ty = Type::getInt64Ty(*DAG.getContext());
5928 Type *RetTy = StructType::get(Int64Ty, Int64Ty);
5929 TargetLowering::CallLoweringInfo CLI(DAG);
5931 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
5932 getLibcallCallingConv(LC), RetTy, Callee, std::move(Args));
5933 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
5934 SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ 1, DL, MVT::i64);
5935 return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(0),
5936 Mask);
5937}
5938
5939// Lower an SME LDR/STR ZA intrinsic
5940// Case 1: If the vector number (vecnum) is an immediate in range, it gets
5941// folded into the instruction
5942// ldr(%tileslice, %ptr, 11) -> ldr [%tileslice, 11], [%ptr, 11]
5943// Case 2: If the vecnum is not an immediate, then it is used to modify the base
5944// and tile slice registers
5945// ldr(%tileslice, %ptr, %vecnum)
5946// ->
5947// %svl = rdsvl
5948// %ptr2 = %ptr + %svl * %vecnum
5949// %tileslice2 = %tileslice + %vecnum
5950// ldr [%tileslice2, 0], [%ptr2, 0]
5951// Case 3: If the vecnum is an immediate out of range, then the same is done as
5952// case 2, but the base and slice registers are modified by the greatest
5953// multiple of 15 lower than the vecnum and the remainder is folded into the
5954// instruction. This means that successive loads and stores that are offset from
5955// each other can share the same base and slice register updates.
5956// ldr(%tileslice, %ptr, 22)
5957// ldr(%tileslice, %ptr, 23)
5958// ->
5959// %svl = rdsvl
5960// %ptr2 = %ptr + %svl * 15
5961// %tileslice2 = %tileslice + 15
5962// ldr [%tileslice2, 7], [%ptr2, 7]
5963// ldr [%tileslice2, 8], [%ptr2, 8]
5964// Case 4: If the vecnum is an add of an immediate, then the non-immediate
5965// operand and the immediate can be folded into the instruction, like case 2.
5966// ldr(%tileslice, %ptr, %vecnum + 7)
5967// ldr(%tileslice, %ptr, %vecnum + 8)
5968// ->
5969// %svl = rdsvl
5970// %ptr2 = %ptr + %svl * %vecnum
5971// %tileslice2 = %tileslice + %vecnum
5972// ldr [%tileslice2, 7], [%ptr2, 7]
5973// ldr [%tileslice2, 8], [%ptr2, 8]
5974// Case 5: The vecnum being an add of an immediate out of range is also handled,
5975// in which case the same remainder logic as case 3 is used.
5977 SDLoc DL(N);
5978
5979 SDValue TileSlice = N->getOperand(2);
5980 SDValue Base = N->getOperand(3);
5981 SDValue VecNum = N->getOperand(4);
5982 int32_t ConstAddend = 0;
5983 SDValue VarAddend = VecNum;
5984
5985 // If the vnum is an add of an immediate, we can fold it into the instruction
5986 if (VecNum.getOpcode() == ISD::ADD &&
5987 isa<ConstantSDNode>(VecNum.getOperand(1))) {
5988 ConstAddend = cast<ConstantSDNode>(VecNum.getOperand(1))->getSExtValue();
5989 VarAddend = VecNum.getOperand(0);
5990 } else if (auto ImmNode = dyn_cast<ConstantSDNode>(VecNum)) {
5991 ConstAddend = ImmNode->getSExtValue();
5992 VarAddend = SDValue();
5993 }
5994
5995 int32_t ImmAddend = ConstAddend % 16;
5996 if (int32_t C = (ConstAddend - ImmAddend)) {
5997 SDValue CVal = DAG.getTargetConstant(C, DL, MVT::i32);
5998 VarAddend = VarAddend
5999 ? DAG.getNode(ISD::ADD, DL, MVT::i32, {VarAddend, CVal})
6000 : CVal;
6001 }
6002
6003 if (VarAddend) {
6004 // Get the vector length that will be multiplied by vnum
6005 auto SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
6006 DAG.getConstant(1, DL, MVT::i32));
6007
6008 // Multiply SVL and vnum then add it to the base
6009 SDValue Mul = DAG.getNode(
6010 ISD::MUL, DL, MVT::i64,
6011 {SVL, DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, VarAddend)});
6012 Base = DAG.getNode(ISD::ADD, DL, MVT::i64, {Base, Mul});
6013 // Just add vnum to the tileslice
6014 TileSlice = DAG.getNode(ISD::ADD, DL, MVT::i32, {TileSlice, VarAddend});
6015 }
6016
6017 return DAG.getNode(IsLoad ? AArch64ISD::SME_ZA_LDR : AArch64ISD::SME_ZA_STR,
6018 DL, MVT::Other,
6019 {/*Chain=*/N.getOperand(0), TileSlice, Base,
6020 DAG.getTargetConstant(ImmAddend, DL, MVT::i32)});
6021}
6022
6024 SDLoc DL(Op);
6025 SDValue ID =
6026 DAG.getTargetConstant(Intrinsic::aarch64_sve_match, DL, MVT::i64);
6027
6028 auto Op1 = Op.getOperand(1);
6029 auto Op2 = Op.getOperand(2);
6030 auto Mask = Op.getOperand(3);
6031
6032 EVT Op1VT = Op1.getValueType();
6033 EVT Op2VT = Op2.getValueType();
6034 EVT ResVT = Op.getValueType();
6035
6036 assert((Op1VT.getVectorElementType() == MVT::i8 ||
6037 Op1VT.getVectorElementType() == MVT::i16) &&
6038 "Expected 8-bit or 16-bit characters.");
6039
6040 // Scalable vector type used to wrap operands.
6041 // A single container is enough for both operands because ultimately the
6042 // operands will have to be wrapped to the same type (nxv16i8 or nxv8i16).
6043 EVT OpContainerVT = Op1VT.isScalableVector()
6044 ? Op1VT
6046
6047 if (Op2VT.is128BitVector()) {
6048 // If Op2 is a full 128-bit vector, wrap it trivially in a scalable vector.
6049 Op2 = convertToScalableVector(DAG, OpContainerVT, Op2);
6050 // Further, if the result is scalable, broadcast Op2 to a full SVE register.
6051 if (ResVT.isScalableVector())
6052 Op2 = DAG.getNode(AArch64ISD::DUPLANE128, DL, OpContainerVT, Op2,
6053 DAG.getTargetConstant(0, DL, MVT::i64));
6054 } else {
6055 // If Op2 is not a full 128-bit vector, we always need to broadcast it.
6056 unsigned Op2BitWidth = Op2VT.getFixedSizeInBits();
6057 MVT Op2IntVT = MVT::getIntegerVT(Op2BitWidth);
6058 EVT Op2PromotedVT = getPackedSVEVectorVT(Op2IntVT);
6059 Op2 = DAG.getBitcast(MVT::getVectorVT(Op2IntVT, 1), Op2);
6060 Op2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op2IntVT, Op2,
6061 DAG.getConstant(0, DL, MVT::i64));
6062 Op2 = DAG.getSplatVector(Op2PromotedVT, DL, Op2);
6063 Op2 = DAG.getBitcast(OpContainerVT, Op2);
6064 }
6065
6066 // If the result is scalable, we just need to carry out the MATCH.
6067 if (ResVT.isScalableVector())
6068 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResVT, ID, Mask, Op1, Op2);
6069
6070 // If the result is fixed, we can still use MATCH but we need to wrap the
6071 // first operand and the mask in scalable vectors before doing so.
6072
6073 // Wrap the operands.
6074 Op1 = convertToScalableVector(DAG, OpContainerVT, Op1);
6075 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, Op1VT, Mask);
6076 Mask = convertFixedMaskToScalableVector(Mask, DAG);
6077
6078 // Carry out the match.
6079 SDValue Match = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Mask.getValueType(),
6080 ID, Mask, Op1, Op2);
6081
6082 // Extract and promote the match result (nxv16i1/nxv8i1) to ResVT
6083 // (v16i8/v8i8).
6084 Match = DAG.getNode(ISD::SIGN_EXTEND, DL, OpContainerVT, Match);
6085 Match = convertFromScalableVector(DAG, Op1VT, Match);
6086 return DAG.getNode(ISD::TRUNCATE, DL, ResVT, Match);
6087}
6088
6089SDValue AArch64TargetLowering::LowerINTRINSIC_VOID(SDValue Op,
6090 SelectionDAG &DAG) const {
6091 unsigned IntNo = Op.getConstantOperandVal(1);
6092 SDLoc DL(Op);
6093 switch (IntNo) {
6094 default:
6095 return SDValue(); // Don't custom lower most intrinsics.
6096 case Intrinsic::aarch64_prefetch: {
6097 SDValue Chain = Op.getOperand(0);
6098 SDValue Addr = Op.getOperand(2);
6099
6100 unsigned IsWrite = Op.getConstantOperandVal(3);
6101 unsigned Locality = Op.getConstantOperandVal(4);
6102 unsigned IsStream = Op.getConstantOperandVal(5);
6103 unsigned IsData = Op.getConstantOperandVal(6);
6104 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
6105 (!IsData << 3) | // IsDataCache bit
6106 (Locality << 1) | // Cache level bits
6107 (unsigned)IsStream; // Stream bit
6108
6109 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Chain,
6110 DAG.getTargetConstant(PrfOp, DL, MVT::i32), Addr);
6111 }
6112 case Intrinsic::aarch64_sme_str:
6113 case Intrinsic::aarch64_sme_ldr: {
6114 return LowerSMELdrStr(Op, DAG, IntNo == Intrinsic::aarch64_sme_ldr);
6115 }
6116 case Intrinsic::aarch64_sme_za_enable:
6117 return DAG.getNode(
6118 AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue),
6119 Op->getOperand(0), // Chain
6120 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));
6121 case Intrinsic::aarch64_sme_za_disable:
6122 return DAG.getNode(
6123 AArch64ISD::SMSTOP, DL, DAG.getVTList(MVT::Other, MVT::Glue),
6124 Op->getOperand(0), // Chain
6125 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));
6126 }
6127}
6128
6129SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
6130 SelectionDAG &DAG) const {
6131 unsigned IntNo = Op.getConstantOperandVal(1);
6132 SDLoc DL(Op);
6133 switch (IntNo) {
6134 default:
6135 return SDValue(); // Don't custom lower most intrinsics.
6136 case Intrinsic::aarch64_mops_memset_tag: {
6137 auto Node = cast<MemIntrinsicSDNode>(Op.getNode());
6138 SDValue Chain = Node->getChain();
6139 SDValue Dst = Op.getOperand(2);
6140 SDValue Val = Op.getOperand(3);
6141 Val = DAG.getAnyExtOrTrunc(Val, DL, MVT::i64);
6142 SDValue Size = Op.getOperand(4);
6143 auto Alignment = Node->getMemOperand()->getAlign();
6144 bool IsVol = Node->isVolatile();
6145 auto DstPtrInfo = Node->getPointerInfo();
6146
6147 const auto &SDI =
6148 static_cast<const AArch64SelectionDAGInfo &>(DAG.getSelectionDAGInfo());
6149 SDValue MS = SDI.EmitMOPS(AArch64::MOPSMemorySetTaggingPseudo, DAG, DL,
6150 Chain, Dst, Val, Size, Alignment, IsVol,
6151 DstPtrInfo, MachinePointerInfo{});
6152
6153 // MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the
6154 // intrinsic has 2. So hide SizeWb using MERGE_VALUES. Otherwise
6155 // LowerOperationWrapper will complain that the number of results has
6156 // changed.
6157 return DAG.getMergeValues({MS.getValue(0), MS.getValue(2)}, DL);
6158 }
6159 }
6160}
6161
6162SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
6163 SelectionDAG &DAG) const {
6164 unsigned IntNo = Op.getConstantOperandVal(0);
6165 SDLoc DL(Op);
6166 switch (IntNo) {
6167 default: return SDValue(); // Don't custom lower most intrinsics.
6168 case Intrinsic::thread_pointer: {
6169 EVT PtrVT = getPointerTy(DAG.getDataLayout());
6170 return DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
6171 }
6172 case Intrinsic::aarch64_sve_whilewr_b:
6173 return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
6174 Op.getOperand(1), Op.getOperand(2),
6175 DAG.getConstant(1, DL, MVT::i64));
6176 case Intrinsic::aarch64_sve_whilewr_h:
6177 return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
6178 Op.getOperand(1), Op.getOperand(2),
6179 DAG.getConstant(2, DL, MVT::i64));
6180 case Intrinsic::aarch64_sve_whilewr_s:
6181 return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
6182 Op.getOperand(1), Op.getOperand(2),
6183 DAG.getConstant(4, DL, MVT::i64));
6184 case Intrinsic::aarch64_sve_whilewr_d:
6185 return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(),
6186 Op.getOperand(1), Op.getOperand(2),
6187 DAG.getConstant(8, DL, MVT::i64));
6188 case Intrinsic::aarch64_sve_whilerw_b:
6189 return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
6190 Op.getOperand(1), Op.getOperand(2),
6191 DAG.getConstant(1, DL, MVT::i64));
6192 case Intrinsic::aarch64_sve_whilerw_h:
6193 return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
6194 Op.getOperand(1), Op.getOperand(2),
6195 DAG.getConstant(2, DL, MVT::i64));
6196 case Intrinsic::aarch64_sve_whilerw_s:
6197 return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
6198 Op.getOperand(1), Op.getOperand(2),
6199 DAG.getConstant(4, DL, MVT::i64));
6200 case Intrinsic::aarch64_sve_whilerw_d:
6201 return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(),
6202 Op.getOperand(1), Op.getOperand(2),
6203 DAG.getConstant(8, DL, MVT::i64));
6204 case Intrinsic::aarch64_neon_abs: {
6205 EVT Ty = Op.getValueType();
6206 if (Ty == MVT::i64) {
6207 SDValue Result =
6208 DAG.getNode(ISD::BITCAST, DL, MVT::v1i64, Op.getOperand(1));
6209 Result = DAG.getNode(ISD::ABS, DL, MVT::v1i64, Result);
6210 return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Result);
6211 } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
6212 return DAG.getNode(ISD::ABS, DL, Ty, Op.getOperand(1));
6213 } else {
6214 report_fatal_error("Unexpected type for AArch64 NEON intrinsic");
6215 }
6216 }
6217 case Intrinsic::aarch64_neon_pmull64: {
6218 SDValue LHS = Op.getOperand(1);
6219 SDValue RHS = Op.getOperand(2);
6220
6221 std::optional<uint64_t> LHSLane =
6223 std::optional<uint64_t> RHSLane =
6225
6226 assert((!LHSLane || *LHSLane < 2) && "Expect lane to be None or 0 or 1");
6227 assert((!RHSLane || *RHSLane < 2) && "Expect lane to be None or 0 or 1");
6228
6229 // 'aarch64_neon_pmull64' takes i64 parameters; while pmull/pmull2
6230 // instructions execute on SIMD registers. So canonicalize i64 to v1i64,
6231 // which ISel recognizes better. For example, generate a ldr into d*
6232 // registers as opposed to a GPR load followed by a fmov.
6233 auto TryVectorizeOperand = [](SDValue N, std::optional<uint64_t> NLane,
6234 std::optional<uint64_t> OtherLane,
6235 const SDLoc &DL,
6236 SelectionDAG &DAG) -> SDValue {
6237 // If the operand is an higher half itself, rewrite it to
6238 // extract_high_v2i64; this way aarch64_neon_pmull64 could
6239 // re-use the dag-combiner function with aarch64_neon_{pmull,smull,umull}.
6240 if (NLane == 1)
6241 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v1i64,
6242 N.getOperand(0), DAG.getConstant(1, DL, MVT::i64));
6243
6244 // Operand N is not a higher half but the other operand is.
6245 if (OtherLane == 1) {
6246 // If this operand is a lower half, rewrite it to
6247 // extract_high_v2i64(duplane(<2 x Ty>, 0)). This saves a roundtrip to
6248 // align lanes of two operands. A roundtrip sequence (to move from lane
6249 // 1 to lane 0) is like this:
6250 // mov x8, v0.d[1]
6251 // fmov d0, x8
6252 if (NLane == 0)
6253 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v1i64,
6254 DAG.getNode(AArch64ISD::DUPLANE64, DL, MVT::v2i64,
6255 N.getOperand(0),
6256 DAG.getConstant(0, DL, MVT::i64)),
6257 DAG.getConstant(1, DL, MVT::i64));
6258
6259 // Otherwise just dup from main to all lanes.
6260 return DAG.getNode(AArch64ISD::DUP, DL, MVT::v1i64, N);
6261 }
6262
6263 // Neither operand is an extract of higher half, so codegen may just use
6264 // the non-high version of PMULL instruction. Use v1i64 to represent i64.
6265 assert(N.getValueType() == MVT::i64 &&
6266 "Intrinsic aarch64_neon_pmull64 requires i64 parameters");
6267 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i64, N);
6268 };
6269
6270 LHS = TryVectorizeOperand(LHS, LHSLane, RHSLane, DL, DAG);
6271 RHS = TryVectorizeOperand(RHS, RHSLane, LHSLane, DL, DAG);
6272
6273 return DAG.getNode(AArch64ISD::PMULL, DL, Op.getValueType(), LHS, RHS);
6274 }
6275 case Intrinsic::aarch64_neon_smax:
6276 return DAG.getNode(ISD::SMAX, DL, Op.getValueType(), Op.getOperand(1),
6277 Op.getOperand(2));
6278 case Intrinsic::aarch64_neon_umax:
6279 return DAG.getNode(ISD::UMAX, DL, Op.getValueType(), Op.getOperand(1),
6280 Op.getOperand(2));
6281 case Intrinsic::aarch64_neon_smin:
6282 return DAG.getNode(ISD::SMIN, DL, Op.getValueType(), Op.getOperand(1),
6283 Op.getOperand(2));
6284 case Intrinsic::aarch64_neon_umin:
6285 return DAG.getNode(ISD::UMIN, DL, Op.getValueType(), Op.getOperand(1),
6286 Op.getOperand(2));
6287 case Intrinsic::aarch64_neon_scalar_sqxtn:
6288 case Intrinsic::aarch64_neon_scalar_sqxtun:
6289 case Intrinsic::aarch64_neon_scalar_uqxtn: {
6290 assert(Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::f32);
6291 if (Op.getValueType() == MVT::i32)
6292 return DAG.getNode(ISD::BITCAST, DL, MVT::i32,
6293 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::f32,
6294 Op.getOperand(0),
6295 DAG.getNode(ISD::BITCAST, DL, MVT::f64,
6296 Op.getOperand(1))));
6297 return SDValue();
6298 }
6299 case Intrinsic::aarch64_neon_sqxtn:
6300 return DAG.getNode(ISD::TRUNCATE_SSAT_S, DL, Op.getValueType(),
6301 Op.getOperand(1));
6302 case Intrinsic::aarch64_neon_sqxtun:
6303 return DAG.getNode(ISD::TRUNCATE_SSAT_U, DL, Op.getValueType(),
6304 Op.getOperand(1));
6305 case Intrinsic::aarch64_neon_uqxtn:
6306 return DAG.getNode(ISD::TRUNCATE_USAT_U, DL, Op.getValueType(),
6307 Op.getOperand(1));
6308 case Intrinsic::aarch64_neon_sqshrn:
6309 if (Op.getValueType().isVector())
6310 return DAG.getNode(ISD::TRUNCATE_SSAT_S, DL, Op.getValueType(),
6311 DAG.getNode(AArch64ISD::VASHR, DL,
6312 Op.getOperand(1).getValueType(),
6313 Op.getOperand(1), Op.getOperand(2)));
6314 return SDValue();
6315 case Intrinsic::aarch64_neon_sqshrun:
6316 if (Op.getValueType().isVector())
6317 return DAG.getNode(ISD::TRUNCATE_SSAT_U, DL, Op.getValueType(),
6318 DAG.getNode(AArch64ISD::VASHR, DL,
6319 Op.getOperand(1).getValueType(),
6320 Op.getOperand(1), Op.getOperand(2)));
6321 return SDValue();
6322 case Intrinsic::aarch64_neon_uqshrn:
6323 if (Op.getValueType().isVector())
6324 return DAG.getNode(ISD::TRUNCATE_USAT_U, DL, Op.getValueType(),
6325 DAG.getNode(AArch64ISD::VLSHR, DL,
6326 Op.getOperand(1).getValueType(),
6327 Op.getOperand(1), Op.getOperand(2)));
6328 return SDValue();
6329 case Intrinsic::aarch64_neon_sqrshrn:
6330 if (Op.getValueType().isVector())
6331 return DAG.getNode(ISD::TRUNCATE_SSAT_S, DL, Op.getValueType(),
6332 DAG.getNode(AArch64ISD::SRSHR_I, DL,
6333 Op.getOperand(1).getValueType(),
6334 Op.getOperand(1), Op.getOperand(2)));
6335 return SDValue();
6336 case Intrinsic::aarch64_neon_sqrshrun:
6337 if (Op.getValueType().isVector())
6338 return DAG.getNode(ISD::TRUNCATE_SSAT_U, DL, Op.getValueType(),
6339 DAG.getNode(AArch64ISD::SRSHR_I, DL,
6340 Op.getOperand(1).getValueType(),
6341 Op.getOperand(1), Op.getOperand(2)));
6342 return SDValue();
6343 case Intrinsic::aarch64_neon_uqrshrn:
6344 if (Op.getValueType().isVector())
6345 return DAG.getNode(ISD::TRUNCATE_USAT_U, DL, Op.getValueType(),
6346 DAG.getNode(AArch64ISD::URSHR_I, DL,
6347 Op.getOperand(1).getValueType(),
6348 Op.getOperand(1), Op.getOperand(2)));
6349 return SDValue();
6350 case Intrinsic::aarch64_neon_sqadd:
6351 if (Op.getValueType().isVector())
6352 return DAG.getNode(ISD::SADDSAT, DL, Op.getValueType(), Op.getOperand(1),
6353 Op.getOperand(2));
6354 return SDValue();
6355 case Intrinsic::aarch64_neon_sqsub:
6356 if (Op.getValueType().isVector())
6357 return DAG.getNode(ISD::SSUBSAT, DL, Op.getValueType(), Op.getOperand(1),
6358 Op.getOperand(2));
6359 return SDValue();
6360 case Intrinsic::aarch64_neon_uqadd:
6361 if (Op.getValueType().isVector())
6362 return DAG.getNode(ISD::UADDSAT, DL, Op.getValueType(), Op.getOperand(1),
6363 Op.getOperand(2));
6364 return SDValue();
6365 case Intrinsic::aarch64_neon_uqsub:
6366 if (Op.getValueType().isVector())
6367 return DAG.getNode(ISD::USUBSAT, DL, Op.getValueType(), Op.getOperand(1),
6368 Op.getOperand(2));
6369 return SDValue();
6370 case Intrinsic::aarch64_sve_whilelt:
6371 return optimizeIncrementingWhile(Op.getNode(), DAG, /*IsSigned=*/true,
6372 /*IsEqual=*/false);
6373 case Intrinsic::aarch64_sve_whilels:
6374 return optimizeIncrementingWhile(Op.getNode(), DAG, /*IsSigned=*/false,
6375 /*IsEqual=*/true);
6376 case Intrinsic::aarch64_sve_whilele:
6377 return optimizeIncrementingWhile(Op.getNode(), DAG, /*IsSigned=*/true,
6378 /*IsEqual=*/true);
6379 case Intrinsic::aarch64_sve_sunpkhi:
6380 return DAG.getNode(AArch64ISD::SUNPKHI, DL, Op.getValueType(),
6381 Op.getOperand(1));
6382 case Intrinsic::aarch64_sve_sunpklo:
6383 return DAG.getNode(AArch64ISD::SUNPKLO, DL, Op.getValueType(),
6384 Op.getOperand(1));
6385 case Intrinsic::aarch64_sve_uunpkhi:
6386 return DAG.getNode(AArch64ISD::UUNPKHI, DL, Op.getValueType(),
6387 Op.getOperand(1));
6388 case Intrinsic::aarch64_sve_uunpklo:
6389 return DAG.getNode(AArch64ISD::UUNPKLO, DL, Op.getValueType(),
6390 Op.getOperand(1));
6391 case Intrinsic::aarch64_sve_clasta_n:
6392 return DAG.getNode(AArch64ISD::CLASTA_N, DL, Op.getValueType(),
6393 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6394 case Intrinsic::aarch64_sve_clastb_n:
6395 return DAG.getNode(AArch64ISD::CLASTB_N, DL, Op.getValueType(),
6396 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6397 case Intrinsic::aarch64_sve_lasta:
6398 return DAG.getNode(AArch64ISD::LASTA, DL, Op.getValueType(),
6399 Op.getOperand(1), Op.getOperand(2));
6400 case Intrinsic::aarch64_sve_lastb:
6401 return DAG.getNode(AArch64ISD::LASTB, DL, Op.getValueType(),
6402 Op.getOperand(1), Op.getOperand(2));
6403 case Intrinsic::aarch64_sve_rev:
6404 return DAG.getNode(ISD::VECTOR_REVERSE, DL, Op.getValueType(),
6405 Op.getOperand(1));
6406 case Intrinsic::aarch64_sve_tbl:
6407 return DAG.getNode(AArch64ISD::TBL, DL, Op.getValueType(), Op.getOperand(1),
6408 Op.getOperand(2));
6409 case Intrinsic::aarch64_sve_trn1:
6410 return DAG.getNode(AArch64ISD::TRN1, DL, Op.getValueType(),
6411 Op.getOperand(1), Op.getOperand(2));
6412 case Intrinsic::aarch64_sve_trn2:
6413 return DAG.getNode(AArch64ISD::TRN2, DL, Op.getValueType(),
6414 Op.getOperand(1), Op.getOperand(2));
6415 case Intrinsic::aarch64_sve_uzp1:
6416 return DAG.getNode(AArch64ISD::UZP1, DL, Op.getValueType(),
6417 Op.getOperand(1), Op.getOperand(2));
6418 case Intrinsic::aarch64_sve_uzp2:
6419 return DAG.getNode(AArch64ISD::UZP2, DL, Op.getValueType(),
6420 Op.getOperand(1), Op.getOperand(2));
6421 case Intrinsic::aarch64_sve_zip1:
6422 return DAG.getNode(AArch64ISD::ZIP1, DL, Op.getValueType(),
6423 Op.getOperand(1), Op.getOperand(2));
6424 case Intrinsic::aarch64_sve_zip2:
6425 return DAG.getNode(AArch64ISD::ZIP2, DL, Op.getValueType(),
6426 Op.getOperand(1), Op.getOperand(2));
6427 case Intrinsic::aarch64_sve_splice:
6428 return DAG.getNode(AArch64ISD::SPLICE, DL, Op.getValueType(),
6429 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6430 case Intrinsic::aarch64_sve_ptrue:
6431 return getPTrue(DAG, DL, Op.getValueType(), Op.getConstantOperandVal(1));
6432 case Intrinsic::aarch64_sve_clz:
6433 return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, DL, Op.getValueType(),
6434 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6435 case Intrinsic::aarch64_sme_cntsd: {
6436 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, DL, Op.getValueType(),
6437 DAG.getConstant(1, DL, MVT::i32));
6438 return DAG.getNode(ISD::SRL, DL, Op.getValueType(), Bytes,
6439 DAG.getConstant(3, DL, MVT::i32), SDNodeFlags::Exact);
6440 }
6441 case Intrinsic::aarch64_sve_cnt: {
6442 SDValue Data = Op.getOperand(3);
6443 // CTPOP only supports integer operands.
6444 if (Data.getValueType().isFloatingPoint())
6445 Data = DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Data);
6446 return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, DL, Op.getValueType(),
6447 Op.getOperand(2), Data, Op.getOperand(1));
6448 }
6449 case Intrinsic::aarch64_sve_dupq_lane:
6450 return LowerDUPQLane(Op, DAG);
6451 case Intrinsic::aarch64_sve_convert_from_svbool:
6452 if (Op.getValueType() == MVT::aarch64svcount)
6453 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Op.getOperand(1));
6454 return getSVEPredicateBitCast(Op.getValueType(), Op.getOperand(1), DAG);
6455 case Intrinsic::aarch64_sve_convert_to_svbool:
6456 if (Op.getOperand(1).getValueType() == MVT::aarch64svcount)
6457 return DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, Op.getOperand(1));
6458 return getSVEPredicateBitCast(MVT::nxv16i1, Op.getOperand(1), DAG);
6459 case Intrinsic::aarch64_sve_fneg:
6460 return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, DL, Op.getValueType(),
6461 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6462 case Intrinsic::aarch64_sve_frintp:
6463 return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, DL, Op.getValueType(),
6464 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6465 case Intrinsic::aarch64_sve_frintm:
6466 return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, DL, Op.getValueType(),
6467 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6468 case Intrinsic::aarch64_sve_frinti:
6469 return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, DL,
6470 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6471 Op.getOperand(1));
6472 case Intrinsic::aarch64_sve_frintx:
6473 return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, DL, Op.getValueType(),
6474 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6475 case Intrinsic::aarch64_sve_frinta:
6476 return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, DL, Op.getValueType(),
6477 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6478 case Intrinsic::aarch64_sve_frintn:
6479 return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, DL,
6480 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6481 Op.getOperand(1));
6482 case Intrinsic::aarch64_sve_frintz:
6483 return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, DL, Op.getValueType(),
6484 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6485 case Intrinsic::aarch64_sve_ucvtf:
6486 return DAG.getNode(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU, DL,
6487 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6488 Op.getOperand(1));
6489 case Intrinsic::aarch64_sve_scvtf:
6490 return DAG.getNode(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU, DL,
6491 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6492 Op.getOperand(1));
6493 case Intrinsic::aarch64_sve_fcvtzu:
6494 return DAG.getNode(AArch64ISD::FCVTZU_MERGE_PASSTHRU, DL, Op.getValueType(),
6495 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6496 case Intrinsic::aarch64_sve_fcvtzs:
6497 return DAG.getNode(AArch64ISD::FCVTZS_MERGE_PASSTHRU, DL, Op.getValueType(),
6498 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6499 case Intrinsic::aarch64_sve_fsqrt:
6500 return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, DL, Op.getValueType(),
6501 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6502 case Intrinsic::aarch64_sve_frecpx:
6503 return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, DL, Op.getValueType(),
6504 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6505 case Intrinsic::aarch64_sve_frecpe_x:
6506 return DAG.getNode(AArch64ISD::FRECPE, DL, Op.getValueType(),
6507 Op.getOperand(1));
6508 case Intrinsic::aarch64_sve_frecps_x:
6509 return DAG.getNode(AArch64ISD::FRECPS, DL, Op.getValueType(),
6510 Op.getOperand(1), Op.getOperand(2));
6511 case Intrinsic::aarch64_sve_frsqrte_x:
6512 return DAG.getNode(AArch64ISD::FRSQRTE, DL, Op.getValueType(),
6513 Op.getOperand(1));
6514 case Intrinsic::aarch64_sve_frsqrts_x:
6515 return DAG.getNode(AArch64ISD::FRSQRTS, DL, Op.getValueType(),
6516 Op.getOperand(1), Op.getOperand(2));
6517 case Intrinsic::aarch64_sve_fabs:
6518 return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, DL, Op.getValueType(),
6519 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6520 case Intrinsic::aarch64_sve_abs:
6521 return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, DL, Op.getValueType(),
6522 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6523 case Intrinsic::aarch64_sve_neg:
6524 return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, DL, Op.getValueType(),
6525 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6526 case Intrinsic::aarch64_sve_insr: {
6527 SDValue Scalar = Op.getOperand(2);
6528 EVT ScalarTy = Scalar.getValueType();
6529 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
6530 Scalar = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Scalar);
6531
6532 return DAG.getNode(AArch64ISD::INSR, DL, Op.getValueType(),
6533 Op.getOperand(1), Scalar);
6534 }
6535 case Intrinsic::aarch64_sve_rbit:
6536 return DAG.getNode(AArch64ISD::BITREVERSE_MERGE_PASSTHRU, DL,
6537 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
6538 Op.getOperand(1));
6539 case Intrinsic::aarch64_sve_revb:
6540 return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, DL, Op.getValueType(),
6541 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6542 case Intrinsic::aarch64_sve_revh:
6543 return DAG.getNode(AArch64ISD::REVH_MERGE_PASSTHRU, DL, Op.getValueType(),
6544 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6545 case Intrinsic::aarch64_sve_revw:
6546 return DAG.getNode(AArch64ISD::REVW_MERGE_PASSTHRU, DL, Op.getValueType(),
6547 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6548 case Intrinsic::aarch64_sve_revd:
6549 return DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, DL, Op.getValueType(),
6550 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
6551 case Intrinsic::aarch64_sve_sxtb:
6552 return DAG.getNode(
6553 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6554 Op.getOperand(2), Op.getOperand(3),
6555 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
6556 Op.getOperand(1));
6557 case Intrinsic::aarch64_sve_sxth:
6558 return DAG.getNode(
6559 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6560 Op.getOperand(2), Op.getOperand(3),
6561 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
6562 Op.getOperand(1));
6563 case Intrinsic::aarch64_sve_sxtw:
6564 return DAG.getNode(
6565 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6566 Op.getOperand(2), Op.getOperand(3),
6567 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
6568 Op.getOperand(1));
6569 case Intrinsic::aarch64_sve_uxtb:
6570 return DAG.getNode(
6571 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6572 Op.getOperand(2), Op.getOperand(3),
6573 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
6574 Op.getOperand(1));
6575 case Intrinsic::aarch64_sve_uxth:
6576 return DAG.getNode(
6577 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6578 Op.getOperand(2), Op.getOperand(3),
6579 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
6580 Op.getOperand(1));
6581 case Intrinsic::aarch64_sve_uxtw:
6582 return DAG.getNode(
6583 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, DL, Op.getValueType(),
6584 Op.getOperand(2), Op.getOperand(3),
6585 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
6586 Op.getOperand(1));
6587 case Intrinsic::localaddress: {
6588 const auto &MF = DAG.getMachineFunction();
6589 const auto *RegInfo = Subtarget->getRegisterInfo();
6590 unsigned Reg = RegInfo->getLocalAddressRegister(MF);
6591 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg,
6592 Op.getSimpleValueType());
6593 }
6594
6595 case Intrinsic::eh_recoverfp: {
6596 // FIXME: This needs to be implemented to correctly handle highly aligned
6597 // stack objects. For now we simply return the incoming FP. Refer D53541
6598 // for more details.
6599 SDValue FnOp = Op.getOperand(1);
6600 SDValue IncomingFPOp = Op.getOperand(2);
6601 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
6602 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
6603 if (!Fn)
6605 "llvm.eh.recoverfp must take a function as the first argument");
6606 return IncomingFPOp;
6607 }
6608 case Intrinsic::aarch64_neon_vsri:
6609 case Intrinsic::aarch64_neon_vsli:
6610 case Intrinsic::aarch64_sve_sri:
6611 case Intrinsic::aarch64_sve_sli: {
6612 EVT Ty = Op.getValueType();
6613
6614 if (!Ty.isVector())
6615 report_fatal_error("Unexpected type for aarch64_neon_vsli");
6616
6617 assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());
6618
6619 bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri ||
6620 IntNo == Intrinsic::aarch64_sve_sri;
6621 unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
6622 return DAG.getNode(Opcode, DL, Ty, Op.getOperand(1), Op.getOperand(2),
6623 Op.getOperand(3));
6624 }
6625
6626 case Intrinsic::aarch64_neon_srhadd:
6627 case Intrinsic::aarch64_neon_urhadd:
6628 case Intrinsic::aarch64_neon_shadd:
6629 case Intrinsic::aarch64_neon_uhadd: {
6630 bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
6631 IntNo == Intrinsic::aarch64_neon_shadd);
6632 bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
6633 IntNo == Intrinsic::aarch64_neon_urhadd);
6634 unsigned Opcode = IsSignedAdd
6635 ? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS)
6636 : (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU);
6637 return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(1),
6638 Op.getOperand(2));
6639 }
6640 case Intrinsic::aarch64_neon_saddlp:
6641 case Intrinsic::aarch64_neon_uaddlp: {
6642 unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp
6643 ? AArch64ISD::UADDLP
6644 : AArch64ISD::SADDLP;
6645 return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(1));
6646 }
6647 case Intrinsic::aarch64_neon_sdot:
6648 case Intrinsic::aarch64_neon_udot:
6649 case Intrinsic::aarch64_sve_sdot:
6650 case Intrinsic::aarch64_sve_udot: {
6651 unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot ||
6652 IntNo == Intrinsic::aarch64_sve_udot)
6653 ? AArch64ISD::UDOT
6654 : AArch64ISD::SDOT;
6655 return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(1),
6656 Op.getOperand(2), Op.getOperand(3));
6657 }
6658 case Intrinsic::aarch64_neon_usdot:
6659 case Intrinsic::aarch64_sve_usdot: {
6660 return DAG.getNode(AArch64ISD::USDOT, DL, Op.getValueType(),
6661 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6662 }
6663 case Intrinsic::aarch64_neon_saddlv:
6664 case Intrinsic::aarch64_neon_uaddlv: {
6665 EVT OpVT = Op.getOperand(1).getValueType();
6666 EVT ResVT = Op.getValueType();
6667 assert(
6668 ((ResVT == MVT::i32 && (OpVT == MVT::v8i8 || OpVT == MVT::v16i8 ||
6669 OpVT == MVT::v8i16 || OpVT == MVT::v4i16)) ||
6670 (ResVT == MVT::i64 && (OpVT == MVT::v4i32 || OpVT == MVT::v2i32))) &&
6671 "Unexpected aarch64_neon_u/saddlv type");
6672 (void)OpVT;
6673 // In order to avoid insert_subvector, use v4i32 rather than v2i32.
6674 SDValue ADDLV = DAG.getNode(
6675 IntNo == Intrinsic::aarch64_neon_uaddlv ? AArch64ISD::UADDLV
6676 : AArch64ISD::SADDLV,
6677 DL, ResVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64, Op.getOperand(1));
6678 SDValue EXTRACT_VEC_ELT = DAG.getNode(
6679 ISD::EXTRACT_VECTOR_ELT, DL, ResVT == MVT::i32 ? MVT::i32 : MVT::i64,
6680 ADDLV, DAG.getConstant(0, DL, MVT::i64));
6681 return EXTRACT_VEC_ELT;
6682 }
6683 case Intrinsic::experimental_cttz_elts: {
6684 SDValue CttzOp = Op.getOperand(1);
6685 EVT VT = CttzOp.getValueType();
6686 assert(VT.getVectorElementType() == MVT::i1 && "Expected MVT::i1");
6687
6688 if (VT.isFixedLengthVector()) {
6689 // We can use SVE instructions to lower this intrinsic by first creating
6690 // an SVE predicate register mask from the fixed-width vector.
6691 EVT NewVT = getTypeToTransformTo(*DAG.getContext(), VT);
6692 SDValue Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, CttzOp);
6693 CttzOp = convertFixedMaskToScalableVector(Mask, DAG);
6694 }
6695
6696 SDValue NewCttzElts =
6697 DAG.getNode(AArch64ISD::CTTZ_ELTS, DL, MVT::i64, CttzOp);
6698 return DAG.getZExtOrTrunc(NewCttzElts, DL, Op.getValueType());
6699 }
6700 case Intrinsic::experimental_vector_match: {
6701 return LowerVectorMatch(Op, DAG);
6702 }
6703 }
6704}
6705
6706bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
6707 if (VT.getVectorElementType() == MVT::i8 ||
6708 VT.getVectorElementType() == MVT::i16) {
6709 EltTy = MVT::i32;
6710 return true;
6711 }
6712 return false;
6713}
6714
6715bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(SDValue Extend,
6716 EVT DataVT) const {
6717 const EVT IndexVT = Extend.getOperand(0).getValueType();
6718 // SVE only supports implicit extension of 32-bit indices.
6719 if (!Subtarget->hasSVE() || IndexVT.getVectorElementType() != MVT::i32)
6720 return false;
6721
6722 // Indices cannot be smaller than the main data type.
6723 if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits())
6724 return false;
6725
6726 // Scalable vectors with "vscale * 2" or fewer elements sit within a 64-bit
6727 // element container type, which would violate the previous clause.
6728 return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2;
6729}
6730
6731bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
6732 EVT ExtVT = ExtVal.getValueType();
6733 if (!ExtVT.isScalableVector() && !Subtarget->useSVEForFixedLengthVectors())
6734 return false;
6735
6736 // It may be worth creating extending masked loads if there are multiple
6737 // masked loads using the same predicate. That way we'll end up creating
6738 // extending masked loads that may then get split by the legaliser. This
6739 // results in just one set of predicate unpacks at the start, instead of
6740 // multiple sets of vector unpacks after each load.
6741 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal->getOperand(0))) {
6742 if (!isLoadExtLegalOrCustom(ISD::ZEXTLOAD, ExtVT, Ld->getValueType(0))) {
6743 // Disable extending masked loads for fixed-width for now, since the code
6744 // quality doesn't look great.
6745 if (!ExtVT.isScalableVector())
6746 return false;
6747
6748 unsigned NumExtMaskedLoads = 0;
6749 for (auto *U : Ld->getMask()->users())
6750 if (isa<MaskedLoadSDNode>(U))
6751 NumExtMaskedLoads++;
6752
6753 if (NumExtMaskedLoads <= 1)
6754 return false;
6755 }
6756 }
6757
6758 EVT PreExtScalarVT = ExtVal->getOperand(0).getValueType().getScalarType();
6759 return PreExtScalarVT == MVT::i8 || PreExtScalarVT == MVT::i16 ||
6760 PreExtScalarVT == MVT::i32 || PreExtScalarVT == MVT::i64;
6761}
6762
6763unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
6764 std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
6765 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
6766 AArch64ISD::GLD1_MERGE_ZERO},
6767 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
6768 AArch64ISD::GLD1_UXTW_MERGE_ZERO},
6769 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
6770 AArch64ISD::GLD1_MERGE_ZERO},
6771 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
6772 AArch64ISD::GLD1_SXTW_MERGE_ZERO},
6773 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
6774 AArch64ISD::GLD1_SCALED_MERGE_ZERO},
6775 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
6776 AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO},
6777 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
6778 AArch64ISD::GLD1_SCALED_MERGE_ZERO},
6779 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
6780 AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO},
6781 };
6782 auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
6783 return AddrModes.find(Key)->second;
6784}
6785
6786unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
6787 switch (Opcode) {
6788 default:
6789 llvm_unreachable("unimplemented opcode");
6790 return Opcode;
6791 case AArch64ISD::GLD1_MERGE_ZERO:
6792 return AArch64ISD::GLD1S_MERGE_ZERO;
6793 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
6794 return AArch64ISD::GLD1S_IMM_MERGE_ZERO;
6795 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
6796 return AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
6797 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
6798 return AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
6799 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
6800 return AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
6801 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
6802 return AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
6803 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
6804 return AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
6805 }
6806}
6807
6808SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
6809 SelectionDAG &DAG) const {
6810 MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);
6811
6812 SDLoc DL(Op);
6813 SDValue Chain = MGT->getChain();
6814 SDValue PassThru = MGT->getPassThru();
6815 SDValue Mask = MGT->getMask();
6816 SDValue BasePtr = MGT->getBasePtr();
6817 SDValue Index = MGT->getIndex();
6818 SDValue Scale = MGT->getScale();
6819 EVT VT = Op.getValueType();
6820 EVT MemVT = MGT->getMemoryVT();
6821 ISD::LoadExtType ExtType = MGT->getExtensionType();
6822 ISD::MemIndexType IndexType = MGT->getIndexType();
6823
6824 // SVE supports zero (and so undef) passthrough values only, everything else
6825 // must be handled manually by an explicit select on the load's output.
6826 if (!PassThru->isUndef() && !isZerosVector(PassThru.getNode())) {
6827 SDValue Ops[] = {Chain, DAG.getUNDEF(VT), Mask, BasePtr, Index, Scale};
6828 SDValue Load =
6829 DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
6830 MGT->getMemOperand(), IndexType, ExtType);
6831 SDValue Select = DAG.getSelect(DL, VT, Mask, Load, PassThru);
6832 return DAG.getMergeValues({Select, Load.getValue(1)}, DL);
6833 }
6834
6835 bool IsScaled = MGT->isIndexScaled();
6836 bool IsSigned = MGT->isIndexSigned();
6837
6838 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
6839 // must be calculated before hand.
6840 uint64_t ScaleVal = Scale->getAsZExtVal();
6841 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
6842 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
6843 EVT IndexVT = Index.getValueType();
6844 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
6845 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
6846 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
6847
6848 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
6849 return DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
6850 MGT->getMemOperand(), IndexType, ExtType);
6851 }
6852
6853 // Lower fixed length gather to a scalable equivalent.
6854 if (VT.isFixedLengthVector()) {
6855 assert(Subtarget->useSVEForFixedLengthVectors() &&
6856 "Cannot lower when not using SVE for fixed vectors!");
6857
6858 // NOTE: Handle floating-point as if integer then bitcast the result.
6859 EVT DataVT = VT.changeVectorElementTypeToInteger();
6860 MemVT = MemVT.changeVectorElementTypeToInteger();
6861
6862 // Find the smallest integer fixed length vector we can use for the gather.
6863 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
6864 if (DataVT.getVectorElementType() == MVT::i64 ||
6865 Index.getValueType().getVectorElementType() == MVT::i64 ||
6866 Mask.getValueType().getVectorElementType() == MVT::i64)
6867 PromotedVT = VT.changeVectorElementType(MVT::i64);
6868
6869 // Promote vector operands except for passthrough, which we know is either
6870 // undef or zero, and thus best constructed directly.
6871 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6872 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
6873 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
6874
6875 // A promoted result type forces the need for an extending load.
6876 if (PromotedVT != DataVT && ExtType == ISD::NON_EXTLOAD)
6877 ExtType = ISD::EXTLOAD;
6878
6879 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
6880
6881 // Convert fixed length vector operands to scalable.
6882 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
6883 Index = convertToScalableVector(DAG, ContainerVT, Index);
6885 PassThru = PassThru->isUndef() ? DAG.getUNDEF(ContainerVT)
6886 : DAG.getConstant(0, DL, ContainerVT);
6887
6888 // Emit equivalent scalable vector gather.
6889 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
6890 SDValue Load =
6891 DAG.getMaskedGather(DAG.getVTList(ContainerVT, MVT::Other), MemVT, DL,
6892 Ops, MGT->getMemOperand(), IndexType, ExtType);
6893
6894 // Extract fixed length data then convert to the required result type.
6895 SDValue Result = convertFromScalableVector(DAG, PromotedVT, Load);
6896 Result = DAG.getNode(ISD::TRUNCATE, DL, DataVT, Result);
6897 if (VT.isFloatingPoint())
6898 Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
6899
6900 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
6901 }
6902
6903 // Everything else is legal.
6904 return Op;
6905}
6906
6907SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
6908 SelectionDAG &DAG) const {
6909 MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);
6910
6911 SDLoc DL(Op);
6912 SDValue Chain = MSC->getChain();
6913 SDValue StoreVal = MSC->getValue();
6914 SDValue Mask = MSC->getMask();
6915 SDValue BasePtr = MSC->getBasePtr();
6916 SDValue Index = MSC->getIndex();
6917 SDValue Scale = MSC->getScale();
6918 EVT VT = StoreVal.getValueType();
6919 EVT MemVT = MSC->getMemoryVT();
6920 ISD::MemIndexType IndexType = MSC->getIndexType();
6921 bool Truncating = MSC->isTruncatingStore();
6922
6923 bool IsScaled = MSC->isIndexScaled();
6924 bool IsSigned = MSC->isIndexSigned();
6925
6926 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
6927 // must be calculated before hand.
6928 uint64_t ScaleVal = Scale->getAsZExtVal();
6929 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
6930 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
6931 EVT IndexVT = Index.getValueType();
6932 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
6933 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
6934 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
6935
6936 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
6937 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
6938 MSC->getMemOperand(), IndexType, Truncating);
6939 }
6940
6941 // Lower fixed length scatter to a scalable equivalent.
6942 if (VT.isFixedLengthVector()) {
6943 assert(Subtarget->useSVEForFixedLengthVectors() &&
6944 "Cannot lower when not using SVE for fixed vectors!");
6945
6946 // Once bitcast we treat floating-point scatters as if integer.
6947 if (VT.isFloatingPoint()) {
6949 MemVT = MemVT.changeVectorElementTypeToInteger();
6950 StoreVal = DAG.getNode(ISD::BITCAST, DL, VT, StoreVal);
6951 }
6952
6953 // Find the smallest integer fixed length vector we can use for the scatter.
6954 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
6955 if (VT.getVectorElementType() == MVT::i64 ||
6956 Index.getValueType().getVectorElementType() == MVT::i64 ||
6957 Mask.getValueType().getVectorElementType() == MVT::i64)
6958 PromotedVT = VT.changeVectorElementType(MVT::i64);
6959
6960 // Promote vector operands.
6961 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
6962 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
6963 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
6964 StoreVal = DAG.getNode(ISD::ANY_EXTEND, DL, PromotedVT, StoreVal);
6965
6966 // A promoted value type forces the need for a truncating store.
6967 if (PromotedVT != VT)
6968 Truncating = true;
6969
6970 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
6971
6972 // Convert fixed length vector operands to scalable.
6973 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
6974 Index = convertToScalableVector(DAG, ContainerVT, Index);
6976 StoreVal = convertToScalableVector(DAG, ContainerVT, StoreVal);
6977
6978 // Emit equivalent scalable vector scatter.
6979 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
6980 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
6981 MSC->getMemOperand(), IndexType, Truncating);
6982 }
6983
6984 // Everything else is legal.
6985 return Op;
6986}
6987
6988SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
6989 SDLoc DL(Op);
6990 MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Op);
6991 assert(LoadNode && "Expected custom lowering of a masked load node");
6992 EVT VT = Op->getValueType(0);
6993
6994 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
6995 return LowerFixedLengthVectorMLoadToSVE(Op, DAG);
6996
6997 SDValue PassThru = LoadNode->getPassThru();
6998 SDValue Mask = LoadNode->getMask();
6999
7000 if (PassThru->isUndef() || isZerosVector(PassThru.getNode()))
7001 return Op;
7002
7004 VT, DL, LoadNode->getChain(), LoadNode->getBasePtr(),
7005 LoadNode->getOffset(), Mask, DAG.getUNDEF(VT), LoadNode->getMemoryVT(),
7006 LoadNode->getMemOperand(), LoadNode->getAddressingMode(),
7007 LoadNode->getExtensionType());
7008
7009 SDValue Result = DAG.getSelect(DL, VT, Mask, Load, PassThru);
7010
7011 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
7012}
7013
7014// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
7016 EVT VT, EVT MemVT,
7017 SelectionDAG &DAG) {
7018 assert(VT.isVector() && "VT should be a vector type");
7019 assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
7020
7021 SDValue Value = ST->getValue();
7022
7023 // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
7024 // the word lane which represent the v4i8 subvector. It optimizes the store
7025 // to:
7026 //
7027 // xtn v0.8b, v0.8h
7028 // str s0, [x0]
7029
7030 SDValue Undef = DAG.getUNDEF(MVT::i16);
7031 SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
7032 {Undef, Undef, Undef, Undef});
7033
7034 SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
7035 Value, UndefVec);
7036 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
7037
7038 Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
7039 SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
7040 Trunc, DAG.getConstant(0, DL, MVT::i64));
7041
7042 return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
7043 ST->getBasePtr(), ST->getMemOperand());
7044}
7045
7047 SDLoc DL(Op);
7048 SDValue Src = Op.getOperand(0);
7049 MVT DestVT = Op.getSimpleValueType();
7050 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7052
7053 unsigned SrcAS = N->getSrcAddressSpace();
7054 unsigned DestAS = N->getDestAddressSpace();
7055 assert(SrcAS != DestAS &&
7056 "addrspacecast must be between different address spaces");
7057 assert(TLI.getTargetMachine().getPointerSize(SrcAS) !=
7058 TLI.getTargetMachine().getPointerSize(DestAS) &&
7059 "addrspacecast must be between different ptr sizes");
7060 (void)TLI;
7061
7062 if (SrcAS == ARM64AS::PTR32_SPTR) {
7063 return DAG.getNode(ISD::SIGN_EXTEND, DL, DestVT, Src,
7064 DAG.getTargetConstant(0, DL, DestVT));
7065 } else if (SrcAS == ARM64AS::PTR32_UPTR) {
7066 return DAG.getNode(ISD::ZERO_EXTEND, DL, DestVT, Src,
7067 DAG.getTargetConstant(0, DL, DestVT));
7068 } else if ((DestAS == ARM64AS::PTR32_SPTR) ||
7069 (DestAS == ARM64AS::PTR32_UPTR)) {
7070 SDValue Ext = DAG.getAnyExtOrTrunc(Src, DL, DestVT);
7071 SDValue Trunc = DAG.getZeroExtendInReg(Ext, DL, DestVT);
7072 return Trunc;
7073 } else {
7074 return Src;
7075 }
7076}
7077
7078// Custom lowering for any store, vector or scalar and/or default or with
7079// a truncate operations. Currently only custom lower truncate operation
7080// from vector v4i16 to v4i8 or volatile stores of i128.
7081SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
7082 SelectionDAG &DAG) const {
7083 SDLoc Dl(Op);
7084 StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
7085 assert (StoreNode && "Can only custom lower store nodes");
7086
7087 SDValue Value = StoreNode->getValue();
7088
7089 EVT VT = Value.getValueType();
7090 EVT MemVT = StoreNode->getMemoryVT();
7091
7092 if (VT.isVector()) {
7094 VT,
7095 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
7096 return LowerFixedLengthVectorStoreToSVE(Op, DAG);
7097
7098 unsigned AS = StoreNode->getAddressSpace();
7099 Align Alignment = StoreNode->getAlign();
7100 if (Alignment < MemVT.getStoreSize() &&
7101 !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment,
7102 StoreNode->getMemOperand()->getFlags(),
7103 nullptr)) {
7104 return scalarizeVectorStore(StoreNode, DAG);
7105 }
7106
7107 if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 &&
7108 MemVT == MVT::v4i8) {
7109 return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
7110 }
7111 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
7112 // the custom lowering, as there are no un-paired non-temporal stores and
7113 // legalization will break up 256 bit inputs.
7114 ElementCount EC = MemVT.getVectorElementCount();
7115 if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
7116 EC.isKnownEven() && DAG.getDataLayout().isLittleEndian() &&
7117 (MemVT.getScalarSizeInBits() == 8u ||
7118 MemVT.getScalarSizeInBits() == 16u ||
7119 MemVT.getScalarSizeInBits() == 32u ||
7120 MemVT.getScalarSizeInBits() == 64u)) {
7121 SDValue Lo =
7124 StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
7125 SDValue Hi =
7128 StoreNode->getValue(),
7129 DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64));
7131 AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
7132 {StoreNode->getChain(), DAG.getBitcast(MVT::v2i64, Lo),
7133 DAG.getBitcast(MVT::v2i64, Hi), StoreNode->getBasePtr()},
7134 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
7135 return Result;
7136 }
7137 } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
7138 return LowerStore128(Op, DAG);
7139 } else if (MemVT == MVT::i64x8) {
7140 SDValue Value = StoreNode->getValue();
7141 assert(Value->getValueType(0) == MVT::i64x8);
7142 SDValue Chain = StoreNode->getChain();
7143 SDValue Base = StoreNode->getBasePtr();
7144 EVT PtrVT = Base.getValueType();
7145 for (unsigned i = 0; i < 8; i++) {
7146 SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64,
7147 Value, DAG.getConstant(i, Dl, MVT::i32));
7148 SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base,
7149 DAG.getConstant(i * 8, Dl, PtrVT));
7150 Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(),
7151 StoreNode->getBaseAlign());
7152 }
7153 return Chain;
7154 }
7155
7156 return SDValue();
7157}
7158
7159/// Lower atomic or volatile 128-bit stores to a single STP instruction.
7160SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
7161 SelectionDAG &DAG) const {
7162 MemSDNode *StoreNode = cast<MemSDNode>(Op);
7163 assert(StoreNode->getMemoryVT() == MVT::i128);
7164 assert(StoreNode->isVolatile() || StoreNode->isAtomic());
7165
7166 bool IsStoreRelease =
7168 if (StoreNode->isAtomic())
7169 assert((Subtarget->hasFeature(AArch64::FeatureLSE2) &&
7170 Subtarget->hasFeature(AArch64::FeatureRCPC3) && IsStoreRelease) ||
7173
7174 SDValue Value = (StoreNode->getOpcode() == ISD::STORE ||
7175 StoreNode->getOpcode() == ISD::ATOMIC_STORE)
7176 ? StoreNode->getOperand(1)
7177 : StoreNode->getOperand(2);
7178 SDLoc DL(Op);
7179 auto StoreValue = DAG.SplitScalar(Value, DL, MVT::i64, MVT::i64);
7180 unsigned Opcode = IsStoreRelease ? AArch64ISD::STILP : AArch64ISD::STP;
7181 if (DAG.getDataLayout().isBigEndian())
7182 std::swap(StoreValue.first, StoreValue.second);
7184 Opcode, DL, DAG.getVTList(MVT::Other),
7185 {StoreNode->getChain(), StoreValue.first, StoreValue.second,
7186 StoreNode->getBasePtr()},
7187 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
7188 return Result;
7189}
7190
7191SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
7192 SelectionDAG &DAG) const {
7193 SDLoc DL(Op);
7194 LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
7195 assert(LoadNode && "Expected custom lowering of a load node");
7196
7197 if (LoadNode->getMemoryVT() == MVT::i64x8) {
7199 SDValue Base = LoadNode->getBasePtr();
7200 SDValue Chain = LoadNode->getChain();
7201 EVT PtrVT = Base.getValueType();
7202 for (unsigned i = 0; i < 8; i++) {
7203 SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
7204 DAG.getConstant(i * 8, DL, PtrVT));
7205 SDValue Part =
7206 DAG.getLoad(MVT::i64, DL, Chain, Ptr, LoadNode->getPointerInfo(),
7207 LoadNode->getBaseAlign());
7208 Ops.push_back(Part);
7209 Chain = SDValue(Part.getNode(), 1);
7210 }
7211 SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops);
7212 return DAG.getMergeValues({Loaded, Chain}, DL);
7213 }
7214
7215 // Custom lowering for extending v4i8 vector loads.
7216 EVT VT = Op->getValueType(0);
7217 assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32");
7218
7219 if (LoadNode->getMemoryVT() != MVT::v4i8)
7220 return SDValue();
7221
7222 // Avoid generating unaligned loads.
7223 if (Subtarget->requiresStrictAlign() && LoadNode->getAlign() < Align(4))
7224 return SDValue();
7225
7226 unsigned ExtType;
7227 if (LoadNode->getExtensionType() == ISD::SEXTLOAD)
7228 ExtType = ISD::SIGN_EXTEND;
7229 else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD ||
7230 LoadNode->getExtensionType() == ISD::EXTLOAD)
7231 ExtType = ISD::ZERO_EXTEND;
7232 else
7233 return SDValue();
7234
7235 SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(),
7236 LoadNode->getBasePtr(), MachinePointerInfo());
7237 SDValue Chain = Load.getValue(1);
7238 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load);
7239 SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec);
7240 SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC);
7241 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext,
7242 DAG.getConstant(0, DL, MVT::i64));
7243 if (VT == MVT::v4i32)
7244 Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext);
7245 return DAG.getMergeValues({Ext, Chain}, DL);
7246}
7247
7248SDValue AArch64TargetLowering::LowerVECTOR_COMPRESS(SDValue Op,
7249 SelectionDAG &DAG) const {
7250 SDLoc DL(Op);
7251 SDValue Vec = Op.getOperand(0);
7252 SDValue Mask = Op.getOperand(1);
7253 SDValue Passthru = Op.getOperand(2);
7254 EVT VecVT = Vec.getValueType();
7255 EVT MaskVT = Mask.getValueType();
7256 EVT ElmtVT = VecVT.getVectorElementType();
7257 const bool IsFixedLength = VecVT.isFixedLengthVector();
7258 const bool HasPassthru = !Passthru.isUndef();
7259 unsigned MinElmts = VecVT.getVectorElementCount().getKnownMinValue();
7260 EVT FixedVecVT = MVT::getVectorVT(ElmtVT.getSimpleVT(), MinElmts);
7261
7262 assert(VecVT.isVector() && "Input to VECTOR_COMPRESS must be vector.");
7263
7264 if (!Subtarget->isSVEAvailable())
7265 return SDValue();
7266
7267 if (IsFixedLength && VecVT.getSizeInBits().getFixedValue() > 128)
7268 return SDValue();
7269
7270 // Only <vscale x {4|2} x {i32|i64}> supported for compact.
7271 if (MinElmts != 2 && MinElmts != 4)
7272 return SDValue();
7273
7274 // We can use the SVE register containing the NEON vector in its lowest bits.
7275 if (IsFixedLength) {
7276 EVT ScalableVecVT =
7277 MVT::getScalableVectorVT(ElmtVT.getSimpleVT(), MinElmts);
7278 EVT ScalableMaskVT = MVT::getScalableVectorVT(
7279 MaskVT.getVectorElementType().getSimpleVT(), MinElmts);
7280
7281 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableVecVT,
7282 DAG.getUNDEF(ScalableVecVT), Vec,
7283 DAG.getConstant(0, DL, MVT::i64));
7284 Mask = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableMaskVT,
7285 DAG.getUNDEF(ScalableMaskVT), Mask,
7286 DAG.getConstant(0, DL, MVT::i64));
7288 ScalableMaskVT.changeVectorElementType(MVT::i1), Mask);
7289 Passthru = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ScalableVecVT,
7290 DAG.getUNDEF(ScalableVecVT), Passthru,
7291 DAG.getConstant(0, DL, MVT::i64));
7292
7293 VecVT = Vec.getValueType();
7294 MaskVT = Mask.getValueType();
7295 }
7296
7297 // Get legal type for compact instruction
7298 EVT ContainerVT = getSVEContainerType(VecVT);
7299 EVT CastVT = VecVT.changeVectorElementTypeToInteger();
7300
7301 // Convert to i32 or i64 for smaller types, as these are the only supported
7302 // sizes for compact.
7303 if (ContainerVT != VecVT) {
7304 Vec = DAG.getBitcast(CastVT, Vec);
7305 Vec = DAG.getNode(ISD::ANY_EXTEND, DL, ContainerVT, Vec);
7306 }
7307
7308 SDValue Compressed = DAG.getNode(
7310 DAG.getTargetConstant(Intrinsic::aarch64_sve_compact, DL, MVT::i64), Mask,
7311 Vec);
7312
7313 // compact fills with 0s, so if our passthru is all 0s, do nothing here.
7314 if (HasPassthru && !ISD::isConstantSplatVectorAllZeros(Passthru.getNode())) {
7315 SDValue Offset = DAG.getNode(
7316 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64,
7317 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64), Mask,
7318 Mask);
7319
7320 SDValue IndexMask = DAG.getNode(
7321 ISD::INTRINSIC_WO_CHAIN, DL, MaskVT,
7322 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64),
7323 DAG.getConstant(0, DL, MVT::i64), Offset);
7324
7325 Compressed =
7326 DAG.getNode(ISD::VSELECT, DL, VecVT, IndexMask, Compressed, Passthru);
7327 }
7328
7329 // Extracting from a legal SVE type before truncating produces better code.
7330 if (IsFixedLength) {
7331 Compressed = DAG.getNode(
7333 FixedVecVT.changeVectorElementType(ContainerVT.getVectorElementType()),
7334 Compressed, DAG.getConstant(0, DL, MVT::i64));
7335 CastVT = FixedVecVT.changeVectorElementTypeToInteger();
7336 VecVT = FixedVecVT;
7337 }
7338
7339 // If we changed the element type before, we need to convert it back.
7340 if (ContainerVT != VecVT) {
7341 Compressed = DAG.getNode(ISD::TRUNCATE, DL, CastVT, Compressed);
7342 Compressed = DAG.getBitcast(VecVT, Compressed);
7343 }
7344
7345 return Compressed;
7346}
7347
7348// Generate SUBS and CSEL for integer abs.
7349SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
7350 MVT VT = Op.getSimpleValueType();
7351
7352 if (VT.isVector())
7353 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU);
7354
7355 SDLoc DL(Op);
7356 SDValue Neg = DAG.getNegative(Op.getOperand(0), DL, VT);
7357
7358 // Generate SUBS & CSEL.
7359 SDValue Cmp = DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, FlagsVT),
7360 Op.getOperand(0), DAG.getConstant(0, DL, VT));
7361 return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
7362 getCondCode(DAG, AArch64CC::PL), Cmp.getValue(1));
7363}
7364
7366 SDValue Chain = Op.getOperand(0);
7367 SDValue Cond = Op.getOperand(1);
7368 SDValue Dest = Op.getOperand(2);
7369
7371 if (SDValue Cmp = emitConjunction(DAG, Cond, CC)) {
7372 SDLoc DL(Op);
7373 SDValue CCVal = getCondCode(DAG, CC);
7374 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CCVal,
7375 Cmp);
7376 }
7377
7378 return SDValue();
7379}
7380
7381// Treat FSHR with constant shifts as legal operation, otherwise it is expanded
7382// FSHL is converted to FSHR before deciding what to do with it
7384 SDValue Shifts = Op.getOperand(2);
7385 // Check if the shift amount is a constant and normalise to [0, SrcBitLen)
7386 // If opcode is FSHL, convert it to FSHR
7387 if (auto *ShiftNo = dyn_cast<ConstantSDNode>(Shifts)) {
7388 SDLoc DL(Op);
7389 MVT VT = Op.getSimpleValueType();
7390 unsigned int NewShiftNo = ShiftNo->getZExtValue() % VT.getFixedSizeInBits();
7391
7392 if (Op.getOpcode() == ISD::FSHL) {
7393 if (NewShiftNo == 0)
7394 return Op.getOperand(0);
7395
7396 NewShiftNo = VT.getFixedSizeInBits() - NewShiftNo;
7397 return DAG.getNode(
7398 ISD::FSHR, DL, VT, Op.getOperand(0), Op.getOperand(1),
7399 DAG.getConstant(NewShiftNo, DL, Shifts.getValueType()));
7400 }
7401
7402 if (Op.getOpcode() == ISD::FSHR) {
7403 if (NewShiftNo == 0)
7404 return Op.getOperand(1);
7405
7406 if (ShiftNo->getZExtValue() == NewShiftNo)
7407 return Op;
7408
7409 // Rewrite using the normalised shift amount.
7410 return DAG.getNode(
7411 ISD::FSHR, DL, VT, Op.getOperand(0), Op.getOperand(1),
7412 DAG.getConstant(NewShiftNo, DL, Shifts.getValueType()));
7413 }
7414 }
7415
7416 return SDValue();
7417}
7418
7420 SDValue X = Op.getOperand(0);
7421 EVT XScalarTy = X.getValueType();
7422 SDValue Exp = Op.getOperand(1);
7423
7424 SDLoc DL(Op);
7425 EVT XVT, ExpVT;
7426 switch (Op.getSimpleValueType().SimpleTy) {
7427 default:
7428 return SDValue();
7429 case MVT::bf16:
7430 case MVT::f16:
7431 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X);
7432 [[fallthrough]];
7433 case MVT::f32:
7434 XVT = MVT::nxv4f32;
7435 ExpVT = MVT::nxv4i32;
7436 break;
7437 case MVT::f64:
7438 XVT = MVT::nxv2f64;
7439 ExpVT = MVT::nxv2i64;
7440 Exp = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Exp);
7441 break;
7442 }
7443
7444 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
7445 SDValue VX =
7446 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, XVT, DAG.getUNDEF(XVT), X, Zero);
7447 SDValue VExp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ExpVT,
7448 DAG.getUNDEF(ExpVT), Exp, Zero);
7449 SDValue VPg = getPTrue(DAG, DL, XVT.changeVectorElementType(MVT::i1),
7450 AArch64SVEPredPattern::all);
7451 SDValue FScale = DAG.getNode(
7453 DAG.getTargetConstant(Intrinsic::aarch64_sve_fscale, DL, MVT::i64), VPg,
7454 VX, VExp);
7455 SDValue Final =
7456 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, X.getValueType(), FScale, Zero);
7457 if (X.getValueType() != XScalarTy)
7458 Final = DAG.getNode(ISD::FP_ROUND, DL, XScalarTy, Final,
7459 DAG.getIntPtrConstant(1, SDLoc(Op), /*isTarget=*/true));
7460 return Final;
7461}
7462
7463SDValue AArch64TargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
7464 SelectionDAG &DAG) const {
7465 return Op.getOperand(0);
7466}
7467
7468SDValue AArch64TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
7469 SelectionDAG &DAG) const {
7470 SDValue Chain = Op.getOperand(0);
7471 SDValue Trmp = Op.getOperand(1); // trampoline, >=32 bytes
7472 SDValue FPtr = Op.getOperand(2); // nested function
7473 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
7474
7475 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
7476
7477 // ldr NestReg, .+16
7478 // ldr x17, .+20
7479 // br x17
7480 // .word 0
7481 // .nest: .qword nest
7482 // .fptr: .qword fptr
7483 SDValue OutChains[5];
7484
7485 const Function *Func =
7486 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
7487 CallingConv::ID CC = Func->getCallingConv();
7488 unsigned NestReg;
7489
7490 switch (CC) {
7491 default:
7492 NestReg = 0x0f; // X15
7493 break;
7495 // Must be kept in sync with AArch64CallingConv.td
7496 NestReg = 0x04; // X4
7497 break;
7498 }
7499
7500 const char FptrReg = 0x11; // X17
7501
7502 SDValue Addr = Trmp;
7503
7504 SDLoc DL(Op);
7505 OutChains[0] = DAG.getStore(
7506 Chain, DL, DAG.getConstant(0x58000080u | NestReg, DL, MVT::i32), Addr,
7507 MachinePointerInfo(TrmpAddr));
7508
7509 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7510 DAG.getConstant(4, DL, MVT::i64));
7511 OutChains[1] = DAG.getStore(
7512 Chain, DL, DAG.getConstant(0x580000b0u | FptrReg, DL, MVT::i32), Addr,
7513 MachinePointerInfo(TrmpAddr, 4));
7514
7515 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7516 DAG.getConstant(8, DL, MVT::i64));
7517 OutChains[2] =
7518 DAG.getStore(Chain, DL, DAG.getConstant(0xd61f0220u, DL, MVT::i32), Addr,
7519 MachinePointerInfo(TrmpAddr, 8));
7520
7521 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7522 DAG.getConstant(16, DL, MVT::i64));
7523 OutChains[3] =
7524 DAG.getStore(Chain, DL, Nest, Addr, MachinePointerInfo(TrmpAddr, 16));
7525
7526 Addr = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7527 DAG.getConstant(24, DL, MVT::i64));
7528 OutChains[4] =
7529 DAG.getStore(Chain, DL, FPtr, Addr, MachinePointerInfo(TrmpAddr, 24));
7530
7531 SDValue StoreToken = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
7532
7533 SDValue EndOfTrmp = DAG.getNode(ISD::ADD, DL, MVT::i64, Trmp,
7534 DAG.getConstant(12, DL, MVT::i64));
7535
7536 // Call clear cache on the trampoline instructions.
7537 return DAG.getNode(ISD::CLEAR_CACHE, DL, MVT::Other, StoreToken, Trmp,
7538 EndOfTrmp);
7539}
7540
7542 SelectionDAG &DAG) const {
7543 LLVM_DEBUG(dbgs() << "Custom lowering: ");
7544 LLVM_DEBUG(Op.dump());
7545
7546 switch (Op.getOpcode()) {
7547 default:
7548 llvm_unreachable("unimplemented operand");
7549 return SDValue();
7552 return LowerLOOP_DEPENDENCE_MASK(Op, DAG);
7553 case ISD::BITCAST:
7554 return LowerBITCAST(Op, DAG);
7555 case ISD::GlobalAddress:
7556 return LowerGlobalAddress(Op, DAG);
7558 return LowerGlobalTLSAddress(Op, DAG);
7560 return LowerPtrAuthGlobalAddress(Op, DAG);
7561 case ISD::ADJUST_TRAMPOLINE:
7562 return LowerADJUST_TRAMPOLINE(Op, DAG);
7563 case ISD::INIT_TRAMPOLINE:
7564 return LowerINIT_TRAMPOLINE(Op, DAG);
7565 case ISD::SETCC:
7566 case ISD::STRICT_FSETCC:
7568 return LowerSETCC(Op, DAG);
7569 case ISD::SETCCCARRY:
7570 return LowerSETCCCARRY(Op, DAG);
7571 case ISD::BRCOND:
7572 return LowerBRCOND(Op, DAG);
7573 case ISD::BR_CC:
7574 return LowerBR_CC(Op, DAG);
7575 case ISD::SELECT:
7576 return LowerSELECT(Op, DAG);
7577 case ISD::SELECT_CC:
7578 return LowerSELECT_CC(Op, DAG);
7579 case ISD::JumpTable:
7580 return LowerJumpTable(Op, DAG);
7581 case ISD::BR_JT:
7582 return LowerBR_JT(Op, DAG);
7583 case ISD::BRIND:
7584 return LowerBRIND(Op, DAG);
7585 case ISD::ConstantPool:
7586 return LowerConstantPool(Op, DAG);
7587 case ISD::BlockAddress:
7588 return LowerBlockAddress(Op, DAG);
7589 case ISD::VASTART:
7590 return LowerVASTART(Op, DAG);
7591 case ISD::VACOPY:
7592 return LowerVACOPY(Op, DAG);
7593 case ISD::VAARG:
7594 return LowerVAARG(Op, DAG);
7595 case ISD::UADDO_CARRY:
7596 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, false /*unsigned*/);
7597 case ISD::USUBO_CARRY:
7598 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, false /*unsigned*/);
7599 case ISD::SADDO_CARRY:
7600 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::ADCS, true /*signed*/);
7601 case ISD::SSUBO_CARRY:
7602 return lowerADDSUBO_CARRY(Op, DAG, AArch64ISD::SBCS, true /*signed*/);
7603 case ISD::SADDO:
7604 case ISD::UADDO:
7605 case ISD::SSUBO:
7606 case ISD::USUBO:
7607 case ISD::SMULO:
7608 case ISD::UMULO:
7609 return LowerXALUO(Op, DAG);
7610 case ISD::FADD:
7611 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
7612 case ISD::FSUB:
7613 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED);
7614 case ISD::FMUL:
7615 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED);
7616 case ISD::FMA:
7617 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
7618 case ISD::FDIV:
7619 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED);
7620 case ISD::FNEG:
7621 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
7622 case ISD::FCEIL:
7623 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU);
7624 case ISD::FFLOOR:
7625 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU);
7626 case ISD::FNEARBYINT:
7627 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);
7628 case ISD::FRINT:
7629 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU);
7630 case ISD::FROUND:
7631 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU);
7632 case ISD::FROUNDEVEN:
7633 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);
7634 case ISD::FTRUNC:
7635 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU);
7636 case ISD::FSQRT:
7637 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU);
7638 case ISD::FABS:
7639 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU);
7640 case ISD::FP_ROUND:
7642 return LowerFP_ROUND(Op, DAG);
7643 case ISD::FP_EXTEND:
7645 return LowerFP_EXTEND(Op, DAG);
7646 case ISD::FRAMEADDR:
7647 return LowerFRAMEADDR(Op, DAG);
7648 case ISD::SPONENTRY:
7649 return LowerSPONENTRY(Op, DAG);
7650 case ISD::RETURNADDR:
7651 return LowerRETURNADDR(Op, DAG);
7653 return LowerADDROFRETURNADDR(Op, DAG);
7655 return LowerCONCAT_VECTORS(Op, DAG);
7657 return LowerINSERT_VECTOR_ELT(Op, DAG);
7659 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
7660 case ISD::BUILD_VECTOR:
7661 return LowerBUILD_VECTOR(Op, DAG);
7663 return LowerZERO_EXTEND_VECTOR_INREG(Op, DAG);
7665 return LowerVECTOR_SHUFFLE(Op, DAG);
7666 case ISD::SPLAT_VECTOR:
7667 return LowerSPLAT_VECTOR(Op, DAG);
7669 return LowerEXTRACT_SUBVECTOR(Op, DAG);
7671 return LowerINSERT_SUBVECTOR(Op, DAG);
7672 case ISD::SDIV:
7673 case ISD::UDIV:
7674 return LowerDIV(Op, DAG);
7675 case ISD::SMIN:
7676 case ISD::UMIN:
7677 case ISD::SMAX:
7678 case ISD::UMAX:
7679 return LowerMinMax(Op, DAG);
7680 case ISD::SRA:
7681 case ISD::SRL:
7682 case ISD::SHL:
7683 return LowerVectorSRA_SRL_SHL(Op, DAG);
7684 case ISD::SHL_PARTS:
7685 case ISD::SRL_PARTS:
7686 case ISD::SRA_PARTS:
7687 return LowerShiftParts(Op, DAG);
7688 case ISD::CTPOP:
7689 case ISD::PARITY:
7690 return LowerCTPOP_PARITY(Op, DAG);
7691 case ISD::FCOPYSIGN:
7692 return LowerFCOPYSIGN(Op, DAG);
7693 case ISD::OR:
7694 return LowerVectorOR(Op, DAG);
7695 case ISD::XOR:
7696 return LowerXOR(Op, DAG);
7697 case ISD::PREFETCH:
7698 return LowerPREFETCH(Op, DAG);
7699 case ISD::SINT_TO_FP:
7700 case ISD::UINT_TO_FP:
7703 return LowerINT_TO_FP(Op, DAG);
7704 case ISD::FP_TO_SINT:
7705 case ISD::FP_TO_UINT:
7708 return LowerFP_TO_INT(Op, DAG);
7711 return LowerFP_TO_INT_SAT(Op, DAG);
7712 case ISD::GET_ROUNDING:
7713 return LowerGET_ROUNDING(Op, DAG);
7714 case ISD::SET_ROUNDING:
7715 return LowerSET_ROUNDING(Op, DAG);
7716 case ISD::GET_FPMODE:
7717 return LowerGET_FPMODE(Op, DAG);
7718 case ISD::SET_FPMODE:
7719 return LowerSET_FPMODE(Op, DAG);
7720 case ISD::RESET_FPMODE:
7721 return LowerRESET_FPMODE(Op, DAG);
7722 case ISD::MUL:
7723 return LowerMUL(Op, DAG);
7724 case ISD::MULHS:
7725 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED);
7726 case ISD::MULHU:
7727 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED);
7729 return LowerINTRINSIC_W_CHAIN(Op, DAG);
7731 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
7733 return LowerINTRINSIC_VOID(Op, DAG);
7734 case ISD::ATOMIC_STORE:
7735 if (cast<MemSDNode>(Op)->getMemoryVT() == MVT::i128) {
7736 assert(Subtarget->hasLSE2() || Subtarget->hasRCPC3());
7737 return LowerStore128(Op, DAG);
7738 }
7739 return SDValue();
7740 case ISD::STORE:
7741 return LowerSTORE(Op, DAG);
7742 case ISD::MSTORE:
7743 return LowerFixedLengthVectorMStoreToSVE(Op, DAG);
7744 case ISD::MGATHER:
7745 return LowerMGATHER(Op, DAG);
7746 case ISD::MSCATTER:
7747 return LowerMSCATTER(Op, DAG);
7748 case ISD::VECREDUCE_SEQ_FADD:
7749 return LowerVECREDUCE_SEQ_FADD(Op, DAG);
7750 case ISD::VECREDUCE_ADD:
7751 case ISD::VECREDUCE_AND:
7752 case ISD::VECREDUCE_OR:
7753 case ISD::VECREDUCE_XOR:
7754 case ISD::VECREDUCE_SMAX:
7755 case ISD::VECREDUCE_SMIN:
7756 case ISD::VECREDUCE_UMAX:
7757 case ISD::VECREDUCE_UMIN:
7758 case ISD::VECREDUCE_FADD:
7759 case ISD::VECREDUCE_FMAX:
7760 case ISD::VECREDUCE_FMIN:
7761 case ISD::VECREDUCE_FMAXIMUM:
7762 case ISD::VECREDUCE_FMINIMUM:
7763 return LowerVECREDUCE(Op, DAG);
7764 case ISD::VECREDUCE_MUL:
7765 case ISD::VECREDUCE_FMUL:
7766 return LowerVECREDUCE_MUL(Op, DAG);
7767 case ISD::ATOMIC_LOAD_AND:
7768 return LowerATOMIC_LOAD_AND(Op, DAG);
7769 case ISD::DYNAMIC_STACKALLOC:
7770 return LowerDYNAMIC_STACKALLOC(Op, DAG);
7771 case ISD::VSCALE:
7772 return LowerVSCALE(Op, DAG);
7774 return LowerVECTOR_COMPRESS(Op, DAG);
7775 case ISD::ANY_EXTEND:
7776 case ISD::SIGN_EXTEND:
7777 case ISD::ZERO_EXTEND:
7778 return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
7779 case ISD::ADDRSPACECAST:
7780 return LowerADDRSPACECAST(Op, DAG);
7782 // Only custom lower when ExtraVT has a legal byte based element type.
7783 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
7784 EVT ExtraEltVT = ExtraVT.getVectorElementType();
7785 if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&
7786 (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64))
7787 return SDValue();
7788
7789 return LowerToPredicatedOp(Op, DAG,
7790 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU);
7791 }
7792 case ISD::TRUNCATE:
7793 return LowerTRUNCATE(Op, DAG);
7794 case ISD::MLOAD:
7795 return LowerMLOAD(Op, DAG);
7796 case ISD::LOAD:
7797 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
7798 !Subtarget->isNeonAvailable()))
7799 return LowerFixedLengthVectorLoadToSVE(Op, DAG);
7800 return LowerLOAD(Op, DAG);
7801 case ISD::ADD:
7802 case ISD::AND:
7803 case ISD::SUB:
7804 return LowerToScalableOp(Op, DAG);
7805 case ISD::FMAXIMUM:
7806 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED);
7807 case ISD::FMAXNUM:
7808 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED);
7809 case ISD::FMINIMUM:
7810 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMIN_PRED);
7811 case ISD::FMINNUM:
7812 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED);
7813 case ISD::VSELECT:
7814 return LowerFixedLengthVectorSelectToSVE(Op, DAG);
7815 case ISD::ABS:
7816 return LowerABS(Op, DAG);
7817 case ISD::ABDS:
7818 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDS_PRED);
7819 case ISD::ABDU:
7820 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED);
7821 case ISD::AVGFLOORS:
7822 return LowerAVG(Op, DAG, AArch64ISD::HADDS_PRED);
7823 case ISD::AVGFLOORU:
7824 return LowerAVG(Op, DAG, AArch64ISD::HADDU_PRED);
7825 case ISD::AVGCEILS:
7826 return LowerAVG(Op, DAG, AArch64ISD::RHADDS_PRED);
7827 case ISD::AVGCEILU:
7828 return LowerAVG(Op, DAG, AArch64ISD::RHADDU_PRED);
7829 case ISD::BITREVERSE:
7830 return LowerBitreverse(Op, DAG);
7831 case ISD::BSWAP:
7832 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU);
7833 case ISD::CTLZ:
7834 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU);
7835 case ISD::CTTZ:
7836 return LowerCTTZ(Op, DAG);
7837 case ISD::VECTOR_SPLICE:
7838 return LowerVECTOR_SPLICE(Op, DAG);
7840 return LowerVECTOR_DEINTERLEAVE(Op, DAG);
7842 return LowerVECTOR_INTERLEAVE(Op, DAG);
7843 case ISD::GET_ACTIVE_LANE_MASK:
7844 return LowerGET_ACTIVE_LANE_MASK(Op, DAG);
7845 case ISD::LRINT:
7846 case ISD::LLRINT:
7847 if (Op.getValueType().isVector())
7848 return LowerVectorXRINT(Op, DAG);
7849 [[fallthrough]];
7850 case ISD::LROUND:
7851 case ISD::LLROUND: {
7852 assert((Op.getOperand(0).getValueType() == MVT::f16 ||
7853 Op.getOperand(0).getValueType() == MVT::bf16) &&
7854 "Expected custom lowering of rounding operations only for f16");
7855 SDLoc DL(Op);
7856 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Op.getOperand(0));
7857 return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), Ext);
7858 }
7859 case ISD::STRICT_LROUND:
7861 case ISD::STRICT_LRINT:
7862 case ISD::STRICT_LLRINT: {
7863 assert((Op.getOperand(1).getValueType() == MVT::f16 ||
7864 Op.getOperand(1).getValueType() == MVT::bf16) &&
7865 "Expected custom lowering of rounding operations only for f16");
7866 SDLoc DL(Op);
7867 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
7868 {Op.getOperand(0), Op.getOperand(1)});
7869 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
7870 {Ext.getValue(1), Ext.getValue(0)});
7871 }
7872 case ISD::WRITE_REGISTER: {
7873 assert(Op.getOperand(2).getValueType() == MVT::i128 &&
7874 "WRITE_REGISTER custom lowering is only for 128-bit sysregs");
7875 SDLoc DL(Op);
7876
7877 SDValue Chain = Op.getOperand(0);
7878 SDValue SysRegName = Op.getOperand(1);
7879 std::pair<SDValue, SDValue> Pair =
7880 DAG.SplitScalar(Op.getOperand(2), DL, MVT::i64, MVT::i64);
7881
7882 // chain = MSRR(chain, sysregname, lo, hi)
7883 SDValue Result = DAG.getNode(AArch64ISD::MSRR, DL, MVT::Other, Chain,
7884 SysRegName, Pair.first, Pair.second);
7885
7886 return Result;
7887 }
7888 case ISD::FSHL:
7889 case ISD::FSHR:
7890 return LowerFunnelShift(Op, DAG);
7891 case ISD::FLDEXP:
7892 return LowerFLDEXP(Op, DAG);
7893 case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
7894 return LowerVECTOR_HISTOGRAM(Op, DAG);
7895 case ISD::PARTIAL_REDUCE_SMLA:
7896 case ISD::PARTIAL_REDUCE_UMLA:
7897 case ISD::PARTIAL_REDUCE_SUMLA:
7898 case ISD::PARTIAL_REDUCE_FMLA:
7899 return LowerPARTIAL_REDUCE_MLA(Op, DAG);
7900 }
7901}
7902
7904 return !Subtarget->useSVEForFixedLengthVectors();
7905}
7906
7908 EVT VT, bool OverrideNEON) const {
7909 if (!VT.isFixedLengthVector() || !VT.isSimple())
7910 return false;
7911
7912 // Don't use SVE for vectors we cannot scalarize if required.
7913 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
7914 // Fixed length predicates should be promoted to i8.
7915 // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
7916 case MVT::i1:
7917 default:
7918 return false;
7919 case MVT::i8:
7920 case MVT::i16:
7921 case MVT::i32:
7922 case MVT::i64:
7923 case MVT::f16:
7924 case MVT::f32:
7925 case MVT::f64:
7926 break;
7927 }
7928
7929 // NEON-sized vectors can be emulated using SVE instructions.
7930 if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
7931 return Subtarget->isSVEorStreamingSVEAvailable();
7932
7933 // Ensure NEON MVTs only belong to a single register class.
7934 if (VT.getFixedSizeInBits() <= 128)
7935 return false;
7936
7937 // Ensure wider than NEON code generation is enabled.
7938 if (!Subtarget->useSVEForFixedLengthVectors())
7939 return false;
7940
7941 // Don't use SVE for types that don't fit.
7942 if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
7943 return false;
7944
7945 // TODO: Perhaps an artificial restriction, but worth having whilst getting
7946 // the base fixed length SVE support in place.
7947 if (!VT.isPow2VectorType())
7948 return false;
7949
7950 return true;
7951}
7952
7953//===----------------------------------------------------------------------===//
7954// Calling Convention Implementation
7955//===----------------------------------------------------------------------===//
7956
7957static unsigned getIntrinsicID(const SDNode *N) {
7958 unsigned Opcode = N->getOpcode();
7959 switch (Opcode) {
7960 default:
7963 unsigned IID = N->getConstantOperandVal(0);
7964 if (IID < Intrinsic::num_intrinsics)
7965 return IID;
7967 }
7968 }
7969}
7970
7972 SDValue N1) const {
7973 if (!N0.hasOneUse())
7974 return false;
7975
7976 unsigned IID = getIntrinsicID(N1.getNode());
7977 // Avoid reassociating expressions that can be lowered to smlal/umlal.
7978 if (IID == Intrinsic::aarch64_neon_umull ||
7979 N1.getOpcode() == AArch64ISD::UMULL ||
7980 IID == Intrinsic::aarch64_neon_smull ||
7981 N1.getOpcode() == AArch64ISD::SMULL)
7982 return N0.getOpcode() != ISD::ADD;
7983
7984 return true;
7985}
7986
7987/// Selects the correct CCAssignFn for a given CallingConvention value.
7989 bool IsVarArg) const {
7990 switch (CC) {
7991 default:
7992 reportFatalUsageError("unsupported calling convention");
7993 case CallingConv::GHC:
7994 return CC_AArch64_GHC;
7996 // The VarArg implementation makes assumptions about register
7997 // argument passing that do not hold for preserve_none, so we
7998 // instead fall back to C argument passing.
7999 // The non-vararg case is handled in the CC function itself.
8000 if (!IsVarArg)
8002 [[fallthrough]];
8003 case CallingConv::C:
8004 case CallingConv::Fast:
8008 case CallingConv::Swift:
8010 case CallingConv::Tail:
8011 case CallingConv::GRAAL:
8012 if (Subtarget->isTargetWindows()) {
8013 if (IsVarArg) {
8014 if (Subtarget->isWindowsArm64EC())
8017 }
8018 return CC_AArch64_Win64PCS;
8019 }
8020 if (!Subtarget->isTargetDarwin())
8021 return CC_AArch64_AAPCS;
8022 if (!IsVarArg)
8023 return CC_AArch64_DarwinPCS;
8024 return Subtarget->isTargetILP32() ? CC_AArch64_DarwinPCS_ILP32_VarArg
8026 case CallingConv::Win64:
8027 if (IsVarArg) {
8028 if (Subtarget->isWindowsArm64EC())
8031 }
8032 return CC_AArch64_Win64PCS;
8034 if (Subtarget->isWindowsArm64EC())
8042 return CC_AArch64_AAPCS;
8047 }
8048}
8049
8050CCAssignFn *
8052 switch (CC) {
8053 default:
8054 return RetCC_AArch64_AAPCS;
8058 if (Subtarget->isWindowsArm64EC())
8060 return RetCC_AArch64_AAPCS;
8061 }
8062}
8063
8064static bool isPassedInFPR(EVT VT) {
8065 return VT.isFixedLengthVector() ||
8066 (VT.isFloatingPoint() && !VT.isScalableVector());
8067}
8068
8070 AArch64FunctionInfo &FuncInfo,
8071 SelectionDAG &DAG) {
8072 if (!FuncInfo.hasZT0SpillSlotIndex())
8073 FuncInfo.setZT0SpillSlotIndex(MFI.CreateSpillStackObject(64, Align(16)));
8074
8075 return DAG.getFrameIndex(
8076 FuncInfo.getZT0SpillSlotIndex(),
8078}
8079
8080// Emit a call to __arm_sme_save or __arm_sme_restore.
8082 SelectionDAG &DAG,
8084 SDValue Chain, bool IsSave) {
8087 FuncInfo->setSMESaveBufferUsed();
8089 Args.emplace_back(
8090 DAG.getCopyFromReg(Chain, DL, Info->getSMESaveBufferAddr(), MVT::i64),
8092
8093 RTLIB::Libcall LC =
8094 IsSave ? RTLIB::SMEABI_SME_SAVE : RTLIB::SMEABI_SME_RESTORE;
8095 SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
8096 TLI.getPointerTy(DAG.getDataLayout()));
8097 auto *RetTy = Type::getVoidTy(*DAG.getContext());
8099 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
8100 TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args));
8101 return TLI.LowerCallTo(CLI).second;
8102}
8103
8105 const AArch64TargetLowering &TLI,
8106 const AArch64RegisterInfo &TRI,
8107 AArch64FunctionInfo &FuncInfo,
8108 SelectionDAG &DAG) {
8109 // Conditionally restore the lazy save using a pseudo node.
8110 RTLIB::Libcall LC = RTLIB::SMEABI_TPIDR2_RESTORE;
8111 TPIDR2Object &TPIDR2 = FuncInfo.getTPIDR2Obj();
8112 SDValue RegMask = DAG.getRegisterMask(TRI.getCallPreservedMask(
8114 SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
8115 TLI.getLibcallName(LC), TLI.getPointerTy(DAG.getDataLayout()));
8116 SDValue TPIDR2_EL0 = DAG.getNode(
8117 ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Chain,
8118 DAG.getTargetConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
8119 // Copy the address of the TPIDR2 block into X0 before 'calling' the
8120 // RESTORE_ZA pseudo.
8121 SDValue Glue;
8122 SDValue TPIDR2Block = DAG.getFrameIndex(
8123 TPIDR2.FrameIndex,
8125 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, TPIDR2Block, Glue);
8126 Chain =
8127 DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
8128 {Chain, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64),
8129 RestoreRoutine, RegMask, Chain.getValue(1)});
8130 // Finally reset the TPIDR2_EL0 register to 0.
8131 Chain = DAG.getNode(
8132 ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
8133 DAG.getTargetConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
8134 DAG.getConstant(0, DL, MVT::i64));
8135 TPIDR2.Uses++;
8136 return Chain;
8137}
8138
8139SDValue AArch64TargetLowering::lowerEHPadEntry(SDValue Chain, SDLoc const &DL,
8140 SelectionDAG &DAG) const {
8141 assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
8142 SDValue Glue = Chain.getValue(1);
8143
8144 MachineFunction &MF = DAG.getMachineFunction();
8145 auto &FuncInfo = *MF.getInfo<AArch64FunctionInfo>();
8146 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
8147 const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();
8148
8149 SMEAttrs SMEFnAttrs = FuncInfo.getSMEFnAttrs();
8150
8151 // The following conditions are true on entry to an exception handler:
8152 // - PSTATE.SM is 0.
8153 // - PSTATE.ZA is 0.
8154 // - TPIDR2_EL0 is null.
8155 // See:
8156 // https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst#exceptions
8157 //
8158 // Therefore, if the function that contains this exception handler is a
8159 // streaming[-compatible] function, we must re-enable streaming mode.
8160 //
8161 // These mode changes are usually optimized away in catch blocks as they
8162 // occur before the __cxa_begin_catch (which is a non-streaming function),
8163 // but are necessary in some cases (such as for cleanups).
8164 //
8165 // Additionally, if the function has ZA or ZT0 state, we must restore it.
8166
8167 // [COND_]SMSTART SM
8168 if (SMEFnAttrs.hasStreamingInterfaceOrBody())
8169 Chain = changeStreamingMode(DAG, DL, /*Enable=*/true, Chain,
8170 /*Glue*/ Glue, AArch64SME::Always);
8171 else if (SMEFnAttrs.hasStreamingCompatibleInterface())
8172 Chain = changeStreamingMode(DAG, DL, /*Enable=*/true, Chain, Glue,
8174
8175 if (getTM().useNewSMEABILowering())
8176 return Chain;
8177
8178 if (SMEFnAttrs.hasAgnosticZAInterface()) {
8179 // Restore full ZA
8180 Chain = emitSMEStateSaveRestore(*this, DAG, &FuncInfo, DL, Chain,
8181 /*IsSave=*/false);
8182 } else if (SMEFnAttrs.hasZAState() || SMEFnAttrs.hasZT0State()) {
8183 // SMSTART ZA
8184 Chain = DAG.getNode(
8185 AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain,
8186 DAG.getTargetConstant(int32_t(AArch64SVCR::SVCRZA), DL, MVT::i32));
8187
8188 // Restore ZT0
8189 if (SMEFnAttrs.hasZT0State()) {
8190 SDValue ZT0FrameIndex =
8191 getZT0FrameIndex(MF.getFrameInfo(), FuncInfo, DAG);
8192 Chain =
8193 DAG.getNode(AArch64ISD::RESTORE_ZT, DL, DAG.getVTList(MVT::Other),
8194 {Chain, DAG.getConstant(0, DL, MVT::i32), ZT0FrameIndex});
8195 }
8196
8197 // Restore ZA
8198 if (SMEFnAttrs.hasZAState())
8199 Chain = emitRestoreZALazySave(Chain, DL, *this, TRI, FuncInfo, DAG);
8200 }
8201
8202 return Chain;
8203}
8204
8205SDValue AArch64TargetLowering::LowerFormalArguments(
8206 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
8207 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
8208 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
8209 MachineFunction &MF = DAG.getMachineFunction();
8210 const Function &F = MF.getFunction();
8211 MachineFrameInfo &MFI = MF.getFrameInfo();
8212 bool IsWin64 =
8213 Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg());
8214 bool StackViaX4 = CallConv == CallingConv::ARM64EC_Thunk_X64 ||
8215 (isVarArg && Subtarget->isWindowsArm64EC());
8216 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8217
8219 GetReturnInfo(CallConv, F.getReturnType(), F.getAttributes(), Outs,
8221 if (any_of(Outs, [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); }))
8222 FuncInfo->setIsSVECC(true);
8223
8224 // Assign locations to all of the incoming arguments.
8226 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
8227
8228 // At this point, Ins[].VT may already be promoted to i32. To correctly
8229 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
8230 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
8231 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
8232 // we use a special version of AnalyzeFormalArguments to pass in ValVT and
8233 // LocVT.
8234 unsigned NumArgs = Ins.size();
8235 Function::const_arg_iterator CurOrigArg = F.arg_begin();
8236 unsigned CurArgIdx = 0;
8237 bool UseVarArgCC = false;
8238 if (IsWin64)
8239 UseVarArgCC = isVarArg;
8240
8241 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC);
8242
8243 for (unsigned i = 0; i != NumArgs; ++i) {
8244 MVT ValVT = Ins[i].VT;
8245 if (Ins[i].isOrigArg()) {
8246 std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
8247 CurArgIdx = Ins[i].getOrigArgIndex();
8248
8249 // Get type of the original argument.
8250 EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
8251 /*AllowUnknown*/ true);
8252 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
8253 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
8254 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
8255 ValVT = MVT::i8;
8256 else if (ActualMVT == MVT::i16)
8257 ValVT = MVT::i16;
8258 }
8259 bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags,
8260 Ins[i].OrigTy, CCInfo);
8261 assert(!Res && "Call operand has unhandled type");
8262 (void)Res;
8263 }
8264
8265 SMEAttrs Attrs = FuncInfo->getSMEFnAttrs();
8266 bool IsLocallyStreaming =
8267 !Attrs.hasStreamingInterface() && Attrs.hasStreamingBody();
8268 assert(Chain.getOpcode() == ISD::EntryToken && "Unexpected Chain value");
8269 SDValue Glue = Chain.getValue(1);
8270
8271 unsigned ExtraArgLocs = 0;
8272 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
8273 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
8274
8275 if (Ins[i].Flags.isByVal()) {
8276 // Byval is used for HFAs in the PCS, but the system should work in a
8277 // non-compliant manner for larger structs.
8278 EVT PtrVT = getPointerTy(DAG.getDataLayout());
8279 int Size = Ins[i].Flags.getByValSize();
8280 unsigned NumRegs = (Size + 7) / 8;
8281
8282 // FIXME: This works on big-endian for composite byvals, which are the common
8283 // case. It should also work for fundamental types too.
8284 unsigned FrameIdx =
8285 MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
8286 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
8287 InVals.push_back(FrameIdxN);
8288
8289 continue;
8290 }
8291
8292 if (Ins[i].Flags.isSwiftAsync())
8293 MF.getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
8294
8295 SDValue ArgValue;
8296 if (VA.isRegLoc()) {
8297 // Arguments stored in registers.
8298 EVT RegVT = VA.getLocVT();
8299 const TargetRegisterClass *RC;
8300
8301 if (RegVT == MVT::i32)
8302 RC = &AArch64::GPR32RegClass;
8303 else if (RegVT == MVT::i64)
8304 RC = &AArch64::GPR64RegClass;
8305 else if (RegVT == MVT::f16 || RegVT == MVT::bf16)
8306 RC = &AArch64::FPR16RegClass;
8307 else if (RegVT == MVT::f32)
8308 RC = &AArch64::FPR32RegClass;
8309 else if (RegVT == MVT::f64 || RegVT.is64BitVector())
8310 RC = &AArch64::FPR64RegClass;
8311 else if (RegVT == MVT::f128 || RegVT.is128BitVector())
8312 RC = &AArch64::FPR128RegClass;
8313 else if (RegVT.isScalableVector() &&
8314 RegVT.getVectorElementType() == MVT::i1) {
8315 FuncInfo->setIsSVECC(true);
8316 RC = &AArch64::PPRRegClass;
8317 } else if (RegVT == MVT::aarch64svcount) {
8318 FuncInfo->setIsSVECC(true);
8319 RC = &AArch64::PPRRegClass;
8320 } else if (RegVT.isScalableVector()) {
8321 FuncInfo->setIsSVECC(true);
8322 RC = &AArch64::ZPRRegClass;
8323 } else
8324 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
8325
8326 // Transform the arguments in physical registers into virtual ones.
8327 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
8328
8329 if (IsLocallyStreaming) {
8330 // LocallyStreamingFunctions must insert the SMSTART in the correct
8331 // position, so we use Glue to ensure no instructions can be scheduled
8332 // between the chain of:
8333 // t0: ch,glue = EntryNode
8334 // t1: res,ch,glue = CopyFromReg
8335 // ...
8336 // tn: res,ch,glue = CopyFromReg t(n-1), ..
8337 // t(n+1): ch, glue = SMSTART t0:0, ...., tn:2
8338 // ^^^^^^
8339 // This will be the new Chain/Root node.
8340 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT, Glue);
8341 Glue = ArgValue.getValue(2);
8342 if (isPassedInFPR(ArgValue.getValueType())) {
8343 ArgValue =
8344 DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
8345 DAG.getVTList(ArgValue.getValueType(), MVT::Glue),
8346 {ArgValue, Glue});
8347 Glue = ArgValue.getValue(1);
8348 }
8349 } else
8350 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
8351
8352 // If this is an 8, 16 or 32-bit value, it is really passed promoted
8353 // to 64 bits. Insert an assert[sz]ext to capture this, then
8354 // truncate to the right size.
8355 switch (VA.getLocInfo()) {
8356 default:
8357 llvm_unreachable("Unknown loc info!");
8358 case CCValAssign::Full:
8359 break;
8361 assert(
8362 (VA.getValVT().isScalableVT() || Subtarget->isWindowsArm64EC()) &&
8363 "Indirect arguments should be scalable on most subtargets");
8364 break;
8365 case CCValAssign::BCvt:
8366 ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
8367 break;
8368 case CCValAssign::AExt:
8369 case CCValAssign::SExt:
8370 case CCValAssign::ZExt:
8371 break;
8373 ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
8374 DAG.getConstant(32, DL, RegVT));
8375 ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
8376 break;
8377 }
8378 } else { // VA.isRegLoc()
8379 assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
8380 unsigned ArgOffset = VA.getLocMemOffset();
8381 unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
8382 ? VA.getLocVT().getSizeInBits()
8383 : VA.getValVT().getSizeInBits()) / 8;
8384
8385 uint32_t BEAlign = 0;
8386 if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
8387 !Ins[i].Flags.isInConsecutiveRegs())
8388 BEAlign = 8 - ArgSize;
8389
8390 SDValue FIN;
8391 MachinePointerInfo PtrInfo;
8392 if (StackViaX4) {
8393 // In both the ARM64EC varargs convention and the thunk convention,
8394 // arguments on the stack are accessed relative to x4, not sp. In
8395 // the thunk convention, there's an additional offset of 32 bytes
8396 // to account for the shadow store.
8397 unsigned ObjOffset = ArgOffset + BEAlign;
8398 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
8399 ObjOffset += 32;
8400 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
8401 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
8402 FIN = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
8403 DAG.getConstant(ObjOffset, DL, MVT::i64));
8405 } else {
8406 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
8407
8408 // Create load nodes to retrieve arguments from the stack.
8409 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
8410 PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
8411 }
8412
8413 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
8415 MVT MemVT = VA.getValVT();
8416
8417 switch (VA.getLocInfo()) {
8418 default:
8419 break;
8420 case CCValAssign::Trunc:
8421 case CCValAssign::BCvt:
8422 MemVT = VA.getLocVT();
8423 break;
8426 Subtarget->isWindowsArm64EC()) &&
8427 "Indirect arguments should be scalable on most subtargets");
8428 MemVT = VA.getLocVT();
8429 break;
8430 case CCValAssign::SExt:
8431 ExtType = ISD::SEXTLOAD;
8432 break;
8433 case CCValAssign::ZExt:
8434 ExtType = ISD::ZEXTLOAD;
8435 break;
8436 case CCValAssign::AExt:
8437 ExtType = ISD::EXTLOAD;
8438 break;
8439 }
8440
8441 ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, PtrInfo,
8442 MemVT);
8443 }
8444
8445 if (VA.getLocInfo() == CCValAssign::Indirect) {
8446 assert((VA.getValVT().isScalableVT() ||
8447 Subtarget->isWindowsArm64EC()) &&
8448 "Indirect arguments should be scalable on most subtargets");
8449
8450 uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinValue();
8451 unsigned NumParts = 1;
8452 if (Ins[i].Flags.isInConsecutiveRegs()) {
8453 while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
8454 ++NumParts;
8455 }
8456
8457 MVT PartLoad = VA.getValVT();
8458 SDValue Ptr = ArgValue;
8459
8460 // Ensure we generate all loads for each tuple part, whilst updating the
8461 // pointer after each load correctly using vscale.
8462 while (NumParts > 0) {
8463 ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo());
8464 InVals.push_back(ArgValue);
8465 NumParts--;
8466 if (NumParts > 0) {
8467 SDValue BytesIncrement;
8468 if (PartLoad.isScalableVector()) {
8469 BytesIncrement = DAG.getVScale(
8470 DL, Ptr.getValueType(),
8471 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
8472 } else {
8473 BytesIncrement = DAG.getConstant(
8474 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
8475 Ptr.getValueType());
8476 }
8477 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
8478 BytesIncrement, SDNodeFlags::NoUnsignedWrap);
8479 ExtraArgLocs++;
8480 i++;
8481 }
8482 }
8483 } else {
8484 if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
8485 ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
8486 ArgValue, DAG.getValueType(MVT::i32));
8487
8488 // i1 arguments are zero-extended to i8 by the caller. Emit a
8489 // hint to reflect this.
8490 if (Ins[i].isOrigArg()) {
8491 Argument *OrigArg = F.getArg(Ins[i].getOrigArgIndex());
8492 if (OrigArg->getType()->isIntegerTy(1)) {
8493 if (!Ins[i].Flags.isZExt()) {
8494 ArgValue = DAG.getNode(AArch64ISD::ASSERT_ZEXT_BOOL, DL,
8495 ArgValue.getValueType(), ArgValue);
8496 }
8497 }
8498 }
8499
8500 InVals.push_back(ArgValue);
8501 }
8502 }
8503 assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
8504
8505 if (Attrs.hasStreamingCompatibleInterface()) {
8506 SDValue EntryPStateSM =
8507 DAG.getNode(AArch64ISD::ENTRY_PSTATE_SM, DL,
8508 DAG.getVTList(MVT::i64, MVT::Other), {Chain});
8509
8510 // Copy the value to a virtual register, and save that in FuncInfo.
8511 Register EntryPStateSMReg =
8512 MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
8513 Chain = DAG.getCopyToReg(EntryPStateSM.getValue(1), DL, EntryPStateSMReg,
8514 EntryPStateSM);
8515 FuncInfo->setPStateSMReg(EntryPStateSMReg);
8516 }
8517
8518 // Insert the SMSTART if this is a locally streaming function and
8519 // make sure it is Glued to the last CopyFromReg value.
8520 if (IsLocallyStreaming) {
8521 if (Attrs.hasStreamingCompatibleInterface())
8522 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
8524 else
8525 Chain = changeStreamingMode(DAG, DL, /*Enable*/ true, Chain, Glue,
8527
8528 // Ensure that the SMSTART happens after the CopyWithChain such that its
8529 // chain result is used.
8530 for (unsigned I=0; I<InVals.size(); ++I) {
8533 Chain = DAG.getCopyToReg(Chain, DL, Reg, InVals[I]);
8534 InVals[I] = DAG.getCopyFromReg(Chain, DL, Reg,
8535 InVals[I].getValueType());
8536 }
8537 }
8538
8539 // varargs
8540 if (isVarArg) {
8542 if (!Subtarget->isTargetDarwin() || IsWin64) {
8543 // The AAPCS variadic function ABI is identical to the non-variadic
8544 // one. As a result there may be more arguments in registers and we
8545 // should save them for future reference.
8546 // Win64 variadic functions also pass arguments in registers, but all
8547 // float arguments are passed in integer registers.
8548 saveVarArgRegisters(CCInfo, DAG, DL, Chain);
8549 }
8550
8551 // This will point to the next argument passed via stack.
8552 unsigned VarArgsOffset = CCInfo.getStackSize();
8553 // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
8554 VarArgsOffset =
8555 alignTo(VarArgsOffset, Subtarget->isTargetILP32() ? 4 : 8);
8556 FuncInfo->setVarArgsStackOffset(VarArgsOffset);
8557 FuncInfo->setVarArgsStackIndex(
8558 MFI.CreateFixedObject(4, VarArgsOffset, true));
8559 }
8560
8561 if (MFI.hasMustTailInVarArgFunc()) {
8562 SmallVector<MVT, 2> RegParmTypes;
8563 RegParmTypes.push_back(MVT::i64);
8564 RegParmTypes.push_back(MVT::f128);
8565 // Compute the set of forwarded registers. The rest are scratch.
8566 SmallVectorImpl<ForwardedRegister> &Forwards =
8567 FuncInfo->getForwardedMustTailRegParms();
8568 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
8570
8571 // Conservatively forward X8, since it might be used for aggregate return.
8572 if (!CCInfo.isAllocated(AArch64::X8)) {
8573 Register X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
8574 Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
8575 }
8576 }
8577 }
8578
8579 // On Windows, InReg pointers must be returned, so record the pointer in a
8580 // virtual register at the start of the function so it can be returned in the
8581 // epilogue.
8582 if (IsWin64 || F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64) {
8583 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
8584 if ((F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64 ||
8585 Ins[I].Flags.isInReg()) &&
8586 Ins[I].Flags.isSRet()) {
8587 assert(!FuncInfo->getSRetReturnReg());
8588
8589 MVT PtrTy = getPointerTy(DAG.getDataLayout());
8590 Register Reg =
8592 FuncInfo->setSRetReturnReg(Reg);
8593
8594 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
8595 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
8596 break;
8597 }
8598 }
8599 }
8600
8601 unsigned StackArgSize = CCInfo.getStackSize();
8602 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
8603 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
8604 // This is a non-standard ABI so by fiat I say we're allowed to make full
8605 // use of the stack area to be popped, which must be aligned to 16 bytes in
8606 // any case:
8607 StackArgSize = alignTo(StackArgSize, 16);
8608
8609 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
8610 // a multiple of 16.
8611 FuncInfo->setArgumentStackToRestore(StackArgSize);
8612
8613 // This realignment carries over to the available bytes below. Our own
8614 // callers will guarantee the space is free by giving an aligned value to
8615 // CALLSEQ_START.
8616 }
8617 // Even if we're not expected to free up the space, it's useful to know how
8618 // much is there while considering tail calls (because we can reuse it).
8619 FuncInfo->setBytesInStackArgArea(StackArgSize);
8620
8621 if (Subtarget->hasCustomCallingConv())
8622 Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
8623
8624 if (getTM().useNewSMEABILowering()) {
8625 if (Subtarget->isTargetWindows() || hasInlineStackProbe(MF)) {
8626 SDValue Size;
8627 if (Attrs.hasZAState()) {
8628 SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
8629 DAG.getConstant(1, DL, MVT::i32));
8630 Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
8631 } else if (Attrs.hasAgnosticZAInterface()) {
8632 RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE_SIZE;
8635 auto *RetTy = EVT(MVT::i64).getTypeForEVT(*DAG.getContext());
8636 TargetLowering::CallLoweringInfo CLI(DAG);
8637 CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
8638 getLibcallCallingConv(LC), RetTy, Callee, {});
8639 std::tie(Size, Chain) = LowerCallTo(CLI);
8640 }
8641 if (Size) {
8642 SDValue Buffer = DAG.getNode(
8643 ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other),
8644 {Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
8645 Chain = Buffer.getValue(1);
8646
8647 Register BufferPtr =
8648 MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
8649 Chain = DAG.getCopyToReg(Chain, DL, BufferPtr, Buffer);
8650 Chain = DAG.getNode(AArch64ISD::SME_STATE_ALLOC, DL,
8651 DAG.getVTList(MVT::Other), Chain);
8652 FuncInfo->setEarlyAllocSMESaveBuffer(BufferPtr);
8653 MFI.CreateVariableSizedObject(Align(16), nullptr);
8654 }
8655 }
8656 } else {
8657 // Old SME ABI lowering (deprecated):
8658 // Create a 16 Byte TPIDR2 object. The dynamic buffer
8659 // will be expanded and stored in the static object later using a
8660 // pseudonode.
8661 if (Attrs.hasZAState()) {
8662 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
8663 TPIDR2.FrameIndex = MFI.CreateStackObject(16, Align(16), false);
8664 SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
8665 DAG.getConstant(1, DL, MVT::i32));
8666 SDValue Buffer;
8667 if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
8668 Buffer = DAG.getNode(AArch64ISD::ALLOCATE_ZA_BUFFER, DL,
8669 DAG.getVTList(MVT::i64, MVT::Other), {Chain, SVL});
8670 } else {
8671 SDValue Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
8672 Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL,
8673 DAG.getVTList(MVT::i64, MVT::Other),
8674 {Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
8675 MFI.CreateVariableSizedObject(Align(16), nullptr);
8676 }
8677 SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
8678 DAG.getConstant(1, DL, MVT::i32));
8679 Chain = DAG.getNode(
8680 AArch64ISD::INIT_TPIDR2OBJ, DL, DAG.getVTList(MVT::Other),
8681 {/*Chain*/ Buffer.getValue(1), /*Buffer ptr*/ Buffer.getValue(0),
8682 /*Num save slices*/ NumZaSaveSlices});
8683 } else if (Attrs.hasAgnosticZAInterface()) {
8684 // Call __arm_sme_state_size().
8685 SDValue BufferSize =
8686 DAG.getNode(AArch64ISD::GET_SME_SAVE_SIZE, DL,
8687 DAG.getVTList(MVT::i64, MVT::Other), Chain);
8688 Chain = BufferSize.getValue(1);
8689 SDValue Buffer;
8690 if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
8691 Buffer = DAG.getNode(AArch64ISD::ALLOC_SME_SAVE_BUFFER, DL,
8692 DAG.getVTList(MVT::i64, MVT::Other),
8693 {Chain, BufferSize});
8694 } else {
8695 // Allocate space dynamically.
8696 Buffer = DAG.getNode(
8697 ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other),
8698 {Chain, BufferSize, DAG.getConstant(1, DL, MVT::i64)});
8699 MFI.CreateVariableSizedObject(Align(16), nullptr);
8700 }
8701 // Copy the value to a virtual register, and save that in FuncInfo.
8702 Register BufferPtr =
8703 MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
8704 FuncInfo->setSMESaveBufferAddr(BufferPtr);
8705 Chain = DAG.getCopyToReg(Buffer.getValue(1), DL, BufferPtr, Buffer);
8706 }
8707 }
8708
8709 if (CallConv == CallingConv::PreserveNone) {
8710 for (const ISD::InputArg &I : Ins) {
8711 if (I.Flags.isSwiftSelf() || I.Flags.isSwiftError() ||
8712 I.Flags.isSwiftAsync()) {
8713 MachineFunction &MF = DAG.getMachineFunction();
8714 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
8715 MF.getFunction(),
8716 "Swift attributes can't be used with preserve_none",
8717 DL.getDebugLoc()));
8718 break;
8719 }
8720 }
8721 }
8722
8723 if (getTM().useNewSMEABILowering()) {
8724 // Clear new ZT0 state. TODO: Move this to the SME ABI pass.
8725 if (Attrs.isNewZT0())
8726 Chain = DAG.getNode(
8727 ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
8728 DAG.getTargetConstant(Intrinsic::aarch64_sme_zero_zt, DL, MVT::i32),
8729 DAG.getTargetConstant(0, DL, MVT::i32));
8730 }
8731
8732 return Chain;
8733}
8734
8735void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
8736 SelectionDAG &DAG,
8737 const SDLoc &DL,
8738 SDValue &Chain) const {
8739 MachineFunction &MF = DAG.getMachineFunction();
8740 MachineFrameInfo &MFI = MF.getFrameInfo();
8741 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8742 auto PtrVT = getPointerTy(DAG.getDataLayout());
8743 Function &F = MF.getFunction();
8744 bool IsWin64 =
8745 Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg());
8746
8748
8750 unsigned NumGPRArgRegs = GPRArgRegs.size();
8751 if (Subtarget->isWindowsArm64EC()) {
8752 // In the ARM64EC ABI, only x0-x3 are used to pass arguments to varargs
8753 // functions.
8754 NumGPRArgRegs = 4;
8755 }
8756 unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
8757
8758 unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
8759 int GPRIdx = 0;
8760 if (GPRSaveSize != 0) {
8761 if (IsWin64) {
8762 GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
8763 if (GPRSaveSize & 15)
8764 // The extra size here, if triggered, will always be 8.
8765 MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
8766 } else
8767 GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);
8768
8769 SDValue FIN;
8770 if (Subtarget->isWindowsArm64EC()) {
8771 // With the Arm64EC ABI, we reserve the save area as usual, but we
8772 // compute its address relative to x4. For a normal AArch64->AArch64
8773 // call, x4 == sp on entry, but calls from an entry thunk can pass in a
8774 // different address.
8775 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
8776 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
8777 FIN = DAG.getNode(ISD::SUB, DL, MVT::i64, Val,
8778 DAG.getConstant(GPRSaveSize, DL, MVT::i64));
8779 } else {
8780 FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
8781 }
8782
8783 for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
8784 Register VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
8785 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
8786 SDValue Store =
8787 DAG.getStore(Val.getValue(1), DL, Val, FIN,
8789 MF, GPRIdx, (i - FirstVariadicGPR) * 8)
8790 : MachinePointerInfo::getStack(MF, i * 8));
8791 MemOps.push_back(Store);
8792 FIN =
8793 DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
8794 }
8795 }
8796 FuncInfo->setVarArgsGPRIndex(GPRIdx);
8797 FuncInfo->setVarArgsGPRSize(GPRSaveSize);
8798
8799 if (Subtarget->hasFPARMv8() && !IsWin64) {
8801 const unsigned NumFPRArgRegs = FPRArgRegs.size();
8802 unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
8803
8804 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
8805 int FPRIdx = 0;
8806 if (FPRSaveSize != 0) {
8807 FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false);
8808
8809 SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
8810
8811 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
8812 Register VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
8813 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
8814
8815 SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
8816 MachinePointerInfo::getStack(MF, i * 16));
8817 MemOps.push_back(Store);
8818 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
8819 DAG.getConstant(16, DL, PtrVT));
8820 }
8821 }
8822 FuncInfo->setVarArgsFPRIndex(FPRIdx);
8823 FuncInfo->setVarArgsFPRSize(FPRSaveSize);
8824 }
8825
8826 if (!MemOps.empty()) {
8827 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
8828 }
8829}
8830
8831/// LowerCallResult - Lower the result values of a call into the
8832/// appropriate copies out of appropriate physical registers.
8833SDValue AArch64TargetLowering::LowerCallResult(
8834 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
8835 const SmallVectorImpl<CCValAssign> &RVLocs, const SDLoc &DL,
8836 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
8837 SDValue ThisVal, bool RequiresSMChange) const {
8838 DenseMap<unsigned, SDValue> CopiedRegs;
8839 // Copy all of the result registers out of their specified physreg.
8840 for (unsigned i = 0; i != RVLocs.size(); ++i) {
8841 CCValAssign VA = RVLocs[i];
8842
8843 // Pass 'this' value directly from the argument to return value, to avoid
8844 // reg unit interference
8845 if (i == 0 && isThisReturn) {
8846 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
8847 "unexpected return calling convention register assignment");
8848 InVals.push_back(ThisVal);
8849 continue;
8850 }
8851
8852 // Avoid copying a physreg twice since RegAllocFast is incompetent and only
8853 // allows one use of a physreg per block.
8854 SDValue Val = CopiedRegs.lookup(VA.getLocReg());
8855 if (!Val) {
8856 Val =
8857 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
8858 Chain = Val.getValue(1);
8859 InGlue = Val.getValue(2);
8860 CopiedRegs[VA.getLocReg()] = Val;
8861 }
8862
8863 switch (VA.getLocInfo()) {
8864 default:
8865 llvm_unreachable("Unknown loc info!");
8866 case CCValAssign::Full:
8867 break;
8868 case CCValAssign::BCvt:
8869 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
8870 break;
8872 Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
8873 DAG.getConstant(32, DL, VA.getLocVT()));
8874 [[fallthrough]];
8875 case CCValAssign::AExt:
8876 [[fallthrough]];
8877 case CCValAssign::ZExt:
8878 Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
8879 break;
8880 }
8881
8882 if (RequiresSMChange && isPassedInFPR(VA.getValVT()))
8883 Val = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
8884 DAG.getVTList(Val.getValueType(), MVT::Glue), Val);
8885
8886 InVals.push_back(Val);
8887 }
8888
8889 return Chain;
8890}
8891
8892/// Return true if the calling convention is one that we can guarantee TCO for.
8893static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
8894 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
8896}
8897
8898/// Return true if we might ever do TCO for calls with this calling convention.
8900 switch (CC) {
8901 case CallingConv::C:
8906 case CallingConv::Swift:
8908 case CallingConv::Tail:
8909 case CallingConv::Fast:
8910 return true;
8911 default:
8912 return false;
8913 }
8914}
8915
8916/// Return true if the call convention supports varargs
8917/// Currently only those that pass varargs like the C
8918/// calling convention does are eligible
8919/// Calling conventions listed in this function must also
8920/// be properly handled in AArch64Subtarget::isCallingConvWin64
8922 switch (CC) {
8923 case CallingConv::C:
8925 // SVE vector call is only partially supported, but it should
8926 // support named arguments being passed. Any arguments being passed
8927 // as varargs, are still unsupported.
8929 return true;
8930 default:
8931 return false;
8932 }
8933}
8934
8936 const AArch64Subtarget *Subtarget,
8938 CCState &CCInfo) {
8939 const SelectionDAG &DAG = CLI.DAG;
8940 CallingConv::ID CalleeCC = CLI.CallConv;
8941 bool IsVarArg = CLI.IsVarArg;
8942 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
8943 bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC, IsVarArg);
8944
8945 // For Arm64EC thunks, allocate 32 extra bytes at the bottom of the stack
8946 // for the shadow store.
8947 if (CalleeCC == CallingConv::ARM64EC_Thunk_X64)
8948 CCInfo.AllocateStack(32, Align(16));
8949
8950 unsigned NumArgs = Outs.size();
8951 for (unsigned i = 0; i != NumArgs; ++i) {
8952 MVT ArgVT = Outs[i].VT;
8953 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
8954
8955 bool UseVarArgCC = false;
8956 if (IsVarArg) {
8957 // On Windows, the fixed arguments in a vararg call are passed in GPRs
8958 // too, so use the vararg CC to force them to integer registers.
8959 if (IsCalleeWin64) {
8960 UseVarArgCC = true;
8961 } else {
8962 UseVarArgCC = ArgFlags.isVarArg();
8963 }
8964 }
8965
8966 if (!UseVarArgCC) {
8967 // Get type of the original argument.
8968 EVT ActualVT =
8969 TLI.getValueType(DAG.getDataLayout(), CLI.Args[Outs[i].OrigArgIndex].Ty,
8970 /*AllowUnknown*/ true);
8971 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT;
8972 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
8973 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
8974 ArgVT = MVT::i8;
8975 else if (ActualMVT == MVT::i16)
8976 ArgVT = MVT::i16;
8977 }
8978
8979 // FIXME: CCAssignFnForCall should be called once, for the call and not per
8980 // argument. This logic should exactly mirror LowerFormalArguments.
8981 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CalleeCC, UseVarArgCC);
8982 bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags,
8983 Outs[i].OrigTy, CCInfo);
8984 assert(!Res && "Call operand has unhandled type");
8985 (void)Res;
8986 }
8987}
8988
8989static SMECallAttrs
8992 if (CLI.CB)
8993 return SMECallAttrs(*CLI.CB, &RTLCI);
8994 if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
8995 return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(ES->getSymbol(), RTLCI));
8997}
8998
8999bool AArch64TargetLowering::isEligibleForTailCallOptimization(
9000 const CallLoweringInfo &CLI) const {
9001 CallingConv::ID CalleeCC = CLI.CallConv;
9002 if (!mayTailCallThisCC(CalleeCC))
9003 return false;
9004
9005 SDValue Callee = CLI.Callee;
9006 bool IsVarArg = CLI.IsVarArg;
9007 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
9008 const SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
9009 const SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
9010 const SelectionDAG &DAG = CLI.DAG;
9011 MachineFunction &MF = DAG.getMachineFunction();
9012 const Function &CallerF = MF.getFunction();
9013 CallingConv::ID CallerCC = CallerF.getCallingConv();
9014
9015 // SME Streaming functions are not eligible for TCO as they may require
9016 // the streaming mode or ZA/ZT0 to be restored after returning from the call.
9017 SMECallAttrs CallAttrs =
9018 getSMECallAttrs(CallerF, getRuntimeLibcallsInfo(), CLI);
9019 if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() ||
9020 CallAttrs.requiresPreservingAllZAState() ||
9021 CallAttrs.requiresPreservingZT0() ||
9022 CallAttrs.caller().hasStreamingBody())
9023 return false;
9024
9025 // Functions using the C or Fast calling convention that have an SVE signature
9026 // preserve more registers and should assume the SVE_VectorCall CC.
9027 // The check for matching callee-saved regs will determine whether it is
9028 // eligible for TCO.
9029 if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) &&
9030 MF.getInfo<AArch64FunctionInfo>()->isSVECC())
9032
9033 bool CCMatch = CallerCC == CalleeCC;
9034
9035 // When using the Windows calling convention on a non-windows OS, we want
9036 // to back up and restore X18 in such functions; we can't do a tail call
9037 // from those functions.
9038 if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
9039 CalleeCC != CallingConv::Win64)
9040 return false;
9041
9042 // Byval parameters hand the function a pointer directly into the stack area
9043 // we want to reuse during a tail call. Working around this *is* possible (see
9044 // X86) but less efficient and uglier in LowerCall.
9045 for (Function::const_arg_iterator i = CallerF.arg_begin(),
9046 e = CallerF.arg_end();
9047 i != e; ++i) {
9048 if (i->hasByValAttr())
9049 return false;
9050
9051 // On Windows, "inreg" attributes signify non-aggregate indirect returns.
9052 // In this case, it is necessary to save X0/X1 in the callee and return it
9053 // in X0. Tail call opt may interfere with this, so we disable tail call
9054 // opt when the caller has an "inreg" attribute -- except if the callee
9055 // also has that attribute on the same argument, and the same value is
9056 // passed.
9057 if (i->hasInRegAttr()) {
9058 unsigned ArgIdx = i - CallerF.arg_begin();
9059 if (!CLI.CB || CLI.CB->arg_size() <= ArgIdx)
9060 return false;
9061 AttributeSet Attrs = CLI.CB->getParamAttributes(ArgIdx);
9062 if (!Attrs.hasAttribute(Attribute::InReg) ||
9063 !Attrs.hasAttribute(Attribute::StructRet) || !i->hasStructRetAttr() ||
9064 CLI.CB->getArgOperand(ArgIdx) != i) {
9065 return false;
9066 }
9067 }
9068 }
9069
9070 if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
9071 return CCMatch;
9072
9073 // Externally-defined functions with weak linkage should not be
9074 // tail-called on AArch64 when the OS does not support dynamic
9075 // pre-emption of symbols, as the AAELF spec requires normal calls
9076 // to undefined weak functions to be replaced with a NOP or jump to the
9077 // next instruction. The behaviour of branch instructions in this
9078 // situation (as used for tail calls) is implementation-defined, so we
9079 // cannot rely on the linker replacing the tail call with a return.
9080 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
9081 const GlobalValue *GV = G->getGlobal();
9082 const Triple &TT = getTargetMachine().getTargetTriple();
9083 if (GV->hasExternalWeakLinkage() &&
9084 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
9085 return false;
9086 }
9087
9088 // Now we search for cases where we can use a tail call without changing the
9089 // ABI. Sibcall is used in some places (particularly gcc) to refer to this
9090 // concept.
9091
9092 // I want anyone implementing a new calling convention to think long and hard
9093 // about this assert.
9094 if (IsVarArg && !callConvSupportsVarArgs(CalleeCC))
9095 report_fatal_error("Unsupported variadic calling convention");
9096
9097 LLVMContext &C = *DAG.getContext();
9098 // Check that the call results are passed in the same way.
9099 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
9100 CCAssignFnForCall(CalleeCC, IsVarArg),
9101 CCAssignFnForCall(CallerCC, IsVarArg)))
9102 return false;
9103 // The callee has to preserve all registers the caller needs to preserve.
9104 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9105 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
9106 if (!CCMatch) {
9107 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
9108 if (Subtarget->hasCustomCallingConv()) {
9109 TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
9110 TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
9111 }
9112 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
9113 return false;
9114 }
9115
9116 // Nothing more to check if the callee is taking no arguments
9117 if (Outs.empty())
9118 return true;
9119
9121 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C);
9122
9123 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
9124
9125 if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) {
9126 // When we are musttail, additional checks have been done and we can safely ignore this check
9127 // At least two cases here: if caller is fastcc then we can't have any
9128 // memory arguments (we'd be expected to clean up the stack afterwards). If
9129 // caller is C then we could potentially use its argument area.
9130
9131 // FIXME: for now we take the most conservative of these in both cases:
9132 // disallow all variadic memory operands.
9133 for (const CCValAssign &ArgLoc : ArgLocs)
9134 if (!ArgLoc.isRegLoc())
9135 return false;
9136 }
9137
9138 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
9139
9140 // If any of the arguments is passed indirectly, it must be SVE, so the
9141 // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
9142 // allocate space on the stack. That is why we determine this explicitly here
9143 // the call cannot be a tailcall.
9144 if (llvm::any_of(ArgLocs, [&](CCValAssign &A) {
9145 assert((A.getLocInfo() != CCValAssign::Indirect ||
9146 A.getValVT().isScalableVector() ||
9147 Subtarget->isWindowsArm64EC()) &&
9148 "Expected value to be scalable");
9149 return A.getLocInfo() == CCValAssign::Indirect;
9150 }))
9151 return false;
9152
9153 // If the stack arguments for this call do not fit into our own save area then
9154 // the call cannot be made tail.
9155 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
9156 return false;
9157
9158 const MachineRegisterInfo &MRI = MF.getRegInfo();
9159 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
9160 return false;
9161
9162 return true;
9163}
9164
9165SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
9166 SelectionDAG &DAG,
9167 MachineFrameInfo &MFI,
9168 int ClobberedFI) const {
9169 SmallVector<SDValue, 8> ArgChains;
9170 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
9171 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
9172
9173 // Include the original chain at the beginning of the list. When this is
9174 // used by target LowerCall hooks, this helps legalize find the
9175 // CALLSEQ_BEGIN node.
9176 ArgChains.push_back(Chain);
9177
9178 // Add a chain value for each stack argument corresponding
9179 for (SDNode *U : DAG.getEntryNode().getNode()->users())
9180 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U))
9181 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
9182 if (FI->getIndex() < 0) {
9183 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
9184 int64_t InLastByte = InFirstByte;
9185 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
9186
9187 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
9188 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
9189 ArgChains.push_back(SDValue(L, 1));
9190 }
9191
9192 // Build a tokenfactor for all the chains.
9193 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
9194}
9195
9196bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
9197 bool TailCallOpt) const {
9198 return (CallCC == CallingConv::Fast && TailCallOpt) ||
9199 CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail;
9200}
9201
9202// Check if the value is zero-extended from i1 to i8
9203static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
9204 unsigned SizeInBits = Arg.getValueType().getSizeInBits();
9205 if (SizeInBits < 8)
9206 return false;
9207
9208 APInt RequiredZero(SizeInBits, 0xFE);
9209 KnownBits Bits = DAG.computeKnownBits(Arg, 4);
9210 bool ZExtBool = (Bits.Zero & RequiredZero) == RequiredZero;
9211 return ZExtBool;
9212}
9213
9214void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
9215 SDNode *Node) const {
9216 // Live-in physreg copies that are glued to SMSTART are applied as
9217 // implicit-def's in the InstrEmitter. Here we remove them, allowing the
9218 // register allocator to pass call args in callee saved regs, without extra
9219 // copies to avoid these fake clobbers of actually-preserved GPRs.
9220 if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1 ||
9221 MI.getOpcode() == AArch64::MSRpstatePseudo) {
9222 for (unsigned I = MI.getNumOperands() - 1; I > 0; --I)
9223 if (MachineOperand &MO = MI.getOperand(I);
9224 MO.isReg() && MO.isImplicit() && MO.isDef() &&
9225 (AArch64::GPR32RegClass.contains(MO.getReg()) ||
9226 AArch64::GPR64RegClass.contains(MO.getReg())))
9227 MI.removeOperand(I);
9228
9229 // The SVE vector length can change when entering/leaving streaming mode.
9230 // FPMR is set to 0 when entering/leaving streaming mode.
9231 if (MI.getOperand(0).getImm() == AArch64SVCR::SVCRSM ||
9232 MI.getOperand(0).getImm() == AArch64SVCR::SVCRSMZA) {
9233 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
9234 /*IsImplicit=*/true));
9235 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/true,
9236 /*IsImplicit=*/true));
9237 MI.addOperand(MachineOperand::CreateReg(AArch64::FPMR, /*IsDef=*/true,
9238 /*IsImplicit=*/true));
9239 }
9240 }
9241
9242 // Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that
9243 // have nothing to do with VG, were it not that they are used to materialise a
9244 // frame-address. If they contain a frame-index to a scalable vector, this
9245 // will likely require an ADDVL instruction to materialise the address, thus
9246 // reading VG.
9247 const MachineFunction &MF = *MI.getMF();
9248 if (MF.getInfo<AArch64FunctionInfo>()->hasStreamingModeChanges() &&
9249 (MI.getOpcode() == AArch64::ADDXri ||
9250 MI.getOpcode() == AArch64::SUBXri)) {
9251 const MachineOperand &MO = MI.getOperand(1);
9252 if (MO.isFI() && MF.getFrameInfo().hasScalableStackID(MO.getIndex()))
9253 MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
9254 /*IsImplicit=*/true));
9255 }
9256}
9257
9259 SelectionDAG &DAG, SDLoc DL, bool Enable, SDValue Chain, SDValue InGlue,
9260 unsigned Condition, bool InsertVectorLengthCheck) const {
9263 FuncInfo->setHasStreamingModeChanges(true);
9264
9265 auto GetCheckVL = [&](SDValue Chain, SDValue InGlue = SDValue()) -> SDValue {
9266 SmallVector<SDValue, 2> Ops = {Chain};
9267 if (InGlue)
9268 Ops.push_back(InGlue);
9269 return DAG.getNode(AArch64ISD::CHECK_MATCHING_VL, DL,
9270 DAG.getVTList(MVT::Other, MVT::Glue), Ops);
9271 };
9272
9273 if (InsertVectorLengthCheck && Enable) {
9274 // Non-streaming -> Streaming
9275 // Insert vector length check before smstart
9276 SDValue CheckVL = GetCheckVL(Chain, InGlue);
9277 Chain = CheckVL.getValue(0);
9278 InGlue = CheckVL.getValue(1);
9279 }
9280
9281 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
9282 SDValue RegMask = DAG.getRegisterMask(TRI->getSMStartStopCallPreservedMask());
9283 SDValue MSROp =
9284 DAG.getTargetConstant((int32_t)AArch64SVCR::SVCRSM, DL, MVT::i32);
9285 SmallVector<SDValue> Ops = {Chain, MSROp};
9286 unsigned Opcode;
9287 if (Condition != AArch64SME::Always) {
9288 Register PStateReg = FuncInfo->getPStateSMReg();
9289 assert(PStateReg.isValid() && "PStateSM Register is invalid");
9290 SDValue PStateSM =
9291 DAG.getCopyFromReg(Chain, DL, PStateReg, MVT::i64, InGlue);
9292 // Use chain and glue from the CopyFromReg.
9293 Ops[0] = PStateSM.getValue(1);
9294 InGlue = PStateSM.getValue(2);
9295 SDValue ConditionOp = DAG.getTargetConstant(Condition, DL, MVT::i64);
9296 Opcode = Enable ? AArch64ISD::COND_SMSTART : AArch64ISD::COND_SMSTOP;
9297 Ops.push_back(ConditionOp);
9298 Ops.push_back(PStateSM);
9299 } else {
9300 Opcode = Enable ? AArch64ISD::SMSTART : AArch64ISD::SMSTOP;
9301 }
9302 Ops.push_back(RegMask);
9303
9304 if (InGlue)
9305 Ops.push_back(InGlue);
9306
9307 SDValue SMChange =
9308 DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
9309
9310 if (!InsertVectorLengthCheck || Enable)
9311 return SMChange;
9312
9313 // Streaming -> Non-streaming
9314 // Insert vector length check after smstop since we cannot read VL
9315 // in streaming mode
9316 return GetCheckVL(SMChange.getValue(0), SMChange.getValue(1));
9317}
9318
9321 if (!CallAttrs.caller().hasStreamingCompatibleInterface() ||
9322 CallAttrs.caller().hasStreamingBody())
9323 return AArch64SME::Always;
9324 if (CallAttrs.callee().hasNonStreamingInterface())
9326 if (CallAttrs.callee().hasStreamingInterface())
9328
9329 llvm_unreachable("Unsupported attributes");
9330}
9331
9332/// Check whether a stack argument requires lowering in a tail call.
9334 const CCValAssign &VA, SDValue Arg,
9335 ISD::ArgFlagsTy Flags, int CallOffset) {
9336 // FIXME: We should be able to handle this case, but it's not clear how to.
9337 if (Flags.isZExt() || Flags.isSExt())
9338 return true;
9339
9340 for (;;) {
9341 // Look through nodes that don't alter the bits of the incoming value.
9342 unsigned Op = Arg.getOpcode();
9343 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST ||
9344 Arg->isAssert() || Op == AArch64ISD::ASSERT_ZEXT_BOOL) {
9345 Arg = Arg.getOperand(0);
9346 continue;
9347 }
9348 break;
9349 }
9350
9351 // If the argument is a load from the same immutable stack slot, we can reuse
9352 // it.
9353 if (auto *LoadNode = dyn_cast<LoadSDNode>(Arg)) {
9354 if (auto *FINode = dyn_cast<FrameIndexSDNode>(LoadNode->getBasePtr())) {
9355 const MachineFrameInfo &MFI = MF.getFrameInfo();
9356 int FI = FINode->getIndex();
9357 if (!MFI.isImmutableObjectIndex(FI))
9358 return true;
9359 if (CallOffset != MFI.getObjectOffset(FI))
9360 return true;
9361 uint64_t SizeInBits = LoadNode->getMemoryVT().getFixedSizeInBits();
9362 if (SizeInBits / 8 != static_cast<uint64_t>(MFI.getObjectSize(FI)))
9363 return true;
9364 return false;
9365 }
9366 }
9367
9368 return true;
9369}
9370
9371/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
9372/// and add input and output parameter nodes.
9373SDValue
9374AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
9375 SmallVectorImpl<SDValue> &InVals) const {
9376 SelectionDAG &DAG = CLI.DAG;
9377 SDLoc &DL = CLI.DL;
9378 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
9379 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
9381 SDValue Chain = CLI.Chain;
9382 SDValue Callee = CLI.Callee;
9383 bool &IsTailCall = CLI.IsTailCall;
9384 CallingConv::ID &CallConv = CLI.CallConv;
9385 bool IsVarArg = CLI.IsVarArg;
9386 const CallBase *CB = CLI.CB;
9387
9388 MachineFunction &MF = DAG.getMachineFunction();
9389 MachineFunction::CallSiteInfo CSInfo;
9390 bool IsThisReturn = false;
9391
9392 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
9393 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
9394 bool IsCFICall = CLI.CB && CLI.CB->isIndirectCall() && CLI.CFIType;
9395 bool IsSibCall = false;
9396 bool GuardWithBTI = false;
9397
9398 if (CLI.CB && CLI.CB->hasFnAttr(Attribute::ReturnsTwice) &&
9399 !Subtarget->noBTIAtReturnTwice()) {
9400 GuardWithBTI = FuncInfo->branchTargetEnforcement();
9401 }
9402
9403 // Analyze operands of the call, assigning locations to each operand.
9405 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
9406
9407 if (IsVarArg) {
9408 unsigned NumArgs = Outs.size();
9409
9410 for (unsigned i = 0; i != NumArgs; ++i) {
9411 if (Outs[i].Flags.isVarArg() && Outs[i].VT.isScalableVector())
9412 report_fatal_error("Passing SVE types to variadic functions is "
9413 "currently not supported");
9414 }
9415 }
9416
9417 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
9418
9419 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
9420 // Assign locations to each value returned by this call.
9422 CCState RetCCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
9423 *DAG.getContext());
9424 RetCCInfo.AnalyzeCallResult(Ins, RetCC);
9425
9426 // Set type id for call site info.
9427 if (MF.getTarget().Options.EmitCallGraphSection && CB && CB->isIndirectCall())
9428 CSInfo = MachineFunction::CallSiteInfo(*CB);
9429
9430 // Check callee args/returns for SVE registers and set calling convention
9431 // accordingly.
9432 if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) {
9433 auto HasSVERegLoc = [](CCValAssign &Loc) {
9434 if (!Loc.isRegLoc())
9435 return false;
9436 return AArch64::ZPRRegClass.contains(Loc.getLocReg()) ||
9437 AArch64::PPRRegClass.contains(Loc.getLocReg());
9438 };
9439 if (any_of(RVLocs, HasSVERegLoc) || any_of(ArgLocs, HasSVERegLoc))
9441 }
9442
9443 // Determine whether we need any streaming mode changes.
9444 SMECallAttrs CallAttrs =
9446
9447 std::optional<unsigned> ZAMarkerNode;
9448 bool UseNewSMEABILowering = getTM().useNewSMEABILowering();
9449
9450 if (UseNewSMEABILowering) {
9451 if (CallAttrs.requiresLazySave() ||
9452 CallAttrs.requiresPreservingAllZAState())
9453 ZAMarkerNode = AArch64ISD::REQUIRES_ZA_SAVE;
9454 else if (CallAttrs.caller().hasZAState() ||
9455 CallAttrs.caller().hasZT0State())
9456 ZAMarkerNode = AArch64ISD::INOUT_ZA_USE;
9457 }
9458
9459 if (IsTailCall) {
9460 // Check if it's really possible to do a tail call.
9461 IsTailCall = isEligibleForTailCallOptimization(CLI);
9462
9463 // A sibling call is one where we're under the usual C ABI and not planning
9464 // to change that but can still do a tail call:
9465 if (!ZAMarkerNode && !TailCallOpt && IsTailCall &&
9466 CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail)
9467 IsSibCall = true;
9468
9469 if (IsTailCall)
9470 ++NumTailCalls;
9471 }
9472
9473 if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
9474 report_fatal_error("failed to perform tail call elimination on a call "
9475 "site marked musttail");
9476
9477 // Get a count of how many bytes are to be pushed on the stack.
9478 unsigned NumBytes = CCInfo.getStackSize();
9479
9480 if (IsSibCall) {
9481 // Since we're not changing the ABI to make this a tail call, the memory
9482 // operands are already available in the caller's incoming argument space.
9483 NumBytes = 0;
9484 }
9485
9486 // FPDiff is the byte offset of the call's argument area from the callee's.
9487 // Stores to callee stack arguments will be placed in FixedStackSlots offset
9488 // by this amount for a tail call. In a sibling call it must be 0 because the
9489 // caller will deallocate the entire stack and the callee still expects its
9490 // arguments to begin at SP+0. Completely unused for non-tail calls.
9491 int FPDiff = 0;
9492
9493 if (IsTailCall && !IsSibCall) {
9494 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
9495
9496 // Since callee will pop argument stack as a tail call, we must keep the
9497 // popped size 16-byte aligned.
9498 NumBytes = alignTo(NumBytes, 16);
9499
9500 // FPDiff will be negative if this tail call requires more space than we
9501 // would automatically have in our incoming argument space. Positive if we
9502 // can actually shrink the stack.
9503 FPDiff = NumReusableBytes - NumBytes;
9504
9505 // Update the required reserved area if this is the tail call requiring the
9506 // most argument stack space.
9507 if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff)
9508 FuncInfo->setTailCallReservedStack(-FPDiff);
9509
9510 // The stack pointer must be 16-byte aligned at all times it's used for a
9511 // memory operation, which in practice means at *all* times and in
9512 // particular across call boundaries. Therefore our own arguments started at
9513 // a 16-byte aligned SP and the delta applied for the tail call should
9514 // satisfy the same constraint.
9515 assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
9516 }
9517
9518 auto DescribeCallsite =
9519 [&](OptimizationRemarkAnalysis &R) -> OptimizationRemarkAnalysis & {
9520 R << "call from '" << ore::NV("Caller", MF.getName()) << "' to '";
9521 if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
9522 R << ore::NV("Callee", ES->getSymbol());
9523 else if (CLI.CB && CLI.CB->getCalledFunction())
9524 R << ore::NV("Callee", CLI.CB->getCalledFunction()->getName());
9525 else
9526 R << "unknown callee";
9527 R << "'";
9528 return R;
9529 };
9530
9531 bool RequiresLazySave = !UseNewSMEABILowering && CallAttrs.requiresLazySave();
9532 bool RequiresSaveAllZA =
9533 !UseNewSMEABILowering && CallAttrs.requiresPreservingAllZAState();
9534 if (RequiresLazySave) {
9535 TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
9536 SDValue TPIDR2ObjAddr = DAG.getFrameIndex(
9537 TPIDR2.FrameIndex,
9539 Chain = DAG.getNode(
9540 ISD::INTRINSIC_VOID, DL, MVT::Other, Chain,
9541 DAG.getTargetConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
9542 TPIDR2ObjAddr);
9543 OptimizationRemarkEmitter ORE(&MF.getFunction());
9544 ORE.emit([&]() {
9545 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
9546 CLI.CB)
9547 : OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
9548 &MF.getFunction());
9549 return DescribeCallsite(R) << " sets up a lazy save for ZA";
9550 });
9551 } else if (RequiresSaveAllZA) {
9552 assert(!CallAttrs.callee().hasSharedZAInterface() &&
9553 "Cannot share state that may not exist");
9554 Chain = emitSMEStateSaveRestore(*this, DAG, FuncInfo, DL, Chain,
9555 /*IsSave=*/true);
9556 }
9557
9558 bool RequiresSMChange = CallAttrs.requiresSMChange();
9559 if (RequiresSMChange) {
9560 OptimizationRemarkEmitter ORE(&MF.getFunction());
9561 ORE.emit([&]() {
9562 auto R = CLI.CB ? OptimizationRemarkAnalysis("sme", "SMETransition",
9563 CLI.CB)
9564 : OptimizationRemarkAnalysis("sme", "SMETransition",
9565 &MF.getFunction());
9566 DescribeCallsite(R) << " requires a streaming mode transition";
9567 return R;
9568 });
9569 }
9570
9571 SDValue ZTFrameIdx;
9572 MachineFrameInfo &MFI = MF.getFrameInfo();
9573 bool ShouldPreserveZT0 = CallAttrs.requiresPreservingZT0();
9574
9575 // If the caller has ZT0 state which will not be preserved by the callee,
9576 // spill ZT0 before the call.
9577 if (ShouldPreserveZT0) {
9578 ZTFrameIdx = getZT0FrameIndex(MFI, *FuncInfo, DAG);
9579
9580 Chain = DAG.getNode(AArch64ISD::SAVE_ZT, DL, DAG.getVTList(MVT::Other),
9581 {Chain, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
9582 }
9583
9584 // If caller shares ZT0 but the callee is not shared ZA, we need to stop
9585 // PSTATE.ZA before the call if there is no lazy-save active.
9586 bool DisableZA = CallAttrs.requiresDisablingZABeforeCall();
9587 assert((!DisableZA || !RequiresLazySave) &&
9588 "Lazy-save should have PSTATE.SM=1 on entry to the function");
9589
9590 if (DisableZA)
9591 Chain = DAG.getNode(
9592 AArch64ISD::SMSTOP, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain,
9593 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));
9594
9595 // Adjust the stack pointer for the new arguments... and mark ZA uses.
9596 // These operations are automatically eliminated by the prolog/epilog pass
9597 assert((!IsSibCall || !ZAMarkerNode) && "ZA markers require CALLSEQ_START");
9598 if (!IsSibCall) {
9599 Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL);
9600 if (ZAMarkerNode) {
9601 // Note: We need the CALLSEQ_START to glue the ZAMarkerNode to, simply
9602 // using a chain can result in incorrect scheduling. The markers refer to
9603 // the position just before the CALLSEQ_START (though occur after as
9604 // CALLSEQ_START lacks in-glue).
9605 Chain =
9606 DAG.getNode(*ZAMarkerNode, DL, DAG.getVTList(MVT::Other, MVT::Glue),
9607 {Chain, Chain.getValue(1)});
9608 }
9609 }
9610
9611 SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
9613
9615 SmallSet<unsigned, 8> RegsUsed;
9616 SmallVector<SDValue, 8> MemOpChains;
9617 auto PtrVT = getPointerTy(DAG.getDataLayout());
9618
9619 if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
9620 const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
9621 for (const auto &F : Forwards) {
9622 SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
9623 RegsToPass.emplace_back(F.PReg, Val);
9624 }
9625 }
9626
9627 // Walk the register/memloc assignments, inserting copies/loads.
9628 unsigned ExtraArgLocs = 0;
9629 for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
9630 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
9631 SDValue Arg = OutVals[i];
9632 ISD::ArgFlagsTy Flags = Outs[i].Flags;
9633
9634 // Promote the value if needed.
9635 switch (VA.getLocInfo()) {
9636 default:
9637 llvm_unreachable("Unknown loc info!");
9638 case CCValAssign::Full:
9639 break;
9640 case CCValAssign::SExt:
9641 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
9642 break;
9643 case CCValAssign::ZExt:
9644 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
9645 break;
9646 case CCValAssign::AExt:
9647 if (Outs[i].ArgVT == MVT::i1) {
9648 // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
9649 //
9650 // Check if we actually have to do this, because the value may
9651 // already be zero-extended.
9652 //
9653 // We cannot just emit a (zext i8 (trunc (assert-zext i8)))
9654 // and rely on DAGCombiner to fold this, because the following
9655 // (anyext i32) is combined with (zext i8) in DAG.getNode:
9656 //
9657 // (ext (zext x)) -> (zext x)
9658 //
9659 // This will give us (zext i32), which we cannot remove, so
9660 // try to check this beforehand.
9661 if (!checkZExtBool(Arg, DAG)) {
9662 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
9663 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
9664 }
9665 }
9666 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
9667 break;
9669 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
9670 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
9671 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
9672 DAG.getConstant(32, DL, VA.getLocVT()));
9673 break;
9674 case CCValAssign::BCvt:
9675 Arg = DAG.getBitcast(VA.getLocVT(), Arg);
9676 break;
9677 case CCValAssign::Trunc:
9678 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
9679 break;
9680 case CCValAssign::FPExt:
9681 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
9682 break;
9684 bool isScalable = VA.getValVT().isScalableVT();
9685 assert((isScalable || Subtarget->isWindowsArm64EC()) &&
9686 "Indirect arguments should be scalable on most subtargets");
9687
9688 uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinValue();
9689 uint64_t PartSize = StoreSize;
9690 unsigned NumParts = 1;
9691 if (Outs[i].Flags.isInConsecutiveRegs()) {
9692 while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
9693 ++NumParts;
9694 StoreSize *= NumParts;
9695 }
9696
9697 Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
9698 Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
9699 MachineFrameInfo &MFI = MF.getFrameInfo();
9700 int FI = MFI.CreateStackObject(StoreSize, Alignment, false);
9701 if (isScalable) {
9702 bool IsPred = VA.getValVT() == MVT::aarch64svcount ||
9703 VA.getValVT().getVectorElementType() == MVT::i1;
9706 }
9707
9708 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI);
9711 SDValue SpillSlot = Ptr;
9712
9713 // Ensure we generate all stores for each tuple part, whilst updating the
9714 // pointer after each store correctly using vscale.
9715 while (NumParts) {
9716 SDValue Store = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI);
9717 MemOpChains.push_back(Store);
9718
9719 NumParts--;
9720 if (NumParts > 0) {
9721 SDValue BytesIncrement;
9722 if (isScalable) {
9723 BytesIncrement = DAG.getVScale(
9724 DL, Ptr.getValueType(),
9725 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize));
9726 } else {
9727 BytesIncrement = DAG.getConstant(
9728 APInt(Ptr.getValueSizeInBits().getFixedValue(), PartSize), DL,
9729 Ptr.getValueType());
9730 }
9731 MPI = MachinePointerInfo(MPI.getAddrSpace());
9732 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
9733 BytesIncrement, SDNodeFlags::NoUnsignedWrap);
9734 ExtraArgLocs++;
9735 i++;
9736 }
9737 }
9738
9739 Arg = SpillSlot;
9740 break;
9741 }
9742
9743 if (VA.isRegLoc()) {
9744 if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
9745 Outs[0].VT == MVT::i64) {
9746 assert(VA.getLocVT() == MVT::i64 &&
9747 "unexpected calling convention register assignment");
9748 assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
9749 "unexpected use of 'returned'");
9750 IsThisReturn = true;
9751 }
9752 if (RegsUsed.count(VA.getLocReg())) {
9753 // If this register has already been used then we're trying to pack
9754 // parts of an [N x i32] into an X-register. The extension type will
9755 // take care of putting the two halves in the right place but we have to
9756 // combine them.
9757 SDValue &Bits =
9758 llvm::find_if(RegsToPass,
9759 [=](const std::pair<unsigned, SDValue> &Elt) {
9760 return Elt.first == VA.getLocReg();
9761 })
9762 ->second;
9763 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
9764 // Call site info is used for function's parameter entry value
9765 // tracking. For now we track only simple cases when parameter
9766 // is transferred through whole register.
9768 [&VA](MachineFunction::ArgRegPair ArgReg) {
9769 return ArgReg.Reg == VA.getLocReg();
9770 });
9771 } else {
9772 // Add an extra level of indirection for streaming mode changes by
9773 // using a pseudo copy node that cannot be rematerialised between a
9774 // smstart/smstop and the call by the simple register coalescer.
9775 if (RequiresSMChange && isPassedInFPR(Arg.getValueType()))
9776 Arg = DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
9777 DAG.getVTList(Arg.getValueType(), MVT::Glue), Arg);
9778 RegsToPass.emplace_back(VA.getLocReg(), Arg);
9779 RegsUsed.insert(VA.getLocReg());
9780 const TargetOptions &Options = DAG.getTarget().Options;
9781 if (Options.EmitCallSiteInfo)
9782 CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i);
9783 }
9784 } else {
9785 assert(VA.isMemLoc());
9786
9787 SDValue DstAddr;
9788 MachinePointerInfo DstInfo;
9789
9790 // FIXME: This works on big-endian for composite byvals, which are the
9791 // common case. It should also work for fundamental types too.
9792 uint32_t BEAlign = 0;
9793 unsigned OpSize;
9794 if (VA.getLocInfo() == CCValAssign::Indirect ||
9796 OpSize = VA.getLocVT().getFixedSizeInBits();
9797 else
9798 OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
9799 : VA.getValVT().getSizeInBits();
9800 OpSize = (OpSize + 7) / 8;
9801 if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
9802 !Flags.isInConsecutiveRegs()) {
9803 if (OpSize < 8)
9804 BEAlign = 8 - OpSize;
9805 }
9806 unsigned LocMemOffset = VA.getLocMemOffset();
9807 int32_t Offset = LocMemOffset + BEAlign;
9808
9809 if (IsTailCall) {
9810 // When the frame pointer is perfectly aligned for the tail call and the
9811 // same stack argument is passed down intact, we can reuse it.
9812 if (!FPDiff && !shouldLowerTailCallStackArg(MF, VA, Arg, Flags, Offset))
9813 continue;
9814
9815 Offset = Offset + FPDiff;
9816 int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
9817
9818 DstAddr = DAG.getFrameIndex(FI, PtrVT);
9819 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
9820
9821 // Make sure any stack arguments overlapping with where we're storing
9822 // are loaded before this eventual operation. Otherwise they'll be
9823 // clobbered.
9824 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
9825 } else {
9826 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
9827
9828 DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
9829 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
9830 }
9831
9832 if (Outs[i].Flags.isByVal()) {
9833 SDValue SizeNode =
9834 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
9835 SDValue Cpy = DAG.getMemcpy(
9836 Chain, DL, DstAddr, Arg, SizeNode,
9837 Outs[i].Flags.getNonZeroByValAlign(),
9838 /*isVol = */ false, /*AlwaysInline = */ false,
9839 /*CI=*/nullptr, std::nullopt, DstInfo, MachinePointerInfo());
9840
9841 MemOpChains.push_back(Cpy);
9842 } else {
9843 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
9844 // promoted to a legal register type i32, we should truncate Arg back to
9845 // i1/i8/i16.
9846 if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
9847 VA.getValVT() == MVT::i16)
9848 Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
9849
9850 SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
9851 MemOpChains.push_back(Store);
9852 }
9853 }
9854 }
9855
9856 if (IsVarArg && Subtarget->isWindowsArm64EC() &&
9857 !(CLI.CB && CLI.CB->isMustTailCall())) {
9858 SDValue ParamPtr = StackPtr;
9859 if (IsTailCall) {
9860 // Create a dummy object at the top of the stack that can be used to get
9861 // the SP after the epilogue
9862 int FI = MF.getFrameInfo().CreateFixedObject(1, FPDiff, true);
9863 ParamPtr = DAG.getFrameIndex(FI, PtrVT);
9864 }
9865
9866 // For vararg calls, the Arm64EC ABI requires values in x4 and x5
9867 // describing the argument list. x4 contains the address of the
9868 // first stack parameter. x5 contains the size in bytes of all parameters
9869 // passed on the stack.
9870 RegsToPass.emplace_back(AArch64::X4, ParamPtr);
9871 RegsToPass.emplace_back(AArch64::X5,
9872 DAG.getConstant(NumBytes, DL, MVT::i64));
9873 }
9874
9875 if (!MemOpChains.empty())
9876 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
9877
9878 SDValue InGlue;
9879 if (RequiresSMChange) {
9880 bool InsertVectorLengthCheck =
9882 Chain = changeStreamingMode(
9883 DAG, DL, CallAttrs.callee().hasStreamingInterface(), Chain, InGlue,
9884 getSMToggleCondition(CallAttrs), InsertVectorLengthCheck);
9885 InGlue = Chain.getValue(1);
9886 }
9887
9888 // Build a sequence of copy-to-reg nodes chained together with token chain
9889 // and flag operands which copy the outgoing args into the appropriate regs.
9890 for (auto &RegToPass : RegsToPass) {
9891 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
9892 RegToPass.second, InGlue);
9893 InGlue = Chain.getValue(1);
9894 }
9895
9896 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
9897 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
9898 // node so that legalize doesn't hack it.
9899 const GlobalValue *CalledGlobal = nullptr;
9900 unsigned OpFlags = 0;
9901 if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
9902 CalledGlobal = G->getGlobal();
9903 OpFlags = Subtarget->classifyGlobalFunctionReference(CalledGlobal,
9905 if (OpFlags & AArch64II::MO_GOT) {
9906 Callee = DAG.getTargetGlobalAddress(CalledGlobal, DL, PtrVT, 0, OpFlags);
9907 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
9908 } else {
9909 const GlobalValue *GV = G->getGlobal();
9910 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
9911 }
9912 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
9913 bool UseGot = (getTargetMachine().getCodeModel() == CodeModel::Large &&
9914 Subtarget->isTargetMachO()) ||
9916 const char *Sym = S->getSymbol();
9917 if (UseGot) {
9919 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
9920 } else {
9921 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
9922 }
9923 }
9924
9925 // We don't usually want to end the call-sequence here because we would tidy
9926 // the frame up *after* the call, however in the ABI-changing tail-call case
9927 // we've carefully laid out the parameters so that when sp is reset they'll be
9928 // in the correct location.
9929 if (IsTailCall && !IsSibCall) {
9930 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, DL);
9931 InGlue = Chain.getValue(1);
9932 }
9933
9934 unsigned Opc = IsTailCall ? AArch64ISD::TC_RETURN : AArch64ISD::CALL;
9935
9936 std::vector<SDValue> Ops;
9937 Ops.push_back(Chain);
9938 Ops.push_back(Callee);
9939
9940 // Calls with operand bundle "clang.arc.attachedcall" are special. They should
9941 // be expanded to the call, directly followed by a special marker sequence and
9942 // a call to an ObjC library function. Use CALL_RVMARKER to do that.
9943 if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
9944 assert(!IsTailCall &&
9945 "tail calls cannot be marked with clang.arc.attachedcall");
9946 Opc = AArch64ISD::CALL_RVMARKER;
9947
9948 // Add a target global address for the retainRV/claimRV runtime function
9949 // just before the call target.
9950 Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
9951 auto GA = DAG.getTargetGlobalAddress(ARCFn, DL, PtrVT);
9952 Ops.insert(Ops.begin() + 1, GA);
9953
9954 // We may or may not need to emit both the marker and the retain/claim call.
9955 // Tell the pseudo expansion using an additional boolean op.
9956 bool ShouldEmitMarker = objcarc::attachedCallOpBundleNeedsMarker(CLI.CB);
9957 SDValue DoEmitMarker =
9958 DAG.getTargetConstant(ShouldEmitMarker, DL, MVT::i32);
9959 Ops.insert(Ops.begin() + 2, DoEmitMarker);
9960 } else if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
9961 Opc = AArch64ISD::CALL_ARM64EC_TO_X64;
9962 } else if (GuardWithBTI) {
9963 Opc = AArch64ISD::CALL_BTI;
9964 }
9965
9966 if (IsTailCall) {
9967 // Each tail call may have to adjust the stack by a different amount, so
9968 // this information must travel along with the operation for eventual
9969 // consumption by emitEpilogue.
9970 Ops.push_back(DAG.getSignedTargetConstant(FPDiff, DL, MVT::i32));
9971 }
9972
9973 if (CLI.PAI) {
9974 const uint64_t Key = CLI.PAI->Key;
9976 "Invalid auth call key");
9977
9978 // Split the discriminator into address/integer components.
9979 SDValue AddrDisc, IntDisc;
9980 std::tie(IntDisc, AddrDisc) =
9981 extractPtrauthBlendDiscriminators(CLI.PAI->Discriminator, &DAG);
9982
9983 if (Opc == AArch64ISD::CALL_RVMARKER)
9984 Opc = AArch64ISD::AUTH_CALL_RVMARKER;
9985 else
9986 Opc = IsTailCall ? AArch64ISD::AUTH_TC_RETURN : AArch64ISD::AUTH_CALL;
9987 Ops.push_back(DAG.getTargetConstant(Key, DL, MVT::i32));
9988 Ops.push_back(IntDisc);
9989 Ops.push_back(AddrDisc);
9990 }
9991
9992 // Add argument registers to the end of the list so that they are known live
9993 // into the call.
9994 for (auto &RegToPass : RegsToPass)
9995 Ops.push_back(DAG.getRegister(RegToPass.first,
9996 RegToPass.second.getValueType()));
9997
9998 // Add a register mask operand representing the call-preserved registers.
9999 const uint32_t *Mask;
10000 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
10001 if (IsThisReturn) {
10002 // For 'this' returns, use the X0-preserving mask if applicable
10003 Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
10004 if (!Mask) {
10005 IsThisReturn = false;
10006 Mask = TRI->getCallPreservedMask(MF, CallConv);
10007 }
10008 } else
10009 Mask = TRI->getCallPreservedMask(MF, CallConv);
10010
10011 if (Subtarget->hasCustomCallingConv())
10012 TRI->UpdateCustomCallPreservedMask(MF, &Mask);
10013
10014 if (TRI->isAnyArgRegReserved(MF))
10015 TRI->emitReservedArgRegCallError(MF);
10016
10017 assert(Mask && "Missing call preserved mask for calling convention");
10018 Ops.push_back(DAG.getRegisterMask(Mask));
10019
10020 if (InGlue.getNode())
10021 Ops.push_back(InGlue);
10022
10023 // If we're doing a tall call, use a TC_RETURN here rather than an
10024 // actual call instruction.
10025 if (IsTailCall) {
10027 SDValue Ret = DAG.getNode(Opc, DL, MVT::Other, Ops);
10028 if (IsCFICall)
10029 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
10030
10031 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
10032 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
10033 if (CalledGlobal &&
10034 MF.getFunction().getParent()->getModuleFlag("import-call-optimization"))
10035 DAG.addCalledGlobal(Ret.getNode(), CalledGlobal, OpFlags);
10036 return Ret;
10037 }
10038
10039 // Returns a chain and a flag for retval copy to use.
10040 Chain = DAG.getNode(Opc, DL, {MVT::Other, MVT::Glue}, Ops);
10041 if (IsCFICall)
10042 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
10043
10044 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
10045 InGlue = Chain.getValue(1);
10046 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
10047 if (CalledGlobal &&
10048 MF.getFunction().getParent()->getModuleFlag("import-call-optimization"))
10049 DAG.addCalledGlobal(Chain.getNode(), CalledGlobal, OpFlags);
10050
10051 uint64_t CalleePopBytes =
10052 DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
10053
10054 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, DL);
10055 InGlue = Chain.getValue(1);
10056
10057 // Handle result values, copying them out of physregs into vregs that we
10058 // return.
10059 SDValue Result = LowerCallResult(
10060 Chain, InGlue, CallConv, IsVarArg, RVLocs, DL, DAG, InVals, IsThisReturn,
10061 IsThisReturn ? OutVals[0] : SDValue(), RequiresSMChange);
10062
10063 if (!Ins.empty())
10064 InGlue = Result.getValue(Result->getNumValues() - 1);
10065
10066 if (RequiresSMChange) {
10068 DAG, DL, !CallAttrs.callee().hasStreamingInterface(), Result, InGlue,
10069 getSMToggleCondition(CallAttrs));
10070 }
10071
10072 if (RequiresLazySave || CallAttrs.requiresEnablingZAAfterCall())
10073 // Unconditionally resume ZA.
10074 Result = DAG.getNode(
10075 AArch64ISD::SMSTART, DL, DAG.getVTList(MVT::Other, MVT::Glue), Result,
10076 DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32));
10077
10078 if (ShouldPreserveZT0)
10079 Result =
10080 DAG.getNode(AArch64ISD::RESTORE_ZT, DL, DAG.getVTList(MVT::Other),
10081 {Result, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
10082
10083 if (RequiresLazySave) {
10084 Result = emitRestoreZALazySave(Result, DL, *this, *TRI, *FuncInfo, DAG);
10085 } else if (RequiresSaveAllZA) {
10086 Result = emitSMEStateSaveRestore(*this, DAG, FuncInfo, DL, Result,
10087 /*IsSave=*/false);
10088 }
10089
10090 if (RequiresSMChange || RequiresLazySave || ShouldPreserveZT0 ||
10091 RequiresSaveAllZA) {
10092 for (unsigned I = 0; I < InVals.size(); ++I) {
10093 // The smstart/smstop is chained as part of the call, but when the
10094 // resulting chain is discarded (which happens when the call is not part
10095 // of a chain, e.g. a call to @llvm.cos()), we need to ensure the
10096 // smstart/smstop is chained to the result value. We can do that by doing
10097 // a vreg -> vreg copy.
10100 SDValue X = DAG.getCopyToReg(Result, DL, Reg, InVals[I]);
10101 InVals[I] = DAG.getCopyFromReg(X, DL, Reg,
10102 InVals[I].getValueType());
10103 }
10104 }
10105
10106 if (CallConv == CallingConv::PreserveNone) {
10107 for (const ISD::OutputArg &O : Outs) {
10108 if (O.Flags.isSwiftSelf() || O.Flags.isSwiftError() ||
10109 O.Flags.isSwiftAsync()) {
10110 MachineFunction &MF = DAG.getMachineFunction();
10111 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10112 MF.getFunction(),
10113 "Swift attributes can't be used with preserve_none",
10114 DL.getDebugLoc()));
10115 break;
10116 }
10117 }
10118 }
10119
10120 return Result;
10121}
10122
10123bool AArch64TargetLowering::CanLowerReturn(
10124 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
10126 const Type *RetTy) const {
10127 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
10129 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
10130 return CCInfo.CheckReturn(Outs, RetCC);
10131}
10132
10133SDValue
10134AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
10135 bool isVarArg,
10137 const SmallVectorImpl<SDValue> &OutVals,
10138 const SDLoc &DL, SelectionDAG &DAG) const {
10139 auto &MF = DAG.getMachineFunction();
10140 auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
10141
10142 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
10144 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
10145 CCInfo.AnalyzeReturn(Outs, RetCC);
10146
10147 // Copy the result values into the output registers.
10148 SDValue Glue;
10150 SmallSet<unsigned, 4> RegsUsed;
10151 for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
10152 ++i, ++realRVLocIdx) {
10153 CCValAssign &VA = RVLocs[i];
10154 assert(VA.isRegLoc() && "Can only return in registers!");
10155 SDValue Arg = OutVals[realRVLocIdx];
10156
10157 switch (VA.getLocInfo()) {
10158 default:
10159 llvm_unreachable("Unknown loc info!");
10160 case CCValAssign::Full:
10161 if (Outs[i].ArgVT == MVT::i1) {
10162 // AAPCS requires i1 to be zero-extended to i8 by the producer of the
10163 // value. This is strictly redundant on Darwin (which uses "zeroext
10164 // i1"), but will be optimised out before ISel.
10165 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
10166 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
10167 }
10168 break;
10169 case CCValAssign::BCvt:
10170 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
10171 break;
10172 case CCValAssign::AExt:
10173 case CCValAssign::ZExt:
10174 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
10175 break;
10177 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
10178 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
10179 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
10180 DAG.getConstant(32, DL, VA.getLocVT()));
10181 break;
10182 }
10183
10184 if (RegsUsed.count(VA.getLocReg())) {
10185 SDValue &Bits =
10186 llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) {
10187 return Elt.first == VA.getLocReg();
10188 })->second;
10189 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
10190 } else {
10191 RetVals.emplace_back(VA.getLocReg(), Arg);
10192 RegsUsed.insert(VA.getLocReg());
10193 }
10194 }
10195
10196 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
10197
10198 // Emit SMSTOP before returning from a locally streaming function
10199 SMEAttrs FuncAttrs = FuncInfo->getSMEFnAttrs();
10200 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface()) {
10201 if (FuncAttrs.hasStreamingCompatibleInterface())
10202 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
10203 /*Glue*/ SDValue(),
10205 else
10206 Chain = changeStreamingMode(DAG, DL, /*Enable*/ false, Chain,
10207 /*Glue*/ SDValue(), AArch64SME::Always);
10208 Glue = Chain.getValue(1);
10209 }
10210
10211 SmallVector<SDValue, 4> RetOps(1, Chain);
10212 for (auto &RetVal : RetVals) {
10213 if (FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface() &&
10214 isPassedInFPR(RetVal.second.getValueType()))
10215 RetVal.second =
10216 DAG.getNode(AArch64ISD::COALESCER_BARRIER, DL,
10217 DAG.getVTList(RetVal.second.getValueType(), MVT::Glue),
10218 RetVal.second);
10219 Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Glue);
10220 Glue = Chain.getValue(1);
10221 RetOps.push_back(
10222 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
10223 }
10224
10225 // Windows AArch64 ABIs require that for returning structs by value we copy
10226 // the sret argument into X0 for the return.
10227 // We saved the argument into a virtual register in the entry block,
10228 // so now we copy the value out and into X0.
10229 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
10230 SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
10232
10233 unsigned RetValReg = AArch64::X0;
10234 if (CallConv == CallingConv::ARM64EC_Thunk_X64)
10235 RetValReg = AArch64::X8;
10236 Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Glue);
10237 Glue = Chain.getValue(1);
10238
10239 RetOps.push_back(
10240 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
10241 }
10242
10243 const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&MF);
10244 if (I) {
10245 for (; *I; ++I) {
10246 if (AArch64::GPR64RegClass.contains(*I))
10247 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
10248 else if (AArch64::FPR64RegClass.contains(*I))
10249 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
10250 else
10251 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
10252 }
10253 }
10254
10255 RetOps[0] = Chain; // Update chain.
10256
10257 // Add the glue if we have it.
10258 if (Glue.getNode())
10259 RetOps.push_back(Glue);
10260
10261 if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
10262 // ARM64EC entry thunks use a special return sequence: instead of a regular
10263 // "ret" instruction, they need to explicitly call the emulator.
10264 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10265 SDValue Arm64ECRetDest =
10266 DAG.getExternalSymbol("__os_arm64x_dispatch_ret", PtrVT);
10267 Arm64ECRetDest =
10268 getAddr(cast<ExternalSymbolSDNode>(Arm64ECRetDest), DAG, 0);
10269 Arm64ECRetDest = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Arm64ECRetDest,
10270 MachinePointerInfo());
10271 RetOps.insert(RetOps.begin() + 1, Arm64ECRetDest);
10272 RetOps.insert(RetOps.begin() + 2, DAG.getTargetConstant(0, DL, MVT::i32));
10273 return DAG.getNode(AArch64ISD::TC_RETURN, DL, MVT::Other, RetOps);
10274 }
10275
10276 return DAG.getNode(AArch64ISD::RET_GLUE, DL, MVT::Other, RetOps);
10277}
10278
10279//===----------------------------------------------------------------------===//
10280// Other Lowering Code
10281//===----------------------------------------------------------------------===//
10282
10283SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
10284 SelectionDAG &DAG,
10285 unsigned Flag) const {
10286 return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
10287 N->getOffset(), Flag);
10288}
10289
10290SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
10291 SelectionDAG &DAG,
10292 unsigned Flag) const {
10293 return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
10294}
10295
10296SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
10297 SelectionDAG &DAG,
10298 unsigned Flag) const {
10299 return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
10300 N->getOffset(), Flag);
10301}
10302
10303SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
10304 SelectionDAG &DAG,
10305 unsigned Flag) const {
10306 return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
10307}
10308
10309SDValue AArch64TargetLowering::getTargetNode(ExternalSymbolSDNode *N, EVT Ty,
10310 SelectionDAG &DAG,
10311 unsigned Flag) const {
10312 return DAG.getTargetExternalSymbol(N->getSymbol(), Ty, Flag);
10313}
10314
10315// (loadGOT sym)
10316template <class NodeTy>
10317SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
10318 unsigned Flags) const {
10319 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
10320 SDLoc DL(N);
10321 EVT Ty = getPointerTy(DAG.getDataLayout());
10322 SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
10323 // FIXME: Once remat is capable of dealing with instructions with register
10324 // operands, expand this into two nodes instead of using a wrapper node.
10325 if (DAG.getMachineFunction()
10326 .getInfo<AArch64FunctionInfo>()
10327 ->hasELFSignedGOT())
10328 return SDValue(DAG.getMachineNode(AArch64::LOADgotAUTH, DL, Ty, GotAddr),
10329 0);
10330 return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
10331}
10332
10333// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
10334template <class NodeTy>
10335SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
10336 unsigned Flags) const {
10337 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
10338 SDLoc DL(N);
10339 EVT Ty = getPointerTy(DAG.getDataLayout());
10340 const unsigned char MO_NC = AArch64II::MO_NC;
10341 return DAG.getNode(
10342 AArch64ISD::WrapperLarge, DL, Ty,
10343 getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
10344 getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
10345 getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
10346 getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
10347}
10348
10349// (addlow (adrp %hi(sym)) %lo(sym))
10350template <class NodeTy>
10351SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
10352 unsigned Flags) const {
10353 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
10354 SDLoc DL(N);
10355 EVT Ty = getPointerTy(DAG.getDataLayout());
10356 SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
10357 SDValue Lo = getTargetNode(N, Ty, DAG,
10359 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi);
10360 return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
10361}
10362
10363// (adr sym)
10364template <class NodeTy>
10365SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
10366 unsigned Flags) const {
10367 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
10368 SDLoc DL(N);
10369 EVT Ty = getPointerTy(DAG.getDataLayout());
10370 SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
10371 return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);
10372}
10373
10374SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
10375 SelectionDAG &DAG) const {
10376 GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
10377 const GlobalValue *GV = GN->getGlobal();
10378 unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
10379
10380 if (OpFlags != AArch64II::MO_NO_FLAG)
10382 "unexpected offset in global node");
10383
10384 // This also catches the large code model case for Darwin, and tiny code
10385 // model with got relocations.
10386 if ((OpFlags & AArch64II::MO_GOT) != 0) {
10387 return getGOT(GN, DAG, OpFlags);
10388 }
10389
10393 Result = getAddrLarge(GN, DAG, OpFlags);
10394 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
10395 Result = getAddrTiny(GN, DAG, OpFlags);
10396 } else {
10397 Result = getAddr(GN, DAG, OpFlags);
10398 }
10399 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10400 SDLoc DL(GN);
10402 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
10404 return Result;
10405}
10406
10407/// Convert a TLS address reference into the correct sequence of loads
10408/// and calls to compute the variable's address (for Darwin, currently) and
10409/// return an SDValue containing the final node.
10410
10411/// Darwin only has one TLS scheme which must be capable of dealing with the
10412/// fully general situation, in the worst case. This means:
10413/// + "extern __thread" declaration.
10414/// + Defined in a possibly unknown dynamic library.
10415///
10416/// The general system is that each __thread variable has a [3 x i64] descriptor
10417/// which contains information used by the runtime to calculate the address. The
10418/// only part of this the compiler needs to know about is the first xword, which
10419/// contains a function pointer that must be called with the address of the
10420/// entire descriptor in "x0".
10421///
10422/// Since this descriptor may be in a different unit, in general even the
10423/// descriptor must be accessed via an indirect load. The "ideal" code sequence
10424/// is:
10425/// adrp x0, _var@TLVPPAGE
10426/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
10427/// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
10428/// ; the function pointer
10429/// blr x1 ; Uses descriptor address in x0
10430/// ; Address of _var is now in x0.
10431///
10432/// If the address of _var's descriptor *is* known to the linker, then it can
10433/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
10434/// a slight efficiency gain.
10435SDValue
10436AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
10437 SelectionDAG &DAG) const {
10438 assert(Subtarget->isTargetDarwin() &&
10439 "This function expects a Darwin target");
10440
10441 SDLoc DL(Op);
10442 MVT PtrVT = getPointerTy(DAG.getDataLayout());
10443 MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout());
10444 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
10445
10446 SDValue TLVPAddr =
10447 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
10448 SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
10449
10450 // The first entry in the descriptor is a function pointer that we must call
10451 // to obtain the address of the variable.
10452 SDValue Chain = DAG.getEntryNode();
10453 SDValue FuncTLVGet = DAG.getLoad(
10454 PtrMemVT, DL, Chain, DescAddr,
10456 Align(PtrMemVT.getSizeInBits() / 8),
10458 Chain = FuncTLVGet.getValue(1);
10459
10460 // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
10461 FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT);
10462
10463 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
10464 MFI.setAdjustsStack(true);
10465
10466 // TLS calls preserve all registers except those that absolutely must be
10467 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
10468 // silly).
10469 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
10470 const uint32_t *Mask = TRI->getTLSCallPreservedMask();
10471 if (Subtarget->hasCustomCallingConv())
10472 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
10473
10474 // Finally, we can make the call. This is just a degenerate version of a
10475 // normal AArch64 call node: x0 takes the address of the descriptor, and
10476 // returns the address of the variable in this thread.
10477 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
10478
10479 unsigned Opcode = AArch64ISD::CALL;
10481 Ops.push_back(Chain);
10482 Ops.push_back(FuncTLVGet);
10483
10484 // With ptrauth-calls, the tlv access thunk pointer is authenticated (IA, 0).
10485 if (DAG.getMachineFunction().getFunction().hasFnAttribute("ptrauth-calls")) {
10486 Opcode = AArch64ISD::AUTH_CALL;
10487 Ops.push_back(DAG.getTargetConstant(AArch64PACKey::IA, DL, MVT::i32));
10488 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64)); // Integer Disc.
10489 Ops.push_back(DAG.getRegister(AArch64::NoRegister, MVT::i64)); // Addr Disc.
10490 }
10491
10492 Ops.push_back(DAG.getRegister(AArch64::X0, MVT::i64));
10493 Ops.push_back(DAG.getRegisterMask(Mask));
10494 Ops.push_back(Chain.getValue(1));
10495 Chain = DAG.getNode(Opcode, DL, DAG.getVTList(MVT::Other, MVT::Glue), Ops);
10496 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
10497}
10498
10499/// Convert a thread-local variable reference into a sequence of instructions to
10500/// compute the variable's address for the local exec TLS model of ELF targets.
10501/// The sequence depends on the maximum TLS area size.
10502SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
10503 SDValue ThreadBase,
10504 const SDLoc &DL,
10505 SelectionDAG &DAG) const {
10506 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10507 SDValue TPOff, Addr;
10508
10509 switch (DAG.getTarget().Options.TLSSize) {
10510 default:
10511 llvm_unreachable("Unexpected TLS size");
10512
10513 case 12: {
10514 // mrs x0, TPIDR_EL0
10515 // add x0, x0, :tprel_lo12:a
10517 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
10518 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
10519 Var,
10520 DAG.getTargetConstant(0, DL, MVT::i32)),
10521 0);
10522 }
10523
10524 case 24: {
10525 // mrs x0, TPIDR_EL0
10526 // add x0, x0, :tprel_hi12:a
10527 // add x0, x0, :tprel_lo12_nc:a
10528 SDValue HiVar = DAG.getTargetGlobalAddress(
10529 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
10530 SDValue LoVar = DAG.getTargetGlobalAddress(
10531 GV, DL, PtrVT, 0,
10533 Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
10534 HiVar,
10535 DAG.getTargetConstant(0, DL, MVT::i32)),
10536 0);
10537 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,
10538 LoVar,
10539 DAG.getTargetConstant(0, DL, MVT::i32)),
10540 0);
10541 }
10542
10543 case 32: {
10544 // mrs x1, TPIDR_EL0
10545 // movz x0, #:tprel_g1:a
10546 // movk x0, #:tprel_g0_nc:a
10547 // add x0, x1, x0
10548 SDValue HiVar = DAG.getTargetGlobalAddress(
10549 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
10550 SDValue LoVar = DAG.getTargetGlobalAddress(
10551 GV, DL, PtrVT, 0,
10553 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
10554 DAG.getTargetConstant(16, DL, MVT::i32)),
10555 0);
10556 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
10557 DAG.getTargetConstant(0, DL, MVT::i32)),
10558 0);
10559 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
10560 }
10561
10562 case 48: {
10563 // mrs x1, TPIDR_EL0
10564 // movz x0, #:tprel_g2:a
10565 // movk x0, #:tprel_g1_nc:a
10566 // movk x0, #:tprel_g0_nc:a
10567 // add x0, x1, x0
10568 SDValue HiVar = DAG.getTargetGlobalAddress(
10569 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G2);
10570 SDValue MiVar = DAG.getTargetGlobalAddress(
10571 GV, DL, PtrVT, 0,
10573 SDValue LoVar = DAG.getTargetGlobalAddress(
10574 GV, DL, PtrVT, 0,
10576 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
10577 DAG.getTargetConstant(32, DL, MVT::i32)),
10578 0);
10579 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar,
10580 DAG.getTargetConstant(16, DL, MVT::i32)),
10581 0);
10582 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
10583 DAG.getTargetConstant(0, DL, MVT::i32)),
10584 0);
10585 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
10586 }
10587 }
10588}
10589
10590/// When accessing thread-local variables under either the general-dynamic or
10591/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
10592/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
10593/// is a function pointer to carry out the resolution.
10594///
10595/// The sequence is:
10596/// adrp x0, :tlsdesc:var
10597/// ldr x1, [x0, #:tlsdesc_lo12:var]
10598/// add x0, x0, #:tlsdesc_lo12:var
10599/// .tlsdesccall var
10600/// blr x1
10601/// (TPIDR_EL0 offset now in x0)
10602///
10603/// The above sequence must be produced unscheduled, to enable the linker to
10604/// optimize/relax this sequence.
10605/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
10606/// above sequence, and expanded really late in the compilation flow, to ensure
10607/// the sequence is produced as per above.
10608SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
10609 const SDLoc &DL,
10610 SelectionDAG &DAG) const {
10611 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10612 auto &MF = DAG.getMachineFunction();
10613 auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
10614
10615 SDValue Glue;
10616 SDValue Chain = DAG.getEntryNode();
10617 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
10618
10619 SMECallAttrs TLSCallAttrs(FuncInfo->getSMEFnAttrs(), {}, SMEAttrs::Normal);
10620 bool RequiresSMChange = TLSCallAttrs.requiresSMChange();
10621
10622 auto ChainAndGlue = [](SDValue Chain) -> std::pair<SDValue, SDValue> {
10623 return {Chain, Chain.getValue(1)};
10624 };
10625
10626 if (RequiresSMChange)
10627 std::tie(Chain, Glue) =
10628 ChainAndGlue(changeStreamingMode(DAG, DL, /*Enable=*/false, Chain, Glue,
10629 getSMToggleCondition(TLSCallAttrs)));
10630
10631 unsigned Opcode =
10632 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>()->hasELFSignedGOT()
10633 ? AArch64ISD::TLSDESC_AUTH_CALLSEQ
10634 : AArch64ISD::TLSDESC_CALLSEQ;
10635 SDValue Ops[] = {Chain, SymAddr, Glue};
10636 std::tie(Chain, Glue) = ChainAndGlue(DAG.getNode(
10637 Opcode, DL, NodeTys, Glue ? ArrayRef(Ops) : ArrayRef(Ops).drop_back()));
10638
10639 if (TLSCallAttrs.requiresLazySave())
10640 std::tie(Chain, Glue) = ChainAndGlue(DAG.getNode(
10641 AArch64ISD::REQUIRES_ZA_SAVE, DL, NodeTys, {Chain, Chain.getValue(1)}));
10642
10643 if (RequiresSMChange)
10644 std::tie(Chain, Glue) =
10645 ChainAndGlue(changeStreamingMode(DAG, DL, /*Enable=*/true, Chain, Glue,
10646 getSMToggleCondition(TLSCallAttrs)));
10647
10648 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
10649}
10650
10651SDValue
10652AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
10653 SelectionDAG &DAG) const {
10654 assert(Subtarget->isTargetELF() && "This function expects an ELF target");
10655
10656 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
10657 AArch64FunctionInfo *MFI =
10658 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
10659
10663
10665 if (Model == TLSModel::LocalDynamic)
10667 }
10668
10670 Model != TLSModel::LocalExec)
10671 report_fatal_error("ELF TLS only supported in small memory model or "
10672 "in local exec TLS model");
10673 // Different choices can be made for the maximum size of the TLS area for a
10674 // module. For the small address model, the default TLS size is 16MiB and the
10675 // maximum TLS size is 4GiB.
10676 // FIXME: add tiny and large code model support for TLS access models other
10677 // than local exec. We currently generate the same code as small for tiny,
10678 // which may be larger than needed.
10679
10680 SDValue TPOff;
10681 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10682 SDLoc DL(Op);
10683 const GlobalValue *GV = GA->getGlobal();
10684
10685 SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
10686
10687 if (Model == TLSModel::LocalExec) {
10688 return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
10689 } else if (Model == TLSModel::InitialExec) {
10690 TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
10691 TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
10692 } else if (Model == TLSModel::LocalDynamic) {
10693 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
10694 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
10695 // the beginning of the module's TLS region, followed by a DTPREL offset
10696 // calculation.
10697
10698 // These accesses will need deduplicating if there's more than one.
10700
10701 // The call needs a relocation too for linker relaxation. It doesn't make
10702 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
10703 // the address.
10704 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
10706
10707 // Now we can calculate the offset from TPIDR_EL0 to this module's
10708 // thread-local area.
10709 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
10710
10711 // Now use :dtprel_whatever: operations to calculate this variable's offset
10712 // in its thread-storage area.
10713 SDValue HiVar = DAG.getTargetGlobalAddress(
10714 GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
10715 SDValue LoVar = DAG.getTargetGlobalAddress(
10716 GV, DL, MVT::i64, 0,
10718
10719 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
10720 DAG.getTargetConstant(0, DL, MVT::i32)),
10721 0);
10722 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
10723 DAG.getTargetConstant(0, DL, MVT::i32)),
10724 0);
10725 } else if (Model == TLSModel::GeneralDynamic) {
10726 // The call needs a relocation too for linker relaxation. It doesn't make
10727 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
10728 // the address.
10729 SDValue SymAddr =
10730 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
10731
10732 // Finally we can make a call to calculate the offset from tpidr_el0.
10733 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
10734 } else
10735 llvm_unreachable("Unsupported ELF TLS access model");
10736
10737 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
10738}
10739
10740SDValue
10741AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
10742 SelectionDAG &DAG) const {
10743 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
10744
10745 SDValue Chain = DAG.getEntryNode();
10746 EVT PtrVT = getPointerTy(DAG.getDataLayout());
10747 SDLoc DL(Op);
10748
10749 SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);
10750
10751 // Load the ThreadLocalStoragePointer from the TEB
10752 // A pointer to the TLS array is located at offset 0x58 from the TEB.
10753 SDValue TLSArray =
10754 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
10755 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
10756 Chain = TLSArray.getValue(1);
10757
10758 // Load the TLS index from the C runtime;
10759 // This does the same as getAddr(), but without having a GlobalAddressSDNode.
10760 // This also does the same as LOADgot, but using a generic i32 load,
10761 // while LOADgot only loads i64.
10762 SDValue TLSIndexHi =
10763 DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE);
10764 SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
10765 "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
10766 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi);
10767 SDValue TLSIndex =
10768 DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo);
10769 TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
10770 Chain = TLSIndex.getValue(1);
10771
10772 // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
10773 // offset into the TLSArray.
10774 TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
10775 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
10776 DAG.getConstant(3, DL, PtrVT));
10777 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
10778 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
10779 MachinePointerInfo());
10780 Chain = TLS.getValue(1);
10781
10782 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
10783 const GlobalValue *GV = GA->getGlobal();
10784 SDValue TGAHi = DAG.getTargetGlobalAddress(
10785 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
10786 SDValue TGALo = DAG.getTargetGlobalAddress(
10787 GV, DL, PtrVT, 0,
10789
10790 // Add the offset from the start of the .tls section (section base).
10791 SDValue Addr =
10792 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
10793 DAG.getTargetConstant(0, DL, MVT::i32)),
10794 0);
10795 Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo);
10796 return Addr;
10797}
10798
10799SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
10800 SelectionDAG &DAG) const {
10801 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
10802 if (DAG.getTarget().useEmulatedTLS())
10803 return LowerToTLSEmulatedModel(GA, DAG);
10804
10805 if (Subtarget->isTargetDarwin())
10806 return LowerDarwinGlobalTLSAddress(Op, DAG);
10807 if (Subtarget->isTargetELF())
10808 return LowerELFGlobalTLSAddress(Op, DAG);
10809 if (Subtarget->isTargetWindows())
10810 return LowerWindowsGlobalTLSAddress(Op, DAG);
10811
10812 llvm_unreachable("Unexpected platform trying to use TLS");
10813}
10814
10815//===----------------------------------------------------------------------===//
10816// PtrAuthGlobalAddress lowering
10817//
10818// We have 3 lowering alternatives to choose from:
10819// - MOVaddrPAC: similar to MOVaddr, with added PAC.
10820// If the GV doesn't need a GOT load (i.e., is locally defined)
10821// materialize the pointer using adrp+add+pac. See LowerMOVaddrPAC.
10822//
10823// - LOADgotPAC: similar to LOADgot, with added PAC.
10824// If the GV needs a GOT load, materialize the pointer using the usual
10825// GOT adrp+ldr, +pac. Pointers in GOT are assumed to be not signed, the GOT
10826// section is assumed to be read-only (for example, via relro mechanism). See
10827// LowerMOVaddrPAC.
10828//
10829// - LOADauthptrstatic: similar to LOADgot, but use a
10830// special stub slot instead of a GOT slot.
10831// Load a signed pointer for symbol 'sym' from a stub slot named
10832// 'sym$auth_ptr$key$disc' filled by dynamic linker during relocation
10833// resolving. This usually lowers to adrp+ldr, but also emits an entry into
10834// .data with an @AUTH relocation. See LowerLOADauthptrstatic.
10835//
10836// All 3 are pseudos that are expand late to longer sequences: this lets us
10837// provide integrity guarantees on the to-be-signed intermediate values.
10838//
10839// LOADauthptrstatic is undesirable because it requires a large section filled
10840// with often similarly-signed pointers, making it a good harvesting target.
10841// Thus, it's only used for ptrauth references to extern_weak to avoid null
10842// checks.
10843
10845 SDValue TGA, SDLoc DL, EVT VT, AArch64PACKey::ID KeyC,
10846 SDValue Discriminator, SDValue AddrDiscriminator, SelectionDAG &DAG) {
10847 const auto *TGN = cast<GlobalAddressSDNode>(TGA.getNode());
10848 assert(TGN->getGlobal()->hasExternalWeakLinkage());
10849
10850 // Offsets and extern_weak don't mix well: ptrauth aside, you'd get the
10851 // offset alone as a pointer if the symbol wasn't available, which would
10852 // probably break null checks in users. Ptrauth complicates things further:
10853 // error out.
10854 if (TGN->getOffset() != 0)
10856 "unsupported non-zero offset in weak ptrauth global reference");
10857
10858 if (!isNullConstant(AddrDiscriminator))
10859 report_fatal_error("unsupported weak addr-div ptrauth global");
10860
10861 SDValue Key = DAG.getTargetConstant(KeyC, DL, MVT::i32);
10862 return SDValue(DAG.getMachineNode(AArch64::LOADauthptrstatic, DL, MVT::i64,
10863 {TGA, Key, Discriminator}),
10864 0);
10865}
10866
10867SDValue
10868AArch64TargetLowering::LowerPtrAuthGlobalAddress(SDValue Op,
10869 SelectionDAG &DAG) const {
10870 SDValue Ptr = Op.getOperand(0);
10871 uint64_t KeyC = Op.getConstantOperandVal(1);
10872 SDValue AddrDiscriminator = Op.getOperand(2);
10873 uint64_t DiscriminatorC = Op.getConstantOperandVal(3);
10874 EVT VT = Op.getValueType();
10875 SDLoc DL(Op);
10876
10877 if (KeyC > AArch64PACKey::LAST)
10878 report_fatal_error("key in ptrauth global out of range [0, " +
10879 Twine((int)AArch64PACKey::LAST) + "]");
10880
10881 // Blend only works if the integer discriminator is 16-bit wide.
10882 if (!isUInt<16>(DiscriminatorC))
10884 "constant discriminator in ptrauth global out of range [0, 0xffff]");
10885
10886 // Choosing between 3 lowering alternatives is target-specific.
10887 if (!Subtarget->isTargetELF() && !Subtarget->isTargetMachO())
10888 report_fatal_error("ptrauth global lowering only supported on MachO/ELF");
10889
10890 int64_t PtrOffsetC = 0;
10891 if (Ptr.getOpcode() == ISD::ADD) {
10892 PtrOffsetC = Ptr.getConstantOperandVal(1);
10893 Ptr = Ptr.getOperand(0);
10894 }
10895 const auto *PtrN = cast<GlobalAddressSDNode>(Ptr.getNode());
10896 const GlobalValue *PtrGV = PtrN->getGlobal();
10897
10898 // Classify the reference to determine whether it needs a GOT load.
10899 const unsigned OpFlags =
10900 Subtarget->ClassifyGlobalReference(PtrGV, getTargetMachine());
10901 const bool NeedsGOTLoad = ((OpFlags & AArch64II::MO_GOT) != 0);
10902 assert(((OpFlags & (~AArch64II::MO_GOT)) == 0) &&
10903 "unsupported non-GOT op flags on ptrauth global reference");
10904
10905 // Fold any offset into the GV; our pseudos expect it there.
10906 PtrOffsetC += PtrN->getOffset();
10907 SDValue TPtr = DAG.getTargetGlobalAddress(PtrGV, DL, VT, PtrOffsetC,
10908 /*TargetFlags=*/0);
10909 assert(PtrN->getTargetFlags() == 0 &&
10910 "unsupported target flags on ptrauth global");
10911
10912 SDValue Key = DAG.getTargetConstant(KeyC, DL, MVT::i32);
10913 SDValue Discriminator = DAG.getTargetConstant(DiscriminatorC, DL, MVT::i64);
10914 SDValue TAddrDiscriminator = !isNullConstant(AddrDiscriminator)
10915 ? AddrDiscriminator
10916 : DAG.getRegister(AArch64::XZR, MVT::i64);
10917
10918 // No GOT load needed -> MOVaddrPAC
10919 if (!NeedsGOTLoad) {
10920 assert(!PtrGV->hasExternalWeakLinkage() && "extern_weak should use GOT");
10921 return SDValue(
10922 DAG.getMachineNode(AArch64::MOVaddrPAC, DL, MVT::i64,
10923 {TPtr, Key, TAddrDiscriminator, Discriminator}),
10924 0);
10925 }
10926
10927 // GOT load -> LOADgotPAC
10928 // Note that we disallow extern_weak refs to avoid null checks later.
10929 if (!PtrGV->hasExternalWeakLinkage())
10930 return SDValue(
10931 DAG.getMachineNode(AArch64::LOADgotPAC, DL, MVT::i64,
10932 {TPtr, Key, TAddrDiscriminator, Discriminator}),
10933 0);
10934
10935 // extern_weak ref -> LOADauthptrstatic
10937 TPtr, DL, VT, (AArch64PACKey::ID)KeyC, Discriminator, AddrDiscriminator,
10938 DAG);
10939}
10940
10941// Looks through \param Val to determine the bit that can be used to
10942// check the sign of the value. It returns the unextended value and
10943// the sign bit position.
10944std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
10945 if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG)
10946 return {Val.getOperand(0),
10947 cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() -
10948 1};
10949
10950 if (Val.getOpcode() == ISD::SIGN_EXTEND)
10951 return {Val.getOperand(0),
10952 Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1};
10953
10954 return {Val, Val.getValueSizeInBits() - 1};
10955}
10956
10957SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
10958 SDValue Chain = Op.getOperand(0);
10959 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
10960 SDValue LHS = Op.getOperand(2);
10961 SDValue RHS = Op.getOperand(3);
10962 SDValue Dest = Op.getOperand(4);
10963 SDLoc DL(Op);
10964
10965 MachineFunction &MF = DAG.getMachineFunction();
10966 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
10967 // will not be produced, as they are conditional branch instructions that do
10968 // not set flags.
10969 bool ProduceNonFlagSettingCondBr =
10970 !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
10971
10972 // Handle f128 first, since lowering it will result in comparing the return
10973 // value of a libcall against zero, which is just what the rest of LowerBR_CC
10974 // is expecting to deal with.
10975 if (LHS.getValueType() == MVT::f128) {
10976 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, DL, LHS, RHS);
10977
10978 // If softenSetCCOperands returned a scalar, we need to compare the result
10979 // against zero to select between true and false values.
10980 if (!RHS.getNode()) {
10981 RHS = DAG.getConstant(0, DL, LHS.getValueType());
10982 CC = ISD::SETNE;
10983 }
10984 }
10985
10986 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
10987 // instruction.
10989 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
10990 // Only lower legal XALUO ops.
10991 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
10992 return SDValue();
10993
10994 // The actual operation with overflow check.
10996 SDValue Value, Overflow;
10997 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
10998
10999 if (CC == ISD::SETNE)
11000 OFCC = getInvertedCondCode(OFCC);
11001 SDValue CCVal = getCondCode(DAG, OFCC);
11002
11003 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CCVal,
11004 Overflow);
11005 }
11006
11007 if (LHS.getValueType().isInteger()) {
11008 assert((LHS.getValueType() == RHS.getValueType()) &&
11009 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
11010
11011 // If the RHS of the comparison is zero, we can potentially fold this
11012 // to a specialized branch.
11013 const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
11014 if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
11015 if (CC == ISD::SETEQ) {
11016 // See if we can use a TBZ to fold in an AND as well.
11017 // TBZ has a smaller branch displacement than CBZ. If the offset is
11018 // out of bounds, a late MI-layer pass rewrites branches.
11019 // 403.gcc is an example that hits this case.
11020 if (LHS.getOpcode() == ISD::AND &&
11021 isa<ConstantSDNode>(LHS.getOperand(1)) &&
11022 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
11023 SDValue Test = LHS.getOperand(0);
11024 uint64_t Mask = LHS.getConstantOperandVal(1);
11025 return DAG.getNode(AArch64ISD::TBZ, DL, MVT::Other, Chain, Test,
11026 DAG.getConstant(Log2_64(Mask), DL, MVT::i64),
11027 Dest);
11028 }
11029
11030 return DAG.getNode(AArch64ISD::CBZ, DL, MVT::Other, Chain, LHS, Dest);
11031 } else if (CC == ISD::SETNE) {
11032 // See if we can use a TBZ to fold in an AND as well.
11033 // TBZ has a smaller branch displacement than CBZ. If the offset is
11034 // out of bounds, a late MI-layer pass rewrites branches.
11035 // 403.gcc is an example that hits this case.
11036 if (LHS.getOpcode() == ISD::AND &&
11037 isa<ConstantSDNode>(LHS.getOperand(1)) &&
11038 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
11039 SDValue Test = LHS.getOperand(0);
11040 uint64_t Mask = LHS.getConstantOperandVal(1);
11041 return DAG.getNode(AArch64ISD::TBNZ, DL, MVT::Other, Chain, Test,
11042 DAG.getConstant(Log2_64(Mask), DL, MVT::i64),
11043 Dest);
11044 }
11045
11046 return DAG.getNode(AArch64ISD::CBNZ, DL, MVT::Other, Chain, LHS, Dest);
11047 } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
11048 // Don't combine AND since emitComparison converts the AND to an ANDS
11049 // (a.k.a. TST) and the test in the test bit and branch instruction
11050 // becomes redundant. This would also increase register pressure.
11051 uint64_t SignBitPos;
11052 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
11053 return DAG.getNode(AArch64ISD::TBNZ, DL, MVT::Other, Chain, LHS,
11054 DAG.getConstant(SignBitPos, DL, MVT::i64), Dest);
11055 }
11056 }
11057 if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
11058 LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
11059 // Don't combine AND since emitComparison converts the AND to an ANDS
11060 // (a.k.a. TST) and the test in the test bit and branch instruction
11061 // becomes redundant. This would also increase register pressure.
11062 uint64_t SignBitPos;
11063 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
11064 return DAG.getNode(AArch64ISD::TBZ, DL, MVT::Other, Chain, LHS,
11065 DAG.getConstant(SignBitPos, DL, MVT::i64), Dest);
11066 }
11067
11068 // Try to emit Armv9.6 CB instructions. We prefer tb{n}z/cb{n}z due to their
11069 // larger branch displacement but do prefer CB over cmp + br.
11070 if (Subtarget->hasCMPBR() &&
11072 ProduceNonFlagSettingCondBr) {
11073 SDValue Cond =
11075 return DAG.getNode(AArch64ISD::CB, DL, MVT::Other, Chain, Cond, LHS, RHS,
11076 Dest);
11077 }
11078
11079 SDValue CCVal;
11080 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, DL);
11081 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CCVal,
11082 Cmp);
11083 }
11084
11085 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||
11086 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
11087
11088 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
11089 // clean. Some of them require two branches to implement.
11090 SDValue Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
11091 AArch64CC::CondCode CC1, CC2;
11092 changeFPCCToAArch64CC(CC, CC1, CC2);
11093 SDValue CC1Val = getCondCode(DAG, CC1);
11094 SDValue BR1 =
11095 DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, Chain, Dest, CC1Val, Cmp);
11096 if (CC2 != AArch64CC::AL) {
11097 SDValue CC2Val = getCondCode(DAG, CC2);
11098 return DAG.getNode(AArch64ISD::BRCOND, DL, MVT::Other, BR1, Dest, CC2Val,
11099 Cmp);
11100 }
11101
11102 return BR1;
11103}
11104
11105SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
11106 SelectionDAG &DAG) const {
11107 if (!Subtarget->isNeonAvailable() &&
11108 !Subtarget->useSVEForFixedLengthVectors())
11109 return SDValue();
11110
11111 EVT VT = Op.getValueType();
11112 EVT IntVT = VT.changeTypeToInteger();
11113 SDLoc DL(Op);
11114
11115 SDValue In1 = Op.getOperand(0);
11116 SDValue In2 = Op.getOperand(1);
11117 EVT SrcVT = In2.getValueType();
11118
11119 if (!SrcVT.bitsEq(VT))
11120 In2 = DAG.getFPExtendOrRound(In2, DL, VT);
11121
11122 if (VT.isScalableVector())
11123 IntVT =
11125
11126 if (VT.isFixedLengthVector() &&
11127 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
11128 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
11129
11130 In1 = convertToScalableVector(DAG, ContainerVT, In1);
11131 In2 = convertToScalableVector(DAG, ContainerVT, In2);
11132
11133 SDValue Res = DAG.getNode(ISD::FCOPYSIGN, DL, ContainerVT, In1, In2);
11134 return convertFromScalableVector(DAG, VT, Res);
11135 }
11136
11137 // With SVE, but without Neon, extend the scalars to scalable vectors and use
11138 // a SVE FCOPYSIGN.
11139 if (!VT.isVector() && !Subtarget->isNeonAvailable() &&
11140 Subtarget->isSVEorStreamingSVEAvailable()) {
11141 if (VT != MVT::f16 && VT != MVT::f32 && VT != MVT::f64 && VT != MVT::bf16)
11142 return SDValue();
11143 EVT SVT = getPackedSVEVectorVT(VT);
11144
11145 SDValue Ins1 =
11146 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SVT, DAG.getUNDEF(SVT), In1,
11147 DAG.getConstant(0, DL, MVT::i64));
11148 SDValue Ins2 =
11149 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SVT, DAG.getUNDEF(SVT), In2,
11150 DAG.getConstant(0, DL, MVT::i64));
11151 SDValue FCS = DAG.getNode(ISD::FCOPYSIGN, DL, SVT, Ins1, Ins2);
11152 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, FCS,
11153 DAG.getConstant(0, DL, MVT::i64));
11154 }
11155
11156 auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) {
11157 if (VT.isScalableVector())
11158 return getSVESafeBitCast(VT, Op, DAG);
11159
11160 return DAG.getBitcast(VT, Op);
11161 };
11162
11163 SDValue VecVal1, VecVal2;
11164 EVT VecVT;
11165 auto SetVecVal = [&](int Idx = -1) {
11166 if (!VT.isVector()) {
11167 VecVal1 =
11168 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In1);
11169 VecVal2 =
11170 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In2);
11171 } else {
11172 VecVal1 = BitCast(VecVT, In1, DAG);
11173 VecVal2 = BitCast(VecVT, In2, DAG);
11174 }
11175 };
11176 if (VT.isVector()) {
11177 VecVT = IntVT;
11178 SetVecVal();
11179 } else if (VT == MVT::f64) {
11180 VecVT = MVT::v2i64;
11181 SetVecVal(AArch64::dsub);
11182 } else if (VT == MVT::f32) {
11183 VecVT = MVT::v4i32;
11184 SetVecVal(AArch64::ssub);
11185 } else if (VT == MVT::f16 || VT == MVT::bf16) {
11186 VecVT = MVT::v8i16;
11187 SetVecVal(AArch64::hsub);
11188 } else {
11189 llvm_unreachable("Invalid type for copysign!");
11190 }
11191
11192 unsigned BitWidth = In1.getScalarValueSizeInBits();
11193 SDValue SignMaskV = DAG.getConstant(~APInt::getSignMask(BitWidth), DL, VecVT);
11194
11195 // We want to materialize a mask with every bit but the high bit set, but the
11196 // AdvSIMD immediate moves cannot materialize that in a single instruction for
11197 // 64-bit elements. Instead, materialize all bits set and then negate that.
11198 if (VT == MVT::f64 || VT == MVT::v2f64) {
11199 SignMaskV = DAG.getConstant(APInt::getAllOnes(BitWidth), DL, VecVT);
11200 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, SignMaskV);
11201 SignMaskV = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, SignMaskV);
11202 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, SignMaskV);
11203 }
11204
11205 SDValue BSP =
11206 DAG.getNode(AArch64ISD::BSP, DL, VecVT, SignMaskV, VecVal1, VecVal2);
11207 if (VT == MVT::f16 || VT == MVT::bf16)
11208 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, BSP);
11209 if (VT == MVT::f32)
11210 return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, BSP);
11211 if (VT == MVT::f64)
11212 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, BSP);
11213
11214 return BitCast(VT, BSP, DAG);
11215}
11216
11217SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
11218 SelectionDAG &DAG) const {
11220 Attribute::NoImplicitFloat))
11221 return SDValue();
11222
11223 EVT VT = Op.getValueType();
11224 if (VT.isScalableVector() ||
11225 useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
11226 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
11227
11228 bool IsParity = Op.getOpcode() == ISD::PARITY;
11229 SDValue Val = Op.getOperand(0);
11230 SDLoc DL(Op);
11231
11232 // for i32, general parity function using EORs is more efficient compared to
11233 // using floating point
11234 if (VT == MVT::i32 && IsParity)
11235 return SDValue();
11236
11237 if (Subtarget->isSVEorStreamingSVEAvailable()) {
11238 if (VT == MVT::i32 || VT == MVT::i64) {
11239 EVT ContainerVT = VT == MVT::i32 ? MVT::nxv4i32 : MVT::nxv2i64;
11240 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
11241 DAG.getUNDEF(ContainerVT), Val,
11242 DAG.getVectorIdxConstant(0, DL));
11243 Val = DAG.getNode(ISD::CTPOP, DL, ContainerVT, Val);
11244 Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Val,
11245 DAG.getVectorIdxConstant(0, DL));
11246 if (IsParity)
11247 Val = DAG.getNode(ISD::AND, DL, VT, Val, DAG.getConstant(1, DL, VT));
11248 return Val;
11249 }
11250
11251 if (VT == MVT::i128) {
11252 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Val);
11253 Val = convertToScalableVector(DAG, MVT::nxv2i64, Val);
11254 Val = DAG.getNode(ISD::CTPOP, DL, MVT::nxv2i64, Val);
11255 Val = convertFromScalableVector(DAG, MVT::v2i64, Val);
11256 Val = DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i64, Val);
11257 Val = DAG.getZExtOrTrunc(Val, DL, VT);
11258 if (IsParity)
11259 Val = DAG.getNode(ISD::AND, DL, VT, Val, DAG.getConstant(1, DL, VT));
11260 return Val;
11261 }
11262 }
11263
11264 if (!Subtarget->isNeonAvailable())
11265 return SDValue();
11266
11267 // If there is no CNT instruction available, GPR popcount can
11268 // be more efficiently lowered to the following sequence that uses
11269 // AdvSIMD registers/instructions as long as the copies to/from
11270 // the AdvSIMD registers are cheap.
11271 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
11272 // CNT V0.8B, V0.8B // 8xbyte pop-counts
11273 // ADDV B0, V0.8B // sum 8xbyte pop-counts
11274 // FMOV X0, D0 // copy result back to integer reg
11275 if (VT == MVT::i32 || VT == MVT::i64) {
11276 if (VT == MVT::i32)
11277 Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
11278 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
11279
11280 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
11281 SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v8i8, CtPop);
11282 AddV = DAG.getNode(AArch64ISD::NVCAST, DL,
11283 VT == MVT::i32 ? MVT::v2i32 : MVT::v1i64, AddV);
11284 AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, AddV,
11285 DAG.getConstant(0, DL, MVT::i64));
11286 if (IsParity)
11287 AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
11288 return AddV;
11289 } else if (VT == MVT::i128) {
11290 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
11291
11292 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
11293 SDValue AddV = DAG.getNode(AArch64ISD::UADDV, DL, MVT::v16i8, CtPop);
11294 AddV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64,
11295 DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v2i64, AddV),
11296 DAG.getConstant(0, DL, MVT::i64));
11297 AddV = DAG.getZExtOrTrunc(AddV, DL, VT);
11298 if (IsParity)
11299 AddV = DAG.getNode(ISD::AND, DL, VT, AddV, DAG.getConstant(1, DL, VT));
11300 return AddV;
11301 }
11302
11303 assert(!IsParity && "ISD::PARITY of vector types not supported");
11304
11305 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
11306 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
11307 "Unexpected type for custom ctpop lowering");
11308
11309 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
11310 Val = DAG.getBitcast(VT8Bit, Val);
11311 Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);
11312
11313 if (Subtarget->hasDotProd() && VT.getScalarSizeInBits() != 16 &&
11314 VT.getVectorNumElements() >= 2) {
11315 EVT DT = VT == MVT::v2i64 ? MVT::v4i32 : VT;
11316 SDValue Zeros = DAG.getConstant(0, DL, DT);
11317 SDValue Ones = DAG.getConstant(1, DL, VT8Bit);
11318
11319 if (VT == MVT::v2i64) {
11320 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
11321 Val = DAG.getNode(AArch64ISD::UADDLP, DL, VT, Val);
11322 } else if (VT == MVT::v2i32) {
11323 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
11324 } else if (VT == MVT::v4i32) {
11325 Val = DAG.getNode(AArch64ISD::UDOT, DL, DT, Zeros, Ones, Val);
11326 } else {
11327 llvm_unreachable("Unexpected type for custom ctpop lowering");
11328 }
11329
11330 return Val;
11331 }
11332
11333 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
11334 unsigned EltSize = 8;
11335 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
11336 while (EltSize != VT.getScalarSizeInBits()) {
11337 EltSize *= 2;
11338 NumElts /= 2;
11339 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
11340 Val = DAG.getNode(AArch64ISD::UADDLP, DL, WidenVT, Val);
11341 }
11342
11343 return Val;
11344}
11345
11346SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
11347 EVT VT = Op.getValueType();
11348 assert(VT.isScalableVector() ||
11350 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()));
11351
11352 SDLoc DL(Op);
11353 SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0));
11354 return DAG.getNode(ISD::CTLZ, DL, VT, RBIT);
11355}
11356
11357SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
11358 SelectionDAG &DAG) const {
11359
11360 EVT VT = Op.getValueType();
11361 SDLoc DL(Op);
11362 unsigned Opcode = Op.getOpcode();
11363 ISD::CondCode CC;
11364 switch (Opcode) {
11365 default:
11366 llvm_unreachable("Wrong instruction");
11367 case ISD::SMAX:
11368 CC = ISD::SETGT;
11369 break;
11370 case ISD::SMIN:
11371 CC = ISD::SETLT;
11372 break;
11373 case ISD::UMAX:
11374 CC = ISD::SETUGT;
11375 break;
11376 case ISD::UMIN:
11377 CC = ISD::SETULT;
11378 break;
11379 }
11380
11381 // Note: This lowering only overrides NEON for v1i64 and v2i64, where we
11382 // prefer using SVE if available.
11383 if (VT.isScalableVector() ||
11384 useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true)) {
11385 switch (Opcode) {
11386 default:
11387 llvm_unreachable("Wrong instruction");
11388 case ISD::SMAX:
11389 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED);
11390 case ISD::SMIN:
11391 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED);
11392 case ISD::UMAX:
11393 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED);
11394 case ISD::UMIN:
11395 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED);
11396 }
11397 }
11398
11399 SDValue Op0 = Op.getOperand(0);
11400 SDValue Op1 = Op.getOperand(1);
11401 SDValue Cond = DAG.getSetCC(DL, VT, Op0, Op1, CC);
11402 return DAG.getSelect(DL, VT, Cond, Op0, Op1);
11403}
11404
11405SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
11406 SelectionDAG &DAG) const {
11407 EVT VT = Op.getValueType();
11408
11409 if (VT.isScalableVector() ||
11411 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
11412 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU);
11413
11414 SDLoc DL(Op);
11415 SDValue REVB;
11416 MVT VST;
11417
11418 switch (VT.getSimpleVT().SimpleTy) {
11419 default:
11420 llvm_unreachable("Invalid type for bitreverse!");
11421
11422 case MVT::v2i32: {
11423 VST = MVT::v8i8;
11424 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
11425
11426 break;
11427 }
11428
11429 case MVT::v4i32: {
11430 VST = MVT::v16i8;
11431 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
11432
11433 break;
11434 }
11435
11436 case MVT::v1i64: {
11437 VST = MVT::v8i8;
11438 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
11439
11440 break;
11441 }
11442
11443 case MVT::v2i64: {
11444 VST = MVT::v16i8;
11445 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
11446
11447 break;
11448 }
11449 }
11450
11451 return DAG.getNode(AArch64ISD::NVCAST, DL, VT,
11452 DAG.getNode(ISD::BITREVERSE, DL, VST, REVB));
11453}
11454
11455// Check whether the continuous comparison sequence.
11456static bool
11457isOrXorChain(SDValue N, unsigned &Num,
11458 SmallVector<std::pair<SDValue, SDValue>, 16> &WorkList) {
11459 if (Num == MaxXors)
11460 return false;
11461
11462 // Skip the one-use zext
11463 if (N->getOpcode() == ISD::ZERO_EXTEND && N->hasOneUse())
11464 N = N->getOperand(0);
11465
11466 // The leaf node must be XOR
11467 if (N->getOpcode() == ISD::XOR) {
11468 WorkList.push_back(std::make_pair(N->getOperand(0), N->getOperand(1)));
11469 Num++;
11470 return true;
11471 }
11472
11473 // All the non-leaf nodes must be OR.
11474 if (N->getOpcode() != ISD::OR || !N->hasOneUse())
11475 return false;
11476
11477 if (isOrXorChain(N->getOperand(0), Num, WorkList) &&
11478 isOrXorChain(N->getOperand(1), Num, WorkList))
11479 return true;
11480 return false;
11481}
11482
11483// Transform chains of ORs and XORs, which usually outlined by memcmp/bmp.
11485 SDValue LHS = N->getOperand(0);
11486 SDValue RHS = N->getOperand(1);
11487 SDLoc DL(N);
11488 EVT VT = N->getValueType(0);
11490
11491 // Only handle integer compares.
11492 if (N->getOpcode() != ISD::SETCC)
11493 return SDValue();
11494
11495 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
11496 // Try to express conjunction "cmp 0 (or (xor A0 A1) (xor B0 B1))" as:
11497 // sub A0, A1; ccmp B0, B1, 0, eq; cmp inv(Cond) flag
11498 unsigned NumXors = 0;
11499 if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) &&
11500 LHS->getOpcode() == ISD::OR && LHS->hasOneUse() &&
11501 isOrXorChain(LHS, NumXors, WorkList)) {
11502 SDValue XOR0, XOR1;
11503 std::tie(XOR0, XOR1) = WorkList[0];
11504 unsigned LogicOp = (Cond == ISD::SETEQ) ? ISD::AND : ISD::OR;
11505 SDValue Cmp = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
11506 for (unsigned I = 1; I < WorkList.size(); I++) {
11507 std::tie(XOR0, XOR1) = WorkList[I];
11508 SDValue CmpChain = DAG.getSetCC(DL, VT, XOR0, XOR1, Cond);
11509 Cmp = DAG.getNode(LogicOp, DL, VT, Cmp, CmpChain);
11510 }
11511
11512 // Exit early by inverting the condition, which help reduce indentations.
11513 return Cmp;
11514 }
11515
11516 return SDValue();
11517}
11518
11519SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
11520
11521 if (Op.getValueType().isVector())
11522 return LowerVSETCC(Op, DAG);
11523
11524 bool IsStrict = Op->isStrictFPOpcode();
11525 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
11526 unsigned OpNo = IsStrict ? 1 : 0;
11527 SDValue Chain;
11528 if (IsStrict)
11529 Chain = Op.getOperand(0);
11530 SDValue LHS = Op.getOperand(OpNo + 0);
11531 SDValue RHS = Op.getOperand(OpNo + 1);
11532 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get();
11533 SDLoc DL(Op);
11534
11535 // We chose ZeroOrOneBooleanContents, so use zero and one.
11536 EVT VT = Op.getValueType();
11537 SDValue TVal = DAG.getConstant(1, DL, VT);
11538 SDValue FVal = DAG.getConstant(0, DL, VT);
11539
11540 // Handle f128 first, since one possible outcome is a normal integer
11541 // comparison which gets picked up by the next if statement.
11542 if (LHS.getValueType() == MVT::f128) {
11543 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, DL, LHS, RHS, Chain,
11544 IsSignaling);
11545
11546 // If softenSetCCOperands returned a scalar, use it.
11547 if (!RHS.getNode()) {
11548 assert(LHS.getValueType() == Op.getValueType() &&
11549 "Unexpected setcc expansion!");
11550 return IsStrict ? DAG.getMergeValues({LHS, Chain}, DL) : LHS;
11551 }
11552 }
11553
11554 if (LHS.getValueType().isInteger()) {
11555
11556 simplifySetCCIntoEq(CC, LHS, RHS, DAG, DL);
11557
11558 SDValue CCVal;
11560 LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, DL);
11561
11562 // Note that we inverted the condition above, so we reverse the order of
11563 // the true and false operands here. This will allow the setcc to be
11564 // matched to a single CSINC instruction.
11565 SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, VT, FVal, TVal, CCVal, Cmp);
11566 return IsStrict ? DAG.getMergeValues({Res, Chain}, DL) : Res;
11567 }
11568
11569 // Now we know we're dealing with FP values.
11570 assert(LHS.getValueType() == MVT::bf16 || LHS.getValueType() == MVT::f16 ||
11571 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
11572
11573 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
11574 // and do the comparison.
11575 SDValue Cmp;
11576 if (IsStrict)
11577 Cmp = emitStrictFPComparison(LHS, RHS, DL, DAG, Chain, IsSignaling);
11578 else
11579 Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
11580
11581 AArch64CC::CondCode CC1, CC2;
11582 changeFPCCToAArch64CC(CC, CC1, CC2);
11583 SDValue Res;
11584 if (CC2 == AArch64CC::AL) {
11585 changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1,
11586 CC2);
11587 SDValue CC1Val = getCondCode(DAG, CC1);
11588
11589 // Note that we inverted the condition above, so we reverse the order of
11590 // the true and false operands here. This will allow the setcc to be
11591 // matched to a single CSINC instruction.
11592 Res = DAG.getNode(AArch64ISD::CSEL, DL, VT, FVal, TVal, CC1Val, Cmp);
11593 } else {
11594 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
11595 // totally clean. Some of them require two CSELs to implement. As is in
11596 // this case, we emit the first CSEL and then emit a second using the output
11597 // of the first as the RHS. We're effectively OR'ing the two CC's together.
11598
11599 // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
11600 SDValue CC1Val = getCondCode(DAG, CC1);
11601 SDValue CS1 =
11602 DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, FVal, CC1Val, Cmp);
11603
11604 SDValue CC2Val = getCondCode(DAG, CC2);
11605 Res = DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, CS1, CC2Val, Cmp);
11606 }
11607 return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, DL) : Res;
11608}
11609
11610SDValue AArch64TargetLowering::LowerSETCCCARRY(SDValue Op,
11611 SelectionDAG &DAG) const {
11612
11613 SDValue LHS = Op.getOperand(0);
11614 SDValue RHS = Op.getOperand(1);
11615 EVT VT = LHS.getValueType();
11616 if (VT != MVT::i32 && VT != MVT::i64)
11617 return SDValue();
11618
11619 SDLoc DL(Op);
11620 SDValue Carry = Op.getOperand(2);
11621 // SBCS uses a carry not a borrow so the carry flag should be inverted first.
11622 SDValue InvCarry = valueToCarryFlag(Carry, DAG, true);
11623 SDValue Cmp = DAG.getNode(AArch64ISD::SBCS, DL, DAG.getVTList(VT, FlagsVT),
11624 LHS, RHS, InvCarry);
11625
11626 EVT OpVT = Op.getValueType();
11627 SDValue TVal = DAG.getConstant(1, DL, OpVT);
11628 SDValue FVal = DAG.getConstant(0, DL, OpVT);
11629
11630 ISD::CondCode Cond = cast<CondCodeSDNode>(Op.getOperand(3))->get();
11632 SDValue CCVal = getCondCode(DAG, changeIntCCToAArch64CC(CondInv));
11633 // Inputs are swapped because the condition is inverted. This will allow
11634 // matching with a single CSINC instruction.
11635 return DAG.getNode(AArch64ISD::CSEL, DL, OpVT, FVal, TVal, CCVal,
11636 Cmp.getValue(1));
11637}
11638
11639/// Emit vector comparison for floating-point values, producing a mask.
11641 AArch64CC::CondCode CC, bool NoNans, EVT VT,
11642 const SDLoc &DL, SelectionDAG &DAG) {
11643 assert(VT.getSizeInBits() == LHS.getValueType().getSizeInBits() &&
11644 "function only supposed to emit natural comparisons");
11645
11646 switch (CC) {
11647 default:
11648 return SDValue();
11649 case AArch64CC::NE: {
11650 SDValue Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, DL, VT, LHS, RHS);
11651 // Use vector semantics for the inversion to potentially save a copy between
11652 // SIMD and regular registers.
11653 if (!LHS.getValueType().isVector()) {
11654 EVT VecVT =
11655 EVT::getVectorVT(*DAG.getContext(), VT, 128 / VT.getSizeInBits());
11656 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
11657 SDValue MaskVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT,
11658 DAG.getUNDEF(VecVT), Fcmeq, Zero);
11659 SDValue InvertedMask = DAG.getNOT(DL, MaskVec, VecVT);
11660 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, InvertedMask, Zero);
11661 }
11662 return DAG.getNOT(DL, Fcmeq, VT);
11663 }
11664 case AArch64CC::EQ:
11665 return DAG.getNode(AArch64ISD::FCMEQ, DL, VT, LHS, RHS);
11666 case AArch64CC::GE:
11667 return DAG.getNode(AArch64ISD::FCMGE, DL, VT, LHS, RHS);
11668 case AArch64CC::GT:
11669 return DAG.getNode(AArch64ISD::FCMGT, DL, VT, LHS, RHS);
11670 case AArch64CC::LE:
11671 if (!NoNans)
11672 return SDValue();
11673 // If we ignore NaNs then we can use to the LS implementation.
11674 [[fallthrough]];
11675 case AArch64CC::LS:
11676 return DAG.getNode(AArch64ISD::FCMGE, DL, VT, RHS, LHS);
11677 case AArch64CC::LT:
11678 if (!NoNans)
11679 return SDValue();
11680 // If we ignore NaNs then we can use to the MI implementation.
11681 [[fallthrough]];
11682 case AArch64CC::MI:
11683 return DAG.getNode(AArch64ISD::FCMGT, DL, VT, RHS, LHS);
11684 }
11685}
11686
11687/// For SELECT_CC, when the true/false values are (-1, 0) and the compared
11688/// values are scalars, try to emit a mask generating vector instruction.
11690 SDValue FVal, ISD::CondCode CC, bool NoNaNs,
11691 const SDLoc &DL, SelectionDAG &DAG) {
11692 assert(!LHS.getValueType().isVector());
11693 assert(!RHS.getValueType().isVector());
11694
11695 auto *CTVal = dyn_cast<ConstantSDNode>(TVal);
11696 auto *CFVal = dyn_cast<ConstantSDNode>(FVal);
11697 if (!CTVal || !CFVal)
11698 return {};
11699 if (!(CTVal->isAllOnes() && CFVal->isZero()) &&
11700 !(CTVal->isZero() && CFVal->isAllOnes()))
11701 return {};
11702
11703 if (CTVal->isZero())
11704 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11705
11706 EVT VT = TVal.getValueType();
11707 if (VT.getSizeInBits() != LHS.getValueType().getSizeInBits())
11708 return {};
11709
11710 if (!NoNaNs && (CC == ISD::SETUO || CC == ISD::SETO)) {
11711 bool OneNaN = false;
11712 if (LHS == RHS) {
11713 OneNaN = true;
11714 } else if (DAG.isKnownNeverNaN(RHS)) {
11715 OneNaN = true;
11716 RHS = LHS;
11717 } else if (DAG.isKnownNeverNaN(LHS)) {
11718 OneNaN = true;
11719 LHS = RHS;
11720 }
11721 if (OneNaN)
11722 CC = (CC == ISD::SETUO) ? ISD::SETUNE : ISD::SETOEQ;
11723 }
11724
11727 bool ShouldInvert = false;
11728 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
11729 SDValue Cmp = emitVectorComparison(LHS, RHS, CC1, NoNaNs, VT, DL, DAG);
11730 SDValue Cmp2;
11731 if (CC2 != AArch64CC::AL) {
11732 Cmp2 = emitVectorComparison(LHS, RHS, CC2, NoNaNs, VT, DL, DAG);
11733 if (!Cmp2)
11734 return {};
11735 }
11736 if (!Cmp2 && !ShouldInvert)
11737 return Cmp;
11738
11739 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), VT, 128 / VT.getSizeInBits());
11740 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
11741 Cmp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT, DAG.getUNDEF(VecVT), Cmp,
11742 Zero);
11743 if (Cmp2) {
11744 Cmp2 = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT, DAG.getUNDEF(VecVT),
11745 Cmp2, Zero);
11746 Cmp = DAG.getNode(ISD::OR, DL, VecVT, Cmp, Cmp2);
11747 }
11748 if (ShouldInvert)
11749 Cmp = DAG.getNOT(DL, Cmp, VecVT);
11750 Cmp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Cmp, Zero);
11751 return Cmp;
11752}
11753
11754SDValue AArch64TargetLowering::LowerSELECT_CC(
11757 const SDLoc &DL, SelectionDAG &DAG) const {
11758 // Handle f128 first, because it will result in a comparison of some RTLIB
11759 // call result against zero.
11760 if (LHS.getValueType() == MVT::f128) {
11761 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, DL, LHS, RHS);
11762
11763 // If softenSetCCOperands returned a scalar, we need to compare the result
11764 // against zero to select between true and false values.
11765 if (!RHS.getNode()) {
11766 RHS = DAG.getConstant(0, DL, LHS.getValueType());
11767 CC = ISD::SETNE;
11768 }
11769 }
11770
11771 // Also handle f16, for which we need to do a f32 comparison.
11772 if ((LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) ||
11773 LHS.getValueType() == MVT::bf16) {
11774 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
11775 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
11776 }
11777
11778 // Next, handle integers.
11779 if (LHS.getValueType().isInteger()) {
11780 assert((LHS.getValueType() == RHS.getValueType()) &&
11781 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
11782
11783 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
11784 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
11785 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
11786
11787 // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
11788 // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
11789 // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
11790 // Both require less instructions than compare and conditional select.
11791 if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TVal &&
11792 RHSC && RHSC->isZero() && CFVal && CFVal->isZero() &&
11793 LHS.getValueType() == RHS.getValueType()) {
11794 EVT VT = LHS.getValueType();
11795 SDValue Shift =
11796 DAG.getNode(ISD::SRA, DL, VT, LHS,
11797 DAG.getConstant(VT.getSizeInBits() - 1, DL, VT));
11798
11799 if (CC == ISD::SETGT)
11800 Shift = DAG.getNOT(DL, Shift, VT);
11801
11802 return DAG.getNode(ISD::AND, DL, VT, LHS, Shift);
11803 }
11804
11805 // Check for sign bit test patterns that can use TST optimization.
11806 // (SELECT_CC setlt, sign_extend_inreg, 0, tval, fval)
11807 // -> TST %operand, sign_bit; CSEL
11808 // (SELECT_CC setlt, sign_extend, 0, tval, fval)
11809 // -> TST %operand, sign_bit; CSEL
11810 if (CC == ISD::SETLT && RHSC && RHSC->isZero() && LHS.hasOneUse() &&
11811 (LHS.getOpcode() == ISD::SIGN_EXTEND_INREG ||
11812 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
11813
11814 uint64_t SignBitPos;
11815 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
11816 EVT TestVT = LHS.getValueType();
11817 SDValue SignBitConst = DAG.getConstant(1ULL << SignBitPos, DL, TestVT);
11818 SDValue TST =
11819 DAG.getNode(AArch64ISD::ANDS, DL, DAG.getVTList(TestVT, MVT::i32),
11820 LHS, SignBitConst);
11821
11822 SDValue Flags = TST.getValue(1);
11823 return DAG.getNode(AArch64ISD::CSEL, DL, TVal.getValueType(), TVal, FVal,
11824 DAG.getConstant(AArch64CC::NE, DL, MVT::i32), Flags);
11825 }
11826
11827 // Canonicalise absolute difference patterns:
11828 // select_cc lhs, rhs, sub(lhs, rhs), sub(rhs, lhs), cc ->
11829 // select_cc lhs, rhs, sub(lhs, rhs), neg(sub(lhs, rhs)), cc
11830 //
11831 // select_cc lhs, rhs, sub(rhs, lhs), sub(lhs, rhs), cc ->
11832 // select_cc lhs, rhs, neg(sub(lhs, rhs)), sub(lhs, rhs), cc
11833 // The second forms can be matched into subs+cneg.
11834 // NOTE: Drop poison generating flags from the negated operand to avoid
11835 // inadvertently propagating poison after the canonicalisation.
11836 if (TVal.getOpcode() == ISD::SUB && FVal.getOpcode() == ISD::SUB) {
11837 if (TVal.getOperand(0) == LHS && TVal.getOperand(1) == RHS &&
11838 FVal.getOperand(0) == RHS && FVal.getOperand(1) == LHS) {
11840 FVal = DAG.getNegative(TVal, DL, TVal.getValueType());
11841 } else if (TVal.getOperand(0) == RHS && TVal.getOperand(1) == LHS &&
11842 FVal.getOperand(0) == LHS && FVal.getOperand(1) == RHS) {
11844 TVal = DAG.getNegative(FVal, DL, FVal.getValueType());
11845 }
11846 }
11847
11848 unsigned Opcode = AArch64ISD::CSEL;
11849
11850 // If both the TVal and the FVal are constants, see if we can swap them in
11851 // order to for a CSINV or CSINC out of them.
11852 if (CTVal && CFVal && CTVal->isAllOnes() && CFVal->isZero()) {
11853 std::swap(TVal, FVal);
11854 std::swap(CTVal, CFVal);
11855 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11856 } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isZero()) {
11857 std::swap(TVal, FVal);
11858 std::swap(CTVal, CFVal);
11859 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11860 } else if (TVal.getOpcode() == ISD::XOR) {
11861 // If TVal is a NOT we want to swap TVal and FVal so that we can match
11862 // with a CSINV rather than a CSEL.
11863 if (isAllOnesConstant(TVal.getOperand(1))) {
11864 std::swap(TVal, FVal);
11865 std::swap(CTVal, CFVal);
11866 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11867 }
11868 } else if (TVal.getOpcode() == ISD::SUB) {
11869 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
11870 // that we can match with a CSNEG rather than a CSEL.
11871 if (isNullConstant(TVal.getOperand(0))) {
11872 std::swap(TVal, FVal);
11873 std::swap(CTVal, CFVal);
11874 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11875 }
11876 } else if (CTVal && CFVal) {
11877 const int64_t TrueVal = CTVal->getSExtValue();
11878 const int64_t FalseVal = CFVal->getSExtValue();
11879 bool Swap = false;
11880
11881 // If both TVal and FVal are constants, see if FVal is the
11882 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
11883 // instead of a CSEL in that case.
11884 if (TrueVal == ~FalseVal) {
11885 Opcode = AArch64ISD::CSINV;
11886 } else if (FalseVal > std::numeric_limits<int64_t>::min() &&
11887 TrueVal == -FalseVal) {
11888 Opcode = AArch64ISD::CSNEG;
11889 } else if (TVal.getValueType() == MVT::i32) {
11890 // If our operands are only 32-bit wide, make sure we use 32-bit
11891 // arithmetic for the check whether we can use CSINC. This ensures that
11892 // the addition in the check will wrap around properly in case there is
11893 // an overflow (which would not be the case if we do the check with
11894 // 64-bit arithmetic).
11895 const uint32_t TrueVal32 = CTVal->getZExtValue();
11896 const uint32_t FalseVal32 = CFVal->getZExtValue();
11897
11898 if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
11899 Opcode = AArch64ISD::CSINC;
11900
11901 if (TrueVal32 > FalseVal32) {
11902 Swap = true;
11903 }
11904 }
11905 } else {
11906 // 64-bit check whether we can use CSINC.
11907 const uint64_t TrueVal64 = TrueVal;
11908 const uint64_t FalseVal64 = FalseVal;
11909
11910 if ((TrueVal64 == FalseVal64 + 1) || (TrueVal64 + 1 == FalseVal64)) {
11911 Opcode = AArch64ISD::CSINC;
11912
11913 if (TrueVal > FalseVal) {
11914 Swap = true;
11915 }
11916 }
11917 }
11918
11919 // Swap TVal and FVal if necessary.
11920 if (Swap) {
11921 std::swap(TVal, FVal);
11922 std::swap(CTVal, CFVal);
11923 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
11924 }
11925
11926 if (Opcode != AArch64ISD::CSEL) {
11927 // Drop FVal since we can get its value by simply inverting/negating
11928 // TVal.
11929 FVal = TVal;
11930 }
11931 }
11932
11933 // Avoid materializing a constant when possible by reusing a known value in
11934 // a register. However, don't perform this optimization if the known value
11935 // is one, zero or negative one in the case of a CSEL. We can always
11936 // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
11937 // FVal, respectively.
11938 ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
11939 if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
11940 !RHSVal->isZero() && !RHSVal->isAllOnes()) {
11942 // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
11943 // "a != C ? x : a" to avoid materializing C.
11944 if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
11945 TVal = LHS;
11946 else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
11947 FVal = LHS;
11948 } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
11949 assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
11950 // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
11951 // avoid materializing C.
11953 if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
11954 Opcode = AArch64ISD::CSINV;
11955 TVal = LHS;
11956 FVal = DAG.getConstant(0, DL, FVal.getValueType());
11957 }
11958 }
11959
11960 SDValue CCVal;
11961 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, DL);
11962 EVT VT = TVal.getValueType();
11963 return DAG.getNode(Opcode, DL, VT, TVal, FVal, CCVal, Cmp);
11964 }
11965
11966 // Now we know we're dealing with FP values.
11967 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
11968 LHS.getValueType() == MVT::f64);
11969 assert(LHS.getValueType() == RHS.getValueType());
11970 EVT VT = TVal.getValueType();
11971
11972 // If the purpose of the comparison is to select between all ones
11973 // or all zeros, try to use a vector comparison because the operands are
11974 // already stored in SIMD registers.
11975 if (Subtarget->isNeonAvailable() && all_of(Users, [](const SDNode *U) {
11976 switch (U->getOpcode()) {
11977 default:
11978 return false;
11981 case AArch64ISD::DUP:
11982 return true;
11983 }
11984 })) {
11985 bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Flags.hasNoNaNs();
11986 SDValue VectorCmp =
11987 emitFloatCompareMask(LHS, RHS, TVal, FVal, CC, NoNaNs, DL, DAG);
11988 if (VectorCmp)
11989 return VectorCmp;
11990 }
11991
11992 SDValue Cmp = emitComparison(LHS, RHS, CC, DL, DAG);
11993
11994 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
11995 // clean. Some of them require two CSELs to implement.
11996 AArch64CC::CondCode CC1, CC2;
11997 changeFPCCToAArch64CC(CC, CC1, CC2);
11998
11999 if (Flags.hasNoSignedZeros()) {
12000 // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
12001 // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
12002 ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
12003 if (RHSVal && RHSVal->isZero()) {
12004 ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
12005 ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);
12006
12007 if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
12008 CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
12009 TVal = LHS;
12010 else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
12011 CFVal && CFVal->isZero() &&
12012 FVal.getValueType() == LHS.getValueType())
12013 FVal = LHS;
12014 }
12015 }
12016
12017 // Emit first, and possibly only, CSEL.
12018 SDValue CC1Val = getCondCode(DAG, CC1);
12019 SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, FVal, CC1Val, Cmp);
12020
12021 // If we need a second CSEL, emit it, using the output of the first as the
12022 // RHS. We're effectively OR'ing the two CC's together.
12023 if (CC2 != AArch64CC::AL) {
12024 SDValue CC2Val = getCondCode(DAG, CC2);
12025 return DAG.getNode(AArch64ISD::CSEL, DL, VT, TVal, CS1, CC2Val, Cmp);
12026 }
12027
12028 // Otherwise, return the output of the first CSEL.
12029 return CS1;
12030}
12031
12032SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
12033 SelectionDAG &DAG) const {
12034 EVT Ty = Op.getValueType();
12035 auto Idx = Op.getConstantOperandAPInt(2);
12036 int64_t IdxVal = Idx.getSExtValue();
12037 assert(Ty.isScalableVector() &&
12038 "Only expect scalable vectors for custom lowering of VECTOR_SPLICE");
12039
12040 // We can use the splice instruction for certain index values where we are
12041 // able to efficiently generate the correct predicate. The index will be
12042 // inverted and used directly as the input to the ptrue instruction, i.e.
12043 // -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the
12044 // splice predicate. However, we can only do this if we can guarantee that
12045 // there are enough elements in the vector, hence we check the index <= min
12046 // number of elements.
12047 std::optional<unsigned> PredPattern;
12048 if (Ty.isScalableVector() && IdxVal < 0 &&
12049 (PredPattern = getSVEPredPatternFromNumElements(std::abs(IdxVal))) !=
12050 std::nullopt) {
12051 SDLoc DL(Op);
12052
12053 // Create a predicate where all but the last -IdxVal elements are false.
12054 EVT PredVT = Ty.changeVectorElementType(MVT::i1);
12055 SDValue Pred = getPTrue(DAG, DL, PredVT, *PredPattern);
12056 Pred = DAG.getNode(ISD::VECTOR_REVERSE, DL, PredVT, Pred);
12057
12058 // Now splice the two inputs together using the predicate.
12059 return DAG.getNode(AArch64ISD::SPLICE, DL, Ty, Pred, Op.getOperand(0),
12060 Op.getOperand(1));
12061 }
12062
12063 // We can select to an EXT instruction when indexing the first 256 bytes.
12065 if (IdxVal >= 0 && (IdxVal * BlockSize / 8) < 256)
12066 return Op;
12067
12068 return SDValue();
12069}
12070
12071SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
12072 SelectionDAG &DAG) const {
12073 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
12074 SDValue LHS = Op.getOperand(0);
12075 SDValue RHS = Op.getOperand(1);
12076 SDValue TVal = Op.getOperand(2);
12077 SDValue FVal = Op.getOperand(3);
12078 SDNodeFlags Flags = Op->getFlags();
12079 SDLoc DL(Op);
12080 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(), Flags, DL, DAG);
12081}
12082
12083SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
12084 SelectionDAG &DAG) const {
12085 SDValue CCVal = Op->getOperand(0);
12086 SDValue TVal = Op->getOperand(1);
12087 SDValue FVal = Op->getOperand(2);
12088 SDLoc DL(Op);
12089
12090 EVT Ty = Op.getValueType();
12091 if (Ty == MVT::aarch64svcount) {
12092 TVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, TVal);
12093 FVal = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i1, FVal);
12094 SDValue Sel =
12095 DAG.getNode(ISD::SELECT, DL, MVT::nxv16i1, CCVal, TVal, FVal);
12096 return DAG.getNode(ISD::BITCAST, DL, Ty, Sel);
12097 }
12098
12099 if (Ty.isScalableVector()) {
12100 MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());
12101 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, CCVal);
12102 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
12103 }
12104
12105 if (useSVEForFixedLengthVectorVT(Ty, !Subtarget->isNeonAvailable())) {
12106 // FIXME: Ideally this would be the same as above using i1 types, however
12107 // for the moment we can't deal with fixed i1 vector types properly, so
12108 // instead extend the predicate to a result type sized integer vector.
12109 MVT SplatValVT = MVT::getIntegerVT(Ty.getScalarSizeInBits());
12110 MVT PredVT = MVT::getVectorVT(SplatValVT, Ty.getVectorElementCount());
12111 SDValue SplatVal = DAG.getSExtOrTrunc(CCVal, DL, SplatValVT);
12112 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, SplatVal);
12113 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
12114 }
12115
12116 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
12117 // instruction.
12118 if (ISD::isOverflowIntrOpRes(CCVal)) {
12119 // Only lower legal XALUO ops.
12120 if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
12121 return SDValue();
12122
12124 SDValue Value, Overflow;
12125 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
12126 SDValue CCVal = getCondCode(DAG, OFCC);
12127
12128 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
12129 CCVal, Overflow);
12130 }
12131
12132 // Lower it the same way as we would lower a SELECT_CC node.
12133 ISD::CondCode CC;
12134 SDValue LHS, RHS;
12135 if (CCVal.getOpcode() == ISD::SETCC) {
12136 LHS = CCVal.getOperand(0);
12137 RHS = CCVal.getOperand(1);
12138 CC = cast<CondCodeSDNode>(CCVal.getOperand(2))->get();
12139 } else {
12140 LHS = CCVal;
12141 RHS = DAG.getConstant(0, DL, CCVal.getValueType());
12142 CC = ISD::SETNE;
12143 }
12144
12145 // If we are lowering a f16 and we do not have fullf16, convert to a f32 in
12146 // order to use FCSELSrrr
12147 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
12148 TVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
12149 DAG.getUNDEF(MVT::f32), TVal);
12150 FVal = DAG.getTargetInsertSubreg(AArch64::hsub, DL, MVT::f32,
12151 DAG.getUNDEF(MVT::f32), FVal);
12152 }
12153
12154 SDValue Res = LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, Op->users(),
12155 Op->getFlags(), DL, DAG);
12156
12157 if ((Ty == MVT::f16 || Ty == MVT::bf16) && !Subtarget->hasFullFP16()) {
12158 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, Ty, Res);
12159 }
12160
12161 return Res;
12162}
12163
12164SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
12165 SelectionDAG &DAG) const {
12166 // Jump table entries as PC relative offsets. No additional tweaking
12167 // is necessary here. Just get the address of the jump table.
12168 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
12169
12172 !Subtarget->isTargetMachO())
12173 return getAddrLarge(JT, DAG);
12174 if (CM == CodeModel::Tiny)
12175 return getAddrTiny(JT, DAG);
12176 return getAddr(JT, DAG);
12177}
12178
12179SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
12180 SelectionDAG &DAG) const {
12181 // Jump table entries as PC relative offsets. No additional tweaking
12182 // is necessary here. Just get the address of the jump table.
12183 SDLoc DL(Op);
12184 SDValue JT = Op.getOperand(1);
12185 SDValue Entry = Op.getOperand(2);
12186 int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();
12187
12188 auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
12189 AFI->setJumpTableEntryInfo(JTI, 4, nullptr);
12190
12191 // With aarch64-jump-table-hardening, we only expand the jump table dispatch
12192 // sequence later, to guarantee the integrity of the intermediate values.
12194 "aarch64-jump-table-hardening")) {
12196 if (Subtarget->isTargetMachO()) {
12197 if (CM != CodeModel::Small && CM != CodeModel::Large)
12198 report_fatal_error("Unsupported code-model for hardened jump-table");
12199 } else {
12200 // Note that COFF support would likely also need JUMP_TABLE_DEBUG_INFO.
12201 assert(Subtarget->isTargetELF() &&
12202 "jump table hardening only supported on MachO/ELF");
12203 if (CM != CodeModel::Small)
12204 report_fatal_error("Unsupported code-model for hardened jump-table");
12205 }
12206
12207 SDValue X16Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::X16,
12208 Entry, SDValue());
12209 SDNode *B = DAG.getMachineNode(AArch64::BR_JumpTable, DL, MVT::Other,
12210 DAG.getTargetJumpTable(JTI, MVT::i32),
12211 X16Copy.getValue(0), X16Copy.getValue(1));
12212 return SDValue(B, 0);
12213 }
12214
12215 SDNode *Dest =
12216 DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
12217 Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
12218 SDValue JTInfo = DAG.getJumpTableDebugInfo(JTI, Op.getOperand(0), DL);
12219 return DAG.getNode(ISD::BRIND, DL, MVT::Other, JTInfo, SDValue(Dest, 0));
12220}
12221
12222SDValue AArch64TargetLowering::LowerBRIND(SDValue Op, SelectionDAG &DAG) const {
12223 SDValue Chain = Op.getOperand(0);
12224 SDValue Dest = Op.getOperand(1);
12225
12226 // BR_JT is lowered to BRIND, but the later lowering is specific to indirectbr
12227 // Skip over the jump-table BRINDs, where the destination is JumpTableDest32.
12228 if (Dest->isMachineOpcode() &&
12229 Dest->getMachineOpcode() == AArch64::JumpTableDest32)
12230 return SDValue();
12231
12232 const MachineFunction &MF = DAG.getMachineFunction();
12233 std::optional<uint16_t> BADisc =
12234 Subtarget->getPtrAuthBlockAddressDiscriminatorIfEnabled(MF.getFunction());
12235 if (!BADisc)
12236 return SDValue();
12237
12238 SDLoc DL(Op);
12239
12240 SDValue Disc = DAG.getTargetConstant(*BADisc, DL, MVT::i64);
12242 SDValue AddrDisc = DAG.getRegister(AArch64::XZR, MVT::i64);
12243
12244 SDNode *BrA = DAG.getMachineNode(AArch64::BRA, DL, MVT::Other,
12245 {Dest, Key, Disc, AddrDisc, Chain});
12246 return SDValue(BrA, 0);
12247}
12248
12249SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
12250 SelectionDAG &DAG) const {
12251 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
12253 if (CM == CodeModel::Large) {
12254 // Use the GOT for the large code model on iOS.
12255 if (Subtarget->isTargetMachO()) {
12256 return getGOT(CP, DAG);
12257 }
12259 return getAddrLarge(CP, DAG);
12260 } else if (CM == CodeModel::Tiny) {
12261 return getAddrTiny(CP, DAG);
12262 }
12263 return getAddr(CP, DAG);
12264}
12265
12266SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
12267 SelectionDAG &DAG) const {
12268 BlockAddressSDNode *BAN = cast<BlockAddressSDNode>(Op);
12269 const BlockAddress *BA = BAN->getBlockAddress();
12270
12271 if (std::optional<uint16_t> BADisc =
12272 Subtarget->getPtrAuthBlockAddressDiscriminatorIfEnabled(
12273 *BA->getFunction())) {
12274 SDLoc DL(Op);
12275
12276 // This isn't cheap, but BRIND is rare.
12277 SDValue TargetBA = DAG.getTargetBlockAddress(BA, BAN->getValueType(0));
12278
12279 SDValue Disc = DAG.getTargetConstant(*BADisc, DL, MVT::i64);
12280
12282 SDValue AddrDisc = DAG.getRegister(AArch64::XZR, MVT::i64);
12283
12284 SDNode *MOV =
12285 DAG.getMachineNode(AArch64::MOVaddrPAC, DL, {MVT::Other, MVT::Glue},
12286 {TargetBA, Key, AddrDisc, Disc});
12287 return DAG.getCopyFromReg(SDValue(MOV, 0), DL, AArch64::X16, MVT::i64,
12288 SDValue(MOV, 1));
12289 }
12290
12292 if (CM == CodeModel::Large && !Subtarget->isTargetMachO()) {
12294 return getAddrLarge(BAN, DAG);
12295 } else if (CM == CodeModel::Tiny) {
12296 return getAddrTiny(BAN, DAG);
12297 }
12298 return getAddr(BAN, DAG);
12299}
12300
12301SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
12302 SelectionDAG &DAG) const {
12303 AArch64FunctionInfo *FuncInfo =
12304 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
12305
12306 SDLoc DL(Op);
12307 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
12309 FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout()));
12310 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
12311 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
12312 MachinePointerInfo(SV));
12313}
12314
12315SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
12316 SelectionDAG &DAG) const {
12317 MachineFunction &MF = DAG.getMachineFunction();
12318 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
12319
12320 SDLoc DL(Op);
12321 SDValue FR;
12322 if (Subtarget->isWindowsArm64EC()) {
12323 // With the Arm64EC ABI, we compute the address of the varargs save area
12324 // relative to x4. For a normal AArch64->AArch64 call, x4 == sp on entry,
12325 // but calls from an entry thunk can pass in a different address.
12326 Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
12327 SDValue Val = DAG.getCopyFromReg(DAG.getEntryNode(), DL, VReg, MVT::i64);
12328 uint64_t StackOffset;
12329 if (FuncInfo->getVarArgsGPRSize() > 0)
12330 StackOffset = -(uint64_t)FuncInfo->getVarArgsGPRSize();
12331 else
12332 StackOffset = FuncInfo->getVarArgsStackOffset();
12333 FR = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
12334 DAG.getConstant(StackOffset, DL, MVT::i64));
12335 } else {
12336 FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
12337 ? FuncInfo->getVarArgsGPRIndex()
12338 : FuncInfo->getVarArgsStackIndex(),
12340 }
12341 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
12342 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
12343 MachinePointerInfo(SV));
12344}
12345
12346SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
12347 SelectionDAG &DAG) const {
12348 // The layout of the va_list struct is specified in the AArch64 Procedure Call
12349 // Standard, section B.3.
12350 MachineFunction &MF = DAG.getMachineFunction();
12351 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
12352 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
12353 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
12354 auto PtrVT = getPointerTy(DAG.getDataLayout());
12355 SDLoc DL(Op);
12356
12357 SDValue Chain = Op.getOperand(0);
12358 SDValue VAList = Op.getOperand(1);
12359 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
12361
12362 // void *__stack at offset 0
12363 unsigned Offset = 0;
12364 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
12365 Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT);
12366 MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
12367 MachinePointerInfo(SV), Align(PtrSize)));
12368
12369 // void *__gr_top at offset 8 (4 on ILP32)
12370 Offset += PtrSize;
12371 int GPRSize = FuncInfo->getVarArgsGPRSize();
12372 if (GPRSize > 0) {
12373 SDValue GRTop, GRTopAddr;
12374
12375 GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12376 DAG.getConstant(Offset, DL, PtrVT));
12377
12378 GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
12379 GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
12380 DAG.getSignedConstant(GPRSize, DL, PtrVT));
12381 GRTop = DAG.getZExtOrTrunc(GRTop, DL, PtrMemVT);
12382
12383 MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
12384 MachinePointerInfo(SV, Offset),
12385 Align(PtrSize)));
12386 }
12387
12388 // void *__vr_top at offset 16 (8 on ILP32)
12389 Offset += PtrSize;
12390 int FPRSize = FuncInfo->getVarArgsFPRSize();
12391 if (FPRSize > 0) {
12392 SDValue VRTop, VRTopAddr;
12393 VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12394 DAG.getConstant(Offset, DL, PtrVT));
12395
12396 VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
12397 VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
12398 DAG.getSignedConstant(FPRSize, DL, PtrVT));
12399 VRTop = DAG.getZExtOrTrunc(VRTop, DL, PtrMemVT);
12400
12401 MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
12402 MachinePointerInfo(SV, Offset),
12403 Align(PtrSize)));
12404 }
12405
12406 // int __gr_offs at offset 24 (12 on ILP32)
12407 Offset += PtrSize;
12408 SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12409 DAG.getConstant(Offset, DL, PtrVT));
12410 MemOps.push_back(
12411 DAG.getStore(Chain, DL, DAG.getSignedConstant(-GPRSize, DL, MVT::i32),
12412 GROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
12413
12414 // int __vr_offs at offset 28 (16 on ILP32)
12415 Offset += 4;
12416 SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12417 DAG.getConstant(Offset, DL, PtrVT));
12418 MemOps.push_back(
12419 DAG.getStore(Chain, DL, DAG.getSignedConstant(-FPRSize, DL, MVT::i32),
12420 VROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
12421
12422 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
12423}
12424
12425SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
12426 SelectionDAG &DAG) const {
12427 MachineFunction &MF = DAG.getMachineFunction();
12428 Function &F = MF.getFunction();
12429
12430 if (Subtarget->isCallingConvWin64(F.getCallingConv(), F.isVarArg()))
12431 return LowerWin64_VASTART(Op, DAG);
12432 else if (Subtarget->isTargetDarwin())
12433 return LowerDarwin_VASTART(Op, DAG);
12434 else
12435 return LowerAAPCS_VASTART(Op, DAG);
12436}
12437
12438SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
12439 SelectionDAG &DAG) const {
12440 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
12441 // pointer.
12442 SDLoc DL(Op);
12443 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
12444 unsigned VaListSize =
12445 (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
12446 ? PtrSize
12447 : Subtarget->isTargetILP32() ? 20 : 32;
12448 const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
12449 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
12450
12451 return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
12452 DAG.getConstant(VaListSize, DL, MVT::i32),
12453 Align(PtrSize), false, false, /*CI=*/nullptr,
12454 std::nullopt, MachinePointerInfo(DestSV),
12455 MachinePointerInfo(SrcSV));
12456}
12457
12458SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
12459 assert(Subtarget->isTargetDarwin() &&
12460 "automatic va_arg instruction only works on Darwin");
12461
12462 const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
12463 EVT VT = Op.getValueType();
12464 SDLoc DL(Op);
12465 SDValue Chain = Op.getOperand(0);
12466 SDValue Addr = Op.getOperand(1);
12467 MaybeAlign Align(Op.getConstantOperandVal(3));
12468 unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
12469 auto PtrVT = getPointerTy(DAG.getDataLayout());
12470 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
12471 SDValue VAList =
12472 DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V));
12473 Chain = VAList.getValue(1);
12474 VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);
12475
12476 if (VT.isScalableVector())
12477 report_fatal_error("Passing SVE types to variadic functions is "
12478 "currently not supported");
12479
12480 if (Align && *Align > MinSlotSize) {
12481 VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12482 DAG.getConstant(Align->value() - 1, DL, PtrVT));
12483 VAList =
12484 DAG.getNode(ISD::AND, DL, PtrVT, VAList,
12485 DAG.getSignedConstant(-(int64_t)Align->value(), DL, PtrVT));
12486 }
12487
12488 Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
12489 unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
12490
12491 // Scalar integer and FP values smaller than 64 bits are implicitly extended
12492 // up to 64 bits. At the very least, we have to increase the striding of the
12493 // vaargs list to match this, and for FP values we need to introduce
12494 // FP_ROUND nodes as well.
12495 if (VT.isInteger() && !VT.isVector())
12496 ArgSize = std::max(ArgSize, MinSlotSize);
12497 bool NeedFPTrunc = false;
12498 if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
12499 ArgSize = 8;
12500 NeedFPTrunc = true;
12501 }
12502
12503 // Increment the pointer, VAList, to the next vaarg
12504 SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
12505 DAG.getConstant(ArgSize, DL, PtrVT));
12506 VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT);
12507
12508 // Store the incremented VAList to the legalized pointer
12509 SDValue APStore =
12510 DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
12511
12512 // Load the actual argument out of the pointer VAList
12513 if (NeedFPTrunc) {
12514 // Load the value as an f64.
12515 SDValue WideFP =
12516 DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
12517 // Round the value down to an f32.
12518 SDValue NarrowFP =
12519 DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
12520 DAG.getIntPtrConstant(1, DL, /*isTarget=*/true));
12521 SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
12522 // Merge the rounded value with the chain output of the load.
12523 return DAG.getMergeValues(Ops, DL);
12524 }
12525
12526 return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
12527}
12528
12529SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
12530 SelectionDAG &DAG) const {
12531 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
12532 MFI.setFrameAddressIsTaken(true);
12533
12534 EVT VT = Op.getValueType();
12535 SDLoc DL(Op);
12536 unsigned Depth = Op.getConstantOperandVal(0);
12537 SDValue FrameAddr =
12538 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
12539 while (Depth--)
12540 FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
12541 MachinePointerInfo());
12542
12543 if (Subtarget->isTargetILP32())
12544 FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr,
12545 DAG.getValueType(VT));
12546
12547 return FrameAddr;
12548}
12549
12550SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
12551 SelectionDAG &DAG) const {
12552 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
12553
12554 EVT VT = getPointerTy(DAG.getDataLayout());
12555 int FI = MFI.CreateFixedObject(4, 0, false);
12556 return DAG.getFrameIndex(FI, VT);
12557}
12558
12559#define GET_REGISTER_MATCHER
12560#include "AArch64GenAsmMatcher.inc"
12561
12562// FIXME? Maybe this could be a TableGen attribute on some registers and
12563// this table could be generated automatically from RegInfo.
12564Register AArch64TargetLowering::
12565getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
12567 if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
12568 const AArch64RegisterInfo *MRI = Subtarget->getRegisterInfo();
12569 unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
12570 if (!Subtarget->isXRegisterReserved(DwarfRegNum) &&
12571 !MRI->isReservedReg(MF, Reg))
12572 Reg = Register();
12573 }
12574 return Reg;
12575}
12576
12577SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
12578 SelectionDAG &DAG) const {
12580
12581 EVT VT = Op.getValueType();
12582 SDLoc DL(Op);
12583
12584 SDValue FrameAddr =
12585 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
12587
12588 return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);
12589}
12590
12591SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
12592 SelectionDAG &DAG) const {
12593 MachineFunction &MF = DAG.getMachineFunction();
12594 MachineFrameInfo &MFI = MF.getFrameInfo();
12595 MFI.setReturnAddressIsTaken(true);
12596
12597 EVT VT = Op.getValueType();
12598 SDLoc DL(Op);
12599 unsigned Depth = Op.getConstantOperandVal(0);
12600 SDValue ReturnAddress;
12601 if (Depth) {
12602 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
12604 ReturnAddress = DAG.getLoad(
12605 VT, DL, DAG.getEntryNode(),
12606 DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo());
12607 } else {
12608 // Return LR, which contains the return address. Mark it an implicit
12609 // live-in.
12610 Register Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
12611 ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
12612 }
12613
12614 // The XPACLRI instruction assembles to a hint-space instruction before
12615 // Armv8.3-A therefore this instruction can be safely used for any pre
12616 // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
12617 // that instead.
12618 SDNode *St;
12619 if (Subtarget->hasPAuth()) {
12620 St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress);
12621 } else {
12622 // XPACLRI operates on LR therefore we must move the operand accordingly.
12623 SDValue Chain =
12624 DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress);
12625 St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain);
12626 }
12627 return SDValue(St, 0);
12628}
12629
12630/// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two
12631/// i32 values and take a 2 x i32 value to shift plus a shift amount.
12632SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op,
12633 SelectionDAG &DAG) const {
12634 SDValue Lo, Hi;
12635 expandShiftParts(Op.getNode(), Lo, Hi, DAG);
12636 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
12637}
12638
12640 const GlobalAddressSDNode *GA) const {
12641 // Offsets are folded in the DAG combine rather than here so that we can
12642 // intelligently choose an offset based on the uses.
12643 return false;
12644}
12645
12647 bool OptForSize) const {
12648 bool IsLegal = false;
12649 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
12650 // 16-bit case when target has full fp16 support.
12651 // We encode bf16 bit patterns as if they were fp16. This results in very
12652 // strange looking assembly but should populate the register with appropriate
12653 // values. Let's say we wanted to encode 0xR3FC0 which is 1.5 in BF16. We will
12654 // end up encoding this as the imm8 0x7f. This imm8 will be expanded to the
12655 // FP16 1.9375 which shares the same bit pattern as BF16 1.5.
12656 // FIXME: We should be able to handle f128 as well with a clever lowering.
12657 const APInt ImmInt = Imm.bitcastToAPInt();
12658 if (VT == MVT::f64)
12659 IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero();
12660 else if (VT == MVT::f32)
12661 IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero();
12662 else if (VT == MVT::f16 || VT == MVT::bf16)
12663 IsLegal =
12664 (Subtarget->hasFullFP16() && AArch64_AM::getFP16Imm(ImmInt) != -1) ||
12665 Imm.isPosZero();
12666
12667 // If we can not materialize in immediate field for fmov, check if the
12668 // value can be encoded as the immediate operand of a logical instruction.
12669 // The immediate value will be created with either MOVZ, MOVN, or ORR.
12670 // TODO: fmov h0, w0 is also legal, however we don't have an isel pattern to
12671 // generate that fmov.
12672 if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) {
12673 // The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
12674 // however the mov+fmov sequence is always better because of the reduced
12675 // cache pressure. The timings are still the same if you consider
12676 // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
12677 // movw+movk is fused). So we limit up to 2 instrdduction at most.
12680 assert(Insn.size() <= 4 &&
12681 "Should be able to build any value with at most 4 moves");
12682 unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 4 : 2));
12683 IsLegal = Insn.size() <= Limit;
12684 }
12685
12686 LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT
12687 << " imm value: "; Imm.dump(););
12688 return IsLegal;
12689}
12690
12691//===----------------------------------------------------------------------===//
12692// AArch64 Optimization Hooks
12693//===----------------------------------------------------------------------===//
12694
12695static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
12696 SDValue Operand, SelectionDAG &DAG,
12697 int &ExtraSteps) {
12698 EVT VT = Operand.getValueType();
12699 if ((ST->hasNEON() &&
12700 (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
12701 VT == MVT::f32 || VT == MVT::v1f32 || VT == MVT::v2f32 ||
12702 VT == MVT::v4f32)) ||
12703 (ST->hasSVE() &&
12704 (VT == MVT::nxv8f16 || VT == MVT::nxv4f32 || VT == MVT::nxv2f64))) {
12706 // For the reciprocal estimates, convergence is quadratic, so the number
12707 // of digits is doubled after each iteration. In ARMv8, the accuracy of
12708 // the initial estimate is 2^-8. Thus the number of extra steps to refine
12709 // the result for float (23 mantissa bits) is 2 and for double (52
12710 // mantissa bits) is 3.
12711 constexpr unsigned AccurateBits = 8;
12712 unsigned DesiredBits = APFloat::semanticsPrecision(VT.getFltSemantics());
12713 ExtraSteps = DesiredBits <= AccurateBits
12714 ? 0
12715 : Log2_64_Ceil(DesiredBits) - Log2_64_Ceil(AccurateBits);
12716 }
12717
12718 return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
12719 }
12720
12721 return SDValue();
12722}
12723
12724SDValue
12725AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
12726 const DenormalMode &Mode) const {
12727 SDLoc DL(Op);
12728 EVT VT = Op.getValueType();
12729 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
12730 SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
12731 return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
12732}
12733
12734SDValue
12735AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op,
12736 SelectionDAG &DAG) const {
12737 return Op;
12738}
12739
12740SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
12741 SelectionDAG &DAG, int Enabled,
12742 int &ExtraSteps,
12743 bool &UseOneConst,
12744 bool Reciprocal) const {
12746 (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
12747 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
12748 DAG, ExtraSteps)) {
12749 SDLoc DL(Operand);
12750 EVT VT = Operand.getValueType();
12751
12752 // Ensure nodes can be recognized by isAssociativeAndCommutative.
12753 SDNodeFlags Flags =
12755
12756 // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
12757 // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
12758 for (int i = ExtraSteps; i > 0; --i) {
12759 SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
12760 Flags);
12761 Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
12762 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
12763 }
12764 if (!Reciprocal)
12765 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
12766
12767 ExtraSteps = 0;
12768 return Estimate;
12769 }
12770
12771 return SDValue();
12772}
12773
12774SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
12775 SelectionDAG &DAG, int Enabled,
12776 int &ExtraSteps) const {
12778 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
12779 DAG, ExtraSteps)) {
12780 SDLoc DL(Operand);
12781 EVT VT = Operand.getValueType();
12782
12784
12785 // Newton reciprocal iteration: E * (2 - X * E)
12786 // AArch64 reciprocal iteration instruction: (2 - M * N)
12787 for (int i = ExtraSteps; i > 0; --i) {
12788 SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
12789 Estimate, Flags);
12790 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
12791 }
12792
12793 ExtraSteps = 0;
12794 return Estimate;
12795 }
12796
12797 return SDValue();
12798}
12799
12800//===----------------------------------------------------------------------===//
12801// AArch64 Inline Assembly Support
12802//===----------------------------------------------------------------------===//
12803
12804// Table of Constraints
12805// TODO: This is the current set of constraints supported by ARM for the
12806// compiler, not all of them may make sense.
12807//
12808// r - A general register
12809// w - An FP/SIMD register of some size in the range v0-v31
12810// x - An FP/SIMD register of some size in the range v0-v15
12811// I - Constant that can be used with an ADD instruction
12812// J - Constant that can be used with a SUB instruction
12813// K - Constant that can be used with a 32-bit logical instruction
12814// L - Constant that can be used with a 64-bit logical instruction
12815// M - Constant that can be used as a 32-bit MOV immediate
12816// N - Constant that can be used as a 64-bit MOV immediate
12817// Q - A memory reference with base register and no offset
12818// S - A symbolic address
12819// Y - Floating point constant zero
12820// Z - Integer constant zero
12821//
12822// Note that general register operands will be output using their 64-bit x
12823// register name, whatever the size of the variable, unless the asm operand
12824// is prefixed by the %w modifier. Floating-point and SIMD register operands
12825// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
12826// %q modifier.
12827const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
12828 // At this point, we have to lower this constraint to something else, so we
12829 // lower it to an "r" or "w". However, by doing this we will force the result
12830 // to be in register, while the X constraint is much more permissive.
12831 //
12832 // Although we are correct (we are free to emit anything, without
12833 // constraints), we might break use cases that would expect us to be more
12834 // efficient and emit something else.
12835 if (!Subtarget->hasFPARMv8())
12836 return "r";
12837
12838 if (ConstraintVT.isFloatingPoint())
12839 return "w";
12840
12841 if (ConstraintVT.isVector() &&
12842 (ConstraintVT.getSizeInBits() == 64 ||
12843 ConstraintVT.getSizeInBits() == 128))
12844 return "w";
12845
12846 return "r";
12847}
12848
12850
12851// Returns a {Reg, RegisterClass} tuple if the constraint is
12852// a specific predicate register.
12853//
12854// For some constraint like "{pn3}" the default path in
12855// TargetLowering::getRegForInlineAsmConstraint() leads it to determine that a
12856// suitable register class for this register is "PPRorPNR", after which it
12857// determines that nxv16i1 is an appropriate type for the constraint, which is
12858// not what we want. The code here pre-empts this by matching the register
12859// explicitly.
12860static std::optional<std::pair<unsigned, const TargetRegisterClass *>>
12862 if (!Constraint.starts_with('{') || !Constraint.ends_with('}') ||
12863 (Constraint[1] != 'p' && Constraint[1] != 'z'))
12864 return std::nullopt;
12865
12866 bool IsPredicate = Constraint[1] == 'p';
12867 Constraint = Constraint.substr(2, Constraint.size() - 3);
12868 bool IsPredicateAsCount = IsPredicate && Constraint.starts_with("n");
12869 if (IsPredicateAsCount)
12870 Constraint = Constraint.drop_front(1);
12871
12872 unsigned V;
12873 if (Constraint.getAsInteger(10, V) || V > 31)
12874 return std::nullopt;
12875
12876 if (IsPredicateAsCount)
12877 return std::make_pair(AArch64::PN0 + V, &AArch64::PNRRegClass);
12878 if (IsPredicate)
12879 return std::make_pair(AArch64::P0 + V, &AArch64::PPRRegClass);
12880 return std::make_pair(AArch64::Z0 + V, &AArch64::ZPRRegClass);
12881}
12882
12883static std::optional<PredicateConstraint>
12886 .Case("Uph", PredicateConstraint::Uph)
12889 .Default(std::nullopt);
12890}
12891
12892static const TargetRegisterClass *
12894 if (VT != MVT::aarch64svcount &&
12895 (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1))
12896 return nullptr;
12897
12898 switch (Constraint) {
12900 return VT == MVT::aarch64svcount ? &AArch64::PNR_p8to15RegClass
12901 : &AArch64::PPR_p8to15RegClass;
12903 return VT == MVT::aarch64svcount ? &AArch64::PNR_3bRegClass
12904 : &AArch64::PPR_3bRegClass;
12906 return VT == MVT::aarch64svcount ? &AArch64::PNRRegClass
12907 : &AArch64::PPRRegClass;
12908 }
12909
12910 llvm_unreachable("Missing PredicateConstraint!");
12911}
12912
12914
12915static std::optional<ReducedGprConstraint>
12918 .Case("Uci", ReducedGprConstraint::Uci)
12920 .Default(std::nullopt);
12921}
12922
12923static const TargetRegisterClass *
12925 if (!VT.isScalarInteger() || VT.getFixedSizeInBits() > 64)
12926 return nullptr;
12927
12928 switch (Constraint) {
12930 return &AArch64::MatrixIndexGPR32_8_11RegClass;
12932 return &AArch64::MatrixIndexGPR32_12_15RegClass;
12933 }
12934
12935 llvm_unreachable("Missing ReducedGprConstraint!");
12936}
12937
12938// The set of cc code supported is from
12939// https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Flag-Output-Operands
12942 .Case("{@cchi}", AArch64CC::HI)
12943 .Case("{@cccs}", AArch64CC::HS)
12944 .Case("{@cclo}", AArch64CC::LO)
12945 .Case("{@ccls}", AArch64CC::LS)
12946 .Case("{@cccc}", AArch64CC::LO)
12947 .Case("{@cceq}", AArch64CC::EQ)
12948 .Case("{@ccgt}", AArch64CC::GT)
12949 .Case("{@ccge}", AArch64CC::GE)
12950 .Case("{@cclt}", AArch64CC::LT)
12951 .Case("{@ccle}", AArch64CC::LE)
12952 .Case("{@cchs}", AArch64CC::HS)
12953 .Case("{@ccne}", AArch64CC::NE)
12954 .Case("{@ccvc}", AArch64CC::VC)
12955 .Case("{@ccpl}", AArch64CC::PL)
12956 .Case("{@ccvs}", AArch64CC::VS)
12957 .Case("{@ccmi}", AArch64CC::MI)
12959 return Cond;
12960}
12961
12962/// Helper function to create 'CSET', which is equivalent to 'CSINC <Wd>, WZR,
12963/// WZR, invert(<cond>)'.
12965 SelectionDAG &DAG) {
12966 return DAG.getNode(AArch64ISD::CSINC, DL, MVT::i32,
12967 DAG.getConstant(0, DL, MVT::i32),
12968 DAG.getConstant(0, DL, MVT::i32),
12969 getCondCode(DAG, getInvertedCondCode(CC)), NZCV);
12970}
12971
12972// Lower @cc flag output via getSETCC.
12973SDValue AArch64TargetLowering::LowerAsmOutputForConstraint(
12974 SDValue &Chain, SDValue &Glue, const SDLoc &DL,
12975 const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {
12976 AArch64CC::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);
12977 if (Cond == AArch64CC::Invalid)
12978 return SDValue();
12979 // The output variable should be a scalar integer.
12980 if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||
12981 OpInfo.ConstraintVT.getSizeInBits() < 8)
12982 report_fatal_error("Flag output operand is of invalid type");
12983
12984 // Get NZCV register. Only update chain when copyfrom is glued.
12985 if (Glue.getNode()) {
12986 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, FlagsVT, Glue);
12987 Chain = Glue.getValue(1);
12988 } else
12989 Glue = DAG.getCopyFromReg(Chain, DL, AArch64::NZCV, FlagsVT);
12990 // Extract CC code.
12991 SDValue CC = getSETCC(Cond, Glue, DL, DAG);
12992
12994
12995 // Truncate or ZERO_EXTEND based on value types.
12996 if (OpInfo.ConstraintVT.getSizeInBits() <= 32)
12997 Result = DAG.getNode(ISD::TRUNCATE, DL, OpInfo.ConstraintVT, CC);
12998 else
12999 Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);
13000
13001 return Result;
13002}
13003
13004/// getConstraintType - Given a constraint letter, return the type of
13005/// constraint it is for this target.
13007AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
13008 if (Constraint.size() == 1) {
13009 switch (Constraint[0]) {
13010 default:
13011 break;
13012 case 'x':
13013 case 'w':
13014 case 'y':
13015 return C_RegisterClass;
13016 // An address with a single base register. Due to the way we
13017 // currently handle addresses it is the same as 'r'.
13018 case 'Q':
13019 return C_Memory;
13020 case 'I':
13021 case 'J':
13022 case 'K':
13023 case 'L':
13024 case 'M':
13025 case 'N':
13026 case 'Y':
13027 case 'Z':
13028 return C_Immediate;
13029 case 'z':
13030 case 'S': // A symbol or label reference with a constant offset
13031 return C_Other;
13032 }
13033 } else if (parsePredicateConstraint(Constraint))
13034 return C_RegisterClass;
13035 else if (parseReducedGprConstraint(Constraint))
13036 return C_RegisterClass;
13037 else if (parseConstraintCode(Constraint) != AArch64CC::Invalid)
13038 return C_Other;
13039 return TargetLowering::getConstraintType(Constraint);
13040}
13041
13042/// Examine constraint type and operand type and determine a weight value.
13043/// This object must already have been set up with the operand type
13044/// and the current alternative constraint selected.
13046AArch64TargetLowering::getSingleConstraintMatchWeight(
13047 AsmOperandInfo &info, const char *constraint) const {
13049 Value *CallOperandVal = info.CallOperandVal;
13050 // If we don't have a value, we can't do a match,
13051 // but allow it at the lowest weight.
13052 if (!CallOperandVal)
13053 return CW_Default;
13054 Type *type = CallOperandVal->getType();
13055 // Look at the constraint type.
13056 switch (*constraint) {
13057 default:
13059 break;
13060 case 'x':
13061 case 'w':
13062 case 'y':
13063 if (type->isFloatingPointTy() || type->isVectorTy())
13064 weight = CW_Register;
13065 break;
13066 case 'z':
13067 weight = CW_Constant;
13068 break;
13069 case 'U':
13070 if (parsePredicateConstraint(constraint) ||
13071 parseReducedGprConstraint(constraint))
13072 weight = CW_Register;
13073 break;
13074 }
13075 return weight;
13076}
13077
13078std::pair<unsigned, const TargetRegisterClass *>
13079AArch64TargetLowering::getRegForInlineAsmConstraint(
13080 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
13081 if (Constraint.size() == 1) {
13082 switch (Constraint[0]) {
13083 case 'r':
13084 if (VT.isScalableVector())
13085 return std::make_pair(0U, nullptr);
13086 if (Subtarget->hasLS64() && VT.getSizeInBits() == 512)
13087 return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass);
13088 if (VT.getFixedSizeInBits() == 64)
13089 return std::make_pair(0U, &AArch64::GPR64commonRegClass);
13090 return std::make_pair(0U, &AArch64::GPR32commonRegClass);
13091 case 'w': {
13092 if (!Subtarget->hasFPARMv8())
13093 break;
13094 if (VT.isScalableVector()) {
13095 if (VT.getVectorElementType() != MVT::i1)
13096 return std::make_pair(0U, &AArch64::ZPRRegClass);
13097 return std::make_pair(0U, nullptr);
13098 }
13099 if (VT == MVT::Other)
13100 break;
13101 uint64_t VTSize = VT.getFixedSizeInBits();
13102 if (VTSize == 16)
13103 return std::make_pair(0U, &AArch64::FPR16RegClass);
13104 if (VTSize == 32)
13105 return std::make_pair(0U, &AArch64::FPR32RegClass);
13106 if (VTSize == 64)
13107 return std::make_pair(0U, &AArch64::FPR64RegClass);
13108 if (VTSize == 128)
13109 return std::make_pair(0U, &AArch64::FPR128RegClass);
13110 break;
13111 }
13112 // The instructions that this constraint is designed for can
13113 // only take 128-bit registers so just use that regclass.
13114 case 'x':
13115 if (!Subtarget->hasFPARMv8())
13116 break;
13117 if (VT.isScalableVector())
13118 return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
13119 if (VT.getSizeInBits() == 128)
13120 return std::make_pair(0U, &AArch64::FPR128_loRegClass);
13121 break;
13122 case 'y':
13123 if (!Subtarget->hasFPARMv8())
13124 break;
13125 if (VT.isScalableVector())
13126 return std::make_pair(0U, &AArch64::ZPR_3bRegClass);
13127 break;
13128 }
13129 } else {
13130 if (const auto P = parseSVERegAsConstraint(Constraint)) {
13131 // SME functions that are not in streaming mode, should
13132 // still observe clobbers of Z-registers by clobbering
13133 // the lower 128bits of those registers.
13134 if (AArch64::ZPRRegClass.hasSubClassEq(P->second) &&
13135 !Subtarget->isSVEorStreamingSVEAvailable())
13136 return std::make_pair(TRI->getSubReg(P->first, AArch64::zsub),
13137 &AArch64::FPR128RegClass);
13138 return *P;
13139 }
13140 if (const auto PC = parsePredicateConstraint(Constraint))
13141 if (const auto *RegClass = getPredicateRegisterClass(*PC, VT))
13142 return std::make_pair(0U, RegClass);
13143
13144 if (const auto RGC = parseReducedGprConstraint(Constraint))
13145 if (const auto *RegClass = getReducedGprRegisterClass(*RGC, VT))
13146 return std::make_pair(0U, RegClass);
13147 }
13148 if (StringRef("{cc}").equals_insensitive(Constraint) ||
13150 return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
13151
13152 if (Constraint == "{za}") {
13153 return std::make_pair(unsigned(AArch64::ZA), &AArch64::MPRRegClass);
13154 }
13155
13156 if (Constraint == "{zt0}") {
13157 return std::make_pair(unsigned(AArch64::ZT0), &AArch64::ZTRRegClass);
13158 }
13159
13160 // Use the default implementation in TargetLowering to convert the register
13161 // constraint into a member of a register class.
13162 std::pair<unsigned, const TargetRegisterClass *> Res;
13164
13165 // Not found as a standard register?
13166 if (!Res.second) {
13167 unsigned Size = Constraint.size();
13168 if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
13169 tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
13170 int RegNo;
13171 bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
13172 if (!Failed && RegNo >= 0 && RegNo <= 31) {
13173 // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
13174 // By default we'll emit v0-v31 for this unless there's a modifier where
13175 // we'll emit the correct register as well.
13176 if (VT != MVT::Other && VT.getSizeInBits() == 64) {
13177 Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
13178 Res.second = &AArch64::FPR64RegClass;
13179 } else {
13180 Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
13181 Res.second = &AArch64::FPR128RegClass;
13182 }
13183 }
13184 }
13185 }
13186
13187 if (Res.second && !Subtarget->hasFPARMv8() &&
13188 !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&
13189 !AArch64::GPR64allRegClass.hasSubClassEq(Res.second))
13190 return std::make_pair(0U, nullptr);
13191
13192 return Res;
13193}
13194
13196 llvm::Type *Ty,
13197 bool AllowUnknown) const {
13198 if (Subtarget->hasLS64() && Ty->isIntegerTy(512))
13199 return EVT(MVT::i64x8);
13200
13201 return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown);
13202}
13203
13204/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
13205/// vector. If it is invalid, don't add anything to Ops.
13206void AArch64TargetLowering::LowerAsmOperandForConstraint(
13207 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
13208 SelectionDAG &DAG) const {
13209 SDValue Result;
13210
13211 // Currently only support length 1 constraints.
13212 if (Constraint.size() != 1)
13213 return;
13214
13215 char ConstraintLetter = Constraint[0];
13216 switch (ConstraintLetter) {
13217 default:
13218 break;
13219
13220 // This set of constraints deal with valid constants for various instructions.
13221 // Validate and return a target constant for them if we can.
13222 case 'z': {
13223 // 'z' maps to xzr or wzr so it needs an input of 0.
13224 if (!isNullConstant(Op))
13225 return;
13226
13227 if (Op.getValueType() == MVT::i64)
13228 Result = DAG.getRegister(AArch64::XZR, MVT::i64);
13229 else
13230 Result = DAG.getRegister(AArch64::WZR, MVT::i32);
13231 break;
13232 }
13233 case 'S':
13234 // Use the generic code path for "s". In GCC's aarch64 port, "S" is
13235 // supported for PIC while "s" isn't, making "s" less useful. We implement
13236 // "S" but not "s".
13238 break;
13239
13240 case 'I':
13241 case 'J':
13242 case 'K':
13243 case 'L':
13244 case 'M':
13245 case 'N':
13247 if (!C)
13248 return;
13249
13250 // Grab the value and do some validation.
13251 uint64_t CVal = C->getZExtValue();
13252 switch (ConstraintLetter) {
13253 // The I constraint applies only to simple ADD or SUB immediate operands:
13254 // i.e. 0 to 4095 with optional shift by 12
13255 // The J constraint applies only to ADD or SUB immediates that would be
13256 // valid when negated, i.e. if [an add pattern] were to be output as a SUB
13257 // instruction [or vice versa], in other words -1 to -4095 with optional
13258 // left shift by 12.
13259 case 'I':
13260 if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
13261 break;
13262 return;
13263 case 'J': {
13264 uint64_t NVal = -C->getSExtValue();
13265 if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
13266 CVal = C->getSExtValue();
13267 break;
13268 }
13269 return;
13270 }
13271 // The K and L constraints apply *only* to logical immediates, including
13272 // what used to be the MOVI alias for ORR (though the MOVI alias has now
13273 // been removed and MOV should be used). So these constraints have to
13274 // distinguish between bit patterns that are valid 32-bit or 64-bit
13275 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
13276 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
13277 // versa.
13278 case 'K':
13279 if (AArch64_AM::isLogicalImmediate(CVal, 32))
13280 break;
13281 return;
13282 case 'L':
13283 if (AArch64_AM::isLogicalImmediate(CVal, 64))
13284 break;
13285 return;
13286 // The M and N constraints are a superset of K and L respectively, for use
13287 // with the MOV (immediate) alias. As well as the logical immediates they
13288 // also match 32 or 64-bit immediates that can be loaded either using a
13289 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
13290 // (M) or 64-bit 0x1234000000000000 (N) etc.
13291 // As a note some of this code is liberally stolen from the asm parser.
13292 case 'M': {
13293 if (!isUInt<32>(CVal))
13294 return;
13295 if (AArch64_AM::isLogicalImmediate(CVal, 32))
13296 break;
13297 if ((CVal & 0xFFFF) == CVal)
13298 break;
13299 if ((CVal & 0xFFFF0000ULL) == CVal)
13300 break;
13301 uint64_t NCVal = ~(uint32_t)CVal;
13302 if ((NCVal & 0xFFFFULL) == NCVal)
13303 break;
13304 if ((NCVal & 0xFFFF0000ULL) == NCVal)
13305 break;
13306 return;
13307 }
13308 case 'N': {
13309 if (AArch64_AM::isLogicalImmediate(CVal, 64))
13310 break;
13311 if ((CVal & 0xFFFFULL) == CVal)
13312 break;
13313 if ((CVal & 0xFFFF0000ULL) == CVal)
13314 break;
13315 if ((CVal & 0xFFFF00000000ULL) == CVal)
13316 break;
13317 if ((CVal & 0xFFFF000000000000ULL) == CVal)
13318 break;
13319 uint64_t NCVal = ~CVal;
13320 if ((NCVal & 0xFFFFULL) == NCVal)
13321 break;
13322 if ((NCVal & 0xFFFF0000ULL) == NCVal)
13323 break;
13324 if ((NCVal & 0xFFFF00000000ULL) == NCVal)
13325 break;
13326 if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
13327 break;
13328 return;
13329 }
13330 default:
13331 return;
13332 }
13333
13334 // All assembler immediates are 64-bit integers.
13335 Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
13336 break;
13337 }
13338
13339 if (Result.getNode()) {
13340 Ops.push_back(Result);
13341 return;
13342 }
13343
13344 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
13345}
13346
13347//===----------------------------------------------------------------------===//
13348// AArch64 Advanced SIMD Support
13349//===----------------------------------------------------------------------===//
13350
13351/// WidenVector - Given a value in the V64 register class, produce the
13352/// equivalent value in the V128 register class.
13354 EVT VT = V64Reg.getValueType();
13355 unsigned NarrowSize = VT.getVectorNumElements();
13356 MVT EltTy = VT.getVectorElementType().getSimpleVT();
13357 MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
13358 SDLoc DL(V64Reg);
13359
13360 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
13361 V64Reg, DAG.getConstant(0, DL, MVT::i64));
13362}
13363
13364/// getExtFactor - Determine the adjustment factor for the position when
13365/// generating an "extract from vector registers" instruction.
13366static unsigned getExtFactor(SDValue &V) {
13367 EVT EltType = V.getValueType().getVectorElementType();
13368 return EltType.getSizeInBits() / 8;
13369}
13370
13371// Check if a vector is built from one vector via extracted elements of
13372// another together with an AND mask, ensuring that all elements fit
13373// within range. This can be reconstructed using AND and NEON's TBL1.
13375 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
13376 SDLoc DL(Op);
13377 EVT VT = Op.getValueType();
13378 assert(!VT.isScalableVector() &&
13379 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
13380
13381 // Can only recreate a shuffle with 16xi8 or 8xi8 elements, as they map
13382 // directly to TBL1.
13383 if (VT != MVT::v16i8 && VT != MVT::v8i8)
13384 return SDValue();
13385
13386 unsigned NumElts = VT.getVectorNumElements();
13387 assert((NumElts == 8 || NumElts == 16) &&
13388 "Need to have exactly 8 or 16 elements in vector.");
13389
13390 SDValue SourceVec;
13391 SDValue MaskSourceVec;
13392 SmallVector<SDValue, 16> AndMaskConstants;
13393
13394 for (unsigned i = 0; i < NumElts; ++i) {
13395 SDValue V = Op.getOperand(i);
13396 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
13397 return SDValue();
13398
13399 SDValue OperandSourceVec = V.getOperand(0);
13400 if (!SourceVec)
13401 SourceVec = OperandSourceVec;
13402 else if (SourceVec != OperandSourceVec)
13403 return SDValue();
13404
13405 // This only looks at shuffles with elements that are
13406 // a) truncated by a constant AND mask extracted from a mask vector, or
13407 // b) extracted directly from a mask vector.
13408 SDValue MaskSource = V.getOperand(1);
13409 if (MaskSource.getOpcode() == ISD::AND) {
13410 if (!isa<ConstantSDNode>(MaskSource.getOperand(1)))
13411 return SDValue();
13412
13413 AndMaskConstants.push_back(MaskSource.getOperand(1));
13414 MaskSource = MaskSource->getOperand(0);
13415 } else if (!AndMaskConstants.empty()) {
13416 // Either all or no operands should have an AND mask.
13417 return SDValue();
13418 }
13419
13420 // An ANY_EXTEND may be inserted between the AND and the source vector
13421 // extraction. We don't care about that, so we can just skip it.
13422 if (MaskSource.getOpcode() == ISD::ANY_EXTEND)
13423 MaskSource = MaskSource.getOperand(0);
13424
13425 if (MaskSource.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
13426 return SDValue();
13427
13428 SDValue MaskIdx = MaskSource.getOperand(1);
13429 if (!isa<ConstantSDNode>(MaskIdx) ||
13430 !cast<ConstantSDNode>(MaskIdx)->getConstantIntValue()->equalsInt(i))
13431 return SDValue();
13432
13433 // We only apply this if all elements come from the same vector with the
13434 // same vector type.
13435 if (!MaskSourceVec) {
13436 MaskSourceVec = MaskSource->getOperand(0);
13437 if (MaskSourceVec.getValueType() != VT)
13438 return SDValue();
13439 } else if (MaskSourceVec != MaskSource->getOperand(0)) {
13440 return SDValue();
13441 }
13442 }
13443
13444 // We need a v16i8 for TBL, so we extend the source with a placeholder vector
13445 // for v8i8 to get a v16i8. As the pattern we are replacing is extract +
13446 // insert, we know that the index in the mask must be smaller than the number
13447 // of elements in the source, or we would have an out-of-bounds access.
13448 if (NumElts == 8)
13449 SourceVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, SourceVec,
13450 DAG.getUNDEF(VT));
13451
13452 // Preconditions met, so we can use a vector (AND +) TBL to build this vector.
13453 if (!AndMaskConstants.empty())
13454 MaskSourceVec = DAG.getNode(ISD::AND, DL, VT, MaskSourceVec,
13455 DAG.getBuildVector(VT, DL, AndMaskConstants));
13456
13457 return DAG.getNode(
13459 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32),
13460 SourceVec, MaskSourceVec);
13461}
13462
13463// Gather data to see if the operation can be modelled as a
13464// shuffle in combination with VEXTs.
13466 SelectionDAG &DAG) const {
13467 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
13468 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
13469 SDLoc DL(Op);
13470 EVT VT = Op.getValueType();
13471 assert(!VT.isScalableVector() &&
13472 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
13473 unsigned NumElts = VT.getVectorNumElements();
13474
13475 struct ShuffleSourceInfo {
13476 SDValue Vec;
13477 unsigned MinElt;
13478 unsigned MaxElt;
13479
13480 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
13481 // be compatible with the shuffle we intend to construct. As a result
13482 // ShuffleVec will be some sliding window into the original Vec.
13483 SDValue ShuffleVec;
13484
13485 // Code should guarantee that element i in Vec starts at element "WindowBase
13486 // + i * WindowScale in ShuffleVec".
13487 int WindowBase;
13488 int WindowScale;
13489
13490 ShuffleSourceInfo(SDValue Vec)
13491 : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
13492 ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
13493
13494 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
13495 };
13496
13497 // First gather all vectors used as an immediate source for this BUILD_VECTOR
13498 // node.
13500 for (unsigned i = 0; i < NumElts; ++i) {
13501 SDValue V = Op.getOperand(i);
13502 if (V.isUndef())
13503 continue;
13504 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13505 !isa<ConstantSDNode>(V.getOperand(1)) ||
13506 V.getOperand(0).getValueType().isScalableVector()) {
13507 LLVM_DEBUG(
13508 dbgs() << "Reshuffle failed: "
13509 "a shuffle can only come from building a vector from "
13510 "various elements of other fixed-width vectors, provided "
13511 "their indices are constant\n");
13512 return SDValue();
13513 }
13514
13515 // Add this element source to the list if it's not already there.
13516 SDValue SourceVec = V.getOperand(0);
13517 auto Source = find(Sources, SourceVec);
13518 if (Source == Sources.end())
13519 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
13520
13521 // Update the minimum and maximum lane number seen.
13522 unsigned EltNo = V.getConstantOperandVal(1);
13523 Source->MinElt = std::min(Source->MinElt, EltNo);
13524 Source->MaxElt = std::max(Source->MaxElt, EltNo);
13525 }
13526
13527 // If we have 3 or 4 sources, try to generate a TBL, which will at least be
13528 // better than moving to/from gpr registers for larger vectors.
13529 if ((Sources.size() == 3 || Sources.size() == 4) && NumElts > 4) {
13530 // Construct a mask for the tbl. We may need to adjust the index for types
13531 // larger than i8.
13533 unsigned OutputFactor = VT.getScalarSizeInBits() / 8;
13534 for (unsigned I = 0; I < NumElts; ++I) {
13535 SDValue V = Op.getOperand(I);
13536 if (V.isUndef()) {
13537 for (unsigned OF = 0; OF < OutputFactor; OF++)
13538 Mask.push_back(-1);
13539 continue;
13540 }
13541 // Set the Mask lanes adjusted for the size of the input and output
13542 // lanes. The Mask is always i8, so it will set OutputFactor lanes per
13543 // output element, adjusted in their positions per input and output types.
13544 unsigned Lane = V.getConstantOperandVal(1);
13545 for (unsigned S = 0; S < Sources.size(); S++) {
13546 if (V.getOperand(0) == Sources[S].Vec) {
13547 unsigned InputSize = Sources[S].Vec.getScalarValueSizeInBits();
13548 unsigned InputBase = 16 * S + Lane * InputSize / 8;
13549 for (unsigned OF = 0; OF < OutputFactor; OF++)
13550 Mask.push_back(InputBase + OF);
13551 break;
13552 }
13553 }
13554 }
13555
13556 // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to
13557 // v16i8, and the TBLMask
13558 SmallVector<SDValue, 16> TBLOperands;
13559 TBLOperands.push_back(DAG.getConstant(Sources.size() == 3
13560 ? Intrinsic::aarch64_neon_tbl3
13561 : Intrinsic::aarch64_neon_tbl4,
13562 DL, MVT::i32));
13563 for (unsigned i = 0; i < Sources.size(); i++) {
13564 SDValue Src = Sources[i].Vec;
13565 EVT SrcVT = Src.getValueType();
13566 Src = DAG.getBitcast(SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, Src);
13567 assert((SrcVT.is64BitVector() || SrcVT.is128BitVector()) &&
13568 "Expected a legally typed vector");
13569 if (SrcVT.is64BitVector())
13570 Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Src,
13571 DAG.getUNDEF(MVT::v8i8));
13572 TBLOperands.push_back(Src);
13573 }
13574
13576 for (unsigned i = 0; i < Mask.size(); i++)
13577 TBLMask.push_back(DAG.getConstant(Mask[i], DL, MVT::i32));
13578 assert((Mask.size() == 8 || Mask.size() == 16) &&
13579 "Expected a v8i8 or v16i8 Mask");
13580 TBLOperands.push_back(DAG.getBuildVector(
13581 Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, DL, TBLMask));
13582
13583 SDValue Shuffle =
13585 Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, TBLOperands);
13586 return DAG.getBitcast(VT, Shuffle);
13587 }
13588
13589 if (Sources.size() > 2) {
13590 LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something "
13591 << "sensible when at most two source vectors are "
13592 << "involved\n");
13593 return SDValue();
13594 }
13595
13596 // Find out the smallest element size among result and two sources, and use
13597 // it as element size to build the shuffle_vector.
13598 EVT SmallestEltTy = VT.getVectorElementType();
13599 for (auto &Source : Sources) {
13600 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
13601 if (SrcEltTy.bitsLT(SmallestEltTy)) {
13602 SmallestEltTy = SrcEltTy;
13603 }
13604 }
13605 unsigned ResMultiplier =
13606 VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits();
13607 uint64_t VTSize = VT.getFixedSizeInBits();
13608 NumElts = VTSize / SmallestEltTy.getFixedSizeInBits();
13609 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
13610
13611 // If the source vector is too wide or too narrow, we may nevertheless be able
13612 // to construct a compatible shuffle either by concatenating it with UNDEF or
13613 // extracting a suitable range of elements.
13614 for (auto &Src : Sources) {
13615 EVT SrcVT = Src.ShuffleVec.getValueType();
13616
13617 TypeSize SrcVTSize = SrcVT.getSizeInBits();
13618 if (SrcVTSize == TypeSize::getFixed(VTSize))
13619 continue;
13620
13621 // This stage of the search produces a source with the same element type as
13622 // the original, but with a total width matching the BUILD_VECTOR output.
13623 EVT EltVT = SrcVT.getVectorElementType();
13624 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
13625 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
13626
13627 if (SrcVTSize.getFixedValue() < VTSize) {
13628 assert(2 * SrcVTSize == VTSize);
13629 // We can pad out the smaller vector for free, so if it's part of a
13630 // shuffle...
13631 Src.ShuffleVec =
13632 DAG.getNode(ISD::CONCAT_VECTORS, DL, DestVT, Src.ShuffleVec,
13633 DAG.getUNDEF(Src.ShuffleVec.getValueType()));
13634 continue;
13635 }
13636
13637 if (SrcVTSize.getFixedValue() != 2 * VTSize) {
13638 LLVM_DEBUG(
13639 dbgs() << "Reshuffle failed: result vector too small to extract\n");
13640 return SDValue();
13641 }
13642
13643 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
13644 LLVM_DEBUG(
13645 dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
13646 return SDValue();
13647 }
13648
13649 if (Src.MinElt >= NumSrcElts) {
13650 // The extraction can just take the second half
13651 Src.ShuffleVec =
13652 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
13653 DAG.getConstant(NumSrcElts, DL, MVT::i64));
13654 Src.WindowBase = -NumSrcElts;
13655 } else if (Src.MaxElt < NumSrcElts) {
13656 // The extraction can just take the first half
13657 Src.ShuffleVec =
13658 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
13659 DAG.getConstant(0, DL, MVT::i64));
13660 } else {
13661 // An actual VEXT is needed
13662 SDValue VEXTSrc1 =
13663 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
13664 DAG.getConstant(0, DL, MVT::i64));
13665 SDValue VEXTSrc2 =
13666 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, DestVT, Src.ShuffleVec,
13667 DAG.getConstant(NumSrcElts, DL, MVT::i64));
13668 unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
13669
13670 if (!SrcVT.is64BitVector()) {
13671 LLVM_DEBUG(
13672 dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
13673 "for SVE vectors.");
13674 return SDValue();
13675 }
13676
13677 Src.ShuffleVec =
13678 DAG.getNode(AArch64ISD::EXT, DL, DestVT, VEXTSrc1, VEXTSrc2,
13679 DAG.getConstant(Imm, DL, MVT::i32));
13680 Src.WindowBase = -Src.MinElt;
13681 }
13682 }
13683
13684 // Another possible incompatibility occurs from the vector element types. We
13685 // can fix this by bitcasting the source vectors to the same type we intend
13686 // for the shuffle.
13687 for (auto &Src : Sources) {
13688 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
13689 if (SrcEltTy == SmallestEltTy)
13690 continue;
13691 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
13692 if (DAG.getDataLayout().isBigEndian()) {
13693 Src.ShuffleVec =
13694 DAG.getNode(AArch64ISD::NVCAST, DL, ShuffleVT, Src.ShuffleVec);
13695 } else {
13696 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Src.ShuffleVec);
13697 }
13698 Src.WindowScale =
13699 SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits();
13700 Src.WindowBase *= Src.WindowScale;
13701 }
13702
13703 // Final check before we try to actually produce a shuffle.
13704 LLVM_DEBUG({
13705 for (auto Src : Sources)
13706 assert(Src.ShuffleVec.getValueType() == ShuffleVT);
13707 });
13708
13709 // The stars all align, our next step is to produce the mask for the shuffle.
13710 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
13711 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
13712 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
13713 SDValue Entry = Op.getOperand(i);
13714 if (Entry.isUndef())
13715 continue;
13716
13717 auto Src = find(Sources, Entry.getOperand(0));
13718 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
13719
13720 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
13721 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
13722 // segment.
13723 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
13724 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
13725 VT.getScalarSizeInBits());
13726 int LanesDefined = BitsDefined / BitsPerShuffleLane;
13727
13728 // This source is expected to fill ResMultiplier lanes of the final shuffle,
13729 // starting at the appropriate offset.
13730 int *LaneMask = &Mask[i * ResMultiplier];
13731
13732 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
13733 ExtractBase += NumElts * (Src - Sources.begin());
13734 for (int j = 0; j < LanesDefined; ++j)
13735 LaneMask[j] = ExtractBase + j;
13736 }
13737
13738 // Final check before we try to produce nonsense...
13739 if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
13740 LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
13741 return SDValue();
13742 }
13743
13744 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
13745 for (unsigned i = 0; i < Sources.size(); ++i)
13746 ShuffleOps[i] = Sources[i].ShuffleVec;
13747
13748 SDValue Shuffle =
13749 DAG.getVectorShuffle(ShuffleVT, DL, ShuffleOps[0], ShuffleOps[1], Mask);
13750 SDValue V;
13751 if (DAG.getDataLayout().isBigEndian()) {
13752 V = DAG.getNode(AArch64ISD::NVCAST, DL, VT, Shuffle);
13753 } else {
13754 V = DAG.getNode(ISD::BITCAST, DL, VT, Shuffle);
13755 }
13756
13757 LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
13758 dbgs() << "Reshuffle, creating node: "; V.dump(););
13759
13760 return V;
13761}
13762
13763// check if an EXT instruction can handle the shuffle mask when the
13764// vector sources of the shuffle are the same.
13765static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
13766 unsigned NumElts = VT.getVectorNumElements();
13767
13768 // Assume that the first shuffle index is not UNDEF. Fail if it is.
13769 if (M[0] < 0)
13770 return false;
13771
13772 Imm = M[0];
13773
13774 // If this is a VEXT shuffle, the immediate value is the index of the first
13775 // element. The other shuffle indices must be the successive elements after
13776 // the first one.
13777 unsigned ExpectedElt = Imm;
13778 for (unsigned i = 1; i < NumElts; ++i) {
13779 // Increment the expected index. If it wraps around, just follow it
13780 // back to index zero and keep going.
13781 ++ExpectedElt;
13782 if (ExpectedElt == NumElts)
13783 ExpectedElt = 0;
13784
13785 if (M[i] < 0)
13786 continue; // ignore UNDEF indices
13787 if (ExpectedElt != static_cast<unsigned>(M[i]))
13788 return false;
13789 }
13790
13791 return true;
13792}
13793
13794// Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
13795// v4i32s. This is really a truncate, which we can construct out of (legal)
13796// concats and truncate nodes.
13798 if (V.getValueType() != MVT::v16i8)
13799 return SDValue();
13800 assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");
13801
13802 for (unsigned X = 0; X < 4; X++) {
13803 // Check the first item in each group is an extract from lane 0 of a v4i32
13804 // or v4i16.
13805 SDValue BaseExt = V.getOperand(X * 4);
13806 if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13807 (BaseExt.getOperand(0).getValueType() != MVT::v4i16 &&
13808 BaseExt.getOperand(0).getValueType() != MVT::v4i32) ||
13809 !isa<ConstantSDNode>(BaseExt.getOperand(1)) ||
13810 BaseExt.getConstantOperandVal(1) != 0)
13811 return SDValue();
13812 SDValue Base = BaseExt.getOperand(0);
13813 // And check the other items are extracts from the same vector.
13814 for (unsigned Y = 1; Y < 4; Y++) {
13815 SDValue Ext = V.getOperand(X * 4 + Y);
13816 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
13817 Ext.getOperand(0) != Base ||
13818 !isa<ConstantSDNode>(Ext.getOperand(1)) ||
13819 Ext.getConstantOperandVal(1) != Y)
13820 return SDValue();
13821 }
13822 }
13823
13824 // Turn the buildvector into a series of truncates and concates, which will
13825 // become uzip1's. Any v4i32s we found get truncated to v4i16, which are
13826 // concat together to produce 2 v8i16. These are both truncated and concat
13827 // together.
13828 SDLoc DL(V);
13829 SDValue Trunc[4] = {
13830 V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0),
13831 V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)};
13832 for (SDValue &V : Trunc)
13833 if (V.getValueType() == MVT::v4i32)
13834 V = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, V);
13835 SDValue Concat0 =
13836 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]);
13837 SDValue Concat1 =
13838 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]);
13839 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0);
13840 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1);
13841 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1);
13842}
13843
13844/// Check if a vector shuffle corresponds to a DUP instructions with a larger
13845/// element width than the vector lane type. If that is the case the function
13846/// returns true and writes the value of the DUP instruction lane operand into
13847/// DupLaneOp
13848static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,
13849 unsigned &DupLaneOp) {
13850 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
13851 "Only possible block sizes for wide DUP are: 16, 32, 64");
13852
13853 if (BlockSize <= VT.getScalarSizeInBits())
13854 return false;
13855 if (BlockSize % VT.getScalarSizeInBits() != 0)
13856 return false;
13857 if (VT.getSizeInBits() % BlockSize != 0)
13858 return false;
13859
13860 size_t SingleVecNumElements = VT.getVectorNumElements();
13861 size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits();
13862 size_t NumBlocks = VT.getSizeInBits() / BlockSize;
13863
13864 // We are looking for masks like
13865 // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
13866 // might be replaced by 'undefined'. BlockIndices will eventually contain
13867 // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
13868 // for the above examples)
13869 SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1);
13870 for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++)
13871 for (size_t I = 0; I < NumEltsPerBlock; I++) {
13872 int Elt = M[BlockIndex * NumEltsPerBlock + I];
13873 if (Elt < 0)
13874 continue;
13875 // For now we don't support shuffles that use the second operand
13876 if ((unsigned)Elt >= SingleVecNumElements)
13877 return false;
13878 if (BlockElts[I] < 0)
13879 BlockElts[I] = Elt;
13880 else if (BlockElts[I] != Elt)
13881 return false;
13882 }
13883
13884 // We found a candidate block (possibly with some undefs). It must be a
13885 // sequence of consecutive integers starting with a value divisible by
13886 // NumEltsPerBlock with some values possibly replaced by undef-s.
13887
13888 // Find first non-undef element
13889 auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; });
13890 assert(FirstRealEltIter != BlockElts.end() &&
13891 "Shuffle with all-undefs must have been caught by previous cases, "
13892 "e.g. isSplat()");
13893 if (FirstRealEltIter == BlockElts.end()) {
13894 DupLaneOp = 0;
13895 return true;
13896 }
13897
13898 // Index of FirstRealElt in BlockElts
13899 size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();
13900
13901 if ((unsigned)*FirstRealEltIter < FirstRealIndex)
13902 return false;
13903 // BlockElts[0] must have the following value if it isn't undef:
13904 size_t Elt0 = *FirstRealEltIter - FirstRealIndex;
13905
13906 // Check the first element
13907 if (Elt0 % NumEltsPerBlock != 0)
13908 return false;
13909 // Check that the sequence indeed consists of consecutive integers (modulo
13910 // undefs)
13911 for (size_t I = 0; I < NumEltsPerBlock; I++)
13912 if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I)
13913 return false;
13914
13915 DupLaneOp = Elt0 / NumEltsPerBlock;
13916 return true;
13917}
13918
13919// check if an EXT instruction can handle the shuffle mask when the
13920// vector sources of the shuffle are different.
13921static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
13922 unsigned &Imm) {
13923 // Look for the first non-undef element.
13924 const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
13925
13926 // Benefit from APInt to handle overflow when calculating expected element.
13927 unsigned NumElts = VT.getVectorNumElements();
13928 unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
13929 APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1, /*isSigned=*/false,
13930 /*implicitTrunc=*/true);
13931 // The following shuffle indices must be the successive elements after the
13932 // first real element.
13933 bool FoundWrongElt = std::any_of(FirstRealElt + 1, M.end(), [&](int Elt) {
13934 return Elt != ExpectedElt++ && Elt >= 0;
13935 });
13936 if (FoundWrongElt)
13937 return false;
13938
13939 // The index of an EXT is the first element if it is not UNDEF.
13940 // Watch out for the beginning UNDEFs. The EXT index should be the expected
13941 // value of the first element. E.g.
13942 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
13943 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
13944 // ExpectedElt is the last mask index plus 1.
13945 Imm = ExpectedElt.getZExtValue();
13946
13947 // There are two difference cases requiring to reverse input vectors.
13948 // For example, for vector <4 x i32> we have the following cases,
13949 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
13950 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
13951 // For both cases, we finally use mask <5, 6, 7, 0>, which requires
13952 // to reverse two input vectors.
13953 if (Imm < NumElts)
13954 ReverseEXT = true;
13955 else
13956 Imm -= NumElts;
13957
13958 return true;
13959}
13960
13961/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
13962/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
13963/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
13964static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
13965 unsigned NumElts = VT.getVectorNumElements();
13966 if (NumElts % 2 != 0)
13967 return false;
13968 WhichResult = (M[0] == 0 ? 0 : 1);
13969 unsigned Idx = WhichResult * NumElts / 2;
13970 for (unsigned i = 0; i != NumElts; i += 2) {
13971 if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
13972 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
13973 return false;
13974 Idx += 1;
13975 }
13976
13977 return true;
13978}
13979
13980/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
13981/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
13982/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
13983static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
13984 unsigned Half = VT.getVectorNumElements() / 2;
13985 WhichResult = (M[0] == 0 ? 0 : 1);
13986 for (unsigned j = 0; j != 2; ++j) {
13987 unsigned Idx = WhichResult;
13988 for (unsigned i = 0; i != Half; ++i) {
13989 int MIdx = M[i + j * Half];
13990 if (MIdx >= 0 && (unsigned)MIdx != Idx)
13991 return false;
13992 Idx += 2;
13993 }
13994 }
13995
13996 return true;
13997}
13998
13999/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
14000/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
14001/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
14002static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
14003 unsigned NumElts = VT.getVectorNumElements();
14004 if (NumElts % 2 != 0)
14005 return false;
14006 WhichResult = (M[0] == 0 ? 0 : 1);
14007 for (unsigned i = 0; i < NumElts; i += 2) {
14008 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
14009 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
14010 return false;
14011 }
14012 return true;
14013}
14014
14015static bool isINSMask(ArrayRef<int> M, int NumInputElements,
14016 bool &DstIsLeft, int &Anomaly) {
14017 if (M.size() != static_cast<size_t>(NumInputElements))
14018 return false;
14019
14020 int NumLHSMatch = 0, NumRHSMatch = 0;
14021 int LastLHSMismatch = -1, LastRHSMismatch = -1;
14022
14023 for (int i = 0; i < NumInputElements; ++i) {
14024 if (M[i] == -1) {
14025 ++NumLHSMatch;
14026 ++NumRHSMatch;
14027 continue;
14028 }
14029
14030 if (M[i] == i)
14031 ++NumLHSMatch;
14032 else
14033 LastLHSMismatch = i;
14034
14035 if (M[i] == i + NumInputElements)
14036 ++NumRHSMatch;
14037 else
14038 LastRHSMismatch = i;
14039 }
14040
14041 if (NumLHSMatch == NumInputElements - 1) {
14042 DstIsLeft = true;
14043 Anomaly = LastLHSMismatch;
14044 return true;
14045 } else if (NumRHSMatch == NumInputElements - 1) {
14046 DstIsLeft = false;
14047 Anomaly = LastRHSMismatch;
14048 return true;
14049 }
14050
14051 return false;
14052}
14053
14054static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
14055 if (VT.getSizeInBits() != 128)
14056 return false;
14057
14058 unsigned NumElts = VT.getVectorNumElements();
14059
14060 for (int I = 0, E = NumElts / 2; I != E; I++) {
14061 if (Mask[I] != I)
14062 return false;
14063 }
14064
14065 int Offset = NumElts / 2;
14066 for (int I = NumElts / 2, E = NumElts; I != E; I++) {
14067 if (Mask[I] != I + SplitLHS * Offset)
14068 return false;
14069 }
14070
14071 return true;
14072}
14073
14075 SDLoc DL(Op);
14076 EVT VT = Op.getValueType();
14077 SDValue V0 = Op.getOperand(0);
14078 SDValue V1 = Op.getOperand(1);
14079 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
14080
14083 return SDValue();
14084
14085 bool SplitV0 = V0.getValueSizeInBits() == 128;
14086
14087 if (!isConcatMask(Mask, VT, SplitV0))
14088 return SDValue();
14089
14090 EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
14091 if (SplitV0) {
14092 V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
14093 DAG.getConstant(0, DL, MVT::i64));
14094 }
14095 if (V1.getValueSizeInBits() == 128) {
14096 V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
14097 DAG.getConstant(0, DL, MVT::i64));
14098 }
14099 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
14100}
14101
14102/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
14103/// the specified operations to build the shuffle. ID is the perfect-shuffle
14104//ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
14105//table entry and LHS/RHS are the immediate inputs for this stage of the
14106//shuffle.
14108 unsigned PFEntry, SDValue LHS,
14109 SDValue RHS, SelectionDAG &DAG,
14110 const SDLoc &DL) {
14111 unsigned OpNum = (PFEntry >> 26) & 0x0F;
14112 unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
14113 unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
14114
14115 enum {
14116 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
14117 OP_VREV,
14118 OP_VDUP0,
14119 OP_VDUP1,
14120 OP_VDUP2,
14121 OP_VDUP3,
14122 OP_VEXT1,
14123 OP_VEXT2,
14124 OP_VEXT3,
14125 OP_VUZPL, // VUZP, left result
14126 OP_VUZPR, // VUZP, right result
14127 OP_VZIPL, // VZIP, left result
14128 OP_VZIPR, // VZIP, right result
14129 OP_VTRNL, // VTRN, left result
14130 OP_VTRNR, // VTRN, right result
14131 OP_MOVLANE // Move lane. RHSID is the lane to move into
14132 };
14133
14134 if (OpNum == OP_COPY) {
14135 if (LHSID == (1 * 9 + 2) * 9 + 3)
14136 return LHS;
14137 assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
14138 return RHS;
14139 }
14140
14141 if (OpNum == OP_MOVLANE) {
14142 // Decompose a PerfectShuffle ID to get the Mask for lane Elt
14143 auto getPFIDLane = [](unsigned ID, int Elt) -> int {
14144 assert(Elt < 4 && "Expected Perfect Lanes to be less than 4");
14145 Elt = 3 - Elt;
14146 while (Elt > 0) {
14147 ID /= 9;
14148 Elt--;
14149 }
14150 return (ID % 9 == 8) ? -1 : ID % 9;
14151 };
14152
14153 // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We
14154 // get the lane to move from the PFID, which is always from the
14155 // original vectors (V1 or V2).
14157 LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, DL);
14158 EVT VT = OpLHS.getValueType();
14159 assert(RHSID < 8 && "Expected a lane index for RHSID!");
14160 unsigned ExtLane = 0;
14161 SDValue Input;
14162
14163 // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs
14164 // convert into a higher type.
14165 if (RHSID & 0x4) {
14166 int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1;
14167 if (MaskElt == -1)
14168 MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1;
14169 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
14170 ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2);
14171 Input = MaskElt < 2 ? V1 : V2;
14172 if (VT.getScalarSizeInBits() == 16) {
14173 Input = DAG.getBitcast(MVT::v2f32, Input);
14174 OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS);
14175 } else {
14176 assert(VT.getScalarSizeInBits() == 32 &&
14177 "Expected 16 or 32 bit shuffle elements");
14178 Input = DAG.getBitcast(MVT::v2f64, Input);
14179 OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS);
14180 }
14181 } else {
14182 int MaskElt = getPFIDLane(ID, RHSID);
14183 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
14184 ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
14185 Input = MaskElt < 4 ? V1 : V2;
14186 // Be careful about creating illegal types. Use f16 instead of i16.
14187 if (VT == MVT::v4i16) {
14188 Input = DAG.getBitcast(MVT::v4f16, Input);
14189 OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS);
14190 }
14191 }
14193 Input.getValueType().getVectorElementType(),
14194 Input, DAG.getVectorIdxConstant(ExtLane, DL));
14195 SDValue Ins =
14196 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Input.getValueType(), OpLHS,
14197 Ext, DAG.getVectorIdxConstant(RHSID & 0x3, DL));
14198 return DAG.getBitcast(VT, Ins);
14199 }
14200
14201 SDValue OpLHS, OpRHS;
14202 OpLHS = GeneratePerfectShuffle(LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS,
14203 RHS, DAG, DL);
14204 OpRHS = GeneratePerfectShuffle(RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS,
14205 RHS, DAG, DL);
14206 EVT VT = OpLHS.getValueType();
14207
14208 switch (OpNum) {
14209 default:
14210 llvm_unreachable("Unknown shuffle opcode!");
14211 case OP_VREV:
14212 // VREV divides the vector in half and swaps within the half.
14213 if (VT.getVectorElementType() == MVT::i32 ||
14214 VT.getVectorElementType() == MVT::f32)
14215 return DAG.getNode(AArch64ISD::REV64, DL, VT, OpLHS);
14216 // vrev <4 x i16> -> REV32
14217 if (VT.getVectorElementType() == MVT::i16 ||
14218 VT.getVectorElementType() == MVT::f16 ||
14219 VT.getVectorElementType() == MVT::bf16)
14220 return DAG.getNode(AArch64ISD::REV32, DL, VT, OpLHS);
14221 // vrev <4 x i8> -> REV16
14222 assert(VT.getVectorElementType() == MVT::i8);
14223 return DAG.getNode(AArch64ISD::REV16, DL, VT, OpLHS);
14224 case OP_VDUP0:
14225 case OP_VDUP1:
14226 case OP_VDUP2:
14227 case OP_VDUP3: {
14228 EVT EltTy = VT.getVectorElementType();
14229 unsigned Opcode;
14230 if (EltTy == MVT::i8)
14231 Opcode = AArch64ISD::DUPLANE8;
14232 else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)
14233 Opcode = AArch64ISD::DUPLANE16;
14234 else if (EltTy == MVT::i32 || EltTy == MVT::f32)
14235 Opcode = AArch64ISD::DUPLANE32;
14236 else if (EltTy == MVT::i64 || EltTy == MVT::f64)
14237 Opcode = AArch64ISD::DUPLANE64;
14238 else
14239 llvm_unreachable("Invalid vector element type?");
14240
14241 if (VT.getSizeInBits() == 64)
14242 OpLHS = WidenVector(OpLHS, DAG);
14243 SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, DL, MVT::i64);
14244 return DAG.getNode(Opcode, DL, VT, OpLHS, Lane);
14245 }
14246 case OP_VEXT1:
14247 case OP_VEXT2:
14248 case OP_VEXT3: {
14249 unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
14250 return DAG.getNode(AArch64ISD::EXT, DL, VT, OpLHS, OpRHS,
14251 DAG.getConstant(Imm, DL, MVT::i32));
14252 }
14253 case OP_VUZPL:
14254 return DAG.getNode(AArch64ISD::UZP1, DL, VT, OpLHS, OpRHS);
14255 case OP_VUZPR:
14256 return DAG.getNode(AArch64ISD::UZP2, DL, VT, OpLHS, OpRHS);
14257 case OP_VZIPL:
14258 return DAG.getNode(AArch64ISD::ZIP1, DL, VT, OpLHS, OpRHS);
14259 case OP_VZIPR:
14260 return DAG.getNode(AArch64ISD::ZIP2, DL, VT, OpLHS, OpRHS);
14261 case OP_VTRNL:
14262 return DAG.getNode(AArch64ISD::TRN1, DL, VT, OpLHS, OpRHS);
14263 case OP_VTRNR:
14264 return DAG.getNode(AArch64ISD::TRN2, DL, VT, OpLHS, OpRHS);
14265 }
14266}
14267
14269 SelectionDAG &DAG) {
14270 // Check to see if we can use the TBL instruction.
14271 SDValue V1 = Op.getOperand(0);
14272 SDValue V2 = Op.getOperand(1);
14273 SDLoc DL(Op);
14274
14275 EVT EltVT = Op.getValueType().getVectorElementType();
14276 unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
14277
14278 bool Swap = false;
14279 if (V1.isUndef() || isZerosVector(V1.getNode())) {
14280 std::swap(V1, V2);
14281 Swap = true;
14282 }
14283
14284 // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill
14285 // out of range values with 0s. We do need to make sure that any out-of-range
14286 // values are really out-of-range for a v16i8 vector.
14287 bool IsUndefOrZero = V2.isUndef() || isZerosVector(V2.getNode());
14288 MVT IndexVT = MVT::v8i8;
14289 unsigned IndexLen = 8;
14290 if (Op.getValueSizeInBits() == 128) {
14291 IndexVT = MVT::v16i8;
14292 IndexLen = 16;
14293 }
14294
14296 for (int Val : ShuffleMask) {
14297 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
14298 unsigned Offset = Byte + Val * BytesPerElt;
14299 if (Swap)
14300 Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen;
14301 if (IsUndefOrZero && Offset >= IndexLen)
14302 Offset = 255;
14303 TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
14304 }
14305 }
14306
14307 SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
14308 SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
14309
14310 SDValue Shuffle;
14311 if (IsUndefOrZero) {
14312 if (IndexLen == 8)
14313 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
14314 Shuffle = DAG.getNode(
14315 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
14316 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32),
14317 V1Cst,
14318 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
14319 } else {
14320 if (IndexLen == 8) {
14321 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
14322 Shuffle = DAG.getNode(
14323 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
14324 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32),
14325 V1Cst,
14326 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
14327 } else {
14328 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
14329 // cannot currently represent the register constraints on the input
14330 // table registers.
14331 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
14332 // DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
14333 // IndexLen));
14334 Shuffle = DAG.getNode(
14335 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
14336 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32),
14337 V1Cst, V2Cst,
14338 DAG.getBuildVector(IndexVT, DL, ArrayRef(TBLMask.data(), IndexLen)));
14339 }
14340 }
14341 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
14342}
14343
14344static unsigned getDUPLANEOp(EVT EltType) {
14345 if (EltType == MVT::i8)
14346 return AArch64ISD::DUPLANE8;
14347 if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)
14348 return AArch64ISD::DUPLANE16;
14349 if (EltType == MVT::i32 || EltType == MVT::f32)
14350 return AArch64ISD::DUPLANE32;
14351 if (EltType == MVT::i64 || EltType == MVT::f64)
14352 return AArch64ISD::DUPLANE64;
14353
14354 llvm_unreachable("Invalid vector element type?");
14355}
14356
14357static SDValue constructDup(SDValue V, int Lane, SDLoc DL, EVT VT,
14358 unsigned Opcode, SelectionDAG &DAG) {
14359 // Try to eliminate a bitcasted extract subvector before a DUPLANE.
14360 auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
14361 // Match: dup (bitcast (extract_subv X, C)), LaneC
14362 if (BitCast.getOpcode() != ISD::BITCAST ||
14364 return false;
14365
14366 // The extract index must align in the destination type. That may not
14367 // happen if the bitcast is from narrow to wide type.
14368 SDValue Extract = BitCast.getOperand(0);
14369 unsigned ExtIdx = Extract.getConstantOperandVal(1);
14370 unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
14371 unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
14372 unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
14373 if (ExtIdxInBits % CastedEltBitWidth != 0)
14374 return false;
14375
14376 // Can't handle cases where vector size is not 128-bit
14377 if (!Extract.getOperand(0).getValueType().is128BitVector())
14378 return false;
14379
14380 // Update the lane value by offsetting with the scaled extract index.
14381 LaneC += ExtIdxInBits / CastedEltBitWidth;
14382
14383 // Determine the casted vector type of the wide vector input.
14384 // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
14385 // Examples:
14386 // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
14387 // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
14388 unsigned SrcVecNumElts =
14389 Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
14391 SrcVecNumElts);
14392 return true;
14393 };
14394 MVT CastVT;
14395 if (getScaledOffsetDup(V, Lane, CastVT)) {
14396 V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0));
14397 } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
14398 V.getOperand(0).getValueType().is128BitVector()) {
14399 // The lane is incremented by the index of the extract.
14400 // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
14401 Lane += V.getConstantOperandVal(1);
14402 V = V.getOperand(0);
14403 } else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
14404 // The lane is decremented if we are splatting from the 2nd operand.
14405 // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
14406 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
14407 Lane -= Idx * VT.getVectorNumElements() / 2;
14408 V = WidenVector(V.getOperand(Idx), DAG);
14409 } else if (VT.getSizeInBits() == 64) {
14410 // Widen the operand to 128-bit register with undef.
14411 V = WidenVector(V, DAG);
14412 }
14413 return DAG.getNode(Opcode, DL, VT, V, DAG.getConstant(Lane, DL, MVT::i64));
14414}
14415
14416// Try to widen element type to get a new mask value for a better permutation
14417// sequence, so that we can use NEON shuffle instructions, such as zip1/2,
14418// UZP1/2, TRN1/2, REV, INS, etc.
14419// For example:
14420// shufflevector <4 x i32> %a, <4 x i32> %b,
14421// <4 x i32> <i32 6, i32 7, i32 2, i32 3>
14422// is equivalent to:
14423// shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
14424// Finally, we can get:
14425// mov v0.d[0], v1.d[1]
14427 SDLoc DL(Op);
14428 EVT VT = Op.getValueType();
14429 EVT ScalarVT = VT.getVectorElementType();
14430 unsigned ElementSize = ScalarVT.getFixedSizeInBits();
14431 SDValue V0 = Op.getOperand(0);
14432 SDValue V1 = Op.getOperand(1);
14433 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
14434
14435 // If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ...
14436 // We need to make sure the wider element type is legal. Thus, ElementSize
14437 // should be not larger than 32 bits, and i1 type should also be excluded.
14438 if (ElementSize > 32 || ElementSize == 1)
14439 return SDValue();
14440
14441 SmallVector<int, 8> NewMask;
14442 if (widenShuffleMaskElts(Mask, NewMask)) {
14443 MVT NewEltVT = VT.isFloatingPoint()
14444 ? MVT::getFloatingPointVT(ElementSize * 2)
14445 : MVT::getIntegerVT(ElementSize * 2);
14446 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
14447 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
14448 V0 = DAG.getBitcast(NewVT, V0);
14449 V1 = DAG.getBitcast(NewVT, V1);
14450 return DAG.getBitcast(VT,
14451 DAG.getVectorShuffle(NewVT, DL, V0, V1, NewMask));
14452 }
14453 }
14454
14455 return SDValue();
14456}
14457
14458// Try to fold shuffle (tbl2, tbl2) into a single tbl4.
14460 ArrayRef<int> ShuffleMask,
14461 SelectionDAG &DAG) {
14462 SDValue Tbl1 = Op->getOperand(0);
14463 SDValue Tbl2 = Op->getOperand(1);
14464 SDLoc DL(Op);
14465 SDValue Tbl2ID =
14466 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i64);
14467
14468 EVT VT = Op.getValueType();
14469 if (Tbl1.getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
14470 Tbl1.getOperand(0) != Tbl2ID ||
14472 Tbl2.getOperand(0) != Tbl2ID)
14473 return SDValue();
14474
14475 if (Tbl1.getValueType() != MVT::v16i8 || Tbl2.getValueType() != MVT::v16i8)
14476 return SDValue();
14477
14478 SDValue Mask1 = Tbl1.getOperand(3);
14479 SDValue Mask2 = Tbl2.getOperand(3);
14480 if (Mask1.getOpcode() != ISD::BUILD_VECTOR ||
14481 Mask2.getOpcode() != ISD::BUILD_VECTOR)
14482 return SDValue();
14483
14484 SmallVector<SDValue, 16> TBLMaskParts(16, SDValue());
14485 for (unsigned I = 0; I < 16; I++) {
14486 if (ShuffleMask[I] < 16)
14487 TBLMaskParts[I] = Mask1.getOperand(ShuffleMask[I]);
14488 else {
14489 auto *C = dyn_cast<ConstantSDNode>(Mask2.getOperand(ShuffleMask[I] - 16));
14490 if (!C)
14491 return SDValue();
14492 TBLMaskParts[I] = DAG.getConstant(C->getSExtValue() + 32, DL, MVT::i32);
14493 }
14494 }
14495
14496 SDValue TBLMask = DAG.getBuildVector(VT, DL, TBLMaskParts);
14497 SDValue ID =
14498 DAG.getTargetConstant(Intrinsic::aarch64_neon_tbl4, DL, MVT::i64);
14499
14500 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::v16i8,
14501 {ID, Tbl1->getOperand(1), Tbl1->getOperand(2),
14502 Tbl2->getOperand(1), Tbl2->getOperand(2), TBLMask});
14503}
14504
14505// Baseline legalization for ZERO_EXTEND_VECTOR_INREG will blend-in zeros,
14506// but we don't have an appropriate instruction,
14507// so custom-lower it as ZIP1-with-zeros.
14508SDValue
14509AArch64TargetLowering::LowerZERO_EXTEND_VECTOR_INREG(SDValue Op,
14510 SelectionDAG &DAG) const {
14511 SDLoc DL(Op);
14512 EVT VT = Op.getValueType();
14513 SDValue SrcOp = Op.getOperand(0);
14514 EVT SrcVT = SrcOp.getValueType();
14515 assert(VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits() == 0 &&
14516 "Unexpected extension factor.");
14517 unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();
14518 // FIXME: support multi-step zipping?
14519 if (Scale != 2)
14520 return SDValue();
14521 SDValue Zeros = DAG.getConstant(0, DL, SrcVT);
14522 return DAG.getBitcast(VT,
14523 DAG.getNode(AArch64ISD::ZIP1, DL, SrcVT, SrcOp, Zeros));
14524}
14525
14526SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
14527 SelectionDAG &DAG) const {
14528 SDLoc DL(Op);
14529 EVT VT = Op.getValueType();
14530
14531 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
14532
14533 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
14534 return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG);
14535
14536 // Convert shuffles that are directly supported on NEON to target-specific
14537 // DAG nodes, instead of keeping them as shuffles and matching them again
14538 // during code selection. This is more efficient and avoids the possibility
14539 // of inconsistencies between legalization and selection.
14540 ArrayRef<int> ShuffleMask = SVN->getMask();
14541
14542 SDValue V1 = Op.getOperand(0);
14543 SDValue V2 = Op.getOperand(1);
14544
14545 assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!");
14546 assert(ShuffleMask.size() == VT.getVectorNumElements() &&
14547 "Unexpected VECTOR_SHUFFLE mask size!");
14548
14549 if (SDValue Res = tryToConvertShuffleOfTbl2ToTbl4(Op, ShuffleMask, DAG))
14550 return Res;
14551
14552 if (SVN->isSplat()) {
14553 int Lane = SVN->getSplatIndex();
14554 // If this is undef splat, generate it via "just" vdup, if possible.
14555 if (Lane == -1)
14556 Lane = 0;
14557
14558 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
14559 return DAG.getNode(AArch64ISD::DUP, DL, V1.getValueType(),
14560 V1.getOperand(0));
14561 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
14562 // constant. If so, we can just reference the lane's definition directly.
14563 if (V1.getOpcode() == ISD::BUILD_VECTOR &&
14565 return DAG.getNode(AArch64ISD::DUP, DL, VT, V1.getOperand(Lane));
14566
14567 // Otherwise, duplicate from the lane of the input vector.
14568 unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
14569 return constructDup(V1, Lane, DL, VT, Opcode, DAG);
14570 }
14571
14572 // Check if the mask matches a DUP for a wider element
14573 for (unsigned LaneSize : {64U, 32U, 16U}) {
14574 unsigned Lane = 0;
14575 if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) {
14576 unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
14577 : LaneSize == 32 ? AArch64ISD::DUPLANE32
14578 : AArch64ISD::DUPLANE16;
14579 // Cast V1 to an integer vector with required lane size
14580 MVT NewEltTy = MVT::getIntegerVT(LaneSize);
14581 unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
14582 MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount);
14583 V1 = DAG.getBitcast(NewVecTy, V1);
14584 // Construct the DUP instruction
14585 V1 = constructDup(V1, Lane, DL, NewVecTy, Opcode, DAG);
14586 // Cast back to the original type
14587 return DAG.getBitcast(VT, V1);
14588 }
14589 }
14590
14591 unsigned NumElts = VT.getVectorNumElements();
14592 unsigned EltSize = VT.getScalarSizeInBits();
14593 if (isREVMask(ShuffleMask, EltSize, NumElts, 64))
14594 return DAG.getNode(AArch64ISD::REV64, DL, V1.getValueType(), V1);
14595 if (isREVMask(ShuffleMask, EltSize, NumElts, 32))
14596 return DAG.getNode(AArch64ISD::REV32, DL, V1.getValueType(), V1);
14597 if (isREVMask(ShuffleMask, EltSize, NumElts, 16))
14598 return DAG.getNode(AArch64ISD::REV16, DL, V1.getValueType(), V1);
14599
14600 if (((NumElts == 8 && EltSize == 16) || (NumElts == 16 && EltSize == 8)) &&
14601 ShuffleVectorInst::isReverseMask(ShuffleMask, ShuffleMask.size())) {
14602 SDValue Rev = DAG.getNode(AArch64ISD::REV64, DL, VT, V1);
14603 return DAG.getNode(AArch64ISD::EXT, DL, VT, Rev, Rev,
14604 DAG.getConstant(8, DL, MVT::i32));
14605 }
14606
14607 bool ReverseEXT = false;
14608 unsigned Imm;
14609 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
14610 if (ReverseEXT)
14611 std::swap(V1, V2);
14612 Imm *= getExtFactor(V1);
14613 return DAG.getNode(AArch64ISD::EXT, DL, V1.getValueType(), V1, V2,
14614 DAG.getConstant(Imm, DL, MVT::i32));
14615 } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
14616 Imm *= getExtFactor(V1);
14617 return DAG.getNode(AArch64ISD::EXT, DL, V1.getValueType(), V1, V1,
14618 DAG.getConstant(Imm, DL, MVT::i32));
14619 }
14620
14621 unsigned WhichResult;
14622 if (isZIPMask(ShuffleMask, NumElts, WhichResult)) {
14623 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
14624 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V2);
14625 }
14626 if (isUZPMask(ShuffleMask, NumElts, WhichResult)) {
14627 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
14628 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V2);
14629 }
14630 if (isTRNMask(ShuffleMask, NumElts, WhichResult)) {
14631 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
14632 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V2);
14633 }
14634
14635 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
14636 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
14637 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V1);
14638 }
14639 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
14640 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
14641 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V1);
14642 }
14643 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
14644 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
14645 return DAG.getNode(Opc, DL, V1.getValueType(), V1, V1);
14646 }
14647
14649 return Concat;
14650
14651 bool DstIsLeft;
14652 int Anomaly;
14653 int NumInputElements = V1.getValueType().getVectorNumElements();
14654 if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
14655 SDValue DstVec = DstIsLeft ? V1 : V2;
14656 SDValue DstLaneV = DAG.getConstant(Anomaly, DL, MVT::i64);
14657
14658 SDValue SrcVec = V1;
14659 int SrcLane = ShuffleMask[Anomaly];
14660 if (SrcLane >= NumInputElements) {
14661 SrcVec = V2;
14662 SrcLane -= NumElts;
14663 }
14664 SDValue SrcLaneV = DAG.getConstant(SrcLane, DL, MVT::i64);
14665
14666 EVT ScalarVT = VT.getVectorElementType();
14667
14668 if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger())
14669 ScalarVT = MVT::i32;
14670
14671 return DAG.getNode(
14672 ISD::INSERT_VECTOR_ELT, DL, VT, DstVec,
14673 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, SrcVec, SrcLaneV),
14674 DstLaneV);
14675 }
14676
14677 if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG))
14678 return NewSD;
14679
14680 // If the shuffle is not directly supported and it has 4 elements, use
14681 // the PerfectShuffle-generated table to synthesize it from other shuffles.
14682 if (NumElts == 4) {
14683 unsigned PFIndexes[4];
14684 for (unsigned i = 0; i != 4; ++i) {
14685 if (ShuffleMask[i] < 0)
14686 PFIndexes[i] = 8;
14687 else
14688 PFIndexes[i] = ShuffleMask[i];
14689 }
14690
14691 // Compute the index in the perfect shuffle table.
14692 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
14693 PFIndexes[2] * 9 + PFIndexes[3];
14694 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
14695 return GeneratePerfectShuffle(PFTableIndex, V1, V2, PFEntry, V1, V2, DAG,
14696 DL);
14697 }
14698
14699 // Check for a "select shuffle", generating a BSL to pick between lanes in
14700 // V1/V2.
14701 if (ShuffleVectorInst::isSelectMask(ShuffleMask, NumElts)) {
14702 assert(VT.getScalarSizeInBits() <= 32 &&
14703 "Expected larger vector element sizes to be handled already");
14704 SmallVector<SDValue> MaskElts;
14705 for (int M : ShuffleMask)
14706 MaskElts.push_back(DAG.getConstant(
14707 M >= static_cast<int>(NumElts) ? 0 : 0xffffffff, DL, MVT::i32));
14708 EVT IVT = VT.changeVectorElementTypeToInteger();
14709 SDValue MaskConst = DAG.getBuildVector(IVT, DL, MaskElts);
14710 return DAG.getBitcast(VT, DAG.getNode(AArch64ISD::BSP, DL, IVT, MaskConst,
14711 DAG.getBitcast(IVT, V1),
14712 DAG.getBitcast(IVT, V2)));
14713 }
14714
14715 // Fall back to generating a TBL
14716 return GenerateTBL(Op, ShuffleMask, DAG);
14717}
14718
14719SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
14720 SelectionDAG &DAG) const {
14721 EVT VT = Op.getValueType();
14722
14723 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
14724 return LowerToScalableOp(Op, DAG);
14725
14726 assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i1 &&
14727 "Unexpected vector type!");
14728
14729 // We can handle the constant cases during isel.
14730 if (isa<ConstantSDNode>(Op.getOperand(0)))
14731 return Op;
14732
14733 // There isn't a natural way to handle the general i1 case, so we use some
14734 // trickery with whilelo.
14735 SDLoc DL(Op);
14736 SDValue SplatVal = DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, MVT::i64);
14737 SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, SplatVal,
14738 DAG.getValueType(MVT::i1));
14739 SDValue ID =
14740 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
14741 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
14742 if (VT == MVT::nxv1i1)
14743 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::nxv1i1,
14744 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv2i1, ID,
14745 Zero, SplatVal),
14746 Zero);
14747 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, ID, Zero, SplatVal);
14748}
14749
14750SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
14751 SelectionDAG &DAG) const {
14752 SDLoc DL(Op);
14753
14754 EVT VT = Op.getValueType();
14755 if (!isTypeLegal(VT) || !VT.isScalableVector())
14756 return SDValue();
14757
14758 // Current lowering only supports the SVE-ACLE types.
14760 return SDValue();
14761
14762 // The DUPQ operation is independent of element type so normalise to i64s.
14763 SDValue Idx128 = Op.getOperand(2);
14764
14765 // DUPQ can be used when idx is in range.
14766 auto *CIdx = dyn_cast<ConstantSDNode>(Idx128);
14767 if (CIdx && (CIdx->getZExtValue() <= 3)) {
14768 SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
14769 return DAG.getNode(AArch64ISD::DUPLANE128, DL, VT, Op.getOperand(1), CI);
14770 }
14771
14772 SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
14773
14774 // The ACLE says this must produce the same result as:
14775 // svtbl(data, svadd_x(svptrue_b64(),
14776 // svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
14777 // index * 2))
14778 SDValue One = DAG.getConstant(1, DL, MVT::i64);
14779 SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One);
14780
14781 // create the vector 0,1,0,1,...
14782 SDValue SV = DAG.getStepVector(DL, MVT::nxv2i64);
14783 SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);
14784
14785 // create the vector idx64,idx64+1,idx64,idx64+1,...
14786 SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128);
14787 SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);
14788 SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);
14789
14790 // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
14791 SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
14792 return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
14793}
14794
14795
14796static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
14797 APInt &UndefBits) {
14798 EVT VT = BVN->getValueType(0);
14799 APInt SplatBits, SplatUndef;
14800 unsigned SplatBitSize;
14801 bool HasAnyUndefs;
14802 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
14803 unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
14804
14805 for (unsigned i = 0; i < NumSplats; ++i) {
14806 CnstBits <<= SplatBitSize;
14807 UndefBits <<= SplatBitSize;
14808 CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
14809 UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
14810 }
14811
14812 return true;
14813 }
14814
14815 return false;
14816}
14817
14818// Try 64-bit splatted SIMD immediate.
14819static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14820 const APInt &Bits) {
14821 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14822 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14823 EVT VT = Op.getValueType();
14824 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;
14825
14828
14829 SDLoc DL(Op);
14830 SDValue Mov =
14831 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32));
14832 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
14833 }
14834 }
14835
14836 return SDValue();
14837}
14838
14839// Try 32-bit splatted SIMD immediate.
14840static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14841 const APInt &Bits,
14842 const SDValue *LHS = nullptr) {
14843 EVT VT = Op.getValueType();
14844 if (VT.isFixedLengthVector() &&
14846 return SDValue();
14847
14848 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14849 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14850 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
14851 bool isAdvSIMDModImm = false;
14852 uint64_t Shift;
14853
14854 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {
14856 Shift = 0;
14857 }
14858 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
14860 Shift = 8;
14861 }
14862 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
14864 Shift = 16;
14865 }
14866 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
14868 Shift = 24;
14869 }
14870
14871 if (isAdvSIMDModImm) {
14872 SDLoc DL(Op);
14873 SDValue Mov;
14874
14875 if (LHS)
14876 Mov = DAG.getNode(NewOp, DL, MovTy,
14877 DAG.getNode(AArch64ISD::NVCAST, DL, MovTy, *LHS),
14878 DAG.getConstant(Value, DL, MVT::i32),
14879 DAG.getConstant(Shift, DL, MVT::i32));
14880 else
14881 Mov =
14882 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32),
14883 DAG.getConstant(Shift, DL, MVT::i32));
14884
14885 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
14886 }
14887 }
14888
14889 return SDValue();
14890}
14891
14892// Try 16-bit splatted SIMD immediate.
14893static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14894 const APInt &Bits,
14895 const SDValue *LHS = nullptr) {
14896 EVT VT = Op.getValueType();
14897 if (VT.isFixedLengthVector() &&
14899 return SDValue();
14900
14901 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14902 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14903 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
14904 bool isAdvSIMDModImm = false;
14905 uint64_t Shift;
14906
14907 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {
14909 Shift = 0;
14910 }
14911 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
14913 Shift = 8;
14914 }
14915
14916 if (isAdvSIMDModImm) {
14917 SDLoc DL(Op);
14918 SDValue Mov;
14919
14920 if (LHS)
14921 Mov = DAG.getNode(NewOp, DL, MovTy,
14922 DAG.getNode(AArch64ISD::NVCAST, DL, MovTy, *LHS),
14923 DAG.getConstant(Value, DL, MVT::i32),
14924 DAG.getConstant(Shift, DL, MVT::i32));
14925 else
14926 Mov =
14927 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32),
14928 DAG.getConstant(Shift, DL, MVT::i32));
14929
14930 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
14931 }
14932 }
14933
14934 return SDValue();
14935}
14936
14937// Try 32-bit splatted SIMD immediate with shifted ones.
14939 SelectionDAG &DAG, const APInt &Bits) {
14940 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14941 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14942 EVT VT = Op.getValueType();
14943 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
14944 bool isAdvSIMDModImm = false;
14945 uint64_t Shift;
14946
14947 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {
14949 Shift = 264;
14950 }
14951 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
14953 Shift = 272;
14954 }
14955
14956 if (isAdvSIMDModImm) {
14957 SDLoc DL(Op);
14958 SDValue Mov =
14959 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32),
14960 DAG.getConstant(Shift, DL, MVT::i32));
14961 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
14962 }
14963 }
14964
14965 return SDValue();
14966}
14967
14968// Try 8-bit splatted SIMD immediate.
14969static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14970 const APInt &Bits) {
14971 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14972 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14973 EVT VT = Op.getValueType();
14974 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
14975
14978
14979 SDLoc DL(Op);
14980 SDValue Mov =
14981 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32));
14982 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
14983 }
14984 }
14985
14986 return SDValue();
14987}
14988
14989// Try FP splatted SIMD immediate.
14990static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
14991 const APInt &Bits) {
14992 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
14993 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
14994 EVT VT = Op.getValueType();
14995 bool isWide = (VT.getSizeInBits() == 128);
14996 MVT MovTy;
14997 bool isAdvSIMDModImm = false;
14998
14999 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {
15001 MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
15002 }
15003 else if (isWide &&
15004 (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
15006 MovTy = MVT::v2f64;
15007 }
15008
15009 if (isAdvSIMDModImm) {
15010 SDLoc DL(Op);
15011 SDValue Mov =
15012 DAG.getNode(NewOp, DL, MovTy, DAG.getConstant(Value, DL, MVT::i32));
15013 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Mov);
15014 }
15015 }
15016
15017 return SDValue();
15018}
15019
15020// Specialized code to quickly find if PotentialBVec is a BuildVector that
15021// consists of only the same constant int value, returned in reference arg
15022// ConstVal
15023static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
15024 uint64_t &ConstVal) {
15025 BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
15026 if (!Bvec)
15027 return false;
15029 if (!FirstElt)
15030 return false;
15031 EVT VT = Bvec->getValueType(0);
15032 unsigned NumElts = VT.getVectorNumElements();
15033 for (unsigned i = 1; i < NumElts; ++i)
15034 if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
15035 return false;
15036 ConstVal = FirstElt->getZExtValue();
15037 return true;
15038}
15039
15041 // Look through cast.
15042 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST)
15043 N = N.getOperand(0);
15044
15045 return ISD::isConstantSplatVectorAllZeros(N.getNode());
15046}
15047
15049 unsigned NumElts = N.getValueType().getVectorMinNumElements();
15050
15051 // Look through cast.
15052 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {
15053 N = N.getOperand(0);
15054 // When reinterpreting from a type with fewer elements the "new" elements
15055 // are not active, so bail if they're likely to be used.
15056 if (N.getValueType().getVectorMinNumElements() < NumElts)
15057 return false;
15058 }
15059
15060 if (ISD::isConstantSplatVectorAllOnes(N.getNode()))
15061 return true;
15062
15063 // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
15064 // or smaller than the implicit element type represented by N.
15065 // NOTE: A larger element count implies a smaller element type.
15066 if (N.getOpcode() == AArch64ISD::PTRUE &&
15067 N.getConstantOperandVal(0) == AArch64SVEPredPattern::all)
15068 return N.getValueType().getVectorMinNumElements() >= NumElts;
15069
15070 // If we're compiling for a specific vector-length, we can check if the
15071 // pattern's VL equals that of the scalable vector at runtime.
15072 if (N.getOpcode() == AArch64ISD::PTRUE) {
15073 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
15074 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
15075 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
15076 if (MaxSVESize && MinSVESize == MaxSVESize) {
15077 unsigned VScale = MaxSVESize / AArch64::SVEBitsPerBlock;
15078 unsigned PatNumElts =
15079 getNumElementsFromSVEPredPattern(N.getConstantOperandVal(0));
15080 return PatNumElts == (NumElts * VScale);
15081 }
15082 }
15083
15084 return false;
15085}
15086
15087// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
15088// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
15089// BUILD_VECTORs with constant element C1, C2 is a constant, and:
15090// - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
15091// - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
15092// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
15094 EVT VT = N->getValueType(0);
15095
15096 if (!VT.isVector())
15097 return SDValue();
15098
15099 SDLoc DL(N);
15100
15101 SDValue And;
15102 SDValue Shift;
15103
15104 SDValue FirstOp = N->getOperand(0);
15105 unsigned FirstOpc = FirstOp.getOpcode();
15106 SDValue SecondOp = N->getOperand(1);
15107 unsigned SecondOpc = SecondOp.getOpcode();
15108
15109 // Is one of the operands an AND or a BICi? The AND may have been optimised to
15110 // a BICi in order to use an immediate instead of a register.
15111 // Is the other operand an shl or lshr? This will have been turned into:
15112 // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift
15113 // or (AArch64ISD::SHL_PRED || AArch64ISD::SRL_PRED) mask, vector, #shiftVec.
15114 if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
15115 (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR ||
15116 SecondOpc == AArch64ISD::SHL_PRED ||
15117 SecondOpc == AArch64ISD::SRL_PRED)) {
15118 And = FirstOp;
15119 Shift = SecondOp;
15120
15121 } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
15122 (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR ||
15123 FirstOpc == AArch64ISD::SHL_PRED ||
15124 FirstOpc == AArch64ISD::SRL_PRED)) {
15125 And = SecondOp;
15126 Shift = FirstOp;
15127 } else
15128 return SDValue();
15129
15130 bool IsAnd = And.getOpcode() == ISD::AND;
15131 bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR ||
15132 Shift.getOpcode() == AArch64ISD::SRL_PRED;
15133 bool ShiftHasPredOp = Shift.getOpcode() == AArch64ISD::SHL_PRED ||
15134 Shift.getOpcode() == AArch64ISD::SRL_PRED;
15135
15136 // Is the shift amount constant and are all lanes active?
15137 uint64_t C2;
15138 if (ShiftHasPredOp) {
15139 if (!isAllActivePredicate(DAG, Shift.getOperand(0)))
15140 return SDValue();
15141 APInt C;
15143 return SDValue();
15144 C2 = C.getZExtValue();
15145 } else if (ConstantSDNode *C2node =
15147 C2 = C2node->getZExtValue();
15148 else
15149 return SDValue();
15150
15151 APInt C1AsAPInt;
15152 unsigned ElemSizeInBits = VT.getScalarSizeInBits();
15153 if (IsAnd) {
15154 // Is the and mask vector all constant?
15155 if (!ISD::isConstantSplatVector(And.getOperand(1).getNode(), C1AsAPInt))
15156 return SDValue();
15157 } else {
15158 // Reconstruct the corresponding AND immediate from the two BICi immediates.
15159 ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1));
15160 ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2));
15161 assert(C1nodeImm && C1nodeShift);
15162 C1AsAPInt = ~(C1nodeImm->getAPIntValue() << C1nodeShift->getAPIntValue());
15163 C1AsAPInt = C1AsAPInt.zextOrTrunc(ElemSizeInBits);
15164 }
15165
15166 // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
15167 // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
15168 // how much one can shift elements of a particular size?
15169 if (C2 > ElemSizeInBits)
15170 return SDValue();
15171
15172 APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2)
15173 : APInt::getLowBitsSet(ElemSizeInBits, C2);
15174 if (C1AsAPInt != RequiredC1)
15175 return SDValue();
15176
15177 SDValue X = And.getOperand(0);
15178 SDValue Y = ShiftHasPredOp ? Shift.getOperand(1) : Shift.getOperand(0);
15179 SDValue Imm = ShiftHasPredOp ? DAG.getTargetConstant(C2, DL, MVT::i32)
15180 : Shift.getOperand(1);
15181
15182 unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
15183 return DAG.getNode(Inst, DL, VT, X, Y, Imm);
15184}
15185
15187 EVT VT = N->getValueType(0);
15188 assert(VT.isVector() && "Expected vector type in tryLowerToBSL\n");
15189 SDLoc DL(N);
15190 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
15191
15192 if (VT.isScalableVector() && !Subtarget.hasSVE2())
15193 return SDValue();
15194
15195 SDValue N0 = N->getOperand(0);
15196 if (N0.getOpcode() != ISD::AND)
15197 return SDValue();
15198
15199 SDValue N1 = N->getOperand(1);
15200 if (N1.getOpcode() != ISD::AND)
15201 return SDValue();
15202
15203 // InstCombine does (not (neg a)) => (add a -1).
15204 // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
15205 // Loop over all combinations of AND operands.
15206 for (int i = 1; i >= 0; --i) {
15207 for (int j = 1; j >= 0; --j) {
15208 SDValue O0 = N0->getOperand(i);
15209 SDValue O1 = N1->getOperand(j);
15210 SDValue Sub, Add, SubSibling, AddSibling;
15211
15212 // Find a SUB and an ADD operand, one from each AND.
15213 if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
15214 Sub = O0;
15215 Add = O1;
15216 SubSibling = N0->getOperand(1 - i);
15217 AddSibling = N1->getOperand(1 - j);
15218 } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
15219 Add = O0;
15220 Sub = O1;
15221 AddSibling = N0->getOperand(1 - i);
15222 SubSibling = N1->getOperand(1 - j);
15223 } else
15224 continue;
15225
15226 if (!ISD::isConstantSplatVectorAllZeros(Sub.getOperand(0).getNode()))
15227 continue;
15228
15229 // Constant ones is always righthand operand of the Add.
15230 if (!ISD::isConstantSplatVectorAllOnes(Add.getOperand(1).getNode()))
15231 continue;
15232
15233 if (Sub.getOperand(1) != Add.getOperand(0))
15234 continue;
15235
15236 return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling);
15237 }
15238 }
15239
15240 // (or (and a b) (and (not a) c)) => (bsl a b c)
15241 // We only have to look for constant vectors here since the general, variable
15242 // case can be handled in TableGen.
15243 unsigned Bits = VT.getScalarSizeInBits();
15244 for (int i = 1; i >= 0; --i)
15245 for (int j = 1; j >= 0; --j) {
15246 APInt Val1, Val2;
15247
15248 if (ISD::isConstantSplatVector(N0->getOperand(i).getNode(), Val1) &&
15250 ~Val1.trunc(Bits) == Val2.trunc(Bits)) {
15251 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
15252 N0->getOperand(1 - i), N1->getOperand(1 - j));
15253 }
15256 if (!BVN0 || !BVN1)
15257 continue;
15258
15259 bool FoundMatch = true;
15260 for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
15263 if (!CN0 || !CN1 ||
15264 CN0->getAPIntValue().trunc(Bits) !=
15265 ~CN1->getAsAPIntVal().trunc(Bits)) {
15266 FoundMatch = false;
15267 break;
15268 }
15269 }
15270 if (FoundMatch)
15271 return DAG.getNode(AArch64ISD::BSP, DL, VT, N0->getOperand(i),
15272 N0->getOperand(1 - i), N1->getOperand(1 - j));
15273 }
15274
15275 return SDValue();
15276}
15277
15278SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
15279 SelectionDAG &DAG) const {
15280 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
15281 !Subtarget->isNeonAvailable()))
15282 return LowerToScalableOp(Op, DAG);
15283
15284 if (SDValue Res = tryLowerToBSL(Op, DAG))
15285 return Res;
15286
15287 // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
15288 if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
15289 return Res;
15290
15291 EVT VT = Op.getValueType();
15292 if (VT.isScalableVector())
15293 return Op;
15294
15295 SDValue LHS = Op.getOperand(0);
15296 BuildVectorSDNode *BVN =
15297 dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
15298 if (!BVN) {
15299 // OR commutes, so try swapping the operands.
15300 LHS = Op.getOperand(1);
15301 BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
15302 }
15303 if (!BVN)
15304 return Op;
15305
15306 APInt DefBits(VT.getSizeInBits(), 0);
15307 APInt UndefBits(VT.getSizeInBits(), 0);
15308 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
15309 SDValue NewOp;
15310
15311 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
15312 DefBits, &LHS)) ||
15313 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
15314 DefBits, &LHS)))
15315 return NewOp;
15316
15317 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
15318 UndefBits, &LHS)) ||
15319 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
15320 UndefBits, &LHS)))
15321 return NewOp;
15322 }
15323
15324 // We can always fall back to a non-immediate OR.
15325 return Op;
15326}
15327
15328// Normalize the operands of BUILD_VECTOR. The value of constant operands will
15329// be truncated to fit element width.
15331 SelectionDAG &DAG) {
15332 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
15333 SDLoc DL(Op);
15334 EVT VT = Op.getValueType();
15335 EVT EltTy= VT.getVectorElementType();
15336
15337 if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
15338 return Op;
15339
15341 for (SDValue Lane : Op->ops()) {
15342 // For integer vectors, type legalization would have promoted the
15343 // operands already. Otherwise, if Op is a floating-point splat
15344 // (with operands cast to integers), then the only possibilities
15345 // are constants and UNDEFs.
15346 if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
15347 Lane = DAG.getConstant(
15348 CstLane->getAPIntValue().trunc(EltTy.getSizeInBits()).getZExtValue(),
15349 DL, MVT::i32);
15350 } else if (Lane.getNode()->isUndef()) {
15351 Lane = DAG.getUNDEF(MVT::i32);
15352 } else {
15353 assert(Lane.getValueType() == MVT::i32 &&
15354 "Unexpected BUILD_VECTOR operand type");
15355 }
15356 Ops.push_back(Lane);
15357 }
15358 return DAG.getBuildVector(VT, DL, Ops);
15359}
15360
15362 const AArch64Subtarget *ST, APInt &DefBits) {
15363 EVT VT = Op.getValueType();
15364 // TODO: We should be able to support 64-bit destinations too
15365 if (!ST->hasSVE() || !VT.is128BitVector() ||
15366 DefBits.getHiBits(64) != DefBits.getLoBits(64))
15367 return SDValue();
15368
15369 // See if we can make use of the SVE dup instruction.
15370 APInt Val64 = DefBits.trunc(64);
15371 int32_t ImmVal, ShiftVal;
15372 if (!AArch64_AM::isSVECpyDupImm(64, Val64.getSExtValue(), ImmVal, ShiftVal))
15373 return SDValue();
15374
15375 SDLoc DL(Op);
15376 SDValue SplatVal = DAG.getSplatVector(MVT::nxv2i64, DL,
15377 DAG.getConstant(Val64, DL, MVT::i64));
15378 SDValue Res = convertFromScalableVector(DAG, MVT::v2i64, SplatVal);
15379 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Res);
15380}
15381
15383 const AArch64Subtarget *ST) {
15384 EVT VT = Op.getValueType();
15385 assert((VT.getSizeInBits() == 64 || VT.getSizeInBits() == 128) &&
15386 "Expected a legal NEON vector");
15387
15388 APInt DefBits(VT.getSizeInBits(), 0);
15389 APInt UndefBits(VT.getSizeInBits(), 0);
15391 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
15392 auto TryMOVIWithBits = [&](APInt DefBits) {
15393 SDValue NewOp;
15394 if ((NewOp =
15395 tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
15396 (NewOp =
15397 tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
15398 (NewOp =
15399 tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
15400 (NewOp =
15401 tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
15402 (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
15403 (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
15404 return NewOp;
15405
15406 APInt NotDefBits = ~DefBits;
15407 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG,
15408 NotDefBits)) ||
15409 (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG,
15410 NotDefBits)) ||
15411 (NewOp =
15412 tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, NotDefBits)))
15413 return NewOp;
15414 return SDValue();
15415 };
15416 if (SDValue R = TryMOVIWithBits(DefBits))
15417 return R;
15418 if (SDValue R = TryMOVIWithBits(UndefBits))
15419 return R;
15420
15421 // Try to materialise the constant using SVE when available.
15422 if (SDValue R = trySVESplat64(Op, DAG, ST, DefBits))
15423 return R;
15424
15425 // See if a fneg of the constant can be materialized with a MOVI, etc
15426 auto TryWithFNeg = [&](APInt DefBits, MVT FVT) {
15427 // FNegate each sub-element of the constant
15428 assert(VT.getSizeInBits() % FVT.getScalarSizeInBits() == 0);
15429 APInt Neg = APInt::getHighBitsSet(FVT.getSizeInBits(), 1)
15430 .zext(VT.getSizeInBits());
15431 APInt NegBits(VT.getSizeInBits(), 0);
15432 unsigned NumElts = VT.getSizeInBits() / FVT.getScalarSizeInBits();
15433 for (unsigned i = 0; i < NumElts; i++)
15434 NegBits |= Neg << (FVT.getScalarSizeInBits() * i);
15435 NegBits = DefBits ^ NegBits;
15436
15437 // Try to create the new constants with MOVI, and if so generate a fneg
15438 // for it.
15439 if (SDValue NewOp = TryMOVIWithBits(NegBits)) {
15440 SDLoc DL(Op);
15441 MVT VFVT = NumElts == 1 ? FVT : MVT::getVectorVT(FVT, NumElts);
15442 return DAG.getNode(
15443 AArch64ISD::NVCAST, DL, VT,
15444 DAG.getNode(ISD::FNEG, DL, VFVT,
15445 DAG.getNode(AArch64ISD::NVCAST, DL, VFVT, NewOp)));
15446 }
15447 return SDValue();
15448 };
15449 SDValue R;
15450 if ((R = TryWithFNeg(DefBits, MVT::f32)) ||
15451 (R = TryWithFNeg(DefBits, MVT::f64)) ||
15452 (ST->hasFullFP16() && (R = TryWithFNeg(DefBits, MVT::f16))))
15453 return R;
15454 }
15455
15456 return SDValue();
15457}
15458
15459SDValue AArch64TargetLowering::LowerFixedLengthBuildVectorToSVE(
15460 SDValue Op, SelectionDAG &DAG) const {
15461 EVT VT = Op.getValueType();
15462 SDLoc DL(Op);
15463 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
15464 auto *BVN = cast<BuildVectorSDNode>(Op);
15465
15466 if (auto SeqInfo = BVN->isConstantSequence()) {
15467 SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT);
15468 SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second);
15469 SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps);
15470 return convertFromScalableVector(DAG, VT, Seq);
15471 }
15472
15473 unsigned NumElems = VT.getVectorNumElements();
15474 if (!VT.isPow2VectorType() || VT.getFixedSizeInBits() > 128 ||
15475 NumElems <= 1 || BVN->isConstant())
15476 return SDValue();
15477
15478 auto IsExtractElt = [](SDValue Op) {
15479 return Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT;
15480 };
15481
15482 // For integer types that are not already in vectors limit to at most four
15483 // elements. This is an arbitrary restriction to avoid many fmovs from GPRs.
15484 if (VT.getScalarType().isInteger() &&
15485 NumElems - count_if(Op->op_values(), IsExtractElt) > 4)
15486 return SDValue();
15487
15488 // Lower (pow2) BUILD_VECTORS that are <= 128-bit to a sequence of ZIP1s.
15489 SDValue ZeroI64 = DAG.getConstant(0, DL, MVT::i64);
15491 Op->op_values(), [&, Undef = DAG.getUNDEF(ContainerVT)](SDValue Op) {
15492 return Op.isUndef() ? Undef
15493 : DAG.getNode(ISD::INSERT_VECTOR_ELT, DL,
15494 ContainerVT, Undef, Op, ZeroI64);
15495 });
15496
15497 ElementCount ZipEC = ContainerVT.getVectorElementCount();
15498 while (Intermediates.size() > 1) {
15499 EVT ZipVT = getPackedSVEVectorVT(ZipEC);
15500
15501 for (unsigned I = 0; I < Intermediates.size(); I += 2) {
15502 SDValue Op0 = DAG.getBitcast(ZipVT, Intermediates[I + 0]);
15503 SDValue Op1 = DAG.getBitcast(ZipVT, Intermediates[I + 1]);
15504 Intermediates[I / 2] =
15505 Op1.isUndef() ? Op0
15506 : DAG.getNode(AArch64ISD::ZIP1, DL, ZipVT, Op0, Op1);
15507 }
15508
15509 Intermediates.resize(Intermediates.size() / 2);
15510 ZipEC = ZipEC.divideCoefficientBy(2);
15511 }
15512
15513 assert(Intermediates.size() == 1);
15514 SDValue Vec = DAG.getBitcast(ContainerVT, Intermediates[0]);
15515 return convertFromScalableVector(DAG, VT, Vec);
15516}
15517
15518SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
15519 SelectionDAG &DAG) const {
15520 EVT VT = Op.getValueType();
15521
15522 bool OverrideNEON = !Subtarget->isNeonAvailable() ||
15523 cast<BuildVectorSDNode>(Op)->isConstantSequence();
15524 if (useSVEForFixedLengthVectorVT(VT, OverrideNEON))
15525 return LowerFixedLengthBuildVectorToSVE(Op, DAG);
15526
15527 // Try to build a simple constant vector.
15528 Op = NormalizeBuildVector(Op, DAG);
15529 // Thought this might return a non-BUILD_VECTOR (e.g. CONCAT_VECTORS), if so,
15530 // abort.
15531 if (Op.getOpcode() != ISD::BUILD_VECTOR)
15532 return SDValue();
15533
15534 // Certain vector constants, used to express things like logical NOT and
15535 // arithmetic NEG, are passed through unmodified. This allows special
15536 // patterns for these operations to match, which will lower these constants
15537 // to whatever is proven necessary.
15538 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
15539 if (BVN->isConstant()) {
15540 if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
15541 unsigned BitSize = VT.getVectorElementType().getSizeInBits();
15542 APInt Val(BitSize,
15543 Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
15544 if (Val.isZero() || (VT.isInteger() && Val.isAllOnes()))
15545 return Op;
15546 }
15547 if (ConstantFPSDNode *Const = BVN->getConstantFPSplatNode())
15548 if (Const->isZero() && !Const->isNegative())
15549 return Op;
15550 }
15551
15552 if (SDValue V = ConstantBuildVector(Op, DAG, Subtarget))
15553 return V;
15554
15555 // Scan through the operands to find some interesting properties we can
15556 // exploit:
15557 // 1) If only one value is used, we can use a DUP, or
15558 // 2) if only the low element is not undef, we can just insert that, or
15559 // 3) if only one constant value is used (w/ some non-constant lanes),
15560 // we can splat the constant value into the whole vector then fill
15561 // in the non-constant lanes.
15562 // 4) FIXME: If different constant values are used, but we can intelligently
15563 // select the values we'll be overwriting for the non-constant
15564 // lanes such that we can directly materialize the vector
15565 // some other way (MOVI, e.g.), we can be sneaky.
15566 // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
15567 SDLoc DL(Op);
15568 unsigned NumElts = VT.getVectorNumElements();
15569 bool isOnlyLowElement = true;
15570 bool usesOnlyOneValue = true;
15571 bool usesOnlyOneConstantValue = true;
15572 bool isConstant = true;
15573 bool AllLanesExtractElt = true;
15574 unsigned NumConstantLanes = 0;
15575 unsigned NumDifferentLanes = 0;
15576 unsigned NumUndefLanes = 0;
15577 SDValue Value;
15578 SDValue ConstantValue;
15579 SmallMapVector<SDValue, unsigned, 16> DifferentValueMap;
15580 unsigned ConsecutiveValCount = 0;
15581 SDValue PrevVal;
15582 for (unsigned i = 0; i < NumElts; ++i) {
15583 SDValue V = Op.getOperand(i);
15584 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
15585 AllLanesExtractElt = false;
15586 if (V.isUndef()) {
15587 ++NumUndefLanes;
15588 continue;
15589 }
15590 if (i > 0)
15591 isOnlyLowElement = false;
15592 if (!isIntOrFPConstant(V))
15593 isConstant = false;
15594
15595 if (isIntOrFPConstant(V)) {
15596 ++NumConstantLanes;
15597 if (!ConstantValue.getNode())
15598 ConstantValue = V;
15599 else if (ConstantValue != V)
15600 usesOnlyOneConstantValue = false;
15601 }
15602
15603 if (!Value.getNode())
15604 Value = V;
15605 else if (V != Value) {
15606 usesOnlyOneValue = false;
15607 ++NumDifferentLanes;
15608 }
15609
15610 if (PrevVal != V) {
15611 ConsecutiveValCount = 0;
15612 PrevVal = V;
15613 }
15614
15615 // Keep different values and its last consecutive count. For example,
15616 //
15617 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
15618 // t24, t24, t24, t24, t24, t24, t24, t24
15619 // t23 = consecutive count 8
15620 // t24 = consecutive count 8
15621 // ------------------------------------------------------------------
15622 // t22: v16i8 = build_vector t24, t24, t23, t23, t23, t23, t23, t24,
15623 // t24, t24, t24, t24, t24, t24, t24, t24
15624 // t23 = consecutive count 5
15625 // t24 = consecutive count 9
15626 DifferentValueMap[V] = ++ConsecutiveValCount;
15627 }
15628
15629 if (!Value.getNode()) {
15630 LLVM_DEBUG(
15631 dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
15632 return DAG.getUNDEF(VT);
15633 }
15634
15635 // Convert BUILD_VECTOR where all elements but the lowest are undef into
15636 // SCALAR_TO_VECTOR, except for when we have a single-element constant vector
15637 // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
15638 if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(Value))) {
15639 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
15640 "SCALAR_TO_VECTOR node\n");
15641 return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Value);
15642 }
15643
15644 if (AllLanesExtractElt) {
15645 SDNode *Vector = nullptr;
15646 bool Even = false;
15647 bool Odd = false;
15648 // Check whether the extract elements match the Even pattern <0,2,4,...> or
15649 // the Odd pattern <1,3,5,...>.
15650 for (unsigned i = 0; i < NumElts; ++i) {
15651 SDValue V = Op.getOperand(i);
15652 const SDNode *N = V.getNode();
15653 if (!isa<ConstantSDNode>(N->getOperand(1))) {
15654 Even = false;
15655 Odd = false;
15656 break;
15657 }
15658 SDValue N0 = N->getOperand(0);
15659
15660 // All elements are extracted from the same vector.
15661 if (!Vector) {
15662 Vector = N0.getNode();
15663 // Check that the type of EXTRACT_VECTOR_ELT matches the type of
15664 // BUILD_VECTOR.
15665 if (VT.getVectorElementType() !=
15667 break;
15668 } else if (Vector != N0.getNode()) {
15669 Odd = false;
15670 Even = false;
15671 break;
15672 }
15673
15674 // Extracted values are either at Even indices <0,2,4,...> or at Odd
15675 // indices <1,3,5,...>.
15676 uint64_t Val = N->getConstantOperandVal(1);
15677 if (Val == 2 * i) {
15678 Even = true;
15679 continue;
15680 }
15681 if (Val - 1 == 2 * i) {
15682 Odd = true;
15683 continue;
15684 }
15685
15686 // Something does not match: abort.
15687 Odd = false;
15688 Even = false;
15689 break;
15690 }
15691 if (Even || Odd) {
15692 SDValue LHS =
15694 DAG.getConstant(0, DL, MVT::i64));
15695 SDValue RHS =
15697 DAG.getConstant(NumElts, DL, MVT::i64));
15698
15699 if (Even && !Odd)
15700 return DAG.getNode(AArch64ISD::UZP1, DL, VT, LHS, RHS);
15701 if (Odd && !Even)
15702 return DAG.getNode(AArch64ISD::UZP2, DL, VT, LHS, RHS);
15703 }
15704 }
15705
15706 // Use DUP for non-constant splats. For f32 constant splats, reduce to
15707 // i32 and try again.
15708 if (usesOnlyOneValue) {
15709 if (!isConstant) {
15710 if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
15711 Value.getValueType() != VT) {
15712 LLVM_DEBUG(
15713 dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
15714 return DAG.getNode(AArch64ISD::DUP, DL, VT, Value);
15715 }
15716
15717 // This is actually a DUPLANExx operation, which keeps everything vectory.
15718
15719 SDValue Lane = Value.getOperand(1);
15720 Value = Value.getOperand(0);
15721 if (Value.getValueSizeInBits() == 64) {
15722 LLVM_DEBUG(
15723 dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
15724 "widening it\n");
15725 Value = WidenVector(Value, DAG);
15726 }
15727
15728 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
15729 return DAG.getNode(Opcode, DL, VT, Value, Lane);
15730 }
15731
15734 EVT EltTy = VT.getVectorElementType();
15735 assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
15736 EltTy == MVT::f64) && "Unsupported floating-point vector type");
15737 LLVM_DEBUG(
15738 dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
15739 "BITCASTS, and try again\n");
15740 MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
15741 for (unsigned i = 0; i < NumElts; ++i)
15742 Ops.push_back(DAG.getNode(ISD::BITCAST, DL, NewType, Op.getOperand(i)));
15743 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
15744 SDValue Val = DAG.getBuildVector(VecVT, DL, Ops);
15745 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
15746 Val.dump(););
15747 Val = LowerBUILD_VECTOR(Val, DAG);
15748 if (Val.getNode())
15749 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
15750 }
15751 }
15752
15753 // If we need to insert a small number of different non-constant elements and
15754 // the vector width is sufficiently large, prefer using DUP with the common
15755 // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
15756 // skip the constant lane handling below.
15757 bool PreferDUPAndInsert =
15758 !isConstant && NumDifferentLanes >= 1 &&
15759 NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) &&
15760 NumDifferentLanes >= NumConstantLanes;
15761
15762 // If there was only one constant value used and for more than one lane,
15763 // start by splatting that value, then replace the non-constant lanes. This
15764 // is better than the default, which will perform a separate initialization
15765 // for each lane.
15766 if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) {
15767 // Firstly, try to materialize the splat constant.
15768 SDValue Val = DAG.getSplatBuildVector(VT, DL, ConstantValue);
15769 unsigned BitSize = VT.getScalarSizeInBits();
15770 APInt ConstantValueAPInt(1, 0);
15771 if (auto *C = dyn_cast<ConstantSDNode>(ConstantValue))
15772 ConstantValueAPInt = C->getAPIntValue().zextOrTrunc(BitSize);
15773 if (!isNullConstant(ConstantValue) && !isNullFPConstant(ConstantValue) &&
15774 !ConstantValueAPInt.isAllOnes()) {
15775 Val = ConstantBuildVector(Val, DAG, Subtarget);
15776 if (!Val)
15777 // Otherwise, materialize the constant and splat it.
15778 Val = DAG.getNode(AArch64ISD::DUP, DL, VT, ConstantValue);
15779 }
15780
15781 // Now insert the non-constant lanes.
15782 for (unsigned i = 0; i < NumElts; ++i) {
15783 SDValue V = Op.getOperand(i);
15784 SDValue LaneIdx = DAG.getConstant(i, DL, MVT::i64);
15785 if (!isIntOrFPConstant(V) && !V.isUndef())
15786 // Note that type legalization likely mucked about with the VT of the
15787 // source operand, so we may have to convert it here before inserting.
15788 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Val, V, LaneIdx);
15789 }
15790 return Val;
15791 }
15792
15793 // This will generate a load from the constant pool.
15794 if (isConstant) {
15795 LLVM_DEBUG(
15796 dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
15797 "expansion\n");
15798 return SDValue();
15799 }
15800
15801 // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
15802 // v4i32s. This is really a truncate, which we can construct out of (legal)
15803 // concats and truncate nodes.
15805 return M;
15806
15807 // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
15808 if (NumElts >= 4) {
15809 if (SDValue Shuffle = ReconstructShuffle(Op, DAG))
15810 return Shuffle;
15811
15812 if (SDValue Shuffle = ReconstructShuffleWithRuntimeMask(Op, DAG))
15813 return Shuffle;
15814 }
15815
15816 if (PreferDUPAndInsert) {
15817 // First, build a constant vector with the common element.
15819 SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, DL, Ops), DAG);
15820 // Next, insert the elements that do not match the common value.
15821 for (unsigned I = 0; I < NumElts; ++I)
15822 if (Op.getOperand(I) != Value)
15823 NewVector =
15824 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NewVector,
15825 Op.getOperand(I), DAG.getConstant(I, DL, MVT::i64));
15826
15827 return NewVector;
15828 }
15829
15830 // If vector consists of two different values, try to generate two DUPs and
15831 // (CONCAT_VECTORS or VECTOR_SHUFFLE).
15832 if (DifferentValueMap.size() == 2 && NumUndefLanes == 0) {
15834 // Check the consecutive count of the value is the half number of vector
15835 // elements. In this case, we can use CONCAT_VECTORS. For example,
15836 //
15837 // canUseVECTOR_CONCAT = true;
15838 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t23, t23, t23,
15839 // t24, t24, t24, t24, t24, t24, t24, t24
15840 //
15841 // canUseVECTOR_CONCAT = false;
15842 // t22: v16i8 = build_vector t23, t23, t23, t23, t23, t24, t24, t24,
15843 // t24, t24, t24, t24, t24, t24, t24, t24
15844 bool canUseVECTOR_CONCAT = true;
15845 for (auto Pair : DifferentValueMap) {
15846 // Check different values have same length which is NumElts / 2.
15847 if (Pair.second != NumElts / 2)
15848 canUseVECTOR_CONCAT = false;
15849 Vals.push_back(Pair.first);
15850 }
15851
15852 // If canUseVECTOR_CONCAT is true, we can generate two DUPs and
15853 // CONCAT_VECTORs. For example,
15854 //
15855 // t22: v16i8 = BUILD_VECTOR t23, t23, t23, t23, t23, t23, t23, t23,
15856 // t24, t24, t24, t24, t24, t24, t24, t24
15857 // ==>
15858 // t26: v8i8 = AArch64ISD::DUP t23
15859 // t28: v8i8 = AArch64ISD::DUP t24
15860 // t29: v16i8 = concat_vectors t26, t28
15861 if (canUseVECTOR_CONCAT) {
15862 EVT SubVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
15863 if (isTypeLegal(SubVT) && SubVT.isVector() &&
15864 SubVT.getVectorNumElements() >= 2) {
15865 SmallVector<SDValue, 8> Ops1(NumElts / 2, Vals[0]);
15866 SmallVector<SDValue, 8> Ops2(NumElts / 2, Vals[1]);
15867 SDValue DUP1 =
15868 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, DL, Ops1), DAG);
15869 SDValue DUP2 =
15870 LowerBUILD_VECTOR(DAG.getBuildVector(SubVT, DL, Ops2), DAG);
15872 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, DUP1, DUP2);
15873 return CONCAT_VECTORS;
15874 }
15875 }
15876
15877 // Let's try to generate VECTOR_SHUFFLE. For example,
15878 //
15879 // t24: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t26, t26, t26, t26
15880 // ==>
15881 // t27: v8i8 = BUILD_VECTOR t26, t26, t26, t26, t26, t26, t26, t26
15882 // t28: v8i8 = BUILD_VECTOR t25, t25, t25, t25, t25, t25, t25, t25
15883 // t29: v8i8 = vector_shuffle<0,1,2,3,12,13,14,15> t27, t28
15884 if (NumElts >= 8) {
15885 SmallVector<int, 16> MaskVec;
15886 // Build mask for VECTOR_SHUFLLE.
15887 SDValue FirstLaneVal = Op.getOperand(0);
15888 for (unsigned i = 0; i < NumElts; ++i) {
15889 SDValue Val = Op.getOperand(i);
15890 if (FirstLaneVal == Val)
15891 MaskVec.push_back(i);
15892 else
15893 MaskVec.push_back(i + NumElts);
15894 }
15895
15896 SmallVector<SDValue, 8> Ops1(NumElts, Vals[0]);
15897 SmallVector<SDValue, 8> Ops2(NumElts, Vals[1]);
15898 SDValue VEC1 = DAG.getBuildVector(VT, DL, Ops1);
15899 SDValue VEC2 = DAG.getBuildVector(VT, DL, Ops2);
15901 DAG.getVectorShuffle(VT, DL, VEC1, VEC2, MaskVec);
15902 return VECTOR_SHUFFLE;
15903 }
15904 }
15905
15906 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
15907 // know the default expansion would otherwise fall back on something even
15908 // worse. For a vector with one or two non-undef values, that's
15909 // scalar_to_vector for the elements followed by a shuffle (provided the
15910 // shuffle is valid for the target) and materialization element by element
15911 // on the stack followed by a load for everything else.
15912 if (!isConstant && !usesOnlyOneValue) {
15913 LLVM_DEBUG(
15914 dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
15915 "of INSERT_VECTOR_ELT\n");
15916
15917 SDValue Vec = DAG.getUNDEF(VT);
15918 SDValue Op0 = Op.getOperand(0);
15919 unsigned i = 0;
15920
15921 // Use SCALAR_TO_VECTOR for lane zero to
15922 // a) Avoid a RMW dependency on the full vector register, and
15923 // b) Allow the register coalescer to fold away the copy if the
15924 // value is already in an S or D register, and we're forced to emit an
15925 // INSERT_SUBREG that we can't fold anywhere.
15926 //
15927 // We also allow types like i8 and i16 which are illegal scalar but legal
15928 // vector element types. After type-legalization the inserted value is
15929 // extended (i32) and it is safe to cast them to the vector type by ignoring
15930 // the upper bits of the lowest lane (e.g. v8i8, v4i16).
15931 if (!Op0.isUndef()) {
15932 LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
15933 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Op0);
15934 ++i;
15935 }
15936 LLVM_DEBUG({
15937 if (i < NumElts)
15938 dbgs() << "Creating nodes for the other vector elements:\n";
15939 });
15940 for (; i < NumElts; ++i) {
15941 SDValue V = Op.getOperand(i);
15942 if (V.isUndef())
15943 continue;
15944 SDValue LaneIdx = DAG.getConstant(i, DL, MVT::i64);
15945 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Vec, V, LaneIdx);
15946 }
15947 return Vec;
15948 }
15949
15950 LLVM_DEBUG(
15951 dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
15952 "better alternative\n");
15953 return SDValue();
15954}
15955
15956SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
15957 SelectionDAG &DAG) const {
15958 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
15959 !Subtarget->isNeonAvailable()))
15960 return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
15961
15962 assert(Op.getValueType().isScalableVector() &&
15963 isTypeLegal(Op.getValueType()) &&
15964 "Expected legal scalable vector type!");
15965
15966 if (isTypeLegal(Op.getOperand(0).getValueType())) {
15967 unsigned NumOperands = Op->getNumOperands();
15968 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
15969 "Unexpected number of operands in CONCAT_VECTORS");
15970
15971 if (NumOperands == 2)
15972 return Op;
15973
15974 // Concat each pair of subvectors and pack into the lower half of the array.
15975 SmallVector<SDValue> ConcatOps(Op->ops());
15976 while (ConcatOps.size() > 1) {
15977 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
15978 SDValue V1 = ConcatOps[I];
15979 SDValue V2 = ConcatOps[I + 1];
15980 EVT SubVT = V1.getValueType();
15981 EVT PairVT = SubVT.getDoubleNumVectorElementsVT(*DAG.getContext());
15982 ConcatOps[I / 2] =
15983 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), PairVT, V1, V2);
15984 }
15985 ConcatOps.resize(ConcatOps.size() / 2);
15986 }
15987 return ConcatOps[0];
15988 }
15989
15990 return SDValue();
15991}
15992
15993SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
15994 SelectionDAG &DAG) const {
15995 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
15996
15997 if (useSVEForFixedLengthVectorVT(Op.getValueType(),
15998 !Subtarget->isNeonAvailable()))
15999 return LowerFixedLengthInsertVectorElt(Op, DAG);
16000
16001 EVT VT = Op.getOperand(0).getValueType();
16002
16003 if (VT.getScalarType() == MVT::i1) {
16004 EVT VectorVT = getPromotedVTForPredicate(VT);
16005 SDLoc DL(Op);
16006 SDValue ExtendedVector =
16007 DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, VectorVT);
16008 SDValue ExtendedValue =
16009 DAG.getAnyExtOrTrunc(Op.getOperand(1), DL,
16010 VectorVT.getScalarType().getSizeInBits() < 32
16011 ? MVT::i32
16012 : VectorVT.getScalarType());
16013 ExtendedVector =
16014 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT, ExtendedVector,
16015 ExtendedValue, Op.getOperand(2));
16016 return DAG.getAnyExtOrTrunc(ExtendedVector, DL, VT);
16017 }
16018
16019 // Check for non-constant or out of range lane.
16020 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
16021 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
16022 return SDValue();
16023
16024 return Op;
16025}
16026
16027SDValue
16028AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
16029 SelectionDAG &DAG) const {
16030 assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
16031 EVT VT = Op.getOperand(0).getValueType();
16032
16033 if (VT.getScalarType() == MVT::i1) {
16034 // We can't directly extract from an SVE predicate; extend it first.
16035 // (This isn't the only possible lowering, but it's straightforward.)
16036 EVT VectorVT = getPromotedVTForPredicate(VT);
16037 SDLoc DL(Op);
16038 SDValue Extend =
16039 DAG.getNode(ISD::ANY_EXTEND, DL, VectorVT, Op.getOperand(0));
16040 MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32;
16041 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractTy,
16042 Extend, Op.getOperand(1));
16043 return DAG.getAnyExtOrTrunc(Extract, DL, Op.getValueType());
16044 }
16045
16046 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
16047 return LowerFixedLengthExtractVectorElt(Op, DAG);
16048
16049 // Check for non-constant or out of range lane.
16050 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
16051 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
16052 return SDValue();
16053
16054 // Insertion/extraction are legal for V128 types.
16055 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
16056 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
16057 VT == MVT::v8f16 || VT == MVT::v8bf16)
16058 return Op;
16059
16060 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
16061 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
16062 VT != MVT::v4bf16)
16063 return SDValue();
16064
16065 // For V64 types, we perform extraction by expanding the value
16066 // to a V128 type and perform the extraction on that.
16067 SDLoc DL(Op);
16068 SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
16069 EVT WideTy = WideVec.getValueType();
16070
16071 EVT ExtrTy = WideTy.getVectorElementType();
16072 if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
16073 ExtrTy = MVT::i32;
16074
16075 // For extractions, we just return the result directly.
16076 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
16077 Op.getOperand(1));
16078}
16079
16080SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
16081 SelectionDAG &DAG) const {
16082 EVT VT = Op.getValueType();
16084 "Only cases that extract a fixed length vector are supported!");
16085 EVT InVT = Op.getOperand(0).getValueType();
16086
16087 // If we don't have legal types yet, do nothing
16088 if (!isTypeLegal(InVT))
16089 return SDValue();
16090
16091 if (InVT.is128BitVector()) {
16092 assert(VT.is64BitVector() && "Extracting unexpected vector type!");
16093 unsigned Idx = Op.getConstantOperandVal(1);
16094
16095 // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
16096 if (Idx == 0)
16097 return Op;
16098
16099 // If this is extracting the upper 64-bits of a 128-bit vector, we match
16100 // that directly.
16101 if (Idx * InVT.getScalarSizeInBits() == 64 && Subtarget->isNeonAvailable())
16102 return Op;
16103 }
16104
16105 if (InVT.isScalableVector() ||
16106 useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable())) {
16107 SDLoc DL(Op);
16108 SDValue Vec = Op.getOperand(0);
16109 SDValue Idx = Op.getOperand(1);
16110
16111 EVT PackedVT = getPackedSVEVectorVT(InVT.getVectorElementType());
16112 if (PackedVT != InVT) {
16113 // Pack input into the bottom part of an SVE register and try again.
16114 SDValue Container = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, PackedVT,
16115 DAG.getUNDEF(PackedVT), Vec,
16116 DAG.getVectorIdxConstant(0, DL));
16117 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Container, Idx);
16118 }
16119
16120 // This will get matched by custom code during ISelDAGToDAG.
16121 if (isNullConstant(Idx))
16122 return Op;
16123
16124 assert(InVT.isScalableVector() && "Unexpected vector type!");
16125 // Move requested subvector to the start of the vector and try again.
16126 SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, InVT, Vec, Vec, Idx);
16127 return convertFromScalableVector(DAG, VT, Splice);
16128 }
16129
16130 return SDValue();
16131}
16132
16133SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
16134 SelectionDAG &DAG) const {
16135 assert(Op.getValueType().isScalableVector() &&
16136 "Only expect to lower inserts into scalable vectors!");
16137
16138 EVT InVT = Op.getOperand(1).getValueType();
16139 unsigned Idx = Op.getConstantOperandVal(2);
16140
16141 SDValue Vec0 = Op.getOperand(0);
16142 SDValue Vec1 = Op.getOperand(1);
16143 SDLoc DL(Op);
16144 EVT VT = Op.getValueType();
16145
16146 if (InVT.isScalableVector()) {
16147 if (!isTypeLegal(VT))
16148 return SDValue();
16149
16150 // Break down insert_subvector into simpler parts.
16151 if (VT.getVectorElementType() == MVT::i1) {
16152 unsigned NumElts = VT.getVectorMinNumElements();
16153 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
16154
16155 SDValue Lo, Hi;
16156 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
16157 DAG.getVectorIdxConstant(0, DL));
16158 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
16159 DAG.getVectorIdxConstant(NumElts / 2, DL));
16160 if (Idx < (NumElts / 2))
16161 Lo = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Lo, Vec1,
16162 DAG.getVectorIdxConstant(Idx, DL));
16163 else
16164 Hi = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Hi, Vec1,
16165 DAG.getVectorIdxConstant(Idx - (NumElts / 2), DL));
16166
16167 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
16168 }
16169
16170 // We can select these directly.
16171 if (isTypeLegal(InVT) && Vec0.isUndef())
16172 return Op;
16173
16174 // Ensure the subvector is half the size of the main vector.
16175 if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
16176 return SDValue();
16177
16178 // Here narrow and wide refers to the vector element types. After "casting"
16179 // both vectors must have the same bit length and so because the subvector
16180 // has fewer elements, those elements need to be bigger.
16181 EVT NarrowVT = getPackedSVEVectorVT(VT.getVectorElementCount());
16182 EVT WideVT = getPackedSVEVectorVT(InVT.getVectorElementCount());
16183
16184 // NOP cast operands to the largest legal vector of the same element count.
16185 if (VT.isFloatingPoint()) {
16186 Vec0 = getSVESafeBitCast(NarrowVT, Vec0, DAG);
16187 Vec1 = getSVESafeBitCast(NarrowVT, Vec1, DAG);
16188 } else {
16189 // Legal integer vectors are already their largest so Vec0 is fine as is.
16190 Vec1 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
16191 Vec1 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, Vec1);
16192 }
16193
16194 // To replace the top/bottom half of vector V with vector SubV we widen the
16195 // preserved half of V, concatenate this to SubV (the order depending on the
16196 // half being replaced) and then narrow the result.
16197 SDValue Narrow;
16198 if (Idx == 0) {
16199 SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0);
16200 HiVec0 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, HiVec0);
16201 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, Vec1, HiVec0);
16202 } else {
16203 assert(Idx == InVT.getVectorMinNumElements() &&
16204 "Invalid subvector index!");
16205 SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0);
16206 LoVec0 = DAG.getNode(AArch64ISD::NVCAST, DL, NarrowVT, LoVec0);
16207 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, LoVec0, Vec1);
16208 }
16209
16210 return getSVESafeBitCast(VT, Narrow, DAG);
16211 }
16212
16213 if (Idx == 0 && isPackedVectorType(VT, DAG)) {
16214 // This will be matched by custom code during ISelDAGToDAG.
16215 if (Vec0.isUndef())
16216 return Op;
16217
16218 std::optional<unsigned> PredPattern =
16220 auto PredTy = VT.changeVectorElementType(MVT::i1);
16221 SDValue PTrue = getPTrue(DAG, DL, PredTy, *PredPattern);
16222 SDValue ScalableVec1 = convertToScalableVector(DAG, VT, Vec1);
16223 return DAG.getNode(ISD::VSELECT, DL, VT, PTrue, ScalableVec1, Vec0);
16224 }
16225
16226 return SDValue();
16227}
16228
16229static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated) {
16230 if (Op.getOpcode() != AArch64ISD::DUP &&
16231 Op.getOpcode() != ISD::SPLAT_VECTOR &&
16232 Op.getOpcode() != ISD::BUILD_VECTOR)
16233 return false;
16234
16235 if (Op.getOpcode() == ISD::BUILD_VECTOR &&
16236 !isAllConstantBuildVector(Op, SplatVal))
16237 return false;
16238
16239 if (Op.getOpcode() != ISD::BUILD_VECTOR &&
16240 !isa<ConstantSDNode>(Op->getOperand(0)))
16241 return false;
16242
16243 SplatVal = Op->getConstantOperandVal(0);
16244 if (Op.getValueType().getVectorElementType() != MVT::i64)
16245 SplatVal = (int32_t)SplatVal;
16246
16247 Negated = false;
16248 if (isPowerOf2_64(SplatVal))
16249 return true;
16250
16251 Negated = true;
16252 if (isPowerOf2_64(-SplatVal)) {
16253 SplatVal = -SplatVal;
16254 return true;
16255 }
16256
16257 return false;
16258}
16259
16260SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
16261 EVT VT = Op.getValueType();
16262 SDLoc DL(Op);
16263
16264 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
16265 return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
16266
16267 assert(VT.isScalableVector() && "Expected a scalable vector.");
16268
16269 bool Signed = Op.getOpcode() == ISD::SDIV;
16270 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
16271
16272 bool Negated;
16273 uint64_t SplatVal;
16274 // NOTE: SRAD cannot be used to represent sdiv-by-one.
16275 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated) &&
16276 SplatVal > 1) {
16278 SDValue Res =
16279 DAG.getNode(AArch64ISD::ASRD_MERGE_OP1, DL, VT, Pg, Op->getOperand(0),
16280 DAG.getTargetConstant(Log2_64(SplatVal), DL, MVT::i32));
16281 if (Negated)
16282 Res = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res);
16283
16284 return Res;
16285 }
16286
16287 if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64)
16288 return LowerToPredicatedOp(Op, DAG, PredOpcode);
16289
16290 // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
16291 // operations, and truncate the result.
16292 EVT WidenedVT;
16293 if (VT == MVT::nxv16i8)
16294 WidenedVT = MVT::nxv8i16;
16295 else if (VT == MVT::nxv8i16)
16296 WidenedVT = MVT::nxv4i32;
16297 else
16298 llvm_unreachable("Unexpected Custom DIV operation");
16299
16300 unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
16301 unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
16302 SDValue Op0Lo = DAG.getNode(UnpkLo, DL, WidenedVT, Op.getOperand(0));
16303 SDValue Op1Lo = DAG.getNode(UnpkLo, DL, WidenedVT, Op.getOperand(1));
16304 SDValue Op0Hi = DAG.getNode(UnpkHi, DL, WidenedVT, Op.getOperand(0));
16305 SDValue Op1Hi = DAG.getNode(UnpkHi, DL, WidenedVT, Op.getOperand(1));
16306 SDValue ResultLo = DAG.getNode(Op.getOpcode(), DL, WidenedVT, Op0Lo, Op1Lo);
16307 SDValue ResultHi = DAG.getNode(Op.getOpcode(), DL, WidenedVT, Op0Hi, Op1Hi);
16308 SDValue ResultLoCast = DAG.getNode(AArch64ISD::NVCAST, DL, VT, ResultLo);
16309 SDValue ResultHiCast = DAG.getNode(AArch64ISD::NVCAST, DL, VT, ResultHi);
16310 return DAG.getNode(AArch64ISD::UZP1, DL, VT, ResultLoCast, ResultHiCast);
16311}
16312
16313bool AArch64TargetLowering::shouldExpandBuildVectorWithShuffles(
16314 EVT VT, unsigned DefinedValues) const {
16315 if (!Subtarget->isNeonAvailable())
16316 return false;
16318}
16319
16321 // Currently no fixed length shuffles that require SVE are legal.
16322 if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
16323 return false;
16324
16325 if (VT.getVectorNumElements() == 4 &&
16326 (VT.is128BitVector() || VT.is64BitVector())) {
16327 unsigned Cost = getPerfectShuffleCost(M);
16328 if (Cost <= 1)
16329 return true;
16330 }
16331
16332 bool DummyBool;
16333 int DummyInt;
16334 unsigned DummyUnsigned;
16335
16336 unsigned EltSize = VT.getScalarSizeInBits();
16337 unsigned NumElts = VT.getVectorNumElements();
16339 isREVMask(M, EltSize, NumElts, 64) ||
16340 isREVMask(M, EltSize, NumElts, 32) ||
16341 isREVMask(M, EltSize, NumElts, 16) ||
16342 isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
16343 isSingletonEXTMask(M, VT, DummyUnsigned) ||
16344 isTRNMask(M, NumElts, DummyUnsigned) ||
16345 isUZPMask(M, NumElts, DummyUnsigned) ||
16346 isZIPMask(M, NumElts, DummyUnsigned) ||
16347 isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
16348 isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
16349 isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
16350 isINSMask(M, NumElts, DummyBool, DummyInt) ||
16351 isConcatMask(M, VT, VT.getSizeInBits() == 128));
16352}
16353
16355 EVT VT) const {
16356 // Just delegate to the generic legality, clear masks aren't special.
16357 return isShuffleMaskLegal(M, VT);
16358}
16359
16360/// getVShiftImm - Check if this is a valid build_vector for the immediate
16361/// operand of a vector shift operation, where all the elements of the
16362/// build_vector must have the same constant integer value.
16363static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
16364 // Ignore bit_converts.
16365 while (Op.getOpcode() == ISD::BITCAST)
16366 Op = Op.getOperand(0);
16368 APInt SplatBits, SplatUndef;
16369 unsigned SplatBitSize;
16370 bool HasAnyUndefs;
16371 if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
16372 HasAnyUndefs, ElementBits) ||
16373 SplatBitSize > ElementBits)
16374 return false;
16375 Cnt = SplatBits.getSExtValue();
16376 return true;
16377}
16378
16379/// isVShiftLImm - Check if this is a valid build_vector for the immediate
16380/// operand of a vector shift left operation. That value must be in the range:
16381/// 0 <= Value < ElementBits for a left shift; or
16382/// 0 <= Value <= ElementBits for a long left shift.
16383static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
16384 assert(VT.isVector() && "vector shift count is not a vector type");
16385 int64_t ElementBits = VT.getScalarSizeInBits();
16386 if (!getVShiftImm(Op, ElementBits, Cnt))
16387 return false;
16388 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
16389}
16390
16391/// isVShiftRImm - Check if this is a valid build_vector for the immediate
16392/// operand of a vector shift right operation. The value must be in the range:
16393/// 1 <= Value <= ElementBits for a right shift; or
16394static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
16395 assert(VT.isVector() && "vector shift count is not a vector type");
16396 int64_t ElementBits = VT.getScalarSizeInBits();
16397 if (!getVShiftImm(Op, ElementBits, Cnt))
16398 return false;
16399 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
16400}
16401
16402SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
16403 SelectionDAG &DAG) const {
16404 EVT VT = Op.getValueType();
16405
16406 if (VT.getScalarType() == MVT::i1) {
16407 // Lower i1 truncate to `(x & 1) != 0`.
16408 SDLoc DL(Op);
16409 EVT OpVT = Op.getOperand(0).getValueType();
16410 SDValue Zero = DAG.getConstant(0, DL, OpVT);
16411 SDValue One = DAG.getConstant(1, DL, OpVT);
16412 SDValue And = DAG.getNode(ISD::AND, DL, OpVT, Op.getOperand(0), One);
16413 return DAG.getSetCC(DL, VT, And, Zero, ISD::SETNE);
16414 }
16415
16416 if (!VT.isVector() || VT.isScalableVector())
16417 return SDValue();
16418
16419 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
16420 !Subtarget->isNeonAvailable()))
16421 return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
16422
16423 return SDValue();
16424}
16425
16426// Check if we can we lower this SRL to a rounding shift instruction. ResVT is
16427// possibly a truncated type, it tells how many bits of the value are to be
16428// used.
16430 SelectionDAG &DAG,
16431 unsigned &ShiftValue,
16432 SDValue &RShOperand) {
16433 if (Shift->getOpcode() != ISD::SRL)
16434 return false;
16435
16436 EVT VT = Shift.getValueType();
16437 assert(VT.isScalableVT());
16438
16439 auto ShiftOp1 =
16441 if (!ShiftOp1)
16442 return false;
16443
16444 ShiftValue = ShiftOp1->getZExtValue();
16445 if (ShiftValue < 1 || ShiftValue > ResVT.getScalarSizeInBits())
16446 return false;
16447
16448 SDValue Add = Shift->getOperand(0);
16449 if (Add->getOpcode() != ISD::ADD || !Add->hasOneUse())
16450 return false;
16451
16453 "ResVT must be truncated or same type as the shift.");
16454 // Check if an overflow can lead to incorrect results.
16455 uint64_t ExtraBits = VT.getScalarSizeInBits() - ResVT.getScalarSizeInBits();
16456 if (ShiftValue > ExtraBits && !Add->getFlags().hasNoUnsignedWrap())
16457 return false;
16458
16459 auto AddOp1 =
16461 if (!AddOp1)
16462 return false;
16463 uint64_t AddValue = AddOp1->getZExtValue();
16464 if (AddValue != 1ULL << (ShiftValue - 1))
16465 return false;
16466
16467 RShOperand = Add->getOperand(0);
16468 return true;
16469}
16470
16471SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
16472 SelectionDAG &DAG) const {
16473 EVT VT = Op.getValueType();
16474 SDLoc DL(Op);
16475 int64_t Cnt;
16476
16477 if (!Op.getOperand(1).getValueType().isVector())
16478 return Op;
16479 unsigned EltSize = VT.getScalarSizeInBits();
16480
16481 switch (Op.getOpcode()) {
16482 case ISD::SHL:
16483 if (VT.isScalableVector() ||
16484 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()))
16485 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED);
16486
16487 if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
16488 return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
16489 DAG.getTargetConstant(Cnt, DL, MVT::i32));
16490 return DAG.getNode(
16492 DAG.getTargetConstant(Intrinsic::aarch64_neon_ushl, DL, MVT::i32),
16493 Op.getOperand(0), Op.getOperand(1));
16494 case ISD::SRA:
16495 case ISD::SRL:
16496 if (VT.isScalableVector() &&
16497 (Subtarget->hasSVE2() ||
16498 (Subtarget->hasSME() && Subtarget->isStreaming()))) {
16499 SDValue RShOperand;
16500 unsigned ShiftValue;
16501 if (canLowerSRLToRoundingShiftForVT(Op, VT, DAG, ShiftValue, RShOperand))
16502 return DAG.getNode(AArch64ISD::URSHR_I_PRED, DL, VT,
16503 getPredicateForVector(DAG, DL, VT), RShOperand,
16504 DAG.getTargetConstant(ShiftValue, DL, MVT::i32));
16505 }
16506
16507 if (VT.isScalableVector() ||
16508 useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) {
16509 unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
16510 : AArch64ISD::SRL_PRED;
16511 return LowerToPredicatedOp(Op, DAG, Opc);
16512 }
16513
16514 // Right shift immediate
16515 if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
16516 unsigned Opc =
16517 (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
16518 return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
16519 DAG.getTargetConstant(Cnt, DL, MVT::i32),
16520 Op->getFlags());
16521 }
16522
16523 // Right shift register. Note, there is not a shift right register
16524 // instruction, but the shift left register instruction takes a signed
16525 // value, where negative numbers specify a right shift.
16526 unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
16527 : Intrinsic::aarch64_neon_ushl;
16528 // negate the shift amount
16529 SDValue NegShift = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
16530 Op.getOperand(1));
16531 SDValue NegShiftLeft =
16533 DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
16534 NegShift);
16535 return NegShiftLeft;
16536 }
16537
16538 llvm_unreachable("unexpected shift opcode");
16539}
16540
16541SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
16542 SelectionDAG &DAG) const {
16543 if (Op.getValueType().isScalableVector())
16544 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
16545
16546 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
16547 !Subtarget->isNeonAvailable()))
16548 return LowerFixedLengthVectorSetccToSVE(Op, DAG);
16549
16550 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
16551 SDValue LHS = Op.getOperand(0);
16552 SDValue RHS = Op.getOperand(1);
16553 EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
16554 SDLoc DL(Op);
16555
16556 if (LHS.getValueType().getVectorElementType().isInteger())
16557 return Op;
16558
16559 assert(((!Subtarget->hasFullFP16() &&
16560 LHS.getValueType().getVectorElementType() != MVT::f16) ||
16561 LHS.getValueType().getVectorElementType() != MVT::bf16 ||
16562 LHS.getValueType().getVectorElementType() != MVT::f128) &&
16563 "Unexpected type!");
16564
16565 // Lower isnan(x) | isnan(never-nan) to x != x.
16566 // Lower !isnan(x) & !isnan(never-nan) to x == x.
16567 if (CC == ISD::SETUO || CC == ISD::SETO) {
16568 bool OneNaN = false;
16569 if (LHS == RHS) {
16570 OneNaN = true;
16571 } else if (DAG.isKnownNeverNaN(RHS)) {
16572 OneNaN = true;
16573 RHS = LHS;
16574 } else if (DAG.isKnownNeverNaN(LHS)) {
16575 OneNaN = true;
16576 LHS = RHS;
16577 }
16578 if (OneNaN) {
16579 CC = CC == ISD::SETUO ? ISD::SETUNE : ISD::SETOEQ;
16580 }
16581 }
16582
16583 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
16584 // clean. Some of them require two branches to implement.
16585 AArch64CC::CondCode CC1, CC2;
16586 bool ShouldInvert;
16587 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
16588
16589 bool NoNaNs =
16590 getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs();
16591 SDValue Cmp = emitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, DL, DAG);
16592 if (!Cmp.getNode())
16593 return SDValue();
16594
16595 if (CC2 != AArch64CC::AL) {
16596 SDValue Cmp2 = emitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, DL, DAG);
16597 if (!Cmp2.getNode())
16598 return SDValue();
16599
16600 Cmp = DAG.getNode(ISD::OR, DL, CmpVT, Cmp, Cmp2);
16601 }
16602
16603 Cmp = DAG.getSExtOrTrunc(Cmp, DL, Op.getValueType());
16604
16605 if (ShouldInvert)
16606 Cmp = DAG.getNOT(DL, Cmp, Cmp.getValueType());
16607
16608 return Cmp;
16609}
16610
16611static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
16612 SelectionDAG &DAG) {
16613 SDValue VecOp = ScalarOp.getOperand(0);
16614 auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
16615 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
16616 DAG.getConstant(0, DL, MVT::i64));
16617}
16618
16619static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,
16620 SDLoc DL, SelectionDAG &DAG) {
16621 unsigned ScalarOpcode;
16622 switch (Opcode) {
16623 case ISD::VECREDUCE_AND:
16624 ScalarOpcode = ISD::AND;
16625 break;
16626 case ISD::VECREDUCE_OR:
16627 ScalarOpcode = ISD::OR;
16628 break;
16629 case ISD::VECREDUCE_XOR:
16630 ScalarOpcode = ISD::XOR;
16631 break;
16632 default:
16633 llvm_unreachable("Expected bitwise vector reduction");
16634 return SDValue();
16635 }
16636
16637 EVT VecVT = Vec.getValueType();
16638 assert(VecVT.isFixedLengthVector() && VecVT.isPow2VectorType() &&
16639 "Expected power-of-2 length vector");
16640
16641 EVT ElemVT = VecVT.getVectorElementType();
16642
16643 SDValue Result;
16644 unsigned NumElems = VecVT.getVectorNumElements();
16645
16646 // Special case for boolean reductions
16647 if (ElemVT == MVT::i1) {
16648 // Split large vectors into smaller ones
16649 if (NumElems > 16) {
16650 SDValue Lo, Hi;
16651 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
16652 EVT HalfVT = Lo.getValueType();
16653 SDValue HalfVec = DAG.getNode(ScalarOpcode, DL, HalfVT, Lo, Hi);
16654 return getVectorBitwiseReduce(Opcode, HalfVec, VT, DL, DAG);
16655 }
16656
16657 // Results of setcc operations get widened to 128 bits if their input
16658 // operands are 128 bits wide, otherwise vectors that are less than 64 bits
16659 // get widened to neatly fit a 64 bit register, so e.g. <4 x i1> gets
16660 // lowered to either <4 x i16> or <4 x i32>. Sign extending to this element
16661 // size leads to the best codegen, since e.g. setcc results might need to be
16662 // truncated otherwise.
16663 unsigned ExtendedWidth = 64;
16664 if (Vec.getOpcode() == ISD::SETCC &&
16665 Vec.getOperand(0).getValueSizeInBits() >= 128) {
16666 ExtendedWidth = 128;
16667 }
16668 EVT ExtendedVT = MVT::getIntegerVT(std::max(ExtendedWidth / NumElems, 8u));
16669
16670 // any_ext doesn't work with umin/umax, so only use it for uadd.
16671 unsigned ExtendOp =
16672 ScalarOpcode == ISD::XOR ? ISD::ANY_EXTEND : ISD::SIGN_EXTEND;
16673 SDValue Extended = DAG.getNode(
16674 ExtendOp, DL, VecVT.changeVectorElementType(ExtendedVT), Vec);
16675 // The uminp/uminv and umaxp/umaxv instructions don't have .2d variants, so
16676 // in that case we bitcast the sign extended values from v2i64 to v4i32
16677 // before reduction for optimal code generation.
16678 if ((ScalarOpcode == ISD::AND || ScalarOpcode == ISD::OR) &&
16679 NumElems == 2 && ExtendedWidth == 128) {
16680 Extended = DAG.getBitcast(MVT::v4i32, Extended);
16681 ExtendedVT = MVT::i32;
16682 }
16683 switch (ScalarOpcode) {
16684 case ISD::AND:
16685 Result = DAG.getNode(ISD::VECREDUCE_UMIN, DL, ExtendedVT, Extended);
16686 break;
16687 case ISD::OR:
16688 Result = DAG.getNode(ISD::VECREDUCE_UMAX, DL, ExtendedVT, Extended);
16689 break;
16690 case ISD::XOR:
16691 Result = DAG.getNode(ISD::VECREDUCE_ADD, DL, ExtendedVT, Extended);
16692 break;
16693 default:
16694 llvm_unreachable("Unexpected Opcode");
16695 }
16696
16697 Result = DAG.getAnyExtOrTrunc(Result, DL, MVT::i1);
16698 } else {
16699 // Iteratively split the vector in half and combine using the bitwise
16700 // operation until it fits in a 64 bit register.
16701 while (VecVT.getSizeInBits() > 64) {
16702 SDValue Lo, Hi;
16703 std::tie(Lo, Hi) = DAG.SplitVector(Vec, DL);
16704 VecVT = Lo.getValueType();
16705 NumElems = VecVT.getVectorNumElements();
16706 Vec = DAG.getNode(ScalarOpcode, DL, VecVT, Lo, Hi);
16707 }
16708
16709 EVT ScalarVT = EVT::getIntegerVT(*DAG.getContext(), VecVT.getSizeInBits());
16710
16711 // Do the remaining work on a scalar since it allows the code generator to
16712 // combine the shift and bitwise operation into one instruction and since
16713 // integer instructions can have higher throughput than vector instructions.
16714 SDValue Scalar = DAG.getBitcast(ScalarVT, Vec);
16715
16716 // Iteratively combine the lower and upper halves of the scalar using the
16717 // bitwise operation, halving the relevant region of the scalar in each
16718 // iteration, until the relevant region is just one element of the original
16719 // vector.
16720 for (unsigned Shift = NumElems / 2; Shift > 0; Shift /= 2) {
16721 SDValue ShiftAmount =
16722 DAG.getConstant(Shift * ElemVT.getSizeInBits(), DL, MVT::i64);
16723 SDValue Shifted =
16724 DAG.getNode(ISD::SRL, DL, ScalarVT, Scalar, ShiftAmount);
16725 Scalar = DAG.getNode(ScalarOpcode, DL, ScalarVT, Scalar, Shifted);
16726 }
16727
16728 Result = DAG.getAnyExtOrTrunc(Scalar, DL, ElemVT);
16729 }
16730
16731 return DAG.getAnyExtOrTrunc(Result, DL, VT);
16732}
16733
16734SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
16735 SelectionDAG &DAG) const {
16736 SDValue Src = Op.getOperand(0);
16737 EVT SrcVT = Src.getValueType();
16738
16739 // Scalarize v2f16 to turn it into a faddp. This will be more efficient than
16740 // widening by inserting zeroes.
16741 if (Subtarget->hasFullFP16() && Op.getOpcode() == ISD::VECREDUCE_FADD &&
16742 SrcVT == MVT::v2f16) {
16743 SDLoc DL(Op);
16744 return DAG.getNode(ISD::FADD, DL, MVT::f16,
16745 DAG.getExtractVectorElt(DL, MVT::f16, Src, 0),
16746 DAG.getExtractVectorElt(DL, MVT::f16, Src, 1));
16747 }
16748
16749 // Try to lower fixed length reductions to SVE.
16750 bool OverrideNEON = !Subtarget->isNeonAvailable() ||
16751 Op.getOpcode() == ISD::VECREDUCE_AND ||
16752 Op.getOpcode() == ISD::VECREDUCE_OR ||
16753 Op.getOpcode() == ISD::VECREDUCE_XOR ||
16754 Op.getOpcode() == ISD::VECREDUCE_FADD ||
16755 (Op.getOpcode() != ISD::VECREDUCE_ADD &&
16756 SrcVT.getVectorElementType() == MVT::i64);
16757 if (SrcVT.isScalableVector() ||
16759 SrcVT, OverrideNEON && Subtarget->useSVEForFixedLengthVectors())) {
16760
16761 if (SrcVT.getVectorElementType() == MVT::i1)
16762 return LowerPredReductionToSVE(Op, DAG);
16763
16764 switch (Op.getOpcode()) {
16765 case ISD::VECREDUCE_ADD:
16766 return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG);
16767 case ISD::VECREDUCE_AND:
16768 return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG);
16769 case ISD::VECREDUCE_OR:
16770 return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG);
16771 case ISD::VECREDUCE_SMAX:
16772 return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG);
16773 case ISD::VECREDUCE_SMIN:
16774 return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG);
16775 case ISD::VECREDUCE_UMAX:
16776 return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG);
16777 case ISD::VECREDUCE_UMIN:
16778 return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG);
16779 case ISD::VECREDUCE_XOR:
16780 return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG);
16781 case ISD::VECREDUCE_FADD:
16782 return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG);
16783 case ISD::VECREDUCE_FMAX:
16784 return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG);
16785 case ISD::VECREDUCE_FMIN:
16786 return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG);
16787 case ISD::VECREDUCE_FMAXIMUM:
16788 return LowerReductionToSVE(AArch64ISD::FMAXV_PRED, Op, DAG);
16789 case ISD::VECREDUCE_FMINIMUM:
16790 return LowerReductionToSVE(AArch64ISD::FMINV_PRED, Op, DAG);
16791 default:
16792 llvm_unreachable("Unhandled fixed length reduction");
16793 }
16794 }
16795
16796 // Lower NEON reductions.
16797 SDLoc DL(Op);
16798 switch (Op.getOpcode()) {
16799 case ISD::VECREDUCE_AND:
16800 case ISD::VECREDUCE_OR:
16801 case ISD::VECREDUCE_XOR:
16802 return getVectorBitwiseReduce(Op.getOpcode(), Op.getOperand(0),
16803 Op.getValueType(), DL, DAG);
16804 case ISD::VECREDUCE_ADD:
16805 return getReductionSDNode(AArch64ISD::UADDV, DL, Op, DAG);
16806 case ISD::VECREDUCE_SMAX:
16807 return getReductionSDNode(AArch64ISD::SMAXV, DL, Op, DAG);
16808 case ISD::VECREDUCE_SMIN:
16809 return getReductionSDNode(AArch64ISD::SMINV, DL, Op, DAG);
16810 case ISD::VECREDUCE_UMAX:
16811 return getReductionSDNode(AArch64ISD::UMAXV, DL, Op, DAG);
16812 case ISD::VECREDUCE_UMIN:
16813 return getReductionSDNode(AArch64ISD::UMINV, DL, Op, DAG);
16814 default:
16815 llvm_unreachable("Unhandled reduction");
16816 }
16817}
16818
16819SDValue AArch64TargetLowering::LowerVECREDUCE_MUL(SDValue Op,
16820 SelectionDAG &DAG) const {
16821 SDLoc DL(Op);
16822 SDValue Src = Op.getOperand(0);
16823 EVT SrcVT = Src.getValueType();
16824 assert(SrcVT.isScalableVector() && "Unexpected operand type!");
16825
16826 SDVTList SrcVTs = DAG.getVTList(SrcVT, SrcVT);
16827 unsigned BaseOpc = ISD::getVecReduceBaseOpcode(Op.getOpcode());
16828 SDValue Identity = DAG.getNeutralElement(BaseOpc, DL, SrcVT, Op->getFlags());
16829
16830 // Whilst we don't know the size of the vector we do know the maximum size so
16831 // can perform a tree reduction with an identity vector, which means once we
16832 // arrive at the result the remaining stages (when the vector is smaller than
16833 // the maximum) have no affect.
16834
16836 unsigned Stages = llvm::Log2_32(Segments * SrcVT.getVectorMinNumElements());
16837
16838 for (unsigned I = 0; I < Stages; ++I) {
16839 Src = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, SrcVTs, Src, Identity);
16840 Src = DAG.getNode(BaseOpc, DL, SrcVT, Src.getValue(0), Src.getValue(1));
16841 }
16842
16843 return DAG.getExtractVectorElt(DL, Op.getValueType(), Src, 0);
16844}
16845
16846SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
16847 SelectionDAG &DAG) const {
16848 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
16849 // No point replacing if we don't have the relevant instruction/libcall anyway
16850 if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
16851 return SDValue();
16852
16853 // LSE has an atomic load-clear instruction, but not a load-and.
16854 SDLoc DL(Op);
16855 MVT VT = Op.getSimpleValueType();
16856 assert(VT != MVT::i128 && "Handled elsewhere, code replicated.");
16857 SDValue RHS = Op.getOperand(2);
16858 AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
16859 RHS = DAG.getNode(ISD::XOR, DL, VT, DAG.getAllOnesConstant(DL, VT), RHS);
16860 return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, DL, AN->getMemoryVT(),
16861 Op.getOperand(0), Op.getOperand(1), RHS,
16862 AN->getMemOperand());
16863}
16864
16865SDValue
16866AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(SDValue Op,
16867 SelectionDAG &DAG) const {
16868
16869 SDLoc DL(Op);
16870 // Get the inputs.
16871 SDNode *Node = Op.getNode();
16872 SDValue Chain = Op.getOperand(0);
16873 SDValue Size = Op.getOperand(1);
16874 MaybeAlign Align =
16875 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
16876 EVT VT = Node->getValueType(0);
16877
16879 "no-stack-arg-probe")) {
16880 SDValue SP = DAG.getCopyFromReg(Chain, DL, AArch64::SP, MVT::i64);
16881 Chain = SP.getValue(1);
16882 SP = DAG.getNode(ISD::SUB, DL, MVT::i64, SP, Size);
16883 if (Align)
16884 SP = DAG.getNode(ISD::AND, DL, VT, SP.getValue(0),
16885 DAG.getSignedConstant(-Align->value(), DL, VT));
16886 Chain = DAG.getCopyToReg(Chain, DL, AArch64::SP, SP);
16887 SDValue Ops[2] = {SP, Chain};
16888 return DAG.getMergeValues(Ops, DL);
16889 }
16890
16891 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
16892
16893 EVT PtrVT = getPointerTy(DAG.getDataLayout());
16894 SDValue Callee = DAG.getTargetExternalSymbol(Subtarget->getChkStkName(),
16895 PtrVT, 0);
16896
16897 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
16898 const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
16899 if (Subtarget->hasCustomCallingConv())
16900 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
16901
16902 Size = DAG.getNode(ISD::SRL, DL, MVT::i64, Size,
16903 DAG.getConstant(4, DL, MVT::i64));
16904 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X15, Size, SDValue());
16905 Chain =
16906 DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
16907 Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
16908 DAG.getRegisterMask(Mask), Chain.getValue(1));
16909 // To match the actual intent better, we should read the output from X15 here
16910 // again (instead of potentially spilling it to the stack), but rereading Size
16911 // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
16912 // here.
16913
16914 Size = DAG.getNode(ISD::SHL, DL, MVT::i64, Size,
16915 DAG.getConstant(4, DL, MVT::i64));
16916
16917 SDValue SP = DAG.getCopyFromReg(Chain, DL, AArch64::SP, MVT::i64);
16918 Chain = SP.getValue(1);
16919 SP = DAG.getNode(ISD::SUB, DL, MVT::i64, SP, Size);
16920 if (Align)
16921 SP = DAG.getNode(ISD::AND, DL, VT, SP.getValue(0),
16922 DAG.getSignedConstant(-Align->value(), DL, VT));
16923 Chain = DAG.getCopyToReg(Chain, DL, AArch64::SP, SP);
16924
16925 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), DL);
16926
16927 SDValue Ops[2] = {SP, Chain};
16928 return DAG.getMergeValues(Ops, DL);
16929}
16930
16931SDValue
16932AArch64TargetLowering::LowerInlineDYNAMIC_STACKALLOC(SDValue Op,
16933 SelectionDAG &DAG) const {
16934 // Get the inputs.
16935 SDNode *Node = Op.getNode();
16936 SDValue Chain = Op.getOperand(0);
16937 SDValue Size = Op.getOperand(1);
16938
16939 MaybeAlign Align =
16940 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
16941 SDLoc DL(Op);
16942 EVT VT = Node->getValueType(0);
16943
16944 // Construct the new SP value in a GPR.
16945 SDValue SP = DAG.getCopyFromReg(Chain, DL, AArch64::SP, MVT::i64);
16946 Chain = SP.getValue(1);
16947 SP = DAG.getNode(ISD::SUB, DL, MVT::i64, SP, Size);
16948 if (Align)
16949 SP = DAG.getNode(ISD::AND, DL, VT, SP.getValue(0),
16950 DAG.getSignedConstant(-Align->value(), DL, VT));
16951
16952 // Set the real SP to the new value with a probing loop.
16953 Chain = DAG.getNode(AArch64ISD::PROBED_ALLOCA, DL, MVT::Other, Chain, SP);
16954 SDValue Ops[2] = {SP, Chain};
16955 return DAG.getMergeValues(Ops, DL);
16956}
16957
16958SDValue
16959AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
16960 SelectionDAG &DAG) const {
16961 MachineFunction &MF = DAG.getMachineFunction();
16962
16963 if (Subtarget->isTargetWindows())
16964 return LowerWindowsDYNAMIC_STACKALLOC(Op, DAG);
16965 else if (hasInlineStackProbe(MF))
16966 return LowerInlineDYNAMIC_STACKALLOC(Op, DAG);
16967 else
16968 return SDValue();
16969}
16970
16971SDValue AArch64TargetLowering::LowerAVG(SDValue Op, SelectionDAG &DAG,
16972 unsigned NewOp) const {
16973 if (Subtarget->hasSVE2())
16974 return LowerToPredicatedOp(Op, DAG, NewOp);
16975
16976 // Default to expand.
16977 return SDValue();
16978}
16979
16980SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
16981 SelectionDAG &DAG) const {
16982 EVT VT = Op.getValueType();
16983 assert(VT != MVT::i64 && "Expected illegal VSCALE node");
16984
16985 SDLoc DL(Op);
16986 APInt MulImm = Op.getConstantOperandAPInt(0);
16987 return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sext(64)), DL,
16988 VT);
16989}
16990
16991/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
16992template <unsigned NumVecs>
16993static bool
16997 // Retrieve EC from first vector argument.
16998 const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType());
17000#ifndef NDEBUG
17001 // Check the assumption that all input vectors are the same type.
17002 for (unsigned I = 0; I < NumVecs; ++I)
17003 assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&
17004 "Invalid type.");
17005#endif
17006 // memVT is `NumVecs * VT`.
17008 EC * NumVecs);
17009 Info.ptrVal = CI.getArgOperand(CI.arg_size() - 1);
17010 Info.offset = 0;
17011 Info.align.reset();
17013 return true;
17014}
17015
17016/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
17017/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
17018/// specified in the intrinsic calls.
17020 const CallInst &I,
17021 MachineFunction &MF,
17022 unsigned Intrinsic) const {
17023 auto &DL = I.getDataLayout();
17024 switch (Intrinsic) {
17025 case Intrinsic::aarch64_sve_st2:
17026 return setInfoSVEStN<2>(*this, DL, Info, I);
17027 case Intrinsic::aarch64_sve_st3:
17028 return setInfoSVEStN<3>(*this, DL, Info, I);
17029 case Intrinsic::aarch64_sve_st4:
17030 return setInfoSVEStN<4>(*this, DL, Info, I);
17031 case Intrinsic::aarch64_neon_ld2:
17032 case Intrinsic::aarch64_neon_ld3:
17033 case Intrinsic::aarch64_neon_ld4:
17034 case Intrinsic::aarch64_neon_ld1x2:
17035 case Intrinsic::aarch64_neon_ld1x3:
17036 case Intrinsic::aarch64_neon_ld1x4: {
17037 Info.opc = ISD::INTRINSIC_W_CHAIN;
17038 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
17039 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
17040 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
17041 Info.offset = 0;
17042 Info.align.reset();
17043 // volatile loads with NEON intrinsics not supported
17044 Info.flags = MachineMemOperand::MOLoad;
17045 return true;
17046 }
17047 case Intrinsic::aarch64_neon_ld2lane:
17048 case Intrinsic::aarch64_neon_ld3lane:
17049 case Intrinsic::aarch64_neon_ld4lane:
17050 case Intrinsic::aarch64_neon_ld2r:
17051 case Intrinsic::aarch64_neon_ld3r:
17052 case Intrinsic::aarch64_neon_ld4r: {
17053 Info.opc = ISD::INTRINSIC_W_CHAIN;
17054 // ldx return struct with the same vec type
17055 Type *RetTy = I.getType();
17056 auto *StructTy = cast<StructType>(RetTy);
17057 unsigned NumElts = StructTy->getNumElements();
17058 Type *VecTy = StructTy->getElementType(0);
17059 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
17060 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
17061 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
17062 Info.offset = 0;
17063 Info.align.reset();
17064 // volatile loads with NEON intrinsics not supported
17065 Info.flags = MachineMemOperand::MOLoad;
17066 return true;
17067 }
17068 case Intrinsic::aarch64_neon_st2:
17069 case Intrinsic::aarch64_neon_st3:
17070 case Intrinsic::aarch64_neon_st4:
17071 case Intrinsic::aarch64_neon_st1x2:
17072 case Intrinsic::aarch64_neon_st1x3:
17073 case Intrinsic::aarch64_neon_st1x4: {
17074 Info.opc = ISD::INTRINSIC_VOID;
17075 unsigned NumElts = 0;
17076 for (const Value *Arg : I.args()) {
17077 Type *ArgTy = Arg->getType();
17078 if (!ArgTy->isVectorTy())
17079 break;
17080 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
17081 }
17082 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
17083 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
17084 Info.offset = 0;
17085 Info.align.reset();
17086 // volatile stores with NEON intrinsics not supported
17087 Info.flags = MachineMemOperand::MOStore;
17088 return true;
17089 }
17090 case Intrinsic::aarch64_neon_st2lane:
17091 case Intrinsic::aarch64_neon_st3lane:
17092 case Intrinsic::aarch64_neon_st4lane: {
17093 Info.opc = ISD::INTRINSIC_VOID;
17094 unsigned NumElts = 0;
17095 // all the vector type is same
17096 Type *VecTy = I.getArgOperand(0)->getType();
17097 MVT EleVT = MVT::getVT(VecTy).getVectorElementType();
17098
17099 for (const Value *Arg : I.args()) {
17100 Type *ArgTy = Arg->getType();
17101 if (!ArgTy->isVectorTy())
17102 break;
17103 NumElts += 1;
17104 }
17105
17106 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), EleVT, NumElts);
17107 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
17108 Info.offset = 0;
17109 Info.align.reset();
17110 // volatile stores with NEON intrinsics not supported
17111 Info.flags = MachineMemOperand::MOStore;
17112 return true;
17113 }
17114 case Intrinsic::aarch64_ldaxr:
17115 case Intrinsic::aarch64_ldxr: {
17116 Type *ValTy = I.getParamElementType(0);
17117 Info.opc = ISD::INTRINSIC_W_CHAIN;
17118 Info.memVT = MVT::getVT(ValTy);
17119 Info.ptrVal = I.getArgOperand(0);
17120 Info.offset = 0;
17121 Info.align = DL.getABITypeAlign(ValTy);
17123 return true;
17124 }
17125 case Intrinsic::aarch64_stlxr:
17126 case Intrinsic::aarch64_stxr: {
17127 Type *ValTy = I.getParamElementType(1);
17128 Info.opc = ISD::INTRINSIC_W_CHAIN;
17129 Info.memVT = MVT::getVT(ValTy);
17130 Info.ptrVal = I.getArgOperand(1);
17131 Info.offset = 0;
17132 Info.align = DL.getABITypeAlign(ValTy);
17134 return true;
17135 }
17136 case Intrinsic::aarch64_ldaxp:
17137 case Intrinsic::aarch64_ldxp:
17138 Info.opc = ISD::INTRINSIC_W_CHAIN;
17139 Info.memVT = MVT::i128;
17140 Info.ptrVal = I.getArgOperand(0);
17141 Info.offset = 0;
17142 Info.align = Align(16);
17144 return true;
17145 case Intrinsic::aarch64_stlxp:
17146 case Intrinsic::aarch64_stxp:
17147 Info.opc = ISD::INTRINSIC_W_CHAIN;
17148 Info.memVT = MVT::i128;
17149 Info.ptrVal = I.getArgOperand(2);
17150 Info.offset = 0;
17151 Info.align = Align(16);
17153 return true;
17154 case Intrinsic::aarch64_sve_ldnt1: {
17155 Type *ElTy = cast<VectorType>(I.getType())->getElementType();
17156 Info.opc = ISD::INTRINSIC_W_CHAIN;
17157 Info.memVT = MVT::getVT(I.getType());
17158 Info.ptrVal = I.getArgOperand(1);
17159 Info.offset = 0;
17160 Info.align = DL.getABITypeAlign(ElTy);
17162 return true;
17163 }
17164 case Intrinsic::aarch64_sve_stnt1: {
17165 Type *ElTy =
17166 cast<VectorType>(I.getArgOperand(0)->getType())->getElementType();
17167 Info.opc = ISD::INTRINSIC_W_CHAIN;
17168 Info.memVT = MVT::getVT(I.getOperand(0)->getType());
17169 Info.ptrVal = I.getArgOperand(2);
17170 Info.offset = 0;
17171 Info.align = DL.getABITypeAlign(ElTy);
17173 return true;
17174 }
17175 case Intrinsic::aarch64_mops_memset_tag: {
17176 Value *Dst = I.getArgOperand(0);
17177 Value *Val = I.getArgOperand(1);
17178 Info.opc = ISD::INTRINSIC_W_CHAIN;
17179 Info.memVT = MVT::getVT(Val->getType());
17180 Info.ptrVal = Dst;
17181 Info.offset = 0;
17182 Info.align = I.getParamAlign(0).valueOrOne();
17183 Info.flags = MachineMemOperand::MOStore;
17184 // The size of the memory being operated on is unknown at this point
17185 Info.size = MemoryLocation::UnknownSize;
17186 return true;
17187 }
17188 default:
17189 break;
17190 }
17191
17192 return false;
17193}
17194
17196 SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT,
17197 std::optional<unsigned> ByteOffset) const {
17198 // TODO: This may be worth removing. Check regression tests for diffs.
17199 if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT,
17200 ByteOffset))
17201 return false;
17202
17203 // If we're reducing the load width in order to avoid having to use an extra
17204 // instruction to do extension then it's probably a good idea.
17205 if (ExtTy != ISD::NON_EXTLOAD)
17206 return true;
17207 // Don't reduce load width if it would prevent us from combining a shift into
17208 // the offset.
17209 MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
17210 assert(Mem);
17211 const SDValue &Base = Mem->getBasePtr();
17212 if (Base.getOpcode() == ISD::ADD &&
17213 Base.getOperand(1).getOpcode() == ISD::SHL &&
17214 Base.getOperand(1).hasOneUse() &&
17215 Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
17216 // It's unknown whether a scalable vector has a power-of-2 bitwidth.
17217 if (Mem->getMemoryVT().isScalableVector())
17218 return false;
17219 // The shift can be combined if it matches the size of the value being
17220 // loaded (and so reducing the width would make it not match).
17221 uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
17222 uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
17223 if (ShiftAmount == Log2_32(LoadBytes))
17224 return false;
17225 }
17226 // We have no reason to disallow reducing the load width, so allow it.
17227 return true;
17228}
17229
17230// Treat a sext_inreg(extract(..)) as free if it has multiple uses.
17232 EVT VT = Extend.getValueType();
17233 if ((VT == MVT::i64 || VT == MVT::i32) && Extend->use_size()) {
17234 SDValue Extract = Extend.getOperand(0);
17235 if (Extract.getOpcode() == ISD::ANY_EXTEND && Extract.hasOneUse())
17236 Extract = Extract.getOperand(0);
17237 if (Extract.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Extract.hasOneUse()) {
17238 EVT VecVT = Extract.getOperand(0).getValueType();
17239 if (VecVT.getScalarType() == MVT::i8 || VecVT.getScalarType() == MVT::i16)
17240 return false;
17241 }
17242 }
17243 return true;
17244}
17245
17246// Truncations from 64-bit GPR to 32-bit GPR is free.
17248 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
17249 return false;
17250 uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedValue();
17251 uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedValue();
17252 return NumBits1 > NumBits2;
17253}
17255 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
17256 return false;
17257 uint64_t NumBits1 = VT1.getFixedSizeInBits();
17258 uint64_t NumBits2 = VT2.getFixedSizeInBits();
17259 return NumBits1 > NumBits2;
17260}
17261
17262/// Check if it is profitable to hoist instruction in then/else to if.
17263/// Not profitable if I and it's user can form a FMA instruction
17264/// because we prefer FMSUB/FMADD.
17266 if (I->getOpcode() != Instruction::FMul)
17267 return true;
17268
17269 if (!I->hasOneUse())
17270 return true;
17271
17272 Instruction *User = I->user_back();
17273
17274 if (!(User->getOpcode() == Instruction::FSub ||
17275 User->getOpcode() == Instruction::FAdd))
17276 return true;
17277
17279 const Function *F = I->getFunction();
17280 const DataLayout &DL = F->getDataLayout();
17281 Type *Ty = User->getOperand(0)->getType();
17282
17283 return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
17285 (Options.AllowFPOpFusion == FPOpFusion::Fast ||
17286 I->getFastMathFlags().allowContract()));
17287}
17288
17289// All 32-bit GPR operations implicitly zero the high-half of the corresponding
17290// 64-bit GPR.
17292 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
17293 return false;
17294 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
17295 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
17296 return NumBits1 == 32 && NumBits2 == 64;
17297}
17299 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
17300 return false;
17301 unsigned NumBits1 = VT1.getSizeInBits();
17302 unsigned NumBits2 = VT2.getSizeInBits();
17303 return NumBits1 == 32 && NumBits2 == 64;
17304}
17305
17307 EVT VT1 = Val.getValueType();
17308 if (isZExtFree(VT1, VT2)) {
17309 return true;
17310 }
17311
17312 if (Val.getOpcode() != ISD::LOAD)
17313 return false;
17314
17315 // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
17316 return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
17317 VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
17318 VT1.getSizeInBits() <= 32);
17319}
17320
17321bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
17322 if (isa<FPExtInst>(Ext))
17323 return false;
17324
17325 // Vector types are not free.
17326 if (Ext->getType()->isVectorTy())
17327 return false;
17328
17329 for (const Use &U : Ext->uses()) {
17330 // The extension is free if we can fold it with a left shift in an
17331 // addressing mode or an arithmetic operation: add, sub, and cmp.
17332
17333 // Is there a shift?
17334 const Instruction *Instr = cast<Instruction>(U.getUser());
17335
17336 // Is this a constant shift?
17337 switch (Instr->getOpcode()) {
17338 case Instruction::Shl:
17339 if (!isa<ConstantInt>(Instr->getOperand(1)))
17340 return false;
17341 break;
17342 case Instruction::GetElementPtr: {
17343 gep_type_iterator GTI = gep_type_begin(Instr);
17344 auto &DL = Ext->getDataLayout();
17345 std::advance(GTI, U.getOperandNo()-1);
17346 Type *IdxTy = GTI.getIndexedType();
17347 // This extension will end up with a shift because of the scaling factor.
17348 // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
17349 // Get the shift amount based on the scaling factor:
17350 // log2(sizeof(IdxTy)) - log2(8).
17351 if (IdxTy->isScalableTy())
17352 return false;
17353 uint64_t ShiftAmt =
17354 llvm::countr_zero(DL.getTypeStoreSizeInBits(IdxTy).getFixedValue()) -
17355 3;
17356 // Is the constant foldable in the shift of the addressing mode?
17357 // I.e., shift amount is between 1 and 4 inclusive.
17358 if (ShiftAmt == 0 || ShiftAmt > 4)
17359 return false;
17360 break;
17361 }
17362 case Instruction::Trunc:
17363 // Check if this is a noop.
17364 // trunc(sext ty1 to ty2) to ty1.
17365 if (Instr->getType() == Ext->getOperand(0)->getType())
17366 continue;
17367 [[fallthrough]];
17368 default:
17369 return false;
17370 }
17371
17372 // At this point we can use the bfm family, so this extension is free
17373 // for that use.
17374 }
17375 return true;
17376}
17377
17378static bool createTblShuffleMask(unsigned SrcWidth, unsigned DstWidth,
17379 unsigned NumElts, bool IsLittleEndian,
17380 SmallVectorImpl<int> &Mask) {
17381 if (DstWidth % 8 != 0 || DstWidth <= 16 || DstWidth > 64)
17382 return false;
17383
17384 assert(DstWidth % SrcWidth == 0 &&
17385 "TBL lowering is not supported for a conversion instruction with this "
17386 "source and destination element type.");
17387
17388 unsigned Factor = DstWidth / SrcWidth;
17389 unsigned MaskLen = NumElts * Factor;
17390
17391 Mask.clear();
17392 Mask.resize(MaskLen, NumElts);
17393
17394 unsigned SrcIndex = 0;
17395 for (unsigned I = IsLittleEndian ? 0 : Factor - 1; I < MaskLen; I += Factor)
17396 Mask[I] = SrcIndex++;
17397
17398 return true;
17399}
17400
17402 FixedVectorType *ZExtTy,
17403 FixedVectorType *DstTy,
17404 bool IsLittleEndian) {
17405 auto *SrcTy = cast<FixedVectorType>(Op->getType());
17406 unsigned NumElts = SrcTy->getNumElements();
17407 auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
17408 auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
17409
17410 SmallVector<int> Mask;
17411 if (!createTblShuffleMask(SrcWidth, DstWidth, NumElts, IsLittleEndian, Mask))
17412 return nullptr;
17413
17414 auto *FirstEltZero = Builder.CreateInsertElement(
17415 PoisonValue::get(SrcTy), Builder.getIntN(SrcWidth, 0), uint64_t(0));
17416 Value *Result = Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
17417 Result = Builder.CreateBitCast(Result, DstTy);
17418 if (DstTy != ZExtTy)
17419 Result = Builder.CreateZExt(Result, ZExtTy);
17420 return Result;
17421}
17422
17424 FixedVectorType *DstTy,
17425 bool IsLittleEndian) {
17426 auto *SrcTy = cast<FixedVectorType>(Op->getType());
17427 auto SrcWidth = cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
17428 auto DstWidth = cast<IntegerType>(DstTy->getElementType())->getBitWidth();
17429
17430 SmallVector<int> Mask;
17431 if (!createTblShuffleMask(SrcWidth, DstWidth, SrcTy->getNumElements(),
17432 !IsLittleEndian, Mask))
17433 return nullptr;
17434
17435 auto *FirstEltZero = Builder.CreateInsertElement(
17436 PoisonValue::get(SrcTy), Builder.getIntN(SrcWidth, 0), uint64_t(0));
17437
17438 return Builder.CreateShuffleVector(Op, FirstEltZero, Mask);
17439}
17440
17441static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
17442 IRBuilder<> Builder(TI);
17444 int NumElements = cast<FixedVectorType>(TI->getType())->getNumElements();
17445 auto *SrcTy = cast<FixedVectorType>(TI->getOperand(0)->getType());
17446 auto *DstTy = cast<FixedVectorType>(TI->getType());
17447 assert(SrcTy->getElementType()->isIntegerTy() &&
17448 "Non-integer type source vector element is not supported");
17449 assert(DstTy->getElementType()->isIntegerTy(8) &&
17450 "Unsupported destination vector element type");
17451 unsigned SrcElemTySz =
17452 cast<IntegerType>(SrcTy->getElementType())->getBitWidth();
17453 unsigned DstElemTySz =
17454 cast<IntegerType>(DstTy->getElementType())->getBitWidth();
17455 assert((SrcElemTySz % DstElemTySz == 0) &&
17456 "Cannot lower truncate to tbl instructions for a source element size "
17457 "that is not divisible by the destination element size");
17458 unsigned TruncFactor = SrcElemTySz / DstElemTySz;
17459 assert((SrcElemTySz == 16 || SrcElemTySz == 32 || SrcElemTySz == 64) &&
17460 "Unsupported source vector element type size");
17461 Type *VecTy = FixedVectorType::get(Builder.getInt8Ty(), 16);
17462
17463 // Create a mask to choose every nth byte from the source vector table of
17464 // bytes to create the truncated destination vector, where 'n' is the truncate
17465 // ratio. For example, for a truncate from Yxi64 to Yxi8, choose
17466 // 0,8,16,..Y*8th bytes for the little-endian format
17468 for (int Itr = 0; Itr < 16; Itr++) {
17469 if (Itr < NumElements)
17470 MaskConst.push_back(Builder.getInt8(
17471 IsLittleEndian ? Itr * TruncFactor
17472 : Itr * TruncFactor + (TruncFactor - 1)));
17473 else
17474 MaskConst.push_back(Builder.getInt8(255));
17475 }
17476
17477 int MaxTblSz = 128 * 4;
17478 int MaxSrcSz = SrcElemTySz * NumElements;
17479 int ElemsPerTbl =
17480 (MaxTblSz > MaxSrcSz) ? NumElements : (MaxTblSz / SrcElemTySz);
17481 assert(ElemsPerTbl <= 16 &&
17482 "Maximum elements selected using TBL instruction cannot exceed 16!");
17483
17484 int ShuffleCount = 128 / SrcElemTySz;
17485 SmallVector<int> ShuffleLanes;
17486 for (int i = 0; i < ShuffleCount; ++i)
17487 ShuffleLanes.push_back(i);
17488
17489 // Create TBL's table of bytes in 1,2,3 or 4 FP/SIMD registers using shuffles
17490 // over the source vector. If TBL's maximum 4 FP/SIMD registers are saturated,
17491 // call TBL & save the result in a vector of TBL results for combining later.
17493 while (ShuffleLanes.back() < NumElements) {
17494 Parts.push_back(Builder.CreateBitCast(
17495 Builder.CreateShuffleVector(TI->getOperand(0), ShuffleLanes), VecTy));
17496
17497 if (Parts.size() == 4) {
17498 Parts.push_back(ConstantVector::get(MaskConst));
17499 Results.push_back(
17500 Builder.CreateIntrinsic(Intrinsic::aarch64_neon_tbl4, VecTy, Parts));
17501 Parts.clear();
17502 }
17503
17504 for (int i = 0; i < ShuffleCount; ++i)
17505 ShuffleLanes[i] += ShuffleCount;
17506 }
17507
17508 assert((Parts.empty() || Results.empty()) &&
17509 "Lowering trunc for vectors requiring different TBL instructions is "
17510 "not supported!");
17511 // Call TBL for the residual table bytes present in 1,2, or 3 FP/SIMD
17512 // registers
17513 if (!Parts.empty()) {
17514 Intrinsic::ID TblID;
17515 switch (Parts.size()) {
17516 case 1:
17517 TblID = Intrinsic::aarch64_neon_tbl1;
17518 break;
17519 case 2:
17520 TblID = Intrinsic::aarch64_neon_tbl2;
17521 break;
17522 case 3:
17523 TblID = Intrinsic::aarch64_neon_tbl3;
17524 break;
17525 }
17526
17527 Parts.push_back(ConstantVector::get(MaskConst));
17528 Results.push_back(Builder.CreateIntrinsic(TblID, VecTy, Parts));
17529 }
17530
17531 // Extract the destination vector from TBL result(s) after combining them
17532 // where applicable. Currently, at most two TBLs are supported.
17533 assert(Results.size() <= 2 && "Trunc lowering does not support generation of "
17534 "more than 2 tbl instructions!");
17535 Value *FinalResult = Results[0];
17536 if (Results.size() == 1) {
17537 if (ElemsPerTbl < 16) {
17538 SmallVector<int> FinalMask(ElemsPerTbl);
17539 std::iota(FinalMask.begin(), FinalMask.end(), 0);
17540 FinalResult = Builder.CreateShuffleVector(Results[0], FinalMask);
17541 }
17542 } else {
17543 SmallVector<int> FinalMask(ElemsPerTbl * Results.size());
17544 if (ElemsPerTbl < 16) {
17545 std::iota(FinalMask.begin(), FinalMask.begin() + ElemsPerTbl, 0);
17546 std::iota(FinalMask.begin() + ElemsPerTbl, FinalMask.end(), 16);
17547 } else {
17548 std::iota(FinalMask.begin(), FinalMask.end(), 0);
17549 }
17550 FinalResult =
17551 Builder.CreateShuffleVector(Results[0], Results[1], FinalMask);
17552 }
17553
17554 TI->replaceAllUsesWith(FinalResult);
17555 TI->eraseFromParent();
17556}
17557
17559 Instruction *I, Loop *L, const TargetTransformInfo &TTI) const {
17560 // shuffle_vector instructions are serialized when targeting SVE,
17561 // see LowerSPLAT_VECTOR. This peephole is not beneficial.
17562 if (!EnableExtToTBL || Subtarget->useSVEForFixedLengthVectors())
17563 return false;
17564
17565 // Try to optimize conversions using tbl. This requires materializing constant
17566 // index vectors, which can increase code size and add loads. Skip the
17567 // transform unless the conversion is in a loop block guaranteed to execute
17568 // and we are not optimizing for size.
17569 Function *F = I->getParent()->getParent();
17570 if (!L || L->getHeader() != I->getParent() || F->hasOptSize())
17571 return false;
17572
17573 auto *SrcTy = dyn_cast<FixedVectorType>(I->getOperand(0)->getType());
17574 auto *DstTy = dyn_cast<FixedVectorType>(I->getType());
17575 if (!SrcTy || !DstTy)
17576 return false;
17577
17578 // Convert 'zext <Y x i8> %x to <Y x i8X>' to a shuffle that can be
17579 // lowered to tbl instructions to insert the original i8 elements
17580 // into i8x lanes. This is enabled for cases where it is beneficial.
17581 auto *ZExt = dyn_cast<ZExtInst>(I);
17582 if (ZExt && SrcTy->getElementType()->isIntegerTy(8)) {
17583 auto DstWidth = DstTy->getElementType()->getScalarSizeInBits();
17584 if (DstWidth % 8 != 0)
17585 return false;
17586
17587 auto *TruncDstType =
17589 // If the ZExt can be lowered to a single ZExt to the next power-of-2 and
17590 // the remaining ZExt folded into the user, don't use tbl lowering.
17591 auto SrcWidth = SrcTy->getElementType()->getScalarSizeInBits();
17592 if (TTI.getCastInstrCost(I->getOpcode(), DstTy, TruncDstType,
17595 if (SrcWidth * 2 >= TruncDstType->getElementType()->getScalarSizeInBits())
17596 return false;
17597
17598 DstTy = TruncDstType;
17599 }
17600
17601 // mul(zext(i8), sext) can be transformed into smull(zext, sext) which
17602 // performs one extend implicitly. If DstWidth is at most 4 * SrcWidth, at
17603 // most one extra extend step is needed and using tbl is not profitable.
17604 // Similarly, bail out if partial_reduce(acc, zext(i8)) can be lowered to a
17605 // udot instruction.
17606 if (SrcWidth * 4 <= DstWidth) {
17607 if (all_of(I->users(), [&](auto *U) {
17608 using namespace llvm::PatternMatch;
17609 auto *SingleUser = cast<Instruction>(&*U);
17610 if (match(SingleUser, m_c_Mul(m_Specific(I), m_SExt(m_Value()))))
17611 return true;
17612 if (match(SingleUser,
17613 m_Intrinsic<Intrinsic::vector_partial_reduce_add>(
17614 m_Value(), m_Specific(I))))
17615 return true;
17616 return false;
17617 }))
17618 return false;
17619 }
17620
17621 if (DstTy->getScalarSizeInBits() >= 64)
17622 return false;
17623
17624 IRBuilder<> Builder(ZExt);
17626 Builder, ZExt->getOperand(0), cast<FixedVectorType>(ZExt->getType()),
17627 DstTy, Subtarget->isLittleEndian());
17628 if (!Result)
17629 return false;
17630 ZExt->replaceAllUsesWith(Result);
17631 ZExt->eraseFromParent();
17632 return true;
17633 }
17634
17635 auto *UIToFP = dyn_cast<UIToFPInst>(I);
17636 if (UIToFP && ((SrcTy->getElementType()->isIntegerTy(8) &&
17637 DstTy->getElementType()->isFloatTy()) ||
17638 (SrcTy->getElementType()->isIntegerTy(16) &&
17639 DstTy->getElementType()->isDoubleTy()))) {
17640 IRBuilder<> Builder(I);
17642 Builder, I->getOperand(0), FixedVectorType::getInteger(DstTy),
17643 FixedVectorType::getInteger(DstTy), Subtarget->isLittleEndian());
17644 assert(ZExt && "Cannot fail for the i8 to float conversion");
17645 auto *UI = Builder.CreateUIToFP(ZExt, DstTy);
17646 I->replaceAllUsesWith(UI);
17647 I->eraseFromParent();
17648 return true;
17649 }
17650
17651 auto *SIToFP = dyn_cast<SIToFPInst>(I);
17652 if (SIToFP && SrcTy->getElementType()->isIntegerTy(8) &&
17653 DstTy->getElementType()->isFloatTy()) {
17654 IRBuilder<> Builder(I);
17655 auto *Shuffle = createTblShuffleForSExt(Builder, I->getOperand(0),
17657 Subtarget->isLittleEndian());
17658 assert(Shuffle && "Cannot fail for the i8 to float conversion");
17659 auto *Cast = Builder.CreateBitCast(Shuffle, VectorType::getInteger(DstTy));
17660 auto *AShr = Builder.CreateAShr(Cast, 24, "", true);
17661 auto *SI = Builder.CreateSIToFP(AShr, DstTy);
17662 I->replaceAllUsesWith(SI);
17663 I->eraseFromParent();
17664 return true;
17665 }
17666
17667 // Convert 'fptoui <(8|16) x float> to <(8|16) x i8>' to a wide fptoui
17668 // followed by a truncate lowered to using tbl.4.
17669 auto *FPToUI = dyn_cast<FPToUIInst>(I);
17670 if (FPToUI &&
17671 (SrcTy->getNumElements() == 8 || SrcTy->getNumElements() == 16) &&
17672 SrcTy->getElementType()->isFloatTy() &&
17673 DstTy->getElementType()->isIntegerTy(8)) {
17674 IRBuilder<> Builder(I);
17675 auto *WideConv = Builder.CreateFPToUI(FPToUI->getOperand(0),
17676 VectorType::getInteger(SrcTy));
17677 auto *TruncI = Builder.CreateTrunc(WideConv, DstTy);
17678 I->replaceAllUsesWith(TruncI);
17679 I->eraseFromParent();
17680 createTblForTrunc(cast<TruncInst>(TruncI), Subtarget->isLittleEndian());
17681 return true;
17682 }
17683
17684 // Convert 'trunc <(8|16) x (i32|i64)> %x to <(8|16) x i8>' to an appropriate
17685 // tbl instruction selecting the lowest/highest (little/big endian) 8 bits
17686 // per lane of the input that is represented using 1,2,3 or 4 128-bit table
17687 // registers
17688 auto *TI = dyn_cast<TruncInst>(I);
17689 if (TI && DstTy->getElementType()->isIntegerTy(8) &&
17690 ((SrcTy->getElementType()->isIntegerTy(32) ||
17691 SrcTy->getElementType()->isIntegerTy(64)) &&
17692 (SrcTy->getNumElements() == 16 || SrcTy->getNumElements() == 8))) {
17693 createTblForTrunc(TI, Subtarget->isLittleEndian());
17694 return true;
17695 }
17696
17697 return false;
17698}
17699
17701 Align &RequiredAlignment) const {
17702 if (!LoadedType.isSimple() ||
17703 (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
17704 return false;
17705 // Cyclone supports unaligned accesses.
17706 RequiredAlignment = Align(1);
17707 unsigned NumBits = LoadedType.getSizeInBits();
17708 return NumBits == 32 || NumBits == 64;
17709}
17710
17711/// A helper function for determining the number of interleaved accesses we
17712/// will generate when lowering accesses of the given type.
17714 VectorType *VecTy, const DataLayout &DL, bool UseScalable) const {
17715 unsigned VecSize = 128;
17716 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
17717 unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
17718 if (UseScalable && isa<FixedVectorType>(VecTy))
17719 VecSize = std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
17720 return std::max<unsigned>(1, (MinElts * ElSize + 127) / VecSize);
17721}
17722
17725 if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
17726 I.hasMetadata(FALKOR_STRIDED_ACCESS_MD))
17727 return MOStridedAccess;
17729}
17730
17732 VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const {
17733 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
17734 auto EC = VecTy->getElementCount();
17735 unsigned MinElts = EC.getKnownMinValue();
17736
17737 UseScalable = false;
17738
17739 if (isa<FixedVectorType>(VecTy) && !Subtarget->isNeonAvailable() &&
17740 (!Subtarget->useSVEForFixedLengthVectors() ||
17742 return false;
17743
17744 if (isa<ScalableVectorType>(VecTy) &&
17745 !Subtarget->isSVEorStreamingSVEAvailable())
17746 return false;
17747
17748 // Ensure the number of vector elements is greater than 1.
17749 if (MinElts < 2)
17750 return false;
17751
17752 // Ensure the element type is legal.
17753 if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
17754 return false;
17755
17756 if (EC.isScalable()) {
17757 UseScalable = true;
17758 return isPowerOf2_32(MinElts) && (MinElts * ElSize) % 128 == 0;
17759 }
17760
17761 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
17762 if (Subtarget->useSVEForFixedLengthVectors()) {
17763 unsigned MinSVEVectorSize =
17764 std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
17765 if (VecSize % MinSVEVectorSize == 0 ||
17766 (VecSize < MinSVEVectorSize && isPowerOf2_32(MinElts) &&
17767 (!Subtarget->isNeonAvailable() || VecSize > 128))) {
17768 UseScalable = true;
17769 return true;
17770 }
17771 }
17772
17773 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
17774 // 128 will be split into multiple interleaved accesses.
17775 return Subtarget->isNeonAvailable() && (VecSize == 64 || VecSize % 128 == 0);
17776}
17777
17779 if (VTy->getElementType() == Type::getDoubleTy(VTy->getContext()))
17780 return ScalableVectorType::get(VTy->getElementType(), 2);
17781
17782 if (VTy->getElementType() == Type::getFloatTy(VTy->getContext()))
17783 return ScalableVectorType::get(VTy->getElementType(), 4);
17784
17785 if (VTy->getElementType() == Type::getBFloatTy(VTy->getContext()))
17786 return ScalableVectorType::get(VTy->getElementType(), 8);
17787
17788 if (VTy->getElementType() == Type::getHalfTy(VTy->getContext()))
17789 return ScalableVectorType::get(VTy->getElementType(), 8);
17790
17791 if (VTy->getElementType() == Type::getInt64Ty(VTy->getContext()))
17792 return ScalableVectorType::get(VTy->getElementType(), 2);
17793
17794 if (VTy->getElementType() == Type::getInt32Ty(VTy->getContext()))
17795 return ScalableVectorType::get(VTy->getElementType(), 4);
17796
17797 if (VTy->getElementType() == Type::getInt16Ty(VTy->getContext()))
17798 return ScalableVectorType::get(VTy->getElementType(), 8);
17799
17800 if (VTy->getElementType() == Type::getInt8Ty(VTy->getContext()))
17801 return ScalableVectorType::get(VTy->getElementType(), 16);
17802
17803 llvm_unreachable("Cannot handle input vector type");
17804}
17805
17806static Function *getStructuredLoadFunction(Module *M, unsigned Factor,
17807 bool Scalable, Type *LDVTy,
17808 Type *PtrTy) {
17809 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
17810 static const Intrinsic::ID SVELoads[3] = {Intrinsic::aarch64_sve_ld2_sret,
17811 Intrinsic::aarch64_sve_ld3_sret,
17812 Intrinsic::aarch64_sve_ld4_sret};
17813 static const Intrinsic::ID NEONLoads[3] = {Intrinsic::aarch64_neon_ld2,
17814 Intrinsic::aarch64_neon_ld3,
17815 Intrinsic::aarch64_neon_ld4};
17816 if (Scalable)
17817 return Intrinsic::getOrInsertDeclaration(M, SVELoads[Factor - 2], {LDVTy});
17818
17819 return Intrinsic::getOrInsertDeclaration(M, NEONLoads[Factor - 2],
17820 {LDVTy, PtrTy});
17821}
17822
17823static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
17824 bool Scalable, Type *STVTy,
17825 Type *PtrTy) {
17826 assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor");
17827 static const Intrinsic::ID SVEStores[3] = {Intrinsic::aarch64_sve_st2,
17828 Intrinsic::aarch64_sve_st3,
17829 Intrinsic::aarch64_sve_st4};
17830 static const Intrinsic::ID NEONStores[3] = {Intrinsic::aarch64_neon_st2,
17831 Intrinsic::aarch64_neon_st3,
17832 Intrinsic::aarch64_neon_st4};
17833 if (Scalable)
17834 return Intrinsic::getOrInsertDeclaration(M, SVEStores[Factor - 2], {STVTy});
17835
17836 return Intrinsic::getOrInsertDeclaration(M, NEONStores[Factor - 2],
17837 {STVTy, PtrTy});
17838}
17839
17840/// Lower an interleaved load into a ldN intrinsic.
17841///
17842/// E.g. Lower an interleaved load (Factor = 2):
17843/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
17844/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
17845/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
17846///
17847/// Into:
17848/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
17849/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
17850/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
17852 Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
17853 ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const {
17854 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
17855 "Invalid interleave factor");
17856 assert(!Shuffles.empty() && "Empty shufflevector input");
17857 assert(Shuffles.size() == Indices.size() &&
17858 "Unmatched number of shufflevectors and indices");
17859
17860 auto *LI = dyn_cast<LoadInst>(Load);
17861 if (!LI)
17862 return false;
17863 assert(!Mask && GapMask.popcount() == Factor && "Unexpected mask on a load");
17864
17865 const DataLayout &DL = LI->getDataLayout();
17866
17867 VectorType *VTy = Shuffles[0]->getType();
17868
17869 // Skip if we do not have NEON and skip illegal vector types. We can
17870 // "legalize" wide vector types into multiple interleaved accesses as long as
17871 // the vector types are divisible by 128.
17872 bool UseScalable;
17873 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
17874 return false;
17875
17876 // Check if the interleave is a zext(shuffle), that can be better optimized
17877 // into shift / and masks. For the moment we do this just for uitofp (not
17878 // zext) to avoid issues with widening instructions.
17879 if (Shuffles.size() == 4 && all_of(Shuffles, [](ShuffleVectorInst *SI) {
17880 using namespace llvm::PatternMatch;
17881 return SI->hasOneUse() && match(SI->user_back(), m_UIToFP(m_Value())) &&
17882 SI->getType()->getScalarSizeInBits() * 4 ==
17883 SI->user_back()->getType()->getScalarSizeInBits();
17884 }))
17885 return false;
17886
17887 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
17888
17889 auto *FVTy = cast<FixedVectorType>(VTy);
17890
17891 // A pointer vector can not be the return type of the ldN intrinsics. Need to
17892 // load integer vectors first and then convert to pointer vectors.
17893 Type *EltTy = FVTy->getElementType();
17894 if (EltTy->isPointerTy())
17895 FVTy =
17896 FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements());
17897
17898 // If we're going to generate more than one load, reset the sub-vector type
17899 // to something legal.
17900 FVTy = FixedVectorType::get(FVTy->getElementType(),
17901 FVTy->getNumElements() / NumLoads);
17902
17903 auto *LDVTy =
17904 UseScalable ? cast<VectorType>(getSVEContainerIRType(FVTy)) : FVTy;
17905
17906 IRBuilder<> Builder(LI);
17907
17908 // The base address of the load.
17909 Value *BaseAddr = LI->getPointerOperand();
17910
17911 Type *PtrTy = LI->getPointerOperandType();
17912 Type *PredTy = VectorType::get(Type::getInt1Ty(LDVTy->getContext()),
17913 LDVTy->getElementCount());
17914
17915 Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor,
17916 UseScalable, LDVTy, PtrTy);
17917
17918 // Holds sub-vectors extracted from the load intrinsic return values. The
17919 // sub-vectors are associated with the shufflevector instructions they will
17920 // replace.
17922
17923 Value *PTrue = nullptr;
17924 if (UseScalable) {
17925 std::optional<unsigned> PgPattern =
17926 getSVEPredPatternFromNumElements(FVTy->getNumElements());
17927 if (Subtarget->getMinSVEVectorSizeInBits() ==
17928 Subtarget->getMaxSVEVectorSizeInBits() &&
17929 Subtarget->getMinSVEVectorSizeInBits() == DL.getTypeSizeInBits(FVTy))
17930 PgPattern = AArch64SVEPredPattern::all;
17931
17932 auto *PTruePat =
17933 ConstantInt::get(Type::getInt32Ty(LDVTy->getContext()), *PgPattern);
17934 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
17935 {PTruePat});
17936 }
17937
17938 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
17939
17940 // If we're generating more than one load, compute the base address of
17941 // subsequent loads as an offset from the previous.
17942 if (LoadCount > 0)
17943 BaseAddr = Builder.CreateConstGEP1_32(LDVTy->getElementType(), BaseAddr,
17944 FVTy->getNumElements() * Factor);
17945
17946 CallInst *LdN;
17947 if (UseScalable)
17948 LdN = Builder.CreateCall(LdNFunc, {PTrue, BaseAddr}, "ldN");
17949 else
17950 LdN = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
17951
17952 // Extract and store the sub-vectors returned by the load intrinsic.
17953 for (unsigned i = 0; i < Shuffles.size(); i++) {
17954 ShuffleVectorInst *SVI = Shuffles[i];
17955 unsigned Index = Indices[i];
17956
17957 Value *SubVec = Builder.CreateExtractValue(LdN, Index);
17958
17959 if (UseScalable)
17960 SubVec = Builder.CreateExtractVector(FVTy, SubVec, uint64_t(0));
17961
17962 // Convert the integer vector to pointer vector if the element is pointer.
17963 if (EltTy->isPointerTy())
17964 SubVec = Builder.CreateIntToPtr(
17966 FVTy->getNumElements()));
17967
17968 SubVecs[SVI].push_back(SubVec);
17969 }
17970 }
17971
17972 // Replace uses of the shufflevector instructions with the sub-vectors
17973 // returned by the load intrinsic. If a shufflevector instruction is
17974 // associated with more than one sub-vector, those sub-vectors will be
17975 // concatenated into a single wide vector.
17976 for (ShuffleVectorInst *SVI : Shuffles) {
17977 auto &SubVec = SubVecs[SVI];
17978 auto *WideVec =
17979 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
17980 SVI->replaceAllUsesWith(WideVec);
17981 }
17982
17983 return true;
17984}
17985
17986template <typename Iter>
17987bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
17988 int MaxLookupDist = 20;
17989 unsigned IdxWidth = DL.getIndexSizeInBits(0);
17990 APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
17991 const Value *PtrA1 =
17992 Ptr->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
17993
17994 while (++It != End) {
17995 if (It->isDebugOrPseudoInst())
17996 continue;
17997 if (MaxLookupDist-- == 0)
17998 break;
17999 if (const auto *SI = dyn_cast<StoreInst>(&*It)) {
18000 const Value *PtrB1 =
18001 SI->getPointerOperand()->stripAndAccumulateInBoundsConstantOffsets(
18002 DL, OffsetB);
18003 if (PtrA1 == PtrB1 &&
18004 (OffsetA.sextOrTrunc(IdxWidth) - OffsetB.sextOrTrunc(IdxWidth))
18005 .abs() == 16)
18006 return true;
18007 }
18008 }
18009
18010 return false;
18011}
18012
18013/// Lower an interleaved store into a stN intrinsic.
18014///
18015/// E.g. Lower an interleaved store (Factor = 3):
18016/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
18017/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
18018/// store <12 x i32> %i.vec, <12 x i32>* %ptr
18019///
18020/// Into:
18021/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
18022/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
18023/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
18024/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
18025///
18026/// Note that the new shufflevectors will be removed and we'll only generate one
18027/// st3 instruction in CodeGen.
18028///
18029/// Example for a more general valid mask (Factor 3). Lower:
18030/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
18031/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
18032/// store <12 x i32> %i.vec, <12 x i32>* %ptr
18033///
18034/// Into:
18035/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
18036/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
18037/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
18038/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
18040 Value *LaneMask,
18041 ShuffleVectorInst *SVI,
18042 unsigned Factor,
18043 const APInt &GapMask) const {
18044
18045 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
18046 "Invalid interleave factor");
18047 auto *SI = dyn_cast<StoreInst>(Store);
18048 if (!SI)
18049 return false;
18050 assert(!LaneMask && GapMask.popcount() == Factor &&
18051 "Unexpected mask on store");
18052
18053 auto *VecTy = cast<FixedVectorType>(SVI->getType());
18054 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
18055
18056 unsigned LaneLen = VecTy->getNumElements() / Factor;
18057 Type *EltTy = VecTy->getElementType();
18058 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
18059
18060 const DataLayout &DL = SI->getDataLayout();
18061 bool UseScalable;
18062
18063 // Skip if we do not have NEON and skip illegal vector types. We can
18064 // "legalize" wide vector types into multiple interleaved accesses as long as
18065 // the vector types are divisible by 128.
18066 if (!isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
18067 return false;
18068
18069 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
18070
18071 Value *Op0 = SVI->getOperand(0);
18072 Value *Op1 = SVI->getOperand(1);
18073 IRBuilder<> Builder(SI);
18074
18075 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
18076 // vectors to integer vectors.
18077 if (EltTy->isPointerTy()) {
18078 Type *IntTy = DL.getIntPtrType(EltTy);
18079 unsigned NumOpElts =
18080 cast<FixedVectorType>(Op0->getType())->getNumElements();
18081
18082 // Convert to the corresponding integer vector.
18083 auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts);
18084 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
18085 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
18086
18087 SubVecTy = FixedVectorType::get(IntTy, LaneLen);
18088 }
18089
18090 // If we're going to generate more than one store, reset the lane length
18091 // and sub-vector type to something legal.
18092 LaneLen /= NumStores;
18093 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
18094
18095 auto *STVTy = UseScalable ? cast<VectorType>(getSVEContainerIRType(SubVecTy))
18096 : SubVecTy;
18097
18098 // The base address of the store.
18099 Value *BaseAddr = SI->getPointerOperand();
18100
18101 auto Mask = SVI->getShuffleMask();
18102
18103 // Sanity check if all the indices are NOT in range.
18104 // If mask is `poison`, `Mask` may be a vector of -1s.
18105 // If all of them are `poison`, OOB read will happen later.
18106 if (llvm::all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
18107 return false;
18108 }
18109 // A 64bit st2 which does not start at element 0 will involved adding extra
18110 // ext elements making the st2 unprofitable, and if there is a nearby store
18111 // that points to BaseAddr+16 or BaseAddr-16 then it can be better left as a
18112 // zip;ldp pair which has higher throughput.
18113 if (Factor == 2 && SubVecTy->getPrimitiveSizeInBits() == 64 &&
18114 (Mask[0] != 0 ||
18115 hasNearbyPairedStore(SI->getIterator(), SI->getParent()->end(), BaseAddr,
18116 DL) ||
18117 hasNearbyPairedStore(SI->getReverseIterator(), SI->getParent()->rend(),
18118 BaseAddr, DL)))
18119 return false;
18120
18121 Type *PtrTy = SI->getPointerOperandType();
18122 Type *PredTy = VectorType::get(Type::getInt1Ty(STVTy->getContext()),
18123 STVTy->getElementCount());
18124
18125 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
18126 UseScalable, STVTy, PtrTy);
18127
18128 Value *PTrue = nullptr;
18129 if (UseScalable) {
18130 std::optional<unsigned> PgPattern =
18131 getSVEPredPatternFromNumElements(SubVecTy->getNumElements());
18132 if (Subtarget->getMinSVEVectorSizeInBits() ==
18133 Subtarget->getMaxSVEVectorSizeInBits() &&
18134 Subtarget->getMinSVEVectorSizeInBits() ==
18135 DL.getTypeSizeInBits(SubVecTy))
18136 PgPattern = AArch64SVEPredPattern::all;
18137
18138 auto *PTruePat =
18139 ConstantInt::get(Type::getInt32Ty(STVTy->getContext()), *PgPattern);
18140 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
18141 {PTruePat});
18142 }
18143
18144 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
18145
18147
18148 // Split the shufflevector operands into sub vectors for the new stN call.
18149 for (unsigned i = 0; i < Factor; i++) {
18150 Value *Shuffle;
18151 unsigned IdxI = StoreCount * LaneLen * Factor + i;
18152 if (Mask[IdxI] >= 0) {
18153 Shuffle = Builder.CreateShuffleVector(
18154 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0));
18155 } else {
18156 unsigned StartMask = 0;
18157 for (unsigned j = 1; j < LaneLen; j++) {
18158 unsigned IdxJ = StoreCount * LaneLen * Factor + j * Factor + i;
18159 if (Mask[IdxJ] >= 0) {
18160 StartMask = Mask[IdxJ] - j;
18161 break;
18162 }
18163 }
18164 // Note: Filling undef gaps with random elements is ok, since
18165 // those elements were being written anyway (with undefs).
18166 // In the case of all undefs we're defaulting to using elems from 0
18167 // Note: StartMask cannot be negative, it's checked in
18168 // isReInterleaveMask
18169 Shuffle = Builder.CreateShuffleVector(
18170 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0));
18171 }
18172
18173 if (UseScalable)
18174 Shuffle = Builder.CreateInsertVector(STVTy, PoisonValue::get(STVTy),
18175 Shuffle, uint64_t(0));
18176
18177 Ops.push_back(Shuffle);
18178 }
18179
18180 if (UseScalable)
18181 Ops.push_back(PTrue);
18182
18183 // If we generating more than one store, we compute the base address of
18184 // subsequent stores as an offset from the previous.
18185 if (StoreCount > 0)
18186 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
18187 BaseAddr, LaneLen * Factor);
18188
18189 Ops.push_back(BaseAddr);
18190 Builder.CreateCall(StNFunc, Ops);
18191 }
18192 return true;
18193}
18194
18196 Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
18197 const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());
18198 if (Factor != 2 && Factor != 3 && Factor != 4) {
18199 LLVM_DEBUG(dbgs() << "Matching ld2, ld3 and ld4 patterns failed\n");
18200 return false;
18201 }
18202 auto *LI = dyn_cast<LoadInst>(Load);
18203 if (!LI)
18204 return false;
18205 assert(!Mask && "Unexpected mask on a load\n");
18206
18208
18209 const DataLayout &DL = LI->getModule()->getDataLayout();
18210 bool UseScalable;
18211 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
18212 return false;
18213
18214 // TODO: Add support for using SVE instructions with fixed types later, using
18215 // the code from lowerInterleavedLoad to obtain the correct container type.
18216 if (UseScalable && !VTy->isScalableTy())
18217 return false;
18218
18219 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
18220 VectorType *LdTy =
18222 VTy->getElementCount().divideCoefficientBy(NumLoads));
18223
18224 Type *PtrTy = LI->getPointerOperandType();
18225 Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor,
18226 UseScalable, LdTy, PtrTy);
18227
18228 IRBuilder<> Builder(LI);
18229 Value *Pred = nullptr;
18230 if (UseScalable)
18231 Pred =
18232 Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue());
18233
18234 Value *BaseAddr = LI->getPointerOperand();
18235 Value *Result = nullptr;
18236 if (NumLoads > 1) {
18237 // Create multiple legal small ldN.
18238 SmallVector<Value *, 4> ExtractedLdValues(Factor, PoisonValue::get(VTy));
18239 for (unsigned I = 0; I < NumLoads; ++I) {
18240 Value *Offset = Builder.getInt64(I * Factor);
18241
18242 Value *Address = Builder.CreateGEP(LdTy, BaseAddr, {Offset});
18243 Value *LdN = nullptr;
18244 if (UseScalable)
18245 LdN = Builder.CreateCall(LdNFunc, {Pred, Address}, "ldN");
18246 else
18247 LdN = Builder.CreateCall(LdNFunc, Address, "ldN");
18248 Value *Idx =
18249 Builder.getInt64(I * LdTy->getElementCount().getKnownMinValue());
18250 for (unsigned J = 0; J < Factor; ++J) {
18251 ExtractedLdValues[J] = Builder.CreateInsertVector(
18252 VTy, ExtractedLdValues[J], Builder.CreateExtractValue(LdN, J), Idx);
18253 }
18254 LLVM_DEBUG(dbgs() << "LdN4 res: "; LdN->dump());
18255 }
18256
18257 // Merge the values from different factors.
18258 Result = PoisonValue::get(DI->getType());
18259 for (unsigned J = 0; J < Factor; ++J)
18260 Result = Builder.CreateInsertValue(Result, ExtractedLdValues[J], J);
18261 } else {
18262 if (UseScalable)
18263 Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
18264 else
18265 Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
18266 }
18267
18268 // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
18269 DI->replaceAllUsesWith(Result);
18270 return true;
18271}
18272
18274 Instruction *Store, Value *Mask,
18275 ArrayRef<Value *> InterleavedValues) const {
18276 unsigned Factor = InterleavedValues.size();
18277 if (Factor != 2 && Factor != 3 && Factor != 4) {
18278 LLVM_DEBUG(dbgs() << "Matching st2, st3 and st4 patterns failed\n");
18279 return false;
18280 }
18282 if (!SI)
18283 return false;
18284 assert(!Mask && "Unexpected mask on plain store");
18285
18286 VectorType *VTy = cast<VectorType>(InterleavedValues[0]->getType());
18287 const DataLayout &DL = SI->getModule()->getDataLayout();
18288
18289 bool UseScalable;
18290 if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
18291 return false;
18292
18293 // TODO: Add support for using SVE instructions with fixed types later, using
18294 // the code from lowerInterleavedStore to obtain the correct container type.
18295 if (UseScalable && !VTy->isScalableTy())
18296 return false;
18297
18298 unsigned NumStores = getNumInterleavedAccesses(VTy, DL, UseScalable);
18299
18300 VectorType *StTy =
18302 VTy->getElementCount().divideCoefficientBy(NumStores));
18303
18304 Type *PtrTy = SI->getPointerOperandType();
18305 Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor,
18306 UseScalable, StTy, PtrTy);
18307
18308 IRBuilder<> Builder(SI);
18309
18310 Value *BaseAddr = SI->getPointerOperand();
18311 Value *Pred = nullptr;
18312
18313 if (UseScalable)
18314 Pred =
18315 Builder.CreateVectorSplat(StTy->getElementCount(), Builder.getTrue());
18316
18317 auto ExtractedValues = InterleavedValues;
18318 SmallVector<Value *, 4> StoreOperands(InterleavedValues);
18319 if (UseScalable)
18320 StoreOperands.push_back(Pred);
18321 StoreOperands.push_back(BaseAddr);
18322 for (unsigned I = 0; I < NumStores; ++I) {
18323 Value *Address = BaseAddr;
18324 if (NumStores > 1) {
18325 Value *Offset = Builder.getInt64(I * Factor);
18326 Address = Builder.CreateGEP(StTy, BaseAddr, {Offset});
18327 Value *Idx =
18328 Builder.getInt64(I * StTy->getElementCount().getKnownMinValue());
18329 for (unsigned J = 0; J < Factor; J++) {
18330 StoreOperands[J] =
18331 Builder.CreateExtractVector(StTy, ExtractedValues[J], Idx);
18332 }
18333 // update the address
18334 StoreOperands[StoreOperands.size() - 1] = Address;
18335 }
18336 Builder.CreateCall(StNFunc, StoreOperands);
18337 }
18338 return true;
18339}
18340
18342 LLVMContext &Context, const MemOp &Op,
18343 const AttributeList &FuncAttributes) const {
18344 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
18345 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
18346 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
18347 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
18348 // taken one instruction to materialize the v2i64 zero and one store (with
18349 // restrictive addressing mode). Just do i64 stores.
18350 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
18351 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
18352 if (Op.isAligned(AlignCheck))
18353 return true;
18354 unsigned Fast;
18355 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
18357 Fast;
18358 };
18359
18360 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
18361 AlignmentIsAcceptable(MVT::v16i8, Align(16)))
18362 return MVT::v16i8;
18363 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
18364 return MVT::f128;
18365 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
18366 return MVT::i64;
18367 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
18368 return MVT::i32;
18369 return MVT::Other;
18370}
18371
18373 const MemOp &Op, const AttributeList &FuncAttributes) const {
18374 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
18375 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
18376 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
18377 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
18378 // taken one instruction to materialize the v2i64 zero and one store (with
18379 // restrictive addressing mode). Just do i64 stores.
18380 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
18381 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
18382 if (Op.isAligned(AlignCheck))
18383 return true;
18384 unsigned Fast;
18385 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
18387 Fast;
18388 };
18389
18390 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
18391 AlignmentIsAcceptable(MVT::v2i64, Align(16)))
18392 return LLT::fixed_vector(2, 64);
18393 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
18394 return LLT::scalar(128);
18395 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
18396 return LLT::scalar(64);
18397 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
18398 return LLT::scalar(32);
18399 return LLT();
18400}
18401
18402// 12-bit optionally shifted immediates are legal for adds.
18404 if (Immed == std::numeric_limits<int64_t>::min()) {
18405 return false;
18406 }
18407 // Same encoding for add/sub, just flip the sign.
18408 return isLegalArithImmed((uint64_t)std::abs(Immed));
18409}
18410
18412 // We will only emit addvl/inc* instructions for SVE2
18413 if (!Subtarget->hasSVE2())
18414 return false;
18415
18416 // addvl's immediates are in terms of the number of bytes in a register.
18417 // Since there are 16 in the base supported size (128bits), we need to
18418 // divide the immediate by that much to give us a useful immediate to
18419 // multiply by vscale. We can't have a remainder as a result of this.
18420 if (Imm % 16 == 0)
18421 return isInt<6>(Imm / 16);
18422
18423 // Inc[b|h|w|d] instructions take a pattern and a positive immediate
18424 // multiplier. For now, assume a pattern of 'all'. Incb would be a subset
18425 // of addvl as a result, so only take h|w|d into account.
18426 // Dec[h|w|d] will cover subtractions.
18427 // Immediates are in the range [1,16], so we can't do a 2's complement check.
18428 // FIXME: Can we make use of other patterns to cover other immediates?
18429
18430 // inch|dech
18431 if (Imm % 8 == 0)
18432 return std::abs(Imm / 8) <= 16;
18433 // incw|decw
18434 if (Imm % 4 == 0)
18435 return std::abs(Imm / 4) <= 16;
18436 // incd|decd
18437 if (Imm % 2 == 0)
18438 return std::abs(Imm / 2) <= 16;
18439
18440 return false;
18441}
18442
18443// Return false to prevent folding
18444// (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine,
18445// if the folding leads to worse code.
18447 SDValue AddNode, SDValue ConstNode) const {
18448 // Let the DAGCombiner decide for vector types and large types.
18449 const EVT VT = AddNode.getValueType();
18450 if (VT.isVector() || VT.getScalarSizeInBits() > 64)
18451 return true;
18452
18453 // It is worse if c1 is legal add immediate, while c1*c2 is not
18454 // and has to be composed by at least two instructions.
18455 const ConstantSDNode *C1Node = cast<ConstantSDNode>(AddNode.getOperand(1));
18456 const ConstantSDNode *C2Node = cast<ConstantSDNode>(ConstNode);
18457 const int64_t C1 = C1Node->getSExtValue();
18458 const APInt C1C2 = C1Node->getAPIntValue() * C2Node->getAPIntValue();
18460 return true;
18462 // Adapt to the width of a register.
18463 unsigned BitSize = VT.getSizeInBits() <= 32 ? 32 : 64;
18464 AArch64_IMM::expandMOVImm(C1C2.getZExtValue(), BitSize, Insn);
18465 if (Insn.size() > 1)
18466 return false;
18467
18468 // Default to true and let the DAGCombiner decide.
18469 return true;
18470}
18471
18472// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
18473// immediates is the same as for an add or a sub.
18475 return isLegalAddImmediate(Immed);
18476}
18477
18478/// isLegalAddressingMode - Return true if the addressing mode represented
18479/// by AM is legal for this target, for a load/store of the specified type.
18481 const AddrMode &AMode, Type *Ty,
18482 unsigned AS, Instruction *I) const {
18483 // AArch64 has five basic addressing modes:
18484 // reg
18485 // reg + 9-bit signed offset
18486 // reg + SIZE_IN_BYTES * 12-bit unsigned offset
18487 // reg1 + reg2
18488 // reg + SIZE_IN_BYTES * reg
18489
18490 // No global is ever allowed as a base.
18491 if (AMode.BaseGV)
18492 return false;
18493
18494 // No reg+reg+imm addressing.
18495 if (AMode.HasBaseReg && AMode.BaseOffs && AMode.Scale)
18496 return false;
18497
18498 // Canonicalise `1*ScaledReg + imm` into `BaseReg + imm` and
18499 // `2*ScaledReg` into `BaseReg + ScaledReg`
18500 AddrMode AM = AMode;
18501 if (AM.Scale && !AM.HasBaseReg) {
18502 if (AM.Scale == 1) {
18503 AM.HasBaseReg = true;
18504 AM.Scale = 0;
18505 } else if (AM.Scale == 2) {
18506 AM.HasBaseReg = true;
18507 AM.Scale = 1;
18508 } else {
18509 return false;
18510 }
18511 }
18512
18513 // A base register is required in all addressing modes.
18514 if (!AM.HasBaseReg)
18515 return false;
18516
18517 if (Ty->isScalableTy()) {
18518 if (isa<ScalableVectorType>(Ty)) {
18519 // See if we have a foldable vscale-based offset, for vector types which
18520 // are either legal or smaller than the minimum; more work will be
18521 // required if we need to consider addressing for types which need
18522 // legalization by splitting.
18523 uint64_t VecNumBytes = DL.getTypeSizeInBits(Ty).getKnownMinValue() / 8;
18524 if (AM.HasBaseReg && !AM.BaseOffs && AM.ScalableOffset && !AM.Scale &&
18525 (AM.ScalableOffset % VecNumBytes == 0) && VecNumBytes <= 16 &&
18526 isPowerOf2_64(VecNumBytes))
18527 return isInt<4>(AM.ScalableOffset / (int64_t)VecNumBytes);
18528
18529 uint64_t VecElemNumBytes =
18530 DL.getTypeSizeInBits(cast<VectorType>(Ty)->getElementType()) / 8;
18531 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset &&
18532 (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes);
18533 }
18534
18535 return AM.HasBaseReg && !AM.BaseOffs && !AM.ScalableOffset && !AM.Scale;
18536 }
18537
18538 // No scalable offsets allowed for non-scalable types.
18539 if (AM.ScalableOffset)
18540 return false;
18541
18542 // check reg + imm case:
18543 // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
18544 uint64_t NumBytes = 0;
18545 if (Ty->isSized()) {
18546 uint64_t NumBits = DL.getTypeSizeInBits(Ty);
18547 NumBytes = NumBits / 8;
18548 if (!isPowerOf2_64(NumBits))
18549 NumBytes = 0;
18550 }
18551
18552 return Subtarget->getInstrInfo()->isLegalAddressingMode(NumBytes, AM.BaseOffs,
18553 AM.Scale);
18554}
18555
18556// Check whether the 2 offsets belong to the same imm24 range, and their high
18557// 12bits are same, then their high part can be decoded with the offset of add.
18558int64_t
18560 int64_t MaxOffset) const {
18561 int64_t HighPart = MinOffset & ~0xfffULL;
18562 if (MinOffset >> 12 == MaxOffset >> 12 && isLegalAddImmediate(HighPart)) {
18563 // Rebase the value to an integer multiple of imm12.
18564 return HighPart;
18565 }
18566
18567 return 0;
18568}
18569
18571 // Consider splitting large offset of struct or array.
18572 return true;
18573}
18574
18576 const MachineFunction &MF, EVT VT) const {
18577 EVT ScalarVT = VT.getScalarType();
18578
18579 if (!ScalarVT.isSimple())
18580 return false;
18581
18582 switch (ScalarVT.getSimpleVT().SimpleTy) {
18583 case MVT::f16:
18584 return Subtarget->hasFullFP16();
18585 case MVT::f32:
18586 case MVT::f64:
18587 return true;
18588 case MVT::bf16:
18589 return VT.isScalableVector() && Subtarget->hasBF16() &&
18590 Subtarget->isNonStreamingSVEorSME2Available();
18591 default:
18592 break;
18593 }
18594
18595 return false;
18596}
18597
18599 Type *Ty) const {
18600 switch (Ty->getScalarType()->getTypeID()) {
18601 case Type::FloatTyID:
18602 case Type::DoubleTyID:
18603 return true;
18604 default:
18605 return false;
18606 }
18607}
18608
18610 EVT VT, CodeGenOptLevel OptLevel) const {
18611 return (OptLevel >= CodeGenOptLevel::Aggressive) && !VT.isScalableVector() &&
18613}
18614
18615const MCPhysReg *
18617 // LR is a callee-save register, but we must treat it as clobbered by any call
18618 // site. Hence we include LR in the scratch registers, which are in turn added
18619 // as implicit-defs for stackmaps and patchpoints.
18620 static const MCPhysReg ScratchRegs[] = {
18621 AArch64::X16, AArch64::X17, AArch64::LR, 0
18622 };
18623 return ScratchRegs;
18624}
18625
18627 static const MCPhysReg RCRegs[] = {AArch64::FPCR};
18628 return RCRegs;
18629}
18630
18631bool
18633 CombineLevel Level) const {
18634 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
18635 N->getOpcode() == ISD::SRL) &&
18636 "Expected shift op");
18637
18638 SDValue ShiftLHS = N->getOperand(0);
18639 EVT VT = N->getValueType(0);
18640
18641 if (!ShiftLHS->hasOneUse())
18642 return false;
18643
18644 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
18645 !ShiftLHS.getOperand(0)->hasOneUse())
18646 return false;
18647
18648 // If ShiftLHS is unsigned bit extraction: ((x >> C) & mask), then do not
18649 // combine it with shift 'N' to let it be lowered to UBFX except:
18650 // ((x >> C) & mask) << C.
18651 if (ShiftLHS.getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
18652 isa<ConstantSDNode>(ShiftLHS.getOperand(1))) {
18653 uint64_t TruncMask = ShiftLHS.getConstantOperandVal(1);
18654 if (isMask_64(TruncMask)) {
18655 SDValue AndLHS = ShiftLHS.getOperand(0);
18656 if (AndLHS.getOpcode() == ISD::SRL) {
18657 if (auto *SRLC = dyn_cast<ConstantSDNode>(AndLHS.getOperand(1))) {
18658 if (N->getOpcode() == ISD::SHL)
18659 if (auto *SHLC = dyn_cast<ConstantSDNode>(N->getOperand(1)))
18660 return SRLC->getZExtValue() == SHLC->getZExtValue();
18661 return false;
18662 }
18663 }
18664 }
18665 }
18666 return true;
18667}
18668
18670 const SDNode *N) const {
18671 assert(N->getOpcode() == ISD::XOR &&
18672 (N->getOperand(0).getOpcode() == ISD::SHL ||
18673 N->getOperand(0).getOpcode() == ISD::SRL) &&
18674 "Expected XOR(SHIFT) pattern");
18675
18676 // Only commute if the entire NOT mask is a hidden shifted mask.
18677 auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
18678 auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
18679 if (XorC && ShiftC) {
18680 unsigned MaskIdx, MaskLen;
18681 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
18682 unsigned ShiftAmt = ShiftC->getZExtValue();
18683 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
18684 if (N->getOperand(0).getOpcode() == ISD::SHL)
18685 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
18686 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
18687 }
18688 }
18689
18690 return false;
18691}
18692
18694 const SDNode *N) const {
18695 assert(((N->getOpcode() == ISD::SHL &&
18696 N->getOperand(0).getOpcode() == ISD::SRL) ||
18697 (N->getOpcode() == ISD::SRL &&
18698 N->getOperand(0).getOpcode() == ISD::SHL)) &&
18699 "Expected shift-shift mask");
18700 // Don't allow multiuse shift folding with the same shift amount.
18701 if (!N->getOperand(0)->hasOneUse())
18702 return false;
18703
18704 // Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns.
18705 EVT VT = N->getValueType(0);
18706 if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 || VT == MVT::i64)) {
18707 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
18708 auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
18709 return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue());
18710 }
18711
18712 // We do not need to fold when this shifting used in specific load case:
18713 // (ldr x, (add x, (shl (srl x, c1) 2)))
18714 if (N->getOpcode() == ISD::SHL && N->hasOneUse()) {
18715 if (auto C2 = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
18716 unsigned ShlAmt = C2->getZExtValue();
18717 if (auto ShouldADD = *N->user_begin();
18718 ShouldADD->getOpcode() == ISD::ADD && ShouldADD->hasOneUse()) {
18719 if (auto Load = dyn_cast<LoadSDNode>(*ShouldADD->user_begin())) {
18720 EVT MemVT = Load->getMemoryVT();
18721
18722 if (Load->getValueType(0).isScalableVector())
18723 return (8ULL << ShlAmt) != MemVT.getScalarSizeInBits();
18724
18725 if (isIndexedLoadLegal(ISD::PRE_INC, MemVT))
18726 return (8ULL << ShlAmt) != MemVT.getFixedSizeInBits();
18727 }
18728 }
18729 }
18730 }
18731
18732 return true;
18733}
18734
18736 unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
18737 SDValue Y) const {
18738 return VT.isScalableVector() && isTypeLegal(VT) &&
18739 SelectOpcode == ISD::VSELECT;
18740}
18741
18743 Type *Ty) const {
18744 assert(Ty->isIntegerTy());
18745
18746 unsigned BitSize = Ty->getPrimitiveSizeInBits();
18747 if (BitSize == 0)
18748 return false;
18749
18750 int64_t Val = Imm.getSExtValue();
18751 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
18752 return true;
18753
18754 if (Val < 0)
18755 Val = ~Val;
18756 if (BitSize == 32)
18757 Val &= (1LL << 32) - 1;
18758
18759 unsigned Shift = llvm::Log2_64((uint64_t)Val) / 16;
18760 // MOVZ is free so return true for one or fewer MOVK.
18761 return Shift < 3;
18762}
18763
18765 unsigned Index) const {
18767 return false;
18768
18769 return (Index == 0 || Index == ResVT.getVectorMinNumElements());
18770}
18771
18772/// Turn vector tests of the signbit in the form of:
18773/// xor (sra X, elt_size(X)-1), -1
18774/// into:
18775/// cmge X, X, #0
18777 const AArch64Subtarget *Subtarget) {
18778 EVT VT = N->getValueType(0);
18779 if (!Subtarget->hasNEON() || !VT.isVector())
18780 return SDValue();
18781
18782 // There must be a shift right algebraic before the xor, and the xor must be a
18783 // 'not' operation.
18784 SDValue Shift = N->getOperand(0);
18785 SDValue Ones = N->getOperand(1);
18786 if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
18788 return SDValue();
18789
18790 // The shift should be smearing the sign bit across each vector element.
18791 auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
18792 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
18793 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
18794 return SDValue();
18795
18796 SDLoc DL(N);
18797 SDValue Zero = DAG.getConstant(0, DL, Shift.getValueType());
18798 return DAG.getSetCC(DL, VT, Shift.getOperand(0), Zero, ISD::SETGE);
18799}
18800
18801// Given a vecreduce_add node, detect the below pattern and convert it to the
18802// node sequence with UABDL, [S|U]ADB and UADDLP.
18803//
18804// i32 vecreduce_add(
18805// v16i32 abs(
18806// v16i32 sub(
18807// v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b))))
18808//
18809// or
18810//
18811// i32 vecreduce_add(
18812// v16i32 zext(
18813// v16i16 abs(
18814// v16i16 sub(
18815// v16i16 [sign|zero]_extend(v16i8 a), v16i16 [sign|zero]_extend(v16i8 b))))
18816//
18817// =================>
18818// i32 vecreduce_add(
18819// v4i32 UADDLP(
18820// v8i16 add(
18821// v8i16 zext(
18822// v8i8 [S|U]ABD low8:v16i8 a, low8:v16i8 b
18823// v8i16 zext(
18824// v8i8 [S|U]ABD high8:v16i8 a, high8:v16i8 b
18826 SelectionDAG &DAG) {
18827 // Assumed i32 vecreduce_add
18828 if (N->getValueType(0) != MVT::i32)
18829 return SDValue();
18830
18831 SDValue VecReduceOp0 = N->getOperand(0);
18832 bool SawTrailingZext = false;
18833 // Look through an optional post-ABS ZEXT from v16i16 -> v16i32.
18834 if (VecReduceOp0.getOpcode() == ISD::ZERO_EXTEND &&
18835 VecReduceOp0->getValueType(0) == MVT::v16i32 &&
18836 VecReduceOp0->getOperand(0)->getOpcode() == ISD::ABS &&
18837 VecReduceOp0->getOperand(0)->getValueType(0) == MVT::v16i16) {
18838 SawTrailingZext = true;
18839 VecReduceOp0 = VecReduceOp0.getOperand(0);
18840 }
18841
18842 // Peel off an optional post-ABS extend (v16i16 -> v16i32).
18843 MVT AbsInputVT = SawTrailingZext ? MVT::v16i16 : MVT::v16i32;
18844 // Assumed v16i16 or v16i32 abs input
18845 unsigned Opcode = VecReduceOp0.getOpcode();
18846 if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != AbsInputVT)
18847 return SDValue();
18848
18849 SDValue ABS = VecReduceOp0;
18850 // Assumed v16i16 or v16i32 sub
18851 if (ABS->getOperand(0)->getOpcode() != ISD::SUB ||
18852 ABS->getOperand(0)->getValueType(0) != AbsInputVT)
18853 return SDValue();
18854
18855 SDValue SUB = ABS->getOperand(0);
18856 unsigned Opcode0 = SUB->getOperand(0).getOpcode();
18857 unsigned Opcode1 = SUB->getOperand(1).getOpcode();
18858 // Assumed v16i16 or v16i32 type
18859 if (SUB->getOperand(0)->getValueType(0) != AbsInputVT ||
18860 SUB->getOperand(1)->getValueType(0) != AbsInputVT)
18861 return SDValue();
18862
18863 // Assumed zext or sext
18864 bool IsZExt = false;
18865 if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) {
18866 IsZExt = true;
18867 } else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) {
18868 IsZExt = false;
18869 } else
18870 return SDValue();
18871
18872 SDValue EXT0 = SUB->getOperand(0);
18873 SDValue EXT1 = SUB->getOperand(1);
18874 // Assumed zext's operand has v16i8 type
18875 if (EXT0->getOperand(0)->getValueType(0) != MVT::v16i8 ||
18876 EXT1->getOperand(0)->getValueType(0) != MVT::v16i8)
18877 return SDValue();
18878
18879 // Pattern is detected. Let's convert it to sequence of nodes.
18880 SDLoc DL(N);
18881
18882 // First, create the node pattern of UABD/SABD.
18883 SDValue UABDHigh8Op0 =
18884 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
18885 DAG.getConstant(8, DL, MVT::i64));
18886 SDValue UABDHigh8Op1 =
18887 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
18888 DAG.getConstant(8, DL, MVT::i64));
18889 SDValue UABDHigh8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
18890 UABDHigh8Op0, UABDHigh8Op1);
18891 SDValue UABDL = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDHigh8);
18892
18893 // Second, create the node pattern of UABAL.
18894 SDValue UABDLo8Op0 =
18895 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
18896 DAG.getConstant(0, DL, MVT::i64));
18897 SDValue UABDLo8Op1 =
18898 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
18899 DAG.getConstant(0, DL, MVT::i64));
18900 SDValue UABDLo8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
18901 UABDLo8Op0, UABDLo8Op1);
18902 SDValue ZExtUABD = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDLo8);
18903 SDValue UABAL = DAG.getNode(ISD::ADD, DL, MVT::v8i16, UABDL, ZExtUABD);
18904
18905 // Third, create the node of UADDLP.
18906 SDValue UADDLP = DAG.getNode(AArch64ISD::UADDLP, DL, MVT::v4i32, UABAL);
18907
18908 // Fourth, create the node of VECREDUCE_ADD.
18909 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP);
18910}
18911
18912static SDValue
18914 const AArch64Subtarget *ST) {
18915 if (DCI.isBeforeLegalize())
18916 return SDValue();
18917
18918 if (SDValue While = optimizeIncrementingWhile(N, DCI.DAG, /*IsSigned=*/false,
18919 /*IsEqual=*/false))
18920 return While;
18921
18922 if (!N->getValueType(0).isScalableVector() ||
18923 (!ST->hasSVE2p1() && !(ST->hasSME2() && ST->isStreaming())))
18924 return SDValue();
18925
18926 // Count the number of users which are extract_vectors.
18927 unsigned NumExts = count_if(N->users(), [](SDNode *Use) {
18928 return Use->getOpcode() == ISD::EXTRACT_SUBVECTOR;
18929 });
18930
18931 auto MaskEC = N->getValueType(0).getVectorElementCount();
18932 if (!MaskEC.isKnownMultipleOf(NumExts))
18933 return SDValue();
18934
18935 ElementCount ExtMinEC = MaskEC.divideCoefficientBy(NumExts);
18936 if (ExtMinEC.getKnownMinValue() < 2)
18937 return SDValue();
18938
18939 SmallVector<SDNode *> Extracts(NumExts, nullptr);
18940 for (SDNode *Use : N->users()) {
18941 if (Use->getOpcode() != ISD::EXTRACT_SUBVECTOR)
18942 continue;
18943
18944 // Ensure the extract type is correct (e.g. if NumExts is 4 and
18945 // the mask return type is nxv8i1, each extract should be nxv2i1.
18946 if (Use->getValueType(0).getVectorElementCount() != ExtMinEC)
18947 return SDValue();
18948
18949 // There should be exactly one extract for each part of the mask.
18950 unsigned Offset = Use->getConstantOperandVal(1);
18951 unsigned Part = Offset / ExtMinEC.getKnownMinValue();
18952 if (Extracts[Part] != nullptr)
18953 return SDValue();
18954
18955 Extracts[Part] = Use;
18956 }
18957
18958 SelectionDAG &DAG = DCI.DAG;
18959 SDLoc DL(N);
18960 SDValue ID =
18961 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo_x2, DL, MVT::i64);
18962
18963 SDValue Idx = N->getOperand(0);
18964 SDValue TC = N->getOperand(1);
18965 if (Idx.getValueType() != MVT::i64) {
18966 Idx = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Idx);
18967 TC = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, TC);
18968 }
18969
18970 // Create the whilelo_x2 intrinsics from each pair of extracts
18971 EVT ExtVT = Extracts[0]->getValueType(0);
18972 EVT DoubleExtVT = ExtVT.getDoubleNumVectorElementsVT(*DAG.getContext());
18973 auto R =
18974 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, {ExtVT, ExtVT}, {ID, Idx, TC});
18975 DCI.CombineTo(Extracts[0], R.getValue(0));
18976 DCI.CombineTo(Extracts[1], R.getValue(1));
18977 SmallVector<SDValue> Concats = {DAG.getNode(
18978 ISD::CONCAT_VECTORS, DL, DoubleExtVT, R.getValue(0), R.getValue(1))};
18979
18980 if (NumExts == 2) {
18981 assert(N->getValueType(0) == DoubleExtVT);
18982 return Concats[0];
18983 }
18984
18985 auto Elts =
18986 DAG.getElementCount(DL, MVT::i64, ExtVT.getVectorElementCount() * 2);
18987 for (unsigned I = 2; I < NumExts; I += 2) {
18988 // After the first whilelo_x2, we need to increment the starting value.
18989 Idx = DAG.getNode(ISD::UADDSAT, DL, MVT::i64, Idx, Elts);
18990 R = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, {ExtVT, ExtVT}, {ID, Idx, TC});
18991 DCI.CombineTo(Extracts[I], R.getValue(0));
18992 DCI.CombineTo(Extracts[I + 1], R.getValue(1));
18993 Concats.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, DoubleExtVT,
18994 R.getValue(0), R.getValue(1)));
18995 }
18996
18997 return DAG.getNode(ISD::CONCAT_VECTORS, DL, N->getValueType(0), Concats);
18998}
18999
19000// Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
19001// vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
19002// vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
19003// If we have vectors larger than v16i8 we extract v16i8 vectors,
19004// Follow the same steps above to get DOT instructions concatenate them
19005// and generate vecreduce.add(concat_vector(DOT, DOT2, ..)).
19007 const AArch64Subtarget *ST) {
19008 if (!ST->isNeonAvailable())
19009 return SDValue();
19010
19011 if (!ST->hasDotProd())
19013
19014 SDValue Op0 = N->getOperand(0);
19015 if (N->getValueType(0) != MVT::i32 || Op0.getValueType().isScalableVT() ||
19016 Op0.getValueType().getVectorElementType() != MVT::i32)
19017 return SDValue();
19018
19019 unsigned ExtOpcode = Op0.getOpcode();
19020 SDValue A = Op0;
19021 SDValue B;
19022 unsigned DotOpcode;
19023 if (ExtOpcode == ISD::MUL) {
19024 A = Op0.getOperand(0);
19025 B = Op0.getOperand(1);
19026 if (A.getOperand(0).getValueType() != B.getOperand(0).getValueType())
19027 return SDValue();
19028 auto OpCodeA = A.getOpcode();
19029 if (OpCodeA != ISD::ZERO_EXTEND && OpCodeA != ISD::SIGN_EXTEND)
19030 return SDValue();
19031
19032 auto OpCodeB = B.getOpcode();
19033 if (OpCodeB != ISD::ZERO_EXTEND && OpCodeB != ISD::SIGN_EXTEND)
19034 return SDValue();
19035
19036 if (OpCodeA == OpCodeB) {
19037 DotOpcode =
19038 OpCodeA == ISD::ZERO_EXTEND ? AArch64ISD::UDOT : AArch64ISD::SDOT;
19039 } else {
19040 // Check USDOT support support
19041 if (!ST->hasMatMulInt8())
19042 return SDValue();
19043 DotOpcode = AArch64ISD::USDOT;
19044 if (OpCodeA == ISD::SIGN_EXTEND)
19045 std::swap(A, B);
19046 }
19047 } else if (ExtOpcode == ISD::ZERO_EXTEND) {
19048 DotOpcode = AArch64ISD::UDOT;
19049 } else if (ExtOpcode == ISD::SIGN_EXTEND) {
19050 DotOpcode = AArch64ISD::SDOT;
19051 } else {
19052 return SDValue();
19053 }
19054
19055 EVT Op0VT = A.getOperand(0).getValueType();
19056 bool IsValidElementCount = Op0VT.getVectorNumElements() % 8 == 0;
19057 bool IsValidSize = Op0VT.getScalarSizeInBits() == 8;
19058 if (!IsValidElementCount || !IsValidSize)
19059 return SDValue();
19060
19061 SDLoc DL(Op0);
19062 // For non-mla reductions B can be set to 1. For MLA we take the operand of
19063 // the extend B.
19064 if (!B)
19065 B = DAG.getConstant(1, DL, Op0VT);
19066 else
19067 B = B.getOperand(0);
19068
19069 unsigned IsMultipleOf16 = Op0VT.getVectorNumElements() % 16 == 0;
19070 unsigned NumOfVecReduce;
19071 EVT TargetType;
19072 if (IsMultipleOf16) {
19073 NumOfVecReduce = Op0VT.getVectorNumElements() / 16;
19074 TargetType = MVT::v4i32;
19075 } else {
19076 NumOfVecReduce = Op0VT.getVectorNumElements() / 8;
19077 TargetType = MVT::v2i32;
19078 }
19079 // Handle the case where we need to generate only one Dot operation.
19080 if (NumOfVecReduce == 1) {
19081 SDValue Zeros = DAG.getConstant(0, DL, TargetType);
19082 SDValue Dot = DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros,
19083 A.getOperand(0), B);
19084 return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
19085 }
19086 // Generate Dot instructions that are multiple of 16.
19087 unsigned VecReduce16Num = Op0VT.getVectorNumElements() / 16;
19088 SmallVector<SDValue, 4> SDotVec16;
19089 unsigned I = 0;
19090 for (; I < VecReduce16Num; I += 1) {
19091 SDValue Zeros = DAG.getConstant(0, DL, MVT::v4i32);
19092 SDValue Op0 =
19093 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, A.getOperand(0),
19094 DAG.getConstant(I * 16, DL, MVT::i64));
19095 SDValue Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v16i8, B,
19096 DAG.getConstant(I * 16, DL, MVT::i64));
19097 SDValue Dot =
19098 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Op0, Op1);
19099 SDotVec16.push_back(Dot);
19100 }
19101 // Concatenate dot operations.
19102 EVT SDot16EVT =
19103 EVT::getVectorVT(*DAG.getContext(), MVT::i32, 4 * VecReduce16Num);
19104 SDValue ConcatSDot16 =
19105 DAG.getNode(ISD::CONCAT_VECTORS, DL, SDot16EVT, SDotVec16);
19106 SDValue VecReduceAdd16 =
19107 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), ConcatSDot16);
19108 unsigned VecReduce8Num = (Op0VT.getVectorNumElements() % 16) / 8;
19109 if (VecReduce8Num == 0)
19110 return VecReduceAdd16;
19111
19112 // Generate the remainder Dot operation that is multiple of 8.
19113 SDValue Zeros = DAG.getConstant(0, DL, MVT::v2i32);
19114 SDValue Vec8Op0 =
19115 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, A.getOperand(0),
19116 DAG.getConstant(I * 16, DL, MVT::i64));
19117 SDValue Vec8Op1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, B,
19118 DAG.getConstant(I * 16, DL, MVT::i64));
19119 SDValue Dot =
19120 DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros, Vec8Op0, Vec8Op1);
19121 SDValue VecReduceAdd8 =
19122 DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
19123 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), VecReduceAdd16,
19124 VecReduceAdd8);
19125}
19126
19127// Given an (integer) vecreduce, we know the order of the inputs does not
19128// matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x))))
19129// into UADDV(UADDLP(x)). This can also happen through an extra add, where we
19130// transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))).
19132 auto DetectAddExtract = [&](SDValue A) {
19133 // Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning
19134 // UADDLP(x) if found.
19135 assert(A.getOpcode() == ISD::ADD);
19136 EVT VT = A.getValueType();
19137 SDValue Op0 = A.getOperand(0);
19138 SDValue Op1 = A.getOperand(1);
19139 if (Op0.getOpcode() != Op1.getOpcode() ||
19140 (Op0.getOpcode() != ISD::ZERO_EXTEND &&
19141 Op0.getOpcode() != ISD::SIGN_EXTEND))
19142 return SDValue();
19143 SDValue Ext0 = Op0.getOperand(0);
19144 SDValue Ext1 = Op1.getOperand(0);
19145 if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
19147 Ext0.getOperand(0) != Ext1.getOperand(0) ||
19149 return SDValue();
19150 // Check that the type is twice the add types, and the extract are from
19151 // upper/lower parts of the same source.
19153 VT.getVectorNumElements() * 2)
19154 return SDValue();
19155 if ((Ext0.getConstantOperandVal(1) != 0 ||
19157 (Ext1.getConstantOperandVal(1) != 0 ||
19159 return SDValue();
19160 unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP
19161 : AArch64ISD::SADDLP;
19162 return DAG.getNode(Opcode, SDLoc(A), VT, Ext0.getOperand(0));
19163 };
19164
19165 if (SDValue R = DetectAddExtract(A))
19166 return R;
19167
19168 if (A.getOperand(0).getOpcode() == ISD::ADD && A.getOperand(0).hasOneUse())
19169 if (SDValue R = performUADDVAddCombine(A.getOperand(0), DAG))
19170 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
19171 A.getOperand(1));
19172 if (A.getOperand(1).getOpcode() == ISD::ADD && A.getOperand(1).hasOneUse())
19173 if (SDValue R = performUADDVAddCombine(A.getOperand(1), DAG))
19174 return DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
19175 A.getOperand(0));
19176 return SDValue();
19177}
19178
19179// We can convert a UADDV(add(zext(64-bit source), zext(64-bit source))) into
19180// UADDLV(concat), where the concat represents the 64-bit zext sources.
19182 // Look for add(zext(64-bit source), zext(64-bit source)), returning
19183 // UADDLV(concat(zext, zext)) if found.
19184 assert(A.getOpcode() == ISD::ADD);
19185 EVT VT = A.getValueType();
19186 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
19187 return SDValue();
19188 SDValue Op0 = A.getOperand(0);
19189 SDValue Op1 = A.getOperand(1);
19190 if (Op0.getOpcode() != ISD::ZERO_EXTEND || Op0.getOpcode() != Op1.getOpcode())
19191 return SDValue();
19192 SDValue Ext0 = Op0.getOperand(0);
19193 SDValue Ext1 = Op1.getOperand(0);
19194 EVT ExtVT0 = Ext0.getValueType();
19195 EVT ExtVT1 = Ext1.getValueType();
19196 // Check zext VTs are the same and 64-bit length.
19197 if (ExtVT0 != ExtVT1 ||
19198 VT.getScalarSizeInBits() != (2 * ExtVT0.getScalarSizeInBits()))
19199 return SDValue();
19200 // Get VT for concat of zext sources.
19201 EVT PairVT = ExtVT0.getDoubleNumVectorElementsVT(*DAG.getContext());
19202 SDValue Concat =
19203 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(A), PairVT, Ext0, Ext1);
19204
19205 switch (VT.getSimpleVT().SimpleTy) {
19206 case MVT::v2i64:
19207 case MVT::v4i32:
19208 return DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), VT, Concat);
19209 case MVT::v8i16: {
19210 SDValue Uaddlv =
19211 DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), MVT::v4i32, Concat);
19212 return DAG.getNode(AArch64ISD::NVCAST, SDLoc(A), MVT::v8i16, Uaddlv);
19213 }
19214 default:
19215 llvm_unreachable("Unhandled vector type");
19216 }
19217}
19218
19220 SDValue A = N->getOperand(0);
19221 if (A.getOpcode() == ISD::ADD) {
19222 if (SDValue R = performUADDVAddCombine(A, DAG))
19223 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), R);
19224 else if (SDValue R = performUADDVZextCombine(A, DAG))
19225 return R;
19226 }
19227
19228 // uaddv(A) --> A if all lanes of A are known to be zeros except the 0th lane.
19229 MVT OpVT = A.getSimpleValueType();
19230 assert(N->getSimpleValueType(0) == OpVT &&
19231 "The operand type should be consistent with the result type of UADDV");
19233 Mask.clearBit(0);
19234 KnownBits KnownLeadingLanes = DAG.computeKnownBits(A, Mask);
19235 if (KnownLeadingLanes.isZero())
19236 return A;
19237
19238 return SDValue();
19239}
19240
19243 const AArch64Subtarget *Subtarget) {
19244 if (DCI.isBeforeLegalizeOps())
19245 return SDValue();
19246
19247 return foldVectorXorShiftIntoCmp(N, DAG, Subtarget);
19248}
19249
19250SDValue
19251AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
19252 SelectionDAG &DAG,
19253 SmallVectorImpl<SDNode *> &Created) const {
19254 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
19255 if (isIntDivCheap(N->getValueType(0), Attr))
19256 return SDValue(N, 0); // Lower SDIV as SDIV
19257
19258 EVT VT = N->getValueType(0);
19259
19260 // If SVE is available, we can generate
19261 // sdiv(x,y) -> ptrue + asrd , where 'y' is positive pow-2 divisor.
19262 // sdiv(x,y) -> ptrue + asrd + subr , where 'y' is negative pow-2 divisor.
19263 if (VT.isVector() && Subtarget->isSVEorStreamingSVEAvailable())
19264 return SDValue(N, 0);
19265
19266 // fold (sdiv X, pow2)
19267 if ((VT != MVT::i32 && VT != MVT::i64) ||
19268 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
19269 return SDValue();
19270
19271 // If the divisor is 2 or -2, the default expansion is better. It will add
19272 // (N->getValueType(0) >> (BitWidth - 1)) to it before shifting right.
19273 if (Divisor == 2 ||
19274 Divisor == APInt(Divisor.getBitWidth(), -2, /*isSigned*/ true))
19275 return SDValue();
19276
19277 return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
19278}
19279
19280SDValue
19281AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
19282 SelectionDAG &DAG,
19283 SmallVectorImpl<SDNode *> &Created) const {
19284 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
19285 if (isIntDivCheap(N->getValueType(0), Attr))
19286 return SDValue(N, 0); // Lower SREM as SREM
19287
19288 EVT VT = N->getValueType(0);
19289
19290 // For scalable and fixed types, mark them as cheap so we can handle it much
19291 // later. This allows us to handle larger than legal types.
19292 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
19293 return SDValue(N, 0);
19294
19295 // fold (srem X, pow2)
19296 if ((VT != MVT::i32 && VT != MVT::i64) ||
19297 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
19298 return SDValue();
19299
19300 unsigned Lg2 = Divisor.countr_zero();
19301 if (Lg2 == 0)
19302 return SDValue();
19303
19304 SDLoc DL(N);
19305 SDValue N0 = N->getOperand(0);
19306 SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
19307 SDValue Zero = DAG.getConstant(0, DL, VT);
19308 SDValue CCVal, CSNeg;
19309 if (Lg2 == 1) {
19310 SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETGE, CCVal, DAG, DL);
19311 SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
19312 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, And, And, CCVal, Cmp);
19313
19314 Created.push_back(Cmp.getNode());
19315 Created.push_back(And.getNode());
19316 } else {
19317 SDValue CCVal = getCondCode(DAG, AArch64CC::MI);
19318 SDVTList VTs = DAG.getVTList(VT, FlagsVT);
19319
19320 SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0);
19321 SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
19322 SDValue AndNeg = DAG.getNode(ISD::AND, DL, VT, Negs, Pow2MinusOne);
19323 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, AndPos, AndNeg, CCVal,
19324 Negs.getValue(1));
19325
19326 Created.push_back(Negs.getNode());
19327 Created.push_back(AndPos.getNode());
19328 Created.push_back(AndNeg.getNode());
19329 }
19330
19331 return CSNeg;
19332}
19333
19334static std::optional<unsigned> IsSVECntIntrinsic(SDValue S) {
19335 switch(getIntrinsicID(S.getNode())) {
19336 default:
19337 break;
19338 case Intrinsic::aarch64_sve_cntb:
19339 return 8;
19340 case Intrinsic::aarch64_sve_cnth:
19341 return 16;
19342 case Intrinsic::aarch64_sve_cntw:
19343 return 32;
19344 case Intrinsic::aarch64_sve_cntd:
19345 return 64;
19346 }
19347 return {};
19348}
19349
19350/// Calculates what the pre-extend type is, based on the extension
19351/// operation node provided by \p Extend.
19352///
19353/// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
19354/// pre-extend type is pulled directly from the operand, while other extend
19355/// operations need a bit more inspection to get this information.
19356///
19357/// \param Extend The SDNode from the DAG that represents the extend operation
19358///
19359/// \returns The type representing the \p Extend source type, or \p MVT::Other
19360/// if no valid type can be determined
19362 switch (Extend.getOpcode()) {
19363 case ISD::SIGN_EXTEND:
19364 case ISD::ZERO_EXTEND:
19365 case ISD::ANY_EXTEND:
19366 return Extend.getOperand(0).getValueType();
19367 case ISD::AssertSext:
19368 case ISD::AssertZext:
19370 VTSDNode *TypeNode = dyn_cast<VTSDNode>(Extend.getOperand(1));
19371 if (!TypeNode)
19372 return MVT::Other;
19373 return TypeNode->getVT();
19374 }
19375 case ISD::AND: {
19378 if (!Constant)
19379 return MVT::Other;
19380
19381 uint32_t Mask = Constant->getZExtValue();
19382
19383 if (Mask == UCHAR_MAX)
19384 return MVT::i8;
19385 else if (Mask == USHRT_MAX)
19386 return MVT::i16;
19387 else if (Mask == UINT_MAX)
19388 return MVT::i32;
19389
19390 return MVT::Other;
19391 }
19392 default:
19393 return MVT::Other;
19394 }
19395}
19396
19397/// Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern
19398/// into sext/zext(buildvector) or sext/zext(shuffle) making use of the vector
19399/// SExt/ZExt rather than the scalar SExt/ZExt
19401 EVT VT = BV.getValueType();
19402 if (BV.getOpcode() != ISD::BUILD_VECTOR &&
19404 return SDValue();
19405
19406 // Use the first item in the buildvector/shuffle to get the size of the
19407 // extend, and make sure it looks valid.
19408 SDValue Extend = BV->getOperand(0);
19409 unsigned ExtendOpcode = Extend.getOpcode();
19410 bool IsAnyExt = ExtendOpcode == ISD::ANY_EXTEND;
19411 bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
19412 ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
19413 ExtendOpcode == ISD::AssertSext;
19414 if (!IsAnyExt && !IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
19415 ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
19416 return SDValue();
19417 // Shuffle inputs are vector, limit to SIGN_EXTEND/ZERO_EXTEND/ANY_EXTEND to
19418 // ensure calculatePreExtendType will work without issue.
19419 if (BV.getOpcode() == ISD::VECTOR_SHUFFLE &&
19420 ExtendOpcode != ISD::SIGN_EXTEND && ExtendOpcode != ISD::ZERO_EXTEND)
19421 return SDValue();
19422
19423 // Restrict valid pre-extend data type
19424 EVT PreExtendType = calculatePreExtendType(Extend);
19425 if (PreExtendType == MVT::Other ||
19426 PreExtendType.getScalarSizeInBits() != VT.getScalarSizeInBits() / 2)
19427 return SDValue();
19428
19429 // Make sure all other operands are equally extended.
19430 bool SeenZExtOrSExt = !IsAnyExt;
19431 for (SDValue Op : drop_begin(BV->ops())) {
19432 if (Op.isUndef())
19433 continue;
19434
19435 if (calculatePreExtendType(Op) != PreExtendType)
19436 return SDValue();
19437
19438 unsigned Opc = Op.getOpcode();
19439 if (Opc == ISD::ANY_EXTEND)
19440 continue;
19441
19442 bool OpcIsSExt = Opc == ISD::SIGN_EXTEND || Opc == ISD::SIGN_EXTEND_INREG ||
19444
19445 if (SeenZExtOrSExt && OpcIsSExt != IsSExt)
19446 return SDValue();
19447
19448 IsSExt = OpcIsSExt;
19449 SeenZExtOrSExt = true;
19450 }
19451
19452 SDValue NBV;
19453 SDLoc DL(BV);
19454 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
19455 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType);
19456 EVT PreExtendLegalType =
19457 PreExtendType.getScalarSizeInBits() < 32 ? MVT::i32 : PreExtendType;
19459 for (SDValue Op : BV->ops())
19460 NewOps.push_back(Op.isUndef() ? DAG.getUNDEF(PreExtendLegalType)
19461 : DAG.getAnyExtOrTrunc(Op.getOperand(0), DL,
19462 PreExtendLegalType));
19463 NBV = DAG.getNode(ISD::BUILD_VECTOR, DL, PreExtendVT, NewOps);
19464 } else { // BV.getOpcode() == ISD::VECTOR_SHUFFLE
19465 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType.getScalarType());
19466 NBV = DAG.getVectorShuffle(PreExtendVT, DL, BV.getOperand(0).getOperand(0),
19467 BV.getOperand(1).isUndef()
19468 ? DAG.getUNDEF(PreExtendVT)
19469 : BV.getOperand(1).getOperand(0),
19470 cast<ShuffleVectorSDNode>(BV)->getMask());
19471 }
19472 unsigned ExtOpc = !SeenZExtOrSExt
19474 : (IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND);
19475 return DAG.getNode(ExtOpc, DL, VT, NBV);
19476}
19477
19478/// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
19479/// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
19481 // If the value type isn't a vector, none of the operands are going to be dups
19482 EVT VT = Mul->getValueType(0);
19483 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
19484 return SDValue();
19485
19486 SDValue Op0 = performBuildShuffleExtendCombine(Mul->getOperand(0), DAG);
19487 SDValue Op1 = performBuildShuffleExtendCombine(Mul->getOperand(1), DAG);
19488
19489 // Neither operands have been changed, don't make any further changes
19490 if (!Op0 && !Op1)
19491 return SDValue();
19492
19493 SDLoc DL(Mul);
19494 return DAG.getNode(Mul->getOpcode(), DL, VT, Op0 ? Op0 : Mul->getOperand(0),
19495 Op1 ? Op1 : Mul->getOperand(1));
19496}
19497
19498// Multiplying an RDSVL value by a constant can sometimes be done cheaper by
19499// folding a power-of-two factor of the constant into the RDSVL immediate and
19500// compensating with an extra shift.
19501//
19502// We rewrite:
19503// (mul (srl (rdsvl 1), w), x)
19504// to one of:
19505// (shl (rdsvl y), z) if z > 0
19506// (srl (rdsvl y), abs(z)) if z < 0
19507// where integers y, z satisfy x = y * 2^(w + z) and y ∈ [-32, 31].
19509 SDLoc DL(Mul);
19510 EVT VT = Mul->getValueType(0);
19511 SDValue MulOp0 = Mul->getOperand(0);
19512 int ConstMultiplier =
19513 cast<ConstantSDNode>(Mul->getOperand(1))->getSExtValue();
19514 if ((MulOp0->getOpcode() != ISD::SRL) ||
19515 (MulOp0->getOperand(0).getOpcode() != AArch64ISD::RDSVL))
19516 return SDValue();
19517
19518 unsigned AbsConstValue = abs(ConstMultiplier);
19519 unsigned OperandShift =
19520 cast<ConstantSDNode>(MulOp0->getOperand(1))->getZExtValue();
19521
19522 // z ≤ ctz(|x|) - w (largest extra shift we can take while keeping y
19523 // integral)
19524 int UpperBound = llvm::countr_zero(AbsConstValue) - OperandShift;
19525
19526 // To keep y in range, with B = 31 for x > 0 and B = 32 for x < 0, we need:
19527 // 2^(w + z) ≥ ceil(x / B) ⇒ z ≥ ceil_log2(ceil(x / B)) - w (LowerBound).
19528 unsigned B = ConstMultiplier < 0 ? 32 : 31;
19529 unsigned CeilAxOverB = (AbsConstValue + (B - 1)) / B; // ceil(|x|/B)
19530 int LowerBound = llvm::Log2_32_Ceil(CeilAxOverB) - OperandShift;
19531
19532 // No valid solution found.
19533 if (LowerBound > UpperBound)
19534 return SDValue();
19535
19536 // Any value of z in [LowerBound, UpperBound] is valid. Prefer no extra
19537 // shift if possible.
19538 int Shift = std::min(std::max(/*prefer*/ 0, LowerBound), UpperBound);
19539
19540 // y = x / 2^(w + z)
19541 int32_t RdsvlMul = (AbsConstValue >> (OperandShift + Shift)) *
19542 (ConstMultiplier < 0 ? -1 : 1);
19543 auto Rdsvl = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
19544 DAG.getSignedConstant(RdsvlMul, DL, MVT::i32));
19545
19546 if (Shift == 0)
19547 return Rdsvl;
19548 return DAG.getNode(Shift < 0 ? ISD::SRL : ISD::SHL, DL, VT, Rdsvl,
19549 DAG.getConstant(abs(Shift), DL, MVT::i32),
19551}
19552
19553// Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz
19554// Same for other types with equivalent constants.
19556 EVT VT = N->getValueType(0);
19557 if (VT != MVT::v2i64 && VT != MVT::v1i64 && VT != MVT::v2i32 &&
19558 VT != MVT::v4i32 && VT != MVT::v4i16 && VT != MVT::v8i16)
19559 return SDValue();
19560 if (N->getOperand(0).getOpcode() != ISD::AND ||
19561 N->getOperand(0).getOperand(0).getOpcode() != ISD::SRL)
19562 return SDValue();
19563
19564 SDValue And = N->getOperand(0);
19565 SDValue Srl = And.getOperand(0);
19566
19567 APInt V1, V2, V3;
19568 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), V1) ||
19569 !ISD::isConstantSplatVector(And.getOperand(1).getNode(), V2) ||
19571 return SDValue();
19572
19573 unsigned HalfSize = VT.getScalarSizeInBits() / 2;
19574 if (!V1.isMask(HalfSize) || V2 != (1ULL | 1ULL << HalfSize) ||
19575 V3 != (HalfSize - 1))
19576 return SDValue();
19577
19578 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
19579 EVT::getIntegerVT(*DAG.getContext(), HalfSize),
19580 VT.getVectorElementCount() * 2);
19581
19582 SDLoc DL(N);
19583 SDValue In = DAG.getNode(AArch64ISD::NVCAST, DL, HalfVT, Srl.getOperand(0));
19584 SDValue Zero = DAG.getConstant(0, DL, In.getValueType());
19585 SDValue CM = DAG.getSetCC(DL, HalfVT, Zero, In, ISD::SETGT);
19586 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, CM);
19587}
19588
19589// Transform vector add(zext i8 to i32, zext i8 to i32)
19590// into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
19591// This allows extra uses of saddl/uaddl at the lower vector widths, and less
19592// extends.
19594 EVT VT = N->getValueType(0);
19595 if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 ||
19596 (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
19597 N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) ||
19598 (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
19599 N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) ||
19600 N->getOperand(0).getOperand(0).getValueType() !=
19601 N->getOperand(1).getOperand(0).getValueType())
19602 return SDValue();
19603
19604 if (N->getOpcode() == ISD::MUL &&
19605 N->getOperand(0).getOpcode() != N->getOperand(1).getOpcode())
19606 return SDValue();
19607
19608 SDValue N0 = N->getOperand(0).getOperand(0);
19609 SDValue N1 = N->getOperand(1).getOperand(0);
19610 EVT InVT = N0.getValueType();
19611
19612 EVT S1 = InVT.getScalarType();
19613 EVT S2 = VT.getScalarType();
19614 if ((S2 == MVT::i32 && S1 == MVT::i8) ||
19615 (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) {
19616 SDLoc DL(N);
19617 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
19620 SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0);
19621 SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1);
19622 SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1);
19623 return DAG.getNode(N->getOpcode() == ISD::MUL ? N->getOperand(0).getOpcode()
19624 : (unsigned)ISD::SIGN_EXTEND,
19625 DL, VT, NewOp);
19626 }
19627 return SDValue();
19628}
19629
19632 const AArch64Subtarget *Subtarget) {
19633
19634 if (SDValue Ext = performMulVectorExtendCombine(N, DAG))
19635 return Ext;
19637 return Ext;
19638 if (SDValue Ext = performVectorExtCombine(N, DAG))
19639 return Ext;
19640
19641 if (DCI.isBeforeLegalizeOps())
19642 return SDValue();
19643
19644 // Canonicalize X*(Y+1) -> X*Y+X and (X+1)*Y -> X*Y+Y,
19645 // and in MachineCombiner pass, add+mul will be combined into madd.
19646 // Similarly, X*(1-Y) -> X - X*Y and (1-Y)*X -> X - Y*X.
19647 SDLoc DL(N);
19648 EVT VT = N->getValueType(0);
19649 SDValue N0 = N->getOperand(0);
19650 SDValue N1 = N->getOperand(1);
19651 SDValue MulOper;
19652 unsigned AddSubOpc;
19653
19654 auto IsAddSubWith1 = [&](SDValue V) -> bool {
19655 AddSubOpc = V->getOpcode();
19656 if ((AddSubOpc == ISD::ADD || AddSubOpc == ISD::SUB) && V->hasOneUse()) {
19657 SDValue Opnd = V->getOperand(1);
19658 MulOper = V->getOperand(0);
19659 if (AddSubOpc == ISD::SUB)
19660 std::swap(Opnd, MulOper);
19661 if (auto C = dyn_cast<ConstantSDNode>(Opnd))
19662 return C->isOne();
19663 }
19664 return false;
19665 };
19666
19667 if (IsAddSubWith1(N0)) {
19668 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N1, MulOper);
19669 return DAG.getNode(AddSubOpc, DL, VT, N1, MulVal);
19670 }
19671
19672 if (IsAddSubWith1(N1)) {
19673 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N0, MulOper);
19674 return DAG.getNode(AddSubOpc, DL, VT, N0, MulVal);
19675 }
19676
19677 // The below optimizations require a constant RHS.
19678 if (!isa<ConstantSDNode>(N1))
19679 return SDValue();
19680
19681 if (SDValue Ext = performMulRdsvlCombine(N, DAG))
19682 return Ext;
19683
19685 const APInt &ConstValue = C->getAPIntValue();
19686
19687 // Allow the scaling to be folded into the `cnt` instruction by preventing
19688 // the scaling to be obscured here. This makes it easier to pattern match.
19689 if (IsSVECntIntrinsic(N0) ||
19690 (N0->getOpcode() == ISD::TRUNCATE &&
19691 (IsSVECntIntrinsic(N0->getOperand(0)))))
19692 if (ConstValue.sge(1) && ConstValue.sle(16))
19693 return SDValue();
19694
19695 // Multiplication of a power of two plus/minus one can be done more
19696 // cheaply as shift+add/sub. For now, this is true unilaterally. If
19697 // future CPUs have a cheaper MADD instruction, this may need to be
19698 // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
19699 // 64-bit is 5 cycles, so this is always a win.
19700 // More aggressively, some multiplications N0 * C can be lowered to
19701 // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
19702 // e.g. 6=3*2=(2+1)*2, 45=(1+4)*(1+8)
19703 // TODO: lower more cases.
19704
19705 // TrailingZeroes is used to test if the mul can be lowered to
19706 // shift+add+shift.
19707 unsigned TrailingZeroes = ConstValue.countr_zero();
19708 if (TrailingZeroes) {
19709 // Conservatively do not lower to shift+add+shift if the mul might be
19710 // folded into smul or umul.
19711 if (N0->hasOneUse() && (isSignExtended(N0, DAG) ||
19712 isZeroExtended(N0, DAG)))
19713 return SDValue();
19714 // Conservatively do not lower to shift+add+shift if the mul might be
19715 // folded into madd or msub.
19716 if (N->hasOneUse() && (N->user_begin()->getOpcode() == ISD::ADD ||
19717 N->user_begin()->getOpcode() == ISD::SUB))
19718 return SDValue();
19719 }
19720 // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
19721 // and shift+add+shift.
19722 APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
19723 unsigned ShiftAmt;
19724
19725 auto Shl = [&](SDValue N0, unsigned N1) {
19726 if (!N0.getNode())
19727 return SDValue();
19728 // If shift causes overflow, ignore this combine.
19729 if (N1 >= N0.getValueSizeInBits())
19730 return SDValue();
19731 SDValue RHS = DAG.getConstant(N1, DL, MVT::i64);
19732 return DAG.getNode(ISD::SHL, DL, VT, N0, RHS);
19733 };
19734 auto Add = [&](SDValue N0, SDValue N1) {
19735 if (!N0.getNode() || !N1.getNode())
19736 return SDValue();
19737 return DAG.getNode(ISD::ADD, DL, VT, N0, N1);
19738 };
19739 auto Sub = [&](SDValue N0, SDValue N1) {
19740 if (!N0.getNode() || !N1.getNode())
19741 return SDValue();
19742 return DAG.getNode(ISD::SUB, DL, VT, N0, N1);
19743 };
19744 auto Negate = [&](SDValue N) {
19745 if (!N0.getNode())
19746 return SDValue();
19747 SDValue Zero = DAG.getConstant(0, DL, VT);
19748 return DAG.getNode(ISD::SUB, DL, VT, Zero, N);
19749 };
19750
19751 // Can the const C be decomposed into (1+2^M1)*(1+2^N1), eg:
19752 // C = 45 is equal to (1+4)*(1+8), we don't decompose it into (1+2)*(16-1) as
19753 // the (2^N - 1) can't be execused via a single instruction.
19754 auto isPowPlusPlusConst = [](APInt C, APInt &M, APInt &N) {
19755 unsigned BitWidth = C.getBitWidth();
19756 for (unsigned i = 1; i < BitWidth / 2; i++) {
19757 APInt Rem;
19758 APInt X(BitWidth, (1 << i) + 1);
19759 APInt::sdivrem(C, X, N, Rem);
19760 APInt NVMinus1 = N - 1;
19761 if (Rem == 0 && NVMinus1.isPowerOf2()) {
19762 M = X;
19763 return true;
19764 }
19765 }
19766 return false;
19767 };
19768
19769 // Can the const C be decomposed into (2^M + 1) * 2^N + 1), eg:
19770 // C = 11 is equal to (1+4)*2+1, we don't decompose it into (1+2)*4-1 as
19771 // the (2^N - 1) can't be execused via a single instruction.
19772 auto isPowPlusPlusOneConst = [](APInt C, APInt &M, APInt &N) {
19773 APInt CVMinus1 = C - 1;
19774 if (CVMinus1.isNegative())
19775 return false;
19776 unsigned TrailingZeroes = CVMinus1.countr_zero();
19777 APInt SCVMinus1 = CVMinus1.ashr(TrailingZeroes) - 1;
19778 if (SCVMinus1.isPowerOf2()) {
19779 unsigned BitWidth = SCVMinus1.getBitWidth();
19780 M = APInt(BitWidth, SCVMinus1.logBase2());
19781 N = APInt(BitWidth, TrailingZeroes);
19782 return true;
19783 }
19784 return false;
19785 };
19786
19787 // Can the const C be decomposed into (1 - (1 - 2^M) * 2^N), eg:
19788 // C = 29 is equal to 1 - (1 - 2^3) * 2^2.
19789 auto isPowMinusMinusOneConst = [](APInt C, APInt &M, APInt &N) {
19790 APInt CVMinus1 = C - 1;
19791 if (CVMinus1.isNegative())
19792 return false;
19793 unsigned TrailingZeroes = CVMinus1.countr_zero();
19794 APInt CVPlus1 = CVMinus1.ashr(TrailingZeroes) + 1;
19795 if (CVPlus1.isPowerOf2()) {
19796 unsigned BitWidth = CVPlus1.getBitWidth();
19797 M = APInt(BitWidth, CVPlus1.logBase2());
19798 N = APInt(BitWidth, TrailingZeroes);
19799 return true;
19800 }
19801 return false;
19802 };
19803
19804 if (ConstValue.isNonNegative()) {
19805 // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
19806 // (mul x, 2^N - 1) => (sub (shl x, N), x)
19807 // (mul x, (2^(N-M) - 1) * 2^M) => (sub (shl x, N), (shl x, M))
19808 // (mul x, (2^M + 1) * (2^N + 1))
19809 // => MV = (add (shl x, M), x); (add (shl MV, N), MV)
19810 // (mul x, (2^M + 1) * 2^N + 1))
19811 // => MV = add (shl x, M), x); add (shl MV, N), x)
19812 // (mul x, 1 - (1 - 2^M) * 2^N))
19813 // => MV = sub (x - (shl x, M)); sub (x - (shl MV, N))
19814 APInt SCVMinus1 = ShiftedConstValue - 1;
19815 APInt SCVPlus1 = ShiftedConstValue + 1;
19816 APInt CVPlus1 = ConstValue + 1;
19817 APInt CVM, CVN;
19818 if (SCVMinus1.isPowerOf2()) {
19819 ShiftAmt = SCVMinus1.logBase2();
19820 return Shl(Add(Shl(N0, ShiftAmt), N0), TrailingZeroes);
19821 } else if (CVPlus1.isPowerOf2()) {
19822 ShiftAmt = CVPlus1.logBase2();
19823 return Sub(Shl(N0, ShiftAmt), N0);
19824 } else if (SCVPlus1.isPowerOf2()) {
19825 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
19826 return Sub(Shl(N0, ShiftAmt), Shl(N0, TrailingZeroes));
19827 }
19828 if (Subtarget->hasALULSLFast() &&
19829 isPowPlusPlusConst(ConstValue, CVM, CVN)) {
19830 APInt CVMMinus1 = CVM - 1;
19831 APInt CVNMinus1 = CVN - 1;
19832 unsigned ShiftM1 = CVMMinus1.logBase2();
19833 unsigned ShiftN1 = CVNMinus1.logBase2();
19834 // ALULSLFast implicate that Shifts <= 4 places are fast
19835 if (ShiftM1 <= 4 && ShiftN1 <= 4) {
19836 SDValue MVal = Add(Shl(N0, ShiftM1), N0);
19837 return Add(Shl(MVal, ShiftN1), MVal);
19838 }
19839 }
19840 if (Subtarget->hasALULSLFast() &&
19841 isPowPlusPlusOneConst(ConstValue, CVM, CVN)) {
19842 unsigned ShiftM = CVM.getZExtValue();
19843 unsigned ShiftN = CVN.getZExtValue();
19844 // ALULSLFast implicate that Shifts <= 4 places are fast
19845 if (ShiftM <= 4 && ShiftN <= 4) {
19846 SDValue MVal = Add(Shl(N0, CVM.getZExtValue()), N0);
19847 return Add(Shl(MVal, CVN.getZExtValue()), N0);
19848 }
19849 }
19850
19851 if (Subtarget->hasALULSLFast() &&
19852 isPowMinusMinusOneConst(ConstValue, CVM, CVN)) {
19853 unsigned ShiftM = CVM.getZExtValue();
19854 unsigned ShiftN = CVN.getZExtValue();
19855 // ALULSLFast implicate that Shifts <= 4 places are fast
19856 if (ShiftM <= 4 && ShiftN <= 4) {
19857 SDValue MVal = Sub(N0, Shl(N0, CVM.getZExtValue()));
19858 return Sub(N0, Shl(MVal, CVN.getZExtValue()));
19859 }
19860 }
19861 } else {
19862 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
19863 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
19864 // (mul x, -(2^(N-M) - 1) * 2^M) => (sub (shl x, M), (shl x, N))
19865 APInt SCVPlus1 = -ShiftedConstValue + 1;
19866 APInt CVNegPlus1 = -ConstValue + 1;
19867 APInt CVNegMinus1 = -ConstValue - 1;
19868 if (CVNegPlus1.isPowerOf2()) {
19869 ShiftAmt = CVNegPlus1.logBase2();
19870 return Sub(N0, Shl(N0, ShiftAmt));
19871 } else if (CVNegMinus1.isPowerOf2()) {
19872 ShiftAmt = CVNegMinus1.logBase2();
19873 return Negate(Add(Shl(N0, ShiftAmt), N0));
19874 } else if (SCVPlus1.isPowerOf2()) {
19875 ShiftAmt = SCVPlus1.logBase2() + TrailingZeroes;
19876 return Sub(Shl(N0, TrailingZeroes), Shl(N0, ShiftAmt));
19877 }
19878 }
19879
19880 return SDValue();
19881}
19882
19884 SelectionDAG &DAG) {
19885 // Take advantage of vector comparisons producing 0 or -1 in each lane to
19886 // optimize away operation when it's from a constant.
19887 //
19888 // The general transformation is:
19889 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
19890 // AND(VECTOR_CMP(x,y), constant2)
19891 // constant2 = UNARYOP(constant)
19892
19893 // Early exit if this isn't a vector operation, the operand of the
19894 // unary operation isn't a bitwise AND, or if the sizes of the operations
19895 // aren't the same.
19896 EVT VT = N->getValueType(0);
19897 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
19898 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
19899 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
19900 return SDValue();
19901
19902 // Now check that the other operand of the AND is a constant. We could
19903 // make the transformation for non-constant splats as well, but it's unclear
19904 // that would be a benefit as it would not eliminate any operations, just
19905 // perform one more step in scalar code before moving to the vector unit.
19906 if (BuildVectorSDNode *BV =
19907 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
19908 // Bail out if the vector isn't a constant.
19909 if (!BV->isConstant())
19910 return SDValue();
19911
19912 // Everything checks out. Build up the new and improved node.
19913 SDLoc DL(N);
19914 EVT IntVT = BV->getValueType(0);
19915 // Create a new constant of the appropriate type for the transformed
19916 // DAG.
19917 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
19918 // The AND node needs bitcasts to/from an integer vector type around it.
19919 SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
19920 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
19921 N->getOperand(0)->getOperand(0), MaskConst);
19922 SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
19923 return Res;
19924 }
19925
19926 return SDValue();
19927}
19928
19929/// Tries to replace scalar FP <-> INT conversions with SVE in streaming
19930/// functions, this can help to reduce the number of fmovs to/from GPRs.
19931static SDValue
19934 const AArch64Subtarget *Subtarget) {
19935 if (N->isStrictFPOpcode())
19936 return SDValue();
19937
19938 if (DCI.isBeforeLegalizeOps())
19939 return SDValue();
19940
19941 if (!Subtarget->isSVEorStreamingSVEAvailable() ||
19942 (!Subtarget->isStreaming() && !Subtarget->isStreamingCompatible()))
19943 return SDValue();
19944
19945 auto isSupportedType = [](EVT VT) {
19946 return !VT.isVector() && VT != MVT::bf16 && VT != MVT::f128;
19947 };
19948
19949 SDValue SrcVal = N->getOperand(0);
19950 EVT SrcTy = SrcVal.getValueType();
19951 EVT DestTy = N->getValueType(0);
19952
19953 if (!isSupportedType(SrcTy) || !isSupportedType(DestTy))
19954 return SDValue();
19955
19956 EVT SrcVecTy;
19957 EVT DestVecTy;
19958 if (DestTy.bitsGT(SrcTy)) {
19959 DestVecTy = getPackedSVEVectorVT(DestTy);
19960 SrcVecTy = DestVecTy.changeVectorElementType(SrcTy);
19961 } else {
19962 SrcVecTy = getPackedSVEVectorVT(SrcTy);
19963 DestVecTy = SrcVecTy.changeVectorElementType(DestTy);
19964 }
19965
19966 // Ensure the resulting src/dest vector type is legal.
19967 if (SrcVecTy == MVT::nxv2i32 || DestVecTy == MVT::nxv2i32)
19968 return SDValue();
19969
19970 SDLoc DL(N);
19971 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
19972 SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SrcVecTy,
19973 DAG.getUNDEF(SrcVecTy), SrcVal, ZeroIdx);
19974 SDValue Convert = DAG.getNode(N->getOpcode(), DL, DestVecTy, Vec);
19975 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestTy, Convert, ZeroIdx);
19976}
19977
19980 const AArch64Subtarget *Subtarget) {
19981 // First try to optimize away the conversion when it's conditionally from
19982 // a constant. Vectors only.
19984 return Res;
19985
19986 if (SDValue Res =
19987 tryToReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
19988 return Res;
19989
19990 EVT VT = N->getValueType(0);
19991 if (VT != MVT::f32 && VT != MVT::f64)
19992 return SDValue();
19993
19994 // Only optimize when the source and destination types have the same width.
19995 if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
19996 return SDValue();
19997
19998 // If the result of an integer load is only used by an integer-to-float
19999 // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
20000 // This eliminates an "integer-to-vector-move" UOP and improves throughput.
20001 SDValue N0 = N->getOperand(0);
20002 if (Subtarget->isNeonAvailable() && ISD::isNormalLoad(N0.getNode()) &&
20003 N0.hasOneUse() &&
20004 // Do not change the width of a volatile load.
20005 !cast<LoadSDNode>(N0)->isVolatile()) {
20006 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
20007 SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
20008 LN0->getPointerInfo(), LN0->getAlign(),
20009 LN0->getMemOperand()->getFlags());
20010
20011 // Make sure successors of the original load stay after it by updating them
20012 // to use the new Chain.
20013 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
20014
20015 unsigned Opcode =
20016 (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
20017 return DAG.getNode(Opcode, SDLoc(N), VT, Load);
20018 }
20019
20020 return SDValue();
20021}
20022
20023/// Fold a floating-point multiply by power of two into floating-point to
20024/// fixed-point conversion.
20027 const AArch64Subtarget *Subtarget) {
20028 if (SDValue Res =
20029 tryToReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
20030 return Res;
20031
20032 if (!Subtarget->isNeonAvailable())
20033 return SDValue();
20034
20035 if (!N->getValueType(0).isSimple())
20036 return SDValue();
20037
20038 SDValue Op = N->getOperand(0);
20039 if (!Op.getValueType().isSimple() || Op.getOpcode() != ISD::FMUL)
20040 return SDValue();
20041
20042 if (!Op.getValueType().is64BitVector() && !Op.getValueType().is128BitVector())
20043 return SDValue();
20044
20045 SDValue ConstVec = Op->getOperand(1);
20046 if (!isa<BuildVectorSDNode>(ConstVec))
20047 return SDValue();
20048
20049 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
20050 uint32_t FloatBits = FloatTy.getSizeInBits();
20051 if (FloatBits != 32 && FloatBits != 64 &&
20052 (FloatBits != 16 || !Subtarget->hasFullFP16()))
20053 return SDValue();
20054
20055 MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
20056 uint32_t IntBits = IntTy.getSizeInBits();
20057 if (IntBits != 16 && IntBits != 32 && IntBits != 64)
20058 return SDValue();
20059
20060 // Avoid conversions where iN is larger than the float (e.g., float -> i64).
20061 if (IntBits > FloatBits)
20062 return SDValue();
20063
20064 BitVector UndefElements;
20066 int32_t Bits = IntBits == 64 ? 64 : 32;
20067 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
20068 if (C == -1 || C == 0 || C > Bits)
20069 return SDValue();
20070
20071 EVT ResTy = Op.getValueType().changeVectorElementTypeToInteger();
20072 if (!DAG.getTargetLoweringInfo().isTypeLegal(ResTy))
20073 return SDValue();
20074
20075 if (N->getOpcode() == ISD::FP_TO_SINT_SAT ||
20076 N->getOpcode() == ISD::FP_TO_UINT_SAT) {
20077 EVT SatVT = cast<VTSDNode>(N->getOperand(1))->getVT();
20078 if (SatVT.getScalarSizeInBits() != IntBits || IntBits != FloatBits)
20079 return SDValue();
20080 }
20081
20082 SDLoc DL(N);
20083 bool IsSigned = (N->getOpcode() == ISD::FP_TO_SINT ||
20084 N->getOpcode() == ISD::FP_TO_SINT_SAT);
20085 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
20086 : Intrinsic::aarch64_neon_vcvtfp2fxu;
20087 SDValue FixConv =
20089 DAG.getTargetConstant(IntrinsicOpcode, DL, MVT::i32),
20090 Op->getOperand(0), DAG.getTargetConstant(C, DL, MVT::i32));
20091 // We can handle smaller integers by generating an extra trunc.
20092 if (IntBits < FloatBits)
20093 FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
20094
20095 return FixConv;
20096}
20097
20098// Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to
20099// convert to csel(ccmp(.., cc0)), depending on cc1:
20100
20101// (AND (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
20102// =>
20103// (CSET cc1 (CCMP x1 y1 !cc1 cc0 cmp0))
20104//
20105// (OR (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
20106// =>
20107// (CSET cc1 (CCMP x1 y1 cc1 !cc0 cmp0))
20109 EVT VT = N->getValueType(0);
20110 SDValue CSel0 = N->getOperand(0);
20111 SDValue CSel1 = N->getOperand(1);
20112
20113 if (CSel0.getOpcode() != AArch64ISD::CSEL ||
20114 CSel1.getOpcode() != AArch64ISD::CSEL)
20115 return SDValue();
20116
20117 if (!CSel0->hasOneUse() || !CSel1->hasOneUse())
20118 return SDValue();
20119
20120 if (!isNullConstant(CSel0.getOperand(0)) ||
20121 !isOneConstant(CSel0.getOperand(1)) ||
20122 !isNullConstant(CSel1.getOperand(0)) ||
20123 !isOneConstant(CSel1.getOperand(1)))
20124 return SDValue();
20125
20126 SDValue Cmp0 = CSel0.getOperand(3);
20127 SDValue Cmp1 = CSel1.getOperand(3);
20130 if (!Cmp0->hasOneUse() || !Cmp1->hasOneUse())
20131 return SDValue();
20132 if (Cmp1.getOpcode() != AArch64ISD::SUBS &&
20133 Cmp0.getOpcode() == AArch64ISD::SUBS) {
20134 std::swap(Cmp0, Cmp1);
20135 std::swap(CC0, CC1);
20136 }
20137
20138 if (Cmp1.getOpcode() != AArch64ISD::SUBS)
20139 return SDValue();
20140
20141 SDLoc DL(N);
20142 SDValue CCmp, Condition;
20143 unsigned NZCV;
20144
20145 if (N->getOpcode() == ISD::AND) {
20147 Condition = getCondCode(DAG, InvCC0);
20149 } else {
20151 Condition = getCondCode(DAG, CC0);
20153 }
20154
20155 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
20156
20157 auto *Op1 = dyn_cast<ConstantSDNode>(Cmp1.getOperand(1));
20158 if (Op1 && Op1->getAPIntValue().isNegative() &&
20159 Op1->getAPIntValue().sgt(-32)) {
20160 // CCMP accept the constant int the range [0, 31]
20161 // if the Op1 is a constant in the range [-31, -1], we
20162 // can select to CCMN to avoid the extra mov
20163 SDValue AbsOp1 =
20164 DAG.getConstant(Op1->getAPIntValue().abs(), DL, Op1->getValueType(0));
20165 CCmp = DAG.getNode(AArch64ISD::CCMN, DL, FlagsVT, Cmp1.getOperand(0),
20166 AbsOp1, NZCVOp, Condition, Cmp0);
20167 } else {
20168 CCmp = DAG.getNode(AArch64ISD::CCMP, DL, FlagsVT, Cmp1.getOperand(0),
20169 Cmp1.getOperand(1), NZCVOp, Condition, Cmp0);
20170 }
20171 return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0),
20172 CSel0.getOperand(1), getCondCode(DAG, CC1), CCmp);
20173}
20174
20176 const AArch64Subtarget *Subtarget,
20177 const AArch64TargetLowering &TLI) {
20178 SelectionDAG &DAG = DCI.DAG;
20179
20180 if (SDValue R = performANDORCSELCombine(N, DAG))
20181 return R;
20182
20183 return SDValue();
20184}
20185
20187 if (!MemVT.getVectorElementType().isSimple())
20188 return false;
20189
20190 uint64_t MaskForTy = 0ull;
20191 switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) {
20192 case MVT::i8:
20193 MaskForTy = 0xffull;
20194 break;
20195 case MVT::i16:
20196 MaskForTy = 0xffffull;
20197 break;
20198 case MVT::i32:
20199 MaskForTy = 0xffffffffull;
20200 break;
20201 default:
20202 return false;
20203 break;
20204 }
20205
20206 if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR)
20207 if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0)))
20208 return Op0->getAPIntValue().getLimitedValue() == MaskForTy;
20209
20210 return false;
20211}
20212
20214 SDValue LeafOp = SDValue(N, 0);
20215 SDValue Op = N->getOperand(0);
20216 while (Op.getOpcode() == AArch64ISD::REINTERPRET_CAST &&
20217 LeafOp.getValueType() != Op.getValueType())
20218 Op = Op->getOperand(0);
20219 if (LeafOp.getValueType() == Op.getValueType())
20220 return Op;
20221 return SDValue();
20222}
20223
20226 SelectionDAG &DAG = DCI.DAG;
20227 SDValue Src = N->getOperand(0);
20228 unsigned Opc = Src->getOpcode();
20229
20230 // Zero/any extend of an unsigned unpack
20231 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
20232 SDValue UnpkOp = Src->getOperand(0);
20233 SDValue Dup = N->getOperand(1);
20234
20235 if (Dup.getOpcode() != ISD::SPLAT_VECTOR)
20236 return SDValue();
20237
20238 SDLoc DL(N);
20240 if (!C)
20241 return SDValue();
20242
20243 uint64_t ExtVal = C->getZExtValue();
20244
20245 auto MaskAndTypeMatch = [ExtVal](EVT VT) -> bool {
20246 return ((ExtVal == 0xFF && VT == MVT::i8) ||
20247 (ExtVal == 0xFFFF && VT == MVT::i16) ||
20248 (ExtVal == 0xFFFFFFFF && VT == MVT::i32));
20249 };
20250
20251 // If the mask is fully covered by the unpack, we don't need to push
20252 // a new AND onto the operand
20253 EVT EltTy = UnpkOp->getValueType(0).getVectorElementType();
20254 if (MaskAndTypeMatch(EltTy))
20255 return Src;
20256
20257 // If this is 'and (uunpklo/hi (extload MemTy -> ExtTy)), mask', then check
20258 // to see if the mask is all-ones of size MemTy.
20259 auto MaskedLoadOp = dyn_cast<MaskedLoadSDNode>(UnpkOp);
20260 if (MaskedLoadOp && (MaskedLoadOp->getExtensionType() == ISD::ZEXTLOAD ||
20261 MaskedLoadOp->getExtensionType() == ISD::EXTLOAD)) {
20262 EVT EltTy = MaskedLoadOp->getMemoryVT().getVectorElementType();
20263 if (MaskAndTypeMatch(EltTy))
20264 return Src;
20265 }
20266
20267 // Truncate to prevent a DUP with an over wide constant
20268 APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits());
20269
20270 // Otherwise, make sure we propagate the AND to the operand
20271 // of the unpack
20272 Dup = DAG.getNode(ISD::SPLAT_VECTOR, DL, UnpkOp->getValueType(0),
20273 DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));
20274
20275 SDValue And = DAG.getNode(ISD::AND, DL,
20276 UnpkOp->getValueType(0), UnpkOp, Dup);
20277
20278 return DAG.getNode(Opc, DL, N->getValueType(0), And);
20279 }
20280
20281 if (DCI.isBeforeLegalizeOps())
20282 return SDValue();
20283
20284 // If both sides of AND operations are i1 splat_vectors then
20285 // we can produce just i1 splat_vector as the result.
20286 if (isAllActivePredicate(DAG, N->getOperand(0)))
20287 return N->getOperand(1);
20288 if (isAllActivePredicate(DAG, N->getOperand(1)))
20289 return N->getOperand(0);
20290
20292 return SDValue();
20293
20294 SDValue Mask = N->getOperand(1);
20295
20296 if (!Src.hasOneUse())
20297 return SDValue();
20298
20299 EVT MemVT;
20300
20301 // SVE load instructions perform an implicit zero-extend, which makes them
20302 // perfect candidates for combining.
20303 switch (Opc) {
20304 case AArch64ISD::LD1_MERGE_ZERO:
20305 case AArch64ISD::LDNF1_MERGE_ZERO:
20306 case AArch64ISD::LDFF1_MERGE_ZERO:
20307 MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT();
20308 break;
20309 case AArch64ISD::GLD1_MERGE_ZERO:
20310 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
20311 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
20312 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
20313 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
20314 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
20315 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
20316 case AArch64ISD::GLDFF1_MERGE_ZERO:
20317 case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
20318 case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
20319 case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
20320 case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
20321 case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
20322 case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
20323 case AArch64ISD::GLDNT1_MERGE_ZERO:
20324 MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
20325 break;
20326 default:
20327 return SDValue();
20328 }
20329
20330 if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT))
20331 return Src;
20332
20333 return SDValue();
20334}
20335
20336// Transform and(fcmp(a, b), fcmp(c, d)) into fccmp(fcmp(a, b), c, d)
20339
20340 // This function performs an optimization on a specific pattern involving
20341 // an AND operation and SETCC (Set Condition Code) node.
20342
20343 SDValue SetCC = N->getOperand(0);
20344 EVT VT = N->getValueType(0);
20345 SelectionDAG &DAG = DCI.DAG;
20346
20347 // Checks if the current node (N) is used by any SELECT instruction and
20348 // returns an empty SDValue to avoid applying the optimization to prevent
20349 // incorrect results
20350 for (auto U : N->users())
20351 if (U->getOpcode() == ISD::SELECT)
20352 return SDValue();
20353
20354 // Check if the operand is a SETCC node with floating-point comparison
20355 if (SetCC.getOpcode() == ISD::SETCC &&
20356 SetCC.getOperand(0).getValueType() == MVT::f32) {
20357
20358 SDValue Cmp;
20360
20361 // Check if the DAG is after legalization and if we can emit the conjunction
20362 if (!DCI.isBeforeLegalize() &&
20363 (Cmp = emitConjunction(DAG, SDValue(N, 0), CC))) {
20364
20366
20367 SDLoc DL(N);
20368 return DAG.getNode(AArch64ISD::CSINC, DL, VT, DAG.getConstant(0, DL, VT),
20369 DAG.getConstant(0, DL, VT),
20370 getCondCode(DAG, InvertedCC), Cmp);
20371 }
20372 }
20373 return SDValue();
20374}
20375
20378 SelectionDAG &DAG = DCI.DAG;
20379 SDValue LHS = N->getOperand(0);
20380 SDValue RHS = N->getOperand(1);
20381 EVT VT = N->getValueType(0);
20382
20383 if (SDValue R = performANDORCSELCombine(N, DAG))
20384 return R;
20385
20386 if (SDValue R = performANDSETCCCombine(N,DCI))
20387 return R;
20388
20389 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
20390 return SDValue();
20391
20392 if (VT.isScalableVector())
20393 return performSVEAndCombine(N, DCI);
20394
20395 // The combining code below works only for NEON vectors. In particular, it
20396 // does not work for SVE when dealing with vectors wider than 128 bits.
20397 if (!VT.is64BitVector() && !VT.is128BitVector())
20398 return SDValue();
20399
20401 if (!BVN)
20402 return SDValue();
20403
20404 // AND does not accept an immediate, so check if we can use a BIC immediate
20405 // instruction instead. We do this here instead of using a (and x, (mvni imm))
20406 // pattern in isel, because some immediates may be lowered to the preferred
20407 // (and x, (movi imm)) form, even though an mvni representation also exists.
20408 APInt DefBits(VT.getSizeInBits(), 0);
20409 APInt UndefBits(VT.getSizeInBits(), 0);
20410 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
20411 SDValue NewOp;
20412
20413 // Any bits known to already be 0 need not be cleared again, which can help
20414 // reduce the size of the immediate to one supported by the instruction.
20415 KnownBits Known = DAG.computeKnownBits(LHS);
20416 APInt ZeroSplat(VT.getSizeInBits(), 0);
20417 for (unsigned I = 0; I < VT.getSizeInBits() / Known.Zero.getBitWidth(); I++)
20418 ZeroSplat |= Known.Zero.zext(VT.getSizeInBits())
20419 << (Known.Zero.getBitWidth() * I);
20420
20421 DefBits = ~(DefBits | ZeroSplat);
20422 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
20423 DefBits, &LHS)) ||
20424 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
20425 DefBits, &LHS)))
20426 return NewOp;
20427
20428 UndefBits = ~(UndefBits | ZeroSplat);
20429 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
20430 UndefBits, &LHS)) ||
20431 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
20432 UndefBits, &LHS)))
20433 return NewOp;
20434 }
20435
20436 return SDValue();
20437}
20438
20441 SelectionDAG &DAG = DCI.DAG;
20442 SDValue LHS = N->getOperand(0);
20443 SDValue RHS = N->getOperand(1);
20444 EVT VT = N->getValueType(0);
20445 SDLoc DL(N);
20446
20447 if (!N->getFlags().hasAllowReassociation())
20448 return SDValue();
20449
20450 // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
20451 auto ReassocComplex = [&](SDValue A, SDValue B) {
20452 if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
20453 return SDValue();
20454 unsigned Opc = A.getConstantOperandVal(0);
20455 if (Opc != Intrinsic::aarch64_neon_vcmla_rot0 &&
20456 Opc != Intrinsic::aarch64_neon_vcmla_rot90 &&
20457 Opc != Intrinsic::aarch64_neon_vcmla_rot180 &&
20458 Opc != Intrinsic::aarch64_neon_vcmla_rot270)
20459 return SDValue();
20460 SDValue VCMLA = DAG.getNode(
20461 ISD::INTRINSIC_WO_CHAIN, DL, VT, A.getOperand(0),
20462 DAG.getNode(ISD::FADD, DL, VT, A.getOperand(1), B, N->getFlags()),
20463 A.getOperand(2), A.getOperand(3));
20464 VCMLA->setFlags(A->getFlags());
20465 return VCMLA;
20466 };
20467 if (SDValue R = ReassocComplex(LHS, RHS))
20468 return R;
20469 if (SDValue R = ReassocComplex(RHS, LHS))
20470 return R;
20471
20472 return SDValue();
20473}
20474
20475static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
20476 switch (Opcode) {
20477 case ISD::STRICT_FADD:
20478 case ISD::FADD:
20479 return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64;
20480 case ISD::ADD:
20481 return VT == MVT::i64;
20482 default:
20483 return false;
20484 }
20485}
20486
20487static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
20489
20491 if ((N.getOpcode() == ISD::SETCC) ||
20492 // get_active_lane_mask is lowered to a whilelo instruction.
20493 (N.getOpcode() == ISD::GET_ACTIVE_LANE_MASK) ||
20494 (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
20495 (N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege ||
20496 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege_x2 ||
20497 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt ||
20498 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt_x2 ||
20499 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi ||
20500 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi_x2 ||
20501 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs ||
20502 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs_x2 ||
20503 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele ||
20504 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele_x2 ||
20505 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo ||
20506 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo_x2 ||
20507 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels ||
20508 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels_x2 ||
20509 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt ||
20510 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt_x2)))
20511 return true;
20512
20513 return false;
20514}
20515
20516// Materialize : i1 = extract_vector_elt t37, Constant:i64<0>
20517// ... into: "ptrue p, all" + PTEST
20518static SDValue
20521 const AArch64Subtarget *Subtarget) {
20522 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
20523 // Make sure PTEST can be legalised with illegal types.
20524 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
20525 return SDValue();
20526
20527 SDValue N0 = N->getOperand(0);
20528 EVT VT = N0.getValueType();
20529
20530 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 ||
20531 !isNullConstant(N->getOperand(1)))
20532 return SDValue();
20533
20534 // Restricted the DAG combine to only cases where we're extracting from a
20535 // flag-setting operation.
20536 if (!isPredicateCCSettingOp(N0) || N0.getResNo() != 0)
20537 return SDValue();
20538
20539 // Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0
20540 SelectionDAG &DAG = DCI.DAG;
20541 SDValue Pg = getPTrue(DAG, SDLoc(N), VT, AArch64SVEPredPattern::all);
20542 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::FIRST_ACTIVE);
20543}
20544
20545// Materialize : Idx = (add (mul vscale, NumEls), -1)
20546// i1 = extract_vector_elt t37, Constant:i64<Idx>
20547// ... into: "ptrue p, all" + PTEST
20548static SDValue
20551 const AArch64Subtarget *Subtarget) {
20552 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
20553 // Make sure PTEST is legal types.
20554 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
20555 return SDValue();
20556
20557 SDValue N0 = N->getOperand(0);
20558 EVT OpVT = N0.getValueType();
20559
20560 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
20561 return SDValue();
20562
20563 // Idx == (add (mul vscale, NumEls), -1)
20564 SDValue Idx = N->getOperand(1);
20565 if (Idx.getOpcode() != ISD::ADD || !isAllOnesConstant(Idx.getOperand(1)))
20566 return SDValue();
20567
20568 SDValue VS = Idx.getOperand(0);
20569 if (VS.getOpcode() != ISD::VSCALE)
20570 return SDValue();
20571
20572 unsigned NumEls = OpVT.getVectorElementCount().getKnownMinValue();
20573 if (VS.getConstantOperandVal(0) != NumEls)
20574 return SDValue();
20575
20576 // Extracts of lane EC-1 for SVE can be expressed as PTEST(Op, LAST) ? 1 : 0
20577 SelectionDAG &DAG = DCI.DAG;
20578 SDValue Pg = getPTrue(DAG, SDLoc(N), OpVT, AArch64SVEPredPattern::all);
20579 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::LAST_ACTIVE);
20580}
20581
20582static SDValue
20584 const AArch64Subtarget *Subtarget) {
20585 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
20586 SelectionDAG &DAG = DCI.DAG;
20587 SDValue Vec = N->getOperand(0);
20588 SDValue Idx = N->getOperand(1);
20589
20590 if (DCI.isBeforeLegalize() || Idx.getOpcode() != ISD::VECTOR_FIND_LAST_ACTIVE)
20591 return SDValue();
20592
20593 // Only legal for 8, 16, 32, and 64 bit element types.
20594 EVT EltVT = Vec.getValueType().getVectorElementType();
20595 if (!is_contained(ArrayRef({MVT::i8, MVT::i16, MVT::i32, MVT::i64, MVT::f16,
20596 MVT::bf16, MVT::f32, MVT::f64}),
20597 EltVT.getSimpleVT().SimpleTy))
20598 return SDValue();
20599
20600 SDValue Mask = Idx.getOperand(0);
20601 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20602 if (!TLI.isOperationLegal(ISD::VECTOR_FIND_LAST_ACTIVE, Mask.getValueType()))
20603 return SDValue();
20604
20605 return DAG.getNode(AArch64ISD::LASTB, SDLoc(N), N->getValueType(0), Mask,
20606 Vec);
20607}
20608
20609static SDValue
20611 const AArch64Subtarget *Subtarget) {
20612 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
20613 if (SDValue Res = performFirstTrueTestVectorCombine(N, DCI, Subtarget))
20614 return Res;
20615 if (SDValue Res = performLastTrueTestVectorCombine(N, DCI, Subtarget))
20616 return Res;
20617 if (SDValue Res = performExtractLastActiveCombine(N, DCI, Subtarget))
20618 return Res;
20619
20620 SelectionDAG &DAG = DCI.DAG;
20621 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
20622
20623 EVT VT = N->getValueType(0);
20624 const bool FullFP16 = Subtarget->hasFullFP16();
20625 bool IsStrict = N0->isStrictFPOpcode();
20626
20627 // extract(dup x) -> x
20628 if (N0.getOpcode() == AArch64ISD::DUP)
20629 return VT.isInteger() ? DAG.getZExtOrTrunc(N0.getOperand(0), SDLoc(N), VT)
20630 : N0.getOperand(0);
20631
20632 // Rewrite for pairwise fadd pattern
20633 // (f32 (extract_vector_elt
20634 // (fadd (vXf32 Other)
20635 // (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0))
20636 // ->
20637 // (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
20638 // (extract_vector_elt (vXf32 Other) 1))
20639 // For strict_fadd we need to make sure the old strict_fadd can be deleted, so
20640 // we can only do this when it's used only by the extract_vector_elt.
20641 if (isNullConstant(N1) && hasPairwiseAdd(N0->getOpcode(), VT, FullFP16) &&
20642 (!IsStrict || N0.hasOneUse())) {
20643 SDLoc DL(N0);
20644 SDValue N00 = N0->getOperand(IsStrict ? 1 : 0);
20645 SDValue N01 = N0->getOperand(IsStrict ? 2 : 1);
20646
20648 SDValue Other = N00;
20649
20650 // And handle the commutative case.
20651 if (!Shuffle) {
20652 Shuffle = dyn_cast<ShuffleVectorSDNode>(N00);
20653 Other = N01;
20654 }
20655
20656 if (Shuffle && Shuffle->getMaskElt(0) == 1 &&
20657 Other == Shuffle->getOperand(0)) {
20658 SDValue Extract1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
20659 DAG.getConstant(0, DL, MVT::i64));
20660 SDValue Extract2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
20661 DAG.getConstant(1, DL, MVT::i64));
20662 if (!IsStrict)
20663 return DAG.getNode(N0->getOpcode(), DL, VT, Extract1, Extract2);
20664
20665 // For strict_fadd we need uses of the final extract_vector to be replaced
20666 // with the strict_fadd, but we also need uses of the chain output of the
20667 // original strict_fadd to use the chain output of the new strict_fadd as
20668 // otherwise it may not be deleted.
20669 SDValue Ret = DAG.getNode(N0->getOpcode(), DL,
20670 {VT, MVT::Other},
20671 {N0->getOperand(0), Extract1, Extract2});
20672 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Ret);
20673 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Ret.getValue(1));
20674 return SDValue(N, 0);
20675 }
20676 }
20677
20678 // Given an extract(load) or extract(extend(load)), produce a scalar load
20679 // instead to avoid the cross-register-bank copies.
20680 if (DCI.isAfterLegalizeDAG() && Subtarget->isLittleEndian() &&
20681 VT.isInteger() && isa<ConstantSDNode>(N1)) {
20682 SDValue LoadN0 = N0;
20683 // Look through sext/zext and extract_subvector / insert_subvector if
20684 // required.
20685 if ((N0.getOpcode() == ISD::ZERO_EXTEND ||
20686 N0.getOpcode() == ISD::SIGN_EXTEND ||
20687 N0.getOpcode() == ISD::ANY_EXTEND) &&
20688 N0.getOperand(0).hasOneUse())
20689 LoadN0 = N0.getOperand(0);
20690 unsigned OffsetElts = 0;
20691 if (LoadN0.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
20692 OffsetElts = LoadN0.getConstantOperandVal(1);
20693 LoadN0 = LoadN0.getOperand(0);
20694 }
20695 if (LoadN0.getOpcode() == ISD::INSERT_SUBVECTOR &&
20696 LoadN0.getOperand(0).isUndef() &&
20697 isNullConstant(LoadN0.getOperand(2)) &&
20698 LoadN0.getOperand(1).hasOneUse())
20699 LoadN0 = LoadN0.getOperand(1);
20700
20701 // Check all the uses are valid and can be scalarized. We check that all the
20702 // uses are extracts and those extracts are not re-inserted into an
20703 // operation best treated as a vector register.
20704 auto Load = dyn_cast<LoadSDNode>(LoadN0);
20705 if (Load && Load->isSimple() && ISD::isNormalLoad(Load) &&
20706 Load->getMemoryVT().isByteSized() &&
20707 all_of(N0->uses(), [&](const SDUse &U) {
20708 return U.getResNo() != N0.getResNo() ||
20709 (U.getUser()->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20710 !any_of(U.getUser()->uses(), [](const SDUse &U2) {
20711 return U2.getUser()->getOpcode() ==
20712 ISD::INSERT_VECTOR_ELT ||
20713 U2.getUser()->getOpcode() == ISD::BUILD_VECTOR ||
20714 U2.getUser()->getOpcode() == ISD::SCALAR_TO_VECTOR;
20715 }));
20716 })) {
20717
20718 SDLoc DL(Load);
20719
20720 // Generate a new scalar load.
20721 unsigned Offset = (OffsetElts + N->getConstantOperandVal(1)) *
20722 Load->getValueType(0).getScalarSizeInBits() / 8;
20723 SDValue BasePtr = DAG.getObjectPtrOffset(
20724 DL, Load->getBasePtr(), DAG.getConstant(Offset, DL, MVT::i64));
20725 ISD::LoadExtType ExtType =
20729 : ISD::EXTLOAD);
20730 SDValue ScalarLoad =
20731 DAG.getExtLoad(ExtType, DL, VT, Load->getChain(), BasePtr,
20732 Load->getPointerInfo().getWithOffset(Offset),
20733 Load->getValueType(0).getScalarType(),
20734 commonAlignment(Load->getAlign(), Offset),
20735 Load->getMemOperand()->getFlags(), Load->getAAInfo());
20736 DAG.makeEquivalentMemoryOrdering(Load, ScalarLoad);
20737 return ScalarLoad;
20738 }
20739 }
20740
20741 return SDValue();
20742}
20743
20746 SelectionDAG &DAG) {
20747 SDLoc DL(N);
20748 EVT VT = N->getValueType(0);
20749 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
20750 unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();
20751
20752 if (VT.isScalableVector())
20753 return SDValue();
20754
20755 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
20756 N1Opc == ISD::TRUNCATE) {
20757 SDValue N00 = N0->getOperand(0);
20758 SDValue N10 = N1->getOperand(0);
20759 EVT N00VT = N00.getValueType();
20760 unsigned N00Opc = N00.getOpcode(), N10Opc = N10.getOpcode();
20761
20762 // Optimize concat_vectors of truncated vectors, where the intermediate
20763 // type is illegal, to avoid said illegality, e.g.,
20764 // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
20765 // (v2i16 (truncate (v2i64)))))
20766 // ->
20767 // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
20768 // (v4i32 (bitcast (v2i64))),
20769 // <0, 2, 4, 6>)))
20770 // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
20771 // on both input and result type, so we might generate worse code.
20772 // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
20773 if (N00VT == N10.getValueType() &&
20774 (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
20775 N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
20776 MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
20778 for (size_t i = 0; i < Mask.size(); ++i)
20779 Mask[i] = i * 2;
20780 return DAG.getNode(ISD::TRUNCATE, DL, VT,
20781 DAG.getVectorShuffle(
20782 MidVT, DL,
20783 DAG.getNode(ISD::BITCAST, DL, MidVT, N00),
20784 DAG.getNode(ISD::BITCAST, DL, MidVT, N10), Mask));
20785 }
20786
20787 // Optimize two large shifts and a combine into a single combine and shift
20788 // For AArch64 architectures, sequences like the following:
20789 //
20790 // ushr v0.4s, v0.4s, #20
20791 // ushr v1.4s, v1.4s, #20
20792 // uzp1 v0.8h, v0.8h, v1.8h
20793 //
20794 // Can be optimized to:
20795 //
20796 // uzp2 v0.8h, v0.8h, v1.8h
20797 // ushr v0.8h, v0.8h, #4
20798 //
20799 // This optimization reduces instruction count.
20800 if (N00Opc == AArch64ISD::VLSHR && N10Opc == AArch64ISD::VLSHR &&
20801 N00->getOperand(1) == N10->getOperand(1)) {
20802 SDValue N000 = N00->getOperand(0);
20803 SDValue N100 = N10->getOperand(0);
20804 uint64_t N001ConstVal = N00->getConstantOperandVal(1),
20805 N101ConstVal = N10->getConstantOperandVal(1),
20806 NScalarSize = N->getValueType(0).getScalarSizeInBits();
20807
20808 if (N001ConstVal == N101ConstVal && N001ConstVal > NScalarSize) {
20809 N000 = DAG.getNode(AArch64ISD::NVCAST, DL, VT, N000);
20810 N100 = DAG.getNode(AArch64ISD::NVCAST, DL, VT, N100);
20811 SDValue Uzp = DAG.getNode(AArch64ISD::UZP2, DL, VT, N000, N100);
20812 SDValue NewShiftConstant =
20813 DAG.getTargetConstant(N001ConstVal - NScalarSize, DL, MVT::i32);
20814
20815 return DAG.getNode(AArch64ISD::VLSHR, DL, VT, Uzp, NewShiftConstant);
20816 }
20817 }
20818 }
20819
20820 if (N->getOperand(0).getValueType() == MVT::v4i8 ||
20821 N->getOperand(0).getValueType() == MVT::v2i16 ||
20822 N->getOperand(0).getValueType() == MVT::v2i8) {
20823 EVT SrcVT = N->getOperand(0).getValueType();
20824 // If we have a concat of v4i8 loads, convert them to a buildvector of f32
20825 // loads to prevent having to go through the v4i8 load legalization that
20826 // needs to extend each element into a larger type.
20827 if (N->getNumOperands() % 2 == 0 &&
20828 all_of(N->op_values(), [SrcVT](SDValue V) {
20829 if (V.getValueType() != SrcVT)
20830 return false;
20831 if (V.isUndef())
20832 return true;
20833 LoadSDNode *LD = dyn_cast<LoadSDNode>(V);
20834 return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() &&
20835 LD->getExtensionType() == ISD::NON_EXTLOAD;
20836 })) {
20837 EVT FVT = SrcVT == MVT::v2i8 ? MVT::f16 : MVT::f32;
20838 EVT NVT = EVT::getVectorVT(*DAG.getContext(), FVT, N->getNumOperands());
20840
20841 for (unsigned i = 0; i < N->getNumOperands(); i++) {
20842 SDValue V = N->getOperand(i);
20843 if (V.isUndef())
20844 Ops.push_back(DAG.getUNDEF(FVT));
20845 else {
20847 SDValue NewLoad = DAG.getLoad(FVT, DL, LD->getChain(),
20848 LD->getBasePtr(), LD->getMemOperand());
20849 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1));
20850 Ops.push_back(NewLoad);
20851 }
20852 }
20853 return DAG.getBitcast(N->getValueType(0),
20854 DAG.getBuildVector(NVT, DL, Ops));
20855 }
20856 }
20857
20858 // Canonicalise concat_vectors to replace concatenations of truncated nots
20859 // with nots of concatenated truncates. This in some cases allows for multiple
20860 // redundant negations to be eliminated.
20861 // (concat_vectors (v4i16 (truncate (not (v4i32)))),
20862 // (v4i16 (truncate (not (v4i32)))))
20863 // ->
20864 // (not (concat_vectors (v4i16 (truncate (v4i32))),
20865 // (v4i16 (truncate (v4i32)))))
20866 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
20867 N1Opc == ISD::TRUNCATE && N->isOnlyUserOf(N0.getNode()) &&
20868 N->isOnlyUserOf(N1.getNode())) {
20869 auto isBitwiseVectorNegate = [](SDValue V) {
20870 return V->getOpcode() == ISD::XOR &&
20871 ISD::isConstantSplatVectorAllOnes(V.getOperand(1).getNode());
20872 };
20873 SDValue N00 = N0->getOperand(0);
20874 SDValue N10 = N1->getOperand(0);
20875 if (isBitwiseVectorNegate(N00) && N0->isOnlyUserOf(N00.getNode()) &&
20876 isBitwiseVectorNegate(N10) && N1->isOnlyUserOf(N10.getNode())) {
20877 return DAG.getNOT(
20878 DL,
20881 N00->getOperand(0)),
20883 N10->getOperand(0))),
20884 VT);
20885 }
20886 }
20887
20888 // Wait till after everything is legalized to try this. That way we have
20889 // legal vector types and such.
20890 if (DCI.isBeforeLegalizeOps())
20891 return SDValue();
20892
20893 // Optimise concat_vectors of two identical binops with a 128-bit destination
20894 // size, combine into an binop of two contacts of the source vectors. eg:
20895 // concat(uhadd(a,b), uhadd(c, d)) -> uhadd(concat(a, c), concat(b, d))
20896 if (N->getNumOperands() == 2 && N0Opc == N1Opc && VT.is128BitVector() &&
20897 (DAG.getTargetLoweringInfo().isBinOp(N0Opc) ||
20898 isVectorizedBinOp(N0Opc)) &&
20899 N0->hasOneUse() && N1->hasOneUse()) {
20900 SDValue N00 = N0->getOperand(0);
20901 SDValue N01 = N0->getOperand(1);
20902 SDValue N10 = N1->getOperand(0);
20903 SDValue N11 = N1->getOperand(1);
20904
20905 if (!N00.isUndef() && !N01.isUndef() && !N10.isUndef() && !N11.isUndef()) {
20906 SDValue Concat0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N00, N10);
20907 SDValue Concat1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N01, N11);
20908 return DAG.getNode(N0Opc, DL, VT, Concat0, Concat1);
20909 }
20910 }
20911
20912 auto IsRSHRN = [](SDValue Shr) {
20913 if (Shr.getOpcode() != AArch64ISD::VLSHR)
20914 return false;
20915 SDValue Op = Shr.getOperand(0);
20916 EVT VT = Op.getValueType();
20917 unsigned ShtAmt = Shr.getConstantOperandVal(1);
20918 if (ShtAmt > VT.getScalarSizeInBits() / 2 || Op.getOpcode() != ISD::ADD)
20919 return false;
20920
20921 APInt Imm;
20922 if (Op.getOperand(1).getOpcode() == AArch64ISD::MOVIshift)
20923 Imm = APInt(VT.getScalarSizeInBits(),
20924 Op.getOperand(1).getConstantOperandVal(0)
20925 << Op.getOperand(1).getConstantOperandVal(1));
20926 else if (Op.getOperand(1).getOpcode() == AArch64ISD::DUP &&
20927 isa<ConstantSDNode>(Op.getOperand(1).getOperand(0)))
20928 Imm = APInt(VT.getScalarSizeInBits(),
20929 Op.getOperand(1).getConstantOperandVal(0));
20930 else
20931 return false;
20932
20933 if (Imm != 1ULL << (ShtAmt - 1))
20934 return false;
20935 return true;
20936 };
20937
20938 // concat(rshrn(x), rshrn(y)) -> rshrn(concat(x, y))
20939 if (N->getNumOperands() == 2 && IsRSHRN(N0) &&
20940 ((IsRSHRN(N1) &&
20942 N1.isUndef())) {
20943 SDValue X = N0.getOperand(0).getOperand(0);
20944 SDValue Y = N1.isUndef() ? DAG.getUNDEF(X.getValueType())
20945 : N1.getOperand(0).getOperand(0);
20946 EVT BVT =
20947 X.getValueType().getDoubleNumVectorElementsVT(*DCI.DAG.getContext());
20948 SDValue CC = DAG.getNode(ISD::CONCAT_VECTORS, DL, BVT, X, Y);
20949 SDValue Add = DAG.getNode(
20950 ISD::ADD, DL, BVT, CC,
20951 DAG.getConstant(1ULL << (N0.getConstantOperandVal(1) - 1), DL, BVT));
20952 SDValue Shr =
20953 DAG.getNode(AArch64ISD::VLSHR, DL, BVT, Add, N0.getOperand(1));
20954 return Shr;
20955 }
20956
20957 // concat(zip1(a, b), zip2(a, b)) is zip1(a, b)
20958 if (N->getNumOperands() == 2 && N0Opc == AArch64ISD::ZIP1 &&
20959 N1Opc == AArch64ISD::ZIP2 && N0.getOperand(0) == N1.getOperand(0) &&
20960 N0.getOperand(1) == N1.getOperand(1)) {
20961 SDValue E0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
20962 DAG.getUNDEF(N0.getValueType()));
20963 SDValue E1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(1),
20964 DAG.getUNDEF(N0.getValueType()));
20965 return DAG.getNode(AArch64ISD::ZIP1, DL, VT, E0, E1);
20966 }
20967
20968 // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
20969 // splat. The indexed instructions are going to be expecting a DUPLANE64, so
20970 // canonicalise to that.
20971 if (N->getNumOperands() == 2 && N0 == N1 && VT.getVectorNumElements() == 2) {
20972 assert(VT.getScalarSizeInBits() == 64);
20973 return DAG.getNode(AArch64ISD::DUPLANE64, DL, VT, WidenVector(N0, DAG),
20974 DAG.getConstant(0, DL, MVT::i64));
20975 }
20976
20977 // Canonicalise concat_vectors so that the right-hand vector has as few
20978 // bit-casts as possible before its real operation. The primary matching
20979