LLVM  16.0.0git
AArch64ISelLowering.cpp
Go to the documentation of this file.
1 //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the AArch64TargetLowering class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AArch64ISelLowering.h"
15 #include "AArch64ExpandImm.h"
17 #include "AArch64PerfectShuffle.h"
18 #include "AArch64RegisterInfo.h"
19 #include "AArch64Subtarget.h"
21 #include "Utils/AArch64BaseInfo.h"
22 #include "llvm/ADT/APFloat.h"
23 #include "llvm/ADT/APInt.h"
24 #include "llvm/ADT/ArrayRef.h"
25 #include "llvm/ADT/STLExtras.h"
26 #include "llvm/ADT/SmallSet.h"
27 #include "llvm/ADT/SmallVector.h"
28 #include "llvm/ADT/Statistic.h"
29 #include "llvm/ADT/StringRef.h"
30 #include "llvm/ADT/Triple.h"
31 #include "llvm/ADT/Twine.h"
32 #include "llvm/Analysis/LoopInfo.h"
37 #include "llvm/CodeGen/Analysis.h"
53 #include "llvm/IR/Attributes.h"
54 #include "llvm/IR/Constants.h"
55 #include "llvm/IR/DataLayout.h"
56 #include "llvm/IR/DebugLoc.h"
57 #include "llvm/IR/DerivedTypes.h"
58 #include "llvm/IR/Function.h"
60 #include "llvm/IR/GlobalValue.h"
61 #include "llvm/IR/IRBuilder.h"
62 #include "llvm/IR/Instruction.h"
63 #include "llvm/IR/Instructions.h"
64 #include "llvm/IR/IntrinsicInst.h"
65 #include "llvm/IR/Intrinsics.h"
66 #include "llvm/IR/IntrinsicsAArch64.h"
67 #include "llvm/IR/Module.h"
68 #include "llvm/IR/OperandTraits.h"
69 #include "llvm/IR/PatternMatch.h"
70 #include "llvm/IR/Type.h"
71 #include "llvm/IR/Use.h"
72 #include "llvm/IR/Value.h"
73 #include "llvm/MC/MCRegisterInfo.h"
74 #include "llvm/Support/Casting.h"
75 #include "llvm/Support/CodeGen.h"
77 #include "llvm/Support/Compiler.h"
78 #include "llvm/Support/Debug.h"
81 #include "llvm/Support/KnownBits.h"
87 #include <algorithm>
88 #include <bitset>
89 #include <cassert>
90 #include <cctype>
91 #include <cstdint>
92 #include <cstdlib>
93 #include <iterator>
94 #include <limits>
95 #include <tuple>
96 #include <utility>
97 #include <vector>
98 
99 using namespace llvm;
100 using namespace llvm::PatternMatch;
101 
102 #define DEBUG_TYPE "aarch64-lower"
103 
104 STATISTIC(NumTailCalls, "Number of tail calls");
105 STATISTIC(NumShiftInserts, "Number of vector shift inserts");
106 STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
107 
108 // FIXME: The necessary dtprel relocations don't seem to be supported
109 // well in the GNU bfd and gold linkers at the moment. Therefore, by
110 // default, for now, fall back to GeneralDynamic code generation.
112  "aarch64-elf-ldtls-generation", cl::Hidden,
113  cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
114  cl::init(false));
115 
116 static cl::opt<bool>
117 EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
118  cl::desc("Enable AArch64 logical imm instruction "
119  "optimization"),
120  cl::init(true));
121 
122 // Temporary option added for the purpose of testing functionality added
123 // to DAGCombiner.cpp in D92230. It is expected that this can be removed
124 // in future when both implementations will be based off MGATHER rather
125 // than the GLD1 nodes added for the SVE gather load intrinsics.
126 static cl::opt<bool>
127 EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
128  cl::desc("Combine extends of AArch64 masked "
129  "gather intrinsics"),
130  cl::init(true));
131 
132 /// Value type used for condition codes.
133 static const MVT MVT_CC = MVT::i32;
134 
135 static inline EVT getPackedSVEVectorVT(EVT VT) {
136  switch (VT.getSimpleVT().SimpleTy) {
137  default:
138  llvm_unreachable("unexpected element type for vector");
139  case MVT::i8:
140  return MVT::nxv16i8;
141  case MVT::i16:
142  return MVT::nxv8i16;
143  case MVT::i32:
144  return MVT::nxv4i32;
145  case MVT::i64:
146  return MVT::nxv2i64;
147  case MVT::f16:
148  return MVT::nxv8f16;
149  case MVT::f32:
150  return MVT::nxv4f32;
151  case MVT::f64:
152  return MVT::nxv2f64;
153  case MVT::bf16:
154  return MVT::nxv8bf16;
155  }
156 }
157 
158 // NOTE: Currently there's only a need to return integer vector types. If this
159 // changes then just add an extra "type" parameter.
161  switch (EC.getKnownMinValue()) {
162  default:
163  llvm_unreachable("unexpected element count for vector");
164  case 16:
165  return MVT::nxv16i8;
166  case 8:
167  return MVT::nxv8i16;
168  case 4:
169  return MVT::nxv4i32;
170  case 2:
171  return MVT::nxv2i64;
172  }
173 }
174 
175 static inline EVT getPromotedVTForPredicate(EVT VT) {
177  "Expected scalable predicate vector type!");
178  switch (VT.getVectorMinNumElements()) {
179  default:
180  llvm_unreachable("unexpected element count for vector");
181  case 2:
182  return MVT::nxv2i64;
183  case 4:
184  return MVT::nxv4i32;
185  case 8:
186  return MVT::nxv8i16;
187  case 16:
188  return MVT::nxv16i8;
189  }
190 }
191 
192 /// Returns true if VT's elements occupy the lowest bit positions of its
193 /// associated register class without any intervening space.
194 ///
195 /// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
196 /// same register class, but only nxv8f16 can be treated as a packed vector.
197 static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
198  assert(VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
199  "Expected legal vector type!");
200  return VT.isFixedLengthVector() ||
202 }
203 
204 // Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
205 // predicate and end with a passthru value matching the result type.
206 static bool isMergePassthruOpcode(unsigned Opc) {
207  switch (Opc) {
208  default:
209  return false;
239  return true;
240  }
241 }
242 
243 // Returns true if inactive lanes are known to be zeroed by construction.
245  switch (Op.getOpcode()) {
246  default:
247  // We guarantee i1 splat_vectors to zero the other lanes by
248  // implementing it with ptrue and possibly a punpklo for nxv1i1.
249  if (ISD::isConstantSplatVectorAllOnes(Op.getNode()))
250  return true;
251  return false;
252  case AArch64ISD::PTRUE:
254  return true;
256  switch (Op.getConstantOperandVal(0)) {
257  default:
258  return false;
259  case Intrinsic::aarch64_sve_ptrue:
260  case Intrinsic::aarch64_sve_pnext:
261  case Intrinsic::aarch64_sve_cmpeq:
262  case Intrinsic::aarch64_sve_cmpne:
263  case Intrinsic::aarch64_sve_cmpge:
264  case Intrinsic::aarch64_sve_cmpgt:
265  case Intrinsic::aarch64_sve_cmphs:
266  case Intrinsic::aarch64_sve_cmphi:
267  case Intrinsic::aarch64_sve_cmpeq_wide:
268  case Intrinsic::aarch64_sve_cmpne_wide:
269  case Intrinsic::aarch64_sve_cmpge_wide:
270  case Intrinsic::aarch64_sve_cmpgt_wide:
271  case Intrinsic::aarch64_sve_cmplt_wide:
272  case Intrinsic::aarch64_sve_cmple_wide:
273  case Intrinsic::aarch64_sve_cmphs_wide:
274  case Intrinsic::aarch64_sve_cmphi_wide:
275  case Intrinsic::aarch64_sve_cmplo_wide:
276  case Intrinsic::aarch64_sve_cmpls_wide:
277  case Intrinsic::aarch64_sve_fcmpeq:
278  case Intrinsic::aarch64_sve_fcmpne:
279  case Intrinsic::aarch64_sve_fcmpge:
280  case Intrinsic::aarch64_sve_fcmpgt:
281  case Intrinsic::aarch64_sve_fcmpuo:
282  return true;
283  }
284  }
285 }
286 
288  const AArch64Subtarget &STI)
289  : TargetLowering(TM), Subtarget(&STI) {
290  // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
291  // we have to make something up. Arbitrarily, choose ZeroOrOne.
293  // When comparing vectors the result sets the different elements in the
294  // vector to all-one or all-zero.
296 
297  // Set up the register classes.
298  addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
299  addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
300 
301  if (Subtarget->hasLS64()) {
302  addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
305  }
306 
307  if (Subtarget->hasFPARMv8()) {
308  addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
309  addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
310  addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
311  addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
312  addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
313  }
314 
315  if (Subtarget->hasNEON()) {
316  addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
317  addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
318  // Someone set us up the NEON.
319  addDRTypeForNEON(MVT::v2f32);
320  addDRTypeForNEON(MVT::v8i8);
321  addDRTypeForNEON(MVT::v4i16);
322  addDRTypeForNEON(MVT::v2i32);
323  addDRTypeForNEON(MVT::v1i64);
324  addDRTypeForNEON(MVT::v1f64);
325  addDRTypeForNEON(MVT::v4f16);
326  if (Subtarget->hasBF16())
327  addDRTypeForNEON(MVT::v4bf16);
328 
329  addQRTypeForNEON(MVT::v4f32);
330  addQRTypeForNEON(MVT::v2f64);
331  addQRTypeForNEON(MVT::v16i8);
332  addQRTypeForNEON(MVT::v8i16);
333  addQRTypeForNEON(MVT::v4i32);
334  addQRTypeForNEON(MVT::v2i64);
335  addQRTypeForNEON(MVT::v8f16);
336  if (Subtarget->hasBF16())
337  addQRTypeForNEON(MVT::v8bf16);
338  }
339 
340  if (Subtarget->hasSVE() || Subtarget->hasSME()) {
341  // Add legal sve predicate types
342  addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass);
343  addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
344  addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
345  addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
346  addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
347 
348  // Add legal sve data types
349  addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
350  addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
351  addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
352  addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
353 
354  addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
355  addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
356  addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
357  addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
358  addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
359  addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
360 
361  if (Subtarget->hasBF16()) {
362  addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
363  addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
364  addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
365  }
366 
367  if (Subtarget->useSVEForFixedLengthVectors()) {
369  if (useSVEForFixedLengthVectorVT(VT))
370  addRegisterClass(VT, &AArch64::ZPRRegClass);
371 
373  if (useSVEForFixedLengthVectorVT(VT))
374  addRegisterClass(VT, &AArch64::ZPRRegClass);
375  }
376  }
377 
378  // Compute derived properties from the register classes
380 
381  // Provide all sorts of operation actions
417 
421 
425 
427 
428  // Custom lowering hooks are needed for XOR
429  // to fold it into CSINC/CSINV.
432 
433  // Virtually no operation on f128 is legal, but LLVM can't expand them when
434  // there's a valid register class, so we need custom operations in most cases.
458  // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
459  // aren't handled.
460 
461  // Lowering for many of the conversions is actually specified by the non-f128
462  // type. The LowerXXX function will be trivial when f128 isn't involved.
493 
498 
499  // Variable arguments.
504 
505  // Variable-sized objects.
508 
509  if (Subtarget->isTargetWindows())
511  else
513 
514  // Constant pool entries
516 
517  // BlockAddress
519 
520  // AArch64 lacks both left-rotate and popcount instructions.
523  for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
526  }
527 
528  // AArch64 doesn't have i32 MULH{S|U}.
531 
532  // AArch64 doesn't have {U|S}MUL_LOHI.
535 
539 
542 
545 
548  for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
551  }
558 
559  // Custom lower Add/Sub/Mul with overflow.
572 
581 
590  if (Subtarget->hasFullFP16())
592  else
594 
595  for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
605  }
606 
607  if (!Subtarget->hasFullFP16()) {
608  for (auto Op :
624 
625  // Round-to-integer need custom lowering for fp16, as Promote doesn't work
626  // because the result type is integer.
630 
631  // promote v4f16 to v4f32 when that is known to be safe.
636 
653 
675  }
676 
677  // AArch64 has implementations of a lot of rounding-like FP operations.
678  for (auto Op :
689  for (MVT Ty : {MVT::f32, MVT::f64})
691  if (Subtarget->hasFullFP16())
693  }
694 
695  // Basic strict FP operations are legal
698  for (MVT Ty : {MVT::f32, MVT::f64})
700  if (Subtarget->hasFullFP16())
702  }
703 
704  // Strict conversion to a larger type is legal
705  for (auto VT : {MVT::f32, MVT::f64})
707 
709 
712 
718 
719  // Generate outline atomics library calls only if LSE was not specified for
720  // subtarget
721  if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
747 #define LCALLNAMES(A, B, N) \
748  setLibcallName(A##N##_RELAX, #B #N "_relax"); \
749  setLibcallName(A##N##_ACQ, #B #N "_acq"); \
750  setLibcallName(A##N##_REL, #B #N "_rel"); \
751  setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
752 #define LCALLNAME4(A, B) \
753  LCALLNAMES(A, B, 1) \
754  LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
755 #define LCALLNAME5(A, B) \
756  LCALLNAMES(A, B, 1) \
757  LCALLNAMES(A, B, 2) \
758  LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
759  LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
760  LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
761  LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
762  LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
763  LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
764  LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
765 #undef LCALLNAMES
766 #undef LCALLNAME4
767 #undef LCALLNAME5
768  }
769 
770  // 128-bit loads and stores can be done without expanding
773 
774  // Aligned 128-bit loads and stores are single-copy atomic according to the
775  // v8.4a spec.
776  if (Subtarget->hasLSE2()) {
779  }
780 
781  // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
782  // custom lowering, as there are no un-paired non-temporal stores and
783  // legalization will break up 256 bit inputs.
791 
792  // 256 bit non-temporal loads can be lowered to LDNP. This is done using
793  // custom lowering, as there are no un-paired non-temporal loads legalization
794  // will break up 256 bit inputs.
802 
803  // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
804  // This requires the Performance Monitors extension.
805  if (Subtarget->hasPerfMon())
807 
808  if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
809  getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
810  // Issue __sincos_stret if available.
813  } else {
816  }
817 
818  if (Subtarget->getTargetTriple().isOSMSVCRT()) {
819  // MSVCRT doesn't have powi; fall back to pow
820  setLibcallName(RTLIB::POWI_F32, nullptr);
821  setLibcallName(RTLIB::POWI_F64, nullptr);
822  }
823 
824  // Make floating-point constants legal for the large code model, so they don't
825  // become loads from the constant pool.
826  if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
829  }
830 
831  // AArch64 does not have floating-point extending loads, i1 sign-extending
832  // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
833  for (MVT VT : MVT::fp_valuetypes()) {
838  }
839  for (MVT VT : MVT::integer_valuetypes())
841 
849 
853 
854  // Indexed loads and stores are supported.
855  for (unsigned im = (unsigned)ISD::PRE_INC;
856  im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
873  }
874 
875  // Trap.
879 
880  // We combine OR nodes for bitfield operations.
882  // Try to create BICs for vector ANDs.
884 
885  // Vector add and sub nodes may conceal a high-half opportunity.
886  // Also, try to fold ADD into CSINC/CSINV..
888  ISD::UINT_TO_FP});
889 
892 
893  // Try and combine setcc with csel
895 
897 
902  if (Subtarget->supportsAddressTopByteIgnored())
904 
906 
908 
910 
914 
916 
918 
920 
921  // In case of strict alignment, avoid an excessive number of byte wide stores.
924  Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32;
925 
929  Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16;
930 
933 
936  Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8;
937 
939 
941 
942  EnableExtLdPromotion = true;
943 
944  // Set required alignment.
946  // Set preferred alignments.
950 
951  // Only change the limit for entries in a jump table if specified by
952  // the sub target, but not at the command line.
953  unsigned MaxJT = STI.getMaximumJumpTableSize();
954  if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
956 
957  setHasExtractBitsInsn(true);
958 
960 
962 
963  if (Subtarget->hasNEON()) {
964  // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
965  // silliness like this:
966  for (auto Op :
982 
983  for (auto Op :
989 
990  // AArch64 doesn't have a direct vector ->f32 conversion instructions for
991  // elements smaller than i32, so promote the input to i32 first.
994 
995  // Similarly, there is no direct i32 -> f64 vector conversion instruction.
996  // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
997  // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1000  for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
1002 
1003  if (Subtarget->hasFullFP16()) {
1005 
1014  } else {
1015  // when AArch64 doesn't have fullfp16 support, promote the input
1016  // to i32 first.
1025  }
1026 
1035  for (auto VT : {MVT::v1i64, MVT::v2i64}) {
1040  }
1041 
1042  // AArch64 doesn't have MUL.2d:
1044  // Custom handling for some quad-vector types to detect MULL.
1048 
1049  // Saturates
1050  for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1056  }
1057 
1059  MVT::v4i32}) {
1066  }
1067 
1068  // Vector reductions
1069  for (MVT VT : { MVT::v4f16, MVT::v2f32,
1071  if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1074 
1076  }
1077  }
1078  for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1085  }
1087 
1090  // Likewise, narrowing and extending vector loads/stores aren't handled
1091  // directly.
1092  for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
1094 
1095  if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1098  } else {
1101  }
1104 
1107 
1108  for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1109  setTruncStoreAction(VT, InnerVT, Expand);
1110  setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1111  setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1112  setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1113  }
1114  }
1115 
1116  // AArch64 has implementations of a lot of rounding-like FP operations.
1117  for (auto Op :
1122  for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1123  setOperationAction(Op, Ty, Legal);
1124  if (Subtarget->hasFullFP16())
1125  for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1126  setOperationAction(Op, Ty, Legal);
1127  }
1128 
1130 
1137 
1138  // ADDP custom lowering
1139  for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1141  // FADDP custom lowering
1142  for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1144  }
1145 
1146  if (Subtarget->hasSME()) {
1148  }
1149 
1150  // FIXME: Move lowering for more nodes here if those are common between
1151  // SVE and SME.
1152  if (Subtarget->hasSVE() || Subtarget->hasSME()) {
1153  for (auto VT :
1157  }
1158  }
1159 
1160  if (Subtarget->hasSME())
1162 
1163  if (Subtarget->hasSVE()) {
1164  for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1205 
1211 
1220  }
1221 
1222  // Illegal unpacked integer vector types.
1223  for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1226  }
1227 
1228  // Legalize unpacked bitcasts to REINTERPRET_CAST.
1232 
1233  for (auto VT :
1237 
1238  for (auto VT :
1247 
1251 
1252  // There are no legal MVT::nxv16f## based types.
1253  if (VT != MVT::nxv16i1) {
1256  }
1257  }
1258 
1259  // NEON doesn't support masked loads/stores/gathers/scatters, but SVE does
1267  }
1268 
1269  // Firstly, exclude all scalable vector extending loads/truncating stores,
1270  // include both integer and floating scalable vector.
1271  for (MVT VT : MVT::scalable_vector_valuetypes()) {
1272  for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
1273  setTruncStoreAction(VT, InnerVT, Expand);
1274  setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1275  setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1276  setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1277  }
1278  }
1279 
1280  // Then, selectively enable those which we directly support.
1287  for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1294  }
1295 
1296  // SVE supports truncating stores of 64 and 128-bit vectors
1302 
1339 
1352 
1364  }
1365 
1366  for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1373  }
1374 
1377 
1378  // NEON doesn't support integer divides, but SVE does
1379  for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1383  }
1384 
1385  // NEON doesn't support 64-bit vector integer muls, but SVE does.
1388 
1389  // NEON doesn't support across-vector reductions, but SVE does.
1390  for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v2f64})
1392 
1393  // NOTE: Currently this has to happen after computeRegisterProperties rather
1394  // than the preferred option of combining it with the addRegisterClass call.
1395  if (Subtarget->useSVEForFixedLengthVectors()) {
1397  if (useSVEForFixedLengthVectorVT(VT))
1398  addTypeForFixedLengthSVE(VT);
1400  if (useSVEForFixedLengthVectorVT(VT))
1401  addTypeForFixedLengthSVE(VT);
1402 
1403  // 64bit results can mean a bigger than NEON input.
1404  for (auto VT : {MVT::v8i8, MVT::v4i16})
1407 
1408  // 128bit results imply a bigger than NEON input.
1409  for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1411  for (auto VT : {MVT::v8f16, MVT::v4f32})
1413 
1414  // These operations are not supported on NEON but SVE can do them.
1435 
1436  // Int operations with no NEON support.
1437  for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1444  }
1445 
1446 
1447  // Use SVE for vectors with more than 2 elements.
1448  for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1450  }
1451 
1456 
1458  }
1459 
1460  if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
1461  // Only required for llvm.aarch64.mops.memset.tag
1463  }
1464 
1465  PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
1466 
1467  IsStrictFPEnabled = true;
1468 }
1469 
1470 void AArch64TargetLowering::addTypeForNEON(MVT VT) {
1471  assert(VT.isVector() && "VT should be a vector type");
1472 
1473  if (VT.isFloatingPoint()) {
1475  setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
1476  setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
1477  }
1478 
1479  // Mark vector float intrinsics as expand.
1480  if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
1489  }
1490 
1491  // But we do support custom-lowering for FCOPYSIGN.
1492  if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
1493  ((VT == MVT::v4f16 || VT == MVT::v8f16) && Subtarget->hasFullFP16()))
1495 
1507 
1511  for (MVT InnerVT : MVT::all_valuetypes())
1512  setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1513 
1514  // CNT supports only B element sizes, then use UADDLP to widen.
1515  if (VT != MVT::v8i8 && VT != MVT::v16i8)
1517 
1523 
1524  for (unsigned Opcode :
1527  setOperationAction(Opcode, VT, Custom);
1528 
1529  if (!VT.isFloatingPoint())
1531 
1532  // [SU][MIN|MAX] are available for all NEON types apart from i64.
1533  if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
1534  for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
1535  setOperationAction(Opcode, VT, Legal);
1536 
1537  // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
1538  // NEON types.
1539  if (VT.isFloatingPoint() &&
1540  VT.getVectorElementType() != MVT::bf16 &&
1541  (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
1542  for (unsigned Opcode :
1548  setOperationAction(Opcode, VT, Legal);
1549 
1550  // Strict fp extend and trunc are legal
1551  if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16)
1553  if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64)
1555 
1556  // FIXME: We could potentially make use of the vector comparison instructions
1557  // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
1558  // complications:
1559  // * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,
1560  // so we would need to expand when the condition code doesn't match the
1561  // kind of comparison.
1562  // * Some kinds of comparison require more than one FCMXY instruction so
1563  // would need to be expanded instead.
1564  // * The lowering of the non-strict versions involves target-specific ISD
1565  // nodes so we would likely need to add strict versions of all of them and
1566  // handle them appropriately.
1569 
1570  if (Subtarget->isLittleEndian()) {
1571  for (unsigned im = (unsigned)ISD::PRE_INC;
1572  im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
1575  }
1576  }
1577 }
1578 
1580  EVT OpVT) const {
1581  // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
1582  if (!Subtarget->hasSVE())
1583  return true;
1584 
1585  // We can only support legal predicate result types. We can use the SVE
1586  // whilelo instruction for generating fixed-width predicates too.
1587  if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 &&
1588  ResVT != MVT::nxv16i1 && ResVT != MVT::v2i1 && ResVT != MVT::v4i1 &&
1589  ResVT != MVT::v8i1 && ResVT != MVT::v16i1)
1590  return true;
1591 
1592  // The whilelo instruction only works with i32 or i64 scalar inputs.
1593  if (OpVT != MVT::i32 && OpVT != MVT::i64)
1594  return true;
1595 
1596  return false;
1597 }
1598 
1599 void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
1600  assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
1601 
1602  // By default everything must be expanded.
1603  for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1605 
1606  // We use EXTRACT_SUBVECTOR to "cast" a scalable vector to a fixed length one.
1608 
1609  if (VT.isFloatingPoint()) {
1621  }
1622 
1623  // Mark integer truncating stores/extending loads as having custom lowering
1624  if (VT.isInteger()) {
1625  MVT InnerVT = VT.changeVectorElementType(MVT::i8);
1626  while (InnerVT != VT) {
1627  setTruncStoreAction(VT, InnerVT, Custom);
1628  setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Custom);
1629  setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Custom);
1630  InnerVT = InnerVT.changeVectorElementType(
1631  MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
1632  }
1633  }
1634 
1635  // Mark floating-point truncating stores/extending loads as having custom
1636  // lowering
1637  if (VT.isFloatingPoint()) {
1638  MVT InnerVT = VT.changeVectorElementType(MVT::f16);
1639  while (InnerVT != VT) {
1640  setTruncStoreAction(VT, InnerVT, Custom);
1641  setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Custom);
1642  InnerVT = InnerVT.changeVectorElementType(
1644  }
1645  }
1646 
1647  // Lower fixed length vector operations to scalable equivalents.
1731 }
1732 
1733 void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
1734  addRegisterClass(VT, &AArch64::FPR64RegClass);
1735  addTypeForNEON(VT);
1736 }
1737 
1738 void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
1739  addRegisterClass(VT, &AArch64::FPR128RegClass);
1740  addTypeForNEON(VT);
1741 }
1742 
1744  LLVMContext &C, EVT VT) const {
1745  if (!VT.isVector())
1746  return MVT::i32;
1747  if (VT.isScalableVector())
1750 }
1751 
1752 static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
1753  const APInt &Demanded,
1755  unsigned NewOpc) {
1756  uint64_t OldImm = Imm, NewImm, Enc;
1757  uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
1758 
1759  // Return if the immediate is already all zeros, all ones, a bimm32 or a
1760  // bimm64.
1761  if (Imm == 0 || Imm == Mask ||
1763  return false;
1764 
1765  unsigned EltSize = Size;
1766  uint64_t DemandedBits = Demanded.getZExtValue();
1767 
1768  // Clear bits that are not demanded.
1769  Imm &= DemandedBits;
1770 
1771  while (true) {
1772  // The goal here is to set the non-demanded bits in a way that minimizes
1773  // the number of switching between 0 and 1. In order to achieve this goal,
1774  // we set the non-demanded bits to the value of the preceding demanded bits.
1775  // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
1776  // non-demanded bit), we copy bit0 (1) to the least significant 'x',
1777  // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
1778  // The final result is 0b11000011.
1779  uint64_t NonDemandedBits = ~DemandedBits;
1780  uint64_t InvertedImm = ~Imm & DemandedBits;
1781  uint64_t RotatedImm =
1782  ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
1783  NonDemandedBits;
1784  uint64_t Sum = RotatedImm + NonDemandedBits;
1785  bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
1786  uint64_t Ones = (Sum + Carry) & NonDemandedBits;
1787  NewImm = (Imm | Ones) & Mask;
1788 
1789  // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
1790  // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
1791  // we halve the element size and continue the search.
1792  if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
1793  break;
1794 
1795  // We cannot shrink the element size any further if it is 2-bits.
1796  if (EltSize == 2)
1797  return false;
1798 
1799  EltSize /= 2;
1800  Mask >>= EltSize;
1801  uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
1802 
1803  // Return if there is mismatch in any of the demanded bits of Imm and Hi.
1804  if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
1805  return false;
1806 
1807  // Merge the upper and lower halves of Imm and DemandedBits.
1808  Imm |= Hi;
1809  DemandedBits |= DemandedBitsHi;
1810  }
1811 
1812  ++NumOptimizedImms;
1813 
1814  // Replicate the element across the register width.
1815  while (EltSize < Size) {
1816  NewImm |= NewImm << EltSize;
1817  EltSize *= 2;
1818  }
1819 
1820  (void)OldImm;
1821  assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
1822  "demanded bits should never be altered");
1823  assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
1824 
1825  // Create the new constant immediate node.
1826  EVT VT = Op.getValueType();
1827  SDLoc DL(Op);
1828  SDValue New;
1829 
1830  // If the new constant immediate is all-zeros or all-ones, let the target
1831  // independent DAG combine optimize this node.
1832  if (NewImm == 0 || NewImm == OrigMask) {
1833  New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
1834  TLO.DAG.getConstant(NewImm, DL, VT));
1835  // Otherwise, create a machine node so that target independent DAG combine
1836  // doesn't undo this optimization.
1837  } else {
1838  Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size);
1839  SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
1840  New = SDValue(
1841  TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
1842  }
1843 
1844  return TLO.CombineTo(Op, New);
1845 }
1846 
1848  SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
1849  TargetLoweringOpt &TLO) const {
1850  // Delay this optimization to as late as possible.
1851  if (!TLO.LegalOps)
1852  return false;
1853 
1855  return false;
1856 
1857  EVT VT = Op.getValueType();
1858  if (VT.isVector())
1859  return false;
1860 
1861  unsigned Size = VT.getSizeInBits();
1862  assert((Size == 32 || Size == 64) &&
1863  "i32 or i64 is expected after legalization.");
1864 
1865  // Exit early if we demand all bits.
1866  if (DemandedBits.countPopulation() == Size)
1867  return false;
1868 
1869  unsigned NewOpc;
1870  switch (Op.getOpcode()) {
1871  default:
1872  return false;
1873  case ISD::AND:
1874  NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
1875  break;
1876  case ISD::OR:
1877  NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
1878  break;
1879  case ISD::XOR:
1880  NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
1881  break;
1882  }
1883  ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
1884  if (!C)
1885  return false;
1886  uint64_t Imm = C->getZExtValue();
1887  return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
1888 }
1889 
1890 /// computeKnownBitsForTargetNode - Determine which of the bits specified in
1891 /// Mask are known to be either zero or one and return them Known.
1893  const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
1894  const SelectionDAG &DAG, unsigned Depth) const {
1895  switch (Op.getOpcode()) {
1896  default:
1897  break;
1898  case AArch64ISD::DUP: {
1899  SDValue SrcOp = Op.getOperand(0);
1900  Known = DAG.computeKnownBits(SrcOp, Depth + 1);
1901  if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
1902  assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
1903  "Expected DUP implicit truncation");
1904  Known = Known.trunc(Op.getScalarValueSizeInBits());
1905  }
1906  break;
1907  }
1908  case AArch64ISD::CSEL: {
1909  KnownBits Known2;
1910  Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
1911  Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
1912  Known = KnownBits::commonBits(Known, Known2);
1913  break;
1914  }
1915  case AArch64ISD::BICi: {
1916  // Compute the bit cleared value.
1917  uint64_t Mask =
1918  ~(Op->getConstantOperandVal(1) << Op->getConstantOperandVal(2));
1919  Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
1920  Known &= KnownBits::makeConstant(APInt(Known.getBitWidth(), Mask));
1921  break;
1922  }
1923  case AArch64ISD::VLSHR: {
1924  KnownBits Known2;
1925  Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
1926  Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
1927  Known = KnownBits::lshr(Known, Known2);
1928  break;
1929  }
1930  case AArch64ISD::VASHR: {
1931  KnownBits Known2;
1932  Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
1933  Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
1934  Known = KnownBits::ashr(Known, Known2);
1935  break;
1936  }
1937  case AArch64ISD::LOADgot:
1938  case AArch64ISD::ADDlow: {
1939  if (!Subtarget->isTargetILP32())
1940  break;
1941  // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
1942  Known.Zero = APInt::getHighBitsSet(64, 32);
1943  break;
1944  }
1946  Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
1947  Known.Zero |= APInt(Known.getBitWidth(), 0xFE);
1948  break;
1949  }
1950  case ISD::INTRINSIC_W_CHAIN: {
1951  ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
1952  Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
1953  switch (IntID) {
1954  default: return;
1955  case Intrinsic::aarch64_ldaxr:
1956  case Intrinsic::aarch64_ldxr: {
1957  unsigned BitWidth = Known.getBitWidth();
1958  EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
1959  unsigned MemBits = VT.getScalarSizeInBits();
1960  Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
1961  return;
1962  }
1963  }
1964  break;
1965  }
1967  case ISD::INTRINSIC_VOID: {
1968  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
1969  switch (IntNo) {
1970  default:
1971  break;
1972  case Intrinsic::aarch64_neon_umaxv:
1973  case Intrinsic::aarch64_neon_uminv: {
1974  // Figure out the datatype of the vector operand. The UMINV instruction
1975  // will zero extend the result, so we can mark as known zero all the
1976  // bits larger than the element datatype. 32-bit or larget doesn't need
1977  // this as those are legal types and will be handled by isel directly.
1978  MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
1979  unsigned BitWidth = Known.getBitWidth();
1980  if (VT == MVT::v8i8 || VT == MVT::v16i8) {
1981  assert(BitWidth >= 8 && "Unexpected width!");
1983  Known.Zero |= Mask;
1984  } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
1985  assert(BitWidth >= 16 && "Unexpected width!");
1987  Known.Zero |= Mask;
1988  }
1989  break;
1990  } break;
1991  }
1992  }
1993  }
1994 }
1995 
1997  EVT) const {
1998  return MVT::i64;
1999 }
2000 
2002  EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2003  bool *Fast) const {
2004  if (Subtarget->requiresStrictAlign())
2005  return false;
2006 
2007  if (Fast) {
2008  // Some CPUs are fine with unaligned stores except for 128-bit ones.
2009  *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
2010  // See comments in performSTORECombine() for more details about
2011  // these conditions.
2012 
2013  // Code that uses clang vector extensions can mark that it
2014  // wants unaligned accesses to be treated as fast by
2015  // underspecifying alignment to be 1 or 2.
2016  Alignment <= 2 ||
2017 
2018  // Disregard v2i64. Memcpy lowering produces those and splitting
2019  // them regresses performance on micro-benchmarks and olden/bh.
2020  VT == MVT::v2i64;
2021  }
2022  return true;
2023 }
2024 
2025 // Same as above but handling LLTs instead.
2027  LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2028  bool *Fast) const {
2029  if (Subtarget->requiresStrictAlign())
2030  return false;
2031 
2032  if (Fast) {
2033  // Some CPUs are fine with unaligned stores except for 128-bit ones.
2034  *Fast = !Subtarget->isMisaligned128StoreSlow() ||
2035  Ty.getSizeInBytes() != 16 ||
2036  // See comments in performSTORECombine() for more details about
2037  // these conditions.
2038 
2039  // Code that uses clang vector extensions can mark that it
2040  // wants unaligned accesses to be treated as fast by
2041  // underspecifying alignment to be 1 or 2.
2042  Alignment <= 2 ||
2043 
2044  // Disregard v2i64. Memcpy lowering produces those and splitting
2045  // them regresses performance on micro-benchmarks and olden/bh.
2046  Ty == LLT::fixed_vector(2, 64);
2047  }
2048  return true;
2049 }
2050 
2051 FastISel *
2053  const TargetLibraryInfo *libInfo) const {
2054  return AArch64::createFastISel(funcInfo, libInfo);
2055 }
2056 
2057 const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
2058 #define MAKE_CASE(V) \
2059  case V: \
2060  return #V;
2061  switch ((AArch64ISD::NodeType)Opcode) {
2063  break;
2355  }
2356 #undef MAKE_CASE
2357  return nullptr;
2358 }
2359 
2362  MachineBasicBlock *MBB) const {
2363  // We materialise the F128CSEL pseudo-instruction as some control flow and a
2364  // phi node:
2365 
2366  // OrigBB:
2367  // [... previous instrs leading to comparison ...]
2368  // b.ne TrueBB
2369  // b EndBB
2370  // TrueBB:
2371  // ; Fallthrough
2372  // EndBB:
2373  // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2374 
2375  MachineFunction *MF = MBB->getParent();
2376  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2377  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2378  DebugLoc DL = MI.getDebugLoc();
2380 
2381  Register DestReg = MI.getOperand(0).getReg();
2382  Register IfTrueReg = MI.getOperand(1).getReg();
2383  Register IfFalseReg = MI.getOperand(2).getReg();
2384  unsigned CondCode = MI.getOperand(3).getImm();
2385  bool NZCVKilled = MI.getOperand(4).isKill();
2386 
2387  MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
2388  MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
2389  MF->insert(It, TrueBB);
2390  MF->insert(It, EndBB);
2391 
2392  // Transfer rest of current basic-block to EndBB
2393  EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
2394  MBB->end());
2396 
2397  BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
2398  BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
2399  MBB->addSuccessor(TrueBB);
2400  MBB->addSuccessor(EndBB);
2401 
2402  // TrueBB falls through to the end.
2403  TrueBB->addSuccessor(EndBB);
2404 
2405  if (!NZCVKilled) {
2406  TrueBB->addLiveIn(AArch64::NZCV);
2407  EndBB->addLiveIn(AArch64::NZCV);
2408  }
2409 
2410  BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
2411  .addReg(IfTrueReg)
2412  .addMBB(TrueBB)
2413  .addReg(IfFalseReg)
2414  .addMBB(MBB);
2415 
2416  MI.eraseFromParent();
2417  return EndBB;
2418 }
2419 
2421  MachineInstr &MI, MachineBasicBlock *BB) const {
2423  BB->getParent()->getFunction().getPersonalityFn())) &&
2424  "SEH does not use catchret!");
2425  return BB;
2426 }
2427 
2429 AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
2430  MachineInstr &MI,
2431  MachineBasicBlock *BB) const {
2432  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2433  MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2434 
2435  MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2436  MIB.add(MI.getOperand(1)); // slice index register
2437  MIB.add(MI.getOperand(2)); // slice index offset
2438  MIB.add(MI.getOperand(3)); // pg
2439  MIB.add(MI.getOperand(4)); // base
2440  MIB.add(MI.getOperand(5)); // offset
2441 
2442  MI.eraseFromParent(); // The pseudo is gone now.
2443  return BB;
2444 }
2445 
2448  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2449  MachineInstrBuilder MIB =
2450  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA));
2451 
2452  MIB.addReg(AArch64::ZA, RegState::Define);
2453  MIB.add(MI.getOperand(0)); // Vector select register
2454  MIB.add(MI.getOperand(1)); // Vector select offset
2455  MIB.add(MI.getOperand(2)); // Base
2456  MIB.add(MI.getOperand(1)); // Offset, same as vector select offset
2457 
2458  MI.eraseFromParent(); // The pseudo is gone now.
2459  return BB;
2460 }
2461 
2463 AArch64TargetLowering::EmitMopa(unsigned Opc, unsigned BaseReg,
2464  MachineInstr &MI, MachineBasicBlock *BB) const {
2465  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2466  MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2467 
2468  MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2469  MIB.addReg(BaseReg + MI.getOperand(0).getImm());
2470  MIB.add(MI.getOperand(1)); // pn
2471  MIB.add(MI.getOperand(2)); // pm
2472  MIB.add(MI.getOperand(3)); // zn
2473  MIB.add(MI.getOperand(4)); // zm
2474 
2475  MI.eraseFromParent(); // The pseudo is gone now.
2476  return BB;
2477 }
2478 
2480 AArch64TargetLowering::EmitInsertVectorToTile(unsigned Opc, unsigned BaseReg,
2481  MachineInstr &MI,
2482  MachineBasicBlock *BB) const {
2483  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2484  MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2485 
2486  MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2487  MIB.addReg(BaseReg + MI.getOperand(0).getImm());
2488  MIB.add(MI.getOperand(1)); // Slice index register
2489  MIB.add(MI.getOperand(2)); // Slice index offset
2490  MIB.add(MI.getOperand(3)); // pg
2491  MIB.add(MI.getOperand(4)); // zn
2492 
2493  MI.eraseFromParent(); // The pseudo is gone now.
2494  return BB;
2495 }
2496 
2499  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2500  MachineInstrBuilder MIB =
2501  BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M));
2502  MIB.add(MI.getOperand(0)); // Mask
2503 
2504  unsigned Mask = MI.getOperand(0).getImm();
2505  for (unsigned I = 0; I < 8; I++) {
2506  if (Mask & (1 << I))
2507  MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine);
2508  }
2509 
2510  MI.eraseFromParent(); // The pseudo is gone now.
2511  return BB;
2512 }
2513 
2515 AArch64TargetLowering::EmitAddVectorToTile(unsigned Opc, unsigned BaseReg,
2516  MachineInstr &MI,
2517  MachineBasicBlock *BB) const {
2518  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2519  MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2520 
2521  MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2522  MIB.addReg(BaseReg + MI.getOperand(0).getImm());
2523  MIB.add(MI.getOperand(1)); // pn
2524  MIB.add(MI.getOperand(2)); // pm
2525  MIB.add(MI.getOperand(3)); // zn
2526 
2527  MI.eraseFromParent(); // The pseudo is gone now.
2528  return BB;
2529 }
2530 
2532  MachineInstr &MI, MachineBasicBlock *BB) const {
2533  switch (MI.getOpcode()) {
2534  default:
2535 #ifndef NDEBUG
2536  MI.dump();
2537 #endif
2538  llvm_unreachable("Unexpected instruction for custom inserter!");
2539 
2540  case AArch64::F128CSEL:
2541  return EmitF128CSEL(MI, BB);
2542  case TargetOpcode::STATEPOINT:
2543  // STATEPOINT is a pseudo instruction which has no implicit defs/uses
2544  // while bl call instruction (where statepoint will be lowered at the end)
2545  // has implicit def. This def is early-clobber as it will be set at
2546  // the moment of the call and earlier than any use is read.
2547  // Add this implicit dead def here as a workaround.
2548  MI.addOperand(*MI.getMF(),
2550  AArch64::LR, /*isDef*/ true,
2551  /*isImp*/ true, /*isKill*/ false, /*isDead*/ true,
2552  /*isUndef*/ false, /*isEarlyClobber*/ true));
2553  [[fallthrough]];
2556  return emitPatchPoint(MI, BB);
2557 
2558  case AArch64::CATCHRET:
2559  return EmitLoweredCatchRet(MI, BB);
2560  case AArch64::LD1_MXIPXX_H_PSEUDO_B:
2561  return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
2562  case AArch64::LD1_MXIPXX_H_PSEUDO_H:
2563  return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB);
2564  case AArch64::LD1_MXIPXX_H_PSEUDO_S:
2565  return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB);
2566  case AArch64::LD1_MXIPXX_H_PSEUDO_D:
2567  return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB);
2568  case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
2569  return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB);
2570  case AArch64::LD1_MXIPXX_V_PSEUDO_B:
2571  return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB);
2572  case AArch64::LD1_MXIPXX_V_PSEUDO_H:
2573  return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB);
2574  case AArch64::LD1_MXIPXX_V_PSEUDO_S:
2575  return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB);
2576  case AArch64::LD1_MXIPXX_V_PSEUDO_D:
2577  return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB);
2578  case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
2579  return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
2580  case AArch64::LDR_ZA_PSEUDO:
2581  return EmitFill(MI, BB);
2582  case AArch64::BFMOPA_MPPZZ_PSEUDO:
2583  return EmitMopa(AArch64::BFMOPA_MPPZZ, AArch64::ZAS0, MI, BB);
2584  case AArch64::BFMOPS_MPPZZ_PSEUDO:
2585  return EmitMopa(AArch64::BFMOPS_MPPZZ, AArch64::ZAS0, MI, BB);
2586  case AArch64::FMOPAL_MPPZZ_PSEUDO:
2587  return EmitMopa(AArch64::FMOPAL_MPPZZ, AArch64::ZAS0, MI, BB);
2588  case AArch64::FMOPSL_MPPZZ_PSEUDO:
2589  return EmitMopa(AArch64::FMOPSL_MPPZZ, AArch64::ZAS0, MI, BB);
2590  case AArch64::FMOPA_MPPZZ_S_PSEUDO:
2591  return EmitMopa(AArch64::FMOPA_MPPZZ_S, AArch64::ZAS0, MI, BB);
2592  case AArch64::FMOPS_MPPZZ_S_PSEUDO:
2593  return EmitMopa(AArch64::FMOPS_MPPZZ_S, AArch64::ZAS0, MI, BB);
2594  case AArch64::FMOPA_MPPZZ_D_PSEUDO:
2595  return EmitMopa(AArch64::FMOPA_MPPZZ_D, AArch64::ZAD0, MI, BB);
2596  case AArch64::FMOPS_MPPZZ_D_PSEUDO:
2597  return EmitMopa(AArch64::FMOPS_MPPZZ_D, AArch64::ZAD0, MI, BB);
2598  case AArch64::SMOPA_MPPZZ_S_PSEUDO:
2599  return EmitMopa(AArch64::SMOPA_MPPZZ_S, AArch64::ZAS0, MI, BB);
2600  case AArch64::SMOPS_MPPZZ_S_PSEUDO:
2601  return EmitMopa(AArch64::SMOPS_MPPZZ_S, AArch64::ZAS0, MI, BB);
2602  case AArch64::UMOPA_MPPZZ_S_PSEUDO:
2603  return EmitMopa(AArch64::UMOPA_MPPZZ_S, AArch64::ZAS0, MI, BB);
2604  case AArch64::UMOPS_MPPZZ_S_PSEUDO:
2605  return EmitMopa(AArch64::UMOPS_MPPZZ_S, AArch64::ZAS0, MI, BB);
2606  case AArch64::SUMOPA_MPPZZ_S_PSEUDO:
2607  return EmitMopa(AArch64::SUMOPA_MPPZZ_S, AArch64::ZAS0, MI, BB);
2608  case AArch64::SUMOPS_MPPZZ_S_PSEUDO:
2609  return EmitMopa(AArch64::SUMOPS_MPPZZ_S, AArch64::ZAS0, MI, BB);
2610  case AArch64::USMOPA_MPPZZ_S_PSEUDO:
2611  return EmitMopa(AArch64::USMOPA_MPPZZ_S, AArch64::ZAS0, MI, BB);
2612  case AArch64::USMOPS_MPPZZ_S_PSEUDO:
2613  return EmitMopa(AArch64::USMOPS_MPPZZ_S, AArch64::ZAS0, MI, BB);
2614  case AArch64::SMOPA_MPPZZ_D_PSEUDO:
2615  return EmitMopa(AArch64::SMOPA_MPPZZ_D, AArch64::ZAD0, MI, BB);
2616  case AArch64::SMOPS_MPPZZ_D_PSEUDO:
2617  return EmitMopa(AArch64::SMOPS_MPPZZ_D, AArch64::ZAD0, MI, BB);
2618  case AArch64::UMOPA_MPPZZ_D_PSEUDO:
2619  return EmitMopa(AArch64::UMOPA_MPPZZ_D, AArch64::ZAD0, MI, BB);
2620  case AArch64::UMOPS_MPPZZ_D_PSEUDO:
2621  return EmitMopa(AArch64::UMOPS_MPPZZ_D, AArch64::ZAD0, MI, BB);
2622  case AArch64::SUMOPA_MPPZZ_D_PSEUDO:
2623  return EmitMopa(AArch64::SUMOPA_MPPZZ_D, AArch64::ZAD0, MI, BB);
2624  case AArch64::SUMOPS_MPPZZ_D_PSEUDO:
2625  return EmitMopa(AArch64::SUMOPS_MPPZZ_D, AArch64::ZAD0, MI, BB);
2626  case AArch64::USMOPA_MPPZZ_D_PSEUDO:
2627  return EmitMopa(AArch64::USMOPA_MPPZZ_D, AArch64::ZAD0, MI, BB);
2628  case AArch64::USMOPS_MPPZZ_D_PSEUDO:
2629  return EmitMopa(AArch64::USMOPS_MPPZZ_D, AArch64::ZAD0, MI, BB);
2630  case AArch64::INSERT_MXIPZ_H_PSEUDO_B:
2631  return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_B, AArch64::ZAB0, MI,
2632  BB);
2633  case AArch64::INSERT_MXIPZ_H_PSEUDO_H:
2634  return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_H, AArch64::ZAH0, MI,
2635  BB);
2636  case AArch64::INSERT_MXIPZ_H_PSEUDO_S:
2637  return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_S, AArch64::ZAS0, MI,
2638  BB);
2639  case AArch64::INSERT_MXIPZ_H_PSEUDO_D:
2640  return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_D, AArch64::ZAD0, MI,
2641  BB);
2642  case AArch64::INSERT_MXIPZ_H_PSEUDO_Q:
2643  return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_Q, AArch64::ZAQ0, MI,
2644  BB);
2645  case AArch64::INSERT_MXIPZ_V_PSEUDO_B:
2646  return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_V_B, AArch64::ZAB0, MI,
2647  BB);
2648  case AArch64::INSERT_MXIPZ_V_PSEUDO_H:
2649  return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_V_H, AArch64::ZAH0, MI,
2650  BB);
2651  case AArch64::INSERT_MXIPZ_V_PSEUDO_S:
2652  return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_V_S, AArch64::ZAS0, MI,
2653  BB);
2654  case AArch64::INSERT_MXIPZ_V_PSEUDO_D:
2655  return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_V_D, AArch64::ZAD0, MI,
2656  BB);
2657  case AArch64::INSERT_MXIPZ_V_PSEUDO_Q:
2658  return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_V_Q, AArch64::ZAQ0, MI,
2659  BB);
2660  case AArch64::ZERO_M_PSEUDO:
2661  return EmitZero(MI, BB);
2662  case AArch64::ADDHA_MPPZ_PSEUDO_S:
2663  return EmitAddVectorToTile(AArch64::ADDHA_MPPZ_S, AArch64::ZAS0, MI, BB);
2664  case AArch64::ADDVA_MPPZ_PSEUDO_S:
2665  return EmitAddVectorToTile(AArch64::ADDVA_MPPZ_S, AArch64::ZAS0, MI, BB);
2666  case AArch64::ADDHA_MPPZ_PSEUDO_D:
2667  return EmitAddVectorToTile(AArch64::ADDHA_MPPZ_D, AArch64::ZAD0, MI, BB);
2668  case AArch64::ADDVA_MPPZ_PSEUDO_D:
2669  return EmitAddVectorToTile(AArch64::ADDVA_MPPZ_D, AArch64::ZAD0, MI, BB);
2670  }
2671 }
2672 
2673 //===----------------------------------------------------------------------===//
2674 // AArch64 Lowering private implementation.
2675 //===----------------------------------------------------------------------===//
2676 
2677 //===----------------------------------------------------------------------===//
2678 // Lowering Code
2679 //===----------------------------------------------------------------------===//
2680 
2681 // Forward declarations of SVE fixed length lowering helpers
2686  SelectionDAG &DAG);
2688  EVT VT);
2689 
2690 /// isZerosVector - Check whether SDNode N is a zero-filled vector.
2691 static bool isZerosVector(const SDNode *N) {
2692  // Look through a bit convert.
2693  while (N->getOpcode() == ISD::BITCAST)
2694  N = N->getOperand(0).getNode();
2695 
2697  return true;
2698 
2699  if (N->getOpcode() != AArch64ISD::DUP)
2700  return false;
2701 
2702  auto Opnd0 = N->getOperand(0);
2703  return isNullConstant(Opnd0) || isNullFPConstant(Opnd0);
2704 }
2705 
2706 /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
2707 /// CC
2709  switch (CC) {
2710  default:
2711  llvm_unreachable("Unknown condition code!");
2712  case ISD::SETNE:
2713  return AArch64CC::NE;
2714  case ISD::SETEQ:
2715  return AArch64CC::EQ;
2716  case ISD::SETGT:
2717  return AArch64CC::GT;
2718  case ISD::SETGE:
2719  return AArch64CC::GE;
2720  case ISD::SETLT:
2721  return AArch64CC::LT;
2722  case ISD::SETLE:
2723  return AArch64CC::LE;
2724  case ISD::SETUGT:
2725  return AArch64CC::HI;
2726  case ISD::SETUGE:
2727  return AArch64CC::HS;
2728  case ISD::SETULT:
2729  return AArch64CC::LO;
2730  case ISD::SETULE:
2731  return AArch64CC::LS;
2732  }
2733 }
2734 
2735 /// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
2738  AArch64CC::CondCode &CondCode2) {
2739  CondCode2 = AArch64CC::AL;
2740  switch (CC) {
2741  default:
2742  llvm_unreachable("Unknown FP condition!");
2743  case ISD::SETEQ:
2744  case ISD::SETOEQ:
2746  break;
2747  case ISD::SETGT:
2748  case ISD::SETOGT:
2750  break;
2751  case ISD::SETGE:
2752  case ISD::SETOGE:
2754  break;
2755  case ISD::SETOLT:
2757  break;
2758  case ISD::SETOLE:
2760  break;
2761  case ISD::SETONE:
2763  CondCode2 = AArch64CC::GT;
2764  break;
2765  case ISD::SETO:
2767  break;
2768  case ISD::SETUO:
2770  break;
2771  case ISD::SETUEQ:
2773  CondCode2 = AArch64CC::VS;
2774  break;
2775  case ISD::SETUGT:
2777  break;
2778  case ISD::SETUGE:
2780  break;
2781  case ISD::SETLT:
2782  case ISD::SETULT:
2784  break;
2785  case ISD::SETLE:
2786  case ISD::SETULE:
2788  break;
2789  case ISD::SETNE:
2790  case ISD::SETUNE:
2792  break;
2793  }
2794 }
2795 
2796 /// Convert a DAG fp condition code to an AArch64 CC.
2797 /// This differs from changeFPCCToAArch64CC in that it returns cond codes that
2798 /// should be AND'ed instead of OR'ed.
2801  AArch64CC::CondCode &CondCode2) {
2802  CondCode2 = AArch64CC::AL;
2803  switch (CC) {
2804  default:
2805  changeFPCCToAArch64CC(CC, CondCode, CondCode2);
2806  assert(CondCode2 == AArch64CC::AL);
2807  break;
2808  case ISD::SETONE:
2809  // (a one b)
2810  // == ((a olt b) || (a ogt b))
2811  // == ((a ord b) && (a une b))
2813  CondCode2 = AArch64CC::NE;
2814  break;
2815  case ISD::SETUEQ:
2816  // (a ueq b)
2817  // == ((a uno b) || (a oeq b))
2818  // == ((a ule b) && (a uge b))
2820  CondCode2 = AArch64CC::LE;
2821  break;
2822  }
2823 }
2824 
2825 /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
2826 /// CC usable with the vector instructions. Fewer operations are available
2827 /// without a real NZCV register, so we have to use less efficient combinations
2828 /// to get the same effect.
2831  AArch64CC::CondCode &CondCode2,
2832  bool &Invert) {
2833  Invert = false;
2834  switch (CC) {
2835  default:
2836  // Mostly the scalar mappings work fine.
2837  changeFPCCToAArch64CC(CC, CondCode, CondCode2);
2838  break;
2839  case ISD::SETUO:
2840  Invert = true;
2841  [[fallthrough]];
2842  case ISD::SETO:
2844  CondCode2 = AArch64CC::GE;
2845  break;
2846  case ISD::SETUEQ:
2847  case ISD::SETULT:
2848  case ISD::SETULE:
2849  case ISD::SETUGT:
2850  case ISD::SETUGE:
2851  // All of the compare-mask comparisons are ordered, but we can switch
2852  // between the two by a double inversion. E.g. ULE == !OGT.
2853  Invert = true;
2854  changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
2855  CondCode, CondCode2);
2856  break;
2857  }
2858 }
2859 
2861  // Matches AArch64DAGToDAGISel::SelectArithImmed().
2862  bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
2863  LLVM_DEBUG(dbgs() << "Is imm " << C
2864  << " legal: " << (IsLegal ? "yes\n" : "no\n"));
2865  return IsLegal;
2866 }
2867 
2868 // Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
2869 // the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
2870 // can be set differently by this operation. It comes down to whether
2871 // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
2872 // everything is fine. If not then the optimization is wrong. Thus general
2873 // comparisons are only valid if op2 != 0.
2874 //
2875 // So, finally, the only LLVM-native comparisons that don't mention C and V
2876 // are SETEQ and SETNE. They're the only ones we can safely use CMN for in
2877 // the absence of information about op2.
2879  return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
2880  (CC == ISD::SETEQ || CC == ISD::SETNE);
2881 }
2882 
2884  SelectionDAG &DAG, SDValue Chain,
2885  bool IsSignaling) {
2886  EVT VT = LHS.getValueType();
2887  assert(VT != MVT::f128);
2888 
2889  const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
2890 
2891  if (VT == MVT::f16 && !FullFP16) {
2893  {Chain, LHS});
2895  {LHS.getValue(1), RHS});
2896  Chain = RHS.getValue(1);
2897  VT = MVT::f32;
2898  }
2899  unsigned Opcode =
2901  return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS});
2902 }
2903 
2905  const SDLoc &dl, SelectionDAG &DAG) {
2906  EVT VT = LHS.getValueType();
2907  const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
2908 
2909  if (VT.isFloatingPoint()) {
2910  assert(VT != MVT::f128);
2911  if (VT == MVT::f16 && !FullFP16) {
2912  LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
2913  RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
2914  VT = MVT::f32;
2915  }
2916  return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
2917  }
2918 
2919  // The CMP instruction is just an alias for SUBS, and representing it as
2920  // SUBS means that it's possible to get CSE with subtract operations.
2921  // A later phase can perform the optimization of setting the destination
2922  // register to WZR/XZR if it ends up being unused.
2923  unsigned Opcode = AArch64ISD::SUBS;
2924 
2925  if (isCMN(RHS, CC)) {
2926  // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
2927  Opcode = AArch64ISD::ADDS;
2928  RHS = RHS.getOperand(1);
2929  } else if (isCMN(LHS, CC)) {
2930  // As we are looking for EQ/NE compares, the operands can be commuted ; can
2931  // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
2932  Opcode = AArch64ISD::ADDS;
2933  LHS = LHS.getOperand(1);
2934  } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
2935  if (LHS.getOpcode() == ISD::AND) {
2936  // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
2937  // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
2938  // of the signed comparisons.
2939  const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl,
2940  DAG.getVTList(VT, MVT_CC),
2941  LHS.getOperand(0),
2942  LHS.getOperand(1));
2943  // Replace all users of (and X, Y) with newly generated (ands X, Y)
2944  DAG.ReplaceAllUsesWith(LHS, ANDSNode);
2945  return ANDSNode.getValue(1);
2946  } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
2947  // Use result of ANDS
2948  return LHS.getValue(1);
2949  }
2950  }
2951 
2952  return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
2953  .getValue(1);
2954 }
2955 
2956 /// \defgroup AArch64CCMP CMP;CCMP matching
2957 ///
2958 /// These functions deal with the formation of CMP;CCMP;... sequences.
2959 /// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
2960 /// a comparison. They set the NZCV flags to a predefined value if their
2961 /// predicate is false. This allows to express arbitrary conjunctions, for
2962 /// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
2963 /// expressed as:
2964 /// cmp A
2965 /// ccmp B, inv(CB), CA
2966 /// check for CB flags
2967 ///
2968 /// This naturally lets us implement chains of AND operations with SETCC
2969 /// operands. And we can even implement some other situations by transforming
2970 /// them:
2971 /// - We can implement (NEG SETCC) i.e. negating a single comparison by
2972 /// negating the flags used in a CCMP/FCCMP operations.
2973 /// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
2974 /// by negating the flags we test for afterwards. i.e.
2975 /// NEG (CMP CCMP CCCMP ...) can be implemented.
2976 /// - Note that we can only ever negate all previously processed results.
2977 /// What we can not implement by flipping the flags to test is a negation
2978 /// of two sub-trees (because the negation affects all sub-trees emitted so
2979 /// far, so the 2nd sub-tree we emit would also affect the first).
2980 /// With those tools we can implement some OR operations:
2981 /// - (OR (SETCC A) (SETCC B)) can be implemented via:
2982 /// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
2983 /// - After transforming OR to NEG/AND combinations we may be able to use NEG
2984 /// elimination rules from earlier to implement the whole thing as a
2985 /// CCMP/FCCMP chain.
2986 ///
2987 /// As complete example:
2988 /// or (or (setCA (cmp A)) (setCB (cmp B)))
2989 /// (and (setCC (cmp C)) (setCD (cmp D)))"
2990 /// can be reassociated to:
2991 /// or (and (setCC (cmp C)) setCD (cmp D))
2992 // (or (setCA (cmp A)) (setCB (cmp B)))
2993 /// can be transformed to:
2994 /// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
2995 /// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
2996 /// which can be implemented as:
2997 /// cmp C
2998 /// ccmp D, inv(CD), CC
2999 /// ccmp A, CA, inv(CD)
3000 /// ccmp B, CB, inv(CA)
3001 /// check for CB flags
3002 ///
3003 /// A counterexample is "or (and A B) (and C D)" which translates to
3004 /// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
3005 /// can only implement 1 of the inner (not) operations, but not both!
3006 /// @{
3007 
3008 /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
3010  ISD::CondCode CC, SDValue CCOp,
3012  AArch64CC::CondCode OutCC,
3013  const SDLoc &DL, SelectionDAG &DAG) {
3014  unsigned Opcode = 0;
3015  const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3016 
3017  if (LHS.getValueType().isFloatingPoint()) {
3018  assert(LHS.getValueType() != MVT::f128);
3019  if (LHS.getValueType() == MVT::f16 && !FullFP16) {
3022  }
3023  Opcode = AArch64ISD::FCCMP;
3024  } else if (RHS.getOpcode() == ISD::SUB) {
3025  SDValue SubOp0 = RHS.getOperand(0);
3026  if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
3027  // See emitComparison() on why we can only do this for SETEQ and SETNE.
3028  Opcode = AArch64ISD::CCMN;
3029  RHS = RHS.getOperand(1);
3030  }
3031  }
3032  if (Opcode == 0)
3033  Opcode = AArch64ISD::CCMP;
3034 
3035  SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
3037  unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
3038  SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
3039  return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
3040 }
3041 
3042 /// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
3043 /// expressed as a conjunction. See \ref AArch64CCMP.
3044 /// \param CanNegate Set to true if we can negate the whole sub-tree just by
3045 /// changing the conditions on the SETCC tests.
3046 /// (this means we can call emitConjunctionRec() with
3047 /// Negate==true on this sub-tree)
3048 /// \param MustBeFirst Set to true if this subtree needs to be negated and we
3049 /// cannot do the negation naturally. We are required to
3050 /// emit the subtree first in this case.
3051 /// \param WillNegate Is true if are called when the result of this
3052 /// subexpression must be negated. This happens when the
3053 /// outer expression is an OR. We can use this fact to know
3054 /// that we have a double negation (or (or ...) ...) that
3055 /// can be implemented for free.
3056 static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
3057  bool &MustBeFirst, bool WillNegate,
3058  unsigned Depth = 0) {
3059  if (!Val.hasOneUse())
3060  return false;
3061  unsigned Opcode = Val->getOpcode();
3062  if (Opcode == ISD::SETCC) {
3063  if (Val->getOperand(0).getValueType() == MVT::f128)
3064  return false;
3065  CanNegate = true;
3066  MustBeFirst = false;
3067  return true;
3068  }
3069  // Protect against exponential runtime and stack overflow.
3070  if (Depth > 6)
3071  return false;
3072  if (Opcode == ISD::AND || Opcode == ISD::OR) {
3073  bool IsOR = Opcode == ISD::OR;
3074  SDValue O0 = Val->getOperand(0);
3075  SDValue O1 = Val->getOperand(1);
3076  bool CanNegateL;
3077  bool MustBeFirstL;
3078  if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
3079  return false;
3080  bool CanNegateR;
3081  bool MustBeFirstR;
3082  if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
3083  return false;
3084 
3085  if (MustBeFirstL && MustBeFirstR)
3086  return false;
3087 
3088  if (IsOR) {
3089  // For an OR expression we need to be able to naturally negate at least
3090  // one side or we cannot do the transformation at all.
3091  if (!CanNegateL && !CanNegateR)
3092  return false;
3093  // If we the result of the OR will be negated and we can naturally negate
3094  // the leafs, then this sub-tree as a whole negates naturally.
3095  CanNegate = WillNegate && CanNegateL && CanNegateR;
3096  // If we cannot naturally negate the whole sub-tree, then this must be
3097  // emitted first.
3098  MustBeFirst = !CanNegate;
3099  } else {
3100  assert(Opcode == ISD::AND && "Must be OR or AND");
3101  // We cannot naturally negate an AND operation.
3102  CanNegate = false;
3103  MustBeFirst = MustBeFirstL || MustBeFirstR;
3104  }
3105  return true;
3106  }
3107  return false;
3108 }
3109 
3110 /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
3111 /// of CCMP/CFCMP ops. See @ref AArch64CCMP.
3112 /// Tries to transform the given i1 producing node @p Val to a series compare
3113 /// and conditional compare operations. @returns an NZCV flags producing node
3114 /// and sets @p OutCC to the flags that should be tested or returns SDValue() if
3115 /// transformation was not possible.
3116 /// \p Negate is true if we want this sub-tree being negated just by changing
3117 /// SETCC conditions.
3119  AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
3121  // We're at a tree leaf, produce a conditional comparison operation.
3122  unsigned Opcode = Val->getOpcode();
3123  if (Opcode == ISD::SETCC) {
3124  SDValue LHS = Val->getOperand(0);
3125  SDValue RHS = Val->getOperand(1);
3126  ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
3127  bool isInteger = LHS.getValueType().isInteger();
3128  if (Negate)
3129  CC = getSetCCInverse(CC, LHS.getValueType());
3130  SDLoc DL(Val);
3131  // Determine OutCC and handle FP special case.
3132  if (isInteger) {
3133  OutCC = changeIntCCToAArch64CC(CC);
3134  } else {
3135  assert(LHS.getValueType().isFloatingPoint());
3136  AArch64CC::CondCode ExtraCC;
3137  changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
3138  // Some floating point conditions can't be tested with a single condition
3139  // code. Construct an additional comparison in this case.
3140  if (ExtraCC != AArch64CC::AL) {
3141  SDValue ExtraCmp;
3142  if (!CCOp.getNode())
3143  ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
3144  else
3145  ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
3146  ExtraCC, DL, DAG);
3147  CCOp = ExtraCmp;
3148  Predicate = ExtraCC;
3149  }
3150  }
3151 
3152  // Produce a normal comparison if we are first in the chain
3153  if (!CCOp)
3154  return emitComparison(LHS, RHS, CC, DL, DAG);
3155  // Otherwise produce a ccmp.
3156  return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
3157  DAG);
3158  }
3159  assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
3160 
3161  bool IsOR = Opcode == ISD::OR;
3162 
3163  SDValue LHS = Val->getOperand(0);
3164  bool CanNegateL;
3165  bool MustBeFirstL;
3166  bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
3167  assert(ValidL && "Valid conjunction/disjunction tree");
3168  (void)ValidL;
3169 
3170  SDValue RHS = Val->getOperand(1);
3171  bool CanNegateR;
3172  bool MustBeFirstR;
3173  bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
3174  assert(ValidR && "Valid conjunction/disjunction tree");
3175  (void)ValidR;
3176 
3177  // Swap sub-tree that must come first to the right side.
3178  if (MustBeFirstL) {
3179  assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
3180  std::swap(LHS, RHS);
3181  std::swap(CanNegateL, CanNegateR);
3182  std::swap(MustBeFirstL, MustBeFirstR);
3183  }
3184 
3185  bool NegateR;
3186  bool NegateAfterR;
3187  bool NegateL;
3188  bool NegateAfterAll;
3189  if (Opcode == ISD::OR) {
3190  // Swap the sub-tree that we can negate naturally to the left.
3191  if (!CanNegateL) {
3192  assert(CanNegateR && "at least one side must be negatable");
3193  assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
3194  assert(!Negate);
3195  std::swap(LHS, RHS);
3196  NegateR = false;
3197  NegateAfterR = true;
3198  } else {
3199  // Negate the left sub-tree if possible, otherwise negate the result.
3200  NegateR = CanNegateR;
3201  NegateAfterR = !CanNegateR;
3202  }
3203  NegateL = true;
3204  NegateAfterAll = !Negate;
3205  } else {
3206  assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
3207  assert(!Negate && "Valid conjunction/disjunction tree");
3208 
3209  NegateL = false;
3210  NegateR = false;
3211  NegateAfterR = false;
3212  NegateAfterAll = false;
3213  }
3214 
3215  // Emit sub-trees.
3216  AArch64CC::CondCode RHSCC;
3217  SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
3218  if (NegateAfterR)
3219  RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
3220  SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
3221  if (NegateAfterAll)
3222  OutCC = AArch64CC::getInvertedCondCode(OutCC);
3223  return CmpL;
3224 }
3225 
3226 /// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
3227 /// In some cases this is even possible with OR operations in the expression.
3228 /// See \ref AArch64CCMP.
3229 /// \see emitConjunctionRec().
3231  AArch64CC::CondCode &OutCC) {
3232  bool DummyCanNegate;
3233  bool DummyMustBeFirst;
3234  if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
3235  return SDValue();
3236 
3237  return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
3238 }
3239 
3240 /// @}
3241 
3242 /// Returns how profitable it is to fold a comparison's operand's shift and/or
3243 /// extension operations.
3245  auto isSupportedExtend = [&](SDValue V) {
3246  if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
3247  return true;
3248 
3249  if (V.getOpcode() == ISD::AND)
3250  if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
3251  uint64_t Mask = MaskCst->getZExtValue();
3252  return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
3253  }
3254 
3255  return false;
3256  };
3257 
3258  if (!Op.hasOneUse())
3259  return 0;
3260 
3261  if (isSupportedExtend(Op))
3262  return 1;
3263 
3264  unsigned Opc = Op.getOpcode();
3265  if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
3266  if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
3267  uint64_t Shift = ShiftCst->getZExtValue();
3268  if (isSupportedExtend(Op.getOperand(0)))
3269  return (Shift <= 4) ? 2 : 1;
3270  EVT VT = Op.getValueType();
3271  if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
3272  return 1;
3273  }
3274 
3275  return 0;
3276 }
3277 
3279  SDValue &AArch64cc, SelectionDAG &DAG,
3280  const SDLoc &dl) {
3281  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
3282  EVT VT = RHS.getValueType();
3283  uint64_t C = RHSC->getZExtValue();
3284  if (!isLegalArithImmed(C)) {
3285  // Constant does not fit, try adjusting it by one?
3286  switch (CC) {
3287  default:
3288  break;
3289  case ISD::SETLT:
3290  case ISD::SETGE:
3291  if ((VT == MVT::i32 && C != 0x80000000 &&
3292  isLegalArithImmed((uint32_t)(C - 1))) ||
3293  (VT == MVT::i64 && C != 0x80000000ULL &&
3294  isLegalArithImmed(C - 1ULL))) {
3295  CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
3296  C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
3297  RHS = DAG.getConstant(C, dl, VT);
3298  }
3299  break;
3300  case ISD::SETULT:
3301  case ISD::SETUGE:
3302  if ((VT == MVT::i32 && C != 0 &&
3303  isLegalArithImmed((uint32_t)(C - 1))) ||
3304  (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
3306  C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
3307  RHS = DAG.getConstant(C, dl, VT);
3308  }
3309  break;
3310  case ISD::SETLE:
3311  case ISD::SETGT:
3312  if ((VT == MVT::i32 && C != INT32_MAX &&
3313  isLegalArithImmed((uint32_t)(C + 1))) ||
3314  (VT == MVT::i64 && C != INT64_MAX &&
3315  isLegalArithImmed(C + 1ULL))) {
3316  CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
3317  C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
3318  RHS = DAG.getConstant(C, dl, VT);
3319  }
3320  break;
3321  case ISD::SETULE:
3322  case ISD::SETUGT:
3323  if ((VT == MVT::i32 && C != UINT32_MAX &&
3324  isLegalArithImmed((uint32_t)(C + 1))) ||
3325  (VT ==