LLVM  14.0.0git
AArch64ISelLowering.cpp
Go to the documentation of this file.
1 //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the AArch64TargetLowering class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AArch64ISelLowering.h"
15 #include "AArch64ExpandImm.h"
17 #include "AArch64PerfectShuffle.h"
18 #include "AArch64RegisterInfo.h"
19 #include "AArch64Subtarget.h"
21 #include "Utils/AArch64BaseInfo.h"
22 #include "llvm/ADT/APFloat.h"
23 #include "llvm/ADT/APInt.h"
24 #include "llvm/ADT/ArrayRef.h"
25 #include "llvm/ADT/STLExtras.h"
26 #include "llvm/ADT/SmallSet.h"
27 #include "llvm/ADT/SmallVector.h"
28 #include "llvm/ADT/Statistic.h"
29 #include "llvm/ADT/StringRef.h"
30 #include "llvm/ADT/Triple.h"
31 #include "llvm/ADT/Twine.h"
34 #include "llvm/CodeGen/Analysis.h"
49 #include "llvm/IR/Attributes.h"
50 #include "llvm/IR/Constants.h"
51 #include "llvm/IR/DataLayout.h"
52 #include "llvm/IR/DebugLoc.h"
53 #include "llvm/IR/DerivedTypes.h"
54 #include "llvm/IR/Function.h"
56 #include "llvm/IR/GlobalValue.h"
57 #include "llvm/IR/IRBuilder.h"
58 #include "llvm/IR/Instruction.h"
59 #include "llvm/IR/Instructions.h"
60 #include "llvm/IR/IntrinsicInst.h"
61 #include "llvm/IR/Intrinsics.h"
62 #include "llvm/IR/IntrinsicsAArch64.h"
63 #include "llvm/IR/Module.h"
64 #include "llvm/IR/OperandTraits.h"
65 #include "llvm/IR/PatternMatch.h"
66 #include "llvm/IR/Type.h"
67 #include "llvm/IR/Use.h"
68 #include "llvm/IR/Value.h"
69 #include "llvm/MC/MCRegisterInfo.h"
70 #include "llvm/Support/Casting.h"
71 #include "llvm/Support/CodeGen.h"
73 #include "llvm/Support/Compiler.h"
74 #include "llvm/Support/Debug.h"
76 #include "llvm/Support/KnownBits.h"
82 #include <algorithm>
83 #include <bitset>
84 #include <cassert>
85 #include <cctype>
86 #include <cstdint>
87 #include <cstdlib>
88 #include <iterator>
89 #include <limits>
90 #include <tuple>
91 #include <utility>
92 #include <vector>
93 
94 using namespace llvm;
95 using namespace llvm::PatternMatch;
96 
97 #define DEBUG_TYPE "aarch64-lower"
98 
99 STATISTIC(NumTailCalls, "Number of tail calls");
100 STATISTIC(NumShiftInserts, "Number of vector shift inserts");
101 STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
102 
103 // FIXME: The necessary dtprel relocations don't seem to be supported
104 // well in the GNU bfd and gold linkers at the moment. Therefore, by
105 // default, for now, fall back to GeneralDynamic code generation.
107  "aarch64-elf-ldtls-generation", cl::Hidden,
108  cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
109  cl::init(false));
110 
111 static cl::opt<bool>
112 EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
113  cl::desc("Enable AArch64 logical imm instruction "
114  "optimization"),
115  cl::init(true));
116 
117 // Temporary option added for the purpose of testing functionality added
118 // to DAGCombiner.cpp in D92230. It is expected that this can be removed
119 // in future when both implementations will be based off MGATHER rather
120 // than the GLD1 nodes added for the SVE gather load intrinsics.
121 static cl::opt<bool>
122 EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
123  cl::desc("Combine extends of AArch64 masked "
124  "gather intrinsics"),
125  cl::init(true));
126 
127 /// Value type used for condition codes.
128 static const MVT MVT_CC = MVT::i32;
129 
130 static inline EVT getPackedSVEVectorVT(EVT VT) {
131  switch (VT.getSimpleVT().SimpleTy) {
132  default:
133  llvm_unreachable("unexpected element type for vector");
134  case MVT::i8:
135  return MVT::nxv16i8;
136  case MVT::i16:
137  return MVT::nxv8i16;
138  case MVT::i32:
139  return MVT::nxv4i32;
140  case MVT::i64:
141  return MVT::nxv2i64;
142  case MVT::f16:
143  return MVT::nxv8f16;
144  case MVT::f32:
145  return MVT::nxv4f32;
146  case MVT::f64:
147  return MVT::nxv2f64;
148  case MVT::bf16:
149  return MVT::nxv8bf16;
150  }
151 }
152 
153 // NOTE: Currently there's only a need to return integer vector types. If this
154 // changes then just add an extra "type" parameter.
156  switch (EC.getKnownMinValue()) {
157  default:
158  llvm_unreachable("unexpected element count for vector");
159  case 16:
160  return MVT::nxv16i8;
161  case 8:
162  return MVT::nxv8i16;
163  case 4:
164  return MVT::nxv4i32;
165  case 2:
166  return MVT::nxv2i64;
167  }
168 }
169 
170 static inline EVT getPromotedVTForPredicate(EVT VT) {
172  "Expected scalable predicate vector type!");
173  switch (VT.getVectorMinNumElements()) {
174  default:
175  llvm_unreachable("unexpected element count for vector");
176  case 2:
177  return MVT::nxv2i64;
178  case 4:
179  return MVT::nxv4i32;
180  case 8:
181  return MVT::nxv8i16;
182  case 16:
183  return MVT::nxv16i8;
184  }
185 }
186 
187 /// Returns true if VT's elements occupy the lowest bit positions of its
188 /// associated register class without any intervening space.
189 ///
190 /// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
191 /// same register class, but only nxv8f16 can be treated as a packed vector.
192 static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
193  assert(VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
194  "Expected legal vector type!");
195  return VT.isFixedLengthVector() ||
197 }
198 
199 // Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
200 // predicate and end with a passthru value matching the result type.
201 static bool isMergePassthruOpcode(unsigned Opc) {
202  switch (Opc) {
203  default:
204  return false;
231  return true;
232  }
233 }
234 
236  const AArch64Subtarget &STI)
237  : TargetLowering(TM), Subtarget(&STI) {
238  // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
239  // we have to make something up. Arbitrarily, choose ZeroOrOne.
241  // When comparing vectors the result sets the different elements in the
242  // vector to all-one or all-zero.
244 
245  // Set up the register classes.
246  addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
247  addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
248 
249  if (Subtarget->hasLS64()) {
250  addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
253  }
254 
255  if (Subtarget->hasFPARMv8()) {
256  addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
257  addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
258  addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
259  addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
260  addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
261  }
262 
263  if (Subtarget->hasNEON()) {
264  addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
265  addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
266  // Someone set us up the NEON.
267  addDRTypeForNEON(MVT::v2f32);
268  addDRTypeForNEON(MVT::v8i8);
269  addDRTypeForNEON(MVT::v4i16);
270  addDRTypeForNEON(MVT::v2i32);
271  addDRTypeForNEON(MVT::v1i64);
272  addDRTypeForNEON(MVT::v1f64);
273  addDRTypeForNEON(MVT::v4f16);
274  if (Subtarget->hasBF16())
275  addDRTypeForNEON(MVT::v4bf16);
276 
277  addQRTypeForNEON(MVT::v4f32);
278  addQRTypeForNEON(MVT::v2f64);
279  addQRTypeForNEON(MVT::v16i8);
280  addQRTypeForNEON(MVT::v8i16);
281  addQRTypeForNEON(MVT::v4i32);
282  addQRTypeForNEON(MVT::v2i64);
283  addQRTypeForNEON(MVT::v8f16);
284  if (Subtarget->hasBF16())
285  addQRTypeForNEON(MVT::v8bf16);
286  }
287 
288  if (Subtarget->hasSVE()) {
289  // Add legal sve predicate types
290  addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
291  addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
292  addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
293  addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
294 
295  // Add legal sve data types
296  addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
297  addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
298  addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
299  addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
300 
301  addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
302  addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
303  addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
304  addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
305  addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
306  addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
307 
308  if (Subtarget->hasBF16()) {
309  addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
310  addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
311  addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
312  }
313 
314  if (Subtarget->useSVEForFixedLengthVectors()) {
316  if (useSVEForFixedLengthVectorVT(VT))
317  addRegisterClass(VT, &AArch64::ZPRRegClass);
318 
320  if (useSVEForFixedLengthVectorVT(VT))
321  addRegisterClass(VT, &AArch64::ZPRRegClass);
322  }
323 
324  for (auto VT : { MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64 }) {
333  }
334 
335  for (auto VT :
339 
340  for (auto VT :
342  MVT::nxv2f64 }) {
354 
366  }
367  }
368 
369  // Compute derived properties from the register classes
371 
372  // Provide all sorts of operation actions
406 
410 
414 
416 
417  // Custom lowering hooks are needed for XOR
418  // to fold it into CSINC/CSINV.
421 
422  // Virtually no operation on f128 is legal, but LLVM can't expand them when
423  // there's a valid register class, so we need custom operations in most cases.
447 
448  // Lowering for many of the conversions is actually specified by the non-f128
449  // type. The LowerXXX function will be trivial when f128 isn't involved.
480 
485 
486  // Variable arguments.
491 
492  // Variable-sized objects.
495 
496  if (Subtarget->isTargetWindows())
498  else
500 
501  // Constant pool entries
503 
504  // BlockAddress
506 
507  // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
516 
517  // AArch64 lacks both left-rotate and popcount instructions.
520  for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
523  }
524 
525  // AArch64 doesn't have i32 MULH{S|U}.
528 
529  // AArch64 doesn't have {U|S}MUL_LOHI.
532 
536 
539 
542  for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
545  }
552 
553  // Custom lower Add/Sub/Mul with overflow.
566 
575  if (Subtarget->hasFullFP16())
577  else
579 
613 
614  if (!Subtarget->hasFullFP16()) {
638 
639  // promote v4f16 to v4f32 when that is known to be safe.
648 
665 
687  }
688 
689  // AArch64 has implementations of a lot of rounding-like FP operations.
690  for (MVT Ty : {MVT::f32, MVT::f64}) {
706  }
707 
708  if (Subtarget->hasFullFP16()) {
720  }
721 
723 
726 
732 
733  // Generate outline atomics library calls only if LSE was not specified for
734  // subtarget
735  if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
761 #define LCALLNAMES(A, B, N) \
762  setLibcallName(A##N##_RELAX, #B #N "_relax"); \
763  setLibcallName(A##N##_ACQ, #B #N "_acq"); \
764  setLibcallName(A##N##_REL, #B #N "_rel"); \
765  setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
766 #define LCALLNAME4(A, B) \
767  LCALLNAMES(A, B, 1) \
768  LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
769 #define LCALLNAME5(A, B) \
770  LCALLNAMES(A, B, 1) \
771  LCALLNAMES(A, B, 2) \
772  LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
773  LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
774  LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
775  LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
776  LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
777  LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
778  LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
779 #undef LCALLNAMES
780 #undef LCALLNAME4
781 #undef LCALLNAME5
782  }
783 
784  // 128-bit loads and stores can be done without expanding
787 
788  // Aligned 128-bit loads and stores are single-copy atomic according to the
789  // v8.4a spec.
790  if (Subtarget->hasLSE2()) {
793  }
794 
795  // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
796  // custom lowering, as there are no un-paired non-temporal stores and
797  // legalization will break up 256 bit inputs.
805 
806  // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
807  // This requires the Performance Monitors extension.
808  if (Subtarget->hasPerfMon())
810 
811  if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
812  getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
813  // Issue __sincos_stret if available.
816  } else {
819  }
820 
821  if (Subtarget->getTargetTriple().isOSMSVCRT()) {
822  // MSVCRT doesn't have powi; fall back to pow
823  setLibcallName(RTLIB::POWI_F32, nullptr);
824  setLibcallName(RTLIB::POWI_F64, nullptr);
825  }
826 
827  // Make floating-point constants legal for the large code model, so they don't
828  // become loads from the constant pool.
829  if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
832  }
833 
834  // AArch64 does not have floating-point extending loads, i1 sign-extending
835  // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
836  for (MVT VT : MVT::fp_valuetypes()) {
841  }
842  for (MVT VT : MVT::integer_valuetypes())
844 
852 
856 
857  // Indexed loads and stores are supported.
858  for (unsigned im = (unsigned)ISD::PRE_INC;
859  im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
876  }
877 
878  // Trap.
882 
883  // We combine OR nodes for bitfield operations.
885  // Try to create BICs for vector ANDs.
887 
888  // Vector add and sub nodes may conceal a high-half opportunity.
889  // Also, try to fold ADD into CSINC/CSINV..
897 
898  // TODO: Do the same for FP_TO_*INT_SAT.
902 
903  // Try and combine setcc with csel
905 
907 
917  if (Subtarget->supportsAddressTopByteIgnored())
919 
921 
924 
931 
933 
934  // In case of strict alignment, avoid an excessive number of byte wide stores.
938 
943 
945 
949 
951 
953 
954  EnableExtLdPromotion = true;
955 
956  // Set required alignment.
958  // Set preferred alignments.
961 
962  // Only change the limit for entries in a jump table if specified by
963  // the sub target, but not at the command line.
964  unsigned MaxJT = STI.getMaximumJumpTableSize();
965  if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
967 
968  setHasExtractBitsInsn(true);
969 
971 
972  if (Subtarget->hasNEON()) {
973  // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
974  // silliness like this:
1001 
1007 
1010 
1012 
1013  // AArch64 doesn't have a direct vector ->f32 conversion instructions for
1014  // elements smaller than i32, so promote the input to i32 first.
1021 
1022  // Similarly, there is no direct i32 -> f64 vector conversion instruction.
1027  // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
1028  // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1031 
1032  if (Subtarget->hasFullFP16()) {
1037  } else {
1038  // when AArch64 doesn't have fullfp16 support, promote the input
1039  // to i32 first.
1044  }
1045 
1054  for (auto VT : {MVT::v1i64, MVT::v2i64}) {
1059  }
1060 
1061  // AArch64 doesn't have MUL.2d:
1063  // Custom handling for some quad-vector types to detect MULL.
1067 
1068  // Saturates
1069  for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1075  }
1076 
1078  MVT::v4i32}) {
1081  }
1082 
1083  // Vector reductions
1084  for (MVT VT : { MVT::v4f16, MVT::v2f32,
1086  if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1089 
1091  }
1092  }
1093  for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1100  }
1102 
1105  // Likewise, narrowing and extending vector loads/stores aren't handled
1106  // directly.
1107  for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
1109 
1110  if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1113  } else {
1116  }
1119 
1122 
1123  for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1124  setTruncStoreAction(VT, InnerVT, Expand);
1125  setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1126  setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1127  setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1128  }
1129  }
1130 
1131  // AArch64 has implementations of a lot of rounding-like FP operations.
1132  for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) {
1140  }
1141 
1142  if (Subtarget->hasFullFP16()) {
1143  for (MVT Ty : {MVT::v4f16, MVT::v8f16}) {
1151  }
1152  }
1153 
1154  if (Subtarget->hasSVE())
1156 
1158 
1165  }
1166 
1167  if (Subtarget->hasSVE()) {
1168  for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1207 
1213  }
1214 
1215  // Illegal unpacked integer vector types.
1216  for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1219  }
1220 
1221  // Legalize unpacked bitcasts to REINTERPRET_CAST.
1225 
1226  for (auto VT : {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1}) {
1235 
1239 
1240  // There are no legal MVT::nxv16f## based types.
1241  if (VT != MVT::nxv16i1) {
1244  }
1245  }
1246 
1247  // NEON doesn't support masked loads/stores/gathers/scatters, but SVE does
1255  }
1256 
1258  for (MVT InnerVT : MVT::fp_scalable_vector_valuetypes()) {
1259  // Avoid marking truncating FP stores as legal to prevent the
1260  // DAGCombiner from creating unsupported truncating stores.
1261  setTruncStoreAction(VT, InnerVT, Expand);
1262  // SVE does not have floating-point extending loads.
1263  setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1264  setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1265  setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1266  }
1267  }
1268 
1269  // SVE supports truncating stores of 64 and 128-bit vectors
1275 
1312 
1314  }
1315 
1316  for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1321  }
1322 
1324 
1327 
1328  // NOTE: Currently this has to happen after computeRegisterProperties rather
1329  // than the preferred option of combining it with the addRegisterClass call.
1330  if (Subtarget->useSVEForFixedLengthVectors()) {
1332  if (useSVEForFixedLengthVectorVT(VT))
1333  addTypeForFixedLengthSVE(VT);
1335  if (useSVEForFixedLengthVectorVT(VT))
1336  addTypeForFixedLengthSVE(VT);
1337 
1338  // 64bit results can mean a bigger than NEON input.
1339  for (auto VT : {MVT::v8i8, MVT::v4i16})
1342 
1343  // 128bit results imply a bigger than NEON input.
1344  for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1346  for (auto VT : {MVT::v8f16, MVT::v4f32})
1348 
1349  // These operations are not supported on NEON but SVE can do them.
1388 
1389  // Int operations with no NEON support.
1390  for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1397  }
1398 
1399  // FP operations with no NEON support.
1400  for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32,
1403 
1404  // Use SVE for vectors with more than 2 elements.
1405  for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1407  }
1408 
1413  }
1414 
1416 }
1417 
1418 void AArch64TargetLowering::addTypeForNEON(MVT VT) {
1419  assert(VT.isVector() && "VT should be a vector type");
1420 
1421  if (VT.isFloatingPoint()) {
1423  setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
1424  setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
1425  }
1426 
1427  // Mark vector float intrinsics as expand.
1428  if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
1437  }
1438 
1439  // But we do support custom-lowering for FCOPYSIGN.
1440  if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
1441  ((VT == MVT::v4f16 || VT == MVT::v8f16) && Subtarget->hasFullFP16()))
1443 
1455 
1459  for (MVT InnerVT : MVT::all_valuetypes())
1460  setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1461 
1462  // CNT supports only B element sizes, then use UADDLP to widen.
1463  if (VT != MVT::v8i8 && VT != MVT::v16i8)
1465 
1471 
1476 
1477  if (!VT.isFloatingPoint())
1479 
1480  // [SU][MIN|MAX] are available for all NEON types apart from i64.
1481  if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
1482  for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
1483  setOperationAction(Opcode, VT, Legal);
1484 
1485  // F[MIN|MAX][NUM|NAN] are available for all FP NEON types.
1486  if (VT.isFloatingPoint() &&
1487  VT.getVectorElementType() != MVT::bf16 &&
1488  (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
1489  for (unsigned Opcode :
1491  setOperationAction(Opcode, VT, Legal);
1492 
1493  if (Subtarget->isLittleEndian()) {
1494  for (unsigned im = (unsigned)ISD::PRE_INC;
1495  im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
1498  }
1499  }
1500 }
1501 
1502 void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
1503  assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
1504 
1505  // By default everything must be expanded.
1506  for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1508 
1509  // We use EXTRACT_SUBVECTOR to "cast" a scalable vector to a fixed length one.
1511 
1512  if (VT.isFloatingPoint()) {
1524  }
1525 
1526  // Mark integer truncating stores as having custom lowering
1527  if (VT.isInteger()) {
1528  MVT InnerVT = VT.changeVectorElementType(MVT::i8);
1529  while (InnerVT != VT) {
1530  setTruncStoreAction(VT, InnerVT, Custom);
1531  setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Custom);
1532  setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Custom);
1533  InnerVT = InnerVT.changeVectorElementType(
1534  MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
1535  }
1536  }
1537 
1538  // Lower fixed length vector operations to scalable equivalents.
1620 }
1621 
1622 void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
1623  addRegisterClass(VT, &AArch64::FPR64RegClass);
1624  addTypeForNEON(VT);
1625 }
1626 
1627 void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
1628  addRegisterClass(VT, &AArch64::FPR128RegClass);
1629  addTypeForNEON(VT);
1630 }
1631 
1633  LLVMContext &C, EVT VT) const {
1634  if (!VT.isVector())
1635  return MVT::i32;
1636  if (VT.isScalableVector())
1639 }
1640 
1641 static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
1642  const APInt &Demanded,
1644  unsigned NewOpc) {
1645  uint64_t OldImm = Imm, NewImm, Enc;
1646  uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
1647 
1648  // Return if the immediate is already all zeros, all ones, a bimm32 or a
1649  // bimm64.
1650  if (Imm == 0 || Imm == Mask ||
1652  return false;
1653 
1654  unsigned EltSize = Size;
1655  uint64_t DemandedBits = Demanded.getZExtValue();
1656 
1657  // Clear bits that are not demanded.
1658  Imm &= DemandedBits;
1659 
1660  while (true) {
1661  // The goal here is to set the non-demanded bits in a way that minimizes
1662  // the number of switching between 0 and 1. In order to achieve this goal,
1663  // we set the non-demanded bits to the value of the preceding demanded bits.
1664  // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
1665  // non-demanded bit), we copy bit0 (1) to the least significant 'x',
1666  // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
1667  // The final result is 0b11000011.
1668  uint64_t NonDemandedBits = ~DemandedBits;
1669  uint64_t InvertedImm = ~Imm & DemandedBits;
1670  uint64_t RotatedImm =
1671  ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
1672  NonDemandedBits;
1673  uint64_t Sum = RotatedImm + NonDemandedBits;
1674  bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
1675  uint64_t Ones = (Sum + Carry) & NonDemandedBits;
1676  NewImm = (Imm | Ones) & Mask;
1677 
1678  // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
1679  // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
1680  // we halve the element size and continue the search.
1681  if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
1682  break;
1683 
1684  // We cannot shrink the element size any further if it is 2-bits.
1685  if (EltSize == 2)
1686  return false;
1687 
1688  EltSize /= 2;
1689  Mask >>= EltSize;
1690  uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
1691 
1692  // Return if there is mismatch in any of the demanded bits of Imm and Hi.
1693  if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
1694  return false;
1695 
1696  // Merge the upper and lower halves of Imm and DemandedBits.
1697  Imm |= Hi;
1698  DemandedBits |= DemandedBitsHi;
1699  }
1700 
1701  ++NumOptimizedImms;
1702 
1703  // Replicate the element across the register width.
1704  while (EltSize < Size) {
1705  NewImm |= NewImm << EltSize;
1706  EltSize *= 2;
1707  }
1708 
1709  (void)OldImm;
1710  assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
1711  "demanded bits should never be altered");
1712  assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
1713 
1714  // Create the new constant immediate node.
1715  EVT VT = Op.getValueType();
1716  SDLoc DL(Op);
1717  SDValue New;
1718 
1719  // If the new constant immediate is all-zeros or all-ones, let the target
1720  // independent DAG combine optimize this node.
1721  if (NewImm == 0 || NewImm == OrigMask) {
1722  New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
1723  TLO.DAG.getConstant(NewImm, DL, VT));
1724  // Otherwise, create a machine node so that target independent DAG combine
1725  // doesn't undo this optimization.
1726  } else {
1728  SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
1729  New = SDValue(
1730  TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
1731  }
1732 
1733  return TLO.CombineTo(Op, New);
1734 }
1735 
1737  SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
1738  TargetLoweringOpt &TLO) const {
1739  // Delay this optimization to as late as possible.
1740  if (!TLO.LegalOps)
1741  return false;
1742 
1744  return false;
1745 
1746  EVT VT = Op.getValueType();
1747  if (VT.isVector())
1748  return false;
1749 
1750  unsigned Size = VT.getSizeInBits();
1751  assert((Size == 32 || Size == 64) &&
1752  "i32 or i64 is expected after legalization.");
1753 
1754  // Exit early if we demand all bits.
1755  if (DemandedBits.countPopulation() == Size)
1756  return false;
1757 
1758  unsigned NewOpc;
1759  switch (Op.getOpcode()) {
1760  default:
1761  return false;
1762  case ISD::AND:
1763  NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
1764  break;
1765  case ISD::OR:
1766  NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
1767  break;
1768  case ISD::XOR:
1769  NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
1770  break;
1771  }
1772  ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
1773  if (!C)
1774  return false;
1775  uint64_t Imm = C->getZExtValue();
1776  return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
1777 }
1778 
1779 /// computeKnownBitsForTargetNode - Determine which of the bits specified in
1780 /// Mask are known to be either zero or one and return them Known.
1782  const SDValue Op, KnownBits &Known,
1783  const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
1784  switch (Op.getOpcode()) {
1785  default:
1786  break;
1787  case AArch64ISD::CSEL: {
1788  KnownBits Known2;
1789  Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
1790  Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
1791  Known = KnownBits::commonBits(Known, Known2);
1792  break;
1793  }
1794  case AArch64ISD::LOADgot:
1795  case AArch64ISD::ADDlow: {
1796  if (!Subtarget->isTargetILP32())
1797  break;
1798  // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
1799  Known.Zero = APInt::getHighBitsSet(64, 32);
1800  break;
1801  }
1802  case ISD::INTRINSIC_W_CHAIN: {
1803  ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
1804  Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
1805  switch (IntID) {
1806  default: return;
1807  case Intrinsic::aarch64_ldaxr:
1808  case Intrinsic::aarch64_ldxr: {
1809  unsigned BitWidth = Known.getBitWidth();
1810  EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
1811  unsigned MemBits = VT.getScalarSizeInBits();
1812  Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
1813  return;
1814  }
1815  }
1816  break;
1817  }
1819  case ISD::INTRINSIC_VOID: {
1820  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
1821  switch (IntNo) {
1822  default:
1823  break;
1824  case Intrinsic::aarch64_neon_umaxv:
1825  case Intrinsic::aarch64_neon_uminv: {
1826  // Figure out the datatype of the vector operand. The UMINV instruction
1827  // will zero extend the result, so we can mark as known zero all the
1828  // bits larger than the element datatype. 32-bit or larget doesn't need
1829  // this as those are legal types and will be handled by isel directly.
1830  MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
1831  unsigned BitWidth = Known.getBitWidth();
1832  if (VT == MVT::v8i8 || VT == MVT::v16i8) {
1833  assert(BitWidth >= 8 && "Unexpected width!");
1835  Known.Zero |= Mask;
1836  } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
1837  assert(BitWidth >= 16 && "Unexpected width!");
1839  Known.Zero |= Mask;
1840  }
1841  break;
1842  } break;
1843  }
1844  }
1845  }
1846 }
1847 
1849  EVT) const {
1850  return MVT::i64;
1851 }
1852 
1854  EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1855  bool *Fast) const {
1856  if (Subtarget->requiresStrictAlign())
1857  return false;
1858 
1859  if (Fast) {
1860  // Some CPUs are fine with unaligned stores except for 128-bit ones.
1861  *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
1862  // See comments in performSTORECombine() for more details about
1863  // these conditions.
1864 
1865  // Code that uses clang vector extensions can mark that it
1866  // wants unaligned accesses to be treated as fast by
1867  // underspecifying alignment to be 1 or 2.
1868  Alignment <= 2 ||
1869 
1870  // Disregard v2i64. Memcpy lowering produces those and splitting
1871  // them regresses performance on micro-benchmarks and olden/bh.
1872  VT == MVT::v2i64;
1873  }
1874  return true;
1875 }
1876 
1877 // Same as above but handling LLTs instead.
1879  LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1880  bool *Fast) const {
1881  if (Subtarget->requiresStrictAlign())
1882  return false;
1883 
1884  if (Fast) {
1885  // Some CPUs are fine with unaligned stores except for 128-bit ones.
1886  *Fast = !Subtarget->isMisaligned128StoreSlow() ||
1887  Ty.getSizeInBytes() != 16 ||
1888  // See comments in performSTORECombine() for more details about
1889  // these conditions.
1890 
1891  // Code that uses clang vector extensions can mark that it
1892  // wants unaligned accesses to be treated as fast by
1893  // underspecifying alignment to be 1 or 2.
1894  Alignment <= 2 ||
1895 
1896  // Disregard v2i64. Memcpy lowering produces those and splitting
1897  // them regresses performance on micro-benchmarks and olden/bh.
1898  Ty == LLT::fixed_vector(2, 64);
1899  }
1900  return true;
1901 }
1902 
1903 FastISel *
1905  const TargetLibraryInfo *libInfo) const {
1906  return AArch64::createFastISel(funcInfo, libInfo);
1907 }
1908 
1909 const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
1910 #define MAKE_CASE(V) \
1911  case V: \
1912  return #V;
1913  switch ((AArch64ISD::NodeType)Opcode) {
1915  break;
2193  }
2194 #undef MAKE_CASE
2195  return nullptr;
2196 }
2197 
2200  MachineBasicBlock *MBB) const {
2201  // We materialise the F128CSEL pseudo-instruction as some control flow and a
2202  // phi node:
2203 
2204  // OrigBB:
2205  // [... previous instrs leading to comparison ...]
2206  // b.ne TrueBB
2207  // b EndBB
2208  // TrueBB:
2209  // ; Fallthrough
2210  // EndBB:
2211  // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2212 
2213  MachineFunction *MF = MBB->getParent();
2214  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2215  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2216  DebugLoc DL = MI.getDebugLoc();
2218 
2219  Register DestReg = MI.getOperand(0).getReg();
2220  Register IfTrueReg = MI.getOperand(1).getReg();
2221  Register IfFalseReg = MI.getOperand(2).getReg();
2222  unsigned CondCode = MI.getOperand(3).getImm();
2223  bool NZCVKilled = MI.getOperand(4).isKill();
2224 
2225  MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
2226  MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
2227  MF->insert(It, TrueBB);
2228  MF->insert(It, EndBB);
2229 
2230  // Transfer rest of current basic-block to EndBB
2231  EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
2232  MBB->end());
2234 
2235  BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
2236  BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
2237  MBB->addSuccessor(TrueBB);
2238  MBB->addSuccessor(EndBB);
2239 
2240  // TrueBB falls through to the end.
2241  TrueBB->addSuccessor(EndBB);
2242 
2243  if (!NZCVKilled) {
2244  TrueBB->addLiveIn(AArch64::NZCV);
2245  EndBB->addLiveIn(AArch64::NZCV);
2246  }
2247 
2248  BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
2249  .addReg(IfTrueReg)
2250  .addMBB(TrueBB)
2251  .addReg(IfFalseReg)
2252  .addMBB(MBB);
2253 
2254  MI.eraseFromParent();
2255  return EndBB;
2256 }
2257 
2259  MachineInstr &MI, MachineBasicBlock *BB) const {
2261  BB->getParent()->getFunction().getPersonalityFn())) &&
2262  "SEH does not use catchret!");
2263  return BB;
2264 }
2265 
2267  MachineInstr &MI, MachineBasicBlock *BB) const {
2268  switch (MI.getOpcode()) {
2269  default:
2270 #ifndef NDEBUG
2271  MI.dump();
2272 #endif
2273  llvm_unreachable("Unexpected instruction for custom inserter!");
2274 
2275  case AArch64::F128CSEL:
2276  return EmitF128CSEL(MI, BB);
2277 
2278  case TargetOpcode::STACKMAP:
2279  case TargetOpcode::PATCHPOINT:
2280  case TargetOpcode::STATEPOINT:
2281  return emitPatchPoint(MI, BB);
2282 
2283  case AArch64::CATCHRET:
2284  return EmitLoweredCatchRet(MI, BB);
2285  }
2286 }
2287 
2288 //===----------------------------------------------------------------------===//
2289 // AArch64 Lowering private implementation.
2290 //===----------------------------------------------------------------------===//
2291 
2292 //===----------------------------------------------------------------------===//
2293 // Lowering Code
2294 //===----------------------------------------------------------------------===//
2295 
2296 // Forward declarations of SVE fixed length lowering helpers
2301  SelectionDAG &DAG);
2302 
2303 /// isZerosVector - Check whether SDNode N is a zero-filled vector.
2304 static bool isZerosVector(const SDNode *N) {
2305  // Look through a bit convert.
2306  while (N->getOpcode() == ISD::BITCAST)
2307  N = N->getOperand(0).getNode();
2308 
2310  return true;
2311 
2312  if (N->getOpcode() != AArch64ISD::DUP)
2313  return false;
2314 
2315  auto Opnd0 = N->getOperand(0);
2316  auto *CINT = dyn_cast<ConstantSDNode>(Opnd0);
2317  auto *CFP = dyn_cast<ConstantFPSDNode>(Opnd0);
2318  return (CINT && CINT->isZero()) || (CFP && CFP->isZero());
2319 }
2320 
2321 /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
2322 /// CC
2324  switch (CC) {
2325  default:
2326  llvm_unreachable("Unknown condition code!");
2327  case ISD::SETNE:
2328  return AArch64CC::NE;
2329  case ISD::SETEQ:
2330  return AArch64CC::EQ;
2331  case ISD::SETGT:
2332  return AArch64CC::GT;
2333  case ISD::SETGE:
2334  return AArch64CC::GE;
2335  case ISD::SETLT:
2336  return AArch64CC::LT;
2337  case ISD::SETLE:
2338  return AArch64CC::LE;
2339  case ISD::SETUGT:
2340  return AArch64CC::HI;
2341  case ISD::SETUGE:
2342  return AArch64CC::HS;
2343  case ISD::SETULT:
2344  return AArch64CC::LO;
2345  case ISD::SETULE:
2346  return AArch64CC::LS;
2347  }
2348 }
2349 
2350 /// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
2353  AArch64CC::CondCode &CondCode2) {
2354  CondCode2 = AArch64CC::AL;
2355  switch (CC) {
2356  default:
2357  llvm_unreachable("Unknown FP condition!");
2358  case ISD::SETEQ:
2359  case ISD::SETOEQ:
2361  break;
2362  case ISD::SETGT:
2363  case ISD::SETOGT:
2365  break;
2366  case ISD::SETGE:
2367  case ISD::SETOGE:
2369  break;
2370  case ISD::SETOLT:
2372  break;
2373  case ISD::SETOLE:
2375  break;
2376  case ISD::SETONE:
2378  CondCode2 = AArch64CC::GT;
2379  break;
2380  case ISD::SETO:
2382  break;
2383  case ISD::SETUO:
2385  break;
2386  case ISD::SETUEQ:
2388  CondCode2 = AArch64CC::VS;
2389  break;
2390  case ISD::SETUGT:
2392  break;
2393  case ISD::SETUGE:
2395  break;
2396  case ISD::SETLT:
2397  case ISD::SETULT:
2399  break;
2400  case ISD::SETLE:
2401  case ISD::SETULE:
2403  break;
2404  case ISD::SETNE:
2405  case ISD::SETUNE:
2407  break;
2408  }
2409 }
2410 
2411 /// Convert a DAG fp condition code to an AArch64 CC.
2412 /// This differs from changeFPCCToAArch64CC in that it returns cond codes that
2413 /// should be AND'ed instead of OR'ed.
2416  AArch64CC::CondCode &CondCode2) {
2417  CondCode2 = AArch64CC::AL;
2418  switch (CC) {
2419  default:
2420  changeFPCCToAArch64CC(CC, CondCode, CondCode2);
2421  assert(CondCode2 == AArch64CC::AL);
2422  break;
2423  case ISD::SETONE:
2424  // (a one b)
2425  // == ((a olt b) || (a ogt b))
2426  // == ((a ord b) && (a une b))
2428  CondCode2 = AArch64CC::NE;
2429  break;
2430  case ISD::SETUEQ:
2431  // (a ueq b)
2432  // == ((a uno b) || (a oeq b))
2433  // == ((a ule b) && (a uge b))
2435  CondCode2 = AArch64CC::LE;
2436  break;
2437  }
2438 }
2439 
2440 /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
2441 /// CC usable with the vector instructions. Fewer operations are available
2442 /// without a real NZCV register, so we have to use less efficient combinations
2443 /// to get the same effect.
2446  AArch64CC::CondCode &CondCode2,
2447  bool &Invert) {
2448  Invert = false;
2449  switch (CC) {
2450  default:
2451  // Mostly the scalar mappings work fine.
2452  changeFPCCToAArch64CC(CC, CondCode, CondCode2);
2453  break;
2454  case ISD::SETUO:
2455  Invert = true;
2457  case ISD::SETO:
2459  CondCode2 = AArch64CC::GE;
2460  break;
2461  case ISD::SETUEQ:
2462  case ISD::SETULT:
2463  case ISD::SETULE:
2464  case ISD::SETUGT:
2465  case ISD::SETUGE:
2466  // All of the compare-mask comparisons are ordered, but we can switch
2467  // between the two by a double inversion. E.g. ULE == !OGT.
2468  Invert = true;
2469  changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
2470  CondCode, CondCode2);
2471  break;
2472  }
2473 }
2474 
2476  // Matches AArch64DAGToDAGISel::SelectArithImmed().
2477  bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
2478  LLVM_DEBUG(dbgs() << "Is imm " << C
2479  << " legal: " << (IsLegal ? "yes\n" : "no\n"));
2480  return IsLegal;
2481 }
2482 
2483 // Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
2484 // the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
2485 // can be set differently by this operation. It comes down to whether
2486 // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
2487 // everything is fine. If not then the optimization is wrong. Thus general
2488 // comparisons are only valid if op2 != 0.
2489 //
2490 // So, finally, the only LLVM-native comparisons that don't mention C and V
2491 // are SETEQ and SETNE. They're the only ones we can safely use CMN for in
2492 // the absence of information about op2.
2493 static bool isCMN(SDValue Op, ISD::CondCode CC) {
2494  return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
2495  (CC == ISD::SETEQ || CC == ISD::SETNE);
2496 }
2497 
2499  SelectionDAG &DAG, SDValue Chain,
2500  bool IsSignaling) {
2501  EVT VT = LHS.getValueType();
2502  assert(VT != MVT::f128);
2503  assert(VT != MVT::f16 && "Lowering of strict fp16 not yet implemented");
2504  unsigned Opcode =
2506  return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS});
2507 }
2508 
2510  const SDLoc &dl, SelectionDAG &DAG) {
2511  EVT VT = LHS.getValueType();
2512  const bool FullFP16 =
2513  static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
2514 
2515  if (VT.isFloatingPoint()) {
2516  assert(VT != MVT::f128);
2517  if (VT == MVT::f16 && !FullFP16) {
2518  LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
2519  RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
2520  VT = MVT::f32;
2521  }
2522  return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
2523  }
2524 
2525  // The CMP instruction is just an alias for SUBS, and representing it as
2526  // SUBS means that it's possible to get CSE with subtract operations.
2527  // A later phase can perform the optimization of setting the destination
2528  // register to WZR/XZR if it ends up being unused.
2529  unsigned Opcode = AArch64ISD::SUBS;
2530 
2531  if (isCMN(RHS, CC)) {
2532  // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
2533  Opcode = AArch64ISD::ADDS;
2534  RHS = RHS.getOperand(1);
2535  } else if (isCMN(LHS, CC)) {
2536  // As we are looking for EQ/NE compares, the operands can be commuted ; can
2537  // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
2538  Opcode = AArch64ISD::ADDS;
2539  LHS = LHS.getOperand(1);
2540  } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
2541  if (LHS.getOpcode() == ISD::AND) {
2542  // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
2543  // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
2544  // of the signed comparisons.
2545  const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl,
2546  DAG.getVTList(VT, MVT_CC),
2547  LHS.getOperand(0),
2548  LHS.getOperand(1));
2549  // Replace all users of (and X, Y) with newly generated (ands X, Y)
2550  DAG.ReplaceAllUsesWith(LHS, ANDSNode);
2551  return ANDSNode.getValue(1);
2552  } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
2553  // Use result of ANDS
2554  return LHS.getValue(1);
2555  }
2556  }
2557 
2558  return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
2559  .getValue(1);
2560 }
2561 
2562 /// \defgroup AArch64CCMP CMP;CCMP matching
2563 ///
2564 /// These functions deal with the formation of CMP;CCMP;... sequences.
2565 /// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
2566 /// a comparison. They set the NZCV flags to a predefined value if their
2567 /// predicate is false. This allows to express arbitrary conjunctions, for
2568 /// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
2569 /// expressed as:
2570 /// cmp A
2571 /// ccmp B, inv(CB), CA
2572 /// check for CB flags
2573 ///
2574 /// This naturally lets us implement chains of AND operations with SETCC
2575 /// operands. And we can even implement some other situations by transforming
2576 /// them:
2577 /// - We can implement (NEG SETCC) i.e. negating a single comparison by
2578 /// negating the flags used in a CCMP/FCCMP operations.
2579 /// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
2580 /// by negating the flags we test for afterwards. i.e.
2581 /// NEG (CMP CCMP CCCMP ...) can be implemented.
2582 /// - Note that we can only ever negate all previously processed results.
2583 /// What we can not implement by flipping the flags to test is a negation
2584 /// of two sub-trees (because the negation affects all sub-trees emitted so
2585 /// far, so the 2nd sub-tree we emit would also affect the first).
2586 /// With those tools we can implement some OR operations:
2587 /// - (OR (SETCC A) (SETCC B)) can be implemented via:
2588 /// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
2589 /// - After transforming OR to NEG/AND combinations we may be able to use NEG
2590 /// elimination rules from earlier to implement the whole thing as a
2591 /// CCMP/FCCMP chain.
2592 ///
2593 /// As complete example:
2594 /// or (or (setCA (cmp A)) (setCB (cmp B)))
2595 /// (and (setCC (cmp C)) (setCD (cmp D)))"
2596 /// can be reassociated to:
2597 /// or (and (setCC (cmp C)) setCD (cmp D))
2598 // (or (setCA (cmp A)) (setCB (cmp B)))
2599 /// can be transformed to:
2600 /// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
2601 /// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
2602 /// which can be implemented as:
2603 /// cmp C
2604 /// ccmp D, inv(CD), CC
2605 /// ccmp A, CA, inv(CD)
2606 /// ccmp B, CB, inv(CA)
2607 /// check for CB flags
2608 ///
2609 /// A counterexample is "or (and A B) (and C D)" which translates to
2610 /// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
2611 /// can only implement 1 of the inner (not) operations, but not both!
2612 /// @{
2613 
2614 /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
2616  ISD::CondCode CC, SDValue CCOp,
2618  AArch64CC::CondCode OutCC,
2619  const SDLoc &DL, SelectionDAG &DAG) {
2620  unsigned Opcode = 0;
2621  const bool FullFP16 =
2622  static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
2623 
2624  if (LHS.getValueType().isFloatingPoint()) {
2625  assert(LHS.getValueType() != MVT::f128);
2626  if (LHS.getValueType() == MVT::f16 && !FullFP16) {
2627  LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
2628  RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
2629  }
2630  Opcode = AArch64ISD::FCCMP;
2631  } else if (RHS.getOpcode() == ISD::SUB) {
2632  SDValue SubOp0 = RHS.getOperand(0);
2633  if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
2634  // See emitComparison() on why we can only do this for SETEQ and SETNE.
2635  Opcode = AArch64ISD::CCMN;
2636  RHS = RHS.getOperand(1);
2637  }
2638  }
2639  if (Opcode == 0)
2640  Opcode = AArch64ISD::CCMP;
2641 
2642  SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
2644  unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
2645  SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
2646  return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
2647 }
2648 
2649 /// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
2650 /// expressed as a conjunction. See \ref AArch64CCMP.
2651 /// \param CanNegate Set to true if we can negate the whole sub-tree just by
2652 /// changing the conditions on the SETCC tests.
2653 /// (this means we can call emitConjunctionRec() with
2654 /// Negate==true on this sub-tree)
2655 /// \param MustBeFirst Set to true if this subtree needs to be negated and we
2656 /// cannot do the negation naturally. We are required to
2657 /// emit the subtree first in this case.
2658 /// \param WillNegate Is true if are called when the result of this
2659 /// subexpression must be negated. This happens when the
2660 /// outer expression is an OR. We can use this fact to know
2661 /// that we have a double negation (or (or ...) ...) that
2662 /// can be implemented for free.
2663 static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
2664  bool &MustBeFirst, bool WillNegate,
2665  unsigned Depth = 0) {
2666  if (!Val.hasOneUse())
2667  return false;
2668  unsigned Opcode = Val->getOpcode();
2669  if (Opcode == ISD::SETCC) {
2670  if (Val->getOperand(0).getValueType() == MVT::f128)
2671  return false;
2672  CanNegate = true;
2673  MustBeFirst = false;
2674  return true;
2675  }
2676  // Protect against exponential runtime and stack overflow.
2677  if (Depth > 6)
2678  return false;
2679  if (Opcode == ISD::AND || Opcode == ISD::OR) {
2680  bool IsOR = Opcode == ISD::OR;
2681  SDValue O0 = Val->getOperand(0);
2682  SDValue O1 = Val->getOperand(1);
2683  bool CanNegateL;
2684  bool MustBeFirstL;
2685  if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
2686  return false;
2687  bool CanNegateR;
2688  bool MustBeFirstR;
2689  if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
2690  return false;
2691 
2692  if (MustBeFirstL && MustBeFirstR)
2693  return false;
2694 
2695  if (IsOR) {
2696  // For an OR expression we need to be able to naturally negate at least
2697  // one side or we cannot do the transformation at all.
2698  if (!CanNegateL && !CanNegateR)
2699  return false;
2700  // If we the result of the OR will be negated and we can naturally negate
2701  // the leafs, then this sub-tree as a whole negates naturally.
2702  CanNegate = WillNegate && CanNegateL && CanNegateR;
2703  // If we cannot naturally negate the whole sub-tree, then this must be
2704  // emitted first.
2705  MustBeFirst = !CanNegate;
2706  } else {
2707  assert(Opcode == ISD::AND && "Must be OR or AND");
2708  // We cannot naturally negate an AND operation.
2709  CanNegate = false;
2710  MustBeFirst = MustBeFirstL || MustBeFirstR;
2711  }
2712  return true;
2713  }
2714  return false;
2715 }
2716 
2717 /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
2718 /// of CCMP/CFCMP ops. See @ref AArch64CCMP.
2719 /// Tries to transform the given i1 producing node @p Val to a series compare
2720 /// and conditional compare operations. @returns an NZCV flags producing node
2721 /// and sets @p OutCC to the flags that should be tested or returns SDValue() if
2722 /// transformation was not possible.
2723 /// \p Negate is true if we want this sub-tree being negated just by changing
2724 /// SETCC conditions.
2726  AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
2728  // We're at a tree leaf, produce a conditional comparison operation.
2729  unsigned Opcode = Val->getOpcode();
2730  if (Opcode == ISD::SETCC) {
2731  SDValue LHS = Val->getOperand(0);
2732  SDValue RHS = Val->getOperand(1);
2733  ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
2734  bool isInteger = LHS.getValueType().isInteger();
2735  if (Negate)
2736  CC = getSetCCInverse(CC, LHS.getValueType());
2737  SDLoc DL(Val);
2738  // Determine OutCC and handle FP special case.
2739  if (isInteger) {
2740  OutCC = changeIntCCToAArch64CC(CC);
2741  } else {
2743  AArch64CC::CondCode ExtraCC;
2744  changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
2745  // Some floating point conditions can't be tested with a single condition
2746  // code. Construct an additional comparison in this case.
2747  if (ExtraCC != AArch64CC::AL) {
2748  SDValue ExtraCmp;
2749  if (!CCOp.getNode())
2750  ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
2751  else
2752  ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
2753  ExtraCC, DL, DAG);
2754  CCOp = ExtraCmp;
2755  Predicate = ExtraCC;
2756  }
2757  }
2758 
2759  // Produce a normal comparison if we are first in the chain
2760  if (!CCOp)
2761  return emitComparison(LHS, RHS, CC, DL, DAG);
2762  // Otherwise produce a ccmp.
2763  return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
2764  DAG);
2765  }
2766  assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
2767 
2768  bool IsOR = Opcode == ISD::OR;
2769 
2770  SDValue LHS = Val->getOperand(0);
2771  bool CanNegateL;
2772  bool MustBeFirstL;
2773  bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
2774  assert(ValidL && "Valid conjunction/disjunction tree");
2775  (void)ValidL;
2776 
2777  SDValue RHS = Val->getOperand(1);
2778  bool CanNegateR;
2779  bool MustBeFirstR;
2780  bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
2781  assert(ValidR && "Valid conjunction/disjunction tree");
2782  (void)ValidR;
2783 
2784  // Swap sub-tree that must come first to the right side.
2785  if (MustBeFirstL) {
2786  assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
2787  std::swap(LHS, RHS);
2788  std::swap(CanNegateL, CanNegateR);
2789  std::swap(MustBeFirstL, MustBeFirstR);
2790  }
2791 
2792  bool NegateR;
2793  bool NegateAfterR;
2794  bool NegateL;
2795  bool NegateAfterAll;
2796  if (Opcode == ISD::OR) {
2797  // Swap the sub-tree that we can negate naturally to the left.
2798  if (!CanNegateL) {
2799  assert(CanNegateR && "at least one side must be negatable");
2800  assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
2801  assert(!Negate);
2802  std::swap(LHS, RHS);
2803  NegateR = false;
2804  NegateAfterR = true;
2805  } else {
2806  // Negate the left sub-tree if possible, otherwise negate the result.
2807  NegateR = CanNegateR;
2808  NegateAfterR = !CanNegateR;
2809  }
2810  NegateL = true;
2811  NegateAfterAll = !Negate;
2812  } else {
2813  assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
2814  assert(!Negate && "Valid conjunction/disjunction tree");
2815 
2816  NegateL = false;
2817  NegateR = false;
2818  NegateAfterR = false;
2819  NegateAfterAll = false;
2820  }
2821 
2822  // Emit sub-trees.
2823  AArch64CC::CondCode RHSCC;
2824  SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
2825  if (NegateAfterR)
2826  RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
2827  SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
2828  if (NegateAfterAll)
2829  OutCC = AArch64CC::getInvertedCondCode(OutCC);
2830  return CmpL;
2831 }
2832 
2833 /// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
2834 /// In some cases this is even possible with OR operations in the expression.
2835 /// See \ref AArch64CCMP.
2836 /// \see emitConjunctionRec().
2838  AArch64CC::CondCode &OutCC) {
2839  bool DummyCanNegate;
2840  bool DummyMustBeFirst;
2841  if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
2842  return SDValue();
2843 
2844  return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
2845 }
2846 
2847 /// @}
2848 
2849 /// Returns how profitable it is to fold a comparison's operand's shift and/or
2850 /// extension operations.
2852  auto isSupportedExtend = [&](SDValue V) {
2853  if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
2854  return true;
2855 
2856  if (V.getOpcode() == ISD::AND)
2857  if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
2858  uint64_t Mask = MaskCst->getZExtValue();
2859  return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
2860  }
2861 
2862  return false;
2863  };
2864 
2865  if (!Op.hasOneUse())
2866  return 0;
2867 
2868  if (isSupportedExtend(Op))
2869  return 1;
2870 
2871  unsigned Opc = Op.getOpcode();
2872  if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
2873  if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
2874  uint64_t Shift = ShiftCst->getZExtValue();
2875  if (isSupportedExtend(Op.getOperand(0)))
2876  return (Shift <= 4) ? 2 : 1;
2877  EVT VT = Op.getValueType();
2878  if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
2879  return 1;
2880  }
2881 
2882  return 0;
2883 }
2884 
2886  SDValue &AArch64cc, SelectionDAG &DAG,
2887  const SDLoc &dl) {
2888  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
2889  EVT VT = RHS.getValueType();
2890  uint64_t C = RHSC->getZExtValue();
2891  if (!isLegalArithImmed(C)) {
2892  // Constant does not fit, try adjusting it by one?
2893  switch (CC) {
2894  default:
2895  break;
2896  case ISD::SETLT:
2897  case ISD::SETGE:
2898  if ((VT == MVT::i32 && C != 0x80000000 &&
2899  isLegalArithImmed((uint32_t)(C - 1))) ||
2900  (VT == MVT::i64 && C != 0x80000000ULL &&
2901  isLegalArithImmed(C - 1ULL))) {
2902  CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
2903  C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
2904  RHS = DAG.getConstant(C, dl, VT);
2905  }
2906  break;
2907  case ISD::SETULT:
2908  case ISD::SETUGE:
2909  if ((VT == MVT::i32 && C != 0 &&
2910  isLegalArithImmed((uint32_t)(C - 1))) ||
2911  (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
2912  CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
2913  C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
2914  RHS = DAG.getConstant(C, dl, VT);
2915  }
2916  break;
2917  case ISD::SETLE:
2918  case ISD::SETGT:
2919  if ((VT == MVT::i32 && C != INT32_MAX &&
2920  isLegalArithImmed((uint32_t)(C + 1))) ||
2921  (VT == MVT::i64 && C != INT64_MAX &&
2922  isLegalArithImmed(C + 1ULL))) {
2923  CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
2924  C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
2925  RHS = DAG.getConstant(C, dl, VT);
2926  }
2927  break;
2928  case ISD::SETULE:
2929  case ISD::SETUGT:
2930  if ((VT == MVT::i32 && C != UINT32_MAX &&
2931  isLegalArithImmed((uint32_t)(C + 1))) ||
2932  (VT == MVT::i64 && C != UINT64_MAX &&
2933  isLegalArithImmed(C + 1ULL))) {
2934  CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
2935  C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
2936  RHS = DAG.getConstant(C, dl, VT);
2937  }
2938  break;
2939  }
2940  }
2941  }
2942 
2943  // Comparisons are canonicalized so that the RHS operand is simpler than the
2944  // LHS one, the extreme case being when RHS is an immediate. However, AArch64
2945  // can fold some shift+extend operations on the RHS operand, so swap the
2946  // operands if that can be done.
2947  //
2948  // For example:
2949  // lsl w13, w11, #1
2950  // cmp w13, w12
2951  // can be turned into:
2952  // cmp w12, w11, lsl #1
2953  if (!isa<ConstantSDNode>(RHS) ||
2954  !isLegalArithImmed(cast<ConstantSDNode>(RHS)->getZExtValue())) {
2955  SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS;
2956 
2958  std::swap(LHS, RHS);
2960  }
2961  }
2962 
2963  SDValue Cmp;
2964  AArch64CC::CondCode AArch64CC;
2965  if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
2966  const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
2967 
2968  // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
2969  // For the i8 operand, the largest immediate is 255, so this can be easily
2970  // encoded in the compare instruction. For the i16 operand, however, the
2971  // largest immediate cannot be encoded in the compare.
2972  // Therefore, use a sign extending load and cmn to avoid materializing the
2973  // -1 constant. For example,
2974  // movz w1, #65535
2975  // ldrh w0, [x0, #0]
2976  // cmp w0, w1
2977  // >
2978  // ldrsh w0, [x0, #0]
2979  // cmn w0, #1
2980  // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
2981  // if and only if (sext LHS) == (sext RHS). The checks are in place to
2982  // ensure both the LHS and RHS are truly zero extended and to make sure the
2983  // transformation is profitable.
2984  if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
2985  cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
2986  cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
2987  LHS.getNode()->hasNUsesOfValue(1, 0)) {
2988  int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
2989  if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
2990  SDValue SExt =
2991  DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
2992  DAG.getValueType(MVT::i16));
2993  Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
2994  RHS.getValueType()),
2995  CC, dl, DAG);
2996  AArch64CC = changeIntCCToAArch64CC(CC);
2997  }
2998  }
2999 
3000  if (!Cmp && (RHSC->isZero() || RHSC->isOne())) {
3001  if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
3002  if ((CC == ISD::SETNE) ^ RHSC->isZero())
3003  AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
3004  }
3005  }
3006  }
3007 
3008  if (!Cmp) {
3009  Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
3010  AArch64CC = changeIntCCToAArch64CC(CC);
3011  }
3012  AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
3013  return Cmp;
3014 }
3015 
3016 static std::pair<SDValue, SDValue>
3018  assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
3019  "Unsupported value type");
3020  SDValue Value, Overflow;
3021  SDLoc DL(Op);
3022  SDValue LHS = Op.getOperand(0);
3023  SDValue RHS = Op.getOperand(1);
3024  unsigned Opc = 0;
3025  switch (Op.getOpcode()) {
3026  default:
3027  llvm_unreachable("Unknown overflow instruction!");
3028  case ISD::SADDO:
3029  Opc = AArch64ISD::ADDS;
3030  CC = AArch64CC::VS;
3031  break;
3032  case ISD::UADDO:
3033  Opc = AArch64ISD::ADDS;
3034  CC = AArch64CC::HS;
3035  break;
3036  case ISD::SSUBO:
3037  Opc = AArch64ISD::SUBS;
3038  CC = AArch64CC::VS;
3039  break;
3040  case ISD::USUBO:
3041  Opc = AArch64ISD::SUBS;
3042  CC = AArch64CC::LO;
3043  break;
3044  // Multiply needs a little bit extra work.
3045  case ISD::SMULO:
3046  case ISD::UMULO: {
3047  CC = AArch64CC::NE;
3048  bool IsSigned = Op.getOpcode() == ISD::SMULO;
3049  if (Op.getValueType() == MVT::i32) {
3050  // Extend to 64-bits, then perform a 64-bit multiply.
3051  unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3052  LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
3053  RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
3054  SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
3056 
3057  // Check that the result fits into a 32-bit integer.
3058  SDVTList VTs = DAG.getVTList(MVT::i64, MVT_CC);
3059  if (IsSigned) {
3060  // cmp xreg, wreg, sxtw
3061  SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value);
3062  Overflow =
3063  DAG.getNode(AArch64ISD::SUBS, DL, VTs, Mul, SExtMul).getValue(1);
3064  } else {
3065  // tst xreg, #0xffffffff00000000
3066  SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64);
3067  Overflow =
3068  DAG.getNode(AArch64ISD::ANDS, DL, VTs, Mul, UpperBits).getValue(1);
3069  }
3070  break;
3071  }
3072  assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
3073  // For the 64 bit multiply
3074  Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
3075  if (IsSigned) {
3076  SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
3077  SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
3078  DAG.getConstant(63, DL, MVT::i64));
3079  // It is important that LowerBits is last, otherwise the arithmetic
3080  // shift will not be folded into the compare (SUBS).
3081  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
3082  Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
3083  .getValue(1);
3084  } else {
3085  SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
3086  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
3087  Overflow =
3088  DAG.getNode(AArch64ISD::SUBS, DL, VTs,
3089  DAG.getConstant(0, DL, MVT::i64),
3090  UpperBits).getValue(1);
3091  }
3092  break;
3093  }
3094  } // switch (...)
3095 
3096  if (Opc) {
3097  SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
3098 
3099  // Emit the AArch64 operation with overflow check.
3100  Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
3101  Overflow = Value.getValue(1);
3102  }
3103  return std::make_pair(Value, Overflow);
3104 }
3105 
3106 SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
3107  if (useSVEForFixedLengthVectorVT(Op.getValueType()))
3108  return LowerToScalableOp(Op, DAG);
3109 
3110  SDValue Sel = Op.getOperand(0);
3111  SDValue Other = Op.getOperand(1);
3112  SDLoc dl(Sel);
3113 
3114  // If the operand is an overflow checking operation, invert the condition
3115  // code and kill the Not operation. I.e., transform:
3116  // (xor (overflow_op_bool, 1))
3117  // -->
3118  // (csel 1, 0, invert(cc), overflow_op_bool)
3119  // ... which later gets transformed to just a cset instruction with an
3120  // inverted condition code, rather than a cset + eor sequence.
3121  if (isOneConstant(Other) && ISD::isOverflowIntrOpRes(Sel)) {
3122  // Only lower legal XALUO ops.
3123  if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0)))
3124  return SDValue();
3125 
3126  SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
3127  SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
3129  SDValue Value, Overflow;
3130  std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
3131  SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
3132  return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
3133  CCVal, Overflow);
3134  }
3135  // If neither operand is a SELECT_CC, give up.
3136  if (Sel.getOpcode() != ISD::SELECT_CC)
3137  std::swap(Sel, Other);
3138  if (Sel.getOpcode() != ISD::SELECT_CC)
3139  return Op;
3140 
3141  // The folding we want to perform is:
3142  // (xor x, (select_cc a, b, cc, 0, -1) )
3143  // -->
3144  // (csel x, (xor x, -1), cc ...)
3145  //
3146  // The latter will get matched to a CSINV instruction.
3147 
3148  ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
3149  SDValue LHS = Sel.getOperand(0);
3150  SDValue RHS = Sel.getOperand(1);
3151  SDValue TVal = Sel.getOperand(2);
3152  SDValue FVal = Sel.getOperand(3);
3153 
3154  // FIXME: This could be generalized to non-integer comparisons.
3155  if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
3156  return Op;
3157 
3158  ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
3159  ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
3160 
3161  // The values aren't constants, this isn't the pattern we're looking for.
3162  if (!CFVal || !CTVal)
3163  return Op;
3164 
3165  // We can commute the SELECT_CC by inverting the condition. This
3166  // might be needed to make this fit into a CSINV pattern.
3167  if (CTVal->isAllOnes() && CFVal->isZero()) {
3168  std::swap(TVal, FVal);
3169  std::swap(CTVal, CFVal);
3170  CC = ISD::getSetCCInverse(CC, LHS.getValueType());
3171  }
3172 
3173  // If the constants line up, perform the transform!
3174  if (CTVal->isZero() && CFVal->isAllOnes()) {
3175  SDValue CCVal;
3176  SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
3177 
3178  FVal = Other;
3179  TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
3180  DAG.getConstant(-1ULL, dl, Other.getValueType()));
3181 
3182  return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
3183  CCVal, Cmp);
3184  }
3185 
3186  return Op;
3187 }
3188 
3190  EVT VT = Op.getValueType();
3191 
3192  // Let legalize expand this if it isn't a legal type yet.
3193  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
3194  return SDValue();
3195 
3196  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
3197 
3198  unsigned Opc;
3199  bool ExtraOp = false;
3200  switch (Op.getOpcode()) {
3201  default:
3202  llvm_unreachable("Invalid code");
3203  case ISD::ADDC:
3204  Opc = AArch64ISD::ADDS;
3205  break;
3206  case ISD::SUBC:
3207  Opc = AArch64ISD::SUBS;
3208  break;
3209  case ISD::ADDE:
3210  Opc = AArch64ISD::ADCS;
3211  ExtraOp = true;
3212  break;
3213  case ISD::SUBE:
3214  Opc = AArch64ISD::SBCS;
3215  ExtraOp = true;
3216  break;
3217  }
3218 
3219  if (!ExtraOp)
3220  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
3221  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
3222  Op.getOperand(2));
3223 }
3224 
3226  // Let legalize expand this if it isn't a legal type yet.
3227  if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
3228  return SDValue();
3229 
3230  SDLoc dl(Op);
3232  // The actual operation that sets the overflow or carry flag.
3233  SDValue Value, Overflow;
3234  std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
3235 
3236  // We use 0 and 1 as false and true values.
3237  SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
3238  SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
3239 
3240  // We use an inverted condition, because the conditional select is inverted
3241  // too. This will allow it to be selected to a single instruction:
3242  // CSINC Wd, WZR, WZR, invert(cond).
3243  SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
3244  Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
3245  CCVal, Overflow);
3246 
3247  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
3248  return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
3249 }
3250 
3251 // Prefetch operands are:
3252 // 1: Address to prefetch
3253 // 2: bool isWrite
3254 // 3: int locality (0 = no locality ... 3 = extreme locality)
3255 // 4: bool isDataCache
3257  SDLoc DL(Op);
3258  unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
3259  unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
3260  unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
3261 
3262  bool IsStream = !Locality;
3263  // When the locality number is set
3264  if (Locality) {
3265  // The front-end should have filtered out the out-of-range values
3266  assert(Locality <= 3 && "Prefetch locality out-of-range");
3267  // The locality degree is the opposite of the cache speed.
3268  // Put the number the other way around.
3269  // The encoding starts at 0 for level 1
3270  Locality = 3 - Locality;
3271  }
3272 
3273  // built the mask value encoding the expected behavior.
3274  unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
3275  (!IsData << 3) | // IsDataCache bit
3276  (Locality << 1) | // Cache level bits
3277  (unsigned)IsStream; // Stream bit
3278  return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
3279  DAG.getConstant(PrfOp, DL, MVT::i32), Op.getOperand(1));
3280 }
3281 
3282 SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
3283  SelectionDAG &DAG) const {
3284  EVT VT = Op.getValueType();
3285  if (VT.isScalableVector())
3286  return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
3287 
3288  if (useSVEForFixedLengthVectorVT(VT))
3289  return LowerFixedLengthFPExtendToSVE(Op, DAG);
3290 
3291  assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
3292  return SDValue();
3293 }
3294 
3295 SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
3296  SelectionDAG &DAG) const {
3297  if (Op.getValueType().isScalableVector())
3298  return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
3299 
3300  bool IsStrict = Op->isStrictFPOpcode();
3301  SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
3302  EVT SrcVT = SrcVal.getValueType();
3303 
3304  if (useSVEForFixedLengthVectorVT(SrcVT))
3305  return LowerFixedLengthFPRoundToSVE(Op, DAG);
3306 
3307  if (SrcVT != MVT::f128) {
3308  // Expand cases where the input is a vector bigger than NEON.
3309  if (useSVEForFixedLengthVectorVT(SrcVT))
3310  return SDValue();
3311 
3312  // It's legal except when f128 is involved
3313  return Op;
3314  }
3315 
3316  return SDValue();
3317 }
3318 
3319 SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
3320  SelectionDAG &DAG) const {
3321  // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
3322  // Any additional optimization in this function should be recorded
3323  // in the cost tables.
3324  EVT InVT = Op.getOperand(0).getValueType();
3325  EVT VT = Op.getValueType();
3326 
3327  if (VT.isScalableVector()) {
3328  unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
3330  : AAr