LLVM  14.0.0git
AArch64ISelLowering.cpp
Go to the documentation of this file.
1 //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the AArch64TargetLowering class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AArch64ISelLowering.h"
15 #include "AArch64ExpandImm.h"
17 #include "AArch64PerfectShuffle.h"
18 #include "AArch64RegisterInfo.h"
19 #include "AArch64Subtarget.h"
21 #include "Utils/AArch64BaseInfo.h"
22 #include "llvm/ADT/APFloat.h"
23 #include "llvm/ADT/APInt.h"
24 #include "llvm/ADT/ArrayRef.h"
25 #include "llvm/ADT/STLExtras.h"
26 #include "llvm/ADT/SmallSet.h"
27 #include "llvm/ADT/SmallVector.h"
28 #include "llvm/ADT/Statistic.h"
29 #include "llvm/ADT/StringRef.h"
30 #include "llvm/ADT/Triple.h"
31 #include "llvm/ADT/Twine.h"
34 #include "llvm/CodeGen/Analysis.h"
50 #include "llvm/IR/Attributes.h"
51 #include "llvm/IR/Constants.h"
52 #include "llvm/IR/DataLayout.h"
53 #include "llvm/IR/DebugLoc.h"
54 #include "llvm/IR/DerivedTypes.h"
55 #include "llvm/IR/Function.h"
57 #include "llvm/IR/GlobalValue.h"
58 #include "llvm/IR/IRBuilder.h"
59 #include "llvm/IR/Instruction.h"
60 #include "llvm/IR/Instructions.h"
61 #include "llvm/IR/IntrinsicInst.h"
62 #include "llvm/IR/Intrinsics.h"
63 #include "llvm/IR/IntrinsicsAArch64.h"
64 #include "llvm/IR/Module.h"
65 #include "llvm/IR/OperandTraits.h"
66 #include "llvm/IR/PatternMatch.h"
67 #include "llvm/IR/Type.h"
68 #include "llvm/IR/Use.h"
69 #include "llvm/IR/Value.h"
70 #include "llvm/MC/MCRegisterInfo.h"
71 #include "llvm/Support/Casting.h"
72 #include "llvm/Support/CodeGen.h"
74 #include "llvm/Support/Compiler.h"
75 #include "llvm/Support/Debug.h"
77 #include "llvm/Support/KnownBits.h"
83 #include <algorithm>
84 #include <bitset>
85 #include <cassert>
86 #include <cctype>
87 #include <cstdint>
88 #include <cstdlib>
89 #include <iterator>
90 #include <limits>
91 #include <tuple>
92 #include <utility>
93 #include <vector>
94 
95 using namespace llvm;
96 using namespace llvm::PatternMatch;
97 
98 #define DEBUG_TYPE "aarch64-lower"
99 
100 STATISTIC(NumTailCalls, "Number of tail calls");
101 STATISTIC(NumShiftInserts, "Number of vector shift inserts");
102 STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
103 
104 // FIXME: The necessary dtprel relocations don't seem to be supported
105 // well in the GNU bfd and gold linkers at the moment. Therefore, by
106 // default, for now, fall back to GeneralDynamic code generation.
108  "aarch64-elf-ldtls-generation", cl::Hidden,
109  cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
110  cl::init(false));
111 
112 static cl::opt<bool>
113 EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
114  cl::desc("Enable AArch64 logical imm instruction "
115  "optimization"),
116  cl::init(true));
117 
118 // Temporary option added for the purpose of testing functionality added
119 // to DAGCombiner.cpp in D92230. It is expected that this can be removed
120 // in future when both implementations will be based off MGATHER rather
121 // than the GLD1 nodes added for the SVE gather load intrinsics.
122 static cl::opt<bool>
123 EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
124  cl::desc("Combine extends of AArch64 masked "
125  "gather intrinsics"),
126  cl::init(true));
127 
128 /// Value type used for condition codes.
129 static const MVT MVT_CC = MVT::i32;
130 
131 static inline EVT getPackedSVEVectorVT(EVT VT) {
132  switch (VT.getSimpleVT().SimpleTy) {
133  default:
134  llvm_unreachable("unexpected element type for vector");
135  case MVT::i8:
136  return MVT::nxv16i8;
137  case MVT::i16:
138  return MVT::nxv8i16;
139  case MVT::i32:
140  return MVT::nxv4i32;
141  case MVT::i64:
142  return MVT::nxv2i64;
143  case MVT::f16:
144  return MVT::nxv8f16;
145  case MVT::f32:
146  return MVT::nxv4f32;
147  case MVT::f64:
148  return MVT::nxv2f64;
149  case MVT::bf16:
150  return MVT::nxv8bf16;
151  }
152 }
153 
154 // NOTE: Currently there's only a need to return integer vector types. If this
155 // changes then just add an extra "type" parameter.
157  switch (EC.getKnownMinValue()) {
158  default:
159  llvm_unreachable("unexpected element count for vector");
160  case 16:
161  return MVT::nxv16i8;
162  case 8:
163  return MVT::nxv8i16;
164  case 4:
165  return MVT::nxv4i32;
166  case 2:
167  return MVT::nxv2i64;
168  }
169 }
170 
171 static inline EVT getPromotedVTForPredicate(EVT VT) {
173  "Expected scalable predicate vector type!");
174  switch (VT.getVectorMinNumElements()) {
175  default:
176  llvm_unreachable("unexpected element count for vector");
177  case 2:
178  return MVT::nxv2i64;
179  case 4:
180  return MVT::nxv4i32;
181  case 8:
182  return MVT::nxv8i16;
183  case 16:
184  return MVT::nxv16i8;
185  }
186 }
187 
188 /// Returns true if VT's elements occupy the lowest bit positions of its
189 /// associated register class without any intervening space.
190 ///
191 /// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
192 /// same register class, but only nxv8f16 can be treated as a packed vector.
193 static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
194  assert(VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
195  "Expected legal vector type!");
196  return VT.isFixedLengthVector() ||
198 }
199 
200 // Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
201 // predicate and end with a passthru value matching the result type.
202 static bool isMergePassthruOpcode(unsigned Opc) {
203  switch (Opc) {
204  default:
205  return false;
234  return true;
235  }
236 }
237 
239  const AArch64Subtarget &STI)
240  : TargetLowering(TM), Subtarget(&STI) {
241  // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
242  // we have to make something up. Arbitrarily, choose ZeroOrOne.
244  // When comparing vectors the result sets the different elements in the
245  // vector to all-one or all-zero.
247 
248  // Set up the register classes.
249  addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
250  addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
251 
252  if (Subtarget->hasLS64()) {
253  addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
256  }
257 
258  if (Subtarget->hasFPARMv8()) {
259  addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
260  addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
261  addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
262  addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
263  addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
264  }
265 
266  if (Subtarget->hasNEON()) {
267  addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
268  addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
269  // Someone set us up the NEON.
270  addDRTypeForNEON(MVT::v2f32);
271  addDRTypeForNEON(MVT::v8i8);
272  addDRTypeForNEON(MVT::v4i16);
273  addDRTypeForNEON(MVT::v2i32);
274  addDRTypeForNEON(MVT::v1i64);
275  addDRTypeForNEON(MVT::v1f64);
276  addDRTypeForNEON(MVT::v4f16);
277  if (Subtarget->hasBF16())
278  addDRTypeForNEON(MVT::v4bf16);
279 
280  addQRTypeForNEON(MVT::v4f32);
281  addQRTypeForNEON(MVT::v2f64);
282  addQRTypeForNEON(MVT::v16i8);
283  addQRTypeForNEON(MVT::v8i16);
284  addQRTypeForNEON(MVT::v4i32);
285  addQRTypeForNEON(MVT::v2i64);
286  addQRTypeForNEON(MVT::v8f16);
287  if (Subtarget->hasBF16())
288  addQRTypeForNEON(MVT::v8bf16);
289  }
290 
291  if (Subtarget->hasSVE()) {
292  // Add legal sve predicate types
293  addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
294  addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
295  addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
296  addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
297 
298  // Add legal sve data types
299  addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
300  addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
301  addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
302  addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
303 
304  addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
305  addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
306  addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
307  addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
308  addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
309  addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
310 
311  if (Subtarget->hasBF16()) {
312  addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
313  addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
314  addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
315  }
316 
317  if (Subtarget->useSVEForFixedLengthVectors()) {
319  if (useSVEForFixedLengthVectorVT(VT))
320  addRegisterClass(VT, &AArch64::ZPRRegClass);
321 
323  if (useSVEForFixedLengthVectorVT(VT))
324  addRegisterClass(VT, &AArch64::ZPRRegClass);
325  }
326 
327  for (auto VT : { MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64 }) {
336  }
337 
338  for (auto VT :
342 
343  for (auto VT :
345  MVT::nxv2f64 }) {
357 
369  }
370  }
371 
372  // Compute derived properties from the register classes
374 
375  // Provide all sorts of operation actions
409 
413 
417 
419 
420  // Custom lowering hooks are needed for XOR
421  // to fold it into CSINC/CSINV.
424 
425  // Virtually no operation on f128 is legal, but LLVM can't expand them when
426  // there's a valid register class, so we need custom operations in most cases.
450 
451  // Lowering for many of the conversions is actually specified by the non-f128
452  // type. The LowerXXX function will be trivial when f128 isn't involved.
483 
488 
489  // Variable arguments.
494 
495  // Variable-sized objects.
498 
499  if (Subtarget->isTargetWindows())
501  else
503 
504  // Constant pool entries
506 
507  // BlockAddress
509 
510  // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
519 
520  // AArch64 lacks both left-rotate and popcount instructions.
523  for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
526  }
527 
528  // AArch64 doesn't have i32 MULH{S|U}.
531 
532  // AArch64 doesn't have {U|S}MUL_LOHI.
535 
539 
542 
545  for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
548  }
555 
556  // Custom lower Add/Sub/Mul with overflow.
569 
578  if (Subtarget->hasFullFP16())
580  else
582 
616 
617  if (!Subtarget->hasFullFP16()) {
641 
642  // promote v4f16 to v4f32 when that is known to be safe.
651 
668 
690  }
691 
692  // AArch64 has implementations of a lot of rounding-like FP operations.
693  for (MVT Ty : {MVT::f32, MVT::f64}) {
709  }
710 
711  if (Subtarget->hasFullFP16()) {
723  }
724 
726 
729 
735 
736  // Generate outline atomics library calls only if LSE was not specified for
737  // subtarget
738  if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
764 #define LCALLNAMES(A, B, N) \
765  setLibcallName(A##N##_RELAX, #B #N "_relax"); \
766  setLibcallName(A##N##_ACQ, #B #N "_acq"); \
767  setLibcallName(A##N##_REL, #B #N "_rel"); \
768  setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
769 #define LCALLNAME4(A, B) \
770  LCALLNAMES(A, B, 1) \
771  LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
772 #define LCALLNAME5(A, B) \
773  LCALLNAMES(A, B, 1) \
774  LCALLNAMES(A, B, 2) \
775  LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
776  LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
777  LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
778  LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
779  LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
780  LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
781  LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
782 #undef LCALLNAMES
783 #undef LCALLNAME4
784 #undef LCALLNAME5
785  }
786 
787  // 128-bit loads and stores can be done without expanding
790 
791  // Aligned 128-bit loads and stores are single-copy atomic according to the
792  // v8.4a spec.
793  if (Subtarget->hasLSE2()) {
796  }
797 
798  // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
799  // custom lowering, as there are no un-paired non-temporal stores and
800  // legalization will break up 256 bit inputs.
808 
809  // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
810  // This requires the Performance Monitors extension.
811  if (Subtarget->hasPerfMon())
813 
814  if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
815  getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
816  // Issue __sincos_stret if available.
819  } else {
822  }
823 
824  if (Subtarget->getTargetTriple().isOSMSVCRT()) {
825  // MSVCRT doesn't have powi; fall back to pow
826  setLibcallName(RTLIB::POWI_F32, nullptr);
827  setLibcallName(RTLIB::POWI_F64, nullptr);
828  }
829 
830  // Make floating-point constants legal for the large code model, so they don't
831  // become loads from the constant pool.
832  if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
835  }
836 
837  // AArch64 does not have floating-point extending loads, i1 sign-extending
838  // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
839  for (MVT VT : MVT::fp_valuetypes()) {
844  }
845  for (MVT VT : MVT::integer_valuetypes())
847 
855 
859 
860  // Indexed loads and stores are supported.
861  for (unsigned im = (unsigned)ISD::PRE_INC;
862  im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
879  }
880 
881  // Trap.
885 
886  // We combine OR nodes for bitfield operations.
888  // Try to create BICs for vector ANDs.
890 
891  // Vector add and sub nodes may conceal a high-half opportunity.
892  // Also, try to fold ADD into CSINC/CSINV..
899 
905 
906  // Try and combine setcc with csel
908 
910 
920  if (Subtarget->supportsAddressTopByteIgnored())
922 
924 
927 
934 
936 
938 
939  // In case of strict alignment, avoid an excessive number of byte wide stores.
943 
948 
950 
954 
956 
958 
959  EnableExtLdPromotion = true;
960 
961  // Set required alignment.
963  // Set preferred alignments.
967 
968  // Only change the limit for entries in a jump table if specified by
969  // the sub target, but not at the command line.
970  unsigned MaxJT = STI.getMaximumJumpTableSize();
971  if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
973 
974  setHasExtractBitsInsn(true);
975 
977 
978  if (Subtarget->hasNEON()) {
979  // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
980  // silliness like this:
1007 
1013 
1016 
1018 
1019  // AArch64 doesn't have a direct vector ->f32 conversion instructions for
1020  // elements smaller than i32, so promote the input to i32 first.
1023 
1024  // Similarly, there is no direct i32 -> f64 vector conversion instruction.
1029  // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
1030  // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1033 
1034  if (Subtarget->hasFullFP16()) {
1043  } else {
1044  // when AArch64 doesn't have fullfp16 support, promote the input
1045  // to i32 first.
1054  }
1055 
1064  for (auto VT : {MVT::v1i64, MVT::v2i64}) {
1069  }
1070 
1071  // AArch64 doesn't have MUL.2d:
1073  // Custom handling for some quad-vector types to detect MULL.
1077 
1078  // Saturates
1079  for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1085  }
1086 
1088  MVT::v4i32}) {
1091  }
1092 
1093  // Vector reductions
1094  for (MVT VT : { MVT::v4f16, MVT::v2f32,
1096  if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1099 
1101  }
1102  }
1103  for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1110  }
1112 
1115  // Likewise, narrowing and extending vector loads/stores aren't handled
1116  // directly.
1117  for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
1119 
1120  if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1123  } else {
1126  }
1129 
1132 
1133  for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1134  setTruncStoreAction(VT, InnerVT, Expand);
1135  setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1136  setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1137  setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1138  }
1139  }
1140 
1141  // AArch64 has implementations of a lot of rounding-like FP operations.
1142  for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) {
1150  }
1151 
1152  if (Subtarget->hasFullFP16()) {
1153  for (MVT Ty : {MVT::v4f16, MVT::v8f16}) {
1161  }
1162  }
1163 
1164  if (Subtarget->hasSVE())
1166 
1168 
1175  }
1176 
1177  if (Subtarget->hasSVE()) {
1178  for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1219 
1225  }
1226 
1227  // Illegal unpacked integer vector types.
1228  for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1231  }
1232 
1233  // Legalize unpacked bitcasts to REINTERPRET_CAST.
1237 
1238  for (auto VT : {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1}) {
1247 
1251 
1252  // There are no legal MVT::nxv16f## based types.
1253  if (VT != MVT::nxv16i1) {
1256  }
1257  }
1258 
1259  // NEON doesn't support masked loads/stores/gathers/scatters, but SVE does
1267  }
1268 
1270  for (MVT InnerVT : MVT::fp_scalable_vector_valuetypes()) {
1271  // Avoid marking truncating FP stores as legal to prevent the
1272  // DAGCombiner from creating unsupported truncating stores.
1273  setTruncStoreAction(VT, InnerVT, Expand);
1274  // SVE does not have floating-point extending loads.
1275  setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1276  setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1277  setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1278  }
1279  }
1280 
1281  // SVE supports truncating stores of 64 and 128-bit vectors
1287 
1324 
1326  }
1327 
1328  for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1334  }
1335 
1337 
1340 
1341  // NOTE: Currently this has to happen after computeRegisterProperties rather
1342  // than the preferred option of combining it with the addRegisterClass call.
1343  if (Subtarget->useSVEForFixedLengthVectors()) {
1345  if (useSVEForFixedLengthVectorVT(VT))
1346  addTypeForFixedLengthSVE(VT);
1348  if (useSVEForFixedLengthVectorVT(VT))
1349  addTypeForFixedLengthSVE(VT);
1350 
1351  // 64bit results can mean a bigger than NEON input.
1352  for (auto VT : {MVT::v8i8, MVT::v4i16})
1355 
1356  // 128bit results imply a bigger than NEON input.
1357  for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1359  for (auto VT : {MVT::v8f16, MVT::v4f32})
1361 
1362  // These operations are not supported on NEON but SVE can do them.
1401 
1402  // Int operations with no NEON support.
1403  for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1410  }
1411 
1412  // FP operations with no NEON support.
1413  for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32,
1416 
1417  // Use SVE for vectors with more than 2 elements.
1418  for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1420  }
1421 
1426  }
1427 
1429 }
1430 
1431 void AArch64TargetLowering::addTypeForNEON(MVT VT) {
1432  assert(VT.isVector() && "VT should be a vector type");
1433 
1434  if (VT.isFloatingPoint()) {
1436  setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
1437  setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
1438  }
1439 
1440  // Mark vector float intrinsics as expand.
1441  if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
1450  }
1451 
1452  // But we do support custom-lowering for FCOPYSIGN.
1453  if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
1454  ((VT == MVT::v4f16 || VT == MVT::v8f16) && Subtarget->hasFullFP16()))
1456 
1468 
1472  for (MVT InnerVT : MVT::all_valuetypes())
1473  setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1474 
1475  // CNT supports only B element sizes, then use UADDLP to widen.
1476  if (VT != MVT::v8i8 && VT != MVT::v16i8)
1478 
1484 
1489 
1490  if (!VT.isFloatingPoint())
1492 
1493  // [SU][MIN|MAX] are available for all NEON types apart from i64.
1494  if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
1495  for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
1496  setOperationAction(Opcode, VT, Legal);
1497 
1498  // F[MIN|MAX][NUM|NAN] are available for all FP NEON types.
1499  if (VT.isFloatingPoint() &&
1500  VT.getVectorElementType() != MVT::bf16 &&
1501  (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
1502  for (unsigned Opcode :
1504  setOperationAction(Opcode, VT, Legal);
1505 
1506  if (Subtarget->isLittleEndian()) {
1507  for (unsigned im = (unsigned)ISD::PRE_INC;
1508  im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
1511  }
1512  }
1513 }
1514 
1516  EVT OpVT) const {
1517  // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
1518  if (!Subtarget->hasSVE())
1519  return true;
1520 
1521  // We can only support legal predicate result types.
1522  if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 &&
1523  ResVT != MVT::nxv16i1)
1524  return true;
1525 
1526  // The whilelo instruction only works with i32 or i64 scalar inputs.
1527  if (OpVT != MVT::i32 && OpVT != MVT::i64)
1528  return true;
1529 
1530  return false;
1531 }
1532 
1533 void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
1534  assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
1535 
1536  // By default everything must be expanded.
1537  for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1539 
1540  // We use EXTRACT_SUBVECTOR to "cast" a scalable vector to a fixed length one.
1542 
1543  if (VT.isFloatingPoint()) {
1555  }
1556 
1557  // Mark integer truncating stores/extending loads as having custom lowering
1558  if (VT.isInteger()) {
1559  MVT InnerVT = VT.changeVectorElementType(MVT::i8);
1560  while (InnerVT != VT) {
1561  setTruncStoreAction(VT, InnerVT, Custom);
1562  setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Custom);
1563  setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Custom);
1564  InnerVT = InnerVT.changeVectorElementType(
1565  MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
1566  }
1567  }
1568 
1569  // Mark floating-point truncating stores/extending loads as having custom
1570  // lowering
1571  if (VT.isFloatingPoint()) {
1572  MVT InnerVT = VT.changeVectorElementType(MVT::f16);
1573  while (InnerVT != VT) {
1574  setTruncStoreAction(VT, InnerVT, Custom);
1575  setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Custom);
1576  InnerVT = InnerVT.changeVectorElementType(
1578  }
1579  }
1580 
1581  // Lower fixed length vector operations to scalable equivalents.
1663 }
1664 
1665 void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
1666  addRegisterClass(VT, &AArch64::FPR64RegClass);
1667  addTypeForNEON(VT);
1668 }
1669 
1670 void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
1671  addRegisterClass(VT, &AArch64::FPR128RegClass);
1672  addTypeForNEON(VT);
1673 }
1674 
1676  LLVMContext &C, EVT VT) const {
1677  if (!VT.isVector())
1678  return MVT::i32;
1679  if (VT.isScalableVector())
1682 }
1683 
1684 static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
1685  const APInt &Demanded,
1687  unsigned NewOpc) {
1688  uint64_t OldImm = Imm, NewImm, Enc;
1689  uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
1690 
1691  // Return if the immediate is already all zeros, all ones, a bimm32 or a
1692  // bimm64.
1693  if (Imm == 0 || Imm == Mask ||
1695  return false;
1696 
1697  unsigned EltSize = Size;
1698  uint64_t DemandedBits = Demanded.getZExtValue();
1699 
1700  // Clear bits that are not demanded.
1701  Imm &= DemandedBits;
1702 
1703  while (true) {
1704  // The goal here is to set the non-demanded bits in a way that minimizes
1705  // the number of switching between 0 and 1. In order to achieve this goal,
1706  // we set the non-demanded bits to the value of the preceding demanded bits.
1707  // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
1708  // non-demanded bit), we copy bit0 (1) to the least significant 'x',
1709  // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
1710  // The final result is 0b11000011.
1711  uint64_t NonDemandedBits = ~DemandedBits;
1712  uint64_t InvertedImm = ~Imm & DemandedBits;
1713  uint64_t RotatedImm =
1714  ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
1715  NonDemandedBits;
1716  uint64_t Sum = RotatedImm + NonDemandedBits;
1717  bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
1718  uint64_t Ones = (Sum + Carry) & NonDemandedBits;
1719  NewImm = (Imm | Ones) & Mask;
1720 
1721  // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
1722  // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
1723  // we halve the element size and continue the search.
1724  if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
1725  break;
1726 
1727  // We cannot shrink the element size any further if it is 2-bits.
1728  if (EltSize == 2)
1729  return false;
1730 
1731  EltSize /= 2;
1732  Mask >>= EltSize;
1733  uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
1734 
1735  // Return if there is mismatch in any of the demanded bits of Imm and Hi.
1736  if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
1737  return false;
1738 
1739  // Merge the upper and lower halves of Imm and DemandedBits.
1740  Imm |= Hi;
1741  DemandedBits |= DemandedBitsHi;
1742  }
1743 
1744  ++NumOptimizedImms;
1745 
1746  // Replicate the element across the register width.
1747  while (EltSize < Size) {
1748  NewImm |= NewImm << EltSize;
1749  EltSize *= 2;
1750  }
1751 
1752  (void)OldImm;
1753  assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
1754  "demanded bits should never be altered");
1755  assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
1756 
1757  // Create the new constant immediate node.
1758  EVT VT = Op.getValueType();
1759  SDLoc DL(Op);
1760  SDValue New;
1761 
1762  // If the new constant immediate is all-zeros or all-ones, let the target
1763  // independent DAG combine optimize this node.
1764  if (NewImm == 0 || NewImm == OrigMask) {
1765  New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
1766  TLO.DAG.getConstant(NewImm, DL, VT));
1767  // Otherwise, create a machine node so that target independent DAG combine
1768  // doesn't undo this optimization.
1769  } else {
1771  SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
1772  New = SDValue(
1773  TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
1774  }
1775 
1776  return TLO.CombineTo(Op, New);
1777 }
1778 
1780  SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
1781  TargetLoweringOpt &TLO) const {
1782  // Delay this optimization to as late as possible.
1783  if (!TLO.LegalOps)
1784  return false;
1785 
1787  return false;
1788 
1789  EVT VT = Op.getValueType();
1790  if (VT.isVector())
1791  return false;
1792 
1793  unsigned Size = VT.getSizeInBits();
1794  assert((Size == 32 || Size == 64) &&
1795  "i32 or i64 is expected after legalization.");
1796 
1797  // Exit early if we demand all bits.
1798  if (DemandedBits.countPopulation() == Size)
1799  return false;
1800 
1801  unsigned NewOpc;
1802  switch (Op.getOpcode()) {
1803  default:
1804  return false;
1805  case ISD::AND:
1806  NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
1807  break;
1808  case ISD::OR:
1809  NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
1810  break;
1811  case ISD::XOR:
1812  NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
1813  break;
1814  }
1815  ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
1816  if (!C)
1817  return false;
1818  uint64_t Imm = C->getZExtValue();
1819  return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
1820 }
1821 
1822 /// computeKnownBitsForTargetNode - Determine which of the bits specified in
1823 /// Mask are known to be either zero or one and return them Known.
1825  const SDValue Op, KnownBits &Known,
1826  const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
1827  switch (Op.getOpcode()) {
1828  default:
1829  break;
1830  case AArch64ISD::CSEL: {
1831  KnownBits Known2;
1832  Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
1833  Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
1834  Known = KnownBits::commonBits(Known, Known2);
1835  break;
1836  }
1837  case AArch64ISD::BICi: {
1838  // Compute the bit cleared value.
1839  uint64_t Mask =
1840  ~(Op->getConstantOperandVal(1) << Op->getConstantOperandVal(2));
1841  Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
1842  Known &= KnownBits::makeConstant(APInt(Known.getBitWidth(), Mask));
1843  break;
1844  }
1845  case AArch64ISD::VLSHR: {
1846  KnownBits Known2;
1847  Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
1848  Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
1849  Known = KnownBits::lshr(Known, Known2);
1850  break;
1851  }
1852  case AArch64ISD::VASHR: {
1853  KnownBits Known2;
1854  Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
1855  Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
1856  Known = KnownBits::ashr(Known, Known2);
1857  break;
1858  }
1859  case AArch64ISD::LOADgot:
1860  case AArch64ISD::ADDlow: {
1861  if (!Subtarget->isTargetILP32())
1862  break;
1863  // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
1864  Known.Zero = APInt::getHighBitsSet(64, 32);
1865  break;
1866  }
1868  Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
1869  Known.Zero |= APInt(Known.getBitWidth(), 0xFE);
1870  break;
1871  }
1872  case ISD::INTRINSIC_W_CHAIN: {
1873  ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
1874  Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
1875  switch (IntID) {
1876  default: return;
1877  case Intrinsic::aarch64_ldaxr:
1878  case Intrinsic::aarch64_ldxr: {
1879  unsigned BitWidth = Known.getBitWidth();
1880  EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
1881  unsigned MemBits = VT.getScalarSizeInBits();
1882  Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
1883  return;
1884  }
1885  }
1886  break;
1887  }
1889  case ISD::INTRINSIC_VOID: {
1890  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
1891  switch (IntNo) {
1892  default:
1893  break;
1894  case Intrinsic::aarch64_neon_umaxv:
1895  case Intrinsic::aarch64_neon_uminv: {
1896  // Figure out the datatype of the vector operand. The UMINV instruction
1897  // will zero extend the result, so we can mark as known zero all the
1898  // bits larger than the element datatype. 32-bit or larget doesn't need
1899  // this as those are legal types and will be handled by isel directly.
1900  MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
1901  unsigned BitWidth = Known.getBitWidth();
1902  if (VT == MVT::v8i8 || VT == MVT::v16i8) {
1903  assert(BitWidth >= 8 && "Unexpected width!");
1905  Known.Zero |= Mask;
1906  } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
1907  assert(BitWidth >= 16 && "Unexpected width!");
1909  Known.Zero |= Mask;
1910  }
1911  break;
1912  } break;
1913  }
1914  }
1915  }
1916 }
1917 
1919  EVT) const {
1920  return MVT::i64;
1921 }
1922 
1924  EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1925  bool *Fast) const {
1926  if (Subtarget->requiresStrictAlign())
1927  return false;
1928 
1929  if (Fast) {
1930  // Some CPUs are fine with unaligned stores except for 128-bit ones.
1931  *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
1932  // See comments in performSTORECombine() for more details about
1933  // these conditions.
1934 
1935  // Code that uses clang vector extensions can mark that it
1936  // wants unaligned accesses to be treated as fast by
1937  // underspecifying alignment to be 1 or 2.
1938  Alignment <= 2 ||
1939 
1940  // Disregard v2i64. Memcpy lowering produces those and splitting
1941  // them regresses performance on micro-benchmarks and olden/bh.
1942  VT == MVT::v2i64;
1943  }
1944  return true;
1945 }
1946 
1947 // Same as above but handling LLTs instead.
1949  LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1950  bool *Fast) const {
1951  if (Subtarget->requiresStrictAlign())
1952  return false;
1953 
1954  if (Fast) {
1955  // Some CPUs are fine with unaligned stores except for 128-bit ones.
1956  *Fast = !Subtarget->isMisaligned128StoreSlow() ||
1957  Ty.getSizeInBytes() != 16 ||
1958  // See comments in performSTORECombine() for more details about
1959  // these conditions.
1960 
1961  // Code that uses clang vector extensions can mark that it
1962  // wants unaligned accesses to be treated as fast by
1963  // underspecifying alignment to be 1 or 2.
1964  Alignment <= 2 ||
1965 
1966  // Disregard v2i64. Memcpy lowering produces those and splitting
1967  // them regresses performance on micro-benchmarks and olden/bh.
1968  Ty == LLT::fixed_vector(2, 64);
1969  }
1970  return true;
1971 }
1972 
1973 FastISel *
1975  const TargetLibraryInfo *libInfo) const {
1976  return AArch64::createFastISel(funcInfo, libInfo);
1977 }
1978 
1979 const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
1980 #define MAKE_CASE(V) \
1981  case V: \
1982  return #V;
1983  switch ((AArch64ISD::NodeType)Opcode) {
1985  break;
2269  }
2270 #undef MAKE_CASE
2271  return nullptr;
2272 }
2273 
2276  MachineBasicBlock *MBB) const {
2277  // We materialise the F128CSEL pseudo-instruction as some control flow and a
2278  // phi node:
2279 
2280  // OrigBB:
2281  // [... previous instrs leading to comparison ...]
2282  // b.ne TrueBB
2283  // b EndBB
2284  // TrueBB:
2285  // ; Fallthrough
2286  // EndBB:
2287  // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2288 
2289  MachineFunction *MF = MBB->getParent();
2290  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2291  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2292  DebugLoc DL = MI.getDebugLoc();
2294 
2295  Register DestReg = MI.getOperand(0).getReg();
2296  Register IfTrueReg = MI.getOperand(1).getReg();
2297  Register IfFalseReg = MI.getOperand(2).getReg();
2298  unsigned CondCode = MI.getOperand(3).getImm();
2299  bool NZCVKilled = MI.getOperand(4).isKill();
2300 
2301  MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
2302  MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
2303  MF->insert(It, TrueBB);
2304  MF->insert(It, EndBB);
2305 
2306  // Transfer rest of current basic-block to EndBB
2307  EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
2308  MBB->end());
2310 
2311  BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
2312  BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
2313  MBB->addSuccessor(TrueBB);
2314  MBB->addSuccessor(EndBB);
2315 
2316  // TrueBB falls through to the end.
2317  TrueBB->addSuccessor(EndBB);
2318 
2319  if (!NZCVKilled) {
2320  TrueBB->addLiveIn(AArch64::NZCV);
2321  EndBB->addLiveIn(AArch64::NZCV);
2322  }
2323 
2324  BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
2325  .addReg(IfTrueReg)
2326  .addMBB(TrueBB)
2327  .addReg(IfFalseReg)
2328  .addMBB(MBB);
2329 
2330  MI.eraseFromParent();
2331  return EndBB;
2332 }
2333 
2335  MachineInstr &MI, MachineBasicBlock *BB) const {
2337  BB->getParent()->getFunction().getPersonalityFn())) &&
2338  "SEH does not use catchret!");
2339  return BB;
2340 }
2341 
2343  MachineInstr &MI, MachineBasicBlock *BB) const {
2344  switch (MI.getOpcode()) {
2345  default:
2346 #ifndef NDEBUG
2347  MI.dump();
2348 #endif
2349  llvm_unreachable("Unexpected instruction for custom inserter!");
2350 
2351  case AArch64::F128CSEL:
2352  return EmitF128CSEL(MI, BB);
2353 
2354  case TargetOpcode::STATEPOINT:
2355  // STATEPOINT is a pseudo instruction which has no implicit defs/uses
2356  // while bl call instruction (where statepoint will be lowered at the end)
2357  // has implicit def. Add this implicit dead def here as a workaround.
2358  MI.addOperand(*MI.getMF(), MachineOperand::CreateReg(AArch64::LR, true,
2359  true, false, true));
2361  case TargetOpcode::STACKMAP:
2362  case TargetOpcode::PATCHPOINT:
2363  return emitPatchPoint(MI, BB);
2364 
2365  case AArch64::CATCHRET:
2366  return EmitLoweredCatchRet(MI, BB);
2367  }
2368 }
2369 
2370 //===----------------------------------------------------------------------===//
2371 // AArch64 Lowering private implementation.
2372 //===----------------------------------------------------------------------===//
2373 
2374 //===----------------------------------------------------------------------===//
2375 // Lowering Code
2376 //===----------------------------------------------------------------------===//
2377 
2378 // Forward declarations of SVE fixed length lowering helpers
2383  SelectionDAG &DAG);
2385  EVT VT);
2386 
2387 /// isZerosVector - Check whether SDNode N is a zero-filled vector.
2388 static bool isZerosVector(const SDNode *N) {
2389  // Look through a bit convert.
2390  while (N->getOpcode() == ISD::BITCAST)
2391  N = N->getOperand(0).getNode();
2392 
2394  return true;
2395 
2396  if (N->getOpcode() != AArch64ISD::DUP)
2397  return false;
2398 
2399  auto Opnd0 = N->getOperand(0);
2400  auto *CINT = dyn_cast<ConstantSDNode>(Opnd0);
2401  auto *CFP = dyn_cast<ConstantFPSDNode>(Opnd0);
2402  return (CINT && CINT->isZero()) || (CFP && CFP->isZero());
2403 }
2404 
2405 /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
2406 /// CC
2408  switch (CC) {
2409  default:
2410  llvm_unreachable("Unknown condition code!");
2411  case ISD::SETNE:
2412  return AArch64CC::NE;
2413  case ISD::SETEQ:
2414  return AArch64CC::EQ;
2415  case ISD::SETGT:
2416  return AArch64CC::GT;
2417  case ISD::SETGE:
2418  return AArch64CC::GE;
2419  case ISD::SETLT:
2420  return AArch64CC::LT;
2421  case ISD::SETLE:
2422  return AArch64CC::LE;
2423  case ISD::SETUGT:
2424  return AArch64CC::HI;
2425  case ISD::SETUGE:
2426  return AArch64CC::HS;
2427  case ISD::SETULT:
2428  return AArch64CC::LO;
2429  case ISD::SETULE:
2430  return AArch64CC::LS;
2431  }
2432 }
2433 
2434 /// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
2437  AArch64CC::CondCode &CondCode2) {
2438  CondCode2 = AArch64CC::AL;
2439  switch (CC) {
2440  default:
2441  llvm_unreachable("Unknown FP condition!");
2442  case ISD::SETEQ:
2443  case ISD::SETOEQ:
2445  break;
2446  case ISD::SETGT:
2447  case ISD::SETOGT:
2449  break;
2450  case ISD::SETGE:
2451  case ISD::SETOGE:
2453  break;
2454  case ISD::SETOLT:
2456  break;
2457  case ISD::SETOLE:
2459  break;
2460  case ISD::SETONE:
2462  CondCode2 = AArch64CC::GT;
2463  break;
2464  case ISD::SETO:
2466  break;
2467  case ISD::SETUO:
2469  break;
2470  case ISD::SETUEQ:
2472  CondCode2 = AArch64CC::VS;
2473  break;
2474  case ISD::SETUGT:
2476  break;
2477  case ISD::SETUGE:
2479  break;
2480  case ISD::SETLT:
2481  case ISD::SETULT:
2483  break;
2484  case ISD::SETLE:
2485  case ISD::SETULE:
2487  break;
2488  case ISD::SETNE:
2489  case ISD::SETUNE:
2491  break;
2492  }
2493 }
2494 
2495 /// Convert a DAG fp condition code to an AArch64 CC.
2496 /// This differs from changeFPCCToAArch64CC in that it returns cond codes that
2497 /// should be AND'ed instead of OR'ed.
2500  AArch64CC::CondCode &CondCode2) {
2501  CondCode2 = AArch64CC::AL;
2502  switch (CC) {
2503  default:
2504  changeFPCCToAArch64CC(CC, CondCode, CondCode2);
2505  assert(CondCode2 == AArch64CC::AL);
2506  break;
2507  case ISD::SETONE:
2508  // (a one b)
2509  // == ((a olt b) || (a ogt b))
2510  // == ((a ord b) && (a une b))
2512  CondCode2 = AArch64CC::NE;
2513  break;
2514  case ISD::SETUEQ:
2515  // (a ueq b)
2516  // == ((a uno b) || (a oeq b))
2517  // == ((a ule b) && (a uge b))
2519  CondCode2 = AArch64CC::LE;
2520  break;
2521  }
2522 }
2523 
2524 /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
2525 /// CC usable with the vector instructions. Fewer operations are available
2526 /// without a real NZCV register, so we have to use less efficient combinations
2527 /// to get the same effect.
2530  AArch64CC::CondCode &CondCode2,
2531  bool &Invert) {
2532  Invert = false;
2533  switch (CC) {
2534  default:
2535  // Mostly the scalar mappings work fine.
2536  changeFPCCToAArch64CC(CC, CondCode, CondCode2);
2537  break;
2538  case ISD::SETUO:
2539  Invert = true;
2541  case ISD::SETO:
2543  CondCode2 = AArch64CC::GE;
2544  break;
2545  case ISD::SETUEQ:
2546  case ISD::SETULT:
2547  case ISD::SETULE:
2548  case ISD::SETUGT:
2549  case ISD::SETUGE:
2550  // All of the compare-mask comparisons are ordered, but we can switch
2551  // between the two by a double inversion. E.g. ULE == !OGT.
2552  Invert = true;
2553  changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
2554  CondCode, CondCode2);
2555  break;
2556  }
2557 }
2558 
2560  // Matches AArch64DAGToDAGISel::SelectArithImmed().
2561  bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
2562  LLVM_DEBUG(dbgs() << "Is imm " << C
2563  << " legal: " << (IsLegal ? "yes\n" : "no\n"));
2564  return IsLegal;
2565 }
2566 
2567 // Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
2568 // the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
2569 // can be set differently by this operation. It comes down to whether
2570 // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
2571 // everything is fine. If not then the optimization is wrong. Thus general
2572 // comparisons are only valid if op2 != 0.
2573 //
2574 // So, finally, the only LLVM-native comparisons that don't mention C and V
2575 // are SETEQ and SETNE. They're the only ones we can safely use CMN for in
2576 // the absence of information about op2.
2577 static bool isCMN(SDValue Op, ISD::CondCode CC) {
2578  return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
2579  (CC == ISD::SETEQ || CC == ISD::SETNE);
2580 }
2581 
2583  SelectionDAG &DAG, SDValue Chain,
2584  bool IsSignaling) {
2585  EVT VT = LHS.getValueType();
2586  assert(VT != MVT::f128);
2587  assert(VT != MVT::f16 && "Lowering of strict fp16 not yet implemented");
2588  unsigned Opcode =
2590  return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS});
2591 }
2592 
2594  const SDLoc &dl, SelectionDAG &DAG) {
2595  EVT VT = LHS.getValueType();
2596  const bool FullFP16 =
2597  static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
2598 
2599  if (VT.isFloatingPoint()) {
2600  assert(VT != MVT::f128);
2601  if (VT == MVT::f16 && !FullFP16) {
2602  LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
2603  RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
2604  VT = MVT::f32;
2605  }
2606  return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
2607  }
2608 
2609  // The CMP instruction is just an alias for SUBS, and representing it as
2610  // SUBS means that it's possible to get CSE with subtract operations.
2611  // A later phase can perform the optimization of setting the destination
2612  // register to WZR/XZR if it ends up being unused.
2613  unsigned Opcode = AArch64ISD::SUBS;
2614 
2615  if (isCMN(RHS, CC)) {
2616  // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
2617  Opcode = AArch64ISD::ADDS;
2618  RHS = RHS.getOperand(1);
2619  } else if (isCMN(LHS, CC)) {
2620  // As we are looking for EQ/NE compares, the operands can be commuted ; can
2621  // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
2622  Opcode = AArch64ISD::ADDS;
2623  LHS = LHS.getOperand(1);
2624  } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
2625  if (LHS.getOpcode() == ISD::AND) {
2626  // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
2627  // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
2628  // of the signed comparisons.
2629  const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl,
2630  DAG.getVTList(VT, MVT_CC),
2631  LHS.getOperand(0),
2632  LHS.getOperand(1));
2633  // Replace all users of (and X, Y) with newly generated (ands X, Y)
2634  DAG.ReplaceAllUsesWith(LHS, ANDSNode);
2635  return ANDSNode.getValue(1);
2636  } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
2637  // Use result of ANDS
2638  return LHS.getValue(1);
2639  }
2640  }
2641 
2642  return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
2643  .getValue(1);
2644 }
2645 
2646 /// \defgroup AArch64CCMP CMP;CCMP matching
2647 ///
2648 /// These functions deal with the formation of CMP;CCMP;... sequences.
2649 /// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
2650 /// a comparison. They set the NZCV flags to a predefined value if their
2651 /// predicate is false. This allows to express arbitrary conjunctions, for
2652 /// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
2653 /// expressed as:
2654 /// cmp A
2655 /// ccmp B, inv(CB), CA
2656 /// check for CB flags
2657 ///
2658 /// This naturally lets us implement chains of AND operations with SETCC
2659 /// operands. And we can even implement some other situations by transforming
2660 /// them:
2661 /// - We can implement (NEG SETCC) i.e. negating a single comparison by
2662 /// negating the flags used in a CCMP/FCCMP operations.
2663 /// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
2664 /// by negating the flags we test for afterwards. i.e.
2665 /// NEG (CMP CCMP CCCMP ...) can be implemented.
2666 /// - Note that we can only ever negate all previously processed results.
2667 /// What we can not implement by flipping the flags to test is a negation
2668 /// of two sub-trees (because the negation affects all sub-trees emitted so
2669 /// far, so the 2nd sub-tree we emit would also affect the first).
2670 /// With those tools we can implement some OR operations:
2671 /// - (OR (SETCC A) (SETCC B)) can be implemented via:
2672 /// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
2673 /// - After transforming OR to NEG/AND combinations we may be able to use NEG
2674 /// elimination rules from earlier to implement the whole thing as a
2675 /// CCMP/FCCMP chain.
2676 ///
2677 /// As complete example:
2678 /// or (or (setCA (cmp A)) (setCB (cmp B)))
2679 /// (and (setCC (cmp C)) (setCD (cmp D)))"
2680 /// can be reassociated to:
2681 /// or (and (setCC (cmp C)) setCD (cmp D))
2682 // (or (setCA (cmp A)) (setCB (cmp B)))
2683 /// can be transformed to:
2684 /// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
2685 /// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
2686 /// which can be implemented as:
2687 /// cmp C
2688 /// ccmp D, inv(CD), CC
2689 /// ccmp A, CA, inv(CD)
2690 /// ccmp B, CB, inv(CA)
2691 /// check for CB flags
2692 ///
2693 /// A counterexample is "or (and A B) (and C D)" which translates to
2694 /// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
2695 /// can only implement 1 of the inner (not) operations, but not both!
2696 /// @{
2697 
2698 /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
2700  ISD::CondCode CC, SDValue CCOp,
2702  AArch64CC::CondCode OutCC,
2703  const SDLoc &DL, SelectionDAG &DAG) {
2704  unsigned Opcode = 0;
2705  const bool FullFP16 =
2706  static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
2707 
2708  if (LHS.getValueType().isFloatingPoint()) {
2709  assert(LHS.getValueType() != MVT::f128);
2710  if (LHS.getValueType() == MVT::f16 && !FullFP16) {
2713  }
2714  Opcode = AArch64ISD::FCCMP;
2715  } else if (RHS.getOpcode() == ISD::SUB) {
2716  SDValue SubOp0 = RHS.getOperand(0);
2717  if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
2718  // See emitComparison() on why we can only do this for SETEQ and SETNE.
2719  Opcode = AArch64ISD::CCMN;
2720  RHS = RHS.getOperand(1);
2721  }
2722  }
2723  if (Opcode == 0)
2724  Opcode = AArch64ISD::CCMP;
2725 
2726  SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
2728  unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
2729  SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
2730  return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
2731 }
2732 
2733 /// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
2734 /// expressed as a conjunction. See \ref AArch64CCMP.
2735 /// \param CanNegate Set to true if we can negate the whole sub-tree just by
2736 /// changing the conditions on the SETCC tests.
2737 /// (this means we can call emitConjunctionRec() with
2738 /// Negate==true on this sub-tree)
2739 /// \param MustBeFirst Set to true if this subtree needs to be negated and we
2740 /// cannot do the negation naturally. We are required to
2741 /// emit the subtree first in this case.
2742 /// \param WillNegate Is true if are called when the result of this
2743 /// subexpression must be negated. This happens when the
2744 /// outer expression is an OR. We can use this fact to know
2745 /// that we have a double negation (or (or ...) ...) that
2746 /// can be implemented for free.
2747 static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
2748  bool &MustBeFirst, bool WillNegate,
2749  unsigned Depth = 0) {
2750  if (!Val.hasOneUse())
2751  return false;
2752  unsigned Opcode = Val->getOpcode();
2753  if (Opcode == ISD::SETCC) {
2754  if (Val->getOperand(0).getValueType() == MVT::f128)
2755  return false;
2756  CanNegate = true;
2757  MustBeFirst = false;
2758  return true;
2759  }
2760  // Protect against exponential runtime and stack overflow.
2761  if (Depth > 6)
2762  return false;
2763  if (Opcode == ISD::AND || Opcode == ISD::OR) {
2764  bool IsOR = Opcode == ISD::OR;
2765  SDValue O0 = Val->getOperand(0);
2766  SDValue O1 = Val->getOperand(1);
2767  bool CanNegateL;
2768  bool MustBeFirstL;
2769  if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
2770  return false;
2771  bool CanNegateR;
2772  bool MustBeFirstR;
2773  if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
2774  return false;
2775 
2776  if (MustBeFirstL && MustBeFirstR)
2777  return false;
2778 
2779  if (IsOR) {
2780  // For an OR expression we need to be able to naturally negate at least
2781  // one side or we cannot do the transformation at all.
2782  if (!CanNegateL && !CanNegateR)
2783  return false;
2784  // If we the result of the OR will be negated and we can naturally negate
2785  // the leafs, then this sub-tree as a whole negates naturally.
2786  CanNegate = WillNegate && CanNegateL && CanNegateR;
2787  // If we cannot naturally negate the whole sub-tree, then this must be
2788  // emitted first.
2789  MustBeFirst = !CanNegate;
2790  } else {
2791  assert(Opcode == ISD::AND && "Must be OR or AND");
2792  // We cannot naturally negate an AND operation.
2793  CanNegate = false;
2794  MustBeFirst = MustBeFirstL || MustBeFirstR;
2795  }
2796  return true;
2797  }
2798  return false;
2799 }
2800 
2801 /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
2802 /// of CCMP/CFCMP ops. See @ref AArch64CCMP.
2803 /// Tries to transform the given i1 producing node @p Val to a series compare
2804 /// and conditional compare operations. @returns an NZCV flags producing node
2805 /// and sets @p OutCC to the flags that should be tested or returns SDValue() if
2806 /// transformation was not possible.
2807 /// \p Negate is true if we want this sub-tree being negated just by changing
2808 /// SETCC conditions.
2810  AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
2812  // We're at a tree leaf, produce a conditional comparison operation.
2813  unsigned Opcode = Val->getOpcode();
2814  if (Opcode == ISD::SETCC) {
2815  SDValue LHS = Val->getOperand(0);
2816  SDValue RHS = Val->getOperand(1);
2817  ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
2818  bool isInteger = LHS.getValueType().isInteger();
2819  if (Negate)
2820  CC = getSetCCInverse(CC, LHS.getValueType());
2821  SDLoc DL(Val);
2822  // Determine OutCC and handle FP special case.
2823  if (isInteger) {
2824  OutCC = changeIntCCToAArch64CC(CC);
2825  } else {
2826  assert(LHS.getValueType().isFloatingPoint());
2827  AArch64CC::CondCode ExtraCC;
2828  changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
2829  // Some floating point conditions can't be tested with a single condition
2830  // code. Construct an additional comparison in this case.
2831  if (ExtraCC != AArch64CC::AL) {
2832  SDValue ExtraCmp;
2833  if (!CCOp.getNode())
2834  ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
2835  else
2836  ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
2837  ExtraCC, DL, DAG);
2838  CCOp = ExtraCmp;
2839  Predicate = ExtraCC;
2840  }
2841  }
2842 
2843  // Produce a normal comparison if we are first in the chain
2844  if (!CCOp)
2845  return emitComparison(LHS, RHS, CC, DL, DAG);
2846  // Otherwise produce a ccmp.
2847  return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
2848  DAG);
2849  }
2850  assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
2851 
2852  bool IsOR = Opcode == ISD::OR;
2853 
2854  SDValue LHS = Val->getOperand(0);
2855  bool CanNegateL;
2856  bool MustBeFirstL;
2857  bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
2858  assert(ValidL && "Valid conjunction/disjunction tree");
2859  (void)ValidL;
2860 
2861  SDValue RHS = Val->getOperand(1);
2862  bool CanNegateR;
2863  bool MustBeFirstR;
2864  bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
2865  assert(ValidR && "Valid conjunction/disjunction tree");
2866  (void)ValidR;
2867 
2868  // Swap sub-tree that must come first to the right side.
2869  if (MustBeFirstL) {
2870  assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
2871  std::swap(LHS, RHS);
2872  std::swap(CanNegateL, CanNegateR);
2873  std::swap(MustBeFirstL, MustBeFirstR);
2874  }
2875 
2876  bool NegateR;
2877  bool NegateAfterR;
2878  bool NegateL;
2879  bool NegateAfterAll;
2880  if (Opcode == ISD::OR) {
2881  // Swap the sub-tree that we can negate naturally to the left.
2882  if (!CanNegateL) {
2883  assert(CanNegateR && "at least one side must be negatable");
2884  assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
2885  assert(!Negate);
2886  std::swap(LHS, RHS);
2887  NegateR = false;
2888  NegateAfterR = true;
2889  } else {
2890  // Negate the left sub-tree if possible, otherwise negate the result.
2891  NegateR = CanNegateR;
2892  NegateAfterR = !CanNegateR;
2893  }
2894  NegateL = true;
2895  NegateAfterAll = !Negate;
2896  } else {
2897  assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
2898  assert(!Negate && "Valid conjunction/disjunction tree");
2899 
2900  NegateL = false;
2901  NegateR = false;
2902  NegateAfterR = false;
2903  NegateAfterAll = false;
2904  }
2905 
2906  // Emit sub-trees.
2907  AArch64CC::CondCode RHSCC;
2908  SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
2909  if (NegateAfterR)
2910  RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
2911  SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
2912  if (NegateAfterAll)
2913  OutCC = AArch64CC::getInvertedCondCode(OutCC);
2914  return CmpL;
2915 }
2916 
2917 /// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
2918 /// In some cases this is even possible with OR operations in the expression.
2919 /// See \ref AArch64CCMP.
2920 /// \see emitConjunctionRec().
2922  AArch64CC::CondCode &OutCC) {
2923  bool DummyCanNegate;
2924  bool DummyMustBeFirst;
2925  if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
2926  return SDValue();
2927 
2928  return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
2929 }
2930 
2931 /// @}
2932 
2933 /// Returns how profitable it is to fold a comparison's operand's shift and/or
2934 /// extension operations.
2936  auto isSupportedExtend = [&](SDValue V) {
2937  if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
2938  return true;
2939 
2940  if (V.getOpcode() == ISD::AND)
2941  if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
2942  uint64_t Mask = MaskCst->getZExtValue();
2943  return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
2944  }
2945 
2946  return false;
2947  };
2948 
2949  if (!Op.hasOneUse())
2950  return 0;
2951 
2952  if (isSupportedExtend(Op))
2953  return 1;
2954 
2955  unsigned Opc = Op.getOpcode();
2956  if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
2957  if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
2958  uint64_t Shift = ShiftCst->getZExtValue();
2959  if (isSupportedExtend(Op.getOperand(0)))
2960  return (Shift <= 4) ? 2 : 1;
2961  EVT VT = Op.getValueType();
2962  if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
2963  return 1;
2964  }
2965 
2966  return 0;
2967 }
2968 
2970  SDValue &AArch64cc, SelectionDAG &DAG,
2971  const SDLoc &dl) {
2972  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
2973  EVT VT = RHS.getValueType();
2974  uint64_t C = RHSC->getZExtValue();
2975  if (!isLegalArithImmed(C)) {
2976  // Constant does not fit, try adjusting it by one?
2977  switch (CC) {
2978  default:
2979  break;
2980  case ISD::SETLT:
2981  case ISD::SETGE:
2982  if ((VT == MVT::i32 && C != 0x80000000 &&
2983  isLegalArithImmed((uint32_t)(C - 1))) ||
2984  (VT == MVT::i64 && C != 0x80000000ULL &&
2985  isLegalArithImmed(C - 1ULL))) {
2986  CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
2987  C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
2988  RHS = DAG.getConstant(C, dl, VT);
2989  }
2990  break;
2991  case ISD::SETULT:
2992  case ISD::SETUGE:
2993  if ((VT == MVT::i32 && C != 0 &&
2994  isLegalArithImmed((uint32_t)(C - 1))) ||
2995  (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
2996  CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
2997  C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
2998  RHS = DAG.getConstant(C, dl, VT);
2999  }
3000  break;
3001  case ISD::SETLE:
3002  case ISD::SETGT:
3003  if ((VT == MVT::i32 && C != INT32_MAX &&
3004  isLegalArithImmed((uint32_t)(C + 1))) ||
3005  (VT == MVT::i64 && C != INT64_MAX &&
3006  isLegalArithImmed(C + 1ULL))) {
3007  CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
3008  C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
3009  RHS = DAG.getConstant(C, dl, VT);
3010  }
3011  break;
3012  case ISD::SETULE:
3013  case ISD::SETUGT:
3014  if ((VT == MVT::i32 && C != UINT32_MAX &&
3015  isLegalArithImmed((uint32_t)(C + 1))) ||
3016  (VT == MVT::i64 && C != UINT64_MAX &&
3017  isLegalArithImmed(C + 1ULL))) {
3018  CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
3019  C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
3020  RHS = DAG.getConstant(C, dl, VT);
3021  }
3022  break;
3023  }
3024  }
3025  }
3026 
3027  // Comparisons are canonicalized so that the RHS operand is simpler than the
3028  // LHS one, the extreme case being when RHS is an immediate. However, AArch64
3029  // can fold some shift+extend operations on the RHS operand, so swap the
3030  // operands if that can be done.
3031  //
3032  // For example:
3033  // lsl w13, w11, #1
3034  // cmp w13, w12
3035  // can be turned into:
3036  // cmp w12, w11, lsl #1
3037  if (!isa<ConstantSDNode>(RHS) ||
3038  !isLegalArithImmed(cast<ConstantSDNode>(RHS)->getZExtValue())) {
3039  SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS;
3040 
3042  std::swap(LHS, RHS);
3044  }
3045  }
3046 
3047  SDValue Cmp;
3048  AArch64CC::CondCode AArch64CC;
3049  if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
3050  const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
3051 
3052  // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
3053  // For the i8 operand, the largest immediate is 255, so this can be easily
3054  // encoded in the compare instruction. For the i16 operand, however, the
3055  // largest immediate cannot be encoded in the compare.
3056  // Therefore, use a sign extending load and cmn to avoid materializing the
3057  // -1 constant. For example,
3058  // movz w1, #65535
3059  // ldrh w0, [x0, #0]
3060  // cmp w0, w1
3061  // >
3062  // ldrsh w0, [x0, #0]
3063  // cmn w0, #1
3064  // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
3065  // if and only if (sext LHS) == (sext RHS). The checks are in place to
3066  // ensure both the LHS and RHS are truly zero extended and to make sure the
3067  // transformation is profitable.
3068  if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
3069  cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
3070  cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
3071  LHS.getNode()->hasNUsesOfValue(1, 0)) {
3072  int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
3073  if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
3074  SDValue SExt =
3075  DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
3076  DAG.getValueType(MVT::i16));
3077  Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
3078  RHS.getValueType()),
3079  CC, dl, DAG);
3080  AArch64CC = changeIntCCToAArch64CC(CC);
3081  }
3082  }
3083 
3084  if (!Cmp && (RHSC->isZero() || RHSC->isOne())) {
3085  if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
3086  if ((CC == ISD::SETNE) ^ RHSC->isZero())
3087  AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
3088  }
3089  }
3090  }
3091 
3092  if (!Cmp) {
3093  Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
3094  AArch64CC = changeIntCCToAArch64CC(CC);
3095  }
3096  AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
3097  return Cmp;
3098 }
3099 
3100 static std::pair<SDValue, SDValue>
3102  assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
3103  "Unsupported value type");
3104  SDValue Value, Overflow;
3105  SDLoc DL(Op);
3106  SDValue LHS = Op.getOperand(0);
3107  SDValue RHS = Op.getOperand(1);
3108  unsigned Opc = 0;
3109  switch (Op.getOpcode()) {
3110  default:
3111  llvm_unreachable("Unknown overflow instruction!");
3112  case ISD::SADDO:
3113  Opc = AArch64ISD::ADDS;
3114  CC = AArch64CC::VS;
3115  break;
3116  case ISD::UADDO:
3117  Opc = AArch64ISD::ADDS;
3118  CC = AArch64CC::HS;
3119  break;
3120  case ISD::SSUBO:
3121  Opc = AArch64ISD::SUBS;
3122  CC = AArch64CC::VS;
3123  break;
3124  case ISD::USUBO:
3125  Opc = AArch64ISD::SUBS;
3126  CC = AArch64CC::LO;
3127  break;
3128  // Multiply needs a little bit extra work.
3129  case ISD::SMULO:
3130  case ISD::UMULO: {
3131  CC = AArch64CC::NE;
3132  bool IsSigned = Op.getOpcode() == ISD::SMULO;
3133  if (Op.getValueType() == MVT::i32) {
3134  // Extend to 64-bits, then perform a 64-bit multiply.
3135  unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3136  LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
3137  RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
3140 
3141  // Check that the result fits into a 32-bit integer.
3142  SDVTList VTs = DAG.getVTList(MVT::i64, MVT_CC);
3143  if (IsSigned) {
3144  // cmp xreg, wreg, sxtw
3145  SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value);
3146  Overflow =
3147  DAG.getNode(AArch64ISD::SUBS, DL, VTs, Mul, SExtMul).getValue(1);
3148  } else {
3149  // tst xreg, #0xffffffff00000000
3150  SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64);
3151  Overflow =
3152  DAG.getNode(AArch64ISD::ANDS, DL, VTs, Mul, UpperBits).getValue(1);
3153  }
3154  break;
3155  }
3156  assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
3157  // For the 64 bit multiply
3158  Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
3159  if (IsSigned) {
3160  SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
3161  SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
3162  DAG.getConstant(63, DL, MVT::i64));
3163  // It is important that LowerBits is last, otherwise the arithmetic
3164  // shift will not be folded into the compare (SUBS).
3165  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
3166  Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
3167  .getValue(1);
3168  } else {
3169  SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
3170  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
3171  Overflow =
3172  DAG.getNode(AArch64ISD::SUBS, DL, VTs,
3173  DAG.getConstant(0, DL, MVT::i64),
3174  UpperBits).getValue(1);
3175  }
3176  break;
3177  }
3178  } // switch (...)
3179 
3180  if (Opc) {
3181  SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
3182 
3183  // Emit the AArch64 operation with overflow check.
3184  Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
3185  Overflow = Value.getValue(1);
3186  }
3187  return std::make_pair(Value, Overflow);
3188 }
3189 
3190 SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
3191  if (useSVEForFixedLengthVectorVT(Op.getValueType()))
3192  return LowerToScalableOp(Op, DAG);
3193 
3194  SDValue Sel = Op.getOperand(0);
3195  SDValue Other = Op.getOperand(1);
3196  SDLoc dl(Sel);
3197 
3198  // If the operand is an overflow checking operation, invert the condition
3199  // code and kill the Not operation. I.e., transform:
3200  // (xor (overflow_op_bool, 1))
3201  // -->
3202  // (csel 1, 0, invert(cc), overflow_op_bool)
3203  // ... which later gets transformed to just a cset instruction with an
3204  // inverted condition code, rather than a cset + eor sequence.
3205  if (isOneConstant(Other) && ISD::isOverflowIntrOpRes(Sel)) {
3206  // Only lower legal XALUO ops.
3207  if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0)))
3208  return SDValue();
3209 
3210  SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
3211  SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
3213  SDValue Value, Overflow;
3214  std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
3215  SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
3216  return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
3217  CCVal, Overflow);
3218  }
3219  // If neither operand is a SELECT_CC, give up.
3220  if (Sel.getOpcode() != ISD::SELECT_CC)
3221  std::swap(Sel, Other);
3222  if (Sel.getOpcode() != ISD::SELECT_CC)
3223  return Op;
3224 
3225  // The folding we want to perform is:
3226  // (xor x, (select_cc a, b, cc, 0, -1) )
3227  // -->
3228  // (csel x, (xor x, -1), cc ...)
3229  //
3230  // The latter will get matched to a CSINV instruction.
3231 
3232  ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
3233  SDValue LHS = Sel.getOperand(0);
3234  SDValue RHS = Sel.getOperand(1);
3235  SDValue TVal = Sel.getOperand(2);
3236  SDValue FVal = Sel.getOperand(3);
3237 
3238  // FIXME: This could be generalized to non-integer comparisons.
3239  if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
3240  return Op;
3241 
3242  ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
3243  ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
3244 
3245  // The values aren't constants, this isn't the pattern we're looking for.
3246  if (!CFVal || !CTVal)
3247  return Op;
3248 
3249  // We can commute the SELECT_CC by inverting the condition. This
3250  // might be needed to make this fit into a CSINV pattern.
3251  if (CTVal->isAllOnes() && CFVal->isZero()) {
3252  std::swap(TVal, FVal);
3253  std::swap(CTVal, CFVal);
3254  CC = ISD::getSetCCInverse(CC, LHS.getValueType());
3255  }
3256 
3257  // If the constants line up, perform the transform!
3258  if (CTVal->isZero() && CFVal->isAllOnes()) {
3259  SDValue CCVal;
3260  SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
3261 
3262  FVal = Other;
3263  TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
3264  DAG.getConstant(-1ULL, dl, Other.getValueType()));
3265 
3266  return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
3267  CCVal, Cmp);
3268  }
3269 
3270  return Op;
3271 }
3272