LLVM  13.0.0git
AArch64ISelLowering.cpp
Go to the documentation of this file.
1 //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the AArch64TargetLowering class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "AArch64ISelLowering.h"
15 #include "AArch64ExpandImm.h"
17 #include "AArch64PerfectShuffle.h"
18 #include "AArch64RegisterInfo.h"
19 #include "AArch64Subtarget.h"
21 #include "Utils/AArch64BaseInfo.h"
22 #include "llvm/ADT/APFloat.h"
23 #include "llvm/ADT/APInt.h"
24 #include "llvm/ADT/ArrayRef.h"
25 #include "llvm/ADT/STLExtras.h"
26 #include "llvm/ADT/SmallSet.h"
27 #include "llvm/ADT/SmallVector.h"
28 #include "llvm/ADT/Statistic.h"
29 #include "llvm/ADT/StringRef.h"
30 #include "llvm/ADT/Triple.h"
31 #include "llvm/ADT/Twine.h"
48 #include "llvm/IR/Attributes.h"
49 #include "llvm/IR/Constants.h"
50 #include "llvm/IR/DataLayout.h"
51 #include "llvm/IR/DebugLoc.h"
52 #include "llvm/IR/DerivedTypes.h"
53 #include "llvm/IR/Function.h"
55 #include "llvm/IR/GlobalValue.h"
56 #include "llvm/IR/IRBuilder.h"
57 #include "llvm/IR/Instruction.h"
58 #include "llvm/IR/Instructions.h"
59 #include "llvm/IR/IntrinsicInst.h"
60 #include "llvm/IR/Intrinsics.h"
61 #include "llvm/IR/IntrinsicsAArch64.h"
62 #include "llvm/IR/Module.h"
63 #include "llvm/IR/OperandTraits.h"
64 #include "llvm/IR/PatternMatch.h"
65 #include "llvm/IR/Type.h"
66 #include "llvm/IR/Use.h"
67 #include "llvm/IR/Value.h"
68 #include "llvm/MC/MCRegisterInfo.h"
69 #include "llvm/Support/Casting.h"
70 #include "llvm/Support/CodeGen.h"
72 #include "llvm/Support/Compiler.h"
73 #include "llvm/Support/Debug.h"
75 #include "llvm/Support/KnownBits.h"
81 #include <algorithm>
82 #include <bitset>
83 #include <cassert>
84 #include <cctype>
85 #include <cstdint>
86 #include <cstdlib>
87 #include <iterator>
88 #include <limits>
89 #include <tuple>
90 #include <utility>
91 #include <vector>
92 
93 using namespace llvm;
94 using namespace llvm::PatternMatch;
95 
96 #define DEBUG_TYPE "aarch64-lower"
97 
98 STATISTIC(NumTailCalls, "Number of tail calls");
99 STATISTIC(NumShiftInserts, "Number of vector shift inserts");
100 STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
101 
102 // FIXME: The necessary dtprel relocations don't seem to be supported
103 // well in the GNU bfd and gold linkers at the moment. Therefore, by
104 // default, for now, fall back to GeneralDynamic code generation.
106  "aarch64-elf-ldtls-generation", cl::Hidden,
107  cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
108  cl::init(false));
109 
110 static cl::opt<bool>
111 EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
112  cl::desc("Enable AArch64 logical imm instruction "
113  "optimization"),
114  cl::init(true));
115 
116 // Temporary option added for the purpose of testing functionality added
117 // to DAGCombiner.cpp in D92230. It is expected that this can be removed
118 // in future when both implementations will be based off MGATHER rather
119 // than the GLD1 nodes added for the SVE gather load intrinsics.
120 static cl::opt<bool>
121 EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
122  cl::desc("Combine extends of AArch64 masked "
123  "gather intrinsics"),
124  cl::init(true));
125 
126 /// Value type used for condition codes.
127 static const MVT MVT_CC = MVT::i32;
128 
129 static inline EVT getPackedSVEVectorVT(EVT VT) {
130  switch (VT.getSimpleVT().SimpleTy) {
131  default:
132  llvm_unreachable("unexpected element type for vector");
133  case MVT::i8:
134  return MVT::nxv16i8;
135  case MVT::i16:
136  return MVT::nxv8i16;
137  case MVT::i32:
138  return MVT::nxv4i32;
139  case MVT::i64:
140  return MVT::nxv2i64;
141  case MVT::f16:
142  return MVT::nxv8f16;
143  case MVT::f32:
144  return MVT::nxv4f32;
145  case MVT::f64:
146  return MVT::nxv2f64;
147  case MVT::bf16:
148  return MVT::nxv8bf16;
149  }
150 }
151 
152 // NOTE: Currently there's only a need to return integer vector types. If this
153 // changes then just add an extra "type" parameter.
155  switch (EC.getKnownMinValue()) {
156  default:
157  llvm_unreachable("unexpected element count for vector");
158  case 16:
159  return MVT::nxv16i8;
160  case 8:
161  return MVT::nxv8i16;
162  case 4:
163  return MVT::nxv4i32;
164  case 2:
165  return MVT::nxv2i64;
166  }
167 }
168 
169 static inline EVT getPromotedVTForPredicate(EVT VT) {
171  "Expected scalable predicate vector type!");
172  switch (VT.getVectorMinNumElements()) {
173  default:
174  llvm_unreachable("unexpected element count for vector");
175  case 2:
176  return MVT::nxv2i64;
177  case 4:
178  return MVT::nxv4i32;
179  case 8:
180  return MVT::nxv8i16;
181  case 16:
182  return MVT::nxv16i8;
183  }
184 }
185 
186 /// Returns true if VT's elements occupy the lowest bit positions of its
187 /// associated register class without any intervening space.
188 ///
189 /// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
190 /// same register class, but only nxv8f16 can be treated as a packed vector.
191 static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
192  assert(VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
193  "Expected legal vector type!");
194  return VT.isFixedLengthVector() ||
196 }
197 
198 // Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
199 // predicate and end with a passthru value matching the result type.
200 static bool isMergePassthruOpcode(unsigned Opc) {
201  switch (Opc) {
202  default:
203  return false;
230  return true;
231  }
232 }
233 
235  const AArch64Subtarget &STI)
236  : TargetLowering(TM), Subtarget(&STI) {
237  // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
238  // we have to make something up. Arbitrarily, choose ZeroOrOne.
240  // When comparing vectors the result sets the different elements in the
241  // vector to all-one or all-zero.
243 
244  // Set up the register classes.
245  addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
246  addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
247 
248  if (Subtarget->hasFPARMv8()) {
249  addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
250  addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
251  addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
252  addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
253  addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
254  }
255 
256  if (Subtarget->hasNEON()) {
257  addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
258  addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
259  // Someone set us up the NEON.
260  addDRTypeForNEON(MVT::v2f32);
261  addDRTypeForNEON(MVT::v8i8);
262  addDRTypeForNEON(MVT::v4i16);
263  addDRTypeForNEON(MVT::v2i32);
264  addDRTypeForNEON(MVT::v1i64);
265  addDRTypeForNEON(MVT::v1f64);
266  addDRTypeForNEON(MVT::v4f16);
267  if (Subtarget->hasBF16())
268  addDRTypeForNEON(MVT::v4bf16);
269 
270  addQRTypeForNEON(MVT::v4f32);
271  addQRTypeForNEON(MVT::v2f64);
272  addQRTypeForNEON(MVT::v16i8);
273  addQRTypeForNEON(MVT::v8i16);
274  addQRTypeForNEON(MVT::v4i32);
275  addQRTypeForNEON(MVT::v2i64);
276  addQRTypeForNEON(MVT::v8f16);
277  if (Subtarget->hasBF16())
278  addQRTypeForNEON(MVT::v8bf16);
279  }
280 
281  if (Subtarget->hasSVE()) {
282  // Add legal sve predicate types
283  addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
284  addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
285  addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
286  addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
287 
288  // Add legal sve data types
289  addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
290  addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
291  addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
292  addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
293 
294  addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
295  addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
296  addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
297  addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
298  addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
299  addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
300 
301  if (Subtarget->hasBF16()) {
302  addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
303  addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
304  addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
305  }
306 
307  if (Subtarget->useSVEForFixedLengthVectors()) {
309  if (useSVEForFixedLengthVectorVT(VT))
310  addRegisterClass(VT, &AArch64::ZPRRegClass);
311 
313  if (useSVEForFixedLengthVectorVT(VT))
314  addRegisterClass(VT, &AArch64::ZPRRegClass);
315  }
316 
317  for (auto VT : { MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64 }) {
326  }
327 
328  for (auto VT :
332 
333  for (auto VT :
335  MVT::nxv2f64 }) {
347 
359  }
360  }
361 
362  // Compute derived properties from the register classes
364 
365  // Provide all sorts of operation actions
399 
403 
407 
409 
410  // Custom lowering hooks are needed for XOR
411  // to fold it into CSINC/CSINV.
414 
415  // Virtually no operation on f128 is legal, but LLVM can't expand them when
416  // there's a valid register class, so we need custom operations in most cases.
440 
441  // Lowering for many of the conversions is actually specified by the non-f128
442  // type. The LowerXXX function will be trivial when f128 isn't involved.
473 
474  // Variable arguments.
479 
480  // Variable-sized objects.
483 
484  if (Subtarget->isTargetWindows())
486  else
488 
489  // Constant pool entries
491 
492  // BlockAddress
494 
495  // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
504 
505  // AArch64 lacks both left-rotate and popcount instructions.
508  for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
511  }
512 
513  // AArch64 doesn't have i32 MULH{S|U}.
516 
517  // AArch64 doesn't have {U|S}MUL_LOHI.
520 
524 
527 
530  for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
533  }
540 
541  // Custom lower Add/Sub/Mul with overflow.
554 
563  if (Subtarget->hasFullFP16())
565  else
567 
601 
602  if (!Subtarget->hasFullFP16()) {
626 
627  // promote v4f16 to v4f32 when that is known to be safe.
636 
653 
675  }
676 
677  // AArch64 has implementations of a lot of rounding-like FP operations.
678  for (MVT Ty : {MVT::f32, MVT::f64}) {
694  }
695 
696  if (Subtarget->hasFullFP16()) {
708  }
709 
711 
714 
720 
721  // Generate outline atomics library calls only if LSE was not specified for
722  // subtarget
723  if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
749 #define LCALLNAMES(A, B, N) \
750  setLibcallName(A##N##_RELAX, #B #N "_relax"); \
751  setLibcallName(A##N##_ACQ, #B #N "_acq"); \
752  setLibcallName(A##N##_REL, #B #N "_rel"); \
753  setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
754 #define LCALLNAME4(A, B) \
755  LCALLNAMES(A, B, 1) \
756  LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
757 #define LCALLNAME5(A, B) \
758  LCALLNAMES(A, B, 1) \
759  LCALLNAMES(A, B, 2) \
760  LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
761  LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
762  LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
763  LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
764  LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
765  LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
766  LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
767 #undef LCALLNAMES
768 #undef LCALLNAME4
769 #undef LCALLNAME5
770  }
771 
772  // 128-bit loads and stores can be done without expanding
775 
776  // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
777  // custom lowering, as there are no un-paired non-temporal stores and
778  // legalization will break up 256 bit inputs.
786 
787  // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
788  // This requires the Performance Monitors extension.
789  if (Subtarget->hasPerfMon())
791 
792  if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
793  getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
794  // Issue __sincos_stret if available.
797  } else {
800  }
801 
802  if (Subtarget->getTargetTriple().isOSMSVCRT()) {
803  // MSVCRT doesn't have powi; fall back to pow
804  setLibcallName(RTLIB::POWI_F32, nullptr);
805  setLibcallName(RTLIB::POWI_F64, nullptr);
806  }
807 
808  // Make floating-point constants legal for the large code model, so they don't
809  // become loads from the constant pool.
810  if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
813  }
814 
815  // AArch64 does not have floating-point extending loads, i1 sign-extending
816  // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
817  for (MVT VT : MVT::fp_valuetypes()) {
822  }
823  for (MVT VT : MVT::integer_valuetypes())
825 
833 
837 
838  // Indexed loads and stores are supported.
839  for (unsigned im = (unsigned)ISD::PRE_INC;
840  im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
857  }
858 
859  // Trap.
863 
864  // We combine OR nodes for bitfield operations.
866  // Try to create BICs for vector ANDs.
868 
869  // Vector add and sub nodes may conceal a high-half opportunity.
870  // Also, try to fold ADD into CSINC/CSINV..
878 
882 
884 
892  if (Subtarget->supportsAddressTopByteIgnored())
894 
896 
899 
905 
907 
908  // In case of strict alignment, avoid an excessive number of byte wide stores.
912 
917 
919 
923 
925 
927 
928  EnableExtLdPromotion = true;
929 
930  // Set required alignment.
932  // Set preferred alignments.
935 
936  // Only change the limit for entries in a jump table if specified by
937  // the sub target, but not at the command line.
938  unsigned MaxJT = STI.getMaximumJumpTableSize();
939  if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
941 
942  setHasExtractBitsInsn(true);
943 
945 
946  if (Subtarget->hasNEON()) {
947  // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
948  // silliness like this:
975 
981 
983 
984  // AArch64 doesn't have a direct vector ->f32 conversion instructions for
985  // elements smaller than i32, so promote the input to i32 first.
988  // i8 vector elements also need promotion to i32 for v8i8
991  // Similarly, there is no direct i32 -> f64 vector conversion instruction.
996  // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
997  // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
1000 
1001  if (Subtarget->hasFullFP16()) {
1006  } else {
1007  // when AArch64 doesn't have fullfp16 support, promote the input
1008  // to i32 first.
1013  }
1014 
1017 
1018  // AArch64 doesn't have MUL.2d:
1020  // Custom handling for some quad-vector types to detect MULL.
1024 
1025  // Saturates
1026  for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1032  }
1033 
1034  // Vector reductions
1035  for (MVT VT : { MVT::v4f16, MVT::v2f32,
1037  if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1040 
1042  }
1043  }
1044  for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1051  }
1053 
1056  // Likewise, narrowing and extending vector loads/stores aren't handled
1057  // directly.
1058  for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
1060 
1061  if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1064  } else {
1067  }
1070 
1073 
1074  for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1075  setTruncStoreAction(VT, InnerVT, Expand);
1076  setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1077  setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1078  setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1079  }
1080  }
1081 
1082  // AArch64 has implementations of a lot of rounding-like FP operations.
1083  for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) {
1091  }
1092 
1093  if (Subtarget->hasFullFP16()) {
1094  for (MVT Ty : {MVT::v4f16, MVT::v8f16}) {
1102  }
1103  }
1104 
1105  if (Subtarget->hasSVE())
1107 
1109  }
1110 
1111  if (Subtarget->hasSVE()) {
1112  // FIXME: Add custom lowering of MLOAD to handle different passthrus (not a
1113  // splat of 0 or undef) once vector selects supported in SVE codegen. See
1114  // D68877 for more details.
1115  for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1153 
1156  }
1157 
1158  // Illegal unpacked integer vector types.
1159  for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1162  }
1163 
1164  for (auto VT : {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1}) {
1173 
1174  // There are no legal MVT::nxv16f## based types.
1175  if (VT != MVT::nxv16i1) {
1178  }
1179  }
1180 
1183  for (auto InnerVT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16,
1185  // Avoid marking truncating FP stores as legal to prevent the
1186  // DAGCombiner from creating unsupported truncating stores.
1187  setTruncStoreAction(VT, InnerVT, Expand);
1188  }
1189 
1221  }
1222 
1223  for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1227  }
1228 
1230 
1233 
1234  // NOTE: Currently this has to happen after computeRegisterProperties rather
1235  // than the preferred option of combining it with the addRegisterClass call.
1236  if (Subtarget->useSVEForFixedLengthVectors()) {
1238  if (useSVEForFixedLengthVectorVT(VT))
1239  addTypeForFixedLengthSVE(VT);
1241  if (useSVEForFixedLengthVectorVT(VT))
1242  addTypeForFixedLengthSVE(VT);
1243 
1244  // 64bit results can mean a bigger than NEON input.
1245  for (auto VT : {MVT::v8i8, MVT::v4i16})
1248 
1249  // 128bit results imply a bigger than NEON input.
1250  for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1252  for (auto VT : {MVT::v8f16, MVT::v4f32})
1254 
1255  // These operations are not supported on NEON but SVE can do them.
1294 
1295  // Int operations with no NEON support.
1296  for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1303  }
1304 
1305  // FP operations with no NEON support.
1306  for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32,
1309 
1310  // Use SVE for vectors with more than 2 elements.
1311  for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1313  }
1314 
1319  }
1320 
1322 }
1323 
1324 void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
1325  assert(VT.isVector() && "VT should be a vector type");
1326 
1327  if (VT.isFloatingPoint()) {
1329  setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
1330  setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
1331  }
1332 
1333  // Mark vector float intrinsics as expand.
1334  if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
1343 
1344  // But we do support custom-lowering for FCOPYSIGN.
1346  }
1347 
1359 
1363  for (MVT InnerVT : MVT::all_valuetypes())
1364  setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1365 
1366  // CNT supports only B element sizes, then use UADDLP to widen.
1367  if (VT != MVT::v8i8 && VT != MVT::v16i8)
1369 
1375 
1378 
1379  if (!VT.isFloatingPoint())
1381 
1382  // [SU][MIN|MAX] are available for all NEON types apart from i64.
1383  if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
1384  for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
1385  setOperationAction(Opcode, VT, Legal);
1386 
1387  // F[MIN|MAX][NUM|NAN] are available for all FP NEON types.
1388  if (VT.isFloatingPoint() &&
1389  VT.getVectorElementType() != MVT::bf16 &&
1390  (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
1391  for (unsigned Opcode :
1393  setOperationAction(Opcode, VT, Legal);
1394 
1395  if (Subtarget->isLittleEndian()) {
1396  for (unsigned im = (unsigned)ISD::PRE_INC;
1397  im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
1400  }
1401  }
1402 }
1403 
1404 void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
1405  assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
1406 
1407  // By default everything must be expanded.
1408  for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1410 
1411  // We use EXTRACT_SUBVECTOR to "cast" a scalable vector to a fixed length one.
1413 
1414  if (VT.isFloatingPoint()) {
1426  }
1427 
1428  // Lower fixed length vector operations to scalable equivalents.
1496 }
1497 
1498 void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
1499  addRegisterClass(VT, &AArch64::FPR64RegClass);
1500  addTypeForNEON(VT, MVT::v2i32);
1501 }
1502 
1503 void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
1504  addRegisterClass(VT, &AArch64::FPR128RegClass);
1505  addTypeForNEON(VT, MVT::v4i32);
1506 }
1507 
1509  LLVMContext &C, EVT VT) const {
1510  if (!VT.isVector())
1511  return MVT::i32;
1512  if (VT.isScalableVector())
1515 }
1516 
1517 static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
1518  const APInt &Demanded,
1520  unsigned NewOpc) {
1521  uint64_t OldImm = Imm, NewImm, Enc;
1522  uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
1523 
1524  // Return if the immediate is already all zeros, all ones, a bimm32 or a
1525  // bimm64.
1526  if (Imm == 0 || Imm == Mask ||
1528  return false;
1529 
1530  unsigned EltSize = Size;
1531  uint64_t DemandedBits = Demanded.getZExtValue();
1532 
1533  // Clear bits that are not demanded.
1534  Imm &= DemandedBits;
1535 
1536  while (true) {
1537  // The goal here is to set the non-demanded bits in a way that minimizes
1538  // the number of switching between 0 and 1. In order to achieve this goal,
1539  // we set the non-demanded bits to the value of the preceding demanded bits.
1540  // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
1541  // non-demanded bit), we copy bit0 (1) to the least significant 'x',
1542  // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
1543  // The final result is 0b11000011.
1544  uint64_t NonDemandedBits = ~DemandedBits;
1545  uint64_t InvertedImm = ~Imm & DemandedBits;
1546  uint64_t RotatedImm =
1547  ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
1548  NonDemandedBits;
1549  uint64_t Sum = RotatedImm + NonDemandedBits;
1550  bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
1551  uint64_t Ones = (Sum + Carry) & NonDemandedBits;
1552  NewImm = (Imm | Ones) & Mask;
1553 
1554  // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
1555  // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
1556  // we halve the element size and continue the search.
1557  if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
1558  break;
1559 
1560  // We cannot shrink the element size any further if it is 2-bits.
1561  if (EltSize == 2)
1562  return false;
1563 
1564  EltSize /= 2;
1565  Mask >>= EltSize;
1566  uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
1567 
1568  // Return if there is mismatch in any of the demanded bits of Imm and Hi.
1569  if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
1570  return false;
1571 
1572  // Merge the upper and lower halves of Imm and DemandedBits.
1573  Imm |= Hi;
1574  DemandedBits |= DemandedBitsHi;
1575  }
1576 
1577  ++NumOptimizedImms;
1578 
1579  // Replicate the element across the register width.
1580  while (EltSize < Size) {
1581  NewImm |= NewImm << EltSize;
1582  EltSize *= 2;
1583  }
1584 
1585  (void)OldImm;
1586  assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
1587  "demanded bits should never be altered");
1588  assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
1589 
1590  // Create the new constant immediate node.
1591  EVT VT = Op.getValueType();
1592  SDLoc DL(Op);
1593  SDValue New;
1594 
1595  // If the new constant immediate is all-zeros or all-ones, let the target
1596  // independent DAG combine optimize this node.
1597  if (NewImm == 0 || NewImm == OrigMask) {
1598  New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
1599  TLO.DAG.getConstant(NewImm, DL, VT));
1600  // Otherwise, create a machine node so that target independent DAG combine
1601  // doesn't undo this optimization.
1602  } else {
1604  SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
1605  New = SDValue(
1606  TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
1607  }
1608 
1609  return TLO.CombineTo(Op, New);
1610 }
1611 
1613  SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
1614  TargetLoweringOpt &TLO) const {
1615  // Delay this optimization to as late as possible.
1616  if (!TLO.LegalOps)
1617  return false;
1618 
1620  return false;
1621 
1622  EVT VT = Op.getValueType();
1623  if (VT.isVector())
1624  return false;
1625 
1626  unsigned Size = VT.getSizeInBits();
1627  assert((Size == 32 || Size == 64) &&
1628  "i32 or i64 is expected after legalization.");
1629 
1630  // Exit early if we demand all bits.
1631  if (DemandedBits.countPopulation() == Size)
1632  return false;
1633 
1634  unsigned NewOpc;
1635  switch (Op.getOpcode()) {
1636  default:
1637  return false;
1638  case ISD::AND:
1639  NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
1640  break;
1641  case ISD::OR:
1642  NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
1643  break;
1644  case ISD::XOR:
1645  NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
1646  break;
1647  }
1648  ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
1649  if (!C)
1650  return false;
1651  uint64_t Imm = C->getZExtValue();
1652  return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
1653 }
1654 
1655 /// computeKnownBitsForTargetNode - Determine which of the bits specified in
1656 /// Mask are known to be either zero or one and return them Known.
1658  const SDValue Op, KnownBits &Known,
1659  const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
1660  switch (Op.getOpcode()) {
1661  default:
1662  break;
1663  case AArch64ISD::CSEL: {
1664  KnownBits Known2;
1665  Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
1666  Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
1667  Known = KnownBits::commonBits(Known, Known2);
1668  break;
1669  }
1670  case AArch64ISD::LOADgot:
1671  case AArch64ISD::ADDlow: {
1672  if (!Subtarget->isTargetILP32())
1673  break;
1674  // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
1675  Known.Zero = APInt::getHighBitsSet(64, 32);
1676  break;
1677  }
1678  case ISD::INTRINSIC_W_CHAIN: {
1679  ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
1680  Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
1681  switch (IntID) {
1682  default: return;
1683  case Intrinsic::aarch64_ldaxr:
1684  case Intrinsic::aarch64_ldxr: {
1685  unsigned BitWidth = Known.getBitWidth();
1686  EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
1687  unsigned MemBits = VT.getScalarSizeInBits();
1688  Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
1689  return;
1690  }
1691  }
1692  break;
1693  }
1695  case ISD::INTRINSIC_VOID: {
1696  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
1697  switch (IntNo) {
1698  default:
1699  break;
1700  case Intrinsic::aarch64_neon_umaxv:
1701  case Intrinsic::aarch64_neon_uminv: {
1702  // Figure out the datatype of the vector operand. The UMINV instruction
1703  // will zero extend the result, so we can mark as known zero all the
1704  // bits larger than the element datatype. 32-bit or larget doesn't need
1705  // this as those are legal types and will be handled by isel directly.
1706  MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
1707  unsigned BitWidth = Known.getBitWidth();
1708  if (VT == MVT::v8i8 || VT == MVT::v16i8) {
1709  assert(BitWidth >= 8 && "Unexpected width!");
1711  Known.Zero |= Mask;
1712  } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
1713  assert(BitWidth >= 16 && "Unexpected width!");
1715  Known.Zero |= Mask;
1716  }
1717  break;
1718  } break;
1719  }
1720  }
1721  }
1722 }
1723 
1725  EVT) const {
1726  return MVT::i64;
1727 }
1728 
1730  EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1731  bool *Fast) const {
1732  if (Subtarget->requiresStrictAlign())
1733  return false;
1734 
1735  if (Fast) {
1736  // Some CPUs are fine with unaligned stores except for 128-bit ones.
1737  *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
1738  // See comments in performSTORECombine() for more details about
1739  // these conditions.
1740 
1741  // Code that uses clang vector extensions can mark that it
1742  // wants unaligned accesses to be treated as fast by
1743  // underspecifying alignment to be 1 or 2.
1744  Alignment <= 2 ||
1745 
1746  // Disregard v2i64. Memcpy lowering produces those and splitting
1747  // them regresses performance on micro-benchmarks and olden/bh.
1748  VT == MVT::v2i64;
1749  }
1750  return true;
1751 }
1752 
1753 // Same as above but handling LLTs instead.
1755  LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1756  bool *Fast) const {
1757  if (Subtarget->requiresStrictAlign())
1758  return false;
1759 
1760  if (Fast) {
1761  // Some CPUs are fine with unaligned stores except for 128-bit ones.
1762  *Fast = !Subtarget->isMisaligned128StoreSlow() ||
1763  Ty.getSizeInBytes() != 16 ||
1764  // See comments in performSTORECombine() for more details about
1765  // these conditions.
1766 
1767  // Code that uses clang vector extensions can mark that it
1768  // wants unaligned accesses to be treated as fast by
1769  // underspecifying alignment to be 1 or 2.
1770  Alignment <= 2 ||
1771 
1772  // Disregard v2i64. Memcpy lowering produces those and splitting
1773  // them regresses performance on micro-benchmarks and olden/bh.
1774  Ty == LLT::vector(2, 64);
1775  }
1776  return true;
1777 }
1778 
1779 FastISel *
1781  const TargetLibraryInfo *libInfo) const {
1782  return AArch64::createFastISel(funcInfo, libInfo);
1783 }
1784 
1785 const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
1786 #define MAKE_CASE(V) \
1787  case V: \
1788  return #V;
1789  switch ((AArch64ISD::NodeType)Opcode) {
1791  break;
2068  }
2069 #undef MAKE_CASE
2070  return nullptr;
2071 }
2072 
2075  MachineBasicBlock *MBB) const {
2076  // We materialise the F128CSEL pseudo-instruction as some control flow and a
2077  // phi node:
2078 
2079  // OrigBB:
2080  // [... previous instrs leading to comparison ...]
2081  // b.ne TrueBB
2082  // b EndBB
2083  // TrueBB:
2084  // ; Fallthrough
2085  // EndBB:
2086  // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2087 
2088  MachineFunction *MF = MBB->getParent();
2089  const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2090  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2091  DebugLoc DL = MI.getDebugLoc();
2093 
2094  Register DestReg = MI.getOperand(0).getReg();
2095  Register IfTrueReg = MI.getOperand(1).getReg();
2096  Register IfFalseReg = MI.getOperand(2).getReg();
2097  unsigned CondCode = MI.getOperand(3).getImm();
2098  bool NZCVKilled = MI.getOperand(4).isKill();
2099 
2100  MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
2101  MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
2102  MF->insert(It, TrueBB);
2103  MF->insert(It, EndBB);
2104 
2105  // Transfer rest of current basic-block to EndBB
2106  EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
2107  MBB->end());
2109 
2110  BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
2111  BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
2112  MBB->addSuccessor(TrueBB);
2113  MBB->addSuccessor(EndBB);
2114 
2115  // TrueBB falls through to the end.
2116  TrueBB->addSuccessor(EndBB);
2117 
2118  if (!NZCVKilled) {
2119  TrueBB->addLiveIn(AArch64::NZCV);
2120  EndBB->addLiveIn(AArch64::NZCV);
2121  }
2122 
2123  BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
2124  .addReg(IfTrueReg)
2125  .addMBB(TrueBB)
2126  .addReg(IfFalseReg)
2127  .addMBB(MBB);
2128 
2129  MI.eraseFromParent();
2130  return EndBB;
2131 }
2132 
2134  MachineInstr &MI, MachineBasicBlock *BB) const {
2136  BB->getParent()->getFunction().getPersonalityFn())) &&
2137  "SEH does not use catchret!");
2138  return BB;
2139 }
2140 
2142  MachineInstr &MI, MachineBasicBlock *BB) const {
2143  switch (MI.getOpcode()) {
2144  default:
2145 #ifndef NDEBUG
2146  MI.dump();
2147 #endif
2148  llvm_unreachable("Unexpected instruction for custom inserter!");
2149 
2150  case AArch64::F128CSEL:
2151  return EmitF128CSEL(MI, BB);
2152 
2153  case TargetOpcode::STACKMAP:
2154  case TargetOpcode::PATCHPOINT:
2155  case TargetOpcode::STATEPOINT:
2156  return emitPatchPoint(MI, BB);
2157 
2158  case AArch64::CATCHRET:
2159  return EmitLoweredCatchRet(MI, BB);
2160  }
2161 }
2162 
2163 //===----------------------------------------------------------------------===//
2164 // AArch64 Lowering private implementation.
2165 //===----------------------------------------------------------------------===//
2166 
2167 //===----------------------------------------------------------------------===//
2168 // Lowering Code
2169 //===----------------------------------------------------------------------===//
2170 
2171 /// isZerosVector - Check whether SDNode N is a zero-filled vector.
2172 static bool isZerosVector(const SDNode *N) {
2173  // Look through a bit convert.
2174  while (N->getOpcode() == ISD::BITCAST)
2175  N = N->getOperand(0).getNode();
2176 
2178  return true;
2179 
2180  if (N->getOpcode() != AArch64ISD::DUP)
2181  return false;
2182 
2183  auto Opnd0 = N->getOperand(0);
2184  auto *CINT = dyn_cast<ConstantSDNode>(Opnd0);
2185  auto *CFP = dyn_cast<ConstantFPSDNode>(Opnd0);
2186  return (CINT && CINT->isNullValue()) || (CFP && CFP->isZero());
2187 }
2188 
2189 /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
2190 /// CC
2192  switch (CC) {
2193  default:
2194  llvm_unreachable("Unknown condition code!");
2195  case ISD::SETNE:
2196  return AArch64CC::NE;
2197  case ISD::SETEQ:
2198  return AArch64CC::EQ;
2199  case ISD::SETGT:
2200  return AArch64CC::GT;
2201  case ISD::SETGE:
2202  return AArch64CC::GE;
2203  case ISD::SETLT:
2204  return AArch64CC::LT;
2205  case ISD::SETLE:
2206  return AArch64CC::LE;
2207  case ISD::SETUGT:
2208  return AArch64CC::HI;
2209  case ISD::SETUGE:
2210  return AArch64CC::HS;
2211  case ISD::SETULT:
2212  return AArch64CC::LO;
2213  case ISD::SETULE:
2214  return AArch64CC::LS;
2215  }
2216 }
2217 
2218 /// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
2221  AArch64CC::CondCode &CondCode2) {
2222  CondCode2 = AArch64CC::AL;
2223  switch (CC) {
2224  default:
2225  llvm_unreachable("Unknown FP condition!");
2226  case ISD::SETEQ:
2227  case ISD::SETOEQ:
2229  break;
2230  case ISD::SETGT:
2231  case ISD::SETOGT:
2233  break;
2234  case ISD::SETGE:
2235  case ISD::SETOGE:
2237  break;
2238  case ISD::SETOLT:
2240  break;
2241  case ISD::SETOLE:
2243  break;
2244  case ISD::SETONE:
2246  CondCode2 = AArch64CC::GT;
2247  break;
2248  case ISD::SETO:
2250  break;
2251  case ISD::SETUO:
2253  break;
2254  case ISD::SETUEQ:
2256  CondCode2 = AArch64CC::VS;
2257  break;
2258  case ISD::SETUGT:
2260  break;
2261  case ISD::SETUGE:
2263  break;
2264  case ISD::SETLT:
2265  case ISD::SETULT:
2267  break;
2268  case ISD::SETLE:
2269  case ISD::SETULE:
2271  break;
2272  case ISD::SETNE:
2273  case ISD::SETUNE:
2275  break;
2276  }
2277 }
2278 
2279 /// Convert a DAG fp condition code to an AArch64 CC.
2280 /// This differs from changeFPCCToAArch64CC in that it returns cond codes that
2281 /// should be AND'ed instead of OR'ed.
2284  AArch64CC::CondCode &CondCode2) {
2285  CondCode2 = AArch64CC::AL;
2286  switch (CC) {
2287  default:
2288  changeFPCCToAArch64CC(CC, CondCode, CondCode2);
2289  assert(CondCode2 == AArch64CC::AL);
2290  break;
2291  case ISD::SETONE:
2292  // (a one b)
2293  // == ((a olt b) || (a ogt b))
2294  // == ((a ord b) && (a une b))
2296  CondCode2 = AArch64CC::NE;
2297  break;
2298  case ISD::SETUEQ:
2299  // (a ueq b)
2300  // == ((a uno b) || (a oeq b))
2301  // == ((a ule b) && (a uge b))
2303  CondCode2 = AArch64CC::LE;
2304  break;
2305  }
2306 }
2307 
2308 /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
2309 /// CC usable with the vector instructions. Fewer operations are available
2310 /// without a real NZCV register, so we have to use less efficient combinations
2311 /// to get the same effect.
2314  AArch64CC::CondCode &CondCode2,
2315  bool &Invert) {
2316  Invert = false;
2317  switch (CC) {
2318  default:
2319  // Mostly the scalar mappings work fine.
2320  changeFPCCToAArch64CC(CC, CondCode, CondCode2);
2321  break;
2322  case ISD::SETUO:
2323  Invert = true;
2325  case ISD::SETO:
2327  CondCode2 = AArch64CC::GE;
2328  break;
2329  case ISD::SETUEQ:
2330  case ISD::SETULT:
2331  case ISD::SETULE:
2332  case ISD::SETUGT:
2333  case ISD::SETUGE:
2334  // All of the compare-mask comparisons are ordered, but we can switch
2335  // between the two by a double inversion. E.g. ULE == !OGT.
2336  Invert = true;
2337  changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
2338  CondCode, CondCode2);
2339  break;
2340  }
2341 }
2342 
2343 static bool isLegalArithImmed(uint64_t C) {
2344  // Matches AArch64DAGToDAGISel::SelectArithImmed().
2345  bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
2346  LLVM_DEBUG(dbgs() << "Is imm " << C
2347  << " legal: " << (IsLegal ? "yes\n" : "no\n"));
2348  return IsLegal;
2349 }
2350 
2351 // Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
2352 // the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
2353 // can be set differently by this operation. It comes down to whether
2354 // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
2355 // everything is fine. If not then the optimization is wrong. Thus general
2356 // comparisons are only valid if op2 != 0.
2357 //
2358 // So, finally, the only LLVM-native comparisons that don't mention C and V
2359 // are SETEQ and SETNE. They're the only ones we can safely use CMN for in
2360 // the absence of information about op2.
2361 static bool isCMN(SDValue Op, ISD::CondCode CC) {
2362  return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
2363  (CC == ISD::SETEQ || CC == ISD::SETNE);
2364 }
2365 
2367  SelectionDAG &DAG, SDValue Chain,
2368  bool IsSignaling) {
2369  EVT VT = LHS.getValueType();
2370  assert(VT != MVT::f128);
2371  assert(VT != MVT::f16 && "Lowering of strict fp16 not yet implemented");
2372  unsigned Opcode =
2374  return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS});
2375 }
2376 
2378  const SDLoc &dl, SelectionDAG &DAG) {
2379  EVT VT = LHS.getValueType();
2380  const bool FullFP16 =
2381  static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
2382 
2383  if (VT.isFloatingPoint()) {
2384  assert(VT != MVT::f128);
2385  if (VT == MVT::f16 && !FullFP16) {
2386  LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
2387  RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
2388  VT = MVT::f32;
2389  }
2390  return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
2391  }
2392 
2393  // The CMP instruction is just an alias for SUBS, and representing it as
2394  // SUBS means that it's possible to get CSE with subtract operations.
2395  // A later phase can perform the optimization of setting the destination
2396  // register to WZR/XZR if it ends up being unused.
2397  unsigned Opcode = AArch64ISD::SUBS;
2398 
2399  if (isCMN(RHS, CC)) {
2400  // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
2401  Opcode = AArch64ISD::ADDS;
2402  RHS = RHS.getOperand(1);
2403  } else if (isCMN(LHS, CC)) {
2404  // As we are looking for EQ/NE compares, the operands can be commuted ; can
2405  // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
2406  Opcode = AArch64ISD::ADDS;
2407  LHS = LHS.getOperand(1);
2408  } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
2409  if (LHS.getOpcode() == ISD::AND) {
2410  // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
2411  // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
2412  // of the signed comparisons.
2413  const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl,
2414  DAG.getVTList(VT, MVT_CC),
2415  LHS.getOperand(0),
2416  LHS.getOperand(1));
2417  // Replace all users of (and X, Y) with newly generated (ands X, Y)
2418  DAG.ReplaceAllUsesWith(LHS, ANDSNode);
2419  return ANDSNode.getValue(1);
2420  } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
2421  // Use result of ANDS
2422  return LHS.getValue(1);
2423  }
2424  }
2425 
2426  return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
2427  .getValue(1);
2428 }
2429 
2430 /// \defgroup AArch64CCMP CMP;CCMP matching
2431 ///
2432 /// These functions deal with the formation of CMP;CCMP;... sequences.
2433 /// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
2434 /// a comparison. They set the NZCV flags to a predefined value if their
2435 /// predicate is false. This allows to express arbitrary conjunctions, for
2436 /// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
2437 /// expressed as:
2438 /// cmp A
2439 /// ccmp B, inv(CB), CA
2440 /// check for CB flags
2441 ///
2442 /// This naturally lets us implement chains of AND operations with SETCC
2443 /// operands. And we can even implement some other situations by transforming
2444 /// them:
2445 /// - We can implement (NEG SETCC) i.e. negating a single comparison by
2446 /// negating the flags used in a CCMP/FCCMP operations.
2447 /// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
2448 /// by negating the flags we test for afterwards. i.e.
2449 /// NEG (CMP CCMP CCCMP ...) can be implemented.
2450 /// - Note that we can only ever negate all previously processed results.
2451 /// What we can not implement by flipping the flags to test is a negation
2452 /// of two sub-trees (because the negation affects all sub-trees emitted so
2453 /// far, so the 2nd sub-tree we emit would also affect the first).
2454 /// With those tools we can implement some OR operations:
2455 /// - (OR (SETCC A) (SETCC B)) can be implemented via:
2456 /// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
2457 /// - After transforming OR to NEG/AND combinations we may be able to use NEG
2458 /// elimination rules from earlier to implement the whole thing as a
2459 /// CCMP/FCCMP chain.
2460 ///
2461 /// As complete example:
2462 /// or (or (setCA (cmp A)) (setCB (cmp B)))
2463 /// (and (setCC (cmp C)) (setCD (cmp D)))"
2464 /// can be reassociated to:
2465 /// or (and (setCC (cmp C)) setCD (cmp D))
2466 // (or (setCA (cmp A)) (setCB (cmp B)))
2467 /// can be transformed to:
2468 /// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
2469 /// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
2470 /// which can be implemented as:
2471 /// cmp C
2472 /// ccmp D, inv(CD), CC
2473 /// ccmp A, CA, inv(CD)
2474 /// ccmp B, CB, inv(CA)
2475 /// check for CB flags
2476 ///
2477 /// A counterexample is "or (and A B) (and C D)" which translates to
2478 /// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
2479 /// can only implement 1 of the inner (not) operations, but not both!
2480 /// @{
2481 
2482 /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
2484  ISD::CondCode CC, SDValue CCOp,
2486  AArch64CC::CondCode OutCC,
2487  const SDLoc &DL, SelectionDAG &DAG) {
2488  unsigned Opcode = 0;
2489  const bool FullFP16 =
2490  static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
2491 
2492  if (LHS.getValueType().isFloatingPoint()) {
2493  assert(LHS.getValueType() != MVT::f128);
2494  if (LHS.getValueType() == MVT::f16 && !FullFP16) {
2495  LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
2496  RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
2497  }
2498  Opcode = AArch64ISD::FCCMP;
2499  } else if (RHS.getOpcode() == ISD::SUB) {
2500  SDValue SubOp0 = RHS.getOperand(0);
2501  if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
2502  // See emitComparison() on why we can only do this for SETEQ and SETNE.
2503  Opcode = AArch64ISD::CCMN;
2504  RHS = RHS.getOperand(1);
2505  }
2506  }
2507  if (Opcode == 0)
2508  Opcode = AArch64ISD::CCMP;
2509 
2510  SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
2512  unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
2513  SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
2514  return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
2515 }
2516 
2517 /// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
2518 /// expressed as a conjunction. See \ref AArch64CCMP.
2519 /// \param CanNegate Set to true if we can negate the whole sub-tree just by
2520 /// changing the conditions on the SETCC tests.
2521 /// (this means we can call emitConjunctionRec() with
2522 /// Negate==true on this sub-tree)
2523 /// \param MustBeFirst Set to true if this subtree needs to be negated and we
2524 /// cannot do the negation naturally. We are required to
2525 /// emit the subtree first in this case.
2526 /// \param WillNegate Is true if are called when the result of this
2527 /// subexpression must be negated. This happens when the
2528 /// outer expression is an OR. We can use this fact to know
2529 /// that we have a double negation (or (or ...) ...) that
2530 /// can be implemented for free.
2531 static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
2532  bool &MustBeFirst, bool WillNegate,
2533  unsigned Depth = 0) {
2534  if (!Val.hasOneUse())
2535  return false;
2536  unsigned Opcode = Val->getOpcode();
2537  if (Opcode == ISD::SETCC) {
2538  if (Val->getOperand(0).getValueType() == MVT::f128)
2539  return false;
2540  CanNegate = true;
2541  MustBeFirst = false;
2542  return true;
2543  }
2544  // Protect against exponential runtime and stack overflow.
2545  if (Depth > 6)
2546  return false;
2547  if (Opcode == ISD::AND || Opcode == ISD::OR) {
2548  bool IsOR = Opcode == ISD::OR;
2549  SDValue O0 = Val->getOperand(0);
2550  SDValue O1 = Val->getOperand(1);
2551  bool CanNegateL;
2552  bool MustBeFirstL;
2553  if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
2554  return false;
2555  bool CanNegateR;
2556  bool MustBeFirstR;
2557  if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
2558  return false;
2559 
2560  if (MustBeFirstL && MustBeFirstR)
2561  return false;
2562 
2563  if (IsOR) {
2564  // For an OR expression we need to be able to naturally negate at least
2565  // one side or we cannot do the transformation at all.
2566  if (!CanNegateL && !CanNegateR)
2567  return false;
2568  // If we the result of the OR will be negated and we can naturally negate
2569  // the leafs, then this sub-tree as a whole negates naturally.
2570  CanNegate = WillNegate && CanNegateL && CanNegateR;
2571  // If we cannot naturally negate the whole sub-tree, then this must be
2572  // emitted first.
2573  MustBeFirst = !CanNegate;
2574  } else {
2575  assert(Opcode == ISD::AND && "Must be OR or AND");
2576  // We cannot naturally negate an AND operation.
2577  CanNegate = false;
2578  MustBeFirst = MustBeFirstL || MustBeFirstR;
2579  }
2580  return true;
2581  }
2582  return false;
2583 }
2584 
2585 /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
2586 /// of CCMP/CFCMP ops. See @ref AArch64CCMP.
2587 /// Tries to transform the given i1 producing node @p Val to a series compare
2588 /// and conditional compare operations. @returns an NZCV flags producing node
2589 /// and sets @p OutCC to the flags that should be tested or returns SDValue() if
2590 /// transformation was not possible.
2591 /// \p Negate is true if we want this sub-tree being negated just by changing
2592 /// SETCC conditions.
2594  AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
2596  // We're at a tree leaf, produce a conditional comparison operation.
2597  unsigned Opcode = Val->getOpcode();
2598  if (Opcode == ISD::SETCC) {
2599  SDValue LHS = Val->getOperand(0);
2600  SDValue RHS = Val->getOperand(1);
2601  ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
2602  bool isInteger = LHS.getValueType().isInteger();
2603  if (Negate)
2604  CC = getSetCCInverse(CC, LHS.getValueType());
2605  SDLoc DL(Val);
2606  // Determine OutCC and handle FP special case.
2607  if (isInteger) {
2608  OutCC = changeIntCCToAArch64CC(CC);
2609  } else {
2611  AArch64CC::CondCode ExtraCC;
2612  changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
2613  // Some floating point conditions can't be tested with a single condition
2614  // code. Construct an additional comparison in this case.
2615  if (ExtraCC != AArch64CC::AL) {
2616  SDValue ExtraCmp;
2617  if (!CCOp.getNode())
2618  ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
2619  else
2620  ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
2621  ExtraCC, DL, DAG);
2622  CCOp = ExtraCmp;
2623  Predicate = ExtraCC;
2624  }
2625  }
2626 
2627  // Produce a normal comparison if we are first in the chain
2628  if (!CCOp)
2629  return emitComparison(LHS, RHS, CC, DL, DAG);
2630  // Otherwise produce a ccmp.
2631  return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
2632  DAG);
2633  }
2634  assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
2635 
2636  bool IsOR = Opcode == ISD::OR;
2637 
2638  SDValue LHS = Val->getOperand(0);
2639  bool CanNegateL;
2640  bool MustBeFirstL;
2641  bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
2642  assert(ValidL && "Valid conjunction/disjunction tree");
2643  (void)ValidL;
2644 
2645  SDValue RHS = Val->getOperand(1);
2646  bool CanNegateR;
2647  bool MustBeFirstR;
2648  bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
2649  assert(ValidR && "Valid conjunction/disjunction tree");
2650  (void)ValidR;
2651 
2652  // Swap sub-tree that must come first to the right side.
2653  if (MustBeFirstL) {
2654  assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
2655  std::swap(LHS, RHS);
2656  std::swap(CanNegateL, CanNegateR);
2657  std::swap(MustBeFirstL, MustBeFirstR);
2658  }
2659 
2660  bool NegateR;
2661  bool NegateAfterR;
2662  bool NegateL;
2663  bool NegateAfterAll;
2664  if (Opcode == ISD::OR) {
2665  // Swap the sub-tree that we can negate naturally to the left.
2666  if (!CanNegateL) {
2667  assert(CanNegateR && "at least one side must be negatable");
2668  assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
2669  assert(!Negate);
2670  std::swap(LHS, RHS);
2671  NegateR = false;
2672  NegateAfterR = true;
2673  } else {
2674  // Negate the left sub-tree if possible, otherwise negate the result.
2675  NegateR = CanNegateR;
2676  NegateAfterR = !CanNegateR;
2677  }
2678  NegateL = true;
2679  NegateAfterAll = !Negate;
2680  } else {
2681  assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
2682  assert(!Negate && "Valid conjunction/disjunction tree");
2683 
2684  NegateL = false;
2685  NegateR = false;
2686  NegateAfterR = false;
2687  NegateAfterAll = false;
2688  }
2689 
2690  // Emit sub-trees.
2691  AArch64CC::CondCode RHSCC;
2692  SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
2693  if (NegateAfterR)
2694  RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
2695  SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
2696  if (NegateAfterAll)
2697  OutCC = AArch64CC::getInvertedCondCode(OutCC);
2698  return CmpL;
2699 }
2700 
2701 /// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
2702 /// In some cases this is even possible with OR operations in the expression.
2703 /// See \ref AArch64CCMP.
2704 /// \see emitConjunctionRec().
2706  AArch64CC::CondCode &OutCC) {
2707  bool DummyCanNegate;
2708  bool DummyMustBeFirst;
2709  if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
2710  return SDValue();
2711 
2712  return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
2713 }
2714 
2715 /// @}
2716 
2717 /// Returns how profitable it is to fold a comparison's operand's shift and/or
2718 /// extension operations.
2720  auto isSupportedExtend = [&](SDValue V) {
2721  if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
2722  return true;
2723 
2724  if (V.getOpcode() == ISD::AND)
2725  if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
2726  uint64_t Mask = MaskCst->getZExtValue();
2727  return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
2728  }
2729 
2730  return false;
2731  };
2732 
2733  if (!Op.hasOneUse())
2734  return 0;
2735 
2736  if (isSupportedExtend(Op))
2737  return 1;
2738 
2739  unsigned Opc = Op.getOpcode();
2740  if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
2741  if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
2742  uint64_t Shift = ShiftCst->getZExtValue();
2743  if (isSupportedExtend(Op.getOperand(0)))
2744  return (Shift <= 4) ? 2 : 1;
2745  EVT VT = Op.getValueType();
2746  if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
2747  return 1;
2748  }
2749 
2750  return 0;
2751 }
2752 
2754  SDValue &AArch64cc, SelectionDAG &DAG,
2755  const SDLoc &dl) {
2756  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
2757  EVT VT = RHS.getValueType();
2758  uint64_t C = RHSC->getZExtValue();
2759  if (!isLegalArithImmed(C)) {
2760  // Constant does not fit, try adjusting it by one?
2761  switch (CC) {
2762  default:
2763  break;
2764  case ISD::SETLT:
2765  case ISD::SETGE:
2766  if ((VT == MVT::i32 && C != 0x80000000 &&
2767  isLegalArithImmed((uint32_t)(C - 1))) ||
2768  (VT == MVT::i64 && C != 0x80000000ULL &&
2769  isLegalArithImmed(C - 1ULL))) {
2770  CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
2771  C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
2772  RHS = DAG.getConstant(C, dl, VT);
2773  }
2774  break;
2775  case ISD::SETULT:
2776  case ISD::SETUGE:
2777  if ((VT == MVT::i32 && C != 0 &&
2778  isLegalArithImmed((uint32_t)(C - 1))) ||
2779  (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
2780  CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
2781  C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
2782  RHS = DAG.getConstant(C, dl, VT);
2783  }
2784  break;
2785  case ISD::SETLE:
2786  case ISD::SETGT:
2787  if ((VT == MVT::i32 && C != INT32_MAX &&
2788  isLegalArithImmed((uint32_t)(C + 1))) ||
2789  (VT == MVT::i64 && C != INT64_MAX &&
2790  isLegalArithImmed(C + 1ULL))) {
2791  CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
2792  C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
2793  RHS = DAG.getConstant(C, dl, VT);
2794  }
2795  break;
2796  case ISD::SETULE:
2797  case ISD::SETUGT:
2798  if ((VT == MVT::i32 && C != UINT32_MAX &&
2799  isLegalArithImmed((uint32_t)(C + 1))) ||
2800  (VT == MVT::i64 && C != UINT64_MAX &&
2801  isLegalArithImmed(C + 1ULL))) {
2802  CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
2803  C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
2804  RHS = DAG.getConstant(C, dl, VT);
2805  }
2806  break;
2807  }
2808  }
2809  }
2810 
2811  // Comparisons are canonicalized so that the RHS operand is simpler than the
2812  // LHS one, the extreme case being when RHS is an immediate. However, AArch64
2813  // can fold some shift+extend operations on the RHS operand, so swap the
2814  // operands if that can be done.
2815  //
2816  // For example:
2817  // lsl w13, w11, #1
2818  // cmp w13, w12
2819  // can be turned into:
2820  // cmp w12, w11, lsl #1
2821  if (!isa<ConstantSDNode>(RHS) ||
2822  !isLegalArithImmed(cast<ConstantSDNode>(RHS)->getZExtValue())) {
2823  SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS;
2824 
2826  std::swap(LHS, RHS);
2828  }
2829  }
2830 
2831  SDValue Cmp;
2832  AArch64CC::CondCode AArch64CC;
2833  if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
2834  const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
2835 
2836  // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
2837  // For the i8 operand, the largest immediate is 255, so this can be easily
2838  // encoded in the compare instruction. For the i16 operand, however, the
2839  // largest immediate cannot be encoded in the compare.
2840  // Therefore, use a sign extending load and cmn to avoid materializing the
2841  // -1 constant. For example,
2842  // movz w1, #65535
2843  // ldrh w0, [x0, #0]
2844  // cmp w0, w1
2845  // >
2846  // ldrsh w0, [x0, #0]
2847  // cmn w0, #1
2848  // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
2849  // if and only if (sext LHS) == (sext RHS). The checks are in place to
2850  // ensure both the LHS and RHS are truly zero extended and to make sure the
2851  // transformation is profitable.
2852  if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
2853  cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
2854  cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
2855  LHS.getNode()->hasNUsesOfValue(1, 0)) {
2856  int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
2857  if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
2858  SDValue SExt =
2859  DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
2860  DAG.getValueType(MVT::i16));
2861  Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
2862  RHS.getValueType()),
2863  CC, dl, DAG);
2864  AArch64CC = changeIntCCToAArch64CC(CC);
2865  }
2866  }
2867 
2868  if (!Cmp && (RHSC->isNullValue() || RHSC->isOne())) {
2869  if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
2870  if ((CC == ISD::SETNE) ^ RHSC->isNullValue())
2871  AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
2872  }
2873  }
2874  }
2875 
2876  if (!Cmp) {
2877  Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
2878  AArch64CC = changeIntCCToAArch64CC(CC);
2879  }
2880  AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
2881  return Cmp;
2882 }
2883 
2884 static std::pair<SDValue, SDValue>
2886  assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
2887  "Unsupported value type");
2888  SDValue Value, Overflow;
2889  SDLoc DL(Op);
2890  SDValue LHS = Op.getOperand(0);
2891  SDValue RHS = Op.getOperand(1);
2892  unsigned Opc = 0;
2893  switch (Op.getOpcode()) {
2894  default:
2895  llvm_unreachable("Unknown overflow instruction!");
2896  case ISD::SADDO:
2897  Opc = AArch64ISD::ADDS;
2898  CC = AArch64CC::VS;
2899  break;
2900  case ISD::UADDO:
2901  Opc = AArch64ISD::ADDS;
2902  CC = AArch64CC::HS;
2903  break;
2904  case ISD::SSUBO:
2905  Opc = AArch64ISD::SUBS;
2906  CC = AArch64CC::VS;
2907  break;
2908  case ISD::USUBO:
2909  Opc = AArch64ISD::SUBS;
2910  CC = AArch64CC::LO;
2911  break;
2912  // Multiply needs a little bit extra work.
2913  case ISD::SMULO:
2914  case ISD::UMULO: {
2915  CC = AArch64CC::NE;
2916  bool IsSigned = Op.getOpcode() == ISD::SMULO;
2917  if (Op.getValueType() == MVT::i32) {
2918  unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
2919  // For a 32 bit multiply with overflow check we want the instruction
2920  // selector to generate a widening multiply (SMADDL/UMADDL). For that we
2921  // need to generate the following pattern:
2922  // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b))
2923  LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
2924  RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
2925  SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
2926  SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul,
2927  DAG.getConstant(0, DL, MVT::i64));
2928  // On AArch64 the upper 32 bits are always zero extended for a 32 bit
2929  // operation. We need to clear out the upper 32 bits, because we used a
2930  // widening multiply that wrote all 64 bits. In the end this should be a
2931  // noop.
2932  Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add);
2933  if (IsSigned) {
2934  // The signed overflow check requires more than just a simple check for
2935  // any bit set in the upper 32 bits of the result. These bits could be
2936  // just the sign bits of a negative number. To perform the overflow
2937  // check we have to arithmetic shift right the 32nd bit of the result by
2938  // 31 bits. Then we compare the result to the upper 32 bits.
2939  SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add,
2940  DAG.getConstant(32, DL, MVT::i64));
2941  UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits);
2942  SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value,
2943  DAG.getConstant(31, DL, MVT::i64));
2944  // It is important that LowerBits is last, otherwise the arithmetic
2945  // shift will not be folded into the compare (SUBS).
2946  SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32);
2947  Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
2948  .getValue(1);
2949  } else {
2950  // The overflow check for unsigned multiply is easy. We only need to
2951  // check if any of the upper 32 bits are set. This can be done with a
2952  // CMP (shifted register). For that we need to generate the following
2953  // pattern:
2954  // (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32)
2955  SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul,
2956  DAG.getConstant(32, DL, MVT::i64));
2957  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
2958  Overflow =
2959  DAG.getNode(AArch64ISD::SUBS, DL, VTs,
2960  DAG.getConstant(0, DL, MVT::i64),
2961  UpperBits).getValue(1);
2962  }
2963  break;
2964  }
2965  assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
2966  // For the 64 bit multiply
2967  Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
2968  if (IsSigned) {
2969  SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
2970  SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
2971  DAG.getConstant(63, DL, MVT::i64));
2972  // It is important that LowerBits is last, otherwise the arithmetic
2973  // shift will not be folded into the compare (SUBS).
2974  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
2975  Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
2976  .getValue(1);
2977  } else {
2978  SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
2979  SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
2980  Overflow =
2981  DAG.getNode(AArch64ISD::SUBS, DL, VTs,
2982  DAG.getConstant(0, DL, MVT::i64),
2983  UpperBits).getValue(1);
2984  }
2985  break;
2986  }
2987  } // switch (...)
2988 
2989  if (Opc) {
2990  SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
2991 
2992  // Emit the AArch64 operation with overflow check.
2993  Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
2994  Overflow = Value.getValue(1);
2995  }
2996  return std::make_pair(Value, Overflow);
2997 }
2998 
2999 SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
3000  if (useSVEForFixedLengthVectorVT(Op.getValueType()))
3001  return LowerToScalableOp(Op, DAG);
3002 
3003  SDValue Sel = Op.getOperand(0);
3004  SDValue Other = Op.getOperand(1);
3005  SDLoc dl(Sel);
3006 
3007  // If the operand is an overflow checking operation, invert the condition
3008  // code and kill the Not operation. I.e., transform:
3009  // (xor (overflow_op_bool, 1))
3010  // -->
3011  // (csel 1, 0, invert(cc), overflow_op_bool)
3012  // ... which later gets transformed to just a cset instruction with an
3013  // inverted condition code, rather than a cset + eor sequence.
3014  if (isOneConstant(Other) && ISD::isOverflowIntrOpRes(Sel)) {
3015  // Only lower legal XALUO ops.
3016  if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0)))
3017  return SDValue();
3018 
3019  SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
3020  SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
3022  SDValue Value, Overflow;
3023  std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
3024  SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
3025  return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
3026  CCVal, Overflow);
3027  }
3028  // If neither operand is a SELECT_CC, give up.
3029  if (Sel.getOpcode() != ISD::SELECT_CC)
3030  std::swap(Sel, Other);
3031  if (Sel.getOpcode() != ISD::SELECT_CC)
3032  return Op;
3033 
3034  // The folding we want to perform is:
3035  // (xor x, (select_cc a, b, cc, 0, -1) )
3036  // -->
3037  // (csel x, (xor x, -1), cc ...)
3038  //
3039  // The latter will get matched to a CSINV instruction.
3040 
3041  ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
3042  SDValue LHS = Sel.getOperand(0);
3043  SDValue RHS = Sel.getOperand(1);
3044  SDValue TVal = Sel.getOperand(2);
3045  SDValue FVal = Sel.getOperand(3);
3046 
3047  // FIXME: This could be generalized to non-integer comparisons.
3048  if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
3049  return Op;
3050 
3051  ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
3052  ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
3053 
3054  // The values aren't constants, this isn't the pattern we're looking for.
3055  if (!CFVal || !CTVal)
3056  return Op;
3057 
3058  // We can commute the SELECT_CC by inverting the condition. This
3059  // might be needed to make this fit into a CSINV pattern.
3060  if (CTVal->isAllOnesValue() && CFVal->isNullValue()) {
3061  std::swap(TVal, FVal);
3062  std::swap(CTVal, CFVal);
3063  CC = ISD::getSetCCInverse(CC, LHS.getValueType());
3064  }
3065 
3066  // If the constants line up, perform the transform!
3067  if (CTVal->isNullValue() && CFVal->isAllOnesValue()) {
3068  SDValue CCVal;
3069  SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
3070 
3071  FVal = Other;
3072  TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
3073  DAG.getConstant(-1ULL, dl, Other.getValueType()));
3074 
3075  return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
3076  CCVal, Cmp);
3077  }
3078 
3079  return Op;
3080 }
3081 
3083  EVT VT = Op.getValueType();
3084 
3085  // Let legalize expand this if it isn't a legal type yet.
3086  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
3087  return SDValue();
3088 
3089  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
3090 
3091  unsigned Opc;
3092  bool ExtraOp = false;
3093  switch (Op.getOpcode()) {
3094  default:
3095  llvm_unreachable("Invalid code");
3096  case ISD::ADDC:
3097  Opc = AArch64ISD::ADDS;
3098  break;
3099  case ISD::SUBC:
3100  Opc = AArch64ISD::SUBS;
3101  break;
3102  case ISD::ADDE:
3103  Opc = AArch64ISD::ADCS;
3104  ExtraOp = true;
3105  break;
3106  case ISD::SUBE:
3107  Opc = AArch64ISD::SBCS;
3108  ExtraOp = true;
3109  break;
3110  }
3111 
3112  if (!ExtraOp)
3113  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
3114  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
3115  Op.getOperand(2));
3116 }
3117 
3119  // Let legalize expand this if it isn't a legal type yet.
3120  if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
3121  return SDValue();
3122 
3123  SDLoc dl(Op);
3125  // The actual operation that sets the overflow or carry flag.
3126  SDValue Value, Overflow;
3127  std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
3128 
3129  // We use 0 and 1 as false and true values.
3130  SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
3131  SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
3132 
3133  // We use an inverted condition, because the conditional select is inverted
3134  // too. This will allow it to be selected to a single instruction:
3135  // CSINC Wd, WZR, WZR, invert(cond).
3136  SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
3137  Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
3138  CCVal, Overflow);
3139 
3140  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
3141  return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
3142 }
3143 
3144 // Prefetch operands are:
3145 // 1: Address to prefetch
3146 // 2: bool isWrite
3147 // 3: int locality (0 = no locality ... 3 = extreme locality)
3148 // 4: bool isDataCache
3150  SDLoc DL(Op);
3151  unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
3152  unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
3153  unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
3154 
3155  bool IsStream = !Locality;
3156  // When the locality number is set
3157  if (Locality) {
3158  // The front-end should have filtered out the out-of-range values
3159  assert(Locality <= 3 && "Prefetch locality out-of-range");
3160  // The locality degree is the opposite of the cache speed.
3161  // Put the number the other way around.
3162  // The encoding starts at 0 for level 1
3163  Locality = 3 - Locality;
3164  }
3165 
3166  // built the mask value encoding the expected behavior.
3167  unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
3168  (!IsData << 3) | // IsDataCache bit
3169  (Locality << 1) | // Cache level bits
3170  (unsigned)IsStream; // Stream bit
3171  return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
3172  DAG.getConstant(PrfOp, DL, MVT::i32), Op.getOperand(1));
3173 }
3174 
3175 SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
3176  SelectionDAG &DAG) const {
3177  if (Op.getValueType().isScalableVector())
3178  return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
3179 
3180  assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
3181  return SDValue();
3182 }
3183 
3184 SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
3185  SelectionDAG &DAG) const {
3186  if (Op.getValueType().isScalableVector())
3187  return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
3188 
3189  bool IsStrict = Op->isStrictFPOpcode();
3190  SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
3191  EVT SrcVT = SrcVal.getValueType();
3192 
3193  if (SrcVT != MVT::f128) {
3194  // Expand cases where the input is a vector bigger than NEON.
3195  if (useSVEForFixedLengthVectorVT(SrcVT))
3196  return SDValue();
3197 
3198  // It's legal except when f128 is involved
3199  return Op;
3200  }
3201 
3202  return SDValue();
3203 }
3204 
3205 SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
3206  SelectionDAG &DAG) const {
3207  // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
3208  // Any additional optimization in this function should be recorded
3209  // in the cost tables.
3210  EVT InVT = Op.getOperand(0).getValueType();
3211  EVT VT = Op.getValueType();
3212 
3213  if (VT.isScalableVector()) {
3214  unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
3217  return LowerToPredicatedOp(Op, DAG, Opcode);
3218  }
3219 
3220  unsigned NumElts = InVT.getVectorNumElements();
3221 
3222  // f16 conversions are promoted to f32 when full fp16 is not supported.
3223  if (InVT.getVectorElementType() == MVT::f16 &&
3224  !Subtarget->hasFullFP16()) {
3225  MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
3226  SDLoc dl(Op);
3227  return DAG.getNode(
3228  Op.getOpcode(), dl, Op.getValueType(),
3229  DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
3230  }
3231 
3232  uint64_t VTSize = VT.getFixedSizeInBits();
3233  uint64_t InVTSize = InVT.getFixedSizeInBits();
3234  if (VTSize < InVTSize) {
3235  SDLoc dl(Op);
3236  SDValue Cv =
3237  DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
3238  Op.getOperand(0));
3239  return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
3240  }
3241 
3242  if (VTSize > InVTSize) {
3243  SDLoc dl(Op);
3244  MVT ExtVT =
3246  VT.getVectorNumElements());
3247  SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
3248  return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
3249  }
3250 
3251  // Type changing conversions are illegal.
3252  return Op;
3253 }
3254 
3255 SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
3256  SelectionDAG &DAG) const {
3257  bool IsStrict = Op->isStrictFPOpcode();
3258  SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
3259 
3260  if (SrcVal.getValueType().isVector())
3261  return LowerVectorFP_TO_INT(Op, DAG);
3262 
3263  // f16 conversions are promoted to f32 when full fp16 is not supported.
3264  if (SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
3265  assert(!IsStrict && "Lowering of strict fp16 not yet implemented");
3266  SDLoc dl(Op);
3267  return DAG.getNode(
3268  Op.getOpcode(), dl, Op.getValueType(),
3269  DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal));
3270  }
3271 
3272  if (SrcVal.getValueType() != MVT::f128) {
3273  // It's legal except when f128 is involved
3274  return Op;
3275  }
3276 
3277  return SDValue();
3278 }
3279 
3280 SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
3281  SelectionDAG &DAG) const {
3282  // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
3283  // Any additional optimization in this function should be recorded
3284  // in the cost tables.
3285  EVT VT = Op.getValueType();
3286  SDLoc dl(Op);
3287  SDValue In = Op.getOperand(0);
3288  EVT InVT = In.getValueType();
3289  unsigned Opc = Op.getOpcode();
3290  bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
3291 
3292  if (VT.isScalableVector()) {
3293  if (InVT.getVectorElementType() == MVT::i1) {
3294  // We can't directly extend an SVE predicate; extend it first.
3295  unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3296  EVT CastVT = getPromotedVTForPredicate(InVT);
3297  In = DAG.getNode(CastOpc, dl, CastVT, In);
3298  return DAG.getNode(Opc, dl, VT, In);
3299  }
3300 
3301  unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
3303  return LowerToPredicatedOp(Op, DAG, Opcode);
3304  }
3305 
3306  uint64_t VTSize = VT.getFixedSizeInBits();
3307  uint64_t InVTSize = InVT.getFixedSizeInBits();
3308  if (VTSize < InVTSize) {
3309  MVT CastVT =
3311  InVT.getVectorNumElements());
3312  In = DAG.getNode(Opc, dl, CastVT, In);
3313  return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl));
3314  }
3315 
3316  if (VTSize > InVTSize) {
3317  unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3318  EVT CastVT = VT.changeVectorElementTypeToInteger();
3319  In = DAG.getNode(CastOpc, dl, CastVT, In);
3320  return DAG.getNode(Opc, dl, VT, In);
3321  }
3322 
3323  return Op;
3324 }
3325 
3326 SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
3327  SelectionDAG &DAG) const {
3328  if (Op.getValueType().isVector())
3329  return LowerVectorINT_TO_FP(Op, DAG);
3330 
3331  bool IsStrict = Op->isStrictFPOpcode();
3332  SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
3333 
3334  // f16 conversions are promoted to f32 when full fp16 is not supported.
3335  if (Op.getValueType() == MVT::f16 &&
3336  !Subtarget->hasFullFP16()) {
3337  assert(!IsStrict && "Lowering of strict fp16 not yet implemented");
3338  SDLoc dl(Op);
3339  return DAG.getNode(
3340  ISD::FP_ROUND, dl, MVT::f16,
3341  DAG.getNode(Op.getOpcode(), dl, MVT::f32, SrcVal),
3342  DAG.getIntPtrConstant(0, dl));
3343  }
3344 
3345  // i128 conversions are libcalls.
3346  if (SrcVal.getValueType() == MVT::i128)
3347  return SDValue();
3348 
3349  // Other conversions are legal, unless it's to the completely software-based
3350  // fp128.
3351  if (Op.getValueType() != MVT::f128)
3352  return Op;
3353  return SDValue();
3354 }
3355 
3356 SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
3357  SelectionDAG &DAG) const {
3358  // For iOS, we want to call an alternative entry point: __sincos_stret,
3359  // which returns the values in two S / D registers.
3360  SDLoc dl(Op);
3361  SDValue Arg = Op.getOperand(0);
3362  EVT ArgVT = Arg.getValueType();
3363  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
3364 
3365  ArgListTy Args;
3366  ArgListEntry Entry;
3367 
3368  Entry.Node = Arg;
3369  Entry.Ty = ArgTy;
3370  Entry.IsSExt = false;
3371  Entry.IsZExt = false;
3372  Args.push_back(Entry);
3373 
3374  RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
3375  : RTLIB::SINCOS_STRET_F32;
3376  const char *LibcallName = getLibcallName(LC);
3377  SDValue Callee =
3378  DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
3379 
3380  StructType *RetTy = StructType::get(ArgTy, ArgTy);
3382  CLI.setDebugLoc(dl)
3383  .setChain(DAG.getEntryNode())
3384  .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
3385 
3386  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
3387  return CallResult.first;
3388 }
3389 
3391  EVT OpVT = Op.getValueType();
3392  if (OpVT != MVT::f16 && OpVT != MVT::bf16)
3393  return SDValue();
3394 
3395  assert(Op.getOperand(0).getValueType() == MVT::i16);
3396  SDLoc DL(Op);
3397 
3398  Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
3399  Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
3400  return SDValue(
3401  DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, OpVT, Op,
3402  DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
3403  0);
3404 }
3405 
3406 static EVT getExtensionTo64Bits(const EVT &OrigVT) {
3407  if (OrigVT.getSizeInBits() >= 64)
3408  return OrigVT;
3409 
3410  assert(OrigVT.isSimple() && "Expecting a simple value type");
3411 
3412  MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
3413  switch (OrigSimpleTy) {
3414  default: llvm_unreachable("Unexpected Vector Type");
3415  case MVT::v2i8:
3416  case MVT::v2i16:
3417  return MVT::v2i32;
3418  case MVT::v4i8:
3419  return MVT::v4i16;
3420  }
3421&