LLVM 22.0.0git
ARMISelLowering.cpp
Go to the documentation of this file.
1//===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that ARM uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "ARMISelLowering.h"
15#include "ARMBaseInstrInfo.h"
16#include "ARMBaseRegisterInfo.h"
17#include "ARMCallingConv.h"
20#include "ARMPerfectShuffle.h"
21#include "ARMRegisterInfo.h"
22#include "ARMSelectionDAGInfo.h"
23#include "ARMSubtarget.h"
27#include "Utils/ARMBaseInfo.h"
28#include "llvm/ADT/APFloat.h"
29#include "llvm/ADT/APInt.h"
30#include "llvm/ADT/ArrayRef.h"
31#include "llvm/ADT/BitVector.h"
32#include "llvm/ADT/DenseMap.h"
33#include "llvm/ADT/STLExtras.h"
36#include "llvm/ADT/Statistic.h"
38#include "llvm/ADT/StringRef.h"
40#include "llvm/ADT/Twine.h"
66#include "llvm/IR/Attributes.h"
67#include "llvm/IR/CallingConv.h"
68#include "llvm/IR/Constant.h"
69#include "llvm/IR/Constants.h"
70#include "llvm/IR/DataLayout.h"
71#include "llvm/IR/DebugLoc.h"
73#include "llvm/IR/Function.h"
74#include "llvm/IR/GlobalAlias.h"
75#include "llvm/IR/GlobalValue.h"
77#include "llvm/IR/IRBuilder.h"
78#include "llvm/IR/InlineAsm.h"
79#include "llvm/IR/Instruction.h"
82#include "llvm/IR/Intrinsics.h"
83#include "llvm/IR/IntrinsicsARM.h"
84#include "llvm/IR/Module.h"
85#include "llvm/IR/Type.h"
86#include "llvm/IR/User.h"
87#include "llvm/IR/Value.h"
88#include "llvm/MC/MCInstrDesc.h"
90#include "llvm/MC/MCSchedule.h"
97#include "llvm/Support/Debug.h"
105#include <algorithm>
106#include <cassert>
107#include <cstdint>
108#include <cstdlib>
109#include <iterator>
110#include <limits>
111#include <optional>
112#include <tuple>
113#include <utility>
114#include <vector>
115
116using namespace llvm;
117
118#define DEBUG_TYPE "arm-isel"
119
120STATISTIC(NumTailCalls, "Number of tail calls");
121STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
122STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
123STATISTIC(NumConstpoolPromoted,
124 "Number of constants with their storage promoted into constant pools");
125
126static cl::opt<bool>
127ARMInterworking("arm-interworking", cl::Hidden,
128 cl::desc("Enable / disable ARM interworking (for debugging only)"),
129 cl::init(true));
130
132 "arm-promote-constant", cl::Hidden,
133 cl::desc("Enable / disable promotion of unnamed_addr constants into "
134 "constant pools"),
135 cl::init(false)); // FIXME: set to true by default once PR32780 is fixed
137 "arm-promote-constant-max-size", cl::Hidden,
138 cl::desc("Maximum size of constant to promote into a constant pool"),
139 cl::init(64));
141 "arm-promote-constant-max-total", cl::Hidden,
142 cl::desc("Maximum size of ALL constants to promote into a constant pool"),
143 cl::init(128));
144
146MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden,
147 cl::desc("Maximum interleave factor for MVE VLDn to generate."),
148 cl::init(2));
149
151 "arm-max-base-updates-to-check", cl::Hidden,
152 cl::desc("Maximum number of base-updates to check generating postindex."),
153 cl::init(64));
154
155/// Value type used for "flags" operands / results (either CPSR or FPSCR_NZCV).
156constexpr MVT FlagsVT = MVT::i32;
157
158// The APCS parameter registers.
159static const MCPhysReg GPRArgRegs[] = {
160 ARM::R0, ARM::R1, ARM::R2, ARM::R3
161};
162
164 SelectionDAG &DAG, const SDLoc &DL) {
166 assert(Arg.ArgVT.bitsLT(MVT::i32));
167 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, Arg.ArgVT, Value);
168 SDValue Ext =
170 MVT::i32, Trunc);
171 return Ext;
172}
173
174void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT) {
175 if (VT != PromotedLdStVT) {
177 AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);
178
180 AddPromotedToType (ISD::STORE, VT, PromotedLdStVT);
181 }
182
183 MVT ElemTy = VT.getVectorElementType();
184 if (ElemTy != MVT::f64)
188 if (ElemTy == MVT::i32) {
193 } else {
198 }
207 if (VT.isInteger()) {
211 }
212
213 // Neon does not support vector divide/remainder operations.
222
223 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
224 for (auto Opcode : {ISD::ABS, ISD::ABDS, ISD::ABDU, ISD::SMIN, ISD::SMAX,
226 setOperationAction(Opcode, VT, Legal);
227 if (!VT.isFloatingPoint())
228 for (auto Opcode : {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT})
229 setOperationAction(Opcode, VT, Legal);
230}
231
232void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
233 addRegisterClass(VT, &ARM::DPRRegClass);
234 addTypeForNEON(VT, MVT::f64);
235}
236
237void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
238 addRegisterClass(VT, &ARM::DPairRegClass);
239 addTypeForNEON(VT, MVT::v2f64);
240}
241
242void ARMTargetLowering::setAllExpand(MVT VT) {
243 for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
245
246 // We support these really simple operations even on types where all
247 // the actual arithmetic has to be broken down into simpler
248 // operations or turned into library calls.
253}
254
255void ARMTargetLowering::addAllExtLoads(const MVT From, const MVT To,
256 LegalizeAction Action) {
257 setLoadExtAction(ISD::EXTLOAD, From, To, Action);
258 setLoadExtAction(ISD::ZEXTLOAD, From, To, Action);
259 setLoadExtAction(ISD::SEXTLOAD, From, To, Action);
260}
261
262void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
263 const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 };
264
265 for (auto VT : IntTypes) {
266 addRegisterClass(VT, &ARM::MQPRRegClass);
296
297 // No native support for these.
307
308 // Vector reductions
318
319 if (!HasMVEFP) {
324 } else {
327 }
328
329 // Pre and Post inc are supported on loads and stores
330 for (unsigned im = (unsigned)ISD::PRE_INC;
331 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
336 }
337 }
338
339 const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 };
340 for (auto VT : FloatTypes) {
341 addRegisterClass(VT, &ARM::MQPRRegClass);
342 if (!HasMVEFP)
343 setAllExpand(VT);
344
345 // These are legal or custom whether we have MVE.fp or not
358
359 // Pre and Post inc are supported on loads and stores
360 for (unsigned im = (unsigned)ISD::PRE_INC;
361 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
366 }
367
368 if (HasMVEFP) {
381
382 // No native support for these.
397 }
398 }
399
400 // Custom Expand smaller than legal vector reductions to prevent false zero
401 // items being added.
410
411 // We 'support' these types up to bitcast/load/store level, regardless of
412 // MVE integer-only / float support. Only doing FP data processing on the FP
413 // vector types is inhibited at integer-only level.
414 const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 };
415 for (auto VT : LongTypes) {
416 addRegisterClass(VT, &ARM::MQPRRegClass);
417 setAllExpand(VT);
423 }
425
426 // We can do bitwise operations on v2i64 vectors
427 setOperationAction(ISD::AND, MVT::v2i64, Legal);
428 setOperationAction(ISD::OR, MVT::v2i64, Legal);
429 setOperationAction(ISD::XOR, MVT::v2i64, Legal);
430
431 // It is legal to extload from v4i8 to v4i16 or v4i32.
432 addAllExtLoads(MVT::v8i16, MVT::v8i8, Legal);
433 addAllExtLoads(MVT::v4i32, MVT::v4i16, Legal);
434 addAllExtLoads(MVT::v4i32, MVT::v4i8, Legal);
435
436 // It is legal to sign extend from v4i8/v4i16 to v4i32 or v8i8 to v8i16.
442
443 // Some truncating stores are legal too.
444 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
445 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
446 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
447
448 // Pre and Post inc on these are legal, given the correct extends
449 for (unsigned im = (unsigned)ISD::PRE_INC;
450 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
451 for (auto VT : {MVT::v8i8, MVT::v4i8, MVT::v4i16}) {
456 }
457 }
458
459 // Predicate types
460 const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1};
461 for (auto VT : pTypes) {
462 addRegisterClass(VT, &ARM::VCCRRegClass);
477
478 if (!HasMVEFP) {
483 }
484 }
488 setOperationAction(ISD::OR, MVT::v2i1, Expand);
494
503}
504
506 return static_cast<const ARMBaseTargetMachine &>(getTargetMachine());
507}
508
510 const ARMSubtarget &STI)
511 : TargetLowering(TM_, STI), Subtarget(&STI),
512 RegInfo(Subtarget->getRegisterInfo()),
513 Itins(Subtarget->getInstrItineraryData()) {
514 const auto &TM = static_cast<const ARMBaseTargetMachine &>(TM_);
515
518
519 const Triple &TT = TM.getTargetTriple();
520
521 if (Subtarget->isThumb1Only())
522 addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
523 else
524 addRegisterClass(MVT::i32, &ARM::GPRRegClass);
525
526 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only() &&
527 Subtarget->hasFPRegs()) {
528 addRegisterClass(MVT::f32, &ARM::SPRRegClass);
529 addRegisterClass(MVT::f64, &ARM::DPRRegClass);
530
535
536 if (!Subtarget->hasVFP2Base()) {
537 setAllExpand(MVT::f32);
538 } else {
541 setOperationAction(Op, MVT::f32, Legal);
542 }
543 if (!Subtarget->hasFP64()) {
544 setAllExpand(MVT::f64);
545 } else {
548 setOperationAction(Op, MVT::f64, Legal);
549
551 }
552 }
553
554 if (Subtarget->hasFullFP16()) {
557 setOperationAction(Op, MVT::f16, Legal);
558
559 addRegisterClass(MVT::f16, &ARM::HPRRegClass);
562
567 }
568
569 if (Subtarget->hasBF16()) {
570 addRegisterClass(MVT::bf16, &ARM::HPRRegClass);
571 setAllExpand(MVT::bf16);
572 if (!Subtarget->hasFullFP16())
574 } else {
579 }
580
582 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
583 setTruncStoreAction(VT, InnerVT, Expand);
584 addAllExtLoads(VT, InnerVT, Expand);
585 }
586
589
591 }
592
593 if (!Subtarget->isThumb1Only() && !Subtarget->hasV8_1MMainlineOps())
595
596 if (!Subtarget->hasV8_1MMainlineOps())
598
599 if (!Subtarget->isThumb1Only())
601
604
607
608 if (Subtarget->hasMVEIntegerOps())
609 addMVEVectorTypes(Subtarget->hasMVEFloatOps());
610
611 // Combine low-overhead loop intrinsics so that we can lower i1 types.
612 if (Subtarget->hasLOB()) {
614 }
615
616 if (Subtarget->hasNEON()) {
617 addDRTypeForNEON(MVT::v2f32);
618 addDRTypeForNEON(MVT::v8i8);
619 addDRTypeForNEON(MVT::v4i16);
620 addDRTypeForNEON(MVT::v2i32);
621 addDRTypeForNEON(MVT::v1i64);
622
623 addQRTypeForNEON(MVT::v4f32);
624 addQRTypeForNEON(MVT::v2f64);
625 addQRTypeForNEON(MVT::v16i8);
626 addQRTypeForNEON(MVT::v8i16);
627 addQRTypeForNEON(MVT::v4i32);
628 addQRTypeForNEON(MVT::v2i64);
629
630 if (Subtarget->hasFullFP16()) {
631 addQRTypeForNEON(MVT::v8f16);
632 addDRTypeForNEON(MVT::v4f16);
633 }
634
635 if (Subtarget->hasBF16()) {
636 addQRTypeForNEON(MVT::v8bf16);
637 addDRTypeForNEON(MVT::v4bf16);
638 }
639 }
640
641 if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) {
642 // v2f64 is legal so that QR subregs can be extracted as f64 elements, but
643 // none of Neon, MVE or VFP supports any arithmetic operations on it.
644 setOperationAction(ISD::FADD, MVT::v2f64, Expand);
645 setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
646 setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
647 // FIXME: Code duplication: FDIV and FREM are expanded always, see
648 // ARMTargetLowering::addTypeForNEON method for details.
649 setOperationAction(ISD::FDIV, MVT::v2f64, Expand);
650 setOperationAction(ISD::FREM, MVT::v2f64, Expand);
651 // FIXME: Create unittest.
652 // In another words, find a way when "copysign" appears in DAG with vector
653 // operands.
655 // FIXME: Code duplication: SETCC has custom operation action, see
656 // ARMTargetLowering::addTypeForNEON method for details.
658 // FIXME: Create unittest for FNEG and for FABS.
659 setOperationAction(ISD::FNEG, MVT::v2f64, Expand);
660 setOperationAction(ISD::FABS, MVT::v2f64, Expand);
662 setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
663 setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
664 setOperationAction(ISD::FTAN, MVT::v2f64, Expand);
665 setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
666 setOperationAction(ISD::FLOG, MVT::v2f64, Expand);
669 setOperationAction(ISD::FEXP, MVT::v2f64, Expand);
678 setOperationAction(ISD::FMA, MVT::v2f64, Expand);
679 }
680
681 if (Subtarget->hasNEON()) {
682 // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
683 // supported for v4f32.
685 setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
686 setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
687 setOperationAction(ISD::FTAN, MVT::v4f32, Expand);
688 setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
689 setOperationAction(ISD::FLOG, MVT::v4f32, Expand);
692 setOperationAction(ISD::FEXP, MVT::v4f32, Expand);
701
702 // Mark v2f32 intrinsics.
704 setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
705 setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
706 setOperationAction(ISD::FTAN, MVT::v2f32, Expand);
707 setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
708 setOperationAction(ISD::FLOG, MVT::v2f32, Expand);
711 setOperationAction(ISD::FEXP, MVT::v2f32, Expand);
720
723 setOperationAction(Op, MVT::v4f16, Expand);
724 setOperationAction(Op, MVT::v8f16, Expand);
725 }
726
727 // Neon does not support some operations on v1i64 and v2i64 types.
728 setOperationAction(ISD::MUL, MVT::v1i64, Expand);
729 // Custom handling for some quad-vector types to detect VMULL.
730 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
731 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
732 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
733 // Custom handling for some vector types to avoid expensive expansions
734 setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
736 setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
738 // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
739 // a destination type that is wider than the source, and nor does
740 // it have a FP_TO_[SU]INT instruction with a narrower destination than
741 // source.
750
753
754 // NEON does not have single instruction CTPOP for vectors with element
755 // types wider than 8-bits. However, custom lowering can leverage the
756 // v8i8/v16i8 vcnt instruction.
763
764 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
765 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
766
767 // NEON does not have single instruction CTTZ for vectors.
769 setOperationAction(ISD::CTTZ, MVT::v4i16, Custom);
770 setOperationAction(ISD::CTTZ, MVT::v2i32, Custom);
771 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
772
773 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
774 setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
775 setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
776 setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);
777
782
787
791 }
792
793 // NEON only has FMA instructions as of VFP4.
794 if (!Subtarget->hasVFP4Base()) {
795 setOperationAction(ISD::FMA, MVT::v2f32, Expand);
796 setOperationAction(ISD::FMA, MVT::v4f32, Expand);
797 }
798
801
802 // It is legal to extload from v4i8 to v4i16 or v4i32.
803 for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16,
804 MVT::v2i32}) {
809 }
810 }
811
812 for (auto VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
813 MVT::v4i32}) {
818 }
819 }
820
821 if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) {
828 }
829 if (Subtarget->hasMVEIntegerOps()) {
832 ISD::SETCC});
833 }
834 if (Subtarget->hasMVEFloatOps()) {
836 }
837
838 if (!Subtarget->hasFP64()) {
839 // When targeting a floating-point unit with only single-precision
840 // operations, f64 is legal for the few double-precision instructions which
841 // are present However, no double-precision operations other than moves,
842 // loads and stores are provided by the hardware.
879 }
880
883
884 if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) {
887 if (Subtarget->hasFullFP16()) {
890 }
891 } else {
893 }
894
895 if (!Subtarget->hasFP16()) {
898 } else {
901 }
902
903 computeRegisterProperties(Subtarget->getRegisterInfo());
904
905 // ARM does not have floating-point extending loads.
906 for (MVT VT : MVT::fp_valuetypes()) {
907 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
908 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
909 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
910 }
911
912 // ... or truncating stores
913 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
914 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
915 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
916 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
917 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
918
919 // ARM does not have i1 sign extending load.
920 for (MVT VT : MVT::integer_valuetypes())
922
923 // ARM supports all 4 flavors of integer indexed load / store.
924 if (!Subtarget->isThumb1Only()) {
925 for (unsigned im = (unsigned)ISD::PRE_INC;
927 setIndexedLoadAction(im, MVT::i1, Legal);
928 setIndexedLoadAction(im, MVT::i8, Legal);
929 setIndexedLoadAction(im, MVT::i16, Legal);
930 setIndexedLoadAction(im, MVT::i32, Legal);
931 setIndexedStoreAction(im, MVT::i1, Legal);
932 setIndexedStoreAction(im, MVT::i8, Legal);
933 setIndexedStoreAction(im, MVT::i16, Legal);
934 setIndexedStoreAction(im, MVT::i32, Legal);
935 }
936 } else {
937 // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}.
940 }
941
946
949 if (Subtarget->hasDSP()) {
958 }
959 if (Subtarget->hasBaseDSP()) {
962 }
963
964 // i64 operation support.
967 if (Subtarget->isThumb1Only()) {
970 }
971 if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops()
972 || (Subtarget->isThumb2() && !Subtarget->hasDSP()))
974
984
985 // MVE lowers 64 bit shifts to lsll and lsrl
986 // assuming that ISD::SRL and SRA of i64 are already marked custom
987 if (Subtarget->hasMVEIntegerOps())
989
990 // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1.
991 if (Subtarget->isThumb1Only()) {
995 }
996
997 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops())
999
1000 // ARM does not have ROTL.
1005 }
1007 // TODO: These two should be set to LibCall, but this currently breaks
1008 // the Linux kernel build. See #101786.
1011 if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) {
1014 }
1015
1016 // @llvm.readcyclecounter requires the Performance Monitors extension.
1017 // Default to the 0 expansion on unsupported platforms.
1018 // FIXME: Technically there are older ARM CPUs that have
1019 // implementation-specific ways of obtaining this information.
1020 if (Subtarget->hasPerfMon())
1022
1023 // Only ARMv6 has BSWAP.
1024 if (!Subtarget->hasV6Ops())
1026
1027 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
1028 : Subtarget->hasDivideInARMMode();
1029 if (!hasDivide) {
1030 // These are expanded into libcalls if the cpu doesn't have HW divider.
1033 }
1034
1035 if (TT.isOSWindows() && !Subtarget->hasDivideInThumbMode()) {
1038
1041 }
1042
1045
1046 // Register based DivRem for AEABI (RTABI 4.2)
1047 if (TT.isTargetAEABI() || TT.isAndroid() || TT.isTargetGNUAEABI() ||
1048 TT.isTargetMuslAEABI() || TT.isOSFuchsia() || TT.isOSWindows()) {
1051 HasStandaloneRem = false;
1052
1057 } else {
1060 }
1061
1066
1067 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1069
1070 // Use the default implementation.
1072 setOperationAction(ISD::VAARG, MVT::Other, Expand);
1074 setOperationAction(ISD::VAEND, MVT::Other, Expand);
1077
1078 if (TT.isOSWindows())
1080 else
1082
1083 // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
1084 // the default expansion.
1085 InsertFencesForAtomic = false;
1086 if (Subtarget->hasAnyDataBarrier() &&
1087 (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) {
1088 // ATOMIC_FENCE needs custom lowering; the others should have been expanded
1089 // to ldrex/strex loops already.
1091 if (!Subtarget->isThumb() || !Subtarget->isMClass())
1093
1094 // On v8, we have particularly efficient implementations of atomic fences
1095 // if they can be combined with nearby atomic loads and stores.
1096 if (!Subtarget->hasAcquireRelease() ||
1097 getTargetMachine().getOptLevel() == CodeGenOptLevel::None) {
1098 // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc.
1099 InsertFencesForAtomic = true;
1100 }
1101 } else {
1102 // If there's anything we can use as a barrier, go through custom lowering
1103 // for ATOMIC_FENCE.
1104 // If target has DMB in thumb, Fences can be inserted.
1105 if (Subtarget->hasDataBarrier())
1106 InsertFencesForAtomic = true;
1107
1109 Subtarget->hasAnyDataBarrier() ? Custom : Expand);
1110
1111 // Set them all for libcall, which will force libcalls.
1124 // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the
1125 // Unordered/Monotonic case.
1126 if (!InsertFencesForAtomic) {
1129 }
1130 }
1131
1132 // Compute supported atomic widths.
1133 if (TT.isOSLinux() || (!Subtarget->isMClass() && Subtarget->hasV6Ops())) {
1134 // For targets where __sync_* routines are reliably available, we use them
1135 // if necessary.
1136 //
1137 // ARM Linux always supports 64-bit atomics through kernel-assisted atomic
1138 // routines (kernel 3.1 or later). FIXME: Not with compiler-rt?
1139 //
1140 // ARMv6 targets have native instructions in ARM mode. For Thumb mode,
1141 // such targets should provide __sync_* routines, which use the ARM mode
1142 // instructions. (ARMv6 doesn't have dmb, but it has an equivalent
1143 // encoding; see ARMISD::MEMBARRIER_MCR.)
1145 } else if ((Subtarget->isMClass() && Subtarget->hasV8MBaselineOps()) ||
1146 Subtarget->hasForced32BitAtomics()) {
1147 // Cortex-M (besides Cortex-M0) have 32-bit atomics.
1149 } else {
1150 // We can't assume anything about other targets; just use libatomic
1151 // routines.
1153 }
1154
1156
1158
1159 // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
1160 if (!Subtarget->hasV6Ops()) {
1163 }
1165
1166 if (!Subtarget->useSoftFloat() && Subtarget->hasFPRegs() &&
1167 !Subtarget->isThumb1Only()) {
1168 // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
1169 // iff target supports vfp2.
1179 }
1180
1181 // We want to custom lower some of our intrinsics.
1186
1196 if (Subtarget->hasFullFP16()) {
1200 }
1201
1203
1206 if (Subtarget->hasFullFP16())
1210 setOperationAction(ISD::BR_JT, MVT::Other, Custom);
1211
1212 // We don't support sin/cos/fmod/copysign/pow
1221 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2Base() &&
1222 !Subtarget->isThumb1Only()) {
1225 }
1228
1229 if (!Subtarget->hasVFP4Base()) {
1232 }
1233
1234 // Various VFP goodness
1235 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) {
1236 // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded.
1237 if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) {
1242 }
1243
1244 // fp16 is a special v7 extension that adds f16 <-> f32 conversions.
1245 if (!Subtarget->hasFP16()) {
1250 }
1251
1252 // Strict floating-point comparisons need custom lowering.
1259 }
1260
1263
1264 // FP-ARMv8 implements a lot of rounding-like FP operations.
1265 if (Subtarget->hasFPARMv8Base()) {
1266 for (auto Op :
1273 setOperationAction(Op, MVT::f32, Legal);
1274
1275 if (Subtarget->hasFP64())
1276 setOperationAction(Op, MVT::f64, Legal);
1277 }
1278
1279 if (Subtarget->hasNEON()) {
1284 }
1285 }
1286
1287 // FP16 often need to be promoted to call lib functions
1288 // clang-format off
1289 if (Subtarget->hasFullFP16()) {
1293
1294 for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
1308 setOperationAction(Op, MVT::f16, Promote);
1309 }
1310
1311 // Round-to-integer need custom lowering for fp16, as Promote doesn't work
1312 // because the result type is integer.
1314 setOperationAction(Op, MVT::f16, Custom);
1315
1321 setOperationAction(Op, MVT::f16, Legal);
1322 }
1323 // clang-format on
1324 }
1325
1326 if (Subtarget->hasNEON()) {
1327 // vmin and vmax aren't available in a scalar form, so we can use
1328 // a NEON instruction with an undef lane instead.
1337
1338 if (Subtarget->hasV8Ops()) {
1339 setOperationAction(ISD::FFLOOR, MVT::v2f32, Legal);
1340 setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
1341 setOperationAction(ISD::FROUND, MVT::v2f32, Legal);
1342 setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
1345 setOperationAction(ISD::FCEIL, MVT::v2f32, Legal);
1346 setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
1347 setOperationAction(ISD::FTRUNC, MVT::v2f32, Legal);
1348 setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
1349 setOperationAction(ISD::FRINT, MVT::v2f32, Legal);
1350 setOperationAction(ISD::FRINT, MVT::v4f32, Legal);
1351 }
1352
1353 if (Subtarget->hasFullFP16()) {
1358
1363
1364 setOperationAction(ISD::FFLOOR, MVT::v4f16, Legal);
1365 setOperationAction(ISD::FFLOOR, MVT::v8f16, Legal);
1366 setOperationAction(ISD::FROUND, MVT::v4f16, Legal);
1367 setOperationAction(ISD::FROUND, MVT::v8f16, Legal);
1370 setOperationAction(ISD::FCEIL, MVT::v4f16, Legal);
1371 setOperationAction(ISD::FCEIL, MVT::v8f16, Legal);
1372 setOperationAction(ISD::FTRUNC, MVT::v4f16, Legal);
1373 setOperationAction(ISD::FTRUNC, MVT::v8f16, Legal);
1374 setOperationAction(ISD::FRINT, MVT::v4f16, Legal);
1375 setOperationAction(ISD::FRINT, MVT::v8f16, Legal);
1376 }
1377 }
1378
1379 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
1380 // it, but it's just a wrapper around ldexp.
1381 if (TT.isOSWindows()) {
1383 if (isOperationExpand(Op, MVT::f32))
1384 setOperationAction(Op, MVT::f32, Promote);
1385 }
1386
1387 // LegalizeDAG currently can't expand fp16 LDEXP/FREXP on targets where i16
1388 // isn't legal.
1390 if (isOperationExpand(Op, MVT::f16))
1391 setOperationAction(Op, MVT::f16, Promote);
1392
1393 // We have target-specific dag combine patterns for the following nodes:
1394 // ARMISD::VMOVRRD - No need to call setTargetDAGCombine
1397
1398 if (Subtarget->hasMVEIntegerOps())
1400
1401 if (Subtarget->hasV6Ops())
1403 if (Subtarget->isThumb1Only())
1405 // Attempt to lower smin/smax to ssat/usat
1406 if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) ||
1407 Subtarget->isThumb2()) {
1409 }
1410
1412
1413 if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() ||
1414 !Subtarget->hasVFP2Base() || Subtarget->hasMinSize())
1416 else
1418
1419 //// temporary - rewrite interface to use type
1422 MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
1424 MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
1426
1427 // On ARM arguments smaller than 4 bytes are extended, so all arguments
1428 // are at least 4 bytes aligned.
1430
1431 // Prefer likely predicted branches to selects on out-of-order cores.
1432 PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder();
1433
1434 setPrefLoopAlignment(Align(1ULL << Subtarget->getPreferBranchLogAlignment()));
1436 Align(1ULL << Subtarget->getPreferBranchLogAlignment()));
1437
1438 setMinFunctionAlignment(Subtarget->isThumb() ? Align(2) : Align(4));
1439
1440 IsStrictFPEnabled = true;
1441}
1442
1444 return Subtarget->useSoftFloat();
1445}
1446
1448 return !Subtarget->isThumb1Only() && VT.getSizeInBits() <= 32;
1449}
1450
1451// FIXME: It might make sense to define the representative register class as the
1452// nearest super-register that has a non-null superset. For example, DPR_VFP2 is
1453// a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
1454// SPR's representative would be DPR_VFP2. This should work well if register
1455// pressure tracking were modified such that a register use would increment the
1456// pressure of the register class's representative and all of it's super
1457// classes' representatives transitively. We have not implemented this because
1458// of the difficulty prior to coalescing of modeling operand register classes
1459// due to the common occurrence of cross class copies and subregister insertions
1460// and extractions.
1461std::pair<const TargetRegisterClass *, uint8_t>
1463 MVT VT) const {
1464 const TargetRegisterClass *RRC = nullptr;
1465 uint8_t Cost = 1;
1466 switch (VT.SimpleTy) {
1467 default:
1469 // Use DPR as representative register class for all floating point
1470 // and vector types. Since there are 32 SPR registers and 32 DPR registers so
1471 // the cost is 1 for both f32 and f64.
1472 case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:
1473 case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:
1474 RRC = &ARM::DPRRegClass;
1475 // When NEON is used for SP, only half of the register file is available
1476 // because operations that define both SP and DP results will be constrained
1477 // to the VFP2 class (D0-D15). We currently model this constraint prior to
1478 // coalescing by double-counting the SP regs. See the FIXME above.
1479 if (Subtarget->useNEONForSinglePrecisionFP())
1480 Cost = 2;
1481 break;
1482 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1483 case MVT::v4f32: case MVT::v2f64:
1484 RRC = &ARM::DPRRegClass;
1485 Cost = 2;
1486 break;
1487 case MVT::v4i64:
1488 RRC = &ARM::DPRRegClass;
1489 Cost = 4;
1490 break;
1491 case MVT::v8i64:
1492 RRC = &ARM::DPRRegClass;
1493 Cost = 8;
1494 break;
1495 }
1496 return std::make_pair(RRC, Cost);
1497}
1498
1500 EVT VT) const {
1501 if (!VT.isVector())
1502 return getPointerTy(DL);
1503
1504 // MVE has a predicate register.
1505 if ((Subtarget->hasMVEIntegerOps() &&
1506 (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
1507 VT == MVT::v16i8)) ||
1508 (Subtarget->hasMVEFloatOps() &&
1509 (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16)))
1510 return MVT::getVectorVT(MVT::i1, VT.getVectorElementCount());
1512}
1513
1514/// getRegClassFor - Return the register class that should be used for the
1515/// specified value type.
1516const TargetRegisterClass *
1517ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
1518 (void)isDivergent;
1519 // Map v4i64 to QQ registers but do not make the type legal. Similarly map
1520 // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
1521 // load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive
1522 // MVE Q registers.
1523 if (Subtarget->hasNEON()) {
1524 if (VT == MVT::v4i64)
1525 return &ARM::QQPRRegClass;
1526 if (VT == MVT::v8i64)
1527 return &ARM::QQQQPRRegClass;
1528 }
1529 if (Subtarget->hasMVEIntegerOps()) {
1530 if (VT == MVT::v4i64)
1531 return &ARM::MQQPRRegClass;
1532 if (VT == MVT::v8i64)
1533 return &ARM::MQQQQPRRegClass;
1534 }
1536}
1537
1538// memcpy, and other memory intrinsics, typically tries to use LDM/STM if the
1539// source/dest is aligned and the copy size is large enough. We therefore want
1540// to align such objects passed to memory intrinsics.
1542 Align &PrefAlign) const {
1543 if (!isa<MemIntrinsic>(CI))
1544 return false;
1545 MinSize = 8;
1546 // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1
1547 // cycle faster than 4-byte aligned LDM.
1548 PrefAlign =
1549 (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? Align(8) : Align(4));
1550 return true;
1551}
1552
1553// Create a fast isel object.
1554FastISel *
1556 const TargetLibraryInfo *libInfo) const {
1557 return ARM::createFastISel(funcInfo, libInfo);
1558}
1559
1561 unsigned NumVals = N->getNumValues();
1562 if (!NumVals)
1563 return Sched::RegPressure;
1564
1565 for (unsigned i = 0; i != NumVals; ++i) {
1566 EVT VT = N->getValueType(i);
1567 if (VT == MVT::Glue || VT == MVT::Other)
1568 continue;
1569 if (VT.isFloatingPoint() || VT.isVector())
1570 return Sched::ILP;
1571 }
1572
1573 if (!N->isMachineOpcode())
1574 return Sched::RegPressure;
1575
1576 // Load are scheduled for latency even if there instruction itinerary
1577 // is not available.
1578 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1579 const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
1580
1581 if (MCID.getNumDefs() == 0)
1582 return Sched::RegPressure;
1583 if (!Itins->isEmpty() &&
1584 Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2U)
1585 return Sched::ILP;
1586
1587 return Sched::RegPressure;
1588}
1589
1590//===----------------------------------------------------------------------===//
1591// Lowering Code
1592//===----------------------------------------------------------------------===//
1593
1594static bool isSRL16(const SDValue &Op) {
1595 if (Op.getOpcode() != ISD::SRL)
1596 return false;
1597 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1598 return Const->getZExtValue() == 16;
1599 return false;
1600}
1601
1602static bool isSRA16(const SDValue &Op) {
1603 if (Op.getOpcode() != ISD::SRA)
1604 return false;
1605 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1606 return Const->getZExtValue() == 16;
1607 return false;
1608}
1609
1610static bool isSHL16(const SDValue &Op) {
1611 if (Op.getOpcode() != ISD::SHL)
1612 return false;
1613 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1614 return Const->getZExtValue() == 16;
1615 return false;
1616}
1617
1618// Check for a signed 16-bit value. We special case SRA because it makes it
1619// more simple when also looking for SRAs that aren't sign extending a
1620// smaller value. Without the check, we'd need to take extra care with
1621// checking order for some operations.
1622static bool isS16(const SDValue &Op, SelectionDAG &DAG) {
1623 if (isSRA16(Op))
1624 return isSHL16(Op.getOperand(0));
1625 return DAG.ComputeNumSignBits(Op) == 17;
1626}
1627
1628/// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
1630 switch (CC) {
1631 default: llvm_unreachable("Unknown condition code!");
1632 case ISD::SETNE: return ARMCC::NE;
1633 case ISD::SETEQ: return ARMCC::EQ;
1634 case ISD::SETGT: return ARMCC::GT;
1635 case ISD::SETGE: return ARMCC::GE;
1636 case ISD::SETLT: return ARMCC::LT;
1637 case ISD::SETLE: return ARMCC::LE;
1638 case ISD::SETUGT: return ARMCC::HI;
1639 case ISD::SETUGE: return ARMCC::HS;
1640 case ISD::SETULT: return ARMCC::LO;
1641 case ISD::SETULE: return ARMCC::LS;
1642 }
1643}
1644
1645/// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
1647 ARMCC::CondCodes &CondCode2) {
1648 CondCode2 = ARMCC::AL;
1649 switch (CC) {
1650 default: llvm_unreachable("Unknown FP condition!");
1651 case ISD::SETEQ:
1652 case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
1653 case ISD::SETGT:
1654 case ISD::SETOGT: CondCode = ARMCC::GT; break;
1655 case ISD::SETGE:
1656 case ISD::SETOGE: CondCode = ARMCC::GE; break;
1657 case ISD::SETOLT: CondCode = ARMCC::MI; break;
1658 case ISD::SETOLE: CondCode = ARMCC::LS; break;
1659 case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
1660 case ISD::SETO: CondCode = ARMCC::VC; break;
1661 case ISD::SETUO: CondCode = ARMCC::VS; break;
1662 case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break;
1663 case ISD::SETUGT: CondCode = ARMCC::HI; break;
1664 case ISD::SETUGE: CondCode = ARMCC::PL; break;
1665 case ISD::SETLT:
1666 case ISD::SETULT: CondCode = ARMCC::LT; break;
1667 case ISD::SETLE:
1668 case ISD::SETULE: CondCode = ARMCC::LE; break;
1669 case ISD::SETNE:
1670 case ISD::SETUNE: CondCode = ARMCC::NE; break;
1671 }
1672}
1673
1674//===----------------------------------------------------------------------===//
1675// Calling Convention Implementation
1676//===----------------------------------------------------------------------===//
1677
1678/// getEffectiveCallingConv - Get the effective calling convention, taking into
1679/// account presence of floating point hardware and calling convention
1680/// limitations, such as support for variadic functions.
1682ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
1683 bool isVarArg) const {
1684 switch (CC) {
1685 default:
1686 report_fatal_error("Unsupported calling convention");
1689 case CallingConv::GHC:
1691 return CC;
1697 case CallingConv::Swift:
1700 case CallingConv::C:
1701 case CallingConv::Tail:
1702 if (!getTM().isAAPCS_ABI())
1703 return CallingConv::ARM_APCS;
1704 else if (Subtarget->hasFPRegs() && !Subtarget->isThumb1Only() &&
1705 getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
1706 !isVarArg)
1708 else
1710 case CallingConv::Fast:
1712 if (!getTM().isAAPCS_ABI()) {
1713 if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && !isVarArg)
1714 return CallingConv::Fast;
1715 return CallingConv::ARM_APCS;
1716 } else if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() &&
1717 !isVarArg)
1719 else
1721 }
1722}
1723
1725 bool isVarArg) const {
1726 return CCAssignFnForNode(CC, false, isVarArg);
1727}
1728
1730 bool isVarArg) const {
1731 return CCAssignFnForNode(CC, true, isVarArg);
1732}
1733
1734/// CCAssignFnForNode - Selects the correct CCAssignFn for the given
1735/// CallingConvention.
1736CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
1737 bool Return,
1738 bool isVarArg) const {
1739 switch (getEffectiveCallingConv(CC, isVarArg)) {
1740 default:
1741 report_fatal_error("Unsupported calling convention");
1743 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
1745 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1747 return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
1748 case CallingConv::Fast:
1749 return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
1750 case CallingConv::GHC:
1751 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
1753 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1755 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1757 return (Return ? RetCC_ARM_AAPCS : CC_ARM_Win32_CFGuard_Check);
1758 }
1759}
1760
1761SDValue ARMTargetLowering::MoveToHPR(const SDLoc &dl, SelectionDAG &DAG,
1762 MVT LocVT, MVT ValVT, SDValue Val) const {
1763 Val = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocVT.getSizeInBits()),
1764 Val);
1765 if (Subtarget->hasFullFP16()) {
1766 Val = DAG.getNode(ARMISD::VMOVhr, dl, ValVT, Val);
1767 } else {
1768 Val = DAG.getNode(ISD::TRUNCATE, dl,
1769 MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
1770 Val = DAG.getNode(ISD::BITCAST, dl, ValVT, Val);
1771 }
1772 return Val;
1773}
1774
1775SDValue ARMTargetLowering::MoveFromHPR(const SDLoc &dl, SelectionDAG &DAG,
1776 MVT LocVT, MVT ValVT,
1777 SDValue Val) const {
1778 if (Subtarget->hasFullFP16()) {
1779 Val = DAG.getNode(ARMISD::VMOVrh, dl,
1780 MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
1781 } else {
1782 Val = DAG.getNode(ISD::BITCAST, dl,
1783 MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
1784 Val = DAG.getNode(ISD::ZERO_EXTEND, dl,
1785 MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
1786 }
1787 return DAG.getNode(ISD::BITCAST, dl, LocVT, Val);
1788}
1789
1790/// LowerCallResult - Lower the result values of a call into the
1791/// appropriate copies out of appropriate physical registers.
1792SDValue ARMTargetLowering::LowerCallResult(
1793 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
1794 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
1795 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
1796 SDValue ThisVal, bool isCmseNSCall) const {
1797 // Assign locations to each value returned by this call.
1799 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
1800 *DAG.getContext());
1801 CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg));
1802
1803 // Copy all of the result registers out of their specified physreg.
1804 for (unsigned i = 0; i != RVLocs.size(); ++i) {
1805 CCValAssign VA = RVLocs[i];
1806
1807 // Pass 'this' value directly from the argument to return value, to avoid
1808 // reg unit interference
1809 if (i == 0 && isThisReturn) {
1810 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 &&
1811 "unexpected return calling convention register assignment");
1812 InVals.push_back(ThisVal);
1813 continue;
1814 }
1815
1816 SDValue Val;
1817 if (VA.needsCustom() &&
1818 (VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2f64)) {
1819 // Handle f64 or half of a v2f64.
1820 SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
1821 InGlue);
1822 Chain = Lo.getValue(1);
1823 InGlue = Lo.getValue(2);
1824 VA = RVLocs[++i]; // skip ahead to next loc
1825 SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
1826 InGlue);
1827 Chain = Hi.getValue(1);
1828 InGlue = Hi.getValue(2);
1829 if (!Subtarget->isLittle())
1830 std::swap (Lo, Hi);
1831 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
1832
1833 if (VA.getLocVT() == MVT::v2f64) {
1834 SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
1835 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
1836 DAG.getConstant(0, dl, MVT::i32));
1837
1838 VA = RVLocs[++i]; // skip ahead to next loc
1839 Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InGlue);
1840 Chain = Lo.getValue(1);
1841 InGlue = Lo.getValue(2);
1842 VA = RVLocs[++i]; // skip ahead to next loc
1843 Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InGlue);
1844 Chain = Hi.getValue(1);
1845 InGlue = Hi.getValue(2);
1846 if (!Subtarget->isLittle())
1847 std::swap (Lo, Hi);
1848 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
1849 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
1850 DAG.getConstant(1, dl, MVT::i32));
1851 }
1852 } else {
1853 Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
1854 InGlue);
1855 Chain = Val.getValue(1);
1856 InGlue = Val.getValue(2);
1857 }
1858
1859 switch (VA.getLocInfo()) {
1860 default: llvm_unreachable("Unknown loc info!");
1861 case CCValAssign::Full: break;
1862 case CCValAssign::BCvt:
1863 Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
1864 break;
1865 }
1866
1867 // f16 arguments have their size extended to 4 bytes and passed as if they
1868 // had been copied to the LSBs of a 32-bit register.
1869 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
1870 if (VA.needsCustom() &&
1871 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
1872 Val = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Val);
1873
1874 // On CMSE Non-secure Calls, call results (returned values) whose bitwidth
1875 // is less than 32 bits must be sign- or zero-extended after the call for
1876 // security reasons. Although the ABI mandates an extension done by the
1877 // callee, the latter cannot be trusted to follow the rules of the ABI.
1878 const ISD::InputArg &Arg = Ins[VA.getValNo()];
1879 if (isCmseNSCall && Arg.ArgVT.isScalarInteger() &&
1880 VA.getLocVT().isScalarInteger() && Arg.ArgVT.bitsLT(MVT::i32))
1881 Val = handleCMSEValue(Val, Arg, DAG, dl);
1882
1883 InVals.push_back(Val);
1884 }
1885
1886 return Chain;
1887}
1888
1889std::pair<SDValue, MachinePointerInfo> ARMTargetLowering::computeAddrForCallArg(
1890 const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, SDValue StackPtr,
1891 bool IsTailCall, int SPDiff) const {
1892 SDValue DstAddr;
1893 MachinePointerInfo DstInfo;
1894 int32_t Offset = VA.getLocMemOffset();
1895 MachineFunction &MF = DAG.getMachineFunction();
1896
1897 if (IsTailCall) {
1898 Offset += SPDiff;
1899 auto PtrVT = getPointerTy(DAG.getDataLayout());
1900 int Size = VA.getLocVT().getFixedSizeInBits() / 8;
1901 int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true);
1902 DstAddr = DAG.getFrameIndex(FI, PtrVT);
1903 DstInfo =
1905 } else {
1906 SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl);
1907 DstAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
1908 StackPtr, PtrOff);
1909 DstInfo =
1911 }
1912
1913 return std::make_pair(DstAddr, DstInfo);
1914}
1915
1916// Returns the type of copying which is required to set up a byval argument to
1917// a tail-called function. This isn't needed for non-tail calls, because they
1918// always need the equivalent of CopyOnce, but tail-calls sometimes need two to
1919// avoid clobbering another argument (CopyViaTemp), and sometimes can be
1920// optimised to zero copies when forwarding an argument from the caller's
1921// caller (NoCopy).
1922ARMTargetLowering::ByValCopyKind ARMTargetLowering::ByValNeedsCopyForTailCall(
1923 SelectionDAG &DAG, SDValue Src, SDValue Dst, ISD::ArgFlagsTy Flags) const {
1924 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
1925 ARMFunctionInfo *AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
1926
1927 // Globals are always safe to copy from.
1929 return CopyOnce;
1930
1931 // Can only analyse frame index nodes, conservatively assume we need a
1932 // temporary.
1933 auto *SrcFrameIdxNode = dyn_cast<FrameIndexSDNode>(Src);
1934 auto *DstFrameIdxNode = dyn_cast<FrameIndexSDNode>(Dst);
1935 if (!SrcFrameIdxNode || !DstFrameIdxNode)
1936 return CopyViaTemp;
1937
1938 int SrcFI = SrcFrameIdxNode->getIndex();
1939 int DstFI = DstFrameIdxNode->getIndex();
1940 assert(MFI.isFixedObjectIndex(DstFI) &&
1941 "byval passed in non-fixed stack slot");
1942
1943 int64_t SrcOffset = MFI.getObjectOffset(SrcFI);
1944 int64_t DstOffset = MFI.getObjectOffset(DstFI);
1945
1946 // If the source is in the local frame, then the copy to the argument memory
1947 // is always valid.
1948 bool FixedSrc = MFI.isFixedObjectIndex(SrcFI);
1949 if (!FixedSrc ||
1950 (FixedSrc && SrcOffset < -(int64_t)AFI->getArgRegsSaveSize()))
1951 return CopyOnce;
1952
1953 // In the case of byval arguments split between registers and the stack,
1954 // computeAddrForCallArg returns a FrameIndex which corresponds only to the
1955 // stack portion, but the Src SDValue will refer to the full value, including
1956 // the local stack memory that the register portion gets stored into. We only
1957 // need to compare them for equality, so normalise on the full value version.
1958 uint64_t RegSize = Flags.getByValSize() - MFI.getObjectSize(DstFI);
1959 DstOffset -= RegSize;
1960
1961 // If the value is already in the correct location, then no copying is
1962 // needed. If not, then we need to copy via a temporary.
1963 if (SrcOffset == DstOffset)
1964 return NoCopy;
1965 else
1966 return CopyViaTemp;
1967}
1968
1969void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,
1970 SDValue Chain, SDValue &Arg,
1971 RegsToPassVector &RegsToPass,
1972 CCValAssign &VA, CCValAssign &NextVA,
1973 SDValue &StackPtr,
1974 SmallVectorImpl<SDValue> &MemOpChains,
1975 bool IsTailCall,
1976 int SPDiff) const {
1977 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
1978 DAG.getVTList(MVT::i32, MVT::i32), Arg);
1979 unsigned id = Subtarget->isLittle() ? 0 : 1;
1980 RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id)));
1981
1982 if (NextVA.isRegLoc())
1983 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id)));
1984 else {
1985 assert(NextVA.isMemLoc());
1986 if (!StackPtr.getNode())
1987 StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP,
1989
1990 SDValue DstAddr;
1991 MachinePointerInfo DstInfo;
1992 std::tie(DstAddr, DstInfo) =
1993 computeAddrForCallArg(dl, DAG, NextVA, StackPtr, IsTailCall, SPDiff);
1994 MemOpChains.push_back(
1995 DAG.getStore(Chain, dl, fmrrd.getValue(1 - id), DstAddr, DstInfo));
1996 }
1997}
1998
1999static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
2000 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
2002}
2003
2004/// LowerCall - Lowering a call into a callseq_start <-
2005/// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
2006/// nodes.
2007SDValue
2008ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2009 SmallVectorImpl<SDValue> &InVals) const {
2010 SelectionDAG &DAG = CLI.DAG;
2011 SDLoc &dl = CLI.DL;
2012 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2013 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2014 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
2015 SDValue Chain = CLI.Chain;
2016 SDValue Callee = CLI.Callee;
2017 bool &isTailCall = CLI.IsTailCall;
2018 CallingConv::ID CallConv = CLI.CallConv;
2019 bool doesNotRet = CLI.DoesNotReturn;
2020 bool isVarArg = CLI.IsVarArg;
2021 const CallBase *CB = CLI.CB;
2022
2023 MachineFunction &MF = DAG.getMachineFunction();
2024 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2025 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
2026 MachineFunction::CallSiteInfo CSInfo;
2027 bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
2028 bool isThisReturn = false;
2029 bool isCmseNSCall = false;
2030 bool isSibCall = false;
2031 bool PreferIndirect = false;
2032 bool GuardWithBTI = false;
2033
2034 // Analyze operands of the call, assigning locations to each operand.
2036 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2037 *DAG.getContext());
2038 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg));
2039
2040 // Lower 'returns_twice' calls to a pseudo-instruction.
2041 if (CLI.CB && CLI.CB->getAttributes().hasFnAttr(Attribute::ReturnsTwice) &&
2042 !Subtarget->noBTIAtReturnTwice())
2043 GuardWithBTI = AFI->branchTargetEnforcement();
2044
2045 // Set type id for call site info.
2046 if (MF.getTarget().Options.EmitCallGraphSection && CB && CB->isIndirectCall())
2047 CSInfo = MachineFunction::CallSiteInfo(*CB);
2048
2049 // Determine whether this is a non-secure function call.
2050 if (CLI.CB && CLI.CB->getAttributes().hasFnAttr("cmse_nonsecure_call"))
2051 isCmseNSCall = true;
2052
2053 // Disable tail calls if they're not supported.
2054 if (!Subtarget->supportsTailCall())
2055 isTailCall = false;
2056
2057 // For both the non-secure calls and the returns from a CMSE entry function,
2058 // the function needs to do some extra work after the call, or before the
2059 // return, respectively, thus it cannot end with a tail call
2060 if (isCmseNSCall || AFI->isCmseNSEntryFunction())
2061 isTailCall = false;
2062
2063 if (isa<GlobalAddressSDNode>(Callee)) {
2064 // If we're optimizing for minimum size and the function is called three or
2065 // more times in this block, we can improve codesize by calling indirectly
2066 // as BLXr has a 16-bit encoding.
2067 auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
2068 if (CLI.CB) {
2069 auto *BB = CLI.CB->getParent();
2070 PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() &&
2071 count_if(GV->users(), [&BB](const User *U) {
2072 return isa<Instruction>(U) &&
2073 cast<Instruction>(U)->getParent() == BB;
2074 }) > 2;
2075 }
2076 }
2077 if (isTailCall) {
2078 // Check if it's really possible to do a tail call.
2079 isTailCall =
2080 IsEligibleForTailCallOptimization(CLI, CCInfo, ArgLocs, PreferIndirect);
2081
2082 if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt &&
2083 CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail)
2084 isSibCall = true;
2085
2086 // We don't support GuaranteedTailCallOpt for ARM, only automatically
2087 // detected sibcalls.
2088 if (isTailCall)
2089 ++NumTailCalls;
2090 }
2091
2092 if (!isTailCall && CLI.CB && CLI.CB->isMustTailCall())
2093 report_fatal_error("failed to perform tail call elimination on a call "
2094 "site marked musttail");
2095
2096 // Get a count of how many bytes are to be pushed on the stack.
2097 unsigned NumBytes = CCInfo.getStackSize();
2098
2099 // SPDiff is the byte offset of the call's argument area from the callee's.
2100 // Stores to callee stack arguments will be placed in FixedStackSlots offset
2101 // by this amount for a tail call. In a sibling call it must be 0 because the
2102 // caller will deallocate the entire stack and the callee still expects its
2103 // arguments to begin at SP+0. Completely unused for non-tail calls.
2104 int SPDiff = 0;
2105
2106 if (isTailCall && !isSibCall) {
2107 auto FuncInfo = MF.getInfo<ARMFunctionInfo>();
2108 unsigned NumReusableBytes = FuncInfo->getArgumentStackSize();
2109
2110 // Since callee will pop argument stack as a tail call, we must keep the
2111 // popped size 16-byte aligned.
2112 MaybeAlign StackAlign = DAG.getDataLayout().getStackAlignment();
2113 assert(StackAlign && "data layout string is missing stack alignment");
2114 NumBytes = alignTo(NumBytes, *StackAlign);
2115
2116 // SPDiff will be negative if this tail call requires more space than we
2117 // would automatically have in our incoming argument space. Positive if we
2118 // can actually shrink the stack.
2119 SPDiff = NumReusableBytes - NumBytes;
2120
2121 // If this call requires more stack than we have available from
2122 // LowerFormalArguments, tell FrameLowering to reserve space for it.
2123 if (SPDiff < 0 && AFI->getArgRegsSaveSize() < (unsigned)-SPDiff)
2124 AFI->setArgRegsSaveSize(-SPDiff);
2125 }
2126
2127 if (isSibCall) {
2128 // For sibling tail calls, memory operands are available in our caller's stack.
2129 NumBytes = 0;
2130 } else {
2131 // Adjust the stack pointer for the new arguments...
2132 // These operations are automatically eliminated by the prolog/epilog pass
2133 Chain = DAG.getCALLSEQ_START(Chain, isTailCall ? 0 : NumBytes, 0, dl);
2134 }
2135
2137 DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout()));
2138
2139 RegsToPassVector RegsToPass;
2140 SmallVector<SDValue, 8> MemOpChains;
2141
2142 // If we are doing a tail-call, any byval arguments will be written to stack
2143 // space which was used for incoming arguments. If any the values being used
2144 // are incoming byval arguments to this function, then they might be
2145 // overwritten by the stores of the outgoing arguments. To avoid this, we
2146 // need to make a temporary copy of them in local stack space, then copy back
2147 // to the argument area.
2148 DenseMap<unsigned, SDValue> ByValTemporaries;
2149 SDValue ByValTempChain;
2150 if (isTailCall) {
2151 SmallVector<SDValue, 8> ByValCopyChains;
2152 for (const CCValAssign &VA : ArgLocs) {
2153 unsigned ArgIdx = VA.getValNo();
2154 SDValue Src = OutVals[ArgIdx];
2155 ISD::ArgFlagsTy Flags = Outs[ArgIdx].Flags;
2156
2157 if (!Flags.isByVal())
2158 continue;
2159
2160 SDValue Dst;
2161 MachinePointerInfo DstInfo;
2162 std::tie(Dst, DstInfo) =
2163 computeAddrForCallArg(dl, DAG, VA, SDValue(), true, SPDiff);
2164 ByValCopyKind Copy = ByValNeedsCopyForTailCall(DAG, Src, Dst, Flags);
2165
2166 if (Copy == NoCopy) {
2167 // If the argument is already at the correct offset on the stack
2168 // (because we are forwarding a byval argument from our caller), we
2169 // don't need any copying.
2170 continue;
2171 } else if (Copy == CopyOnce) {
2172 // If the argument is in our local stack frame, no other argument
2173 // preparation can clobber it, so we can copy it to the final location
2174 // later.
2175 ByValTemporaries[ArgIdx] = Src;
2176 } else {
2177 assert(Copy == CopyViaTemp && "unexpected enum value");
2178 // If we might be copying this argument from the outgoing argument
2179 // stack area, we need to copy via a temporary in the local stack
2180 // frame.
2181 int TempFrameIdx = MFI.CreateStackObject(
2182 Flags.getByValSize(), Flags.getNonZeroByValAlign(), false);
2183 SDValue Temp =
2184 DAG.getFrameIndex(TempFrameIdx, getPointerTy(DAG.getDataLayout()));
2185
2186 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2187 SDValue AlignNode =
2188 DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32);
2189
2190 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
2191 SDValue Ops[] = {Chain, Temp, Src, SizeNode, AlignNode};
2192 ByValCopyChains.push_back(
2193 DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs, Ops));
2194 ByValTemporaries[ArgIdx] = Temp;
2195 }
2196 }
2197 if (!ByValCopyChains.empty())
2198 ByValTempChain =
2199 DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ByValCopyChains);
2200 }
2201
2202 // During a tail call, stores to the argument area must happen after all of
2203 // the function's incoming arguments have been loaded because they may alias.
2204 // This is done by folding in a TokenFactor from LowerFormalArguments, but
2205 // there's no point in doing so repeatedly so this tracks whether that's
2206 // happened yet.
2207 bool AfterFormalArgLoads = false;
2208
2209 // Walk the register/memloc assignments, inserting copies/loads. In the case
2210 // of tail call optimization, arguments are handled later.
2211 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
2212 i != e;
2213 ++i, ++realArgIdx) {
2214 CCValAssign &VA = ArgLocs[i];
2215 SDValue Arg = OutVals[realArgIdx];
2216 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2217 bool isByVal = Flags.isByVal();
2218
2219 // Promote the value if needed.
2220 switch (VA.getLocInfo()) {
2221 default: llvm_unreachable("Unknown loc info!");
2222 case CCValAssign::Full: break;
2223 case CCValAssign::SExt:
2224 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
2225 break;
2226 case CCValAssign::ZExt:
2227 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
2228 break;
2229 case CCValAssign::AExt:
2230 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
2231 break;
2232 case CCValAssign::BCvt:
2233 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2234 break;
2235 }
2236
2237 if (isTailCall && VA.isMemLoc() && !AfterFormalArgLoads) {
2238 Chain = DAG.getStackArgumentTokenFactor(Chain);
2239 if (ByValTempChain) {
2240 // In case of large byval copies, re-using the stackframe for tail-calls
2241 // can lead to overwriting incoming arguments on the stack. Force
2242 // loading these stack arguments before the copy to avoid that.
2243 SmallVector<SDValue, 8> IncomingLoad;
2244 for (unsigned I = 0; I < OutVals.size(); ++I) {
2245 if (Outs[I].Flags.isByVal())
2246 continue;
2247
2248 SDValue OutVal = OutVals[I];
2249 LoadSDNode *OutLN = dyn_cast_or_null<LoadSDNode>(OutVal);
2250 if (!OutLN)
2251 continue;
2252
2253 FrameIndexSDNode *FIN =
2255 if (!FIN)
2256 continue;
2257
2258 if (!MFI.isFixedObjectIndex(FIN->getIndex()))
2259 continue;
2260
2261 for (const CCValAssign &VA : ArgLocs) {
2262 if (VA.isMemLoc())
2263 IncomingLoad.push_back(OutVal.getValue(1));
2264 }
2265 }
2266
2267 // Update the chain to force loads for potentially clobbered argument
2268 // loads to happen before the byval copy.
2269 if (!IncomingLoad.empty()) {
2270 IncomingLoad.push_back(Chain);
2271 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, IncomingLoad);
2272 }
2273
2274 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chain,
2275 ByValTempChain);
2276 }
2277 AfterFormalArgLoads = true;
2278 }
2279
2280 // f16 arguments have their size extended to 4 bytes and passed as if they
2281 // had been copied to the LSBs of a 32-bit register.
2282 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
2283 if (VA.needsCustom() &&
2284 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) {
2285 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
2286 } else {
2287 // f16 arguments could have been extended prior to argument lowering.
2288 // Mask them arguments if this is a CMSE nonsecure call.
2289 auto ArgVT = Outs[realArgIdx].ArgVT;
2290 if (isCmseNSCall && (ArgVT == MVT::f16)) {
2291 auto LocBits = VA.getLocVT().getSizeInBits();
2292 auto MaskValue = APInt::getLowBitsSet(LocBits, ArgVT.getSizeInBits());
2293 SDValue Mask =
2294 DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
2295 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
2296 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
2297 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2298 }
2299 }
2300
2301 // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
2302 if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
2303 SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2304 DAG.getConstant(0, dl, MVT::i32));
2305 SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2306 DAG.getConstant(1, dl, MVT::i32));
2307
2308 PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, VA, ArgLocs[++i],
2309 StackPtr, MemOpChains, isTailCall, SPDiff);
2310
2311 VA = ArgLocs[++i]; // skip ahead to next loc
2312 if (VA.isRegLoc()) {
2313 PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, VA, ArgLocs[++i],
2314 StackPtr, MemOpChains, isTailCall, SPDiff);
2315 } else {
2316 assert(VA.isMemLoc());
2317 SDValue DstAddr;
2318 MachinePointerInfo DstInfo;
2319 std::tie(DstAddr, DstInfo) =
2320 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2321 MemOpChains.push_back(DAG.getStore(Chain, dl, Op1, DstAddr, DstInfo));
2322 }
2323 } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
2324 PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
2325 StackPtr, MemOpChains, isTailCall, SPDiff);
2326 } else if (VA.isRegLoc()) {
2327 if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
2328 Outs[0].VT == MVT::i32) {
2329 assert(VA.getLocVT() == MVT::i32 &&
2330 "unexpected calling convention register assignment");
2331 assert(!Ins.empty() && Ins[0].VT == MVT::i32 &&
2332 "unexpected use of 'returned'");
2333 isThisReturn = true;
2334 }
2335 const TargetOptions &Options = DAG.getTarget().Options;
2336 if (Options.EmitCallSiteInfo)
2337 CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i);
2338 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2339 } else if (isByVal) {
2340 assert(VA.isMemLoc());
2341 unsigned offset = 0;
2342
2343 // True if this byval aggregate will be split between registers
2344 // and memory.
2345 unsigned ByValArgsCount = CCInfo.getInRegsParamsCount();
2346 unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed();
2347
2348 SDValue ByValSrc;
2349 bool NeedsStackCopy;
2350 if (auto It = ByValTemporaries.find(realArgIdx);
2351 It != ByValTemporaries.end()) {
2352 ByValSrc = It->second;
2353 NeedsStackCopy = true;
2354 } else {
2355 ByValSrc = Arg;
2356 NeedsStackCopy = !isTailCall;
2357 }
2358
2359 // If part of the argument is in registers, load them.
2360 if (CurByValIdx < ByValArgsCount) {
2361 unsigned RegBegin, RegEnd;
2362 CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd);
2363
2364 EVT PtrVT = getPointerTy(DAG.getDataLayout());
2365 unsigned int i, j;
2366 for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
2367 SDValue Const = DAG.getConstant(4*i, dl, MVT::i32);
2368 SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, ByValSrc, Const);
2369 SDValue Load =
2370 DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo(),
2371 DAG.InferPtrAlign(AddArg));
2372 MemOpChains.push_back(Load.getValue(1));
2373 RegsToPass.push_back(std::make_pair(j, Load));
2374 }
2375
2376 // If parameter size outsides register area, "offset" value
2377 // helps us to calculate stack slot for remained part properly.
2378 offset = RegEnd - RegBegin;
2379
2380 CCInfo.nextInRegsParam();
2381 }
2382
2383 // If the memory part of the argument isn't already in the correct place
2384 // (which can happen with tail calls), copy it into the argument area.
2385 if (NeedsStackCopy && Flags.getByValSize() > 4 * offset) {
2386 auto PtrVT = getPointerTy(DAG.getDataLayout());
2387 SDValue Dst;
2388 MachinePointerInfo DstInfo;
2389 std::tie(Dst, DstInfo) =
2390 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2391 SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl);
2392 SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, ByValSrc, SrcOffset);
2393 SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl,
2394 MVT::i32);
2395 SDValue AlignNode =
2396 DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32);
2397
2398 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
2399 SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
2400 MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
2401 Ops));
2402 }
2403 } else {
2404 assert(VA.isMemLoc());
2405 SDValue DstAddr;
2406 MachinePointerInfo DstInfo;
2407 std::tie(DstAddr, DstInfo) =
2408 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2409
2410 SDValue Store = DAG.getStore(Chain, dl, Arg, DstAddr, DstInfo);
2411 MemOpChains.push_back(Store);
2412 }
2413 }
2414
2415 if (!MemOpChains.empty())
2416 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
2417
2418 // Build a sequence of copy-to-reg nodes chained together with token chain
2419 // and flag operands which copy the outgoing args into the appropriate regs.
2420 SDValue InGlue;
2421 for (const auto &[Reg, N] : RegsToPass) {
2422 Chain = DAG.getCopyToReg(Chain, dl, Reg, N, InGlue);
2423 InGlue = Chain.getValue(1);
2424 }
2425
2426 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
2427 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
2428 // node so that legalize doesn't hack it.
2429 bool isDirect = false;
2430
2431 const TargetMachine &TM = getTargetMachine();
2432 const GlobalValue *GVal = nullptr;
2433 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
2434 GVal = G->getGlobal();
2435 bool isStub = !TM.shouldAssumeDSOLocal(GVal) && Subtarget->isTargetMachO();
2436
2437 bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
2438 bool isLocalARMFunc = false;
2439 auto PtrVt = getPointerTy(DAG.getDataLayout());
2440
2441 if (Subtarget->genLongCalls()) {
2442 assert((!isPositionIndependent() || Subtarget->isTargetWindows()) &&
2443 "long-calls codegen is not position independent!");
2444 // Handle a global address or an external symbol. If it's not one of
2445 // those, the target's already in a register, so we don't need to do
2446 // anything extra.
2447 if (isa<GlobalAddressSDNode>(Callee)) {
2448 if (Subtarget->genExecuteOnly()) {
2449 if (Subtarget->useMovt())
2450 ++NumMovwMovt;
2451 Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt,
2452 DAG.getTargetGlobalAddress(GVal, dl, PtrVt));
2453 } else {
2454 // Create a constant pool entry for the callee address
2455 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2456 ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(
2457 GVal, ARMPCLabelIndex, ARMCP::CPValue, 0);
2458
2459 // Get the address of the callee into a register
2460 SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2461 Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr);
2462 Callee = DAG.getLoad(
2463 PtrVt, dl, DAG.getEntryNode(), Addr,
2465 }
2466 } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
2467 const char *Sym = S->getSymbol();
2468
2469 if (Subtarget->genExecuteOnly()) {
2470 if (Subtarget->useMovt())
2471 ++NumMovwMovt;
2472 Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt,
2473 DAG.getTargetGlobalAddress(GVal, dl, PtrVt));
2474 } else {
2475 // Create a constant pool entry for the callee address
2476 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2477 ARMConstantPoolValue *CPV = ARMConstantPoolSymbol::Create(
2478 *DAG.getContext(), Sym, ARMPCLabelIndex, 0);
2479
2480 // Get the address of the callee into a register
2481 SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2482 Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr);
2483 Callee = DAG.getLoad(
2484 PtrVt, dl, DAG.getEntryNode(), Addr,
2486 }
2487 }
2488 } else if (isa<GlobalAddressSDNode>(Callee)) {
2489 if (!PreferIndirect) {
2490 isDirect = true;
2491 bool isDef = GVal->isStrongDefinitionForLinker();
2492
2493 // ARM call to a local ARM function is predicable.
2494 isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking);
2495 // tBX takes a register source operand.
2496 if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2497 assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?");
2498 Callee = DAG.getNode(
2499 ARMISD::WrapperPIC, dl, PtrVt,
2500 DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, ARMII::MO_NONLAZY));
2501 Callee = DAG.getLoad(
2502 PtrVt, dl, DAG.getEntryNode(), Callee,
2506 } else if (Subtarget->isTargetCOFF()) {
2507 assert(Subtarget->isTargetWindows() &&
2508 "Windows is the only supported COFF target");
2509 unsigned TargetFlags = ARMII::MO_NO_FLAG;
2510 if (GVal->hasDLLImportStorageClass())
2511 TargetFlags = ARMII::MO_DLLIMPORT;
2512 else if (!TM.shouldAssumeDSOLocal(GVal))
2513 TargetFlags = ARMII::MO_COFFSTUB;
2514 Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, /*offset=*/0,
2515 TargetFlags);
2516 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
2517 Callee =
2518 DAG.getLoad(PtrVt, dl, DAG.getEntryNode(),
2519 DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee),
2521 } else {
2522 Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, 0);
2523 }
2524 }
2525 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
2526 isDirect = true;
2527 // tBX takes a register source operand.
2528 const char *Sym = S->getSymbol();
2529 if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2530 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2531 ARMConstantPoolValue *CPV =
2533 ARMPCLabelIndex, 4);
2534 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2535 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2536 Callee = DAG.getLoad(
2537 PtrVt, dl, DAG.getEntryNode(), CPAddr,
2539 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
2540 Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel);
2541 } else {
2542 Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0);
2543 }
2544 }
2545
2546 if (isCmseNSCall) {
2547 assert(!isARMFunc && !isDirect &&
2548 "Cannot handle call to ARM function or direct call");
2549 if (NumBytes > 0) {
2550 DAG.getContext()->diagnose(
2551 DiagnosticInfoUnsupported(DAG.getMachineFunction().getFunction(),
2552 "call to non-secure function would require "
2553 "passing arguments on stack",
2554 dl.getDebugLoc()));
2555 }
2556 if (isStructRet) {
2557 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
2559 "call to non-secure function would return value through pointer",
2560 dl.getDebugLoc()));
2561 }
2562 }
2563
2564 // FIXME: handle tail calls differently.
2565 unsigned CallOpc;
2566 if (Subtarget->isThumb()) {
2567 if (GuardWithBTI)
2568 CallOpc = ARMISD::t2CALL_BTI;
2569 else if (isCmseNSCall)
2570 CallOpc = ARMISD::tSECALL;
2571 else if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
2572 CallOpc = ARMISD::CALL_NOLINK;
2573 else
2574 CallOpc = ARMISD::CALL;
2575 } else {
2576 if (!isDirect && !Subtarget->hasV5TOps())
2577 CallOpc = ARMISD::CALL_NOLINK;
2578 else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() &&
2579 // Emit regular call when code size is the priority
2580 !Subtarget->hasMinSize())
2581 // "mov lr, pc; b _foo" to avoid confusing the RSP
2582 CallOpc = ARMISD::CALL_NOLINK;
2583 else
2584 CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL;
2585 }
2586
2587 // We don't usually want to end the call-sequence here because we would tidy
2588 // the frame up *after* the call, however in the ABI-changing tail-call case
2589 // we've carefully laid out the parameters so that when sp is reset they'll be
2590 // in the correct location.
2591 if (isTailCall && !isSibCall) {
2592 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, dl);
2593 InGlue = Chain.getValue(1);
2594 }
2595
2596 std::vector<SDValue> Ops;
2597 Ops.push_back(Chain);
2598 Ops.push_back(Callee);
2599
2600 if (isTailCall) {
2601 Ops.push_back(DAG.getSignedTargetConstant(SPDiff, dl, MVT::i32));
2602 }
2603
2604 // Add argument registers to the end of the list so that they are known live
2605 // into the call.
2606 for (const auto &[Reg, N] : RegsToPass)
2607 Ops.push_back(DAG.getRegister(Reg, N.getValueType()));
2608
2609 // Add a register mask operand representing the call-preserved registers.
2610 const uint32_t *Mask;
2611 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
2612 if (isThisReturn) {
2613 // For 'this' returns, use the R0-preserving mask if applicable
2614 Mask = ARI->getThisReturnPreservedMask(MF, CallConv);
2615 if (!Mask) {
2616 // Set isThisReturn to false if the calling convention is not one that
2617 // allows 'returned' to be modeled in this way, so LowerCallResult does
2618 // not try to pass 'this' straight through
2619 isThisReturn = false;
2620 Mask = ARI->getCallPreservedMask(MF, CallConv);
2621 }
2622 } else
2623 Mask = ARI->getCallPreservedMask(MF, CallConv);
2624
2625 assert(Mask && "Missing call preserved mask for calling convention");
2626 Ops.push_back(DAG.getRegisterMask(Mask));
2627
2628 if (InGlue.getNode())
2629 Ops.push_back(InGlue);
2630
2631 if (isTailCall) {
2633 SDValue Ret = DAG.getNode(ARMISD::TC_RETURN, dl, MVT::Other, Ops);
2634 if (CLI.CFIType)
2635 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
2636 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
2637 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
2638 return Ret;
2639 }
2640
2641 // Returns a chain and a flag for retval copy to use.
2642 Chain = DAG.getNode(CallOpc, dl, {MVT::Other, MVT::Glue}, Ops);
2643 if (CLI.CFIType)
2644 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
2645 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
2646 InGlue = Chain.getValue(1);
2647 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
2648
2649 // If we're guaranteeing tail-calls will be honoured, the callee must
2650 // pop its own argument stack on return. But this call is *not* a tail call so
2651 // we need to undo that after it returns to restore the status-quo.
2652 bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt;
2653 uint64_t CalleePopBytes =
2654 canGuaranteeTCO(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : -1U;
2655
2656 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, dl);
2657 if (!Ins.empty())
2658 InGlue = Chain.getValue(1);
2659
2660 // Handle result values, copying them out of physregs into vregs that we
2661 // return.
2662 return LowerCallResult(Chain, InGlue, CallConv, isVarArg, Ins, dl, DAG,
2663 InVals, isThisReturn,
2664 isThisReturn ? OutVals[0] : SDValue(), isCmseNSCall);
2665}
2666
2667/// HandleByVal - Every parameter *after* a byval parameter is passed
2668/// on the stack. Remember the next parameter register to allocate,
2669/// and then confiscate the rest of the parameter registers to insure
2670/// this.
2671void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
2672 Align Alignment) const {
2673 // Byval (as with any stack) slots are always at least 4 byte aligned.
2674 Alignment = std::max(Alignment, Align(4));
2675
2676 MCRegister Reg = State->AllocateReg(GPRArgRegs);
2677 if (!Reg)
2678 return;
2679
2680 unsigned AlignInRegs = Alignment.value() / 4;
2681 unsigned Waste = (ARM::R4 - Reg) % AlignInRegs;
2682 for (unsigned i = 0; i < Waste; ++i)
2683 Reg = State->AllocateReg(GPRArgRegs);
2684
2685 if (!Reg)
2686 return;
2687
2688 unsigned Excess = 4 * (ARM::R4 - Reg);
2689
2690 // Special case when NSAA != SP and parameter size greater than size of
2691 // all remained GPR regs. In that case we can't split parameter, we must
2692 // send it to stack. We also must set NCRN to R4, so waste all
2693 // remained registers.
2694 const unsigned NSAAOffset = State->getStackSize();
2695 if (NSAAOffset != 0 && Size > Excess) {
2696 while (State->AllocateReg(GPRArgRegs))
2697 ;
2698 return;
2699 }
2700
2701 // First register for byval parameter is the first register that wasn't
2702 // allocated before this method call, so it would be "reg".
2703 // If parameter is small enough to be saved in range [reg, r4), then
2704 // the end (first after last) register would be reg + param-size-in-regs,
2705 // else parameter would be splitted between registers and stack,
2706 // end register would be r4 in this case.
2707 unsigned ByValRegBegin = Reg;
2708 unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4);
2709 State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd);
2710 // Note, first register is allocated in the beginning of function already,
2711 // allocate remained amount of registers we need.
2712 for (unsigned i = Reg + 1; i != ByValRegEnd; ++i)
2713 State->AllocateReg(GPRArgRegs);
2714 // A byval parameter that is split between registers and memory needs its
2715 // size truncated here.
2716 // In the case where the entire structure fits in registers, we set the
2717 // size in memory to zero.
2718 Size = std::max<int>(Size - Excess, 0);
2719}
2720
2721/// IsEligibleForTailCallOptimization - Check whether the call is eligible
2722/// for tail call optimization. Targets which want to do tail call
2723/// optimization should implement this function. Note that this function also
2724/// processes musttail calls, so when this function returns false on a valid
2725/// musttail call, a fatal backend error occurs.
2726bool ARMTargetLowering::IsEligibleForTailCallOptimization(
2728 SmallVectorImpl<CCValAssign> &ArgLocs, const bool isIndirect) const {
2729 CallingConv::ID CalleeCC = CLI.CallConv;
2730 SDValue Callee = CLI.Callee;
2731 bool isVarArg = CLI.IsVarArg;
2732 const SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2733 const SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2734 const SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
2735 const SelectionDAG &DAG = CLI.DAG;
2736 MachineFunction &MF = DAG.getMachineFunction();
2737 const Function &CallerF = MF.getFunction();
2738 CallingConv::ID CallerCC = CallerF.getCallingConv();
2739
2740 assert(Subtarget->supportsTailCall());
2741
2742 // Indirect tail-calls require a register to hold the target address. That
2743 // register must be:
2744 // * Allocatable (i.e. r0-r7 if the target is Thumb1).
2745 // * Not callee-saved, so must be one of r0-r3 or r12.
2746 // * Not used to hold an argument to the tail-called function, which might be
2747 // in r0-r3.
2748 // * Not used to hold the return address authentication code, which is in r12
2749 // if enabled.
2750 // Sometimes, no register matches all of these conditions, so we can't do a
2751 // tail-call.
2752 if (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect) {
2753 SmallSet<MCPhysReg, 5> AddressRegisters = {ARM::R0, ARM::R1, ARM::R2,
2754 ARM::R3};
2755 if (!(Subtarget->isThumb1Only() ||
2756 MF.getInfo<ARMFunctionInfo>()->shouldSignReturnAddress(true)))
2757 AddressRegisters.insert(ARM::R12);
2758 for (const CCValAssign &AL : ArgLocs)
2759 if (AL.isRegLoc())
2760 AddressRegisters.erase(AL.getLocReg());
2761 if (AddressRegisters.empty()) {
2762 LLVM_DEBUG(dbgs() << "false (no reg to hold function pointer)\n");
2763 return false;
2764 }
2765 }
2766
2767 // Look for obvious safe cases to perform tail call optimization that do not
2768 // require ABI changes. This is what gcc calls sibcall.
2769
2770 // Exception-handling functions need a special set of instructions to indicate
2771 // a return to the hardware. Tail-calling another function would probably
2772 // break this.
2773 if (CallerF.hasFnAttribute("interrupt")) {
2774 LLVM_DEBUG(dbgs() << "false (interrupt attribute)\n");
2775 return false;
2776 }
2777
2778 if (canGuaranteeTCO(CalleeCC,
2779 getTargetMachine().Options.GuaranteedTailCallOpt)) {
2780 LLVM_DEBUG(dbgs() << (CalleeCC == CallerCC ? "true" : "false")
2781 << " (guaranteed tail-call CC)\n");
2782 return CalleeCC == CallerCC;
2783 }
2784
2785 // Also avoid sibcall optimization if either caller or callee uses struct
2786 // return semantics.
2787 bool isCalleeStructRet = Outs.empty() ? false : Outs[0].Flags.isSRet();
2788 bool isCallerStructRet = MF.getFunction().hasStructRetAttr();
2789 if (isCalleeStructRet != isCallerStructRet) {
2790 LLVM_DEBUG(dbgs() << "false (struct-ret)\n");
2791 return false;
2792 }
2793
2794 // Externally-defined functions with weak linkage should not be
2795 // tail-called on ARM when the OS does not support dynamic
2796 // pre-emption of symbols, as the AAELF spec requires normal calls
2797 // to undefined weak functions to be replaced with a NOP or jump to the
2798 // next instruction. The behaviour of branch instructions in this
2799 // situation (as used for tail calls) is implementation-defined, so we
2800 // cannot rely on the linker replacing the tail call with a return.
2801 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
2802 const GlobalValue *GV = G->getGlobal();
2803 const Triple &TT = getTargetMachine().getTargetTriple();
2804 if (GV->hasExternalWeakLinkage() &&
2805 (!TT.isOSWindows() || TT.isOSBinFormatELF() ||
2806 TT.isOSBinFormatMachO())) {
2807 LLVM_DEBUG(dbgs() << "false (external weak linkage)\n");
2808 return false;
2809 }
2810 }
2811
2812 // Check that the call results are passed in the same way.
2813 LLVMContext &C = *DAG.getContext();
2815 getEffectiveCallingConv(CalleeCC, isVarArg),
2816 getEffectiveCallingConv(CallerCC, CallerF.isVarArg()), MF, C, Ins,
2817 CCAssignFnForReturn(CalleeCC, isVarArg),
2818 CCAssignFnForReturn(CallerCC, CallerF.isVarArg()))) {
2819 LLVM_DEBUG(dbgs() << "false (incompatible results)\n");
2820 return false;
2821 }
2822 // The callee has to preserve all registers the caller needs to preserve.
2823 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
2824 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
2825 if (CalleeCC != CallerCC) {
2826 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
2827 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) {
2828 LLVM_DEBUG(dbgs() << "false (not all registers preserved)\n");
2829 return false;
2830 }
2831 }
2832
2833 // If Caller's vararg argument has been split between registers and stack, do
2834 // not perform tail call, since part of the argument is in caller's local
2835 // frame.
2836 const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>();
2837 if (CLI.IsVarArg && AFI_Caller->getArgRegsSaveSize()) {
2838 LLVM_DEBUG(dbgs() << "false (arg reg save area)\n");
2839 return false;
2840 }
2841
2842 // If the callee takes no arguments then go on to check the results of the
2843 // call.
2844 const MachineRegisterInfo &MRI = MF.getRegInfo();
2845 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) {
2846 LLVM_DEBUG(dbgs() << "false (parameters in CSRs do not match)\n");
2847 return false;
2848 }
2849
2850 // If the stack arguments for this call do not fit into our own save area then
2851 // the call cannot be made tail.
2852 if (CCInfo.getStackSize() > AFI_Caller->getArgumentStackSize())
2853 return false;
2854
2855 LLVM_DEBUG(dbgs() << "true\n");
2856 return true;
2857}
2858
2859bool
2860ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
2861 MachineFunction &MF, bool isVarArg,
2863 LLVMContext &Context, const Type *RetTy) const {
2865 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2866 return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
2867}
2868
2870 const SDLoc &DL, SelectionDAG &DAG) {
2871 const MachineFunction &MF = DAG.getMachineFunction();
2872 const Function &F = MF.getFunction();
2873
2874 StringRef IntKind = F.getFnAttribute("interrupt").getValueAsString();
2875
2876 // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset
2877 // version of the "preferred return address". These offsets affect the return
2878 // instruction if this is a return from PL1 without hypervisor extensions.
2879 // IRQ/FIQ: +4 "subs pc, lr, #4"
2880 // SWI: 0 "subs pc, lr, #0"
2881 // ABORT: +4 "subs pc, lr, #4"
2882 // UNDEF: +4/+2 "subs pc, lr, #0"
2883 // UNDEF varies depending on where the exception came from ARM or Thumb
2884 // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0.
2885
2886 int64_t LROffset;
2887 if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" ||
2888 IntKind == "ABORT")
2889 LROffset = 4;
2890 else if (IntKind == "SWI" || IntKind == "UNDEF")
2891 LROffset = 0;
2892 else
2893 report_fatal_error("Unsupported interrupt attribute. If present, value "
2894 "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF");
2895
2896 RetOps.insert(RetOps.begin() + 1,
2897 DAG.getConstant(LROffset, DL, MVT::i32, false));
2898
2899 return DAG.getNode(ARMISD::INTRET_GLUE, DL, MVT::Other, RetOps);
2900}
2901
2902SDValue
2903ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2904 bool isVarArg,
2906 const SmallVectorImpl<SDValue> &OutVals,
2907 const SDLoc &dl, SelectionDAG &DAG) const {
2908 // CCValAssign - represent the assignment of the return value to a location.
2910
2911 // CCState - Info about the registers and stack slots.
2912 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2913 *DAG.getContext());
2914
2915 // Analyze outgoing return values.
2916 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
2917
2918 SDValue Glue;
2920 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2921 bool isLittleEndian = Subtarget->isLittle();
2922
2923 MachineFunction &MF = DAG.getMachineFunction();
2924 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2925 AFI->setReturnRegsCount(RVLocs.size());
2926
2927 // Report error if cmse entry function returns structure through first ptr arg.
2928 if (AFI->isCmseNSEntryFunction() && MF.getFunction().hasStructRetAttr()) {
2929 // Note: using an empty SDLoc(), as the first line of the function is a
2930 // better place to report than the last line.
2931 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
2933 "secure entry function would return value through pointer",
2934 SDLoc().getDebugLoc()));
2935 }
2936
2937 // Copy the result values into the output registers.
2938 for (unsigned i = 0, realRVLocIdx = 0;
2939 i != RVLocs.size();
2940 ++i, ++realRVLocIdx) {
2941 CCValAssign &VA = RVLocs[i];
2942 assert(VA.isRegLoc() && "Can only return in registers!");
2943
2944 SDValue Arg = OutVals[realRVLocIdx];
2945 bool ReturnF16 = false;
2946
2947 if (Subtarget->hasFullFP16() && getTM().isTargetHardFloat()) {
2948 // Half-precision return values can be returned like this:
2949 //
2950 // t11 f16 = fadd ...
2951 // t12: i16 = bitcast t11
2952 // t13: i32 = zero_extend t12
2953 // t14: f32 = bitcast t13 <~~~~~~~ Arg
2954 //
2955 // to avoid code generation for bitcasts, we simply set Arg to the node
2956 // that produces the f16 value, t11 in this case.
2957 //
2958 if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) {
2959 SDValue ZE = Arg.getOperand(0);
2960 if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) {
2961 SDValue BC = ZE.getOperand(0);
2962 if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) {
2963 Arg = BC.getOperand(0);
2964 ReturnF16 = true;
2965 }
2966 }
2967 }
2968 }
2969
2970 switch (VA.getLocInfo()) {
2971 default: llvm_unreachable("Unknown loc info!");
2972 case CCValAssign::Full: break;
2973 case CCValAssign::BCvt:
2974 if (!ReturnF16)
2975 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2976 break;
2977 }
2978
2979 // Mask f16 arguments if this is a CMSE nonsecure entry.
2980 auto RetVT = Outs[realRVLocIdx].ArgVT;
2981 if (AFI->isCmseNSEntryFunction() && (RetVT == MVT::f16)) {
2982 if (VA.needsCustom() && VA.getValVT() == MVT::f16) {
2983 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
2984 } else {
2985 auto LocBits = VA.getLocVT().getSizeInBits();
2986 auto MaskValue = APInt::getLowBitsSet(LocBits, RetVT.getSizeInBits());
2987 SDValue Mask =
2988 DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
2989 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
2990 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
2991 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2992 }
2993 }
2994
2995 if (VA.needsCustom() &&
2996 (VA.getLocVT() == MVT::v2f64 || VA.getLocVT() == MVT::f64)) {
2997 if (VA.getLocVT() == MVT::v2f64) {
2998 // Extract the first half and return it in two registers.
2999 SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
3000 DAG.getConstant(0, dl, MVT::i32));
3001 SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
3002 DAG.getVTList(MVT::i32, MVT::i32), Half);
3003
3004 Chain =
3005 DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3006 HalfGPRs.getValue(isLittleEndian ? 0 : 1), Glue);
3007 Glue = Chain.getValue(1);
3008 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3009 VA = RVLocs[++i]; // skip ahead to next loc
3010 Chain =
3011 DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3012 HalfGPRs.getValue(isLittleEndian ? 1 : 0), Glue);
3013 Glue = Chain.getValue(1);
3014 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3015 VA = RVLocs[++i]; // skip ahead to next loc
3016
3017 // Extract the 2nd half and fall through to handle it as an f64 value.
3018 Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
3019 DAG.getConstant(1, dl, MVT::i32));
3020 }
3021 // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is
3022 // available.
3023 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
3024 DAG.getVTList(MVT::i32, MVT::i32), Arg);
3025 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3026 fmrrd.getValue(isLittleEndian ? 0 : 1), Glue);
3027 Glue = Chain.getValue(1);
3028 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3029 VA = RVLocs[++i]; // skip ahead to next loc
3030 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3031 fmrrd.getValue(isLittleEndian ? 1 : 0), Glue);
3032 } else
3033 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Glue);
3034
3035 // Guarantee that all emitted copies are
3036 // stuck together, avoiding something bad.
3037 Glue = Chain.getValue(1);
3038 RetOps.push_back(DAG.getRegister(
3039 VA.getLocReg(), ReturnF16 ? Arg.getValueType() : VA.getLocVT()));
3040 }
3041 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
3042 const MCPhysReg *I =
3043 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3044 if (I) {
3045 for (; *I; ++I) {
3046 if (ARM::GPRRegClass.contains(*I))
3047 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3048 else if (ARM::DPRRegClass.contains(*I))
3050 else
3051 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3052 }
3053 }
3054
3055 // Update chain and glue.
3056 RetOps[0] = Chain;
3057 if (Glue.getNode())
3058 RetOps.push_back(Glue);
3059
3060 // CPUs which aren't M-class use a special sequence to return from
3061 // exceptions (roughly, any instruction setting pc and cpsr simultaneously,
3062 // though we use "subs pc, lr, #N").
3063 //
3064 // M-class CPUs actually use a normal return sequence with a special
3065 // (hardware-provided) value in LR, so the normal code path works.
3066 if (DAG.getMachineFunction().getFunction().hasFnAttribute("interrupt") &&
3067 !Subtarget->isMClass()) {
3068 if (Subtarget->isThumb1Only())
3069 report_fatal_error("interrupt attribute is not supported in Thumb1");
3070 return LowerInterruptReturn(RetOps, dl, DAG);
3071 }
3072
3073 unsigned RetNode =
3074 AFI->isCmseNSEntryFunction() ? ARMISD::SERET_GLUE : ARMISD::RET_GLUE;
3075 return DAG.getNode(RetNode, dl, MVT::Other, RetOps);
3076}
3077
3078bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
3079 if (N->getNumValues() != 1)
3080 return false;
3081 if (!N->hasNUsesOfValue(1, 0))
3082 return false;
3083
3084 SDValue TCChain = Chain;
3085 SDNode *Copy = *N->user_begin();
3086 if (Copy->getOpcode() == ISD::CopyToReg) {
3087 // If the copy has a glue operand, we conservatively assume it isn't safe to
3088 // perform a tail call.
3089 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3090 return false;
3091 TCChain = Copy->getOperand(0);
3092 } else if (Copy->getOpcode() == ARMISD::VMOVRRD) {
3093 SDNode *VMov = Copy;
3094 // f64 returned in a pair of GPRs.
3095 SmallPtrSet<SDNode*, 2> Copies;
3096 for (SDNode *U : VMov->users()) {
3097 if (U->getOpcode() != ISD::CopyToReg)
3098 return false;
3099 Copies.insert(U);
3100 }
3101 if (Copies.size() > 2)
3102 return false;
3103
3104 for (SDNode *U : VMov->users()) {
3105 SDValue UseChain = U->getOperand(0);
3106 if (Copies.count(UseChain.getNode()))
3107 // Second CopyToReg
3108 Copy = U;
3109 else {
3110 // We are at the top of this chain.
3111 // If the copy has a glue operand, we conservatively assume it
3112 // isn't safe to perform a tail call.
3113 if (U->getOperand(U->getNumOperands() - 1).getValueType() == MVT::Glue)
3114 return false;
3115 // First CopyToReg
3116 TCChain = UseChain;
3117 }
3118 }
3119 } else if (Copy->getOpcode() == ISD::BITCAST) {
3120 // f32 returned in a single GPR.
3121 if (!Copy->hasOneUse())
3122 return false;
3123 Copy = *Copy->user_begin();
3124 if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0))
3125 return false;
3126 // If the copy has a glue operand, we conservatively assume it isn't safe to
3127 // perform a tail call.
3128 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3129 return false;
3130 TCChain = Copy->getOperand(0);
3131 } else {
3132 return false;
3133 }
3134
3135 bool HasRet = false;
3136 for (const SDNode *U : Copy->users()) {
3137 if (U->getOpcode() != ARMISD::RET_GLUE &&
3138 U->getOpcode() != ARMISD::INTRET_GLUE)
3139 return false;
3140 HasRet = true;
3141 }
3142
3143 if (!HasRet)
3144 return false;
3145
3146 Chain = TCChain;
3147 return true;
3148}
3149
3150bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3151 if (!Subtarget->supportsTailCall())
3152 return false;
3153
3154 if (!CI->isTailCall())
3155 return false;
3156
3157 return true;
3158}
3159
3160// Trying to write a 64 bit value so need to split into two 32 bit values first,
3161// and pass the lower and high parts through.
3163 SDLoc DL(Op);
3164 SDValue WriteValue = Op->getOperand(2);
3165
3166 // This function is only supposed to be called for i64 type argument.
3167 assert(WriteValue.getValueType() == MVT::i64
3168 && "LowerWRITE_REGISTER called for non-i64 type argument.");
3169
3170 SDValue Lo, Hi;
3171 std::tie(Lo, Hi) = DAG.SplitScalar(WriteValue, DL, MVT::i32, MVT::i32);
3172 SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi };
3173 return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops);
3174}
3175
3176// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
3177// their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
3178// one of the above mentioned nodes. It has to be wrapped because otherwise
3179// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
3180// be used to form addressing mode. These wrapped nodes will be selected
3181// into MOVi.
3182SDValue ARMTargetLowering::LowerConstantPool(SDValue Op,
3183 SelectionDAG &DAG) const {
3184 EVT PtrVT = Op.getValueType();
3185 // FIXME there is no actual debug info here
3186 SDLoc dl(Op);
3187 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
3188 SDValue Res;
3189
3190 // When generating execute-only code Constant Pools must be promoted to the
3191 // global data section. It's a bit ugly that we can't share them across basic
3192 // blocks, but this way we guarantee that execute-only behaves correct with
3193 // position-independent addressing modes.
3194 if (Subtarget->genExecuteOnly()) {
3195 auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
3196 auto *T = CP->getType();
3197 auto C = const_cast<Constant*>(CP->getConstVal());
3198 auto M = DAG.getMachineFunction().getFunction().getParent();
3199 auto GV = new GlobalVariable(
3200 *M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C,
3201 Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" +
3202 Twine(DAG.getMachineFunction().getFunctionNumber()) + "_" +
3203 Twine(AFI->createPICLabelUId())
3204 );
3206 dl, PtrVT);
3207 return LowerGlobalAddress(GA, DAG);
3208 }
3209
3210 // The 16-bit ADR instruction can only encode offsets that are multiples of 4,
3211 // so we need to align to at least 4 bytes when we don't have 32-bit ADR.
3212 Align CPAlign = CP->getAlign();
3213 if (Subtarget->isThumb1Only())
3214 CPAlign = std::max(CPAlign, Align(4));
3215 if (CP->isMachineConstantPoolEntry())
3216 Res =
3217 DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, CPAlign);
3218 else
3219 Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CPAlign);
3220 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
3221}
3222
3224 // If we don't have a 32-bit pc-relative branch instruction then the jump
3225 // table consists of block addresses. Usually this is inline, but for
3226 // execute-only it must be placed out-of-line.
3227 if (Subtarget->genExecuteOnly() && !Subtarget->hasV8MBaselineOps())
3230}
3231
3232SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
3233 SelectionDAG &DAG) const {
3236 unsigned ARMPCLabelIndex = 0;
3237 SDLoc DL(Op);
3238 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3239 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
3240 SDValue CPAddr;
3241 bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI();
3242 if (!IsPositionIndependent) {
3243 CPAddr = DAG.getTargetConstantPool(BA, PtrVT, Align(4));
3244 } else {
3245 unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
3246 ARMPCLabelIndex = AFI->createPICLabelUId();
3248 ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex,
3249 ARMCP::CPBlockAddress, PCAdj);
3250 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3251 }
3252 CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
3253 SDValue Result = DAG.getLoad(
3254 PtrVT, DL, DAG.getEntryNode(), CPAddr,
3256 if (!IsPositionIndependent)
3257 return Result;
3258 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32);
3259 return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
3260}
3261
3262/// Convert a TLS address reference into the correct sequence of loads
3263/// and calls to compute the variable's address for Darwin, and return an
3264/// SDValue containing the final node.
3265
3266/// Darwin only has one TLS scheme which must be capable of dealing with the
3267/// fully general situation, in the worst case. This means:
3268/// + "extern __thread" declaration.
3269/// + Defined in a possibly unknown dynamic library.
3270///
3271/// The general system is that each __thread variable has a [3 x i32] descriptor
3272/// which contains information used by the runtime to calculate the address. The
3273/// only part of this the compiler needs to know about is the first word, which
3274/// contains a function pointer that must be called with the address of the
3275/// entire descriptor in "r0".
3276///
3277/// Since this descriptor may be in a different unit, in general access must
3278/// proceed along the usual ARM rules. A common sequence to produce is:
3279///
3280/// movw rT1, :lower16:_var$non_lazy_ptr
3281/// movt rT1, :upper16:_var$non_lazy_ptr
3282/// ldr r0, [rT1]
3283/// ldr rT2, [r0]
3284/// blx rT2
3285/// [...address now in r0...]
3286SDValue
3287ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op,
3288 SelectionDAG &DAG) const {
3289 assert(Subtarget->isTargetDarwin() &&
3290 "This function expects a Darwin target");
3291 SDLoc DL(Op);
3292
3293 // First step is to get the address of the actua global symbol. This is where
3294 // the TLS descriptor lives.
3295 SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG);
3296
3297 // The first entry in the descriptor is a function pointer that we must call
3298 // to obtain the address of the variable.
3299 SDValue Chain = DAG.getEntryNode();
3300 SDValue FuncTLVGet = DAG.getLoad(
3301 MVT::i32, DL, Chain, DescAddr,
3305 Chain = FuncTLVGet.getValue(1);
3306
3307 MachineFunction &F = DAG.getMachineFunction();
3308 MachineFrameInfo &MFI = F.getFrameInfo();
3309 MFI.setAdjustsStack(true);
3310
3311 // TLS calls preserve all registers except those that absolutely must be
3312 // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be
3313 // silly).
3314 auto TRI =
3316 auto ARI = static_cast<const ARMRegisterInfo *>(TRI);
3317 const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction());
3318
3319 // Finally, we can make the call. This is just a degenerate version of a
3320 // normal AArch64 call node: r0 takes the address of the descriptor, and
3321 // returns the address of the variable in this thread.
3322 Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue());
3323 Chain =
3324 DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
3325 Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32),
3326 DAG.getRegisterMask(Mask), Chain.getValue(1));
3327 return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1));
3328}
3329
3330SDValue
3331ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op,
3332 SelectionDAG &DAG) const {
3333 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
3334
3335 SDValue Chain = DAG.getEntryNode();
3336 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3337 SDLoc DL(Op);
3338
3339 // Load the current TEB (thread environment block)
3340 SDValue Ops[] = {Chain,
3341 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
3342 DAG.getTargetConstant(15, DL, MVT::i32),
3343 DAG.getTargetConstant(0, DL, MVT::i32),
3344 DAG.getTargetConstant(13, DL, MVT::i32),
3345 DAG.getTargetConstant(0, DL, MVT::i32),
3346 DAG.getTargetConstant(2, DL, MVT::i32)};
3347 SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
3348 DAG.getVTList(MVT::i32, MVT::Other), Ops);
3349
3350 SDValue TEB = CurrentTEB.getValue(0);
3351 Chain = CurrentTEB.getValue(1);
3352
3353 // Load the ThreadLocalStoragePointer from the TEB
3354 // A pointer to the TLS array is located at offset 0x2c from the TEB.
3355 SDValue TLSArray =
3356 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL));
3357 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
3358
3359 // The pointer to the thread's TLS data area is at the TLS Index scaled by 4
3360 // offset into the TLSArray.
3361
3362 // Load the TLS index from the C runtime
3363 SDValue TLSIndex =
3364 DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG);
3365 TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex);
3366 TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo());
3367
3368 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
3369 DAG.getConstant(2, DL, MVT::i32));
3370 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
3371 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
3372 MachinePointerInfo());
3373
3374 // Get the offset of the start of the .tls section (section base)
3375 const auto *GA = cast<GlobalAddressSDNode>(Op);
3376 auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL);
3377 SDValue Offset = DAG.getLoad(
3378 PtrVT, DL, Chain,
3379 DAG.getNode(ARMISD::Wrapper, DL, MVT::i32,
3380 DAG.getTargetConstantPool(CPV, PtrVT, Align(4))),
3382
3383 return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset);
3384}
3385
3386// Lower ISD::GlobalTLSAddress using the "general dynamic" model
3387SDValue
3388ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
3389 SelectionDAG &DAG) const {
3390 SDLoc dl(GA);
3391 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3392 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
3393 MachineFunction &MF = DAG.getMachineFunction();
3394 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3395 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3396 ARMConstantPoolValue *CPV =
3397 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
3398 ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
3399 SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3400 Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
3401 Argument = DAG.getLoad(
3402 PtrVT, dl, DAG.getEntryNode(), Argument,
3404 SDValue Chain = Argument.getValue(1);
3405
3406 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3407 Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel);
3408
3409 // call __tls_get_addr.
3411 Args.emplace_back(Argument, Type::getInt32Ty(*DAG.getContext()));
3412
3413 // FIXME: is there useful debug info available here?
3414 TargetLowering::CallLoweringInfo CLI(DAG);
3415 CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
3417 DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args));
3418
3419 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
3420 return CallResult.first;
3421}
3422
3423// Lower ISD::GlobalTLSAddress using the "initial exec" or
3424// "local exec" model.
3425SDValue
3426ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
3427 SelectionDAG &DAG,
3428 TLSModel::Model model) const {
3429 const GlobalValue *GV = GA->getGlobal();
3430 SDLoc dl(GA);
3432 SDValue Chain = DAG.getEntryNode();
3433 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3434 // Get the Thread Pointer
3435 SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
3436
3437 if (model == TLSModel::InitialExec) {
3438 MachineFunction &MF = DAG.getMachineFunction();
3439 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3440 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3441 // Initial exec model.
3442 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
3443 ARMConstantPoolValue *CPV =
3444 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
3446 true);
3447 Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3448 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
3449 Offset = DAG.getLoad(
3450 PtrVT, dl, Chain, Offset,
3452 Chain = Offset.getValue(1);
3453
3454 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3455 Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel);
3456
3457 Offset = DAG.getLoad(
3458 PtrVT, dl, Chain, Offset,
3460 } else {
3461 // local exec model
3462 assert(model == TLSModel::LocalExec);
3463 ARMConstantPoolValue *CPV =
3465 Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3466 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
3467 Offset = DAG.getLoad(
3468 PtrVT, dl, Chain, Offset,
3470 }
3471
3472 // The address of the thread local variable is the add of the thread
3473 // pointer with the offset of the variable.
3474 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
3475}
3476
3477SDValue
3478ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
3479 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
3480 if (DAG.getTarget().useEmulatedTLS())
3481 return LowerToTLSEmulatedModel(GA, DAG);
3482
3483 if (Subtarget->isTargetDarwin())
3484 return LowerGlobalTLSAddressDarwin(Op, DAG);
3485
3486 if (Subtarget->isTargetWindows())
3487 return LowerGlobalTLSAddressWindows(Op, DAG);
3488
3489 // TODO: implement the "local dynamic" model
3490 assert(Subtarget->isTargetELF() && "Only ELF implemented here");
3492
3493 switch (model) {
3496 return LowerToTLSGeneralDynamicModel(GA, DAG);
3499 return LowerToTLSExecModels(GA, DAG, model);
3500 }
3501 llvm_unreachable("bogus TLS model");
3502}
3503
3504/// Return true if all users of V are within function F, looking through
3505/// ConstantExprs.
3506static bool allUsersAreInFunction(const Value *V, const Function *F) {
3507 SmallVector<const User*,4> Worklist(V->users());
3508 while (!Worklist.empty()) {
3509 auto *U = Worklist.pop_back_val();
3510 if (isa<ConstantExpr>(U)) {
3511 append_range(Worklist, U->users());
3512 continue;
3513 }
3514
3515 auto *I = dyn_cast<Instruction>(U);
3516 if (!I || I->getParent()->getParent() != F)
3517 return false;
3518 }
3519 return true;
3520}
3521
3523 const GlobalValue *GV, SelectionDAG &DAG,
3524 EVT PtrVT, const SDLoc &dl) {
3525 // If we're creating a pool entry for a constant global with unnamed address,
3526 // and the global is small enough, we can emit it inline into the constant pool
3527 // to save ourselves an indirection.
3528 //
3529 // This is a win if the constant is only used in one function (so it doesn't
3530 // need to be duplicated) or duplicating the constant wouldn't increase code
3531 // size (implying the constant is no larger than 4 bytes).
3532 const Function &F = DAG.getMachineFunction().getFunction();
3533
3534 // We rely on this decision to inline being idemopotent and unrelated to the
3535 // use-site. We know that if we inline a variable at one use site, we'll
3536 // inline it elsewhere too (and reuse the constant pool entry). Fast-isel
3537 // doesn't know about this optimization, so bail out if it's enabled else
3538 // we could decide to inline here (and thus never emit the GV) but require
3539 // the GV from fast-isel generated code.
3542 return SDValue();
3543
3544 auto *GVar = dyn_cast<GlobalVariable>(GV);
3545 if (!GVar || !GVar->hasInitializer() ||
3546 !GVar->isConstant() || !GVar->hasGlobalUnnamedAddr() ||
3547 !GVar->hasLocalLinkage())
3548 return SDValue();
3549
3550 // If we inline a value that contains relocations, we move the relocations
3551 // from .data to .text. This is not allowed in position-independent code.
3552 auto *Init = GVar->getInitializer();
3553 if ((TLI->isPositionIndependent() || TLI->getSubtarget()->isROPI()) &&
3554 Init->needsDynamicRelocation())
3555 return SDValue();
3556
3557 // The constant islands pass can only really deal with alignment requests
3558 // <= 4 bytes and cannot pad constants itself. Therefore we cannot promote
3559 // any type wanting greater alignment requirements than 4 bytes. We also
3560 // can only promote constants that are multiples of 4 bytes in size or
3561 // are paddable to a multiple of 4. Currently we only try and pad constants
3562 // that are strings for simplicity.
3563 auto *CDAInit = dyn_cast<ConstantDataArray>(Init);
3564 unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType());
3565 Align PrefAlign = DAG.getDataLayout().getPreferredAlign(GVar);
3566 unsigned RequiredPadding = 4 - (Size % 4);
3567 bool PaddingPossible =
3568 RequiredPadding == 4 || (CDAInit && CDAInit->isString());
3569 if (!PaddingPossible || PrefAlign > 4 || Size > ConstpoolPromotionMaxSize ||
3570 Size == 0)
3571 return SDValue();
3572
3573 unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding);
3575 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3576
3577 // We can't bloat the constant pool too much, else the ConstantIslands pass
3578 // may fail to converge. If we haven't promoted this global yet (it may have
3579 // multiple uses), and promoting it would increase the constant pool size (Sz
3580 // > 4), ensure we have space to do so up to MaxTotal.
3581 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar) && Size > 4)
3582 if (AFI->getPromotedConstpoolIncrease() + PaddedSize - 4 >=
3584 return SDValue();
3585
3586 // This is only valid if all users are in a single function; we can't clone
3587 // the constant in general. The LLVM IR unnamed_addr allows merging
3588 // constants, but not cloning them.
3589 //
3590 // We could potentially allow cloning if we could prove all uses of the
3591 // constant in the current function don't care about the address, like
3592 // printf format strings. But that isn't implemented for now.
3593 if (!allUsersAreInFunction(GVar, &F))
3594 return SDValue();
3595
3596 // We're going to inline this global. Pad it out if needed.
3597 if (RequiredPadding != 4) {
3598 StringRef S = CDAInit->getAsString();
3599
3601 std::copy(S.bytes_begin(), S.bytes_end(), V.begin());
3602 while (RequiredPadding--)
3603 V.push_back(0);
3605 }
3606
3607 auto CPVal = ARMConstantPoolConstant::Create(GVar, Init);
3608 SDValue CPAddr = DAG.getTargetConstantPool(CPVal, PtrVT, Align(4));
3609 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) {
3612 PaddedSize - 4);
3613 }
3614 ++NumConstpoolPromoted;
3615 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3616}
3617
3619 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
3620 if (!(GV = GA->getAliaseeObject()))
3621 return false;
3622 if (const auto *V = dyn_cast<GlobalVariable>(GV))
3623 return V->isConstant();
3624 return isa<Function>(GV);
3625}
3626
3627SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op,
3628 SelectionDAG &DAG) const {
3629 switch (Subtarget->getTargetTriple().getObjectFormat()) {
3630 default: llvm_unreachable("unknown object format");
3631 case Triple::COFF:
3632 return LowerGlobalAddressWindows(Op, DAG);
3633 case Triple::ELF:
3634 return LowerGlobalAddressELF(Op, DAG);
3635 case Triple::MachO:
3636 return LowerGlobalAddressDarwin(Op, DAG);
3637 }
3638}
3639
3640SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
3641 SelectionDAG &DAG) const {
3642 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3643 SDLoc dl(Op);
3644 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3645 bool IsRO = isReadOnly(GV);
3646
3647 // promoteToConstantPool only if not generating XO text section
3648 if (GV->isDSOLocal() && !Subtarget->genExecuteOnly())
3649 if (SDValue V = promoteToConstantPool(this, GV, DAG, PtrVT, dl))
3650 return V;
3651
3652 if (isPositionIndependent()) {
3654 GV, dl, PtrVT, 0, GV->isDSOLocal() ? 0 : ARMII::MO_GOT);
3655 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
3656 if (!GV->isDSOLocal())
3657 Result =
3658 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
3660 return Result;
3661 } else if (Subtarget->isROPI() && IsRO) {
3662 // PC-relative.
3663 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
3664 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
3665 return Result;
3666 } else if (Subtarget->isRWPI() && !IsRO) {
3667 // SB-relative.
3668 SDValue RelAddr;
3669 if (Subtarget->useMovt()) {
3670 ++NumMovwMovt;
3671 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_SBREL);
3672 RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G);
3673 } else { // use literal pool for address constant
3674 ARMConstantPoolValue *CPV =
3676 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3677 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3678 RelAddr = DAG.getLoad(
3679 PtrVT, dl, DAG.getEntryNode(), CPAddr,
3681 }
3682 SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT);
3683 SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, RelAddr);
3684 return Result;
3685 }
3686
3687 // If we have T2 ops, we can materialize the address directly via movt/movw
3688 // pair. This is always cheaper. If need to generate Execute Only code, and we
3689 // only have Thumb1 available, we can't use a constant pool and are forced to
3690 // use immediate relocations.
3691 if (Subtarget->useMovt() || Subtarget->genExecuteOnly()) {
3692 if (Subtarget->useMovt())
3693 ++NumMovwMovt;
3694 // FIXME: Once remat is capable of dealing with instructions with register
3695 // operands, expand this into two nodes.
3696 return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
3697 DAG.getTargetGlobalAddress(GV, dl, PtrVT));
3698 } else {
3699 SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, Align(4));
3700 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3701 return DAG.getLoad(
3702 PtrVT, dl, DAG.getEntryNode(), CPAddr,
3704 }
3705}
3706
3707SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
3708 SelectionDAG &DAG) const {
3709 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
3710 "ROPI/RWPI not currently supported for Darwin");
3711 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3712 SDLoc dl(Op);
3713 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3714
3715 if (Subtarget->useMovt())
3716 ++NumMovwMovt;
3717
3718 // FIXME: Once remat is capable of dealing with instructions with register
3719 // operands, expand this into multiple nodes
3720 unsigned Wrapper =
3721 isPositionIndependent() ? ARMISD::WrapperPIC : ARMISD::Wrapper;
3722
3723 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY);
3724 SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G);
3725
3726 if (Subtarget->isGVIndirectSymbol(GV))
3727 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
3729 return Result;
3730}
3731
3732SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
3733 SelectionDAG &DAG) const {
3734 assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported");
3735 assert(Subtarget->useMovt() &&
3736 "Windows on ARM expects to use movw/movt");
3737 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
3738 "ROPI/RWPI not currently supported for Windows");
3739
3740 const TargetMachine &TM = getTargetMachine();
3741 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3742 ARMII::TOF TargetFlags = ARMII::MO_NO_FLAG;
3743 if (GV->hasDLLImportStorageClass())
3744 TargetFlags = ARMII::MO_DLLIMPORT;
3745 else if (!TM.shouldAssumeDSOLocal(GV))
3746 TargetFlags = ARMII::MO_COFFSTUB;
3747 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3749 SDLoc DL(Op);
3750
3751 ++NumMovwMovt;
3752
3753 // FIXME: Once remat is capable of dealing with instructions with register
3754 // operands, expand this into two nodes.
3755 Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT,
3756 DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*offset=*/0,
3757 TargetFlags));
3758 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
3759 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
3761 return Result;
3762}
3763
3764SDValue
3765ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
3766 SDLoc dl(Op);
3767 SDValue Val = DAG.getConstant(0, dl, MVT::i32);
3768 return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl,
3769 DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
3770 Op.getOperand(1), Val);
3771}
3772
3773SDValue
3774ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {
3775 SDLoc dl(Op);
3776 return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0),
3777 Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32));
3778}
3779
3780SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
3781 SelectionDAG &DAG) const {
3782 SDLoc dl(Op);
3783 return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other,
3784 Op.getOperand(0));
3785}
3786
3787SDValue ARMTargetLowering::LowerINTRINSIC_VOID(
3788 SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const {
3789 unsigned IntNo =
3790 Op.getConstantOperandVal(Op.getOperand(0).getValueType() == MVT::Other);
3791 switch (IntNo) {
3792 default:
3793 return SDValue(); // Don't custom lower most intrinsics.
3794 case Intrinsic::arm_gnu_eabi_mcount: {
3795 MachineFunction &MF = DAG.getMachineFunction();
3796 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3797 SDLoc dl(Op);
3798 SDValue Chain = Op.getOperand(0);
3799 // call "\01__gnu_mcount_nc"
3800 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
3801 const uint32_t *Mask =
3803 assert(Mask && "Missing call preserved mask for calling convention");
3804 // Mark LR an implicit live-in.
3805 Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
3806 SDValue ReturnAddress =
3807 DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, PtrVT);
3808 constexpr EVT ResultTys[] = {MVT::Other, MVT::Glue};
3809 SDValue Callee =
3810 DAG.getTargetExternalSymbol("\01__gnu_mcount_nc", PtrVT, 0);
3812 if (Subtarget->isThumb())
3813 return SDValue(
3814 DAG.getMachineNode(
3815 ARM::tBL_PUSHLR, dl, ResultTys,
3816 {ReturnAddress, DAG.getTargetConstant(ARMCC::AL, dl, PtrVT),
3817 DAG.getRegister(0, PtrVT), Callee, RegisterMask, Chain}),
3818 0);
3819 return SDValue(
3820 DAG.getMachineNode(ARM::BL_PUSHLR, dl, ResultTys,
3821 {ReturnAddress, Callee, RegisterMask, Chain}),
3822 0);
3823 }
3824 }
3825}
3826
3827SDValue
3828ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
3829 const ARMSubtarget *Subtarget) const {
3830 unsigned IntNo = Op.getConstantOperandVal(0);
3831 SDLoc dl(Op);
3832 switch (IntNo) {
3833 default: return SDValue(); // Don't custom lower most intrinsics.
3834 case Intrinsic::thread_pointer: {
3835 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3836 return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
3837 }
3838 case Intrinsic::arm_cls: {
3839 const SDValue &Operand = Op.getOperand(1);
3840 const EVT VTy = Op.getValueType();
3841 SDValue SRA =
3842 DAG.getNode(ISD::SRA, dl, VTy, Operand, DAG.getConstant(31, dl, VTy));
3843 SDValue XOR = DAG.getNode(ISD::XOR, dl, VTy, SRA, Operand);
3844 SDValue SHL =
3845 DAG.getNode(ISD::SHL, dl, VTy, XOR, DAG.getConstant(1, dl, VTy));
3846 SDValue OR =
3847 DAG.getNode(ISD::OR, dl, VTy, SHL, DAG.getConstant(1, dl, VTy));
3848 SDValue Result = DAG.getNode(ISD::CTLZ, dl, VTy, OR);
3849 return Result;
3850 }
3851 case Intrinsic::arm_cls64: {
3852 // cls(x) = if cls(hi(x)) != 31 then cls(hi(x))
3853 // else 31 + clz(if hi(x) == 0 then lo(x) else not(lo(x)))
3854 const SDValue &Operand = Op.getOperand(1);
3855 const EVT VTy = Op.getValueType();
3856 SDValue Lo, Hi;
3857 std::tie(Lo, Hi) = DAG.SplitScalar(Operand, dl, VTy, VTy);
3858 SDValue Constant0 = DAG.getConstant(0, dl, VTy);
3859 SDValue Constant1 = DAG.getConstant(1, dl, VTy);
3860 SDValue Constant31 = DAG.getConstant(31, dl, VTy);
3861 SDValue SRAHi = DAG.getNode(ISD::SRA, dl, VTy, Hi, Constant31);
3862 SDValue XORHi = DAG.getNode(ISD::XOR, dl, VTy, SRAHi, Hi);
3863 SDValue SHLHi = DAG.getNode(ISD::SHL, dl, VTy, XORHi, Constant1);
3864 SDValue ORHi = DAG.getNode(ISD::OR, dl, VTy, SHLHi, Constant1);
3865 SDValue CLSHi = DAG.getNode(ISD::CTLZ, dl, VTy, ORHi);
3866 SDValue CheckLo =
3867 DAG.getSetCC(dl, MVT::i1, CLSHi, Constant31, ISD::CondCode::SETEQ);
3868 SDValue HiIsZero =
3869 DAG.getSetCC(dl, MVT::i1, Hi, Constant0, ISD::CondCode::SETEQ);
3870 SDValue AdjustedLo =
3871 DAG.getSelect(dl, VTy, HiIsZero, Lo, DAG.getNOT(dl, Lo, VTy));
3872 SDValue CLZAdjustedLo = DAG.getNode(ISD::CTLZ, dl, VTy, AdjustedLo);
3873 SDValue Result =
3874 DAG.getSelect(dl, VTy, CheckLo,
3875 DAG.getNode(ISD::ADD, dl, VTy, CLZAdjustedLo, Constant31), CLSHi);
3876 return Result;
3877 }
3878 case Intrinsic::eh_sjlj_lsda: {
3879 MachineFunction &MF = DAG.getMachineFunction();
3880 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3881 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3882 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3883 SDValue CPAddr;
3884 bool IsPositionIndependent = isPositionIndependent();
3885 unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0;
3886 ARMConstantPoolValue *CPV =
3887 ARMConstantPoolConstant::Create(&MF.getFunction(), ARMPCLabelIndex,
3888 ARMCP::CPLSDA, PCAdj);
3889 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3890 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3891 SDValue Result = DAG.getLoad(
3892 PtrVT, dl, DAG.getEntryNode(), CPAddr,
3894
3895 if (IsPositionIndependent) {
3896 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3897 Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
3898 }
3899 return Result;
3900 }
3901 case Intrinsic::arm_neon_vabs:
3902 return DAG.getNode(ISD::ABS, SDLoc(Op), Op.getValueType(),
3903 Op.getOperand(1));
3904 case Intrinsic::arm_neon_vabds:
3905 if (Op.getValueType().isInteger())
3906 return DAG.getNode(ISD::ABDS, SDLoc(Op), Op.getValueType(),
3907 Op.getOperand(1), Op.getOperand(2));
3908 return SDValue();
3909 case Intrinsic::arm_neon_vabdu:
3910 return DAG.getNode(ISD::ABDU, SDLoc(Op), Op.getValueType(),
3911 Op.getOperand(1), Op.getOperand(2));
3912 case Intrinsic::arm_neon_vmulls:
3913 case Intrinsic::arm_neon_vmullu: {
3914 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls)
3915 ? ARMISD::VMULLs : ARMISD::VMULLu;
3916 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3917 Op.getOperand(1), Op.getOperand(2));
3918 }
3919 case Intrinsic::arm_neon_vminnm:
3920 case Intrinsic::arm_neon_vmaxnm: {
3921 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm)
3922 ? ISD::FMINNUM : ISD::FMAXNUM;
3923 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3924 Op.getOperand(1), Op.getOperand(2));
3925 }
3926 case Intrinsic::arm_neon_vminu:
3927 case Intrinsic::arm_neon_vmaxu: {
3928 if (Op.getValueType().isFloatingPoint())
3929 return SDValue();
3930 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu)
3931 ? ISD::UMIN : ISD::UMAX;
3932 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3933 Op.getOperand(1), Op.getOperand(2));
3934 }
3935 case Intrinsic::arm_neon_vmins:
3936 case Intrinsic::arm_neon_vmaxs: {
3937 // v{min,max}s is overloaded between signed integers and floats.
3938 if (!Op.getValueType().isFloatingPoint()) {
3939 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
3940 ? ISD::SMIN : ISD::SMAX;
3941 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3942 Op.getOperand(1), Op.getOperand(2));
3943 }
3944 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
3945 ? ISD::FMINIMUM : ISD::FMAXIMUM;
3946 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3947 Op.getOperand(1), Op.getOperand(2));
3948 }
3949 case Intrinsic::arm_neon_vtbl1:
3950 return DAG.getNode(ARMISD::VTBL1, SDLoc(Op), Op.getValueType(),
3951 Op.getOperand(1), Op.getOperand(2));
3952 case Intrinsic::arm_neon_vtbl2:
3953 return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(),
3954 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3955 case Intrinsic::arm_mve_pred_i2v:
3956 case Intrinsic::arm_mve_pred_v2i:
3957 return DAG.getNode(ARMISD::PREDICATE_CAST, SDLoc(Op), Op.getValueType(),
3958 Op.getOperand(1));
3959 case Intrinsic::arm_mve_vreinterpretq:
3960 return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(Op), Op.getValueType(),
3961 Op.getOperand(1));
3962 case Intrinsic::arm_mve_lsll:
3963 return DAG.getNode(ARMISD::LSLL, SDLoc(Op), Op->getVTList(),
3964 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3965 case Intrinsic::arm_mve_asrl:
3966 return DAG.getNode(ARMISD::ASRL, SDLoc(Op), Op->getVTList(),
3967 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3968 }
3969}
3970
3972 const ARMSubtarget *Subtarget) {
3973 SDLoc dl(Op);
3974 auto SSID = static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
3975 if (SSID == SyncScope::SingleThread)
3976 return Op;
3977
3978 if (!Subtarget->hasDataBarrier()) {
3979 // Some ARMv6 cpus can support data barriers with an mcr instruction.
3980 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
3981 // here.
3982 assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
3983 "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!");
3984 return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
3985 DAG.getConstant(0, dl, MVT::i32));
3986 }
3987
3988 AtomicOrdering Ord =
3989 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
3991 if (Subtarget->isMClass()) {
3992 // Only a full system barrier exists in the M-class architectures.
3994 } else if (Subtarget->preferISHSTBarriers() &&
3995 Ord == AtomicOrdering::Release) {
3996 // Swift happens to implement ISHST barriers in a way that's compatible with
3997 // Release semantics but weaker than ISH so we'd be fools not to use
3998 // it. Beware: other processors probably don't!
4000 }
4001
4002 return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0),
4003 DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32),
4004 DAG.getConstant(Domain, dl, MVT::i32));
4005}
4006
4008 const ARMSubtarget *Subtarget) {
4009 // ARM pre v5TE and Thumb1 does not have preload instructions.
4010 if (!(Subtarget->isThumb2() ||
4011 (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps())))
4012 // Just preserve the chain.
4013 return Op.getOperand(0);
4014
4015 SDLoc dl(Op);
4016 unsigned isRead = ~Op.getConstantOperandVal(2) & 1;
4017 if (!isRead &&
4018 (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension()))
4019 // ARMv7 with MP extension has PLDW.
4020 return Op.getOperand(0);
4021
4022 unsigned isData = Op.getConstantOperandVal(4);
4023 if (Subtarget->isThumb()) {
4024 // Invert the bits.
4025 isRead = ~isRead & 1;
4026 isData = ~isData & 1;
4027 }
4028
4029 return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0),
4030 Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32),
4031 DAG.getConstant(isData, dl, MVT::i32));
4032}
4033
4036 ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>();
4037
4038 // vastart just stores the address of the VarArgsFrameIndex slot into the
4039 // memory location argument.
4040 SDLoc dl(Op);
4042 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
4043 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
4044 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
4045 MachinePointerInfo(SV));
4046}
4047
4048SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA,
4049 CCValAssign &NextVA,
4050 SDValue &Root,
4051 SelectionDAG &DAG,
4052 const SDLoc &dl) const {
4053 MachineFunction &MF = DAG.getMachineFunction();
4054 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4055
4056 const TargetRegisterClass *RC;
4057 if (AFI->isThumb1OnlyFunction())
4058 RC = &ARM::tGPRRegClass;
4059 else
4060 RC = &ARM::GPRRegClass;
4061
4062 // Transform the arguments stored in physical registers into virtual ones.
4063 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
4064 SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
4065
4066 SDValue ArgValue2;
4067 if (NextVA.isMemLoc()) {
4068 MachineFrameInfo &MFI = MF.getFrameInfo();
4069 int FI = MFI.CreateFixedObject(4, NextVA.getLocMemOffset(), true);
4070
4071 // Create load node to retrieve arguments from the stack.
4072 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
4073 ArgValue2 = DAG.getLoad(
4074 MVT::i32, dl, Root, FIN,
4076 } else {
4077 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
4078 ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
4079 }
4080 if (!Subtarget->isLittle())
4081 std::swap (ArgValue, ArgValue2);
4082 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2);
4083}
4084
4085// The remaining GPRs hold either the beginning of variable-argument
4086// data, or the beginning of an aggregate passed by value (usually
4087// byval). Either way, we allocate stack slots adjacent to the data
4088// provided by our caller, and store the unallocated registers there.
4089// If this is a variadic function, the va_list pointer will begin with
4090// these values; otherwise, this reassembles a (byval) structure that
4091// was split between registers and memory.
4092// Return: The frame index registers were stored into.
4093int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
4094 const SDLoc &dl, SDValue &Chain,
4095 const Value *OrigArg,
4096 unsigned InRegsParamRecordIdx,
4097 int ArgOffset, unsigned ArgSize) const {
4098 // Currently, two use-cases possible:
4099 // Case #1. Non-var-args function, and we meet first byval parameter.
4100 // Setup first unallocated register as first byval register;
4101 // eat all remained registers
4102 // (these two actions are performed by HandleByVal method).
4103 // Then, here, we initialize stack frame with
4104 // "store-reg" instructions.
4105 // Case #2. Var-args function, that doesn't contain byval parameters.
4106 // The same: eat all remained unallocated registers,
4107 // initialize stack frame.
4108
4109 MachineFunction &MF = DAG.getMachineFunction();
4110 MachineFrameInfo &MFI = MF.getFrameInfo();
4111 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4112 unsigned RBegin, REnd;
4113 if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) {
4114 CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd);
4115 } else {
4116 unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
4117 RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx];
4118 REnd = ARM::R4;
4119 }
4120
4121 if (REnd != RBegin)
4122 ArgOffset = -4 * (ARM::R4 - RBegin);
4123
4124 auto PtrVT = getPointerTy(DAG.getDataLayout());
4125 int FrameIndex = MFI.CreateFixedObject(ArgSize, ArgOffset, false);
4126 SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT);
4127
4129 const TargetRegisterClass *RC =
4130 AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
4131
4132 for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) {
4133 Register VReg = MF.addLiveIn(Reg, RC);
4134 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
4135 SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
4136 MachinePointerInfo(OrigArg, 4 * i));
4137 MemOps.push_back(Store);
4138 FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT));
4139 }
4140
4141 if (!MemOps.empty())
4142 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
4143 return FrameIndex;
4144}
4145
4146// Setup stack frame, the va_list pointer will start from.
4147void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
4148 const SDLoc &dl, SDValue &Chain,
4149 unsigned ArgOffset,
4150 unsigned TotalArgRegsSaveSize,
4151 bool ForceMutable) const {
4152 MachineFunction &MF = DAG.getMachineFunction();
4153 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4154
4155 // Try to store any remaining integer argument regs
4156 // to their spots on the stack so that they may be loaded by dereferencing
4157 // the result of va_next.
4158 // If there is no regs to be stored, just point address after last
4159 // argument passed via stack.
4160 int FrameIndex = StoreByValRegs(
4161 CCInfo, DAG, dl, Chain, nullptr, CCInfo.getInRegsParamsCount(),
4162 CCInfo.getStackSize(), std::max(4U, TotalArgRegsSaveSize));
4163 AFI->setVarArgsFrameIndex(FrameIndex);
4164}
4165
4166bool ARMTargetLowering::splitValueIntoRegisterParts(
4167 SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
4168 unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
4169 EVT ValueVT = Val.getValueType();
4170 if ((ValueVT == MVT::f16 || ValueVT == MVT::bf16) && PartVT == MVT::f32) {
4171 unsigned ValueBits = ValueVT.getSizeInBits();
4172 unsigned PartBits = PartVT.getSizeInBits();
4173 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val);
4174 Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val);
4175 Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
4176 Parts[0] = Val;
4177 return true;
4178 }
4179 return false;
4180}
4181
4182SDValue ARMTargetLowering::joinRegisterPartsIntoValue(
4183 SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts,
4184 MVT PartVT, EVT ValueVT, std::optional<CallingConv::ID> CC) const {
4185 if ((ValueVT == MVT::f16 || ValueVT == MVT::bf16) && PartVT == MVT::f32) {
4186 unsigned ValueBits = ValueVT.getSizeInBits();
4187 unsigned PartBits = PartVT.getSizeInBits();
4188 SDValue Val = Parts[0];
4189
4190 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val);
4191 Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val);
4192 Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
4193 return Val;
4194 }
4195 return SDValue();
4196}
4197
4198SDValue ARMTargetLowering::LowerFormalArguments(
4199 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
4200 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
4201 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
4202 MachineFunction &MF = DAG.getMachineFunction();
4203 MachineFrameInfo &MFI = MF.getFrameInfo();
4204
4205 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4206
4207 // Assign locations to all of the incoming arguments.
4209 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
4210 *DAG.getContext());
4211 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg));
4212
4214 unsigned CurArgIdx = 0;
4215
4216 // Initially ArgRegsSaveSize is zero.
4217 // Then we increase this value each time we meet byval parameter.
4218 // We also increase this value in case of varargs function.
4219 AFI->setArgRegsSaveSize(0);
4220
4221 // Calculate the amount of stack space that we need to allocate to store
4222 // byval and variadic arguments that are passed in registers.
4223 // We need to know this before we allocate the first byval or variadic
4224 // argument, as they will be allocated a stack slot below the CFA (Canonical
4225 // Frame Address, the stack pointer at entry to the function).
4226 unsigned ArgRegBegin = ARM::R4;
4227 for (const CCValAssign &VA : ArgLocs) {
4228 if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount())
4229 break;
4230
4231 unsigned Index = VA.getValNo();
4232 ISD::ArgFlagsTy Flags = Ins[Index].Flags;
4233 if (!Flags.isByVal())
4234 continue;
4235
4236 assert(VA.isMemLoc() && "unexpected byval pointer in reg");
4237 unsigned RBegin, REnd;
4238 CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd);
4239 ArgRegBegin = std::min(ArgRegBegin, RBegin);
4240
4241 CCInfo.nextInRegsParam();
4242 }
4243 CCInfo.rewindByValRegsInfo();
4244
4245 int lastInsIndex = -1;
4246 if (isVarArg && MFI.hasVAStart()) {
4247 unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
4248 if (RegIdx != std::size(GPRArgRegs))
4249 ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]);
4250 }
4251
4252 unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin);
4253 AFI->setArgRegsSaveSize(TotalArgRegsSaveSize);
4254 auto PtrVT = getPointerTy(DAG.getDataLayout());
4255
4256 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4257 CCValAssign &VA = ArgLocs[i];
4258 if (Ins[VA.getValNo()].isOrigArg()) {
4259 std::advance(CurOrigArg,
4260 Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx);
4261 CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex();
4262 }
4263 // Arguments stored in registers.
4264 if (VA.isRegLoc()) {
4265 EVT RegVT = VA.getLocVT();
4266 SDValue ArgValue;
4267
4268 if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
4269 // f64 and vector types are split up into multiple registers or
4270 // combinations of registers and stack slots.
4271 SDValue ArgValue1 =
4272 GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4273 VA = ArgLocs[++i]; // skip ahead to next loc
4274 SDValue ArgValue2;
4275 if (VA.isMemLoc()) {
4276 int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true);
4277 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
4278 ArgValue2 = DAG.getLoad(
4279 MVT::f64, dl, Chain, FIN,
4281 } else {
4282 ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4283 }
4284 ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
4285 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
4286 ArgValue1, DAG.getIntPtrConstant(0, dl));
4287 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
4288 ArgValue2, DAG.getIntPtrConstant(1, dl));
4289 } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
4290 ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4291 } else {
4292 const TargetRegisterClass *RC;
4293
4294 if (RegVT == MVT::f16 || RegVT == MVT::bf16)
4295 RC = &ARM::HPRRegClass;
4296 else if (RegVT == MVT::f32)
4297 RC = &ARM::SPRRegClass;
4298 else if (RegVT == MVT::f64 || RegVT == MVT::v4f16 ||
4299 RegVT == MVT::v4bf16)
4300 RC = &ARM::DPRRegClass;
4301 else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16 ||
4302 RegVT == MVT::v8bf16)
4303 RC = &ARM::QPRRegClass;
4304 else if (RegVT == MVT::i32)
4305 RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass
4306 : &ARM::GPRRegClass;
4307 else
4308 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
4309
4310 // Transform the arguments in physical registers into virtual ones.
4311 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
4312 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
4313
4314 // If this value is passed in r0 and has the returned attribute (e.g.
4315 // C++ 'structors), record this fact for later use.
4316 if (VA.getLocReg() == ARM::R0 && Ins[VA.getValNo()].Flags.isReturned()) {
4317 AFI->setPreservesR0();
4318 }
4319 }
4320
4321 // If this is an 8 or 16-bit value, it is really passed promoted
4322 // to 32 bits. Insert an assert[sz]ext to capture this, then
4323 // truncate to the right size.
4324 switch (VA.getLocInfo()) {
4325 default: llvm_unreachable("Unknown loc info!");
4326 case CCValAssign::Full: break;
4327 case CCValAssign::BCvt:
4328 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
4329 break;
4330 }
4331
4332 // f16 arguments have their size extended to 4 bytes and passed as if they
4333 // had been copied to the LSBs of a 32-bit register.
4334 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
4335 if (VA.needsCustom() &&
4336 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
4337 ArgValue = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), ArgValue);
4338
4339 // On CMSE Entry Functions, formal integer arguments whose bitwidth is
4340 // less than 32 bits must be sign- or zero-extended in the callee for
4341 // security reasons. Although the ABI mandates an extension done by the
4342 // caller, the latter cannot be trusted to follow the rules of the ABI.
4343 const ISD::InputArg &Arg = Ins[VA.getValNo()];
4344 if (AFI->isCmseNSEntryFunction() && Arg.ArgVT.isScalarInteger() &&
4345 RegVT.isScalarInteger() && Arg.ArgVT.bitsLT(MVT::i32))
4346 ArgValue = handleCMSEValue(ArgValue, Arg, DAG, dl);
4347
4348 InVals.push_back(ArgValue);
4349 } else { // VA.isRegLoc()
4350 // Only arguments passed on the stack should make it here.
4351 assert(VA.isMemLoc());
4352 assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
4353
4354 int index = VA.getValNo();
4355
4356 // Some Ins[] entries become multiple ArgLoc[] entries.
4357 // Process them only once.
4358 if (index != lastInsIndex)
4359 {
4360 ISD::ArgFlagsTy Flags = Ins[index].Flags;
4361 // FIXME: For now, all byval parameter objects are marked mutable.
4362 // This can be changed with more analysis.
4363 // In case of tail call optimization mark all arguments mutable.
4364 // Since they could be overwritten by lowering of arguments in case of
4365 // a tail call.
4366 if (Flags.isByVal()) {
4367 assert(Ins[index].isOrigArg() &&
4368 "Byval arguments cannot be implicit");
4369 unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed();
4370
4371 int FrameIndex = StoreByValRegs(
4372 CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex,
4373 VA.getLocMemOffset(), Flags.getByValSize());
4374 InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT));
4375 CCInfo.nextInRegsParam();
4376 } else if (VA.needsCustom() && (VA.getValVT() == MVT::f16 ||
4377 VA.getValVT() == MVT::bf16)) {
4378 // f16 and bf16 values are passed in the least-significant half of
4379 // a 4 byte stack slot. This is done as-if the extension was done
4380 // in a 32-bit register, so the actual bytes used for the value
4381 // differ between little and big endian.
4382 assert(VA.getLocVT().getSizeInBits() == 32);
4383 unsigned FIOffset = VA.getLocMemOffset();
4384 int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits() / 8,
4385 FIOffset, true);
4386
4387 SDValue Addr = DAG.getFrameIndex(FI, PtrVT);
4388 if (DAG.getDataLayout().isBigEndian())
4389 Addr = DAG.getObjectPtrOffset(dl, Addr, TypeSize::getFixed(2));
4390
4391 InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, Addr,
4393 DAG.getMachineFunction(), FI)));
4394
4395 } else {
4396 unsigned FIOffset = VA.getLocMemOffset();
4397 int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
4398 FIOffset, true);
4399
4400 // Create load nodes to retrieve arguments from the stack.
4401 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
4402 InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
4404 DAG.getMachineFunction(), FI)));
4405 }
4406 lastInsIndex = index;
4407 }
4408 }
4409 }
4410
4411 // varargs
4412 if (isVarArg && MFI.hasVAStart()) {
4413 VarArgStyleRegisters(CCInfo, DAG, dl, Chain, CCInfo.getStackSize(),
4414 TotalArgRegsSaveSize);
4415 if (AFI->isCmseNSEntryFunction()) {
4416 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
4418 "secure entry function must not be variadic", dl.getDebugLoc()));
4419 }
4420 }
4421
4422 unsigned StackArgSize = CCInfo.getStackSize();
4423 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4424 if (canGuaranteeTCO(CallConv, TailCallOpt)) {
4425 // The only way to guarantee a tail call is if the callee restores its
4426 // argument area, but it must also keep the stack aligned when doing so.
4427 MaybeAlign StackAlign = DAG.getDataLayout().getStackAlignment();
4428 assert(StackAlign && "data layout string is missing stack alignment");
4429 StackArgSize = alignTo(StackArgSize, *StackAlign);
4430
4431 AFI->setArgumentStackToRestore(StackArgSize);
4432 }
4433 AFI->setArgumentStackSize(StackArgSize);
4434
4435 if (CCInfo.getStackSize() > 0 && AFI->isCmseNSEntryFunction()) {
4436 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
4438 "secure entry function requires arguments on stack", dl.getDebugLoc()));
4439 }
4440
4441 return Chain;
4442}
4443
4444/// isFloatingPointZero - Return true if this is +0.0.
4447 return CFP->getValueAPF().isPosZero();
4448 else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
4449 // Maybe this has already been legalized into the constant pool?
4450 if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) {
4451 SDValue WrapperOp = Op.getOperand(1).getOperand(0);
4453 if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
4454 return CFP->getValueAPF().isPosZero();
4455 }
4456 } else if (Op->getOpcode() == ISD::BITCAST &&
4457 Op->getValueType(0) == MVT::f64) {
4458 // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64)
4459 // created by LowerConstantFP().
4460 SDValue BitcastOp = Op->getOperand(0);
4461 if (BitcastOp->getOpcode() == ARMISD::VMOVIMM &&
4462 isNullConstant(BitcastOp->getOperand(0)))
4463 return true;
4464 }
4465 return false;
4466}
4467
4468/// Returns appropriate ARM CMP (cmp) and corresponding condition code for
4469/// the given operands.
4470SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
4471 SDValue &ARMcc, SelectionDAG &DAG,
4472 const SDLoc &dl) const {
4473 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
4474 unsigned C = RHSC->getZExtValue();
4475 if (!isLegalICmpImmediate((int32_t)C)) {
4476 // Constant does not fit, try adjusting it by one.
4477 switch (CC) {
4478 default: break;
4479 case ISD::SETLT:
4480 case ISD::SETGE:
4481 if (C != 0x80000000 && isLegalICmpImmediate(C-1)) {
4482 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
4483 RHS = DAG.getConstant(C - 1, dl, MVT::i32);
4484 }
4485 break;
4486 case ISD::SETULT:
4487 case ISD::SETUGE:
4488 if (C != 0 && isLegalICmpImmediate(C-1)) {
4489 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
4490 RHS = DAG.getConstant(C - 1, dl, MVT::i32);
4491 }
4492 break;
4493 case ISD::SETLE:
4494 case ISD::SETGT:
4495 if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) {
4496 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
4497 RHS = DAG.getConstant(C + 1, dl, MVT::i32);
4498 }
4499 break;
4500 case ISD::SETULE:
4501 case ISD::SETUGT:
4502 if (C != 0xffffffff && isLegalICmpImmediate(C+1)) {
4503 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
4504 RHS = DAG.getConstant(C + 1, dl, MVT::i32);
4505 }
4506 break;
4507 }
4508 }
4509 } else if ((ARM_AM::getShiftOpcForNode(LHS.getOpcode()) != ARM_AM::no_shift) &&
4511 // In ARM and Thumb-2, the compare instructions can shift their second
4512 // operand.
4514 std::swap(LHS, RHS);
4515 }
4516
4517 // Thumb1 has very limited immediate modes, so turning an "and" into a
4518 // shift can save multiple instructions.
4519 //
4520 // If we have (x & C1), and C1 is an appropriate mask, we can transform it
4521 // into "((x << n) >> n)". But that isn't necessarily profitable on its
4522 // own. If it's the operand to an unsigned comparison with an immediate,
4523 // we can eliminate one of the shifts: we transform
4524 // "((x << n) >> n) == C2" to "(x << n) == (C2 << n)".
4525 //
4526 // We avoid transforming cases which aren't profitable due to encoding
4527 // details:
4528 //
4529 // 1. C2 fits into the immediate field of a cmp, and the transformed version
4530 // would not; in that case, we're essentially trading one immediate load for
4531 // another.
4532 // 2. C1 is 255 or 65535, so we can use uxtb or uxth.
4533 // 3. C2 is zero; we have other code for this special case.
4534 //
4535 // FIXME: Figure out profitability for Thumb2; we usually can't save an
4536 // instruction, since the AND is always one instruction anyway, but we could
4537 // use narrow instructions in some cases.
4538 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::AND &&
4539 LHS->hasOneUse() && isa<ConstantSDNode>(LHS.getOperand(1)) &&
4540 LHS.getValueType() == MVT::i32 && isa<ConstantSDNode>(RHS) &&
4541 !isSignedIntSetCC(CC)) {
4542 unsigned Mask = LHS.getConstantOperandVal(1);
4543 auto *RHSC = cast<ConstantSDNode>(RHS.getNode());
4544 uint64_t RHSV = RHSC->getZExtValue();
4545 if (isMask_32(Mask) && (RHSV & ~Mask) == 0 && Mask != 255 && Mask != 65535) {
4546 unsigned ShiftBits = llvm::countl_zero(Mask);
4547 if (RHSV && (RHSV > 255 || (RHSV << ShiftBits) <= 255)) {
4548 SDValue ShiftAmt = DAG.getConstant(ShiftBits, dl, MVT::i32);
4549 LHS = DAG.getNode(ISD::SHL, dl, MVT::i32, LHS.getOperand(0), ShiftAmt);
4550 RHS = DAG.getConstant(RHSV << ShiftBits, dl, MVT::i32);
4551 }
4552 }
4553 }
4554
4555 // The specific comparison "(x<<c) > 0x80000000U" can be optimized to a
4556 // single "lsls x, c+1". The shift sets the "C" and "Z" flags the same
4557 // way a cmp would.
4558 // FIXME: Add support for ARM/Thumb2; this would need isel patterns, and
4559 // some tweaks to the heuristics for the previous and->shift transform.
4560 // FIXME: Optimize cases where the LHS isn't a shift.
4561 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::SHL &&
4562 isa<ConstantSDNode>(RHS) && RHS->getAsZExtVal() == 0x80000000U &&
4563 CC == ISD::SETUGT && isa<ConstantSDNode>(LHS.getOperand(1)) &&
4564 LHS.getConstantOperandVal(1) < 31) {
4565 unsigned ShiftAmt = LHS.getConstantOperandVal(1) + 1;
4566 SDValue Shift =
4567 DAG.getNode(ARMISD::LSLS, dl, DAG.getVTList(MVT::i32, FlagsVT),
4568 LHS.getOperand(0), DAG.getConstant(ShiftAmt, dl, MVT::i32));
4569 ARMcc = DAG.getConstant(ARMCC::HI, dl, MVT::i32);
4570 return Shift.getValue(1);
4571 }
4572
4574
4575 // If the RHS is a constant zero then the V (overflow) flag will never be
4576 // set. This can allow us to simplify GE to PL or LT to MI, which can be
4577 // simpler for other passes (like the peephole optimiser) to deal with.
4578 if (isNullConstant(RHS)) {
4579 switch (CondCode) {
4580 default: break;
4581 case ARMCC::GE:
4583 break;
4584 case ARMCC::LT:
4586 break;
4587 }
4588 }
4589
4590 unsigned CompareType;
4591 switch (CondCode) {
4592 default:
4593 CompareType = ARMISD::CMP;
4594 break;
4595 case ARMCC::EQ:
4596 case ARMCC::NE:
4597 // Uses only Z Flag
4598 CompareType = ARMISD::CMPZ;
4599 break;
4600 }
4601 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
4602 return DAG.getNode(CompareType, dl, FlagsVT, LHS, RHS);
4603}
4604
4605/// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
4606SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS,
4607 SelectionDAG &DAG, const SDLoc &dl,
4608 bool Signaling) const {
4609 assert(Subtarget->hasFP64() || RHS.getValueType() != MVT::f64);
4610 SDValue Flags;
4612 Flags = DAG.getNode(Signaling ? ARMISD::CMPFPE : ARMISD::CMPFP, dl, FlagsVT,
4613 LHS, RHS);
4614 else
4615 Flags = DAG.getNode(Signaling ? ARMISD::CMPFPEw0 : ARMISD::CMPFPw0, dl,
4616 FlagsVT, LHS);
4617 return DAG.getNode(ARMISD::FMSTAT, dl, FlagsVT, Flags);
4618}
4619
4620// This function returns three things: the arithmetic computation itself
4621// (Value), a comparison (OverflowCmp), and a condition code (ARMcc). The
4622// comparison and the condition code define the case in which the arithmetic
4623// computation *does not* overflow.
4624std::pair<SDValue, SDValue>
4625ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,
4626 SDValue &ARMcc) const {
4627 assert(Op.getValueType() == MVT::i32 && "Unsupported value type");
4628
4629 SDValue Value, OverflowCmp;
4630 SDValue LHS = Op.getOperand(0);
4631 SDValue RHS = Op.getOperand(1);
4632 SDLoc dl(Op);
4633
4634 // FIXME: We are currently always generating CMPs because we don't support
4635 // generating CMN through the backend. This is not as good as the natural
4636 // CMP case because it causes a register dependency and cannot be folded
4637 // later.
4638
4639 switch (Op.getOpcode()) {
4640 default:
4641 llvm_unreachable("Unknown overflow instruction!");
4642 case ISD::SADDO:
4643 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
4644 Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS);
4645 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value, LHS);
4646 break;
4647 case ISD::UADDO:
4648 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
4649 // We use ADDC here to correspond to its use in LowerUnsignedALUO.
4650 // We do not use it in the USUBO case as Value may not be used.
4651 Value = DAG.getNode(ARMISD::ADDC, dl,
4652 DAG.getVTList(Op.getValueType(), MVT::i32), LHS, RHS)
4653 .getValue(0);
4654 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value, LHS);
4655 break;
4656 case ISD::SSUBO:
4657 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
4658 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
4659 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, LHS, RHS);
4660 break;
4661 case ISD::USUBO:
4662 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
4663 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
4664 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, LHS, RHS);
4665 break;
4666 case ISD::UMULO:
4667 // We generate a UMUL_LOHI and then check if the high word is 0.
4668 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
4669 Value = DAG.getNode(ISD::UMUL_LOHI, dl,
4670 DAG.getVTList(Op.getValueType(), Op.getValueType()),
4671 LHS, RHS);
4672 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value.getValue(1),
4673 DAG.getConstant(0, dl, MVT::i32));
4674 Value = Value.getValue(0); // We only want the low 32 bits for the result.
4675 break;
4676 case ISD::SMULO:
4677 // We generate a SMUL_LOHI and then check if all the bits of the high word
4678 // are the same as the sign bit of the low word.
4679 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
4680 Value = DAG.getNode(ISD::SMUL_LOHI, dl,
4681 DAG.getVTList(Op.getValueType(), Op.getValueType()),
4682 LHS, RHS);
4683 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value.getValue(1),
4684 DAG.getNode(ISD::SRA, dl, Op.getValueType(),
4685 Value.getValue(0),
4686 DAG.getConstant(31, dl, MVT::i32)));
4687 Value = Value.getValue(0); // We only want the low 32 bits for the result.
4688 break;
4689 } // switch (...)
4690
4691 return std::make_pair(Value, OverflowCmp);
4692}
4693
4694SDValue
4695ARMTargetLowering::LowerSignedALUO(SDValue Op, SelectionDAG &DAG) const {
4696 // Let legalize expand this if it isn't a legal type yet.
4697 if (!isTypeLegal(Op.getValueType()))
4698 return SDValue();
4699
4700 SDValue Value, OverflowCmp;
4701 SDValue ARMcc;
4702 std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc);
4703 SDLoc dl(Op);
4704 // We use 0 and 1 as false and true values.
4705 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
4706 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
4707 EVT VT = Op.getValueType();
4708
4709 SDValue Overflow =
4710 DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal, ARMcc, OverflowCmp);
4711
4712 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
4713 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
4714}
4715
4717 SelectionDAG &DAG) {
4718 SDLoc DL(BoolCarry);
4719 EVT CarryVT = BoolCarry.getValueType();
4720
4721 // This converts the boolean value carry into the carry flag by doing
4722 // ARMISD::SUBC Carry, 1
4723 SDValue Carry = DAG.getNode(ARMISD::SUBC, DL,
4724 DAG.getVTList(CarryVT, MVT::i32),
4725 BoolCarry, DAG.getConstant(1, DL, CarryVT));
4726 return Carry.getValue(1);
4727}
4728
4730 SelectionDAG &DAG) {
4731 SDLoc DL(Flags);
4732
4733 // Now convert the carry flag into a boolean carry. We do this
4734 // using ARMISD:ADDE 0, 0, Carry
4735 return DAG.getNode(ARMISD::ADDE, DL, DAG.getVTList(VT, MVT::i32),
4736 DAG.getConstant(0, DL, MVT::i32),
4737 DAG.getConstant(0, DL, MVT::i32), Flags);
4738}
4739
4740SDValue ARMTargetLowering::LowerUnsignedALUO(SDValue Op,
4741 SelectionDAG &DAG) const {
4742 // Let legalize expand this if it isn't a legal type yet.
4743 if (!isTypeLegal(Op.getValueType()))
4744 return SDValue();
4745
4746 SDValue LHS = Op.getOperand(0);
4747 SDValue RHS = Op.getOperand(1);
4748 SDLoc dl(Op);
4749
4750 EVT VT = Op.getValueType();
4751 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
4752 SDValue Value;
4753 SDValue Overflow;
4754 switch (Op.getOpcode()) {
4755 default:
4756 llvm_unreachable("Unknown overflow instruction!");
4757 case ISD::UADDO:
4758 Value = DAG.getNode(ARMISD::ADDC, dl, VTs, LHS, RHS);
4759 // Convert the carry flag into a boolean value.
4760 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG);
4761 break;
4762 case ISD::USUBO: {
4763 Value = DAG.getNode(ARMISD::SUBC, dl, VTs, LHS, RHS);
4764 // Convert the carry flag into a boolean value.
4765 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG);
4766 // ARMISD::SUBC returns 0 when we have to borrow, so make it an overflow
4767 // value. So compute 1 - C.
4768 Overflow = DAG.getNode(ISD::SUB, dl, MVT::i32,
4769 DAG.getConstant(1, dl, MVT::i32), Overflow);
4770 break;
4771 }
4772 }
4773
4774 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
4775}
4776
4778 const ARMSubtarget *Subtarget) {
4779 EVT VT = Op.getValueType();
4780 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP() || Subtarget->isThumb1Only())
4781 return SDValue();
4782 if (!VT.isSimple())
4783 return SDValue();
4784
4785 unsigned NewOpcode;
4786 switch (VT.getSimpleVT().SimpleTy) {
4787 default:
4788 return SDValue();
4789 case MVT::i8:
4790 switch (Op->getOpcode()) {
4791 case ISD::UADDSAT:
4792 NewOpcode = ARMISD::UQADD8b;
4793 break;
4794 case ISD::SADDSAT:
4795 NewOpcode = ARMISD::QADD8b;
4796 break;
4797 case ISD::USUBSAT:
4798 NewOpcode = ARMISD::UQSUB8b;
4799 break;
4800 case ISD::SSUBSAT:
4801 NewOpcode = ARMISD::QSUB8b;
4802 break;
4803 }
4804 break;
4805 case MVT::i16:
4806 switch (Op->getOpcode()) {
4807 case ISD::UADDSAT:
4808 NewOpcode = ARMISD::UQADD16b;
4809 break;
4810 case ISD::SADDSAT:
4811 NewOpcode = ARMISD::QADD16b;
4812 break;
4813 case ISD::USUBSAT:
4814 NewOpcode = ARMISD::UQSUB16b;
4815 break;
4816 case ISD::SSUBSAT:
4817 NewOpcode = ARMISD::QSUB16b;
4818 break;
4819 }
4820 break;
4821 }
4822
4823 SDLoc dl(Op);
4824 SDValue Add =
4825 DAG.getNode(NewOpcode, dl, MVT::i32,
4826 DAG.getSExtOrTrunc(Op->getOperand(0), dl, MVT::i32),
4827 DAG.getSExtOrTrunc(Op->getOperand(1), dl, MVT::i32));
4828 return DAG.getNode(ISD::TRUNCATE, dl, VT, Add);
4829}
4830
4831SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
4832 SDValue Cond = Op.getOperand(0);
4833 SDValue SelectTrue = Op.getOperand(1);
4834 SDValue SelectFalse = Op.getOperand(2);
4835 SDLoc dl(Op);
4836 unsigned Opc = Cond.getOpcode();
4837
4838 if (Cond.getResNo() == 1 &&
4839 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
4840 Opc == ISD::USUBO)) {
4841 if (!isTypeLegal(Cond->getValueType(0)))
4842 return SDValue();
4843
4844 SDValue Value, OverflowCmp;
4845 SDValue ARMcc;
4846 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
4847 EVT VT = Op.getValueType();
4848
4849 return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, OverflowCmp, DAG);
4850 }
4851
4852 // Convert:
4853 //
4854 // (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond)
4855 // (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond)
4856 //
4857 if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) {
4858 const ConstantSDNode *CMOVTrue =
4859 dyn_cast<ConstantSDNode>(Cond.getOperand(0));
4860 const ConstantSDNode *CMOVFalse =
4861 dyn_cast<ConstantSDNode>(Cond.getOperand(1));
4862
4863 if (CMOVTrue && CMOVFalse) {
4864 unsigned CMOVTrueVal = CMOVTrue->getZExtValue();
4865 unsigned CMOVFalseVal = CMOVFalse->getZExtValue();
4866
4867 SDValue True;
4868 SDValue False;
4869 if (CMOVTrueVal == 1 && CMOVFalseVal == 0) {
4870 True = SelectTrue;
4871 False = SelectFalse;
4872 } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) {
4873 True = SelectFalse;
4874 False = SelectTrue;
4875 }
4876
4877 if (True.getNode() && False.getNode())
4878 return getCMOV(dl, Op.getValueType(), True, False, Cond.getOperand(2),
4879 Cond.getOperand(3), DAG);
4880 }
4881 }
4882
4883 // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the
4884 // undefined bits before doing a full-word comparison with zero.
4885 Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond,
4886 DAG.getConstant(1, dl, Cond.getValueType()));
4887
4888 return DAG.getSelectCC(dl, Cond,
4889 DAG.getConstant(0, dl, Cond.getValueType()),
4890 SelectTrue, SelectFalse, ISD::SETNE);
4891}
4892
4894 bool &swpCmpOps, bool &swpVselOps) {
4895 // Start by selecting the GE condition code for opcodes that return true for
4896 // 'equality'
4897 if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE ||
4898 CC == ISD::SETULE || CC == ISD::SETGE || CC == ISD::SETLE)
4899 CondCode = ARMCC::GE;
4900
4901 // and GT for opcodes that return false for 'equality'.
4902 else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT ||
4903 CC == ISD::SETULT || CC == ISD::SETGT || CC == ISD::SETLT)
4904 CondCode = ARMCC::GT;
4905
4906 // Since we are constrained to GE/GT, if the opcode contains 'less', we need
4907 // to swap the compare operands.
4908 if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT ||
4909 CC == ISD::SETULT || CC == ISD::SETLE || CC == ISD::SETLT)
4910 swpCmpOps = true;
4911
4912 // Both GT and GE are ordered comparisons, and return false for 'unordered'.
4913 // If we have an unordered opcode, we need to swap the operands to the VSEL
4914 // instruction (effectively negating the condition).
4915 //
4916 // This also has the effect of swapping which one of 'less' or 'greater'
4917 // returns true, so we also swap the compare operands. It also switches
4918 // whether we return true for 'equality', so we compensate by picking the
4919 // opposite condition code to our original choice.
4920 if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE ||
4921 CC == ISD::SETUGT) {
4922 swpCmpOps = !swpCmpOps;
4923 swpVselOps = !swpVselOps;
4924 CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT;
4925 }
4926
4927 // 'ordered' is 'anything but unordered', so use the VS condition code and
4928 // swap the VSEL operands.
4929 if (CC == ISD::SETO) {
4930 CondCode = ARMCC::VS;
4931 swpVselOps = true;
4932 }
4933
4934 // 'unordered or not equal' is 'anything but equal', so use the EQ condition
4935 // code and swap the VSEL operands. Also do this if we don't care about the
4936 // unordered case.
4937 if (CC == ISD::SETUNE || CC == ISD::SETNE) {
4938 CondCode = ARMCC::EQ;
4939 swpVselOps = true;
4940 }
4941}
4942
4943SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal,
4944 SDValue TrueVal, SDValue ARMcc,
4945 SDValue Flags, SelectionDAG &DAG) const {
4946 if (!Subtarget->hasFP64() && VT == MVT::f64) {
4947 FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl,
4948 DAG.getVTList(MVT::i32, MVT::i32), FalseVal);
4949 TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl,
4950 DAG.getVTList(MVT::i32, MVT::i32), TrueVal);
4951
4952 SDValue TrueLow = TrueVal.getValue(0);
4953 SDValue TrueHigh = TrueVal.getValue(1);
4954 SDValue FalseLow = FalseVal.getValue(0);
4955 SDValue FalseHigh = FalseVal.getValue(1);
4956
4957 SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow,
4958 ARMcc, Flags);
4959 SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh,
4960 ARMcc, Flags);
4961
4962 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High);
4963 }
4964 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, Flags);
4965}
4966
4967static bool isGTorGE(ISD::CondCode CC) {
4968 return CC == ISD::SETGT || CC == ISD::SETGE;
4969}
4970
4971static bool isLTorLE(ISD::CondCode CC) {
4972 return CC == ISD::SETLT || CC == ISD::SETLE;
4973}
4974
4975// See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating.
4976// All of these conditions (and their <= and >= counterparts) will do:
4977// x < k ? k : x
4978// x > k ? x : k
4979// k < x ? x : k
4980// k > x ? k : x
4981static bool isLowerSaturate(const SDValue LHS, const SDValue RHS,
4982 const SDValue TrueVal, const SDValue FalseVal,
4983 const ISD::CondCode CC, const SDValue K) {
4984 return (isGTorGE(CC) &&
4985 ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) ||
4986 (isLTorLE(CC) &&
4987 ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal)));
4988}
4989
4990// Check if two chained conditionals could be converted into SSAT or USAT.
4991//
4992// SSAT can replace a set of two conditional selectors that bound a number to an
4993// interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples:
4994//
4995// x < -k ? -k : (x > k ? k : x)
4996// x < -k ? -k : (x < k ? x : k)
4997// x > -k ? (x > k ? k : x) : -k
4998// x < k ? (x < -k ? -k : x) : k
4999// etc.
5000//
5001// LLVM canonicalizes these to either a min(max()) or a max(min())
5002// pattern. This function tries to match one of these and will return a SSAT
5003// node if successful.
5004//
5005// USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1
5006// is a power of 2.
5008 EVT VT = Op.getValueType();
5009 SDValue V1 = Op.getOperand(0);
5010 SDValue K1 = Op.getOperand(1);
5011 SDValue TrueVal1 = Op.getOperand(2);
5012 SDValue FalseVal1 = Op.getOperand(3);
5013 ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5014
5015 const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1;
5016 if (Op2.getOpcode() != ISD::SELECT_CC)
5017 return SDValue();
5018
5019 SDValue V2 = Op2.getOperand(0);
5020 SDValue K2 = Op2.getOperand(1);
5021 SDValue TrueVal2 = Op2.getOperand(2);
5022 SDValue FalseVal2 = Op2.getOperand(3);
5023 ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get();
5024
5025 SDValue V1Tmp = V1;
5026 SDValue V2Tmp = V2;
5027
5028 // Check that the registers and the constants match a max(min()) or min(max())
5029 // pattern
5030 if (V1Tmp != TrueVal1 || V2Tmp != TrueVal2 || K1 != FalseVal1 ||
5031 K2 != FalseVal2 ||
5032 !((isGTorGE(CC1) && isLTorLE(CC2)) || (isLTorLE(CC1) && isGTorGE(CC2))))
5033 return SDValue();
5034
5035 // Check that the constant in the lower-bound check is
5036 // the opposite of the constant in the upper-bound check
5037 // in 1's complement.
5039 return SDValue();
5040
5041 int64_t Val1 = cast<ConstantSDNode>(K1)->getSExtValue();
5042 int64_t Val2 = cast<ConstantSDNode>(K2)->getSExtValue();
5043 int64_t PosVal = std::max(Val1, Val2);
5044 int64_t NegVal = std::min(Val1, Val2);
5045
5046 if (!((Val1 > Val2 && isLTorLE(CC1)) || (Val1 < Val2 && isLTorLE(CC2))) ||
5047 !isPowerOf2_64(PosVal + 1))
5048 return SDValue();
5049
5050 // Handle the difference between USAT (unsigned) and SSAT (signed)
5051 // saturation
5052 // At this point, PosVal is guaranteed to be positive
5053 uint64_t K = PosVal;
5054 SDLoc dl(Op);
5055 if (Val1 == ~Val2)
5056 return DAG.getNode(ARMISD::SSAT, dl, VT, V2Tmp,
5057 DAG.getConstant(llvm::countr_one(K), dl, VT));
5058 if (NegVal == 0)
5059 return DAG.getNode(ARMISD::USAT, dl, VT, V2Tmp,
5060 DAG.getConstant(llvm::countr_one(K), dl, VT));
5061
5062 return SDValue();
5063}
5064
5065// Check if a condition of the type x < k ? k : x can be converted into a
5066// bit operation instead of conditional moves.
5067// Currently this is allowed given:
5068// - The conditions and values match up
5069// - k is 0 or -1 (all ones)
5070// This function will not check the last condition, thats up to the caller
5071// It returns true if the transformation can be made, and in such case
5072// returns x in V, and k in SatK.
5074 SDValue &SatK)
5075{
5076 SDValue LHS = Op.getOperand(0);
5077 SDValue RHS = Op.getOperand(1);
5078 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5079 SDValue TrueVal = Op.getOperand(2);
5080 SDValue FalseVal = Op.getOperand(3);
5081
5083 ? &RHS
5084 : nullptr;
5085
5086 // No constant operation in comparison, early out
5087 if (!K)
5088 return false;
5089
5090 SDValue KTmp = isa<ConstantSDNode>(TrueVal) ? TrueVal : FalseVal;
5091 V = (KTmp == TrueVal) ? FalseVal : TrueVal;
5092 SDValue VTmp = (K && *K == LHS) ? RHS : LHS;
5093
5094 // If the constant on left and right side, or variable on left and right,
5095 // does not match, early out
5096 if (*K != KTmp || V != VTmp)
5097 return false;
5098
5099 if (isLowerSaturate(LHS, RHS, TrueVal, FalseVal, CC, *K)) {
5100 SatK = *K;
5101 return true;
5102 }
5103
5104 return false;
5105}
5106
5107bool ARMTargetLowering::isUnsupportedFloatingType(EVT VT) const {
5108 if (VT == MVT::f32)
5109 return !Subtarget->hasVFP2Base();
5110 if (VT == MVT::f64)
5111 return !Subtarget->hasFP64();
5112 if (VT == MVT::f16)
5113 return !Subtarget->hasFullFP16();
5114 return false;
5115}
5116
5117SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
5118 EVT VT = Op.getValueType();
5119 SDLoc dl(Op);
5120
5121 // Try to convert two saturating conditional selects into a single SSAT
5122 if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2())
5123 if (SDValue SatValue = LowerSaturatingConditional(Op, DAG))
5124 return SatValue;
5125
5126 // Try to convert expressions of the form x < k ? k : x (and similar forms)
5127 // into more efficient bit operations, which is possible when k is 0 or -1
5128 // On ARM and Thumb-2 which have flexible operand 2 this will result in
5129 // single instructions. On Thumb the shift and the bit operation will be two
5130 // instructions.
5131 // Only allow this transformation on full-width (32-bit) operations
5132 SDValue LowerSatConstant;
5133 SDValue SatValue;
5134 if (VT == MVT::i32 &&
5135 isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) {
5136 SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue,
5137 DAG.getConstant(31, dl, VT));
5138 if (isNullConstant(LowerSatConstant)) {
5139 SDValue NotShiftV = DAG.getNode(ISD::XOR, dl, VT, ShiftV,
5140 DAG.getAllOnesConstant(dl, VT));
5141 return DAG.getNode(ISD::AND, dl, VT, SatValue, NotShiftV);
5142 } else if (isAllOnesConstant(LowerSatConstant))
5143 return DAG.getNode(ISD::OR, dl, VT, SatValue, ShiftV);
5144 }
5145
5146 SDValue LHS = Op.getOperand(0);
5147 SDValue RHS = Op.getOperand(1);
5148 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5149 SDValue TrueVal = Op.getOperand(2);
5150 SDValue FalseVal = Op.getOperand(3);
5151 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FalseVal);
5152 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TrueVal);
5153 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
5154 if (Op.getValueType().isInteger()) {
5155
5156 // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
5157 // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
5158 // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
5159 // Both require less instructions than compare and conditional select.
5160 if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TrueVal && RHSC &&
5161 RHSC->isZero() && CFVal && CFVal->isZero() &&
5162 LHS.getValueType() == RHS.getValueType()) {
5163 EVT VT = LHS.getValueType();
5164 SDValue Shift =
5165 DAG.getNode(ISD::SRA, dl, VT, LHS,
5166 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
5167
5168 if (CC == ISD::SETGT)
5169 Shift = DAG.getNOT(dl, Shift, VT);
5170
5171 return DAG.getNode(ISD::AND, dl, VT, LHS, Shift);
5172 }
5173 }
5174
5175 if (Subtarget->hasV8_1MMainlineOps() && CFVal && CTVal &&
5176 LHS.getValueType() == MVT::i32 && RHS.getValueType() == MVT::i32) {
5177 unsigned TVal = CTVal->getZExtValue();
5178 unsigned FVal = CFVal->getZExtValue();
5179 unsigned Opcode = 0;
5180
5181 if (TVal == ~FVal) {
5182 Opcode = ARMISD::CSINV;
5183 } else if (TVal == ~FVal + 1) {
5184 Opcode = ARMISD::CSNEG;
5185 } else if (TVal + 1 == FVal) {
5186 Opcode = ARMISD::CSINC;
5187 } else if (TVal == FVal + 1) {
5188 Opcode = ARMISD::CSINC;
5189 std::swap(TrueVal, FalseVal);
5190 std::swap(TVal, FVal);
5191 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5192 }
5193
5194 if (Opcode) {
5195 // If one of the constants is cheaper than another, materialise the
5196 // cheaper one and let the csel generate the other.
5197 if (Opcode != ARMISD::CSINC &&
5198 HasLowerConstantMaterializationCost(FVal, TVal, Subtarget)) {
5199 std::swap(TrueVal, FalseVal);
5200 std::swap(TVal, FVal);
5201 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5202 }
5203
5204 // Attempt to use ZR checking TVal is 0, possibly inverting the condition
5205 // to get there. CSINC not is invertable like the other two (~(~a) == a,
5206 // -(-a) == a, but (a+1)+1 != a).
5207 if (FVal == 0 && Opcode != ARMISD::CSINC) {
5208 std::swap(TrueVal, FalseVal);
5209 std::swap(TVal, FVal);
5210 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5211 }
5212
5213 // Drops F's value because we can get it by inverting/negating TVal.
5214 FalseVal = TrueVal;
5215
5216 SDValue ARMcc;
5217 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5218 EVT VT = TrueVal.getValueType();
5219 return DAG.getNode(Opcode, dl, VT, TrueVal, FalseVal, ARMcc, Cmp);
5220 }
5221 }
5222
5223 if (isUnsupportedFloatingType(LHS.getValueType())) {
5224 softenSetCCOperands(DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
5225
5226 // If softenSetCCOperands only returned one value, we should compare it to
5227 // zero.
5228 if (!RHS.getNode()) {
5229 RHS = DAG.getConstant(0, dl, LHS.getValueType());
5230 CC = ISD::SETNE;
5231 }
5232 }
5233
5234 if (LHS.getValueType() == MVT::i32) {
5235 // Try to generate VSEL on ARMv8.
5236 // The VSEL instruction can't use all the usual ARM condition
5237 // codes: it only has two bits to select the condition code, so it's
5238 // constrained to use only GE, GT, VS and EQ.
5239 //
5240 // To implement all the various ISD::SETXXX opcodes, we sometimes need to
5241 // swap the operands of the previous compare instruction (effectively
5242 // inverting the compare condition, swapping 'less' and 'greater') and
5243 // sometimes need to swap the operands to the VSEL (which inverts the
5244 // condition in the sense of firing whenever the previous condition didn't)
5245 if (Subtarget->hasFPARMv8Base() && (TrueVal.getValueType() == MVT::f16 ||
5246 TrueVal.getValueType() == MVT::f32 ||
5247 TrueVal.getValueType() == MVT::f64)) {
5249 if (CondCode == ARMCC::LT || CondCode == ARMCC::LE ||
5250 CondCode == ARMCC::VC || CondCode == ARMCC::NE) {
5251 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5252 std::swap(TrueVal, FalseVal);
5253 }
5254 }
5255
5256 SDValue ARMcc;
5257 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5258 // Choose GE over PL, which vsel does now support
5259 if (ARMcc->getAsZExtVal() == ARMCC::PL)
5260 ARMcc = DAG.getConstant(ARMCC::GE, dl, MVT::i32);
5261 return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, Cmp, DAG);
5262 }
5263
5264 ARMCC::CondCodes CondCode, CondCode2;
5265 FPCCToARMCC(CC, CondCode, CondCode2);
5266
5267 // Normalize the fp compare. If RHS is zero we prefer to keep it there so we
5268 // match CMPFPw0 instead of CMPFP, though we don't do this for f16 because we
5269 // must use VSEL (limited condition codes), due to not having conditional f16
5270 // moves.
5271 if (Subtarget->hasFPARMv8Base() &&
5272 !(isFloatingPointZero(RHS) && TrueVal.getValueType() != MVT::f16) &&
5273 (TrueVal.getValueType() == MVT::f16 ||
5274 TrueVal.getValueType() == MVT::f32 ||
5275 TrueVal.getValueType() == MVT::f64)) {
5276 bool swpCmpOps = false;
5277 bool swpVselOps = false;
5278 checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps);
5279
5280 if (CondCode == ARMCC::GT || CondCode == ARMCC::GE ||
5281 CondCode == ARMCC::VS || CondCode == ARMCC::EQ) {
5282 if (swpCmpOps)
5283 std::swap(LHS, RHS);
5284 if (swpVselOps)
5285 std::swap(TrueVal, FalseVal);
5286 }
5287 }
5288
5289 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5290 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
5291 SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, Cmp, DAG);
5292 if (CondCode2 != ARMCC::AL) {
5293 SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32);
5294 Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, Cmp, DAG);
5295 }
5296 return Result;
5297}
5298
5299/// canChangeToInt - Given the fp compare operand, return true if it is suitable
5300/// to morph to an integer compare sequence.
5301static bool canChangeToInt(SDValue Op, bool &SeenZero,
5302 const ARMSubtarget *Subtarget) {
5303 SDNode *N = Op.getNode();
5304 if (!N->hasOneUse())
5305 // Otherwise it requires moving the value from fp to integer registers.
5306 return false;
5307 if (!N->getNumValues())
5308 return false;
5309 EVT VT = Op.getValueType();
5310 if (VT != MVT::f32 && !Subtarget->isFPBrccSlow())
5311 // f32 case is generally profitable. f64 case only makes sense when vcmpe +
5312 // vmrs are very slow, e.g. cortex-a8.
5313 return false;
5314
5315 if (isFloatingPointZero(Op)) {
5316 SeenZero = true;
5317 return true;
5318 }
5319 return ISD::isNormalLoad(N);
5320}
5321
5324 return DAG.getConstant(0, SDLoc(Op), MVT::i32);
5325
5327 return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(),
5328 Ld->getPointerInfo(), Ld->getAlign(),
5329 Ld->getMemOperand()->getFlags());
5330
5331 llvm_unreachable("Unknown VFP cmp argument!");
5332}
5333
5335 SDValue &RetVal1, SDValue &RetVal2) {
5336 SDLoc dl(Op);
5337
5338 if (isFloatingPointZero(Op)) {
5339 RetVal1 = DAG.getConstant(0, dl, MVT::i32);
5340 RetVal2 = DAG.getConstant(0, dl, MVT::i32);
5341 return;
5342 }
5343
5344 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) {
5345 SDValue Ptr = Ld->getBasePtr();
5346 RetVal1 =
5347 DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
5348 Ld->getAlign(), Ld->getMemOperand()->getFlags());
5349
5350 EVT PtrType = Ptr.getValueType();
5351 SDValue NewPtr = DAG.getNode(ISD::ADD, dl,
5352 PtrType, Ptr, DAG.getConstant(4, dl, PtrType));
5353 RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr,
5354 Ld->getPointerInfo().getWithOffset(4),
5355 commonAlignment(Ld->getAlign(), 4),
5356 Ld->getMemOperand()->getFlags());
5357 return;
5358 }
5359
5360 llvm_unreachable("Unknown VFP cmp argument!");
5361}
5362
5363/// OptimizeVFPBrcond - With nnan and without daz, it's legal to optimize some
5364/// f32 and even f64 comparisons to integer ones.
5365SDValue
5366ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
5367 SDValue Chain = Op.getOperand(0);
5368 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
5369 SDValue LHS = Op.getOperand(2);
5370 SDValue RHS = Op.getOperand(3);
5371 SDValue Dest = Op.getOperand(4);
5372 SDLoc dl(Op);
5373
5374 bool LHSSeenZero = false;
5375 bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget);
5376 bool RHSSeenZero = false;
5377 bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget);
5378 if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) {
5379 // If unsafe fp math optimization is enabled and there are no other uses of
5380 // the CMP operands, and the condition code is EQ or NE, we can optimize it
5381 // to an integer comparison.
5382 if (CC == ISD::SETOEQ)
5383 CC = ISD::SETEQ;
5384 else if (CC == ISD::SETUNE)
5385 CC = ISD::SETNE;
5386
5387 SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32);
5388 SDValue ARMcc;
5389 if (LHS.getValueType() == MVT::f32) {
5390 LHS = DAG.getNode(ISD::AND, dl, MVT::i32,
5391 bitcastf32Toi32(LHS, DAG), Mask);
5392 RHS = DAG.getNode(ISD::AND, dl, MVT::i32,
5393 bitcastf32Toi32(RHS, DAG), Mask);
5394 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5395 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc,
5396 Cmp);
5397 }
5398
5399 SDValue LHS1, LHS2;
5400 SDValue RHS1, RHS2;
5401 expandf64Toi32(LHS, DAG, LHS1, LHS2);
5402 expandf64Toi32(RHS, DAG, RHS1, RHS2);
5403 LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask);
5404 RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask);
5406 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5407 SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest };
5408 return DAG.getNode(ARMISD::BCC_i64, dl, MVT::Other, Ops);
5409 }
5410
5411 return SDValue();
5412}
5413
5414// Generate CMP + CMOV for integer abs.
5415SDValue ARMTargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
5416 SDLoc DL(Op);
5417
5418 SDValue Neg = DAG.getNegative(Op.getOperand(0), DL, MVT::i32);
5419
5420 // Generate CMP & CMOV.
5421 SDValue Cmp = DAG.getNode(ARMISD::CMP, DL, FlagsVT, Op.getOperand(0),
5422 DAG.getConstant(0, DL, MVT::i32));
5423 return DAG.getNode(ARMISD::CMOV, DL, MVT::i32, Op.getOperand(0), Neg,
5424 DAG.getConstant(ARMCC::MI, DL, MVT::i32), Cmp);
5425}
5426
5427SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
5428 SDValue Chain = Op.getOperand(0);
5429 SDValue Cond = Op.getOperand(1);
5430 SDValue Dest = Op.getOperand(2);
5431 SDLoc dl(Op);
5432
5433 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
5434 // instruction.
5435 unsigned Opc = Cond.getOpcode();
5436 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
5437 !Subtarget->isThumb1Only();
5438 if (Cond.getResNo() == 1 &&
5439 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5440 Opc == ISD::USUBO || OptimizeMul)) {
5441 // Only lower legal XALUO ops.
5442 if (!isTypeLegal(Cond->getValueType(0)))
5443 return SDValue();
5444
5445 // The actual operation with overflow check.
5446 SDValue Value, OverflowCmp;
5447 SDValue ARMcc;
5448 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
5449
5450 // Reverse the condition code.
5452 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();
5454 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
5455
5456 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc,
5457 OverflowCmp);
5458 }
5459
5460 return SDValue();
5461}
5462
5463SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
5464 SDValue Chain = Op.getOperand(0);
5465 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
5466 SDValue LHS = Op.getOperand(2);
5467 SDValue RHS = Op.getOperand(3);
5468 SDValue Dest = Op.getOperand(4);
5469 SDLoc dl(Op);
5470
5471 if (isUnsupportedFloatingType(LHS.getValueType())) {
5472 softenSetCCOperands(DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
5473
5474 // If softenSetCCOperands only returned one value, we should compare it to
5475 // zero.
5476 if (!RHS.getNode()) {
5477 RHS = DAG.getConstant(0, dl, LHS.getValueType());
5478 CC = ISD::SETNE;
5479 }
5480 }
5481
5482 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
5483 // instruction.
5484 unsigned Opc = LHS.getOpcode();
5485 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
5486 !Subtarget->isThumb1Only();
5487 if (LHS.getResNo() == 1 && (isOneConstant(RHS) || isNullConstant(RHS)) &&
5488 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5489 Opc == ISD::USUBO || OptimizeMul) &&
5490 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
5491 // Only lower legal XALUO ops.
5492 if (!isTypeLegal(LHS->getValueType(0)))
5493 return SDValue();
5494
5495 // The actual operation with overflow check.
5496 SDValue Value, OverflowCmp;
5497 SDValue ARMcc;
5498 std::tie(Value, OverflowCmp) = getARMXALUOOp(LHS.getValue(0), DAG, ARMcc);
5499
5500 if ((CC == ISD::SETNE) != isOneConstant(RHS)) {
5501 // Reverse the condition code.
5503 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();
5505 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
5506 }
5507
5508 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc,
5509 OverflowCmp);
5510 }
5511
5512 if (LHS.getValueType() == MVT::i32) {
5513 SDValue ARMcc;
5514 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5515 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, Cmp);
5516 }
5517
5518 SDNodeFlags Flags = Op->getFlags();
5519 if (Flags.hasNoNaNs() &&
5520 DAG.getDenormalMode(MVT::f32) == DenormalMode::getIEEE() &&
5521 DAG.getDenormalMode(MVT::f64) == DenormalMode::getIEEE() &&
5522 (CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETNE ||
5523 CC == ISD::SETUNE)) {
5524 if (SDValue Result = OptimizeVFPBrcond(Op, DAG))
5525 return Result;
5526 }
5527
5528 ARMCC::CondCodes CondCode, CondCode2;
5529 FPCCToARMCC(CC, CondCode, CondCode2);
5530
5531 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5532 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
5533 SDValue Ops[] = {Chain, Dest, ARMcc, Cmp};
5534 SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Ops);
5535 if (CondCode2 != ARMCC::AL) {
5536 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
5537 SDValue Ops[] = {Res, Dest, ARMcc, Cmp};
5538 Res = DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Ops);
5539 }
5540 return Res;
5541}
5542
5543SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
5544 SDValue Chain = Op.getOperand(0);
5545 SDValue Table = Op.getOperand(1);
5546 SDValue Index = Op.getOperand(2);
5547 SDLoc dl(Op);
5548
5549 EVT PTy = getPointerTy(DAG.getDataLayout());
5550 JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
5551 SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy);
5552 Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI);
5553 Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy));
5554 SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Index);
5555 if (Subtarget->isThumb2() || (Subtarget->hasV8MBaselineOps() && Subtarget->isThumb())) {
5556 // Thumb2 and ARMv8-M use a two-level jump. That is, it jumps into the jump table
5557 // which does another jump to the destination. This also makes it easier
5558 // to translate it to TBB / TBH later (Thumb2 only).
5559 // FIXME: This might not work if the function is extremely large.
5560 return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain,
5561 Addr, Op.getOperand(2), JTI);
5562 }
5563 if (isPositionIndependent() || Subtarget->isROPI()) {
5564 Addr =
5565 DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,
5567 Chain = Addr.getValue(1);
5568 Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Addr);
5569 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
5570 } else {
5571 Addr =
5572 DAG.getLoad(PTy, dl, Chain, Addr,
5574 Chain = Addr.getValue(1);
5575 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
5576 }
5577}
5578
5580 EVT VT = Op.getValueType();
5581 SDLoc dl(Op);
5582
5583 if (Op.getValueType().getVectorElementType() == MVT::i32) {
5584 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32)
5585 return Op;
5586 return DAG.UnrollVectorOp(Op.getNode());
5587 }
5588
5589 const bool HasFullFP16 = DAG.getSubtarget<ARMSubtarget>().hasFullFP16();
5590
5591 EVT NewTy;
5592 const EVT OpTy = Op.getOperand(0).getValueType();
5593 if (OpTy == MVT::v4f32)
5594 NewTy = MVT::v4i32;
5595 else if (OpTy == MVT::v4f16 && HasFullFP16)
5596 NewTy = MVT::v4i16;
5597 else if (OpTy == MVT::v8f16 && HasFullFP16)
5598 NewTy = MVT::v8i16;
5599 else
5600 llvm_unreachable("Invalid type for custom lowering!");
5601
5602 if (VT != MVT::v4i16 && VT != MVT::v8i16)
5603 return DAG.UnrollVectorOp(Op.getNode());
5604
5605 Op = DAG.getNode(Op.getOpcode(), dl, NewTy, Op.getOperand(0));
5606 return DAG.getNode(ISD::TRUNCATE, dl, VT, Op);
5607}
5608
5609SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
5610 EVT VT = Op.getValueType();
5611 if (VT.isVector())
5612 return LowerVectorFP_TO_INT(Op, DAG);
5613
5614 bool IsStrict = Op->isStrictFPOpcode();
5615 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
5616
5617 if (isUnsupportedFloatingType(SrcVal.getValueType())) {
5618 RTLIB::Libcall LC;
5619 if (Op.getOpcode() == ISD::FP_TO_SINT ||
5620 Op.getOpcode() == ISD::STRICT_FP_TO_SINT)
5621 LC = RTLIB::getFPTOSINT(SrcVal.getValueType(),
5622 Op.getValueType());
5623 else
5624 LC = RTLIB::getFPTOUINT(SrcVal.getValueType(),
5625 Op.getValueType());
5626 SDLoc Loc(Op);
5627 MakeLibCallOptions CallOptions;
5628 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
5630 std::tie(Result, Chain) = makeLibCall(DAG, LC, Op.getValueType(), SrcVal,
5631 CallOptions, Loc, Chain);
5632 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;
5633 }
5634
5635 // FIXME: Remove this when we have strict fp instruction selection patterns
5636 if (IsStrict) {
5637 SDLoc Loc(Op);
5638 SDValue Result =
5641 Loc, Op.getValueType(), SrcVal);
5642 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc);
5643 }
5644
5645 return Op;
5646}
5647
5649 const ARMSubtarget *Subtarget) {
5650 EVT VT = Op.getValueType();
5651 EVT ToVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
5652 EVT FromVT = Op.getOperand(0).getValueType();
5653
5654 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f32)
5655 return Op;
5656 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f64 &&
5657 Subtarget->hasFP64())
5658 return Op;
5659 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f16 &&
5660 Subtarget->hasFullFP16())
5661 return Op;
5662 if (VT == MVT::v4i32 && ToVT == MVT::i32 && FromVT == MVT::v4f32 &&
5663 Subtarget->hasMVEFloatOps())
5664 return Op;
5665 if (VT == MVT::v8i16 && ToVT == MVT::i16 && FromVT == MVT::v8f16 &&
5666 Subtarget->hasMVEFloatOps())
5667 return Op;
5668
5669 if (FromVT != MVT::v4f32 && FromVT != MVT::v8f16)
5670 return SDValue();
5671
5672 SDLoc DL(Op);
5673 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT_SAT;
5674 unsigned BW = ToVT.getScalarSizeInBits() - IsSigned;
5675 SDValue CVT = DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
5676 DAG.getValueType(VT.getScalarType()));
5677 SDValue Max = DAG.getNode(IsSigned ? ISD::SMIN : ISD::UMIN, DL, VT, CVT,
5678 DAG.getConstant((1 << BW) - 1, DL, VT));
5679 if (IsSigned)
5680 Max = DAG.getNode(ISD::SMAX, DL, VT, Max,
5681 DAG.getSignedConstant(-(1 << BW), DL, VT));
5682 return Max;
5683}
5684
5686 EVT VT = Op.getValueType();
5687 SDLoc dl(Op);
5688
5689 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) {
5690 if (VT.getVectorElementType() == MVT::f32)
5691 return Op;
5692 return DAG.UnrollVectorOp(Op.getNode());
5693 }
5694
5695 assert((Op.getOperand(0).getValueType() == MVT::v4i16 ||
5696 Op.getOperand(0).getValueType() == MVT::v8i16) &&
5697 "Invalid type for custom lowering!");
5698
5699 const bool HasFullFP16 = DAG.getSubtarget<ARMSubtarget>().hasFullFP16();
5700
5701 EVT DestVecType;
5702 if (VT == MVT::v4f32)
5703 DestVecType = MVT::v4i32;
5704 else if (VT == MVT::v4f16 && HasFullFP16)
5705 DestVecType = MVT::v4i16;
5706 else if (VT == MVT::v8f16 && HasFullFP16)
5707 DestVecType = MVT::v8i16;
5708 else
5709 return DAG.UnrollVectorOp(Op.getNode());
5710
5711 unsigned CastOpc;
5712 unsigned Opc;
5713 switch (Op.getOpcode()) {
5714 default: llvm_unreachable("Invalid opcode!");
5715 case ISD::SINT_TO_FP:
5716 CastOpc = ISD::SIGN_EXTEND;
5718 break;
5719 case ISD::UINT_TO_FP:
5720 CastOpc = ISD::ZERO_EXTEND;
5722 break;
5723 }
5724
5725 Op = DAG.getNode(CastOpc, dl, DestVecType, Op.getOperand(0));
5726 return DAG.getNode(Opc, dl, VT, Op);
5727}
5728
5729SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const {
5730 EVT VT = Op.getValueType();
5731 if (VT.isVector())
5732 return LowerVectorINT_TO_FP(Op, DAG);
5733 if (isUnsupportedFloatingType(VT)) {
5734 RTLIB::Libcall LC;
5735 if (Op.getOpcode() == ISD::SINT_TO_FP)
5736 LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(),
5737 Op.getValueType());
5738 else
5739 LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(),
5740 Op.getValueType());
5741 MakeLibCallOptions CallOptions;
5742 return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
5743 CallOptions, SDLoc(Op)).first;
5744 }
5745
5746 return Op;
5747}
5748
5749SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
5750 // Implement fcopysign with a fabs and a conditional fneg.
5751 SDValue Tmp0 = Op.getOperand(0);
5752 SDValue Tmp1 = Op.getOperand(1);
5753 SDLoc dl(Op);
5754 EVT VT = Op.getValueType();
5755 EVT SrcVT = Tmp1.getValueType();
5756 bool InGPR = Tmp0.getOpcode() == ISD::BITCAST ||
5757 Tmp0.getOpcode() == ARMISD::VMOVDRR;
5758 bool UseNEON = !InGPR && Subtarget->hasNEON();
5759
5760 if (UseNEON) {
5761 // Use VBSL to copy the sign bit.
5762 unsigned EncodedVal = ARM_AM::createVMOVModImm(0x6, 0x80);
5763 SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32,
5764 DAG.getTargetConstant(EncodedVal, dl, MVT::i32));
5765 EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64;
5766 if (VT == MVT::f64)
5767 Mask = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
5768 DAG.getNode(ISD::BITCAST, dl, OpVT, Mask),
5769 DAG.getConstant(32, dl, MVT::i32));
5770 else /*if (VT == MVT::f32)*/
5771 Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0);
5772 if (SrcVT == MVT::f32) {
5773 Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1);
5774 if (VT == MVT::f64)
5775 Tmp1 = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
5776 DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1),
5777 DAG.getConstant(32, dl, MVT::i32));
5778 } else if (VT == MVT::f32)
5779 Tmp1 = DAG.getNode(ARMISD::VSHRuIMM, dl, MVT::v1i64,
5780 DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1),
5781 DAG.getConstant(32, dl, MVT::i32));
5782 Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);
5783 Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1);
5784
5786 dl, MVT::i32);
5787 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);
5788 SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask,
5789 DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes));
5790
5791 SDValue Res = DAG.getNode(ISD::OR, dl, OpVT,
5792 DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask),
5793 DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot));
5794 if (VT == MVT::f32) {
5795 Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res);
5796 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
5797 DAG.getConstant(0, dl, MVT::i32));
5798 } else {
5799 Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res);
5800 }
5801
5802 return Res;
5803 }
5804
5805 // Bitcast operand 1 to i32.
5806 if (SrcVT == MVT::f64)
5807 Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
5808 Tmp1).getValue(1);
5809 Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1);
5810
5811 // Or in the signbit with integer operations.
5812 SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32);
5813 SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32);
5814 Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1);
5815 if (VT == MVT::f32) {
5816 Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32,
5817 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2);
5818 return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
5819 DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1));
5820 }
5821
5822 // f64: Or the high part with signbit and then combine two parts.
5823 Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
5824 Tmp0);
5825 SDValue Lo = Tmp0.getValue(0);
5826 SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2);
5827 Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1);
5828 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
5829}
5830
5831SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
5832 MachineFunction &MF = DAG.getMachineFunction();
5833 MachineFrameInfo &MFI = MF.getFrameInfo();
5834 MFI.setReturnAddressIsTaken(true);
5835
5836 EVT VT = Op.getValueType();
5837 SDLoc dl(Op);
5838 unsigned Depth = Op.getConstantOperandVal(0);
5839 if (Depth) {
5840 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
5841 SDValue Offset = DAG.getConstant(4, dl, MVT::i32);
5842 return DAG.getLoad(VT, dl, DAG.getEntryNode(),
5843 DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
5844 MachinePointerInfo());
5845 }
5846
5847 // Return LR, which contains the return address. Mark it an implicit live-in.
5848 Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
5849 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
5850}
5851
5852SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
5853 const ARMBaseRegisterInfo &ARI =
5854 *static_cast<const ARMBaseRegisterInfo*>(RegInfo);
5855 MachineFunction &MF = DAG.getMachineFunction();
5856 MachineFrameInfo &MFI = MF.getFrameInfo();
5857 MFI.setFrameAddressIsTaken(true);
5858
5859 EVT VT = Op.getValueType();
5860 SDLoc dl(Op); // FIXME probably not meaningful
5861 unsigned Depth = Op.getConstantOperandVal(0);
5862 Register FrameReg = ARI.getFrameRegister(MF);
5863 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
5864 while (Depth--)
5865 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
5866 MachinePointerInfo());
5867 return FrameAddr;
5868}
5869
5870// FIXME? Maybe this could be a TableGen attribute on some registers and
5871// this table could be generated automatically from RegInfo.
5872Register ARMTargetLowering::getRegisterByName(const char* RegName, LLT VT,
5873 const MachineFunction &MF) const {
5874 return StringSwitch<Register>(RegName)
5875 .Case("sp", ARM::SP)
5876 .Default(Register());
5877}
5878
5879// Result is 64 bit value so split into two 32 bit values and return as a
5880// pair of values.
5882 SelectionDAG &DAG) {
5883 SDLoc DL(N);
5884
5885 // This function is only supposed to be called for i64 type destination.
5886 assert(N->getValueType(0) == MVT::i64
5887 && "ExpandREAD_REGISTER called for non-i64 type result.");
5888
5890 DAG.getVTList(MVT::i32, MVT::i32, MVT::Other),
5891 N->getOperand(0),
5892 N->getOperand(1));
5893
5894 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0),
5895 Read.getValue(1)));
5896 Results.push_back(Read.getValue(2)); // Chain
5897}
5898
5899/// \p BC is a bitcast that is about to be turned into a VMOVDRR.
5900/// When \p DstVT, the destination type of \p BC, is on the vector
5901/// register bank and the source of bitcast, \p Op, operates on the same bank,
5902/// it might be possible to combine them, such that everything stays on the
5903/// vector register bank.
5904/// \p return The node that would replace \p BT, if the combine
5905/// is possible.
5907 SelectionDAG &DAG) {
5908 SDValue Op = BC->getOperand(0);
5909 EVT DstVT = BC->getValueType(0);
5910
5911 // The only vector instruction that can produce a scalar (remember,
5912 // since the bitcast was about to be turned into VMOVDRR, the source
5913 // type is i64) from a vector is EXTRACT_VECTOR_ELT.
5914 // Moreover, we can do this combine only if there is one use.
5915 // Finally, if the destination type is not a vector, there is not
5916 // much point on forcing everything on the vector bank.
5917 if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
5918 !Op.hasOneUse())
5919 return SDValue();
5920
5921 // If the index is not constant, we will introduce an additional
5922 // multiply that will stick.
5923 // Give up in that case.
5924 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1));
5925 if (!Index)
5926 return SDValue();
5927 unsigned DstNumElt = DstVT.getVectorNumElements();
5928
5929 // Compute the new index.
5930 const APInt &APIntIndex = Index->getAPIntValue();
5931 APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt);
5932 NewIndex *= APIntIndex;
5933 // Check if the new constant index fits into i32.
5934 if (NewIndex.getBitWidth() > 32)
5935 return SDValue();
5936
5937 // vMTy bitcast(i64 extractelt vNi64 src, i32 index) ->
5938 // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M)
5939 SDLoc dl(Op);
5940 SDValue ExtractSrc = Op.getOperand(0);
5941 EVT VecVT = EVT::getVectorVT(
5942 *DAG.getContext(), DstVT.getScalarType(),
5943 ExtractSrc.getValueType().getVectorNumElements() * DstNumElt);
5944 SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc);
5945 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast,
5946 DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32));
5947}
5948
5949/// ExpandBITCAST - If the target supports VFP, this function is called to
5950/// expand a bit convert where either the source or destination type is i64 to
5951/// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64
5952/// operand type is illegal (e.g., v2f32 for a target that doesn't support
5953/// vectors), since the legalizer won't know what to do with that.
5954SDValue ARMTargetLowering::ExpandBITCAST(SDNode *N, SelectionDAG &DAG,
5955 const ARMSubtarget *Subtarget) const {
5956 SDLoc dl(N);
5957 SDValue Op = N->getOperand(0);
5958
5959 // This function is only supposed to be called for i16 and i64 types, either
5960 // as the source or destination of the bit convert.
5961 EVT SrcVT = Op.getValueType();
5962 EVT DstVT = N->getValueType(0);
5963
5964 if ((SrcVT == MVT::i16 || SrcVT == MVT::i32) &&
5965 (DstVT == MVT::f16 || DstVT == MVT::bf16))
5966 return MoveToHPR(SDLoc(N), DAG, MVT::i32, DstVT.getSimpleVT(),
5967 DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), MVT::i32, Op));
5968
5969 if ((DstVT == MVT::i16 || DstVT == MVT::i32) &&
5970 (SrcVT == MVT::f16 || SrcVT == MVT::bf16)) {
5971 if (Subtarget->hasFullFP16() && !Subtarget->hasBF16())
5972 Op = DAG.getBitcast(MVT::f16, Op);
5973 return DAG.getNode(
5974 ISD::TRUNCATE, SDLoc(N), DstVT,
5975 MoveFromHPR(SDLoc(N), DAG, MVT::i32, SrcVT.getSimpleVT(), Op));
5976 }
5977
5978 if (!(SrcVT == MVT::i64 || DstVT == MVT::i64))
5979 return SDValue();
5980
5981 // Turn i64->f64 into VMOVDRR.
5982 if (SrcVT == MVT::i64 && isTypeLegal(DstVT)) {
5983 // Do not force values to GPRs (this is what VMOVDRR does for the inputs)
5984 // if we can combine the bitcast with its source.
5986 return Val;
5987 SDValue Lo, Hi;
5988 std::tie(Lo, Hi) = DAG.SplitScalar(Op, dl, MVT::i32, MVT::i32);
5989 return DAG.getNode(ISD::BITCAST, dl, DstVT,
5990 DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi));
5991 }
5992
5993 // Turn f64->i64 into VMOVRRD.
5994 if (DstVT == MVT::i64 && isTypeLegal(SrcVT)) {
5995 SDValue Cvt;
5996 if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() &&
5997 SrcVT.getVectorNumElements() > 1)
5998 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
5999 DAG.getVTList(MVT::i32, MVT::i32),
6000 DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op));
6001 else
6002 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
6003 DAG.getVTList(MVT::i32, MVT::i32), Op);
6004 // Merge the pieces into a single i64 value.
6005 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1));
6006 }
6007
6008 return SDValue();
6009}
6010
6011/// getZeroVector - Returns a vector of specified type with all zero elements.
6012/// Zero vectors are used to represent vector negation and in those cases
6013/// will be implemented with the NEON VNEG instruction. However, VNEG does
6014/// not support i64 elements, so sometimes the zero vectors will need to be
6015/// explicitly constructed. Regardless, use a canonical VMOV to create the
6016/// zero vector.
6017static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
6018 assert(VT.isVector() && "Expected a vector type");
6019 // The canonical modified immediate encoding of a zero vector is....0!
6020 SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32);
6021 EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
6022 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal);
6023 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
6024}
6025
6026/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
6027/// i32 values and take a 2 x i32 value to shift plus a shift amount.
6028SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op,
6029 SelectionDAG &DAG) const {
6030 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
6031 EVT VT = Op.getValueType();
6032 unsigned VTBits = VT.getSizeInBits();
6033 SDLoc dl(Op);
6034 SDValue ShOpLo = Op.getOperand(0);
6035 SDValue ShOpHi = Op.getOperand(1);
6036 SDValue ShAmt = Op.getOperand(2);
6037 SDValue ARMcc;
6038 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
6039
6040 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
6041
6042 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6043 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
6044 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
6045 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
6046 DAG.getConstant(VTBits, dl, MVT::i32));
6047 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
6048 SDValue LoSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
6049 SDValue LoBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
6050 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6051 ISD::SETGE, ARMcc, DAG, dl);
6052 SDValue Lo =
6053 DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, LoBigShift, ARMcc, CmpLo);
6054
6055 SDValue HiSmallShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
6056 SDValue HiBigShift = Opc == ISD::SRA
6057 ? DAG.getNode(Opc, dl, VT, ShOpHi,
6058 DAG.getConstant(VTBits - 1, dl, VT))
6059 : DAG.getConstant(0, dl, VT);
6060 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6061 ISD::SETGE, ARMcc, DAG, dl);
6062 SDValue Hi =
6063 DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, ARMcc, CmpHi);
6064
6065 SDValue Ops[2] = { Lo, Hi };
6066 return DAG.getMergeValues(Ops, dl);
6067}
6068
6069/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
6070/// i32 values and take a 2 x i32 value to shift plus a shift amount.
6071SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
6072 SelectionDAG &DAG) const {
6073 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
6074 EVT VT = Op.getValueType();
6075 unsigned VTBits = VT.getSizeInBits();
6076 SDLoc dl(Op);
6077 SDValue ShOpLo = Op.getOperand(0);
6078 SDValue ShOpHi = Op.getOperand(1);
6079 SDValue ShAmt = Op.getOperand(2);
6080 SDValue ARMcc;
6081
6082 assert(Op.getOpcode() == ISD::SHL_PARTS);
6083 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6084 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
6085 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
6086 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
6087 SDValue HiSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
6088
6089 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
6090 DAG.getConstant(VTBits, dl, MVT::i32));
6091 SDValue HiBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
6092 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6093 ISD::SETGE, ARMcc, DAG, dl);
6094 SDValue Hi =
6095 DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, ARMcc, CmpHi);
6096
6097 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6098 ISD::SETGE, ARMcc, DAG, dl);
6099 SDValue LoSmallShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
6100 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift,
6101 DAG.getConstant(0, dl, VT), ARMcc, CmpLo);
6102
6103 SDValue Ops[2] = { Lo, Hi };
6104 return DAG.getMergeValues(Ops, dl);
6105}
6106
6107SDValue ARMTargetLowering::LowerGET_ROUNDING(SDValue Op,
6108 SelectionDAG &DAG) const {
6109 // The rounding mode is in bits 23:22 of the FPSCR.
6110 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
6111 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
6112 // so that the shift + and get folded into a bitfield extract.
6113 SDLoc dl(Op);
6114 SDValue Chain = Op.getOperand(0);
6115 SDValue Ops[] = {Chain,
6116 DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32)};
6117
6118 SDValue FPSCR =
6119 DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, {MVT::i32, MVT::Other}, Ops);
6120 Chain = FPSCR.getValue(1);
6121 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR,
6122 DAG.getConstant(1U << 22, dl, MVT::i32));
6123 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
6124 DAG.getConstant(22, dl, MVT::i32));
6125 SDValue And = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
6126 DAG.getConstant(3, dl, MVT::i32));
6127 return DAG.getMergeValues({And, Chain}, dl);
6128}
6129
6130SDValue ARMTargetLowering::LowerSET_ROUNDING(SDValue Op,
6131 SelectionDAG &DAG) const {
6132 SDLoc DL(Op);
6133 SDValue Chain = Op->getOperand(0);
6134 SDValue RMValue = Op->getOperand(1);
6135
6136 // The rounding mode is in bits 23:22 of the FPSCR.
6137 // The llvm.set.rounding argument value to ARM rounding mode value mapping
6138 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
6139 // ((arg - 1) & 3) << 22).
6140 //
6141 // It is expected that the argument of llvm.set.rounding is within the
6142 // segment [0, 3], so NearestTiesToAway (4) is not handled here. It is
6143 // responsibility of the code generated llvm.set.rounding to ensure this
6144 // condition.
6145
6146 // Calculate new value of FPSCR[23:22].
6147 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
6148 DAG.getConstant(1, DL, MVT::i32));
6149 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
6150 DAG.getConstant(0x3, DL, MVT::i32));
6151 RMValue = DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
6152 DAG.getConstant(ARM::RoundingBitsPos, DL, MVT::i32));
6153
6154 // Get current value of FPSCR.
6155 SDValue Ops[] = {Chain,
6156 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6157 SDValue FPSCR =
6158 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6159 Chain = FPSCR.getValue(1);
6160 FPSCR = FPSCR.getValue(0);
6161
6162 // Put new rounding mode into FPSCR[23:22].
6163 const unsigned RMMask = ~(ARM::Rounding::rmMask << ARM::RoundingBitsPos);
6164 FPSCR = DAG.getNode(ISD::AND, DL, MVT::i32, FPSCR,
6165 DAG.getConstant(RMMask, DL, MVT::i32));
6166 FPSCR = DAG.getNode(ISD::OR, DL, MVT::i32, FPSCR, RMValue);
6167 SDValue Ops2[] = {
6168 Chain, DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32), FPSCR};
6169 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6170}
6171
6172SDValue ARMTargetLowering::LowerSET_FPMODE(SDValue Op,
6173 SelectionDAG &DAG) const {
6174 SDLoc DL(Op);
6175 SDValue Chain = Op->getOperand(0);
6176 SDValue Mode = Op->getOperand(1);
6177
6178 // Generate nodes to build:
6179 // FPSCR = (FPSCR & FPStatusBits) | (Mode & ~FPStatusBits)
6180 SDValue Ops[] = {Chain,
6181 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6182 SDValue FPSCR =
6183 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6184 Chain = FPSCR.getValue(1);
6185 FPSCR = FPSCR.getValue(0);
6186
6187 SDValue FPSCRMasked =
6188 DAG.getNode(ISD::AND, DL, MVT::i32, FPSCR,
6189 DAG.getConstant(ARM::FPStatusBits, DL, MVT::i32));
6190 SDValue InputMasked =
6191 DAG.getNode(ISD::AND, DL, MVT::i32, Mode,
6192 DAG.getConstant(~ARM::FPStatusBits, DL, MVT::i32));
6193 FPSCR = DAG.getNode(ISD::OR, DL, MVT::i32, FPSCRMasked, InputMasked);
6194
6195 SDValue Ops2[] = {
6196 Chain, DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32), FPSCR};
6197 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6198}
6199
6200SDValue ARMTargetLowering::LowerRESET_FPMODE(SDValue Op,
6201 SelectionDAG &DAG) const {
6202 SDLoc DL(Op);
6203 SDValue Chain = Op->getOperand(0);
6204
6205 // To get the default FP mode all control bits are cleared:
6206 // FPSCR = FPSCR & (FPStatusBits | FPReservedBits)
6207 SDValue Ops[] = {Chain,
6208 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6209 SDValue FPSCR =
6210 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6211 Chain = FPSCR.getValue(1);
6212 FPSCR = FPSCR.getValue(0);
6213
6214 SDValue FPSCRMasked = DAG.getNode(
6215 ISD::AND, DL, MVT::i32, FPSCR,
6217 SDValue Ops2[] = {Chain,
6218 DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32),
6219 FPSCRMasked};
6220 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6221}
6222
6224 const ARMSubtarget *ST) {
6225 SDLoc dl(N);
6226 EVT VT = N->getValueType(0);
6227 if (VT.isVector() && ST->hasNEON()) {
6228
6229 // Compute the least significant set bit: LSB = X & -X
6230 SDValue X = N->getOperand(0);
6231 SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X);
6232 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX);
6233
6234 EVT ElemTy = VT.getVectorElementType();
6235
6236 if (ElemTy == MVT::i8) {
6237 // Compute with: cttz(x) = ctpop(lsb - 1)
6238 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6239 DAG.getTargetConstant(1, dl, ElemTy));
6240 SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
6241 return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
6242 }
6243
6244 if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) &&
6245 (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) {
6246 // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0
6247 unsigned NumBits = ElemTy.getSizeInBits();
6248 SDValue WidthMinus1 =
6249 DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6250 DAG.getTargetConstant(NumBits - 1, dl, ElemTy));
6251 SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB);
6252 return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ);
6253 }
6254
6255 // Compute with: cttz(x) = ctpop(lsb - 1)
6256
6257 // Compute LSB - 1.
6258 SDValue Bits;
6259 if (ElemTy == MVT::i64) {
6260 // Load constant 0xffff'ffff'ffff'ffff to register.
6261 SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6262 DAG.getTargetConstant(0x1eff, dl, MVT::i32));
6263 Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF);
6264 } else {
6265 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6266 DAG.getTargetConstant(1, dl, ElemTy));
6267 Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
6268 }
6269 return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
6270 }
6271
6272 if (!ST->hasV6T2Ops())
6273 return SDValue();
6274
6275 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0));
6276 return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
6277}
6278
6280 const ARMSubtarget *ST) {
6281 EVT VT = N->getValueType(0);
6282 SDLoc DL(N);
6283
6284 assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");
6285 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
6286 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
6287 "Unexpected type for custom ctpop lowering");
6288
6289 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6290 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
6291 SDValue Res = DAG.getBitcast(VT8Bit, N->getOperand(0));
6292 Res = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Res);
6293
6294 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
6295 unsigned EltSize = 8;
6296 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
6297 while (EltSize != VT.getScalarSizeInBits()) {
6299 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddlu, DL,
6300 TLI.getPointerTy(DAG.getDataLayout())));
6301 Ops.push_back(Res);
6302
6303 EltSize *= 2;
6304 NumElts /= 2;
6305 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
6306 Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, Ops);
6307 }
6308
6309 return Res;
6310}
6311
6312/// Getvshiftimm - Check if this is a valid build_vector for the immediate
6313/// operand of a vector shift operation, where all the elements of the
6314/// build_vector must have the same constant integer value.
6315static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
6316 // Ignore bit_converts.
6317 while (Op.getOpcode() == ISD::BITCAST)
6318 Op = Op.getOperand(0);
6320 APInt SplatBits, SplatUndef;
6321 unsigned SplatBitSize;
6322 bool HasAnyUndefs;
6323 if (!BVN ||
6324 !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs,
6325 ElementBits) ||
6326 SplatBitSize > ElementBits)
6327 return false;
6328 Cnt = SplatBits.getSExtValue();
6329 return true;
6330}
6331
6332/// isVShiftLImm - Check if this is a valid build_vector for the immediate
6333/// operand of a vector shift left operation. That value must be in the range:
6334/// 0 <= Value < ElementBits for a left shift; or
6335/// 0 <= Value <= ElementBits for a long left shift.
6336static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
6337 assert(VT.isVector() && "vector shift count is not a vector type");
6338 int64_t ElementBits = VT.getScalarSizeInBits();
6339 if (!getVShiftImm(Op, ElementBits, Cnt))
6340 return false;
6341 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
6342}
6343
6344/// isVShiftRImm - Check if this is a valid build_vector for the immediate
6345/// operand of a vector shift right operation. For a shift opcode, the value
6346/// is positive, but for an intrinsic the value count must be negative. The
6347/// absolute value must be in the range:
6348/// 1 <= |Value| <= ElementBits for a right shift; or
6349/// 1 <= |Value| <= ElementBits/2 for a narrow right shift.
6350static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
6351 int64_t &Cnt) {
6352 assert(VT.isVector() && "vector shift count is not a vector type");
6353 int64_t ElementBits = VT.getScalarSizeInBits();
6354 if (!getVShiftImm(Op, ElementBits, Cnt))
6355 return false;
6356 if (!isIntrinsic)
6357 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
6358 if (Cnt >= -(isNarrow ? ElementBits / 2 : ElementBits) && Cnt <= -1) {
6359 Cnt = -Cnt;
6360 return true;
6361 }
6362 return false;
6363}
6364
6366 const ARMSubtarget *ST) {
6367 EVT VT = N->getValueType(0);
6368 SDLoc dl(N);
6369 int64_t Cnt;
6370
6371 if (!VT.isVector())
6372 return SDValue();
6373
6374 // We essentially have two forms here. Shift by an immediate and shift by a
6375 // vector register (there are also shift by a gpr, but that is just handled
6376 // with a tablegen pattern). We cannot easily match shift by an immediate in
6377 // tablegen so we do that here and generate a VSHLIMM/VSHRsIMM/VSHRuIMM.
6378 // For shifting by a vector, we don't have VSHR, only VSHL (which can be
6379 // signed or unsigned, and a negative shift indicates a shift right).
6380 if (N->getOpcode() == ISD::SHL) {
6381 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt))
6382 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
6383 DAG.getConstant(Cnt, dl, MVT::i32));
6384 return DAG.getNode(ARMISD::VSHLu, dl, VT, N->getOperand(0),
6385 N->getOperand(1));
6386 }
6387
6388 assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) &&
6389 "unexpected vector shift opcode");
6390
6391 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
6392 unsigned VShiftOpc =
6393 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
6394 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
6395 DAG.getConstant(Cnt, dl, MVT::i32));
6396 }
6397
6398 // Other right shifts we don't have operations for (we use a shift left by a
6399 // negative number).
6400 EVT ShiftVT = N->getOperand(1).getValueType();
6401 SDValue NegatedCount = DAG.getNode(
6402 ISD::SUB, dl, ShiftVT, getZeroVector(ShiftVT, DAG, dl), N->getOperand(1));
6403 unsigned VShiftOpc =
6404 (N->getOpcode() == ISD::SRA ? ARMISD::VSHLs : ARMISD::VSHLu);
6405 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), NegatedCount);
6406}
6407
6409 const ARMSubtarget *ST) {
6410 EVT VT = N->getValueType(0);
6411 SDLoc dl(N);
6412
6413 // We can get here for a node like i32 = ISD::SHL i32, i64
6414 if (VT != MVT::i64)
6415 return SDValue();
6416
6417 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA ||
6418 N->getOpcode() == ISD::SHL) &&
6419 "Unknown shift to lower!");
6420
6421 unsigned ShOpc = N->getOpcode();
6422 if (ST->hasMVEIntegerOps()) {
6423 SDValue ShAmt = N->getOperand(1);
6424 unsigned ShPartsOpc = ARMISD::LSLL;
6426
6427 // If the shift amount is greater than 32 or has a greater bitwidth than 64
6428 // then do the default optimisation
6429 if ((!Con && ShAmt->getValueType(0).getSizeInBits() > 64) ||
6430 (Con && (Con->getAPIntValue() == 0 || Con->getAPIntValue().uge(32))))
6431 return SDValue();
6432
6433 // Extract the lower 32 bits of the shift amount if it's not an i32
6434 if (ShAmt->getValueType(0) != MVT::i32)
6435 ShAmt = DAG.getZExtOrTrunc(ShAmt, dl, MVT::i32);
6436
6437 if (ShOpc == ISD::SRL) {
6438 if (!Con)
6439 // There is no t2LSRLr instruction so negate and perform an lsll if the
6440 // shift amount is in a register, emulating a right shift.
6441 ShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6442 DAG.getConstant(0, dl, MVT::i32), ShAmt);
6443 else
6444 // Else generate an lsrl on the immediate shift amount
6445 ShPartsOpc = ARMISD::LSRL;
6446 } else if (ShOpc == ISD::SRA)
6447 ShPartsOpc = ARMISD::ASRL;
6448
6449 // Split Lower/Upper 32 bits of the destination/source
6450 SDValue Lo, Hi;
6451 std::tie(Lo, Hi) =
6452 DAG.SplitScalar(N->getOperand(0), dl, MVT::i32, MVT::i32);
6453 // Generate the shift operation as computed above
6454 Lo = DAG.getNode(ShPartsOpc, dl, DAG.getVTList(MVT::i32, MVT::i32), Lo, Hi,
6455 ShAmt);
6456 // The upper 32 bits come from the second return value of lsll
6457 Hi = SDValue(Lo.getNode(), 1);
6458 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
6459 }
6460
6461 // We only lower SRA, SRL of 1 here, all others use generic lowering.
6462 if (!isOneConstant(N->getOperand(1)) || N->getOpcode() == ISD::SHL)
6463 return SDValue();
6464
6465 // If we are in thumb mode, we don't have RRX.
6466 if (ST->isThumb1Only())
6467 return SDValue();
6468
6469 // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr.
6470 SDValue Lo, Hi;
6471 std::tie(Lo, Hi) = DAG.SplitScalar(N->getOperand(0), dl, MVT::i32, MVT::i32);
6472
6473 // First, build a LSRS1/ASRS1 op, which shifts the top part by one and
6474 // captures the shifted out bit into a carry flag.
6475 unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::LSRS1 : ARMISD::ASRS1;
6476 Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, FlagsVT), Hi);
6477
6478 // The low part is an ARMISD::RRX operand, which shifts the carry in.
6479 Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1));
6480
6481 // Merge the pieces into a single i64 value.
6482 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
6483}
6484
6486 const ARMSubtarget *ST) {
6487 bool Invert = false;
6488 bool Swap = false;
6489 unsigned Opc = ARMCC::AL;
6490
6491 SDValue Op0 = Op.getOperand(0);
6492 SDValue Op1 = Op.getOperand(1);
6493 SDValue CC = Op.getOperand(2);
6494 EVT VT = Op.getValueType();
6495 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
6496 SDLoc dl(Op);
6497
6498 EVT CmpVT;
6499 if (ST->hasNEON())
6501 else {
6502 assert(ST->hasMVEIntegerOps() &&
6503 "No hardware support for integer vector comparison!");
6504
6505 if (Op.getValueType().getVectorElementType() != MVT::i1)
6506 return SDValue();
6507
6508 // Make sure we expand floating point setcc to scalar if we do not have
6509 // mve.fp, so that we can handle them from there.
6510 if (Op0.getValueType().isFloatingPoint() && !ST->hasMVEFloatOps())
6511 return SDValue();
6512
6513 CmpVT = VT;
6514 }
6515
6516 if (Op0.getValueType().getVectorElementType() == MVT::i64 &&
6517 (SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) {
6518 // Special-case integer 64-bit equality comparisons. They aren't legal,
6519 // but they can be lowered with a few vector instructions.
6520 unsigned CmpElements = CmpVT.getVectorNumElements() * 2;
6521 EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, CmpElements);
6522 SDValue CastOp0 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op0);
6523 SDValue CastOp1 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op1);
6524 SDValue Cmp = DAG.getNode(ISD::SETCC, dl, SplitVT, CastOp0, CastOp1,
6525 DAG.getCondCode(ISD::SETEQ));
6526 SDValue Reversed = DAG.getNode(ARMISD::VREV64, dl, SplitVT, Cmp);
6527 SDValue Merged = DAG.getNode(ISD::AND, dl, SplitVT, Cmp, Reversed);
6528 Merged = DAG.getNode(ISD::BITCAST, dl, CmpVT, Merged);
6529 if (SetCCOpcode == ISD::SETNE)
6530 Merged = DAG.getNOT(dl, Merged, CmpVT);
6531 Merged = DAG.getSExtOrTrunc(Merged, dl, VT);
6532 return Merged;
6533 }
6534
6535 if (CmpVT.getVectorElementType() == MVT::i64)
6536 // 64-bit comparisons are not legal in general.
6537 return SDValue();
6538
6539 if (Op1.getValueType().isFloatingPoint()) {
6540 switch (SetCCOpcode) {
6541 default: llvm_unreachable("Illegal FP comparison");
6542 case ISD::SETUNE:
6543 case ISD::SETNE:
6544 if (ST->hasMVEFloatOps()) {
6545 Opc = ARMCC::NE; break;
6546 } else {
6547 Invert = true; [[fallthrough]];
6548 }
6549 case ISD::SETOEQ:
6550 case ISD::SETEQ: Opc = ARMCC::EQ; break;
6551 case ISD::SETOLT:
6552 case ISD::SETLT: Swap = true; [[fallthrough]];
6553 case ISD::SETOGT:
6554 case ISD::SETGT: Opc = ARMCC::GT; break;
6555 case ISD::SETOLE:
6556 case ISD::SETLE: Swap = true; [[fallthrough]];
6557 case ISD::SETOGE:
6558 case ISD::SETGE: Opc = ARMCC::GE; break;
6559 case ISD::SETUGE: Swap = true; [[fallthrough]];
6560 case ISD::SETULE: Invert = true; Opc = ARMCC::GT; break;
6561 case ISD::SETUGT: Swap = true; [[fallthrough]];
6562 case ISD::SETULT: Invert = true; Opc = ARMCC::GE; break;
6563 case ISD::SETUEQ: Invert = true; [[fallthrough]];
6564 case ISD::SETONE: {
6565 // Expand this to (OLT | OGT).
6566 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
6567 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6568 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6569 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6570 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
6571 if (Invert)
6572 Result = DAG.getNOT(dl, Result, VT);
6573 return Result;
6574 }
6575 case ISD::SETUO: Invert = true; [[fallthrough]];
6576 case ISD::SETO: {
6577 // Expand this to (OLT | OGE).
6578 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
6579 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6580 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6581 DAG.getConstant(ARMCC::GE, dl, MVT::i32));
6582 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
6583 if (Invert)
6584 Result = DAG.getNOT(dl, Result, VT);
6585 return Result;
6586 }
6587 }
6588 } else {
6589 // Integer comparisons.
6590 switch (SetCCOpcode) {
6591 default: llvm_unreachable("Illegal integer comparison");
6592 case ISD::SETNE:
6593 if (ST->hasMVEIntegerOps()) {
6594 Opc = ARMCC::NE; break;
6595 } else {
6596 Invert = true; [[fallthrough]];
6597 }
6598 case ISD::SETEQ: Opc = ARMCC::EQ; break;
6599 case ISD::SETLT: Swap = true; [[fallthrough]];
6600 case ISD::SETGT: Opc = ARMCC::GT; break;
6601 case ISD::SETLE: Swap = true; [[fallthrough]];
6602 case ISD::SETGE: Opc = ARMCC::GE; break;
6603 case ISD::SETULT: Swap = true; [[fallthrough]];
6604 case ISD::SETUGT: Opc = ARMCC::HI; break;
6605 case ISD::SETULE: Swap = true; [[fallthrough]];
6606 case ISD::SETUGE: Opc = ARMCC::HS; break;
6607 }
6608
6609 // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero).
6610 if (ST->hasNEON() && Opc == ARMCC::EQ) {
6611 SDValue AndOp;
6613 AndOp = Op0;
6614 else if (ISD::isBuildVectorAllZeros(Op0.getNode()))
6615 AndOp = Op1;
6616
6617 // Ignore bitconvert.
6618 if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST)
6619 AndOp = AndOp.getOperand(0);
6620
6621 if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {
6622 Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0));
6623 Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1));
6624 SDValue Result = DAG.getNode(ARMISD::VTST, dl, CmpVT, Op0, Op1);
6625 if (!Invert)
6626 Result = DAG.getNOT(dl, Result, VT);
6627 return Result;
6628 }
6629 }
6630 }
6631
6632 if (Swap)
6633 std::swap(Op0, Op1);
6634
6635 // If one of the operands is a constant vector zero, attempt to fold the
6636 // comparison to a specialized compare-against-zero form.
6638 (Opc == ARMCC::GE || Opc == ARMCC::GT || Opc == ARMCC::EQ ||
6639 Opc == ARMCC::NE)) {
6640 if (Opc == ARMCC::GE)
6641 Opc = ARMCC::LE;
6642 else if (Opc == ARMCC::GT)
6643 Opc = ARMCC::LT;
6644 std::swap(Op0, Op1);
6645 }
6646
6647 SDValue Result;
6649 (Opc == ARMCC::GE || Opc == ARMCC::GT || Opc == ARMCC::LE ||
6650 Opc == ARMCC::LT || Opc == ARMCC::NE || Opc == ARMCC::EQ))
6651 Result = DAG.getNode(ARMISD::VCMPZ, dl, CmpVT, Op0,
6652 DAG.getConstant(Opc, dl, MVT::i32));
6653 else
6654 Result = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6655 DAG.getConstant(Opc, dl, MVT::i32));
6656
6657 Result = DAG.getSExtOrTrunc(Result, dl, VT);
6658
6659 if (Invert)
6660 Result = DAG.getNOT(dl, Result, VT);
6661
6662 return Result;
6663}
6664
6666 SDValue LHS = Op.getOperand(0);
6667 SDValue RHS = Op.getOperand(1);
6668 SDValue Carry = Op.getOperand(2);
6669 SDValue Cond = Op.getOperand(3);
6670 SDLoc DL(Op);
6671
6672 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
6673
6674 // ARMISD::SUBE expects a carry not a borrow like ISD::USUBO_CARRY so we
6675 // have to invert the carry first.
6676 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
6677 DAG.getConstant(1, DL, MVT::i32), Carry);
6678 // This converts the boolean value carry into the carry flag.
6679 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
6680
6681 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
6682 SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry);
6683
6684 SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
6685 SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
6686 SDValue ARMcc = DAG.getConstant(
6687 IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32);
6688 return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc,
6689 Cmp.getValue(1));
6690}
6691
6692/// isVMOVModifiedImm - Check if the specified splat value corresponds to a
6693/// valid vector constant for a NEON or MVE instruction with a "modified
6694/// immediate" operand (e.g., VMOV). If so, return the encoded value.
6695static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
6696 unsigned SplatBitSize, SelectionDAG &DAG,
6697 const SDLoc &dl, EVT &VT, EVT VectorVT,
6698 VMOVModImmType type) {
6699 unsigned OpCmode, Imm;
6700 bool is128Bits = VectorVT.is128BitVector();
6701
6702 // SplatBitSize is set to the smallest size that splats the vector, so a
6703 // zero vector will always have SplatBitSize == 8. However, NEON modified
6704 // immediate instructions others than VMOV do not support the 8-bit encoding
6705 // of a zero vector, and the default encoding of zero is supposed to be the
6706 // 32-bit version.
6707 if (SplatBits == 0)
6708 SplatBitSize = 32;
6709
6710 switch (SplatBitSize) {
6711 case 8:
6712 if (type != VMOVModImm)
6713 return SDValue();
6714 // Any 1-byte value is OK. Op=0, Cmode=1110.
6715 assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
6716 OpCmode = 0xe;
6717 Imm = SplatBits;
6718 VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
6719 break;
6720
6721 case 16:
6722 // NEON's 16-bit VMOV supports splat values where only one byte is nonzero.
6723 VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
6724 if ((SplatBits & ~0xff) == 0) {
6725 // Value = 0x00nn: Op=x, Cmode=100x.
6726 OpCmode = 0x8;
6727 Imm = SplatBits;
6728 break;
6729 }
6730 if ((SplatBits & ~0xff00) == 0) {
6731 // Value = 0xnn00: Op=x, Cmode=101x.
6732 OpCmode = 0xa;
6733 Imm = SplatBits >> 8;
6734 break;
6735 }
6736 return SDValue();
6737
6738 case 32:
6739 // NEON's 32-bit VMOV supports splat values where:
6740 // * only one byte is nonzero, or
6741 // * the least significant byte is 0xff and the second byte is nonzero, or
6742 // * the least significant 2 bytes are 0xff and the third is nonzero.
6743 VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
6744 if ((SplatBits & ~0xff) == 0) {
6745 // Value = 0x000000nn: Op=x, Cmode=000x.
6746 OpCmode = 0;
6747 Imm = SplatBits;
6748 break;
6749 }
6750 if ((SplatBits & ~0xff00) == 0) {
6751 // Value = 0x0000nn00: Op=x, Cmode=001x.
6752 OpCmode = 0x2;
6753 Imm = SplatBits >> 8;
6754 break;
6755 }
6756 if ((SplatBits & ~0xff0000) == 0) {
6757 // Value = 0x00nn0000: Op=x, Cmode=010x.
6758 OpCmode = 0x4;
6759 Imm = SplatBits >> 16;
6760 break;
6761 }
6762 if ((SplatBits & ~0xff000000) == 0) {
6763 // Value = 0xnn000000: Op=x, Cmode=011x.
6764 OpCmode = 0x6;
6765 Imm = SplatBits >> 24;
6766 break;
6767 }
6768
6769 // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC
6770 if (type == OtherModImm) return SDValue();
6771
6772 if ((SplatBits & ~0xffff) == 0 &&
6773 ((SplatBits | SplatUndef) & 0xff) == 0xff) {
6774 // Value = 0x0000nnff: Op=x, Cmode=1100.
6775 OpCmode = 0xc;
6776 Imm = SplatBits >> 8;
6777 break;
6778 }
6779
6780 // cmode == 0b1101 is not supported for MVE VMVN
6781 if (type == MVEVMVNModImm)
6782 return SDValue();
6783
6784 if ((SplatBits & ~0xffffff) == 0 &&
6785 ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
6786 // Value = 0x00nnffff: Op=x, Cmode=1101.
6787 OpCmode = 0xd;
6788 Imm = SplatBits >> 16;
6789 break;
6790 }
6791
6792 // Note: there are a few 32-bit splat values (specifically: 00ffff00,
6793 // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not
6794 // VMOV.I32. A (very) minor optimization would be to replicate the value
6795 // and fall through here to test for a valid 64-bit splat. But, then the
6796 // caller would also need to check and handle the change in size.
6797 return SDValue();
6798
6799 case 64: {
6800 if (type != VMOVModImm)
6801 return SDValue();
6802 // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
6803 uint64_t BitMask = 0xff;
6804 unsigned ImmMask = 1;
6805 Imm = 0;
6806 for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
6807 if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
6808 Imm |= ImmMask;
6809 } else if ((SplatBits & BitMask) != 0) {
6810 return SDValue();
6811 }
6812 BitMask <<= 8;
6813 ImmMask <<= 1;
6814 }
6815
6816 // Op=1, Cmode=1110.
6817 OpCmode = 0x1e;
6818 VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
6819 break;
6820 }
6821
6822 default:
6823 llvm_unreachable("unexpected size for isVMOVModifiedImm");
6824 }
6825
6826 unsigned EncodedVal = ARM_AM::createVMOVModImm(OpCmode, Imm);
6827 return DAG.getTargetConstant(EncodedVal, dl, MVT::i32);
6828}
6829
6830SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
6831 const ARMSubtarget *ST) const {
6832 EVT VT = Op.getValueType();
6833 bool IsDouble = (VT == MVT::f64);
6834 ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);
6835 const APFloat &FPVal = CFP->getValueAPF();
6836
6837 // Prevent floating-point constants from using literal loads
6838 // when execute-only is enabled.
6839 if (ST->genExecuteOnly()) {
6840 // We shouldn't trigger this for v6m execute-only
6841 assert((!ST->isThumb1Only() || ST->hasV8MBaselineOps()) &&
6842 "Unexpected architecture");
6843
6844 // If we can represent the constant as an immediate, don't lower it
6845 if (isFPImmLegal(FPVal, VT))
6846 return Op;
6847 // Otherwise, construct as integer, and move to float register
6848 APInt INTVal = FPVal.bitcastToAPInt();
6849 SDLoc DL(CFP);
6850 switch (VT.getSimpleVT().SimpleTy) {
6851 default:
6852 llvm_unreachable("Unknown floating point type!");
6853 break;
6854 case MVT::f64: {
6855 SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32);
6856 SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32);
6857 return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi);
6858 }
6859 case MVT::f32:
6860 return DAG.getNode(ARMISD::VMOVSR, DL, VT,
6861 DAG.getConstant(INTVal, DL, MVT::i32));
6862 }
6863 }
6864
6865 if (!ST->hasVFP3Base())
6866 return SDValue();
6867
6868 // Use the default (constant pool) lowering for double constants when we have
6869 // an SP-only FPU
6870 if (IsDouble && !Subtarget->hasFP64())
6871 return SDValue();
6872
6873 // Try splatting with a VMOV.f32...
6874 int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal);
6875
6876 if (ImmVal != -1) {
6877 if (IsDouble || !ST->useNEONForSinglePrecisionFP()) {
6878 // We have code in place to select a valid ConstantFP already, no need to
6879 // do any mangling.
6880 return Op;
6881 }
6882
6883 // It's a float and we are trying to use NEON operations where
6884 // possible. Lower it to a splat followed by an extract.
6885 SDLoc DL(Op);
6886 SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32);
6887 SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32,
6888 NewVal);
6889 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant,
6890 DAG.getConstant(0, DL, MVT::i32));
6891 }
6892
6893 // The rest of our options are NEON only, make sure that's allowed before
6894 // proceeding..
6895 if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP()))
6896 return SDValue();
6897
6898 EVT VMovVT;
6899 uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue();
6900
6901 // It wouldn't really be worth bothering for doubles except for one very
6902 // important value, which does happen to match: 0.0. So make sure we don't do
6903 // anything stupid.
6904 if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32))
6905 return SDValue();
6906
6907 // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too).
6908 SDValue NewVal = isVMOVModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op),
6909 VMovVT, VT, VMOVModImm);
6910 if (NewVal != SDValue()) {
6911 SDLoc DL(Op);
6912 SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT,
6913 NewVal);
6914 if (IsDouble)
6915 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
6916
6917 // It's a float: cast and extract a vector element.
6918 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
6919 VecConstant);
6920 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
6921 DAG.getConstant(0, DL, MVT::i32));
6922 }
6923
6924 // Finally, try a VMVN.i32
6925 NewVal = isVMOVModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT,
6926 VT, VMVNModImm);
6927 if (NewVal != SDValue()) {
6928 SDLoc DL(Op);
6929 SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal);
6930
6931 if (IsDouble)
6932 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
6933
6934 // It's a float: cast and extract a vector element.
6935 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
6936 VecConstant);
6937 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
6938 DAG.getConstant(0, DL, MVT::i32));
6939 }
6940
6941 return SDValue();
6942}
6943
6944// check if an VEXT instruction can handle the shuffle mask when the
6945// vector sources of the shuffle are the same.
6946static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
6947 unsigned NumElts = VT.getVectorNumElements();
6948
6949 // Assume that the first shuffle index is not UNDEF. Fail if it is.
6950 if (M[0] < 0)
6951 return false;
6952
6953 Imm = M[0];
6954
6955 // If this is a VEXT shuffle, the immediate value is the index of the first
6956 // element. The other shuffle indices must be the successive elements after
6957 // the first one.
6958 unsigned ExpectedElt = Imm;
6959 for (unsigned i = 1; i < NumElts; ++i) {
6960 // Increment the expected index. If it wraps around, just follow it
6961 // back to index zero and keep going.
6962 ++ExpectedElt;
6963 if (ExpectedElt == NumElts)
6964 ExpectedElt = 0;
6965
6966 if (M[i] < 0) continue; // ignore UNDEF indices
6967 if (ExpectedElt != static_cast<unsigned>(M[i]))
6968 return false;
6969 }
6970
6971 return true;
6972}
6973
6974static bool isVEXTMask(ArrayRef<int> M, EVT VT,
6975 bool &ReverseVEXT, unsigned &Imm) {
6976 unsigned NumElts = VT.getVectorNumElements();
6977 ReverseVEXT = false;
6978
6979 // Assume that the first shuffle index is not UNDEF. Fail if it is.
6980 if (M[0] < 0)
6981 return false;
6982
6983 Imm = M[0];
6984
6985 // If this is a VEXT shuffle, the immediate value is the index of the first
6986 // element. The other shuffle indices must be the successive elements after
6987 // the first one.
6988 unsigned ExpectedElt = Imm;
6989 for (unsigned i = 1; i < NumElts; ++i) {
6990 // Increment the expected index. If it wraps around, it may still be
6991 // a VEXT but the source vectors must be swapped.
6992 ExpectedElt += 1;
6993 if (ExpectedElt == NumElts * 2) {
6994 ExpectedElt = 0;
6995 ReverseVEXT = true;
6996 }
6997
6998 if (M[i] < 0) continue; // ignore UNDEF indices
6999 if (ExpectedElt != static_cast<unsigned>(M[i]))
7000 return false;
7001 }
7002
7003 // Adjust the index value if the source operands will be swapped.
7004 if (ReverseVEXT)
7005 Imm -= NumElts;
7006
7007 return true;
7008}
7009
7010static bool isVTBLMask(ArrayRef<int> M, EVT VT) {
7011 // We can handle <8 x i8> vector shuffles. If the index in the mask is out of
7012 // range, then 0 is placed into the resulting vector. So pretty much any mask
7013 // of 8 elements can work here.
7014 return VT == MVT::v8i8 && M.size() == 8;
7015}
7016
7017static unsigned SelectPairHalf(unsigned Elements, ArrayRef<int> Mask,
7018 unsigned Index) {
7019 if (Mask.size() == Elements * 2)
7020 return Index / Elements;
7021 return Mask[Index] == 0 ? 0 : 1;
7022}
7023
7024// Checks whether the shuffle mask represents a vector transpose (VTRN) by
7025// checking that pairs of elements in the shuffle mask represent the same index
7026// in each vector, incrementing the expected index by 2 at each step.
7027// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6]
7028// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g}
7029// v2={e,f,g,h}
7030// WhichResult gives the offset for each element in the mask based on which
7031// of the two results it belongs to.
7032//
7033// The transpose can be represented either as:
7034// result1 = shufflevector v1, v2, result1_shuffle_mask
7035// result2 = shufflevector v1, v2, result2_shuffle_mask
7036// where v1/v2 and the shuffle masks have the same number of elements
7037// (here WhichResult (see below) indicates which result is being checked)
7038//
7039// or as:
7040// results = shufflevector v1, v2, shuffle_mask
7041// where both results are returned in one vector and the shuffle mask has twice
7042// as many elements as v1/v2 (here WhichResult will always be 0 if true) here we
7043// want to check the low half and high half of the shuffle mask as if it were
7044// the other case
7045static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7046 unsigned EltSz = VT.getScalarSizeInBits();
7047 if (EltSz == 64)
7048 return false;
7049
7050 unsigned NumElts = VT.getVectorNumElements();
7051 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7052 return false;
7053
7054 // If the mask is twice as long as the input vector then we need to check the
7055 // upper and lower parts of the mask with a matching value for WhichResult
7056 // FIXME: A mask with only even values will be rejected in case the first
7057 // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only
7058 // M[0] is used to determine WhichResult
7059 for (unsigned i = 0; i < M.size(); i += NumElts) {
7060 WhichResult = SelectPairHalf(NumElts, M, i);
7061 for (unsigned j = 0; j < NumElts; j += 2) {
7062 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
7063 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult))
7064 return false;
7065 }
7066 }
7067
7068 if (M.size() == NumElts*2)
7069 WhichResult = 0;
7070
7071 return true;
7072}
7073
7074/// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of
7075/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7076/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
7077static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7078 unsigned EltSz = VT.getScalarSizeInBits();
7079 if (EltSz == 64)
7080 return false;
7081
7082 unsigned NumElts = VT.getVectorNumElements();
7083 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7084 return false;
7085
7086 for (unsigned i = 0; i < M.size(); i += NumElts) {
7087 WhichResult = SelectPairHalf(NumElts, M, i);
7088 for (unsigned j = 0; j < NumElts; j += 2) {
7089 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
7090 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult))
7091 return false;
7092 }
7093 }
7094
7095 if (M.size() == NumElts*2)
7096 WhichResult = 0;
7097
7098 return true;
7099}
7100
7101// Checks whether the shuffle mask represents a vector unzip (VUZP) by checking
7102// that the mask elements are either all even and in steps of size 2 or all odd
7103// and in steps of size 2.
7104// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6]
7105// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g}
7106// v2={e,f,g,h}
7107// Requires similar checks to that of isVTRNMask with
7108// respect the how results are returned.
7109static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7110 unsigned EltSz = VT.getScalarSizeInBits();
7111 if (EltSz == 64)
7112 return false;
7113
7114 unsigned NumElts = VT.getVectorNumElements();
7115 if (M.size() != NumElts && M.size() != NumElts*2)
7116 return false;
7117
7118 for (unsigned i = 0; i < M.size(); i += NumElts) {
7119 WhichResult = SelectPairHalf(NumElts, M, i);
7120 for (unsigned j = 0; j < NumElts; ++j) {
7121 if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult)
7122 return false;
7123 }
7124 }
7125
7126 if (M.size() == NumElts*2)
7127 WhichResult = 0;
7128
7129 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7130 if (VT.is64BitVector() && EltSz == 32)
7131 return false;
7132
7133 return true;
7134}
7135
7136/// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of
7137/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7138/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
7139static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7140 unsigned EltSz = VT.getScalarSizeInBits();
7141 if (EltSz == 64)
7142 return false;
7143
7144 unsigned NumElts = VT.getVectorNumElements();
7145 if (M.size() != NumElts && M.size() != NumElts*2)
7146 return false;
7147
7148 unsigned Half = NumElts / 2;
7149 for (unsigned i = 0; i < M.size(); i += NumElts) {
7150 WhichResult = SelectPairHalf(NumElts, M, i);
7151 for (unsigned j = 0; j < NumElts; j += Half) {
7152 unsigned Idx = WhichResult;
7153 for (unsigned k = 0; k < Half; ++k) {
7154 int MIdx = M[i + j + k];
7155 if (MIdx >= 0 && (unsigned) MIdx != Idx)
7156 return false;
7157 Idx += 2;
7158 }
7159 }
7160 }
7161
7162 if (M.size() == NumElts*2)
7163 WhichResult = 0;
7164
7165 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7166 if (VT.is64BitVector() && EltSz == 32)
7167 return false;
7168
7169 return true;
7170}
7171
7172// Checks whether the shuffle mask represents a vector zip (VZIP) by checking
7173// that pairs of elements of the shufflemask represent the same index in each
7174// vector incrementing sequentially through the vectors.
7175// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5]
7176// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f}
7177// v2={e,f,g,h}
7178// Requires similar checks to that of isVTRNMask with respect the how results
7179// are returned.
7180static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7181 unsigned EltSz = VT.getScalarSizeInBits();
7182 if (EltSz == 64)
7183 return false;
7184
7185 unsigned NumElts = VT.getVectorNumElements();
7186 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7187 return false;
7188
7189 for (unsigned i = 0; i < M.size(); i += NumElts) {
7190 WhichResult = SelectPairHalf(NumElts, M, i);
7191 unsigned Idx = WhichResult * NumElts / 2;
7192 for (unsigned j = 0; j < NumElts; j += 2) {
7193 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
7194 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts))
7195 return false;
7196 Idx += 1;
7197 }
7198 }
7199
7200 if (M.size() == NumElts*2)
7201 WhichResult = 0;
7202
7203 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7204 if (VT.is64BitVector() && EltSz == 32)
7205 return false;
7206
7207 return true;
7208}
7209
7210/// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of
7211/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7212/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
7213static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7214 unsigned EltSz = VT.getScalarSizeInBits();
7215 if (EltSz == 64)
7216 return false;
7217
7218 unsigned NumElts = VT.getVectorNumElements();
7219 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7220 return false;
7221
7222 for (unsigned i = 0; i < M.size(); i += NumElts) {
7223 WhichResult = SelectPairHalf(NumElts, M, i);
7224 unsigned Idx = WhichResult * NumElts / 2;
7225 for (unsigned j = 0; j < NumElts; j += 2) {
7226 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
7227 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx))
7228 return false;
7229 Idx += 1;
7230 }
7231 }
7232
7233 if (M.size() == NumElts*2)
7234 WhichResult = 0;
7235
7236 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7237 if (VT.is64BitVector() && EltSz == 32)
7238 return false;
7239
7240 return true;
7241}
7242
7243/// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN),
7244/// and return the corresponding ARMISD opcode if it is, or 0 if it isn't.
7245static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT,
7246 unsigned &WhichResult,
7247 bool &isV_UNDEF) {
7248 isV_UNDEF = false;
7249 if (isVTRNMask(ShuffleMask, VT, WhichResult))
7250 return ARMISD::VTRN;
7251 if (isVUZPMask(ShuffleMask, VT, WhichResult))
7252 return ARMISD::VUZP;
7253 if (isVZIPMask(ShuffleMask, VT, WhichResult))
7254 return ARMISD::VZIP;
7255
7256 isV_UNDEF = true;
7257 if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult))
7258 return ARMISD::VTRN;
7259 if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult))
7260 return ARMISD::VUZP;
7261 if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult))
7262 return ARMISD::VZIP;
7263
7264 return 0;
7265}
7266
7267/// \return true if this is a reverse operation on an vector.
7268static bool isReverseMask(ArrayRef<int> M, EVT VT) {
7269 unsigned NumElts = VT.getVectorNumElements();
7270 // Make sure the mask has the right size.
7271 if (NumElts != M.size())
7272 return false;
7273
7274 // Look for <15, ..., 3, -1, 1, 0>.
7275 for (unsigned i = 0; i != NumElts; ++i)
7276 if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i))
7277 return false;
7278
7279 return true;
7280}
7281
7282static bool isTruncMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) {
7283 unsigned NumElts = VT.getVectorNumElements();
7284 // Make sure the mask has the right size.
7285 if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8))
7286 return false;
7287
7288 // Half-width truncation patterns (e.g. v4i32 -> v8i16):
7289 // !Top && SingleSource: <0, 2, 4, 6, 0, 2, 4, 6>
7290 // !Top && !SingleSource: <0, 2, 4, 6, 8, 10, 12, 14>
7291 // Top && SingleSource: <1, 3, 5, 7, 1, 3, 5, 7>
7292 // Top && !SingleSource: <1, 3, 5, 7, 9, 11, 13, 15>
7293 int Ofs = Top ? 1 : 0;
7294 int Upper = SingleSource ? 0 : NumElts;
7295 for (int i = 0, e = NumElts / 2; i != e; ++i) {
7296 if (M[i] >= 0 && M[i] != (i * 2) + Ofs)
7297 return false;
7298 if (M[i + e] >= 0 && M[i + e] != (i * 2) + Ofs + Upper)
7299 return false;
7300 }
7301 return true;
7302}
7303
7304static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) {
7305 unsigned NumElts = VT.getVectorNumElements();
7306 // Make sure the mask has the right size.
7307 if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8))
7308 return false;
7309
7310 // If Top
7311 // Look for <0, N, 2, N+2, 4, N+4, ..>.
7312 // This inserts Input2 into Input1
7313 // else if not Top
7314 // Look for <0, N+1, 2, N+3, 4, N+5, ..>
7315 // This inserts Input1 into Input2
7316 unsigned Offset = Top ? 0 : 1;
7317 unsigned N = SingleSource ? 0 : NumElts;
7318 for (unsigned i = 0; i < NumElts; i += 2) {
7319 if (M[i] >= 0 && M[i] != (int)i)
7320 return false;
7321 if (M[i + 1] >= 0 && M[i + 1] != (int)(N + i + Offset))
7322 return false;
7323 }
7324
7325 return true;
7326}
7327
7328static bool isVMOVNTruncMask(ArrayRef<int> M, EVT ToVT, bool rev) {
7329 unsigned NumElts = ToVT.getVectorNumElements();
7330 if (NumElts != M.size())
7331 return false;
7332
7333 // Test if the Trunc can be convertable to a VMOVN with this shuffle. We are
7334 // looking for patterns of:
7335 // !rev: 0 N/2 1 N/2+1 2 N/2+2 ...
7336 // rev: N/2 0 N/2+1 1 N/2+2 2 ...
7337
7338 unsigned Off0 = rev ? NumElts / 2 : 0;
7339 unsigned Off1 = rev ? 0 : NumElts / 2;
7340 for (unsigned i = 0; i < NumElts; i += 2) {
7341 if (M[i] >= 0 && M[i] != (int)(Off0 + i / 2))
7342 return false;
7343 if (M[i + 1] >= 0 && M[i + 1] != (int)(Off1 + i / 2))
7344 return false;
7345 }
7346
7347 return true;
7348}
7349
7350// Reconstruct an MVE VCVT from a BuildVector of scalar fptrunc, all extracted
7351// from a pair of inputs. For example:
7352// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
7353// FP_ROUND(EXTRACT_ELT(Y, 0),
7354// FP_ROUND(EXTRACT_ELT(X, 1),
7355// FP_ROUND(EXTRACT_ELT(Y, 1), ...)
7357 const ARMSubtarget *ST) {
7358 assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7359 if (!ST->hasMVEFloatOps())
7360 return SDValue();
7361
7362 SDLoc dl(BV);
7363 EVT VT = BV.getValueType();
7364 if (VT != MVT::v8f16)
7365 return SDValue();
7366
7367 // We are looking for a buildvector of fptrunc elements, where all the
7368 // elements are interleavingly extracted from two sources. Check the first two
7369 // items are valid enough and extract some info from them (they are checked
7370 // properly in the loop below).
7371 if (BV.getOperand(0).getOpcode() != ISD::FP_ROUND ||
7374 return SDValue();
7375 if (BV.getOperand(1).getOpcode() != ISD::FP_ROUND ||
7378 return SDValue();
7379 SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);
7380 SDValue Op1 = BV.getOperand(1).getOperand(0).getOperand(0);
7381 if (Op0.getValueType() != MVT::v4f32 || Op1.getValueType() != MVT::v4f32)
7382 return SDValue();
7383
7384 // Check all the values in the BuildVector line up with our expectations.
7385 for (unsigned i = 1; i < 4; i++) {
7386 auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
7387 return Trunc.getOpcode() == ISD::FP_ROUND &&
7389 Trunc.getOperand(0).getOperand(0) == Op &&
7390 Trunc.getOperand(0).getConstantOperandVal(1) == Idx;
7391 };
7392 if (!Check(BV.getOperand(i * 2 + 0), Op0, i))
7393 return SDValue();
7394 if (!Check(BV.getOperand(i * 2 + 1), Op1, i))
7395 return SDValue();
7396 }
7397
7398 SDValue N1 = DAG.getNode(ARMISD::VCVTN, dl, VT, DAG.getUNDEF(VT), Op0,
7399 DAG.getConstant(0, dl, MVT::i32));
7400 return DAG.getNode(ARMISD::VCVTN, dl, VT, N1, Op1,
7401 DAG.getConstant(1, dl, MVT::i32));
7402}
7403
7404// Reconstruct an MVE VCVT from a BuildVector of scalar fpext, all extracted
7405// from a single input on alternating lanes. For example:
7406// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
7407// FP_ROUND(EXTRACT_ELT(X, 2),
7408// FP_ROUND(EXTRACT_ELT(X, 4), ...)
7410 const ARMSubtarget *ST) {
7411 assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7412 if (!ST->hasMVEFloatOps())
7413 return SDValue();
7414
7415 SDLoc dl(BV);
7416 EVT VT = BV.getValueType();
7417 if (VT != MVT::v4f32)
7418 return SDValue();
7419
7420 // We are looking for a buildvector of fptext elements, where all the
7421 // elements are alternating lanes from a single source. For example <0,2,4,6>
7422 // or <1,3,5,7>. Check the first two items are valid enough and extract some
7423 // info from them (they are checked properly in the loop below).
7424 if (BV.getOperand(0).getOpcode() != ISD::FP_EXTEND ||
7426 return SDValue();
7427 SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);
7429 if (Op0.getValueType() != MVT::v8f16 || (Offset != 0 && Offset != 1))
7430 return SDValue();
7431
7432 // Check all the values in the BuildVector line up with our expectations.
7433 for (unsigned i = 1; i < 4; i++) {
7434 auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
7435 return Trunc.getOpcode() == ISD::FP_EXTEND &&
7437 Trunc.getOperand(0).getOperand(0) == Op &&
7438 Trunc.getOperand(0).getConstantOperandVal(1) == Idx;
7439 };
7440 if (!Check(BV.getOperand(i), Op0, 2 * i + Offset))
7441 return SDValue();
7442 }
7443
7444 return DAG.getNode(ARMISD::VCVTL, dl, VT, Op0,
7445 DAG.getConstant(Offset, dl, MVT::i32));
7446}
7447
7448// If N is an integer constant that can be moved into a register in one
7449// instruction, return an SDValue of such a constant (will become a MOV
7450// instruction). Otherwise return null.
7452 const ARMSubtarget *ST, const SDLoc &dl) {
7453 uint64_t Val;
7454 if (!isa<ConstantSDNode>(N))
7455 return SDValue();
7456 Val = N->getAsZExtVal();
7457
7458 if (ST->isThumb1Only()) {
7459 if (Val <= 255 || ~Val <= 255)
7460 return DAG.getConstant(Val, dl, MVT::i32);
7461 } else {
7462 if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1)
7463 return DAG.getConstant(Val, dl, MVT::i32);
7464 }
7465 return SDValue();
7466}
7467
7469 const ARMSubtarget *ST) {
7470 SDLoc dl(Op);
7471 EVT VT = Op.getValueType();
7472
7473 assert(ST->hasMVEIntegerOps() && "LowerBUILD_VECTOR_i1 called without MVE!");
7474
7475 unsigned NumElts = VT.getVectorNumElements();
7476 unsigned BoolMask;
7477 unsigned BitsPerBool;
7478 if (NumElts == 2) {
7479 BitsPerBool = 8;
7480 BoolMask = 0xff;
7481 } else if (NumElts == 4) {
7482 BitsPerBool = 4;
7483 BoolMask = 0xf;
7484 } else if (NumElts == 8) {
7485 BitsPerBool = 2;
7486 BoolMask = 0x3;
7487 } else if (NumElts == 16) {
7488 BitsPerBool = 1;
7489 BoolMask = 0x1;
7490 } else
7491 return SDValue();
7492
7493 // If this is a single value copied into all lanes (a splat), we can just sign
7494 // extend that single value
7495 SDValue FirstOp = Op.getOperand(0);
7496 if (!isa<ConstantSDNode>(FirstOp) &&
7497 llvm::all_of(llvm::drop_begin(Op->ops()), [&FirstOp](const SDUse &U) {
7498 return U.get().isUndef() || U.get() == FirstOp;
7499 })) {
7500 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, FirstOp,
7501 DAG.getValueType(MVT::i1));
7502 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), Ext);
7503 }
7504
7505 // First create base with bits set where known
7506 unsigned Bits32 = 0;
7507 for (unsigned i = 0; i < NumElts; ++i) {
7508 SDValue V = Op.getOperand(i);
7509 if (!isa<ConstantSDNode>(V) && !V.isUndef())
7510 continue;
7511 bool BitSet = V.isUndef() ? false : V->getAsZExtVal();
7512 if (BitSet)
7513 Bits32 |= BoolMask << (i * BitsPerBool);
7514 }
7515
7516 // Add in unknown nodes
7517 SDValue Base = DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT,
7518 DAG.getConstant(Bits32, dl, MVT::i32));
7519 for (unsigned i = 0; i < NumElts; ++i) {
7520 SDValue V = Op.getOperand(i);
7521 if (isa<ConstantSDNode>(V) || V.isUndef())
7522 continue;
7523 Base = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Base, V,
7524 DAG.getConstant(i, dl, MVT::i32));
7525 }
7526
7527 return Base;
7528}
7529
7531 const ARMSubtarget *ST) {
7532 if (!ST->hasMVEIntegerOps())
7533 return SDValue();
7534
7535 // We are looking for a buildvector where each element is Op[0] + i*N
7536 EVT VT = Op.getValueType();
7537 SDValue Op0 = Op.getOperand(0);
7538 unsigned NumElts = VT.getVectorNumElements();
7539
7540 // Get the increment value from operand 1
7541 SDValue Op1 = Op.getOperand(1);
7542 if (Op1.getOpcode() != ISD::ADD || Op1.getOperand(0) != Op0 ||
7544 return SDValue();
7545 unsigned N = Op1.getConstantOperandVal(1);
7546 if (N != 1 && N != 2 && N != 4 && N != 8)
7547 return SDValue();
7548
7549 // Check that each other operand matches
7550 for (unsigned I = 2; I < NumElts; I++) {
7551 SDValue OpI = Op.getOperand(I);
7552 if (OpI.getOpcode() != ISD::ADD || OpI.getOperand(0) != Op0 ||
7554 OpI.getConstantOperandVal(1) != I * N)
7555 return SDValue();
7556 }
7557
7558 SDLoc DL(Op);
7559 return DAG.getNode(ARMISD::VIDUP, DL, DAG.getVTList(VT, MVT::i32), Op0,
7560 DAG.getConstant(N, DL, MVT::i32));
7561}
7562
7563// Returns true if the operation N can be treated as qr instruction variant at
7564// operand Op.
7565static bool IsQRMVEInstruction(const SDNode *N, const SDNode *Op) {
7566 switch (N->getOpcode()) {
7567 case ISD::ADD:
7568 case ISD::MUL:
7569 case ISD::SADDSAT:
7570 case ISD::UADDSAT:
7571 case ISD::AVGFLOORS:
7572 case ISD::AVGFLOORU:
7573 return true;
7574 case ISD::SUB:
7575 case ISD::SSUBSAT:
7576 case ISD::USUBSAT:
7577 return N->getOperand(1).getNode() == Op;
7579 switch (N->getConstantOperandVal(0)) {
7580 case Intrinsic::arm_mve_add_predicated:
7581 case Intrinsic::arm_mve_mul_predicated:
7582 case Intrinsic::arm_mve_qadd_predicated:
7583 case Intrinsic::arm_mve_vhadd:
7584 case Intrinsic::arm_mve_hadd_predicated:
7585 case Intrinsic::arm_mve_vqdmulh:
7586 case Intrinsic::arm_mve_qdmulh_predicated:
7587 case Intrinsic::arm_mve_vqrdmulh:
7588 case Intrinsic::arm_mve_qrdmulh_predicated:
7589 case Intrinsic::arm_mve_vqdmull:
7590 case Intrinsic::arm_mve_vqdmull_predicated:
7591 return true;
7592 case Intrinsic::arm_mve_sub_predicated:
7593 case Intrinsic::arm_mve_qsub_predicated:
7594 case Intrinsic::arm_mve_vhsub:
7595 case Intrinsic::arm_mve_hsub_predicated:
7596 return N->getOperand(2).getNode() == Op;
7597 default:
7598 return false;
7599 }
7600 default:
7601 return false;
7602 }
7603}
7604
7605// If this is a case we can't handle, return null and let the default
7606// expansion code take care of it.
7607SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
7608 const ARMSubtarget *ST) const {
7609 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
7610 SDLoc dl(Op);
7611 EVT VT = Op.getValueType();
7612
7613 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
7614 return LowerBUILD_VECTOR_i1(Op, DAG, ST);
7615
7616 if (SDValue R = LowerBUILD_VECTORToVIDUP(Op, DAG, ST))
7617 return R;
7618
7619 APInt SplatBits, SplatUndef;
7620 unsigned SplatBitSize;
7621 bool HasAnyUndefs;
7622 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
7623 if (SplatUndef.isAllOnes())
7624 return DAG.getUNDEF(VT);
7625
7626 // If all the users of this constant splat are qr instruction variants,
7627 // generate a vdup of the constant.
7628 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == SplatBitSize &&
7629 (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32) &&
7630 all_of(BVN->users(),
7631 [BVN](const SDNode *U) { return IsQRMVEInstruction(U, BVN); })) {
7632 EVT DupVT = SplatBitSize == 32 ? MVT::v4i32
7633 : SplatBitSize == 16 ? MVT::v8i16
7634 : MVT::v16i8;
7635 SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32);
7636 SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const);
7637 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup);
7638 }
7639
7640 if ((ST->hasNEON() && SplatBitSize <= 64) ||
7641 (ST->hasMVEIntegerOps() && SplatBitSize <= 64)) {
7642 // Check if an immediate VMOV works.
7643 EVT VmovVT;
7644 SDValue Val =
7645 isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
7646 SplatBitSize, DAG, dl, VmovVT, VT, VMOVModImm);
7647
7648 if (Val.getNode()) {
7649 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);
7650 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vmov);
7651 }
7652
7653 // Try an immediate VMVN.
7654 uint64_t NegatedImm = (~SplatBits).getZExtValue();
7655 Val = isVMOVModifiedImm(
7656 NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VmovVT,
7657 VT, ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm);
7658 if (Val.getNode()) {
7659 SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val);
7660 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vmov);
7661 }
7662
7663 // Use vmov.f32 to materialize other v2f32 and v4f32 splats.
7664 if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) {
7665 int ImmVal = ARM_AM::getFP32Imm(SplatBits);
7666 if (ImmVal != -1) {
7667 SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32);
7668 return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val);
7669 }
7670 }
7671
7672 // If we are under MVE, generate a VDUP(constant), bitcast to the original
7673 // type.
7674 if (ST->hasMVEIntegerOps() &&
7675 (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32)) {
7676 EVT DupVT = SplatBitSize == 32 ? MVT::v4i32
7677 : SplatBitSize == 16 ? MVT::v8i16
7678 : MVT::v16i8;
7679 SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32);
7680 SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const);
7681 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup);
7682 }
7683 }
7684 }
7685
7686 // Scan through the operands to see if only one value is used.
7687 //
7688 // As an optimisation, even if more than one value is used it may be more
7689 // profitable to splat with one value then change some lanes.
7690 //
7691 // Heuristically we decide to do this if the vector has a "dominant" value,
7692 // defined as splatted to more than half of the lanes.
7693 unsigned NumElts = VT.getVectorNumElements();
7694 bool isOnlyLowElement = true;
7695 bool usesOnlyOneValue = true;
7696 bool hasDominantValue = false;
7697 bool isConstant = true;
7698
7699 // Map of the number of times a particular SDValue appears in the
7700 // element list.
7701 DenseMap<SDValue, unsigned> ValueCounts;
7702 SDValue Value;
7703 for (unsigned i = 0; i < NumElts; ++i) {
7704 SDValue V = Op.getOperand(i);
7705 if (V.isUndef())
7706 continue;
7707 if (i > 0)
7708 isOnlyLowElement = false;
7710 isConstant = false;
7711
7712 unsigned &Count = ValueCounts[V];
7713
7714 // Is this value dominant? (takes up more than half of the lanes)
7715 if (++Count > (NumElts / 2)) {
7716 hasDominantValue = true;
7717 Value = V;
7718 }
7719 }
7720 if (ValueCounts.size() != 1)
7721 usesOnlyOneValue = false;
7722 if (!Value.getNode() && !ValueCounts.empty())
7723 Value = ValueCounts.begin()->first;
7724
7725 if (ValueCounts.empty())
7726 return DAG.getUNDEF(VT);
7727
7728 // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR.
7729 // Keep going if we are hitting this case.
7730 if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode()))
7731 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
7732
7733 unsigned EltSize = VT.getScalarSizeInBits();
7734
7735 // Use VDUP for non-constant splats. For f32 constant splats, reduce to
7736 // i32 and try again.
7737 if (hasDominantValue && EltSize <= 32) {
7738 if (!isConstant) {
7739 SDValue N;
7740
7741 // If we are VDUPing a value that comes directly from a vector, that will
7742 // cause an unnecessary move to and from a GPR, where instead we could
7743 // just use VDUPLANE. We can only do this if the lane being extracted
7744 // is at a constant index, as the VDUP from lane instructions only have
7745 // constant-index forms.
7746 ConstantSDNode *constIndex;
7747 if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7748 (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) {
7749 // We need to create a new undef vector to use for the VDUPLANE if the
7750 // size of the vector from which we get the value is different than the
7751 // size of the vector that we need to create. We will insert the element
7752 // such that the register coalescer will remove unnecessary copies.
7753 if (VT != Value->getOperand(0).getValueType()) {
7754 unsigned index = constIndex->getAPIntValue().getLimitedValue() %
7756 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
7757 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT),
7758 Value, DAG.getConstant(index, dl, MVT::i32)),
7759 DAG.getConstant(index, dl, MVT::i32));
7760 } else
7761 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
7762 Value->getOperand(0), Value->getOperand(1));
7763 } else
7764 N = DAG.getNode(ARMISD::VDUP, dl, VT, Value);
7765
7766 if (!usesOnlyOneValue) {
7767 // The dominant value was splatted as 'N', but we now have to insert
7768 // all differing elements.
7769 for (unsigned I = 0; I < NumElts; ++I) {
7770 if (Op.getOperand(I) == Value)
7771 continue;
7773 Ops.push_back(N);
7774 Ops.push_back(Op.getOperand(I));
7775 Ops.push_back(DAG.getConstant(I, dl, MVT::i32));
7776 N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops);
7777 }
7778 }
7779 return N;
7780 }
7783 MVT FVT = VT.getVectorElementType().getSimpleVT();
7784 assert(FVT == MVT::f32 || FVT == MVT::f16);
7785 MVT IVT = (FVT == MVT::f32) ? MVT::i32 : MVT::i16;
7786 for (unsigned i = 0; i < NumElts; ++i)
7787 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, IVT,
7788 Op.getOperand(i)));
7789 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), IVT, NumElts);
7790 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
7791 Val = LowerBUILD_VECTOR(Val, DAG, ST);
7792 if (Val.getNode())
7793 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
7794 }
7795 if (usesOnlyOneValue) {
7796 SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
7797 if (isConstant && Val.getNode())
7798 return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
7799 }
7800 }
7801
7802 // If all elements are constants and the case above didn't get hit, fall back
7803 // to the default expansion, which will generate a load from the constant
7804 // pool.
7805 if (isConstant)
7806 return SDValue();
7807
7808 // Reconstruct the BUILDVECTOR to one of the legal shuffles (such as vext and
7809 // vmovn). Empirical tests suggest this is rarely worth it for vectors of
7810 // length <= 2.
7811 if (NumElts >= 4)
7812 if (SDValue shuffle = ReconstructShuffle(Op, DAG))
7813 return shuffle;
7814
7815 // Attempt to turn a buildvector of scalar fptrunc's or fpext's back into
7816 // VCVT's
7817 if (SDValue VCVT = LowerBuildVectorOfFPTrunc(Op, DAG, Subtarget))
7818 return VCVT;
7819 if (SDValue VCVT = LowerBuildVectorOfFPExt(Op, DAG, Subtarget))
7820 return VCVT;
7821
7822 if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) {
7823 // If we haven't found an efficient lowering, try splitting a 128-bit vector
7824 // into two 64-bit vectors; we might discover a better way to lower it.
7825 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts);
7826 EVT ExtVT = VT.getVectorElementType();
7827 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2);
7828 SDValue Lower = DAG.getBuildVector(HVT, dl, ArrayRef(&Ops[0], NumElts / 2));
7829 if (Lower.getOpcode() == ISD::BUILD_VECTOR)
7830 Lower = LowerBUILD_VECTOR(Lower, DAG, ST);
7831 SDValue Upper =
7832 DAG.getBuildVector(HVT, dl, ArrayRef(&Ops[NumElts / 2], NumElts / 2));
7833 if (Upper.getOpcode() == ISD::BUILD_VECTOR)
7834 Upper = LowerBUILD_VECTOR(Upper, DAG, ST);
7835 if (Lower && Upper)
7836 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lower, Upper);
7837 }
7838
7839 // Vectors with 32- or 64-bit elements can be built by directly assigning
7840 // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands
7841 // will be legalized.
7842 if (EltSize >= 32) {
7843 // Do the expansion with floating-point types, since that is what the VFP
7844 // registers are defined to use, and since i64 is not legal.
7845 EVT EltVT = EVT::getFloatingPointVT(EltSize);
7846 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
7848 for (unsigned i = 0; i < NumElts; ++i)
7849 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i)));
7850 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
7851 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
7852 }
7853
7854 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
7855 // know the default expansion would otherwise fall back on something even
7856 // worse. For a vector with one or two non-undef values, that's
7857 // scalar_to_vector for the elements followed by a shuffle (provided the
7858 // shuffle is valid for the target) and materialization element by element
7859 // on the stack followed by a load for everything else.
7860 if (!isConstant && !usesOnlyOneValue) {
7861 SDValue Vec = DAG.getUNDEF(VT);
7862 for (unsigned i = 0 ; i < NumElts; ++i) {
7863 SDValue V = Op.getOperand(i);
7864 if (V.isUndef())
7865 continue;
7866 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32);
7867 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
7868 }
7869 return Vec;
7870 }
7871
7872 return SDValue();
7873}
7874
7875// Gather data to see if the operation can be modelled as a
7876// shuffle in combination with VEXTs.
7877SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
7878 SelectionDAG &DAG) const {
7879 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7880 SDLoc dl(Op);
7881 EVT VT = Op.getValueType();
7882 unsigned NumElts = VT.getVectorNumElements();
7883
7884 struct ShuffleSourceInfo {
7885 SDValue Vec;
7886 unsigned MinElt = std::numeric_limits<unsigned>::max();
7887 unsigned MaxElt = 0;
7888
7889 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
7890 // be compatible with the shuffle we intend to construct. As a result
7891 // ShuffleVec will be some sliding window into the original Vec.
7892 SDValue ShuffleVec;
7893
7894 // Code should guarantee that element i in Vec starts at element "WindowBase
7895 // + i * WindowScale in ShuffleVec".
7896 int WindowBase = 0;
7897 int WindowScale = 1;
7898
7899 ShuffleSourceInfo(SDValue Vec) : Vec(Vec), ShuffleVec(Vec) {}
7900
7901 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
7902 };
7903
7904 // First gather all vectors used as an immediate source for this BUILD_VECTOR
7905 // node.
7907 for (unsigned i = 0; i < NumElts; ++i) {
7908 SDValue V = Op.getOperand(i);
7909 if (V.isUndef())
7910 continue;
7911 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
7912 // A shuffle can only come from building a vector from various
7913 // elements of other vectors.
7914 return SDValue();
7915 } else if (!isa<ConstantSDNode>(V.getOperand(1))) {
7916 // Furthermore, shuffles require a constant mask, whereas extractelts
7917 // accept variable indices.
7918 return SDValue();
7919 }
7920
7921 // Add this element source to the list if it's not already there.
7922 SDValue SourceVec = V.getOperand(0);
7923 auto Source = llvm::find(Sources, SourceVec);
7924 if (Source == Sources.end())
7925 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
7926
7927 // Update the minimum and maximum lane number seen.
7928 unsigned EltNo = V.getConstantOperandVal(1);
7929 Source->MinElt = std::min(Source->MinElt, EltNo);
7930 Source->MaxElt = std::max(Source->MaxElt, EltNo);
7931 }
7932
7933 // Currently only do something sane when at most two source vectors
7934 // are involved.
7935 if (Sources.size() > 2)
7936 return SDValue();
7937
7938 // Find out the smallest element size among result and two sources, and use
7939 // it as element size to build the shuffle_vector.
7940 EVT SmallestEltTy = VT.getVectorElementType();
7941 for (auto &Source : Sources) {
7942 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
7943 if (SrcEltTy.bitsLT(SmallestEltTy))
7944 SmallestEltTy = SrcEltTy;
7945 }
7946 unsigned ResMultiplier =
7947 VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();
7948 NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
7949 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
7950
7951 // If the source vector is too wide or too narrow, we may nevertheless be able
7952 // to construct a compatible shuffle either by concatenating it with UNDEF or
7953 // extracting a suitable range of elements.
7954 for (auto &Src : Sources) {
7955 EVT SrcVT = Src.ShuffleVec.getValueType();
7956
7957 uint64_t SrcVTSize = SrcVT.getFixedSizeInBits();
7958 uint64_t VTSize = VT.getFixedSizeInBits();
7959 if (SrcVTSize == VTSize)
7960 continue;
7961
7962 // This stage of the search produces a source with the same element type as
7963 // the original, but with a total width matching the BUILD_VECTOR output.
7964 EVT EltVT = SrcVT.getVectorElementType();
7965 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
7966 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
7967
7968 if (SrcVTSize < VTSize) {
7969 if (2 * SrcVTSize != VTSize)
7970 return SDValue();
7971 // We can pad out the smaller vector for free, so if it's part of a
7972 // shuffle...
7973 Src.ShuffleVec =
7974 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
7975 DAG.getUNDEF(Src.ShuffleVec.getValueType()));
7976 continue;
7977 }
7978
7979 if (SrcVTSize != 2 * VTSize)
7980 return SDValue();
7981
7982 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
7983 // Span too large for a VEXT to cope
7984 return SDValue();
7985 }
7986
7987 if (Src.MinElt >= NumSrcElts) {
7988 // The extraction can just take the second half
7989 Src.ShuffleVec =
7990 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
7991 DAG.getConstant(NumSrcElts, dl, MVT::i32));
7992 Src.WindowBase = -NumSrcElts;
7993 } else if (Src.MaxElt < NumSrcElts) {
7994 // The extraction can just take the first half
7995 Src.ShuffleVec =
7996 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
7997 DAG.getConstant(0, dl, MVT::i32));
7998 } else {
7999 // An actual VEXT is needed
8000 SDValue VEXTSrc1 =
8001 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8002 DAG.getConstant(0, dl, MVT::i32));
8003 SDValue VEXTSrc2 =
8004 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8005 DAG.getConstant(NumSrcElts, dl, MVT::i32));
8006
8007 Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1,
8008 VEXTSrc2,
8009 DAG.getConstant(Src.MinElt, dl, MVT::i32));
8010 Src.WindowBase = -Src.MinElt;
8011 }
8012 }
8013
8014 // Another possible incompatibility occurs from the vector element types. We
8015 // can fix this by bitcasting the source vectors to the same type we intend
8016 // for the shuffle.
8017 for (auto &Src : Sources) {
8018 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
8019 if (SrcEltTy == SmallestEltTy)
8020 continue;
8021 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
8022 Src.ShuffleVec = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, ShuffleVT, Src.ShuffleVec);
8023 Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
8024 Src.WindowBase *= Src.WindowScale;
8025 }
8026
8027 // Final check before we try to actually produce a shuffle.
8028 LLVM_DEBUG({
8029 for (auto Src : Sources)
8030 assert(Src.ShuffleVec.getValueType() == ShuffleVT);
8031 });
8032
8033 // The stars all align, our next step is to produce the mask for the shuffle.
8034 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
8035 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
8036 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
8037 SDValue Entry = Op.getOperand(i);
8038 if (Entry.isUndef())
8039 continue;
8040
8041 auto Src = llvm::find(Sources, Entry.getOperand(0));
8042 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
8043
8044 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
8045 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
8046 // segment.
8047 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
8048 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
8049 VT.getScalarSizeInBits());
8050 int LanesDefined = BitsDefined / BitsPerShuffleLane;
8051
8052 // This source is expected to fill ResMultiplier lanes of the final shuffle,
8053 // starting at the appropriate offset.
8054 int *LaneMask = &Mask[i * ResMultiplier];
8055
8056 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
8057 ExtractBase += NumElts * (Src - Sources.begin());
8058 for (int j = 0; j < LanesDefined; ++j)
8059 LaneMask[j] = ExtractBase + j;
8060 }
8061
8062
8063 // We can't handle more than two sources. This should have already
8064 // been checked before this point.
8065 assert(Sources.size() <= 2 && "Too many sources!");
8066
8067 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
8068 for (unsigned i = 0; i < Sources.size(); ++i)
8069 ShuffleOps[i] = Sources[i].ShuffleVec;
8070
8071 SDValue Shuffle = buildLegalVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
8072 ShuffleOps[1], Mask, DAG);
8073 if (!Shuffle)
8074 return SDValue();
8075 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Shuffle);
8076}
8077
8079 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
8088 OP_VUZPL, // VUZP, left result
8089 OP_VUZPR, // VUZP, right result
8090 OP_VZIPL, // VZIP, left result
8091 OP_VZIPR, // VZIP, right result
8092 OP_VTRNL, // VTRN, left result
8093 OP_VTRNR // VTRN, right result
8094};
8095
8096static bool isLegalMVEShuffleOp(unsigned PFEntry) {
8097 unsigned OpNum = (PFEntry >> 26) & 0x0F;
8098 switch (OpNum) {
8099 case OP_COPY:
8100 case OP_VREV:
8101 case OP_VDUP0:
8102 case OP_VDUP1:
8103 case OP_VDUP2:
8104 case OP_VDUP3:
8105 return true;
8106 }
8107 return false;
8108}
8109
8110/// isShuffleMaskLegal - Targets can use this to indicate that they only
8111/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
8112/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
8113/// are assumed to be legal.
8115 if (VT.getVectorNumElements() == 4 &&
8116 (VT.is128BitVector() || VT.is64BitVector())) {
8117 unsigned PFIndexes[4];
8118 for (unsigned i = 0; i != 4; ++i) {
8119 if (M[i] < 0)
8120 PFIndexes[i] = 8;
8121 else
8122 PFIndexes[i] = M[i];
8123 }
8124
8125 // Compute the index in the perfect shuffle table.
8126 unsigned PFTableIndex =
8127 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
8128 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
8129 unsigned Cost = (PFEntry >> 30);
8130
8131 if (Cost <= 4 && (Subtarget->hasNEON() || isLegalMVEShuffleOp(PFEntry)))
8132 return true;
8133 }
8134
8135 bool ReverseVEXT, isV_UNDEF;
8136 unsigned Imm, WhichResult;
8137
8138 unsigned EltSize = VT.getScalarSizeInBits();
8139 if (EltSize >= 32 ||
8141 ShuffleVectorInst::isIdentityMask(M, M.size()) ||
8142 isVREVMask(M, VT, 64) ||
8143 isVREVMask(M, VT, 32) ||
8144 isVREVMask(M, VT, 16))
8145 return true;
8146 else if (Subtarget->hasNEON() &&
8147 (isVEXTMask(M, VT, ReverseVEXT, Imm) ||
8148 isVTBLMask(M, VT) ||
8149 isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF)))
8150 return true;
8151 else if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8152 isReverseMask(M, VT))
8153 return true;
8154 else if (Subtarget->hasMVEIntegerOps() &&
8155 (isVMOVNMask(M, VT, true, false) ||
8156 isVMOVNMask(M, VT, false, false) || isVMOVNMask(M, VT, true, true)))
8157 return true;
8158 else if (Subtarget->hasMVEIntegerOps() &&
8159 (isTruncMask(M, VT, false, false) ||
8160 isTruncMask(M, VT, false, true) ||
8161 isTruncMask(M, VT, true, false) || isTruncMask(M, VT, true, true)))
8162 return true;
8163 else
8164 return false;
8165}
8166
8167/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
8168/// the specified operations to build the shuffle.
8170 SDValue RHS, SelectionDAG &DAG,
8171 const SDLoc &dl) {
8172 unsigned OpNum = (PFEntry >> 26) & 0x0F;
8173 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
8174 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
8175
8176 if (OpNum == OP_COPY) {
8177 if (LHSID == (1*9+2)*9+3) return LHS;
8178 assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
8179 return RHS;
8180 }
8181
8182 SDValue OpLHS, OpRHS;
8183 OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
8184 OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
8185 EVT VT = OpLHS.getValueType();
8186
8187 switch (OpNum) {
8188 default: llvm_unreachable("Unknown shuffle opcode!");
8189 case OP_VREV:
8190 // VREV divides the vector in half and swaps within the half.
8191 if (VT.getScalarSizeInBits() == 32)
8192 return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);
8193 // vrev <4 x i16> -> VREV32
8194 if (VT.getScalarSizeInBits() == 16)
8195 return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS);
8196 // vrev <4 x i8> -> VREV16
8197 assert(VT.getScalarSizeInBits() == 8);
8198 return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS);
8199 case OP_VDUP0:
8200 case OP_VDUP1:
8201 case OP_VDUP2:
8202 case OP_VDUP3:
8203 return DAG.getNode(ARMISD::VDUPLANE, dl, VT,
8204 OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32));
8205 case OP_VEXT1:
8206 case OP_VEXT2:
8207 case OP_VEXT3:
8208 return DAG.getNode(ARMISD::VEXT, dl, VT,
8209 OpLHS, OpRHS,
8210 DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32));
8211 case OP_VUZPL:
8212 case OP_VUZPR:
8213 return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
8214 OpLHS, OpRHS).getValue(OpNum-OP_VUZPL);
8215 case OP_VZIPL:
8216 case OP_VZIPR:
8217 return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
8218 OpLHS, OpRHS).getValue(OpNum-OP_VZIPL);
8219 case OP_VTRNL:
8220 case OP_VTRNR:
8221 return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
8222 OpLHS, OpRHS).getValue(OpNum-OP_VTRNL);
8223 }
8224}
8225
8227 ArrayRef<int> ShuffleMask,
8228 SelectionDAG &DAG) {
8229 // Check to see if we can use the VTBL instruction.
8230 SDValue V1 = Op.getOperand(0);
8231 SDValue V2 = Op.getOperand(1);
8232 SDLoc DL(Op);
8233
8234 SmallVector<SDValue, 8> VTBLMask;
8235 for (int I : ShuffleMask)
8236 VTBLMask.push_back(DAG.getSignedConstant(I, DL, MVT::i32));
8237
8238 if (V2.getNode()->isUndef())
8239 return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
8240 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
8241
8242 return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
8243 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
8244}
8245
8247 SDLoc DL(Op);
8248 EVT VT = Op.getValueType();
8249
8250 assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8251 "Expect an v8i16/v16i8 type");
8252 SDValue OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, Op.getOperand(0));
8253 // For a v16i8 type: After the VREV, we have got <7, ..., 0, 15, ..., 8>. Now,
8254 // extract the first 8 bytes into the top double word and the last 8 bytes
8255 // into the bottom double word, through a new vector shuffle that will be
8256 // turned into a VEXT on Neon, or a couple of VMOVDs on MVE.
8257 std::vector<int> NewMask;
8258 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++)
8259 NewMask.push_back(VT.getVectorNumElements() / 2 + i);
8260 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++)
8261 NewMask.push_back(i);
8262 return DAG.getVectorShuffle(VT, DL, OpLHS, OpLHS, NewMask);
8263}
8264
8266 switch (VT.getSimpleVT().SimpleTy) {
8267 case MVT::v2i1:
8268 return MVT::v2f64;
8269 case MVT::v4i1:
8270 return MVT::v4i32;
8271 case MVT::v8i1:
8272 return MVT::v8i16;
8273 case MVT::v16i1:
8274 return MVT::v16i8;
8275 default:
8276 llvm_unreachable("Unexpected vector predicate type");
8277 }
8278}
8279
8281 SelectionDAG &DAG) {
8282 // Converting from boolean predicates to integers involves creating a vector
8283 // of all ones or all zeroes and selecting the lanes based upon the real
8284 // predicate.
8286 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), dl, MVT::i32);
8287 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllOnes);
8288
8289 SDValue AllZeroes =
8290 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0x0), dl, MVT::i32);
8291 AllZeroes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllZeroes);
8292
8293 // Get full vector type from predicate type
8295
8296 SDValue RecastV1;
8297 // If the real predicate is an v8i1 or v4i1 (not v16i1) then we need to recast
8298 // this to a v16i1. This cannot be done with an ordinary bitcast because the
8299 // sizes are not the same. We have to use a MVE specific PREDICATE_CAST node,
8300 // since we know in hardware the sizes are really the same.
8301 if (VT != MVT::v16i1)
8302 RecastV1 = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Pred);
8303 else
8304 RecastV1 = Pred;
8305
8306 // Select either all ones or zeroes depending upon the real predicate bits.
8307 SDValue PredAsVector =
8308 DAG.getNode(ISD::VSELECT, dl, MVT::v16i8, RecastV1, AllOnes, AllZeroes);
8309
8310 // Recast our new predicate-as-integer v16i8 vector into something
8311 // appropriate for the shuffle, i.e. v4i32 for a real v4i1 predicate.
8312 return DAG.getNode(ISD::BITCAST, dl, NewVT, PredAsVector);
8313}
8314
8316 const ARMSubtarget *ST) {
8317 EVT VT = Op.getValueType();
8319 ArrayRef<int> ShuffleMask = SVN->getMask();
8320
8321 assert(ST->hasMVEIntegerOps() &&
8322 "No support for vector shuffle of boolean predicates");
8323
8324 SDValue V1 = Op.getOperand(0);
8325 SDValue V2 = Op.getOperand(1);
8326 SDLoc dl(Op);
8327 if (isReverseMask(ShuffleMask, VT)) {
8328 SDValue cast = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, V1);
8329 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, cast);
8330 SDValue srl = DAG.getNode(ISD::SRL, dl, MVT::i32, rbit,
8331 DAG.getConstant(16, dl, MVT::i32));
8332 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, srl);
8333 }
8334
8335 // Until we can come up with optimised cases for every single vector
8336 // shuffle in existence we have chosen the least painful strategy. This is
8337 // to essentially promote the boolean predicate to a 8-bit integer, where
8338 // each predicate represents a byte. Then we fall back on a normal integer
8339 // vector shuffle and convert the result back into a predicate vector. In
8340 // many cases the generated code might be even better than scalar code
8341 // operating on bits. Just imagine trying to shuffle 8 arbitrary 2-bit
8342 // fields in a register into 8 other arbitrary 2-bit fields!
8343 SDValue PredAsVector1 = PromoteMVEPredVector(dl, V1, VT, DAG);
8344 EVT NewVT = PredAsVector1.getValueType();
8345 SDValue PredAsVector2 = V2.isUndef() ? DAG.getUNDEF(NewVT)
8346 : PromoteMVEPredVector(dl, V2, VT, DAG);
8347 assert(PredAsVector2.getValueType() == NewVT &&
8348 "Expected identical vector type in expanded i1 shuffle!");
8349
8350 // Do the shuffle!
8351 SDValue Shuffled = DAG.getVectorShuffle(NewVT, dl, PredAsVector1,
8352 PredAsVector2, ShuffleMask);
8353
8354 // Now return the result of comparing the shuffled vector with zero,
8355 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. For a v2i1
8356 // we convert to a v4i1 compare to fill in the two halves of the i64 as i32s.
8357 if (VT == MVT::v2i1) {
8358 SDValue BC = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Shuffled);
8359 SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, BC,
8360 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8361 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp);
8362 }
8363 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Shuffled,
8364 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8365}
8366
8368 ArrayRef<int> ShuffleMask,
8369 SelectionDAG &DAG) {
8370 // Attempt to lower the vector shuffle using as many whole register movs as
8371 // possible. This is useful for types smaller than 32bits, which would
8372 // often otherwise become a series for grp movs.
8373 SDLoc dl(Op);
8374 EVT VT = Op.getValueType();
8375 if (VT.getScalarSizeInBits() >= 32)
8376 return SDValue();
8377
8378 assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8379 "Unexpected vector type");
8380 int NumElts = VT.getVectorNumElements();
8381 int QuarterSize = NumElts / 4;
8382 // The four final parts of the vector, as i32's
8383 SDValue Parts[4];
8384
8385 // Look for full lane vmovs like <0,1,2,3> or <u,5,6,7> etc, (but not
8386 // <u,u,u,u>), returning the vmov lane index
8387 auto getMovIdx = [](ArrayRef<int> ShuffleMask, int Start, int Length) {
8388 // Detect which mov lane this would be from the first non-undef element.
8389 int MovIdx = -1;
8390 for (int i = 0; i < Length; i++) {
8391 if (ShuffleMask[Start + i] >= 0) {
8392 if (ShuffleMask[Start + i] % Length != i)
8393 return -1;
8394 MovIdx = ShuffleMask[Start + i] / Length;
8395 break;
8396 }
8397 }
8398 // If all items are undef, leave this for other combines
8399 if (MovIdx == -1)
8400 return -1;
8401 // Check the remaining values are the correct part of the same mov
8402 for (int i = 1; i < Length; i++) {
8403 if (ShuffleMask[Start + i] >= 0 &&
8404 (ShuffleMask[Start + i] / Length != MovIdx ||
8405 ShuffleMask[Start + i] % Length != i))
8406 return -1;
8407 }
8408 return MovIdx;
8409 };
8410
8411 for (int Part = 0; Part < 4; ++Part) {
8412 // Does this part look like a mov
8413 int Elt = getMovIdx(ShuffleMask, Part * QuarterSize, QuarterSize);
8414 if (Elt != -1) {
8415 SDValue Input = Op->getOperand(0);
8416 if (Elt >= 4) {
8417 Input = Op->getOperand(1);
8418 Elt -= 4;
8419 }
8420 SDValue BitCast = DAG.getBitcast(MVT::v4f32, Input);
8421 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, BitCast,
8422 DAG.getConstant(Elt, dl, MVT::i32));
8423 }
8424 }
8425
8426 // Nothing interesting found, just return
8427 if (!Parts[0] && !Parts[1] && !Parts[2] && !Parts[3])
8428 return SDValue();
8429
8430 // The other parts need to be built with the old shuffle vector, cast to a
8431 // v4i32 and extract_vector_elts
8432 if (!Parts[0] || !Parts[1] || !Parts[2] || !Parts[3]) {
8433 SmallVector<int, 16> NewShuffleMask;
8434 for (int Part = 0; Part < 4; ++Part)
8435 for (int i = 0; i < QuarterSize; i++)
8436 NewShuffleMask.push_back(
8437 Parts[Part] ? -1 : ShuffleMask[Part * QuarterSize + i]);
8438 SDValue NewShuffle = DAG.getVectorShuffle(
8439 VT, dl, Op->getOperand(0), Op->getOperand(1), NewShuffleMask);
8440 SDValue BitCast = DAG.getBitcast(MVT::v4f32, NewShuffle);
8441
8442 for (int Part = 0; Part < 4; ++Part)
8443 if (!Parts[Part])
8444 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32,
8445 BitCast, DAG.getConstant(Part, dl, MVT::i32));
8446 }
8447 // Build a vector out of the various parts and bitcast it back to the original
8448 // type.
8449 SDValue NewVec = DAG.getNode(ARMISD::BUILD_VECTOR, dl, MVT::v4f32, Parts);
8450 return DAG.getBitcast(VT, NewVec);
8451}
8452
8454 ArrayRef<int> ShuffleMask,
8455 SelectionDAG &DAG) {
8456 SDValue V1 = Op.getOperand(0);
8457 SDValue V2 = Op.getOperand(1);
8458 EVT VT = Op.getValueType();
8459 unsigned NumElts = VT.getVectorNumElements();
8460
8461 // An One-Off Identity mask is one that is mostly an identity mask from as
8462 // single source but contains a single element out-of-place, either from a
8463 // different vector or from another position in the same vector. As opposed to
8464 // lowering this via a ARMISD::BUILD_VECTOR we can generate an extract/insert
8465 // pair directly.
8466 auto isOneOffIdentityMask = [](ArrayRef<int> Mask, EVT VT, int BaseOffset,
8467 int &OffElement) {
8468 OffElement = -1;
8469 int NonUndef = 0;
8470 for (int i = 0, NumMaskElts = Mask.size(); i < NumMaskElts; ++i) {
8471 if (Mask[i] == -1)
8472 continue;
8473 NonUndef++;
8474 if (Mask[i] != i + BaseOffset) {
8475 if (OffElement == -1)
8476 OffElement = i;
8477 else
8478 return false;
8479 }
8480 }
8481 return NonUndef > 2 && OffElement != -1;
8482 };
8483 int OffElement;
8484 SDValue VInput;
8485 if (isOneOffIdentityMask(ShuffleMask, VT, 0, OffElement))
8486 VInput = V1;
8487 else if (isOneOffIdentityMask(ShuffleMask, VT, NumElts, OffElement))
8488 VInput = V2;
8489 else
8490 return SDValue();
8491
8492 SDLoc dl(Op);
8493 EVT SVT = VT.getScalarType() == MVT::i8 || VT.getScalarType() == MVT::i16
8494 ? MVT::i32
8495 : VT.getScalarType();
8496 SDValue Elt = DAG.getNode(
8497 ISD::EXTRACT_VECTOR_ELT, dl, SVT,
8498 ShuffleMask[OffElement] < (int)NumElts ? V1 : V2,
8499 DAG.getVectorIdxConstant(ShuffleMask[OffElement] % NumElts, dl));
8500 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, VInput, Elt,
8501 DAG.getVectorIdxConstant(OffElement % NumElts, dl));
8502}
8503
8505 const ARMSubtarget *ST) {
8506 SDValue V1 = Op.getOperand(0);
8507 SDValue V2 = Op.getOperand(1);
8508 SDLoc dl(Op);
8509 EVT VT = Op.getValueType();
8511 unsigned EltSize = VT.getScalarSizeInBits();
8512
8513 if (ST->hasMVEIntegerOps() && EltSize == 1)
8514 return LowerVECTOR_SHUFFLE_i1(Op, DAG, ST);
8515
8516 // Convert shuffles that are directly supported on NEON to target-specific
8517 // DAG nodes, instead of keeping them as shuffles and matching them again
8518 // during code selection. This is more efficient and avoids the possibility
8519 // of inconsistencies between legalization and selection.
8520 // FIXME: floating-point vectors should be canonicalized to integer vectors
8521 // of the same time so that they get CSEd properly.
8522 ArrayRef<int> ShuffleMask = SVN->getMask();
8523
8524 if (EltSize <= 32) {
8525 if (SVN->isSplat()) {
8526 int Lane = SVN->getSplatIndex();
8527 // If this is undef splat, generate it via "just" vdup, if possible.
8528 if (Lane == -1) Lane = 0;
8529
8530 // Test if V1 is a SCALAR_TO_VECTOR.
8531 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
8532 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
8533 }
8534 // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
8535 // (and probably will turn into a SCALAR_TO_VECTOR once legalization
8536 // reaches it).
8537 if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR &&
8539 bool IsScalarToVector = true;
8540 for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i)
8541 if (!V1.getOperand(i).isUndef()) {
8542 IsScalarToVector = false;
8543 break;
8544 }
8545 if (IsScalarToVector)
8546 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
8547 }
8548 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
8549 DAG.getConstant(Lane, dl, MVT::i32));
8550 }
8551
8552 bool ReverseVEXT = false;
8553 unsigned Imm = 0;
8554 if (ST->hasNEON() && isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
8555 if (ReverseVEXT)
8556 std::swap(V1, V2);
8557 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
8558 DAG.getConstant(Imm, dl, MVT::i32));
8559 }
8560
8561 if (isVREVMask(ShuffleMask, VT, 64))
8562 return DAG.getNode(ARMISD::VREV64, dl, VT, V1);
8563 if (isVREVMask(ShuffleMask, VT, 32))
8564 return DAG.getNode(ARMISD::VREV32, dl, VT, V1);
8565 if (isVREVMask(ShuffleMask, VT, 16))
8566 return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
8567
8568 if (ST->hasNEON() && V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
8569 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1,
8570 DAG.getConstant(Imm, dl, MVT::i32));
8571 }
8572
8573 // Check for Neon shuffles that modify both input vectors in place.
8574 // If both results are used, i.e., if there are two shuffles with the same
8575 // source operands and with masks corresponding to both results of one of
8576 // these operations, DAG memoization will ensure that a single node is
8577 // used for both shuffles.
8578 unsigned WhichResult = 0;
8579 bool isV_UNDEF = false;
8580 if (ST->hasNEON()) {
8581 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
8582 ShuffleMask, VT, WhichResult, isV_UNDEF)) {
8583 if (isV_UNDEF)
8584 V2 = V1;
8585 return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2)
8586 .getValue(WhichResult);
8587 }
8588 }
8589 if (ST->hasMVEIntegerOps()) {
8590 if (isVMOVNMask(ShuffleMask, VT, false, false))
8591 return DAG.getNode(ARMISD::VMOVN, dl, VT, V2, V1,
8592 DAG.getConstant(0, dl, MVT::i32));
8593 if (isVMOVNMask(ShuffleMask, VT, true, false))
8594 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V2,
8595 DAG.getConstant(1, dl, MVT::i32));
8596 if (isVMOVNMask(ShuffleMask, VT, true, true))
8597 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V1,
8598 DAG.getConstant(1, dl, MVT::i32));
8599 }
8600
8601 // Also check for these shuffles through CONCAT_VECTORS: we canonicalize
8602 // shuffles that produce a result larger than their operands with:
8603 // shuffle(concat(v1, undef), concat(v2, undef))
8604 // ->
8605 // shuffle(concat(v1, v2), undef)
8606 // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine).
8607 //
8608 // This is useful in the general case, but there are special cases where
8609 // native shuffles produce larger results: the two-result ops.
8610 //
8611 // Look through the concat when lowering them:
8612 // shuffle(concat(v1, v2), undef)
8613 // ->
8614 // concat(VZIP(v1, v2):0, :1)
8615 //
8616 if (ST->hasNEON() && V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) {
8617 SDValue SubV1 = V1->getOperand(0);
8618 SDValue SubV2 = V1->getOperand(1);
8619 EVT SubVT = SubV1.getValueType();
8620
8621 // We expect these to have been canonicalized to -1.
8622 assert(llvm::all_of(ShuffleMask, [&](int i) {
8623 return i < (int)VT.getVectorNumElements();
8624 }) && "Unexpected shuffle index into UNDEF operand!");
8625
8626 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
8627 ShuffleMask, SubVT, WhichResult, isV_UNDEF)) {
8628 if (isV_UNDEF)
8629 SubV2 = SubV1;
8630 assert((WhichResult == 0) &&
8631 "In-place shuffle of concat can only have one result!");
8632 SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT),
8633 SubV1, SubV2);
8634 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0),
8635 Res.getValue(1));
8636 }
8637 }
8638 }
8639
8640 if (ST->hasMVEIntegerOps() && EltSize <= 32) {
8641 if (SDValue V = LowerVECTOR_SHUFFLEUsingOneOff(Op, ShuffleMask, DAG))
8642 return V;
8643
8644 for (bool Top : {false, true}) {
8645 for (bool SingleSource : {false, true}) {
8646 if (isTruncMask(ShuffleMask, VT, Top, SingleSource)) {
8647 MVT FromSVT = MVT::getIntegerVT(EltSize * 2);
8648 MVT FromVT = MVT::getVectorVT(FromSVT, ShuffleMask.size() / 2);
8649 SDValue Lo = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, FromVT, V1);
8650 SDValue Hi = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, FromVT,
8651 SingleSource ? V1 : V2);
8652 if (Top) {
8653 SDValue Amt = DAG.getConstant(EltSize, dl, FromVT);
8654 Lo = DAG.getNode(ISD::SRL, dl, FromVT, Lo, Amt);
8655 Hi = DAG.getNode(ISD::SRL, dl, FromVT, Hi, Amt);
8656 }
8657 return DAG.getNode(ARMISD::MVETRUNC, dl, VT, Lo, Hi);
8658 }
8659 }
8660 }
8661 }
8662
8663 // If the shuffle is not directly supported and it has 4 elements, use
8664 // the PerfectShuffle-generated table to synthesize it from other shuffles.
8665 unsigned NumElts = VT.getVectorNumElements();
8666 if (NumElts == 4) {
8667 unsigned PFIndexes[4];
8668 for (unsigned i = 0; i != 4; ++i) {
8669 if (ShuffleMask[i] < 0)
8670 PFIndexes[i] = 8;
8671 else
8672 PFIndexes[i] = ShuffleMask[i];
8673 }
8674
8675 // Compute the index in the perfect shuffle table.
8676 unsigned PFTableIndex =
8677 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
8678 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
8679 unsigned Cost = (PFEntry >> 30);
8680
8681 if (Cost <= 4) {
8682 if (ST->hasNEON())
8683 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
8684 else if (isLegalMVEShuffleOp(PFEntry)) {
8685 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
8686 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
8687 unsigned PFEntryLHS = PerfectShuffleTable[LHSID];
8688 unsigned PFEntryRHS = PerfectShuffleTable[RHSID];
8689 if (isLegalMVEShuffleOp(PFEntryLHS) && isLegalMVEShuffleOp(PFEntryRHS))
8690 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
8691 }
8692 }
8693 }
8694
8695 // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs.
8696 if (EltSize >= 32) {
8697 // Do the expansion with floating-point types, since that is what the VFP
8698 // registers are defined to use, and since i64 is not legal.
8699 EVT EltVT = EVT::getFloatingPointVT(EltSize);
8700 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
8701 V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1);
8702 V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2);
8704 for (unsigned i = 0; i < NumElts; ++i) {
8705 if (ShuffleMask[i] < 0)
8706 Ops.push_back(DAG.getUNDEF(EltVT));
8707 else
8708 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
8709 ShuffleMask[i] < (int)NumElts ? V1 : V2,
8710 DAG.getConstant(ShuffleMask[i] & (NumElts-1),
8711 dl, MVT::i32)));
8712 }
8713 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
8714 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
8715 }
8716
8717 if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8718 isReverseMask(ShuffleMask, VT))
8719 return LowerReverse_VECTOR_SHUFFLE(Op, DAG);
8720
8721 if (ST->hasNEON() && VT == MVT::v8i8)
8722 if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG))
8723 return NewOp;
8724
8725 if (ST->hasMVEIntegerOps())
8726 if (SDValue NewOp = LowerVECTOR_SHUFFLEUsingMovs(Op, ShuffleMask, DAG))
8727 return NewOp;
8728
8729 return SDValue();
8730}
8731
8733 const ARMSubtarget *ST) {
8734 EVT VecVT = Op.getOperand(0).getValueType();
8735 SDLoc dl(Op);
8736
8737 assert(ST->hasMVEIntegerOps() &&
8738 "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
8739
8740 SDValue Conv =
8741 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
8742 unsigned Lane = Op.getConstantOperandVal(2);
8743 unsigned LaneWidth =
8745 unsigned Mask = ((1 << LaneWidth) - 1) << Lane * LaneWidth;
8746 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32,
8747 Op.getOperand(1), DAG.getValueType(MVT::i1));
8748 SDValue BFI = DAG.getNode(ARMISD::BFI, dl, MVT::i32, Conv, Ext,
8749 DAG.getConstant(~Mask, dl, MVT::i32));
8750 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), BFI);
8751}
8752
8753SDValue ARMTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
8754 SelectionDAG &DAG) const {
8755 // INSERT_VECTOR_ELT is legal only for immediate indexes.
8756 SDValue Lane = Op.getOperand(2);
8757 if (!isa<ConstantSDNode>(Lane))
8758 return SDValue();
8759
8760 SDValue Elt = Op.getOperand(1);
8761 EVT EltVT = Elt.getValueType();
8762
8763 if (Subtarget->hasMVEIntegerOps() &&
8764 Op.getValueType().getScalarSizeInBits() == 1)
8765 return LowerINSERT_VECTOR_ELT_i1(Op, DAG, Subtarget);
8766
8767 if (getTypeAction(*DAG.getContext(), EltVT) ==
8769 // INSERT_VECTOR_ELT doesn't want f16 operands promoting to f32,
8770 // but the type system will try to do that if we don't intervene.
8771 // Reinterpret any such vector-element insertion as one with the
8772 // corresponding integer types.
8773
8774 SDLoc dl(Op);
8775
8776 EVT IEltVT = MVT::getIntegerVT(EltVT.getScalarSizeInBits());
8777 assert(getTypeAction(*DAG.getContext(), IEltVT) !=
8779
8780 SDValue VecIn = Op.getOperand(0);
8781 EVT VecVT = VecIn.getValueType();
8782 EVT IVecVT = EVT::getVectorVT(*DAG.getContext(), IEltVT,
8783 VecVT.getVectorNumElements());
8784
8785 SDValue IElt = DAG.getNode(ISD::BITCAST, dl, IEltVT, Elt);
8786 SDValue IVecIn = DAG.getNode(ISD::BITCAST, dl, IVecVT, VecIn);
8787 SDValue IVecOut = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVecVT,
8788 IVecIn, IElt, Lane);
8789 return DAG.getNode(ISD::BITCAST, dl, VecVT, IVecOut);
8790 }
8791
8792 return Op;
8793}
8794
8796 const ARMSubtarget *ST) {
8797 EVT VecVT = Op.getOperand(0).getValueType();
8798 SDLoc dl(Op);
8799
8800 assert(ST->hasMVEIntegerOps() &&
8801 "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
8802
8803 SDValue Conv =
8804 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
8805 unsigned Lane = Op.getConstantOperandVal(1);
8806 unsigned LaneWidth =
8808 SDValue Shift = DAG.getNode(ISD::SRL, dl, MVT::i32, Conv,
8809 DAG.getConstant(Lane * LaneWidth, dl, MVT::i32));
8810 return Shift;
8811}
8812
8814 const ARMSubtarget *ST) {
8815 // EXTRACT_VECTOR_ELT is legal only for immediate indexes.
8816 SDValue Lane = Op.getOperand(1);
8817 if (!isa<ConstantSDNode>(Lane))
8818 return SDValue();
8819
8820 SDValue Vec = Op.getOperand(0);
8821 EVT VT = Vec.getValueType();
8822
8823 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
8824 return LowerEXTRACT_VECTOR_ELT_i1(Op, DAG, ST);
8825
8826 if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) {
8827 SDLoc dl(Op);
8828 return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
8829 }
8830
8831 return Op;
8832}
8833
8835 const ARMSubtarget *ST) {
8836 SDLoc dl(Op);
8837 assert(Op.getValueType().getScalarSizeInBits() == 1 &&
8838 "Unexpected custom CONCAT_VECTORS lowering");
8839 assert(isPowerOf2_32(Op.getNumOperands()) &&
8840 "Unexpected custom CONCAT_VECTORS lowering");
8841 assert(ST->hasMVEIntegerOps() &&
8842 "CONCAT_VECTORS lowering only supported for MVE");
8843
8844 auto ConcatPair = [&](SDValue V1, SDValue V2) {
8845 EVT Op1VT = V1.getValueType();
8846 EVT Op2VT = V2.getValueType();
8847 assert(Op1VT == Op2VT && "Operand types don't match!");
8848 assert((Op1VT == MVT::v2i1 || Op1VT == MVT::v4i1 || Op1VT == MVT::v8i1) &&
8849 "Unexpected i1 concat operations!");
8850 EVT VT = Op1VT.getDoubleNumVectorElementsVT(*DAG.getContext());
8851
8852 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
8853 SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG);
8854
8855 // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets
8856 // promoted to v8i16, etc.
8857 MVT ElType =
8859 unsigned NumElts = 2 * Op1VT.getVectorNumElements();
8860
8861 EVT ConcatVT = MVT::getVectorVT(ElType, NumElts);
8862 if (Op1VT == MVT::v4i1 || Op1VT == MVT::v8i1) {
8863 // Use MVETRUNC to truncate the combined NewV1::NewV2 into the smaller
8864 // ConcatVT.
8865 SDValue ConVec =
8866 DAG.getNode(ARMISD::MVETRUNC, dl, ConcatVT, NewV1, NewV2);
8867 return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
8868 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8869 }
8870
8871 // Extract the vector elements from Op1 and Op2 one by one and truncate them
8872 // to be the right size for the destination. For example, if Op1 is v4i1
8873 // then the promoted vector is v4i32. The result of concatenation gives a
8874 // v8i1, which when promoted is v8i16. That means each i32 element from Op1
8875 // needs truncating to i16 and inserting in the result.
8876 auto ExtractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) {
8877 EVT NewVT = NewV.getValueType();
8878 EVT ConcatVT = ConVec.getValueType();
8879 unsigned ExtScale = 1;
8880 if (NewVT == MVT::v2f64) {
8881 NewV = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, NewV);
8882 ExtScale = 2;
8883 }
8884 for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) {
8885 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV,
8886 DAG.getIntPtrConstant(i * ExtScale, dl));
8887 ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt,
8888 DAG.getConstant(j, dl, MVT::i32));
8889 }
8890 return ConVec;
8891 };
8892 unsigned j = 0;
8893 SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT);
8894 ConVec = ExtractInto(NewV1, ConVec, j);
8895 ConVec = ExtractInto(NewV2, ConVec, j);
8896
8897 // Now return the result of comparing the subvector with zero, which will
8898 // generate a real predicate, i.e. v4i1, v8i1 or v16i1.
8899 return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
8900 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8901 };
8902
8903 // Concat each pair of subvectors and pack into the lower half of the array.
8904 SmallVector<SDValue> ConcatOps(Op->ops());
8905 while (ConcatOps.size() > 1) {
8906 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
8907 SDValue V1 = ConcatOps[I];
8908 SDValue V2 = ConcatOps[I + 1];
8909 ConcatOps[I / 2] = ConcatPair(V1, V2);
8910 }
8911 ConcatOps.resize(ConcatOps.size() / 2);
8912 }
8913 return ConcatOps[0];
8914}
8915
8917 const ARMSubtarget *ST) {
8918 EVT VT = Op->getValueType(0);
8919 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
8920 return LowerCONCAT_VECTORS_i1(Op, DAG, ST);
8921
8922 // The only time a CONCAT_VECTORS operation can have legal types is when
8923 // two 64-bit vectors are concatenated to a 128-bit vector.
8924 assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&
8925 "unexpected CONCAT_VECTORS");
8926 SDLoc dl(Op);
8927 SDValue Val = DAG.getUNDEF(MVT::v2f64);
8928 SDValue Op0 = Op.getOperand(0);
8929 SDValue Op1 = Op.getOperand(1);
8930 if (!Op0.isUndef())
8931 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
8932 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0),
8933 DAG.getIntPtrConstant(0, dl));
8934 if (!Op1.isUndef())
8935 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
8936 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1),
8937 DAG.getIntPtrConstant(1, dl));
8938 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val);
8939}
8940
8942 const ARMSubtarget *ST) {
8943 SDValue V1 = Op.getOperand(0);
8944 SDValue V2 = Op.getOperand(1);
8945 SDLoc dl(Op);
8946 EVT VT = Op.getValueType();
8947 EVT Op1VT = V1.getValueType();
8948 unsigned NumElts = VT.getVectorNumElements();
8949 unsigned Index = V2->getAsZExtVal();
8950
8951 assert(VT.getScalarSizeInBits() == 1 &&
8952 "Unexpected custom EXTRACT_SUBVECTOR lowering");
8953 assert(ST->hasMVEIntegerOps() &&
8954 "EXTRACT_SUBVECTOR lowering only supported for MVE");
8955
8956 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
8957
8958 // We now have Op1 promoted to a vector of integers, where v8i1 gets
8959 // promoted to v8i16, etc.
8960
8962
8963 if (NumElts == 2) {
8964 EVT SubVT = MVT::v4i32;
8965 SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT);
8966 for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j += 2) {
8967 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1,
8968 DAG.getIntPtrConstant(i, dl));
8969 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
8970 DAG.getConstant(j, dl, MVT::i32));
8971 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
8972 DAG.getConstant(j + 1, dl, MVT::i32));
8973 }
8974 SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, SubVec,
8975 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8976 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp);
8977 }
8978
8979 EVT SubVT = MVT::getVectorVT(ElType, NumElts);
8980 SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT);
8981 for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j++) {
8982 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1,
8983 DAG.getIntPtrConstant(i, dl));
8984 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
8985 DAG.getConstant(j, dl, MVT::i32));
8986 }
8987
8988 // Now return the result of comparing the subvector with zero,
8989 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
8990 return DAG.getNode(ARMISD::VCMPZ, dl, VT, SubVec,
8991 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8992}
8993
8994// Turn a truncate into a predicate (an i1 vector) into icmp(and(x, 1), 0).
8996 const ARMSubtarget *ST) {
8997 assert(ST->hasMVEIntegerOps() && "Expected MVE!");
8998 EVT VT = N->getValueType(0);
8999 assert((VT == MVT::v16i1 || VT == MVT::v8i1 || VT == MVT::v4i1) &&
9000 "Expected a vector i1 type!");
9001 SDValue Op = N->getOperand(0);
9002 EVT FromVT = Op.getValueType();
9003 SDLoc DL(N);
9004
9005 SDValue And =
9006 DAG.getNode(ISD::AND, DL, FromVT, Op, DAG.getConstant(1, DL, FromVT));
9007 return DAG.getNode(ISD::SETCC, DL, VT, And, DAG.getConstant(0, DL, FromVT),
9008 DAG.getCondCode(ISD::SETNE));
9009}
9010
9012 const ARMSubtarget *Subtarget) {
9013 if (!Subtarget->hasMVEIntegerOps())
9014 return SDValue();
9015
9016 EVT ToVT = N->getValueType(0);
9017 if (ToVT.getScalarType() == MVT::i1)
9018 return LowerTruncatei1(N, DAG, Subtarget);
9019
9020 // MVE does not have a single instruction to perform the truncation of a v4i32
9021 // into the lower half of a v8i16, in the same way that a NEON vmovn would.
9022 // Most of the instructions in MVE follow the 'Beats' system, where moving
9023 // values from different lanes is usually something that the instructions
9024 // avoid.
9025 //
9026 // Instead it has top/bottom instructions such as VMOVLT/B and VMOVNT/B,
9027 // which take a the top/bottom half of a larger lane and extend it (or do the
9028 // opposite, truncating into the top/bottom lane from a larger lane). Note
9029 // that because of the way we widen lanes, a v4i16 is really a v4i32 using the
9030 // bottom 16bits from each vector lane. This works really well with T/B
9031 // instructions, but that doesn't extend to v8i32->v8i16 where the lanes need
9032 // to move order.
9033 //
9034 // But truncates and sext/zext are always going to be fairly common from llvm.
9035 // We have several options for how to deal with them:
9036 // - Wherever possible combine them into an instruction that makes them
9037 // "free". This includes loads/stores, which can perform the trunc as part
9038 // of the memory operation. Or certain shuffles that can be turned into
9039 // VMOVN/VMOVL.
9040 // - Lane Interleaving to transform blocks surrounded by ext/trunc. So
9041 // trunc(mul(sext(a), sext(b))) may become
9042 // VMOVNT(VMUL(VMOVLB(a), VMOVLB(b)), VMUL(VMOVLT(a), VMOVLT(b))). (Which in
9043 // this case can use VMULL). This is performed in the
9044 // MVELaneInterleavingPass.
9045 // - Otherwise we have an option. By default we would expand the
9046 // zext/sext/trunc into a series of lane extract/inserts going via GPR
9047 // registers. One for each vector lane in the vector. This can obviously be
9048 // very expensive.
9049 // - The other option is to use the fact that loads/store can extend/truncate
9050 // to turn a trunc into two truncating stack stores and a stack reload. This
9051 // becomes 3 back-to-back memory operations, but at least that is less than
9052 // all the insert/extracts.
9053 //
9054 // In order to do the last, we convert certain trunc's into MVETRUNC, which
9055 // are either optimized where they can be, or eventually lowered into stack
9056 // stores/loads. This prevents us from splitting a v8i16 trunc into two stores
9057 // two early, where other instructions would be better, and stops us from
9058 // having to reconstruct multiple buildvector shuffles into loads/stores.
9059 if (ToVT != MVT::v8i16 && ToVT != MVT::v16i8)
9060 return SDValue();
9061 EVT FromVT = N->getOperand(0).getValueType();
9062 if (FromVT != MVT::v8i32 && FromVT != MVT::v16i16)
9063 return SDValue();
9064
9065 SDValue Lo, Hi;
9066 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
9067 SDLoc DL(N);
9068 return DAG.getNode(ARMISD::MVETRUNC, DL, ToVT, Lo, Hi);
9069}
9070
9072 const ARMSubtarget *Subtarget) {
9073 if (!Subtarget->hasMVEIntegerOps())
9074 return SDValue();
9075
9076 // See LowerTruncate above for an explanation of MVEEXT/MVETRUNC.
9077
9078 EVT ToVT = N->getValueType(0);
9079 if (ToVT != MVT::v16i32 && ToVT != MVT::v8i32 && ToVT != MVT::v16i16)
9080 return SDValue();
9081 SDValue Op = N->getOperand(0);
9082 EVT FromVT = Op.getValueType();
9083 if (FromVT != MVT::v8i16 && FromVT != MVT::v16i8)
9084 return SDValue();
9085
9086 SDLoc DL(N);
9087 EVT ExtVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext());
9088 if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8)
9089 ExtVT = MVT::v8i16;
9090
9091 unsigned Opcode =
9093 SDValue Ext = DAG.getNode(Opcode, DL, DAG.getVTList(ExtVT, ExtVT), Op);
9094 SDValue Ext1 = Ext.getValue(1);
9095
9096 if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8) {
9097 Ext = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext);
9098 Ext1 = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext1);
9099 }
9100
9101 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Ext, Ext1);
9102}
9103
9104/// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
9105/// element has been zero/sign-extended, depending on the isSigned parameter,
9106/// from an integer type half its size.
9108 bool isSigned) {
9109 // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32.
9110 EVT VT = N->getValueType(0);
9111 if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) {
9112 SDNode *BVN = N->getOperand(0).getNode();
9113 if (BVN->getValueType(0) != MVT::v4i32 ||
9114 BVN->getOpcode() != ISD::BUILD_VECTOR)
9115 return false;
9116 unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
9117 unsigned HiElt = 1 - LoElt;
9122 if (!Lo0 || !Hi0 || !Lo1 || !Hi1)
9123 return false;
9124 if (isSigned) {
9125 if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 &&
9126 Hi1->getSExtValue() == Lo1->getSExtValue() >> 32)
9127 return true;
9128 } else {
9129 if (Hi0->isZero() && Hi1->isZero())
9130 return true;
9131 }
9132 return false;
9133 }
9134
9135 if (N->getOpcode() != ISD::BUILD_VECTOR)
9136 return false;
9137
9138 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
9139 SDNode *Elt = N->getOperand(i).getNode();
9141 unsigned EltSize = VT.getScalarSizeInBits();
9142 unsigned HalfSize = EltSize / 2;
9143 if (isSigned) {
9144 if (!isIntN(HalfSize, C->getSExtValue()))
9145 return false;
9146 } else {
9147 if (!isUIntN(HalfSize, C->getZExtValue()))
9148 return false;
9149 }
9150 continue;
9151 }
9152 return false;
9153 }
9154
9155 return true;
9156}
9157
9158/// isSignExtended - Check if a node is a vector value that is sign-extended
9159/// or a constant BUILD_VECTOR with sign-extended elements.
9161 if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N))
9162 return true;
9163 if (isExtendedBUILD_VECTOR(N, DAG, true))
9164 return true;
9165 return false;
9166}
9167
9168/// isZeroExtended - Check if a node is a vector value that is zero-extended (or
9169/// any-extended) or a constant BUILD_VECTOR with zero-extended elements.
9171 if (N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND ||
9173 return true;
9174 if (isExtendedBUILD_VECTOR(N, DAG, false))
9175 return true;
9176 return false;
9177}
9178
9179static EVT getExtensionTo64Bits(const EVT &OrigVT) {
9180 if (OrigVT.getSizeInBits() >= 64)
9181 return OrigVT;
9182
9183 assert(OrigVT.isSimple() && "Expecting a simple value type");
9184
9185 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
9186 switch (OrigSimpleTy) {
9187 default: llvm_unreachable("Unexpected Vector Type");
9188 case MVT::v2i8:
9189 case MVT::v2i16:
9190 return MVT::v2i32;
9191 case MVT::v4i8:
9192 return MVT::v4i16;
9193 }
9194}
9195
9196/// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total
9197/// value size to 64 bits. We need a 64-bit D register as an operand to VMULL.
9198/// We insert the required extension here to get the vector to fill a D register.
9200 const EVT &OrigTy,
9201 const EVT &ExtTy,
9202 unsigned ExtOpcode) {
9203 // The vector originally had a size of OrigTy. It was then extended to ExtTy.
9204 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
9205 // 64-bits we need to insert a new extension so that it will be 64-bits.
9206 assert(ExtTy.is128BitVector() && "Unexpected extension size");
9207 if (OrigTy.getSizeInBits() >= 64)
9208 return N;
9209
9210 // Must extend size to at least 64 bits to be used as an operand for VMULL.
9211 EVT NewVT = getExtensionTo64Bits(OrigTy);
9212
9213 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
9214}
9215
9216/// SkipLoadExtensionForVMULL - return a load of the original vector size that
9217/// does not do any sign/zero extension. If the original vector is less
9218/// than 64 bits, an appropriate extension will be added after the load to
9219/// reach a total size of 64 bits. We have to add the extension separately
9220/// because ARM does not have a sign/zero extending load for vectors.
9222 EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT());
9223
9224 // The load already has the right type.
9225 if (ExtendedTy == LD->getMemoryVT())
9226 return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(),
9227 LD->getBasePtr(), LD->getPointerInfo(), LD->getAlign(),
9228 LD->getMemOperand()->getFlags());
9229
9230 // We need to create a zextload/sextload. We cannot just create a load
9231 // followed by a zext/zext node because LowerMUL is also run during normal
9232 // operation legalization where we can't create illegal types.
9233 return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy,
9234 LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
9235 LD->getMemoryVT(), LD->getAlign(),
9236 LD->getMemOperand()->getFlags());
9237}
9238
9239/// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
9240/// ANY_EXTEND, extending load, or BUILD_VECTOR with extended elements, return
9241/// the unextended value. The unextended vector should be 64 bits so that it can
9242/// be used as an operand to a VMULL instruction. If the original vector size
9243/// before extension is less than 64 bits we add a an extension to resize
9244/// the vector to 64 bits.
9246 if (N->getOpcode() == ISD::SIGN_EXTEND ||
9247 N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND)
9248 return AddRequiredExtensionForVMULL(N->getOperand(0), DAG,
9249 N->getOperand(0)->getValueType(0),
9250 N->getValueType(0),
9251 N->getOpcode());
9252
9253 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
9254 assert((ISD::isSEXTLoad(LD) || ISD::isZEXTLoad(LD)) &&
9255 "Expected extending load");
9256
9257 SDValue newLoad = SkipLoadExtensionForVMULL(LD, DAG);
9258 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), newLoad.getValue(1));
9259 unsigned Opcode = ISD::isSEXTLoad(LD) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
9260 SDValue extLoad =
9261 DAG.getNode(Opcode, SDLoc(newLoad), LD->getValueType(0), newLoad);
9262 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 0), extLoad);
9263
9264 return newLoad;
9265 }
9266
9267 // Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will
9268 // have been legalized as a BITCAST from v4i32.
9269 if (N->getOpcode() == ISD::BITCAST) {
9270 SDNode *BVN = N->getOperand(0).getNode();
9272 BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR");
9273 unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
9274 return DAG.getBuildVector(
9275 MVT::v2i32, SDLoc(N),
9276 {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)});
9277 }
9278 // Construct a new BUILD_VECTOR with elements truncated to half the size.
9279 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
9280 EVT VT = N->getValueType(0);
9281 unsigned EltSize = VT.getScalarSizeInBits() / 2;
9282 unsigned NumElts = VT.getVectorNumElements();
9283 MVT TruncVT = MVT::getIntegerVT(EltSize);
9285 SDLoc dl(N);
9286 for (unsigned i = 0; i != NumElts; ++i) {
9287 const APInt &CInt = N->getConstantOperandAPInt(i);
9288 // Element types smaller than 32 bits are not legal, so use i32 elements.
9289 // The values are implicitly truncated so sext vs. zext doesn't matter.
9290 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
9291 }
9292 return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
9293}
9294
9295static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
9296 unsigned Opcode = N->getOpcode();
9297 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
9298 SDNode *N0 = N->getOperand(0).getNode();
9299 SDNode *N1 = N->getOperand(1).getNode();
9300 return N0->hasOneUse() && N1->hasOneUse() &&
9301 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
9302 }
9303 return false;
9304}
9305
9306static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
9307 unsigned Opcode = N->getOpcode();
9308 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
9309 SDNode *N0 = N->getOperand(0).getNode();
9310 SDNode *N1 = N->getOperand(1).getNode();
9311 return N0->hasOneUse() && N1->hasOneUse() &&
9312 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
9313 }
9314 return false;
9315}
9316
9318 // Multiplications are only custom-lowered for 128-bit vectors so that
9319 // VMULL can be detected. Otherwise v2i64 multiplications are not legal.
9320 EVT VT = Op.getValueType();
9321 assert(VT.is128BitVector() && VT.isInteger() &&
9322 "unexpected type for custom-lowering ISD::MUL");
9323 SDNode *N0 = Op.getOperand(0).getNode();
9324 SDNode *N1 = Op.getOperand(1).getNode();
9325 unsigned NewOpc = 0;
9326 bool isMLA = false;
9327 bool isN0SExt = isSignExtended(N0, DAG);
9328 bool isN1SExt = isSignExtended(N1, DAG);
9329 if (isN0SExt && isN1SExt)
9330 NewOpc = ARMISD::VMULLs;
9331 else {
9332 bool isN0ZExt = isZeroExtended(N0, DAG);
9333 bool isN1ZExt = isZeroExtended(N1, DAG);
9334 if (isN0ZExt && isN1ZExt)
9335 NewOpc = ARMISD::VMULLu;
9336 else if (isN1SExt || isN1ZExt) {
9337 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
9338 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
9339 if (isN1SExt && isAddSubSExt(N0, DAG)) {
9340 NewOpc = ARMISD::VMULLs;
9341 isMLA = true;
9342 } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
9343 NewOpc = ARMISD::VMULLu;
9344 isMLA = true;
9345 } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
9346 std::swap(N0, N1);
9347 NewOpc = ARMISD::VMULLu;
9348 isMLA = true;
9349 }
9350 }
9351
9352 if (!NewOpc) {
9353 if (VT == MVT::v2i64)
9354 // Fall through to expand this. It is not legal.
9355 return SDValue();
9356 else
9357 // Other vector multiplications are legal.
9358 return Op;
9359 }
9360 }
9361
9362 // Legalize to a VMULL instruction.
9363 SDLoc DL(Op);
9364 SDValue Op0;
9365 SDValue Op1 = SkipExtensionForVMULL(N1, DAG);
9366 if (!isMLA) {
9367 Op0 = SkipExtensionForVMULL(N0, DAG);
9369 Op1.getValueType().is64BitVector() &&
9370 "unexpected types for extended operands to VMULL");
9371 return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
9372 }
9373
9374 // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during
9375 // isel lowering to take advantage of no-stall back to back vmul + vmla.
9376 // vmull q0, d4, d6
9377 // vmlal q0, d5, d6
9378 // is faster than
9379 // vaddl q0, d4, d5
9380 // vmovl q1, d6
9381 // vmul q0, q0, q1
9382 SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG);
9383 SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG);
9384 EVT Op1VT = Op1.getValueType();
9385 return DAG.getNode(N0->getOpcode(), DL, VT,
9386 DAG.getNode(NewOpc, DL, VT,
9387 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
9388 DAG.getNode(NewOpc, DL, VT,
9389 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
9390}
9391
9393 SelectionDAG &DAG) {
9394 // TODO: Should this propagate fast-math-flags?
9395
9396 // Convert to float
9397 // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
9398 // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo));
9399 X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X);
9400 Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y);
9401 X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X);
9402 Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y);
9403 // Get reciprocal estimate.
9404 // float4 recip = vrecpeq_f32(yf);
9405 Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9406 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9407 Y);
9408 // Because char has a smaller range than uchar, we can actually get away
9409 // without any newton steps. This requires that we use a weird bias
9410 // of 0xb000, however (again, this has been exhaustively tested).
9411 // float4 result = as_float4(as_int4(xf*recip) + 0xb000);
9412 X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y);
9413 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X);
9414 Y = DAG.getConstant(0xb000, dl, MVT::v4i32);
9415 X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y);
9416 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X);
9417 // Convert back to short.
9418 X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X);
9419 X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X);
9420 return X;
9421}
9422
9424 SelectionDAG &DAG) {
9425 // TODO: Should this propagate fast-math-flags?
9426
9427 SDValue N2;
9428 // Convert to float.
9429 // float4 yf = vcvt_f32_s32(vmovl_s16(y));
9430 // float4 xf = vcvt_f32_s32(vmovl_s16(x));
9431 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0);
9432 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1);
9433 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
9434 N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
9435
9436 // Use reciprocal estimate and one refinement step.
9437 // float4 recip = vrecpeq_f32(yf);
9438 // recip *= vrecpsq_f32(yf, recip);
9439 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9440 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9441 N1);
9442 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9443 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9444 N1, N2);
9445 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9446 // Because short has a smaller range than ushort, we can actually get away
9447 // with only a single newton step. This requires that we use a weird bias
9448 // of 89, however (again, this has been exhaustively tested).
9449 // float4 result = as_float4(as_int4(xf*recip) + 0x89);
9450 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
9451 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
9452 N1 = DAG.getConstant(0x89, dl, MVT::v4i32);
9453 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
9454 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
9455 // Convert back to integer and return.
9456 // return vmovn_s32(vcvt_s32_f32(result));
9457 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
9458 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
9459 return N0;
9460}
9461
9463 const ARMSubtarget *ST) {
9464 EVT VT = Op.getValueType();
9465 assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
9466 "unexpected type for custom-lowering ISD::SDIV");
9467
9468 SDLoc dl(Op);
9469 SDValue N0 = Op.getOperand(0);
9470 SDValue N1 = Op.getOperand(1);
9471 SDValue N2, N3;
9472
9473 if (VT == MVT::v8i8) {
9474 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0);
9475 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1);
9476
9477 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9478 DAG.getIntPtrConstant(4, dl));
9479 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9480 DAG.getIntPtrConstant(4, dl));
9481 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9482 DAG.getIntPtrConstant(0, dl));
9483 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9484 DAG.getIntPtrConstant(0, dl));
9485
9486 N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16
9487 N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16
9488
9489 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
9490 N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
9491
9492 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0);
9493 return N0;
9494 }
9495 return LowerSDIV_v4i16(N0, N1, dl, DAG);
9496}
9497
9499 const ARMSubtarget *ST) {
9500 // TODO: Should this propagate fast-math-flags?
9501 EVT VT = Op.getValueType();
9502 assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
9503 "unexpected type for custom-lowering ISD::UDIV");
9504
9505 SDLoc dl(Op);
9506 SDValue N0 = Op.getOperand(0);
9507 SDValue N1 = Op.getOperand(1);
9508 SDValue N2, N3;
9509
9510 if (VT == MVT::v8i8) {
9511 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0);
9512 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1);
9513
9514 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9515 DAG.getIntPtrConstant(4, dl));
9516 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9517 DAG.getIntPtrConstant(4, dl));
9518 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9519 DAG.getIntPtrConstant(0, dl));
9520 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9521 DAG.getIntPtrConstant(0, dl));
9522
9523 N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16
9524 N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16
9525
9526 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
9527 N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
9528
9529 N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8,
9530 DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl,
9531 MVT::i32),
9532 N0);
9533 return N0;
9534 }
9535
9536 // v4i16 sdiv ... Convert to float.
9537 // float4 yf = vcvt_f32_s32(vmovl_u16(y));
9538 // float4 xf = vcvt_f32_s32(vmovl_u16(x));
9539 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0);
9540 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1);
9541 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
9542 SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
9543
9544 // Use reciprocal estimate and two refinement steps.
9545 // float4 recip = vrecpeq_f32(yf);
9546 // recip *= vrecpsq_f32(yf, recip);
9547 // recip *= vrecpsq_f32(yf, recip);
9548 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9549 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9550 BN1);
9551 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9552 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9553 BN1, N2);
9554 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9555 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9556 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9557 BN1, N2);
9558 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9559 // Simply multiplying by the reciprocal estimate can leave us a few ulps
9560 // too low, so we add 2 ulps (exhaustive testing shows that this is enough,
9561 // and that it will never cause us to return an answer too large).
9562 // float4 result = as_float4(as_int4(xf*recip) + 2);
9563 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
9564 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
9565 N1 = DAG.getConstant(2, dl, MVT::v4i32);
9566 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
9567 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
9568 // Convert back to integer and return.
9569 // return vmovn_u32(vcvt_s32_f32(result));
9570 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
9571 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
9572 return N0;
9573}
9574
9576 SDNode *N = Op.getNode();
9577 EVT VT = N->getValueType(0);
9578 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
9579
9580 SDValue Carry = Op.getOperand(2);
9581
9582 SDLoc DL(Op);
9583
9584 SDValue Result;
9585 if (Op.getOpcode() == ISD::UADDO_CARRY) {
9586 // This converts the boolean value carry into the carry flag.
9587 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
9588
9589 // Do the addition proper using the carry flag we wanted.
9590 Result = DAG.getNode(ARMISD::ADDE, DL, VTs, Op.getOperand(0),
9591 Op.getOperand(1), Carry);
9592
9593 // Now convert the carry flag into a boolean value.
9594 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
9595 } else {
9596 // ARMISD::SUBE expects a carry not a borrow like ISD::USUBO_CARRY so we
9597 // have to invert the carry first.
9598 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
9599 DAG.getConstant(1, DL, MVT::i32), Carry);
9600 // This converts the boolean value carry into the carry flag.
9601 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
9602
9603 // Do the subtraction proper using the carry flag we wanted.
9604 Result = DAG.getNode(ARMISD::SUBE, DL, VTs, Op.getOperand(0),
9605 Op.getOperand(1), Carry);
9606
9607 // Now convert the carry flag into a boolean value.
9608 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
9609 // But the carry returned by ARMISD::SUBE is not a borrow as expected
9610 // by ISD::USUBO_CARRY, so compute 1 - C.
9611 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
9612 DAG.getConstant(1, DL, MVT::i32), Carry);
9613 }
9614
9615 // Return both values.
9616 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry);
9617}
9618
9619SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
9620 bool Signed,
9621 SDValue &Chain) const {
9622 EVT VT = Op.getValueType();
9623 assert((VT == MVT::i32 || VT == MVT::i64) &&
9624 "unexpected type for custom lowering DIV");
9625 SDLoc dl(Op);
9626
9627 const auto &DL = DAG.getDataLayout();
9628 RTLIB::Libcall LC;
9629 if (Signed)
9630 LC = VT == MVT::i32 ? RTLIB::SDIVREM_I32 : RTLIB::SDIVREM_I64;
9631 else
9632 LC = VT == MVT::i32 ? RTLIB::UDIVREM_I32 : RTLIB::UDIVREM_I64;
9633
9634 RTLIB::LibcallImpl LCImpl = getLibcallImpl(LC);
9635 SDValue ES = DAG.getExternalSymbol(LCImpl, getPointerTy(DL));
9636
9638
9639 for (auto AI : {1, 0}) {
9640 SDValue Operand = Op.getOperand(AI);
9641 Args.emplace_back(Operand,
9642 Operand.getValueType().getTypeForEVT(*DAG.getContext()));
9643 }
9644
9645 CallLoweringInfo CLI(DAG);
9646 CLI.setDebugLoc(dl).setChain(Chain).setCallee(
9648 ES, std::move(Args));
9649
9650 return LowerCallTo(CLI).first;
9651}
9652
9653// This is a code size optimisation: return the original SDIV node to
9654// DAGCombiner when we don't want to expand SDIV into a sequence of
9655// instructions, and an empty node otherwise which will cause the
9656// SDIV to be expanded in DAGCombine.
9657SDValue
9658ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
9659 SelectionDAG &DAG,
9660 SmallVectorImpl<SDNode *> &Created) const {
9661 // TODO: Support SREM
9662 if (N->getOpcode() != ISD::SDIV)
9663 return SDValue();
9664
9665 const auto &ST = DAG.getSubtarget<ARMSubtarget>();
9666 const bool MinSize = ST.hasMinSize();
9667 const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode()
9668 : ST.hasDivideInARMMode();
9669
9670 // Don't touch vector types; rewriting this may lead to scalarizing
9671 // the int divs.
9672 if (N->getOperand(0).getValueType().isVector())
9673 return SDValue();
9674
9675 // Bail if MinSize is not set, and also for both ARM and Thumb mode we need
9676 // hwdiv support for this to be really profitable.
9677 if (!(MinSize && HasDivide))
9678 return SDValue();
9679
9680 // ARM mode is a bit simpler than Thumb: we can handle large power
9681 // of 2 immediates with 1 mov instruction; no further checks required,
9682 // just return the sdiv node.
9683 if (!ST.isThumb())
9684 return SDValue(N, 0);
9685
9686 // In Thumb mode, immediates larger than 128 need a wide 4-byte MOV,
9687 // and thus lose the code size benefits of a MOVS that requires only 2.
9688 // TargetTransformInfo and 'getIntImmCodeSizeCost' could be helpful here,
9689 // but as it's doing exactly this, it's not worth the trouble to get TTI.
9690 if (Divisor.sgt(128))
9691 return SDValue();
9692
9693 return SDValue(N, 0);
9694}
9695
9696SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG,
9697 bool Signed) const {
9698 assert(Op.getValueType() == MVT::i32 &&
9699 "unexpected type for custom lowering DIV");
9700 SDLoc dl(Op);
9701
9702 SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other,
9703 DAG.getEntryNode(), Op.getOperand(1));
9704
9705 return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
9706}
9707
9709 SDLoc DL(N);
9710 SDValue Op = N->getOperand(1);
9711 if (N->getValueType(0) == MVT::i32)
9712 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, Op);
9713 SDValue Lo, Hi;
9714 std::tie(Lo, Hi) = DAG.SplitScalar(Op, DL, MVT::i32, MVT::i32);
9715 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain,
9716 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi));
9717}
9718
9719void ARMTargetLowering::ExpandDIV_Windows(
9720 SDValue Op, SelectionDAG &DAG, bool Signed,
9722 const auto &DL = DAG.getDataLayout();
9723
9724 assert(Op.getValueType() == MVT::i64 &&
9725 "unexpected type for custom lowering DIV");
9726 SDLoc dl(Op);
9727
9728 SDValue DBZCHK = WinDBZCheckDenominator(DAG, Op.getNode(), DAG.getEntryNode());
9729
9730 SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
9731
9732 SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result);
9733 SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result,
9734 DAG.getConstant(32, dl, getPointerTy(DL)));
9735 Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper);
9736
9737 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lower, Upper));
9738}
9739
9741 LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
9742 EVT MemVT = LD->getMemoryVT();
9743 assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
9744 MemVT == MVT::v16i1) &&
9745 "Expected a predicate type!");
9746 assert(MemVT == Op.getValueType());
9747 assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
9748 "Expected a non-extending load");
9749 assert(LD->isUnindexed() && "Expected a unindexed load");
9750
9751 // The basic MVE VLDR on a v2i1/v4i1/v8i1 actually loads the entire 16bit
9752 // predicate, with the "v4i1" bits spread out over the 16 bits loaded. We
9753 // need to make sure that 8/4/2 bits are actually loaded into the correct
9754 // place, which means loading the value and then shuffling the values into
9755 // the bottom bits of the predicate.
9756 // Equally, VLDR for an v16i1 will actually load 32bits (so will be incorrect
9757 // for BE).
9758 // Speaking of BE, apparently the rest of llvm will assume a reverse order to
9759 // a natural VMSR(load), so needs to be reversed.
9760
9761 SDLoc dl(Op);
9762 SDValue Load = DAG.getExtLoad(
9763 ISD::EXTLOAD, dl, MVT::i32, LD->getChain(), LD->getBasePtr(),
9765 LD->getMemOperand());
9766 SDValue Val = Load;
9767 if (DAG.getDataLayout().isBigEndian())
9768 Val = DAG.getNode(ISD::SRL, dl, MVT::i32,
9769 DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Load),
9770 DAG.getConstant(32 - MemVT.getSizeInBits(), dl, MVT::i32));
9771 SDValue Pred = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Val);
9772 if (MemVT != MVT::v16i1)
9773 Pred = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Pred,
9774 DAG.getConstant(0, dl, MVT::i32));
9775 return DAG.getMergeValues({Pred, Load.getValue(1)}, dl);
9776}
9777
9778void ARMTargetLowering::LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results,
9779 SelectionDAG &DAG) const {
9780 LoadSDNode *LD = cast<LoadSDNode>(N);
9781 EVT MemVT = LD->getMemoryVT();
9782 assert(LD->isUnindexed() && "Loads should be unindexed at this point.");
9783
9784 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
9785 !Subtarget->isThumb1Only() && LD->isVolatile() &&
9786 LD->getAlign() >= Subtarget->getDualLoadStoreAlignment()) {
9787 SDLoc dl(N);
9789 ARMISD::LDRD, dl, DAG.getVTList({MVT::i32, MVT::i32, MVT::Other}),
9790 {LD->getChain(), LD->getBasePtr()}, MemVT, LD->getMemOperand());
9791 SDValue Lo = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 0 : 1);
9792 SDValue Hi = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 1 : 0);
9793 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
9794 Results.append({Pair, Result.getValue(2)});
9795 }
9796}
9797
9799 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
9800 EVT MemVT = ST->getMemoryVT();
9801 assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
9802 MemVT == MVT::v16i1) &&
9803 "Expected a predicate type!");
9804 assert(MemVT == ST->getValue().getValueType());
9805 assert(!ST->isTruncatingStore() && "Expected a non-extending store");
9806 assert(ST->isUnindexed() && "Expected a unindexed store");
9807
9808 // Only store the v2i1 or v4i1 or v8i1 worth of bits, via a buildvector with
9809 // top bits unset and a scalar store.
9810 SDLoc dl(Op);
9811 SDValue Build = ST->getValue();
9812 if (MemVT != MVT::v16i1) {
9814 for (unsigned I = 0; I < MemVT.getVectorNumElements(); I++) {
9815 unsigned Elt = DAG.getDataLayout().isBigEndian()
9816 ? MemVT.getVectorNumElements() - I - 1
9817 : I;
9818 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Build,
9819 DAG.getConstant(Elt, dl, MVT::i32)));
9820 }
9821 for (unsigned I = MemVT.getVectorNumElements(); I < 16; I++)
9822 Ops.push_back(DAG.getUNDEF(MVT::i32));
9823 Build = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i1, Ops);
9824 }
9825 SDValue GRP = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Build);
9826 if (MemVT == MVT::v16i1 && DAG.getDataLayout().isBigEndian())
9827 GRP = DAG.getNode(ISD::SRL, dl, MVT::i32,
9828 DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, GRP),
9829 DAG.getConstant(16, dl, MVT::i32));
9830 return DAG.getTruncStore(
9831 ST->getChain(), dl, GRP, ST->getBasePtr(),
9833 ST->getMemOperand());
9834}
9835
9837 const ARMSubtarget *Subtarget) {
9838 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
9839 EVT MemVT = ST->getMemoryVT();
9840 assert(ST->isUnindexed() && "Stores should be unindexed at this point.");
9841
9842 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
9843 !Subtarget->isThumb1Only() && ST->isVolatile() &&
9844 ST->getAlign() >= Subtarget->getDualLoadStoreAlignment()) {
9845 SDNode *N = Op.getNode();
9846 SDLoc dl(N);
9847
9848 SDValue Lo = DAG.getNode(
9849 ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
9850 DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 0 : 1, dl,
9851 MVT::i32));
9852 SDValue Hi = DAG.getNode(
9853 ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
9854 DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 1 : 0, dl,
9855 MVT::i32));
9856
9857 return DAG.getMemIntrinsicNode(ARMISD::STRD, dl, DAG.getVTList(MVT::Other),
9858 {ST->getChain(), Lo, Hi, ST->getBasePtr()},
9859 MemVT, ST->getMemOperand());
9860 } else if (Subtarget->hasMVEIntegerOps() &&
9861 ((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
9862 MemVT == MVT::v16i1))) {
9863 return LowerPredicateStore(Op, DAG);
9864 }
9865
9866 return SDValue();
9867}
9868
9869static bool isZeroVector(SDValue N) {
9870 return (ISD::isBuildVectorAllZeros(N.getNode()) ||
9871 (N->getOpcode() == ARMISD::VMOVIMM &&
9872 isNullConstant(N->getOperand(0))));
9873}
9874
9877 MVT VT = Op.getSimpleValueType();
9878 SDValue Mask = N->getMask();
9879 SDValue PassThru = N->getPassThru();
9880 SDLoc dl(Op);
9881
9882 if (isZeroVector(PassThru))
9883 return Op;
9884
9885 // MVE Masked loads use zero as the passthru value. Here we convert undef to
9886 // zero too, and other values are lowered to a select.
9887 SDValue ZeroVec = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
9888 DAG.getTargetConstant(0, dl, MVT::i32));
9889 SDValue NewLoad = DAG.getMaskedLoad(
9890 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask, ZeroVec,
9891 N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
9892 N->getExtensionType(), N->isExpandingLoad());
9893 SDValue Combo = NewLoad;
9894 bool PassThruIsCastZero = (PassThru.getOpcode() == ISD::BITCAST ||
9895 PassThru.getOpcode() == ARMISD::VECTOR_REG_CAST) &&
9896 isZeroVector(PassThru->getOperand(0));
9897 if (!PassThru.isUndef() && !PassThruIsCastZero)
9898 Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
9899 return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl);
9900}
9901
9903 const ARMSubtarget *ST) {
9904 if (!ST->hasMVEIntegerOps())
9905 return SDValue();
9906
9907 SDLoc dl(Op);
9908 unsigned BaseOpcode = 0;
9909 switch (Op->getOpcode()) {
9910 default: llvm_unreachable("Expected VECREDUCE opcode");
9911 case ISD::VECREDUCE_FADD: BaseOpcode = ISD::FADD; break;
9912 case ISD::VECREDUCE_FMUL: BaseOpcode = ISD::FMUL; break;
9913 case ISD::VECREDUCE_MUL: BaseOpcode = ISD::MUL; break;
9914 case ISD::VECREDUCE_AND: BaseOpcode = ISD::AND; break;
9915 case ISD::VECREDUCE_OR: BaseOpcode = ISD::OR; break;
9916 case ISD::VECREDUCE_XOR: BaseOpcode = ISD::XOR; break;
9917 case ISD::VECREDUCE_FMAX: BaseOpcode = ISD::FMAXNUM; break;
9918 case ISD::VECREDUCE_FMIN: BaseOpcode = ISD::FMINNUM; break;
9919 }
9920
9921 SDValue Op0 = Op->getOperand(0);
9922 EVT VT = Op0.getValueType();
9923 EVT EltVT = VT.getVectorElementType();
9924 unsigned NumElts = VT.getVectorNumElements();
9925 unsigned NumActiveLanes = NumElts;
9926
9927 assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 ||
9928 NumActiveLanes == 2) &&
9929 "Only expected a power 2 vector size");
9930
9931 // Use Mul(X, Rev(X)) until 4 items remain. Going down to 4 vector elements
9932 // allows us to easily extract vector elements from the lanes.
9933 while (NumActiveLanes > 4) {
9934 unsigned RevOpcode = NumActiveLanes == 16 ? ARMISD::VREV16 : ARMISD::VREV32;
9935 SDValue Rev = DAG.getNode(RevOpcode, dl, VT, Op0);
9936 Op0 = DAG.getNode(BaseOpcode, dl, VT, Op0, Rev);
9937 NumActiveLanes /= 2;
9938 }
9939
9940 SDValue Res;
9941 if (NumActiveLanes == 4) {
9942 // The remaining 4 elements are summed sequentially
9943 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
9944 DAG.getConstant(0 * NumElts / 4, dl, MVT::i32));
9945 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
9946 DAG.getConstant(1 * NumElts / 4, dl, MVT::i32));
9947 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
9948 DAG.getConstant(2 * NumElts / 4, dl, MVT::i32));
9949 SDValue Ext3 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
9950 DAG.getConstant(3 * NumElts / 4, dl, MVT::i32));
9951 SDValue Res0 = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
9952 SDValue Res1 = DAG.getNode(BaseOpcode, dl, EltVT, Ext2, Ext3, Op->getFlags());
9953 Res = DAG.getNode(BaseOpcode, dl, EltVT, Res0, Res1, Op->getFlags());
9954 } else {
9955 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
9956 DAG.getConstant(0, dl, MVT::i32));
9957 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
9958 DAG.getConstant(1, dl, MVT::i32));
9959 Res = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
9960 }
9961
9962 // Result type may be wider than element type.
9963 if (EltVT != Op->getValueType(0))
9964 Res = DAG.getNode(ISD::ANY_EXTEND, dl, Op->getValueType(0), Res);
9965 return Res;
9966}
9967
9969 const ARMSubtarget *ST) {
9970 if (!ST->hasMVEFloatOps())
9971 return SDValue();
9972 return LowerVecReduce(Op, DAG, ST);
9973}
9974
9976 const ARMSubtarget *ST) {
9977 if (!ST->hasNEON())
9978 return SDValue();
9979
9980 SDLoc dl(Op);
9981 SDValue Op0 = Op->getOperand(0);
9982 EVT VT = Op0.getValueType();
9983 EVT EltVT = VT.getVectorElementType();
9984
9985 unsigned PairwiseIntrinsic = 0;
9986 switch (Op->getOpcode()) {
9987 default:
9988 llvm_unreachable("Expected VECREDUCE opcode");
9990 PairwiseIntrinsic = Intrinsic::arm_neon_vpminu;
9991 break;
9993 PairwiseIntrinsic = Intrinsic::arm_neon_vpmaxu;
9994 break;
9996 PairwiseIntrinsic = Intrinsic::arm_neon_vpmins;
9997 break;
9999 PairwiseIntrinsic = Intrinsic::arm_neon_vpmaxs;
10000 break;
10001 }
10002 SDValue PairwiseOp = DAG.getConstant(PairwiseIntrinsic, dl, MVT::i32);
10003
10004 unsigned NumElts = VT.getVectorNumElements();
10005 unsigned NumActiveLanes = NumElts;
10006
10007 assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 ||
10008 NumActiveLanes == 2) &&
10009 "Only expected a power 2 vector size");
10010
10011 // Split 128-bit vectors, since vpmin/max takes 2 64-bit vectors.
10012 if (VT.is128BitVector()) {
10013 SDValue Lo, Hi;
10014 std::tie(Lo, Hi) = DAG.SplitVector(Op0, dl);
10015 VT = Lo.getValueType();
10016 Op0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, {PairwiseOp, Lo, Hi});
10017 NumActiveLanes /= 2;
10018 }
10019
10020 // Use pairwise reductions until one lane remains
10021 while (NumActiveLanes > 1) {
10022 Op0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, {PairwiseOp, Op0, Op0});
10023 NumActiveLanes /= 2;
10024 }
10025
10026 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10027 DAG.getConstant(0, dl, MVT::i32));
10028
10029 // Result type may be wider than element type.
10030 if (EltVT != Op.getValueType()) {
10031 unsigned Extend = 0;
10032 switch (Op->getOpcode()) {
10033 default:
10034 llvm_unreachable("Expected VECREDUCE opcode");
10037 Extend = ISD::ZERO_EXTEND;
10038 break;
10041 Extend = ISD::SIGN_EXTEND;
10042 break;
10043 }
10044 Res = DAG.getNode(Extend, dl, Op.getValueType(), Res);
10045 }
10046 return Res;
10047}
10048
10050 if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getSuccessOrdering()))
10051 // Acquire/Release load/store is not legal for targets without a dmb or
10052 // equivalent available.
10053 return SDValue();
10054
10055 // Monotonic load/store is legal for all targets.
10056 return Op;
10057}
10058
10061 SelectionDAG &DAG,
10062 const ARMSubtarget *Subtarget) {
10063 SDLoc DL(N);
10064 // Under Power Management extensions, the cycle-count is:
10065 // mrc p15, #0, <Rt>, c9, c13, #0
10066 SDValue Ops[] = { N->getOperand(0), // Chain
10067 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
10068 DAG.getTargetConstant(15, DL, MVT::i32),
10069 DAG.getTargetConstant(0, DL, MVT::i32),
10070 DAG.getTargetConstant(9, DL, MVT::i32),
10071 DAG.getTargetConstant(13, DL, MVT::i32),
10072 DAG.getTargetConstant(0, DL, MVT::i32)
10073 };
10074
10075 SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
10076 DAG.getVTList(MVT::i32, MVT::Other), Ops);
10077 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32,
10078 DAG.getConstant(0, DL, MVT::i32)));
10079 Results.push_back(Cycles32.getValue(1));
10080}
10081
10083 SDValue V1) {
10084 SDLoc dl(V0.getNode());
10085 SDValue RegClass =
10086 DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32);
10087 SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32);
10088 SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32);
10089 const SDValue Ops[] = {RegClass, V0, SubReg0, V1, SubReg1};
10090 return SDValue(
10091 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
10092}
10093
10095 SDLoc dl(V.getNode());
10096 auto [VLo, VHi] = DAG.SplitScalar(V, dl, MVT::i32, MVT::i32);
10097 bool isBigEndian = DAG.getDataLayout().isBigEndian();
10098 if (isBigEndian)
10099 std::swap(VLo, VHi);
10100 return createGPRPairNode2xi32(DAG, VLo, VHi);
10101}
10102
10105 SelectionDAG &DAG) {
10106 assert(N->getValueType(0) == MVT::i64 &&
10107 "AtomicCmpSwap on types less than 64 should be legal");
10108 SDValue Ops[] = {
10109 createGPRPairNode2xi32(DAG, N->getOperand(1),
10110 DAG.getUNDEF(MVT::i32)), // pointer, temp
10111 createGPRPairNodei64(DAG, N->getOperand(2)), // expected
10112 createGPRPairNodei64(DAG, N->getOperand(3)), // new
10113 N->getOperand(0), // chain in
10114 };
10115 SDNode *CmpSwap = DAG.getMachineNode(
10116 ARM::CMP_SWAP_64, SDLoc(N),
10117 DAG.getVTList(MVT::Untyped, MVT::Untyped, MVT::Other), Ops);
10118
10119 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
10120 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
10121
10122 bool isBigEndian = DAG.getDataLayout().isBigEndian();
10123
10124 SDValue Lo =
10125 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0,
10126 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));
10127 SDValue Hi =
10128 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1,
10129 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));
10130 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i64, Lo, Hi));
10131 Results.push_back(SDValue(CmpSwap, 2));
10132}
10133
10134SDValue ARMTargetLowering::LowerFSETCC(SDValue Op, SelectionDAG &DAG) const {
10135 SDLoc dl(Op);
10136 EVT VT = Op.getValueType();
10137 SDValue Chain = Op.getOperand(0);
10138 SDValue LHS = Op.getOperand(1);
10139 SDValue RHS = Op.getOperand(2);
10140 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(3))->get();
10141 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
10142
10143 // If we don't have instructions of this float type then soften to a libcall
10144 // and use SETCC instead.
10145 if (isUnsupportedFloatingType(LHS.getValueType())) {
10146 softenSetCCOperands(DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS,
10147 Chain, IsSignaling);
10148 if (!RHS.getNode()) {
10149 RHS = DAG.getConstant(0, dl, LHS.getValueType());
10150 CC = ISD::SETNE;
10151 }
10152 SDValue Result = DAG.getNode(ISD::SETCC, dl, VT, LHS, RHS,
10153 DAG.getCondCode(CC));
10154 return DAG.getMergeValues({Result, Chain}, dl);
10155 }
10156
10157 ARMCC::CondCodes CondCode, CondCode2;
10158 FPCCToARMCC(CC, CondCode, CondCode2);
10159
10160 SDValue True = DAG.getConstant(1, dl, VT);
10161 SDValue False = DAG.getConstant(0, dl, VT);
10162 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
10163 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, IsSignaling);
10164 SDValue Result = getCMOV(dl, VT, False, True, ARMcc, Cmp, DAG);
10165 if (CondCode2 != ARMCC::AL) {
10166 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
10167 Result = getCMOV(dl, VT, Result, True, ARMcc, Cmp, DAG);
10168 }
10169 return DAG.getMergeValues({Result, Chain}, dl);
10170}
10171
10172SDValue ARMTargetLowering::LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const {
10173 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
10174
10175 EVT VT = getPointerTy(DAG.getDataLayout());
10176 int FI = MFI.CreateFixedObject(4, 0, false);
10177 return DAG.getFrameIndex(FI, VT);
10178}
10179
10180SDValue ARMTargetLowering::LowerFP_TO_BF16(SDValue Op,
10181 SelectionDAG &DAG) const {
10182 SDLoc DL(Op);
10183 MakeLibCallOptions CallOptions;
10184 MVT SVT = Op.getOperand(0).getSimpleValueType();
10185 RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, MVT::bf16);
10186 SDValue Res =
10187 makeLibCall(DAG, LC, MVT::f32, Op.getOperand(0), CallOptions, DL).first;
10188 return DAG.getBitcast(MVT::i32, Res);
10189}
10190
10191SDValue ARMTargetLowering::LowerCMP(SDValue Op, SelectionDAG &DAG) const {
10192 SDLoc dl(Op);
10193 SDValue LHS = Op.getOperand(0);
10194 SDValue RHS = Op.getOperand(1);
10195
10196 // Determine if this is signed or unsigned comparison
10197 bool IsSigned = (Op.getOpcode() == ISD::SCMP);
10198
10199 // Special case for Thumb1 UCMP only
10200 if (!IsSigned && Subtarget->isThumb1Only()) {
10201 // For Thumb unsigned comparison, use this sequence:
10202 // subs r2, r0, r1 ; r2 = LHS - RHS, sets flags
10203 // sbc r2, r2 ; r2 = r2 - r2 - !carry
10204 // cmp r1, r0 ; compare RHS with LHS
10205 // sbc r1, r1 ; r1 = r1 - r1 - !carry
10206 // subs r0, r2, r1 ; r0 = r2 - r1 (final result)
10207
10208 // First subtraction: LHS - RHS
10209 SDValue Sub1WithFlags = DAG.getNode(
10210 ARMISD::SUBC, dl, DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
10211 SDValue Sub1Result = Sub1WithFlags.getValue(0);
10212 SDValue Flags1 = Sub1WithFlags.getValue(1);
10213
10214 // SUBE: Sub1Result - Sub1Result - !carry
10215 // This gives 0 if LHS >= RHS (unsigned), -1 if LHS < RHS (unsigned)
10216 SDValue Sbc1 =
10217 DAG.getNode(ARMISD::SUBE, dl, DAG.getVTList(MVT::i32, FlagsVT),
10218 Sub1Result, Sub1Result, Flags1);
10219 SDValue Sbc1Result = Sbc1.getValue(0);
10220
10221 // Second comparison: RHS vs LHS (reverse comparison)
10222 SDValue CmpFlags = DAG.getNode(ARMISD::CMP, dl, FlagsVT, RHS, LHS);
10223
10224 // SUBE: RHS - RHS - !carry
10225 // This gives 0 if RHS <= LHS (unsigned), -1 if RHS > LHS (unsigned)
10226 SDValue Sbc2 = DAG.getNode(
10227 ARMISD::SUBE, dl, DAG.getVTList(MVT::i32, FlagsVT), RHS, RHS, CmpFlags);
10228 SDValue Sbc2Result = Sbc2.getValue(0);
10229
10230 // Final subtraction: Sbc1Result - Sbc2Result (no flags needed)
10231 SDValue Result =
10232 DAG.getNode(ISD::SUB, dl, MVT::i32, Sbc1Result, Sbc2Result);
10233 if (Op.getValueType() != MVT::i32)
10234 Result = DAG.getSExtOrTrunc(Result, dl, Op.getValueType());
10235
10236 return Result;
10237 }
10238
10239 // For the ARM assembly pattern:
10240 // subs r0, r0, r1 ; subtract RHS from LHS and set flags
10241 // movgt r0, #1 ; if LHS > RHS, set result to 1 (GT for signed, HI for
10242 // unsigned) mvnlt r0, #0 ; if LHS < RHS, set result to -1 (LT for
10243 // signed, LO for unsigned)
10244 // ; if LHS == RHS, result remains 0 from the subs
10245
10246 // Optimization: if RHS is a subtraction against 0, use ADDC instead of SUBC
10247 unsigned Opcode = ARMISD::SUBC;
10248
10249 // Check if RHS is a subtraction against 0: (0 - X)
10250 if (RHS.getOpcode() == ISD::SUB) {
10251 SDValue SubLHS = RHS.getOperand(0);
10252 SDValue SubRHS = RHS.getOperand(1);
10253
10254 // Check if it's 0 - X
10255 if (isNullConstant(SubLHS)) {
10256 bool CanUseAdd = false;
10257 if (IsSigned) {
10258 // For SCMP: only if X is known to never be INT_MIN (to avoid overflow)
10259 if (RHS->getFlags().hasNoSignedWrap() || !DAG.computeKnownBits(SubRHS)
10261 .isMinSignedValue()) {
10262 CanUseAdd = true;
10263 }
10264 } else {
10265 // For UCMP: only if X is known to never be zero
10266 if (DAG.isKnownNeverZero(SubRHS)) {
10267 CanUseAdd = true;
10268 }
10269 }
10270
10271 if (CanUseAdd) {
10272 Opcode = ARMISD::ADDC;
10273 RHS = SubRHS; // Replace RHS with X, so we do LHS + X instead of
10274 // LHS - (0 - X)
10275 }
10276 }
10277 }
10278
10279 // Generate the operation with flags
10280 SDValue OpWithFlags =
10281 DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
10282
10283 SDValue OpResult = OpWithFlags.getValue(0);
10284 SDValue Flags = OpWithFlags.getValue(1);
10285
10286 // Constants for conditional moves
10287 SDValue One = DAG.getConstant(1, dl, MVT::i32);
10288 SDValue MinusOne = DAG.getAllOnesConstant(dl, MVT::i32);
10289
10290 // Select condition codes based on signed vs unsigned
10291 ARMCC::CondCodes GTCond = IsSigned ? ARMCC::GT : ARMCC::HI;
10292 ARMCC::CondCodes LTCond = IsSigned ? ARMCC::LT : ARMCC::LO;
10293
10294 // First conditional move: if greater than, set to 1
10295 SDValue GTCondValue = DAG.getConstant(GTCond, dl, MVT::i32);
10296 SDValue Result1 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, OpResult, One,
10297 GTCondValue, Flags);
10298
10299 // Second conditional move: if less than, set to -1
10300 SDValue LTCondValue = DAG.getConstant(LTCond, dl, MVT::i32);
10301 SDValue Result2 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, Result1, MinusOne,
10302 LTCondValue, Flags);
10303
10304 if (Op.getValueType() != MVT::i32)
10305 Result2 = DAG.getSExtOrTrunc(Result2, dl, Op.getValueType());
10306
10307 return Result2;
10308}
10309
10311 LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump());
10312 switch (Op.getOpcode()) {
10313 default: llvm_unreachable("Don't know how to custom lower this!");
10314 case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG);
10315 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
10316 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
10317 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
10318 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
10319 case ISD::SELECT: return LowerSELECT(Op, DAG);
10320 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
10321 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
10322 case ISD::BR_CC: return LowerBR_CC(Op, DAG);
10323 case ISD::BR_JT: return LowerBR_JT(Op, DAG);
10324 case ISD::VASTART: return LowerVASTART(Op, DAG);
10325 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG, Subtarget);
10326 case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget);
10327 case ISD::SINT_TO_FP:
10328 case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG);
10331 case ISD::FP_TO_SINT:
10332 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
10334 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG, Subtarget);
10335 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
10336 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
10337 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
10338 case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);
10339 case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);
10340 case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
10341 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG, Subtarget);
10342 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
10343 Subtarget);
10344 case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG, Subtarget);
10345 case ISD::SHL:
10346 case ISD::SRL:
10347 case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget);
10348 case ISD::SREM: return LowerREM(Op.getNode(), DAG);
10349 case ISD::UREM: return LowerREM(Op.getNode(), DAG);
10350 case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG);
10351 case ISD::SRL_PARTS:
10352 case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG);
10353 case ISD::CTTZ:
10354 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
10355 case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget);
10356 case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget);
10357 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
10358 case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget);
10359 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget);
10360 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG, Subtarget);
10361 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG, Subtarget);
10362 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
10363 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG, Subtarget);
10364 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG, Subtarget);
10365 case ISD::TRUNCATE: return LowerTruncate(Op.getNode(), DAG, Subtarget);
10366 case ISD::SIGN_EXTEND:
10367 case ISD::ZERO_EXTEND: return LowerVectorExtend(Op.getNode(), DAG, Subtarget);
10368 case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
10369 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
10370 case ISD::SET_FPMODE:
10371 return LowerSET_FPMODE(Op, DAG);
10372 case ISD::RESET_FPMODE:
10373 return LowerRESET_FPMODE(Op, DAG);
10374 case ISD::MUL: return LowerMUL(Op, DAG);
10375 case ISD::SDIV:
10376 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
10377 return LowerDIV_Windows(Op, DAG, /* Signed */ true);
10378 return LowerSDIV(Op, DAG, Subtarget);
10379 case ISD::UDIV:
10380 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
10381 return LowerDIV_Windows(Op, DAG, /* Signed */ false);
10382 return LowerUDIV(Op, DAG, Subtarget);
10383 case ISD::UADDO_CARRY:
10384 case ISD::USUBO_CARRY:
10385 return LowerUADDSUBO_CARRY(Op, DAG);
10386 case ISD::SADDO:
10387 case ISD::SSUBO:
10388 return LowerSignedALUO(Op, DAG);
10389 case ISD::UADDO:
10390 case ISD::USUBO:
10391 return LowerUnsignedALUO(Op, DAG);
10392 case ISD::SADDSAT:
10393 case ISD::SSUBSAT:
10394 case ISD::UADDSAT:
10395 case ISD::USUBSAT:
10396 return LowerADDSUBSAT(Op, DAG, Subtarget);
10397 case ISD::LOAD:
10398 return LowerPredicateLoad(Op, DAG);
10399 case ISD::STORE:
10400 return LowerSTORE(Op, DAG, Subtarget);
10401 case ISD::MLOAD:
10402 return LowerMLOAD(Op, DAG);
10403 case ISD::VECREDUCE_MUL:
10404 case ISD::VECREDUCE_AND:
10405 case ISD::VECREDUCE_OR:
10406 case ISD::VECREDUCE_XOR:
10407 return LowerVecReduce(Op, DAG, Subtarget);
10412 return LowerVecReduceF(Op, DAG, Subtarget);
10417 return LowerVecReduceMinMax(Op, DAG, Subtarget);
10418 case ISD::ATOMIC_LOAD:
10419 case ISD::ATOMIC_STORE:
10420 return LowerAtomicLoadStore(Op, DAG);
10421 case ISD::SDIVREM:
10422 case ISD::UDIVREM: return LowerDivRem(Op, DAG);
10424 if (Subtarget->isTargetWindows())
10425 return LowerDYNAMIC_STACKALLOC(Op, DAG);
10426 llvm_unreachable("Don't know how to custom lower this!");
10428 case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
10430 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
10431 case ISD::STRICT_FSETCC:
10432 case ISD::STRICT_FSETCCS: return LowerFSETCC(Op, DAG);
10433 case ISD::SPONENTRY:
10434 return LowerSPONENTRY(Op, DAG);
10435 case ISD::FP_TO_BF16:
10436 return LowerFP_TO_BF16(Op, DAG);
10437 case ARMISD::WIN__DBZCHK: return SDValue();
10438 case ISD::UCMP:
10439 case ISD::SCMP:
10440 return LowerCMP(Op, DAG);
10441 case ISD::ABS:
10442 return LowerABS(Op, DAG);
10443 case ISD::STRICT_LROUND:
10445 case ISD::STRICT_LRINT:
10446 case ISD::STRICT_LLRINT: {
10447 assert((Op.getOperand(1).getValueType() == MVT::f16 ||
10448 Op.getOperand(1).getValueType() == MVT::bf16) &&
10449 "Expected custom lowering of rounding operations only for f16");
10450 SDLoc DL(Op);
10451 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
10452 {Op.getOperand(0), Op.getOperand(1)});
10453 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
10454 {Ext.getValue(1), Ext.getValue(0)});
10455 }
10456 }
10457}
10458
10460 SelectionDAG &DAG) {
10461 unsigned IntNo = N->getConstantOperandVal(0);
10462 unsigned Opc = 0;
10463 if (IntNo == Intrinsic::arm_smlald)
10464 Opc = ARMISD::SMLALD;
10465 else if (IntNo == Intrinsic::arm_smlaldx)
10466 Opc = ARMISD::SMLALDX;
10467 else if (IntNo == Intrinsic::arm_smlsld)
10468 Opc = ARMISD::SMLSLD;
10469 else if (IntNo == Intrinsic::arm_smlsldx)
10470 Opc = ARMISD::SMLSLDX;
10471 else
10472 return;
10473
10474 SDLoc dl(N);
10475 SDValue Lo, Hi;
10476 std::tie(Lo, Hi) = DAG.SplitScalar(N->getOperand(3), dl, MVT::i32, MVT::i32);
10477
10478 SDValue LongMul = DAG.getNode(Opc, dl,
10479 DAG.getVTList(MVT::i32, MVT::i32),
10480 N->getOperand(1), N->getOperand(2),
10481 Lo, Hi);
10482 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
10483 LongMul.getValue(0), LongMul.getValue(1)));
10484}
10485
10486/// ReplaceNodeResults - Replace the results of node with an illegal result
10487/// type with new values built out of custom code.
10490 SelectionDAG &DAG) const {
10491 SDValue Res;
10492 switch (N->getOpcode()) {
10493 default:
10494 llvm_unreachable("Don't know how to custom expand this!");
10495 case ISD::READ_REGISTER:
10497 break;
10498 case ISD::BITCAST:
10499 Res = ExpandBITCAST(N, DAG, Subtarget);
10500 break;
10501 case ISD::SRL:
10502 case ISD::SRA:
10503 case ISD::SHL:
10504 Res = Expand64BitShift(N, DAG, Subtarget);
10505 break;
10506 case ISD::SREM:
10507 case ISD::UREM:
10508 Res = LowerREM(N, DAG);
10509 break;
10510 case ISD::SDIVREM:
10511 case ISD::UDIVREM:
10512 Res = LowerDivRem(SDValue(N, 0), DAG);
10513 assert(Res.getNumOperands() == 2 && "DivRem needs two values");
10514 Results.push_back(Res.getValue(0));
10515 Results.push_back(Res.getValue(1));
10516 return;
10517 case ISD::SADDSAT:
10518 case ISD::SSUBSAT:
10519 case ISD::UADDSAT:
10520 case ISD::USUBSAT:
10521 Res = LowerADDSUBSAT(SDValue(N, 0), DAG, Subtarget);
10522 break;
10524 ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
10525 return;
10526 case ISD::UDIV:
10527 case ISD::SDIV:
10528 assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows");
10529 return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV,
10530 Results);
10533 return;
10535 return ReplaceLongIntrinsic(N, Results, DAG);
10536 case ISD::LOAD:
10537 LowerLOAD(N, Results, DAG);
10538 break;
10539 case ISD::TRUNCATE:
10540 Res = LowerTruncate(N, DAG, Subtarget);
10541 break;
10542 case ISD::SIGN_EXTEND:
10543 case ISD::ZERO_EXTEND:
10544 Res = LowerVectorExtend(N, DAG, Subtarget);
10545 break;
10548 Res = LowerFP_TO_INT_SAT(SDValue(N, 0), DAG, Subtarget);
10549 break;
10550 }
10551 if (Res.getNode())
10552 Results.push_back(Res);
10553}
10554
10555//===----------------------------------------------------------------------===//
10556// ARM Scheduler Hooks
10557//===----------------------------------------------------------------------===//
10558
10559/// SetupEntryBlockForSjLj - Insert code into the entry block that creates and
10560/// registers the function context.
10561void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
10563 MachineBasicBlock *DispatchBB,
10564 int FI) const {
10565 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
10566 "ROPI/RWPI not currently supported with SjLj");
10567 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
10568 DebugLoc dl = MI.getDebugLoc();
10569 MachineFunction *MF = MBB->getParent();
10573 const Function &F = MF->getFunction();
10574
10575 bool isThumb = Subtarget->isThumb();
10576 bool isThumb2 = Subtarget->isThumb2();
10577
10578 unsigned PCLabelId = AFI->createPICLabelUId();
10579 unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8;
10581 ARMConstantPoolMBB::Create(F.getContext(), DispatchBB, PCLabelId, PCAdj);
10582 unsigned CPI = MCP->getConstantPoolIndex(CPV, Align(4));
10583
10584 const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass
10585 : &ARM::GPRRegClass;
10586
10587 // Grab constant pool and fixed stack memory operands.
10588 MachineMemOperand *CPMMO =
10591
10592 MachineMemOperand *FIMMOSt =
10595
10596 // Load the address of the dispatch MBB into the jump buffer.
10597 if (isThumb2) {
10598 // Incoming value: jbuf
10599 // ldr.n r5, LCPI1_1
10600 // orr r5, r5, #1
10601 // add r5, pc
10602 // str r5, [$jbuf, #+4] ; &jbuf[1]
10603 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10604 BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1)
10606 .addMemOperand(CPMMO)
10608 // Set the low bit because of thumb mode.
10609 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10610 BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2)
10611 .addReg(NewVReg1, RegState::Kill)
10612 .addImm(0x01)
10614 .add(condCodeOp());
10615 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10616 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3)
10617 .addReg(NewVReg2, RegState::Kill)
10618 .addImm(PCLabelId);
10619 BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12))
10620 .addReg(NewVReg3, RegState::Kill)
10621 .addFrameIndex(FI)
10622 .addImm(36) // &jbuf[1] :: pc
10623 .addMemOperand(FIMMOSt)
10625 } else if (isThumb) {
10626 // Incoming value: jbuf
10627 // ldr.n r1, LCPI1_4
10628 // add r1, pc
10629 // mov r2, #1
10630 // orrs r1, r2
10631 // add r2, $jbuf, #+4 ; &jbuf[1]
10632 // str r1, [r2]
10633 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10634 BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1)
10636 .addMemOperand(CPMMO)
10638 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10639 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2)
10640 .addReg(NewVReg1, RegState::Kill)
10641 .addImm(PCLabelId);
10642 // Set the low bit because of thumb mode.
10643 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10644 BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3)
10645 .addReg(ARM::CPSR, RegState::Define)
10646 .addImm(1)
10648 Register NewVReg4 = MRI->createVirtualRegister(TRC);
10649 BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4)
10650 .addReg(ARM::CPSR, RegState::Define)
10651 .addReg(NewVReg2, RegState::Kill)
10652 .addReg(NewVReg3, RegState::Kill)
10654 Register NewVReg5 = MRI->createVirtualRegister(TRC);
10655 BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5)
10656 .addFrameIndex(FI)
10657 .addImm(36); // &jbuf[1] :: pc
10658 BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi))
10659 .addReg(NewVReg4, RegState::Kill)
10660 .addReg(NewVReg5, RegState::Kill)
10661 .addImm(0)
10662 .addMemOperand(FIMMOSt)
10664 } else {
10665 // Incoming value: jbuf
10666 // ldr r1, LCPI1_1
10667 // add r1, pc, r1
10668 // str r1, [$jbuf, #+4] ; &jbuf[1]
10669 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10670 BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1)
10672 .addImm(0)
10673 .addMemOperand(CPMMO)
10675 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10676 BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2)
10677 .addReg(NewVReg1, RegState::Kill)
10678 .addImm(PCLabelId)
10680 BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12))
10681 .addReg(NewVReg2, RegState::Kill)
10682 .addFrameIndex(FI)
10683 .addImm(36) // &jbuf[1] :: pc
10684 .addMemOperand(FIMMOSt)
10686 }
10687}
10688
10689void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
10690 MachineBasicBlock *MBB) const {
10691 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
10692 DebugLoc dl = MI.getDebugLoc();
10693 MachineFunction *MF = MBB->getParent();
10694 MachineRegisterInfo *MRI = &MF->getRegInfo();
10695 MachineFrameInfo &MFI = MF->getFrameInfo();
10696 int FI = MFI.getFunctionContextIndex();
10697
10698 const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass
10699 : &ARM::GPRnopcRegClass;
10700
10701 // Get a mapping of the call site numbers to all of the landing pads they're
10702 // associated with.
10703 DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2>> CallSiteNumToLPad;
10704 unsigned MaxCSNum = 0;
10705 for (MachineBasicBlock &BB : *MF) {
10706 if (!BB.isEHPad())
10707 continue;
10708
10709 // FIXME: We should assert that the EH_LABEL is the first MI in the landing
10710 // pad.
10711 for (MachineInstr &II : BB) {
10712 if (!II.isEHLabel())
10713 continue;
10714
10715 MCSymbol *Sym = II.getOperand(0).getMCSymbol();
10716 if (!MF->hasCallSiteLandingPad(Sym)) continue;
10717
10718 SmallVectorImpl<unsigned> &CallSiteIdxs = MF->getCallSiteLandingPad(Sym);
10719 for (unsigned Idx : CallSiteIdxs) {
10720 CallSiteNumToLPad[Idx].push_back(&BB);
10721 MaxCSNum = std::max(MaxCSNum, Idx);
10722 }
10723 break;
10724 }
10725 }
10726
10727 // Get an ordered list of the machine basic blocks for the jump table.
10728 std::vector<MachineBasicBlock*> LPadList;
10729 SmallPtrSet<MachineBasicBlock*, 32> InvokeBBs;
10730 LPadList.reserve(CallSiteNumToLPad.size());
10731 for (unsigned I = 1; I <= MaxCSNum; ++I) {
10732 SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I];
10733 for (MachineBasicBlock *MBB : MBBList) {
10734 LPadList.push_back(MBB);
10735 InvokeBBs.insert_range(MBB->predecessors());
10736 }
10737 }
10738
10739 assert(!LPadList.empty() &&
10740 "No landing pad destinations for the dispatch jump table!");
10741
10742 // Create the jump table and associated information.
10743 MachineJumpTableInfo *JTI =
10744 MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline);
10745 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
10746
10747 // Create the MBBs for the dispatch code.
10748
10749 // Shove the dispatch's address into the return slot in the function context.
10750 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
10751 DispatchBB->setIsEHPad();
10752
10753 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
10754
10755 BuildMI(TrapBB, dl, TII->get(Subtarget->isThumb() ? ARM::tTRAP : ARM::TRAP));
10756 DispatchBB->addSuccessor(TrapBB);
10757
10758 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
10759 DispatchBB->addSuccessor(DispContBB);
10760
10761 // Insert and MBBs.
10762 MF->insert(MF->end(), DispatchBB);
10763 MF->insert(MF->end(), DispContBB);
10764 MF->insert(MF->end(), TrapBB);
10765
10766 // Insert code into the entry block that creates and registers the function
10767 // context.
10768 SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI);
10769
10770 MachineMemOperand *FIMMOLd = MF->getMachineMemOperand(
10773
10774 MachineInstrBuilder MIB;
10775 MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
10776
10777 const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
10778 const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
10779
10780 // Add a register mask with no preserved registers. This results in all
10781 // registers being marked as clobbered. This can't work if the dispatch block
10782 // is in a Thumb1 function and is linked with ARM code which uses the FP
10783 // registers, as there is no way to preserve the FP registers in Thumb1 mode.
10785
10786 bool IsPositionIndependent = isPositionIndependent();
10787 unsigned NumLPads = LPadList.size();
10788 if (Subtarget->isThumb2()) {
10789 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10790 BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1)
10791 .addFrameIndex(FI)
10792 .addImm(4)
10793 .addMemOperand(FIMMOLd)
10795
10796 if (NumLPads < 256) {
10797 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri))
10798 .addReg(NewVReg1)
10799 .addImm(LPadList.size())
10801 } else {
10802 Register VReg1 = MRI->createVirtualRegister(TRC);
10803 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1)
10804 .addImm(NumLPads & 0xFFFF)
10806
10807 unsigned VReg2 = VReg1;
10808 if ((NumLPads & 0xFFFF0000) != 0) {
10809 VReg2 = MRI->createVirtualRegister(TRC);
10810 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2)
10811 .addReg(VReg1)
10812 .addImm(NumLPads >> 16)
10814 }
10815
10816 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr))
10817 .addReg(NewVReg1)
10818 .addReg(VReg2)
10820 }
10821
10822 BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc))
10823 .addMBB(TrapBB)
10825 .addReg(ARM::CPSR);
10826
10827 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10828 BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3)
10829 .addJumpTableIndex(MJTI)
10831
10832 Register NewVReg4 = MRI->createVirtualRegister(TRC);
10833 BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4)
10834 .addReg(NewVReg3, RegState::Kill)
10835 .addReg(NewVReg1)
10838 .add(condCodeOp());
10839
10840 BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT))
10841 .addReg(NewVReg4, RegState::Kill)
10842 .addReg(NewVReg1)
10843 .addJumpTableIndex(MJTI);
10844 } else if (Subtarget->isThumb()) {
10845 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10846 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)
10847 .addFrameIndex(FI)
10848 .addImm(1)
10849 .addMemOperand(FIMMOLd)
10851
10852 if (NumLPads < 256) {
10853 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8))
10854 .addReg(NewVReg1)
10855 .addImm(NumLPads)
10857 } else {
10858 MachineConstantPool *ConstantPool = MF->getConstantPool();
10859 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
10860 const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
10861
10862 // MachineConstantPool wants an explicit alignment.
10863 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
10864 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
10865
10866 Register VReg1 = MRI->createVirtualRegister(TRC);
10867 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))
10868 .addReg(VReg1, RegState::Define)
10871 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr))
10872 .addReg(NewVReg1)
10873 .addReg(VReg1)
10875 }
10876
10877 BuildMI(DispatchBB, dl, TII->get(ARM::tBcc))
10878 .addMBB(TrapBB)
10880 .addReg(ARM::CPSR);
10881
10882 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10883 BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2)
10884 .addReg(ARM::CPSR, RegState::Define)
10885 .addReg(NewVReg1)
10886 .addImm(2)
10888
10889 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10890 BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)
10891 .addJumpTableIndex(MJTI)
10893
10894 Register NewVReg4 = MRI->createVirtualRegister(TRC);
10895 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)
10896 .addReg(ARM::CPSR, RegState::Define)
10897 .addReg(NewVReg2, RegState::Kill)
10898 .addReg(NewVReg3)
10900
10901 MachineMemOperand *JTMMOLd =
10902 MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF),
10904
10905 Register NewVReg5 = MRI->createVirtualRegister(TRC);
10906 BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
10907 .addReg(NewVReg4, RegState::Kill)
10908 .addImm(0)
10909 .addMemOperand(JTMMOLd)
10911
10912 unsigned NewVReg6 = NewVReg5;
10913 if (IsPositionIndependent) {
10914 NewVReg6 = MRI->createVirtualRegister(TRC);
10915 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
10916 .addReg(ARM::CPSR, RegState::Define)
10917 .addReg(NewVReg5, RegState::Kill)
10918 .addReg(NewVReg3)
10920 }
10921
10922 BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr))
10923 .addReg(NewVReg6, RegState::Kill)
10924 .addJumpTableIndex(MJTI);
10925 } else {
10926 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10927 BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)
10928 .addFrameIndex(FI)
10929 .addImm(4)
10930 .addMemOperand(FIMMOLd)
10932
10933 if (NumLPads < 256) {
10934 BuildMI(DispatchBB, dl, TII->get(ARM::CMPri))
10935 .addReg(NewVReg1)
10936 .addImm(NumLPads)
10938 } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) {
10939 Register VReg1 = MRI->createVirtualRegister(TRC);
10940 BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1)
10941 .addImm(NumLPads & 0xFFFF)
10943
10944 unsigned VReg2 = VReg1;
10945 if ((NumLPads & 0xFFFF0000) != 0) {
10946 VReg2 = MRI->createVirtualRegister(TRC);
10947 BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2)
10948 .addReg(VReg1)
10949 .addImm(NumLPads >> 16)
10951 }
10952
10953 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
10954 .addReg(NewVReg1)
10955 .addReg(VReg2)
10957 } else {
10958 MachineConstantPool *ConstantPool = MF->getConstantPool();
10959 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
10960 const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
10961
10962 // MachineConstantPool wants an explicit alignment.
10963 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
10964 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
10965
10966 Register VReg1 = MRI->createVirtualRegister(TRC);
10967 BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))
10968 .addReg(VReg1, RegState::Define)
10970 .addImm(0)
10972 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
10973 .addReg(NewVReg1)
10974 .addReg(VReg1, RegState::Kill)
10976 }
10977
10978 BuildMI(DispatchBB, dl, TII->get(ARM::Bcc))
10979 .addMBB(TrapBB)
10981 .addReg(ARM::CPSR);
10982
10983 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10984 BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3)
10985 .addReg(NewVReg1)
10988 .add(condCodeOp());
10989 Register NewVReg4 = MRI->createVirtualRegister(TRC);
10990 BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)
10991 .addJumpTableIndex(MJTI)
10993
10994 MachineMemOperand *JTMMOLd =
10995 MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF),
10997 Register NewVReg5 = MRI->createVirtualRegister(TRC);
10998 BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
10999 .addReg(NewVReg3, RegState::Kill)
11000 .addReg(NewVReg4)
11001 .addImm(0)
11002 .addMemOperand(JTMMOLd)
11004
11005 if (IsPositionIndependent) {
11006 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
11007 .addReg(NewVReg5, RegState::Kill)
11008 .addReg(NewVReg4)
11009 .addJumpTableIndex(MJTI);
11010 } else {
11011 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr))
11012 .addReg(NewVReg5, RegState::Kill)
11013 .addJumpTableIndex(MJTI);
11014 }
11015 }
11016
11017 // Add the jump table entries as successors to the MBB.
11018 SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs;
11019 for (MachineBasicBlock *CurMBB : LPadList) {
11020 if (SeenMBBs.insert(CurMBB).second)
11021 DispContBB->addSuccessor(CurMBB);
11022 }
11023
11024 // N.B. the order the invoke BBs are processed in doesn't matter here.
11025 const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF);
11027 for (MachineBasicBlock *BB : InvokeBBs) {
11028
11029 // Remove the landing pad successor from the invoke block and replace it
11030 // with the new dispatch block.
11031 SmallVector<MachineBasicBlock*, 4> Successors(BB->successors());
11032 while (!Successors.empty()) {
11033 MachineBasicBlock *SMBB = Successors.pop_back_val();
11034 if (SMBB->isEHPad()) {
11035 BB->removeSuccessor(SMBB);
11036 MBBLPads.push_back(SMBB);
11037 }
11038 }
11039
11040 BB->addSuccessor(DispatchBB, BranchProbability::getZero());
11041 BB->normalizeSuccProbs();
11042
11043 // Find the invoke call and mark all of the callee-saved registers as
11044 // 'implicit defined' so that they're spilled. This prevents code from
11045 // moving instructions to before the EH block, where they will never be
11046 // executed.
11048 II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) {
11049 if (!II->isCall()) continue;
11050
11051 DenseSet<unsigned> DefRegs;
11053 OI = II->operands_begin(), OE = II->operands_end();
11054 OI != OE; ++OI) {
11055 if (!OI->isReg()) continue;
11056 DefRegs.insert(OI->getReg());
11057 }
11058
11059 MachineInstrBuilder MIB(*MF, &*II);
11060
11061 for (unsigned i = 0; SavedRegs[i] != 0; ++i) {
11062 unsigned Reg = SavedRegs[i];
11063 if (Subtarget->isThumb2() &&
11064 !ARM::tGPRRegClass.contains(Reg) &&
11065 !ARM::hGPRRegClass.contains(Reg))
11066 continue;
11067 if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg))
11068 continue;
11069 if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg))
11070 continue;
11071 if (!DefRegs.contains(Reg))
11073 }
11074
11075 break;
11076 }
11077 }
11078
11079 // Mark all former landing pads as non-landing pads. The dispatch is the only
11080 // landing pad now.
11081 for (MachineBasicBlock *MBBLPad : MBBLPads)
11082 MBBLPad->setIsEHPad(false);
11083
11084 // The instruction is gone now.
11085 MI.eraseFromParent();
11086}
11087
11088static
11090 for (MachineBasicBlock *S : MBB->successors())
11091 if (S != Succ)
11092 return S;
11093 llvm_unreachable("Expecting a BB with two successors!");
11094}
11095
11096/// Return the load opcode for a given load size. If load size >= 8,
11097/// neon opcode will be returned.
11098static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) {
11099 if (LdSize >= 8)
11100 return LdSize == 16 ? ARM::VLD1q32wb_fixed
11101 : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0;
11102 if (IsThumb1)
11103 return LdSize == 4 ? ARM::tLDRi
11104 : LdSize == 2 ? ARM::tLDRHi
11105 : LdSize == 1 ? ARM::tLDRBi : 0;
11106 if (IsThumb2)
11107 return LdSize == 4 ? ARM::t2LDR_POST
11108 : LdSize == 2 ? ARM::t2LDRH_POST
11109 : LdSize == 1 ? ARM::t2LDRB_POST : 0;
11110 return LdSize == 4 ? ARM::LDR_POST_IMM
11111 : LdSize == 2 ? ARM::LDRH_POST
11112 : LdSize == 1 ? ARM::LDRB_POST_IMM : 0;
11113}
11114
11115/// Return the store opcode for a given store size. If store size >= 8,
11116/// neon opcode will be returned.
11117static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) {
11118 if (StSize >= 8)
11119 return StSize == 16 ? ARM::VST1q32wb_fixed
11120 : StSize == 8 ? ARM::VST1d32wb_fixed : 0;
11121 if (IsThumb1)
11122 return StSize == 4 ? ARM::tSTRi
11123 : StSize == 2 ? ARM::tSTRHi
11124 : StSize == 1 ? ARM::tSTRBi : 0;
11125 if (IsThumb2)
11126 return StSize == 4 ? ARM::t2STR_POST
11127 : StSize == 2 ? ARM::t2STRH_POST
11128 : StSize == 1 ? ARM::t2STRB_POST : 0;
11129 return StSize == 4 ? ARM::STR_POST_IMM
11130 : StSize == 2 ? ARM::STRH_POST
11131 : StSize == 1 ? ARM::STRB_POST_IMM : 0;
11132}
11133
11134/// Emit a post-increment load operation with given size. The instructions
11135/// will be added to BB at Pos.
11137 const TargetInstrInfo *TII, const DebugLoc &dl,
11138 unsigned LdSize, unsigned Data, unsigned AddrIn,
11139 unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
11140 unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2);
11141 assert(LdOpc != 0 && "Should have a load opcode");
11142 if (LdSize >= 8) {
11143 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11144 .addReg(AddrOut, RegState::Define)
11145 .addReg(AddrIn)
11146 .addImm(0)
11148 } else if (IsThumb1) {
11149 // load + update AddrIn
11150 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11151 .addReg(AddrIn)
11152 .addImm(0)
11154 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
11155 .add(t1CondCodeOp())
11156 .addReg(AddrIn)
11157 .addImm(LdSize)
11159 } else if (IsThumb2) {
11160 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11161 .addReg(AddrOut, RegState::Define)
11162 .addReg(AddrIn)
11163 .addImm(LdSize)
11165 } else { // arm
11166 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11167 .addReg(AddrOut, RegState::Define)
11168 .addReg(AddrIn)
11169 .addReg(0)
11170 .addImm(LdSize)
11172 }
11173}
11174
11175/// Emit a post-increment store operation with given size. The instructions
11176/// will be added to BB at Pos.
11178 const TargetInstrInfo *TII, const DebugLoc &dl,
11179 unsigned StSize, unsigned Data, unsigned AddrIn,
11180 unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
11181 unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2);
11182 assert(StOpc != 0 && "Should have a store opcode");
11183 if (StSize >= 8) {
11184 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11185 .addReg(AddrIn)
11186 .addImm(0)
11187 .addReg(Data)
11189 } else if (IsThumb1) {
11190 // store + update AddrIn
11191 BuildMI(*BB, Pos, dl, TII->get(StOpc))
11192 .addReg(Data)
11193 .addReg(AddrIn)
11194 .addImm(0)
11196 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
11197 .add(t1CondCodeOp())
11198 .addReg(AddrIn)
11199 .addImm(StSize)
11201 } else if (IsThumb2) {
11202 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11203 .addReg(Data)
11204 .addReg(AddrIn)
11205 .addImm(StSize)
11207 } else { // arm
11208 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11209 .addReg(Data)
11210 .addReg(AddrIn)
11211 .addReg(0)
11212 .addImm(StSize)
11214 }
11215}
11216
11218ARMTargetLowering::EmitStructByval(MachineInstr &MI,
11219 MachineBasicBlock *BB) const {
11220 // This pseudo instruction has 3 operands: dst, src, size
11221 // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
11222 // Otherwise, we will generate unrolled scalar copies.
11223 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11224 const BasicBlock *LLVM_BB = BB->getBasicBlock();
11226
11227 Register dest = MI.getOperand(0).getReg();
11228 Register src = MI.getOperand(1).getReg();
11229 unsigned SizeVal = MI.getOperand(2).getImm();
11230 unsigned Alignment = MI.getOperand(3).getImm();
11231 DebugLoc dl = MI.getDebugLoc();
11232
11233 MachineFunction *MF = BB->getParent();
11234 MachineRegisterInfo &MRI = MF->getRegInfo();
11235 unsigned UnitSize = 0;
11236 const TargetRegisterClass *TRC = nullptr;
11237 const TargetRegisterClass *VecTRC = nullptr;
11238
11239 bool IsThumb1 = Subtarget->isThumb1Only();
11240 bool IsThumb2 = Subtarget->isThumb2();
11241 bool IsThumb = Subtarget->isThumb();
11242
11243 if (Alignment & 1) {
11244 UnitSize = 1;
11245 } else if (Alignment & 2) {
11246 UnitSize = 2;
11247 } else {
11248 // Check whether we can use NEON instructions.
11249 if (!MF->getFunction().hasFnAttribute(Attribute::NoImplicitFloat) &&
11250 Subtarget->hasNEON()) {
11251 if ((Alignment % 16 == 0) && SizeVal >= 16)
11252 UnitSize = 16;
11253 else if ((Alignment % 8 == 0) && SizeVal >= 8)
11254 UnitSize = 8;
11255 }
11256 // Can't use NEON instructions.
11257 if (UnitSize == 0)
11258 UnitSize = 4;
11259 }
11260
11261 // Select the correct opcode and register class for unit size load/store
11262 bool IsNeon = UnitSize >= 8;
11263 TRC = IsThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
11264 if (IsNeon)
11265 VecTRC = UnitSize == 16 ? &ARM::DPairRegClass
11266 : UnitSize == 8 ? &ARM::DPRRegClass
11267 : nullptr;
11268
11269 unsigned BytesLeft = SizeVal % UnitSize;
11270 unsigned LoopSize = SizeVal - BytesLeft;
11271
11272 if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) {
11273 // Use LDR and STR to copy.
11274 // [scratch, srcOut] = LDR_POST(srcIn, UnitSize)
11275 // [destOut] = STR_POST(scratch, destIn, UnitSize)
11276 unsigned srcIn = src;
11277 unsigned destIn = dest;
11278 for (unsigned i = 0; i < LoopSize; i+=UnitSize) {
11279 Register srcOut = MRI.createVirtualRegister(TRC);
11280 Register destOut = MRI.createVirtualRegister(TRC);
11281 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
11282 emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut,
11283 IsThumb1, IsThumb2);
11284 emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut,
11285 IsThumb1, IsThumb2);
11286 srcIn = srcOut;
11287 destIn = destOut;
11288 }
11289
11290 // Handle the leftover bytes with LDRB and STRB.
11291 // [scratch, srcOut] = LDRB_POST(srcIn, 1)
11292 // [destOut] = STRB_POST(scratch, destIn, 1)
11293 for (unsigned i = 0; i < BytesLeft; i++) {
11294 Register srcOut = MRI.createVirtualRegister(TRC);
11295 Register destOut = MRI.createVirtualRegister(TRC);
11296 Register scratch = MRI.createVirtualRegister(TRC);
11297 emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut,
11298 IsThumb1, IsThumb2);
11299 emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut,
11300 IsThumb1, IsThumb2);
11301 srcIn = srcOut;
11302 destIn = destOut;
11303 }
11304 MI.eraseFromParent(); // The instruction is gone now.
11305 return BB;
11306 }
11307
11308 // Expand the pseudo op to a loop.
11309 // thisMBB:
11310 // ...
11311 // movw varEnd, # --> with thumb2
11312 // movt varEnd, #
11313 // ldrcp varEnd, idx --> without thumb2
11314 // fallthrough --> loopMBB
11315 // loopMBB:
11316 // PHI varPhi, varEnd, varLoop
11317 // PHI srcPhi, src, srcLoop
11318 // PHI destPhi, dst, destLoop
11319 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
11320 // [destLoop] = STR_POST(scratch, destPhi, UnitSize)
11321 // subs varLoop, varPhi, #UnitSize
11322 // bne loopMBB
11323 // fallthrough --> exitMBB
11324 // exitMBB:
11325 // epilogue to handle left-over bytes
11326 // [scratch, srcOut] = LDRB_POST(srcLoop, 1)
11327 // [destOut] = STRB_POST(scratch, destLoop, 1)
11328 MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
11329 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
11330 MF->insert(It, loopMBB);
11331 MF->insert(It, exitMBB);
11332
11333 // Set the call frame size on entry to the new basic blocks.
11334 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
11335 loopMBB->setCallFrameSize(CallFrameSize);
11336 exitMBB->setCallFrameSize(CallFrameSize);
11337
11338 // Transfer the remainder of BB and its successor edges to exitMBB.
11339 exitMBB->splice(exitMBB->begin(), BB,
11340 std::next(MachineBasicBlock::iterator(MI)), BB->end());
11342
11343 // Load an immediate to varEnd.
11344 Register varEnd = MRI.createVirtualRegister(TRC);
11345 if (Subtarget->useMovt()) {
11346 BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVi32imm : ARM::MOVi32imm),
11347 varEnd)
11348 .addImm(LoopSize);
11349 } else if (Subtarget->genExecuteOnly()) {
11350 assert(IsThumb && "Non-thumb expected to have used movt");
11351 BuildMI(BB, dl, TII->get(ARM::tMOVi32imm), varEnd).addImm(LoopSize);
11352 } else {
11353 MachineConstantPool *ConstantPool = MF->getConstantPool();
11355 const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
11356
11357 // MachineConstantPool wants an explicit alignment.
11358 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
11359 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
11360 MachineMemOperand *CPMMO =
11363
11364 if (IsThumb)
11365 BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci))
11366 .addReg(varEnd, RegState::Define)
11369 .addMemOperand(CPMMO);
11370 else
11371 BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp))
11372 .addReg(varEnd, RegState::Define)
11374 .addImm(0)
11376 .addMemOperand(CPMMO);
11377 }
11378 BB->addSuccessor(loopMBB);
11379
11380 // Generate the loop body:
11381 // varPhi = PHI(varLoop, varEnd)
11382 // srcPhi = PHI(srcLoop, src)
11383 // destPhi = PHI(destLoop, dst)
11384 MachineBasicBlock *entryBB = BB;
11385 BB = loopMBB;
11386 Register varLoop = MRI.createVirtualRegister(TRC);
11387 Register varPhi = MRI.createVirtualRegister(TRC);
11388 Register srcLoop = MRI.createVirtualRegister(TRC);
11389 Register srcPhi = MRI.createVirtualRegister(TRC);
11390 Register destLoop = MRI.createVirtualRegister(TRC);
11391 Register destPhi = MRI.createVirtualRegister(TRC);
11392
11393 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi)
11394 .addReg(varLoop).addMBB(loopMBB)
11395 .addReg(varEnd).addMBB(entryBB);
11396 BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi)
11397 .addReg(srcLoop).addMBB(loopMBB)
11398 .addReg(src).addMBB(entryBB);
11399 BuildMI(BB, dl, TII->get(ARM::PHI), destPhi)
11400 .addReg(destLoop).addMBB(loopMBB)
11401 .addReg(dest).addMBB(entryBB);
11402
11403 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
11404 // [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
11405 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
11406 emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop,
11407 IsThumb1, IsThumb2);
11408 emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop,
11409 IsThumb1, IsThumb2);
11410
11411 // Decrement loop variable by UnitSize.
11412 if (IsThumb1) {
11413 BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop)
11414 .add(t1CondCodeOp())
11415 .addReg(varPhi)
11416 .addImm(UnitSize)
11418 } else {
11419 MachineInstrBuilder MIB =
11420 BuildMI(*BB, BB->end(), dl,
11421 TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
11422 MIB.addReg(varPhi)
11423 .addImm(UnitSize)
11425 .add(condCodeOp());
11426 MIB->getOperand(5).setReg(ARM::CPSR);
11427 MIB->getOperand(5).setIsDef(true);
11428 }
11429 BuildMI(*BB, BB->end(), dl,
11430 TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc))
11431 .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
11432
11433 // loopMBB can loop back to loopMBB or fall through to exitMBB.
11434 BB->addSuccessor(loopMBB);
11435 BB->addSuccessor(exitMBB);
11436
11437 // Add epilogue to handle BytesLeft.
11438 BB = exitMBB;
11439 auto StartOfExit = exitMBB->begin();
11440
11441 // [scratch, srcOut] = LDRB_POST(srcLoop, 1)
11442 // [destOut] = STRB_POST(scratch, destLoop, 1)
11443 unsigned srcIn = srcLoop;
11444 unsigned destIn = destLoop;
11445 for (unsigned i = 0; i < BytesLeft; i++) {
11446 Register srcOut = MRI.createVirtualRegister(TRC);
11447 Register destOut = MRI.createVirtualRegister(TRC);
11448 Register scratch = MRI.createVirtualRegister(TRC);
11449 emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut,
11450 IsThumb1, IsThumb2);
11451 emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut,
11452 IsThumb1, IsThumb2);
11453 srcIn = srcOut;
11454 destIn = destOut;
11455 }
11456
11457 MI.eraseFromParent(); // The instruction is gone now.
11458 return BB;
11459}
11460
11462ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI,
11463 MachineBasicBlock *MBB) const {
11464 const TargetMachine &TM = getTargetMachine();
11465 const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
11466 DebugLoc DL = MI.getDebugLoc();
11467
11468 assert(Subtarget->isTargetWindows() &&
11469 "__chkstk is only supported on Windows");
11470 assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode");
11471
11472 // __chkstk takes the number of words to allocate on the stack in R4, and
11473 // returns the stack adjustment in number of bytes in R4. This will not
11474 // clober any other registers (other than the obvious lr).
11475 //
11476 // Although, technically, IP should be considered a register which may be
11477 // clobbered, the call itself will not touch it. Windows on ARM is a pure
11478 // thumb-2 environment, so there is no interworking required. As a result, we
11479 // do not expect a veneer to be emitted by the linker, clobbering IP.
11480 //
11481 // Each module receives its own copy of __chkstk, so no import thunk is
11482 // required, again, ensuring that IP is not clobbered.
11483 //
11484 // Finally, although some linkers may theoretically provide a trampoline for
11485 // out of range calls (which is quite common due to a 32M range limitation of
11486 // branches for Thumb), we can generate the long-call version via
11487 // -mcmodel=large, alleviating the need for the trampoline which may clobber
11488 // IP.
11489
11490 switch (TM.getCodeModel()) {
11491 case CodeModel::Tiny:
11492 llvm_unreachable("Tiny code model not available on ARM.");
11493 case CodeModel::Small:
11494 case CodeModel::Medium:
11495 case CodeModel::Kernel:
11496 BuildMI(*MBB, MI, DL, TII.get(ARM::tBL))
11498 .addExternalSymbol("__chkstk")
11501 .addReg(ARM::R12,
11503 .addReg(ARM::CPSR,
11505 break;
11506 case CodeModel::Large: {
11507 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
11508 Register Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11509
11510 BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg)
11511 .addExternalSymbol("__chkstk");
11517 .addReg(ARM::R12,
11519 .addReg(ARM::CPSR,
11521 break;
11522 }
11523 }
11524
11525 BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP)
11526 .addReg(ARM::SP, RegState::Kill)
11527 .addReg(ARM::R4, RegState::Kill)
11530 .add(condCodeOp());
11531
11532 MI.eraseFromParent();
11533 return MBB;
11534}
11535
11537ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI,
11538 MachineBasicBlock *MBB) const {
11539 DebugLoc DL = MI.getDebugLoc();
11540 MachineFunction *MF = MBB->getParent();
11541 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11542
11543 MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock();
11544 MF->insert(++MBB->getIterator(), ContBB);
11545 ContBB->splice(ContBB->begin(), MBB,
11546 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
11548 MBB->addSuccessor(ContBB);
11549
11550 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
11551 BuildMI(TrapBB, DL, TII->get(ARM::t__brkdiv0));
11552 MF->push_back(TrapBB);
11553 MBB->addSuccessor(TrapBB);
11554
11555 BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8))
11556 .addReg(MI.getOperand(0).getReg())
11557 .addImm(0)
11559 BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc))
11560 .addMBB(TrapBB)
11562 .addReg(ARM::CPSR);
11563
11564 MI.eraseFromParent();
11565 return ContBB;
11566}
11567
11568// The CPSR operand of SelectItr might be missing a kill marker
11569// because there were multiple uses of CPSR, and ISel didn't know
11570// which to mark. Figure out whether SelectItr should have had a
11571// kill marker, and set it if it should. Returns the correct kill
11572// marker value.
11575 const TargetRegisterInfo* TRI) {
11576 // Scan forward through BB for a use/def of CPSR.
11577 MachineBasicBlock::iterator miI(std::next(SelectItr));
11578 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
11579 const MachineInstr& mi = *miI;
11580 if (mi.readsRegister(ARM::CPSR, /*TRI=*/nullptr))
11581 return false;
11582 if (mi.definesRegister(ARM::CPSR, /*TRI=*/nullptr))
11583 break; // Should have kill-flag - update below.
11584 }
11585
11586 // If we hit the end of the block, check whether CPSR is live into a
11587 // successor.
11588 if (miI == BB->end()) {
11589 for (MachineBasicBlock *Succ : BB->successors())
11590 if (Succ->isLiveIn(ARM::CPSR))
11591 return false;
11592 }
11593
11594 // We found a def, or hit the end of the basic block and CPSR wasn't live
11595 // out. SelectMI should have a kill flag on CPSR.
11596 SelectItr->addRegisterKilled(ARM::CPSR, TRI);
11597 return true;
11598}
11599
11600/// Adds logic in loop entry MBB to calculate loop iteration count and adds
11601/// t2WhileLoopSetup and t2WhileLoopStart to generate WLS loop
11603 MachineBasicBlock *TpLoopBody,
11604 MachineBasicBlock *TpExit, Register OpSizeReg,
11605 const TargetInstrInfo *TII, DebugLoc Dl,
11607 // Calculates loop iteration count = ceil(n/16) = (n + 15) >> 4.
11608 Register AddDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11609 BuildMI(TpEntry, Dl, TII->get(ARM::t2ADDri), AddDestReg)
11610 .addUse(OpSizeReg)
11611 .addImm(15)
11613 .addReg(0);
11614
11615 Register LsrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11616 BuildMI(TpEntry, Dl, TII->get(ARM::t2LSRri), LsrDestReg)
11617 .addUse(AddDestReg, RegState::Kill)
11618 .addImm(4)
11620 .addReg(0);
11621
11622 Register TotalIterationsReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11623 BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopSetup), TotalIterationsReg)
11624 .addUse(LsrDestReg, RegState::Kill);
11625
11626 BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopStart))
11627 .addUse(TotalIterationsReg)
11628 .addMBB(TpExit);
11629
11630 BuildMI(TpEntry, Dl, TII->get(ARM::t2B))
11631 .addMBB(TpLoopBody)
11633
11634 return TotalIterationsReg;
11635}
11636
11637/// Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and
11638/// t2DoLoopEnd. These are used by later passes to generate tail predicated
11639/// loops.
11640static void genTPLoopBody(MachineBasicBlock *TpLoopBody,
11641 MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit,
11642 const TargetInstrInfo *TII, DebugLoc Dl,
11643 MachineRegisterInfo &MRI, Register OpSrcReg,
11644 Register OpDestReg, Register ElementCountReg,
11645 Register TotalIterationsReg, bool IsMemcpy) {
11646 // First insert 4 PHI nodes for: Current pointer to Src (if memcpy), Dest
11647 // array, loop iteration counter, predication counter.
11648
11649 Register SrcPhiReg, CurrSrcReg;
11650 if (IsMemcpy) {
11651 // Current position in the src array
11652 SrcPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11653 CurrSrcReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11654 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), SrcPhiReg)
11655 .addUse(OpSrcReg)
11656 .addMBB(TpEntry)
11657 .addUse(CurrSrcReg)
11658 .addMBB(TpLoopBody);
11659 }
11660
11661 // Current position in the dest array
11662 Register DestPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11663 Register CurrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11664 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), DestPhiReg)
11665 .addUse(OpDestReg)
11666 .addMBB(TpEntry)
11667 .addUse(CurrDestReg)
11668 .addMBB(TpLoopBody);
11669
11670 // Current loop counter
11671 Register LoopCounterPhiReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11672 Register RemainingLoopIterationsReg =
11673 MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11674 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), LoopCounterPhiReg)
11675 .addUse(TotalIterationsReg)
11676 .addMBB(TpEntry)
11677 .addUse(RemainingLoopIterationsReg)
11678 .addMBB(TpLoopBody);
11679
11680 // Predication counter
11681 Register PredCounterPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11682 Register RemainingElementsReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11683 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), PredCounterPhiReg)
11684 .addUse(ElementCountReg)
11685 .addMBB(TpEntry)
11686 .addUse(RemainingElementsReg)
11687 .addMBB(TpLoopBody);
11688
11689 // Pass predication counter to VCTP
11690 Register VccrReg = MRI.createVirtualRegister(&ARM::VCCRRegClass);
11691 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VCTP8), VccrReg)
11692 .addUse(PredCounterPhiReg)
11694 .addReg(0)
11695 .addReg(0);
11696
11697 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2SUBri), RemainingElementsReg)
11698 .addUse(PredCounterPhiReg)
11699 .addImm(16)
11701 .addReg(0);
11702
11703 // VLDRB (only if memcpy) and VSTRB instructions, predicated using VPR
11704 Register SrcValueReg;
11705 if (IsMemcpy) {
11706 SrcValueReg = MRI.createVirtualRegister(&ARM::MQPRRegClass);
11707 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VLDRBU8_post))
11708 .addDef(CurrSrcReg)
11709 .addDef(SrcValueReg)
11710 .addReg(SrcPhiReg)
11711 .addImm(16)
11713 .addUse(VccrReg)
11714 .addReg(0);
11715 } else
11716 SrcValueReg = OpSrcReg;
11717
11718 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VSTRBU8_post))
11719 .addDef(CurrDestReg)
11720 .addUse(SrcValueReg)
11721 .addReg(DestPhiReg)
11722 .addImm(16)
11724 .addUse(VccrReg)
11725 .addReg(0);
11726
11727 // Add the pseudoInstrs for decrementing the loop counter and marking the
11728 // end:t2DoLoopDec and t2DoLoopEnd
11729 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopDec), RemainingLoopIterationsReg)
11730 .addUse(LoopCounterPhiReg)
11731 .addImm(1);
11732
11733 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopEnd))
11734 .addUse(RemainingLoopIterationsReg)
11735 .addMBB(TpLoopBody);
11736
11737 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2B))
11738 .addMBB(TpExit)
11740}
11741
11743 // KCFI is supported in all ARM/Thumb modes
11744 return true;
11745}
11746
11750 const TargetInstrInfo *TII) const {
11751 assert(MBBI->isCall() && MBBI->getCFIType() &&
11752 "Invalid call instruction for a KCFI check");
11753
11754 MachineOperand *TargetOp = nullptr;
11755 switch (MBBI->getOpcode()) {
11756 // ARM mode opcodes
11757 case ARM::BLX:
11758 case ARM::BLX_pred:
11759 case ARM::BLX_noip:
11760 case ARM::BLX_pred_noip:
11761 case ARM::BX_CALL:
11762 TargetOp = &MBBI->getOperand(0);
11763 break;
11764 case ARM::TCRETURNri:
11765 case ARM::TCRETURNrinotr12:
11766 case ARM::TAILJMPr:
11767 case ARM::TAILJMPr4:
11768 TargetOp = &MBBI->getOperand(0);
11769 break;
11770 // Thumb mode opcodes (Thumb1 and Thumb2)
11771 // Note: Most Thumb call instructions have predicate operands before the
11772 // target register Format: tBLXr pred, predreg, target_register, ...
11773 case ARM::tBLXr: // Thumb1/Thumb2: BLX register (requires V5T)
11774 case ARM::tBLXr_noip: // Thumb1/Thumb2: BLX register, no IP clobber
11775 case ARM::tBX_CALL: // Thumb1 only: BX call (push LR, BX)
11776 TargetOp = &MBBI->getOperand(2);
11777 break;
11778 // Tail call instructions don't have predicates, target is operand 0
11779 case ARM::tTAILJMPr: // Thumb1/Thumb2: Tail call via register
11780 TargetOp = &MBBI->getOperand(0);
11781 break;
11782 default:
11783 llvm_unreachable("Unexpected CFI call opcode");
11784 }
11785
11786 assert(TargetOp && TargetOp->isReg() && "Invalid target operand");
11787 TargetOp->setIsRenamable(false);
11788
11789 // Select the appropriate KCFI_CHECK variant based on the instruction set
11790 unsigned KCFICheckOpcode;
11791 if (Subtarget->isThumb()) {
11792 if (Subtarget->isThumb2()) {
11793 KCFICheckOpcode = ARM::KCFI_CHECK_Thumb2;
11794 } else {
11795 KCFICheckOpcode = ARM::KCFI_CHECK_Thumb1;
11796 }
11797 } else {
11798 KCFICheckOpcode = ARM::KCFI_CHECK_ARM;
11799 }
11800
11801 return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(KCFICheckOpcode))
11802 .addReg(TargetOp->getReg())
11803 .addImm(MBBI->getCFIType())
11804 .getInstr();
11805}
11806
11809 MachineBasicBlock *BB) const {
11810 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11811 DebugLoc dl = MI.getDebugLoc();
11812 bool isThumb2 = Subtarget->isThumb2();
11813 switch (MI.getOpcode()) {
11814 default: {
11815 MI.print(errs());
11816 llvm_unreachable("Unexpected instr type to insert");
11817 }
11818
11819 // Thumb1 post-indexed loads are really just single-register LDMs.
11820 case ARM::tLDR_postidx: {
11821 MachineOperand Def(MI.getOperand(1));
11822 BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD))
11823 .add(Def) // Rn_wb
11824 .add(MI.getOperand(2)) // Rn
11825 .add(MI.getOperand(3)) // PredImm
11826 .add(MI.getOperand(4)) // PredReg
11827 .add(MI.getOperand(0)) // Rt
11828 .cloneMemRefs(MI);
11829 MI.eraseFromParent();
11830 return BB;
11831 }
11832
11833 case ARM::MVE_MEMCPYLOOPINST:
11834 case ARM::MVE_MEMSETLOOPINST: {
11835
11836 // Transformation below expands MVE_MEMCPYLOOPINST/MVE_MEMSETLOOPINST Pseudo
11837 // into a Tail Predicated (TP) Loop. It adds the instructions to calculate
11838 // the iteration count =ceil(size_in_bytes/16)) in the TP entry block and
11839 // adds the relevant instructions in the TP loop Body for generation of a
11840 // WLSTP loop.
11841
11842 // Below is relevant portion of the CFG after the transformation.
11843 // The Machine Basic Blocks are shown along with branch conditions (in
11844 // brackets). Note that TP entry/exit MBBs depict the entry/exit of this
11845 // portion of the CFG and may not necessarily be the entry/exit of the
11846 // function.
11847
11848 // (Relevant) CFG after transformation:
11849 // TP entry MBB
11850 // |
11851 // |-----------------|
11852 // (n <= 0) (n > 0)
11853 // | |
11854 // | TP loop Body MBB<--|
11855 // | | |
11856 // \ |___________|
11857 // \ /
11858 // TP exit MBB
11859
11860 MachineFunction *MF = BB->getParent();
11861 MachineFunctionProperties &Properties = MF->getProperties();
11863
11864 Register OpDestReg = MI.getOperand(0).getReg();
11865 Register OpSrcReg = MI.getOperand(1).getReg();
11866 Register OpSizeReg = MI.getOperand(2).getReg();
11867
11868 // Allocate the required MBBs and add to parent function.
11869 MachineBasicBlock *TpEntry = BB;
11870 MachineBasicBlock *TpLoopBody = MF->CreateMachineBasicBlock();
11871 MachineBasicBlock *TpExit;
11872
11873 MF->push_back(TpLoopBody);
11874
11875 // If any instructions are present in the current block after
11876 // MVE_MEMCPYLOOPINST or MVE_MEMSETLOOPINST, split the current block and
11877 // move the instructions into the newly created exit block. If there are no
11878 // instructions add an explicit branch to the FallThrough block and then
11879 // split.
11880 //
11881 // The split is required for two reasons:
11882 // 1) A terminator(t2WhileLoopStart) will be placed at that site.
11883 // 2) Since a TPLoopBody will be added later, any phis in successive blocks
11884 // need to be updated. splitAt() already handles this.
11885 TpExit = BB->splitAt(MI, false);
11886 if (TpExit == BB) {
11887 assert(BB->canFallThrough() && "Exit Block must be Fallthrough of the "
11888 "block containing memcpy/memset Pseudo");
11889 TpExit = BB->getFallThrough();
11890 BuildMI(BB, dl, TII->get(ARM::t2B))
11891 .addMBB(TpExit)
11893 TpExit = BB->splitAt(MI, false);
11894 }
11895
11896 // Add logic for iteration count
11897 Register TotalIterationsReg =
11898 genTPEntry(TpEntry, TpLoopBody, TpExit, OpSizeReg, TII, dl, MRI);
11899
11900 // Add the vectorized (and predicated) loads/store instructions
11901 bool IsMemcpy = MI.getOpcode() == ARM::MVE_MEMCPYLOOPINST;
11902 genTPLoopBody(TpLoopBody, TpEntry, TpExit, TII, dl, MRI, OpSrcReg,
11903 OpDestReg, OpSizeReg, TotalIterationsReg, IsMemcpy);
11904
11905 // Required to avoid conflict with the MachineVerifier during testing.
11906 Properties.resetNoPHIs();
11907
11908 // Connect the blocks
11909 TpEntry->addSuccessor(TpLoopBody);
11910 TpLoopBody->addSuccessor(TpLoopBody);
11911 TpLoopBody->addSuccessor(TpExit);
11912
11913 // Reorder for a more natural layout
11914 TpLoopBody->moveAfter(TpEntry);
11915 TpExit->moveAfter(TpLoopBody);
11916
11917 // Finally, remove the memcpy Pseudo Instruction
11918 MI.eraseFromParent();
11919
11920 // Return the exit block as it may contain other instructions requiring a
11921 // custom inserter
11922 return TpExit;
11923 }
11924
11925 // The Thumb2 pre-indexed stores have the same MI operands, they just
11926 // define them differently in the .td files from the isel patterns, so
11927 // they need pseudos.
11928 case ARM::t2STR_preidx:
11929 MI.setDesc(TII->get(ARM::t2STR_PRE));
11930 return BB;
11931 case ARM::t2STRB_preidx:
11932 MI.setDesc(TII->get(ARM::t2STRB_PRE));
11933 return BB;
11934 case ARM::t2STRH_preidx:
11935 MI.setDesc(TII->get(ARM::t2STRH_PRE));
11936 return BB;
11937
11938 case ARM::STRi_preidx:
11939 case ARM::STRBi_preidx: {
11940 unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM
11941 : ARM::STRB_PRE_IMM;
11942 // Decode the offset.
11943 unsigned Offset = MI.getOperand(4).getImm();
11944 bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub;
11946 if (isSub)
11947 Offset = -Offset;
11948
11949 MachineMemOperand *MMO = *MI.memoperands_begin();
11950 BuildMI(*BB, MI, dl, TII->get(NewOpc))
11951 .add(MI.getOperand(0)) // Rn_wb
11952 .add(MI.getOperand(1)) // Rt
11953 .add(MI.getOperand(2)) // Rn
11954 .addImm(Offset) // offset (skip GPR==zero_reg)
11955 .add(MI.getOperand(5)) // pred
11956 .add(MI.getOperand(6))
11957 .addMemOperand(MMO);
11958 MI.eraseFromParent();
11959 return BB;
11960 }
11961 case ARM::STRr_preidx:
11962 case ARM::STRBr_preidx:
11963 case ARM::STRH_preidx: {
11964 unsigned NewOpc;
11965 switch (MI.getOpcode()) {
11966 default: llvm_unreachable("unexpected opcode!");
11967 case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break;
11968 case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break;
11969 case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break;
11970 }
11971 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
11972 for (const MachineOperand &MO : MI.operands())
11973 MIB.add(MO);
11974 MI.eraseFromParent();
11975 return BB;
11976 }
11977
11978 case ARM::tMOVCCr_pseudo: {
11979 // To "insert" a SELECT_CC instruction, we actually have to insert the
11980 // diamond control-flow pattern. The incoming instruction knows the
11981 // destination vreg to set, the condition code register to branch on, the
11982 // true/false values to select between, and a branch opcode to use.
11983 const BasicBlock *LLVM_BB = BB->getBasicBlock();
11985
11986 // thisMBB:
11987 // ...
11988 // TrueVal = ...
11989 // cmpTY ccX, r1, r2
11990 // bCC copy1MBB
11991 // fallthrough --> copy0MBB
11992 MachineBasicBlock *thisMBB = BB;
11993 MachineFunction *F = BB->getParent();
11994 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
11995 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
11996 F->insert(It, copy0MBB);
11997 F->insert(It, sinkMBB);
11998
11999 // Set the call frame size on entry to the new basic blocks.
12000 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
12001 copy0MBB->setCallFrameSize(CallFrameSize);
12002 sinkMBB->setCallFrameSize(CallFrameSize);
12003
12004 // Check whether CPSR is live past the tMOVCCr_pseudo.
12005 const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
12006 if (!MI.killsRegister(ARM::CPSR, /*TRI=*/nullptr) &&
12007 !checkAndUpdateCPSRKill(MI, thisMBB, TRI)) {
12008 copy0MBB->addLiveIn(ARM::CPSR);
12009 sinkMBB->addLiveIn(ARM::CPSR);
12010 }
12011
12012 // Transfer the remainder of BB and its successor edges to sinkMBB.
12013 sinkMBB->splice(sinkMBB->begin(), BB,
12014 std::next(MachineBasicBlock::iterator(MI)), BB->end());
12016
12017 BB->addSuccessor(copy0MBB);
12018 BB->addSuccessor(sinkMBB);
12019
12020 BuildMI(BB, dl, TII->get(ARM::tBcc))
12021 .addMBB(sinkMBB)
12022 .addImm(MI.getOperand(3).getImm())
12023 .addReg(MI.getOperand(4).getReg());
12024
12025 // copy0MBB:
12026 // %FalseValue = ...
12027 // # fallthrough to sinkMBB
12028 BB = copy0MBB;
12029
12030 // Update machine-CFG edges
12031 BB->addSuccessor(sinkMBB);
12032
12033 // sinkMBB:
12034 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
12035 // ...
12036 BB = sinkMBB;
12037 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg())
12038 .addReg(MI.getOperand(1).getReg())
12039 .addMBB(copy0MBB)
12040 .addReg(MI.getOperand(2).getReg())
12041 .addMBB(thisMBB);
12042
12043 MI.eraseFromParent(); // The pseudo instruction is gone now.
12044 return BB;
12045 }
12046
12047 case ARM::BCCi64:
12048 case ARM::BCCZi64: {
12049 // If there is an unconditional branch to the other successor, remove it.
12050 BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end());
12051
12052 // Compare both parts that make up the double comparison separately for
12053 // equality.
12054 bool RHSisZero = MI.getOpcode() == ARM::BCCZi64;
12055
12056 Register LHS1 = MI.getOperand(1).getReg();
12057 Register LHS2 = MI.getOperand(2).getReg();
12058 if (RHSisZero) {
12059 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
12060 .addReg(LHS1)
12061 .addImm(0)
12063 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
12064 .addReg(LHS2).addImm(0)
12065 .addImm(ARMCC::EQ).addReg(ARM::CPSR);
12066 } else {
12067 Register RHS1 = MI.getOperand(3).getReg();
12068 Register RHS2 = MI.getOperand(4).getReg();
12069 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
12070 .addReg(LHS1)
12071 .addReg(RHS1)
12073 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
12074 .addReg(LHS2).addReg(RHS2)
12075 .addImm(ARMCC::EQ).addReg(ARM::CPSR);
12076 }
12077
12078 MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB();
12079 MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB);
12080 if (MI.getOperand(0).getImm() == ARMCC::NE)
12081 std::swap(destMBB, exitMBB);
12082
12083 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
12084 .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR);
12085 if (isThumb2)
12086 BuildMI(BB, dl, TII->get(ARM::t2B))
12087 .addMBB(exitMBB)
12089 else
12090 BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB);
12091
12092 MI.eraseFromParent(); // The pseudo instruction is gone now.
12093 return BB;
12094 }
12095
12096 case ARM::Int_eh_sjlj_setjmp:
12097 case ARM::Int_eh_sjlj_setjmp_nofp:
12098 case ARM::tInt_eh_sjlj_setjmp:
12099 case ARM::t2Int_eh_sjlj_setjmp:
12100 case ARM::t2Int_eh_sjlj_setjmp_nofp:
12101 return BB;
12102
12103 case ARM::Int_eh_sjlj_setup_dispatch:
12104 EmitSjLjDispatchBlock(MI, BB);
12105 return BB;
12106 case ARM::COPY_STRUCT_BYVAL_I32:
12107 ++NumLoopByVals;
12108 return EmitStructByval(MI, BB);
12109 case ARM::WIN__CHKSTK:
12110 return EmitLowered__chkstk(MI, BB);
12111 case ARM::WIN__DBZCHK:
12112 return EmitLowered__dbzchk(MI, BB);
12113 }
12114}
12115
12116/// Attaches vregs to MEMCPY that it will use as scratch registers
12117/// when it is expanded into LDM/STM. This is done as a post-isel lowering
12118/// instead of as a custom inserter because we need the use list from the SDNode.
12119static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget,
12120 MachineInstr &MI, const SDNode *Node) {
12121 bool isThumb1 = Subtarget->isThumb1Only();
12122
12123 MachineFunction *MF = MI.getParent()->getParent();
12125 MachineInstrBuilder MIB(*MF, MI);
12126
12127 // If the new dst/src is unused mark it as dead.
12128 if (!Node->hasAnyUseOfValue(0)) {
12129 MI.getOperand(0).setIsDead(true);
12130 }
12131 if (!Node->hasAnyUseOfValue(1)) {
12132 MI.getOperand(1).setIsDead(true);
12133 }
12134
12135 // The MEMCPY both defines and kills the scratch registers.
12136 for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) {
12137 Register TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass
12138 : &ARM::GPRRegClass);
12140 }
12141}
12142
12144 SDNode *Node) const {
12145 if (MI.getOpcode() == ARM::MEMCPY) {
12146 attachMEMCPYScratchRegs(Subtarget, MI, Node);
12147 return;
12148 }
12149
12150 const MCInstrDesc *MCID = &MI.getDesc();
12151 // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
12152 // RSC. Coming out of isel, they have an implicit CPSR def, but the optional
12153 // operand is still set to noreg. If needed, set the optional operand's
12154 // register to CPSR, and remove the redundant implicit def.
12155 //
12156 // e.g. ADCS (..., implicit-def CPSR) -> ADC (... opt:def CPSR).
12157
12158 // Rename pseudo opcodes.
12159 unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode());
12160 unsigned ccOutIdx;
12161 if (NewOpc) {
12162 const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo();
12163 MCID = &TII->get(NewOpc);
12164
12165 assert(MCID->getNumOperands() ==
12166 MI.getDesc().getNumOperands() + 5 - MI.getDesc().getSize()
12167 && "converted opcode should be the same except for cc_out"
12168 " (and, on Thumb1, pred)");
12169
12170 MI.setDesc(*MCID);
12171
12172 // Add the optional cc_out operand
12173 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true));
12174
12175 // On Thumb1, move all input operands to the end, then add the predicate
12176 if (Subtarget->isThumb1Only()) {
12177 for (unsigned c = MCID->getNumOperands() - 4; c--;) {
12178 MI.addOperand(MI.getOperand(1));
12179 MI.removeOperand(1);
12180 }
12181
12182 // Restore the ties
12183 for (unsigned i = MI.getNumOperands(); i--;) {
12184 const MachineOperand& op = MI.getOperand(i);
12185 if (op.isReg() && op.isUse()) {
12186 int DefIdx = MCID->getOperandConstraint(i, MCOI::TIED_TO);
12187 if (DefIdx != -1)
12188 MI.tieOperands(DefIdx, i);
12189 }
12190 }
12191
12193 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/false));
12194 ccOutIdx = 1;
12195 } else
12196 ccOutIdx = MCID->getNumOperands() - 1;
12197 } else
12198 ccOutIdx = MCID->getNumOperands() - 1;
12199
12200 // Any ARM instruction that sets the 's' bit should specify an optional
12201 // "cc_out" operand in the last operand position.
12202 if (!MI.hasOptionalDef() || !MCID->operands()[ccOutIdx].isOptionalDef()) {
12203 assert(!NewOpc && "Optional cc_out operand required");
12204 return;
12205 }
12206 // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it
12207 // since we already have an optional CPSR def.
12208 bool definesCPSR = false;
12209 bool deadCPSR = false;
12210 for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e;
12211 ++i) {
12212 const MachineOperand &MO = MI.getOperand(i);
12213 if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) {
12214 definesCPSR = true;
12215 if (MO.isDead())
12216 deadCPSR = true;
12217 MI.removeOperand(i);
12218 break;
12219 }
12220 }
12221 if (!definesCPSR) {
12222 assert(!NewOpc && "Optional cc_out operand required");
12223 return;
12224 }
12225 assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag");
12226 if (deadCPSR) {
12227 assert(!MI.getOperand(ccOutIdx).getReg() &&
12228 "expect uninitialized optional cc_out operand");
12229 // Thumb1 instructions must have the S bit even if the CPSR is dead.
12230 if (!Subtarget->isThumb1Only())
12231 return;
12232 }
12233
12234 // If this instruction was defined with an optional CPSR def and its dag node
12235 // had a live implicit CPSR def, then activate the optional CPSR def.
12236 MachineOperand &MO = MI.getOperand(ccOutIdx);
12237 MO.setReg(ARM::CPSR);
12238 MO.setIsDef(true);
12239}
12240
12241//===----------------------------------------------------------------------===//
12242// ARM Optimization Hooks
12243//===----------------------------------------------------------------------===//
12244
12245// Helper function that checks if N is a null or all ones constant.
12246static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) {
12248}
12249
12250// Return true if N is conditionally 0 or all ones.
12251// Detects these expressions where cc is an i1 value:
12252//
12253// (select cc 0, y) [AllOnes=0]
12254// (select cc y, 0) [AllOnes=0]
12255// (zext cc) [AllOnes=0]
12256// (sext cc) [AllOnes=0/1]
12257// (select cc -1, y) [AllOnes=1]
12258// (select cc y, -1) [AllOnes=1]
12259//
12260// Invert is set when N is the null/all ones constant when CC is false.
12261// OtherOp is set to the alternative value of N.
12263 SDValue &CC, bool &Invert,
12264 SDValue &OtherOp,
12265 SelectionDAG &DAG) {
12266 switch (N->getOpcode()) {
12267 default: return false;
12268 case ISD::SELECT: {
12269 CC = N->getOperand(0);
12270 SDValue N1 = N->getOperand(1);
12271 SDValue N2 = N->getOperand(2);
12272 if (isZeroOrAllOnes(N1, AllOnes)) {
12273 Invert = false;
12274 OtherOp = N2;
12275 return true;
12276 }
12277 if (isZeroOrAllOnes(N2, AllOnes)) {
12278 Invert = true;
12279 OtherOp = N1;
12280 return true;
12281 }
12282 return false;
12283 }
12284 case ISD::ZERO_EXTEND:
12285 // (zext cc) can never be the all ones value.
12286 if (AllOnes)
12287 return false;
12288 [[fallthrough]];
12289 case ISD::SIGN_EXTEND: {
12290 SDLoc dl(N);
12291 EVT VT = N->getValueType(0);
12292 CC = N->getOperand(0);
12293 if (CC.getValueType() != MVT::i1 || CC.getOpcode() != ISD::SETCC)
12294 return false;
12295 Invert = !AllOnes;
12296 if (AllOnes)
12297 // When looking for an AllOnes constant, N is an sext, and the 'other'
12298 // value is 0.
12299 OtherOp = DAG.getConstant(0, dl, VT);
12300 else if (N->getOpcode() == ISD::ZERO_EXTEND)
12301 // When looking for a 0 constant, N can be zext or sext.
12302 OtherOp = DAG.getConstant(1, dl, VT);
12303 else
12304 OtherOp = DAG.getAllOnesConstant(dl, VT);
12305 return true;
12306 }
12307 }
12308}
12309
12310// Combine a constant select operand into its use:
12311//
12312// (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
12313// (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
12314// (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) [AllOnes=1]
12315// (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
12316// (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
12317//
12318// The transform is rejected if the select doesn't have a constant operand that
12319// is null, or all ones when AllOnes is set.
12320//
12321// Also recognize sext/zext from i1:
12322//
12323// (add (zext cc), x) -> (select cc (add x, 1), x)
12324// (add (sext cc), x) -> (select cc (add x, -1), x)
12325//
12326// These transformations eventually create predicated instructions.
12327//
12328// @param N The node to transform.
12329// @param Slct The N operand that is a select.
12330// @param OtherOp The other N operand (x above).
12331// @param DCI Context.
12332// @param AllOnes Require the select constant to be all ones instead of null.
12333// @returns The new node, or SDValue() on failure.
12334static
12337 bool AllOnes = false) {
12338 SelectionDAG &DAG = DCI.DAG;
12339 EVT VT = N->getValueType(0);
12340 SDValue NonConstantVal;
12341 SDValue CCOp;
12342 bool SwapSelectOps;
12343 if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps,
12344 NonConstantVal, DAG))
12345 return SDValue();
12346
12347 // Slct is now know to be the desired identity constant when CC is true.
12348 SDValue TrueVal = OtherOp;
12349 SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
12350 OtherOp, NonConstantVal);
12351 // Unless SwapSelectOps says CC should be false.
12352 if (SwapSelectOps)
12353 std::swap(TrueVal, FalseVal);
12354
12355 return DAG.getNode(ISD::SELECT, SDLoc(N), VT,
12356 CCOp, TrueVal, FalseVal);
12357}
12358
12359// Attempt combineSelectAndUse on each operand of a commutative operator N.
12360static
12363 SDValue N0 = N->getOperand(0);
12364 SDValue N1 = N->getOperand(1);
12365 if (N0.getNode()->hasOneUse())
12366 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes))
12367 return Result;
12368 if (N1.getNode()->hasOneUse())
12369 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes))
12370 return Result;
12371 return SDValue();
12372}
12373
12375 // VUZP shuffle node.
12376 if (N->getOpcode() == ARMISD::VUZP)
12377 return true;
12378
12379 // "VUZP" on i32 is an alias for VTRN.
12380 if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32)
12381 return true;
12382
12383 return false;
12384}
12385
12388 const ARMSubtarget *Subtarget) {
12389 // Look for ADD(VUZP.0, VUZP.1).
12390 if (!IsVUZPShuffleNode(N0.getNode()) || N0.getNode() != N1.getNode() ||
12391 N0 == N1)
12392 return SDValue();
12393
12394 // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD.
12395 if (!N->getValueType(0).is64BitVector())
12396 return SDValue();
12397
12398 // Generate vpadd.
12399 SelectionDAG &DAG = DCI.DAG;
12400 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12401 SDLoc dl(N);
12402 SDNode *Unzip = N0.getNode();
12403 EVT VT = N->getValueType(0);
12404
12406 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl,
12407 TLI.getPointerTy(DAG.getDataLayout())));
12408 Ops.push_back(Unzip->getOperand(0));
12409 Ops.push_back(Unzip->getOperand(1));
12410
12411 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
12412}
12413
12416 const ARMSubtarget *Subtarget) {
12417 // Check for two extended operands.
12418 if (!(N0.getOpcode() == ISD::SIGN_EXTEND &&
12419 N1.getOpcode() == ISD::SIGN_EXTEND) &&
12420 !(N0.getOpcode() == ISD::ZERO_EXTEND &&
12421 N1.getOpcode() == ISD::ZERO_EXTEND))
12422 return SDValue();
12423
12424 SDValue N00 = N0.getOperand(0);
12425 SDValue N10 = N1.getOperand(0);
12426
12427 // Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1))
12428 if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() ||
12429 N00 == N10)
12430 return SDValue();
12431
12432 // We only recognize Q register paddl here; this can't be reached until
12433 // after type legalization.
12434 if (!N00.getValueType().is64BitVector() ||
12436 return SDValue();
12437
12438 // Generate vpaddl.
12439 SelectionDAG &DAG = DCI.DAG;
12440 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12441 SDLoc dl(N);
12442 EVT VT = N->getValueType(0);
12443
12445 // Form vpaddl.sN or vpaddl.uN depending on the kind of extension.
12446 unsigned Opcode;
12447 if (N0.getOpcode() == ISD::SIGN_EXTEND)
12448 Opcode = Intrinsic::arm_neon_vpaddls;
12449 else
12450 Opcode = Intrinsic::arm_neon_vpaddlu;
12451 Ops.push_back(DAG.getConstant(Opcode, dl,
12452 TLI.getPointerTy(DAG.getDataLayout())));
12453 EVT ElemTy = N00.getValueType().getVectorElementType();
12454 unsigned NumElts = VT.getVectorNumElements();
12455 EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), ElemTy, NumElts * 2);
12456 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), ConcatVT,
12457 N00.getOperand(0), N00.getOperand(1));
12458 Ops.push_back(Concat);
12459
12460 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
12461}
12462
12463// FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in
12464// an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is
12465// much easier to match.
12466static SDValue
12469 const ARMSubtarget *Subtarget) {
12470 // Only perform optimization if after legalize, and if NEON is available. We
12471 // also expected both operands to be BUILD_VECTORs.
12472 if (DCI.isBeforeLegalize() || !Subtarget->hasNEON()
12473 || N0.getOpcode() != ISD::BUILD_VECTOR
12474 || N1.getOpcode() != ISD::BUILD_VECTOR)
12475 return SDValue();
12476
12477 // Check output type since VPADDL operand elements can only be 8, 16, or 32.
12478 EVT VT = N->getValueType(0);
12479 if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64)
12480 return SDValue();
12481
12482 // Check that the vector operands are of the right form.
12483 // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR
12484 // operands, where N is the size of the formed vector.
12485 // Each EXTRACT_VECTOR should have the same input vector and odd or even
12486 // index such that we have a pair wise add pattern.
12487
12488 // Grab the vector that all EXTRACT_VECTOR nodes should be referencing.
12490 return SDValue();
12491 SDValue Vec = N0->getOperand(0)->getOperand(0);
12492 SDNode *V = Vec.getNode();
12493 unsigned nextIndex = 0;
12494
12495 // For each operands to the ADD which are BUILD_VECTORs,
12496 // check to see if each of their operands are an EXTRACT_VECTOR with
12497 // the same vector and appropriate index.
12498 for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) {
12501
12502 SDValue ExtVec0 = N0->getOperand(i);
12503 SDValue ExtVec1 = N1->getOperand(i);
12504
12505 // First operand is the vector, verify its the same.
12506 if (V != ExtVec0->getOperand(0).getNode() ||
12507 V != ExtVec1->getOperand(0).getNode())
12508 return SDValue();
12509
12510 // Second is the constant, verify its correct.
12513
12514 // For the constant, we want to see all the even or all the odd.
12515 if (!C0 || !C1 || C0->getZExtValue() != nextIndex
12516 || C1->getZExtValue() != nextIndex+1)
12517 return SDValue();
12518
12519 // Increment index.
12520 nextIndex+=2;
12521 } else
12522 return SDValue();
12523 }
12524
12525 // Don't generate vpaddl+vmovn; we'll match it to vpadd later. Also make sure
12526 // we're using the entire input vector, otherwise there's a size/legality
12527 // mismatch somewhere.
12528 if (nextIndex != Vec.getValueType().getVectorNumElements() ||
12530 return SDValue();
12531
12532 // Create VPADDL node.
12533 SelectionDAG &DAG = DCI.DAG;
12534 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12535
12536 SDLoc dl(N);
12537
12538 // Build operand list.
12540 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl,
12541 TLI.getPointerTy(DAG.getDataLayout())));
12542
12543 // Input is the vector.
12544 Ops.push_back(Vec);
12545
12546 // Get widened type and narrowed type.
12547 MVT widenType;
12548 unsigned numElem = VT.getVectorNumElements();
12549
12550 EVT inputLaneType = Vec.getValueType().getVectorElementType();
12551 switch (inputLaneType.getSimpleVT().SimpleTy) {
12552 case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break;
12553 case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break;
12554 case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break;
12555 default:
12556 llvm_unreachable("Invalid vector element type for padd optimization.");
12557 }
12558
12559 SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops);
12560 unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE;
12561 return DAG.getNode(ExtOp, dl, VT, tmp);
12562}
12563
12565 if (V->getOpcode() == ISD::UMUL_LOHI ||
12566 V->getOpcode() == ISD::SMUL_LOHI)
12567 return V;
12568 return SDValue();
12569}
12570
12571static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode,
12573 const ARMSubtarget *Subtarget) {
12574 if (!Subtarget->hasBaseDSP())
12575 return SDValue();
12576
12577 // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and
12578 // accumulates the product into a 64-bit value. The 16-bit values will
12579 // be sign extended somehow or SRA'd into 32-bit values
12580 // (addc (adde (mul 16bit, 16bit), lo), hi)
12581 SDValue Mul = AddcNode->getOperand(0);
12582 SDValue Lo = AddcNode->getOperand(1);
12583 if (Mul.getOpcode() != ISD::MUL) {
12584 Lo = AddcNode->getOperand(0);
12585 Mul = AddcNode->getOperand(1);
12586 if (Mul.getOpcode() != ISD::MUL)
12587 return SDValue();
12588 }
12589
12590 SDValue SRA = AddeNode->getOperand(0);
12591 SDValue Hi = AddeNode->getOperand(1);
12592 if (SRA.getOpcode() != ISD::SRA) {
12593 SRA = AddeNode->getOperand(1);
12594 Hi = AddeNode->getOperand(0);
12595 if (SRA.getOpcode() != ISD::SRA)
12596 return SDValue();
12597 }
12598 if (auto Const = dyn_cast<ConstantSDNode>(SRA.getOperand(1))) {
12599 if (Const->getZExtValue() != 31)
12600 return SDValue();
12601 } else
12602 return SDValue();
12603
12604 if (SRA.getOperand(0) != Mul)
12605 return SDValue();
12606
12607 SelectionDAG &DAG = DCI.DAG;
12608 SDLoc dl(AddcNode);
12609 unsigned Opcode = 0;
12610 SDValue Op0;
12611 SDValue Op1;
12612
12613 if (isS16(Mul.getOperand(0), DAG) && isS16(Mul.getOperand(1), DAG)) {
12614 Opcode = ARMISD::SMLALBB;
12615 Op0 = Mul.getOperand(0);
12616 Op1 = Mul.getOperand(1);
12617 } else if (isS16(Mul.getOperand(0), DAG) && isSRA16(Mul.getOperand(1))) {
12618 Opcode = ARMISD::SMLALBT;
12619 Op0 = Mul.getOperand(0);
12620 Op1 = Mul.getOperand(1).getOperand(0);
12621 } else if (isSRA16(Mul.getOperand(0)) && isS16(Mul.getOperand(1), DAG)) {
12622 Opcode = ARMISD::SMLALTB;
12623 Op0 = Mul.getOperand(0).getOperand(0);
12624 Op1 = Mul.getOperand(1);
12625 } else if (isSRA16(Mul.getOperand(0)) && isSRA16(Mul.getOperand(1))) {
12626 Opcode = ARMISD::SMLALTT;
12627 Op0 = Mul->getOperand(0).getOperand(0);
12628 Op1 = Mul->getOperand(1).getOperand(0);
12629 }
12630
12631 if (!Op0 || !Op1)
12632 return SDValue();
12633
12634 SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
12635 Op0, Op1, Lo, Hi);
12636 // Replace the ADDs' nodes uses by the MLA node's values.
12637 SDValue HiMLALResult(SMLAL.getNode(), 1);
12638 SDValue LoMLALResult(SMLAL.getNode(), 0);
12639
12640 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);
12641 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult);
12642
12643 // Return original node to notify the driver to stop replacing.
12644 SDValue resNode(AddcNode, 0);
12645 return resNode;
12646}
12647
12650 const ARMSubtarget *Subtarget) {
12651 // Look for multiply add opportunities.
12652 // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where
12653 // each add nodes consumes a value from ISD::UMUL_LOHI and there is
12654 // a glue link from the first add to the second add.
12655 // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by
12656 // a S/UMLAL instruction.
12657 // UMUL_LOHI
12658 // / :lo \ :hi
12659 // V \ [no multiline comment]
12660 // loAdd -> ADDC |
12661 // \ :carry /
12662 // V V
12663 // ADDE <- hiAdd
12664 //
12665 // In the special case where only the higher part of a signed result is used
12666 // and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts
12667 // a constant with the exact value of 0x80000000, we recognize we are dealing
12668 // with a "rounded multiply and add" (or subtract) and transform it into
12669 // either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively.
12670
12671 assert((AddeSubeNode->getOpcode() == ARMISD::ADDE ||
12672 AddeSubeNode->getOpcode() == ARMISD::SUBE) &&
12673 "Expect an ADDE or SUBE");
12674
12675 assert(AddeSubeNode->getNumOperands() == 3 &&
12676 AddeSubeNode->getOperand(2).getValueType() == MVT::i32 &&
12677 "ADDE node has the wrong inputs");
12678
12679 // Check that we are chained to the right ADDC or SUBC node.
12680 SDNode *AddcSubcNode = AddeSubeNode->getOperand(2).getNode();
12681 if ((AddeSubeNode->getOpcode() == ARMISD::ADDE &&
12682 AddcSubcNode->getOpcode() != ARMISD::ADDC) ||
12683 (AddeSubeNode->getOpcode() == ARMISD::SUBE &&
12684 AddcSubcNode->getOpcode() != ARMISD::SUBC))
12685 return SDValue();
12686
12687 SDValue AddcSubcOp0 = AddcSubcNode->getOperand(0);
12688 SDValue AddcSubcOp1 = AddcSubcNode->getOperand(1);
12689
12690 // Check if the two operands are from the same mul_lohi node.
12691 if (AddcSubcOp0.getNode() == AddcSubcOp1.getNode())
12692 return SDValue();
12693
12694 assert(AddcSubcNode->getNumValues() == 2 &&
12695 AddcSubcNode->getValueType(0) == MVT::i32 &&
12696 "Expect ADDC with two result values. First: i32");
12697
12698 // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it
12699 // maybe a SMLAL which multiplies two 16-bit values.
12700 if (AddeSubeNode->getOpcode() == ARMISD::ADDE &&
12701 AddcSubcOp0->getOpcode() != ISD::UMUL_LOHI &&
12702 AddcSubcOp0->getOpcode() != ISD::SMUL_LOHI &&
12703 AddcSubcOp1->getOpcode() != ISD::UMUL_LOHI &&
12704 AddcSubcOp1->getOpcode() != ISD::SMUL_LOHI)
12705 return AddCombineTo64BitSMLAL16(AddcSubcNode, AddeSubeNode, DCI, Subtarget);
12706
12707 // Check for the triangle shape.
12708 SDValue AddeSubeOp0 = AddeSubeNode->getOperand(0);
12709 SDValue AddeSubeOp1 = AddeSubeNode->getOperand(1);
12710
12711 // Make sure that the ADDE/SUBE operands are not coming from the same node.
12712 if (AddeSubeOp0.getNode() == AddeSubeOp1.getNode())
12713 return SDValue();
12714
12715 // Find the MUL_LOHI node walking up ADDE/SUBE's operands.
12716 bool IsLeftOperandMUL = false;
12717 SDValue MULOp = findMUL_LOHI(AddeSubeOp0);
12718 if (MULOp == SDValue())
12719 MULOp = findMUL_LOHI(AddeSubeOp1);
12720 else
12721 IsLeftOperandMUL = true;
12722 if (MULOp == SDValue())
12723 return SDValue();
12724
12725 // Figure out the right opcode.
12726 unsigned Opc = MULOp->getOpcode();
12727 unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL;
12728
12729 // Figure out the high and low input values to the MLAL node.
12730 SDValue *HiAddSub = nullptr;
12731 SDValue *LoMul = nullptr;
12732 SDValue *LowAddSub = nullptr;
12733
12734 // Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI.
12735 if ((AddeSubeOp0 != MULOp.getValue(1)) && (AddeSubeOp1 != MULOp.getValue(1)))
12736 return SDValue();
12737
12738 if (IsLeftOperandMUL)
12739 HiAddSub = &AddeSubeOp1;
12740 else
12741 HiAddSub = &AddeSubeOp0;
12742
12743 // Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node
12744 // whose low result is fed to the ADDC/SUBC we are checking.
12745
12746 if (AddcSubcOp0 == MULOp.getValue(0)) {
12747 LoMul = &AddcSubcOp0;
12748 LowAddSub = &AddcSubcOp1;
12749 }
12750 if (AddcSubcOp1 == MULOp.getValue(0)) {
12751 LoMul = &AddcSubcOp1;
12752 LowAddSub = &AddcSubcOp0;
12753 }
12754
12755 if (!LoMul)
12756 return SDValue();
12757
12758 // If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC
12759 // the replacement below will create a cycle.
12760 if (AddcSubcNode == HiAddSub->getNode() ||
12761 AddcSubcNode->isPredecessorOf(HiAddSub->getNode()))
12762 return SDValue();
12763
12764 // Create the merged node.
12765 SelectionDAG &DAG = DCI.DAG;
12766
12767 // Start building operand list.
12769 Ops.push_back(LoMul->getOperand(0));
12770 Ops.push_back(LoMul->getOperand(1));
12771
12772 // Check whether we can use SMMLAR, SMMLSR or SMMULR instead. For this to be
12773 // the case, we must be doing signed multiplication and only use the higher
12774 // part of the result of the MLAL, furthermore the LowAddSub must be a constant
12775 // addition or subtraction with the value of 0x800000.
12776 if (Subtarget->hasV6Ops() && Subtarget->hasDSP() && Subtarget->useMulOps() &&
12777 FinalOpc == ARMISD::SMLAL && !AddeSubeNode->hasAnyUseOfValue(1) &&
12778 LowAddSub->getNode()->getOpcode() == ISD::Constant &&
12779 static_cast<ConstantSDNode *>(LowAddSub->getNode())->getZExtValue() ==
12780 0x80000000) {
12781 Ops.push_back(*HiAddSub);
12782 if (AddcSubcNode->getOpcode() == ARMISD::SUBC) {
12783 FinalOpc = ARMISD::SMMLSR;
12784 } else {
12785 FinalOpc = ARMISD::SMMLAR;
12786 }
12787 SDValue NewNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), MVT::i32, Ops);
12788 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), NewNode);
12789
12790 return SDValue(AddeSubeNode, 0);
12791 } else if (AddcSubcNode->getOpcode() == ARMISD::SUBC)
12792 // SMMLS is generated during instruction selection and the rest of this
12793 // function can not handle the case where AddcSubcNode is a SUBC.
12794 return SDValue();
12795
12796 // Finish building the operand list for {U/S}MLAL
12797 Ops.push_back(*LowAddSub);
12798 Ops.push_back(*HiAddSub);
12799
12800 SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode),
12801 DAG.getVTList(MVT::i32, MVT::i32), Ops);
12802
12803 // Replace the ADDs' nodes uses by the MLA node's values.
12804 SDValue HiMLALResult(MLALNode.getNode(), 1);
12805 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), HiMLALResult);
12806
12807 SDValue LoMLALResult(MLALNode.getNode(), 0);
12808 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcSubcNode, 0), LoMLALResult);
12809
12810 // Return original node to notify the driver to stop replacing.
12811 return SDValue(AddeSubeNode, 0);
12812}
12813
12816 const ARMSubtarget *Subtarget) {
12817 // UMAAL is similar to UMLAL except that it adds two unsigned values.
12818 // While trying to combine for the other MLAL nodes, first search for the
12819 // chance to use UMAAL. Check if Addc uses a node which has already
12820 // been combined into a UMLAL. The other pattern is UMLAL using Addc/Adde
12821 // as the addend, and it's handled in PerformUMLALCombine.
12822
12823 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
12824 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
12825
12826 // Check that we have a glued ADDC node.
12827 SDNode* AddcNode = AddeNode->getOperand(2).getNode();
12828 if (AddcNode->getOpcode() != ARMISD::ADDC)
12829 return SDValue();
12830
12831 // Find the converted UMAAL or quit if it doesn't exist.
12832 SDNode *UmlalNode = nullptr;
12833 SDValue AddHi;
12834 if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) {
12835 UmlalNode = AddcNode->getOperand(0).getNode();
12836 AddHi = AddcNode->getOperand(1);
12837 } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) {
12838 UmlalNode = AddcNode->getOperand(1).getNode();
12839 AddHi = AddcNode->getOperand(0);
12840 } else {
12841 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
12842 }
12843
12844 // The ADDC should be glued to an ADDE node, which uses the same UMLAL as
12845 // the ADDC as well as Zero.
12846 if (!isNullConstant(UmlalNode->getOperand(3)))
12847 return SDValue();
12848
12849 if ((isNullConstant(AddeNode->getOperand(0)) &&
12850 AddeNode->getOperand(1).getNode() == UmlalNode) ||
12851 (AddeNode->getOperand(0).getNode() == UmlalNode &&
12852 isNullConstant(AddeNode->getOperand(1)))) {
12853 SelectionDAG &DAG = DCI.DAG;
12854 SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1),
12855 UmlalNode->getOperand(2), AddHi };
12856 SDValue UMAAL = DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode),
12857 DAG.getVTList(MVT::i32, MVT::i32), Ops);
12858
12859 // Replace the ADDs' nodes uses by the UMAAL node's values.
12860 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1));
12861 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0));
12862
12863 // Return original node to notify the driver to stop replacing.
12864 return SDValue(AddeNode, 0);
12865 }
12866 return SDValue();
12867}
12868
12870 const ARMSubtarget *Subtarget) {
12871 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
12872 return SDValue();
12873
12874 // Check that we have a pair of ADDC and ADDE as operands.
12875 // Both addends of the ADDE must be zero.
12876 SDNode* AddcNode = N->getOperand(2).getNode();
12877 SDNode* AddeNode = N->getOperand(3).getNode();
12878 if ((AddcNode->getOpcode() == ARMISD::ADDC) &&
12879 (AddeNode->getOpcode() == ARMISD::ADDE) &&
12880 isNullConstant(AddeNode->getOperand(0)) &&
12881 isNullConstant(AddeNode->getOperand(1)) &&
12882 (AddeNode->getOperand(2).getNode() == AddcNode))
12883 return DAG.getNode(ARMISD::UMAAL, SDLoc(N),
12884 DAG.getVTList(MVT::i32, MVT::i32),
12885 {N->getOperand(0), N->getOperand(1),
12886 AddcNode->getOperand(0), AddcNode->getOperand(1)});
12887 else
12888 return SDValue();
12889}
12890
12893 const ARMSubtarget *Subtarget) {
12894 SelectionDAG &DAG(DCI.DAG);
12895
12896 if (N->getOpcode() == ARMISD::SUBC && N->hasAnyUseOfValue(1)) {
12897 // (SUBC (ADDE 0, 0, C), 1) -> C
12898 SDValue LHS = N->getOperand(0);
12899 SDValue RHS = N->getOperand(1);
12900 if (LHS->getOpcode() == ARMISD::ADDE &&
12901 isNullConstant(LHS->getOperand(0)) &&
12902 isNullConstant(LHS->getOperand(1)) && isOneConstant(RHS)) {
12903 return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2));
12904 }
12905 }
12906
12907 if (Subtarget->isThumb1Only()) {
12908 SDValue RHS = N->getOperand(1);
12910 int32_t imm = C->getSExtValue();
12911 if (imm < 0 && imm > std::numeric_limits<int>::min()) {
12912 SDLoc DL(N);
12913 RHS = DAG.getConstant(-imm, DL, MVT::i32);
12914 unsigned Opcode = (N->getOpcode() == ARMISD::ADDC) ? ARMISD::SUBC
12915 : ARMISD::ADDC;
12916 return DAG.getNode(Opcode, DL, N->getVTList(), N->getOperand(0), RHS);
12917 }
12918 }
12919 }
12920
12921 return SDValue();
12922}
12923
12926 const ARMSubtarget *Subtarget) {
12927 if (Subtarget->isThumb1Only()) {
12928 SelectionDAG &DAG = DCI.DAG;
12929 SDValue RHS = N->getOperand(1);
12931 int64_t imm = C->getSExtValue();
12932 if (imm < 0) {
12933 SDLoc DL(N);
12934
12935 // The with-carry-in form matches bitwise not instead of the negation.
12936 // Effectively, the inverse interpretation of the carry flag already
12937 // accounts for part of the negation.
12938 RHS = DAG.getConstant(~imm, DL, MVT::i32);
12939
12940 unsigned Opcode = (N->getOpcode() == ARMISD::ADDE) ? ARMISD::SUBE
12941 : ARMISD::ADDE;
12942 return DAG.getNode(Opcode, DL, N->getVTList(),
12943 N->getOperand(0), RHS, N->getOperand(2));
12944 }
12945 }
12946 } else if (N->getOperand(1)->getOpcode() == ISD::SMUL_LOHI) {
12947 return AddCombineTo64bitMLAL(N, DCI, Subtarget);
12948 }
12949 return SDValue();
12950}
12951
12954 const ARMSubtarget *Subtarget) {
12955 if (!Subtarget->hasMVEIntegerOps())
12956 return SDValue();
12957
12958 SDLoc dl(N);
12959 SDValue SetCC;
12960 SDValue LHS;
12961 SDValue RHS;
12962 ISD::CondCode CC;
12963 SDValue TrueVal;
12964 SDValue FalseVal;
12965
12966 if (N->getOpcode() == ISD::SELECT &&
12967 N->getOperand(0)->getOpcode() == ISD::SETCC) {
12968 SetCC = N->getOperand(0);
12969 LHS = SetCC->getOperand(0);
12970 RHS = SetCC->getOperand(1);
12971 CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
12972 TrueVal = N->getOperand(1);
12973 FalseVal = N->getOperand(2);
12974 } else if (N->getOpcode() == ISD::SELECT_CC) {
12975 LHS = N->getOperand(0);
12976 RHS = N->getOperand(1);
12977 CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
12978 TrueVal = N->getOperand(2);
12979 FalseVal = N->getOperand(3);
12980 } else {
12981 return SDValue();
12982 }
12983
12984 unsigned int Opcode = 0;
12985 if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMIN ||
12986 FalseVal->getOpcode() == ISD::VECREDUCE_UMIN) &&
12987 (CC == ISD::SETULT || CC == ISD::SETUGT)) {
12988 Opcode = ARMISD::VMINVu;
12989 if (CC == ISD::SETUGT)
12990 std::swap(TrueVal, FalseVal);
12991 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMIN ||
12992 FalseVal->getOpcode() == ISD::VECREDUCE_SMIN) &&
12993 (CC == ISD::SETLT || CC == ISD::SETGT)) {
12994 Opcode = ARMISD::VMINVs;
12995 if (CC == ISD::SETGT)
12996 std::swap(TrueVal, FalseVal);
12997 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMAX ||
12998 FalseVal->getOpcode() == ISD::VECREDUCE_UMAX) &&
12999 (CC == ISD::SETUGT || CC == ISD::SETULT)) {
13000 Opcode = ARMISD::VMAXVu;
13001 if (CC == ISD::SETULT)
13002 std::swap(TrueVal, FalseVal);
13003 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMAX ||
13004 FalseVal->getOpcode() == ISD::VECREDUCE_SMAX) &&
13005 (CC == ISD::SETGT || CC == ISD::SETLT)) {
13006 Opcode = ARMISD::VMAXVs;
13007 if (CC == ISD::SETLT)
13008 std::swap(TrueVal, FalseVal);
13009 } else
13010 return SDValue();
13011
13012 // Normalise to the right hand side being the vector reduction
13013 switch (TrueVal->getOpcode()) {
13018 std::swap(LHS, RHS);
13019 std::swap(TrueVal, FalseVal);
13020 break;
13021 }
13022
13023 EVT VectorType = FalseVal->getOperand(0).getValueType();
13024
13025 if (VectorType != MVT::v16i8 && VectorType != MVT::v8i16 &&
13026 VectorType != MVT::v4i32)
13027 return SDValue();
13028
13029 EVT VectorScalarType = VectorType.getVectorElementType();
13030
13031 // The values being selected must also be the ones being compared
13032 if (TrueVal != LHS || FalseVal != RHS)
13033 return SDValue();
13034
13035 EVT LeftType = LHS->getValueType(0);
13036 EVT RightType = RHS->getValueType(0);
13037
13038 // The types must match the reduced type too
13039 if (LeftType != VectorScalarType || RightType != VectorScalarType)
13040 return SDValue();
13041
13042 // Legalise the scalar to an i32
13043 if (VectorScalarType != MVT::i32)
13044 LHS = DCI.DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
13045
13046 // Generate the reduction as an i32 for legalisation purposes
13047 auto Reduction =
13048 DCI.DAG.getNode(Opcode, dl, MVT::i32, LHS, RHS->getOperand(0));
13049
13050 // The result isn't actually an i32 so truncate it back to its original type
13051 if (VectorScalarType != MVT::i32)
13052 Reduction = DCI.DAG.getNode(ISD::TRUNCATE, dl, VectorScalarType, Reduction);
13053
13054 return Reduction;
13055}
13056
13057// A special combine for the vqdmulh family of instructions. This is one of the
13058// potential set of patterns that could patch this instruction. The base pattern
13059// you would expect to be min(max(ashr(mul(mul(sext(x), 2), sext(y)), 16))).
13060// This matches the different min(max(ashr(mul(mul(sext(x), sext(y)), 2), 16))),
13061// which llvm will have optimized to min(ashr(mul(sext(x), sext(y)), 15))) as
13062// the max is unnecessary.
13064 EVT VT = N->getValueType(0);
13065 SDValue Shft;
13066 ConstantSDNode *Clamp;
13067
13068 if (!VT.isVector() || VT.getScalarSizeInBits() > 64)
13069 return SDValue();
13070
13071 if (N->getOpcode() == ISD::SMIN) {
13072 Shft = N->getOperand(0);
13073 Clamp = isConstOrConstSplat(N->getOperand(1));
13074 } else if (N->getOpcode() == ISD::VSELECT) {
13075 // Detect a SMIN, which for an i64 node will be a vselect/setcc, not a smin.
13076 SDValue Cmp = N->getOperand(0);
13077 if (Cmp.getOpcode() != ISD::SETCC ||
13078 cast<CondCodeSDNode>(Cmp.getOperand(2))->get() != ISD::SETLT ||
13079 Cmp.getOperand(0) != N->getOperand(1) ||
13080 Cmp.getOperand(1) != N->getOperand(2))
13081 return SDValue();
13082 Shft = N->getOperand(1);
13083 Clamp = isConstOrConstSplat(N->getOperand(2));
13084 } else
13085 return SDValue();
13086
13087 if (!Clamp)
13088 return SDValue();
13089
13090 MVT ScalarType;
13091 int ShftAmt = 0;
13092 switch (Clamp->getSExtValue()) {
13093 case (1 << 7) - 1:
13094 ScalarType = MVT::i8;
13095 ShftAmt = 7;
13096 break;
13097 case (1 << 15) - 1:
13098 ScalarType = MVT::i16;
13099 ShftAmt = 15;
13100 break;
13101 case (1ULL << 31) - 1:
13102 ScalarType = MVT::i32;
13103 ShftAmt = 31;
13104 break;
13105 default:
13106 return SDValue();
13107 }
13108
13109 if (Shft.getOpcode() != ISD::SRA)
13110 return SDValue();
13112 if (!N1 || N1->getSExtValue() != ShftAmt)
13113 return SDValue();
13114
13115 SDValue Mul = Shft.getOperand(0);
13116 if (Mul.getOpcode() != ISD::MUL)
13117 return SDValue();
13118
13119 SDValue Ext0 = Mul.getOperand(0);
13120 SDValue Ext1 = Mul.getOperand(1);
13121 if (Ext0.getOpcode() != ISD::SIGN_EXTEND ||
13122 Ext1.getOpcode() != ISD::SIGN_EXTEND)
13123 return SDValue();
13124 EVT VecVT = Ext0.getOperand(0).getValueType();
13125 if (!VecVT.isPow2VectorType() || VecVT.getVectorNumElements() == 1)
13126 return SDValue();
13127 if (Ext1.getOperand(0).getValueType() != VecVT ||
13128 VecVT.getScalarType() != ScalarType ||
13129 VT.getScalarSizeInBits() < ScalarType.getScalarSizeInBits() * 2)
13130 return SDValue();
13131
13132 SDLoc DL(Mul);
13133 unsigned LegalLanes = 128 / (ShftAmt + 1);
13134 EVT LegalVecVT = MVT::getVectorVT(ScalarType, LegalLanes);
13135 // For types smaller than legal vectors extend to be legal and only use needed
13136 // lanes.
13137 if (VecVT.getSizeInBits() < 128) {
13138 EVT ExtVecVT =
13140 VecVT.getVectorNumElements());
13141 SDValue Inp0 =
13142 DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext0.getOperand(0));
13143 SDValue Inp1 =
13144 DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext1.getOperand(0));
13145 Inp0 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp0);
13146 Inp1 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp1);
13147 SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1);
13148 SDValue Trunc = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, ExtVecVT, VQDMULH);
13149 Trunc = DAG.getNode(ISD::TRUNCATE, DL, VecVT, Trunc);
13150 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Trunc);
13151 }
13152
13153 // For larger types, split into legal sized chunks.
13154 assert(VecVT.getSizeInBits() % 128 == 0 && "Expected a power2 type");
13155 unsigned NumParts = VecVT.getSizeInBits() / 128;
13157 for (unsigned I = 0; I < NumParts; ++I) {
13158 SDValue Inp0 =
13159 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext0.getOperand(0),
13160 DAG.getVectorIdxConstant(I * LegalLanes, DL));
13161 SDValue Inp1 =
13162 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext1.getOperand(0),
13163 DAG.getVectorIdxConstant(I * LegalLanes, DL));
13164 SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1);
13165 Parts.push_back(VQDMULH);
13166 }
13167 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT,
13168 DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Parts));
13169}
13170
13173 const ARMSubtarget *Subtarget) {
13174 if (!Subtarget->hasMVEIntegerOps())
13175 return SDValue();
13176
13177 if (SDValue V = PerformVQDMULHCombine(N, DCI.DAG))
13178 return V;
13179
13180 // Transforms vselect(not(cond), lhs, rhs) into vselect(cond, rhs, lhs).
13181 //
13182 // We need to re-implement this optimization here as the implementation in the
13183 // Target-Independent DAGCombiner does not handle the kind of constant we make
13184 // (it calls isConstOrConstSplat with AllowTruncation set to false - and for
13185 // good reason, allowing truncation there would break other targets).
13186 //
13187 // Currently, this is only done for MVE, as it's the only target that benefits
13188 // from this transformation (e.g. VPNOT+VPSEL becomes a single VPSEL).
13189 if (N->getOperand(0).getOpcode() != ISD::XOR)
13190 return SDValue();
13191 SDValue XOR = N->getOperand(0);
13192
13193 // Check if the XOR's RHS is either a 1, or a BUILD_VECTOR of 1s.
13194 // It is important to check with truncation allowed as the BUILD_VECTORs we
13195 // generate in those situations will truncate their operands.
13196 ConstantSDNode *Const =
13197 isConstOrConstSplat(XOR->getOperand(1), /*AllowUndefs*/ false,
13198 /*AllowTruncation*/ true);
13199 if (!Const || !Const->isOne())
13200 return SDValue();
13201
13202 // Rewrite into vselect(cond, rhs, lhs).
13203 SDValue Cond = XOR->getOperand(0);
13204 SDValue LHS = N->getOperand(1);
13205 SDValue RHS = N->getOperand(2);
13206 EVT Type = N->getValueType(0);
13207 return DCI.DAG.getNode(ISD::VSELECT, SDLoc(N), Type, Cond, RHS, LHS);
13208}
13209
13210// Convert vsetcc([0,1,2,..], splat(n), ult) -> vctp n
13213 const ARMSubtarget *Subtarget) {
13214 SDValue Op0 = N->getOperand(0);
13215 SDValue Op1 = N->getOperand(1);
13216 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
13217 EVT VT = N->getValueType(0);
13218
13219 if (!Subtarget->hasMVEIntegerOps() ||
13221 return SDValue();
13222
13223 if (CC == ISD::SETUGE) {
13224 std::swap(Op0, Op1);
13225 CC = ISD::SETULT;
13226 }
13227
13228 if (CC != ISD::SETULT || VT.getScalarSizeInBits() != 1 ||
13230 return SDValue();
13231
13232 // Check first operand is BuildVector of 0,1,2,...
13233 for (unsigned I = 0; I < VT.getVectorNumElements(); I++) {
13234 if (!Op0.getOperand(I).isUndef() &&
13236 Op0.getConstantOperandVal(I) == I))
13237 return SDValue();
13238 }
13239
13240 // The second is a Splat of Op1S
13241 SDValue Op1S = DCI.DAG.getSplatValue(Op1);
13242 if (!Op1S)
13243 return SDValue();
13244
13245 unsigned Opc;
13246 switch (VT.getVectorNumElements()) {
13247 case 2:
13248 Opc = Intrinsic::arm_mve_vctp64;
13249 break;
13250 case 4:
13251 Opc = Intrinsic::arm_mve_vctp32;
13252 break;
13253 case 8:
13254 Opc = Intrinsic::arm_mve_vctp16;
13255 break;
13256 case 16:
13257 Opc = Intrinsic::arm_mve_vctp8;
13258 break;
13259 default:
13260 return SDValue();
13261 }
13262
13263 SDLoc DL(N);
13264 return DCI.DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
13265 DCI.DAG.getConstant(Opc, DL, MVT::i32),
13266 DCI.DAG.getZExtOrTrunc(Op1S, DL, MVT::i32));
13267}
13268
13269/// PerformADDECombine - Target-specific dag combine transform from
13270/// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or
13271/// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL
13274 const ARMSubtarget *Subtarget) {
13275 // Only ARM and Thumb2 support UMLAL/SMLAL.
13276 if (Subtarget->isThumb1Only())
13277 return PerformAddeSubeCombine(N, DCI, Subtarget);
13278
13279 // Only perform the checks after legalize when the pattern is available.
13280 if (DCI.isBeforeLegalize()) return SDValue();
13281
13282 return AddCombineTo64bitUMAAL(N, DCI, Subtarget);
13283}
13284
13285/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
13286/// operands N0 and N1. This is a helper for PerformADDCombine that is
13287/// called with the default operands, and if that fails, with commuted
13288/// operands.
13291 const ARMSubtarget *Subtarget){
13292 // Attempt to create vpadd for this add.
13293 if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget))
13294 return Result;
13295
13296 // Attempt to create vpaddl for this add.
13297 if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget))
13298 return Result;
13299 if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI,
13300 Subtarget))
13301 return Result;
13302
13303 // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
13304 if (N0.getNode()->hasOneUse())
13305 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI))
13306 return Result;
13307 return SDValue();
13308}
13309
13311 EVT VT = N->getValueType(0);
13312 SDValue N0 = N->getOperand(0);
13313 SDValue N1 = N->getOperand(1);
13314 SDLoc dl(N);
13315
13316 auto IsVecReduce = [](SDValue Op) {
13317 switch (Op.getOpcode()) {
13318 case ISD::VECREDUCE_ADD:
13319 case ARMISD::VADDVs:
13320 case ARMISD::VADDVu:
13321 case ARMISD::VMLAVs:
13322 case ARMISD::VMLAVu:
13323 return true;
13324 }
13325 return false;
13326 };
13327
13328 auto DistrubuteAddAddVecReduce = [&](SDValue N0, SDValue N1) {
13329 // Distribute add(X, add(vecreduce(Y), vecreduce(Z))) ->
13330 // add(add(X, vecreduce(Y)), vecreduce(Z))
13331 // to make better use of vaddva style instructions.
13332 if (VT == MVT::i32 && N1.getOpcode() == ISD::ADD && !IsVecReduce(N0) &&
13333 IsVecReduce(N1.getOperand(0)) && IsVecReduce(N1.getOperand(1)) &&
13334 !isa<ConstantSDNode>(N0) && N1->hasOneUse()) {
13335 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0, N1.getOperand(0));
13336 return DAG.getNode(ISD::ADD, dl, VT, Add0, N1.getOperand(1));
13337 }
13338 // And turn add(add(A, reduce(B)), add(C, reduce(D))) ->
13339 // add(add(add(A, C), reduce(B)), reduce(D))
13340 if (VT == MVT::i32 && N0.getOpcode() == ISD::ADD &&
13341 N1.getOpcode() == ISD::ADD && N0->hasOneUse() && N1->hasOneUse()) {
13342 unsigned N0RedOp = 0;
13343 if (!IsVecReduce(N0.getOperand(N0RedOp))) {
13344 N0RedOp = 1;
13345 if (!IsVecReduce(N0.getOperand(N0RedOp)))
13346 return SDValue();
13347 }
13348
13349 unsigned N1RedOp = 0;
13350 if (!IsVecReduce(N1.getOperand(N1RedOp)))
13351 N1RedOp = 1;
13352 if (!IsVecReduce(N1.getOperand(N1RedOp)))
13353 return SDValue();
13354
13355 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0.getOperand(1 - N0RedOp),
13356 N1.getOperand(1 - N1RedOp));
13357 SDValue Add1 =
13358 DAG.getNode(ISD::ADD, dl, VT, Add0, N0.getOperand(N0RedOp));
13359 return DAG.getNode(ISD::ADD, dl, VT, Add1, N1.getOperand(N1RedOp));
13360 }
13361 return SDValue();
13362 };
13363 if (SDValue R = DistrubuteAddAddVecReduce(N0, N1))
13364 return R;
13365 if (SDValue R = DistrubuteAddAddVecReduce(N1, N0))
13366 return R;
13367
13368 // Distribute add(vecreduce(load(Y)), vecreduce(load(Z)))
13369 // Or add(add(X, vecreduce(load(Y))), vecreduce(load(Z)))
13370 // by ascending load offsets. This can help cores prefetch if the order of
13371 // loads is more predictable.
13372 auto DistrubuteVecReduceLoad = [&](SDValue N0, SDValue N1, bool IsForward) {
13373 // Check if two reductions are known to load data where one is before/after
13374 // another. Return negative if N0 loads data before N1, positive if N1 is
13375 // before N0 and 0 otherwise if nothing is known.
13376 auto IsKnownOrderedLoad = [&](SDValue N0, SDValue N1) {
13377 // Look through to the first operand of a MUL, for the VMLA case.
13378 // Currently only looks at the first operand, in the hope they are equal.
13379 if (N0.getOpcode() == ISD::MUL)
13380 N0 = N0.getOperand(0);
13381 if (N1.getOpcode() == ISD::MUL)
13382 N1 = N1.getOperand(0);
13383
13384 // Return true if the two operands are loads to the same object and the
13385 // offset of the first is known to be less than the offset of the second.
13386 LoadSDNode *Load0 = dyn_cast<LoadSDNode>(N0);
13387 LoadSDNode *Load1 = dyn_cast<LoadSDNode>(N1);
13388 if (!Load0 || !Load1 || Load0->getChain() != Load1->getChain() ||
13389 !Load0->isSimple() || !Load1->isSimple() || Load0->isIndexed() ||
13390 Load1->isIndexed())
13391 return 0;
13392
13393 auto BaseLocDecomp0 = BaseIndexOffset::match(Load0, DAG);
13394 auto BaseLocDecomp1 = BaseIndexOffset::match(Load1, DAG);
13395
13396 if (!BaseLocDecomp0.getBase() ||
13397 BaseLocDecomp0.getBase() != BaseLocDecomp1.getBase() ||
13398 !BaseLocDecomp0.hasValidOffset() || !BaseLocDecomp1.hasValidOffset())
13399 return 0;
13400 if (BaseLocDecomp0.getOffset() < BaseLocDecomp1.getOffset())
13401 return -1;
13402 if (BaseLocDecomp0.getOffset() > BaseLocDecomp1.getOffset())
13403 return 1;
13404 return 0;
13405 };
13406
13407 SDValue X;
13408 if (N0.getOpcode() == ISD::ADD && N0->hasOneUse()) {
13409 if (IsVecReduce(N0.getOperand(0)) && IsVecReduce(N0.getOperand(1))) {
13410 int IsBefore = IsKnownOrderedLoad(N0.getOperand(0).getOperand(0),
13411 N0.getOperand(1).getOperand(0));
13412 if (IsBefore < 0) {
13413 X = N0.getOperand(0);
13414 N0 = N0.getOperand(1);
13415 } else if (IsBefore > 0) {
13416 X = N0.getOperand(1);
13417 N0 = N0.getOperand(0);
13418 } else
13419 return SDValue();
13420 } else if (IsVecReduce(N0.getOperand(0))) {
13421 X = N0.getOperand(1);
13422 N0 = N0.getOperand(0);
13423 } else if (IsVecReduce(N0.getOperand(1))) {
13424 X = N0.getOperand(0);
13425 N0 = N0.getOperand(1);
13426 } else
13427 return SDValue();
13428 } else if (IsForward && IsVecReduce(N0) && IsVecReduce(N1) &&
13429 IsKnownOrderedLoad(N0.getOperand(0), N1.getOperand(0)) < 0) {
13430 // Note this is backward to how you would expect. We create
13431 // add(reduce(load + 16), reduce(load + 0)) so that the
13432 // add(reduce(load+16), X) is combined into VADDVA(X, load+16)), leaving
13433 // the X as VADDV(load + 0)
13434 return DAG.getNode(ISD::ADD, dl, VT, N1, N0);
13435 } else
13436 return SDValue();
13437
13438 if (!IsVecReduce(N0) || !IsVecReduce(N1))
13439 return SDValue();
13440
13441 if (IsKnownOrderedLoad(N1.getOperand(0), N0.getOperand(0)) >= 0)
13442 return SDValue();
13443
13444 // Switch from add(add(X, N0), N1) to add(add(X, N1), N0)
13445 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, X, N1);
13446 return DAG.getNode(ISD::ADD, dl, VT, Add0, N0);
13447 };
13448 if (SDValue R = DistrubuteVecReduceLoad(N0, N1, true))
13449 return R;
13450 if (SDValue R = DistrubuteVecReduceLoad(N1, N0, false))
13451 return R;
13452 return SDValue();
13453}
13454
13456 const ARMSubtarget *Subtarget) {
13457 if (!Subtarget->hasMVEIntegerOps())
13458 return SDValue();
13459
13461 return R;
13462
13463 EVT VT = N->getValueType(0);
13464 SDValue N0 = N->getOperand(0);
13465 SDValue N1 = N->getOperand(1);
13466 SDLoc dl(N);
13467
13468 if (VT != MVT::i64)
13469 return SDValue();
13470
13471 // We are looking for a i64 add of a VADDLVx. Due to these being i64's, this
13472 // will look like:
13473 // t1: i32,i32 = ARMISD::VADDLVs x
13474 // t2: i64 = build_pair t1, t1:1
13475 // t3: i64 = add t2, y
13476 // Otherwise we try to push the add up above VADDLVAx, to potentially allow
13477 // the add to be simplified separately.
13478 // We also need to check for sext / zext and commutitive adds.
13479 auto MakeVecReduce = [&](unsigned Opcode, unsigned OpcodeA, SDValue NA,
13480 SDValue NB) {
13481 if (NB->getOpcode() != ISD::BUILD_PAIR)
13482 return SDValue();
13483 SDValue VecRed = NB->getOperand(0);
13484 if ((VecRed->getOpcode() != Opcode && VecRed->getOpcode() != OpcodeA) ||
13485 VecRed.getResNo() != 0 ||
13486 NB->getOperand(1) != SDValue(VecRed.getNode(), 1))
13487 return SDValue();
13488
13489 if (VecRed->getOpcode() == OpcodeA) {
13490 // add(NA, VADDLVA(Inp), Y) -> VADDLVA(add(NA, Inp), Y)
13491 SDValue Inp = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
13492 VecRed.getOperand(0), VecRed.getOperand(1));
13493 NA = DAG.getNode(ISD::ADD, dl, MVT::i64, Inp, NA);
13494 }
13495
13497 std::tie(Ops[0], Ops[1]) = DAG.SplitScalar(NA, dl, MVT::i32, MVT::i32);
13498
13499 unsigned S = VecRed->getOpcode() == OpcodeA ? 2 : 0;
13500 for (unsigned I = S, E = VecRed.getNumOperands(); I < E; I++)
13501 Ops.push_back(VecRed->getOperand(I));
13502 SDValue Red =
13503 DAG.getNode(OpcodeA, dl, DAG.getVTList({MVT::i32, MVT::i32}), Ops);
13504 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Red,
13505 SDValue(Red.getNode(), 1));
13506 };
13507
13508 if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N0, N1))
13509 return M;
13510 if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N0, N1))
13511 return M;
13512 if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N1, N0))
13513 return M;
13514 if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N1, N0))
13515 return M;
13516 if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N0, N1))
13517 return M;
13518 if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N0, N1))
13519 return M;
13520 if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N1, N0))
13521 return M;
13522 if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N1, N0))
13523 return M;
13524 if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N0, N1))
13525 return M;
13526 if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N0, N1))
13527 return M;
13528 if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N1, N0))
13529 return M;
13530 if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N1, N0))
13531 return M;
13532 if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N0, N1))
13533 return M;
13534 if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N0, N1))
13535 return M;
13536 if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N1, N0))
13537 return M;
13538 if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N1, N0))
13539 return M;
13540 return SDValue();
13541}
13542
13543bool
13545 CombineLevel Level) const {
13546 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
13547 N->getOpcode() == ISD::SRL) &&
13548 "Expected shift op");
13549
13550 SDValue ShiftLHS = N->getOperand(0);
13551 if (!ShiftLHS->hasOneUse())
13552 return false;
13553
13554 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
13555 !ShiftLHS.getOperand(0)->hasOneUse())
13556 return false;
13557
13558 if (Level == BeforeLegalizeTypes)
13559 return true;
13560
13561 if (N->getOpcode() != ISD::SHL)
13562 return true;
13563
13564 if (Subtarget->isThumb1Only()) {
13565 // Avoid making expensive immediates by commuting shifts. (This logic
13566 // only applies to Thumb1 because ARM and Thumb2 immediates can be shifted
13567 // for free.)
13568 if (N->getOpcode() != ISD::SHL)
13569 return true;
13570 SDValue N1 = N->getOperand(0);
13571 if (N1->getOpcode() != ISD::ADD && N1->getOpcode() != ISD::AND &&
13572 N1->getOpcode() != ISD::OR && N1->getOpcode() != ISD::XOR)
13573 return true;
13574 if (auto *Const = dyn_cast<ConstantSDNode>(N1->getOperand(1))) {
13575 if (Const->getAPIntValue().ult(256))
13576 return false;
13577 if (N1->getOpcode() == ISD::ADD && Const->getAPIntValue().slt(0) &&
13578 Const->getAPIntValue().sgt(-256))
13579 return false;
13580 }
13581 return true;
13582 }
13583
13584 // Turn off commute-with-shift transform after legalization, so it doesn't
13585 // conflict with PerformSHLSimplify. (We could try to detect when
13586 // PerformSHLSimplify would trigger more precisely, but it isn't
13587 // really necessary.)
13588 return false;
13589}
13590
13592 const SDNode *N) const {
13593 assert(N->getOpcode() == ISD::XOR &&
13594 (N->getOperand(0).getOpcode() == ISD::SHL ||
13595 N->getOperand(0).getOpcode() == ISD::SRL) &&
13596 "Expected XOR(SHIFT) pattern");
13597
13598 // Only commute if the entire NOT mask is a hidden shifted mask.
13599 auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
13600 auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
13601 if (XorC && ShiftC) {
13602 unsigned MaskIdx, MaskLen;
13603 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
13604 unsigned ShiftAmt = ShiftC->getZExtValue();
13605 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
13606 if (N->getOperand(0).getOpcode() == ISD::SHL)
13607 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
13608 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
13609 }
13610 }
13611
13612 return false;
13613}
13614
13616 const SDNode *N) const {
13617 assert(((N->getOpcode() == ISD::SHL &&
13618 N->getOperand(0).getOpcode() == ISD::SRL) ||
13619 (N->getOpcode() == ISD::SRL &&
13620 N->getOperand(0).getOpcode() == ISD::SHL)) &&
13621 "Expected shift-shift mask");
13622
13623 if (!Subtarget->isThumb1Only())
13624 return true;
13625
13626 EVT VT = N->getValueType(0);
13627 if (VT.getScalarSizeInBits() > 32)
13628 return true;
13629
13630 return false;
13631}
13632
13634 unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
13635 SDValue Y) const {
13636 return Subtarget->hasMVEIntegerOps() && isTypeLegal(VT) &&
13637 SelectOpcode == ISD::VSELECT;
13638}
13639
13641 if (!Subtarget->hasNEON()) {
13642 if (Subtarget->isThumb1Only())
13643 return VT.getScalarSizeInBits() <= 32;
13644 return true;
13645 }
13646 return VT.isScalarInteger();
13647}
13648
13650 EVT VT) const {
13651 if (!isOperationLegalOrCustom(Op, VT) || !FPVT.isSimple())
13652 return false;
13653
13654 switch (FPVT.getSimpleVT().SimpleTy) {
13655 case MVT::f16:
13656 return Subtarget->hasVFP2Base();
13657 case MVT::f32:
13658 return Subtarget->hasVFP2Base();
13659 case MVT::f64:
13660 return Subtarget->hasFP64();
13661 case MVT::v4f32:
13662 case MVT::v8f16:
13663 return Subtarget->hasMVEFloatOps();
13664 default:
13665 return false;
13666 }
13667}
13668
13671 const ARMSubtarget *ST) {
13672 // Allow the generic combiner to identify potential bswaps.
13673 if (DCI.isBeforeLegalize())
13674 return SDValue();
13675
13676 // DAG combiner will fold:
13677 // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
13678 // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2
13679 // Other code patterns that can be also be modified have the following form:
13680 // b + ((a << 1) | 510)
13681 // b + ((a << 1) & 510)
13682 // b + ((a << 1) ^ 510)
13683 // b + ((a << 1) + 510)
13684
13685 // Many instructions can perform the shift for free, but it requires both
13686 // the operands to be registers. If c1 << c2 is too large, a mov immediate
13687 // instruction will needed. So, unfold back to the original pattern if:
13688 // - if c1 and c2 are small enough that they don't require mov imms.
13689 // - the user(s) of the node can perform an shl
13690
13691 // No shifted operands for 16-bit instructions.
13692 if (ST->isThumb() && ST->isThumb1Only())
13693 return SDValue();
13694
13695 // Check that all the users could perform the shl themselves.
13696 for (auto *U : N->users()) {
13697 switch(U->getOpcode()) {
13698 default:
13699 return SDValue();
13700 case ISD::SUB:
13701 case ISD::ADD:
13702 case ISD::AND:
13703 case ISD::OR:
13704 case ISD::XOR:
13705 case ISD::SETCC:
13706 case ARMISD::CMP:
13707 // Check that the user isn't already using a constant because there
13708 // aren't any instructions that support an immediate operand and a
13709 // shifted operand.
13710 if (isa<ConstantSDNode>(U->getOperand(0)) ||
13711 isa<ConstantSDNode>(U->getOperand(1)))
13712 return SDValue();
13713
13714 // Check that it's not already using a shift.
13715 if (U->getOperand(0).getOpcode() == ISD::SHL ||
13716 U->getOperand(1).getOpcode() == ISD::SHL)
13717 return SDValue();
13718 break;
13719 }
13720 }
13721
13722 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR &&
13723 N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND)
13724 return SDValue();
13725
13726 if (N->getOperand(0).getOpcode() != ISD::SHL)
13727 return SDValue();
13728
13729 SDValue SHL = N->getOperand(0);
13730
13731 auto *C1ShlC2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
13732 auto *C2 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
13733 if (!C1ShlC2 || !C2)
13734 return SDValue();
13735
13736 APInt C2Int = C2->getAPIntValue();
13737 APInt C1Int = C1ShlC2->getAPIntValue();
13738 unsigned C2Width = C2Int.getBitWidth();
13739 if (C2Int.uge(C2Width))
13740 return SDValue();
13741 uint64_t C2Value = C2Int.getZExtValue();
13742
13743 // Check that performing a lshr will not lose any information.
13744 APInt Mask = APInt::getHighBitsSet(C2Width, C2Width - C2Value);
13745 if ((C1Int & Mask) != C1Int)
13746 return SDValue();
13747
13748 // Shift the first constant.
13749 C1Int.lshrInPlace(C2Int);
13750
13751 // The immediates are encoded as an 8-bit value that can be rotated.
13752 auto LargeImm = [](const APInt &Imm) {
13753 unsigned Zeros = Imm.countl_zero() + Imm.countr_zero();
13754 return Imm.getBitWidth() - Zeros > 8;
13755 };
13756
13757 if (LargeImm(C1Int) || LargeImm(C2Int))
13758 return SDValue();
13759
13760 SelectionDAG &DAG = DCI.DAG;
13761 SDLoc dl(N);
13762 SDValue X = SHL.getOperand(0);
13763 SDValue BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X,
13764 DAG.getConstant(C1Int, dl, MVT::i32));
13765 // Shift left to compensate for the lshr of C1Int.
13766 SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1));
13767
13768 LLVM_DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump();
13769 SHL.dump(); N->dump());
13770 LLVM_DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump());
13771 return Res;
13772}
13773
13774
13775/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
13776///
13779 const ARMSubtarget *Subtarget) {
13780 SDValue N0 = N->getOperand(0);
13781 SDValue N1 = N->getOperand(1);
13782
13783 // Only works one way, because it needs an immediate operand.
13784 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
13785 return Result;
13786
13787 if (SDValue Result = PerformADDVecReduce(N, DCI.DAG, Subtarget))
13788 return Result;
13789
13790 // First try with the default operand order.
13791 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget))
13792 return Result;
13793
13794 // If that didn't work, try again with the operands commuted.
13795 return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget);
13796}
13797
13798// Combine (sub 0, (csinc X, Y, CC)) -> (csinv -X, Y, CC)
13799// providing -X is as cheap as X (currently, just a constant).
13801 if (N->getValueType(0) != MVT::i32 || !isNullConstant(N->getOperand(0)))
13802 return SDValue();
13803 SDValue CSINC = N->getOperand(1);
13804 if (CSINC.getOpcode() != ARMISD::CSINC || !CSINC.hasOneUse())
13805 return SDValue();
13806
13808 if (!X)
13809 return SDValue();
13810
13811 return DAG.getNode(ARMISD::CSINV, SDLoc(N), MVT::i32,
13812 DAG.getNode(ISD::SUB, SDLoc(N), MVT::i32, N->getOperand(0),
13813 CSINC.getOperand(0)),
13814 CSINC.getOperand(1), CSINC.getOperand(2),
13815 CSINC.getOperand(3));
13816}
13817
13819 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0));
13820}
13821
13822// Try to fold
13823//
13824// (neg (cmov X, Y)) -> (cmov (neg X), (neg Y))
13825//
13826// The folding helps cmov to be matched with csneg without generating
13827// redundant neg instruction.
13829 if (!isNegatedInteger(SDValue(N, 0)))
13830 return SDValue();
13831
13832 SDValue CMov = N->getOperand(1);
13833 if (CMov.getOpcode() != ARMISD::CMOV || !CMov->hasOneUse())
13834 return SDValue();
13835
13836 SDValue N0 = CMov.getOperand(0);
13837 SDValue N1 = CMov.getOperand(1);
13838
13839 // If neither of them are negations, it's not worth the folding as it
13840 // introduces two additional negations while reducing one negation.
13841 if (!isNegatedInteger(N0) && !isNegatedInteger(N1))
13842 return SDValue();
13843
13844 SDLoc DL(N);
13845 EVT VT = CMov.getValueType();
13846
13847 SDValue N0N = DAG.getNegative(N0, DL, VT);
13848 SDValue N1N = DAG.getNegative(N1, DL, VT);
13849 return DAG.getNode(ARMISD::CMOV, DL, VT, N0N, N1N, CMov.getOperand(2),
13850 CMov.getOperand(3));
13851}
13852
13853/// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
13854///
13857 const ARMSubtarget *Subtarget) {
13858 SDValue N0 = N->getOperand(0);
13859 SDValue N1 = N->getOperand(1);
13860
13861 // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
13862 if (N1.getNode()->hasOneUse())
13863 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI))
13864 return Result;
13865
13866 if (SDValue R = PerformSubCSINCCombine(N, DCI.DAG))
13867 return R;
13868
13869 if (SDValue Val = performNegCMovCombine(N, DCI.DAG))
13870 return Val;
13871
13872 if (!Subtarget->hasMVEIntegerOps() || !N->getValueType(0).isVector())
13873 return SDValue();
13874
13875 // Fold (sub (ARMvmovImm 0), (ARMvdup x)) -> (ARMvdup (sub 0, x))
13876 // so that we can readily pattern match more mve instructions which can use
13877 // a scalar operand.
13878 SDValue VDup = N->getOperand(1);
13879 if (VDup->getOpcode() != ARMISD::VDUP)
13880 return SDValue();
13881
13882 SDValue VMov = N->getOperand(0);
13883 if (VMov->getOpcode() == ISD::BITCAST)
13884 VMov = VMov->getOperand(0);
13885
13886 if (VMov->getOpcode() != ARMISD::VMOVIMM || !isZeroVector(VMov))
13887 return SDValue();
13888
13889 SDLoc dl(N);
13890 SDValue Negate = DCI.DAG.getNode(ISD::SUB, dl, MVT::i32,
13891 DCI.DAG.getConstant(0, dl, MVT::i32),
13892 VDup->getOperand(0));
13893 return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), Negate);
13894}
13895
13896/// PerformVMULCombine
13897/// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the
13898/// special multiplier accumulator forwarding.
13899/// vmul d3, d0, d2
13900/// vmla d3, d1, d2
13901/// is faster than
13902/// vadd d3, d0, d1
13903/// vmul d3, d3, d2
13904// However, for (A + B) * (A + B),
13905// vadd d2, d0, d1
13906// vmul d3, d0, d2
13907// vmla d3, d1, d2
13908// is slower than
13909// vadd d2, d0, d1
13910// vmul d3, d2, d2
13913 const ARMSubtarget *Subtarget) {
13914 if (!Subtarget->hasVMLxForwarding())
13915 return SDValue();
13916
13917 SelectionDAG &DAG = DCI.DAG;
13918 SDValue N0 = N->getOperand(0);
13919 SDValue N1 = N->getOperand(1);
13920 unsigned Opcode = N0.getOpcode();
13921 if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
13922 Opcode != ISD::FADD && Opcode != ISD::FSUB) {
13923 Opcode = N1.getOpcode();
13924 if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
13925 Opcode != ISD::FADD && Opcode != ISD::FSUB)
13926 return SDValue();
13927 std::swap(N0, N1);
13928 }
13929
13930 if (N0 == N1)
13931 return SDValue();
13932
13933 EVT VT = N->getValueType(0);
13934 SDLoc DL(N);
13935 SDValue N00 = N0->getOperand(0);
13936 SDValue N01 = N0->getOperand(1);
13937 return DAG.getNode(Opcode, DL, VT,
13938 DAG.getNode(ISD::MUL, DL, VT, N00, N1),
13939 DAG.getNode(ISD::MUL, DL, VT, N01, N1));
13940}
13941
13943 const ARMSubtarget *Subtarget) {
13944 EVT VT = N->getValueType(0);
13945 if (VT != MVT::v2i64)
13946 return SDValue();
13947
13948 SDValue N0 = N->getOperand(0);
13949 SDValue N1 = N->getOperand(1);
13950
13951 auto IsSignExt = [&](SDValue Op) {
13952 if (Op->getOpcode() != ISD::SIGN_EXTEND_INREG)
13953 return SDValue();
13954 EVT VT = cast<VTSDNode>(Op->getOperand(1))->getVT();
13955 if (VT.getScalarSizeInBits() == 32)
13956 return Op->getOperand(0);
13957 return SDValue();
13958 };
13959 auto IsZeroExt = [&](SDValue Op) {
13960 // Zero extends are a little more awkward. At the point we are matching
13961 // this, we are looking for an AND with a (-1, 0, -1, 0) buildvector mask.
13962 // That might be before of after a bitcast depending on how the and is
13963 // placed. Because this has to look through bitcasts, it is currently only
13964 // supported on LE.
13965 if (!Subtarget->isLittle())
13966 return SDValue();
13967
13968 SDValue And = Op;
13969 if (And->getOpcode() == ISD::BITCAST)
13970 And = And->getOperand(0);
13971 if (And->getOpcode() != ISD::AND)
13972 return SDValue();
13973 SDValue Mask = And->getOperand(1);
13974 if (Mask->getOpcode() == ISD::BITCAST)
13975 Mask = Mask->getOperand(0);
13976
13977 if (Mask->getOpcode() != ISD::BUILD_VECTOR ||
13978 Mask.getValueType() != MVT::v4i32)
13979 return SDValue();
13980 if (isAllOnesConstant(Mask->getOperand(0)) &&
13981 isNullConstant(Mask->getOperand(1)) &&
13982 isAllOnesConstant(Mask->getOperand(2)) &&
13983 isNullConstant(Mask->getOperand(3)))
13984 return And->getOperand(0);
13985 return SDValue();
13986 };
13987
13988 SDLoc dl(N);
13989 if (SDValue Op0 = IsSignExt(N0)) {
13990 if (SDValue Op1 = IsSignExt(N1)) {
13991 SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
13992 SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
13993 return DAG.getNode(ARMISD::VMULLs, dl, VT, New0a, New1a);
13994 }
13995 }
13996 if (SDValue Op0 = IsZeroExt(N0)) {
13997 if (SDValue Op1 = IsZeroExt(N1)) {
13998 SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
13999 SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
14000 return DAG.getNode(ARMISD::VMULLu, dl, VT, New0a, New1a);
14001 }
14002 }
14003
14004 return SDValue();
14005}
14006
14009 const ARMSubtarget *Subtarget) {
14010 SelectionDAG &DAG = DCI.DAG;
14011
14012 EVT VT = N->getValueType(0);
14013 if (Subtarget->hasMVEIntegerOps() && VT == MVT::v2i64)
14014 return PerformMVEVMULLCombine(N, DAG, Subtarget);
14015
14016 if (Subtarget->isThumb1Only())
14017 return SDValue();
14018
14019 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
14020 return SDValue();
14021
14022 if (VT.is64BitVector() || VT.is128BitVector())
14023 return PerformVMULCombine(N, DCI, Subtarget);
14024 if (VT != MVT::i32)
14025 return SDValue();
14026
14027 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
14028 if (!C)
14029 return SDValue();
14030
14031 int64_t MulAmt = C->getSExtValue();
14032 unsigned ShiftAmt = llvm::countr_zero<uint64_t>(MulAmt);
14033
14034 ShiftAmt = ShiftAmt & (32 - 1);
14035 SDValue V = N->getOperand(0);
14036 SDLoc DL(N);
14037
14038 SDValue Res;
14039 MulAmt >>= ShiftAmt;
14040
14041 if (MulAmt >= 0) {
14042 if (llvm::has_single_bit<uint32_t>(MulAmt - 1)) {
14043 // (mul x, 2^N + 1) => (add (shl x, N), x)
14044 Res = DAG.getNode(ISD::ADD, DL, VT,
14045 V,
14046 DAG.getNode(ISD::SHL, DL, VT,
14047 V,
14048 DAG.getConstant(Log2_32(MulAmt - 1), DL,
14049 MVT::i32)));
14050 } else if (llvm::has_single_bit<uint32_t>(MulAmt + 1)) {
14051 // (mul x, 2^N - 1) => (sub (shl x, N), x)
14052 Res = DAG.getNode(ISD::SUB, DL, VT,
14053 DAG.getNode(ISD::SHL, DL, VT,
14054 V,
14055 DAG.getConstant(Log2_32(MulAmt + 1), DL,
14056 MVT::i32)),
14057 V);
14058 } else
14059 return SDValue();
14060 } else {
14061 uint64_t MulAmtAbs = -MulAmt;
14062 if (llvm::has_single_bit<uint32_t>(MulAmtAbs + 1)) {
14063 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
14064 Res = DAG.getNode(ISD::SUB, DL, VT,
14065 V,
14066 DAG.getNode(ISD::SHL, DL, VT,
14067 V,
14068 DAG.getConstant(Log2_32(MulAmtAbs + 1), DL,
14069 MVT::i32)));
14070 } else if (llvm::has_single_bit<uint32_t>(MulAmtAbs - 1)) {
14071 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
14072 Res = DAG.getNode(ISD::ADD, DL, VT,
14073 V,
14074 DAG.getNode(ISD::SHL, DL, VT,
14075 V,
14076 DAG.getConstant(Log2_32(MulAmtAbs - 1), DL,
14077 MVT::i32)));
14078 Res = DAG.getNode(ISD::SUB, DL, VT,
14079 DAG.getConstant(0, DL, MVT::i32), Res);
14080 } else
14081 return SDValue();
14082 }
14083
14084 if (ShiftAmt != 0)
14085 Res = DAG.getNode(ISD::SHL, DL, VT,
14086 Res, DAG.getConstant(ShiftAmt, DL, MVT::i32));
14087
14088 // Do not add new nodes to DAG combiner worklist.
14089 DCI.CombineTo(N, Res, false);
14090 return SDValue();
14091}
14092
14095 const ARMSubtarget *Subtarget) {
14096 // Allow DAGCombine to pattern-match before we touch the canonical form.
14097 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
14098 return SDValue();
14099
14100 if (N->getValueType(0) != MVT::i32)
14101 return SDValue();
14102
14103 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1));
14104 if (!N1C)
14105 return SDValue();
14106
14107 uint32_t C1 = (uint32_t)N1C->getZExtValue();
14108 // Don't transform uxtb/uxth.
14109 if (C1 == 255 || C1 == 65535)
14110 return SDValue();
14111
14112 SDNode *N0 = N->getOperand(0).getNode();
14113 if (!N0->hasOneUse())
14114 return SDValue();
14115
14116 if (N0->getOpcode() != ISD::SHL && N0->getOpcode() != ISD::SRL)
14117 return SDValue();
14118
14119 bool LeftShift = N0->getOpcode() == ISD::SHL;
14120
14122 if (!N01C)
14123 return SDValue();
14124
14125 uint32_t C2 = (uint32_t)N01C->getZExtValue();
14126 if (!C2 || C2 >= 32)
14127 return SDValue();
14128
14129 // Clear irrelevant bits in the mask.
14130 if (LeftShift)
14131 C1 &= (-1U << C2);
14132 else
14133 C1 &= (-1U >> C2);
14134
14135 SelectionDAG &DAG = DCI.DAG;
14136 SDLoc DL(N);
14137
14138 // We have a pattern of the form "(and (shl x, c2) c1)" or
14139 // "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to
14140 // transform to a pair of shifts, to save materializing c1.
14141
14142 // First pattern: right shift, then mask off leading bits.
14143 // FIXME: Use demanded bits?
14144 if (!LeftShift && isMask_32(C1)) {
14145 uint32_t C3 = llvm::countl_zero(C1);
14146 if (C2 < C3) {
14147 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
14148 DAG.getConstant(C3 - C2, DL, MVT::i32));
14149 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
14150 DAG.getConstant(C3, DL, MVT::i32));
14151 }
14152 }
14153
14154 // First pattern, reversed: left shift, then mask off trailing bits.
14155 if (LeftShift && isMask_32(~C1)) {
14156 uint32_t C3 = llvm::countr_zero(C1);
14157 if (C2 < C3) {
14158 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
14159 DAG.getConstant(C3 - C2, DL, MVT::i32));
14160 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
14161 DAG.getConstant(C3, DL, MVT::i32));
14162 }
14163 }
14164
14165 // Second pattern: left shift, then mask off leading bits.
14166 // FIXME: Use demanded bits?
14167 if (LeftShift && isShiftedMask_32(C1)) {
14168 uint32_t Trailing = llvm::countr_zero(C1);
14169 uint32_t C3 = llvm::countl_zero(C1);
14170 if (Trailing == C2 && C2 + C3 < 32) {
14171 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
14172 DAG.getConstant(C2 + C3, DL, MVT::i32));
14173 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
14174 DAG.getConstant(C3, DL, MVT::i32));
14175 }
14176 }
14177
14178 // Second pattern, reversed: right shift, then mask off trailing bits.
14179 // FIXME: Handle other patterns of known/demanded bits.
14180 if (!LeftShift && isShiftedMask_32(C1)) {
14181 uint32_t Leading = llvm::countl_zero(C1);
14182 uint32_t C3 = llvm::countr_zero(C1);
14183 if (Leading == C2 && C2 + C3 < 32) {
14184 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
14185 DAG.getConstant(C2 + C3, DL, MVT::i32));
14186 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
14187 DAG.getConstant(C3, DL, MVT::i32));
14188 }
14189 }
14190
14191 // Transform "(and (shl x, c2) c1)" into "(shl (and x, c1>>c2), c2)"
14192 // if "c1 >> c2" is a cheaper immediate than "c1"
14193 if (LeftShift &&
14194 HasLowerConstantMaterializationCost(C1 >> C2, C1, Subtarget)) {
14195
14196 SDValue And = DAG.getNode(ISD::AND, DL, MVT::i32, N0->getOperand(0),
14197 DAG.getConstant(C1 >> C2, DL, MVT::i32));
14198 return DAG.getNode(ISD::SHL, DL, MVT::i32, And,
14199 DAG.getConstant(C2, DL, MVT::i32));
14200 }
14201
14202 return SDValue();
14203}
14204
14207 const ARMSubtarget *Subtarget) {
14208 // Attempt to use immediate-form VBIC
14209 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
14210 SDLoc dl(N);
14211 EVT VT = N->getValueType(0);
14212 SelectionDAG &DAG = DCI.DAG;
14213
14214 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT) || VT == MVT::v2i1 ||
14215 VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1)
14216 return SDValue();
14217
14218 APInt SplatBits, SplatUndef;
14219 unsigned SplatBitSize;
14220 bool HasAnyUndefs;
14221 if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
14222 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
14223 if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
14224 SplatBitSize == 64) {
14225 EVT VbicVT;
14226 SDValue Val = isVMOVModifiedImm((~SplatBits).getZExtValue(),
14227 SplatUndef.getZExtValue(), SplatBitSize,
14228 DAG, dl, VbicVT, VT, OtherModImm);
14229 if (Val.getNode()) {
14230 SDValue Input =
14231 DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VbicVT, N->getOperand(0));
14232 SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val);
14233 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vbic);
14234 }
14235 }
14236 }
14237
14238 if (!Subtarget->isThumb1Only()) {
14239 // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))
14240 if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI))
14241 return Result;
14242
14243 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14244 return Result;
14245 }
14246
14247 if (Subtarget->isThumb1Only())
14248 if (SDValue Result = CombineANDShift(N, DCI, Subtarget))
14249 return Result;
14250
14251 return SDValue();
14252}
14253
14254// Try combining OR nodes to SMULWB, SMULWT.
14257 const ARMSubtarget *Subtarget) {
14258 if (!Subtarget->hasV6Ops() ||
14259 (Subtarget->isThumb() &&
14260 (!Subtarget->hasThumb2() || !Subtarget->hasDSP())))
14261 return SDValue();
14262
14263 SDValue SRL = OR->getOperand(0);
14264 SDValue SHL = OR->getOperand(1);
14265
14266 if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) {
14267 SRL = OR->getOperand(1);
14268 SHL = OR->getOperand(0);
14269 }
14270 if (!isSRL16(SRL) || !isSHL16(SHL))
14271 return SDValue();
14272
14273 // The first operands to the shifts need to be the two results from the
14274 // same smul_lohi node.
14275 if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) ||
14276 SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI)
14277 return SDValue();
14278
14279 SDNode *SMULLOHI = SRL.getOperand(0).getNode();
14280 if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) ||
14281 SHL.getOperand(0) != SDValue(SMULLOHI, 1))
14282 return SDValue();
14283
14284 // Now we have:
14285 // (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16)))
14286 // For SMUL[B|T] smul_lohi will take a 32-bit and a 16-bit arguments.
14287 // For SMUWB the 16-bit value will signed extended somehow.
14288 // For SMULWT only the SRA is required.
14289 // Check both sides of SMUL_LOHI
14290 SDValue OpS16 = SMULLOHI->getOperand(0);
14291 SDValue OpS32 = SMULLOHI->getOperand(1);
14292
14293 SelectionDAG &DAG = DCI.DAG;
14294 if (!isS16(OpS16, DAG) && !isSRA16(OpS16)) {
14295 OpS16 = OpS32;
14296 OpS32 = SMULLOHI->getOperand(0);
14297 }
14298
14299 SDLoc dl(OR);
14300 unsigned Opcode = 0;
14301 if (isS16(OpS16, DAG))
14302 Opcode = ARMISD::SMULWB;
14303 else if (isSRA16(OpS16)) {
14304 Opcode = ARMISD::SMULWT;
14305 OpS16 = OpS16->getOperand(0);
14306 }
14307 else
14308 return SDValue();
14309
14310 SDValue Res = DAG.getNode(Opcode, dl, MVT::i32, OpS32, OpS16);
14311 DAG.ReplaceAllUsesOfValueWith(SDValue(OR, 0), Res);
14312 return SDValue(OR, 0);
14313}
14314
14317 const ARMSubtarget *Subtarget) {
14318 // BFI is only available on V6T2+
14319 if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
14320 return SDValue();
14321
14322 EVT VT = N->getValueType(0);
14323 SDValue N0 = N->getOperand(0);
14324 SDValue N1 = N->getOperand(1);
14325 SelectionDAG &DAG = DCI.DAG;
14326 SDLoc DL(N);
14327 // 1) or (and A, mask), val => ARMbfi A, val, mask
14328 // iff (val & mask) == val
14329 //
14330 // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
14331 // 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2)
14332 // && mask == ~mask2
14333 // 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2)
14334 // && ~mask == mask2
14335 // (i.e., copy a bitfield value into another bitfield of the same width)
14336
14337 if (VT != MVT::i32)
14338 return SDValue();
14339
14340 SDValue N00 = N0.getOperand(0);
14341
14342 // The value and the mask need to be constants so we can verify this is
14343 // actually a bitfield set. If the mask is 0xffff, we can do better
14344 // via a movt instruction, so don't use BFI in that case.
14345 SDValue MaskOp = N0.getOperand(1);
14347 if (!MaskC)
14348 return SDValue();
14349 unsigned Mask = MaskC->getZExtValue();
14350 if (Mask == 0xffff)
14351 return SDValue();
14352 SDValue Res;
14353 // Case (1): or (and A, mask), val => ARMbfi A, val, mask
14355 if (N1C) {
14356 unsigned Val = N1C->getZExtValue();
14357 if ((Val & ~Mask) != Val)
14358 return SDValue();
14359
14360 if (ARM::isBitFieldInvertedMask(Mask)) {
14361 Val >>= llvm::countr_zero(~Mask);
14362
14363 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00,
14364 DAG.getConstant(Val, DL, MVT::i32),
14365 DAG.getConstant(Mask, DL, MVT::i32));
14366
14367 DCI.CombineTo(N, Res, false);
14368 // Return value from the original node to inform the combiner than N is
14369 // now dead.
14370 return SDValue(N, 0);
14371 }
14372 } else if (N1.getOpcode() == ISD::AND) {
14373 // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
14375 if (!N11C)
14376 return SDValue();
14377 unsigned Mask2 = N11C->getZExtValue();
14378
14379 // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern
14380 // as is to match.
14381 if (ARM::isBitFieldInvertedMask(Mask) &&
14382 (Mask == ~Mask2)) {
14383 // The pack halfword instruction works better for masks that fit it,
14384 // so use that when it's available.
14385 if (Subtarget->hasDSP() &&
14386 (Mask == 0xffff || Mask == 0xffff0000))
14387 return SDValue();
14388 // 2a
14389 unsigned amt = llvm::countr_zero(Mask2);
14390 Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
14391 DAG.getConstant(amt, DL, MVT::i32));
14392 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res,
14393 DAG.getConstant(Mask, DL, MVT::i32));
14394 DCI.CombineTo(N, Res, false);
14395 // Return value from the original node to inform the combiner than N is
14396 // now dead.
14397 return SDValue(N, 0);
14398 } else if (ARM::isBitFieldInvertedMask(~Mask) &&
14399 (~Mask == Mask2)) {
14400 // The pack halfword instruction works better for masks that fit it,
14401 // so use that when it's available.
14402 if (Subtarget->hasDSP() &&
14403 (Mask2 == 0xffff || Mask2 == 0xffff0000))
14404 return SDValue();
14405 // 2b
14406 unsigned lsb = llvm::countr_zero(Mask);
14407 Res = DAG.getNode(ISD::SRL, DL, VT, N00,
14408 DAG.getConstant(lsb, DL, MVT::i32));
14409 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
14410 DAG.getConstant(Mask2, DL, MVT::i32));
14411 DCI.CombineTo(N, Res, false);
14412 // Return value from the original node to inform the combiner than N is
14413 // now dead.
14414 return SDValue(N, 0);
14415 }
14416 }
14417
14418 if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) &&
14419 N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) &&
14421 // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask
14422 // where lsb(mask) == #shamt and masked bits of B are known zero.
14423 SDValue ShAmt = N00.getOperand(1);
14424 unsigned ShAmtC = ShAmt->getAsZExtVal();
14425 unsigned LSB = llvm::countr_zero(Mask);
14426 if (ShAmtC != LSB)
14427 return SDValue();
14428
14429 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0),
14430 DAG.getConstant(~Mask, DL, MVT::i32));
14431
14432 DCI.CombineTo(N, Res, false);
14433 // Return value from the original node to inform the combiner than N is
14434 // now dead.
14435 return SDValue(N, 0);
14436 }
14437
14438 return SDValue();
14439}
14440
14441static bool isValidMVECond(unsigned CC, bool IsFloat) {
14442 switch (CC) {
14443 case ARMCC::EQ:
14444 case ARMCC::NE:
14445 case ARMCC::LE:
14446 case ARMCC::GT:
14447 case ARMCC::GE:
14448 case ARMCC::LT:
14449 return true;
14450 case ARMCC::HS:
14451 case ARMCC::HI:
14452 return !IsFloat;
14453 default:
14454 return false;
14455 };
14456}
14457
14459 if (N->getOpcode() == ARMISD::VCMP)
14460 return (ARMCC::CondCodes)N->getConstantOperandVal(2);
14461 else if (N->getOpcode() == ARMISD::VCMPZ)
14462 return (ARMCC::CondCodes)N->getConstantOperandVal(1);
14463 else
14464 llvm_unreachable("Not a VCMP/VCMPZ!");
14465}
14466
14469 return isValidMVECond(CC, N->getOperand(0).getValueType().isFloatingPoint());
14470}
14471
14473 const ARMSubtarget *Subtarget) {
14474 // Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain
14475 // together with predicates
14476 EVT VT = N->getValueType(0);
14477 SDLoc DL(N);
14478 SDValue N0 = N->getOperand(0);
14479 SDValue N1 = N->getOperand(1);
14480
14481 auto IsFreelyInvertable = [&](SDValue V) {
14482 if (V->getOpcode() == ARMISD::VCMP || V->getOpcode() == ARMISD::VCMPZ)
14483 return CanInvertMVEVCMP(V);
14484 return false;
14485 };
14486
14487 // At least one operand must be freely invertable.
14488 if (!(IsFreelyInvertable(N0) || IsFreelyInvertable(N1)))
14489 return SDValue();
14490
14491 SDValue NewN0 = DAG.getLogicalNOT(DL, N0, VT);
14492 SDValue NewN1 = DAG.getLogicalNOT(DL, N1, VT);
14493 SDValue And = DAG.getNode(ISD::AND, DL, VT, NewN0, NewN1);
14494 return DAG.getLogicalNOT(DL, And, VT);
14495}
14496
14497/// PerformORCombine - Target-specific dag combine xforms for ISD::OR
14500 const ARMSubtarget *Subtarget) {
14501 // Attempt to use immediate-form VORR
14502 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
14503 SDLoc dl(N);
14504 EVT VT = N->getValueType(0);
14505 SelectionDAG &DAG = DCI.DAG;
14506
14507 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
14508 return SDValue();
14509
14510 if (Subtarget->hasMVEIntegerOps() && (VT == MVT::v2i1 || VT == MVT::v4i1 ||
14511 VT == MVT::v8i1 || VT == MVT::v16i1))
14512 return PerformORCombine_i1(N, DAG, Subtarget);
14513
14514 APInt SplatBits, SplatUndef;
14515 unsigned SplatBitSize;
14516 bool HasAnyUndefs;
14517 if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
14518 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
14519 if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
14520 SplatBitSize == 64) {
14521 EVT VorrVT;
14522 SDValue Val =
14523 isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
14524 SplatBitSize, DAG, dl, VorrVT, VT, OtherModImm);
14525 if (Val.getNode()) {
14526 SDValue Input =
14527 DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VorrVT, N->getOperand(0));
14528 SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val);
14529 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vorr);
14530 }
14531 }
14532 }
14533
14534 if (!Subtarget->isThumb1Only()) {
14535 // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
14536 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
14537 return Result;
14538 if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget))
14539 return Result;
14540 }
14541
14542 SDValue N0 = N->getOperand(0);
14543 SDValue N1 = N->getOperand(1);
14544
14545 // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
14546 if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&
14548
14549 // The code below optimizes (or (and X, Y), Z).
14550 // The AND operand needs to have a single user to make these optimizations
14551 // profitable.
14552 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
14553 return SDValue();
14554
14555 APInt SplatUndef;
14556 unsigned SplatBitSize;
14557 bool HasAnyUndefs;
14558
14559 APInt SplatBits0, SplatBits1;
14562 // Ensure that the second operand of both ands are constants
14563 if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
14564 HasAnyUndefs) && !HasAnyUndefs) {
14565 if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
14566 HasAnyUndefs) && !HasAnyUndefs) {
14567 // Ensure that the bit width of the constants are the same and that
14568 // the splat arguments are logical inverses as per the pattern we
14569 // are trying to simplify.
14570 if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() &&
14571 SplatBits0 == ~SplatBits1) {
14572 // Canonicalize the vector type to make instruction selection
14573 // simpler.
14574 EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
14575 SDValue Result = DAG.getNode(ARMISD::VBSP, dl, CanonicalVT,
14576 N0->getOperand(1),
14577 N0->getOperand(0),
14578 N1->getOperand(0));
14579 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Result);
14580 }
14581 }
14582 }
14583 }
14584
14585 // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
14586 // reasonable.
14587 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
14588 if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget))
14589 return Res;
14590 }
14591
14592 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14593 return Result;
14594
14595 return SDValue();
14596}
14597
14600 const ARMSubtarget *Subtarget) {
14601 EVT VT = N->getValueType(0);
14602 SelectionDAG &DAG = DCI.DAG;
14603
14604 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
14605 return SDValue();
14606
14607 if (!Subtarget->isThumb1Only()) {
14608 // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
14609 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
14610 return Result;
14611
14612 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14613 return Result;
14614 }
14615
14616 if (Subtarget->hasMVEIntegerOps()) {
14617 // fold (xor(vcmp/z, 1)) into a vcmp with the opposite condition.
14618 SDValue N0 = N->getOperand(0);
14619 SDValue N1 = N->getOperand(1);
14620 const TargetLowering *TLI = Subtarget->getTargetLowering();
14621 if (TLI->isConstTrueVal(N1) &&
14622 (N0->getOpcode() == ARMISD::VCMP || N0->getOpcode() == ARMISD::VCMPZ)) {
14623 if (CanInvertMVEVCMP(N0)) {
14624 SDLoc DL(N0);
14626
14628 Ops.push_back(N0->getOperand(0));
14629 if (N0->getOpcode() == ARMISD::VCMP)
14630 Ops.push_back(N0->getOperand(1));
14631 Ops.push_back(DAG.getConstant(CC, DL, MVT::i32));
14632 return DAG.getNode(N0->getOpcode(), DL, N0->getValueType(0), Ops);
14633 }
14634 }
14635 }
14636
14637 return SDValue();
14638}
14639
14640// ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it,
14641// and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and
14642// their position in "to" (Rd).
14643static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) {
14644 assert(N->getOpcode() == ARMISD::BFI);
14645
14646 SDValue From = N->getOperand(1);
14647 ToMask = ~N->getConstantOperandAPInt(2);
14648 FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.popcount());
14649
14650 // If the Base came from a SHR #C, we can deduce that it is really testing bit
14651 // #C in the base of the SHR.
14652 if (From->getOpcode() == ISD::SRL &&
14653 isa<ConstantSDNode>(From->getOperand(1))) {
14654 APInt Shift = From->getConstantOperandAPInt(1);
14655 assert(Shift.getLimitedValue() < 32 && "Shift too large!");
14656 FromMask <<= Shift.getLimitedValue(31);
14657 From = From->getOperand(0);
14658 }
14659
14660 return From;
14661}
14662
14663// If A and B contain one contiguous set of bits, does A | B == A . B?
14664//
14665// Neither A nor B must be zero.
14666static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) {
14667 unsigned LastActiveBitInA = A.countr_zero();
14668 unsigned FirstActiveBitInB = B.getBitWidth() - B.countl_zero() - 1;
14669 return LastActiveBitInA - 1 == FirstActiveBitInB;
14670}
14671
14673 // We have a BFI in N. Find a BFI it can combine with, if one exists.
14674 APInt ToMask, FromMask;
14675 SDValue From = ParseBFI(N, ToMask, FromMask);
14676 SDValue To = N->getOperand(0);
14677
14678 SDValue V = To;
14679 if (V.getOpcode() != ARMISD::BFI)
14680 return SDValue();
14681
14682 APInt NewToMask, NewFromMask;
14683 SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask);
14684 if (NewFrom != From)
14685 return SDValue();
14686
14687 // Do the written bits conflict with any we've seen so far?
14688 if ((NewToMask & ToMask).getBoolValue())
14689 // Conflicting bits.
14690 return SDValue();
14691
14692 // Are the new bits contiguous when combined with the old bits?
14693 if (BitsProperlyConcatenate(ToMask, NewToMask) &&
14694 BitsProperlyConcatenate(FromMask, NewFromMask))
14695 return V;
14696 if (BitsProperlyConcatenate(NewToMask, ToMask) &&
14697 BitsProperlyConcatenate(NewFromMask, FromMask))
14698 return V;
14699
14700 return SDValue();
14701}
14702
14704 SDValue N0 = N->getOperand(0);
14705 SDValue N1 = N->getOperand(1);
14706
14707 if (N1.getOpcode() == ISD::AND) {
14708 // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff
14709 // the bits being cleared by the AND are not demanded by the BFI.
14711 if (!N11C)
14712 return SDValue();
14713 unsigned InvMask = N->getConstantOperandVal(2);
14714 unsigned LSB = llvm::countr_zero(~InvMask);
14715 unsigned Width = llvm::bit_width<unsigned>(~InvMask) - LSB;
14716 assert(Width <
14717 static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
14718 "undefined behavior");
14719 unsigned Mask = (1u << Width) - 1;
14720 unsigned Mask2 = N11C->getZExtValue();
14721 if ((Mask & (~Mask2)) == 0)
14722 return DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0),
14723 N->getOperand(0), N1.getOperand(0), N->getOperand(2));
14724 return SDValue();
14725 }
14726
14727 // Look for another BFI to combine with.
14728 if (SDValue CombineBFI = FindBFIToCombineWith(N)) {
14729 // We've found a BFI.
14730 APInt ToMask1, FromMask1;
14731 SDValue From1 = ParseBFI(N, ToMask1, FromMask1);
14732
14733 APInt ToMask2, FromMask2;
14734 SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2);
14735 assert(From1 == From2);
14736 (void)From2;
14737
14738 // Create a new BFI, combining the two together.
14739 APInt NewFromMask = FromMask1 | FromMask2;
14740 APInt NewToMask = ToMask1 | ToMask2;
14741
14742 EVT VT = N->getValueType(0);
14743 SDLoc dl(N);
14744
14745 if (NewFromMask[0] == 0)
14746 From1 = DAG.getNode(ISD::SRL, dl, VT, From1,
14747 DAG.getConstant(NewFromMask.countr_zero(), dl, VT));
14748 return DAG.getNode(ARMISD::BFI, dl, VT, CombineBFI.getOperand(0), From1,
14749 DAG.getConstant(~NewToMask, dl, VT));
14750 }
14751
14752 // Reassociate BFI(BFI (A, B, M1), C, M2) to BFI(BFI (A, C, M2), B, M1) so
14753 // that lower bit insertions are performed first, providing that M1 and M2
14754 // do no overlap. This can allow multiple BFI instructions to be combined
14755 // together by the other folds above.
14756 if (N->getOperand(0).getOpcode() == ARMISD::BFI) {
14757 APInt ToMask1 = ~N->getConstantOperandAPInt(2);
14758 APInt ToMask2 = ~N0.getConstantOperandAPInt(2);
14759
14760 if (!N0.hasOneUse() || (ToMask1 & ToMask2) != 0 ||
14761 ToMask1.countl_zero() < ToMask2.countl_zero())
14762 return SDValue();
14763
14764 EVT VT = N->getValueType(0);
14765 SDLoc dl(N);
14766 SDValue BFI1 = DAG.getNode(ARMISD::BFI, dl, VT, N0.getOperand(0),
14767 N->getOperand(1), N->getOperand(2));
14768 return DAG.getNode(ARMISD::BFI, dl, VT, BFI1, N0.getOperand(1),
14769 N0.getOperand(2));
14770 }
14771
14772 return SDValue();
14773}
14774
14775// Check that N is CMPZ(CSINC(0, 0, CC, X)),
14776// or CMPZ(CMOV(1, 0, CC, X))
14777// return X if valid.
14779 if (Cmp->getOpcode() != ARMISD::CMPZ || !isNullConstant(Cmp->getOperand(1)))
14780 return SDValue();
14781 SDValue CSInc = Cmp->getOperand(0);
14782
14783 // Ignore any `And 1` nodes that may not yet have been removed. We are
14784 // looking for a value that produces 1/0, so these have no effect on the
14785 // code.
14786 while (CSInc.getOpcode() == ISD::AND &&
14787 isa<ConstantSDNode>(CSInc.getOperand(1)) &&
14788 CSInc.getConstantOperandVal(1) == 1 && CSInc->hasOneUse())
14789 CSInc = CSInc.getOperand(0);
14790
14791 if (CSInc.getOpcode() == ARMISD::CSINC &&
14792 isNullConstant(CSInc.getOperand(0)) &&
14793 isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) {
14795 return CSInc.getOperand(3);
14796 }
14797 if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(0)) &&
14798 isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) {
14800 return CSInc.getOperand(3);
14801 }
14802 if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(1)) &&
14803 isNullConstant(CSInc.getOperand(0)) && CSInc->hasOneUse()) {
14806 return CSInc.getOperand(3);
14807 }
14808 return SDValue();
14809}
14810
14812 // Given CMPZ(CSINC(C, 0, 0, EQ), 0), we can just use C directly. As in
14813 // t92: flags = ARMISD::CMPZ t74, 0
14814 // t93: i32 = ARMISD::CSINC 0, 0, 1, t92
14815 // t96: flags = ARMISD::CMPZ t93, 0
14816 // t114: i32 = ARMISD::CSINV 0, 0, 0, t96
14818 if (SDValue C = IsCMPZCSINC(N, Cond))
14819 if (Cond == ARMCC::EQ)
14820 return C;
14821 return SDValue();
14822}
14823
14825 // Fold away an unneccessary CMPZ/CSINC
14826 // CSXYZ A, B, C1 (CMPZ (CSINC 0, 0, C2, D), 0) ->
14827 // if C1==EQ -> CSXYZ A, B, C2, D
14828 // if C1==NE -> CSXYZ A, B, NOT(C2), D
14830 if (SDValue C = IsCMPZCSINC(N->getOperand(3).getNode(), Cond)) {
14831 if (N->getConstantOperandVal(2) == ARMCC::EQ)
14832 return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
14833 N->getOperand(1),
14834 DAG.getConstant(Cond, SDLoc(N), MVT::i32), C);
14835 if (N->getConstantOperandVal(2) == ARMCC::NE)
14836 return DAG.getNode(
14837 N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
14838 N->getOperand(1),
14840 }
14841 return SDValue();
14842}
14843
14844/// PerformVMOVRRDCombine - Target-specific dag combine xforms for
14845/// ARMISD::VMOVRRD.
14848 const ARMSubtarget *Subtarget) {
14849 // vmovrrd(vmovdrr x, y) -> x,y
14850 SDValue InDouble = N->getOperand(0);
14851 if (InDouble.getOpcode() == ARMISD::VMOVDRR && Subtarget->hasFP64())
14852 return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1));
14853
14854 // vmovrrd(load f64) -> (load i32), (load i32)
14855 SDNode *InNode = InDouble.getNode();
14856 if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() &&
14857 InNode->getValueType(0) == MVT::f64 &&
14858 InNode->getOperand(1).getOpcode() == ISD::FrameIndex &&
14859 !cast<LoadSDNode>(InNode)->isVolatile()) {
14860 // TODO: Should this be done for non-FrameIndex operands?
14861 LoadSDNode *LD = cast<LoadSDNode>(InNode);
14862
14863 SelectionDAG &DAG = DCI.DAG;
14864 SDLoc DL(LD);
14865 SDValue BasePtr = LD->getBasePtr();
14866 SDValue NewLD1 =
14867 DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(),
14868 LD->getAlign(), LD->getMemOperand()->getFlags());
14869
14870 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
14871 DAG.getConstant(4, DL, MVT::i32));
14872
14873 SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, LD->getChain(), OffsetPtr,
14874 LD->getPointerInfo().getWithOffset(4),
14875 commonAlignment(LD->getAlign(), 4),
14876 LD->getMemOperand()->getFlags());
14877
14878 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
14879 if (DCI.DAG.getDataLayout().isBigEndian())
14880 std::swap (NewLD1, NewLD2);
14881 SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2);
14882 return Result;
14883 }
14884
14885 // VMOVRRD(extract(..(build_vector(a, b, c, d)))) -> a,b or c,d
14886 // VMOVRRD(extract(insert_vector(insert_vector(.., a, l1), b, l2))) -> a,b
14887 if (InDouble.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
14888 isa<ConstantSDNode>(InDouble.getOperand(1))) {
14889 SDValue BV = InDouble.getOperand(0);
14890 // Look up through any nop bitcasts and vector_reg_casts. bitcasts may
14891 // change lane order under big endian.
14892 bool BVSwap = BV.getOpcode() == ISD::BITCAST;
14893 while (
14894 (BV.getOpcode() == ISD::BITCAST ||
14895 BV.getOpcode() == ARMISD::VECTOR_REG_CAST) &&
14896 (BV.getValueType() == MVT::v2f64 || BV.getValueType() == MVT::v2i64)) {
14897 BVSwap = BV.getOpcode() == ISD::BITCAST;
14898 BV = BV.getOperand(0);
14899 }
14900 if (BV.getValueType() != MVT::v4i32)
14901 return SDValue();
14902
14903 // Handle buildvectors, pulling out the correct lane depending on
14904 // endianness.
14905 unsigned Offset = InDouble.getConstantOperandVal(1) == 1 ? 2 : 0;
14906 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
14907 SDValue Op0 = BV.getOperand(Offset);
14908 SDValue Op1 = BV.getOperand(Offset + 1);
14909 if (!Subtarget->isLittle() && BVSwap)
14910 std::swap(Op0, Op1);
14911
14912 return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N));
14913 }
14914
14915 // A chain of insert_vectors, grabbing the correct value of the chain of
14916 // inserts.
14917 SDValue Op0, Op1;
14918 while (BV.getOpcode() == ISD::INSERT_VECTOR_ELT) {
14919 if (isa<ConstantSDNode>(BV.getOperand(2))) {
14920 if (BV.getConstantOperandVal(2) == Offset && !Op0)
14921 Op0 = BV.getOperand(1);
14922 if (BV.getConstantOperandVal(2) == Offset + 1 && !Op1)
14923 Op1 = BV.getOperand(1);
14924 }
14925 BV = BV.getOperand(0);
14926 }
14927 if (!Subtarget->isLittle() && BVSwap)
14928 std::swap(Op0, Op1);
14929 if (Op0 && Op1)
14930 return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N));
14931 }
14932
14933 return SDValue();
14934}
14935
14936/// PerformVMOVDRRCombine - Target-specific dag combine xforms for
14937/// ARMISD::VMOVDRR. This is also used for BUILD_VECTORs with 2 operands.
14939 // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X)
14940 SDValue Op0 = N->getOperand(0);
14941 SDValue Op1 = N->getOperand(1);
14942 if (Op0.getOpcode() == ISD::BITCAST)
14943 Op0 = Op0.getOperand(0);
14944 if (Op1.getOpcode() == ISD::BITCAST)
14945 Op1 = Op1.getOperand(0);
14946 if (Op0.getOpcode() == ARMISD::VMOVRRD &&
14947 Op0.getNode() == Op1.getNode() &&
14948 Op0.getResNo() == 0 && Op1.getResNo() == 1)
14949 return DAG.getNode(ISD::BITCAST, SDLoc(N),
14950 N->getValueType(0), Op0.getOperand(0));
14951 return SDValue();
14952}
14953
14956 SDValue Op0 = N->getOperand(0);
14957
14958 // VMOVhr (VMOVrh (X)) -> X
14959 if (Op0->getOpcode() == ARMISD::VMOVrh)
14960 return Op0->getOperand(0);
14961
14962 // FullFP16: half values are passed in S-registers, and we don't
14963 // need any of the bitcast and moves:
14964 //
14965 // t2: f32,ch1,gl1? = CopyFromReg ch, Register:f32 %0, gl?
14966 // t5: i32 = bitcast t2
14967 // t18: f16 = ARMISD::VMOVhr t5
14968 // =>
14969 // tN: f16,ch2,gl2? = CopyFromReg ch, Register::f32 %0, gl?
14970 if (Op0->getOpcode() == ISD::BITCAST) {
14971 SDValue Copy = Op0->getOperand(0);
14972 if (Copy.getValueType() == MVT::f32 &&
14973 Copy->getOpcode() == ISD::CopyFromReg) {
14974 bool HasGlue = Copy->getNumOperands() == 3;
14975 SDValue Ops[] = {Copy->getOperand(0), Copy->getOperand(1),
14976 HasGlue ? Copy->getOperand(2) : SDValue()};
14977 EVT OutTys[] = {N->getValueType(0), MVT::Other, MVT::Glue};
14978 SDValue NewCopy =
14980 DCI.DAG.getVTList(ArrayRef(OutTys, HasGlue ? 3 : 2)),
14981 ArrayRef(Ops, HasGlue ? 3 : 2));
14982
14983 // Update Users, Chains, and Potential Glue.
14984 DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), NewCopy.getValue(0));
14985 DCI.DAG.ReplaceAllUsesOfValueWith(Copy.getValue(1), NewCopy.getValue(1));
14986 if (HasGlue)
14987 DCI.DAG.ReplaceAllUsesOfValueWith(Copy.getValue(2),
14988 NewCopy.getValue(2));
14989
14990 return NewCopy;
14991 }
14992 }
14993
14994 // fold (VMOVhr (load x)) -> (load (f16*)x)
14995 if (LoadSDNode *LN0 = dyn_cast<LoadSDNode>(Op0)) {
14996 if (LN0->hasOneUse() && LN0->isUnindexed() &&
14997 LN0->getMemoryVT() == MVT::i16) {
14998 SDValue Load =
14999 DCI.DAG.getLoad(N->getValueType(0), SDLoc(N), LN0->getChain(),
15000 LN0->getBasePtr(), LN0->getMemOperand());
15001 DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));
15002 DCI.DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Load.getValue(1));
15003 return Load;
15004 }
15005 }
15006
15007 // Only the bottom 16 bits of the source register are used.
15008 APInt DemandedMask = APInt::getLowBitsSet(32, 16);
15009 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
15010 if (TLI.SimplifyDemandedBits(Op0, DemandedMask, DCI))
15011 return SDValue(N, 0);
15012
15013 return SDValue();
15014}
15015
15017 SDValue N0 = N->getOperand(0);
15018 EVT VT = N->getValueType(0);
15019
15020 // fold (VMOVrh (fpconst x)) -> const x
15022 APFloat V = C->getValueAPF();
15023 return DAG.getConstant(V.bitcastToAPInt().getZExtValue(), SDLoc(N), VT);
15024 }
15025
15026 // fold (VMOVrh (load x)) -> (zextload (i16*)x)
15027 if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse()) {
15028 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
15029
15030 SDValue Load =
15031 DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT, LN0->getChain(),
15032 LN0->getBasePtr(), MVT::i16, LN0->getMemOperand());
15033 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));
15034 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
15035 return Load;
15036 }
15037
15038 // Fold VMOVrh(extract(x, n)) -> vgetlaneu(x, n)
15039 if (N0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15041 return DAG.getNode(ARMISD::VGETLANEu, SDLoc(N), VT, N0->getOperand(0),
15042 N0->getOperand(1));
15043
15044 return SDValue();
15045}
15046
15047/// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node
15048/// are normal, non-volatile loads. If so, it is profitable to bitcast an
15049/// i64 vector to have f64 elements, since the value can then be loaded
15050/// directly into a VFP register.
15052 unsigned NumElts = N->getValueType(0).getVectorNumElements();
15053 for (unsigned i = 0; i < NumElts; ++i) {
15054 SDNode *Elt = N->getOperand(i).getNode();
15055 if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile())
15056 return true;
15057 }
15058 return false;
15059}
15060
15061/// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for
15062/// ISD::BUILD_VECTOR.
15065 const ARMSubtarget *Subtarget) {
15066 // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X):
15067 // VMOVRRD is introduced when legalizing i64 types. It forces the i64 value
15068 // into a pair of GPRs, which is fine when the value is used as a scalar,
15069 // but if the i64 value is converted to a vector, we need to undo the VMOVRRD.
15070 SelectionDAG &DAG = DCI.DAG;
15071 if (N->getNumOperands() == 2)
15072 if (SDValue RV = PerformVMOVDRRCombine(N, DAG))
15073 return RV;
15074
15075 // Load i64 elements as f64 values so that type legalization does not split
15076 // them up into i32 values.
15077 EVT VT = N->getValueType(0);
15078 if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N))
15079 return SDValue();
15080 SDLoc dl(N);
15082 unsigned NumElts = VT.getVectorNumElements();
15083 for (unsigned i = 0; i < NumElts; ++i) {
15084 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i));
15085 Ops.push_back(V);
15086 // Make the DAGCombiner fold the bitcast.
15087 DCI.AddToWorklist(V.getNode());
15088 }
15089 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts);
15090 SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops);
15091 return DAG.getNode(ISD::BITCAST, dl, VT, BV);
15092}
15093
15094/// Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
15095static SDValue
15097 // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR.
15098 // At that time, we may have inserted bitcasts from integer to float.
15099 // If these bitcasts have survived DAGCombine, change the lowering of this
15100 // BUILD_VECTOR in something more vector friendly, i.e., that does not
15101 // force to use floating point types.
15102
15103 // Make sure we can change the type of the vector.
15104 // This is possible iff:
15105 // 1. The vector is only used in a bitcast to a integer type. I.e.,
15106 // 1.1. Vector is used only once.
15107 // 1.2. Use is a bit convert to an integer type.
15108 // 2. The size of its operands are 32-bits (64-bits are not legal).
15109 EVT VT = N->getValueType(0);
15110 EVT EltVT = VT.getVectorElementType();
15111
15112 // Check 1.1. and 2.
15113 if (EltVT.getSizeInBits() != 32 || !N->hasOneUse())
15114 return SDValue();
15115
15116 // By construction, the input type must be float.
15117 assert(EltVT == MVT::f32 && "Unexpected type!");
15118
15119 // Check 1.2.
15120 SDNode *Use = *N->user_begin();
15121 if (Use->getOpcode() != ISD::BITCAST ||
15122 Use->getValueType(0).isFloatingPoint())
15123 return SDValue();
15124
15125 // Check profitability.
15126 // Model is, if more than half of the relevant operands are bitcast from
15127 // i32, turn the build_vector into a sequence of insert_vector_elt.
15128 // Relevant operands are everything that is not statically
15129 // (i.e., at compile time) bitcasted.
15130 unsigned NumOfBitCastedElts = 0;
15131 unsigned NumElts = VT.getVectorNumElements();
15132 unsigned NumOfRelevantElts = NumElts;
15133 for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
15134 SDValue Elt = N->getOperand(Idx);
15135 if (Elt->getOpcode() == ISD::BITCAST) {
15136 // Assume only bit cast to i32 will go away.
15137 if (Elt->getOperand(0).getValueType() == MVT::i32)
15138 ++NumOfBitCastedElts;
15139 } else if (Elt.isUndef() || isa<ConstantSDNode>(Elt))
15140 // Constants are statically casted, thus do not count them as
15141 // relevant operands.
15142 --NumOfRelevantElts;
15143 }
15144
15145 // Check if more than half of the elements require a non-free bitcast.
15146 if (NumOfBitCastedElts <= NumOfRelevantElts / 2)
15147 return SDValue();
15148
15149 SelectionDAG &DAG = DCI.DAG;
15150 // Create the new vector type.
15151 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
15152 // Check if the type is legal.
15153 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15154 if (!TLI.isTypeLegal(VecVT))
15155 return SDValue();
15156
15157 // Combine:
15158 // ARMISD::BUILD_VECTOR E1, E2, ..., EN.
15159 // => BITCAST INSERT_VECTOR_ELT
15160 // (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1),
15161 // (BITCAST EN), N.
15162 SDValue Vec = DAG.getUNDEF(VecVT);
15163 SDLoc dl(N);
15164 for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) {
15165 SDValue V = N->getOperand(Idx);
15166 if (V.isUndef())
15167 continue;
15168 if (V.getOpcode() == ISD::BITCAST &&
15169 V->getOperand(0).getValueType() == MVT::i32)
15170 // Fold obvious case.
15171 V = V.getOperand(0);
15172 else {
15173 V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V);
15174 // Make the DAGCombiner fold the bitcasts.
15175 DCI.AddToWorklist(V.getNode());
15176 }
15177 SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32);
15178 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx);
15179 }
15180 Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec);
15181 // Make the DAGCombiner fold the bitcasts.
15182 DCI.AddToWorklist(Vec.getNode());
15183 return Vec;
15184}
15185
15186static SDValue
15188 EVT VT = N->getValueType(0);
15189 SDValue Op = N->getOperand(0);
15190 SDLoc dl(N);
15191
15192 // PREDICATE_CAST(PREDICATE_CAST(x)) == PREDICATE_CAST(x)
15193 if (Op->getOpcode() == ARMISD::PREDICATE_CAST) {
15194 // If the valuetypes are the same, we can remove the cast entirely.
15195 if (Op->getOperand(0).getValueType() == VT)
15196 return Op->getOperand(0);
15197 return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));
15198 }
15199
15200 // Turn pred_cast(xor x, -1) into xor(pred_cast x, -1), in order to produce
15201 // more VPNOT which might get folded as else predicates.
15202 if (Op.getValueType() == MVT::i32 && isBitwiseNot(Op)) {
15203 SDValue X =
15204 DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));
15205 SDValue C = DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT,
15206 DCI.DAG.getConstant(65535, dl, MVT::i32));
15207 return DCI.DAG.getNode(ISD::XOR, dl, VT, X, C);
15208 }
15209
15210 // Only the bottom 16 bits of the source register are used.
15211 if (Op.getValueType() == MVT::i32) {
15212 APInt DemandedMask = APInt::getLowBitsSet(32, 16);
15213 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
15214 if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
15215 return SDValue(N, 0);
15216 }
15217 return SDValue();
15218}
15219
15221 const ARMSubtarget *ST) {
15222 EVT VT = N->getValueType(0);
15223 SDValue Op = N->getOperand(0);
15224 SDLoc dl(N);
15225
15226 // Under Little endian, a VECTOR_REG_CAST is equivalent to a BITCAST
15227 if (ST->isLittle())
15228 return DAG.getNode(ISD::BITCAST, dl, VT, Op);
15229
15230 // VT VECTOR_REG_CAST (VT Op) -> Op
15231 if (Op.getValueType() == VT)
15232 return Op;
15233 // VECTOR_REG_CAST undef -> undef
15234 if (Op.isUndef())
15235 return DAG.getUNDEF(VT);
15236
15237 // VECTOR_REG_CAST(VECTOR_REG_CAST(x)) == VECTOR_REG_CAST(x)
15238 if (Op->getOpcode() == ARMISD::VECTOR_REG_CAST) {
15239 // If the valuetypes are the same, we can remove the cast entirely.
15240 if (Op->getOperand(0).getValueType() == VT)
15241 return Op->getOperand(0);
15242 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Op->getOperand(0));
15243 }
15244
15245 return SDValue();
15246}
15247
15249 const ARMSubtarget *Subtarget) {
15250 if (!Subtarget->hasMVEIntegerOps())
15251 return SDValue();
15252
15253 EVT VT = N->getValueType(0);
15254 SDValue Op0 = N->getOperand(0);
15255 SDValue Op1 = N->getOperand(1);
15256 ARMCC::CondCodes Cond = (ARMCC::CondCodes)N->getConstantOperandVal(2);
15257 SDLoc dl(N);
15258
15259 // vcmp X, 0, cc -> vcmpz X, cc
15260 if (isZeroVector(Op1))
15261 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op0, N->getOperand(2));
15262
15263 unsigned SwappedCond = getSwappedCondition(Cond);
15264 if (isValidMVECond(SwappedCond, VT.isFloatingPoint())) {
15265 // vcmp 0, X, cc -> vcmpz X, reversed(cc)
15266 if (isZeroVector(Op0))
15267 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op1,
15268 DAG.getConstant(SwappedCond, dl, MVT::i32));
15269 // vcmp vdup(Y), X, cc -> vcmp X, vdup(Y), reversed(cc)
15270 if (Op0->getOpcode() == ARMISD::VDUP && Op1->getOpcode() != ARMISD::VDUP)
15271 return DAG.getNode(ARMISD::VCMP, dl, VT, Op1, Op0,
15272 DAG.getConstant(SwappedCond, dl, MVT::i32));
15273 }
15274
15275 return SDValue();
15276}
15277
15278/// PerformInsertEltCombine - Target-specific dag combine xforms for
15279/// ISD::INSERT_VECTOR_ELT.
15282 // Bitcast an i64 load inserted into a vector to f64.
15283 // Otherwise, the i64 value will be legalized to a pair of i32 values.
15284 EVT VT = N->getValueType(0);
15285 SDNode *Elt = N->getOperand(1).getNode();
15286 if (VT.getVectorElementType() != MVT::i64 ||
15287 !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile())
15288 return SDValue();
15289
15290 SelectionDAG &DAG = DCI.DAG;
15291 SDLoc dl(N);
15292 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
15294 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0));
15295 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1));
15296 // Make the DAGCombiner fold the bitcasts.
15297 DCI.AddToWorklist(Vec.getNode());
15298 DCI.AddToWorklist(V.getNode());
15299 SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT,
15300 Vec, V, N->getOperand(2));
15301 return DAG.getNode(ISD::BITCAST, dl, VT, InsElt);
15302}
15303
15304// Convert a pair of extracts from the same base vector to a VMOVRRD. Either
15305// directly or bitcast to an integer if the original is a float vector.
15306// extract(x, n); extract(x, n+1) -> VMOVRRD(extract v2f64 x, n/2)
15307// bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD(extract x, n/2)
15308static SDValue
15310 EVT VT = N->getValueType(0);
15311 SDLoc dl(N);
15312
15313 if (!DCI.isAfterLegalizeDAG() || VT != MVT::i32 ||
15314 !DCI.DAG.getTargetLoweringInfo().isTypeLegal(MVT::f64))
15315 return SDValue();
15316
15317 SDValue Ext = SDValue(N, 0);
15318 if (Ext.getOpcode() == ISD::BITCAST &&
15319 Ext.getOperand(0).getValueType() == MVT::f32)
15320 Ext = Ext.getOperand(0);
15321 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
15323 Ext.getConstantOperandVal(1) % 2 != 0)
15324 return SDValue();
15325 if (Ext->hasOneUse() && (Ext->user_begin()->getOpcode() == ISD::SINT_TO_FP ||
15326 Ext->user_begin()->getOpcode() == ISD::UINT_TO_FP))
15327 return SDValue();
15328
15329 SDValue Op0 = Ext.getOperand(0);
15330 EVT VecVT = Op0.getValueType();
15331 unsigned ResNo = Op0.getResNo();
15332 unsigned Lane = Ext.getConstantOperandVal(1);
15333 if (VecVT.getVectorNumElements() != 4)
15334 return SDValue();
15335
15336 // Find another extract, of Lane + 1
15337 auto OtherIt = find_if(Op0->users(), [&](SDNode *V) {
15338 return V->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15339 isa<ConstantSDNode>(V->getOperand(1)) &&
15340 V->getConstantOperandVal(1) == Lane + 1 &&
15341 V->getOperand(0).getResNo() == ResNo;
15342 });
15343 if (OtherIt == Op0->users().end())
15344 return SDValue();
15345
15346 // For float extracts, we need to be converting to a i32 for both vector
15347 // lanes.
15348 SDValue OtherExt(*OtherIt, 0);
15349 if (OtherExt.getValueType() != MVT::i32) {
15350 if (!OtherExt->hasOneUse() ||
15351 OtherExt->user_begin()->getOpcode() != ISD::BITCAST ||
15352 OtherExt->user_begin()->getValueType(0) != MVT::i32)
15353 return SDValue();
15354 OtherExt = SDValue(*OtherExt->user_begin(), 0);
15355 }
15356
15357 // Convert the type to a f64 and extract with a VMOVRRD.
15358 SDValue F64 = DCI.DAG.getNode(
15359 ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15360 DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v2f64, Op0),
15361 DCI.DAG.getConstant(Ext.getConstantOperandVal(1) / 2, dl, MVT::i32));
15362 SDValue VMOVRRD =
15363 DCI.DAG.getNode(ARMISD::VMOVRRD, dl, {MVT::i32, MVT::i32}, F64);
15364
15365 DCI.CombineTo(OtherExt.getNode(), SDValue(VMOVRRD.getNode(), 1));
15366 return VMOVRRD;
15367}
15368
15371 const ARMSubtarget *ST) {
15372 SDValue Op0 = N->getOperand(0);
15373 EVT VT = N->getValueType(0);
15374 SDLoc dl(N);
15375
15376 // extract (vdup x) -> x
15377 if (Op0->getOpcode() == ARMISD::VDUP) {
15378 SDValue X = Op0->getOperand(0);
15379 if (VT == MVT::f16 && X.getValueType() == MVT::i32)
15380 return DCI.DAG.getNode(ARMISD::VMOVhr, dl, VT, X);
15381 if (VT == MVT::i32 && X.getValueType() == MVT::f16)
15382 return DCI.DAG.getNode(ARMISD::VMOVrh, dl, VT, X);
15383 if (VT == MVT::f32 && X.getValueType() == MVT::i32)
15384 return DCI.DAG.getNode(ISD::BITCAST, dl, VT, X);
15385
15386 while (X.getValueType() != VT && X->getOpcode() == ISD::BITCAST)
15387 X = X->getOperand(0);
15388 if (X.getValueType() == VT)
15389 return X;
15390 }
15391
15392 // extract ARM_BUILD_VECTOR -> x
15393 if (Op0->getOpcode() == ARMISD::BUILD_VECTOR &&
15394 isa<ConstantSDNode>(N->getOperand(1)) &&
15395 N->getConstantOperandVal(1) < Op0.getNumOperands()) {
15396 return Op0.getOperand(N->getConstantOperandVal(1));
15397 }
15398
15399 // extract(bitcast(BUILD_VECTOR(VMOVDRR(a, b), ..))) -> a or b
15400 if (Op0.getValueType() == MVT::v4i32 &&
15401 isa<ConstantSDNode>(N->getOperand(1)) &&
15402 Op0.getOpcode() == ISD::BITCAST &&
15404 Op0.getOperand(0).getValueType() == MVT::v2f64) {
15405 SDValue BV = Op0.getOperand(0);
15406 unsigned Offset = N->getConstantOperandVal(1);
15407 SDValue MOV = BV.getOperand(Offset < 2 ? 0 : 1);
15408 if (MOV.getOpcode() == ARMISD::VMOVDRR)
15409 return MOV.getOperand(ST->isLittle() ? Offset % 2 : 1 - Offset % 2);
15410 }
15411
15412 // extract x, n; extract x, n+1 -> VMOVRRD x
15413 if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI))
15414 return R;
15415
15416 // extract (MVETrunc(x)) -> extract x
15417 if (Op0->getOpcode() == ARMISD::MVETRUNC) {
15418 unsigned Idx = N->getConstantOperandVal(1);
15419 unsigned Vec =
15421 unsigned SubIdx =
15423 return DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Op0.getOperand(Vec),
15424 DCI.DAG.getConstant(SubIdx, dl, MVT::i32));
15425 }
15426
15427 return SDValue();
15428}
15429
15431 SDValue Op = N->getOperand(0);
15432 EVT VT = N->getValueType(0);
15433
15434 // sext_inreg(VGETLANEu) -> VGETLANEs
15435 if (Op.getOpcode() == ARMISD::VGETLANEu &&
15436 cast<VTSDNode>(N->getOperand(1))->getVT() ==
15437 Op.getOperand(0).getValueType().getScalarType())
15438 return DAG.getNode(ARMISD::VGETLANEs, SDLoc(N), VT, Op.getOperand(0),
15439 Op.getOperand(1));
15440
15441 return SDValue();
15442}
15443
15444static SDValue
15446 SDValue Vec = N->getOperand(0);
15447 SDValue SubVec = N->getOperand(1);
15448 uint64_t IdxVal = N->getConstantOperandVal(2);
15449 EVT VecVT = Vec.getValueType();
15450 EVT SubVT = SubVec.getValueType();
15451
15452 // Only do this for legal fixed vector types.
15453 if (!VecVT.isFixedLengthVector() ||
15454 !DCI.DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
15456 return SDValue();
15457
15458 // Ignore widening patterns.
15459 if (IdxVal == 0 && Vec.isUndef())
15460 return SDValue();
15461
15462 // Subvector must be half the width and an "aligned" insertion.
15463 unsigned NumSubElts = SubVT.getVectorNumElements();
15464 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
15465 (IdxVal != 0 && IdxVal != NumSubElts))
15466 return SDValue();
15467
15468 // Fold insert_subvector -> concat_vectors
15469 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
15470 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
15471 SDLoc DL(N);
15472 SDValue Lo, Hi;
15473 if (IdxVal == 0) {
15474 Lo = SubVec;
15475 Hi = DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
15476 DCI.DAG.getVectorIdxConstant(NumSubElts, DL));
15477 } else {
15478 Lo = DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
15479 DCI.DAG.getVectorIdxConstant(0, DL));
15480 Hi = SubVec;
15481 }
15482 return DCI.DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
15483}
15484
15485// shuffle(MVETrunc(x, y)) -> VMOVN(x, y)
15487 SelectionDAG &DAG) {
15488 SDValue Trunc = N->getOperand(0);
15489 EVT VT = Trunc.getValueType();
15490 if (Trunc.getOpcode() != ARMISD::MVETRUNC || !N->getOperand(1).isUndef())
15491 return SDValue();
15492
15493 SDLoc DL(Trunc);
15494 if (isVMOVNTruncMask(N->getMask(), VT, false))
15495 return DAG.getNode(
15496 ARMISD::VMOVN, DL, VT,
15497 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)),
15498 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)),
15499 DAG.getConstant(1, DL, MVT::i32));
15500 else if (isVMOVNTruncMask(N->getMask(), VT, true))
15501 return DAG.getNode(
15502 ARMISD::VMOVN, DL, VT,
15503 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)),
15504 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)),
15505 DAG.getConstant(1, DL, MVT::i32));
15506 return SDValue();
15507}
15508
15509/// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for
15510/// ISD::VECTOR_SHUFFLE.
15513 return R;
15514
15515 // The LLVM shufflevector instruction does not require the shuffle mask
15516 // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does
15517 // have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the
15518 // operands do not match the mask length, they are extended by concatenating
15519 // them with undef vectors. That is probably the right thing for other
15520 // targets, but for NEON it is better to concatenate two double-register
15521 // size vector operands into a single quad-register size vector. Do that
15522 // transformation here:
15523 // shuffle(concat(v1, undef), concat(v2, undef)) ->
15524 // shuffle(concat(v1, v2), undef)
15525 SDValue Op0 = N->getOperand(0);
15526 SDValue Op1 = N->getOperand(1);
15527 if (Op0.getOpcode() != ISD::CONCAT_VECTORS ||
15528 Op1.getOpcode() != ISD::CONCAT_VECTORS ||
15529 Op0.getNumOperands() != 2 ||
15530 Op1.getNumOperands() != 2)
15531 return SDValue();
15532 SDValue Concat0Op1 = Op0.getOperand(1);
15533 SDValue Concat1Op1 = Op1.getOperand(1);
15534 if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef())
15535 return SDValue();
15536 // Skip the transformation if any of the types are illegal.
15537 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15538 EVT VT = N->getValueType(0);
15539 if (!TLI.isTypeLegal(VT) ||
15540 !TLI.isTypeLegal(Concat0Op1.getValueType()) ||
15541 !TLI.isTypeLegal(Concat1Op1.getValueType()))
15542 return SDValue();
15543
15544 SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT,
15545 Op0.getOperand(0), Op1.getOperand(0));
15546 // Translate the shuffle mask.
15547 SmallVector<int, 16> NewMask;
15548 unsigned NumElts = VT.getVectorNumElements();
15549 unsigned HalfElts = NumElts/2;
15551 for (unsigned n = 0; n < NumElts; ++n) {
15552 int MaskElt = SVN->getMaskElt(n);
15553 int NewElt = -1;
15554 if (MaskElt < (int)HalfElts)
15555 NewElt = MaskElt;
15556 else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts))
15557 NewElt = HalfElts + MaskElt - NumElts;
15558 NewMask.push_back(NewElt);
15559 }
15560 return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat,
15561 DAG.getUNDEF(VT), NewMask);
15562}
15563
15564/// Load/store instruction that can be merged with a base address
15565/// update
15570 unsigned AddrOpIdx;
15571};
15572
15574 /// Instruction that updates a pointer
15576 /// Pointer increment operand
15578 /// Pointer increment value if it is a constant, or 0 otherwise
15579 unsigned ConstInc;
15580};
15581
15583 // Check that the add is independent of the load/store.
15584 // Otherwise, folding it would create a cycle. Search through Addr
15585 // as well, since the User may not be a direct user of Addr and
15586 // only share a base pointer.
15589 Worklist.push_back(N);
15590 Worklist.push_back(User);
15591 const unsigned MaxSteps = 1024;
15592 if (SDNode::hasPredecessorHelper(N, Visited, Worklist, MaxSteps) ||
15593 SDNode::hasPredecessorHelper(User, Visited, Worklist, MaxSteps))
15594 return false;
15595 return true;
15596}
15597
15599 struct BaseUpdateUser &User,
15600 bool SimpleConstIncOnly,
15602 SelectionDAG &DAG = DCI.DAG;
15603 SDNode *N = Target.N;
15604 MemSDNode *MemN = cast<MemSDNode>(N);
15605 SDLoc dl(N);
15606
15607 // Find the new opcode for the updating load/store.
15608 bool isLoadOp = true;
15609 bool isLaneOp = false;
15610 // Workaround for vst1x and vld1x intrinsics which do not have alignment
15611 // as an operand.
15612 bool hasAlignment = true;
15613 unsigned NewOpc = 0;
15614 unsigned NumVecs = 0;
15615 if (Target.isIntrinsic) {
15616 unsigned IntNo = N->getConstantOperandVal(1);
15617 switch (IntNo) {
15618 default:
15619 llvm_unreachable("unexpected intrinsic for Neon base update");
15620 case Intrinsic::arm_neon_vld1:
15621 NewOpc = ARMISD::VLD1_UPD;
15622 NumVecs = 1;
15623 break;
15624 case Intrinsic::arm_neon_vld2:
15625 NewOpc = ARMISD::VLD2_UPD;
15626 NumVecs = 2;
15627 break;
15628 case Intrinsic::arm_neon_vld3:
15629 NewOpc = ARMISD::VLD3_UPD;
15630 NumVecs = 3;
15631 break;
15632 case Intrinsic::arm_neon_vld4:
15633 NewOpc = ARMISD::VLD4_UPD;
15634 NumVecs = 4;
15635 break;
15636 case Intrinsic::arm_neon_vld1x2:
15637 NewOpc = ARMISD::VLD1x2_UPD;
15638 NumVecs = 2;
15639 hasAlignment = false;
15640 break;
15641 case Intrinsic::arm_neon_vld1x3:
15642 NewOpc = ARMISD::VLD1x3_UPD;
15643 NumVecs = 3;
15644 hasAlignment = false;
15645 break;
15646 case Intrinsic::arm_neon_vld1x4:
15647 NewOpc = ARMISD::VLD1x4_UPD;
15648 NumVecs = 4;
15649 hasAlignment = false;
15650 break;
15651 case Intrinsic::arm_neon_vld2dup:
15652 NewOpc = ARMISD::VLD2DUP_UPD;
15653 NumVecs = 2;
15654 break;
15655 case Intrinsic::arm_neon_vld3dup:
15656 NewOpc = ARMISD::VLD3DUP_UPD;
15657 NumVecs = 3;
15658 break;
15659 case Intrinsic::arm_neon_vld4dup:
15660 NewOpc = ARMISD::VLD4DUP_UPD;
15661 NumVecs = 4;
15662 break;
15663 case Intrinsic::arm_neon_vld2lane:
15664 NewOpc = ARMISD::VLD2LN_UPD;
15665 NumVecs = 2;
15666 isLaneOp = true;
15667 break;
15668 case Intrinsic::arm_neon_vld3lane:
15669 NewOpc = ARMISD::VLD3LN_UPD;
15670 NumVecs = 3;
15671 isLaneOp = true;
15672 break;
15673 case Intrinsic::arm_neon_vld4lane:
15674 NewOpc = ARMISD::VLD4LN_UPD;
15675 NumVecs = 4;
15676 isLaneOp = true;
15677 break;
15678 case Intrinsic::arm_neon_vst1:
15679 NewOpc = ARMISD::VST1_UPD;
15680 NumVecs = 1;
15681 isLoadOp = false;
15682 break;
15683 case Intrinsic::arm_neon_vst2:
15684 NewOpc = ARMISD::VST2_UPD;
15685 NumVecs = 2;
15686 isLoadOp = false;
15687 break;
15688 case Intrinsic::arm_neon_vst3:
15689 NewOpc = ARMISD::VST3_UPD;
15690 NumVecs = 3;
15691 isLoadOp = false;
15692 break;
15693 case Intrinsic::arm_neon_vst4:
15694 NewOpc = ARMISD::VST4_UPD;
15695 NumVecs = 4;
15696 isLoadOp = false;
15697 break;
15698 case Intrinsic::arm_neon_vst2lane:
15699 NewOpc = ARMISD::VST2LN_UPD;
15700 NumVecs = 2;
15701 isLoadOp = false;
15702 isLaneOp = true;
15703 break;
15704 case Intrinsic::arm_neon_vst3lane:
15705 NewOpc = ARMISD::VST3LN_UPD;
15706 NumVecs = 3;
15707 isLoadOp = false;
15708 isLaneOp = true;
15709 break;
15710 case Intrinsic::arm_neon_vst4lane:
15711 NewOpc = ARMISD::VST4LN_UPD;
15712 NumVecs = 4;
15713 isLoadOp = false;
15714 isLaneOp = true;
15715 break;
15716 case Intrinsic::arm_neon_vst1x2:
15717 NewOpc = ARMISD::VST1x2_UPD;
15718 NumVecs = 2;
15719 isLoadOp = false;
15720 hasAlignment = false;
15721 break;
15722 case Intrinsic::arm_neon_vst1x3:
15723 NewOpc = ARMISD::VST1x3_UPD;
15724 NumVecs = 3;
15725 isLoadOp = false;
15726 hasAlignment = false;
15727 break;
15728 case Intrinsic::arm_neon_vst1x4:
15729 NewOpc = ARMISD::VST1x4_UPD;
15730 NumVecs = 4;
15731 isLoadOp = false;
15732 hasAlignment = false;
15733 break;
15734 }
15735 } else {
15736 isLaneOp = true;
15737 switch (N->getOpcode()) {
15738 default:
15739 llvm_unreachable("unexpected opcode for Neon base update");
15740 case ARMISD::VLD1DUP:
15741 NewOpc = ARMISD::VLD1DUP_UPD;
15742 NumVecs = 1;
15743 break;
15744 case ARMISD::VLD2DUP:
15745 NewOpc = ARMISD::VLD2DUP_UPD;
15746 NumVecs = 2;
15747 break;
15748 case ARMISD::VLD3DUP:
15749 NewOpc = ARMISD::VLD3DUP_UPD;
15750 NumVecs = 3;
15751 break;
15752 case ARMISD::VLD4DUP:
15753 NewOpc = ARMISD::VLD4DUP_UPD;
15754 NumVecs = 4;
15755 break;
15756 case ISD::LOAD:
15757 NewOpc = ARMISD::VLD1_UPD;
15758 NumVecs = 1;
15759 isLaneOp = false;
15760 break;
15761 case ISD::STORE:
15762 NewOpc = ARMISD::VST1_UPD;
15763 NumVecs = 1;
15764 isLaneOp = false;
15765 isLoadOp = false;
15766 break;
15767 }
15768 }
15769
15770 // Find the size of memory referenced by the load/store.
15771 EVT VecTy;
15772 if (isLoadOp) {
15773 VecTy = N->getValueType(0);
15774 } else if (Target.isIntrinsic) {
15775 VecTy = N->getOperand(Target.AddrOpIdx + 1).getValueType();
15776 } else {
15777 assert(Target.isStore &&
15778 "Node has to be a load, a store, or an intrinsic!");
15779 VecTy = N->getOperand(1).getValueType();
15780 }
15781
15782 bool isVLDDUPOp =
15783 NewOpc == ARMISD::VLD1DUP_UPD || NewOpc == ARMISD::VLD2DUP_UPD ||
15784 NewOpc == ARMISD::VLD3DUP_UPD || NewOpc == ARMISD::VLD4DUP_UPD;
15785
15786 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
15787 if (isLaneOp || isVLDDUPOp)
15788 NumBytes /= VecTy.getVectorNumElements();
15789
15790 if (NumBytes >= 3 * 16 && User.ConstInc != NumBytes) {
15791 // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
15792 // separate instructions that make it harder to use a non-constant update.
15793 return false;
15794 }
15795
15796 if (SimpleConstIncOnly && User.ConstInc != NumBytes)
15797 return false;
15798
15799 if (!isValidBaseUpdate(N, User.N))
15800 return false;
15801
15802 // OK, we found an ADD we can fold into the base update.
15803 // Now, create a _UPD node, taking care of not breaking alignment.
15804
15805 EVT AlignedVecTy = VecTy;
15806 Align Alignment = MemN->getAlign();
15807
15808 // If this is a less-than-standard-aligned load/store, change the type to
15809 // match the standard alignment.
15810 // The alignment is overlooked when selecting _UPD variants; and it's
15811 // easier to introduce bitcasts here than fix that.
15812 // There are 3 ways to get to this base-update combine:
15813 // - intrinsics: they are assumed to be properly aligned (to the standard
15814 // alignment of the memory type), so we don't need to do anything.
15815 // - ARMISD::VLDx nodes: they are only generated from the aforementioned
15816 // intrinsics, so, likewise, there's nothing to do.
15817 // - generic load/store instructions: the alignment is specified as an
15818 // explicit operand, rather than implicitly as the standard alignment
15819 // of the memory type (like the intrisics). We need to change the
15820 // memory type to match the explicit alignment. That way, we don't
15821 // generate non-standard-aligned ARMISD::VLDx nodes.
15822 if (isa<LSBaseSDNode>(N)) {
15823 if (Alignment.value() < VecTy.getScalarSizeInBits() / 8) {
15824 MVT EltTy = MVT::getIntegerVT(Alignment.value() * 8);
15825 assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
15826 assert(!isLaneOp && "Unexpected generic load/store lane.");
15827 unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
15828 AlignedVecTy = MVT::getVectorVT(EltTy, NumElts);
15829 }
15830 // Don't set an explicit alignment on regular load/stores that we want
15831 // to transform to VLD/VST 1_UPD nodes.
15832 // This matches the behavior of regular load/stores, which only get an
15833 // explicit alignment if the MMO alignment is larger than the standard
15834 // alignment of the memory type.
15835 // Intrinsics, however, always get an explicit alignment, set to the
15836 // alignment of the MMO.
15837 Alignment = Align(1);
15838 }
15839
15840 // Create the new updating load/store node.
15841 // First, create an SDVTList for the new updating node's results.
15842 EVT Tys[6];
15843 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
15844 unsigned n;
15845 for (n = 0; n < NumResultVecs; ++n)
15846 Tys[n] = AlignedVecTy;
15847 Tys[n++] = MVT::i32;
15848 Tys[n] = MVT::Other;
15849 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
15850
15851 // Then, gather the new node's operands.
15853 Ops.push_back(N->getOperand(0)); // incoming chain
15854 Ops.push_back(N->getOperand(Target.AddrOpIdx));
15855 Ops.push_back(User.Inc);
15856
15857 if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) {
15858 // Try to match the intrinsic's signature
15859 Ops.push_back(StN->getValue());
15860 } else {
15861 // Loads (and of course intrinsics) match the intrinsics' signature,
15862 // so just add all but the alignment operand.
15863 unsigned LastOperand =
15864 hasAlignment ? N->getNumOperands() - 1 : N->getNumOperands();
15865 for (unsigned i = Target.AddrOpIdx + 1; i < LastOperand; ++i)
15866 Ops.push_back(N->getOperand(i));
15867 }
15868
15869 // For all node types, the alignment operand is always the last one.
15870 Ops.push_back(DAG.getConstant(Alignment.value(), dl, MVT::i32));
15871
15872 // If this is a non-standard-aligned STORE, the penultimate operand is the
15873 // stored value. Bitcast it to the aligned type.
15874 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {
15875 SDValue &StVal = Ops[Ops.size() - 2];
15876 StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);
15877 }
15878
15879 EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy;
15880 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT,
15881 MemN->getMemOperand());
15882
15883 // Update the uses.
15884 SmallVector<SDValue, 5> NewResults;
15885 for (unsigned i = 0; i < NumResultVecs; ++i)
15886 NewResults.push_back(SDValue(UpdN.getNode(), i));
15887
15888 // If this is an non-standard-aligned LOAD, the first result is the loaded
15889 // value. Bitcast it to the expected result type.
15890 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {
15891 SDValue &LdVal = NewResults[0];
15892 LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal);
15893 }
15894
15895 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
15896 DCI.CombineTo(N, NewResults);
15897 DCI.CombineTo(User.N, SDValue(UpdN.getNode(), NumResultVecs));
15898
15899 return true;
15900}
15901
15902// If (opcode ptr inc) is and ADD-like instruction, return the
15903// increment value. Otherwise return 0.
15904static unsigned getPointerConstIncrement(unsigned Opcode, SDValue Ptr,
15905 SDValue Inc, const SelectionDAG &DAG) {
15907 if (!CInc)
15908 return 0;
15909
15910 switch (Opcode) {
15911 case ARMISD::VLD1_UPD:
15912 case ISD::ADD:
15913 return CInc->getZExtValue();
15914 case ISD::OR: {
15915 if (DAG.haveNoCommonBitsSet(Ptr, Inc)) {
15916 // (OR ptr inc) is the same as (ADD ptr inc)
15917 return CInc->getZExtValue();
15918 }
15919 return 0;
15920 }
15921 default:
15922 return 0;
15923 }
15924}
15925
15927 switch (N->getOpcode()) {
15928 case ISD::ADD:
15929 case ISD::OR: {
15930 if (isa<ConstantSDNode>(N->getOperand(1))) {
15931 *Ptr = N->getOperand(0);
15932 *CInc = N->getOperand(1);
15933 return true;
15934 }
15935 return false;
15936 }
15937 case ARMISD::VLD1_UPD: {
15938 if (isa<ConstantSDNode>(N->getOperand(2))) {
15939 *Ptr = N->getOperand(1);
15940 *CInc = N->getOperand(2);
15941 return true;
15942 }
15943 return false;
15944 }
15945 default:
15946 return false;
15947 }
15948}
15949
15950/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP,
15951/// NEON load/store intrinsics, and generic vector load/stores, to merge
15952/// base address updates.
15953/// For generic load/stores, the memory type is assumed to be a vector.
15954/// The caller is assumed to have checked legality.
15957 const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
15958 N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
15959 const bool isStore = N->getOpcode() == ISD::STORE;
15960 const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1);
15961 BaseUpdateTarget Target = {N, isIntrinsic, isStore, AddrOpIdx};
15962
15963 // Limit the number of possible base-updates we look at to prevent degenerate
15964 // cases.
15965 unsigned MaxBaseUpdates = ArmMaxBaseUpdatesToCheck;
15966
15967 SDValue Addr = N->getOperand(AddrOpIdx);
15968
15970
15971 // Search for a use of the address operand that is an increment.
15972 for (SDUse &Use : Addr->uses()) {
15973 SDNode *User = Use.getUser();
15974 if (Use.getResNo() != Addr.getResNo() || User->getNumOperands() != 2)
15975 continue;
15976
15977 SDValue Inc = User->getOperand(Use.getOperandNo() == 1 ? 0 : 1);
15978 unsigned ConstInc =
15979 getPointerConstIncrement(User->getOpcode(), Addr, Inc, DCI.DAG);
15980
15981 if (ConstInc || User->getOpcode() == ISD::ADD) {
15982 BaseUpdates.push_back({User, Inc, ConstInc});
15983 if (BaseUpdates.size() >= MaxBaseUpdates)
15984 break;
15985 }
15986 }
15987
15988 // If the address is a constant pointer increment itself, find
15989 // another constant increment that has the same base operand
15990 SDValue Base;
15991 SDValue CInc;
15992 if (findPointerConstIncrement(Addr.getNode(), &Base, &CInc)) {
15993 unsigned Offset =
15994 getPointerConstIncrement(Addr->getOpcode(), Base, CInc, DCI.DAG);
15995 for (SDUse &Use : Base->uses()) {
15996
15997 SDNode *User = Use.getUser();
15998 if (Use.getResNo() != Base.getResNo() || User == Addr.getNode() ||
15999 User->getNumOperands() != 2)
16000 continue;
16001
16002 SDValue UserInc = User->getOperand(Use.getOperandNo() == 0 ? 1 : 0);
16003 unsigned UserOffset =
16004 getPointerConstIncrement(User->getOpcode(), Base, UserInc, DCI.DAG);
16005
16006 if (!UserOffset || UserOffset <= Offset)
16007 continue;
16008
16009 unsigned NewConstInc = UserOffset - Offset;
16010 SDValue NewInc = DCI.DAG.getConstant(NewConstInc, SDLoc(N), MVT::i32);
16011 BaseUpdates.push_back({User, NewInc, NewConstInc});
16012 if (BaseUpdates.size() >= MaxBaseUpdates)
16013 break;
16014 }
16015 }
16016
16017 // Try to fold the load/store with an update that matches memory
16018 // access size. This should work well for sequential loads.
16019 unsigned NumValidUpd = BaseUpdates.size();
16020 for (unsigned I = 0; I < NumValidUpd; I++) {
16021 BaseUpdateUser &User = BaseUpdates[I];
16022 if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/true, DCI))
16023 return SDValue();
16024 }
16025
16026 // Try to fold with other users. Non-constant updates are considered
16027 // first, and constant updates are sorted to not break a sequence of
16028 // strided accesses (if there is any).
16029 llvm::stable_sort(BaseUpdates,
16030 [](const BaseUpdateUser &LHS, const BaseUpdateUser &RHS) {
16031 return LHS.ConstInc < RHS.ConstInc;
16032 });
16033 for (BaseUpdateUser &User : BaseUpdates) {
16034 if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/false, DCI))
16035 return SDValue();
16036 }
16037 return SDValue();
16038}
16039
16042 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
16043 return SDValue();
16044
16045 return CombineBaseUpdate(N, DCI);
16046}
16047
16050 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
16051 return SDValue();
16052
16053 SelectionDAG &DAG = DCI.DAG;
16054 SDValue Addr = N->getOperand(2);
16055 MemSDNode *MemN = cast<MemSDNode>(N);
16056 SDLoc dl(N);
16057
16058 // For the stores, where there are multiple intrinsics we only actually want
16059 // to post-inc the last of the them.
16060 unsigned IntNo = N->getConstantOperandVal(1);
16061 if (IntNo == Intrinsic::arm_mve_vst2q && N->getConstantOperandVal(5) != 1)
16062 return SDValue();
16063 if (IntNo == Intrinsic::arm_mve_vst4q && N->getConstantOperandVal(7) != 3)
16064 return SDValue();
16065
16066 // Search for a use of the address operand that is an increment.
16067 for (SDUse &Use : Addr->uses()) {
16068 SDNode *User = Use.getUser();
16069 if (User->getOpcode() != ISD::ADD || Use.getResNo() != Addr.getResNo())
16070 continue;
16071
16072 // Check that the add is independent of the load/store. Otherwise, folding
16073 // it would create a cycle. We can avoid searching through Addr as it's a
16074 // predecessor to both.
16077 Visited.insert(Addr.getNode());
16078 Worklist.push_back(N);
16079 Worklist.push_back(User);
16080 const unsigned MaxSteps = 1024;
16081 if (SDNode::hasPredecessorHelper(N, Visited, Worklist, MaxSteps) ||
16082 SDNode::hasPredecessorHelper(User, Visited, Worklist, MaxSteps))
16083 continue;
16084
16085 // Find the new opcode for the updating load/store.
16086 bool isLoadOp = true;
16087 unsigned NewOpc = 0;
16088 unsigned NumVecs = 0;
16089 switch (IntNo) {
16090 default:
16091 llvm_unreachable("unexpected intrinsic for MVE VLDn combine");
16092 case Intrinsic::arm_mve_vld2q:
16093 NewOpc = ARMISD::VLD2_UPD;
16094 NumVecs = 2;
16095 break;
16096 case Intrinsic::arm_mve_vld4q:
16097 NewOpc = ARMISD::VLD4_UPD;
16098 NumVecs = 4;
16099 break;
16100 case Intrinsic::arm_mve_vst2q:
16101 NewOpc = ARMISD::VST2_UPD;
16102 NumVecs = 2;
16103 isLoadOp = false;
16104 break;
16105 case Intrinsic::arm_mve_vst4q:
16106 NewOpc = ARMISD::VST4_UPD;
16107 NumVecs = 4;
16108 isLoadOp = false;
16109 break;
16110 }
16111
16112 // Find the size of memory referenced by the load/store.
16113 EVT VecTy;
16114 if (isLoadOp) {
16115 VecTy = N->getValueType(0);
16116 } else {
16117 VecTy = N->getOperand(3).getValueType();
16118 }
16119
16120 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
16121
16122 // If the increment is a constant, it must match the memory ref size.
16123 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
16125 if (!CInc || CInc->getZExtValue() != NumBytes)
16126 continue;
16127
16128 // Create the new updating load/store node.
16129 // First, create an SDVTList for the new updating node's results.
16130 EVT Tys[6];
16131 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
16132 unsigned n;
16133 for (n = 0; n < NumResultVecs; ++n)
16134 Tys[n] = VecTy;
16135 Tys[n++] = MVT::i32;
16136 Tys[n] = MVT::Other;
16137 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
16138
16139 // Then, gather the new node's operands.
16141 Ops.push_back(N->getOperand(0)); // incoming chain
16142 Ops.push_back(N->getOperand(2)); // ptr
16143 Ops.push_back(Inc);
16144
16145 for (unsigned i = 3; i < N->getNumOperands(); ++i)
16146 Ops.push_back(N->getOperand(i));
16147
16148 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, VecTy,
16149 MemN->getMemOperand());
16150
16151 // Update the uses.
16152 SmallVector<SDValue, 5> NewResults;
16153 for (unsigned i = 0; i < NumResultVecs; ++i)
16154 NewResults.push_back(SDValue(UpdN.getNode(), i));
16155
16156 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
16157 DCI.CombineTo(N, NewResults);
16158 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
16159
16160 break;
16161 }
16162
16163 return SDValue();
16164}
16165
16166/// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
16167/// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
16168/// are also VDUPLANEs. If so, combine them to a vldN-dup operation and
16169/// return true.
16171 SelectionDAG &DAG = DCI.DAG;
16172 EVT VT = N->getValueType(0);
16173 // vldN-dup instructions only support 64-bit vectors for N > 1.
16174 if (!VT.is64BitVector())
16175 return false;
16176
16177 // Check if the VDUPLANE operand is a vldN-dup intrinsic.
16178 SDNode *VLD = N->getOperand(0).getNode();
16179 if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
16180 return false;
16181 unsigned NumVecs = 0;
16182 unsigned NewOpc = 0;
16183 unsigned IntNo = VLD->getConstantOperandVal(1);
16184 if (IntNo == Intrinsic::arm_neon_vld2lane) {
16185 NumVecs = 2;
16186 NewOpc = ARMISD::VLD2DUP;
16187 } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
16188 NumVecs = 3;
16189 NewOpc = ARMISD::VLD3DUP;
16190 } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
16191 NumVecs = 4;
16192 NewOpc = ARMISD::VLD4DUP;
16193 } else {
16194 return false;
16195 }
16196
16197 // First check that all the vldN-lane uses are VDUPLANEs and that the lane
16198 // numbers match the load.
16199 unsigned VLDLaneNo = VLD->getConstantOperandVal(NumVecs + 3);
16200 for (SDUse &Use : VLD->uses()) {
16201 // Ignore uses of the chain result.
16202 if (Use.getResNo() == NumVecs)
16203 continue;
16204 SDNode *User = Use.getUser();
16205 if (User->getOpcode() != ARMISD::VDUPLANE ||
16206 VLDLaneNo != User->getConstantOperandVal(1))
16207 return false;
16208 }
16209
16210 // Create the vldN-dup node.
16211 EVT Tys[5];
16212 unsigned n;
16213 for (n = 0; n < NumVecs; ++n)
16214 Tys[n] = VT;
16215 Tys[n] = MVT::Other;
16216 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumVecs + 1));
16217 SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
16219 SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys,
16220 Ops, VLDMemInt->getMemoryVT(),
16221 VLDMemInt->getMemOperand());
16222
16223 // Update the uses.
16224 for (SDUse &Use : VLD->uses()) {
16225 unsigned ResNo = Use.getResNo();
16226 // Ignore uses of the chain result.
16227 if (ResNo == NumVecs)
16228 continue;
16229 DCI.CombineTo(Use.getUser(), SDValue(VLDDup.getNode(), ResNo));
16230 }
16231
16232 // Now the vldN-lane intrinsic is dead except for its chain result.
16233 // Update uses of the chain.
16234 std::vector<SDValue> VLDDupResults;
16235 for (unsigned n = 0; n < NumVecs; ++n)
16236 VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
16237 VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
16238 DCI.CombineTo(VLD, VLDDupResults);
16239
16240 return true;
16241}
16242
16243/// PerformVDUPLANECombine - Target-specific dag combine xforms for
16244/// ARMISD::VDUPLANE.
16247 const ARMSubtarget *Subtarget) {
16248 SDValue Op = N->getOperand(0);
16249 EVT VT = N->getValueType(0);
16250
16251 // On MVE, we just convert the VDUPLANE to a VDUP with an extract.
16252 if (Subtarget->hasMVEIntegerOps()) {
16253 EVT ExtractVT = VT.getVectorElementType();
16254 // We need to ensure we are creating a legal type.
16255 if (!DCI.DAG.getTargetLoweringInfo().isTypeLegal(ExtractVT))
16256 ExtractVT = MVT::i32;
16257 SDValue Extract = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), ExtractVT,
16258 N->getOperand(0), N->getOperand(1));
16259 return DCI.DAG.getNode(ARMISD::VDUP, SDLoc(N), VT, Extract);
16260 }
16261
16262 // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses
16263 // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation.
16264 if (CombineVLDDUP(N, DCI))
16265 return SDValue(N, 0);
16266
16267 // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is
16268 // redundant. Ignore bit_converts for now; element sizes are checked below.
16269 while (Op.getOpcode() == ISD::BITCAST)
16270 Op = Op.getOperand(0);
16271 if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM)
16272 return SDValue();
16273
16274 // Make sure the VMOV element size is not bigger than the VDUPLANE elements.
16275 unsigned EltSize = Op.getScalarValueSizeInBits();
16276 // The canonical VMOV for a zero vector uses a 32-bit element size.
16277 unsigned Imm = Op.getConstantOperandVal(0);
16278 unsigned EltBits;
16279 if (ARM_AM::decodeVMOVModImm(Imm, EltBits) == 0)
16280 EltSize = 8;
16281 if (EltSize > VT.getScalarSizeInBits())
16282 return SDValue();
16283
16284 return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
16285}
16286
16287/// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
16289 const ARMSubtarget *Subtarget) {
16290 SDValue Op = N->getOperand(0);
16291 SDLoc dl(N);
16292
16293 if (Subtarget->hasMVEIntegerOps()) {
16294 // Convert VDUP f32 -> VDUP BITCAST i32 under MVE, as we know the value will
16295 // need to come from a GPR.
16296 if (Op.getValueType() == MVT::f32)
16297 return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
16298 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op));
16299 else if (Op.getValueType() == MVT::f16)
16300 return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
16301 DAG.getNode(ARMISD::VMOVrh, dl, MVT::i32, Op));
16302 }
16303
16304 if (!Subtarget->hasNEON())
16305 return SDValue();
16306
16307 // Match VDUP(LOAD) -> VLD1DUP.
16308 // We match this pattern here rather than waiting for isel because the
16309 // transform is only legal for unindexed loads.
16310 LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode());
16311 if (LD && Op.hasOneUse() && LD->isUnindexed() &&
16312 LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) {
16313 SDValue Ops[] = {LD->getOperand(0), LD->getOperand(1),
16314 DAG.getConstant(LD->getAlign().value(), SDLoc(N), MVT::i32)};
16315 SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other);
16316 SDValue VLDDup =
16318 LD->getMemoryVT(), LD->getMemOperand());
16319 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1));
16320 return VLDDup;
16321 }
16322
16323 return SDValue();
16324}
16325
16328 const ARMSubtarget *Subtarget) {
16329 EVT VT = N->getValueType(0);
16330
16331 // If this is a legal vector load, try to combine it into a VLD1_UPD.
16332 if (Subtarget->hasNEON() && ISD::isNormalLoad(N) && VT.isVector() &&
16334 return CombineBaseUpdate(N, DCI);
16335
16336 return SDValue();
16337}
16338
16339// Optimize trunc store (of multiple scalars) to shuffle and store. First,
16340// pack all of the elements in one place. Next, store to memory in fewer
16341// chunks.
16343 SelectionDAG &DAG) {
16344 SDValue StVal = St->getValue();
16345 EVT VT = StVal.getValueType();
16346 if (!St->isTruncatingStore() || !VT.isVector())
16347 return SDValue();
16348 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16349 EVT StVT = St->getMemoryVT();
16350 unsigned NumElems = VT.getVectorNumElements();
16351 assert(StVT != VT && "Cannot truncate to the same type");
16352 unsigned FromEltSz = VT.getScalarSizeInBits();
16353 unsigned ToEltSz = StVT.getScalarSizeInBits();
16354
16355 // From, To sizes and ElemCount must be pow of two
16356 if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz))
16357 return SDValue();
16358
16359 // We are going to use the original vector elt for storing.
16360 // Accumulated smaller vector elements must be a multiple of the store size.
16361 if (0 != (NumElems * FromEltSz) % ToEltSz)
16362 return SDValue();
16363
16364 unsigned SizeRatio = FromEltSz / ToEltSz;
16365 assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
16366
16367 // Create a type on which we perform the shuffle.
16368 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
16369 NumElems * SizeRatio);
16370 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
16371
16372 SDLoc DL(St);
16373 SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
16374 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
16375 for (unsigned i = 0; i < NumElems; ++i)
16376 ShuffleVec[i] = DAG.getDataLayout().isBigEndian() ? (i + 1) * SizeRatio - 1
16377 : i * SizeRatio;
16378
16379 // Can't shuffle using an illegal type.
16380 if (!TLI.isTypeLegal(WideVecVT))
16381 return SDValue();
16382
16383 SDValue Shuff = DAG.getVectorShuffle(
16384 WideVecVT, DL, WideVec, DAG.getUNDEF(WideVec.getValueType()), ShuffleVec);
16385 // At this point all of the data is stored at the bottom of the
16386 // register. We now need to save it to mem.
16387
16388 // Find the largest store unit
16389 MVT StoreType = MVT::i8;
16390 for (MVT Tp : MVT::integer_valuetypes()) {
16391 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
16392 StoreType = Tp;
16393 }
16394 // Didn't find a legal store type.
16395 if (!TLI.isTypeLegal(StoreType))
16396 return SDValue();
16397
16398 // Bitcast the original vector into a vector of store-size units
16399 EVT StoreVecVT =
16400 EVT::getVectorVT(*DAG.getContext(), StoreType,
16401 VT.getSizeInBits() / EVT(StoreType).getSizeInBits());
16402 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
16403 SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
16405 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL,
16406 TLI.getPointerTy(DAG.getDataLayout()));
16407 SDValue BasePtr = St->getBasePtr();
16408
16409 // Perform one or more big stores into memory.
16410 unsigned E = (ToEltSz * NumElems) / StoreType.getSizeInBits();
16411 for (unsigned I = 0; I < E; I++) {
16412 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreType,
16413 ShuffWide, DAG.getIntPtrConstant(I, DL));
16414 SDValue Ch =
16415 DAG.getStore(St->getChain(), DL, SubVec, BasePtr, St->getPointerInfo(),
16416 St->getAlign(), St->getMemOperand()->getFlags());
16417 BasePtr =
16418 DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, Increment);
16419 Chains.push_back(Ch);
16420 }
16421 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
16422}
16423
16424// Try taking a single vector store from an fpround (which would otherwise turn
16425// into an expensive buildvector) and splitting it into a series of narrowing
16426// stores.
16428 SelectionDAG &DAG) {
16429 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16430 return SDValue();
16431 SDValue Trunc = St->getValue();
16432 if (Trunc->getOpcode() != ISD::FP_ROUND)
16433 return SDValue();
16434 EVT FromVT = Trunc->getOperand(0).getValueType();
16435 EVT ToVT = Trunc.getValueType();
16436 if (!ToVT.isVector())
16437 return SDValue();
16439 EVT ToEltVT = ToVT.getVectorElementType();
16440 EVT FromEltVT = FromVT.getVectorElementType();
16441
16442 if (FromEltVT != MVT::f32 || ToEltVT != MVT::f16)
16443 return SDValue();
16444
16445 unsigned NumElements = 4;
16446 if (FromVT.getVectorNumElements() % NumElements != 0)
16447 return SDValue();
16448
16449 // Test if the Trunc will be convertable to a VMOVN with a shuffle, and if so
16450 // use the VMOVN over splitting the store. We are looking for patterns of:
16451 // !rev: 0 N 1 N+1 2 N+2 ...
16452 // rev: N 0 N+1 1 N+2 2 ...
16453 // The shuffle may either be a single source (in which case N = NumElts/2) or
16454 // two inputs extended with concat to the same size (in which case N =
16455 // NumElts).
16456 auto isVMOVNShuffle = [&](ShuffleVectorSDNode *SVN, bool Rev) {
16457 ArrayRef<int> M = SVN->getMask();
16458 unsigned NumElts = ToVT.getVectorNumElements();
16459 if (SVN->getOperand(1).isUndef())
16460 NumElts /= 2;
16461
16462 unsigned Off0 = Rev ? NumElts : 0;
16463 unsigned Off1 = Rev ? 0 : NumElts;
16464
16465 for (unsigned I = 0; I < NumElts; I += 2) {
16466 if (M[I] >= 0 && M[I] != (int)(Off0 + I / 2))
16467 return false;
16468 if (M[I + 1] >= 0 && M[I + 1] != (int)(Off1 + I / 2))
16469 return false;
16470 }
16471
16472 return true;
16473 };
16474
16475 if (auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(Trunc.getOperand(0)))
16476 if (isVMOVNShuffle(Shuffle, false) || isVMOVNShuffle(Shuffle, true))
16477 return SDValue();
16478
16479 LLVMContext &C = *DAG.getContext();
16480 SDLoc DL(St);
16481 // Details about the old store
16482 SDValue Ch = St->getChain();
16483 SDValue BasePtr = St->getBasePtr();
16484 Align Alignment = St->getBaseAlign();
16486 AAMDNodes AAInfo = St->getAAInfo();
16487
16488 // We split the store into slices of NumElements. fp16 trunc stores are vcvt
16489 // and then stored as truncating integer stores.
16490 EVT NewFromVT = EVT::getVectorVT(C, FromEltVT, NumElements);
16491 EVT NewToVT = EVT::getVectorVT(
16492 C, EVT::getIntegerVT(C, ToEltVT.getSizeInBits()), NumElements);
16493
16495 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
16496 unsigned NewOffset = i * NumElements * ToEltVT.getSizeInBits() / 8;
16497 SDValue NewPtr =
16498 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
16499
16500 SDValue Extract =
16501 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0),
16502 DAG.getConstant(i * NumElements, DL, MVT::i32));
16503
16504 SDValue FPTrunc =
16505 DAG.getNode(ARMISD::VCVTN, DL, MVT::v8f16, DAG.getUNDEF(MVT::v8f16),
16506 Extract, DAG.getConstant(0, DL, MVT::i32));
16507 Extract = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v4i32, FPTrunc);
16508
16509 SDValue Store = DAG.getTruncStore(
16510 Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
16511 NewToVT, Alignment, MMOFlags, AAInfo);
16512 Stores.push_back(Store);
16513 }
16514 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
16515}
16516
16517// Try taking a single vector store from an MVETRUNC (which would otherwise turn
16518// into an expensive buildvector) and splitting it into a series of narrowing
16519// stores.
16521 SelectionDAG &DAG) {
16522 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16523 return SDValue();
16524 SDValue Trunc = St->getValue();
16525 if (Trunc->getOpcode() != ARMISD::MVETRUNC)
16526 return SDValue();
16527 EVT FromVT = Trunc->getOperand(0).getValueType();
16528 EVT ToVT = Trunc.getValueType();
16529
16530 LLVMContext &C = *DAG.getContext();
16531 SDLoc DL(St);
16532 // Details about the old store
16533 SDValue Ch = St->getChain();
16534 SDValue BasePtr = St->getBasePtr();
16535 Align Alignment = St->getBaseAlign();
16537 AAMDNodes AAInfo = St->getAAInfo();
16538
16539 EVT NewToVT = EVT::getVectorVT(C, ToVT.getVectorElementType(),
16540 FromVT.getVectorNumElements());
16541
16543 for (unsigned i = 0; i < Trunc.getNumOperands(); i++) {
16544 unsigned NewOffset =
16545 i * FromVT.getVectorNumElements() * ToVT.getScalarSizeInBits() / 8;
16546 SDValue NewPtr =
16547 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
16548
16549 SDValue Extract = Trunc.getOperand(i);
16550 SDValue Store = DAG.getTruncStore(
16551 Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
16552 NewToVT, Alignment, MMOFlags, AAInfo);
16553 Stores.push_back(Store);
16554 }
16555 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
16556}
16557
16558// Given a floating point store from an extracted vector, with an integer
16559// VGETLANE that already exists, store the existing VGETLANEu directly. This can
16560// help reduce fp register pressure, doesn't require the fp extract and allows
16561// use of more integer post-inc stores not available with vstr.
16563 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16564 return SDValue();
16565 SDValue Extract = St->getValue();
16566 EVT VT = Extract.getValueType();
16567 // For now only uses f16. This may be useful for f32 too, but that will
16568 // be bitcast(extract), not the VGETLANEu we currently check here.
16569 if (VT != MVT::f16 || Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16570 return SDValue();
16571
16572 SDNode *GetLane =
16573 DAG.getNodeIfExists(ARMISD::VGETLANEu, DAG.getVTList(MVT::i32),
16574 {Extract.getOperand(0), Extract.getOperand(1)});
16575 if (!GetLane)
16576 return SDValue();
16577
16578 LLVMContext &C = *DAG.getContext();
16579 SDLoc DL(St);
16580 // Create a new integer store to replace the existing floating point version.
16581 SDValue Ch = St->getChain();
16582 SDValue BasePtr = St->getBasePtr();
16583 Align Alignment = St->getBaseAlign();
16585 AAMDNodes AAInfo = St->getAAInfo();
16586 EVT NewToVT = EVT::getIntegerVT(C, VT.getSizeInBits());
16587 SDValue Store = DAG.getTruncStore(Ch, DL, SDValue(GetLane, 0), BasePtr,
16588 St->getPointerInfo(), NewToVT, Alignment,
16589 MMOFlags, AAInfo);
16590
16591 return Store;
16592}
16593
16594/// PerformSTORECombine - Target-specific dag combine xforms for
16595/// ISD::STORE.
16598 const ARMSubtarget *Subtarget) {
16600 if (St->isVolatile())
16601 return SDValue();
16602 SDValue StVal = St->getValue();
16603 EVT VT = StVal.getValueType();
16604
16605 if (Subtarget->hasNEON())
16606 if (SDValue Store = PerformTruncatingStoreCombine(St, DCI.DAG))
16607 return Store;
16608
16609 if (Subtarget->hasMVEFloatOps())
16610 if (SDValue NewToken = PerformSplittingToNarrowingStores(St, DCI.DAG))
16611 return NewToken;
16612
16613 if (Subtarget->hasMVEIntegerOps()) {
16614 if (SDValue NewChain = PerformExtractFpToIntStores(St, DCI.DAG))
16615 return NewChain;
16616 if (SDValue NewToken =
16618 return NewToken;
16619 }
16620
16621 if (!ISD::isNormalStore(St))
16622 return SDValue();
16623
16624 // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and
16625 // ARM stores of arguments in the same cache line.
16626 if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
16627 StVal.getNode()->hasOneUse()) {
16628 SelectionDAG &DAG = DCI.DAG;
16629 bool isBigEndian = DAG.getDataLayout().isBigEndian();
16630 SDLoc DL(St);
16631 SDValue BasePtr = St->getBasePtr();
16632 SDValue NewST1 = DAG.getStore(
16633 St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0),
16634 BasePtr, St->getPointerInfo(), St->getBaseAlign(),
16635 St->getMemOperand()->getFlags());
16636
16637 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
16638 DAG.getConstant(4, DL, MVT::i32));
16639 return DAG.getStore(NewST1.getValue(0), DL,
16640 StVal.getNode()->getOperand(isBigEndian ? 0 : 1),
16641 OffsetPtr, St->getPointerInfo().getWithOffset(4),
16642 St->getBaseAlign(), St->getMemOperand()->getFlags());
16643 }
16644
16645 if (StVal.getValueType() == MVT::i64 &&
16647
16648 // Bitcast an i64 store extracted from a vector to f64.
16649 // Otherwise, the i64 value will be legalized to a pair of i32 values.
16650 SelectionDAG &DAG = DCI.DAG;
16651 SDLoc dl(StVal);
16652 SDValue IntVec = StVal.getOperand(0);
16653 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
16655 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec);
16656 SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
16657 Vec, StVal.getOperand(1));
16658 dl = SDLoc(N);
16659 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt);
16660 // Make the DAGCombiner fold the bitcasts.
16661 DCI.AddToWorklist(Vec.getNode());
16662 DCI.AddToWorklist(ExtElt.getNode());
16663 DCI.AddToWorklist(V.getNode());
16664 return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
16665 St->getPointerInfo(), St->getAlign(),
16666 St->getMemOperand()->getFlags(), St->getAAInfo());
16667 }
16668
16669 // If this is a legal vector store, try to combine it into a VST1_UPD.
16670 if (Subtarget->hasNEON() && ISD::isNormalStore(N) && VT.isVector() &&
16672 return CombineBaseUpdate(N, DCI);
16673
16674 return SDValue();
16675}
16676
16677/// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD)
16678/// can replace combinations of VMUL and VCVT (floating-point to integer)
16679/// when the VMUL has a constant operand that is a power of 2.
16680///
16681/// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
16682/// vmul.f32 d16, d17, d16
16683/// vcvt.s32.f32 d16, d16
16684/// becomes:
16685/// vcvt.s32.f32 d16, d16, #3
16687 const ARMSubtarget *Subtarget) {
16688 if (!Subtarget->hasNEON())
16689 return SDValue();
16690
16691 SDValue Op = N->getOperand(0);
16692 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
16693 Op.getOpcode() != ISD::FMUL)
16694 return SDValue();
16695
16696 SDValue ConstVec = Op->getOperand(1);
16697 if (!isa<BuildVectorSDNode>(ConstVec))
16698 return SDValue();
16699
16700 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
16701 uint32_t FloatBits = FloatTy.getSizeInBits();
16702 MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
16703 uint32_t IntBits = IntTy.getSizeInBits();
16704 unsigned NumLanes = Op.getValueType().getVectorNumElements();
16705 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
16706 // These instructions only exist converting from f32 to i32. We can handle
16707 // smaller integers by generating an extra truncate, but larger ones would
16708 // be lossy. We also can't handle anything other than 2 or 4 lanes, since
16709 // these intructions only support v2i32/v4i32 types.
16710 return SDValue();
16711 }
16712
16713 BitVector UndefElements;
16715 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
16716 if (C == -1 || C == 0 || C > 32)
16717 return SDValue();
16718
16719 SDLoc dl(N);
16720 bool isSigned = N->getOpcode() == ISD::FP_TO_SINT;
16721 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs :
16722 Intrinsic::arm_neon_vcvtfp2fxu;
16723 SDValue FixConv = DAG.getNode(
16724 ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
16725 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0),
16726 DAG.getConstant(C, dl, MVT::i32));
16727
16728 if (IntBits < FloatBits)
16729 FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv);
16730
16731 return FixConv;
16732}
16733
16735 const ARMSubtarget *Subtarget) {
16736 if (!Subtarget->hasMVEFloatOps())
16737 return SDValue();
16738
16739 // Turn (fadd x, (vselect c, y, -0.0)) into (vselect c, (fadd x, y), x)
16740 // The second form can be more easily turned into a predicated vadd, and
16741 // possibly combined into a fma to become a predicated vfma.
16742 SDValue Op0 = N->getOperand(0);
16743 SDValue Op1 = N->getOperand(1);
16744 EVT VT = N->getValueType(0);
16745 SDLoc DL(N);
16746
16747 // The identity element for a fadd is -0.0 or +0.0 when the nsz flag is set,
16748 // which these VMOV's represent.
16749 auto isIdentitySplat = [&](SDValue Op, bool NSZ) {
16750 if (Op.getOpcode() != ISD::BITCAST ||
16751 Op.getOperand(0).getOpcode() != ARMISD::VMOVIMM)
16752 return false;
16753 uint64_t ImmVal = Op.getOperand(0).getConstantOperandVal(0);
16754 if (VT == MVT::v4f32 && (ImmVal == 1664 || (ImmVal == 0 && NSZ)))
16755 return true;
16756 if (VT == MVT::v8f16 && (ImmVal == 2688 || (ImmVal == 0 && NSZ)))
16757 return true;
16758 return false;
16759 };
16760
16761 if (Op0.getOpcode() == ISD::VSELECT && Op1.getOpcode() != ISD::VSELECT)
16762 std::swap(Op0, Op1);
16763
16764 if (Op1.getOpcode() != ISD::VSELECT)
16765 return SDValue();
16766
16767 SDNodeFlags FaddFlags = N->getFlags();
16768 bool NSZ = FaddFlags.hasNoSignedZeros();
16769 if (!isIdentitySplat(Op1.getOperand(2), NSZ))
16770 return SDValue();
16771
16772 SDValue FAdd =
16773 DAG.getNode(ISD::FADD, DL, VT, Op0, Op1.getOperand(1), FaddFlags);
16774 return DAG.getNode(ISD::VSELECT, DL, VT, Op1.getOperand(0), FAdd, Op0, FaddFlags);
16775}
16776
16778 SDValue LHS = N->getOperand(0);
16779 SDValue RHS = N->getOperand(1);
16780 EVT VT = N->getValueType(0);
16781 SDLoc DL(N);
16782
16783 if (!N->getFlags().hasAllowReassociation())
16784 return SDValue();
16785
16786 // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
16787 auto ReassocComplex = [&](SDValue A, SDValue B) {
16788 if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
16789 return SDValue();
16790 unsigned Opc = A.getConstantOperandVal(0);
16791 if (Opc != Intrinsic::arm_mve_vcmlaq)
16792 return SDValue();
16793 SDValue VCMLA = DAG.getNode(
16794 ISD::INTRINSIC_WO_CHAIN, DL, VT, A.getOperand(0), A.getOperand(1),
16795 DAG.getNode(ISD::FADD, DL, VT, A.getOperand(2), B, N->getFlags()),
16796 A.getOperand(3), A.getOperand(4));
16797 VCMLA->setFlags(A->getFlags());
16798 return VCMLA;
16799 };
16800 if (SDValue R = ReassocComplex(LHS, RHS))
16801 return R;
16802 if (SDValue R = ReassocComplex(RHS, LHS))
16803 return R;
16804
16805 return SDValue();
16806}
16807
16809 const ARMSubtarget *Subtarget) {
16810 if (SDValue S = PerformFAddVSelectCombine(N, DAG, Subtarget))
16811 return S;
16812 if (SDValue S = PerformFADDVCMLACombine(N, DAG))
16813 return S;
16814 return SDValue();
16815}
16816
16817/// PerformVMulVCTPCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
16818/// can replace combinations of VCVT (integer to floating-point) and VMUL
16819/// when the VMUL has a constant operand that is a power of 2.
16820///
16821/// Example (assume d17 = <float 0.125, float 0.125>):
16822/// vcvt.f32.s32 d16, d16
16823/// vmul.f32 d16, d16, d17
16824/// becomes:
16825/// vcvt.f32.s32 d16, d16, #3
16827 const ARMSubtarget *Subtarget) {
16828 if (!Subtarget->hasNEON())
16829 return SDValue();
16830
16831 SDValue Op = N->getOperand(0);
16832 unsigned OpOpcode = Op.getNode()->getOpcode();
16833 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() ||
16834 (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP))
16835 return SDValue();
16836
16837 SDValue ConstVec = N->getOperand(1);
16838 if (!isa<BuildVectorSDNode>(ConstVec))
16839 return SDValue();
16840
16841 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
16842 uint32_t FloatBits = FloatTy.getSizeInBits();
16843 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
16844 uint32_t IntBits = IntTy.getSizeInBits();
16845 unsigned NumLanes = Op.getValueType().getVectorNumElements();
16846 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
16847 // These instructions only exist converting from i32 to f32. We can handle
16848 // smaller integers by generating an extra extend, but larger ones would
16849 // be lossy. We also can't handle anything other than 2 or 4 lanes, since
16850 // these intructions only support v2i32/v4i32 types.
16851 return SDValue();
16852 }
16853
16854 ConstantFPSDNode *CN = isConstOrConstSplatFP(ConstVec, true);
16855 APFloat Recip(0.0f);
16856 if (!CN || !CN->getValueAPF().getExactInverse(&Recip))
16857 return SDValue();
16858
16859 bool IsExact;
16860 APSInt IntVal(33);
16861 if (Recip.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact) !=
16862 APFloat::opOK ||
16863 !IsExact)
16864 return SDValue();
16865
16866 int32_t C = IntVal.exactLogBase2();
16867 if (C == -1 || C == 0 || C > 32)
16868 return SDValue();
16869
16870 SDLoc DL(N);
16871 bool isSigned = OpOpcode == ISD::SINT_TO_FP;
16872 SDValue ConvInput = Op.getOperand(0);
16873 if (IntBits < FloatBits)
16875 NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, ConvInput);
16876
16877 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp
16878 : Intrinsic::arm_neon_vcvtfxu2fp;
16879 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
16880 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
16881 DAG.getConstant(C, DL, MVT::i32));
16882}
16883
16885 const ARMSubtarget *ST) {
16886 if (!ST->hasMVEIntegerOps())
16887 return SDValue();
16888
16889 assert(N->getOpcode() == ISD::VECREDUCE_ADD);
16890 EVT ResVT = N->getValueType(0);
16891 SDValue N0 = N->getOperand(0);
16892 SDLoc dl(N);
16893
16894 // Try to turn vecreduce_add(add(x, y)) into vecreduce(x) + vecreduce(y)
16895 if (ResVT == MVT::i32 && N0.getOpcode() == ISD::ADD &&
16896 (N0.getValueType() == MVT::v4i32 || N0.getValueType() == MVT::v8i16 ||
16897 N0.getValueType() == MVT::v16i8)) {
16898 SDValue Red0 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(0));
16899 SDValue Red1 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(1));
16900 return DAG.getNode(ISD::ADD, dl, ResVT, Red0, Red1);
16901 }
16902
16903 // We are looking for something that will have illegal types if left alone,
16904 // but that we can convert to a single instruction under MVE. For example
16905 // vecreduce_add(sext(A, v8i32)) => VADDV.s16 A
16906 // or
16907 // vecreduce_add(mul(zext(A, v16i32), zext(B, v16i32))) => VMLADAV.u8 A, B
16908
16909 // The legal cases are:
16910 // VADDV u/s 8/16/32
16911 // VMLAV u/s 8/16/32
16912 // VADDLV u/s 32
16913 // VMLALV u/s 16/32
16914
16915 // If the input vector is smaller than legal (v4i8/v4i16 for example) we can
16916 // extend it and use v4i32 instead.
16917 auto ExtTypeMatches = [](SDValue A, ArrayRef<MVT> ExtTypes) {
16918 EVT AVT = A.getValueType();
16919 return any_of(ExtTypes, [&](MVT Ty) {
16920 return AVT.getVectorNumElements() == Ty.getVectorNumElements() &&
16921 AVT.bitsLE(Ty);
16922 });
16923 };
16924 auto ExtendIfNeeded = [&](SDValue A, unsigned ExtendCode) {
16925 EVT AVT = A.getValueType();
16926 if (!AVT.is128BitVector())
16927 A = DAG.getNode(ExtendCode, dl,
16929 128 / AVT.getVectorMinNumElements())),
16930 A);
16931 return A;
16932 };
16933 auto IsVADDV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes) {
16934 if (ResVT != RetTy || N0->getOpcode() != ExtendCode)
16935 return SDValue();
16936 SDValue A = N0->getOperand(0);
16937 if (ExtTypeMatches(A, ExtTypes))
16938 return ExtendIfNeeded(A, ExtendCode);
16939 return SDValue();
16940 };
16941 auto IsPredVADDV = [&](MVT RetTy, unsigned ExtendCode,
16942 ArrayRef<MVT> ExtTypes, SDValue &Mask) {
16943 if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||
16945 return SDValue();
16946 Mask = N0->getOperand(0);
16947 SDValue Ext = N0->getOperand(1);
16948 if (Ext->getOpcode() != ExtendCode)
16949 return SDValue();
16950 SDValue A = Ext->getOperand(0);
16951 if (ExtTypeMatches(A, ExtTypes))
16952 return ExtendIfNeeded(A, ExtendCode);
16953 return SDValue();
16954 };
16955 auto IsVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
16956 SDValue &A, SDValue &B) {
16957 // For a vmla we are trying to match a larger pattern:
16958 // ExtA = sext/zext A
16959 // ExtB = sext/zext B
16960 // Mul = mul ExtA, ExtB
16961 // vecreduce.add Mul
16962 // There might also be en extra extend between the mul and the addreduce, so
16963 // long as the bitwidth is high enough to make them equivalent (for example
16964 // original v8i16 might be mul at v8i32 and the reduce happens at v8i64).
16965 if (ResVT != RetTy)
16966 return false;
16967 SDValue Mul = N0;
16968 if (Mul->getOpcode() == ExtendCode &&
16969 Mul->getOperand(0).getScalarValueSizeInBits() * 2 >=
16970 ResVT.getScalarSizeInBits())
16971 Mul = Mul->getOperand(0);
16972 if (Mul->getOpcode() != ISD::MUL)
16973 return false;
16974 SDValue ExtA = Mul->getOperand(0);
16975 SDValue ExtB = Mul->getOperand(1);
16976 if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode)
16977 return false;
16978 A = ExtA->getOperand(0);
16979 B = ExtB->getOperand(0);
16980 if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) {
16981 A = ExtendIfNeeded(A, ExtendCode);
16982 B = ExtendIfNeeded(B, ExtendCode);
16983 return true;
16984 }
16985 return false;
16986 };
16987 auto IsPredVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
16988 SDValue &A, SDValue &B, SDValue &Mask) {
16989 // Same as the pattern above with a select for the zero predicated lanes
16990 // ExtA = sext/zext A
16991 // ExtB = sext/zext B
16992 // Mul = mul ExtA, ExtB
16993 // N0 = select Mask, Mul, 0
16994 // vecreduce.add N0
16995 if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||
16997 return false;
16998 Mask = N0->getOperand(0);
16999 SDValue Mul = N0->getOperand(1);
17000 if (Mul->getOpcode() == ExtendCode &&
17001 Mul->getOperand(0).getScalarValueSizeInBits() * 2 >=
17002 ResVT.getScalarSizeInBits())
17003 Mul = Mul->getOperand(0);
17004 if (Mul->getOpcode() != ISD::MUL)
17005 return false;
17006 SDValue ExtA = Mul->getOperand(0);
17007 SDValue ExtB = Mul->getOperand(1);
17008 if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode)
17009 return false;
17010 A = ExtA->getOperand(0);
17011 B = ExtB->getOperand(0);
17012 if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) {
17013 A = ExtendIfNeeded(A, ExtendCode);
17014 B = ExtendIfNeeded(B, ExtendCode);
17015 return true;
17016 }
17017 return false;
17018 };
17019 auto Create64bitNode = [&](unsigned Opcode, ArrayRef<SDValue> Ops) {
17020 // Split illegal MVT::v16i8->i64 vector reductions into two legal v8i16->i64
17021 // reductions. The operands are extended with MVEEXT, but as they are
17022 // reductions the lane orders do not matter. MVEEXT may be combined with
17023 // loads to produce two extending loads, or else they will be expanded to
17024 // VREV/VMOVL.
17025 EVT VT = Ops[0].getValueType();
17026 if (VT == MVT::v16i8) {
17027 assert((Opcode == ARMISD::VMLALVs || Opcode == ARMISD::VMLALVu) &&
17028 "Unexpected illegal long reduction opcode");
17029 bool IsUnsigned = Opcode == ARMISD::VMLALVu;
17030
17031 SDValue Ext0 =
17032 DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl,
17033 DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[0]);
17034 SDValue Ext1 =
17035 DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl,
17036 DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[1]);
17037
17038 SDValue MLA0 = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
17039 Ext0, Ext1);
17040 SDValue MLA1 =
17041 DAG.getNode(IsUnsigned ? ARMISD::VMLALVAu : ARMISD::VMLALVAs, dl,
17042 DAG.getVTList(MVT::i32, MVT::i32), MLA0, MLA0.getValue(1),
17043 Ext0.getValue(1), Ext1.getValue(1));
17044 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, MLA1, MLA1.getValue(1));
17045 }
17046 SDValue Node = DAG.getNode(Opcode, dl, {MVT::i32, MVT::i32}, Ops);
17047 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Node,
17048 SDValue(Node.getNode(), 1));
17049 };
17050
17051 SDValue A, B;
17052 SDValue Mask;
17053 if (IsVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
17054 return DAG.getNode(ARMISD::VMLAVs, dl, ResVT, A, B);
17055 if (IsVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
17056 return DAG.getNode(ARMISD::VMLAVu, dl, ResVT, A, B);
17057 if (IsVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32},
17058 A, B))
17059 return Create64bitNode(ARMISD::VMLALVs, {A, B});
17060 if (IsVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32},
17061 A, B))
17062 return Create64bitNode(ARMISD::VMLALVu, {A, B});
17063 if (IsVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B))
17064 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17065 DAG.getNode(ARMISD::VMLAVs, dl, MVT::i32, A, B));
17066 if (IsVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B))
17067 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17068 DAG.getNode(ARMISD::VMLAVu, dl, MVT::i32, A, B));
17069
17070 if (IsPredVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B,
17071 Mask))
17072 return DAG.getNode(ARMISD::VMLAVps, dl, ResVT, A, B, Mask);
17073 if (IsPredVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B,
17074 Mask))
17075 return DAG.getNode(ARMISD::VMLAVpu, dl, ResVT, A, B, Mask);
17076 if (IsPredVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B,
17077 Mask))
17078 return Create64bitNode(ARMISD::VMLALVps, {A, B, Mask});
17079 if (IsPredVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B,
17080 Mask))
17081 return Create64bitNode(ARMISD::VMLALVpu, {A, B, Mask});
17082 if (IsPredVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B, Mask))
17083 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17084 DAG.getNode(ARMISD::VMLAVps, dl, MVT::i32, A, B, Mask));
17085 if (IsPredVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B, Mask))
17086 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17087 DAG.getNode(ARMISD::VMLAVpu, dl, MVT::i32, A, B, Mask));
17088
17089 if (SDValue A = IsVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}))
17090 return DAG.getNode(ARMISD::VADDVs, dl, ResVT, A);
17091 if (SDValue A = IsVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}))
17092 return DAG.getNode(ARMISD::VADDVu, dl, ResVT, A);
17093 if (SDValue A = IsVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}))
17094 return Create64bitNode(ARMISD::VADDLVs, {A});
17095 if (SDValue A = IsVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}))
17096 return Create64bitNode(ARMISD::VADDLVu, {A});
17097 if (SDValue A = IsVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}))
17098 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17099 DAG.getNode(ARMISD::VADDVs, dl, MVT::i32, A));
17100 if (SDValue A = IsVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}))
17101 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17102 DAG.getNode(ARMISD::VADDVu, dl, MVT::i32, A));
17103
17104 if (SDValue A = IsPredVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
17105 return DAG.getNode(ARMISD::VADDVps, dl, ResVT, A, Mask);
17106 if (SDValue A = IsPredVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
17107 return DAG.getNode(ARMISD::VADDVpu, dl, ResVT, A, Mask);
17108 if (SDValue A = IsPredVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}, Mask))
17109 return Create64bitNode(ARMISD::VADDLVps, {A, Mask});
17110 if (SDValue A = IsPredVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}, Mask))
17111 return Create64bitNode(ARMISD::VADDLVpu, {A, Mask});
17112 if (SDValue A = IsPredVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, Mask))
17113 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17114 DAG.getNode(ARMISD::VADDVps, dl, MVT::i32, A, Mask));
17115 if (SDValue A = IsPredVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, Mask))
17116 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17117 DAG.getNode(ARMISD::VADDVpu, dl, MVT::i32, A, Mask));
17118
17119 // Some complications. We can get a case where the two inputs of the mul are
17120 // the same, then the output sext will have been helpfully converted to a
17121 // zext. Turn it back.
17122 SDValue Op = N0;
17123 if (Op->getOpcode() == ISD::VSELECT)
17124 Op = Op->getOperand(1);
17125 if (Op->getOpcode() == ISD::ZERO_EXTEND &&
17126 Op->getOperand(0)->getOpcode() == ISD::MUL) {
17127 SDValue Mul = Op->getOperand(0);
17128 if (Mul->getOperand(0) == Mul->getOperand(1) &&
17129 Mul->getOperand(0)->getOpcode() == ISD::SIGN_EXTEND) {
17130 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, N0->getValueType(0), Mul);
17131 if (Op != N0)
17132 Ext = DAG.getNode(ISD::VSELECT, dl, N0->getValueType(0),
17133 N0->getOperand(0), Ext, N0->getOperand(2));
17134 return DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, Ext);
17135 }
17136 }
17137
17138 return SDValue();
17139}
17140
17141// Looks for vaddv(shuffle) or vmlav(shuffle, shuffle), with a shuffle where all
17142// the lanes are used. Due to the reduction being commutative the shuffle can be
17143// removed.
17145 unsigned VecOp = N->getOperand(0).getValueType().isVector() ? 0 : 2;
17146 auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(VecOp));
17147 if (!Shuf || !Shuf->getOperand(1).isUndef())
17148 return SDValue();
17149
17150 // Check all elements are used once in the mask.
17151 ArrayRef<int> Mask = Shuf->getMask();
17152 APInt SetElts(Mask.size(), 0);
17153 for (int E : Mask) {
17154 if (E < 0 || E >= (int)Mask.size())
17155 return SDValue();
17156 SetElts.setBit(E);
17157 }
17158 if (!SetElts.isAllOnes())
17159 return SDValue();
17160
17161 if (N->getNumOperands() != VecOp + 1) {
17162 auto *Shuf2 = dyn_cast<ShuffleVectorSDNode>(N->getOperand(VecOp + 1));
17163 if (!Shuf2 || !Shuf2->getOperand(1).isUndef() || Shuf2->getMask() != Mask)
17164 return SDValue();
17165 }
17166
17168 for (SDValue Op : N->ops()) {
17169 if (Op.getValueType().isVector())
17170 Ops.push_back(Op.getOperand(0));
17171 else
17172 Ops.push_back(Op);
17173 }
17174 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getVTList(), Ops);
17175}
17176
17179 SDValue Op0 = N->getOperand(0);
17180 SDValue Op1 = N->getOperand(1);
17181 unsigned IsTop = N->getConstantOperandVal(2);
17182
17183 // VMOVNT a undef -> a
17184 // VMOVNB a undef -> a
17185 // VMOVNB undef a -> a
17186 if (Op1->isUndef())
17187 return Op0;
17188 if (Op0->isUndef() && !IsTop)
17189 return Op1;
17190
17191 // VMOVNt(c, VQMOVNb(a, b)) => VQMOVNt(c, b)
17192 // VMOVNb(c, VQMOVNb(a, b)) => VQMOVNb(c, b)
17193 if ((Op1->getOpcode() == ARMISD::VQMOVNs ||
17194 Op1->getOpcode() == ARMISD::VQMOVNu) &&
17195 Op1->getConstantOperandVal(2) == 0)
17196 return DCI.DAG.getNode(Op1->getOpcode(), SDLoc(Op1), N->getValueType(0),
17197 Op0, Op1->getOperand(1), N->getOperand(2));
17198
17199 // Only the bottom lanes from Qm (Op1) and either the top or bottom lanes from
17200 // Qd (Op0) are demanded from a VMOVN, depending on whether we are inserting
17201 // into the top or bottom lanes.
17202 unsigned NumElts = N->getValueType(0).getVectorNumElements();
17203 APInt Op1DemandedElts = APInt::getSplat(NumElts, APInt::getLowBitsSet(2, 1));
17204 APInt Op0DemandedElts =
17205 IsTop ? Op1DemandedElts
17206 : APInt::getSplat(NumElts, APInt::getHighBitsSet(2, 1));
17207
17208 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
17209 if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, DCI))
17210 return SDValue(N, 0);
17211 if (TLI.SimplifyDemandedVectorElts(Op1, Op1DemandedElts, DCI))
17212 return SDValue(N, 0);
17213
17214 return SDValue();
17215}
17216
17219 SDValue Op0 = N->getOperand(0);
17220 unsigned IsTop = N->getConstantOperandVal(2);
17221
17222 unsigned NumElts = N->getValueType(0).getVectorNumElements();
17223 APInt Op0DemandedElts =
17224 APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
17225 : APInt::getHighBitsSet(2, 1));
17226
17227 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
17228 if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, DCI))
17229 return SDValue(N, 0);
17230 return SDValue();
17231}
17232
17235 EVT VT = N->getValueType(0);
17236 SDValue LHS = N->getOperand(0);
17237 SDValue RHS = N->getOperand(1);
17238
17239 auto *Shuf0 = dyn_cast<ShuffleVectorSDNode>(LHS);
17240 auto *Shuf1 = dyn_cast<ShuffleVectorSDNode>(RHS);
17241 // Turn VQDMULH(shuffle, shuffle) -> shuffle(VQDMULH)
17242 if (Shuf0 && Shuf1 && Shuf0->getMask().equals(Shuf1->getMask()) &&
17243 LHS.getOperand(1).isUndef() && RHS.getOperand(1).isUndef() &&
17244 (LHS.hasOneUse() || RHS.hasOneUse() || LHS == RHS)) {
17245 SDLoc DL(N);
17246 SDValue NewBinOp = DCI.DAG.getNode(N->getOpcode(), DL, VT,
17247 LHS.getOperand(0), RHS.getOperand(0));
17248 SDValue UndefV = LHS.getOperand(1);
17249 return DCI.DAG.getVectorShuffle(VT, DL, NewBinOp, UndefV, Shuf0->getMask());
17250 }
17251 return SDValue();
17252}
17253
17255 SDLoc DL(N);
17256 SDValue Op0 = N->getOperand(0);
17257 SDValue Op1 = N->getOperand(1);
17258
17259 // Turn X << -C -> X >> C and viceversa. The negative shifts can come up from
17260 // uses of the intrinsics.
17261 if (auto C = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
17262 int ShiftAmt = C->getSExtValue();
17263 if (ShiftAmt == 0) {
17264 SDValue Merge = DAG.getMergeValues({Op0, Op1}, DL);
17265 DAG.ReplaceAllUsesWith(N, Merge.getNode());
17266 return SDValue();
17267 }
17268
17269 if (ShiftAmt >= -32 && ShiftAmt < 0) {
17270 unsigned NewOpcode =
17271 N->getOpcode() == ARMISD::LSLL ? ARMISD::LSRL : ARMISD::LSLL;
17272 SDValue NewShift = DAG.getNode(NewOpcode, DL, N->getVTList(), Op0, Op1,
17273 DAG.getConstant(-ShiftAmt, DL, MVT::i32));
17274 DAG.ReplaceAllUsesWith(N, NewShift.getNode());
17275 return NewShift;
17276 }
17277 }
17278
17279 return SDValue();
17280}
17281
17282/// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
17284 DAGCombinerInfo &DCI) const {
17285 SelectionDAG &DAG = DCI.DAG;
17286 unsigned IntNo = N->getConstantOperandVal(0);
17287 switch (IntNo) {
17288 default:
17289 // Don't do anything for most intrinsics.
17290 break;
17291
17292 // Vector shifts: check for immediate versions and lower them.
17293 // Note: This is done during DAG combining instead of DAG legalizing because
17294 // the build_vectors for 64-bit vector element shift counts are generally
17295 // not legal, and it is hard to see their values after they get legalized to
17296 // loads from a constant pool.
17297 case Intrinsic::arm_neon_vshifts:
17298 case Intrinsic::arm_neon_vshiftu:
17299 case Intrinsic::arm_neon_vrshifts:
17300 case Intrinsic::arm_neon_vrshiftu:
17301 case Intrinsic::arm_neon_vrshiftn:
17302 case Intrinsic::arm_neon_vqshifts:
17303 case Intrinsic::arm_neon_vqshiftu:
17304 case Intrinsic::arm_neon_vqshiftsu:
17305 case Intrinsic::arm_neon_vqshiftns:
17306 case Intrinsic::arm_neon_vqshiftnu:
17307 case Intrinsic::arm_neon_vqshiftnsu:
17308 case Intrinsic::arm_neon_vqrshiftns:
17309 case Intrinsic::arm_neon_vqrshiftnu:
17310 case Intrinsic::arm_neon_vqrshiftnsu: {
17311 EVT VT = N->getOperand(1).getValueType();
17312 int64_t Cnt;
17313 unsigned VShiftOpc = 0;
17314
17315 switch (IntNo) {
17316 case Intrinsic::arm_neon_vshifts:
17317 case Intrinsic::arm_neon_vshiftu:
17318 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) {
17319 VShiftOpc = ARMISD::VSHLIMM;
17320 break;
17321 }
17322 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) {
17323 VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? ARMISD::VSHRsIMM
17324 : ARMISD::VSHRuIMM);
17325 break;
17326 }
17327 return SDValue();
17328
17329 case Intrinsic::arm_neon_vrshifts:
17330 case Intrinsic::arm_neon_vrshiftu:
17331 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt))
17332 break;
17333 return SDValue();
17334
17335 case Intrinsic::arm_neon_vqshifts:
17336 case Intrinsic::arm_neon_vqshiftu:
17337 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
17338 break;
17339 return SDValue();
17340
17341 case Intrinsic::arm_neon_vqshiftsu:
17342 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
17343 break;
17344 llvm_unreachable("invalid shift count for vqshlu intrinsic");
17345
17346 case Intrinsic::arm_neon_vrshiftn:
17347 case Intrinsic::arm_neon_vqshiftns:
17348 case Intrinsic::arm_neon_vqshiftnu:
17349 case Intrinsic::arm_neon_vqshiftnsu:
17350 case Intrinsic::arm_neon_vqrshiftns:
17351 case Intrinsic::arm_neon_vqrshiftnu:
17352 case Intrinsic::arm_neon_vqrshiftnsu:
17353 // Narrowing shifts require an immediate right shift.
17354 if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt))
17355 break;
17356 llvm_unreachable("invalid shift count for narrowing vector shift "
17357 "intrinsic");
17358
17359 default:
17360 llvm_unreachable("unhandled vector shift");
17361 }
17362
17363 switch (IntNo) {
17364 case Intrinsic::arm_neon_vshifts:
17365 case Intrinsic::arm_neon_vshiftu:
17366 // Opcode already set above.
17367 break;
17368 case Intrinsic::arm_neon_vrshifts:
17369 VShiftOpc = ARMISD::VRSHRsIMM;
17370 break;
17371 case Intrinsic::arm_neon_vrshiftu:
17372 VShiftOpc = ARMISD::VRSHRuIMM;
17373 break;
17374 case Intrinsic::arm_neon_vrshiftn:
17375 VShiftOpc = ARMISD::VRSHRNIMM;
17376 break;
17377 case Intrinsic::arm_neon_vqshifts:
17378 VShiftOpc = ARMISD::VQSHLsIMM;
17379 break;
17380 case Intrinsic::arm_neon_vqshiftu:
17381 VShiftOpc = ARMISD::VQSHLuIMM;
17382 break;
17383 case Intrinsic::arm_neon_vqshiftsu:
17384 VShiftOpc = ARMISD::VQSHLsuIMM;
17385 break;
17386 case Intrinsic::arm_neon_vqshiftns:
17387 VShiftOpc = ARMISD::VQSHRNsIMM;
17388 break;
17389 case Intrinsic::arm_neon_vqshiftnu:
17390 VShiftOpc = ARMISD::VQSHRNuIMM;
17391 break;
17392 case Intrinsic::arm_neon_vqshiftnsu:
17393 VShiftOpc = ARMISD::VQSHRNsuIMM;
17394 break;
17395 case Intrinsic::arm_neon_vqrshiftns:
17396 VShiftOpc = ARMISD::VQRSHRNsIMM;
17397 break;
17398 case Intrinsic::arm_neon_vqrshiftnu:
17399 VShiftOpc = ARMISD::VQRSHRNuIMM;
17400 break;
17401 case Intrinsic::arm_neon_vqrshiftnsu:
17402 VShiftOpc = ARMISD::VQRSHRNsuIMM;
17403 break;
17404 }
17405
17406 SDLoc dl(N);
17407 return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
17408 N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32));
17409 }
17410
17411 case Intrinsic::arm_neon_vshiftins: {
17412 EVT VT = N->getOperand(1).getValueType();
17413 int64_t Cnt;
17414 unsigned VShiftOpc = 0;
17415
17416 if (isVShiftLImm(N->getOperand(3), VT, false, Cnt))
17417 VShiftOpc = ARMISD::VSLIIMM;
17418 else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt))
17419 VShiftOpc = ARMISD::VSRIIMM;
17420 else {
17421 llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
17422 }
17423
17424 SDLoc dl(N);
17425 return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
17426 N->getOperand(1), N->getOperand(2),
17427 DAG.getConstant(Cnt, dl, MVT::i32));
17428 }
17429
17430 case Intrinsic::arm_neon_vqrshifts:
17431 case Intrinsic::arm_neon_vqrshiftu:
17432 // No immediate versions of these to check for.
17433 break;
17434
17435 case Intrinsic::arm_neon_vbsl: {
17436 SDLoc dl(N);
17437 return DAG.getNode(ARMISD::VBSP, dl, N->getValueType(0), N->getOperand(1),
17438 N->getOperand(2), N->getOperand(3));
17439 }
17440 case Intrinsic::arm_mve_vqdmlah:
17441 case Intrinsic::arm_mve_vqdmlash:
17442 case Intrinsic::arm_mve_vqrdmlah:
17443 case Intrinsic::arm_mve_vqrdmlash:
17444 case Intrinsic::arm_mve_vmla_n_predicated:
17445 case Intrinsic::arm_mve_vmlas_n_predicated:
17446 case Intrinsic::arm_mve_vqdmlah_predicated:
17447 case Intrinsic::arm_mve_vqdmlash_predicated:
17448 case Intrinsic::arm_mve_vqrdmlah_predicated:
17449 case Intrinsic::arm_mve_vqrdmlash_predicated: {
17450 // These intrinsics all take an i32 scalar operand which is narrowed to the
17451 // size of a single lane of the vector type they return. So we don't need
17452 // any bits of that operand above that point, which allows us to eliminate
17453 // uxth/sxth.
17454 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
17455 APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);
17456 if (SimplifyDemandedBits(N->getOperand(3), DemandedMask, DCI))
17457 return SDValue();
17458 break;
17459 }
17460
17461 case Intrinsic::arm_mve_minv:
17462 case Intrinsic::arm_mve_maxv:
17463 case Intrinsic::arm_mve_minav:
17464 case Intrinsic::arm_mve_maxav:
17465 case Intrinsic::arm_mve_minv_predicated:
17466 case Intrinsic::arm_mve_maxv_predicated:
17467 case Intrinsic::arm_mve_minav_predicated:
17468 case Intrinsic::arm_mve_maxav_predicated: {
17469 // These intrinsics all take an i32 scalar operand which is narrowed to the
17470 // size of a single lane of the vector type they take as the other input.
17471 unsigned BitWidth = N->getOperand(2)->getValueType(0).getScalarSizeInBits();
17472 APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);
17473 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
17474 return SDValue();
17475 break;
17476 }
17477
17478 case Intrinsic::arm_mve_addv: {
17479 // Turn this intrinsic straight into the appropriate ARMISD::VADDV node,
17480 // which allow PerformADDVecReduce to turn it into VADDLV when possible.
17481 bool Unsigned = N->getConstantOperandVal(2);
17482 unsigned Opc = Unsigned ? ARMISD::VADDVu : ARMISD::VADDVs;
17483 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), N->getOperand(1));
17484 }
17485
17486 case Intrinsic::arm_mve_addlv:
17487 case Intrinsic::arm_mve_addlv_predicated: {
17488 // Same for these, but ARMISD::VADDLV has to be followed by a BUILD_PAIR
17489 // which recombines the two outputs into an i64
17490 bool Unsigned = N->getConstantOperandVal(2);
17491 unsigned Opc = IntNo == Intrinsic::arm_mve_addlv ?
17492 (Unsigned ? ARMISD::VADDLVu : ARMISD::VADDLVs) :
17493 (Unsigned ? ARMISD::VADDLVpu : ARMISD::VADDLVps);
17494
17496 for (unsigned i = 1, e = N->getNumOperands(); i < e; i++)
17497 if (i != 2) // skip the unsigned flag
17498 Ops.push_back(N->getOperand(i));
17499
17500 SDLoc dl(N);
17501 SDValue val = DAG.getNode(Opc, dl, {MVT::i32, MVT::i32}, Ops);
17502 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, val.getValue(0),
17503 val.getValue(1));
17504 }
17505 }
17506
17507 return SDValue();
17508}
17509
17510/// PerformShiftCombine - Checks for immediate versions of vector shifts and
17511/// lowers them. As with the vector shift intrinsics, this is done during DAG
17512/// combining instead of DAG legalizing because the build_vectors for 64-bit
17513/// vector element shift counts are generally not legal, and it is hard to see
17514/// their values after they get legalized to loads from a constant pool.
17517 const ARMSubtarget *ST) {
17518 SelectionDAG &DAG = DCI.DAG;
17519 EVT VT = N->getValueType(0);
17520
17521 if (ST->isThumb1Only() && N->getOpcode() == ISD::SHL && VT == MVT::i32 &&
17522 N->getOperand(0)->getOpcode() == ISD::AND &&
17523 N->getOperand(0)->hasOneUse()) {
17524 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
17525 return SDValue();
17526 // Look for the pattern (shl (and x, AndMask), ShiftAmt). This doesn't
17527 // usually show up because instcombine prefers to canonicalize it to
17528 // (and (shl x, ShiftAmt) (shl AndMask, ShiftAmt)), but the shift can come
17529 // out of GEP lowering in some cases.
17530 SDValue N0 = N->getOperand(0);
17531 ConstantSDNode *ShiftAmtNode = dyn_cast<ConstantSDNode>(N->getOperand(1));
17532 if (!ShiftAmtNode)
17533 return SDValue();
17534 uint32_t ShiftAmt = static_cast<uint32_t>(ShiftAmtNode->getZExtValue());
17535 ConstantSDNode *AndMaskNode = dyn_cast<ConstantSDNode>(N0->getOperand(1));
17536 if (!AndMaskNode)
17537 return SDValue();
17538 uint32_t AndMask = static_cast<uint32_t>(AndMaskNode->getZExtValue());
17539 // Don't transform uxtb/uxth.
17540 if (AndMask == 255 || AndMask == 65535)
17541 return SDValue();
17542 if (isMask_32(AndMask)) {
17543 uint32_t MaskedBits = llvm::countl_zero(AndMask);
17544 if (MaskedBits > ShiftAmt) {
17545 SDLoc DL(N);
17546 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
17547 DAG.getConstant(MaskedBits, DL, MVT::i32));
17548 return DAG.getNode(
17549 ISD::SRL, DL, MVT::i32, SHL,
17550 DAG.getConstant(MaskedBits - ShiftAmt, DL, MVT::i32));
17551 }
17552 }
17553 }
17554
17555 // Nothing to be done for scalar shifts.
17556 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17557 if (!VT.isVector() || !TLI.isTypeLegal(VT))
17558 return SDValue();
17559 if (ST->hasMVEIntegerOps())
17560 return SDValue();
17561
17562 int64_t Cnt;
17563
17564 switch (N->getOpcode()) {
17565 default: llvm_unreachable("unexpected shift opcode");
17566
17567 case ISD::SHL:
17568 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) {
17569 SDLoc dl(N);
17570 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
17571 DAG.getConstant(Cnt, dl, MVT::i32));
17572 }
17573 break;
17574
17575 case ISD::SRA:
17576 case ISD::SRL:
17577 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
17578 unsigned VShiftOpc =
17579 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
17580 SDLoc dl(N);
17581 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
17582 DAG.getConstant(Cnt, dl, MVT::i32));
17583 }
17584 }
17585 return SDValue();
17586}
17587
17588// Look for a sign/zero/fpextend extend of a larger than legal load. This can be
17589// split into multiple extending loads, which are simpler to deal with than an
17590// arbitrary extend. For fp extends we use an integer extending load and a VCVTL
17591// to convert the type to an f32.
17593 SDValue N0 = N->getOperand(0);
17594 if (N0.getOpcode() != ISD::LOAD)
17595 return SDValue();
17597 if (!LD->isSimple() || !N0.hasOneUse() || LD->isIndexed() ||
17598 LD->getExtensionType() != ISD::NON_EXTLOAD)
17599 return SDValue();
17600 EVT FromVT = LD->getValueType(0);
17601 EVT ToVT = N->getValueType(0);
17602 if (!ToVT.isVector())
17603 return SDValue();
17605 EVT ToEltVT = ToVT.getVectorElementType();
17606 EVT FromEltVT = FromVT.getVectorElementType();
17607
17608 unsigned NumElements = 0;
17609 if (ToEltVT == MVT::i32 && FromEltVT == MVT::i8)
17610 NumElements = 4;
17611 if (ToEltVT == MVT::f32 && FromEltVT == MVT::f16)
17612 NumElements = 4;
17613 if (NumElements == 0 ||
17614 (FromEltVT != MVT::f16 && FromVT.getVectorNumElements() == NumElements) ||
17615 FromVT.getVectorNumElements() % NumElements != 0 ||
17616 !isPowerOf2_32(NumElements))
17617 return SDValue();
17618
17619 LLVMContext &C = *DAG.getContext();
17620 SDLoc DL(LD);
17621 // Details about the old load
17622 SDValue Ch = LD->getChain();
17623 SDValue BasePtr = LD->getBasePtr();
17624 Align Alignment = LD->getBaseAlign();
17625 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
17626 AAMDNodes AAInfo = LD->getAAInfo();
17627
17628 ISD::LoadExtType NewExtType =
17629 N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
17630 SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
17631 EVT NewFromVT = EVT::getVectorVT(
17632 C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);
17633 EVT NewToVT = EVT::getVectorVT(
17634 C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);
17635
17638 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
17639 unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
17640 SDValue NewPtr =
17641 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
17642
17643 SDValue NewLoad =
17644 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
17645 LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
17646 Alignment, MMOFlags, AAInfo);
17647 Loads.push_back(NewLoad);
17648 Chains.push_back(SDValue(NewLoad.getNode(), 1));
17649 }
17650
17651 // Float truncs need to extended with VCVTB's into their floating point types.
17652 if (FromEltVT == MVT::f16) {
17654
17655 for (unsigned i = 0; i < Loads.size(); i++) {
17656 SDValue LoadBC =
17657 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v8f16, Loads[i]);
17658 SDValue FPExt = DAG.getNode(ARMISD::VCVTL, DL, MVT::v4f32, LoadBC,
17659 DAG.getConstant(0, DL, MVT::i32));
17660 Extends.push_back(FPExt);
17661 }
17662
17663 Loads = Extends;
17664 }
17665
17666 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
17667 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
17668 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Loads);
17669}
17670
17671/// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
17672/// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND.
17674 const ARMSubtarget *ST) {
17675 SDValue N0 = N->getOperand(0);
17676
17677 // Check for sign- and zero-extensions of vector extract operations of 8- and
17678 // 16-bit vector elements. NEON and MVE support these directly. They are
17679 // handled during DAG combining because type legalization will promote them
17680 // to 32-bit types and it is messy to recognize the operations after that.
17681 if ((ST->hasNEON() || ST->hasMVEIntegerOps()) &&
17683 SDValue Vec = N0.getOperand(0);
17684 SDValue Lane = N0.getOperand(1);
17685 EVT VT = N->getValueType(0);
17686 EVT EltVT = N0.getValueType();
17687 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17688
17689 if (VT == MVT::i32 &&
17690 (EltVT == MVT::i8 || EltVT == MVT::i16) &&
17691 TLI.isTypeLegal(Vec.getValueType()) &&
17692 isa<ConstantSDNode>(Lane)) {
17693
17694 unsigned Opc = 0;
17695 switch (N->getOpcode()) {
17696 default: llvm_unreachable("unexpected opcode");
17697 case ISD::SIGN_EXTEND:
17698 Opc = ARMISD::VGETLANEs;
17699 break;
17700 case ISD::ZERO_EXTEND:
17701 case ISD::ANY_EXTEND:
17702 Opc = ARMISD::VGETLANEu;
17703 break;
17704 }
17705 return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane);
17706 }
17707 }
17708
17709 if (ST->hasMVEIntegerOps())
17710 if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
17711 return NewLoad;
17712
17713 return SDValue();
17714}
17715
17717 const ARMSubtarget *ST) {
17718 if (ST->hasMVEFloatOps())
17719 if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
17720 return NewLoad;
17721
17722 return SDValue();
17723}
17724
17725// Lower smin(smax(x, C1), C2) to ssat or usat, if they have saturating
17726// constant bounds.
17728 const ARMSubtarget *Subtarget) {
17729 if ((Subtarget->isThumb() || !Subtarget->hasV6Ops()) &&
17730 !Subtarget->isThumb2())
17731 return SDValue();
17732
17733 EVT VT = Op.getValueType();
17734 SDValue Op0 = Op.getOperand(0);
17735
17736 if (VT != MVT::i32 ||
17737 (Op0.getOpcode() != ISD::SMIN && Op0.getOpcode() != ISD::SMAX) ||
17738 !isa<ConstantSDNode>(Op.getOperand(1)) ||
17740 return SDValue();
17741
17742 SDValue Min = Op;
17743 SDValue Max = Op0;
17744 SDValue Input = Op0.getOperand(0);
17745 if (Min.getOpcode() == ISD::SMAX)
17746 std::swap(Min, Max);
17747
17748 APInt MinC = Min.getConstantOperandAPInt(1);
17749 APInt MaxC = Max.getConstantOperandAPInt(1);
17750
17751 if (Min.getOpcode() != ISD::SMIN || Max.getOpcode() != ISD::SMAX ||
17752 !(MinC + 1).isPowerOf2())
17753 return SDValue();
17754
17755 SDLoc DL(Op);
17756 if (MinC == ~MaxC)
17757 return DAG.getNode(ARMISD::SSAT, DL, VT, Input,
17758 DAG.getConstant(MinC.countr_one(), DL, VT));
17759 if (MaxC == 0)
17760 return DAG.getNode(ARMISD::USAT, DL, VT, Input,
17761 DAG.getConstant(MinC.countr_one(), DL, VT));
17762
17763 return SDValue();
17764}
17765
17766/// PerformMinMaxCombine - Target-specific DAG combining for creating truncating
17767/// saturates.
17769 const ARMSubtarget *ST) {
17770 EVT VT = N->getValueType(0);
17771 SDValue N0 = N->getOperand(0);
17772
17773 if (VT == MVT::i32)
17774 return PerformMinMaxToSatCombine(SDValue(N, 0), DAG, ST);
17775
17776 if (!ST->hasMVEIntegerOps())
17777 return SDValue();
17778
17779 if (SDValue V = PerformVQDMULHCombine(N, DAG))
17780 return V;
17781
17782 if (VT != MVT::v4i32 && VT != MVT::v8i16)
17783 return SDValue();
17784
17785 auto IsSignedSaturate = [&](SDNode *Min, SDNode *Max) {
17786 // Check one is a smin and the other is a smax
17787 if (Min->getOpcode() != ISD::SMIN)
17788 std::swap(Min, Max);
17789 if (Min->getOpcode() != ISD::SMIN || Max->getOpcode() != ISD::SMAX)
17790 return false;
17791
17792 APInt SaturateC;
17793 if (VT == MVT::v4i32)
17794 SaturateC = APInt(32, (1 << 15) - 1, true);
17795 else //if (VT == MVT::v8i16)
17796 SaturateC = APInt(16, (1 << 7) - 1, true);
17797
17798 APInt MinC, MaxC;
17799 if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) ||
17800 MinC != SaturateC)
17801 return false;
17802 if (!ISD::isConstantSplatVector(Max->getOperand(1).getNode(), MaxC) ||
17803 MaxC != ~SaturateC)
17804 return false;
17805 return true;
17806 };
17807
17808 if (IsSignedSaturate(N, N0.getNode())) {
17809 SDLoc DL(N);
17810 MVT ExtVT, HalfVT;
17811 if (VT == MVT::v4i32) {
17812 HalfVT = MVT::v8i16;
17813 ExtVT = MVT::v4i16;
17814 } else { // if (VT == MVT::v8i16)
17815 HalfVT = MVT::v16i8;
17816 ExtVT = MVT::v8i8;
17817 }
17818
17819 // Create a VQMOVNB with undef top lanes, then signed extended into the top
17820 // half. That extend will hopefully be removed if only the bottom bits are
17821 // demanded (though a truncating store, for example).
17822 SDValue VQMOVN =
17823 DAG.getNode(ARMISD::VQMOVNs, DL, HalfVT, DAG.getUNDEF(HalfVT),
17824 N0->getOperand(0), DAG.getConstant(0, DL, MVT::i32));
17825 SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);
17826 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Bitcast,
17827 DAG.getValueType(ExtVT));
17828 }
17829
17830 auto IsUnsignedSaturate = [&](SDNode *Min) {
17831 // For unsigned, we just need to check for <= 0xffff
17832 if (Min->getOpcode() != ISD::UMIN)
17833 return false;
17834
17835 APInt SaturateC;
17836 if (VT == MVT::v4i32)
17837 SaturateC = APInt(32, (1 << 16) - 1, true);
17838 else //if (VT == MVT::v8i16)
17839 SaturateC = APInt(16, (1 << 8) - 1, true);
17840
17841 APInt MinC;
17842 if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) ||
17843 MinC != SaturateC)
17844 return false;
17845 return true;
17846 };
17847
17848 if (IsUnsignedSaturate(N)) {
17849 SDLoc DL(N);
17850 MVT HalfVT;
17851 unsigned ExtConst;
17852 if (VT == MVT::v4i32) {
17853 HalfVT = MVT::v8i16;
17854 ExtConst = 0x0000FFFF;
17855 } else { //if (VT == MVT::v8i16)
17856 HalfVT = MVT::v16i8;
17857 ExtConst = 0x00FF;
17858 }
17859
17860 // Create a VQMOVNB with undef top lanes, then ZExt into the top half with
17861 // an AND. That extend will hopefully be removed if only the bottom bits are
17862 // demanded (though a truncating store, for example).
17863 SDValue VQMOVN =
17864 DAG.getNode(ARMISD::VQMOVNu, DL, HalfVT, DAG.getUNDEF(HalfVT), N0,
17865 DAG.getConstant(0, DL, MVT::i32));
17866 SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);
17867 return DAG.getNode(ISD::AND, DL, VT, Bitcast,
17868 DAG.getConstant(ExtConst, DL, VT));
17869 }
17870
17871 return SDValue();
17872}
17873
17876 if (!C)
17877 return nullptr;
17878 const APInt *CV = &C->getAPIntValue();
17879 return CV->isPowerOf2() ? CV : nullptr;
17880}
17881
17883 // If we have a CMOV, OR and AND combination such as:
17884 // if (x & CN)
17885 // y |= CM;
17886 //
17887 // And:
17888 // * CN is a single bit;
17889 // * All bits covered by CM are known zero in y
17890 //
17891 // Then we can convert this into a sequence of BFI instructions. This will
17892 // always be a win if CM is a single bit, will always be no worse than the
17893 // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is
17894 // three bits (due to the extra IT instruction).
17895
17896 SDValue Op0 = CMOV->getOperand(0);
17897 SDValue Op1 = CMOV->getOperand(1);
17898 auto CC = CMOV->getConstantOperandAPInt(2).getLimitedValue();
17899 SDValue CmpZ = CMOV->getOperand(3);
17900
17901 // The compare must be against zero.
17902 if (!isNullConstant(CmpZ->getOperand(1)))
17903 return SDValue();
17904
17905 assert(CmpZ->getOpcode() == ARMISD::CMPZ);
17906 SDValue And = CmpZ->getOperand(0);
17907 if (And->getOpcode() != ISD::AND)
17908 return SDValue();
17909 const APInt *AndC = isPowerOf2Constant(And->getOperand(1));
17910 if (!AndC)
17911 return SDValue();
17912 SDValue X = And->getOperand(0);
17913
17914 if (CC == ARMCC::EQ) {
17915 // We're performing an "equal to zero" compare. Swap the operands so we
17916 // canonicalize on a "not equal to zero" compare.
17917 std::swap(Op0, Op1);
17918 } else {
17919 assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?");
17920 }
17921
17922 if (Op1->getOpcode() != ISD::OR)
17923 return SDValue();
17924
17926 if (!OrC)
17927 return SDValue();
17928 SDValue Y = Op1->getOperand(0);
17929
17930 if (Op0 != Y)
17931 return SDValue();
17932
17933 // Now, is it profitable to continue?
17934 APInt OrCI = OrC->getAPIntValue();
17935 unsigned Heuristic = Subtarget->isThumb() ? 3 : 2;
17936 if (OrCI.popcount() > Heuristic)
17937 return SDValue();
17938
17939 // Lastly, can we determine that the bits defined by OrCI
17940 // are zero in Y?
17941 KnownBits Known = DAG.computeKnownBits(Y);
17942 if ((OrCI & Known.Zero) != OrCI)
17943 return SDValue();
17944
17945 // OK, we can do the combine.
17946 SDValue V = Y;
17947 SDLoc dl(X);
17948 EVT VT = X.getValueType();
17949 unsigned BitInX = AndC->logBase2();
17950
17951 if (BitInX != 0) {
17952 // We must shift X first.
17953 X = DAG.getNode(ISD::SRL, dl, VT, X,
17954 DAG.getConstant(BitInX, dl, VT));
17955 }
17956
17957 for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits();
17958 BitInY < NumActiveBits; ++BitInY) {
17959 if (OrCI[BitInY] == 0)
17960 continue;
17961 APInt Mask(VT.getSizeInBits(), 0);
17962 Mask.setBit(BitInY);
17963 V = DAG.getNode(ARMISD::BFI, dl, VT, V, X,
17964 // Confusingly, the operand is an *inverted* mask.
17965 DAG.getConstant(~Mask, dl, VT));
17966 }
17967
17968 return V;
17969}
17970
17971// Given N, the value controlling the conditional branch, search for the loop
17972// intrinsic, returning it, along with how the value is used. We need to handle
17973// patterns such as the following:
17974// (brcond (xor (setcc (loop.decrement), 0, ne), 1), exit)
17975// (brcond (setcc (loop.decrement), 0, eq), exit)
17976// (brcond (setcc (loop.decrement), 0, ne), header)
17978 bool &Negate) {
17979 switch (N->getOpcode()) {
17980 default:
17981 break;
17982 case ISD::XOR: {
17983 if (!isa<ConstantSDNode>(N.getOperand(1)))
17984 return SDValue();
17985 if (!cast<ConstantSDNode>(N.getOperand(1))->isOne())
17986 return SDValue();
17987 Negate = !Negate;
17988 return SearchLoopIntrinsic(N.getOperand(0), CC, Imm, Negate);
17989 }
17990 case ISD::SETCC: {
17991 auto *Const = dyn_cast<ConstantSDNode>(N.getOperand(1));
17992 if (!Const)
17993 return SDValue();
17994 if (Const->isZero())
17995 Imm = 0;
17996 else if (Const->isOne())
17997 Imm = 1;
17998 else
17999 return SDValue();
18000 CC = cast<CondCodeSDNode>(N.getOperand(2))->get();
18001 return SearchLoopIntrinsic(N->getOperand(0), CC, Imm, Negate);
18002 }
18004 unsigned IntOp = N.getConstantOperandVal(1);
18005 if (IntOp != Intrinsic::test_start_loop_iterations &&
18006 IntOp != Intrinsic::loop_decrement_reg)
18007 return SDValue();
18008 return N;
18009 }
18010 }
18011 return SDValue();
18012}
18013
18016 const ARMSubtarget *ST) {
18017
18018 // The hwloop intrinsics that we're interested are used for control-flow,
18019 // either for entering or exiting the loop:
18020 // - test.start.loop.iterations will test whether its operand is zero. If it
18021 // is zero, the proceeding branch should not enter the loop.
18022 // - loop.decrement.reg also tests whether its operand is zero. If it is
18023 // zero, the proceeding branch should not branch back to the beginning of
18024 // the loop.
18025 // So here, we need to check that how the brcond is using the result of each
18026 // of the intrinsics to ensure that we're branching to the right place at the
18027 // right time.
18028
18029 ISD::CondCode CC;
18030 SDValue Cond;
18031 int Imm = 1;
18032 bool Negate = false;
18033 SDValue Chain = N->getOperand(0);
18034 SDValue Dest;
18035
18036 if (N->getOpcode() == ISD::BRCOND) {
18037 CC = ISD::SETEQ;
18038 Cond = N->getOperand(1);
18039 Dest = N->getOperand(2);
18040 } else {
18041 assert(N->getOpcode() == ISD::BR_CC && "Expected BRCOND or BR_CC!");
18042 CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
18043 Cond = N->getOperand(2);
18044 Dest = N->getOperand(4);
18045 if (auto *Const = dyn_cast<ConstantSDNode>(N->getOperand(3))) {
18046 if (!Const->isOne() && !Const->isZero())
18047 return SDValue();
18048 Imm = Const->getZExtValue();
18049 } else
18050 return SDValue();
18051 }
18052
18053 SDValue Int = SearchLoopIntrinsic(Cond, CC, Imm, Negate);
18054 if (!Int)
18055 return SDValue();
18056
18057 if (Negate)
18058 CC = ISD::getSetCCInverse(CC, /* Integer inverse */ MVT::i32);
18059
18060 auto IsTrueIfZero = [](ISD::CondCode CC, int Imm) {
18061 return (CC == ISD::SETEQ && Imm == 0) ||
18062 (CC == ISD::SETNE && Imm == 1) ||
18063 (CC == ISD::SETLT && Imm == 1) ||
18064 (CC == ISD::SETULT && Imm == 1);
18065 };
18066
18067 auto IsFalseIfZero = [](ISD::CondCode CC, int Imm) {
18068 return (CC == ISD::SETEQ && Imm == 1) ||
18069 (CC == ISD::SETNE && Imm == 0) ||
18070 (CC == ISD::SETGT && Imm == 0) ||
18071 (CC == ISD::SETUGT && Imm == 0) ||
18072 (CC == ISD::SETGE && Imm == 1) ||
18073 (CC == ISD::SETUGE && Imm == 1);
18074 };
18075
18076 assert((IsTrueIfZero(CC, Imm) || IsFalseIfZero(CC, Imm)) &&
18077 "unsupported condition");
18078
18079 SDLoc dl(Int);
18080 SelectionDAG &DAG = DCI.DAG;
18081 SDValue Elements = Int.getOperand(2);
18082 unsigned IntOp = Int->getConstantOperandVal(1);
18083 assert((N->hasOneUse() && N->user_begin()->getOpcode() == ISD::BR) &&
18084 "expected single br user");
18085 SDNode *Br = *N->user_begin();
18086 SDValue OtherTarget = Br->getOperand(1);
18087
18088 // Update the unconditional branch to branch to the given Dest.
18089 auto UpdateUncondBr = [](SDNode *Br, SDValue Dest, SelectionDAG &DAG) {
18090 SDValue NewBrOps[] = { Br->getOperand(0), Dest };
18091 SDValue NewBr = DAG.getNode(ISD::BR, SDLoc(Br), MVT::Other, NewBrOps);
18092 DAG.ReplaceAllUsesOfValueWith(SDValue(Br, 0), NewBr);
18093 };
18094
18095 if (IntOp == Intrinsic::test_start_loop_iterations) {
18096 SDValue Res;
18097 SDValue Setup = DAG.getNode(ARMISD::WLSSETUP, dl, MVT::i32, Elements);
18098 // We expect this 'instruction' to branch when the counter is zero.
18099 if (IsTrueIfZero(CC, Imm)) {
18100 SDValue Ops[] = {Chain, Setup, Dest};
18101 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
18102 } else {
18103 // The logic is the reverse of what we need for WLS, so find the other
18104 // basic block target: the target of the proceeding br.
18105 UpdateUncondBr(Br, Dest, DAG);
18106
18107 SDValue Ops[] = {Chain, Setup, OtherTarget};
18108 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
18109 }
18110 // Update LR count to the new value
18111 DAG.ReplaceAllUsesOfValueWith(Int.getValue(0), Setup);
18112 // Update chain
18113 DAG.ReplaceAllUsesOfValueWith(Int.getValue(2), Int.getOperand(0));
18114 return Res;
18115 } else {
18116 SDValue Size =
18117 DAG.getTargetConstant(Int.getConstantOperandVal(3), dl, MVT::i32);
18118 SDValue Args[] = { Int.getOperand(0), Elements, Size, };
18119 SDValue LoopDec = DAG.getNode(ARMISD::LOOP_DEC, dl,
18120 DAG.getVTList(MVT::i32, MVT::Other), Args);
18121 DAG.ReplaceAllUsesWith(Int.getNode(), LoopDec.getNode());
18122
18123 // We expect this instruction to branch when the count is not zero.
18124 SDValue Target = IsFalseIfZero(CC, Imm) ? Dest : OtherTarget;
18125
18126 // Update the unconditional branch to target the loop preheader if we've
18127 // found the condition has been reversed.
18128 if (Target == OtherTarget)
18129 UpdateUncondBr(Br, Dest, DAG);
18130
18131 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
18132 SDValue(LoopDec.getNode(), 1), Chain);
18133
18134 SDValue EndArgs[] = { Chain, SDValue(LoopDec.getNode(), 0), Target };
18135 return DAG.getNode(ARMISD::LE, dl, MVT::Other, EndArgs);
18136 }
18137 return SDValue();
18138}
18139
18140/// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
18141SDValue
18143 SDValue Cmp = N->getOperand(3);
18144 if (Cmp.getOpcode() != ARMISD::CMPZ)
18145 // Only looking at NE cases.
18146 return SDValue();
18147
18148 SDLoc dl(N);
18149 SDValue LHS = Cmp.getOperand(0);
18150 SDValue RHS = Cmp.getOperand(1);
18151 SDValue Chain = N->getOperand(0);
18152 SDValue BB = N->getOperand(1);
18153 SDValue ARMcc = N->getOperand(2);
18155
18156 // (brcond Chain BB ne (cmpz (and (cmov 0 1 CC Flags) 1) 0))
18157 // -> (brcond Chain BB CC Flags)
18158 if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() &&
18159 LHS->getOperand(0)->getOpcode() == ARMISD::CMOV &&
18160 LHS->getOperand(0)->hasOneUse() &&
18161 isNullConstant(LHS->getOperand(0)->getOperand(0)) &&
18162 isOneConstant(LHS->getOperand(0)->getOperand(1)) &&
18163 isOneConstant(LHS->getOperand(1)) && isNullConstant(RHS)) {
18164 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, BB,
18165 LHS->getOperand(0)->getOperand(2),
18166 LHS->getOperand(0)->getOperand(3));
18167 }
18168
18169 return SDValue();
18170}
18171
18172/// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
18173SDValue
18175 SDValue Cmp = N->getOperand(3);
18176 if (Cmp.getOpcode() != ARMISD::CMPZ)
18177 // Only looking at EQ and NE cases.
18178 return SDValue();
18179
18180 EVT VT = N->getValueType(0);
18181 SDLoc dl(N);
18182 SDValue LHS = Cmp.getOperand(0);
18183 SDValue RHS = Cmp.getOperand(1);
18184 SDValue FalseVal = N->getOperand(0);
18185 SDValue TrueVal = N->getOperand(1);
18186 SDValue ARMcc = N->getOperand(2);
18188
18189 // BFI is only available on V6T2+.
18190 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) {
18192 if (R)
18193 return R;
18194 }
18195
18196 // Simplify
18197 // mov r1, r0
18198 // cmp r1, x
18199 // mov r0, y
18200 // moveq r0, x
18201 // to
18202 // cmp r0, x
18203 // movne r0, y
18204 //
18205 // mov r1, r0
18206 // cmp r1, x
18207 // mov r0, x
18208 // movne r0, y
18209 // to
18210 // cmp r0, x
18211 // movne r0, y
18212 /// FIXME: Turn this into a target neutral optimization?
18213 SDValue Res;
18214 if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) {
18215 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc, Cmp);
18216 } else if (CC == ARMCC::EQ && TrueVal == RHS) {
18217 SDValue ARMcc;
18218 SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl);
18219 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc, NewCmp);
18220 }
18221
18222 // (cmov F T ne (cmpz (cmov 0 1 CC Flags) 0))
18223 // -> (cmov F T CC Flags)
18224 if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse() &&
18225 isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
18226 isNullConstant(RHS)) {
18227 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal,
18228 LHS->getOperand(2), LHS->getOperand(3));
18229 }
18230
18231 if (!VT.isInteger())
18232 return SDValue();
18233
18234 // Fold away an unneccessary CMPZ/CMOV
18235 // CMOV A, B, C1, (CMPZ (CMOV 1, 0, C2, D), 0) ->
18236 // if C1==EQ -> CMOV A, B, C2, D
18237 // if C1==NE -> CMOV A, B, NOT(C2), D
18238 if (N->getConstantOperandVal(2) == ARMCC::EQ ||
18239 N->getConstantOperandVal(2) == ARMCC::NE) {
18241 if (SDValue C = IsCMPZCSINC(N->getOperand(3).getNode(), Cond)) {
18242 if (N->getConstantOperandVal(2) == ARMCC::NE)
18244 return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
18245 N->getOperand(1),
18246 DAG.getConstant(Cond, SDLoc(N), MVT::i32), C);
18247 }
18248 }
18249
18250 // Materialize a boolean comparison for integers so we can avoid branching.
18251 if (isNullConstant(FalseVal)) {
18252 if (CC == ARMCC::EQ && isOneConstant(TrueVal)) {
18253 if (!Subtarget->isThumb1Only() && Subtarget->hasV5TOps()) {
18254 // If x == y then x - y == 0 and ARM's CLZ will return 32, shifting it
18255 // right 5 bits will make that 32 be 1, otherwise it will be 0.
18256 // CMOV 0, 1, ==, (CMPZ x, y) -> SRL (CTLZ (SUB x, y)), 5
18257 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
18258 Res = DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::CTLZ, dl, VT, Sub),
18259 DAG.getConstant(5, dl, MVT::i32));
18260 } else {
18261 // CMOV 0, 1, ==, (CMPZ x, y) ->
18262 // (UADDO_CARRY (SUB x, y), t:0, t:1)
18263 // where t = (USUBO_CARRY 0, (SUB x, y), 0)
18264 //
18265 // The USUBO_CARRY computes 0 - (x - y) and this will give a borrow when
18266 // x != y. In other words, a carry C == 1 when x == y, C == 0
18267 // otherwise.
18268 // The final UADDO_CARRY computes
18269 // x - y + (0 - (x - y)) + C == C
18270 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
18271 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18272 SDValue Neg = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, Sub);
18273 // ISD::USUBO_CARRY returns a borrow but we want the carry here
18274 // actually.
18275 SDValue Carry =
18276 DAG.getNode(ISD::SUB, dl, MVT::i32,
18277 DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1));
18278 Res = DAG.getNode(ISD::UADDO_CARRY, dl, VTs, Sub, Neg, Carry);
18279 }
18280 } else if (CC == ARMCC::NE && !isNullConstant(RHS) &&
18281 (!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) {
18282 // This seems pointless but will allow us to combine it further below.
18283 // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUBC x, y), z, !=, (SUBC x, y):1
18284 SDValue Sub =
18285 DAG.getNode(ARMISD::SUBC, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
18286 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc,
18287 Sub.getValue(1));
18288 FalseVal = Sub;
18289 }
18290 } else if (isNullConstant(TrueVal)) {
18291 if (CC == ARMCC::EQ && !isNullConstant(RHS) &&
18292 (!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) {
18293 // This seems pointless but will allow us to combine it further below
18294 // Note that we change == for != as this is the dual for the case above.
18295 // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUBC x, y), z, !=, (SUBC x, y):1
18296 SDValue Sub =
18297 DAG.getNode(ARMISD::SUBC, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
18298 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal,
18299 DAG.getConstant(ARMCC::NE, dl, MVT::i32),
18300 Sub.getValue(1));
18301 FalseVal = Sub;
18302 }
18303 }
18304
18305 // On Thumb1, the DAG above may be further combined if z is a power of 2
18306 // (z == 2 ^ K).
18307 // CMOV (SUBC x, y), z, !=, (SUBC x, y):1 ->
18308 // t1 = (USUBO (SUB x, y), 1)
18309 // t2 = (USUBO_CARRY (SUB x, y), t1:0, t1:1)
18310 // Result = if K != 0 then (SHL t2:0, K) else t2:0
18311 //
18312 // This also handles the special case of comparing against zero; it's
18313 // essentially, the same pattern, except there's no SUBC:
18314 // CMOV x, z, !=, (CMPZ x, 0) ->
18315 // t1 = (USUBO x, 1)
18316 // t2 = (USUBO_CARRY x, t1:0, t1:1)
18317 // Result = if K != 0 then (SHL t2:0, K) else t2:0
18318 const APInt *TrueConst;
18319 if (Subtarget->isThumb1Only() && CC == ARMCC::NE &&
18320 ((FalseVal.getOpcode() == ARMISD::SUBC && FalseVal.getOperand(0) == LHS &&
18321 FalseVal.getOperand(1) == RHS) ||
18322 (FalseVal == LHS && isNullConstant(RHS))) &&
18323 (TrueConst = isPowerOf2Constant(TrueVal))) {
18324 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18325 unsigned ShiftAmount = TrueConst->logBase2();
18326 if (ShiftAmount)
18327 TrueVal = DAG.getConstant(1, dl, VT);
18328 SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal);
18329 Res = DAG.getNode(ISD::USUBO_CARRY, dl, VTs, FalseVal, Subc,
18330 Subc.getValue(1));
18331
18332 if (ShiftAmount)
18333 Res = DAG.getNode(ISD::SHL, dl, VT, Res,
18334 DAG.getConstant(ShiftAmount, dl, MVT::i32));
18335 }
18336
18337 if (Res.getNode()) {
18338 KnownBits Known = DAG.computeKnownBits(SDValue(N,0));
18339 // Capture demanded bits information that would be otherwise lost.
18340 if (Known.Zero == 0xfffffffe)
18341 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18342 DAG.getValueType(MVT::i1));
18343 else if (Known.Zero == 0xffffff00)
18344 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18345 DAG.getValueType(MVT::i8));
18346 else if (Known.Zero == 0xffff0000)
18347 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18348 DAG.getValueType(MVT::i16));
18349 }
18350
18351 return Res;
18352}
18353
18356 const ARMSubtarget *ST) {
18357 SelectionDAG &DAG = DCI.DAG;
18358 SDValue Src = N->getOperand(0);
18359 EVT DstVT = N->getValueType(0);
18360
18361 // Convert v4f32 bitcast (v4i32 vdup (i32)) -> v4f32 vdup (i32) under MVE.
18362 if (ST->hasMVEIntegerOps() && Src.getOpcode() == ARMISD::VDUP) {
18363 EVT SrcVT = Src.getValueType();
18364 if (SrcVT.getScalarSizeInBits() == DstVT.getScalarSizeInBits())
18365 return DAG.getNode(ARMISD::VDUP, SDLoc(N), DstVT, Src.getOperand(0));
18366 }
18367
18368 // We may have a bitcast of something that has already had this bitcast
18369 // combine performed on it, so skip past any VECTOR_REG_CASTs.
18370 if (Src.getOpcode() == ARMISD::VECTOR_REG_CAST &&
18371 Src.getOperand(0).getValueType().getScalarSizeInBits() <=
18372 Src.getValueType().getScalarSizeInBits())
18373 Src = Src.getOperand(0);
18374
18375 // Bitcast from element-wise VMOV or VMVN doesn't need VREV if the VREV that
18376 // would be generated is at least the width of the element type.
18377 EVT SrcVT = Src.getValueType();
18378 if ((Src.getOpcode() == ARMISD::VMOVIMM ||
18379 Src.getOpcode() == ARMISD::VMVNIMM ||
18380 Src.getOpcode() == ARMISD::VMOVFPIMM) &&
18381 SrcVT.getScalarSizeInBits() <= DstVT.getScalarSizeInBits() &&
18382 DAG.getDataLayout().isBigEndian())
18383 return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(N), DstVT, Src);
18384
18385 // bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD x
18386 if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI))
18387 return R;
18388
18389 return SDValue();
18390}
18391
18392// Some combines for the MVETrunc truncations legalizer helper. Also lowers the
18393// node into stack operations after legalizeOps.
18396 SelectionDAG &DAG = DCI.DAG;
18397 EVT VT = N->getValueType(0);
18398 SDLoc DL(N);
18399
18400 // MVETrunc(Undef, Undef) -> Undef
18401 if (all_of(N->ops(), [](SDValue Op) { return Op.isUndef(); }))
18402 return DAG.getUNDEF(VT);
18403
18404 // MVETrunc(MVETrunc a b, MVETrunc c, d) -> MVETrunc
18405 if (N->getNumOperands() == 2 &&
18406 N->getOperand(0).getOpcode() == ARMISD::MVETRUNC &&
18407 N->getOperand(1).getOpcode() == ARMISD::MVETRUNC)
18408 return DAG.getNode(ARMISD::MVETRUNC, DL, VT, N->getOperand(0).getOperand(0),
18409 N->getOperand(0).getOperand(1),
18410 N->getOperand(1).getOperand(0),
18411 N->getOperand(1).getOperand(1));
18412
18413 // MVETrunc(shuffle, shuffle) -> VMOVN
18414 if (N->getNumOperands() == 2 &&
18415 N->getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE &&
18416 N->getOperand(1).getOpcode() == ISD::VECTOR_SHUFFLE) {
18417 auto *S0 = cast<ShuffleVectorSDNode>(N->getOperand(0).getNode());
18418 auto *S1 = cast<ShuffleVectorSDNode>(N->getOperand(1).getNode());
18419
18420 if (S0->getOperand(0) == S1->getOperand(0) &&
18421 S0->getOperand(1) == S1->getOperand(1)) {
18422 // Construct complete shuffle mask
18423 SmallVector<int, 8> Mask(S0->getMask());
18424 Mask.append(S1->getMask().begin(), S1->getMask().end());
18425
18426 if (isVMOVNTruncMask(Mask, VT, false))
18427 return DAG.getNode(
18428 ARMISD::VMOVN, DL, VT,
18429 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)),
18430 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)),
18431 DAG.getConstant(1, DL, MVT::i32));
18432 if (isVMOVNTruncMask(Mask, VT, true))
18433 return DAG.getNode(
18434 ARMISD::VMOVN, DL, VT,
18435 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)),
18436 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)),
18437 DAG.getConstant(1, DL, MVT::i32));
18438 }
18439 }
18440
18441 // For MVETrunc of a buildvector or shuffle, it can be beneficial to lower the
18442 // truncate to a buildvector to allow the generic optimisations to kick in.
18443 if (all_of(N->ops(), [](SDValue Op) {
18444 return Op.getOpcode() == ISD::BUILD_VECTOR ||
18445 Op.getOpcode() == ISD::VECTOR_SHUFFLE ||
18446 (Op.getOpcode() == ISD::BITCAST &&
18447 Op.getOperand(0).getOpcode() == ISD::BUILD_VECTOR);
18448 })) {
18449 SmallVector<SDValue, 8> Extracts;
18450 for (unsigned Op = 0; Op < N->getNumOperands(); Op++) {
18451 SDValue O = N->getOperand(Op);
18452 for (unsigned i = 0; i < O.getValueType().getVectorNumElements(); i++) {
18453 SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, O,
18454 DAG.getConstant(i, DL, MVT::i32));
18455 Extracts.push_back(Ext);
18456 }
18457 }
18458 return DAG.getBuildVector(VT, DL, Extracts);
18459 }
18460
18461 // If we are late in the legalization process and nothing has optimised
18462 // the trunc to anything better, lower it to a stack store and reload,
18463 // performing the truncation whilst keeping the lanes in the correct order:
18464 // VSTRH.32 a, stack; VSTRH.32 b, stack+8; VLDRW.32 stack;
18465 if (!DCI.isAfterLegalizeDAG())
18466 return SDValue();
18467
18468 SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::getFixed(16), Align(4));
18469 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
18470 int NumIns = N->getNumOperands();
18471 assert((NumIns == 2 || NumIns == 4) &&
18472 "Expected 2 or 4 inputs to an MVETrunc");
18473 EVT StoreVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
18474 if (N->getNumOperands() == 4)
18475 StoreVT = StoreVT.getHalfNumVectorElementsVT(*DAG.getContext());
18476
18477 SmallVector<SDValue> Chains;
18478 for (int I = 0; I < NumIns; I++) {
18479 SDValue Ptr = DAG.getNode(
18480 ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
18481 DAG.getConstant(I * 16 / NumIns, DL, StackPtr.getValueType()));
18483 DAG.getMachineFunction(), SPFI, I * 16 / NumIns);
18484 SDValue Ch = DAG.getTruncStore(DAG.getEntryNode(), DL, N->getOperand(I),
18485 Ptr, MPI, StoreVT, Align(4));
18486 Chains.push_back(Ch);
18487 }
18488
18489 SDValue Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
18490 MachinePointerInfo MPI =
18492 return DAG.getLoad(VT, DL, Chain, StackPtr, MPI, Align(4));
18493}
18494
18495// Take a MVEEXT(load x) and split that into (extload x, extload x+8)
18497 SelectionDAG &DAG) {
18498 SDValue N0 = N->getOperand(0);
18500 if (!LD || !LD->isSimple() || !N0.hasOneUse() || LD->isIndexed())
18501 return SDValue();
18502
18503 EVT FromVT = LD->getMemoryVT();
18504 EVT ToVT = N->getValueType(0);
18505 if (!ToVT.isVector())
18506 return SDValue();
18507 assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements() * 2);
18508 EVT ToEltVT = ToVT.getVectorElementType();
18509 EVT FromEltVT = FromVT.getVectorElementType();
18510
18511 unsigned NumElements = 0;
18512 if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8))
18513 NumElements = 4;
18514 if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8)
18515 NumElements = 8;
18516 assert(NumElements != 0);
18517
18518 ISD::LoadExtType NewExtType =
18519 N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
18520 if (LD->getExtensionType() != ISD::NON_EXTLOAD &&
18521 LD->getExtensionType() != ISD::EXTLOAD &&
18522 LD->getExtensionType() != NewExtType)
18523 return SDValue();
18524
18525 LLVMContext &C = *DAG.getContext();
18526 SDLoc DL(LD);
18527 // Details about the old load
18528 SDValue Ch = LD->getChain();
18529 SDValue BasePtr = LD->getBasePtr();
18530 Align Alignment = LD->getBaseAlign();
18531 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
18532 AAMDNodes AAInfo = LD->getAAInfo();
18533
18534 SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
18535 EVT NewFromVT = EVT::getVectorVT(
18536 C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);
18537 EVT NewToVT = EVT::getVectorVT(
18538 C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);
18539
18542 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
18543 unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
18544 SDValue NewPtr =
18545 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
18546
18547 SDValue NewLoad =
18548 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
18549 LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
18550 Alignment, MMOFlags, AAInfo);
18551 Loads.push_back(NewLoad);
18552 Chains.push_back(SDValue(NewLoad.getNode(), 1));
18553 }
18554
18555 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
18556 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
18557 return DAG.getMergeValues(Loads, DL);
18558}
18559
18560// Perform combines for MVEEXT. If it has not be optimized to anything better
18561// before lowering, it gets converted to stack store and extloads performing the
18562// extend whilst still keeping the same lane ordering.
18565 SelectionDAG &DAG = DCI.DAG;
18566 EVT VT = N->getValueType(0);
18567 SDLoc DL(N);
18568 assert(N->getNumValues() == 2 && "Expected MVEEXT with 2 elements");
18569 assert((VT == MVT::v4i32 || VT == MVT::v8i16) && "Unexpected MVEEXT type");
18570
18571 EVT ExtVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(
18572 *DAG.getContext());
18573 auto Extend = [&](SDValue V) {
18574 SDValue VVT = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, V);
18575 return N->getOpcode() == ARMISD::MVESEXT
18576 ? DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, VVT,
18577 DAG.getValueType(ExtVT))
18578 : DAG.getZeroExtendInReg(VVT, DL, ExtVT);
18579 };
18580
18581 // MVEEXT(VDUP) -> SIGN_EXTEND_INREG(VDUP)
18582 if (N->getOperand(0).getOpcode() == ARMISD::VDUP) {
18583 SDValue Ext = Extend(N->getOperand(0));
18584 return DAG.getMergeValues({Ext, Ext}, DL);
18585 }
18586
18587 // MVEEXT(shuffle) -> SIGN_EXTEND_INREG/ZERO_EXTEND_INREG
18588 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0))) {
18589 ArrayRef<int> Mask = SVN->getMask();
18590 assert(Mask.size() == 2 * VT.getVectorNumElements());
18591 assert(Mask.size() == SVN->getValueType(0).getVectorNumElements());
18592 unsigned Rev = VT == MVT::v4i32 ? ARMISD::VREV32 : ARMISD::VREV16;
18593 SDValue Op0 = SVN->getOperand(0);
18594 SDValue Op1 = SVN->getOperand(1);
18595
18596 auto CheckInregMask = [&](int Start, int Offset) {
18597 for (int Idx = 0, E = VT.getVectorNumElements(); Idx < E; ++Idx)
18598 if (Mask[Start + Idx] >= 0 && Mask[Start + Idx] != Idx * 2 + Offset)
18599 return false;
18600 return true;
18601 };
18602 SDValue V0 = SDValue(N, 0);
18603 SDValue V1 = SDValue(N, 1);
18604 if (CheckInregMask(0, 0))
18605 V0 = Extend(Op0);
18606 else if (CheckInregMask(0, 1))
18607 V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0));
18608 else if (CheckInregMask(0, Mask.size()))
18609 V0 = Extend(Op1);
18610 else if (CheckInregMask(0, Mask.size() + 1))
18611 V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1));
18612
18613 if (CheckInregMask(VT.getVectorNumElements(), Mask.size()))
18614 V1 = Extend(Op1);
18615 else if (CheckInregMask(VT.getVectorNumElements(), Mask.size() + 1))
18616 V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1));
18617 else if (CheckInregMask(VT.getVectorNumElements(), 0))
18618 V1 = Extend(Op0);
18619 else if (CheckInregMask(VT.getVectorNumElements(), 1))
18620 V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0));
18621
18622 if (V0.getNode() != N || V1.getNode() != N)
18623 return DAG.getMergeValues({V0, V1}, DL);
18624 }
18625
18626 // MVEEXT(load) -> extload, extload
18627 if (N->getOperand(0)->getOpcode() == ISD::LOAD)
18629 return L;
18630
18631 if (!DCI.isAfterLegalizeDAG())
18632 return SDValue();
18633
18634 // Lower to a stack store and reload:
18635 // VSTRW.32 a, stack; VLDRH.32 stack; VLDRH.32 stack+8;
18636 SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::getFixed(16), Align(4));
18637 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
18638 int NumOuts = N->getNumValues();
18639 assert((NumOuts == 2 || NumOuts == 4) &&
18640 "Expected 2 or 4 outputs to an MVEEXT");
18641 EVT LoadVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(
18642 *DAG.getContext());
18643 if (N->getNumOperands() == 4)
18644 LoadVT = LoadVT.getHalfNumVectorElementsVT(*DAG.getContext());
18645
18646 MachinePointerInfo MPI =
18648 SDValue Chain = DAG.getStore(DAG.getEntryNode(), DL, N->getOperand(0),
18649 StackPtr, MPI, Align(4));
18650
18652 for (int I = 0; I < NumOuts; I++) {
18653 SDValue Ptr = DAG.getNode(
18654 ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
18655 DAG.getConstant(I * 16 / NumOuts, DL, StackPtr.getValueType()));
18657 DAG.getMachineFunction(), SPFI, I * 16 / NumOuts);
18658 SDValue Load = DAG.getExtLoad(
18659 N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD, DL,
18660 VT, Chain, Ptr, MPI, LoadVT, Align(4));
18661 Loads.push_back(Load);
18662 }
18663
18664 return DAG.getMergeValues(Loads, DL);
18665}
18666
18668 DAGCombinerInfo &DCI) const {
18669 switch (N->getOpcode()) {
18670 default: break;
18671 case ISD::SELECT_CC:
18672 case ISD::SELECT: return PerformSELECTCombine(N, DCI, Subtarget);
18673 case ISD::VSELECT: return PerformVSELECTCombine(N, DCI, Subtarget);
18674 case ISD::SETCC: return PerformVSetCCToVCTPCombine(N, DCI, Subtarget);
18675 case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget);
18676 case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget);
18677 case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget);
18678 case ISD::SUB: return PerformSUBCombine(N, DCI, Subtarget);
18679 case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget);
18680 case ISD::OR: return PerformORCombine(N, DCI, Subtarget);
18681 case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget);
18682 case ISD::AND: return PerformANDCombine(N, DCI, Subtarget);
18683 case ISD::BRCOND:
18684 case ISD::BR_CC: return PerformHWLoopCombine(N, DCI, Subtarget);
18685 case ARMISD::ADDC:
18686 case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget);
18687 case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget);
18688 case ARMISD::BFI: return PerformBFICombine(N, DCI.DAG);
18689 case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
18690 case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
18691 case ARMISD::VMOVhr: return PerformVMOVhrCombine(N, DCI);
18692 case ARMISD::VMOVrh: return PerformVMOVrhCombine(N, DCI.DAG);
18693 case ISD::STORE: return PerformSTORECombine(N, DCI, Subtarget);
18694 case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget);
18697 return PerformExtractEltCombine(N, DCI, Subtarget);
18701 case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI, Subtarget);
18702 case ARMISD::VDUP: return PerformVDUPCombine(N, DCI.DAG, Subtarget);
18703 case ISD::FP_TO_SINT:
18704 case ISD::FP_TO_UINT:
18705 return PerformVCVTCombine(N, DCI.DAG, Subtarget);
18706 case ISD::FADD:
18707 return PerformFADDCombine(N, DCI.DAG, Subtarget);
18708 case ISD::FMUL:
18709 return PerformVMulVCTPCombine(N, DCI.DAG, Subtarget);
18711 return PerformIntrinsicCombine(N, DCI);
18712 case ISD::SHL:
18713 case ISD::SRA:
18714 case ISD::SRL:
18715 return PerformShiftCombine(N, DCI, Subtarget);
18716 case ISD::SIGN_EXTEND:
18717 case ISD::ZERO_EXTEND:
18718 case ISD::ANY_EXTEND:
18719 return PerformExtendCombine(N, DCI.DAG, Subtarget);
18720 case ISD::FP_EXTEND:
18721 return PerformFPExtendCombine(N, DCI.DAG, Subtarget);
18722 case ISD::SMIN:
18723 case ISD::UMIN:
18724 case ISD::SMAX:
18725 case ISD::UMAX:
18726 return PerformMinMaxCombine(N, DCI.DAG, Subtarget);
18727 case ARMISD::CMOV:
18728 return PerformCMOVCombine(N, DCI.DAG);
18729 case ARMISD::BRCOND:
18730 return PerformBRCONDCombine(N, DCI.DAG);
18731 case ARMISD::CMPZ:
18732 return PerformCMPZCombine(N, DCI.DAG);
18733 case ARMISD::CSINC:
18734 case ARMISD::CSINV:
18735 case ARMISD::CSNEG:
18736 return PerformCSETCombine(N, DCI.DAG);
18737 case ISD::LOAD:
18738 return PerformLOADCombine(N, DCI, Subtarget);
18739 case ARMISD::VLD1DUP:
18740 case ARMISD::VLD2DUP:
18741 case ARMISD::VLD3DUP:
18742 case ARMISD::VLD4DUP:
18743 return PerformVLDCombine(N, DCI);
18745 return PerformARMBUILD_VECTORCombine(N, DCI);
18746 case ISD::BITCAST:
18747 return PerformBITCASTCombine(N, DCI, Subtarget);
18748 case ARMISD::PREDICATE_CAST:
18749 return PerformPREDICATE_CASTCombine(N, DCI);
18750 case ARMISD::VECTOR_REG_CAST:
18751 return PerformVECTOR_REG_CASTCombine(N, DCI.DAG, Subtarget);
18752 case ARMISD::MVETRUNC:
18753 return PerformMVETruncCombine(N, DCI);
18754 case ARMISD::MVESEXT:
18755 case ARMISD::MVEZEXT:
18756 return PerformMVEExtCombine(N, DCI);
18757 case ARMISD::VCMP:
18758 return PerformVCMPCombine(N, DCI.DAG, Subtarget);
18759 case ISD::VECREDUCE_ADD:
18760 return PerformVECREDUCE_ADDCombine(N, DCI.DAG, Subtarget);
18761 case ARMISD::VADDVs:
18762 case ARMISD::VADDVu:
18763 case ARMISD::VADDLVs:
18764 case ARMISD::VADDLVu:
18765 case ARMISD::VADDLVAs:
18766 case ARMISD::VADDLVAu:
18767 case ARMISD::VMLAVs:
18768 case ARMISD::VMLAVu:
18769 case ARMISD::VMLALVs:
18770 case ARMISD::VMLALVu:
18771 case ARMISD::VMLALVAs:
18772 case ARMISD::VMLALVAu:
18773 return PerformReduceShuffleCombine(N, DCI.DAG);
18774 case ARMISD::VMOVN:
18775 return PerformVMOVNCombine(N, DCI);
18776 case ARMISD::VQMOVNs:
18777 case ARMISD::VQMOVNu:
18778 return PerformVQMOVNCombine(N, DCI);
18779 case ARMISD::VQDMULH:
18780 return PerformVQDMULHCombine(N, DCI);
18781 case ARMISD::ASRL:
18782 case ARMISD::LSRL:
18783 case ARMISD::LSLL:
18784 return PerformLongShiftCombine(N, DCI.DAG);
18785 case ARMISD::SMULWB: {
18786 unsigned BitWidth = N->getValueType(0).getSizeInBits();
18787 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
18788 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
18789 return SDValue();
18790 break;
18791 }
18792 case ARMISD::SMULWT: {
18793 unsigned BitWidth = N->getValueType(0).getSizeInBits();
18794 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
18795 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
18796 return SDValue();
18797 break;
18798 }
18799 case ARMISD::SMLALBB:
18800 case ARMISD::QADD16b:
18801 case ARMISD::QSUB16b:
18802 case ARMISD::UQADD16b:
18803 case ARMISD::UQSUB16b: {
18804 unsigned BitWidth = N->getValueType(0).getSizeInBits();
18805 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
18806 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
18807 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
18808 return SDValue();
18809 break;
18810 }
18811 case ARMISD::SMLALBT: {
18812 unsigned LowWidth = N->getOperand(0).getValueType().getSizeInBits();
18813 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
18814 unsigned HighWidth = N->getOperand(1).getValueType().getSizeInBits();
18815 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
18816 if ((SimplifyDemandedBits(N->getOperand(0), LowMask, DCI)) ||
18817 (SimplifyDemandedBits(N->getOperand(1), HighMask, DCI)))
18818 return SDValue();
18819 break;
18820 }
18821 case ARMISD::SMLALTB: {
18822 unsigned HighWidth = N->getOperand(0).getValueType().getSizeInBits();
18823 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
18824 unsigned LowWidth = N->getOperand(1).getValueType().getSizeInBits();
18825 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
18826 if ((SimplifyDemandedBits(N->getOperand(0), HighMask, DCI)) ||
18827 (SimplifyDemandedBits(N->getOperand(1), LowMask, DCI)))
18828 return SDValue();
18829 break;
18830 }
18831 case ARMISD::SMLALTT: {
18832 unsigned BitWidth = N->getValueType(0).getSizeInBits();
18833 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
18834 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
18835 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
18836 return SDValue();
18837 break;
18838 }
18839 case ARMISD::QADD8b:
18840 case ARMISD::QSUB8b:
18841 case ARMISD::UQADD8b:
18842 case ARMISD::UQSUB8b: {
18843 unsigned BitWidth = N->getValueType(0).getSizeInBits();
18844 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 8);
18845 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
18846 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
18847 return SDValue();
18848 break;
18849 }
18850 case ARMISD::VBSP:
18851 if (N->getOperand(1) == N->getOperand(2))
18852 return N->getOperand(1);
18853 return SDValue();
18856 switch (N->getConstantOperandVal(1)) {
18857 case Intrinsic::arm_neon_vld1:
18858 case Intrinsic::arm_neon_vld1x2:
18859 case Intrinsic::arm_neon_vld1x3:
18860 case Intrinsic::arm_neon_vld1x4:
18861 case Intrinsic::arm_neon_vld2:
18862 case Intrinsic::arm_neon_vld3:
18863 case Intrinsic::arm_neon_vld4:
18864 case Intrinsic::arm_neon_vld2lane:
18865 case Intrinsic::arm_neon_vld3lane:
18866 case Intrinsic::arm_neon_vld4lane:
18867 case Intrinsic::arm_neon_vld2dup:
18868 case Intrinsic::arm_neon_vld3dup:
18869 case Intrinsic::arm_neon_vld4dup:
18870 case Intrinsic::arm_neon_vst1:
18871 case Intrinsic::arm_neon_vst1x2:
18872 case Intrinsic::arm_neon_vst1x3:
18873 case Intrinsic::arm_neon_vst1x4:
18874 case Intrinsic::arm_neon_vst2:
18875 case Intrinsic::arm_neon_vst3:
18876 case Intrinsic::arm_neon_vst4:
18877 case Intrinsic::arm_neon_vst2lane:
18878 case Intrinsic::arm_neon_vst3lane:
18879 case Intrinsic::arm_neon_vst4lane:
18880 return PerformVLDCombine(N, DCI);
18881 case Intrinsic::arm_mve_vld2q:
18882 case Intrinsic::arm_mve_vld4q:
18883 case Intrinsic::arm_mve_vst2q:
18884 case Intrinsic::arm_mve_vst4q:
18885 return PerformMVEVLDCombine(N, DCI);
18886 default: break;
18887 }
18888 break;
18889 }
18890 return SDValue();
18891}
18892
18894 EVT VT) const {
18895 return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
18896}
18897
18899 Align Alignment,
18901 unsigned *Fast) const {
18902 // Depends what it gets converted into if the type is weird.
18903 if (!VT.isSimple())
18904 return false;
18905
18906 // The AllowsUnaligned flag models the SCTLR.A setting in ARM cpus
18907 bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
18908 auto Ty = VT.getSimpleVT().SimpleTy;
18909
18910 if (Ty == MVT::i8 || Ty == MVT::i16 || Ty == MVT::i32) {
18911 // Unaligned access can use (for example) LRDB, LRDH, LDR
18912 if (AllowsUnaligned) {
18913 if (Fast)
18914 *Fast = Subtarget->hasV7Ops();
18915 return true;
18916 }
18917 }
18918
18919 if (Ty == MVT::f64 || Ty == MVT::v2f64) {
18920 // For any little-endian targets with neon, we can support unaligned ld/st
18921 // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
18922 // A big-endian target may also explicitly support unaligned accesses
18923 if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) {
18924 if (Fast)
18925 *Fast = 1;
18926 return true;
18927 }
18928 }
18929
18930 if (!Subtarget->hasMVEIntegerOps())
18931 return false;
18932
18933 // These are for predicates
18934 if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1 ||
18935 Ty == MVT::v2i1)) {
18936 if (Fast)
18937 *Fast = 1;
18938 return true;
18939 }
18940
18941 // These are for truncated stores/narrowing loads. They are fine so long as
18942 // the alignment is at least the size of the item being loaded
18943 if ((Ty == MVT::v4i8 || Ty == MVT::v8i8 || Ty == MVT::v4i16) &&
18944 Alignment >= VT.getScalarSizeInBits() / 8) {
18945 if (Fast)
18946 *Fast = true;
18947 return true;
18948 }
18949
18950 // In little-endian MVE, the store instructions VSTRB.U8, VSTRH.U16 and
18951 // VSTRW.U32 all store the vector register in exactly the same format, and
18952 // differ only in the range of their immediate offset field and the required
18953 // alignment. So there is always a store that can be used, regardless of
18954 // actual type.
18955 //
18956 // For big endian, that is not the case. But can still emit a (VSTRB.U8;
18957 // VREV64.8) pair and get the same effect. This will likely be better than
18958 // aligning the vector through the stack.
18959 if (Ty == MVT::v16i8 || Ty == MVT::v8i16 || Ty == MVT::v8f16 ||
18960 Ty == MVT::v4i32 || Ty == MVT::v4f32 || Ty == MVT::v2i64 ||
18961 Ty == MVT::v2f64) {
18962 if (Fast)
18963 *Fast = 1;
18964 return true;
18965 }
18966
18967 return false;
18968}
18969
18971 LLVMContext &Context, const MemOp &Op,
18972 const AttributeList &FuncAttributes) const {
18973 // See if we can use NEON instructions for this...
18974 if ((Op.isMemcpy() || Op.isZeroMemset()) && Subtarget->hasNEON() &&
18975 !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {
18976 unsigned Fast;
18977 if (Op.size() >= 16 &&
18978 (Op.isAligned(Align(16)) ||
18979 (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, Align(1),
18981 Fast))) {
18982 return MVT::v2f64;
18983 } else if (Op.size() >= 8 &&
18984 (Op.isAligned(Align(8)) ||
18986 MVT::f64, 0, Align(1), MachineMemOperand::MONone, &Fast) &&
18987 Fast))) {
18988 return MVT::f64;
18989 }
18990 }
18991
18992 // Let the target-independent logic figure it out.
18993 return MVT::Other;
18994}
18995
18996// 64-bit integers are split into their high and low parts and held in two
18997// different registers, so the trunc is free since the low register can just
18998// be used.
18999bool ARMTargetLowering::isTruncateFree(Type *SrcTy, Type *DstTy) const {
19000 if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy())
19001 return false;
19002 unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();
19003 unsigned DestBits = DstTy->getPrimitiveSizeInBits();
19004 return (SrcBits == 64 && DestBits == 32);
19005}
19006
19008 if (SrcVT.isVector() || DstVT.isVector() || !SrcVT.isInteger() ||
19009 !DstVT.isInteger())
19010 return false;
19011 unsigned SrcBits = SrcVT.getSizeInBits();
19012 unsigned DestBits = DstVT.getSizeInBits();
19013 return (SrcBits == 64 && DestBits == 32);
19014}
19015
19017 if (Val.getOpcode() != ISD::LOAD)
19018 return false;
19019
19020 EVT VT1 = Val.getValueType();
19021 if (!VT1.isSimple() || !VT1.isInteger() ||
19022 !VT2.isSimple() || !VT2.isInteger())
19023 return false;
19024
19025 switch (VT1.getSimpleVT().SimpleTy) {
19026 default: break;
19027 case MVT::i1:
19028 case MVT::i8:
19029 case MVT::i16:
19030 // 8-bit and 16-bit loads implicitly zero-extend to 32-bits.
19031 return true;
19032 }
19033
19034 return false;
19035}
19036
19038 if (!VT.isSimple())
19039 return false;
19040
19041 // There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that
19042 // negate values directly (fneg is free). So, we don't want to let the DAG
19043 // combiner rewrite fneg into xors and some other instructions. For f16 and
19044 // FullFP16 argument passing, some bitcast nodes may be introduced,
19045 // triggering this DAG combine rewrite, so we are avoiding that with this.
19046 switch (VT.getSimpleVT().SimpleTy) {
19047 default: break;
19048 case MVT::f16:
19049 return Subtarget->hasFullFP16();
19050 }
19051
19052 return false;
19053}
19054
19056 if (!Subtarget->hasMVEIntegerOps())
19057 return nullptr;
19058 Type *SVIType = SVI->getType();
19059 Type *ScalarType = SVIType->getScalarType();
19060
19061 if (ScalarType->isFloatTy())
19062 return Type::getInt32Ty(SVIType->getContext());
19063 if (ScalarType->isHalfTy())
19064 return Type::getInt16Ty(SVIType->getContext());
19065 return nullptr;
19066}
19067
19069 EVT VT = ExtVal.getValueType();
19070
19071 if (!isTypeLegal(VT))
19072 return false;
19073
19074 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal.getOperand(0))) {
19075 if (Ld->isExpandingLoad())
19076 return false;
19077 }
19078
19079 if (Subtarget->hasMVEIntegerOps())
19080 return true;
19081
19082 // Don't create a loadext if we can fold the extension into a wide/long
19083 // instruction.
19084 // If there's more than one user instruction, the loadext is desirable no
19085 // matter what. There can be two uses by the same instruction.
19086 if (ExtVal->use_empty() ||
19087 !ExtVal->user_begin()->isOnlyUserOf(ExtVal.getNode()))
19088 return true;
19089
19090 SDNode *U = *ExtVal->user_begin();
19091 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB ||
19092 U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHLIMM))
19093 return false;
19094
19095 return true;
19096}
19097
19099 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
19100 return false;
19101
19102 if (!isTypeLegal(EVT::getEVT(Ty1)))
19103 return false;
19104
19105 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
19106
19107 // Assuming the caller doesn't have a zeroext or signext return parameter,
19108 // truncation all the way down to i1 is valid.
19109 return true;
19110}
19111
19112/// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
19113/// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
19114/// expanded to FMAs when this method returns true, otherwise fmuladd is
19115/// expanded to fmul + fadd.
19116///
19117/// ARM supports both fused and unfused multiply-add operations; we already
19118/// lower a pair of fmul and fadd to the latter so it's not clear that there
19119/// would be a gain or that the gain would be worthwhile enough to risk
19120/// correctness bugs.
19121///
19122/// For MVE, we set this to true as it helps simplify the need for some
19123/// patterns (and we don't have the non-fused floating point instruction).
19124bool ARMTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
19125 EVT VT) const {
19126 if (Subtarget->useSoftFloat())
19127 return false;
19128
19129 if (!VT.isSimple())
19130 return false;
19131
19132 switch (VT.getSimpleVT().SimpleTy) {
19133 case MVT::v4f32:
19134 case MVT::v8f16:
19135 return Subtarget->hasMVEFloatOps();
19136 case MVT::f16:
19137 return Subtarget->useFPVFMx16();
19138 case MVT::f32:
19139 return Subtarget->useFPVFMx();
19140 case MVT::f64:
19141 return Subtarget->useFPVFMx64();
19142 default:
19143 break;
19144 }
19145
19146 return false;
19147}
19148
19149static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
19150 if (V < 0)
19151 return false;
19152
19153 unsigned Scale = 1;
19154 switch (VT.getSimpleVT().SimpleTy) {
19155 case MVT::i1:
19156 case MVT::i8:
19157 // Scale == 1;
19158 break;
19159 case MVT::i16:
19160 // Scale == 2;
19161 Scale = 2;
19162 break;
19163 default:
19164 // On thumb1 we load most things (i32, i64, floats, etc) with a LDR
19165 // Scale == 4;
19166 Scale = 4;
19167 break;
19168 }
19169
19170 if ((V & (Scale - 1)) != 0)
19171 return false;
19172 return isUInt<5>(V / Scale);
19173}
19174
19175static bool isLegalT2AddressImmediate(int64_t V, EVT VT,
19176 const ARMSubtarget *Subtarget) {
19177 if (!VT.isInteger() && !VT.isFloatingPoint())
19178 return false;
19179 if (VT.isVector() && Subtarget->hasNEON())
19180 return false;
19181 if (VT.isVector() && VT.isFloatingPoint() && Subtarget->hasMVEIntegerOps() &&
19182 !Subtarget->hasMVEFloatOps())
19183 return false;
19184
19185 bool IsNeg = false;
19186 if (V < 0) {
19187 IsNeg = true;
19188 V = -V;
19189 }
19190
19191 unsigned NumBytes = std::max((unsigned)VT.getSizeInBits() / 8, 1U);
19192
19193 // MVE: size * imm7
19194 if (VT.isVector() && Subtarget->hasMVEIntegerOps()) {
19195 switch (VT.getSimpleVT().getVectorElementType().SimpleTy) {
19196 case MVT::i32:
19197 case MVT::f32:
19198 return isShiftedUInt<7,2>(V);
19199 case MVT::i16:
19200 case MVT::f16:
19201 return isShiftedUInt<7,1>(V);
19202 case MVT::i8:
19203 return isUInt<7>(V);
19204 default:
19205 return false;
19206 }
19207 }
19208
19209 // half VLDR: 2 * imm8
19210 if (VT.isFloatingPoint() && NumBytes == 2 && Subtarget->hasFPRegs16())
19211 return isShiftedUInt<8, 1>(V);
19212 // VLDR and LDRD: 4 * imm8
19213 if ((VT.isFloatingPoint() && Subtarget->hasVFP2Base()) || NumBytes == 8)
19214 return isShiftedUInt<8, 2>(V);
19215
19216 if (NumBytes == 1 || NumBytes == 2 || NumBytes == 4) {
19217 // + imm12 or - imm8
19218 if (IsNeg)
19219 return isUInt<8>(V);
19220 return isUInt<12>(V);
19221 }
19222
19223 return false;
19224}
19225
19226/// isLegalAddressImmediate - Return true if the integer value can be used
19227/// as the offset of the target addressing mode for load / store of the
19228/// given type.
19229static bool isLegalAddressImmediate(int64_t V, EVT VT,
19230 const ARMSubtarget *Subtarget) {
19231 if (V == 0)
19232 return true;
19233
19234 if (!VT.isSimple())
19235 return false;
19236
19237 if (Subtarget->isThumb1Only())
19238 return isLegalT1AddressImmediate(V, VT);
19239 else if (Subtarget->isThumb2())
19240 return isLegalT2AddressImmediate(V, VT, Subtarget);
19241
19242 // ARM mode.
19243 if (V < 0)
19244 V = - V;
19245 switch (VT.getSimpleVT().SimpleTy) {
19246 default: return false;
19247 case MVT::i1:
19248 case MVT::i8:
19249 case MVT::i32:
19250 // +- imm12
19251 return isUInt<12>(V);
19252 case MVT::i16:
19253 // +- imm8
19254 return isUInt<8>(V);
19255 case MVT::f32:
19256 case MVT::f64:
19257 if (!Subtarget->hasVFP2Base()) // FIXME: NEON?
19258 return false;
19259 return isShiftedUInt<8, 2>(V);
19260 }
19261}
19262
19264 EVT VT) const {
19265 int Scale = AM.Scale;
19266 if (Scale < 0)
19267 return false;
19268
19269 switch (VT.getSimpleVT().SimpleTy) {
19270 default: return false;
19271 case MVT::i1:
19272 case MVT::i8:
19273 case MVT::i16:
19274 case MVT::i32:
19275 if (Scale == 1)
19276 return true;
19277 // r + r << imm
19278 Scale = Scale & ~1;
19279 return Scale == 2 || Scale == 4 || Scale == 8;
19280 case MVT::i64:
19281 // FIXME: What are we trying to model here? ldrd doesn't have an r + r
19282 // version in Thumb mode.
19283 // r + r
19284 if (Scale == 1)
19285 return true;
19286 // r * 2 (this can be lowered to r + r).
19287 if (!AM.HasBaseReg && Scale == 2)
19288 return true;
19289 return false;
19290 case MVT::isVoid:
19291 // Note, we allow "void" uses (basically, uses that aren't loads or
19292 // stores), because arm allows folding a scale into many arithmetic
19293 // operations. This should be made more precise and revisited later.
19294
19295 // Allow r << imm, but the imm has to be a multiple of two.
19296 if (Scale & 1) return false;
19297 return isPowerOf2_32(Scale);
19298 }
19299}
19300
19302 EVT VT) const {
19303 const int Scale = AM.Scale;
19304
19305 // Negative scales are not supported in Thumb1.
19306 if (Scale < 0)
19307 return false;
19308
19309 // Thumb1 addressing modes do not support register scaling excepting the
19310 // following cases:
19311 // 1. Scale == 1 means no scaling.
19312 // 2. Scale == 2 this can be lowered to r + r if there is no base register.
19313 return (Scale == 1) || (!AM.HasBaseReg && Scale == 2);
19314}
19315
19316/// isLegalAddressingMode - Return true if the addressing mode represented
19317/// by AM is legal for this target, for a load/store of the specified type.
19319 const AddrMode &AM, Type *Ty,
19320 unsigned AS, Instruction *I) const {
19321 EVT VT = getValueType(DL, Ty, true);
19322 if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))
19323 return false;
19324
19325 // Can never fold addr of global into load/store.
19326 if (AM.BaseGV)
19327 return false;
19328
19329 switch (AM.Scale) {
19330 case 0: // no scale reg, must be "r+i" or "r", or "i".
19331 break;
19332 default:
19333 // ARM doesn't support any R+R*scale+imm addr modes.
19334 if (AM.BaseOffs)
19335 return false;
19336
19337 if (!VT.isSimple())
19338 return false;
19339
19340 if (Subtarget->isThumb1Only())
19341 return isLegalT1ScaledAddressingMode(AM, VT);
19342
19343 if (Subtarget->isThumb2())
19344 return isLegalT2ScaledAddressingMode(AM, VT);
19345
19346 int Scale = AM.Scale;
19347 switch (VT.getSimpleVT().SimpleTy) {
19348 default: return false;
19349 case MVT::i1:
19350 case MVT::i8:
19351 case MVT::i32:
19352 if (Scale < 0) Scale = -Scale;
19353 if (Scale == 1)
19354 return true;
19355 // r + r << imm
19356 return isPowerOf2_32(Scale & ~1);
19357 case MVT::i16:
19358 case MVT::i64:
19359 // r +/- r
19360 if (Scale == 1 || (AM.HasBaseReg && Scale == -1))
19361 return true;
19362 // r * 2 (this can be lowered to r + r).
19363 if (!AM.HasBaseReg && Scale == 2)
19364 return true;
19365 return false;
19366
19367 case MVT::isVoid:
19368 // Note, we allow "void" uses (basically, uses that aren't loads or
19369 // stores), because arm allows folding a scale into many arithmetic
19370 // operations. This should be made more precise and revisited later.
19371
19372 // Allow r << imm, but the imm has to be a multiple of two.
19373 if (Scale & 1) return false;
19374 return isPowerOf2_32(Scale);
19375 }
19376 }
19377 return true;
19378}
19379
19380/// isLegalICmpImmediate - Return true if the specified immediate is legal
19381/// icmp immediate, that is the target has icmp instructions which can compare
19382/// a register against the immediate without having to materialize the
19383/// immediate into a register.
19385 // Thumb2 and ARM modes can use cmn for negative immediates.
19386 if (!Subtarget->isThumb())
19387 return ARM_AM::getSOImmVal((uint32_t)Imm) != -1 ||
19388 ARM_AM::getSOImmVal(-(uint32_t)Imm) != -1;
19389 if (Subtarget->isThumb2())
19390 return ARM_AM::getT2SOImmVal((uint32_t)Imm) != -1 ||
19391 ARM_AM::getT2SOImmVal(-(uint32_t)Imm) != -1;
19392 // Thumb1 doesn't have cmn, and only 8-bit immediates.
19393 return Imm >= 0 && Imm <= 255;
19394}
19395
19396/// isLegalAddImmediate - Return true if the specified immediate is a legal add
19397/// *or sub* immediate, that is the target has add or sub instructions which can
19398/// add a register with the immediate without having to materialize the
19399/// immediate into a register.
19401 // Same encoding for add/sub, just flip the sign.
19402 uint64_t AbsImm = AbsoluteValue(Imm);
19403 if (!Subtarget->isThumb())
19404 return ARM_AM::getSOImmVal(AbsImm) != -1;
19405 if (Subtarget->isThumb2())
19406 return ARM_AM::getT2SOImmVal(AbsImm) != -1;
19407 // Thumb1 only has 8-bit unsigned immediate.
19408 return AbsImm <= 255;
19409}
19410
19411// Return false to prevent folding
19412// (mul (add r, c0), c1) -> (add (mul r, c1), c0*c1) in DAGCombine,
19413// if the folding leads to worse code.
19415 SDValue ConstNode) const {
19416 // Let the DAGCombiner decide for vector types and large types.
19417 const EVT VT = AddNode.getValueType();
19418 if (VT.isVector() || VT.getScalarSizeInBits() > 32)
19419 return true;
19420
19421 // It is worse if c0 is legal add immediate, while c1*c0 is not
19422 // and has to be composed by at least two instructions.
19423 const ConstantSDNode *C0Node = cast<ConstantSDNode>(AddNode.getOperand(1));
19424 const ConstantSDNode *C1Node = cast<ConstantSDNode>(ConstNode);
19425 const int64_t C0 = C0Node->getSExtValue();
19426 APInt CA = C0Node->getAPIntValue() * C1Node->getAPIntValue();
19428 return true;
19429 if (ConstantMaterializationCost((unsigned)CA.getZExtValue(), Subtarget) > 1)
19430 return false;
19431
19432 // Default to true and let the DAGCombiner decide.
19433 return true;
19434}
19435
19437 bool isSEXTLoad, SDValue &Base,
19438 SDValue &Offset, bool &isInc,
19439 SelectionDAG &DAG) {
19440 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19441 return false;
19442
19443 if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) {
19444 // AddressingMode 3
19445 Base = Ptr->getOperand(0);
19447 int RHSC = (int)RHS->getZExtValue();
19448 if (RHSC < 0 && RHSC > -256) {
19449 assert(Ptr->getOpcode() == ISD::ADD);
19450 isInc = false;
19451 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19452 return true;
19453 }
19454 }
19455 isInc = (Ptr->getOpcode() == ISD::ADD);
19456 Offset = Ptr->getOperand(1);
19457 return true;
19458 } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) {
19459 // AddressingMode 2
19461 int RHSC = (int)RHS->getZExtValue();
19462 if (RHSC < 0 && RHSC > -0x1000) {
19463 assert(Ptr->getOpcode() == ISD::ADD);
19464 isInc = false;
19465 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19466 Base = Ptr->getOperand(0);
19467 return true;
19468 }
19469 }
19470
19471 if (Ptr->getOpcode() == ISD::ADD) {
19472 isInc = true;
19473 ARM_AM::ShiftOpc ShOpcVal=
19475 if (ShOpcVal != ARM_AM::no_shift) {
19476 Base = Ptr->getOperand(1);
19477 Offset = Ptr->getOperand(0);
19478 } else {
19479 Base = Ptr->getOperand(0);
19480 Offset = Ptr->getOperand(1);
19481 }
19482 return true;
19483 }
19484
19485 isInc = (Ptr->getOpcode() == ISD::ADD);
19486 Base = Ptr->getOperand(0);
19487 Offset = Ptr->getOperand(1);
19488 return true;
19489 }
19490
19491 // FIXME: Use VLDM / VSTM to emulate indexed FP load / store.
19492 return false;
19493}
19494
19496 bool isSEXTLoad, SDValue &Base,
19497 SDValue &Offset, bool &isInc,
19498 SelectionDAG &DAG) {
19499 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19500 return false;
19501
19502 Base = Ptr->getOperand(0);
19504 int RHSC = (int)RHS->getZExtValue();
19505 if (RHSC < 0 && RHSC > -0x100) { // 8 bits.
19506 assert(Ptr->getOpcode() == ISD::ADD);
19507 isInc = false;
19508 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19509 return true;
19510 } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero.
19511 isInc = Ptr->getOpcode() == ISD::ADD;
19512 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
19513 return true;
19514 }
19515 }
19516
19517 return false;
19518}
19519
19520static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment,
19521 bool isSEXTLoad, bool IsMasked, bool isLE,
19523 bool &isInc, SelectionDAG &DAG) {
19524 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19525 return false;
19526 if (!isa<ConstantSDNode>(Ptr->getOperand(1)))
19527 return false;
19528
19529 // We allow LE non-masked loads to change the type (for example use a vldrb.8
19530 // as opposed to a vldrw.32). This can allow extra addressing modes or
19531 // alignments for what is otherwise an equivalent instruction.
19532 bool CanChangeType = isLE && !IsMasked;
19533
19535 int RHSC = (int)RHS->getZExtValue();
19536
19537 auto IsInRange = [&](int RHSC, int Limit, int Scale) {
19538 if (RHSC < 0 && RHSC > -Limit * Scale && RHSC % Scale == 0) {
19539 assert(Ptr->getOpcode() == ISD::ADD);
19540 isInc = false;
19541 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19542 return true;
19543 } else if (RHSC > 0 && RHSC < Limit * Scale && RHSC % Scale == 0) {
19544 isInc = Ptr->getOpcode() == ISD::ADD;
19545 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
19546 return true;
19547 }
19548 return false;
19549 };
19550
19551 // Try to find a matching instruction based on s/zext, Alignment, Offset and
19552 // (in BE/masked) type.
19553 Base = Ptr->getOperand(0);
19554 if (VT == MVT::v4i16) {
19555 if (Alignment >= 2 && IsInRange(RHSC, 0x80, 2))
19556 return true;
19557 } else if (VT == MVT::v4i8 || VT == MVT::v8i8) {
19558 if (IsInRange(RHSC, 0x80, 1))
19559 return true;
19560 } else if (Alignment >= 4 &&
19561 (CanChangeType || VT == MVT::v4i32 || VT == MVT::v4f32) &&
19562 IsInRange(RHSC, 0x80, 4))
19563 return true;
19564 else if (Alignment >= 2 &&
19565 (CanChangeType || VT == MVT::v8i16 || VT == MVT::v8f16) &&
19566 IsInRange(RHSC, 0x80, 2))
19567 return true;
19568 else if ((CanChangeType || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1))
19569 return true;
19570 return false;
19571}
19572
19573/// getPreIndexedAddressParts - returns true by value, base pointer and
19574/// offset pointer and addressing mode by reference if the node's address
19575/// can be legally represented as pre-indexed load / store address.
19576bool
19578 SDValue &Offset,
19580 SelectionDAG &DAG) const {
19581 if (Subtarget->isThumb1Only())
19582 return false;
19583
19584 EVT VT;
19585 SDValue Ptr;
19586 Align Alignment;
19587 bool isSEXTLoad = false;
19588 bool IsMasked = false;
19589 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
19590 Ptr = LD->getBasePtr();
19591 VT = LD->getMemoryVT();
19592 Alignment = LD->getAlign();
19593 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19594 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
19595 Ptr = ST->getBasePtr();
19596 VT = ST->getMemoryVT();
19597 Alignment = ST->getAlign();
19598 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
19599 Ptr = LD->getBasePtr();
19600 VT = LD->getMemoryVT();
19601 Alignment = LD->getAlign();
19602 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19603 IsMasked = true;
19605 Ptr = ST->getBasePtr();
19606 VT = ST->getMemoryVT();
19607 Alignment = ST->getAlign();
19608 IsMasked = true;
19609 } else
19610 return false;
19611
19612 bool isInc;
19613 bool isLegal = false;
19614 if (VT.isVector())
19615 isLegal = Subtarget->hasMVEIntegerOps() &&
19617 Ptr.getNode(), VT, Alignment, isSEXTLoad, IsMasked,
19618 Subtarget->isLittle(), Base, Offset, isInc, DAG);
19619 else {
19620 if (Subtarget->isThumb2())
19621 isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
19622 Offset, isInc, DAG);
19623 else
19624 isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
19625 Offset, isInc, DAG);
19626 }
19627 if (!isLegal)
19628 return false;
19629
19630 AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC;
19631 return true;
19632}
19633
19634/// getPostIndexedAddressParts - returns true by value, base pointer and
19635/// offset pointer and addressing mode by reference if this node can be
19636/// combined with a load / store to form a post-indexed load / store.
19638 SDValue &Base,
19639 SDValue &Offset,
19641 SelectionDAG &DAG) const {
19642 EVT VT;
19643 SDValue Ptr;
19644 Align Alignment;
19645 bool isSEXTLoad = false, isNonExt;
19646 bool IsMasked = false;
19647 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
19648 VT = LD->getMemoryVT();
19649 Ptr = LD->getBasePtr();
19650 Alignment = LD->getAlign();
19651 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19652 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
19653 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
19654 VT = ST->getMemoryVT();
19655 Ptr = ST->getBasePtr();
19656 Alignment = ST->getAlign();
19657 isNonExt = !ST->isTruncatingStore();
19658 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
19659 VT = LD->getMemoryVT();
19660 Ptr = LD->getBasePtr();
19661 Alignment = LD->getAlign();
19662 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19663 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
19664 IsMasked = true;
19666 VT = ST->getMemoryVT();
19667 Ptr = ST->getBasePtr();
19668 Alignment = ST->getAlign();
19669 isNonExt = !ST->isTruncatingStore();
19670 IsMasked = true;
19671 } else
19672 return false;
19673
19674 if (Subtarget->isThumb1Only()) {
19675 // Thumb-1 can do a limited post-inc load or store as an updating LDM. It
19676 // must be non-extending/truncating, i32, with an offset of 4.
19677 assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!");
19678 if (Op->getOpcode() != ISD::ADD || !isNonExt)
19679 return false;
19680 auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1));
19681 if (!RHS || RHS->getZExtValue() != 4)
19682 return false;
19683 if (Alignment < Align(4))
19684 return false;
19685
19686 Offset = Op->getOperand(1);
19687 Base = Op->getOperand(0);
19688 AM = ISD::POST_INC;
19689 return true;
19690 }
19691
19692 bool isInc;
19693 bool isLegal = false;
19694 if (VT.isVector())
19695 isLegal = Subtarget->hasMVEIntegerOps() &&
19696 getMVEIndexedAddressParts(Op, VT, Alignment, isSEXTLoad, IsMasked,
19697 Subtarget->isLittle(), Base, Offset,
19698 isInc, DAG);
19699 else {
19700 if (Subtarget->isThumb2())
19701 isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
19702 isInc, DAG);
19703 else
19704 isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
19705 isInc, DAG);
19706 }
19707 if (!isLegal)
19708 return false;
19709
19710 if (Ptr != Base) {
19711 // Swap base ptr and offset to catch more post-index load / store when
19712 // it's legal. In Thumb2 mode, offset must be an immediate.
19713 if (Ptr == Offset && Op->getOpcode() == ISD::ADD &&
19714 !Subtarget->isThumb2())
19716
19717 // Post-indexed load / store update the base pointer.
19718 if (Ptr != Base)
19719 return false;
19720 }
19721
19722 AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
19723 return true;
19724}
19725
19727 KnownBits &Known,
19728 const APInt &DemandedElts,
19729 const SelectionDAG &DAG,
19730 unsigned Depth) const {
19731 unsigned BitWidth = Known.getBitWidth();
19732 Known.resetAll();
19733 switch (Op.getOpcode()) {
19734 default: break;
19735 case ARMISD::ADDC:
19736 case ARMISD::ADDE:
19737 case ARMISD::SUBC:
19738 case ARMISD::SUBE:
19739 // Special cases when we convert a carry to a boolean.
19740 if (Op.getResNo() == 0) {
19741 SDValue LHS = Op.getOperand(0);
19742 SDValue RHS = Op.getOperand(1);
19743 // (ADDE 0, 0, C) will give us a single bit.
19744 if (Op->getOpcode() == ARMISD::ADDE && isNullConstant(LHS) &&
19745 isNullConstant(RHS)) {
19747 return;
19748 }
19749 }
19750 break;
19751 case ARMISD::CMOV: {
19752 // Bits are known zero/one if known on the LHS and RHS.
19753 Known = DAG.computeKnownBits(Op.getOperand(0), Depth+1);
19754 if (Known.isUnknown())
19755 return;
19756
19757 KnownBits KnownRHS = DAG.computeKnownBits(Op.getOperand(1), Depth+1);
19758 Known = Known.intersectWith(KnownRHS);
19759 return;
19760 }
19762 Intrinsic::ID IntID =
19763 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
19764 switch (IntID) {
19765 default: return;
19766 case Intrinsic::arm_ldaex:
19767 case Intrinsic::arm_ldrex: {
19768 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
19769 unsigned MemBits = VT.getScalarSizeInBits();
19770 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
19771 return;
19772 }
19773 }
19774 }
19775 case ARMISD::BFI: {
19776 // Conservatively, we can recurse down the first operand
19777 // and just mask out all affected bits.
19778 Known = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
19779
19780 // The operand to BFI is already a mask suitable for removing the bits it
19781 // sets.
19782 const APInt &Mask = Op.getConstantOperandAPInt(2);
19783 Known.Zero &= Mask;
19784 Known.One &= Mask;
19785 return;
19786 }
19787 case ARMISD::VGETLANEs:
19788 case ARMISD::VGETLANEu: {
19789 const SDValue &SrcSV = Op.getOperand(0);
19790 EVT VecVT = SrcSV.getValueType();
19791 assert(VecVT.isVector() && "VGETLANE expected a vector type");
19792 const unsigned NumSrcElts = VecVT.getVectorNumElements();
19793 ConstantSDNode *Pos = cast<ConstantSDNode>(Op.getOperand(1).getNode());
19794 assert(Pos->getAPIntValue().ult(NumSrcElts) &&
19795 "VGETLANE index out of bounds");
19796 unsigned Idx = Pos->getZExtValue();
19797 APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx);
19798 Known = DAG.computeKnownBits(SrcSV, DemandedElt, Depth + 1);
19799
19800 EVT VT = Op.getValueType();
19801 const unsigned DstSz = VT.getScalarSizeInBits();
19802 const unsigned SrcSz = VecVT.getVectorElementType().getSizeInBits();
19803 (void)SrcSz;
19804 assert(SrcSz == Known.getBitWidth());
19805 assert(DstSz > SrcSz);
19806 if (Op.getOpcode() == ARMISD::VGETLANEs)
19807 Known = Known.sext(DstSz);
19808 else {
19809 Known = Known.zext(DstSz);
19810 }
19811 assert(DstSz == Known.getBitWidth());
19812 break;
19813 }
19814 case ARMISD::VMOVrh: {
19815 KnownBits KnownOp = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
19816 assert(KnownOp.getBitWidth() == 16);
19817 Known = KnownOp.zext(32);
19818 break;
19819 }
19820 case ARMISD::CSINC:
19821 case ARMISD::CSINV:
19822 case ARMISD::CSNEG: {
19823 KnownBits KnownOp0 = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
19824 KnownBits KnownOp1 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
19825
19826 // The result is either:
19827 // CSINC: KnownOp0 or KnownOp1 + 1
19828 // CSINV: KnownOp0 or ~KnownOp1
19829 // CSNEG: KnownOp0 or KnownOp1 * -1
19830 if (Op.getOpcode() == ARMISD::CSINC)
19831 KnownOp1 =
19832 KnownBits::add(KnownOp1, KnownBits::makeConstant(APInt(32, 1)));
19833 else if (Op.getOpcode() == ARMISD::CSINV)
19834 std::swap(KnownOp1.Zero, KnownOp1.One);
19835 else if (Op.getOpcode() == ARMISD::CSNEG)
19836 KnownOp1 = KnownBits::mul(KnownOp1,
19838
19839 Known = KnownOp0.intersectWith(KnownOp1);
19840 break;
19841 }
19842 case ARMISD::VORRIMM:
19843 case ARMISD::VBICIMM: {
19844 unsigned Encoded = Op.getConstantOperandVal(1);
19845 unsigned DecEltBits = 0;
19846 uint64_t DecodedVal = ARM_AM::decodeVMOVModImm(Encoded, DecEltBits);
19847
19848 unsigned EltBits = Op.getScalarValueSizeInBits();
19849 if (EltBits != DecEltBits) {
19850 // Be conservative: only update Known when EltBits == DecEltBits.
19851 // This is believed to always be true for VORRIMM/VBICIMM today, but if
19852 // that changes in the future, doing nothing here is safer than risking
19853 // subtle bugs.
19854 break;
19855 }
19856
19857 KnownBits KnownLHS = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
19858 bool IsVORR = Op.getOpcode() == ARMISD::VORRIMM;
19859 APInt Imm(DecEltBits, DecodedVal);
19860
19861 Known.One = IsVORR ? (KnownLHS.One | Imm) : (KnownLHS.One & ~Imm);
19862 Known.Zero = IsVORR ? (KnownLHS.Zero & ~Imm) : (KnownLHS.Zero | Imm);
19863 break;
19864 }
19865 }
19866}
19867
19869 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
19870 TargetLoweringOpt &TLO) const {
19871 // Delay optimization, so we don't have to deal with illegal types, or block
19872 // optimizations.
19873 if (!TLO.LegalOps)
19874 return false;
19875
19876 // Only optimize AND for now.
19877 if (Op.getOpcode() != ISD::AND)
19878 return false;
19879
19880 EVT VT = Op.getValueType();
19881
19882 // Ignore vectors.
19883 if (VT.isVector())
19884 return false;
19885
19886 assert(VT == MVT::i32 && "Unexpected integer type");
19887
19888 // Make sure the RHS really is a constant.
19889 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
19890 if (!C)
19891 return false;
19892
19893 unsigned Mask = C->getZExtValue();
19894
19895 unsigned Demanded = DemandedBits.getZExtValue();
19896 unsigned ShrunkMask = Mask & Demanded;
19897 unsigned ExpandedMask = Mask | ~Demanded;
19898
19899 // If the mask is all zeros, let the target-independent code replace the
19900 // result with zero.
19901 if (ShrunkMask == 0)
19902 return false;
19903
19904 // If the mask is all ones, erase the AND. (Currently, the target-independent
19905 // code won't do this, so we have to do it explicitly to avoid an infinite
19906 // loop in obscure cases.)
19907 if (ExpandedMask == ~0U)
19908 return TLO.CombineTo(Op, Op.getOperand(0));
19909
19910 auto IsLegalMask = [ShrunkMask, ExpandedMask](unsigned Mask) -> bool {
19911 return (ShrunkMask & Mask) == ShrunkMask && (~ExpandedMask & Mask) == 0;
19912 };
19913 auto UseMask = [Mask, Op, VT, &TLO](unsigned NewMask) -> bool {
19914 if (NewMask == Mask)
19915 return true;
19916 SDLoc DL(Op);
19917 SDValue NewC = TLO.DAG.getConstant(NewMask, DL, VT);
19918 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
19919 return TLO.CombineTo(Op, NewOp);
19920 };
19921
19922 // Prefer uxtb mask.
19923 if (IsLegalMask(0xFF))
19924 return UseMask(0xFF);
19925
19926 // Prefer uxth mask.
19927 if (IsLegalMask(0xFFFF))
19928 return UseMask(0xFFFF);
19929
19930 // [1, 255] is Thumb1 movs+ands, legal immediate for ARM/Thumb2.
19931 // FIXME: Prefer a contiguous sequence of bits for other optimizations.
19932 if (ShrunkMask < 256)
19933 return UseMask(ShrunkMask);
19934
19935 // [-256, -2] is Thumb1 movs+bics, legal immediate for ARM/Thumb2.
19936 // FIXME: Prefer a contiguous sequence of bits for other optimizations.
19937 if ((int)ExpandedMask <= -2 && (int)ExpandedMask >= -256)
19938 return UseMask(ExpandedMask);
19939
19940 // Potential improvements:
19941 //
19942 // We could try to recognize lsls+lsrs or lsrs+lsls pairs here.
19943 // We could try to prefer Thumb1 immediates which can be lowered to a
19944 // two-instruction sequence.
19945 // We could try to recognize more legal ARM/Thumb2 immediates here.
19946
19947 return false;
19948}
19949
19951 SDValue Op, const APInt &OriginalDemandedBits,
19952 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
19953 unsigned Depth) const {
19954 unsigned Opc = Op.getOpcode();
19955
19956 switch (Opc) {
19957 case ARMISD::ASRL:
19958 case ARMISD::LSRL: {
19959 // If this is result 0 and the other result is unused, see if the demand
19960 // bits allow us to shrink this long shift into a standard small shift in
19961 // the opposite direction.
19962 if (Op.getResNo() == 0 && !Op->hasAnyUseOfValue(1) &&
19963 isa<ConstantSDNode>(Op->getOperand(2))) {
19964 unsigned ShAmt = Op->getConstantOperandVal(2);
19965 if (ShAmt < 32 && OriginalDemandedBits.isSubsetOf(APInt::getAllOnes(32)
19966 << (32 - ShAmt)))
19967 return TLO.CombineTo(
19968 Op, TLO.DAG.getNode(
19969 ISD::SHL, SDLoc(Op), MVT::i32, Op.getOperand(1),
19970 TLO.DAG.getConstant(32 - ShAmt, SDLoc(Op), MVT::i32)));
19971 }
19972 break;
19973 }
19974 case ARMISD::VBICIMM: {
19975 SDValue Op0 = Op.getOperand(0);
19976 unsigned ModImm = Op.getConstantOperandVal(1);
19977 unsigned EltBits = 0;
19978 uint64_t Mask = ARM_AM::decodeVMOVModImm(ModImm, EltBits);
19979 if ((OriginalDemandedBits & Mask) == 0)
19980 return TLO.CombineTo(Op, Op0);
19981 }
19982 }
19983
19985 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
19986}
19987
19988//===----------------------------------------------------------------------===//
19989// ARM Inline Assembly Support
19990//===----------------------------------------------------------------------===//
19991
19992const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const {
19993 // At this point, we have to lower this constraint to something else, so we
19994 // lower it to an "r" or "w". However, by doing this we will force the result
19995 // to be in register, while the X constraint is much more permissive.
19996 //
19997 // Although we are correct (we are free to emit anything, without
19998 // constraints), we might break use cases that would expect us to be more
19999 // efficient and emit something else.
20000 if (!Subtarget->hasVFP2Base())
20001 return "r";
20002 if (ConstraintVT.isFloatingPoint())
20003 return "w";
20004 if (ConstraintVT.isVector() && Subtarget->hasNEON() &&
20005 (ConstraintVT.getSizeInBits() == 64 ||
20006 ConstraintVT.getSizeInBits() == 128))
20007 return "w";
20008
20009 return "r";
20010}
20011
20012/// getConstraintType - Given a constraint letter, return the type of
20013/// constraint it is for this target.
20016 unsigned S = Constraint.size();
20017 if (S == 1) {
20018 switch (Constraint[0]) {
20019 default: break;
20020 case 'l': return C_RegisterClass;
20021 case 'w': return C_RegisterClass;
20022 case 'h': return C_RegisterClass;
20023 case 'x': return C_RegisterClass;
20024 case 't': return C_RegisterClass;
20025 case 'j': return C_Immediate; // Constant for movw.
20026 // An address with a single base register. Due to the way we
20027 // currently handle addresses it is the same as an 'r' memory constraint.
20028 case 'Q': return C_Memory;
20029 }
20030 } else if (S == 2) {
20031 switch (Constraint[0]) {
20032 default: break;
20033 case 'T': return C_RegisterClass;
20034 // All 'U+' constraints are addresses.
20035 case 'U': return C_Memory;
20036 }
20037 }
20038 return TargetLowering::getConstraintType(Constraint);
20039}
20040
20041/// Examine constraint type and operand type and determine a weight value.
20042/// This object must already have been set up with the operand type
20043/// and the current alternative constraint selected.
20046 AsmOperandInfo &info, const char *constraint) const {
20048 Value *CallOperandVal = info.CallOperandVal;
20049 // If we don't have a value, we can't do a match,
20050 // but allow it at the lowest weight.
20051 if (!CallOperandVal)
20052 return CW_Default;
20053 Type *type = CallOperandVal->getType();
20054 // Look at the constraint type.
20055 switch (*constraint) {
20056 default:
20058 break;
20059 case 'l':
20060 if (type->isIntegerTy()) {
20061 if (Subtarget->isThumb())
20062 weight = CW_SpecificReg;
20063 else
20064 weight = CW_Register;
20065 }
20066 break;
20067 case 'w':
20068 if (type->isFloatingPointTy())
20069 weight = CW_Register;
20070 break;
20071 }
20072 return weight;
20073}
20074
20075static bool isIncompatibleReg(const MCPhysReg &PR, MVT VT) {
20076 if (PR == 0 || VT == MVT::Other)
20077 return false;
20078 if (ARM::SPRRegClass.contains(PR))
20079 return VT != MVT::f32 && VT != MVT::f16 && VT != MVT::i32;
20080 if (ARM::DPRRegClass.contains(PR))
20081 return VT != MVT::f64 && !VT.is64BitVector();
20082 return false;
20083}
20084
20085using RCPair = std::pair<unsigned, const TargetRegisterClass *>;
20086
20088 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
20089 switch (Constraint.size()) {
20090 case 1:
20091 // GCC ARM Constraint Letters
20092 switch (Constraint[0]) {
20093 case 'l': // Low regs or general regs.
20094 if (Subtarget->isThumb())
20095 return RCPair(0U, &ARM::tGPRRegClass);
20096 return RCPair(0U, &ARM::GPRRegClass);
20097 case 'h': // High regs or no regs.
20098 if (Subtarget->isThumb())
20099 return RCPair(0U, &ARM::hGPRRegClass);
20100 break;
20101 case 'r':
20102 if (Subtarget->isThumb1Only())
20103 return RCPair(0U, &ARM::tGPRRegClass);
20104 return RCPair(0U, &ARM::GPRRegClass);
20105 case 'w':
20106 if (VT == MVT::Other)
20107 break;
20108 if (VT == MVT::f32 || VT == MVT::f16 || VT == MVT::bf16)
20109 return RCPair(0U, &ARM::SPRRegClass);
20110 if (VT.getSizeInBits() == 64)
20111 return RCPair(0U, &ARM::DPRRegClass);
20112 if (VT.getSizeInBits() == 128)
20113 return RCPair(0U, &ARM::QPRRegClass);
20114 break;
20115 case 'x':
20116 if (VT == MVT::Other)
20117 break;
20118 if (VT == MVT::f32 || VT == MVT::f16 || VT == MVT::bf16)
20119 return RCPair(0U, &ARM::SPR_8RegClass);
20120 if (VT.getSizeInBits() == 64)
20121 return RCPair(0U, &ARM::DPR_8RegClass);
20122 if (VT.getSizeInBits() == 128)
20123 return RCPair(0U, &ARM::QPR_8RegClass);
20124 break;
20125 case 't':
20126 if (VT == MVT::Other)
20127 break;
20128 if (VT == MVT::f32 || VT == MVT::i32 || VT == MVT::f16 || VT == MVT::bf16)
20129 return RCPair(0U, &ARM::SPRRegClass);
20130 if (VT.getSizeInBits() == 64)
20131 return RCPair(0U, &ARM::DPR_VFP2RegClass);
20132 if (VT.getSizeInBits() == 128)
20133 return RCPair(0U, &ARM::QPR_VFP2RegClass);
20134 break;
20135 }
20136 break;
20137
20138 case 2:
20139 if (Constraint[0] == 'T') {
20140 switch (Constraint[1]) {
20141 default:
20142 break;
20143 case 'e':
20144 return RCPair(0U, &ARM::tGPREvenRegClass);
20145 case 'o':
20146 return RCPair(0U, &ARM::tGPROddRegClass);
20147 }
20148 }
20149 break;
20150
20151 default:
20152 break;
20153 }
20154
20155 if (StringRef("{cc}").equals_insensitive(Constraint))
20156 return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
20157
20158 auto RCP = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
20159 if (isIncompatibleReg(RCP.first, VT))
20160 return {0, nullptr};
20161 return RCP;
20162}
20163
20164/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
20165/// vector. If it is invalid, don't add anything to Ops.
20167 StringRef Constraint,
20168 std::vector<SDValue> &Ops,
20169 SelectionDAG &DAG) const {
20170 SDValue Result;
20171
20172 // Currently only support length 1 constraints.
20173 if (Constraint.size() != 1)
20174 return;
20175
20176 char ConstraintLetter = Constraint[0];
20177 switch (ConstraintLetter) {
20178 default: break;
20179 case 'j':
20180 case 'I': case 'J': case 'K': case 'L':
20181 case 'M': case 'N': case 'O':
20183 if (!C)
20184 return;
20185
20186 int64_t CVal64 = C->getSExtValue();
20187 int CVal = (int) CVal64;
20188 // None of these constraints allow values larger than 32 bits. Check
20189 // that the value fits in an int.
20190 if (CVal != CVal64)
20191 return;
20192
20193 switch (ConstraintLetter) {
20194 case 'j':
20195 // Constant suitable for movw, must be between 0 and
20196 // 65535.
20197 if (Subtarget->hasV6T2Ops() || (Subtarget->hasV8MBaselineOps()))
20198 if (CVal >= 0 && CVal <= 65535)
20199 break;
20200 return;
20201 case 'I':
20202 if (Subtarget->isThumb1Only()) {
20203 // This must be a constant between 0 and 255, for ADD
20204 // immediates.
20205 if (CVal >= 0 && CVal <= 255)
20206 break;
20207 } else if (Subtarget->isThumb2()) {
20208 // A constant that can be used as an immediate value in a
20209 // data-processing instruction.
20210 if (ARM_AM::getT2SOImmVal(CVal) != -1)
20211 break;
20212 } else {
20213 // A constant that can be used as an immediate value in a
20214 // data-processing instruction.
20215 if (ARM_AM::getSOImmVal(CVal) != -1)
20216 break;
20217 }
20218 return;
20219
20220 case 'J':
20221 if (Subtarget->isThumb1Only()) {
20222 // This must be a constant between -255 and -1, for negated ADD
20223 // immediates. This can be used in GCC with an "n" modifier that
20224 // prints the negated value, for use with SUB instructions. It is
20225 // not useful otherwise but is implemented for compatibility.
20226 if (CVal >= -255 && CVal <= -1)
20227 break;
20228 } else {
20229 // This must be a constant between -4095 and 4095. This is suitable
20230 // for use as the immediate offset field in LDR and STR instructions
20231 // such as LDR r0,[r1,#offset].
20232 if (CVal >= -4095 && CVal <= 4095)
20233 break;
20234 }
20235 return;
20236
20237 case 'K':
20238 if (Subtarget->isThumb1Only()) {
20239 // A 32-bit value where only one byte has a nonzero value. Exclude
20240 // zero to match GCC. This constraint is used by GCC internally for
20241 // constants that can be loaded with a move/shift combination.
20242 // It is not useful otherwise but is implemented for compatibility.
20243 if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal))
20244 break;
20245 } else if (Subtarget->isThumb2()) {
20246 // A constant whose bitwise inverse can be used as an immediate
20247 // value in a data-processing instruction. This can be used in GCC
20248 // with a "B" modifier that prints the inverted value, for use with
20249 // BIC and MVN instructions. It is not useful otherwise but is
20250 // implemented for compatibility.
20251 if (ARM_AM::getT2SOImmVal(~CVal) != -1)
20252 break;
20253 } else {
20254 // A constant whose bitwise inverse can be used as an immediate
20255 // value in a data-processing instruction. This can be used in GCC
20256 // with a "B" modifier that prints the inverted value, for use with
20257 // BIC and MVN instructions. It is not useful otherwise but is
20258 // implemented for compatibility.
20259 if (ARM_AM::getSOImmVal(~CVal) != -1)
20260 break;
20261 }
20262 return;
20263
20264 case 'L':
20265 if (Subtarget->isThumb1Only()) {
20266 // This must be a constant between -7 and 7,
20267 // for 3-operand ADD/SUB immediate instructions.
20268 if (CVal >= -7 && CVal < 7)
20269 break;
20270 } else if (Subtarget->isThumb2()) {
20271 // A constant whose negation can be used as an immediate value in a
20272 // data-processing instruction. This can be used in GCC with an "n"
20273 // modifier that prints the negated value, for use with SUB
20274 // instructions. It is not useful otherwise but is implemented for
20275 // compatibility.
20276 if (ARM_AM::getT2SOImmVal(-CVal) != -1)
20277 break;
20278 } else {
20279 // A constant whose negation can be used as an immediate value in a
20280 // data-processing instruction. This can be used in GCC with an "n"
20281 // modifier that prints the negated value, for use with SUB
20282 // instructions. It is not useful otherwise but is implemented for
20283 // compatibility.
20284 if (ARM_AM::getSOImmVal(-CVal) != -1)
20285 break;
20286 }
20287 return;
20288
20289 case 'M':
20290 if (Subtarget->isThumb1Only()) {
20291 // This must be a multiple of 4 between 0 and 1020, for
20292 // ADD sp + immediate.
20293 if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0))
20294 break;
20295 } else {
20296 // A power of two or a constant between 0 and 32. This is used in
20297 // GCC for the shift amount on shifted register operands, but it is
20298 // useful in general for any shift amounts.
20299 if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0))
20300 break;
20301 }
20302 return;
20303
20304 case 'N':
20305 if (Subtarget->isThumb1Only()) {
20306 // This must be a constant between 0 and 31, for shift amounts.
20307 if (CVal >= 0 && CVal <= 31)
20308 break;
20309 }
20310 return;
20311
20312 case 'O':
20313 if (Subtarget->isThumb1Only()) {
20314 // This must be a multiple of 4 between -508 and 508, for
20315 // ADD/SUB sp = sp + immediate.
20316 if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0))
20317 break;
20318 }
20319 return;
20320 }
20321 Result = DAG.getSignedTargetConstant(CVal, SDLoc(Op), Op.getValueType());
20322 break;
20323 }
20324
20325 if (Result.getNode()) {
20326 Ops.push_back(Result);
20327 return;
20328 }
20329 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
20330}
20331
20332static RTLIB::Libcall getDivRemLibcall(
20333 const SDNode *N, MVT::SimpleValueType SVT) {
20334 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
20335 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) &&
20336 "Unhandled Opcode in getDivRemLibcall");
20337 bool isSigned = N->getOpcode() == ISD::SDIVREM ||
20338 N->getOpcode() == ISD::SREM;
20339 RTLIB::Libcall LC;
20340 switch (SVT) {
20341 default: llvm_unreachable("Unexpected request for libcall!");
20342 case MVT::i8: LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break;
20343 case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
20344 case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
20345 case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
20346 }
20347 return LC;
20348}
20349
20351 const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget) {
20352 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
20353 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) &&
20354 "Unhandled Opcode in getDivRemArgList");
20355 bool isSigned = N->getOpcode() == ISD::SDIVREM ||
20356 N->getOpcode() == ISD::SREM;
20358 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
20359 EVT ArgVT = N->getOperand(i).getValueType();
20360 Type *ArgTy = ArgVT.getTypeForEVT(*Context);
20361 TargetLowering::ArgListEntry Entry(N->getOperand(i), ArgTy);
20362 Entry.IsSExt = isSigned;
20363 Entry.IsZExt = !isSigned;
20364 Args.push_back(Entry);
20365 }
20366 if (Subtarget->isTargetWindows() && Args.size() >= 2)
20367 std::swap(Args[0], Args[1]);
20368 return Args;
20369}
20370
20371SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
20372 assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
20373 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
20374 Subtarget->isTargetFuchsia() || Subtarget->isTargetWindows()) &&
20375 "Register-based DivRem lowering only");
20376 unsigned Opcode = Op->getOpcode();
20377 assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
20378 "Invalid opcode for Div/Rem lowering");
20379 bool isSigned = (Opcode == ISD::SDIVREM);
20380 EVT VT = Op->getValueType(0);
20381 SDLoc dl(Op);
20382
20383 if (VT == MVT::i64 && isa<ConstantSDNode>(Op.getOperand(1))) {
20385 if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i32, DAG)) {
20386 SDValue Res0 =
20387 DAG.getNode(ISD::BUILD_PAIR, dl, VT, Result[0], Result[1]);
20388 SDValue Res1 =
20389 DAG.getNode(ISD::BUILD_PAIR, dl, VT, Result[2], Result[3]);
20390 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
20391 {Res0, Res1});
20392 }
20393 }
20394
20395 Type *Ty = VT.getTypeForEVT(*DAG.getContext());
20396
20397 // If the target has hardware divide, use divide + multiply + subtract:
20398 // div = a / b
20399 // rem = a - b * div
20400 // return {div, rem}
20401 // This should be lowered into UDIV/SDIV + MLS later on.
20402 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
20403 : Subtarget->hasDivideInARMMode();
20404 if (hasDivide && Op->getValueType(0).isSimple() &&
20405 Op->getSimpleValueType(0) == MVT::i32) {
20406 unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
20407 const SDValue Dividend = Op->getOperand(0);
20408 const SDValue Divisor = Op->getOperand(1);
20409 SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor);
20410 SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor);
20411 SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul);
20412
20413 SDValue Values[2] = {Div, Rem};
20414 return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values);
20415 }
20416
20417 RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(),
20418 VT.getSimpleVT().SimpleTy);
20419 SDValue InChain = DAG.getEntryNode();
20420
20422 DAG.getContext(),
20423 Subtarget);
20424
20425 RTLIB::LibcallImpl LCImpl = getLibcallImpl(LC);
20426 SDValue Callee =
20427 DAG.getExternalSymbol(LCImpl, getPointerTy(DAG.getDataLayout()));
20428
20429 Type *RetTy = StructType::get(Ty, Ty);
20430
20431 if (Subtarget->isTargetWindows())
20432 InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain);
20433
20434 TargetLowering::CallLoweringInfo CLI(DAG);
20435 CLI.setDebugLoc(dl)
20436 .setChain(InChain)
20437 .setCallee(getLibcallImplCallingConv(LCImpl), RetTy, Callee,
20438 std::move(Args))
20439 .setInRegister()
20442
20443 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
20444 return CallInfo.first;
20445}
20446
20447// Lowers REM using divmod helpers
20448// see RTABI section 4.2/4.3
20449SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const {
20450 EVT VT = N->getValueType(0);
20451
20452 if (VT == MVT::i64 && isa<ConstantSDNode>(N->getOperand(1))) {
20454 if (expandDIVREMByConstant(N, Result, MVT::i32, DAG))
20455 return DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), N->getValueType(0),
20456 Result[0], Result[1]);
20457 }
20458
20459 // Build return types (div and rem)
20460 std::vector<Type*> RetTyParams;
20461 Type *RetTyElement;
20462
20463 switch (VT.getSimpleVT().SimpleTy) {
20464 default: llvm_unreachable("Unexpected request for libcall!");
20465 case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break;
20466 case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break;
20467 case MVT::i32: RetTyElement = Type::getInt32Ty(*DAG.getContext()); break;
20468 case MVT::i64: RetTyElement = Type::getInt64Ty(*DAG.getContext()); break;
20469 }
20470
20471 RetTyParams.push_back(RetTyElement);
20472 RetTyParams.push_back(RetTyElement);
20473 ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams);
20474 Type *RetTy = StructType::get(*DAG.getContext(), ret);
20475
20476 RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT().
20477 SimpleTy);
20478 SDValue InChain = DAG.getEntryNode();
20480 Subtarget);
20481 bool isSigned = N->getOpcode() == ISD::SREM;
20482
20483 RTLIB::LibcallImpl LCImpl = getLibcallImpl(LC);
20484 SDValue Callee =
20485 DAG.getExternalSymbol(LCImpl, getPointerTy(DAG.getDataLayout()));
20486
20487 if (Subtarget->isTargetWindows())
20488 InChain = WinDBZCheckDenominator(DAG, N, InChain);
20489
20490 // Lower call
20491 CallLoweringInfo CLI(DAG);
20492 CLI.setChain(InChain)
20493 .setCallee(getLibcallImplCallingConv(LCImpl), RetTy, Callee,
20494 std::move(Args))
20497 .setDebugLoc(SDLoc(N));
20498 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
20499
20500 // Return second (rem) result operand (first contains div)
20501 SDNode *ResNode = CallResult.first.getNode();
20502 assert(ResNode->getNumOperands() == 2 && "divmod should return two operands");
20503 return ResNode->getOperand(1);
20504}
20505
20506SDValue
20507ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
20508 assert(Subtarget->isTargetWindows() && "unsupported target platform");
20509 SDLoc DL(Op);
20510
20511 // Get the inputs.
20512 SDValue Chain = Op.getOperand(0);
20513 SDValue Size = Op.getOperand(1);
20514
20516 "no-stack-arg-probe")) {
20517 MaybeAlign Align =
20518 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
20519 SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
20520 Chain = SP.getValue(1);
20521 SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size);
20522 if (Align)
20523 SP = DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0),
20524 DAG.getSignedConstant(-Align->value(), DL, MVT::i32));
20525 Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP);
20526 SDValue Ops[2] = { SP, Chain };
20527 return DAG.getMergeValues(Ops, DL);
20528 }
20529
20530 SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size,
20531 DAG.getConstant(2, DL, MVT::i32));
20532
20533 SDValue Glue;
20534 Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Glue);
20535 Glue = Chain.getValue(1);
20536
20537 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
20538 Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Glue);
20539
20540 SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
20541 Chain = NewSP.getValue(1);
20542
20543 SDValue Ops[2] = { NewSP, Chain };
20544 return DAG.getMergeValues(Ops, DL);
20545}
20546
20547SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
20548 bool IsStrict = Op->isStrictFPOpcode();
20549 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
20550 const unsigned DstSz = Op.getValueType().getSizeInBits();
20551 const unsigned SrcSz = SrcVal.getValueType().getSizeInBits();
20552 assert(DstSz > SrcSz && DstSz <= 64 && SrcSz >= 16 &&
20553 "Unexpected type for custom-lowering FP_EXTEND");
20554
20555 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
20556 "With both FP DP and 16, any FP conversion is legal!");
20557
20558 assert(!(DstSz == 32 && Subtarget->hasFP16()) &&
20559 "With FP16, 16 to 32 conversion is legal!");
20560
20561 // Converting from 32 -> 64 is valid if we have FP64.
20562 if (SrcSz == 32 && DstSz == 64 && Subtarget->hasFP64()) {
20563 // FIXME: Remove this when we have strict fp instruction selection patterns
20564 if (IsStrict) {
20565 SDLoc Loc(Op);
20567 Loc, Op.getValueType(), SrcVal);
20568 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc);
20569 }
20570 return Op;
20571 }
20572
20573 // Either we are converting from 16 -> 64, without FP16 and/or
20574 // FP.double-precision or without Armv8-fp. So we must do it in two
20575 // steps.
20576 // Or we are converting from 32 -> 64 without fp.double-precision or 16 -> 32
20577 // without FP16. So we must do a function call.
20578 SDLoc Loc(Op);
20579 RTLIB::Libcall LC;
20580 MakeLibCallOptions CallOptions;
20581 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
20582 for (unsigned Sz = SrcSz; Sz <= 32 && Sz < DstSz; Sz *= 2) {
20583 bool Supported = (Sz == 16 ? Subtarget->hasFP16() : Subtarget->hasFP64());
20584 MVT SrcVT = (Sz == 16 ? MVT::f16 : MVT::f32);
20585 MVT DstVT = (Sz == 16 ? MVT::f32 : MVT::f64);
20586 if (Supported) {
20587 if (IsStrict) {
20588 SrcVal = DAG.getNode(ISD::STRICT_FP_EXTEND, Loc,
20589 {DstVT, MVT::Other}, {Chain, SrcVal});
20590 Chain = SrcVal.getValue(1);
20591 } else {
20592 SrcVal = DAG.getNode(ISD::FP_EXTEND, Loc, DstVT, SrcVal);
20593 }
20594 } else {
20595 LC = RTLIB::getFPEXT(SrcVT, DstVT);
20596 assert(LC != RTLIB::UNKNOWN_LIBCALL &&
20597 "Unexpected type for custom-lowering FP_EXTEND");
20598 std::tie(SrcVal, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions,
20599 Loc, Chain);
20600 }
20601 }
20602
20603 return IsStrict ? DAG.getMergeValues({SrcVal, Chain}, Loc) : SrcVal;
20604}
20605
20606SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
20607 bool IsStrict = Op->isStrictFPOpcode();
20608
20609 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
20610 EVT SrcVT = SrcVal.getValueType();
20611 EVT DstVT = Op.getValueType();
20612 const unsigned DstSz = Op.getValueType().getSizeInBits();
20613 const unsigned SrcSz = SrcVT.getSizeInBits();
20614 (void)DstSz;
20615 assert(DstSz < SrcSz && SrcSz <= 64 && DstSz >= 16 &&
20616 "Unexpected type for custom-lowering FP_ROUND");
20617
20618 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
20619 "With both FP DP and 16, any FP conversion is legal!");
20620
20621 SDLoc Loc(Op);
20622
20623 // Instruction from 32 -> 16 if hasFP16 is valid
20624 if (SrcSz == 32 && Subtarget->hasFP16())
20625 return Op;
20626
20627 // Lib call from 32 -> 16 / 64 -> [32, 16]
20628 RTLIB::Libcall LC = RTLIB::getFPROUND(SrcVT, DstVT);
20629 assert(LC != RTLIB::UNKNOWN_LIBCALL &&
20630 "Unexpected type for custom-lowering FP_ROUND");
20631 MakeLibCallOptions CallOptions;
20632 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
20634 std::tie(Result, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions,
20635 Loc, Chain);
20636 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;
20637}
20638
20639bool
20641 // The ARM target isn't yet aware of offsets.
20642 return false;
20643}
20644
20646 if (v == 0xffffffff)
20647 return false;
20648
20649 // there can be 1's on either or both "outsides", all the "inside"
20650 // bits must be 0's
20651 return isShiftedMask_32(~v);
20652}
20653
20654/// isFPImmLegal - Returns true if the target can instruction select the
20655/// specified FP immediate natively. If false, the legalizer will
20656/// materialize the FP immediate as a load from a constant pool.
20658 bool ForCodeSize) const {
20659 if (!Subtarget->hasVFP3Base())
20660 return false;
20661 if (VT == MVT::f16 && Subtarget->hasFullFP16())
20662 return ARM_AM::getFP16Imm(Imm) != -1;
20663 if (VT == MVT::f32 && Subtarget->hasFullFP16() &&
20664 ARM_AM::getFP32FP16Imm(Imm) != -1)
20665 return true;
20666 if (VT == MVT::f32)
20667 return ARM_AM::getFP32Imm(Imm) != -1;
20668 if (VT == MVT::f64 && Subtarget->hasFP64())
20669 return ARM_AM::getFP64Imm(Imm) != -1;
20670 return false;
20671}
20672
20673/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
20674/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
20675/// specified in the intrinsic calls.
20677 const CallBase &I,
20678 MachineFunction &MF,
20679 unsigned Intrinsic) const {
20680 switch (Intrinsic) {
20681 case Intrinsic::arm_neon_vld1:
20682 case Intrinsic::arm_neon_vld2:
20683 case Intrinsic::arm_neon_vld3:
20684 case Intrinsic::arm_neon_vld4:
20685 case Intrinsic::arm_neon_vld2lane:
20686 case Intrinsic::arm_neon_vld3lane:
20687 case Intrinsic::arm_neon_vld4lane:
20688 case Intrinsic::arm_neon_vld2dup:
20689 case Intrinsic::arm_neon_vld3dup:
20690 case Intrinsic::arm_neon_vld4dup: {
20691 Info.opc = ISD::INTRINSIC_W_CHAIN;
20692 // Conservatively set memVT to the entire set of vectors loaded.
20693 auto &DL = I.getDataLayout();
20694 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
20695 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
20696 Info.ptrVal = I.getArgOperand(0);
20697 Info.offset = 0;
20698 Value *AlignArg = I.getArgOperand(I.arg_size() - 1);
20699 Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();
20700 // volatile loads with NEON intrinsics not supported
20701 Info.flags = MachineMemOperand::MOLoad;
20702 return true;
20703 }
20704 case Intrinsic::arm_neon_vld1x2:
20705 case Intrinsic::arm_neon_vld1x3:
20706 case Intrinsic::arm_neon_vld1x4: {
20707 Info.opc = ISD::INTRINSIC_W_CHAIN;
20708 // Conservatively set memVT to the entire set of vectors loaded.
20709 auto &DL = I.getDataLayout();
20710 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
20711 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
20712 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
20713 Info.offset = 0;
20714 Info.align = I.getParamAlign(I.arg_size() - 1).valueOrOne();
20715 // volatile loads with NEON intrinsics not supported
20716 Info.flags = MachineMemOperand::MOLoad;
20717 return true;
20718 }
20719 case Intrinsic::arm_neon_vst1:
20720 case Intrinsic::arm_neon_vst2:
20721 case Intrinsic::arm_neon_vst3:
20722 case Intrinsic::arm_neon_vst4:
20723 case Intrinsic::arm_neon_vst2lane:
20724 case Intrinsic::arm_neon_vst3lane:
20725 case Intrinsic::arm_neon_vst4lane: {
20726 Info.opc = ISD::INTRINSIC_VOID;
20727 // Conservatively set memVT to the entire set of vectors stored.
20728 auto &DL = I.getDataLayout();
20729 unsigned NumElts = 0;
20730 for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) {
20731 Type *ArgTy = I.getArgOperand(ArgI)->getType();
20732 if (!ArgTy->isVectorTy())
20733 break;
20734 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
20735 }
20736 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
20737 Info.ptrVal = I.getArgOperand(0);
20738 Info.offset = 0;
20739 Value *AlignArg = I.getArgOperand(I.arg_size() - 1);
20740 Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();
20741 // volatile stores with NEON intrinsics not supported
20742 Info.flags = MachineMemOperand::MOStore;
20743 return true;
20744 }
20745 case Intrinsic::arm_neon_vst1x2:
20746 case Intrinsic::arm_neon_vst1x3:
20747 case Intrinsic::arm_neon_vst1x4: {
20748 Info.opc = ISD::INTRINSIC_VOID;
20749 // Conservatively set memVT to the entire set of vectors stored.
20750 auto &DL = I.getDataLayout();
20751 unsigned NumElts = 0;
20752 for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) {
20753 Type *ArgTy = I.getArgOperand(ArgI)->getType();
20754 if (!ArgTy->isVectorTy())
20755 break;
20756 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
20757 }
20758 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
20759 Info.ptrVal = I.getArgOperand(0);
20760 Info.offset = 0;
20761 Info.align = I.getParamAlign(0).valueOrOne();
20762 // volatile stores with NEON intrinsics not supported
20763 Info.flags = MachineMemOperand::MOStore;
20764 return true;
20765 }
20766 case Intrinsic::arm_mve_vld2q:
20767 case Intrinsic::arm_mve_vld4q: {
20768 Info.opc = ISD::INTRINSIC_W_CHAIN;
20769 // Conservatively set memVT to the entire set of vectors loaded.
20770 Type *VecTy = cast<StructType>(I.getType())->getElementType(1);
20771 unsigned Factor = Intrinsic == Intrinsic::arm_mve_vld2q ? 2 : 4;
20772 Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);
20773 Info.ptrVal = I.getArgOperand(0);
20774 Info.offset = 0;
20775 Info.align = Align(VecTy->getScalarSizeInBits() / 8);
20776 // volatile loads with MVE intrinsics not supported
20777 Info.flags = MachineMemOperand::MOLoad;
20778 return true;
20779 }
20780 case Intrinsic::arm_mve_vst2q:
20781 case Intrinsic::arm_mve_vst4q: {
20782 Info.opc = ISD::INTRINSIC_VOID;
20783 // Conservatively set memVT to the entire set of vectors stored.
20784 Type *VecTy = I.getArgOperand(1)->getType();
20785 unsigned Factor = Intrinsic == Intrinsic::arm_mve_vst2q ? 2 : 4;
20786 Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);
20787 Info.ptrVal = I.getArgOperand(0);
20788 Info.offset = 0;
20789 Info.align = Align(VecTy->getScalarSizeInBits() / 8);
20790 // volatile stores with MVE intrinsics not supported
20791 Info.flags = MachineMemOperand::MOStore;
20792 return true;
20793 }
20794 case Intrinsic::arm_mve_vldr_gather_base:
20795 case Intrinsic::arm_mve_vldr_gather_base_predicated: {
20796 Info.opc = ISD::INTRINSIC_W_CHAIN;
20797 Info.ptrVal = nullptr;
20798 Info.memVT = MVT::getVT(I.getType());
20799 Info.align = Align(1);
20800 Info.flags |= MachineMemOperand::MOLoad;
20801 return true;
20802 }
20803 case Intrinsic::arm_mve_vldr_gather_base_wb:
20804 case Intrinsic::arm_mve_vldr_gather_base_wb_predicated: {
20805 Info.opc = ISD::INTRINSIC_W_CHAIN;
20806 Info.ptrVal = nullptr;
20807 Info.memVT = MVT::getVT(I.getType()->getContainedType(0));
20808 Info.align = Align(1);
20809 Info.flags |= MachineMemOperand::MOLoad;
20810 return true;
20811 }
20812 case Intrinsic::arm_mve_vldr_gather_offset:
20813 case Intrinsic::arm_mve_vldr_gather_offset_predicated: {
20814 Info.opc = ISD::INTRINSIC_W_CHAIN;
20815 Info.ptrVal = nullptr;
20816 MVT DataVT = MVT::getVT(I.getType());
20817 unsigned MemSize = cast<ConstantInt>(I.getArgOperand(2))->getZExtValue();
20818 Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize),
20819 DataVT.getVectorNumElements());
20820 Info.align = Align(1);
20821 Info.flags |= MachineMemOperand::MOLoad;
20822 return true;
20823 }
20824 case Intrinsic::arm_mve_vstr_scatter_base:
20825 case Intrinsic::arm_mve_vstr_scatter_base_predicated: {
20826 Info.opc = ISD::INTRINSIC_VOID;
20827 Info.ptrVal = nullptr;
20828 Info.memVT = MVT::getVT(I.getArgOperand(2)->getType());
20829 Info.align = Align(1);
20830 Info.flags |= MachineMemOperand::MOStore;
20831 return true;
20832 }
20833 case Intrinsic::arm_mve_vstr_scatter_base_wb:
20834 case Intrinsic::arm_mve_vstr_scatter_base_wb_predicated: {
20835 Info.opc = ISD::INTRINSIC_W_CHAIN;
20836 Info.ptrVal = nullptr;
20837 Info.memVT = MVT::getVT(I.getArgOperand(2)->getType());
20838 Info.align = Align(1);
20839 Info.flags |= MachineMemOperand::MOStore;
20840 return true;
20841 }
20842 case Intrinsic::arm_mve_vstr_scatter_offset:
20843 case Intrinsic::arm_mve_vstr_scatter_offset_predicated: {
20844 Info.opc = ISD::INTRINSIC_VOID;
20845 Info.ptrVal = nullptr;
20846 MVT DataVT = MVT::getVT(I.getArgOperand(2)->getType());
20847 unsigned MemSize = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
20848 Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize),
20849 DataVT.getVectorNumElements());
20850 Info.align = Align(1);
20851 Info.flags |= MachineMemOperand::MOStore;
20852 return true;
20853 }
20854 case Intrinsic::arm_ldaex:
20855 case Intrinsic::arm_ldrex: {
20856 auto &DL = I.getDataLayout();
20857 Type *ValTy = I.getParamElementType(0);
20858 Info.opc = ISD::INTRINSIC_W_CHAIN;
20859 Info.memVT = MVT::getVT(ValTy);
20860 Info.ptrVal = I.getArgOperand(0);
20861 Info.offset = 0;
20862 Info.align = DL.getABITypeAlign(ValTy);
20864 return true;
20865 }
20866 case Intrinsic::arm_stlex:
20867 case Intrinsic::arm_strex: {
20868 auto &DL = I.getDataLayout();
20869 Type *ValTy = I.getParamElementType(1);
20870 Info.opc = ISD::INTRINSIC_W_CHAIN;
20871 Info.memVT = MVT::getVT(ValTy);
20872 Info.ptrVal = I.getArgOperand(1);
20873 Info.offset = 0;
20874 Info.align = DL.getABITypeAlign(ValTy);
20876 return true;
20877 }
20878 case Intrinsic::arm_stlexd:
20879 case Intrinsic::arm_strexd:
20880 Info.opc = ISD::INTRINSIC_W_CHAIN;
20881 Info.memVT = MVT::i64;
20882 Info.ptrVal = I.getArgOperand(2);
20883 Info.offset = 0;
20884 Info.align = Align(8);
20886 return true;
20887
20888 case Intrinsic::arm_ldaexd:
20889 case Intrinsic::arm_ldrexd:
20890 Info.opc = ISD::INTRINSIC_W_CHAIN;
20891 Info.memVT = MVT::i64;
20892 Info.ptrVal = I.getArgOperand(0);
20893 Info.offset = 0;
20894 Info.align = Align(8);
20896 return true;
20897
20898 default:
20899 break;
20900 }
20901
20902 return false;
20903}
20904
20905/// Returns true if it is beneficial to convert a load of a constant
20906/// to just the constant itself.
20908 Type *Ty) const {
20909 assert(Ty->isIntegerTy());
20910
20911 unsigned Bits = Ty->getPrimitiveSizeInBits();
20912 if (Bits == 0 || Bits > 32)
20913 return false;
20914 return true;
20915}
20916
20918 unsigned Index) const {
20920 return false;
20921
20922 return (Index == 0 || Index == ResVT.getVectorNumElements());
20923}
20924
20926 ARM_MB::MemBOpt Domain) const {
20927 // First, if the target has no DMB, see what fallback we can use.
20928 if (!Subtarget->hasDataBarrier()) {
20929 // Some ARMv6 cpus can support data barriers with an mcr instruction.
20930 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
20931 // here.
20932 if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) {
20933 Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0),
20934 Builder.getInt32(0), Builder.getInt32(7),
20935 Builder.getInt32(10), Builder.getInt32(5)};
20936 return Builder.CreateIntrinsic(Intrinsic::arm_mcr, args);
20937 } else {
20938 // Instead of using barriers, atomic accesses on these subtargets use
20939 // libcalls.
20940 llvm_unreachable("makeDMB on a target so old that it has no barriers");
20941 }
20942 } else {
20943 // Only a full system barrier exists in the M-class architectures.
20944 Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain;
20945 Constant *CDomain = Builder.getInt32(Domain);
20946 return Builder.CreateIntrinsic(Intrinsic::arm_dmb, CDomain);
20947 }
20948}
20949
20950// Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
20952 Instruction *Inst,
20953 AtomicOrdering Ord) const {
20954 switch (Ord) {
20957 llvm_unreachable("Invalid fence: unordered/non-atomic");
20960 return nullptr; // Nothing to do
20962 if (!Inst->hasAtomicStore())
20963 return nullptr; // Nothing to do
20964 [[fallthrough]];
20967 if (Subtarget->preferISHSTBarriers())
20968 return makeDMB(Builder, ARM_MB::ISHST);
20969 // FIXME: add a comment with a link to documentation justifying this.
20970 else
20971 return makeDMB(Builder, ARM_MB::ISH);
20972 }
20973 llvm_unreachable("Unknown fence ordering in emitLeadingFence");
20974}
20975
20977 Instruction *Inst,
20978 AtomicOrdering Ord) const {
20979 switch (Ord) {
20982 llvm_unreachable("Invalid fence: unordered/not-atomic");
20985 return nullptr; // Nothing to do
20989 return makeDMB(Builder, ARM_MB::ISH);
20990 }
20991 llvm_unreachable("Unknown fence ordering in emitTrailingFence");
20992}
20993
20994// Loads and stores less than 64-bits are already atomic; ones above that
20995// are doomed anyway, so defer to the default libcall and blame the OS when
20996// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
20997// anything for those.
21000 bool has64BitAtomicStore;
21001 if (Subtarget->isMClass())
21002 has64BitAtomicStore = false;
21003 else if (Subtarget->isThumb())
21004 has64BitAtomicStore = Subtarget->hasV7Ops();
21005 else
21006 has64BitAtomicStore = Subtarget->hasV6Ops();
21007
21008 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
21009 return Size == 64 && has64BitAtomicStore ? AtomicExpansionKind::Expand
21011}
21012
21013// Loads and stores less than 64-bits are already atomic; ones above that
21014// are doomed anyway, so defer to the default libcall and blame the OS when
21015// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
21016// anything for those.
21017// FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that
21018// guarantee, see DDI0406C ARM architecture reference manual,
21019// sections A8.8.72-74 LDRD)
21022 bool has64BitAtomicLoad;
21023 if (Subtarget->isMClass())
21024 has64BitAtomicLoad = false;
21025 else if (Subtarget->isThumb())
21026 has64BitAtomicLoad = Subtarget->hasV7Ops();
21027 else
21028 has64BitAtomicLoad = Subtarget->hasV6Ops();
21029
21030 unsigned Size = LI->getType()->getPrimitiveSizeInBits();
21031 return (Size == 64 && has64BitAtomicLoad) ? AtomicExpansionKind::LLOnly
21033}
21034
21035// For the real atomic operations, we have ldrex/strex up to 32 bits,
21036// and up to 64 bits on the non-M profiles
21039 if (AI->isFloatingPointOperation())
21041
21042 unsigned Size = AI->getType()->getPrimitiveSizeInBits();
21043 bool hasAtomicRMW;
21044 if (Subtarget->isMClass())
21045 hasAtomicRMW = Subtarget->hasV8MBaselineOps();
21046 else if (Subtarget->isThumb())
21047 hasAtomicRMW = Subtarget->hasV7Ops();
21048 else
21049 hasAtomicRMW = Subtarget->hasV6Ops();
21050 if (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW) {
21051 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
21052 // implement atomicrmw without spilling. If the target address is also on
21053 // the stack and close enough to the spill slot, this can lead to a
21054 // situation where the monitor always gets cleared and the atomic operation
21055 // can never succeed. So at -O0 lower this operation to a CAS loop.
21056 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
21059 }
21061}
21062
21063// Similar to shouldExpandAtomicRMWInIR, ldrex/strex can be used up to 32
21064// bits, and up to 64 bits on the non-M profiles.
21067 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
21068 // implement cmpxchg without spilling. If the address being exchanged is also
21069 // on the stack and close enough to the spill slot, this can lead to a
21070 // situation where the monitor always gets cleared and the atomic operation
21071 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
21072 unsigned Size = AI->getOperand(1)->getType()->getPrimitiveSizeInBits();
21073 bool HasAtomicCmpXchg;
21074 if (Subtarget->isMClass())
21075 HasAtomicCmpXchg = Subtarget->hasV8MBaselineOps();
21076 else if (Subtarget->isThumb())
21077 HasAtomicCmpXchg = Subtarget->hasV7Ops();
21078 else
21079 HasAtomicCmpXchg = Subtarget->hasV6Ops();
21080 if (getTargetMachine().getOptLevel() != CodeGenOptLevel::None &&
21081 HasAtomicCmpXchg && Size <= (Subtarget->isMClass() ? 32U : 64U))
21084}
21085
21087 const Instruction *I) const {
21088 return InsertFencesForAtomic;
21089}
21090
21092 // ROPI/RWPI are not supported currently.
21093 return !Subtarget->isROPI() && !Subtarget->isRWPI();
21094}
21095
21097 // MSVC CRT provides functionalities for stack protection.
21098 RTLIB::LibcallImpl SecurityCheckCookieLibcall =
21099 getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
21100
21101 RTLIB::LibcallImpl SecurityCookieVar =
21102 getLibcallImpl(RTLIB::STACK_CHECK_GUARD);
21103 if (SecurityCheckCookieLibcall != RTLIB::Unsupported &&
21104 SecurityCookieVar != RTLIB::Unsupported) {
21105 // MSVC CRT has a global variable holding security cookie.
21106 M.getOrInsertGlobal(getLibcallImplName(SecurityCookieVar),
21107 PointerType::getUnqual(M.getContext()));
21108
21109 // MSVC CRT has a function to validate security cookie.
21110 FunctionCallee SecurityCheckCookie =
21111 M.getOrInsertFunction(getLibcallImplName(SecurityCheckCookieLibcall),
21112 Type::getVoidTy(M.getContext()),
21113 PointerType::getUnqual(M.getContext()));
21114 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee()))
21115 F->addParamAttr(0, Attribute::AttrKind::InReg);
21116 }
21117
21119}
21120
21122 unsigned &Cost) const {
21123 // If we do not have NEON, vector types are not natively supported.
21124 if (!Subtarget->hasNEON())
21125 return false;
21126
21127 // Floating point values and vector values map to the same register file.
21128 // Therefore, although we could do a store extract of a vector type, this is
21129 // better to leave at float as we have more freedom in the addressing mode for
21130 // those.
21131 if (VectorTy->isFPOrFPVectorTy())
21132 return false;
21133
21134 // If the index is unknown at compile time, this is very expensive to lower
21135 // and it is not possible to combine the store with the extract.
21136 if (!isa<ConstantInt>(Idx))
21137 return false;
21138
21139 assert(VectorTy->isVectorTy() && "VectorTy is not a vector type");
21140 unsigned BitWidth = VectorTy->getPrimitiveSizeInBits().getFixedValue();
21141 // We can do a store + vector extract on any vector that fits perfectly in a D
21142 // or Q register.
21143 if (BitWidth == 64 || BitWidth == 128) {
21144 Cost = 0;
21145 return true;
21146 }
21147 return false;
21148}
21149
21151 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
21152 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
21153 unsigned Opcode = Op.getOpcode();
21154 switch (Opcode) {
21155 case ARMISD::VORRIMM:
21156 case ARMISD::VBICIMM:
21157 return false;
21158 }
21160 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
21161}
21162
21164 return Subtarget->hasV5TOps() && !Subtarget->isThumb1Only();
21165}
21166
21168 return Subtarget->hasV5TOps() && !Subtarget->isThumb1Only();
21169}
21170
21172 const Instruction &AndI) const {
21173 if (!Subtarget->hasV7Ops())
21174 return false;
21175
21176 // Sink the `and` instruction only if the mask would fit into a modified
21177 // immediate operand.
21179 if (!Mask || Mask->getValue().getBitWidth() > 32u)
21180 return false;
21181 auto MaskVal = unsigned(Mask->getValue().getZExtValue());
21182 return (Subtarget->isThumb2() ? ARM_AM::getT2SOImmVal(MaskVal)
21183 : ARM_AM::getSOImmVal(MaskVal)) != -1;
21184}
21185
21188 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
21189 if (Subtarget->hasMinSize() && !Subtarget->isTargetWindows())
21192 ExpansionFactor);
21193}
21194
21196 Value *Addr,
21197 AtomicOrdering Ord) const {
21198 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
21199 bool IsAcquire = isAcquireOrStronger(Ord);
21200
21201 // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd
21202 // intrinsic must return {i32, i32} and we have to recombine them into a
21203 // single i64 here.
21204 if (ValueTy->getPrimitiveSizeInBits() == 64) {
21206 IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd;
21207
21208 Value *LoHi =
21209 Builder.CreateIntrinsic(Int, Addr, /*FMFSource=*/nullptr, "lohi");
21210
21211 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
21212 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
21213 if (!Subtarget->isLittle())
21214 std::swap (Lo, Hi);
21215 Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
21216 Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
21217 return Builder.CreateOr(
21218 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 32)), "val64");
21219 }
21220
21221 Type *Tys[] = { Addr->getType() };
21222 Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex;
21223 CallInst *CI = Builder.CreateIntrinsic(Int, Tys, Addr);
21224
21225 CI->addParamAttr(
21226 0, Attribute::get(M->getContext(), Attribute::ElementType, ValueTy));
21227 return Builder.CreateTruncOrBitCast(CI, ValueTy);
21228}
21229
21231 IRBuilderBase &Builder) const {
21232 if (!Subtarget->hasV7Ops())
21233 return;
21234 Builder.CreateIntrinsic(Intrinsic::arm_clrex, {});
21235}
21236
21238 Value *Val, Value *Addr,
21239 AtomicOrdering Ord) const {
21240 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
21241 bool IsRelease = isReleaseOrStronger(Ord);
21242
21243 // Since the intrinsics must have legal type, the i64 intrinsics take two
21244 // parameters: "i32, i32". We must marshal Val into the appropriate form
21245 // before the call.
21246 if (Val->getType()->getPrimitiveSizeInBits() == 64) {
21248 IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd;
21249 Type *Int32Ty = Type::getInt32Ty(M->getContext());
21250
21251 Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo");
21252 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi");
21253 if (!Subtarget->isLittle())
21254 std::swap(Lo, Hi);
21255 return Builder.CreateIntrinsic(Int, {Lo, Hi, Addr});
21256 }
21257
21258 Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex;
21259 Type *Tys[] = { Addr->getType() };
21261
21262 CallInst *CI = Builder.CreateCall(
21263 Strex, {Builder.CreateZExtOrBitCast(
21264 Val, Strex->getFunctionType()->getParamType(0)),
21265 Addr});
21266 CI->addParamAttr(1, Attribute::get(M->getContext(), Attribute::ElementType,
21267 Val->getType()));
21268 return CI;
21269}
21270
21271
21273 return Subtarget->isMClass();
21274}
21275
21276/// A helper function for determining the number of interleaved accesses we
21277/// will generate when lowering accesses of the given type.
21278unsigned
21280 const DataLayout &DL) const {
21281 return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
21282}
21283
21285 unsigned Factor, FixedVectorType *VecTy, Align Alignment,
21286 const DataLayout &DL) const {
21287
21288 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
21289 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
21290
21291 if (!Subtarget->hasNEON() && !Subtarget->hasMVEIntegerOps())
21292 return false;
21293
21294 // Ensure the vector doesn't have f16 elements. Even though we could do an
21295 // i16 vldN, we can't hold the f16 vectors and will end up converting via
21296 // f32.
21297 if (Subtarget->hasNEON() && VecTy->getElementType()->isHalfTy())
21298 return false;
21299 if (Subtarget->hasMVEIntegerOps() && Factor == 3)
21300 return false;
21301
21302 // Ensure the number of vector elements is greater than 1.
21303 if (VecTy->getNumElements() < 2)
21304 return false;
21305
21306 // Ensure the element type is legal.
21307 if (ElSize != 8 && ElSize != 16 && ElSize != 32)
21308 return false;
21309 // And the alignment if high enough under MVE.
21310 if (Subtarget->hasMVEIntegerOps() && Alignment < ElSize / 8)
21311 return false;
21312
21313 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
21314 // 128 will be split into multiple interleaved accesses.
21315 if (Subtarget->hasNEON() && VecSize == 64)
21316 return true;
21317 return VecSize % 128 == 0;
21318}
21319
21321 if (Subtarget->hasNEON())
21322 return 4;
21323 if (Subtarget->hasMVEIntegerOps())
21326}
21327
21328/// Lower an interleaved load into a vldN intrinsic.
21329///
21330/// E.g. Lower an interleaved load (Factor = 2):
21331/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4
21332/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
21333/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
21334///
21335/// Into:
21336/// %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4)
21337/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
21338/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
21340 Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
21341 ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const {
21342 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
21343 "Invalid interleave factor");
21344 assert(!Shuffles.empty() && "Empty shufflevector input");
21345 assert(Shuffles.size() == Indices.size() &&
21346 "Unmatched number of shufflevectors and indices");
21347
21348 auto *LI = dyn_cast<LoadInst>(Load);
21349 if (!LI)
21350 return false;
21351 assert(!Mask && GapMask.popcount() == Factor && "Unexpected mask on a load");
21352
21353 auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType());
21354 Type *EltTy = VecTy->getElementType();
21355
21356 const DataLayout &DL = LI->getDataLayout();
21357 Align Alignment = LI->getAlign();
21358
21359 // Skip if we do not have NEON and skip illegal vector types. We can
21360 // "legalize" wide vector types into multiple interleaved accesses as long as
21361 // the vector types are divisible by 128.
21362 if (!isLegalInterleavedAccessType(Factor, VecTy, Alignment, DL))
21363 return false;
21364
21365 unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);
21366
21367 // A pointer vector can not be the return type of the ldN intrinsics. Need to
21368 // load integer vectors first and then convert to pointer vectors.
21369 if (EltTy->isPointerTy())
21370 VecTy = FixedVectorType::get(DL.getIntPtrType(EltTy), VecTy);
21371
21372 IRBuilder<> Builder(LI);
21373
21374 // The base address of the load.
21375 Value *BaseAddr = LI->getPointerOperand();
21376
21377 if (NumLoads > 1) {
21378 // If we're going to generate more than one load, reset the sub-vector type
21379 // to something legal.
21380 VecTy = FixedVectorType::get(VecTy->getElementType(),
21381 VecTy->getNumElements() / NumLoads);
21382 }
21383
21384 assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!");
21385
21386 auto createLoadIntrinsic = [&](Value *BaseAddr) {
21387 if (Subtarget->hasNEON()) {
21388 Type *PtrTy = Builder.getPtrTy(LI->getPointerAddressSpace());
21389 Type *Tys[] = {VecTy, PtrTy};
21390 static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
21391 Intrinsic::arm_neon_vld3,
21392 Intrinsic::arm_neon_vld4};
21393
21395 Ops.push_back(BaseAddr);
21396 Ops.push_back(Builder.getInt32(LI->getAlign().value()));
21397
21398 return Builder.CreateIntrinsic(LoadInts[Factor - 2], Tys, Ops,
21399 /*FMFSource=*/nullptr, "vldN");
21400 } else {
21401 assert((Factor == 2 || Factor == 4) &&
21402 "expected interleave factor of 2 or 4 for MVE");
21403 Intrinsic::ID LoadInts =
21404 Factor == 2 ? Intrinsic::arm_mve_vld2q : Intrinsic::arm_mve_vld4q;
21405 Type *PtrTy = Builder.getPtrTy(LI->getPointerAddressSpace());
21406 Type *Tys[] = {VecTy, PtrTy};
21407
21409 Ops.push_back(BaseAddr);
21410 return Builder.CreateIntrinsic(LoadInts, Tys, Ops, /*FMFSource=*/nullptr,
21411 "vldN");
21412 }
21413 };
21414
21415 // Holds sub-vectors extracted from the load intrinsic return values. The
21416 // sub-vectors are associated with the shufflevector instructions they will
21417 // replace.
21419
21420 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
21421 // If we're generating more than one load, compute the base address of
21422 // subsequent loads as an offset from the previous.
21423 if (LoadCount > 0)
21424 BaseAddr = Builder.CreateConstGEP1_32(VecTy->getElementType(), BaseAddr,
21425 VecTy->getNumElements() * Factor);
21426
21427 CallInst *VldN = createLoadIntrinsic(BaseAddr);
21428
21429 // Replace uses of each shufflevector with the corresponding vector loaded
21430 // by ldN.
21431 for (unsigned i = 0; i < Shuffles.size(); i++) {
21432 ShuffleVectorInst *SV = Shuffles[i];
21433 unsigned Index = Indices[i];
21434
21435 Value *SubVec = Builder.CreateExtractValue(VldN, Index);
21436
21437 // Convert the integer vector to pointer vector if the element is pointer.
21438 if (EltTy->isPointerTy())
21439 SubVec = Builder.CreateIntToPtr(
21440 SubVec,
21442
21443 SubVecs[SV].push_back(SubVec);
21444 }
21445 }
21446
21447 // Replace uses of the shufflevector instructions with the sub-vectors
21448 // returned by the load intrinsic. If a shufflevector instruction is
21449 // associated with more than one sub-vector, those sub-vectors will be
21450 // concatenated into a single wide vector.
21451 for (ShuffleVectorInst *SVI : Shuffles) {
21452 auto &SubVec = SubVecs[SVI];
21453 auto *WideVec =
21454 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
21455 SVI->replaceAllUsesWith(WideVec);
21456 }
21457
21458 return true;
21459}
21460
21461/// Lower an interleaved store into a vstN intrinsic.
21462///
21463/// E.g. Lower an interleaved store (Factor = 3):
21464/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
21465/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
21466/// store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4
21467///
21468/// Into:
21469/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
21470/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
21471/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
21472/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
21473///
21474/// Note that the new shufflevectors will be removed and we'll only generate one
21475/// vst3 instruction in CodeGen.
21476///
21477/// Example for a more general valid mask (Factor 3). Lower:
21478/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
21479/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
21480/// store <12 x i32> %i.vec, <12 x i32>* %ptr
21481///
21482/// Into:
21483/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
21484/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
21485/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
21486/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
21488 Value *LaneMask,
21489 ShuffleVectorInst *SVI,
21490 unsigned Factor,
21491 const APInt &GapMask) const {
21492 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
21493 "Invalid interleave factor");
21494 auto *SI = dyn_cast<StoreInst>(Store);
21495 if (!SI)
21496 return false;
21497 assert(!LaneMask && GapMask.popcount() == Factor &&
21498 "Unexpected mask on store");
21499
21500 auto *VecTy = cast<FixedVectorType>(SVI->getType());
21501 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
21502
21503 unsigned LaneLen = VecTy->getNumElements() / Factor;
21504 Type *EltTy = VecTy->getElementType();
21505 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
21506
21507 const DataLayout &DL = SI->getDataLayout();
21508 Align Alignment = SI->getAlign();
21509
21510 // Skip if we do not have NEON and skip illegal vector types. We can
21511 // "legalize" wide vector types into multiple interleaved accesses as long as
21512 // the vector types are divisible by 128.
21513 if (!isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
21514 return false;
21515
21516 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
21517
21518 Value *Op0 = SVI->getOperand(0);
21519 Value *Op1 = SVI->getOperand(1);
21520 IRBuilder<> Builder(SI);
21521
21522 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
21523 // vectors to integer vectors.
21524 if (EltTy->isPointerTy()) {
21525 Type *IntTy = DL.getIntPtrType(EltTy);
21526
21527 // Convert to the corresponding integer vector.
21528 auto *IntVecTy =
21530 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
21531 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
21532
21533 SubVecTy = FixedVectorType::get(IntTy, LaneLen);
21534 }
21535
21536 // The base address of the store.
21537 Value *BaseAddr = SI->getPointerOperand();
21538
21539 if (NumStores > 1) {
21540 // If we're going to generate more than one store, reset the lane length
21541 // and sub-vector type to something legal.
21542 LaneLen /= NumStores;
21543 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
21544 }
21545
21546 assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!");
21547
21548 auto Mask = SVI->getShuffleMask();
21549
21550 auto createStoreIntrinsic = [&](Value *BaseAddr,
21551 SmallVectorImpl<Value *> &Shuffles) {
21552 if (Subtarget->hasNEON()) {
21553 static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
21554 Intrinsic::arm_neon_vst3,
21555 Intrinsic::arm_neon_vst4};
21556 Type *PtrTy = Builder.getPtrTy(SI->getPointerAddressSpace());
21557 Type *Tys[] = {PtrTy, SubVecTy};
21558
21560 Ops.push_back(BaseAddr);
21561 append_range(Ops, Shuffles);
21562 Ops.push_back(Builder.getInt32(SI->getAlign().value()));
21563 Builder.CreateIntrinsic(StoreInts[Factor - 2], Tys, Ops);
21564 } else {
21565 assert((Factor == 2 || Factor == 4) &&
21566 "expected interleave factor of 2 or 4 for MVE");
21567 Intrinsic::ID StoreInts =
21568 Factor == 2 ? Intrinsic::arm_mve_vst2q : Intrinsic::arm_mve_vst4q;
21569 Type *PtrTy = Builder.getPtrTy(SI->getPointerAddressSpace());
21570 Type *Tys[] = {PtrTy, SubVecTy};
21571
21573 Ops.push_back(BaseAddr);
21574 append_range(Ops, Shuffles);
21575 for (unsigned F = 0; F < Factor; F++) {
21576 Ops.push_back(Builder.getInt32(F));
21577 Builder.CreateIntrinsic(StoreInts, Tys, Ops);
21578 Ops.pop_back();
21579 }
21580 }
21581 };
21582
21583 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
21584 // If we generating more than one store, we compute the base address of
21585 // subsequent stores as an offset from the previous.
21586 if (StoreCount > 0)
21587 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
21588 BaseAddr, LaneLen * Factor);
21589
21590 SmallVector<Value *, 4> Shuffles;
21591
21592 // Split the shufflevector operands into sub vectors for the new vstN call.
21593 for (unsigned i = 0; i < Factor; i++) {
21594 unsigned IdxI = StoreCount * LaneLen * Factor + i;
21595 if (Mask[IdxI] >= 0) {
21596 Shuffles.push_back(Builder.CreateShuffleVector(
21597 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0)));
21598 } else {
21599 unsigned StartMask = 0;
21600 for (unsigned j = 1; j < LaneLen; j++) {
21601 unsigned IdxJ = StoreCount * LaneLen * Factor + j;
21602 if (Mask[IdxJ * Factor + IdxI] >= 0) {
21603 StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
21604 break;
21605 }
21606 }
21607 // Note: If all elements in a chunk are undefs, StartMask=0!
21608 // Note: Filling undef gaps with random elements is ok, since
21609 // those elements were being written anyway (with undefs).
21610 // In the case of all undefs we're defaulting to using elems from 0
21611 // Note: StartMask cannot be negative, it's checked in
21612 // isReInterleaveMask
21613 Shuffles.push_back(Builder.CreateShuffleVector(
21614 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0)));
21615 }
21616 }
21617
21618 createStoreIntrinsic(BaseAddr, Shuffles);
21619 }
21620 return true;
21621}
21622
21630
21632 uint64_t &Members) {
21633 if (auto *ST = dyn_cast<StructType>(Ty)) {
21634 for (unsigned i = 0; i < ST->getNumElements(); ++i) {
21635 uint64_t SubMembers = 0;
21636 if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers))
21637 return false;
21638 Members += SubMembers;
21639 }
21640 } else if (auto *AT = dyn_cast<ArrayType>(Ty)) {
21641 uint64_t SubMembers = 0;
21642 if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers))
21643 return false;
21644 Members += SubMembers * AT->getNumElements();
21645 } else if (Ty->isFloatTy()) {
21646 if (Base != HA_UNKNOWN && Base != HA_FLOAT)
21647 return false;
21648 Members = 1;
21649 Base = HA_FLOAT;
21650 } else if (Ty->isDoubleTy()) {
21651 if (Base != HA_UNKNOWN && Base != HA_DOUBLE)
21652 return false;
21653 Members = 1;
21654 Base = HA_DOUBLE;
21655 } else if (auto *VT = dyn_cast<VectorType>(Ty)) {
21656 Members = 1;
21657 switch (Base) {
21658 case HA_FLOAT:
21659 case HA_DOUBLE:
21660 return false;
21661 case HA_VECT64:
21662 return VT->getPrimitiveSizeInBits().getFixedValue() == 64;
21663 case HA_VECT128:
21664 return VT->getPrimitiveSizeInBits().getFixedValue() == 128;
21665 case HA_UNKNOWN:
21666 switch (VT->getPrimitiveSizeInBits().getFixedValue()) {
21667 case 64:
21668 Base = HA_VECT64;
21669 return true;
21670 case 128:
21671 Base = HA_VECT128;
21672 return true;
21673 default:
21674 return false;
21675 }
21676 }
21677 }
21678
21679 return (Members > 0 && Members <= 4);
21680}
21681
21682/// Return the correct alignment for the current calling convention.
21684 Type *ArgTy, const DataLayout &DL) const {
21685 const Align ABITypeAlign = DL.getABITypeAlign(ArgTy);
21686 if (!ArgTy->isVectorTy())
21687 return ABITypeAlign;
21688
21689 // Avoid over-aligning vector parameters. It would require realigning the
21690 // stack and waste space for no real benefit.
21691 MaybeAlign StackAlign = DL.getStackAlignment();
21692 assert(StackAlign && "data layout string is missing stack alignment");
21693 return std::min(ABITypeAlign, *StackAlign);
21694}
21695
21696/// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of
21697/// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when
21698/// passing according to AAPCS rules.
21700 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
21701 const DataLayout &DL) const {
21702 if (getEffectiveCallingConv(CallConv, isVarArg) !=
21704 return false;
21705
21707 uint64_t Members = 0;
21708 bool IsHA = isHomogeneousAggregate(Ty, Base, Members);
21709 LLVM_DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump());
21710
21711 bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy();
21712 return IsHA || IsIntArray;
21713}
21714
21716 const Constant *PersonalityFn) const {
21717 // Platforms which do not use SjLj EH may return values in these registers
21718 // via the personality function.
21720 return EM == ExceptionHandling::SjLj ? Register() : ARM::R0;
21721}
21722
21724 const Constant *PersonalityFn) const {
21725 // Platforms which do not use SjLj EH may return values in these registers
21726 // via the personality function.
21728 return EM == ExceptionHandling::SjLj ? Register() : ARM::R1;
21729}
21730
21731void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
21732 // Update IsSplitCSR in ARMFunctionInfo.
21733 ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>();
21734 AFI->setIsSplitCSR(true);
21735}
21736
21737void ARMTargetLowering::insertCopiesSplitCSR(
21738 MachineBasicBlock *Entry,
21739 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
21740 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
21741 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
21742 if (!IStart)
21743 return;
21744
21745 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21746 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
21747 MachineBasicBlock::iterator MBBI = Entry->begin();
21748 for (const MCPhysReg *I = IStart; *I; ++I) {
21749 const TargetRegisterClass *RC = nullptr;
21750 if (ARM::GPRRegClass.contains(*I))
21751 RC = &ARM::GPRRegClass;
21752 else if (ARM::DPRRegClass.contains(*I))
21753 RC = &ARM::DPRRegClass;
21754 else
21755 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
21756
21757 Register NewVR = MRI->createVirtualRegister(RC);
21758 // Create copy from CSR to a virtual register.
21759 // FIXME: this currently does not emit CFI pseudo-instructions, it works
21760 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
21761 // nounwind. If we want to generalize this later, we may need to emit
21762 // CFI pseudo-instructions.
21763 assert(Entry->getParent()->getFunction().hasFnAttribute(
21764 Attribute::NoUnwind) &&
21765 "Function should be nounwind in insertCopiesSplitCSR!");
21766 Entry->addLiveIn(*I);
21767 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
21768 .addReg(*I);
21769
21770 // Insert the copy-back instructions right before the terminator.
21771 for (auto *Exit : Exits)
21772 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
21773 TII->get(TargetOpcode::COPY), *I)
21774 .addReg(NewVR);
21775 }
21776}
21777
21782
21784 return Subtarget->hasMVEIntegerOps();
21785}
21786
21789 auto *VTy = dyn_cast<FixedVectorType>(Ty);
21790 if (!VTy)
21791 return false;
21792
21793 auto *ScalarTy = VTy->getScalarType();
21794 unsigned NumElements = VTy->getNumElements();
21795
21796 unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
21797 if (VTyWidth < 128 || !llvm::isPowerOf2_32(VTyWidth))
21798 return false;
21799
21800 // Both VCADD and VCMUL/VCMLA support the same types, F16 and F32
21801 if (ScalarTy->isHalfTy() || ScalarTy->isFloatTy())
21802 return Subtarget->hasMVEFloatOps();
21803
21805 return false;
21806
21807 return Subtarget->hasMVEIntegerOps() &&
21808 (ScalarTy->isIntegerTy(8) || ScalarTy->isIntegerTy(16) ||
21809 ScalarTy->isIntegerTy(32));
21810}
21811
21813 static const MCPhysReg RCRegs[] = {ARM::FPSCR_RM};
21814 return RCRegs;
21815}
21816
21819 ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
21820 Value *Accumulator) const {
21821
21823
21824 unsigned TyWidth = Ty->getScalarSizeInBits() * Ty->getNumElements();
21825
21826 assert(TyWidth >= 128 && "Width of vector type must be at least 128 bits");
21827
21828 if (TyWidth > 128) {
21829 int Stride = Ty->getNumElements() / 2;
21830 auto SplitSeq = llvm::seq<int>(0, Ty->getNumElements());
21831 auto SplitSeqVec = llvm::to_vector(SplitSeq);
21832 ArrayRef<int> LowerSplitMask(&SplitSeqVec[0], Stride);
21833 ArrayRef<int> UpperSplitMask(&SplitSeqVec[Stride], Stride);
21834
21835 auto *LowerSplitA = B.CreateShuffleVector(InputA, LowerSplitMask);
21836 auto *LowerSplitB = B.CreateShuffleVector(InputB, LowerSplitMask);
21837 auto *UpperSplitA = B.CreateShuffleVector(InputA, UpperSplitMask);
21838 auto *UpperSplitB = B.CreateShuffleVector(InputB, UpperSplitMask);
21839 Value *LowerSplitAcc = nullptr;
21840 Value *UpperSplitAcc = nullptr;
21841
21842 if (Accumulator) {
21843 LowerSplitAcc = B.CreateShuffleVector(Accumulator, LowerSplitMask);
21844 UpperSplitAcc = B.CreateShuffleVector(Accumulator, UpperSplitMask);
21845 }
21846
21847 auto *LowerSplitInt = createComplexDeinterleavingIR(
21848 B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
21849 auto *UpperSplitInt = createComplexDeinterleavingIR(
21850 B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
21851
21852 ArrayRef<int> JoinMask(&SplitSeqVec[0], Ty->getNumElements());
21853 return B.CreateShuffleVector(LowerSplitInt, UpperSplitInt, JoinMask);
21854 }
21855
21856 auto *IntTy = Type::getInt32Ty(B.getContext());
21857
21858 ConstantInt *ConstRotation = nullptr;
21859 if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
21860 ConstRotation = ConstantInt::get(IntTy, (int)Rotation);
21861
21862 if (Accumulator)
21863 return B.CreateIntrinsic(Intrinsic::arm_mve_vcmlaq, Ty,
21864 {ConstRotation, Accumulator, InputB, InputA});
21865 return B.CreateIntrinsic(Intrinsic::arm_mve_vcmulq, Ty,
21866 {ConstRotation, InputB, InputA});
21867 }
21868
21869 if (OperationType == ComplexDeinterleavingOperation::CAdd) {
21870 // 1 means the value is not halved.
21871 auto *ConstHalving = ConstantInt::get(IntTy, 1);
21872
21874 ConstRotation = ConstantInt::get(IntTy, 0);
21876 ConstRotation = ConstantInt::get(IntTy, 1);
21877
21878 if (!ConstRotation)
21879 return nullptr; // Invalid rotation for arm_mve_vcaddq
21880
21881 return B.CreateIntrinsic(Intrinsic::arm_mve_vcaddq, Ty,
21882 {ConstHalving, ConstRotation, InputA, InputB});
21883 }
21884
21885 return nullptr;
21886}
unsigned const MachineRegisterInfo * MRI
static bool isAddSubSExt(SDValue N, SelectionDAG &DAG)
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt)
isVShiftRImm - Check if this is a valid build_vector for the immediate operand of a vector shift righ...
static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG, bool isSigned)
static bool isZeroExtended(SDValue N, SelectionDAG &DAG)
return SDValue()
static const MCPhysReg GPRArgRegs[]
static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, SDValue V2, unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &DL)
GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit the specified operations t...
constexpr MVT FlagsVT
Value type used for NZCV flags.
static bool isNegatedInteger(SDValue Op)
static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt)
getVShiftImm - Check if this is a valid build_vector for the immediate operand of a vector shift oper...
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static bool isSignExtended(SDValue N, SelectionDAG &DAG)
static bool isAddSubZExt(SDValue N, SelectionDAG &DAG)
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt)
isVShiftLImm - Check if this is a valid build_vector for the immediate operand of a vector shift left...
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
static bool isConstant(const MachineInstr &MI)
constexpr LLT F64
constexpr LLT S1
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG)
static bool isStore(int Opcode)
static bool isThumb(const MCSubtargetInfo &STI)
static SDValue LowerUADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG)
static SDValue PerformExtractEltToVMOVRRD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isIncompatibleReg(const MCPhysReg &PR, MVT VT)
static SDValue PerformVQDMULHCombine(SDNode *N, SelectionDAG &DAG)
static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG, const EVT &OrigTy, const EVT &ExtTy, unsigned ExtOpcode)
AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total value size to 64 bits.
static cl::opt< unsigned > ConstpoolPromotionMaxSize("arm-promote-constant-max-size", cl::Hidden, cl::desc("Maximum size of constant to promote into a constant pool"), cl::init(64))
static bool isZeroOrAllOnes(SDValue N, bool AllOnes)
static SDValue LowerINSERT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isVTBLMask(ArrayRef< int > M, EVT VT)
static SDValue PerformSUBCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
static cl::opt< bool > EnableConstpoolPromotion("arm-promote-constant", cl::Hidden, cl::desc("Enable / disable promotion of unnamed_addr constants into " "constant pools"), cl::init(false))
static SDValue PerformFAddVSelectCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformExtractFpToIntStores(StoreSDNode *St, SelectionDAG &DAG)
static SDValue PerformVDUPCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
static SDValue PerformExtractEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static const APInt * isPowerOf2Constant(SDValue V)
static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD) can replace combinations of ...
static SDValue PerformVMOVhrCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG)
static SDValue LowerVECTOR_SHUFFLEUsingOneOff(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static bool isValidMVECond(unsigned CC, bool IsFloat)
static SDValue PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC)
IntCCToARMCC - Convert a DAG integer condition code to an ARM CC.
static SDValue PerformSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformSTORECombine - Target-specific dag combine xforms for ISD::STORE.
static SDValue ConvertBooleanCarryToCarryFlag(SDValue BoolCarry, SelectionDAG &DAG)
static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isGTorGE(ISD::CondCode CC)
static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a vldN-lane (N > 1) intrinsic,...
static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask)
static bool isReverseMask(ArrayRef< int > M, EVT VT)
static bool isVZIP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of "vector_shuffle v,...
static SDValue PerformSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformVECTOR_REG_CASTCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformVMulVCTPCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVMulVCTPCombine - VCVT (fixed-point to floating-point, Advanced SIMD) can replace combinations...
static SDValue createGPRPairNode2xi32(SelectionDAG &DAG, SDValue V0, SDValue V1)
static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG)
static bool findPointerConstIncrement(SDNode *N, SDValue *Ptr, SDValue *CInc)
static bool isVTRNMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool CanInvertMVEVCMP(SDValue N)
static SDValue PerformLongShiftCombine(SDNode *N, SelectionDAG &DAG)
static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformShiftCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
PerformShiftCombine - Checks for immediate versions of vector shifts and lowers them.
static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, ARMCC::CondCodes &CondCode2)
FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static EVT getVectorTyFromPredicateVector(EVT VT)
static SDValue PerformFADDVCMLACombine(SDNode *N, SelectionDAG &DAG)
static SDValue handleCMSEValue(const SDValue &Value, const ISD::InputArg &Arg, SelectionDAG &DAG, const SDLoc &DL)
static SDValue PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
static bool isSRL16(const SDValue &Op)
static SDValue PerformVMOVrhCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformLOADCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue IsCMPZCSINC(SDNode *Cmp, ARMCC::CondCodes &CC)
static unsigned getPointerConstIncrement(unsigned Opcode, SDValue Ptr, SDValue Inc, const SelectionDAG &DAG)
static SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static Register genTPEntry(MachineBasicBlock *TpEntry, MachineBasicBlock *TpLoopBody, MachineBasicBlock *TpExit, Register OpSizeReg, const TargetInstrInfo *TII, DebugLoc Dl, MachineRegisterInfo &MRI)
Adds logic in loop entry MBB to calculate loop iteration count and adds t2WhileLoopSetup and t2WhileL...
static SDValue createGPRPairNodei64(SelectionDAG &DAG, SDValue V)
static bool isLTorLE(ISD::CondCode CC)
static SDValue PerformVCMPCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformMVEVMULLCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl, SelectionDAG &DAG)
static SDValue performNegCMovCombine(SDNode *N, SelectionDAG &DAG)
static EVT getExtensionTo64Bits(const EVT &OrigVT)
static SDValue PerformBITCASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue AddCombineTo64bitMLAL(SDNode *AddeSubeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG)
static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock *BB, const TargetRegisterInfo *TRI)
static SDValue PerformCMPZCombine(SDNode *N, SelectionDAG &DAG)
static bool hasNormalLoadOperand(SDNode *N)
hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node are normal,...
static SDValue PerformInsertEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
PerformInsertEltCombine - Target-specific dag combine xforms for ISD::INSERT_VECTOR_ELT.
static SDValue PerformVDUPLANECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVDUPLANECombine - Target-specific dag combine xforms for ARMISD::VDUPLANE.
static SDValue LowerBuildVectorOfFPTrunc(SDValue BV, SelectionDAG &DAG, const ARMSubtarget *ST)
static cl::opt< unsigned > ConstpoolPromotionMaxTotal("arm-promote-constant-max-total", cl::Hidden, cl::desc("Maximum size of ALL constants to promote into a constant pool"), cl::init(128))
static SDValue LowerTruncatei1(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static RTLIB::Libcall getDivRemLibcall(const SDNode *N, MVT::SimpleValueType SVT)
static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG &DAG)
SkipLoadExtensionForVMULL - return a load of the original vector size that does not do any sign/zero ...
static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombineWithOperands - Try DAG combinations for an ADD with operands N0 and N1.
static SDValue PromoteMVEPredVector(SDLoc dl, SDValue Pred, EVT VT, SelectionDAG &DAG)
static bool isVZIPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue PerformORCombineToSMULWBT(SDNode *OR, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool isVTRN_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of "vector_shuffle v,...
static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue FindBFIToCombineWith(SDNode *N)
static SDValue LowerADDSUBSAT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue ConvertCarryFlagToBooleanCarry(SDValue Flags, EVT VT, SelectionDAG &DAG)
static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, bool &swpCmpOps, bool &swpVselOps)
static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static bool isS16(const SDValue &Op, SelectionDAG &DAG)
static bool isSRA16(const SDValue &Op)
static SDValue AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerVECTOR_SHUFFLEUsingMovs(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue LowerInterruptReturn(SmallVectorImpl< SDValue > &RetOps, const SDLoc &DL, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl, SelectionDAG &DAG)
static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, SDValue &RetVal1, SDValue &RetVal2)
static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformVLDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isSHL16(const SDValue &Op)
static bool isVEXTMask(ArrayRef< int > M, EVT VT, bool &ReverseVEXT, unsigned &Imm)
static SDValue PerformMVEVLDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
cl::opt< unsigned > ArmMaxBaseUpdatesToCheck("arm-max-base-updates-to-check", cl::Hidden, cl::desc("Maximum number of base-updates to check generating postindex."), cl::init(64))
static bool isTruncMask(ArrayRef< int > M, EVT VT, bool Top, bool SingleSource)
static SDValue PerformADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2)
Return the load opcode for a given load size.
static bool isLegalT2AddressImmediate(int64_t V, EVT VT, const ARMSubtarget *Subtarget)
static bool isLegalMVEShuffleOp(unsigned PFEntry)
static SDValue PerformSignExtendInregCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformShuffleVMOVNCombine(ShuffleVectorSDNode *N, SelectionDAG &DAG)
static bool isVUZPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG)
PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for ISD::VECTOR_SHUFFLE.
static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG)
SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND, ANY_EXTEND,...
static bool isVMOVNTruncMask(ArrayRef< int > M, EVT ToVT, bool rev)
static SDValue PerformVQMOVNCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static MachineBasicBlock * OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ)
static SDValue LowerVecReduceMinMax(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformFPExtendCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformAddcSubcCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformVSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static TargetLowering::ArgListTy getDivRemArgList(const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget)
static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
getZeroVector - Returns a vector of specified type with all zero elements.
static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, SelectionDAG &DAG)
static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT, bool isSEXTLoad, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
static ARMCC::CondCodes getVCMPCondCode(SDValue N)
static cl::opt< bool > ARMInterworking("arm-interworking", cl::Hidden, cl::desc("Enable / disable ARM interworking (for debugging only)"), cl::init(true))
static void ReplaceREADCYCLECOUNTER(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformORCombineToBFI(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes, SDValue &CC, bool &Invert, SDValue &OtherOp, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformVSetCCToVCTPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerBUILD_VECTORToVIDUP(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isZeroVector(SDValue N)
static SDValue PerformAddeSubeCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static void ReplaceCMP_SWAP_64Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static bool isLowerSaturate(const SDValue LHS, const SDValue RHS, const SDValue TrueVal, const SDValue FalseVal, const ISD::CondCode CC, const SDValue K)
static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG)
static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, const TargetInstrInfo *TII, const DebugLoc &dl, unsigned StSize, unsigned Data, unsigned AddrIn, unsigned AddrOut, bool IsThumb1, bool IsThumb2)
Emit a post-increment store operation with given size.
static bool isVMOVNMask(ArrayRef< int > M, EVT VT, bool Top, bool SingleSource)
static SDValue CombineBaseUpdate(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
CombineBaseUpdate - Target-specific DAG combine function for VLDDUP, NEON load/store intrinsics,...
static SDValue LowerSaturatingConditional(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSubCSINCCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformVMOVRRDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVMOVRRDCombine - Target-specific dag combine xforms for ARMISD::VMOVRRD.
static SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformCSETCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformVMOVNCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue PerformInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerVectorExtend(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue WinDBZCheckDenominator(SelectionDAG &DAG, SDNode *N, SDValue InChain)
static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue PerformVMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVMULCombine Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the special multi...
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG)
static SDValue PerformBFICombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformORCombine - Target-specific dag combine xforms for ISD::OR.
static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG)
static SDValue PerformTruncatingStoreCombine(StoreSDNode *St, SelectionDAG &DAG)
static unsigned SelectPairHalf(unsigned Elements, ArrayRef< int > Mask, unsigned Index)
static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, const TargetInstrInfo *TII, const DebugLoc &dl, unsigned LdSize, unsigned Data, unsigned AddrIn, unsigned AddrOut, bool IsThumb1, bool IsThumb2)
Emit a post-increment load operation with given size.
static SDValue TryDistrubutionADDVecReduce(SDNode *N, SelectionDAG &DAG)
static bool isValidBaseUpdate(SDNode *N, SDNode *User)
static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, const ARMSubtarget *ST, const SDLoc &dl)
static bool IsQRMVEInstruction(const SDNode *N, const SDNode *Op)
static SDValue PerformMinMaxToSatCombine(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformXORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment, bool isSEXTLoad, bool IsMasked, bool isLE, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
std::pair< unsigned, const TargetRegisterClass * > RCPair
static SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp, TargetLowering::DAGCombinerInfo &DCI, bool AllOnes=false)
static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND, ISD::ZERO_EXTEND,...
static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
cl::opt< unsigned > MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden, cl::desc("Maximum interleave factor for MVE VLDn to generate."), cl::init(2))
static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, unsigned SplatBitSize, SelectionDAG &DAG, const SDLoc &dl, EVT &VT, EVT VectorVT, VMOVModImmType type)
isVMOVModifiedImm - Check if the specified splat value corresponds to a valid vector constant for a N...
static SDValue LowerBuildVectorOfFPExt(SDValue BV, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC, SelectionDAG &DAG)
BC is a bitcast that is about to be turned into a VMOVDRR.
static SDValue promoteToConstantPool(const ARMTargetLowering *TLI, const GlobalValue *GV, SelectionDAG &DAG, EVT PtrVT, const SDLoc &dl)
static unsigned isNEONTwoResultShuffleMask(ArrayRef< int > ShuffleMask, EVT VT, unsigned &WhichResult, bool &isV_UNDEF)
Check if ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN), and return the corresponding AR...
static bool BitsProperlyConcatenate(const APInt &A, const APInt &B)
static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT, bool isSEXTLoad, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
static SDValue LowerVecReduce(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG)
static bool TryCombineBaseUpdate(struct BaseUpdateTarget &Target, struct BaseUpdateUser &User, bool SimpleConstIncOnly, TargetLowering::DAGCombinerInfo &DCI)
static bool allUsersAreInFunction(const Value *V, const Function *F)
Return true if all users of V are within function F, looking through ConstantExprs.
static bool isSingletonVEXTMask(ArrayRef< int > M, EVT VT, unsigned &Imm)
static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG)
PerformVMOVDRRCombine - Target-specific dag combine xforms for ARMISD::VMOVDRR.
static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V, SDValue &SatK)
static bool isLegalAddressImmediate(int64_t V, EVT VT, const ARMSubtarget *Subtarget)
isLegalAddressImmediate - Return true if the integer value can be used as the offset of the target ad...
static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isLegalT1AddressImmediate(int64_t V, EVT VT)
static SDValue CombineANDShift(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSHLSimplify(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue PerformADDECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDECombine - Target-specific dag combine transform from ARMISD::ADDC, ARMISD::ADDE,...
static SDValue PerformReduceShuffleCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformUMLALCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerTruncate(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformHWLoopCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue PerformSplittingMVETruncToNarrowingStores(StoreSDNode *St, SelectionDAG &DAG)
static bool isVUZP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of "vector_shuffle v,...
static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, uint64_t &Members)
static SDValue PerformMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerReverse_VECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG)
static SDValue PerformANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformADDVecReduce(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG)
static SDValue SearchLoopIntrinsic(SDValue N, ISD::CondCode &CC, int &Imm, bool &Negate)
static bool canChangeToInt(SDValue Op, bool &SeenZero, const ARMSubtarget *Subtarget)
canChangeToInt - Given the fp compare operand, return true if it is suitable to morph to an integer c...
static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2)
Return the store opcode for a given store size.
static bool IsVUZPShuffleNode(SDNode *N)
static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget, MachineInstr &MI, const SDNode *Node)
Attaches vregs to MEMCPY that it will use as scratch registers when it is expanded into LDM/STM.
static bool isFloatingPointZero(SDValue Op)
isFloatingPointZero - Return true if this is +0.0.
static SDValue findMUL_LOHI(SDValue V)
static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformORCombine_i1(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformSplittingMVEEXTToWideningLoad(SDNode *N, SelectionDAG &DAG)
static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG)
static void genTPLoopBody(MachineBasicBlock *TpLoopBody, MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit, const TargetInstrInfo *TII, DebugLoc Dl, MachineRegisterInfo &MRI, Register OpSrcReg, Register OpDestReg, Register ElementCountReg, Register TotalIterationsReg, bool IsMemcpy)
Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and t2DoLoopEnd.
static SDValue PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformBUILD_VECTORCombine - Target-specific dag combine xforms for ISD::BUILD_VECTOR.
static SDValue LowerVecReduceF(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformMinMaxCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
PerformMinMaxCombine - Target-specific DAG combining for creating truncating saturates.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
This file a TargetTransformInfoImplBase conforming object specific to the ARM target machine.
Function Alias Analysis false
Function Alias Analysis Results
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
This file implements the BitVector class.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static std::optional< bool > isBigEndian(const SmallDenseMap< int64_t, int64_t, 8 > &MemOffset2Idx, int64_t LowestIdx)
Given a map from byte offsets in memory to indices in a load/store, determine if that map corresponds...
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static void createLoadIntrinsic(IntrinsicInst *II, LoadInst *LI, Value *Offset, dxil::ResourceTypeInfo &RTI)
static void createStoreIntrinsic(IntrinsicInst *II, StoreInst *SI, Value *Offset, dxil::ResourceTypeInfo &RTI)
This file defines the DenseMap class.
static bool isSigned(unsigned int Opcode)
#define Check(C,...)
#define op(i)
#define im(i)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
static Value * LowerCTPOP(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctpop of V before the specified instruction IP.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
lazy value info
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
This file declares the MachineConstantPool class which is an abstract constant pool to keep track of ...
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
nvptx lower args
uint64_t High
uint64_t IntrinsicInst * II
PowerPC Reduce CR logical Operation
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
SI Lower i1 Copies
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
static cl::opt< unsigned > MaxSteps("has-predecessor-max-steps", cl::Hidden, cl::init(8192), cl::desc("DAG combiner limit number of steps when searching DAG " "for predecessor nodes"))
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
This file contains some functions that are useful when dealing with strings.
This file implements the StringSwitch template, which mimics a switch() statement whose cases are str...
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file describes how to lower LLVM code to machine code.
static X86::CondCode getSwappedCondition(X86::CondCode CC)
Assuming the flags are set by MI(a,b), return the condition code if we modify the instructions such t...
static constexpr int Concat[]
Value * RHS
Value * LHS
BinaryOperator * Mul
static bool isIntrinsic(const CallBase &Call, Intrinsic::ID ID)
The Input class is used to parse a yaml document into in-memory structs and vectors.
static constexpr roundingMode rmTowardZero
Definition APFloat.h:348
LLVM_ABI bool getExactInverse(APFloat *Inv) const
If this value is normal and has an exact, normal, multiplicative inverse, store it in inv and return ...
Definition APFloat.cpp:5995
APInt bitcastToAPInt() const
Definition APFloat.h:1335
opStatus convertToInteger(MutableArrayRef< integerPart > Input, unsigned int Width, bool IsSigned, roundingMode RM, bool *IsExact) const
Definition APFloat.h:1314
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition APInt.h:424
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1541
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1671
LLVM_ABI APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition APInt.cpp:1033
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1513
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:936
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1331
bool sgt(const APInt &RHS) const
Signed greater than comparison.
Definition APInt.h:1202
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:372
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1489
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition APInt.h:1112
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1640
unsigned countl_zero() const
The APInt version of std::countl_zero.
Definition APInt.h:1599
static LLVM_ABI APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition APInt.cpp:651
unsigned logBase2() const
Definition APInt.h:1762
uint64_t getLimitedValue(uint64_t Limit=UINT64_MAX) const
If this value is smaller than the specified limit, return it, otherwise return the limit value.
Definition APInt.h:476
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition APInt.h:1258
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:240
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1563
void lshrInPlace(unsigned ShiftAmt)
Logical right-shift this APInt by ShiftAmt in place.
Definition APInt.h:859
APInt lshr(unsigned shiftAmt) const
Logical right-shift function.
Definition APInt.h:852
unsigned countr_one() const
Count the number of trailing one bits.
Definition APInt.h:1657
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1222
An arbitrary precision integer that knows its signedness.
Definition APSInt.h:24
const ARMBaseRegisterInfo & getRegisterInfo() const
const uint32_t * getSjLjDispatchPreservedMask(const MachineFunction &MF) const
const MCPhysReg * getCalleeSavedRegs(const MachineFunction *MF) const override
Code Generation virtual methods...
Register getFrameRegister(const MachineFunction &MF) const override
const uint32_t * getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override
const uint32_t * getTLSCallPreservedMask(const MachineFunction &MF) const
const uint32_t * getThisReturnPreservedMask(const MachineFunction &MF, CallingConv::ID) const
getThisReturnPreservedMask - Returns a call preserved mask specific to the case that 'returned' is on...
static ARMConstantPoolConstant * Create(const Constant *C, unsigned ID)
static ARMConstantPoolMBB * Create(LLVMContext &C, const MachineBasicBlock *mbb, unsigned ID, unsigned char PCAdj)
static ARMConstantPoolSymbol * Create(LLVMContext &C, StringRef s, unsigned ID, unsigned char PCAdj)
ARMConstantPoolValue - ARM specific constantpool value.
ARMFunctionInfo - This class is derived from MachineFunctionInfo and contains private ARM-specific in...
SmallPtrSet< const GlobalVariable *, 2 > & getGlobalsPromotedToConstantPool()
void setArgumentStackToRestore(unsigned v)
void setArgRegsSaveSize(unsigned s)
void setReturnRegsCount(unsigned s)
unsigned getArgRegsSaveSize() const
void markGlobalAsPromotedToConstantPool(const GlobalVariable *GV)
Indicate to the backend that GV has had its storage changed to inside a constant pool.
void setArgumentStackSize(unsigned size)
unsigned getArgumentStackSize() const
const Triple & getTargetTriple() const
const ARMBaseInstrInfo * getInstrInfo() const override
bool isThumb1Only() const
bool useFPVFMx() const
bool isThumb2() const
bool isTargetWindows() const
bool hasBaseDSP() const
const ARMTargetLowering * getTargetLowering() const override
const ARMBaseRegisterInfo * getRegisterInfo() const override
bool hasVFP2Base() const
bool useFPVFMx64() const
bool isLittle() const
bool useFPVFMx16() const
bool isMClass() const
bool useMulOps() const
Align getDualLoadStoreAlignment() const
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X, SDValue Y) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool isReadOnly(const GlobalValue *GV) const
unsigned getMaxSupportedInterleaveFactor() const override
Get the maximum supported factor for interleaved memory accesses.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool shouldInsertFencesForAtomic(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
Align getABIAlignmentForCallingConv(Type *ArgTy, const DataLayout &DL) const override
Return the correct alignment for the current calling convention.
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to move this shift by a constant amount through its operand,...
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const override
Examine constraint string and operand type and determine a weight value.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
isLegalAddressingMode - Return true if the addressing mode represented by AM is legal for this target...
const ARMSubtarget * getSubtarget() const
bool isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const
bool isLegalT1ScaledAddressingMode(const AddrMode &AM, EVT VT) const
Returns true if the addressing mode representing by AM is legal for the Thumb1 target,...
bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const override
getPreIndexedAddressParts - returns true by value, base pointer and offset pointer and addressing mod...
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallBase &I, MachineFunction &MF, unsigned Intrinsic) const override
getTgtMemIntrinsic - Represent NEON load and store intrinsics as MemIntrinsicNodes.
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize, Align &PrefAlign) const override
Return true if the pointer arguments to CI should be aligned by aligning the object whose address is ...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
ReplaceNodeResults - Replace the results of node with an illegal result type with new values built ou...
void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override
bool isMulAddWithConstProfitable(SDValue AddNode, SDValue ConstNode) const override
Return true if it may be profitable to transform (mul (add x, c1), c2) -> (add (mul x,...
bool isLegalAddImmediate(int64_t Imm) const override
isLegalAddImmediate - Return true if the specified immediate is legal add immediate,...
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
Instruction * emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
SDValue PerformMVETruncCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize=false) const override
isFPImmLegal - Returns true if the target can instruction select the specified FP immediate natively.
ConstraintType getConstraintType(StringRef Constraint) const override
getConstraintType - Given a constraint letter, return the type of constraint it is for this target.
bool preferIncOfAddToSubOfNot(EVT VT) const override
These two forms are equivalent: sub y, (xor x, -1) add (add x, 1), y The variant with two add's is IR...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void insertSSPDeclarations(Module &M) const override
Inserts necessary declarations for SSP (stack protection) purpose.
SDValue PerformIntrinsicCombine(SDNode *N, DAGCombinerInfo &DCI) const
PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
bool shouldFoldConstantShiftPairToMask(const SDNode *N) const override
Return true if it is profitable to fold a pair of shifts into a mask.
bool isDesirableToCommuteXorWithShift(const SDNode *N) const override
Return true if it is profitable to combine an XOR of a logical shift to create a logical shift of NOT...
SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const
PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
Value * createComplexDeinterleavingIR(IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator=nullptr) const override
Create the IR node for the given complex deinterleaving operation.
bool isComplexDeinterleavingSupported() const override
Does this target support complex deinterleaving.
SDValue PerformMVEExtCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &OriginalDemandedBits, const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
getSetCCResultType - Return the value type to use for ISD::SETCC.
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
Value * emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override
Perform a store-conditional operation to Addr.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool isVarArg) const
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
createFastISel - This method returns a target specific FastISel object, or null if the target does no...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
This method should be implemented by targets that mark instructions with the 'hasPostISelHook' flag.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
bool isTruncateFree(Type *SrcTy, Type *DstTy) const override
Return true if it's free to truncate a value of type FromTy to type ToTy.
bool isShuffleMaskLegal(ArrayRef< int > M, EVT VT) const override
isShuffleMaskLegal - Targets can use this to indicate that they only support some VECTOR_SHUFFLE oper...
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool lowerInterleavedStore(Instruction *Store, Value *Mask, ShuffleVectorInst *SVI, unsigned Factor, const APInt &GapMask) const override
Lower an interleaved store into a vstN intrinsic.
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const override
getRegClassFor - Return the register class that should be used for the specified value type.
bool useLoadStackGuardNode(const Module &M) const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef< ShuffleVectorInst * > Shuffles, ArrayRef< unsigned > Indices, unsigned Factor, const APInt &GapMask) const override
Lower an interleaved load into a vldN intrinsic.
std::pair< const TargetRegisterClass *, uint8_t > findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const override
Return the largest legal super-reg register class of the register class for the specified type and it...
bool preferSelectsOverBooleanArithmetic(EVT VT) const override
Should we prefer selects to doing arithmetic on boolean types.
bool isZExtFree(SDValue Val, EVT VT2) const override
Return true if zero-extending the specific node Val to type VT2 is free (either because it's implicit...
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
ARMTargetLowering(const TargetMachine &TM, const ARMSubtarget &STI)
bool isComplexDeinterleavingOperationSupported(ComplexDeinterleavingOperation Operation, Type *Ty) const override
Does this target support complex deinterleaving with the given operation and type.
bool supportKCFIBundles() const override
Return true if the target supports kcfi operand bundles.
SDValue PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const
PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
Type * shouldConvertSplatType(ShuffleVectorInst *SVI) const override
Given a shuffle vector SVI representing a vector splat, return a new scalar type of size equal to SVI...
Value * emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr, AtomicOrdering Ord) const override
Perform a load-linked operation on Addr, returning a "Value *" with the corresponding pointee type.
Instruction * makeDMB(IRBuilderBase &Builder, ARM_MB::MemBOpt Domain) const
bool isLegalICmpImmediate(int64_t Imm) const override
isLegalICmpImmediate - Return true if the specified immediate is legal icmp immediate,...
const char * LowerXConstraint(EVT ConstraintVT) const override
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
bool isDesirableToTransformToIntegerOp(unsigned Opc, EVT VT) const override
Return true if it is profitable for dag combiner to transform a floating point op of specified opcode...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool isVarArg) const
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, unsigned *Fast) const override
allowsMisalignedMemoryAccesses - Returns true if the target allows unaligned memory accesses of the s...
bool isLegalInterleavedAccessType(unsigned Factor, FixedVectorType *VecTy, Align Alignment, const DataLayout &DL) const
Returns true if VecTy is a legal interleaved access type.
bool isVectorLoadExtDesirable(SDValue ExtVal) const override
Return true if folding a vector load into ExtVal (a sign, zero, or any extend node) is profitable.
bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx, unsigned &Cost) const override
Return true if the target can combine store(extractelement VectorTy,Idx).
bool useSoftFloat() const override
bool alignLoopsWithOptSize() const override
Should loops be aligned even when the function is marked OptSize (but not MinSize).
SDValue PerformCMOVToBFICombine(SDNode *N, SelectionDAG &DAG) const
bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override
Return true if a truncation from FromTy to ToTy is permitted when deciding whether a call is in tail ...
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
LowerAsmOperandForConstraint - Lower the specified operand into the Ops vector.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg, const DataLayout &DL) const override
Returns true if an argument of type Ty needs to be passed in a contiguous block of registers in calli...
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
const ARMBaseTargetMachine & getTM() const
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const override
getPostIndexedAddressParts - returns true by value, base pointer and offset pointer and addressing mo...
Instruction * emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
Inserts in the IR a target-specific intrinsic specifying a fence.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
An instruction that atomically checks whether a specified value is in a memory location,...
an instruction that atomically reads a memory location, combines it with another value,...
bool isFloatingPointOperation() const
static LLVM_ABI Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
static LLVM_ABI BaseIndexOffset match(const SDNode *N, const SelectionDAG &DAG)
Parses tree in N for base, index, offset addresses.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
The address of a basic block.
Definition Constants.h:904
static BranchProbability getZero()
A "pseudo-class" with methods for operating on BUILD_VECTORs.
LLVM_ABI bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
LLVM_ABI int32_t getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements, uint32_t BitWidth) const
If this is a constant FP splat and the splatted constant FP is an exact power or 2,...
CCState - This class holds information needed while lowering arguments and return values.
void getInRegsParamInfo(unsigned InRegsParamRecordIndex, unsigned &BeginReg, unsigned &EndReg) const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
unsigned getInRegsParamsProcessed() const
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
void addInRegsParamInfo(unsigned RegBegin, unsigned RegEnd)
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
unsigned getInRegsParamsCount() const
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
bool needsCustom() const
int64_t getLocMemOffset() const
unsigned getValNo() const
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
LLVM_ABI bool isIndirectCall() const
Return true if the callsite is an indirect call.
AttributeList getAttributes() const
Return the attributes for this call.
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition Constants.h:720
const APFloat & getValueAPF() const
ConstantFP - Floating Point Values [float, double].
Definition Constants.h:282
This is the shared class of boolean and integer constants.
Definition Constants.h:87
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
int64_t getSExtValue() const
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
bool isLittleEndian() const
Layout endianness...
Definition DataLayout.h:214
bool isBigEndian() const
Definition DataLayout.h:215
MaybeAlign getStackAlignment() const
Returns the natural stack alignment, or MaybeAlign() if one wasn't specified.
Definition DataLayout.h:244
LLVM_ABI TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
LLVM_ABI Align getPreferredAlign(const GlobalVariable *GV) const
Returns the preferred alignment of the specified global.
StringRef getPrivateGlobalPrefix() const
Definition DataLayout.h:302
LLVM_ABI Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
A debug info location.
Definition DebugLoc.h:123
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
unsigned size() const
Definition DenseMap.h:110
bool empty() const
Definition DenseMap.h:109
iterator begin()
Definition DenseMap.h:78
iterator end()
Definition DenseMap.h:81
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition FastISel.h:66
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
arg_iterator arg_begin()
Definition Function.h:866
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
bool hasStructRetAttr() const
Determine if the function returns a structure through first or second pointer argument.
Definition Function.h:687
const Argument * const_arg_iterator
Definition Function.h:73
bool isVarArg() const
isVarArg - Return true if this function takes a variable number of arguments.
Definition Function.h:227
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:730
const GlobalValue * getGlobal() const
bool isDSOLocal() const
bool hasExternalWeakLinkage() const
bool hasDLLImportStorageClass() const
Module * getParent()
Get the module that this global value is contained inside of...
bool isStrongDefinitionForLinker() const
Returns true if this global's definition will be the one chosen by the linker.
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788
LLVM_ABI bool hasAtomicStore() const LLVM_READONLY
Return true if this atomic instruction stores to memory.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool isUnindexed() const
Return true if this is NOT a pre/post inc/dec load/store.
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
Describe properties that are true of each instruction in the target description file.
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isInteger() const
Return true if this is an integer or a vector integer type.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
bool is64BitVector() const
Return true if this is a 64-bit vector type.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
bool isEHPad() const
Returns true if the block is a landing pad.
LLVM_ABI MachineBasicBlock * getFallThrough(bool JumpToFallThrough=true)
Return the fallthrough block if the block can implicitly transfer control to the block after it by fa...
void setCallFrameSize(unsigned N)
Set the call frame size on entry to this basic block.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
LLVM_ABI bool canFallThrough()
Return true if the block can implicitly transfer control to the block after it by falling off the end...
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
Instructions::iterator instr_iterator
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator_range< succ_iterator > successors()
iterator_range< pred_iterator > predecessors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
LLVM_ABI void moveAfter(MachineBasicBlock *NewBefore)
void setIsEHPad(bool V=true)
Indicates the block is a landing pad.
The MachineConstantPool class keeps track of constants referenced by a function which must be spilled...
unsigned getConstantPoolIndex(const Constant *C, Align Alignment)
getConstantPoolIndex - Create a new entry in the constant pool or return an existing one.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
LLVM_ABI void computeMaxCallFrameSize(MachineFunction &MF, std::vector< MachineBasicBlock::iterator > *FrameSDOps=nullptr)
Computes the maximum size of a callframe.
void setFrameAddressIsTaken(bool T)
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
bool hasVAStart() const
Returns true if the function calls the llvm.va_start intrinsic.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
int getFunctionContextIndex() const
Return the index for the function context object.
Properties which a MachineFunction may have at a given point in time.
unsigned getFunctionNumber() const
getFunctionNumber - Return a unique ID for the current function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineConstantPool * getConstantPool()
getConstantPool - Return the constant pool object for the current function.
const MachineFunctionProperties & getProperties() const
Get the function properties.
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addConstantPoolIndex(unsigned Idx, int Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addJumpTableIndex(unsigned Idx, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr reads the specified register.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr fully defines the specified register.
MachineOperand * mop_iterator
iterator/begin/end - Iterate over all operands of a machine instruction.
const MachineOperand & getOperand(unsigned i) const
LLVM_ABI unsigned createJumpTableIndex(const std::vector< MachineBasicBlock * > &DestBBs)
createJumpTableIndex - Create a new jump table.
@ EK_Inline
EK_Inline - Jump table entries are emitted inline at their point of use.
@ EK_BlockAddress
EK_BlockAddress - Each entry is a plain address of block, e.g.: .word LBB123.
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
LLVM_ABI void setIsRenamable(bool Val=true)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
LLVM_ABI void setIsDef(bool Val=true)
Change a def to a use, or a use to a def.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This class is used to represent an MLOAD node.
This class is used to represent an MSTORE node.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
Align getBaseAlign() const
Returns alignment and volatility of the memory access.
Align getAlign() const
bool isVolatile() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Wrapper class representing virtual and physical registers.
Definition Register.h:20
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
LLVM_ABI bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< use_iterator > uses()
SDNodeFlags getFlags() const
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
bool use_empty() const
Return true if there are no uses of this node.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getNumOperands() const
Return the number of values used by this operation.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
const APInt & getConstantOperandAPInt(unsigned Num) const
Helper method returns the APInt of a ConstantSDNode operand.
bool isPredecessorOf(const SDNode *N) const
Return true if this node is a predecessor of N.
LLVM_ABI bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
void setCFIType(uint32_t Type)
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
iterator_range< user_iterator > users()
void setFlags(SDNodeFlags NewFlags)
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
const SDValue & getOperand(unsigned i) const
const APInt & getConstantOperandAPInt(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
unsigned getOpcode() const
unsigned getNumOperands() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
LLVM_ABI SDValue getStackArgumentTokenFactor(SDValue Chain)
Compute a TokenFactor to force all the incoming stack arguments to be loaded from the stack.
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getSplatValue(SDValue V, bool LegalTypes=false)
If V is a splat vector, return its scalar source operand by extracting that element from the source v...
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags, bool AllowCommute=false)
Get the specified node if it's already available, or else return NULL.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge)
Set NoMergeSiteInfo to be associated with Node if NoMerge is true.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI SDValue getNegative(SDValue Val, const SDLoc &DL, EVT VT)
Create negative operation as (SUB 0, Val).
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
LLVM_ABI MaybeAlign InferPtrAlign(SDValue Ptr) const
Infer alignment of a load / store address.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
void addCallSiteInfo(const SDNode *Node, CallSiteInfo &&CallInfo)
Set CallSiteInfo to be associated with Node.
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
LLVM_ABI SDValue CreateStackTemporary(TypeSize Bytes, Align Alignment)
Create a stack temporary based on the size in bytes and the alignment.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
DenormalMode getDenormalMode(EVT VT) const
Return the current function's default denormal handling kind for the given floating point type.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
LLVM_ABI SDValue getLogicalNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a logical NOT operation as (XOR Val, BooleanOne).
This instruction constructs a fixed permutation of two input vectors.
VectorType * getType() const
Overload to return most specific vector type.
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
static LLVM_ABI bool isSplatMask(ArrayRef< int > Mask)
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
bool empty() const
Definition SmallSet.h:168
bool erase(const T &V)
Definition SmallSet.h:199
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
iterator insert(iterator I, T &&Elt)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
const unsigned char * bytes_end() const
Definition StringRef.h:127
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
const unsigned char * bytes_begin() const
Definition StringRef.h:124
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:413
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
void setMinStackArgumentAlignment(Align Alignment)
Set the minimum stack alignment of an argument.
const TargetMachine & getTargetMachine() const
void setIndexedMaskedLoadAction(unsigned IdxMode, MVT VT, LegalizeAction Action)
Indicate that the specified indexed masked load does or does not work with the specified type and ind...
void setIndexedLoadAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed load does or does not work with the specified type and indicate w...
CallingConv::ID getLibcallImplCallingConv(RTLIB::LibcallImpl Call) const
Get the CallingConv that should be used for the specified libcall implementation.
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
Sched::Preference getSchedulingPreference() const
Return target scheduling preference.
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setIndexedStoreAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed store does or does not work with the specified type and indicate ...
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
virtual unsigned getMaxSupportedInterleaveFactor() const
Get the maximum supported factor for interleaved memory accesses.
void setIndexedMaskedStoreAction(unsigned IdxMode, MVT VT, LegalizeAction Action)
Indicate that the specified indexed masked store does or does not work with the specified type and in...
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
virtual std::pair< const TargetRegisterClass *, uint8_t > findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const
Return the largest legal super-reg register class of the register class for the specified type and it...
RTLIB::LibcallImpl getLibcallImpl(RTLIB::Libcall Call) const
Get the libcall impl routine name for the specified libcall.
static StringRef getLibcallImplName(RTLIB::LibcallImpl Call)
Get the libcall routine name for the specified libcall implementation.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
std::vector< ArgListEntry > ArgListTy
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void insertSSPDeclarations(Module &M) const
Inserts necessary declarations for SSP (stack protection) purpose.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedEltMask, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Vector Op.
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool expandDIVREMByConstant(SDNode *N, SmallVectorImpl< SDValue > &Result, EVT HiLoVT, SelectionDAG &DAG, SDValue LL=SDValue(), SDValue LH=SDValue()) const
Attempt to expand an n-bit div/rem/divrem by constant using a n/2-bit urem by constant and other arit...
bool isPositionIndependent() const
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
SDValue buildLegalVectorShuffle(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1, MutableArrayRef< int > Mask, SelectionDAG &DAG) const
Tries to build a legal vector shuffle using the provided parameters or equivalent variations.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
TargetLowering(const TargetLowering &)=delete
bool isConstTrueVal(SDValue N) const
Return if the N is a constant or constant vector equal to the true value from getBooleanContents().
virtual ArrayRef< MCPhysReg > getRoundingControlRegisters() const
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
std::pair< SDValue, SDValue > makeLibCall(SelectionDAG &DAG, RTLIB::LibcallImpl LibcallImpl, EVT RetVT, ArrayRef< SDValue > Ops, MakeLibCallOptions CallOptions, const SDLoc &dl, SDValue Chain=SDValue()) const
Returns a pair of (return value, chain).
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
ExceptionHandling getExceptionModel() const
Return the ExceptionHandling to use, considering TargetOptions and the Triple's default.
const Triple & getTargetTriple() const
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
TargetOptions Options
unsigned EnableFastISel
EnableFastISel - This flag enables fast-path instruction selection which trades away generated code q...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned EmitCallGraphSection
Emit section containing call graph metadata.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
ObjectFormatType getObjectFormat() const
Get the object format for this triple.
Definition Triple.h:439
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:280
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:294
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
Definition Type.cpp:295
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition Type.h:225
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:35
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
Value * getOperand(unsigned i) const
Definition User.h:232
unsigned getNumOperands() const
Definition User.h:254
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
Base class of all SIMD vector types.
Type * getElementType() const
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
IteratorT end() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
static CondCodes getOppositeCondition(CondCodes CC)
Definition ARMBaseInfo.h:48
@ SECREL
Thread Pointer Offset.
@ SBREL
Section Relative (Windows TLS)
@ GOTTPOFF
Global Offset Table, PC Relative.
@ TPOFF
Global Offset Table, Thread Pointer Offset.
TOF
Target Operand Flag enum.
@ MO_NONLAZY
MO_NONLAZY - This is an independent flag, on a symbol operand "FOO" it represents a symbol which,...
@ MO_SBREL
MO_SBREL - On a symbol operand, this represents a static base relative relocation.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_GOT
MO_GOT - On a symbol operand, this represents a GOT relative relocation.
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
static ShiftOpc getShiftOpcForNode(unsigned Opcode)
int getSOImmVal(unsigned Arg)
getSOImmVal - Given a 32-bit immediate, if it is something that can fit into an shifter_operand immed...
int getFP32Imm(const APInt &Imm)
getFP32Imm - Return an 8-bit floating-point version of the 32-bit floating-point value.
uint64_t decodeVMOVModImm(unsigned ModImm, unsigned &EltBits)
decodeVMOVModImm - Decode a NEON/MVE modified immediate value into the element value and the element ...
unsigned getAM2Offset(unsigned AM2Opc)
bool isThumbImmShiftedVal(unsigned V)
isThumbImmShiftedVal - Return true if the specified value can be obtained by left shifting a 8-bit im...
int getT2SOImmVal(unsigned Arg)
getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit into a Thumb-2 shifter_oper...
unsigned createVMOVModImm(unsigned OpCmode, unsigned Val)
int getFP64Imm(const APInt &Imm)
getFP64Imm - Return an 8-bit floating-point version of the 64-bit floating-point value.
int getFP16Imm(const APInt &Imm)
getFP16Imm - Return an 8-bit floating-point version of the 16-bit floating-point value.
unsigned getSORegOpc(ShiftOpc ShOp, unsigned Imm)
int getFP32FP16Imm(const APInt &Imm)
If this is a FP16Imm encoded as a fp32 value, return the 8-bit encoding for it.
AddrOpc getAM2Op(unsigned AM2Opc)
bool isBitFieldInvertedMask(unsigned v)
const unsigned FPStatusBits
const unsigned FPReservedBits
const unsigned RoundingBitsPos
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Swift
Calling convention for Swift.
Definition CallingConv.h:69
@ ARM_APCS
ARM Procedure Calling Standard (obsolete, but still used on some targets).
@ CFGuard_Check
Special calling convention on Windows for calling the Control Guard Check ICall funtion.
Definition CallingConv.h:82
@ PreserveMost
Used for runtime calls that preserves most registers.
Definition CallingConv.h:63
@ ARM_AAPCS
ARM Architecture Procedure Calling Standard calling convention (aka EABI).
@ CXX_FAST_TLS
Used for access functions.
Definition CallingConv.h:72
@ GHC
Used by the Glasgow Haskell Compiler (GHC).
Definition CallingConv.h:50
@ PreserveAll
Used for runtime calls that preserves (almost) all registers.
Definition CallingConv.h:66
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition CallingConv.h:87
@ ARM_AAPCS_VFP
Same as ARM_AAPCS, but uses hard floating point ABI.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:807
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:256
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:780
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition ISDOpcodes.h:504
@ SET_FPENV
Sets the current floating-point environment.
@ MLOAD
Masked load and store - consecutive vector load and store operations with additional mask operand tha...
@ EH_SJLJ_LONGJMP
OUTCHAIN = EH_SJLJ_LONGJMP(INCHAIN, buffer) This corresponds to the eh.sjlj.longjmp intrinsic.
Definition ISDOpcodes.h:163
@ FGETSIGN
INT = FGETSIGN(FP) - Return the sign bit of the specified floating point value as an integer 0/1 valu...
Definition ISDOpcodes.h:531
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:593
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:771
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ RESET_FPENV
Set floating-point environment to default state.
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ SET_FPMODE
Sets the current dynamic floating-point control modes.
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:841
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ FMODF
FMODF - Decomposes the operand into integral and fractional parts, each having the same type and sign...
@ FATAN2
FATAN2 - atan2, inspired by libm.
@ FSINCOSPI
FSINCOSPI - Compute both the sine and cosine times pi more accurately than FSINCOS(pi*x),...
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ EH_SJLJ_SETUP_DISPATCH
OUTCHAIN = EH_SJLJ_SETUP_DISPATCH(INCHAIN) The target initializes the dispatch table here.
Definition ISDOpcodes.h:167
@ GlobalAddress
Definition ISDOpcodes.h:88
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:868
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:577
@ VECREDUCE_FMAX
FMIN/FMAX nodes can have flags, for NaN/NoNaN variants.
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:744
@ ATOMIC_FENCE
OUTCHAIN = ATOMIC_FENCE(INCHAIN, ordering, scope) This corresponds to the fence instruction.
@ RESET_FPMODE
Sets default dynamic floating-point control modes.
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:275
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition ISDOpcodes.h:991
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:981
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition ISDOpcodes.h:431
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ GlobalTLSAddress
Definition ISDOpcodes.h:89
@ SET_ROUNDING
Set rounding mode.
Definition ISDOpcodes.h:963
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:832
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition ISDOpcodes.h:712
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:662
@ BR
Control flow instructions. These all have token chains.
@ VECREDUCE_FADD
These reductions have relaxed evaluation order semantics, and have a single vector operand.
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:779
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition ISDOpcodes.h:815
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BR_CC
BR_CC - Conditional branch.
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:347
@ BR_JT
BR_JT - Jumptable branch.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:784
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:228
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:242
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343
@ VECREDUCE_ADD
Integer reductions may have a result type larger than the vector element type.
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:958
@ STRICT_FP_TO_FP16
Definition ISDOpcodes.h:994
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:701
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
@ STRICT_FP16_TO_FP
Definition ISDOpcodes.h:993
@ GET_FPENV
Gets the current floating-point environment.
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:762
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:642
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:607
@ READ_REGISTER
READ_REGISTER, WRITE_REGISTER - This node represents llvm.register on the DAG, which implements the n...
Definition ISDOpcodes.h:134
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:569
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:838
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:799
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:876
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:724
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:966
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:793
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ STRICT_FROUNDEVEN
Definition ISDOpcodes.h:457
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition ISDOpcodes.h:110
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:471
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:493
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:470
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:914
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:736
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ SCMP
[US]CMP - 3-way comparison of signed or unsigned integers.
Definition ISDOpcodes.h:732
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition ISDOpcodes.h:707
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition ISDOpcodes.h:420
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:558
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:947
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
Definition ISDOpcodes.h:122
@ STRICT_FNEARBYINT
Definition ISDOpcodes.h:451
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition ISDOpcodes.h:933
@ EH_SJLJ_SETJMP
RESULT, OUTCHAIN = EH_SJLJ_SETJMP(INCHAIN, buffer) This corresponds to the eh.sjlj....
Definition ISDOpcodes.h:157
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:844
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
@ BRCOND
BRCOND - Conditional branch.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:821
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:527
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360
@ ABDS
ABDS/ABDU - Absolute difference - Return the absolute difference between two numbers interpreted as s...
Definition ISDOpcodes.h:719
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:549
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
bool isZEXTLoad(const SDNode *N)
Returns true if the specified node is a ZEXTLOAD.
LLVM_ABI CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
bool isEXTLoad(const SDNode *N)
Returns true if the specified node is a EXTLOAD.
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
LLVM_ABI bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
LLVM_ABI bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
bool isSEXTLoad(const SDNode *N)
Returns true if the specified node is a SEXTLOAD.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
static const int LAST_INDEXED_MODE
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
LLVM_ABI Libcall getSINTTOFP(EVT OpVT, EVT RetVT)
getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getUINTTOFP(EVT OpVT, EVT RetVT)
getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOUINT(EVT OpVT, EVT RetVT)
getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOSINT(EVT OpVT, EVT RetVT)
getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPEXT(EVT OpVT, EVT RetVT)
getFPEXT - Return the FPEXT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPROUND(EVT OpVT, EVT RetVT)
getFPROUND - Return the FPROUND_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition LLVMContext.h:55
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
bool RetFastCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280
@ Offset
Definition DWP.cpp:532
@ Length
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2070
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1763
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
bool HasLowerConstantMaterializationCost(unsigned Val1, unsigned Val2, const ARMSubtarget *Subtarget, bool ForCodesize=false)
Returns true if Val1 has a lower Constant Materialization Cost than Val2.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
InstructionCost Cost
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
bool isStrongerThanMonotonic(AtomicOrdering AO)
int countr_one(T Value)
Count the number of ones from the least significant bit to the first zero bit.
Definition bit.h:293
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:296
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
bool CC_ARM_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
constexpr bool isMask_32(uint32_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition MathExtras.h:255
ExceptionHandling
Definition CodeGen.h:53
@ SjLj
setjmp/longjmp based exceptions
Definition CodeGen.h:56
bool RetCC_ARM_AAPCS_VFP(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool RetCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:303
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2148
constexpr bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition MathExtras.h:243
bool RetCC_ARM_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
void shuffle(Iterator first, Iterator last, RNG &&g)
Definition STLExtras.h:1528
bool CC_ARM_APCS_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
static std::array< MachineOperand, 2 > predOps(ARMCC::CondCodes Pred, unsigned PredReg=0)
Get the operands corresponding to the given Pred value.
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
constexpr bool isShiftedMask_32(uint32_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (32 bit ver...
Definition MathExtras.h:267
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
bool isReleaseOrStronger(AtomicOrdering AO)
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:236
LLVM_ABI bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
bool FastCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool CC_ARM_Win32_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
CombineLevel
Definition DAGCombine.h:15
@ BeforeLegalizeTypes
Definition DAGCombine.h:16
unsigned ConstantMaterializationCost(unsigned Val, const ARMSubtarget *Subtarget, bool ForCodesize=false)
Returns the number of instructions required to materialize the given constant in a register,...
@ Mul
Product of integers.
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ FAdd
Sum of floats.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr U AbsoluteValue(T X)
Return the absolute value of a signed integer, converted to the corresponding unsigned integer type.
Definition MathExtras.h:592
bool isAcquireOrStronger(AtomicOrdering AO)
constexpr unsigned BitWidth
static MachineOperand t1CondCodeOp(bool isDead=false)
Get the operand corresponding to the conditional code result for Thumb1.
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1973
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1770
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
static MachineOperand condCodeOp(unsigned CCReg=0)
Get the operand corresponding to the conditional code result.
bool isVREVMask(ArrayRef< int > M, EVT VT, unsigned BlockSize)
isVREVMask - Check if a vector shuffle corresponds to a VREV instruction with the specified blocksize...
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
unsigned gettBLXrOpcode(const MachineFunction &MF)
bool CC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
@ Increment
Incrementally increasing token ID.
Definition AllocToken.h:26
bool CC_ARM_AAPCS_VFP(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
constexpr bool isShiftedUInt(uint64_t x)
Checks if a unsigned integer is an N bit number shifted left by S.
Definition MathExtras.h:198
unsigned convertAddSubFlagsOpcode(unsigned OldOpc)
Map pseudo instructions that imply an 'S' bit onto real opcodes.
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
static const unsigned PerfectShuffleTable[6561+1]
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
Load/store instruction that can be merged with a base address update.
SDNode * N
Instruction that updates a pointer.
unsigned ConstInc
Pointer increment value if it is a constant, or 0 otherwise.
SDValue Inc
Pointer increment operand.
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:761
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition ValueTypes.h:35
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition ValueTypes.h:94
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:284
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
ElementCount getVectorElementCount() const
Definition ValueTypes.h:350
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:463
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition ValueTypes.h:359
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:470
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition ValueTypes.h:207
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:381
bool isFixedLengthVector() const
Definition ValueTypes.h:181
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition ValueTypes.h:59
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
bool bitsLE(EVT VT) const
Return true if this has no more bits than VT.
Definition ValueTypes.h:308
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:453
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition ValueTypes.h:202
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
EVT ArgVT
Usually the non-legalized type of the argument, which is the EVT corresponding to the OrigTy IR type.
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:301
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:172
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition KnownBits.h:311
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:180
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:347
static LLVM_ABI KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
APInt getSignedMinValue() const
Return the minimal signed value possible given these KnownBits.
Definition KnownBits.h:135
Matching combinators.
SmallVector< ArgRegPair, 1 > ArgRegPairs
Vector of call argument and its forwarding register.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getJumpTable(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a jump table entry.
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
static LLVM_ABI MachinePointerInfo getConstantPool(MachineFunction &MF)
Return a MachinePointerInfo record that refers to the constant pool.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoSignedZeros() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This contains information for each constraint that we are lowering.
This structure contains all information that is necessary for lowering calls.
CallLoweringInfo & setInRegister(bool Value=true)
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
SmallVector< ISD::InputArg, 32 > Ins
CallLoweringInfo & setZExtResult(bool Value=true)
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
CallLoweringInfo & setSExtResult(bool Value=true)
SmallVector< ISD::OutputArg, 32 > Outs
CallLoweringInfo & setChain(SDValue InChain)
CallLoweringInfo & setCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList, AttributeSet ResultAttrs={})
LLVM_ABI void AddToWorklist(SDNode *N)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
This structure is used to pass arguments to makeLibCall function.
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...