LLVM 22.0.0git
ARMISelLowering.cpp
Go to the documentation of this file.
1//===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that ARM uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "ARMISelLowering.h"
15#include "ARMBaseInstrInfo.h"
16#include "ARMBaseRegisterInfo.h"
17#include "ARMCallingConv.h"
20#include "ARMPerfectShuffle.h"
21#include "ARMRegisterInfo.h"
22#include "ARMSelectionDAGInfo.h"
23#include "ARMSubtarget.h"
27#include "Utils/ARMBaseInfo.h"
28#include "llvm/ADT/APFloat.h"
29#include "llvm/ADT/APInt.h"
30#include "llvm/ADT/ArrayRef.h"
31#include "llvm/ADT/BitVector.h"
32#include "llvm/ADT/DenseMap.h"
33#include "llvm/ADT/STLExtras.h"
36#include "llvm/ADT/Statistic.h"
38#include "llvm/ADT/StringRef.h"
40#include "llvm/ADT/Twine.h"
66#include "llvm/IR/Attributes.h"
67#include "llvm/IR/CallingConv.h"
68#include "llvm/IR/Constant.h"
69#include "llvm/IR/Constants.h"
70#include "llvm/IR/DataLayout.h"
71#include "llvm/IR/DebugLoc.h"
73#include "llvm/IR/Function.h"
74#include "llvm/IR/GlobalAlias.h"
75#include "llvm/IR/GlobalValue.h"
77#include "llvm/IR/IRBuilder.h"
78#include "llvm/IR/InlineAsm.h"
79#include "llvm/IR/Instruction.h"
82#include "llvm/IR/Intrinsics.h"
83#include "llvm/IR/IntrinsicsARM.h"
84#include "llvm/IR/Module.h"
85#include "llvm/IR/Type.h"
86#include "llvm/IR/User.h"
87#include "llvm/IR/Value.h"
88#include "llvm/MC/MCInstrDesc.h"
90#include "llvm/MC/MCSchedule.h"
97#include "llvm/Support/Debug.h"
105#include <algorithm>
106#include <cassert>
107#include <cstdint>
108#include <cstdlib>
109#include <iterator>
110#include <limits>
111#include <optional>
112#include <tuple>
113#include <utility>
114#include <vector>
115
116using namespace llvm;
117
118#define DEBUG_TYPE "arm-isel"
119
120STATISTIC(NumTailCalls, "Number of tail calls");
121STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
122STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
123STATISTIC(NumConstpoolPromoted,
124 "Number of constants with their storage promoted into constant pools");
125
126static cl::opt<bool>
127ARMInterworking("arm-interworking", cl::Hidden,
128 cl::desc("Enable / disable ARM interworking (for debugging only)"),
129 cl::init(true));
130
132 "arm-promote-constant", cl::Hidden,
133 cl::desc("Enable / disable promotion of unnamed_addr constants into "
134 "constant pools"),
135 cl::init(false)); // FIXME: set to true by default once PR32780 is fixed
137 "arm-promote-constant-max-size", cl::Hidden,
138 cl::desc("Maximum size of constant to promote into a constant pool"),
139 cl::init(64));
141 "arm-promote-constant-max-total", cl::Hidden,
142 cl::desc("Maximum size of ALL constants to promote into a constant pool"),
143 cl::init(128));
144
146MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden,
147 cl::desc("Maximum interleave factor for MVE VLDn to generate."),
148 cl::init(2));
149
151 "arm-max-base-updates-to-check", cl::Hidden,
152 cl::desc("Maximum number of base-updates to check generating postindex."),
153 cl::init(64));
154
155/// Value type used for "flags" operands / results (either CPSR or FPSCR_NZCV).
156constexpr MVT FlagsVT = MVT::i32;
157
158// The APCS parameter registers.
159static const MCPhysReg GPRArgRegs[] = {
160 ARM::R0, ARM::R1, ARM::R2, ARM::R3
161};
162
164 SelectionDAG &DAG, const SDLoc &DL) {
166 assert(Arg.ArgVT.bitsLT(MVT::i32));
167 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, Arg.ArgVT, Value);
168 SDValue Ext =
170 MVT::i32, Trunc);
171 return Ext;
172}
173
174void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT) {
175 if (VT != PromotedLdStVT) {
176 setOperationAction(ISD::LOAD, VT, Promote);
177 AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);
178
179 setOperationAction(ISD::STORE, VT, Promote);
180 AddPromotedToType (ISD::STORE, VT, PromotedLdStVT);
181 }
182
183 MVT ElemTy = VT.getVectorElementType();
184 if (ElemTy != MVT::f64)
188 if (ElemTy == MVT::i32) {
193 } else {
198 }
207 if (VT.isInteger()) {
211 }
212
213 // Neon does not support vector divide/remainder operations.
222
223 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
224 for (auto Opcode : {ISD::ABS, ISD::ABDS, ISD::ABDU, ISD::SMIN, ISD::SMAX,
226 setOperationAction(Opcode, VT, Legal);
227 if (!VT.isFloatingPoint())
228 for (auto Opcode : {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT})
229 setOperationAction(Opcode, VT, Legal);
230}
231
232void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
233 addRegisterClass(VT, &ARM::DPRRegClass);
234 addTypeForNEON(VT, MVT::f64);
235}
236
237void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
238 addRegisterClass(VT, &ARM::DPairRegClass);
239 addTypeForNEON(VT, MVT::v2f64);
240}
241
242void ARMTargetLowering::setAllExpand(MVT VT) {
243 for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
245
246 // We support these really simple operations even on types where all
247 // the actual arithmetic has to be broken down into simpler
248 // operations or turned into library calls.
249 setOperationAction(ISD::BITCAST, VT, Legal);
250 setOperationAction(ISD::LOAD, VT, Legal);
251 setOperationAction(ISD::STORE, VT, Legal);
253}
254
255void ARMTargetLowering::addAllExtLoads(const MVT From, const MVT To,
256 LegalizeAction Action) {
257 setLoadExtAction(ISD::EXTLOAD, From, To, Action);
258 setLoadExtAction(ISD::ZEXTLOAD, From, To, Action);
259 setLoadExtAction(ISD::SEXTLOAD, From, To, Action);
260}
261
262void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
263 const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 };
264
265 for (auto VT : IntTypes) {
266 addRegisterClass(VT, &ARM::MQPRRegClass);
280 setOperationAction(ISD::MLOAD, VT, Custom);
281 setOperationAction(ISD::MSTORE, VT, Legal);
296
297 // No native support for these.
307
308 // Vector reductions
309 setOperationAction(ISD::VECREDUCE_ADD, VT, Legal);
310 setOperationAction(ISD::VECREDUCE_SMAX, VT, Legal);
311 setOperationAction(ISD::VECREDUCE_UMAX, VT, Legal);
312 setOperationAction(ISD::VECREDUCE_SMIN, VT, Legal);
313 setOperationAction(ISD::VECREDUCE_UMIN, VT, Legal);
314 setOperationAction(ISD::VECREDUCE_MUL, VT, Custom);
315 setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
316 setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
317 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
318
319 if (!HasMVEFP) {
324 } else {
327 }
328
329 // Pre and Post inc are supported on loads and stores
330 for (unsigned im = (unsigned)ISD::PRE_INC;
331 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
336 }
337 }
338
339 const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 };
340 for (auto VT : FloatTypes) {
341 addRegisterClass(VT, &ARM::MQPRRegClass);
342 if (!HasMVEFP)
343 setAllExpand(VT);
344
345 // These are legal or custom whether we have MVE.fp or not
354 setOperationAction(ISD::MLOAD, VT, Custom);
355 setOperationAction(ISD::MSTORE, VT, Legal);
358
359 // Pre and Post inc are supported on loads and stores
360 for (unsigned im = (unsigned)ISD::PRE_INC;
361 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
366 }
367
368 if (HasMVEFP) {
369 setOperationAction(ISD::FMINNUM, VT, Legal);
370 setOperationAction(ISD::FMAXNUM, VT, Legal);
371 setOperationAction(ISD::FROUND, VT, Legal);
372 setOperationAction(ISD::FROUNDEVEN, VT, Legal);
373 setOperationAction(ISD::FRINT, VT, Legal);
374 setOperationAction(ISD::FTRUNC, VT, Legal);
375 setOperationAction(ISD::FFLOOR, VT, Legal);
376 setOperationAction(ISD::FCEIL, VT, Legal);
377 setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
378 setOperationAction(ISD::VECREDUCE_FMUL, VT, Custom);
379 setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
380 setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
381
382 // No native support for these.
385 setOperationAction(ISD::FSQRT, VT, Expand);
386 setOperationAction(ISD::FSIN, VT, Expand);
387 setOperationAction(ISD::FCOS, VT, Expand);
388 setOperationAction(ISD::FTAN, VT, Expand);
389 setOperationAction(ISD::FPOW, VT, Expand);
390 setOperationAction(ISD::FLOG, VT, Expand);
391 setOperationAction(ISD::FLOG2, VT, Expand);
392 setOperationAction(ISD::FLOG10, VT, Expand);
393 setOperationAction(ISD::FEXP, VT, Expand);
394 setOperationAction(ISD::FEXP2, VT, Expand);
395 setOperationAction(ISD::FEXP10, VT, Expand);
396 setOperationAction(ISD::FNEARBYINT, VT, Expand);
397 }
398 }
399
400 // Custom Expand smaller than legal vector reductions to prevent false zero
401 // items being added.
402 setOperationAction(ISD::VECREDUCE_FADD, MVT::v4f16, Custom);
403 setOperationAction(ISD::VECREDUCE_FMUL, MVT::v4f16, Custom);
404 setOperationAction(ISD::VECREDUCE_FMIN, MVT::v4f16, Custom);
405 setOperationAction(ISD::VECREDUCE_FMAX, MVT::v4f16, Custom);
406 setOperationAction(ISD::VECREDUCE_FADD, MVT::v2f16, Custom);
407 setOperationAction(ISD::VECREDUCE_FMUL, MVT::v2f16, Custom);
408 setOperationAction(ISD::VECREDUCE_FMIN, MVT::v2f16, Custom);
409 setOperationAction(ISD::VECREDUCE_FMAX, MVT::v2f16, Custom);
410
411 // We 'support' these types up to bitcast/load/store level, regardless of
412 // MVE integer-only / float support. Only doing FP data processing on the FP
413 // vector types is inhibited at integer-only level.
414 const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 };
415 for (auto VT : LongTypes) {
416 addRegisterClass(VT, &ARM::MQPRRegClass);
417 setAllExpand(VT);
423 }
425
426 // We can do bitwise operations on v2i64 vectors
427 setOperationAction(ISD::AND, MVT::v2i64, Legal);
428 setOperationAction(ISD::OR, MVT::v2i64, Legal);
429 setOperationAction(ISD::XOR, MVT::v2i64, Legal);
430
431 // It is legal to extload from v4i8 to v4i16 or v4i32.
432 addAllExtLoads(MVT::v8i16, MVT::v8i8, Legal);
433 addAllExtLoads(MVT::v4i32, MVT::v4i16, Legal);
434 addAllExtLoads(MVT::v4i32, MVT::v4i8, Legal);
435
436 // It is legal to sign extend from v4i8/v4i16 to v4i32 or v8i8 to v8i16.
442
443 // Some truncating stores are legal too.
444 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
445 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
446 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
447
448 // Pre and Post inc on these are legal, given the correct extends
449 for (unsigned im = (unsigned)ISD::PRE_INC;
450 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
451 for (auto VT : {MVT::v8i8, MVT::v4i8, MVT::v4i16}) {
456 }
457 }
458
459 // Predicate types
460 const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1};
461 for (auto VT : pTypes) {
462 addRegisterClass(VT, &ARM::VCCRRegClass);
471 setOperationAction(ISD::LOAD, VT, Custom);
472 setOperationAction(ISD::STORE, VT, Custom);
477
478 if (!HasMVEFP) {
483 }
484 }
488 setOperationAction(ISD::OR, MVT::v2i1, Expand);
494
503}
504
506 return static_cast<const ARMBaseTargetMachine &>(getTargetMachine());
507}
508
510 const ARMSubtarget &STI)
511 : TargetLowering(TM_, STI), Subtarget(&STI),
512 RegInfo(Subtarget->getRegisterInfo()),
513 Itins(Subtarget->getInstrItineraryData()) {
514 const auto &TM = static_cast<const ARMBaseTargetMachine &>(TM_);
515
518
519 const Triple &TT = TM.getTargetTriple();
520
521 if (TT.isOSBinFormatMachO()) {
522 // Uses VFP for Thumb libfuncs if available.
523 if (Subtarget->isThumb() && Subtarget->hasVFP2Base() &&
524 Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) {
525 // clang-format off
526 static const struct {
527 const RTLIB::Libcall Op;
528 const RTLIB::LibcallImpl Impl;
529 } LibraryCalls[] = {
530 // Single-precision floating-point arithmetic.
531 { RTLIB::ADD_F32, RTLIB::impl___addsf3vfp },
532 { RTLIB::SUB_F32, RTLIB::impl___subsf3vfp },
533 { RTLIB::MUL_F32, RTLIB::impl___mulsf3vfp },
534 { RTLIB::DIV_F32, RTLIB::impl___divsf3vfp },
535
536 // Double-precision floating-point arithmetic.
537 { RTLIB::ADD_F64, RTLIB::impl___adddf3vfp },
538 { RTLIB::SUB_F64, RTLIB::impl___subdf3vfp },
539 { RTLIB::MUL_F64, RTLIB::impl___muldf3vfp },
540 { RTLIB::DIV_F64, RTLIB::impl___divdf3vfp },
541
542 // Single-precision comparisons.
543 { RTLIB::OEQ_F32, RTLIB::impl___eqsf2vfp },
544 { RTLIB::UNE_F32, RTLIB::impl___nesf2vfp },
545 { RTLIB::OLT_F32, RTLIB::impl___ltsf2vfp },
546 { RTLIB::OLE_F32, RTLIB::impl___lesf2vfp },
547 { RTLIB::OGE_F32, RTLIB::impl___gesf2vfp },
548 { RTLIB::OGT_F32, RTLIB::impl___gtsf2vfp },
549 { RTLIB::UO_F32, RTLIB::impl___unordsf2vfp },
550
551 // Double-precision comparisons.
552 { RTLIB::OEQ_F64, RTLIB::impl___eqdf2vfp },
553 { RTLIB::UNE_F64, RTLIB::impl___nedf2vfp },
554 { RTLIB::OLT_F64, RTLIB::impl___ltdf2vfp },
555 { RTLIB::OLE_F64, RTLIB::impl___ledf2vfp },
556 { RTLIB::OGE_F64, RTLIB::impl___gedf2vfp },
557 { RTLIB::OGT_F64, RTLIB::impl___gtdf2vfp },
558 { RTLIB::UO_F64, RTLIB::impl___unorddf2vfp },
559
560 // Floating-point to integer conversions.
561 // i64 conversions are done via library routines even when generating VFP
562 // instructions, so use the same ones.
563 { RTLIB::FPTOSINT_F64_I32, RTLIB::impl___fixdfsivfp },
564 { RTLIB::FPTOUINT_F64_I32, RTLIB::impl___fixunsdfsivfp },
565 { RTLIB::FPTOSINT_F32_I32, RTLIB::impl___fixsfsivfp },
566 { RTLIB::FPTOUINT_F32_I32, RTLIB::impl___fixunssfsivfp },
567
568 // Conversions between floating types.
569 { RTLIB::FPROUND_F64_F32, RTLIB::impl___truncdfsf2vfp },
570 { RTLIB::FPEXT_F32_F64, RTLIB::impl___extendsfdf2vfp },
571
572 // Integer to floating-point conversions.
573 // i64 conversions are done via library routines even when generating VFP
574 // instructions, so use the same ones.
575 // FIXME: There appears to be some naming inconsistency in ARM libgcc:
576 // e.g., __floatunsidf vs. __floatunssidfvfp.
577 { RTLIB::SINTTOFP_I32_F64, RTLIB::impl___floatsidfvfp },
578 { RTLIB::UINTTOFP_I32_F64, RTLIB::impl___floatunssidfvfp },
579 { RTLIB::SINTTOFP_I32_F32, RTLIB::impl___floatsisfvfp },
580 { RTLIB::UINTTOFP_I32_F32, RTLIB::impl___floatunssisfvfp },
581 };
582 // clang-format on
583
584 for (const auto &LC : LibraryCalls)
585 setLibcallImpl(LC.Op, LC.Impl);
586 }
587 }
588
589 if (Subtarget->isThumb1Only())
590 addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
591 else
592 addRegisterClass(MVT::i32, &ARM::GPRRegClass);
593
594 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only() &&
595 Subtarget->hasFPRegs()) {
596 addRegisterClass(MVT::f32, &ARM::SPRRegClass);
597 addRegisterClass(MVT::f64, &ARM::DPRRegClass);
598
603
604 if (!Subtarget->hasVFP2Base()) {
605 setAllExpand(MVT::f32);
606 } else {
609 setOperationAction(Op, MVT::f32, Legal);
610 }
611 if (!Subtarget->hasFP64()) {
612 setAllExpand(MVT::f64);
613 } else {
616 setOperationAction(Op, MVT::f64, Legal);
617 }
618 }
619
620 if (Subtarget->hasFullFP16()) {
621 addRegisterClass(MVT::f16, &ARM::HPRRegClass);
622 setOperationAction(ISD::BITCAST, MVT::i16, Custom);
623 setOperationAction(ISD::BITCAST, MVT::f16, Custom);
624
625 setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
626 setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
627 }
628
629 if (Subtarget->hasBF16()) {
630 addRegisterClass(MVT::bf16, &ARM::HPRRegClass);
631 setAllExpand(MVT::bf16);
632 if (!Subtarget->hasFullFP16())
633 setOperationAction(ISD::BITCAST, MVT::bf16, Custom);
634 } else {
635 setOperationAction(ISD::BF16_TO_FP, MVT::f32, Expand);
636 setOperationAction(ISD::BF16_TO_FP, MVT::f64, Expand);
637 setOperationAction(ISD::FP_TO_BF16, MVT::f32, Custom);
638 setOperationAction(ISD::FP_TO_BF16, MVT::f64, Custom);
639 }
640
642 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
643 setTruncStoreAction(VT, InnerVT, Expand);
644 addAllExtLoads(VT, InnerVT, Expand);
645 }
646
649
651 }
652
653 if (!Subtarget->isThumb1Only() && !Subtarget->hasV8_1MMainlineOps())
655
656 if (!Subtarget->hasV8_1MMainlineOps())
658
659 if (!Subtarget->isThumb1Only())
661
664
667
668 if (Subtarget->hasMVEIntegerOps())
669 addMVEVectorTypes(Subtarget->hasMVEFloatOps());
670
671 // Combine low-overhead loop intrinsics so that we can lower i1 types.
672 if (Subtarget->hasLOB()) {
673 setTargetDAGCombine({ISD::BRCOND, ISD::BR_CC});
674 }
675
676 if (Subtarget->hasNEON()) {
677 addDRTypeForNEON(MVT::v2f32);
678 addDRTypeForNEON(MVT::v8i8);
679 addDRTypeForNEON(MVT::v4i16);
680 addDRTypeForNEON(MVT::v2i32);
681 addDRTypeForNEON(MVT::v1i64);
682
683 addQRTypeForNEON(MVT::v4f32);
684 addQRTypeForNEON(MVT::v2f64);
685 addQRTypeForNEON(MVT::v16i8);
686 addQRTypeForNEON(MVT::v8i16);
687 addQRTypeForNEON(MVT::v4i32);
688 addQRTypeForNEON(MVT::v2i64);
689
690 if (Subtarget->hasFullFP16()) {
691 addQRTypeForNEON(MVT::v8f16);
692 addDRTypeForNEON(MVT::v4f16);
693 }
694
695 if (Subtarget->hasBF16()) {
696 addQRTypeForNEON(MVT::v8bf16);
697 addDRTypeForNEON(MVT::v4bf16);
698 }
699 }
700
701 if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) {
702 // v2f64 is legal so that QR subregs can be extracted as f64 elements, but
703 // none of Neon, MVE or VFP supports any arithmetic operations on it.
704 setOperationAction(ISD::FADD, MVT::v2f64, Expand);
705 setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
706 setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
707 // FIXME: Code duplication: FDIV and FREM are expanded always, see
708 // ARMTargetLowering::addTypeForNEON method for details.
709 setOperationAction(ISD::FDIV, MVT::v2f64, Expand);
710 setOperationAction(ISD::FREM, MVT::v2f64, Expand);
711 // FIXME: Create unittest.
712 // In another words, find a way when "copysign" appears in DAG with vector
713 // operands.
715 // FIXME: Code duplication: SETCC has custom operation action, see
716 // ARMTargetLowering::addTypeForNEON method for details.
718 // FIXME: Create unittest for FNEG and for FABS.
719 setOperationAction(ISD::FNEG, MVT::v2f64, Expand);
720 setOperationAction(ISD::FABS, MVT::v2f64, Expand);
721 setOperationAction(ISD::FSQRT, MVT::v2f64, Expand);
722 setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
723 setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
724 setOperationAction(ISD::FTAN, MVT::v2f64, Expand);
725 setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
726 setOperationAction(ISD::FLOG, MVT::v2f64, Expand);
727 setOperationAction(ISD::FLOG2, MVT::v2f64, Expand);
728 setOperationAction(ISD::FLOG10, MVT::v2f64, Expand);
729 setOperationAction(ISD::FEXP, MVT::v2f64, Expand);
730 setOperationAction(ISD::FEXP2, MVT::v2f64, Expand);
731 setOperationAction(ISD::FEXP10, MVT::v2f64, Expand);
732 setOperationAction(ISD::FCEIL, MVT::v2f64, Expand);
733 setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand);
734 setOperationAction(ISD::FRINT, MVT::v2f64, Expand);
735 setOperationAction(ISD::FROUNDEVEN, MVT::v2f64, Expand);
736 setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand);
737 setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand);
738 setOperationAction(ISD::FMA, MVT::v2f64, Expand);
739 }
740
741 if (Subtarget->hasNEON()) {
742 // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
743 // supported for v4f32.
744 setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
745 setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
746 setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
747 setOperationAction(ISD::FTAN, MVT::v4f32, Expand);
748 setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
749 setOperationAction(ISD::FLOG, MVT::v4f32, Expand);
750 setOperationAction(ISD::FLOG2, MVT::v4f32, Expand);
751 setOperationAction(ISD::FLOG10, MVT::v4f32, Expand);
752 setOperationAction(ISD::FEXP, MVT::v4f32, Expand);
753 setOperationAction(ISD::FEXP2, MVT::v4f32, Expand);
754 setOperationAction(ISD::FEXP10, MVT::v4f32, Expand);
755 setOperationAction(ISD::FCEIL, MVT::v4f32, Expand);
756 setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand);
757 setOperationAction(ISD::FRINT, MVT::v4f32, Expand);
758 setOperationAction(ISD::FROUNDEVEN, MVT::v4f32, Expand);
759 setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand);
760 setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand);
761
762 // Mark v2f32 intrinsics.
763 setOperationAction(ISD::FSQRT, MVT::v2f32, Expand);
764 setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
765 setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
766 setOperationAction(ISD::FTAN, MVT::v2f32, Expand);
767 setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
768 setOperationAction(ISD::FLOG, MVT::v2f32, Expand);
769 setOperationAction(ISD::FLOG2, MVT::v2f32, Expand);
770 setOperationAction(ISD::FLOG10, MVT::v2f32, Expand);
771 setOperationAction(ISD::FEXP, MVT::v2f32, Expand);
772 setOperationAction(ISD::FEXP2, MVT::v2f32, Expand);
773 setOperationAction(ISD::FEXP10, MVT::v2f32, Expand);
774 setOperationAction(ISD::FCEIL, MVT::v2f32, Expand);
775 setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand);
776 setOperationAction(ISD::FRINT, MVT::v2f32, Expand);
777 setOperationAction(ISD::FROUNDEVEN, MVT::v2f32, Expand);
778 setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand);
779 setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand);
780
781 for (ISD::NodeType Op : {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL,
782 ISD::FRINT, ISD::FTRUNC, ISD::FROUNDEVEN}) {
783 setOperationAction(Op, MVT::v4f16, Expand);
784 setOperationAction(Op, MVT::v8f16, Expand);
785 }
786
787 // Neon does not support some operations on v1i64 and v2i64 types.
788 setOperationAction(ISD::MUL, MVT::v1i64, Expand);
789 // Custom handling for some quad-vector types to detect VMULL.
790 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
791 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
792 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
793 // Custom handling for some vector types to avoid expensive expansions
794 setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
796 setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
798 // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
799 // a destination type that is wider than the source, and nor does
800 // it have a FP_TO_[SU]INT instruction with a narrower destination than
801 // source.
810
812 setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand);
813
814 // NEON does not have single instruction CTPOP for vectors with element
815 // types wider than 8-bits. However, custom lowering can leverage the
816 // v8i8/v16i8 vcnt instruction.
823
824 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
825 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
826
827 // NEON does not have single instruction CTTZ for vectors.
829 setOperationAction(ISD::CTTZ, MVT::v4i16, Custom);
830 setOperationAction(ISD::CTTZ, MVT::v2i32, Custom);
831 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
832
833 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
834 setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
835 setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
836 setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);
837
842
847
851 }
852
853 // NEON only has FMA instructions as of VFP4.
854 if (!Subtarget->hasVFP4Base()) {
855 setOperationAction(ISD::FMA, MVT::v2f32, Expand);
856 setOperationAction(ISD::FMA, MVT::v4f32, Expand);
857 }
858
860 ISD::FP_TO_UINT, ISD::FMUL, ISD::LOAD});
861
862 // It is legal to extload from v4i8 to v4i16 or v4i32.
863 for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16,
864 MVT::v2i32}) {
869 }
870 }
871
872 for (auto VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
873 MVT::v4i32}) {
874 setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
875 setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
876 setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
877 setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
878 }
879 }
880
881 if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) {
887 ISD::INTRINSIC_VOID, ISD::VECREDUCE_ADD, ISD::ADD, ISD::BITCAST});
888 }
889 if (Subtarget->hasMVEIntegerOps()) {
891 ISD::FP_EXTEND, ISD::SELECT, ISD::SELECT_CC,
892 ISD::SETCC});
893 }
894 if (Subtarget->hasMVEFloatOps()) {
896 }
897
898 if (!Subtarget->hasFP64()) {
899 // When targeting a floating-point unit with only single-precision
900 // operations, f64 is legal for the few double-precision instructions which
901 // are present However, no double-precision operations other than moves,
902 // loads and stores are provided by the hardware.
911 setOperationAction(ISD::FNEG, MVT::f64, Expand);
912 setOperationAction(ISD::FABS, MVT::f64, Expand);
913 setOperationAction(ISD::FSQRT, MVT::f64, Expand);
914 setOperationAction(ISD::FSIN, MVT::f64, Expand);
915 setOperationAction(ISD::FCOS, MVT::f64, Expand);
916 setOperationAction(ISD::FPOW, MVT::f64, Expand);
917 setOperationAction(ISD::FLOG, MVT::f64, Expand);
918 setOperationAction(ISD::FLOG2, MVT::f64, Expand);
919 setOperationAction(ISD::FLOG10, MVT::f64, Expand);
920 setOperationAction(ISD::FEXP, MVT::f64, Expand);
921 setOperationAction(ISD::FEXP2, MVT::f64, Expand);
922 setOperationAction(ISD::FEXP10, MVT::f64, Expand);
923 setOperationAction(ISD::FCEIL, MVT::f64, Expand);
924 setOperationAction(ISD::FTRUNC, MVT::f64, Expand);
925 setOperationAction(ISD::FRINT, MVT::f64, Expand);
926 setOperationAction(ISD::FROUNDEVEN, MVT::f64, Expand);
927 setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand);
928 setOperationAction(ISD::FFLOOR, MVT::f64, Expand);
941 }
942
943 if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) {
944 setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);
946 if (Subtarget->hasFullFP16()) {
949 }
950 }
951
952 if (!Subtarget->hasFP16()) {
953 setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);
955 }
956
957 computeRegisterProperties(Subtarget->getRegisterInfo());
958
959 // ARM does not have floating-point extending loads.
960 for (MVT VT : MVT::fp_valuetypes()) {
961 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
962 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
963 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
964 }
965
966 // ... or truncating stores
967 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
968 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
969 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
970 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
971 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
972
973 // ARM does not have i1 sign extending load.
974 for (MVT VT : MVT::integer_valuetypes())
976
977 // ARM supports all 4 flavors of integer indexed load / store.
978 if (!Subtarget->isThumb1Only()) {
979 for (unsigned im = (unsigned)ISD::PRE_INC;
981 setIndexedLoadAction(im, MVT::i1, Legal);
982 setIndexedLoadAction(im, MVT::i8, Legal);
983 setIndexedLoadAction(im, MVT::i16, Legal);
984 setIndexedLoadAction(im, MVT::i32, Legal);
985 setIndexedStoreAction(im, MVT::i1, Legal);
986 setIndexedStoreAction(im, MVT::i8, Legal);
987 setIndexedStoreAction(im, MVT::i16, Legal);
988 setIndexedStoreAction(im, MVT::i32, Legal);
989 }
990 } else {
991 // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}.
994 }
995
1000
1003 if (Subtarget->hasDSP()) {
1012 }
1013 if (Subtarget->hasBaseDSP()) {
1016 }
1017
1018 // i64 operation support.
1021 if (Subtarget->isThumb1Only()) {
1024 }
1025 if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops()
1026 || (Subtarget->isThumb2() && !Subtarget->hasDSP()))
1028
1036 setOperationAction(ISD::LOAD, MVT::i64, Custom);
1037 setOperationAction(ISD::STORE, MVT::i64, Custom);
1038
1039 // MVE lowers 64 bit shifts to lsll and lsrl
1040 // assuming that ISD::SRL and SRA of i64 are already marked custom
1041 if (Subtarget->hasMVEIntegerOps())
1043
1044 // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1.
1045 if (Subtarget->isThumb1Only()) {
1049 }
1050
1051 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops())
1053
1054 // ARM does not have ROTL.
1059 }
1061 // TODO: These two should be set to LibCall, but this currently breaks
1062 // the Linux kernel build. See #101786.
1065 if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) {
1068 }
1069
1070 // @llvm.readcyclecounter requires the Performance Monitors extension.
1071 // Default to the 0 expansion on unsupported platforms.
1072 // FIXME: Technically there are older ARM CPUs that have
1073 // implementation-specific ways of obtaining this information.
1074 if (Subtarget->hasPerfMon())
1075 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom);
1076
1077 // Only ARMv6 has BSWAP.
1078 if (!Subtarget->hasV6Ops())
1080
1081 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
1082 : Subtarget->hasDivideInARMMode();
1083 if (!hasDivide) {
1084 // These are expanded into libcalls if the cpu doesn't have HW divider.
1087 }
1088
1089 if (TT.isOSWindows() && !Subtarget->hasDivideInThumbMode()) {
1092
1095 }
1096
1099
1100 // Register based DivRem for AEABI (RTABI 4.2)
1101 if (TT.isTargetAEABI() || TT.isAndroid() || TT.isTargetGNUAEABI() ||
1102 TT.isTargetMuslAEABI() || TT.isOSFuchsia() || TT.isOSWindows()) {
1105 HasStandaloneRem = false;
1106
1111 } else {
1114 }
1115
1120
1121 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1122 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
1123
1124 // Use the default implementation.
1125 setOperationAction(ISD::VASTART, MVT::Other, Custom);
1126 setOperationAction(ISD::VAARG, MVT::Other, Expand);
1127 setOperationAction(ISD::VACOPY, MVT::Other, Expand);
1128 setOperationAction(ISD::VAEND, MVT::Other, Expand);
1129 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
1130 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
1131
1132 if (TT.isOSWindows())
1133 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
1134 else
1135 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
1136
1137 // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
1138 // the default expansion.
1139 InsertFencesForAtomic = false;
1140 if (Subtarget->hasAnyDataBarrier() &&
1141 (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) {
1142 // ATOMIC_FENCE needs custom lowering; the others should have been expanded
1143 // to ldrex/strex loops already.
1144 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
1145 if (!Subtarget->isThumb() || !Subtarget->isMClass())
1146 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
1147
1148 // On v8, we have particularly efficient implementations of atomic fences
1149 // if they can be combined with nearby atomic loads and stores.
1150 if (!Subtarget->hasAcquireRelease() ||
1151 getTargetMachine().getOptLevel() == CodeGenOptLevel::None) {
1152 // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc.
1153 InsertFencesForAtomic = true;
1154 }
1155 } else {
1156 // If there's anything we can use as a barrier, go through custom lowering
1157 // for ATOMIC_FENCE.
1158 // If target has DMB in thumb, Fences can be inserted.
1159 if (Subtarget->hasDataBarrier())
1160 InsertFencesForAtomic = true;
1161
1162 setOperationAction(ISD::ATOMIC_FENCE, MVT::Other,
1163 Subtarget->hasAnyDataBarrier() ? Custom : Expand);
1164
1165 // Set them all for libcall, which will force libcalls.
1166 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, LibCall);
1167 setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, LibCall);
1168 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, LibCall);
1169 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, LibCall);
1170 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, LibCall);
1171 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, LibCall);
1172 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, LibCall);
1173 setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, LibCall);
1174 setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, LibCall);
1175 setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, LibCall);
1176 setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, LibCall);
1177 setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, LibCall);
1178 // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the
1179 // Unordered/Monotonic case.
1180 if (!InsertFencesForAtomic) {
1181 setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom);
1182 setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom);
1183 }
1184 }
1185
1186 // Compute supported atomic widths.
1187 if (TT.isOSLinux() || (!Subtarget->isMClass() && Subtarget->hasV6Ops())) {
1188 // For targets where __sync_* routines are reliably available, we use them
1189 // if necessary.
1190 //
1191 // ARM Linux always supports 64-bit atomics through kernel-assisted atomic
1192 // routines (kernel 3.1 or later). FIXME: Not with compiler-rt?
1193 //
1194 // ARMv6 targets have native instructions in ARM mode. For Thumb mode,
1195 // such targets should provide __sync_* routines, which use the ARM mode
1196 // instructions. (ARMv6 doesn't have dmb, but it has an equivalent
1197 // encoding; see ARMISD::MEMBARRIER_MCR.)
1199 } else if ((Subtarget->isMClass() && Subtarget->hasV8MBaselineOps()) ||
1200 Subtarget->hasForced32BitAtomics()) {
1201 // Cortex-M (besides Cortex-M0) have 32-bit atomics.
1203 } else {
1204 // We can't assume anything about other targets; just use libatomic
1205 // routines.
1207 }
1208
1210
1211 setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
1212
1213 // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
1214 if (!Subtarget->hasV6Ops()) {
1217 }
1219
1220 if (!Subtarget->useSoftFloat() && Subtarget->hasFPRegs() &&
1221 !Subtarget->isThumb1Only()) {
1222 // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
1223 // iff target supports vfp2.
1224 setOperationAction(ISD::BITCAST, MVT::i64, Custom);
1226 setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
1227 setOperationAction(ISD::GET_FPENV, MVT::i32, Legal);
1228 setOperationAction(ISD::SET_FPENV, MVT::i32, Legal);
1229 setOperationAction(ISD::RESET_FPENV, MVT::Other, Legal);
1230 setOperationAction(ISD::GET_FPMODE, MVT::i32, Legal);
1231 setOperationAction(ISD::SET_FPMODE, MVT::i32, Custom);
1232 setOperationAction(ISD::RESET_FPMODE, MVT::Other, Custom);
1233 }
1234
1235 // We want to custom lower some of our intrinsics.
1240
1250 if (Subtarget->hasFullFP16()) {
1254 }
1255
1257
1258 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
1259 setOperationAction(ISD::BR_CC, MVT::i32, Custom);
1260 if (Subtarget->hasFullFP16())
1261 setOperationAction(ISD::BR_CC, MVT::f16, Custom);
1262 setOperationAction(ISD::BR_CC, MVT::f32, Custom);
1263 setOperationAction(ISD::BR_CC, MVT::f64, Custom);
1264 setOperationAction(ISD::BR_JT, MVT::Other, Custom);
1265
1266 // We don't support sin/cos/fmod/copysign/pow
1267 setOperationAction(ISD::FSIN, MVT::f64, Expand);
1268 setOperationAction(ISD::FSIN, MVT::f32, Expand);
1269 setOperationAction(ISD::FCOS, MVT::f32, Expand);
1270 setOperationAction(ISD::FCOS, MVT::f64, Expand);
1271 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
1272 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
1275 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2Base() &&
1276 !Subtarget->isThumb1Only()) {
1279 }
1280 setOperationAction(ISD::FPOW, MVT::f64, Expand);
1281 setOperationAction(ISD::FPOW, MVT::f32, Expand);
1282
1283 if (!Subtarget->hasVFP4Base()) {
1286 }
1287
1288 // Various VFP goodness
1289 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) {
1290 // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded.
1291 if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) {
1292 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
1293 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
1294 setOperationAction(ISD::STRICT_FP16_TO_FP, MVT::f64, LibCall);
1295 setOperationAction(ISD::STRICT_FP_TO_FP16, MVT::f64, LibCall);
1296 }
1297
1298 // fp16 is a special v7 extension that adds f16 <-> f32 conversions.
1299 if (!Subtarget->hasFP16()) {
1300 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
1301 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
1302 setOperationAction(ISD::STRICT_FP16_TO_FP, MVT::f32, LibCall);
1303 setOperationAction(ISD::STRICT_FP_TO_FP16, MVT::f32, LibCall);
1304 }
1305
1306 // Strict floating-point comparisons need custom lowering.
1313 }
1314
1315 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
1316 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
1317
1318 // FP-ARMv8 implements a lot of rounding-like FP operations.
1319 if (Subtarget->hasFPARMv8Base()) {
1320 setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
1321 setOperationAction(ISD::FCEIL, MVT::f32, Legal);
1322 setOperationAction(ISD::FROUND, MVT::f32, Legal);
1323 setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
1324 setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
1325 setOperationAction(ISD::FRINT, MVT::f32, Legal);
1326 setOperationAction(ISD::FROUNDEVEN, MVT::f32, Legal);
1327 setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
1328 setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
1329 if (Subtarget->hasNEON()) {
1330 setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal);
1331 setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal);
1332 setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
1333 setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
1334 }
1335
1336 if (Subtarget->hasFP64()) {
1337 setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
1338 setOperationAction(ISD::FCEIL, MVT::f64, Legal);
1339 setOperationAction(ISD::FROUND, MVT::f64, Legal);
1340 setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
1341 setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
1342 setOperationAction(ISD::FRINT, MVT::f64, Legal);
1343 setOperationAction(ISD::FROUNDEVEN, MVT::f64, Legal);
1344 setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
1345 setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
1346 }
1347 }
1348
1349 // FP16 often need to be promoted to call lib functions
1350 // clang-format off
1351 if (Subtarget->hasFullFP16()) {
1352 setOperationAction(ISD::LRINT, MVT::f16, Expand);
1353 setOperationAction(ISD::LROUND, MVT::f16, Expand);
1355
1356 for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
1357 ISD::FCOS, ISD::FSIN, ISD::FSINCOS,
1358 ISD::FSINCOSPI, ISD::FMODF, ISD::FACOS,
1359 ISD::FASIN, ISD::FATAN, ISD::FATAN2,
1360 ISD::FCOSH, ISD::FSINH, ISD::FTANH,
1361 ISD::FTAN, ISD::FEXP, ISD::FEXP2,
1362 ISD::FEXP10, ISD::FLOG, ISD::FLOG2,
1363 ISD::FLOG10, ISD::STRICT_FREM, ISD::STRICT_FPOW,
1370 setOperationAction(Op, MVT::f16, Promote);
1371 }
1372
1373 // Round-to-integer need custom lowering for fp16, as Promote doesn't work
1374 // because the result type is integer.
1376 setOperationAction(Op, MVT::f16, Custom);
1377
1378 for (auto Op : {ISD::FROUND, ISD::FROUNDEVEN, ISD::FTRUNC,
1379 ISD::FNEARBYINT, ISD::FRINT, ISD::FFLOOR,
1383 setOperationAction(Op, MVT::f16, Legal);
1384 }
1385 // clang-format on
1386 }
1387
1388 if (Subtarget->hasNEON()) {
1389 // vmin and vmax aren't available in a scalar form, so we can use
1390 // a NEON instruction with an undef lane instead.
1391 setOperationAction(ISD::FMINIMUM, MVT::f32, Legal);
1392 setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal);
1393 setOperationAction(ISD::FMINIMUM, MVT::f16, Legal);
1394 setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal);
1395 setOperationAction(ISD::FMINIMUM, MVT::v2f32, Legal);
1396 setOperationAction(ISD::FMAXIMUM, MVT::v2f32, Legal);
1397 setOperationAction(ISD::FMINIMUM, MVT::v4f32, Legal);
1398 setOperationAction(ISD::FMAXIMUM, MVT::v4f32, Legal);
1399
1400 if (Subtarget->hasV8Ops()) {
1401 setOperationAction(ISD::FFLOOR, MVT::v2f32, Legal);
1402 setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
1403 setOperationAction(ISD::FROUND, MVT::v2f32, Legal);
1404 setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
1405 setOperationAction(ISD::FROUNDEVEN, MVT::v2f32, Legal);
1406 setOperationAction(ISD::FROUNDEVEN, MVT::v4f32, Legal);
1407 setOperationAction(ISD::FCEIL, MVT::v2f32, Legal);
1408 setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
1409 setOperationAction(ISD::FTRUNC, MVT::v2f32, Legal);
1410 setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
1411 setOperationAction(ISD::FRINT, MVT::v2f32, Legal);
1412 setOperationAction(ISD::FRINT, MVT::v4f32, Legal);
1413 }
1414
1415 if (Subtarget->hasFullFP16()) {
1416 setOperationAction(ISD::FMINNUM, MVT::v4f16, Legal);
1417 setOperationAction(ISD::FMAXNUM, MVT::v4f16, Legal);
1418 setOperationAction(ISD::FMINNUM, MVT::v8f16, Legal);
1419 setOperationAction(ISD::FMAXNUM, MVT::v8f16, Legal);
1420
1421 setOperationAction(ISD::FMINIMUM, MVT::v4f16, Legal);
1422 setOperationAction(ISD::FMAXIMUM, MVT::v4f16, Legal);
1423 setOperationAction(ISD::FMINIMUM, MVT::v8f16, Legal);
1424 setOperationAction(ISD::FMAXIMUM, MVT::v8f16, Legal);
1425
1426 setOperationAction(ISD::FFLOOR, MVT::v4f16, Legal);
1427 setOperationAction(ISD::FFLOOR, MVT::v8f16, Legal);
1428 setOperationAction(ISD::FROUND, MVT::v4f16, Legal);
1429 setOperationAction(ISD::FROUND, MVT::v8f16, Legal);
1430 setOperationAction(ISD::FROUNDEVEN, MVT::v4f16, Legal);
1431 setOperationAction(ISD::FROUNDEVEN, MVT::v8f16, Legal);
1432 setOperationAction(ISD::FCEIL, MVT::v4f16, Legal);
1433 setOperationAction(ISD::FCEIL, MVT::v8f16, Legal);
1434 setOperationAction(ISD::FTRUNC, MVT::v4f16, Legal);
1435 setOperationAction(ISD::FTRUNC, MVT::v8f16, Legal);
1436 setOperationAction(ISD::FRINT, MVT::v4f16, Legal);
1437 setOperationAction(ISD::FRINT, MVT::v8f16, Legal);
1438 }
1439 }
1440
1441 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
1442 // it, but it's just a wrapper around ldexp.
1443 if (TT.isOSWindows()) {
1444 for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
1445 if (isOperationExpand(Op, MVT::f32))
1446 setOperationAction(Op, MVT::f32, Promote);
1447 }
1448
1449 // LegalizeDAG currently can't expand fp16 LDEXP/FREXP on targets where i16
1450 // isn't legal.
1451 for (ISD::NodeType Op : {ISD::FLDEXP, ISD::STRICT_FLDEXP, ISD::FFREXP})
1452 if (isOperationExpand(Op, MVT::f16))
1453 setOperationAction(Op, MVT::f16, Promote);
1454
1455 // We have target-specific dag combine patterns for the following nodes:
1456 // ARMISD::VMOVRRD - No need to call setTargetDAGCombine
1459
1460 if (Subtarget->hasMVEIntegerOps())
1462
1463 if (Subtarget->hasV6Ops())
1465 if (Subtarget->isThumb1Only())
1467 // Attempt to lower smin/smax to ssat/usat
1468 if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) ||
1469 Subtarget->isThumb2()) {
1471 }
1472
1474
1475 if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() ||
1476 !Subtarget->hasVFP2Base() || Subtarget->hasMinSize())
1478 else
1480
1481 //// temporary - rewrite interface to use type
1484 MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
1486 MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
1488
1489 // On ARM arguments smaller than 4 bytes are extended, so all arguments
1490 // are at least 4 bytes aligned.
1492
1493 // Prefer likely predicted branches to selects on out-of-order cores.
1494 PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder();
1495
1496 setPrefLoopAlignment(Align(1ULL << Subtarget->getPreferBranchLogAlignment()));
1498 Align(1ULL << Subtarget->getPreferBranchLogAlignment()));
1499
1500 setMinFunctionAlignment(Subtarget->isThumb() ? Align(2) : Align(4));
1501}
1502
1504 return Subtarget->useSoftFloat();
1505}
1506
1508 return !Subtarget->isThumb1Only() && VT.getSizeInBits() <= 32;
1509}
1510
1511// FIXME: It might make sense to define the representative register class as the
1512// nearest super-register that has a non-null superset. For example, DPR_VFP2 is
1513// a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
1514// SPR's representative would be DPR_VFP2. This should work well if register
1515// pressure tracking were modified such that a register use would increment the
1516// pressure of the register class's representative and all of it's super
1517// classes' representatives transitively. We have not implemented this because
1518// of the difficulty prior to coalescing of modeling operand register classes
1519// due to the common occurrence of cross class copies and subregister insertions
1520// and extractions.
1521std::pair<const TargetRegisterClass *, uint8_t>
1523 MVT VT) const {
1524 const TargetRegisterClass *RRC = nullptr;
1525 uint8_t Cost = 1;
1526 switch (VT.SimpleTy) {
1527 default:
1529 // Use DPR as representative register class for all floating point
1530 // and vector types. Since there are 32 SPR registers and 32 DPR registers so
1531 // the cost is 1 for both f32 and f64.
1532 case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:
1533 case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:
1534 RRC = &ARM::DPRRegClass;
1535 // When NEON is used for SP, only half of the register file is available
1536 // because operations that define both SP and DP results will be constrained
1537 // to the VFP2 class (D0-D15). We currently model this constraint prior to
1538 // coalescing by double-counting the SP regs. See the FIXME above.
1539 if (Subtarget->useNEONForSinglePrecisionFP())
1540 Cost = 2;
1541 break;
1542 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1543 case MVT::v4f32: case MVT::v2f64:
1544 RRC = &ARM::DPRRegClass;
1545 Cost = 2;
1546 break;
1547 case MVT::v4i64:
1548 RRC = &ARM::DPRRegClass;
1549 Cost = 4;
1550 break;
1551 case MVT::v8i64:
1552 RRC = &ARM::DPRRegClass;
1553 Cost = 8;
1554 break;
1555 }
1556 return std::make_pair(RRC, Cost);
1557}
1558
1560 EVT VT) const {
1561 if (!VT.isVector())
1562 return getPointerTy(DL);
1563
1564 // MVE has a predicate register.
1565 if ((Subtarget->hasMVEIntegerOps() &&
1566 (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
1567 VT == MVT::v16i8)) ||
1568 (Subtarget->hasMVEFloatOps() &&
1569 (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16)))
1570 return MVT::getVectorVT(MVT::i1, VT.getVectorElementCount());
1572}
1573
1574/// getRegClassFor - Return the register class that should be used for the
1575/// specified value type.
1576const TargetRegisterClass *
1577ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
1578 (void)isDivergent;
1579 // Map v4i64 to QQ registers but do not make the type legal. Similarly map
1580 // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
1581 // load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive
1582 // MVE Q registers.
1583 if (Subtarget->hasNEON()) {
1584 if (VT == MVT::v4i64)
1585 return &ARM::QQPRRegClass;
1586 if (VT == MVT::v8i64)
1587 return &ARM::QQQQPRRegClass;
1588 }
1589 if (Subtarget->hasMVEIntegerOps()) {
1590 if (VT == MVT::v4i64)
1591 return &ARM::MQQPRRegClass;
1592 if (VT == MVT::v8i64)
1593 return &ARM::MQQQQPRRegClass;
1594 }
1596}
1597
1598// memcpy, and other memory intrinsics, typically tries to use LDM/STM if the
1599// source/dest is aligned and the copy size is large enough. We therefore want
1600// to align such objects passed to memory intrinsics.
1602 Align &PrefAlign) const {
1603 if (!isa<MemIntrinsic>(CI))
1604 return false;
1605 MinSize = 8;
1606 // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1
1607 // cycle faster than 4-byte aligned LDM.
1608 PrefAlign =
1609 (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? Align(8) : Align(4));
1610 return true;
1611}
1612
1613// Create a fast isel object.
1614FastISel *
1616 const TargetLibraryInfo *libInfo) const {
1617 return ARM::createFastISel(funcInfo, libInfo);
1618}
1619
1621 unsigned NumVals = N->getNumValues();
1622 if (!NumVals)
1623 return Sched::RegPressure;
1624
1625 for (unsigned i = 0; i != NumVals; ++i) {
1626 EVT VT = N->getValueType(i);
1627 if (VT == MVT::Glue || VT == MVT::Other)
1628 continue;
1629 if (VT.isFloatingPoint() || VT.isVector())
1630 return Sched::ILP;
1631 }
1632
1633 if (!N->isMachineOpcode())
1634 return Sched::RegPressure;
1635
1636 // Load are scheduled for latency even if there instruction itinerary
1637 // is not available.
1638 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1639 const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
1640
1641 if (MCID.getNumDefs() == 0)
1642 return Sched::RegPressure;
1643 if (!Itins->isEmpty() &&
1644 Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2U)
1645 return Sched::ILP;
1646
1647 return Sched::RegPressure;
1648}
1649
1650//===----------------------------------------------------------------------===//
1651// Lowering Code
1652//===----------------------------------------------------------------------===//
1653
1654static bool isSRL16(const SDValue &Op) {
1655 if (Op.getOpcode() != ISD::SRL)
1656 return false;
1657 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1658 return Const->getZExtValue() == 16;
1659 return false;
1660}
1661
1662static bool isSRA16(const SDValue &Op) {
1663 if (Op.getOpcode() != ISD::SRA)
1664 return false;
1665 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1666 return Const->getZExtValue() == 16;
1667 return false;
1668}
1669
1670static bool isSHL16(const SDValue &Op) {
1671 if (Op.getOpcode() != ISD::SHL)
1672 return false;
1673 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1674 return Const->getZExtValue() == 16;
1675 return false;
1676}
1677
1678// Check for a signed 16-bit value. We special case SRA because it makes it
1679// more simple when also looking for SRAs that aren't sign extending a
1680// smaller value. Without the check, we'd need to take extra care with
1681// checking order for some operations.
1682static bool isS16(const SDValue &Op, SelectionDAG &DAG) {
1683 if (isSRA16(Op))
1684 return isSHL16(Op.getOperand(0));
1685 return DAG.ComputeNumSignBits(Op) == 17;
1686}
1687
1688/// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
1690 switch (CC) {
1691 default: llvm_unreachable("Unknown condition code!");
1692 case ISD::SETNE: return ARMCC::NE;
1693 case ISD::SETEQ: return ARMCC::EQ;
1694 case ISD::SETGT: return ARMCC::GT;
1695 case ISD::SETGE: return ARMCC::GE;
1696 case ISD::SETLT: return ARMCC::LT;
1697 case ISD::SETLE: return ARMCC::LE;
1698 case ISD::SETUGT: return ARMCC::HI;
1699 case ISD::SETUGE: return ARMCC::HS;
1700 case ISD::SETULT: return ARMCC::LO;
1701 case ISD::SETULE: return ARMCC::LS;
1702 }
1703}
1704
1705/// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
1707 ARMCC::CondCodes &CondCode2) {
1708 CondCode2 = ARMCC::AL;
1709 switch (CC) {
1710 default: llvm_unreachable("Unknown FP condition!");
1711 case ISD::SETEQ:
1712 case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
1713 case ISD::SETGT:
1714 case ISD::SETOGT: CondCode = ARMCC::GT; break;
1715 case ISD::SETGE:
1716 case ISD::SETOGE: CondCode = ARMCC::GE; break;
1717 case ISD::SETOLT: CondCode = ARMCC::MI; break;
1718 case ISD::SETOLE: CondCode = ARMCC::LS; break;
1719 case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
1720 case ISD::SETO: CondCode = ARMCC::VC; break;
1721 case ISD::SETUO: CondCode = ARMCC::VS; break;
1722 case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break;
1723 case ISD::SETUGT: CondCode = ARMCC::HI; break;
1724 case ISD::SETUGE: CondCode = ARMCC::PL; break;
1725 case ISD::SETLT:
1726 case ISD::SETULT: CondCode = ARMCC::LT; break;
1727 case ISD::SETLE:
1728 case ISD::SETULE: CondCode = ARMCC::LE; break;
1729 case ISD::SETNE:
1730 case ISD::SETUNE: CondCode = ARMCC::NE; break;
1731 }
1732}
1733
1734//===----------------------------------------------------------------------===//
1735// Calling Convention Implementation
1736//===----------------------------------------------------------------------===//
1737
1738/// getEffectiveCallingConv - Get the effective calling convention, taking into
1739/// account presence of floating point hardware and calling convention
1740/// limitations, such as support for variadic functions.
1742ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
1743 bool isVarArg) const {
1744 switch (CC) {
1745 default:
1746 report_fatal_error("Unsupported calling convention");
1749 case CallingConv::GHC:
1751 return CC;
1757 case CallingConv::Swift:
1760 case CallingConv::C:
1761 case CallingConv::Tail:
1762 if (!getTM().isAAPCS_ABI())
1763 return CallingConv::ARM_APCS;
1764 else if (Subtarget->hasFPRegs() && !Subtarget->isThumb1Only() &&
1765 getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
1766 !isVarArg)
1768 else
1770 case CallingConv::Fast:
1772 if (!getTM().isAAPCS_ABI()) {
1773 if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && !isVarArg)
1774 return CallingConv::Fast;
1775 return CallingConv::ARM_APCS;
1776 } else if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() &&
1777 !isVarArg)
1779 else
1781 }
1782}
1783
1785 bool isVarArg) const {
1786 return CCAssignFnForNode(CC, false, isVarArg);
1787}
1788
1790 bool isVarArg) const {
1791 return CCAssignFnForNode(CC, true, isVarArg);
1792}
1793
1794/// CCAssignFnForNode - Selects the correct CCAssignFn for the given
1795/// CallingConvention.
1796CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
1797 bool Return,
1798 bool isVarArg) const {
1799 switch (getEffectiveCallingConv(CC, isVarArg)) {
1800 default:
1801 report_fatal_error("Unsupported calling convention");
1803 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
1805 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1807 return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
1808 case CallingConv::Fast:
1809 return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
1810 case CallingConv::GHC:
1811 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
1813 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1815 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1817 return (Return ? RetCC_ARM_AAPCS : CC_ARM_Win32_CFGuard_Check);
1818 }
1819}
1820
1821SDValue ARMTargetLowering::MoveToHPR(const SDLoc &dl, SelectionDAG &DAG,
1822 MVT LocVT, MVT ValVT, SDValue Val) const {
1823 Val = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocVT.getSizeInBits()),
1824 Val);
1825 if (Subtarget->hasFullFP16()) {
1826 Val = DAG.getNode(ARMISD::VMOVhr, dl, ValVT, Val);
1827 } else {
1828 Val = DAG.getNode(ISD::TRUNCATE, dl,
1829 MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
1830 Val = DAG.getNode(ISD::BITCAST, dl, ValVT, Val);
1831 }
1832 return Val;
1833}
1834
1835SDValue ARMTargetLowering::MoveFromHPR(const SDLoc &dl, SelectionDAG &DAG,
1836 MVT LocVT, MVT ValVT,
1837 SDValue Val) const {
1838 if (Subtarget->hasFullFP16()) {
1839 Val = DAG.getNode(ARMISD::VMOVrh, dl,
1840 MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
1841 } else {
1842 Val = DAG.getNode(ISD::BITCAST, dl,
1843 MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
1844 Val = DAG.getNode(ISD::ZERO_EXTEND, dl,
1845 MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
1846 }
1847 return DAG.getNode(ISD::BITCAST, dl, LocVT, Val);
1848}
1849
1850/// LowerCallResult - Lower the result values of a call into the
1851/// appropriate copies out of appropriate physical registers.
1852SDValue ARMTargetLowering::LowerCallResult(
1853 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
1854 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
1855 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
1856 SDValue ThisVal, bool isCmseNSCall) const {
1857 // Assign locations to each value returned by this call.
1859 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
1860 *DAG.getContext());
1861 CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg));
1862
1863 // Copy all of the result registers out of their specified physreg.
1864 for (unsigned i = 0; i != RVLocs.size(); ++i) {
1865 CCValAssign VA = RVLocs[i];
1866
1867 // Pass 'this' value directly from the argument to return value, to avoid
1868 // reg unit interference
1869 if (i == 0 && isThisReturn) {
1870 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 &&
1871 "unexpected return calling convention register assignment");
1872 InVals.push_back(ThisVal);
1873 continue;
1874 }
1875
1876 SDValue Val;
1877 if (VA.needsCustom() &&
1878 (VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2f64)) {
1879 // Handle f64 or half of a v2f64.
1880 SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
1881 InGlue);
1882 Chain = Lo.getValue(1);
1883 InGlue = Lo.getValue(2);
1884 VA = RVLocs[++i]; // skip ahead to next loc
1885 SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
1886 InGlue);
1887 Chain = Hi.getValue(1);
1888 InGlue = Hi.getValue(2);
1889 if (!Subtarget->isLittle())
1890 std::swap (Lo, Hi);
1891 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
1892
1893 if (VA.getLocVT() == MVT::v2f64) {
1894 SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
1895 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
1896 DAG.getConstant(0, dl, MVT::i32));
1897
1898 VA = RVLocs[++i]; // skip ahead to next loc
1899 Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InGlue);
1900 Chain = Lo.getValue(1);
1901 InGlue = Lo.getValue(2);
1902 VA = RVLocs[++i]; // skip ahead to next loc
1903 Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InGlue);
1904 Chain = Hi.getValue(1);
1905 InGlue = Hi.getValue(2);
1906 if (!Subtarget->isLittle())
1907 std::swap (Lo, Hi);
1908 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
1909 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
1910 DAG.getConstant(1, dl, MVT::i32));
1911 }
1912 } else {
1913 Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
1914 InGlue);
1915 Chain = Val.getValue(1);
1916 InGlue = Val.getValue(2);
1917 }
1918
1919 switch (VA.getLocInfo()) {
1920 default: llvm_unreachable("Unknown loc info!");
1921 case CCValAssign::Full: break;
1922 case CCValAssign::BCvt:
1923 Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
1924 break;
1925 }
1926
1927 // f16 arguments have their size extended to 4 bytes and passed as if they
1928 // had been copied to the LSBs of a 32-bit register.
1929 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
1930 if (VA.needsCustom() &&
1931 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
1932 Val = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Val);
1933
1934 // On CMSE Non-secure Calls, call results (returned values) whose bitwidth
1935 // is less than 32 bits must be sign- or zero-extended after the call for
1936 // security reasons. Although the ABI mandates an extension done by the
1937 // callee, the latter cannot be trusted to follow the rules of the ABI.
1938 const ISD::InputArg &Arg = Ins[VA.getValNo()];
1939 if (isCmseNSCall && Arg.ArgVT.isScalarInteger() &&
1940 VA.getLocVT().isScalarInteger() && Arg.ArgVT.bitsLT(MVT::i32))
1941 Val = handleCMSEValue(Val, Arg, DAG, dl);
1942
1943 InVals.push_back(Val);
1944 }
1945
1946 return Chain;
1947}
1948
1949std::pair<SDValue, MachinePointerInfo> ARMTargetLowering::computeAddrForCallArg(
1950 const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, SDValue StackPtr,
1951 bool IsTailCall, int SPDiff) const {
1952 SDValue DstAddr;
1953 MachinePointerInfo DstInfo;
1954 int32_t Offset = VA.getLocMemOffset();
1955 MachineFunction &MF = DAG.getMachineFunction();
1956
1957 if (IsTailCall) {
1958 Offset += SPDiff;
1959 auto PtrVT = getPointerTy(DAG.getDataLayout());
1960 int Size = VA.getLocVT().getFixedSizeInBits() / 8;
1961 int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true);
1962 DstAddr = DAG.getFrameIndex(FI, PtrVT);
1963 DstInfo =
1965 } else {
1966 SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl);
1967 DstAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
1968 StackPtr, PtrOff);
1969 DstInfo =
1971 }
1972
1973 return std::make_pair(DstAddr, DstInfo);
1974}
1975
1976// Returns the type of copying which is required to set up a byval argument to
1977// a tail-called function. This isn't needed for non-tail calls, because they
1978// always need the equivalent of CopyOnce, but tail-calls sometimes need two to
1979// avoid clobbering another argument (CopyViaTemp), and sometimes can be
1980// optimised to zero copies when forwarding an argument from the caller's
1981// caller (NoCopy).
1982ARMTargetLowering::ByValCopyKind ARMTargetLowering::ByValNeedsCopyForTailCall(
1983 SelectionDAG &DAG, SDValue Src, SDValue Dst, ISD::ArgFlagsTy Flags) const {
1984 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
1985 ARMFunctionInfo *AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
1986
1987 // Globals are always safe to copy from.
1989 return CopyOnce;
1990
1991 // Can only analyse frame index nodes, conservatively assume we need a
1992 // temporary.
1993 auto *SrcFrameIdxNode = dyn_cast<FrameIndexSDNode>(Src);
1994 auto *DstFrameIdxNode = dyn_cast<FrameIndexSDNode>(Dst);
1995 if (!SrcFrameIdxNode || !DstFrameIdxNode)
1996 return CopyViaTemp;
1997
1998 int SrcFI = SrcFrameIdxNode->getIndex();
1999 int DstFI = DstFrameIdxNode->getIndex();
2000 assert(MFI.isFixedObjectIndex(DstFI) &&
2001 "byval passed in non-fixed stack slot");
2002
2003 int64_t SrcOffset = MFI.getObjectOffset(SrcFI);
2004 int64_t DstOffset = MFI.getObjectOffset(DstFI);
2005
2006 // If the source is in the local frame, then the copy to the argument memory
2007 // is always valid.
2008 bool FixedSrc = MFI.isFixedObjectIndex(SrcFI);
2009 if (!FixedSrc ||
2010 (FixedSrc && SrcOffset < -(int64_t)AFI->getArgRegsSaveSize()))
2011 return CopyOnce;
2012
2013 // In the case of byval arguments split between registers and the stack,
2014 // computeAddrForCallArg returns a FrameIndex which corresponds only to the
2015 // stack portion, but the Src SDValue will refer to the full value, including
2016 // the local stack memory that the register portion gets stored into. We only
2017 // need to compare them for equality, so normalise on the full value version.
2018 uint64_t RegSize = Flags.getByValSize() - MFI.getObjectSize(DstFI);
2019 DstOffset -= RegSize;
2020
2021 // If the value is already in the correct location, then no copying is
2022 // needed. If not, then we need to copy via a temporary.
2023 if (SrcOffset == DstOffset)
2024 return NoCopy;
2025 else
2026 return CopyViaTemp;
2027}
2028
2029void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,
2030 SDValue Chain, SDValue &Arg,
2031 RegsToPassVector &RegsToPass,
2032 CCValAssign &VA, CCValAssign &NextVA,
2033 SDValue &StackPtr,
2034 SmallVectorImpl<SDValue> &MemOpChains,
2035 bool IsTailCall,
2036 int SPDiff) const {
2037 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
2038 DAG.getVTList(MVT::i32, MVT::i32), Arg);
2039 unsigned id = Subtarget->isLittle() ? 0 : 1;
2040 RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id)));
2041
2042 if (NextVA.isRegLoc())
2043 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id)));
2044 else {
2045 assert(NextVA.isMemLoc());
2046 if (!StackPtr.getNode())
2047 StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP,
2049
2050 SDValue DstAddr;
2051 MachinePointerInfo DstInfo;
2052 std::tie(DstAddr, DstInfo) =
2053 computeAddrForCallArg(dl, DAG, NextVA, StackPtr, IsTailCall, SPDiff);
2054 MemOpChains.push_back(
2055 DAG.getStore(Chain, dl, fmrrd.getValue(1 - id), DstAddr, DstInfo));
2056 }
2057}
2058
2059static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
2060 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
2062}
2063
2064/// LowerCall - Lowering a call into a callseq_start <-
2065/// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
2066/// nodes.
2067SDValue
2068ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2069 SmallVectorImpl<SDValue> &InVals) const {
2070 SelectionDAG &DAG = CLI.DAG;
2071 SDLoc &dl = CLI.DL;
2072 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2073 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2074 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
2075 SDValue Chain = CLI.Chain;
2076 SDValue Callee = CLI.Callee;
2077 bool &isTailCall = CLI.IsTailCall;
2078 CallingConv::ID CallConv = CLI.CallConv;
2079 bool doesNotRet = CLI.DoesNotReturn;
2080 bool isVarArg = CLI.IsVarArg;
2081 const CallBase *CB = CLI.CB;
2082
2083 MachineFunction &MF = DAG.getMachineFunction();
2084 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2085 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
2086 MachineFunction::CallSiteInfo CSInfo;
2087 bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
2088 bool isThisReturn = false;
2089 bool isCmseNSCall = false;
2090 bool isSibCall = false;
2091 bool PreferIndirect = false;
2092 bool GuardWithBTI = false;
2093
2094 // Analyze operands of the call, assigning locations to each operand.
2096 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2097 *DAG.getContext());
2098 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg));
2099
2100 // Lower 'returns_twice' calls to a pseudo-instruction.
2101 if (CLI.CB && CLI.CB->getAttributes().hasFnAttr(Attribute::ReturnsTwice) &&
2102 !Subtarget->noBTIAtReturnTwice())
2103 GuardWithBTI = AFI->branchTargetEnforcement();
2104
2105 // Set type id for call site info.
2106 if (MF.getTarget().Options.EmitCallGraphSection && CB && CB->isIndirectCall())
2107 CSInfo = MachineFunction::CallSiteInfo(*CB);
2108
2109 // Determine whether this is a non-secure function call.
2110 if (CLI.CB && CLI.CB->getAttributes().hasFnAttr("cmse_nonsecure_call"))
2111 isCmseNSCall = true;
2112
2113 // Disable tail calls if they're not supported.
2114 if (!Subtarget->supportsTailCall())
2115 isTailCall = false;
2116
2117 // For both the non-secure calls and the returns from a CMSE entry function,
2118 // the function needs to do some extra work after the call, or before the
2119 // return, respectively, thus it cannot end with a tail call
2120 if (isCmseNSCall || AFI->isCmseNSEntryFunction())
2121 isTailCall = false;
2122
2123 if (isa<GlobalAddressSDNode>(Callee)) {
2124 // If we're optimizing for minimum size and the function is called three or
2125 // more times in this block, we can improve codesize by calling indirectly
2126 // as BLXr has a 16-bit encoding.
2127 auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
2128 if (CLI.CB) {
2129 auto *BB = CLI.CB->getParent();
2130 PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() &&
2131 count_if(GV->users(), [&BB](const User *U) {
2132 return isa<Instruction>(U) &&
2133 cast<Instruction>(U)->getParent() == BB;
2134 }) > 2;
2135 }
2136 }
2137 if (isTailCall) {
2138 // Check if it's really possible to do a tail call.
2139 isTailCall =
2140 IsEligibleForTailCallOptimization(CLI, CCInfo, ArgLocs, PreferIndirect);
2141
2142 if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt &&
2143 CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail)
2144 isSibCall = true;
2145
2146 // We don't support GuaranteedTailCallOpt for ARM, only automatically
2147 // detected sibcalls.
2148 if (isTailCall)
2149 ++NumTailCalls;
2150 }
2151
2152 if (!isTailCall && CLI.CB && CLI.CB->isMustTailCall())
2153 report_fatal_error("failed to perform tail call elimination on a call "
2154 "site marked musttail");
2155
2156 // Get a count of how many bytes are to be pushed on the stack.
2157 unsigned NumBytes = CCInfo.getStackSize();
2158
2159 // SPDiff is the byte offset of the call's argument area from the callee's.
2160 // Stores to callee stack arguments will be placed in FixedStackSlots offset
2161 // by this amount for a tail call. In a sibling call it must be 0 because the
2162 // caller will deallocate the entire stack and the callee still expects its
2163 // arguments to begin at SP+0. Completely unused for non-tail calls.
2164 int SPDiff = 0;
2165
2166 if (isTailCall && !isSibCall) {
2167 auto FuncInfo = MF.getInfo<ARMFunctionInfo>();
2168 unsigned NumReusableBytes = FuncInfo->getArgumentStackSize();
2169
2170 // Since callee will pop argument stack as a tail call, we must keep the
2171 // popped size 16-byte aligned.
2172 MaybeAlign StackAlign = DAG.getDataLayout().getStackAlignment();
2173 assert(StackAlign && "data layout string is missing stack alignment");
2174 NumBytes = alignTo(NumBytes, *StackAlign);
2175
2176 // SPDiff will be negative if this tail call requires more space than we
2177 // would automatically have in our incoming argument space. Positive if we
2178 // can actually shrink the stack.
2179 SPDiff = NumReusableBytes - NumBytes;
2180
2181 // If this call requires more stack than we have available from
2182 // LowerFormalArguments, tell FrameLowering to reserve space for it.
2183 if (SPDiff < 0 && AFI->getArgRegsSaveSize() < (unsigned)-SPDiff)
2184 AFI->setArgRegsSaveSize(-SPDiff);
2185 }
2186
2187 if (isSibCall) {
2188 // For sibling tail calls, memory operands are available in our caller's stack.
2189 NumBytes = 0;
2190 } else {
2191 // Adjust the stack pointer for the new arguments...
2192 // These operations are automatically eliminated by the prolog/epilog pass
2193 Chain = DAG.getCALLSEQ_START(Chain, isTailCall ? 0 : NumBytes, 0, dl);
2194 }
2195
2197 DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout()));
2198
2199 RegsToPassVector RegsToPass;
2200 SmallVector<SDValue, 8> MemOpChains;
2201
2202 // If we are doing a tail-call, any byval arguments will be written to stack
2203 // space which was used for incoming arguments. If any the values being used
2204 // are incoming byval arguments to this function, then they might be
2205 // overwritten by the stores of the outgoing arguments. To avoid this, we
2206 // need to make a temporary copy of them in local stack space, then copy back
2207 // to the argument area.
2208 DenseMap<unsigned, SDValue> ByValTemporaries;
2209 SDValue ByValTempChain;
2210 if (isTailCall) {
2211 SmallVector<SDValue, 8> ByValCopyChains;
2212 for (const CCValAssign &VA : ArgLocs) {
2213 unsigned ArgIdx = VA.getValNo();
2214 SDValue Src = OutVals[ArgIdx];
2215 ISD::ArgFlagsTy Flags = Outs[ArgIdx].Flags;
2216
2217 if (!Flags.isByVal())
2218 continue;
2219
2220 SDValue Dst;
2221 MachinePointerInfo DstInfo;
2222 std::tie(Dst, DstInfo) =
2223 computeAddrForCallArg(dl, DAG, VA, SDValue(), true, SPDiff);
2224 ByValCopyKind Copy = ByValNeedsCopyForTailCall(DAG, Src, Dst, Flags);
2225
2226 if (Copy == NoCopy) {
2227 // If the argument is already at the correct offset on the stack
2228 // (because we are forwarding a byval argument from our caller), we
2229 // don't need any copying.
2230 continue;
2231 } else if (Copy == CopyOnce) {
2232 // If the argument is in our local stack frame, no other argument
2233 // preparation can clobber it, so we can copy it to the final location
2234 // later.
2235 ByValTemporaries[ArgIdx] = Src;
2236 } else {
2237 assert(Copy == CopyViaTemp && "unexpected enum value");
2238 // If we might be copying this argument from the outgoing argument
2239 // stack area, we need to copy via a temporary in the local stack
2240 // frame.
2241 int TempFrameIdx = MFI.CreateStackObject(
2242 Flags.getByValSize(), Flags.getNonZeroByValAlign(), false);
2243 SDValue Temp =
2244 DAG.getFrameIndex(TempFrameIdx, getPointerTy(DAG.getDataLayout()));
2245
2246 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2247 SDValue AlignNode =
2248 DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32);
2249
2250 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
2251 SDValue Ops[] = {Chain, Temp, Src, SizeNode, AlignNode};
2252 ByValCopyChains.push_back(
2253 DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs, Ops));
2254 ByValTemporaries[ArgIdx] = Temp;
2255 }
2256 }
2257 if (!ByValCopyChains.empty())
2258 ByValTempChain =
2259 DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ByValCopyChains);
2260 }
2261
2262 // During a tail call, stores to the argument area must happen after all of
2263 // the function's incoming arguments have been loaded because they may alias.
2264 // This is done by folding in a TokenFactor from LowerFormalArguments, but
2265 // there's no point in doing so repeatedly so this tracks whether that's
2266 // happened yet.
2267 bool AfterFormalArgLoads = false;
2268
2269 // Walk the register/memloc assignments, inserting copies/loads. In the case
2270 // of tail call optimization, arguments are handled later.
2271 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
2272 i != e;
2273 ++i, ++realArgIdx) {
2274 CCValAssign &VA = ArgLocs[i];
2275 SDValue Arg = OutVals[realArgIdx];
2276 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2277 bool isByVal = Flags.isByVal();
2278
2279 // Promote the value if needed.
2280 switch (VA.getLocInfo()) {
2281 default: llvm_unreachable("Unknown loc info!");
2282 case CCValAssign::Full: break;
2283 case CCValAssign::SExt:
2284 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
2285 break;
2286 case CCValAssign::ZExt:
2287 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
2288 break;
2289 case CCValAssign::AExt:
2290 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
2291 break;
2292 case CCValAssign::BCvt:
2293 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2294 break;
2295 }
2296
2297 if (isTailCall && VA.isMemLoc() && !AfterFormalArgLoads) {
2298 Chain = DAG.getStackArgumentTokenFactor(Chain);
2299 if (ByValTempChain) {
2300 // In case of large byval copies, re-using the stackframe for tail-calls
2301 // can lead to overwriting incoming arguments on the stack. Force
2302 // loading these stack arguments before the copy to avoid that.
2303 SmallVector<SDValue, 8> IncomingLoad;
2304 for (unsigned I = 0; I < OutVals.size(); ++I) {
2305 if (Outs[I].Flags.isByVal())
2306 continue;
2307
2308 SDValue OutVal = OutVals[I];
2309 LoadSDNode *OutLN = dyn_cast_or_null<LoadSDNode>(OutVal);
2310 if (!OutLN)
2311 continue;
2312
2313 FrameIndexSDNode *FIN =
2315 if (!FIN)
2316 continue;
2317
2318 if (!MFI.isFixedObjectIndex(FIN->getIndex()))
2319 continue;
2320
2321 for (const CCValAssign &VA : ArgLocs) {
2322 if (VA.isMemLoc())
2323 IncomingLoad.push_back(OutVal.getValue(1));
2324 }
2325 }
2326
2327 // Update the chain to force loads for potentially clobbered argument
2328 // loads to happen before the byval copy.
2329 if (!IncomingLoad.empty()) {
2330 IncomingLoad.push_back(Chain);
2331 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, IncomingLoad);
2332 }
2333
2334 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chain,
2335 ByValTempChain);
2336 }
2337 AfterFormalArgLoads = true;
2338 }
2339
2340 // f16 arguments have their size extended to 4 bytes and passed as if they
2341 // had been copied to the LSBs of a 32-bit register.
2342 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
2343 if (VA.needsCustom() &&
2344 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) {
2345 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
2346 } else {
2347 // f16 arguments could have been extended prior to argument lowering.
2348 // Mask them arguments if this is a CMSE nonsecure call.
2349 auto ArgVT = Outs[realArgIdx].ArgVT;
2350 if (isCmseNSCall && (ArgVT == MVT::f16)) {
2351 auto LocBits = VA.getLocVT().getSizeInBits();
2352 auto MaskValue = APInt::getLowBitsSet(LocBits, ArgVT.getSizeInBits());
2353 SDValue Mask =
2354 DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
2355 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
2356 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
2357 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2358 }
2359 }
2360
2361 // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
2362 if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
2363 SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2364 DAG.getConstant(0, dl, MVT::i32));
2365 SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2366 DAG.getConstant(1, dl, MVT::i32));
2367
2368 PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, VA, ArgLocs[++i],
2369 StackPtr, MemOpChains, isTailCall, SPDiff);
2370
2371 VA = ArgLocs[++i]; // skip ahead to next loc
2372 if (VA.isRegLoc()) {
2373 PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, VA, ArgLocs[++i],
2374 StackPtr, MemOpChains, isTailCall, SPDiff);
2375 } else {
2376 assert(VA.isMemLoc());
2377 SDValue DstAddr;
2378 MachinePointerInfo DstInfo;
2379 std::tie(DstAddr, DstInfo) =
2380 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2381 MemOpChains.push_back(DAG.getStore(Chain, dl, Op1, DstAddr, DstInfo));
2382 }
2383 } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
2384 PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
2385 StackPtr, MemOpChains, isTailCall, SPDiff);
2386 } else if (VA.isRegLoc()) {
2387 if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
2388 Outs[0].VT == MVT::i32) {
2389 assert(VA.getLocVT() == MVT::i32 &&
2390 "unexpected calling convention register assignment");
2391 assert(!Ins.empty() && Ins[0].VT == MVT::i32 &&
2392 "unexpected use of 'returned'");
2393 isThisReturn = true;
2394 }
2395 const TargetOptions &Options = DAG.getTarget().Options;
2396 if (Options.EmitCallSiteInfo)
2397 CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i);
2398 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2399 } else if (isByVal) {
2400 assert(VA.isMemLoc());
2401 unsigned offset = 0;
2402
2403 // True if this byval aggregate will be split between registers
2404 // and memory.
2405 unsigned ByValArgsCount = CCInfo.getInRegsParamsCount();
2406 unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed();
2407
2408 SDValue ByValSrc;
2409 bool NeedsStackCopy;
2410 if (auto It = ByValTemporaries.find(realArgIdx);
2411 It != ByValTemporaries.end()) {
2412 ByValSrc = It->second;
2413 NeedsStackCopy = true;
2414 } else {
2415 ByValSrc = Arg;
2416 NeedsStackCopy = !isTailCall;
2417 }
2418
2419 // If part of the argument is in registers, load them.
2420 if (CurByValIdx < ByValArgsCount) {
2421 unsigned RegBegin, RegEnd;
2422 CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd);
2423
2424 EVT PtrVT = getPointerTy(DAG.getDataLayout());
2425 unsigned int i, j;
2426 for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
2427 SDValue Const = DAG.getConstant(4*i, dl, MVT::i32);
2428 SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, ByValSrc, Const);
2429 SDValue Load =
2430 DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo(),
2431 DAG.InferPtrAlign(AddArg));
2432 MemOpChains.push_back(Load.getValue(1));
2433 RegsToPass.push_back(std::make_pair(j, Load));
2434 }
2435
2436 // If parameter size outsides register area, "offset" value
2437 // helps us to calculate stack slot for remained part properly.
2438 offset = RegEnd - RegBegin;
2439
2440 CCInfo.nextInRegsParam();
2441 }
2442
2443 // If the memory part of the argument isn't already in the correct place
2444 // (which can happen with tail calls), copy it into the argument area.
2445 if (NeedsStackCopy && Flags.getByValSize() > 4 * offset) {
2446 auto PtrVT = getPointerTy(DAG.getDataLayout());
2447 SDValue Dst;
2448 MachinePointerInfo DstInfo;
2449 std::tie(Dst, DstInfo) =
2450 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2451 SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl);
2452 SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, ByValSrc, SrcOffset);
2453 SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl,
2454 MVT::i32);
2455 SDValue AlignNode =
2456 DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32);
2457
2458 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
2459 SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
2460 MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
2461 Ops));
2462 }
2463 } else {
2464 assert(VA.isMemLoc());
2465 SDValue DstAddr;
2466 MachinePointerInfo DstInfo;
2467 std::tie(DstAddr, DstInfo) =
2468 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2469
2470 SDValue Store = DAG.getStore(Chain, dl, Arg, DstAddr, DstInfo);
2471 MemOpChains.push_back(Store);
2472 }
2473 }
2474
2475 if (!MemOpChains.empty())
2476 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
2477
2478 // Build a sequence of copy-to-reg nodes chained together with token chain
2479 // and flag operands which copy the outgoing args into the appropriate regs.
2480 SDValue InGlue;
2481 for (const auto &[Reg, N] : RegsToPass) {
2482 Chain = DAG.getCopyToReg(Chain, dl, Reg, N, InGlue);
2483 InGlue = Chain.getValue(1);
2484 }
2485
2486 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
2487 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
2488 // node so that legalize doesn't hack it.
2489 bool isDirect = false;
2490
2491 const TargetMachine &TM = getTargetMachine();
2492 const GlobalValue *GVal = nullptr;
2493 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
2494 GVal = G->getGlobal();
2495 bool isStub = !TM.shouldAssumeDSOLocal(GVal) && Subtarget->isTargetMachO();
2496
2497 bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
2498 bool isLocalARMFunc = false;
2499 auto PtrVt = getPointerTy(DAG.getDataLayout());
2500
2501 if (Subtarget->genLongCalls()) {
2502 assert((!isPositionIndependent() || Subtarget->isTargetWindows()) &&
2503 "long-calls codegen is not position independent!");
2504 // Handle a global address or an external symbol. If it's not one of
2505 // those, the target's already in a register, so we don't need to do
2506 // anything extra.
2507 if (isa<GlobalAddressSDNode>(Callee)) {
2508 if (Subtarget->genExecuteOnly()) {
2509 if (Subtarget->useMovt())
2510 ++NumMovwMovt;
2511 Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt,
2512 DAG.getTargetGlobalAddress(GVal, dl, PtrVt));
2513 } else {
2514 // Create a constant pool entry for the callee address
2515 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2516 ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(
2517 GVal, ARMPCLabelIndex, ARMCP::CPValue, 0);
2518
2519 // Get the address of the callee into a register
2520 SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2521 Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr);
2522 Callee = DAG.getLoad(
2523 PtrVt, dl, DAG.getEntryNode(), Addr,
2525 }
2526 } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
2527 const char *Sym = S->getSymbol();
2528
2529 if (Subtarget->genExecuteOnly()) {
2530 if (Subtarget->useMovt())
2531 ++NumMovwMovt;
2532 Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt,
2533 DAG.getTargetGlobalAddress(GVal, dl, PtrVt));
2534 } else {
2535 // Create a constant pool entry for the callee address
2536 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2537 ARMConstantPoolValue *CPV = ARMConstantPoolSymbol::Create(
2538 *DAG.getContext(), Sym, ARMPCLabelIndex, 0);
2539
2540 // Get the address of the callee into a register
2541 SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2542 Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr);
2543 Callee = DAG.getLoad(
2544 PtrVt, dl, DAG.getEntryNode(), Addr,
2546 }
2547 }
2548 } else if (isa<GlobalAddressSDNode>(Callee)) {
2549 if (!PreferIndirect) {
2550 isDirect = true;
2551 bool isDef = GVal->isStrongDefinitionForLinker();
2552
2553 // ARM call to a local ARM function is predicable.
2554 isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking);
2555 // tBX takes a register source operand.
2556 if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2557 assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?");
2558 Callee = DAG.getNode(
2559 ARMISD::WrapperPIC, dl, PtrVt,
2560 DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, ARMII::MO_NONLAZY));
2561 Callee = DAG.getLoad(
2562 PtrVt, dl, DAG.getEntryNode(), Callee,
2566 } else if (Subtarget->isTargetCOFF()) {
2567 assert(Subtarget->isTargetWindows() &&
2568 "Windows is the only supported COFF target");
2569 unsigned TargetFlags = ARMII::MO_NO_FLAG;
2570 if (GVal->hasDLLImportStorageClass())
2571 TargetFlags = ARMII::MO_DLLIMPORT;
2572 else if (!TM.shouldAssumeDSOLocal(GVal))
2573 TargetFlags = ARMII::MO_COFFSTUB;
2574 Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, /*offset=*/0,
2575 TargetFlags);
2576 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
2577 Callee =
2578 DAG.getLoad(PtrVt, dl, DAG.getEntryNode(),
2579 DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee),
2581 } else {
2582 Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, 0);
2583 }
2584 }
2585 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
2586 isDirect = true;
2587 // tBX takes a register source operand.
2588 const char *Sym = S->getSymbol();
2589 if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2590 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2591 ARMConstantPoolValue *CPV =
2593 ARMPCLabelIndex, 4);
2594 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2595 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2596 Callee = DAG.getLoad(
2597 PtrVt, dl, DAG.getEntryNode(), CPAddr,
2599 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
2600 Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel);
2601 } else {
2602 Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0);
2603 }
2604 }
2605
2606 if (isCmseNSCall) {
2607 assert(!isARMFunc && !isDirect &&
2608 "Cannot handle call to ARM function or direct call");
2609 if (NumBytes > 0) {
2610 DAG.getContext()->diagnose(
2611 DiagnosticInfoUnsupported(DAG.getMachineFunction().getFunction(),
2612 "call to non-secure function would require "
2613 "passing arguments on stack",
2614 dl.getDebugLoc()));
2615 }
2616 if (isStructRet) {
2617 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
2619 "call to non-secure function would return value through pointer",
2620 dl.getDebugLoc()));
2621 }
2622 }
2623
2624 // FIXME: handle tail calls differently.
2625 unsigned CallOpc;
2626 if (Subtarget->isThumb()) {
2627 if (GuardWithBTI)
2628 CallOpc = ARMISD::t2CALL_BTI;
2629 else if (isCmseNSCall)
2630 CallOpc = ARMISD::tSECALL;
2631 else if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
2632 CallOpc = ARMISD::CALL_NOLINK;
2633 else
2634 CallOpc = ARMISD::CALL;
2635 } else {
2636 if (!isDirect && !Subtarget->hasV5TOps())
2637 CallOpc = ARMISD::CALL_NOLINK;
2638 else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() &&
2639 // Emit regular call when code size is the priority
2640 !Subtarget->hasMinSize())
2641 // "mov lr, pc; b _foo" to avoid confusing the RSP
2642 CallOpc = ARMISD::CALL_NOLINK;
2643 else
2644 CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL;
2645 }
2646
2647 // We don't usually want to end the call-sequence here because we would tidy
2648 // the frame up *after* the call, however in the ABI-changing tail-call case
2649 // we've carefully laid out the parameters so that when sp is reset they'll be
2650 // in the correct location.
2651 if (isTailCall && !isSibCall) {
2652 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, dl);
2653 InGlue = Chain.getValue(1);
2654 }
2655
2656 std::vector<SDValue> Ops;
2657 Ops.push_back(Chain);
2658 Ops.push_back(Callee);
2659
2660 if (isTailCall) {
2661 Ops.push_back(DAG.getSignedTargetConstant(SPDiff, dl, MVT::i32));
2662 }
2663
2664 // Add argument registers to the end of the list so that they are known live
2665 // into the call.
2666 for (const auto &[Reg, N] : RegsToPass)
2667 Ops.push_back(DAG.getRegister(Reg, N.getValueType()));
2668
2669 // Add a register mask operand representing the call-preserved registers.
2670 const uint32_t *Mask;
2671 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
2672 if (isThisReturn) {
2673 // For 'this' returns, use the R0-preserving mask if applicable
2674 Mask = ARI->getThisReturnPreservedMask(MF, CallConv);
2675 if (!Mask) {
2676 // Set isThisReturn to false if the calling convention is not one that
2677 // allows 'returned' to be modeled in this way, so LowerCallResult does
2678 // not try to pass 'this' straight through
2679 isThisReturn = false;
2680 Mask = ARI->getCallPreservedMask(MF, CallConv);
2681 }
2682 } else
2683 Mask = ARI->getCallPreservedMask(MF, CallConv);
2684
2685 assert(Mask && "Missing call preserved mask for calling convention");
2686 Ops.push_back(DAG.getRegisterMask(Mask));
2687
2688 if (InGlue.getNode())
2689 Ops.push_back(InGlue);
2690
2691 if (isTailCall) {
2693 SDValue Ret = DAG.getNode(ARMISD::TC_RETURN, dl, MVT::Other, Ops);
2694 if (CLI.CFIType)
2695 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
2696 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
2697 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
2698 return Ret;
2699 }
2700
2701 // Returns a chain and a flag for retval copy to use.
2702 Chain = DAG.getNode(CallOpc, dl, {MVT::Other, MVT::Glue}, Ops);
2703 if (CLI.CFIType)
2704 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
2705 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
2706 InGlue = Chain.getValue(1);
2707 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
2708
2709 // If we're guaranteeing tail-calls will be honoured, the callee must
2710 // pop its own argument stack on return. But this call is *not* a tail call so
2711 // we need to undo that after it returns to restore the status-quo.
2712 bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt;
2713 uint64_t CalleePopBytes =
2714 canGuaranteeTCO(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : -1U;
2715
2716 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, dl);
2717 if (!Ins.empty())
2718 InGlue = Chain.getValue(1);
2719
2720 // Handle result values, copying them out of physregs into vregs that we
2721 // return.
2722 return LowerCallResult(Chain, InGlue, CallConv, isVarArg, Ins, dl, DAG,
2723 InVals, isThisReturn,
2724 isThisReturn ? OutVals[0] : SDValue(), isCmseNSCall);
2725}
2726
2727/// HandleByVal - Every parameter *after* a byval parameter is passed
2728/// on the stack. Remember the next parameter register to allocate,
2729/// and then confiscate the rest of the parameter registers to insure
2730/// this.
2731void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
2732 Align Alignment) const {
2733 // Byval (as with any stack) slots are always at least 4 byte aligned.
2734 Alignment = std::max(Alignment, Align(4));
2735
2736 MCRegister Reg = State->AllocateReg(GPRArgRegs);
2737 if (!Reg)
2738 return;
2739
2740 unsigned AlignInRegs = Alignment.value() / 4;
2741 unsigned Waste = (ARM::R4 - Reg) % AlignInRegs;
2742 for (unsigned i = 0; i < Waste; ++i)
2743 Reg = State->AllocateReg(GPRArgRegs);
2744
2745 if (!Reg)
2746 return;
2747
2748 unsigned Excess = 4 * (ARM::R4 - Reg);
2749
2750 // Special case when NSAA != SP and parameter size greater than size of
2751 // all remained GPR regs. In that case we can't split parameter, we must
2752 // send it to stack. We also must set NCRN to R4, so waste all
2753 // remained registers.
2754 const unsigned NSAAOffset = State->getStackSize();
2755 if (NSAAOffset != 0 && Size > Excess) {
2756 while (State->AllocateReg(GPRArgRegs))
2757 ;
2758 return;
2759 }
2760
2761 // First register for byval parameter is the first register that wasn't
2762 // allocated before this method call, so it would be "reg".
2763 // If parameter is small enough to be saved in range [reg, r4), then
2764 // the end (first after last) register would be reg + param-size-in-regs,
2765 // else parameter would be splitted between registers and stack,
2766 // end register would be r4 in this case.
2767 unsigned ByValRegBegin = Reg;
2768 unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4);
2769 State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd);
2770 // Note, first register is allocated in the beginning of function already,
2771 // allocate remained amount of registers we need.
2772 for (unsigned i = Reg + 1; i != ByValRegEnd; ++i)
2773 State->AllocateReg(GPRArgRegs);
2774 // A byval parameter that is split between registers and memory needs its
2775 // size truncated here.
2776 // In the case where the entire structure fits in registers, we set the
2777 // size in memory to zero.
2778 Size = std::max<int>(Size - Excess, 0);
2779}
2780
2781/// IsEligibleForTailCallOptimization - Check whether the call is eligible
2782/// for tail call optimization. Targets which want to do tail call
2783/// optimization should implement this function. Note that this function also
2784/// processes musttail calls, so when this function returns false on a valid
2785/// musttail call, a fatal backend error occurs.
2786bool ARMTargetLowering::IsEligibleForTailCallOptimization(
2788 SmallVectorImpl<CCValAssign> &ArgLocs, const bool isIndirect) const {
2789 CallingConv::ID CalleeCC = CLI.CallConv;
2790 SDValue Callee = CLI.Callee;
2791 bool isVarArg = CLI.IsVarArg;
2792 const SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2793 const SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2794 const SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
2795 const SelectionDAG &DAG = CLI.DAG;
2796 MachineFunction &MF = DAG.getMachineFunction();
2797 const Function &CallerF = MF.getFunction();
2798 CallingConv::ID CallerCC = CallerF.getCallingConv();
2799
2800 assert(Subtarget->supportsTailCall());
2801
2802 // Indirect tail-calls require a register to hold the target address. That
2803 // register must be:
2804 // * Allocatable (i.e. r0-r7 if the target is Thumb1).
2805 // * Not callee-saved, so must be one of r0-r3 or r12.
2806 // * Not used to hold an argument to the tail-called function, which might be
2807 // in r0-r3.
2808 // * Not used to hold the return address authentication code, which is in r12
2809 // if enabled.
2810 // Sometimes, no register matches all of these conditions, so we can't do a
2811 // tail-call.
2812 if (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect) {
2813 SmallSet<MCPhysReg, 5> AddressRegisters = {ARM::R0, ARM::R1, ARM::R2,
2814 ARM::R3};
2815 if (!(Subtarget->isThumb1Only() ||
2816 MF.getInfo<ARMFunctionInfo>()->shouldSignReturnAddress(true)))
2817 AddressRegisters.insert(ARM::R12);
2818 for (const CCValAssign &AL : ArgLocs)
2819 if (AL.isRegLoc())
2820 AddressRegisters.erase(AL.getLocReg());
2821 if (AddressRegisters.empty()) {
2822 LLVM_DEBUG(dbgs() << "false (no reg to hold function pointer)\n");
2823 return false;
2824 }
2825 }
2826
2827 // Look for obvious safe cases to perform tail call optimization that do not
2828 // require ABI changes. This is what gcc calls sibcall.
2829
2830 // Exception-handling functions need a special set of instructions to indicate
2831 // a return to the hardware. Tail-calling another function would probably
2832 // break this.
2833 if (CallerF.hasFnAttribute("interrupt")) {
2834 LLVM_DEBUG(dbgs() << "false (interrupt attribute)\n");
2835 return false;
2836 }
2837
2838 if (canGuaranteeTCO(CalleeCC,
2839 getTargetMachine().Options.GuaranteedTailCallOpt)) {
2840 LLVM_DEBUG(dbgs() << (CalleeCC == CallerCC ? "true" : "false")
2841 << " (guaranteed tail-call CC)\n");
2842 return CalleeCC == CallerCC;
2843 }
2844
2845 // Also avoid sibcall optimization if either caller or callee uses struct
2846 // return semantics.
2847 bool isCalleeStructRet = Outs.empty() ? false : Outs[0].Flags.isSRet();
2848 bool isCallerStructRet = MF.getFunction().hasStructRetAttr();
2849 if (isCalleeStructRet != isCallerStructRet) {
2850 LLVM_DEBUG(dbgs() << "false (struct-ret)\n");
2851 return false;
2852 }
2853
2854 // Externally-defined functions with weak linkage should not be
2855 // tail-called on ARM when the OS does not support dynamic
2856 // pre-emption of symbols, as the AAELF spec requires normal calls
2857 // to undefined weak functions to be replaced with a NOP or jump to the
2858 // next instruction. The behaviour of branch instructions in this
2859 // situation (as used for tail calls) is implementation-defined, so we
2860 // cannot rely on the linker replacing the tail call with a return.
2861 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
2862 const GlobalValue *GV = G->getGlobal();
2863 const Triple &TT = getTargetMachine().getTargetTriple();
2864 if (GV->hasExternalWeakLinkage() &&
2865 (!TT.isOSWindows() || TT.isOSBinFormatELF() ||
2866 TT.isOSBinFormatMachO())) {
2867 LLVM_DEBUG(dbgs() << "false (external weak linkage)\n");
2868 return false;
2869 }
2870 }
2871
2872 // Check that the call results are passed in the same way.
2873 LLVMContext &C = *DAG.getContext();
2875 getEffectiveCallingConv(CalleeCC, isVarArg),
2876 getEffectiveCallingConv(CallerCC, CallerF.isVarArg()), MF, C, Ins,
2877 CCAssignFnForReturn(CalleeCC, isVarArg),
2878 CCAssignFnForReturn(CallerCC, CallerF.isVarArg()))) {
2879 LLVM_DEBUG(dbgs() << "false (incompatible results)\n");
2880 return false;
2881 }
2882 // The callee has to preserve all registers the caller needs to preserve.
2883 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
2884 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
2885 if (CalleeCC != CallerCC) {
2886 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
2887 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) {
2888 LLVM_DEBUG(dbgs() << "false (not all registers preserved)\n");
2889 return false;
2890 }
2891 }
2892
2893 // If Caller's vararg argument has been split between registers and stack, do
2894 // not perform tail call, since part of the argument is in caller's local
2895 // frame.
2896 const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>();
2897 if (CLI.IsVarArg && AFI_Caller->getArgRegsSaveSize()) {
2898 LLVM_DEBUG(dbgs() << "false (arg reg save area)\n");
2899 return false;
2900 }
2901
2902 // If the callee takes no arguments then go on to check the results of the
2903 // call.
2904 const MachineRegisterInfo &MRI = MF.getRegInfo();
2905 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) {
2906 LLVM_DEBUG(dbgs() << "false (parameters in CSRs do not match)\n");
2907 return false;
2908 }
2909
2910 // If the stack arguments for this call do not fit into our own save area then
2911 // the call cannot be made tail.
2912 if (CCInfo.getStackSize() > AFI_Caller->getArgumentStackSize())
2913 return false;
2914
2915 LLVM_DEBUG(dbgs() << "true\n");
2916 return true;
2917}
2918
2919bool
2920ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
2921 MachineFunction &MF, bool isVarArg,
2923 LLVMContext &Context, const Type *RetTy) const {
2925 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2926 return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
2927}
2928
2930 const SDLoc &DL, SelectionDAG &DAG) {
2931 const MachineFunction &MF = DAG.getMachineFunction();
2932 const Function &F = MF.getFunction();
2933
2934 StringRef IntKind = F.getFnAttribute("interrupt").getValueAsString();
2935
2936 // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset
2937 // version of the "preferred return address". These offsets affect the return
2938 // instruction if this is a return from PL1 without hypervisor extensions.
2939 // IRQ/FIQ: +4 "subs pc, lr, #4"
2940 // SWI: 0 "subs pc, lr, #0"
2941 // ABORT: +4 "subs pc, lr, #4"
2942 // UNDEF: +4/+2 "subs pc, lr, #0"
2943 // UNDEF varies depending on where the exception came from ARM or Thumb
2944 // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0.
2945
2946 int64_t LROffset;
2947 if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" ||
2948 IntKind == "ABORT")
2949 LROffset = 4;
2950 else if (IntKind == "SWI" || IntKind == "UNDEF")
2951 LROffset = 0;
2952 else
2953 report_fatal_error("Unsupported interrupt attribute. If present, value "
2954 "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF");
2955
2956 RetOps.insert(RetOps.begin() + 1,
2957 DAG.getConstant(LROffset, DL, MVT::i32, false));
2958
2959 return DAG.getNode(ARMISD::INTRET_GLUE, DL, MVT::Other, RetOps);
2960}
2961
2962SDValue
2963ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2964 bool isVarArg,
2966 const SmallVectorImpl<SDValue> &OutVals,
2967 const SDLoc &dl, SelectionDAG &DAG) const {
2968 // CCValAssign - represent the assignment of the return value to a location.
2970
2971 // CCState - Info about the registers and stack slots.
2972 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2973 *DAG.getContext());
2974
2975 // Analyze outgoing return values.
2976 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
2977
2978 SDValue Glue;
2980 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2981 bool isLittleEndian = Subtarget->isLittle();
2982
2983 MachineFunction &MF = DAG.getMachineFunction();
2984 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2985 AFI->setReturnRegsCount(RVLocs.size());
2986
2987 // Report error if cmse entry function returns structure through first ptr arg.
2988 if (AFI->isCmseNSEntryFunction() && MF.getFunction().hasStructRetAttr()) {
2989 // Note: using an empty SDLoc(), as the first line of the function is a
2990 // better place to report than the last line.
2991 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
2993 "secure entry function would return value through pointer",
2994 SDLoc().getDebugLoc()));
2995 }
2996
2997 // Copy the result values into the output registers.
2998 for (unsigned i = 0, realRVLocIdx = 0;
2999 i != RVLocs.size();
3000 ++i, ++realRVLocIdx) {
3001 CCValAssign &VA = RVLocs[i];
3002 assert(VA.isRegLoc() && "Can only return in registers!");
3003
3004 SDValue Arg = OutVals[realRVLocIdx];
3005 bool ReturnF16 = false;
3006
3007 if (Subtarget->hasFullFP16() && getTM().isTargetHardFloat()) {
3008 // Half-precision return values can be returned like this:
3009 //
3010 // t11 f16 = fadd ...
3011 // t12: i16 = bitcast t11
3012 // t13: i32 = zero_extend t12
3013 // t14: f32 = bitcast t13 <~~~~~~~ Arg
3014 //
3015 // to avoid code generation for bitcasts, we simply set Arg to the node
3016 // that produces the f16 value, t11 in this case.
3017 //
3018 if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) {
3019 SDValue ZE = Arg.getOperand(0);
3020 if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) {
3021 SDValue BC = ZE.getOperand(0);
3022 if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) {
3023 Arg = BC.getOperand(0);
3024 ReturnF16 = true;
3025 }
3026 }
3027 }
3028 }
3029
3030 switch (VA.getLocInfo()) {
3031 default: llvm_unreachable("Unknown loc info!");
3032 case CCValAssign::Full: break;
3033 case CCValAssign::BCvt:
3034 if (!ReturnF16)
3035 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
3036 break;
3037 }
3038
3039 // Mask f16 arguments if this is a CMSE nonsecure entry.
3040 auto RetVT = Outs[realRVLocIdx].ArgVT;
3041 if (AFI->isCmseNSEntryFunction() && (RetVT == MVT::f16)) {
3042 if (VA.needsCustom() && VA.getValVT() == MVT::f16) {
3043 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
3044 } else {
3045 auto LocBits = VA.getLocVT().getSizeInBits();
3046 auto MaskValue = APInt::getLowBitsSet(LocBits, RetVT.getSizeInBits());
3047 SDValue Mask =
3048 DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
3049 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
3050 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
3051 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
3052 }
3053 }
3054
3055 if (VA.needsCustom() &&
3056 (VA.getLocVT() == MVT::v2f64 || VA.getLocVT() == MVT::f64)) {
3057 if (VA.getLocVT() == MVT::v2f64) {
3058 // Extract the first half and return it in two registers.
3059 SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
3060 DAG.getConstant(0, dl, MVT::i32));
3061 SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
3062 DAG.getVTList(MVT::i32, MVT::i32), Half);
3063
3064 Chain =
3065 DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3066 HalfGPRs.getValue(isLittleEndian ? 0 : 1), Glue);
3067 Glue = Chain.getValue(1);
3068 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3069 VA = RVLocs[++i]; // skip ahead to next loc
3070 Chain =
3071 DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3072 HalfGPRs.getValue(isLittleEndian ? 1 : 0), Glue);
3073 Glue = Chain.getValue(1);
3074 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3075 VA = RVLocs[++i]; // skip ahead to next loc
3076
3077 // Extract the 2nd half and fall through to handle it as an f64 value.
3078 Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
3079 DAG.getConstant(1, dl, MVT::i32));
3080 }
3081 // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is
3082 // available.
3083 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
3084 DAG.getVTList(MVT::i32, MVT::i32), Arg);
3085 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3086 fmrrd.getValue(isLittleEndian ? 0 : 1), Glue);
3087 Glue = Chain.getValue(1);
3088 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3089 VA = RVLocs[++i]; // skip ahead to next loc
3090 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3091 fmrrd.getValue(isLittleEndian ? 1 : 0), Glue);
3092 } else
3093 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Glue);
3094
3095 // Guarantee that all emitted copies are
3096 // stuck together, avoiding something bad.
3097 Glue = Chain.getValue(1);
3098 RetOps.push_back(DAG.getRegister(
3099 VA.getLocReg(), ReturnF16 ? Arg.getValueType() : VA.getLocVT()));
3100 }
3101 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
3102 const MCPhysReg *I =
3103 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3104 if (I) {
3105 for (; *I; ++I) {
3106 if (ARM::GPRRegClass.contains(*I))
3107 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3108 else if (ARM::DPRRegClass.contains(*I))
3110 else
3111 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3112 }
3113 }
3114
3115 // Update chain and glue.
3116 RetOps[0] = Chain;
3117 if (Glue.getNode())
3118 RetOps.push_back(Glue);
3119
3120 // CPUs which aren't M-class use a special sequence to return from
3121 // exceptions (roughly, any instruction setting pc and cpsr simultaneously,
3122 // though we use "subs pc, lr, #N").
3123 //
3124 // M-class CPUs actually use a normal return sequence with a special
3125 // (hardware-provided) value in LR, so the normal code path works.
3126 if (DAG.getMachineFunction().getFunction().hasFnAttribute("interrupt") &&
3127 !Subtarget->isMClass()) {
3128 if (Subtarget->isThumb1Only())
3129 report_fatal_error("interrupt attribute is not supported in Thumb1");
3130 return LowerInterruptReturn(RetOps, dl, DAG);
3131 }
3132
3133 unsigned RetNode =
3134 AFI->isCmseNSEntryFunction() ? ARMISD::SERET_GLUE : ARMISD::RET_GLUE;
3135 return DAG.getNode(RetNode, dl, MVT::Other, RetOps);
3136}
3137
3138bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
3139 if (N->getNumValues() != 1)
3140 return false;
3141 if (!N->hasNUsesOfValue(1, 0))
3142 return false;
3143
3144 SDValue TCChain = Chain;
3145 SDNode *Copy = *N->user_begin();
3146 if (Copy->getOpcode() == ISD::CopyToReg) {
3147 // If the copy has a glue operand, we conservatively assume it isn't safe to
3148 // perform a tail call.
3149 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3150 return false;
3151 TCChain = Copy->getOperand(0);
3152 } else if (Copy->getOpcode() == ARMISD::VMOVRRD) {
3153 SDNode *VMov = Copy;
3154 // f64 returned in a pair of GPRs.
3155 SmallPtrSet<SDNode*, 2> Copies;
3156 for (SDNode *U : VMov->users()) {
3157 if (U->getOpcode() != ISD::CopyToReg)
3158 return false;
3159 Copies.insert(U);
3160 }
3161 if (Copies.size() > 2)
3162 return false;
3163
3164 for (SDNode *U : VMov->users()) {
3165 SDValue UseChain = U->getOperand(0);
3166 if (Copies.count(UseChain.getNode()))
3167 // Second CopyToReg
3168 Copy = U;
3169 else {
3170 // We are at the top of this chain.
3171 // If the copy has a glue operand, we conservatively assume it
3172 // isn't safe to perform a tail call.
3173 if (U->getOperand(U->getNumOperands() - 1).getValueType() == MVT::Glue)
3174 return false;
3175 // First CopyToReg
3176 TCChain = UseChain;
3177 }
3178 }
3179 } else if (Copy->getOpcode() == ISD::BITCAST) {
3180 // f32 returned in a single GPR.
3181 if (!Copy->hasOneUse())
3182 return false;
3183 Copy = *Copy->user_begin();
3184 if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0))
3185 return false;
3186 // If the copy has a glue operand, we conservatively assume it isn't safe to
3187 // perform a tail call.
3188 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3189 return false;
3190 TCChain = Copy->getOperand(0);
3191 } else {
3192 return false;
3193 }
3194
3195 bool HasRet = false;
3196 for (const SDNode *U : Copy->users()) {
3197 if (U->getOpcode() != ARMISD::RET_GLUE &&
3198 U->getOpcode() != ARMISD::INTRET_GLUE)
3199 return false;
3200 HasRet = true;
3201 }
3202
3203 if (!HasRet)
3204 return false;
3205
3206 Chain = TCChain;
3207 return true;
3208}
3209
3210bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3211 if (!Subtarget->supportsTailCall())
3212 return false;
3213
3214 if (!CI->isTailCall())
3215 return false;
3216
3217 return true;
3218}
3219
3220// Trying to write a 64 bit value so need to split into two 32 bit values first,
3221// and pass the lower and high parts through.
3223 SDLoc DL(Op);
3224 SDValue WriteValue = Op->getOperand(2);
3225
3226 // This function is only supposed to be called for i64 type argument.
3227 assert(WriteValue.getValueType() == MVT::i64
3228 && "LowerWRITE_REGISTER called for non-i64 type argument.");
3229
3230 SDValue Lo, Hi;
3231 std::tie(Lo, Hi) = DAG.SplitScalar(WriteValue, DL, MVT::i32, MVT::i32);
3232 SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi };
3233 return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops);
3234}
3235
3236// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
3237// their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
3238// one of the above mentioned nodes. It has to be wrapped because otherwise
3239// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
3240// be used to form addressing mode. These wrapped nodes will be selected
3241// into MOVi.
3242SDValue ARMTargetLowering::LowerConstantPool(SDValue Op,
3243 SelectionDAG &DAG) const {
3244 EVT PtrVT = Op.getValueType();
3245 // FIXME there is no actual debug info here
3246 SDLoc dl(Op);
3247 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
3248 SDValue Res;
3249
3250 // When generating execute-only code Constant Pools must be promoted to the
3251 // global data section. It's a bit ugly that we can't share them across basic
3252 // blocks, but this way we guarantee that execute-only behaves correct with
3253 // position-independent addressing modes.
3254 if (Subtarget->genExecuteOnly()) {
3255 auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
3256 auto *T = CP->getType();
3257 auto C = const_cast<Constant*>(CP->getConstVal());
3258 auto M = DAG.getMachineFunction().getFunction().getParent();
3259 auto GV = new GlobalVariable(
3260 *M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C,
3261 Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" +
3262 Twine(DAG.getMachineFunction().getFunctionNumber()) + "_" +
3263 Twine(AFI->createPICLabelUId())
3264 );
3266 dl, PtrVT);
3267 return LowerGlobalAddress(GA, DAG);
3268 }
3269
3270 // The 16-bit ADR instruction can only encode offsets that are multiples of 4,
3271 // so we need to align to at least 4 bytes when we don't have 32-bit ADR.
3272 Align CPAlign = CP->getAlign();
3273 if (Subtarget->isThumb1Only())
3274 CPAlign = std::max(CPAlign, Align(4));
3275 if (CP->isMachineConstantPoolEntry())
3276 Res =
3277 DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, CPAlign);
3278 else
3279 Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CPAlign);
3280 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
3281}
3282
3284 // If we don't have a 32-bit pc-relative branch instruction then the jump
3285 // table consists of block addresses. Usually this is inline, but for
3286 // execute-only it must be placed out-of-line.
3287 if (Subtarget->genExecuteOnly() && !Subtarget->hasV8MBaselineOps())
3290}
3291
3292SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
3293 SelectionDAG &DAG) const {
3296 unsigned ARMPCLabelIndex = 0;
3297 SDLoc DL(Op);
3298 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3299 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
3300 SDValue CPAddr;
3301 bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI();
3302 if (!IsPositionIndependent) {
3303 CPAddr = DAG.getTargetConstantPool(BA, PtrVT, Align(4));
3304 } else {
3305 unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
3306 ARMPCLabelIndex = AFI->createPICLabelUId();
3308 ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex,
3309 ARMCP::CPBlockAddress, PCAdj);
3310 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3311 }
3312 CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
3313 SDValue Result = DAG.getLoad(
3314 PtrVT, DL, DAG.getEntryNode(), CPAddr,
3316 if (!IsPositionIndependent)
3317 return Result;
3318 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32);
3319 return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
3320}
3321
3322/// Convert a TLS address reference into the correct sequence of loads
3323/// and calls to compute the variable's address for Darwin, and return an
3324/// SDValue containing the final node.
3325
3326/// Darwin only has one TLS scheme which must be capable of dealing with the
3327/// fully general situation, in the worst case. This means:
3328/// + "extern __thread" declaration.
3329/// + Defined in a possibly unknown dynamic library.
3330///
3331/// The general system is that each __thread variable has a [3 x i32] descriptor
3332/// which contains information used by the runtime to calculate the address. The
3333/// only part of this the compiler needs to know about is the first word, which
3334/// contains a function pointer that must be called with the address of the
3335/// entire descriptor in "r0".
3336///
3337/// Since this descriptor may be in a different unit, in general access must
3338/// proceed along the usual ARM rules. A common sequence to produce is:
3339///
3340/// movw rT1, :lower16:_var$non_lazy_ptr
3341/// movt rT1, :upper16:_var$non_lazy_ptr
3342/// ldr r0, [rT1]
3343/// ldr rT2, [r0]
3344/// blx rT2
3345/// [...address now in r0...]
3346SDValue
3347ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op,
3348 SelectionDAG &DAG) const {
3349 assert(Subtarget->isTargetDarwin() &&
3350 "This function expects a Darwin target");
3351 SDLoc DL(Op);
3352
3353 // First step is to get the address of the actua global symbol. This is where
3354 // the TLS descriptor lives.
3355 SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG);
3356
3357 // The first entry in the descriptor is a function pointer that we must call
3358 // to obtain the address of the variable.
3359 SDValue Chain = DAG.getEntryNode();
3360 SDValue FuncTLVGet = DAG.getLoad(
3361 MVT::i32, DL, Chain, DescAddr,
3365 Chain = FuncTLVGet.getValue(1);
3366
3367 MachineFunction &F = DAG.getMachineFunction();
3368 MachineFrameInfo &MFI = F.getFrameInfo();
3369 MFI.setAdjustsStack(true);
3370
3371 // TLS calls preserve all registers except those that absolutely must be
3372 // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be
3373 // silly).
3374 auto TRI =
3376 auto ARI = static_cast<const ARMRegisterInfo *>(TRI);
3377 const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction());
3378
3379 // Finally, we can make the call. This is just a degenerate version of a
3380 // normal AArch64 call node: r0 takes the address of the descriptor, and
3381 // returns the address of the variable in this thread.
3382 Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue());
3383 Chain =
3384 DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
3385 Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32),
3386 DAG.getRegisterMask(Mask), Chain.getValue(1));
3387 return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1));
3388}
3389
3390SDValue
3391ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op,
3392 SelectionDAG &DAG) const {
3393 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
3394
3395 SDValue Chain = DAG.getEntryNode();
3396 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3397 SDLoc DL(Op);
3398
3399 // Load the current TEB (thread environment block)
3400 SDValue Ops[] = {Chain,
3401 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
3402 DAG.getTargetConstant(15, DL, MVT::i32),
3403 DAG.getTargetConstant(0, DL, MVT::i32),
3404 DAG.getTargetConstant(13, DL, MVT::i32),
3405 DAG.getTargetConstant(0, DL, MVT::i32),
3406 DAG.getTargetConstant(2, DL, MVT::i32)};
3407 SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
3408 DAG.getVTList(MVT::i32, MVT::Other), Ops);
3409
3410 SDValue TEB = CurrentTEB.getValue(0);
3411 Chain = CurrentTEB.getValue(1);
3412
3413 // Load the ThreadLocalStoragePointer from the TEB
3414 // A pointer to the TLS array is located at offset 0x2c from the TEB.
3415 SDValue TLSArray =
3416 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL));
3417 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
3418
3419 // The pointer to the thread's TLS data area is at the TLS Index scaled by 4
3420 // offset into the TLSArray.
3421
3422 // Load the TLS index from the C runtime
3423 SDValue TLSIndex =
3424 DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG);
3425 TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex);
3426 TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo());
3427
3428 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
3429 DAG.getConstant(2, DL, MVT::i32));
3430 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
3431 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
3432 MachinePointerInfo());
3433
3434 // Get the offset of the start of the .tls section (section base)
3435 const auto *GA = cast<GlobalAddressSDNode>(Op);
3436 auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL);
3437 SDValue Offset = DAG.getLoad(
3438 PtrVT, DL, Chain,
3439 DAG.getNode(ARMISD::Wrapper, DL, MVT::i32,
3440 DAG.getTargetConstantPool(CPV, PtrVT, Align(4))),
3442
3443 return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset);
3444}
3445
3446// Lower ISD::GlobalTLSAddress using the "general dynamic" model
3447SDValue
3448ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
3449 SelectionDAG &DAG) const {
3450 SDLoc dl(GA);
3451 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3452 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
3453 MachineFunction &MF = DAG.getMachineFunction();
3454 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3455 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3456 ARMConstantPoolValue *CPV =
3457 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
3458 ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
3459 SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3460 Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
3461 Argument = DAG.getLoad(
3462 PtrVT, dl, DAG.getEntryNode(), Argument,
3464 SDValue Chain = Argument.getValue(1);
3465
3466 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3467 Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel);
3468
3469 // call __tls_get_addr.
3471 Args.emplace_back(Argument, Type::getInt32Ty(*DAG.getContext()));
3472
3473 // FIXME: is there useful debug info available here?
3474 TargetLowering::CallLoweringInfo CLI(DAG);
3475 CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
3477 DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args));
3478
3479 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
3480 return CallResult.first;
3481}
3482
3483// Lower ISD::GlobalTLSAddress using the "initial exec" or
3484// "local exec" model.
3485SDValue
3486ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
3487 SelectionDAG &DAG,
3488 TLSModel::Model model) const {
3489 const GlobalValue *GV = GA->getGlobal();
3490 SDLoc dl(GA);
3492 SDValue Chain = DAG.getEntryNode();
3493 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3494 // Get the Thread Pointer
3495 SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
3496
3497 if (model == TLSModel::InitialExec) {
3498 MachineFunction &MF = DAG.getMachineFunction();
3499 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3500 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3501 // Initial exec model.
3502 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
3503 ARMConstantPoolValue *CPV =
3504 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
3506 true);
3507 Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3508 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
3509 Offset = DAG.getLoad(
3510 PtrVT, dl, Chain, Offset,
3512 Chain = Offset.getValue(1);
3513
3514 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3515 Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel);
3516
3517 Offset = DAG.getLoad(
3518 PtrVT, dl, Chain, Offset,
3520 } else {
3521 // local exec model
3522 assert(model == TLSModel::LocalExec);
3523 ARMConstantPoolValue *CPV =
3525 Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3526 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
3527 Offset = DAG.getLoad(
3528 PtrVT, dl, Chain, Offset,
3530 }
3531
3532 // The address of the thread local variable is the add of the thread
3533 // pointer with the offset of the variable.
3534 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
3535}
3536
3537SDValue
3538ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
3539 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
3540 if (DAG.getTarget().useEmulatedTLS())
3541 return LowerToTLSEmulatedModel(GA, DAG);
3542
3543 if (Subtarget->isTargetDarwin())
3544 return LowerGlobalTLSAddressDarwin(Op, DAG);
3545
3546 if (Subtarget->isTargetWindows())
3547 return LowerGlobalTLSAddressWindows(Op, DAG);
3548
3549 // TODO: implement the "local dynamic" model
3550 assert(Subtarget->isTargetELF() && "Only ELF implemented here");
3552
3553 switch (model) {
3556 return LowerToTLSGeneralDynamicModel(GA, DAG);
3559 return LowerToTLSExecModels(GA, DAG, model);
3560 }
3561 llvm_unreachable("bogus TLS model");
3562}
3563
3564/// Return true if all users of V are within function F, looking through
3565/// ConstantExprs.
3566static bool allUsersAreInFunction(const Value *V, const Function *F) {
3567 SmallVector<const User*,4> Worklist(V->users());
3568 while (!Worklist.empty()) {
3569 auto *U = Worklist.pop_back_val();
3570 if (isa<ConstantExpr>(U)) {
3571 append_range(Worklist, U->users());
3572 continue;
3573 }
3574
3575 auto *I = dyn_cast<Instruction>(U);
3576 if (!I || I->getParent()->getParent() != F)
3577 return false;
3578 }
3579 return true;
3580}
3581
3583 const GlobalValue *GV, SelectionDAG &DAG,
3584 EVT PtrVT, const SDLoc &dl) {
3585 // If we're creating a pool entry for a constant global with unnamed address,
3586 // and the global is small enough, we can emit it inline into the constant pool
3587 // to save ourselves an indirection.
3588 //
3589 // This is a win if the constant is only used in one function (so it doesn't
3590 // need to be duplicated) or duplicating the constant wouldn't increase code
3591 // size (implying the constant is no larger than 4 bytes).
3592 const Function &F = DAG.getMachineFunction().getFunction();
3593
3594 // We rely on this decision to inline being idemopotent and unrelated to the
3595 // use-site. We know that if we inline a variable at one use site, we'll
3596 // inline it elsewhere too (and reuse the constant pool entry). Fast-isel
3597 // doesn't know about this optimization, so bail out if it's enabled else
3598 // we could decide to inline here (and thus never emit the GV) but require
3599 // the GV from fast-isel generated code.
3602 return SDValue();
3603
3604 auto *GVar = dyn_cast<GlobalVariable>(GV);
3605 if (!GVar || !GVar->hasInitializer() ||
3606 !GVar->isConstant() || !GVar->hasGlobalUnnamedAddr() ||
3607 !GVar->hasLocalLinkage())
3608 return SDValue();
3609
3610 // If we inline a value that contains relocations, we move the relocations
3611 // from .data to .text. This is not allowed in position-independent code.
3612 auto *Init = GVar->getInitializer();
3613 if ((TLI->isPositionIndependent() || TLI->getSubtarget()->isROPI()) &&
3614 Init->needsDynamicRelocation())
3615 return SDValue();
3616
3617 // The constant islands pass can only really deal with alignment requests
3618 // <= 4 bytes and cannot pad constants itself. Therefore we cannot promote
3619 // any type wanting greater alignment requirements than 4 bytes. We also
3620 // can only promote constants that are multiples of 4 bytes in size or
3621 // are paddable to a multiple of 4. Currently we only try and pad constants
3622 // that are strings for simplicity.
3623 auto *CDAInit = dyn_cast<ConstantDataArray>(Init);
3624 unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType());
3625 Align PrefAlign = DAG.getDataLayout().getPreferredAlign(GVar);
3626 unsigned RequiredPadding = 4 - (Size % 4);
3627 bool PaddingPossible =
3628 RequiredPadding == 4 || (CDAInit && CDAInit->isString());
3629 if (!PaddingPossible || PrefAlign > 4 || Size > ConstpoolPromotionMaxSize ||
3630 Size == 0)
3631 return SDValue();
3632
3633 unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding);
3635 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3636
3637 // We can't bloat the constant pool too much, else the ConstantIslands pass
3638 // may fail to converge. If we haven't promoted this global yet (it may have
3639 // multiple uses), and promoting it would increase the constant pool size (Sz
3640 // > 4), ensure we have space to do so up to MaxTotal.
3641 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar) && Size > 4)
3642 if (AFI->getPromotedConstpoolIncrease() + PaddedSize - 4 >=
3644 return SDValue();
3645
3646 // This is only valid if all users are in a single function; we can't clone
3647 // the constant in general. The LLVM IR unnamed_addr allows merging
3648 // constants, but not cloning them.
3649 //
3650 // We could potentially allow cloning if we could prove all uses of the
3651 // constant in the current function don't care about the address, like
3652 // printf format strings. But that isn't implemented for now.
3653 if (!allUsersAreInFunction(GVar, &F))
3654 return SDValue();
3655
3656 // We're going to inline this global. Pad it out if needed.
3657 if (RequiredPadding != 4) {
3658 StringRef S = CDAInit->getAsString();
3659
3661 std::copy(S.bytes_begin(), S.bytes_end(), V.begin());
3662 while (RequiredPadding--)
3663 V.push_back(0);
3665 }
3666
3667 auto CPVal = ARMConstantPoolConstant::Create(GVar, Init);
3668 SDValue CPAddr = DAG.getTargetConstantPool(CPVal, PtrVT, Align(4));
3669 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) {
3672 PaddedSize - 4);
3673 }
3674 ++NumConstpoolPromoted;
3675 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3676}
3677
3679 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
3680 if (!(GV = GA->getAliaseeObject()))
3681 return false;
3682 if (const auto *V = dyn_cast<GlobalVariable>(GV))
3683 return V->isConstant();
3684 return isa<Function>(GV);
3685}
3686
3687SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op,
3688 SelectionDAG &DAG) const {
3689 switch (Subtarget->getTargetTriple().getObjectFormat()) {
3690 default: llvm_unreachable("unknown object format");
3691 case Triple::COFF:
3692 return LowerGlobalAddressWindows(Op, DAG);
3693 case Triple::ELF:
3694 return LowerGlobalAddressELF(Op, DAG);
3695 case Triple::MachO:
3696 return LowerGlobalAddressDarwin(Op, DAG);
3697 }
3698}
3699
3700SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
3701 SelectionDAG &DAG) const {
3702 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3703 SDLoc dl(Op);
3704 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3705 bool IsRO = isReadOnly(GV);
3706
3707 // promoteToConstantPool only if not generating XO text section
3708 if (GV->isDSOLocal() && !Subtarget->genExecuteOnly())
3709 if (SDValue V = promoteToConstantPool(this, GV, DAG, PtrVT, dl))
3710 return V;
3711
3712 if (isPositionIndependent()) {
3714 GV, dl, PtrVT, 0, GV->isDSOLocal() ? 0 : ARMII::MO_GOT);
3715 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
3716 if (!GV->isDSOLocal())
3717 Result =
3718 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
3720 return Result;
3721 } else if (Subtarget->isROPI() && IsRO) {
3722 // PC-relative.
3723 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
3724 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
3725 return Result;
3726 } else if (Subtarget->isRWPI() && !IsRO) {
3727 // SB-relative.
3728 SDValue RelAddr;
3729 if (Subtarget->useMovt()) {
3730 ++NumMovwMovt;
3731 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_SBREL);
3732 RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G);
3733 } else { // use literal pool for address constant
3734 ARMConstantPoolValue *CPV =
3736 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3737 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3738 RelAddr = DAG.getLoad(
3739 PtrVT, dl, DAG.getEntryNode(), CPAddr,
3741 }
3742 SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT);
3743 SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, RelAddr);
3744 return Result;
3745 }
3746
3747 // If we have T2 ops, we can materialize the address directly via movt/movw
3748 // pair. This is always cheaper. If need to generate Execute Only code, and we
3749 // only have Thumb1 available, we can't use a constant pool and are forced to
3750 // use immediate relocations.
3751 if (Subtarget->useMovt() || Subtarget->genExecuteOnly()) {
3752 if (Subtarget->useMovt())
3753 ++NumMovwMovt;
3754 // FIXME: Once remat is capable of dealing with instructions with register
3755 // operands, expand this into two nodes.
3756 return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
3757 DAG.getTargetGlobalAddress(GV, dl, PtrVT));
3758 } else {
3759 SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, Align(4));
3760 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3761 return DAG.getLoad(
3762 PtrVT, dl, DAG.getEntryNode(), CPAddr,
3764 }
3765}
3766
3767SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
3768 SelectionDAG &DAG) const {
3769 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
3770 "ROPI/RWPI not currently supported for Darwin");
3771 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3772 SDLoc dl(Op);
3773 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3774
3775 if (Subtarget->useMovt())
3776 ++NumMovwMovt;
3777
3778 // FIXME: Once remat is capable of dealing with instructions with register
3779 // operands, expand this into multiple nodes
3780 unsigned Wrapper =
3781 isPositionIndependent() ? ARMISD::WrapperPIC : ARMISD::Wrapper;
3782
3783 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY);
3784 SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G);
3785
3786 if (Subtarget->isGVIndirectSymbol(GV))
3787 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
3789 return Result;
3790}
3791
3792SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
3793 SelectionDAG &DAG) const {
3794 assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported");
3795 assert(Subtarget->useMovt() &&
3796 "Windows on ARM expects to use movw/movt");
3797 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
3798 "ROPI/RWPI not currently supported for Windows");
3799
3800 const TargetMachine &TM = getTargetMachine();
3801 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3802 ARMII::TOF TargetFlags = ARMII::MO_NO_FLAG;
3803 if (GV->hasDLLImportStorageClass())
3804 TargetFlags = ARMII::MO_DLLIMPORT;
3805 else if (!TM.shouldAssumeDSOLocal(GV))
3806 TargetFlags = ARMII::MO_COFFSTUB;
3807 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3809 SDLoc DL(Op);
3810
3811 ++NumMovwMovt;
3812
3813 // FIXME: Once remat is capable of dealing with instructions with register
3814 // operands, expand this into two nodes.
3815 Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT,
3816 DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*offset=*/0,
3817 TargetFlags));
3818 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
3819 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
3821 return Result;
3822}
3823
3824SDValue
3825ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
3826 SDLoc dl(Op);
3827 SDValue Val = DAG.getConstant(0, dl, MVT::i32);
3828 return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl,
3829 DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
3830 Op.getOperand(1), Val);
3831}
3832
3833SDValue
3834ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {
3835 SDLoc dl(Op);
3836 return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0),
3837 Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32));
3838}
3839
3840SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
3841 SelectionDAG &DAG) const {
3842 SDLoc dl(Op);
3843 return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other,
3844 Op.getOperand(0));
3845}
3846
3847SDValue ARMTargetLowering::LowerINTRINSIC_VOID(
3848 SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const {
3849 unsigned IntNo =
3850 Op.getConstantOperandVal(Op.getOperand(0).getValueType() == MVT::Other);
3851 switch (IntNo) {
3852 default:
3853 return SDValue(); // Don't custom lower most intrinsics.
3854 case Intrinsic::arm_gnu_eabi_mcount: {
3855 MachineFunction &MF = DAG.getMachineFunction();
3856 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3857 SDLoc dl(Op);
3858 SDValue Chain = Op.getOperand(0);
3859 // call "\01__gnu_mcount_nc"
3860 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
3861 const uint32_t *Mask =
3863 assert(Mask && "Missing call preserved mask for calling convention");
3864 // Mark LR an implicit live-in.
3865 Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
3866 SDValue ReturnAddress =
3867 DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, PtrVT);
3868 constexpr EVT ResultTys[] = {MVT::Other, MVT::Glue};
3869 SDValue Callee =
3870 DAG.getTargetExternalSymbol("\01__gnu_mcount_nc", PtrVT, 0);
3872 if (Subtarget->isThumb())
3873 return SDValue(
3874 DAG.getMachineNode(
3875 ARM::tBL_PUSHLR, dl, ResultTys,
3876 {ReturnAddress, DAG.getTargetConstant(ARMCC::AL, dl, PtrVT),
3877 DAG.getRegister(0, PtrVT), Callee, RegisterMask, Chain}),
3878 0);
3879 return SDValue(
3880 DAG.getMachineNode(ARM::BL_PUSHLR, dl, ResultTys,
3881 {ReturnAddress, Callee, RegisterMask, Chain}),
3882 0);
3883 }
3884 }
3885}
3886
3887SDValue
3888ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
3889 const ARMSubtarget *Subtarget) const {
3890 unsigned IntNo = Op.getConstantOperandVal(0);
3891 SDLoc dl(Op);
3892 switch (IntNo) {
3893 default: return SDValue(); // Don't custom lower most intrinsics.
3894 case Intrinsic::thread_pointer: {
3895 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3896 return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
3897 }
3898 case Intrinsic::arm_cls: {
3899 const SDValue &Operand = Op.getOperand(1);
3900 const EVT VTy = Op.getValueType();
3901 SDValue SRA =
3902 DAG.getNode(ISD::SRA, dl, VTy, Operand, DAG.getConstant(31, dl, VTy));
3903 SDValue XOR = DAG.getNode(ISD::XOR, dl, VTy, SRA, Operand);
3904 SDValue SHL =
3905 DAG.getNode(ISD::SHL, dl, VTy, XOR, DAG.getConstant(1, dl, VTy));
3906 SDValue OR =
3907 DAG.getNode(ISD::OR, dl, VTy, SHL, DAG.getConstant(1, dl, VTy));
3908 SDValue Result = DAG.getNode(ISD::CTLZ, dl, VTy, OR);
3909 return Result;
3910 }
3911 case Intrinsic::arm_cls64: {
3912 // cls(x) = if cls(hi(x)) != 31 then cls(hi(x))
3913 // else 31 + clz(if hi(x) == 0 then lo(x) else not(lo(x)))
3914 const SDValue &Operand = Op.getOperand(1);
3915 const EVT VTy = Op.getValueType();
3916 SDValue Lo, Hi;
3917 std::tie(Lo, Hi) = DAG.SplitScalar(Operand, dl, VTy, VTy);
3918 SDValue Constant0 = DAG.getConstant(0, dl, VTy);
3919 SDValue Constant1 = DAG.getConstant(1, dl, VTy);
3920 SDValue Constant31 = DAG.getConstant(31, dl, VTy);
3921 SDValue SRAHi = DAG.getNode(ISD::SRA, dl, VTy, Hi, Constant31);
3922 SDValue XORHi = DAG.getNode(ISD::XOR, dl, VTy, SRAHi, Hi);
3923 SDValue SHLHi = DAG.getNode(ISD::SHL, dl, VTy, XORHi, Constant1);
3924 SDValue ORHi = DAG.getNode(ISD::OR, dl, VTy, SHLHi, Constant1);
3925 SDValue CLSHi = DAG.getNode(ISD::CTLZ, dl, VTy, ORHi);
3926 SDValue CheckLo =
3927 DAG.getSetCC(dl, MVT::i1, CLSHi, Constant31, ISD::CondCode::SETEQ);
3928 SDValue HiIsZero =
3929 DAG.getSetCC(dl, MVT::i1, Hi, Constant0, ISD::CondCode::SETEQ);
3930 SDValue AdjustedLo =
3931 DAG.getSelect(dl, VTy, HiIsZero, Lo, DAG.getNOT(dl, Lo, VTy));
3932 SDValue CLZAdjustedLo = DAG.getNode(ISD::CTLZ, dl, VTy, AdjustedLo);
3933 SDValue Result =
3934 DAG.getSelect(dl, VTy, CheckLo,
3935 DAG.getNode(ISD::ADD, dl, VTy, CLZAdjustedLo, Constant31), CLSHi);
3936 return Result;
3937 }
3938 case Intrinsic::eh_sjlj_lsda: {
3939 MachineFunction &MF = DAG.getMachineFunction();
3940 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3941 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3942 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3943 SDValue CPAddr;
3944 bool IsPositionIndependent = isPositionIndependent();
3945 unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0;
3946 ARMConstantPoolValue *CPV =
3947 ARMConstantPoolConstant::Create(&MF.getFunction(), ARMPCLabelIndex,
3948 ARMCP::CPLSDA, PCAdj);
3949 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3950 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3951 SDValue Result = DAG.getLoad(
3952 PtrVT, dl, DAG.getEntryNode(), CPAddr,
3954
3955 if (IsPositionIndependent) {
3956 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3957 Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
3958 }
3959 return Result;
3960 }
3961 case Intrinsic::arm_neon_vabs:
3962 return DAG.getNode(ISD::ABS, SDLoc(Op), Op.getValueType(),
3963 Op.getOperand(1));
3964 case Intrinsic::arm_neon_vabds:
3965 if (Op.getValueType().isInteger())
3966 return DAG.getNode(ISD::ABDS, SDLoc(Op), Op.getValueType(),
3967 Op.getOperand(1), Op.getOperand(2));
3968 return SDValue();
3969 case Intrinsic::arm_neon_vabdu:
3970 return DAG.getNode(ISD::ABDU, SDLoc(Op), Op.getValueType(),
3971 Op.getOperand(1), Op.getOperand(2));
3972 case Intrinsic::arm_neon_vmulls:
3973 case Intrinsic::arm_neon_vmullu: {
3974 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls)
3975 ? ARMISD::VMULLs : ARMISD::VMULLu;
3976 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3977 Op.getOperand(1), Op.getOperand(2));
3978 }
3979 case Intrinsic::arm_neon_vminnm:
3980 case Intrinsic::arm_neon_vmaxnm: {
3981 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm)
3982 ? ISD::FMINNUM : ISD::FMAXNUM;
3983 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3984 Op.getOperand(1), Op.getOperand(2));
3985 }
3986 case Intrinsic::arm_neon_vminu:
3987 case Intrinsic::arm_neon_vmaxu: {
3988 if (Op.getValueType().isFloatingPoint())
3989 return SDValue();
3990 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu)
3991 ? ISD::UMIN : ISD::UMAX;
3992 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3993 Op.getOperand(1), Op.getOperand(2));
3994 }
3995 case Intrinsic::arm_neon_vmins:
3996 case Intrinsic::arm_neon_vmaxs: {
3997 // v{min,max}s is overloaded between signed integers and floats.
3998 if (!Op.getValueType().isFloatingPoint()) {
3999 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
4000 ? ISD::SMIN : ISD::SMAX;
4001 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4002 Op.getOperand(1), Op.getOperand(2));
4003 }
4004 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
4005 ? ISD::FMINIMUM : ISD::FMAXIMUM;
4006 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
4007 Op.getOperand(1), Op.getOperand(2));
4008 }
4009 case Intrinsic::arm_neon_vtbl1:
4010 return DAG.getNode(ARMISD::VTBL1, SDLoc(Op), Op.getValueType(),
4011 Op.getOperand(1), Op.getOperand(2));
4012 case Intrinsic::arm_neon_vtbl2:
4013 return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(),
4014 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4015 case Intrinsic::arm_mve_pred_i2v:
4016 case Intrinsic::arm_mve_pred_v2i:
4017 return DAG.getNode(ARMISD::PREDICATE_CAST, SDLoc(Op), Op.getValueType(),
4018 Op.getOperand(1));
4019 case Intrinsic::arm_mve_vreinterpretq:
4020 return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(Op), Op.getValueType(),
4021 Op.getOperand(1));
4022 case Intrinsic::arm_mve_lsll:
4023 return DAG.getNode(ARMISD::LSLL, SDLoc(Op), Op->getVTList(),
4024 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4025 case Intrinsic::arm_mve_asrl:
4026 return DAG.getNode(ARMISD::ASRL, SDLoc(Op), Op->getVTList(),
4027 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4028 }
4029}
4030
4032 const ARMSubtarget *Subtarget) {
4033 SDLoc dl(Op);
4034 auto SSID = static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
4035 if (SSID == SyncScope::SingleThread)
4036 return Op;
4037
4038 if (!Subtarget->hasDataBarrier()) {
4039 // Some ARMv6 cpus can support data barriers with an mcr instruction.
4040 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
4041 // here.
4042 assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
4043 "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!");
4044 return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
4045 DAG.getConstant(0, dl, MVT::i32));
4046 }
4047
4048 AtomicOrdering Ord =
4049 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
4051 if (Subtarget->isMClass()) {
4052 // Only a full system barrier exists in the M-class architectures.
4054 } else if (Subtarget->preferISHSTBarriers() &&
4055 Ord == AtomicOrdering::Release) {
4056 // Swift happens to implement ISHST barriers in a way that's compatible with
4057 // Release semantics but weaker than ISH so we'd be fools not to use
4058 // it. Beware: other processors probably don't!
4060 }
4061
4062 return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0),
4063 DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32),
4064 DAG.getConstant(Domain, dl, MVT::i32));
4065}
4066
4068 const ARMSubtarget *Subtarget) {
4069 // ARM pre v5TE and Thumb1 does not have preload instructions.
4070 if (!(Subtarget->isThumb2() ||
4071 (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps())))
4072 // Just preserve the chain.
4073 return Op.getOperand(0);
4074
4075 SDLoc dl(Op);
4076 unsigned isRead = ~Op.getConstantOperandVal(2) & 1;
4077 if (!isRead &&
4078 (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension()))
4079 // ARMv7 with MP extension has PLDW.
4080 return Op.getOperand(0);
4081
4082 unsigned isData = Op.getConstantOperandVal(4);
4083 if (Subtarget->isThumb()) {
4084 // Invert the bits.
4085 isRead = ~isRead & 1;
4086 isData = ~isData & 1;
4087 }
4088
4089 return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0),
4090 Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32),
4091 DAG.getConstant(isData, dl, MVT::i32));
4092}
4093
4096 ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>();
4097
4098 // vastart just stores the address of the VarArgsFrameIndex slot into the
4099 // memory location argument.
4100 SDLoc dl(Op);
4102 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
4103 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
4104 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
4105 MachinePointerInfo(SV));
4106}
4107
4108SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA,
4109 CCValAssign &NextVA,
4110 SDValue &Root,
4111 SelectionDAG &DAG,
4112 const SDLoc &dl) const {
4113 MachineFunction &MF = DAG.getMachineFunction();
4114 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4115
4116 const TargetRegisterClass *RC;
4117 if (AFI->isThumb1OnlyFunction())
4118 RC = &ARM::tGPRRegClass;
4119 else
4120 RC = &ARM::GPRRegClass;
4121
4122 // Transform the arguments stored in physical registers into virtual ones.
4123 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
4124 SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
4125
4126 SDValue ArgValue2;
4127 if (NextVA.isMemLoc()) {
4128 MachineFrameInfo &MFI = MF.getFrameInfo();
4129 int FI = MFI.CreateFixedObject(4, NextVA.getLocMemOffset(), true);
4130
4131 // Create load node to retrieve arguments from the stack.
4132 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
4133 ArgValue2 = DAG.getLoad(
4134 MVT::i32, dl, Root, FIN,
4136 } else {
4137 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
4138 ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
4139 }
4140 if (!Subtarget->isLittle())
4141 std::swap (ArgValue, ArgValue2);
4142 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2);
4143}
4144
4145// The remaining GPRs hold either the beginning of variable-argument
4146// data, or the beginning of an aggregate passed by value (usually
4147// byval). Either way, we allocate stack slots adjacent to the data
4148// provided by our caller, and store the unallocated registers there.
4149// If this is a variadic function, the va_list pointer will begin with
4150// these values; otherwise, this reassembles a (byval) structure that
4151// was split between registers and memory.
4152// Return: The frame index registers were stored into.
4153int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
4154 const SDLoc &dl, SDValue &Chain,
4155 const Value *OrigArg,
4156 unsigned InRegsParamRecordIdx,
4157 int ArgOffset, unsigned ArgSize) const {
4158 // Currently, two use-cases possible:
4159 // Case #1. Non-var-args function, and we meet first byval parameter.
4160 // Setup first unallocated register as first byval register;
4161 // eat all remained registers
4162 // (these two actions are performed by HandleByVal method).
4163 // Then, here, we initialize stack frame with
4164 // "store-reg" instructions.
4165 // Case #2. Var-args function, that doesn't contain byval parameters.
4166 // The same: eat all remained unallocated registers,
4167 // initialize stack frame.
4168
4169 MachineFunction &MF = DAG.getMachineFunction();
4170 MachineFrameInfo &MFI = MF.getFrameInfo();
4171 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4172 unsigned RBegin, REnd;
4173 if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) {
4174 CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd);
4175 } else {
4176 unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
4177 RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx];
4178 REnd = ARM::R4;
4179 }
4180
4181 if (REnd != RBegin)
4182 ArgOffset = -4 * (ARM::R4 - RBegin);
4183
4184 auto PtrVT = getPointerTy(DAG.getDataLayout());
4185 int FrameIndex = MFI.CreateFixedObject(ArgSize, ArgOffset, false);
4186 SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT);
4187
4189 const TargetRegisterClass *RC =
4190 AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
4191
4192 for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) {
4193 Register VReg = MF.addLiveIn(Reg, RC);
4194 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
4195 SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
4196 MachinePointerInfo(OrigArg, 4 * i));
4197 MemOps.push_back(Store);
4198 FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT));
4199 }
4200
4201 if (!MemOps.empty())
4202 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
4203 return FrameIndex;
4204}
4205
4206// Setup stack frame, the va_list pointer will start from.
4207void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
4208 const SDLoc &dl, SDValue &Chain,
4209 unsigned ArgOffset,
4210 unsigned TotalArgRegsSaveSize,
4211 bool ForceMutable) const {
4212 MachineFunction &MF = DAG.getMachineFunction();
4213 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4214
4215 // Try to store any remaining integer argument regs
4216 // to their spots on the stack so that they may be loaded by dereferencing
4217 // the result of va_next.
4218 // If there is no regs to be stored, just point address after last
4219 // argument passed via stack.
4220 int FrameIndex = StoreByValRegs(
4221 CCInfo, DAG, dl, Chain, nullptr, CCInfo.getInRegsParamsCount(),
4222 CCInfo.getStackSize(), std::max(4U, TotalArgRegsSaveSize));
4223 AFI->setVarArgsFrameIndex(FrameIndex);
4224}
4225
4226bool ARMTargetLowering::splitValueIntoRegisterParts(
4227 SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
4228 unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
4229 EVT ValueVT = Val.getValueType();
4230 if ((ValueVT == MVT::f16 || ValueVT == MVT::bf16) && PartVT == MVT::f32) {
4231 unsigned ValueBits = ValueVT.getSizeInBits();
4232 unsigned PartBits = PartVT.getSizeInBits();
4233 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val);
4234 Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val);
4235 Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
4236 Parts[0] = Val;
4237 return true;
4238 }
4239 return false;
4240}
4241
4242SDValue ARMTargetLowering::joinRegisterPartsIntoValue(
4243 SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts,
4244 MVT PartVT, EVT ValueVT, std::optional<CallingConv::ID> CC) const {
4245 if ((ValueVT == MVT::f16 || ValueVT == MVT::bf16) && PartVT == MVT::f32) {
4246 unsigned ValueBits = ValueVT.getSizeInBits();
4247 unsigned PartBits = PartVT.getSizeInBits();
4248 SDValue Val = Parts[0];
4249
4250 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val);
4251 Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val);
4252 Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
4253 return Val;
4254 }
4255 return SDValue();
4256}
4257
4258SDValue ARMTargetLowering::LowerFormalArguments(
4259 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
4260 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
4261 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
4262 MachineFunction &MF = DAG.getMachineFunction();
4263 MachineFrameInfo &MFI = MF.getFrameInfo();
4264
4265 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4266
4267 // Assign locations to all of the incoming arguments.
4269 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
4270 *DAG.getContext());
4271 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg));
4272
4274 unsigned CurArgIdx = 0;
4275
4276 // Initially ArgRegsSaveSize is zero.
4277 // Then we increase this value each time we meet byval parameter.
4278 // We also increase this value in case of varargs function.
4279 AFI->setArgRegsSaveSize(0);
4280
4281 // Calculate the amount of stack space that we need to allocate to store
4282 // byval and variadic arguments that are passed in registers.
4283 // We need to know this before we allocate the first byval or variadic
4284 // argument, as they will be allocated a stack slot below the CFA (Canonical
4285 // Frame Address, the stack pointer at entry to the function).
4286 unsigned ArgRegBegin = ARM::R4;
4287 for (const CCValAssign &VA : ArgLocs) {
4288 if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount())
4289 break;
4290
4291 unsigned Index = VA.getValNo();
4292 ISD::ArgFlagsTy Flags = Ins[Index].Flags;
4293 if (!Flags.isByVal())
4294 continue;
4295
4296 assert(VA.isMemLoc() && "unexpected byval pointer in reg");
4297 unsigned RBegin, REnd;
4298 CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd);
4299 ArgRegBegin = std::min(ArgRegBegin, RBegin);
4300
4301 CCInfo.nextInRegsParam();
4302 }
4303 CCInfo.rewindByValRegsInfo();
4304
4305 int lastInsIndex = -1;
4306 if (isVarArg && MFI.hasVAStart()) {
4307 unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
4308 if (RegIdx != std::size(GPRArgRegs))
4309 ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]);
4310 }
4311
4312 unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin);
4313 AFI->setArgRegsSaveSize(TotalArgRegsSaveSize);
4314 auto PtrVT = getPointerTy(DAG.getDataLayout());
4315
4316 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4317 CCValAssign &VA = ArgLocs[i];
4318 if (Ins[VA.getValNo()].isOrigArg()) {
4319 std::advance(CurOrigArg,
4320 Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx);
4321 CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex();
4322 }
4323 // Arguments stored in registers.
4324 if (VA.isRegLoc()) {
4325 EVT RegVT = VA.getLocVT();
4326 SDValue ArgValue;
4327
4328 if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
4329 // f64 and vector types are split up into multiple registers or
4330 // combinations of registers and stack slots.
4331 SDValue ArgValue1 =
4332 GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4333 VA = ArgLocs[++i]; // skip ahead to next loc
4334 SDValue ArgValue2;
4335 if (VA.isMemLoc()) {
4336 int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true);
4337 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
4338 ArgValue2 = DAG.getLoad(
4339 MVT::f64, dl, Chain, FIN,
4341 } else {
4342 ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4343 }
4344 ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
4345 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
4346 ArgValue1, DAG.getIntPtrConstant(0, dl));
4347 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
4348 ArgValue2, DAG.getIntPtrConstant(1, dl));
4349 } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
4350 ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4351 } else {
4352 const TargetRegisterClass *RC;
4353
4354 if (RegVT == MVT::f16 || RegVT == MVT::bf16)
4355 RC = &ARM::HPRRegClass;
4356 else if (RegVT == MVT::f32)
4357 RC = &ARM::SPRRegClass;
4358 else if (RegVT == MVT::f64 || RegVT == MVT::v4f16 ||
4359 RegVT == MVT::v4bf16)
4360 RC = &ARM::DPRRegClass;
4361 else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16 ||
4362 RegVT == MVT::v8bf16)
4363 RC = &ARM::QPRRegClass;
4364 else if (RegVT == MVT::i32)
4365 RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass
4366 : &ARM::GPRRegClass;
4367 else
4368 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
4369
4370 // Transform the arguments in physical registers into virtual ones.
4371 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
4372 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
4373
4374 // If this value is passed in r0 and has the returned attribute (e.g.
4375 // C++ 'structors), record this fact for later use.
4376 if (VA.getLocReg() == ARM::R0 && Ins[VA.getValNo()].Flags.isReturned()) {
4377 AFI->setPreservesR0();
4378 }
4379 }
4380
4381 // If this is an 8 or 16-bit value, it is really passed promoted
4382 // to 32 bits. Insert an assert[sz]ext to capture this, then
4383 // truncate to the right size.
4384 switch (VA.getLocInfo()) {
4385 default: llvm_unreachable("Unknown loc info!");
4386 case CCValAssign::Full: break;
4387 case CCValAssign::BCvt:
4388 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
4389 break;
4390 }
4391
4392 // f16 arguments have their size extended to 4 bytes and passed as if they
4393 // had been copied to the LSBs of a 32-bit register.
4394 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
4395 if (VA.needsCustom() &&
4396 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
4397 ArgValue = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), ArgValue);
4398
4399 // On CMSE Entry Functions, formal integer arguments whose bitwidth is
4400 // less than 32 bits must be sign- or zero-extended in the callee for
4401 // security reasons. Although the ABI mandates an extension done by the
4402 // caller, the latter cannot be trusted to follow the rules of the ABI.
4403 const ISD::InputArg &Arg = Ins[VA.getValNo()];
4404 if (AFI->isCmseNSEntryFunction() && Arg.ArgVT.isScalarInteger() &&
4405 RegVT.isScalarInteger() && Arg.ArgVT.bitsLT(MVT::i32))
4406 ArgValue = handleCMSEValue(ArgValue, Arg, DAG, dl);
4407
4408 InVals.push_back(ArgValue);
4409 } else { // VA.isRegLoc()
4410 // Only arguments passed on the stack should make it here.
4411 assert(VA.isMemLoc());
4412 assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
4413
4414 int index = VA.getValNo();
4415
4416 // Some Ins[] entries become multiple ArgLoc[] entries.
4417 // Process them only once.
4418 if (index != lastInsIndex)
4419 {
4420 ISD::ArgFlagsTy Flags = Ins[index].Flags;
4421 // FIXME: For now, all byval parameter objects are marked mutable.
4422 // This can be changed with more analysis.
4423 // In case of tail call optimization mark all arguments mutable.
4424 // Since they could be overwritten by lowering of arguments in case of
4425 // a tail call.
4426 if (Flags.isByVal()) {
4427 assert(Ins[index].isOrigArg() &&
4428 "Byval arguments cannot be implicit");
4429 unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed();
4430
4431 int FrameIndex = StoreByValRegs(
4432 CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex,
4433 VA.getLocMemOffset(), Flags.getByValSize());
4434 InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT));
4435 CCInfo.nextInRegsParam();
4436 } else if (VA.needsCustom() && (VA.getValVT() == MVT::f16 ||
4437 VA.getValVT() == MVT::bf16)) {
4438 // f16 and bf16 values are passed in the least-significant half of
4439 // a 4 byte stack slot. This is done as-if the extension was done
4440 // in a 32-bit register, so the actual bytes used for the value
4441 // differ between little and big endian.
4442 assert(VA.getLocVT().getSizeInBits() == 32);
4443 unsigned FIOffset = VA.getLocMemOffset();
4444 int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits() / 8,
4445 FIOffset, true);
4446
4447 SDValue Addr = DAG.getFrameIndex(FI, PtrVT);
4448 if (DAG.getDataLayout().isBigEndian())
4449 Addr = DAG.getObjectPtrOffset(dl, Addr, TypeSize::getFixed(2));
4450
4451 InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, Addr,
4453 DAG.getMachineFunction(), FI)));
4454
4455 } else {
4456 unsigned FIOffset = VA.getLocMemOffset();
4457 int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
4458 FIOffset, true);
4459
4460 // Create load nodes to retrieve arguments from the stack.
4461 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
4462 InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
4464 DAG.getMachineFunction(), FI)));
4465 }
4466 lastInsIndex = index;
4467 }
4468 }
4469 }
4470
4471 // varargs
4472 if (isVarArg && MFI.hasVAStart()) {
4473 VarArgStyleRegisters(CCInfo, DAG, dl, Chain, CCInfo.getStackSize(),
4474 TotalArgRegsSaveSize);
4475 if (AFI->isCmseNSEntryFunction()) {
4476 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
4478 "secure entry function must not be variadic", dl.getDebugLoc()));
4479 }
4480 }
4481
4482 unsigned StackArgSize = CCInfo.getStackSize();
4483 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4484 if (canGuaranteeTCO(CallConv, TailCallOpt)) {
4485 // The only way to guarantee a tail call is if the callee restores its
4486 // argument area, but it must also keep the stack aligned when doing so.
4487 MaybeAlign StackAlign = DAG.getDataLayout().getStackAlignment();
4488 assert(StackAlign && "data layout string is missing stack alignment");
4489 StackArgSize = alignTo(StackArgSize, *StackAlign);
4490
4491 AFI->setArgumentStackToRestore(StackArgSize);
4492 }
4493 AFI->setArgumentStackSize(StackArgSize);
4494
4495 if (CCInfo.getStackSize() > 0 && AFI->isCmseNSEntryFunction()) {
4496 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
4498 "secure entry function requires arguments on stack", dl.getDebugLoc()));
4499 }
4500
4501 return Chain;
4502}
4503
4504/// isFloatingPointZero - Return true if this is +0.0.
4507 return CFP->getValueAPF().isPosZero();
4508 else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
4509 // Maybe this has already been legalized into the constant pool?
4510 if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) {
4511 SDValue WrapperOp = Op.getOperand(1).getOperand(0);
4513 if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
4514 return CFP->getValueAPF().isPosZero();
4515 }
4516 } else if (Op->getOpcode() == ISD::BITCAST &&
4517 Op->getValueType(0) == MVT::f64) {
4518 // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64)
4519 // created by LowerConstantFP().
4520 SDValue BitcastOp = Op->getOperand(0);
4521 if (BitcastOp->getOpcode() == ARMISD::VMOVIMM &&
4522 isNullConstant(BitcastOp->getOperand(0)))
4523 return true;
4524 }
4525 return false;
4526}
4527
4528/// Returns appropriate ARM CMP (cmp) and corresponding condition code for
4529/// the given operands.
4530SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
4531 SDValue &ARMcc, SelectionDAG &DAG,
4532 const SDLoc &dl) const {
4533 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
4534 unsigned C = RHSC->getZExtValue();
4535 if (!isLegalICmpImmediate((int32_t)C)) {
4536 // Constant does not fit, try adjusting it by one.
4537 switch (CC) {
4538 default: break;
4539 case ISD::SETLT:
4540 case ISD::SETGE:
4541 if (C != 0x80000000 && isLegalICmpImmediate(C-1)) {
4542 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
4543 RHS = DAG.getConstant(C - 1, dl, MVT::i32);
4544 }
4545 break;
4546 case ISD::SETULT:
4547 case ISD::SETUGE:
4548 if (C != 0 && isLegalICmpImmediate(C-1)) {
4549 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
4550 RHS = DAG.getConstant(C - 1, dl, MVT::i32);
4551 }
4552 break;
4553 case ISD::SETLE:
4554 case ISD::SETGT:
4555 if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) {
4556 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
4557 RHS = DAG.getConstant(C + 1, dl, MVT::i32);
4558 }
4559 break;
4560 case ISD::SETULE:
4561 case ISD::SETUGT:
4562 if (C != 0xffffffff && isLegalICmpImmediate(C+1)) {
4563 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
4564 RHS = DAG.getConstant(C + 1, dl, MVT::i32);
4565 }
4566 break;
4567 }
4568 }
4569 } else if ((ARM_AM::getShiftOpcForNode(LHS.getOpcode()) != ARM_AM::no_shift) &&
4571 // In ARM and Thumb-2, the compare instructions can shift their second
4572 // operand.
4574 std::swap(LHS, RHS);
4575 }
4576
4577 // Thumb1 has very limited immediate modes, so turning an "and" into a
4578 // shift can save multiple instructions.
4579 //
4580 // If we have (x & C1), and C1 is an appropriate mask, we can transform it
4581 // into "((x << n) >> n)". But that isn't necessarily profitable on its
4582 // own. If it's the operand to an unsigned comparison with an immediate,
4583 // we can eliminate one of the shifts: we transform
4584 // "((x << n) >> n) == C2" to "(x << n) == (C2 << n)".
4585 //
4586 // We avoid transforming cases which aren't profitable due to encoding
4587 // details:
4588 //
4589 // 1. C2 fits into the immediate field of a cmp, and the transformed version
4590 // would not; in that case, we're essentially trading one immediate load for
4591 // another.
4592 // 2. C1 is 255 or 65535, so we can use uxtb or uxth.
4593 // 3. C2 is zero; we have other code for this special case.
4594 //
4595 // FIXME: Figure out profitability for Thumb2; we usually can't save an
4596 // instruction, since the AND is always one instruction anyway, but we could
4597 // use narrow instructions in some cases.
4598 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::AND &&
4599 LHS->hasOneUse() && isa<ConstantSDNode>(LHS.getOperand(1)) &&
4600 LHS.getValueType() == MVT::i32 && isa<ConstantSDNode>(RHS) &&
4601 !isSignedIntSetCC(CC)) {
4602 unsigned Mask = LHS.getConstantOperandVal(1);
4603 auto *RHSC = cast<ConstantSDNode>(RHS.getNode());
4604 uint64_t RHSV = RHSC->getZExtValue();
4605 if (isMask_32(Mask) && (RHSV & ~Mask) == 0 && Mask != 255 && Mask != 65535) {
4606 unsigned ShiftBits = llvm::countl_zero(Mask);
4607 if (RHSV && (RHSV > 255 || (RHSV << ShiftBits) <= 255)) {
4608 SDValue ShiftAmt = DAG.getConstant(ShiftBits, dl, MVT::i32);
4609 LHS = DAG.getNode(ISD::SHL, dl, MVT::i32, LHS.getOperand(0), ShiftAmt);
4610 RHS = DAG.getConstant(RHSV << ShiftBits, dl, MVT::i32);
4611 }
4612 }
4613 }
4614
4615 // The specific comparison "(x<<c) > 0x80000000U" can be optimized to a
4616 // single "lsls x, c+1". The shift sets the "C" and "Z" flags the same
4617 // way a cmp would.
4618 // FIXME: Add support for ARM/Thumb2; this would need isel patterns, and
4619 // some tweaks to the heuristics for the previous and->shift transform.
4620 // FIXME: Optimize cases where the LHS isn't a shift.
4621 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::SHL &&
4622 isa<ConstantSDNode>(RHS) && RHS->getAsZExtVal() == 0x80000000U &&
4623 CC == ISD::SETUGT && isa<ConstantSDNode>(LHS.getOperand(1)) &&
4624 LHS.getConstantOperandVal(1) < 31) {
4625 unsigned ShiftAmt = LHS.getConstantOperandVal(1) + 1;
4626 SDValue Shift =
4627 DAG.getNode(ARMISD::LSLS, dl, DAG.getVTList(MVT::i32, FlagsVT),
4628 LHS.getOperand(0), DAG.getConstant(ShiftAmt, dl, MVT::i32));
4629 ARMcc = DAG.getConstant(ARMCC::HI, dl, MVT::i32);
4630 return Shift.getValue(1);
4631 }
4632
4634
4635 // If the RHS is a constant zero then the V (overflow) flag will never be
4636 // set. This can allow us to simplify GE to PL or LT to MI, which can be
4637 // simpler for other passes (like the peephole optimiser) to deal with.
4638 if (isNullConstant(RHS)) {
4639 switch (CondCode) {
4640 default: break;
4641 case ARMCC::GE:
4643 break;
4644 case ARMCC::LT:
4646 break;
4647 }
4648 }
4649
4650 unsigned CompareType;
4651 switch (CondCode) {
4652 default:
4653 CompareType = ARMISD::CMP;
4654 break;
4655 case ARMCC::EQ:
4656 case ARMCC::NE:
4657 // Uses only Z Flag
4658 CompareType = ARMISD::CMPZ;
4659 break;
4660 }
4661 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
4662 return DAG.getNode(CompareType, dl, FlagsVT, LHS, RHS);
4663}
4664
4665/// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
4666SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS,
4667 SelectionDAG &DAG, const SDLoc &dl,
4668 bool Signaling) const {
4669 assert(Subtarget->hasFP64() || RHS.getValueType() != MVT::f64);
4670 SDValue Flags;
4672 Flags = DAG.getNode(Signaling ? ARMISD::CMPFPE : ARMISD::CMPFP, dl, FlagsVT,
4673 LHS, RHS);
4674 else
4675 Flags = DAG.getNode(Signaling ? ARMISD::CMPFPEw0 : ARMISD::CMPFPw0, dl,
4676 FlagsVT, LHS);
4677 return DAG.getNode(ARMISD::FMSTAT, dl, FlagsVT, Flags);
4678}
4679
4680// This function returns three things: the arithmetic computation itself
4681// (Value), a comparison (OverflowCmp), and a condition code (ARMcc). The
4682// comparison and the condition code define the case in which the arithmetic
4683// computation *does not* overflow.
4684std::pair<SDValue, SDValue>
4685ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,
4686 SDValue &ARMcc) const {
4687 assert(Op.getValueType() == MVT::i32 && "Unsupported value type");
4688
4689 SDValue Value, OverflowCmp;
4690 SDValue LHS = Op.getOperand(0);
4691 SDValue RHS = Op.getOperand(1);
4692 SDLoc dl(Op);
4693
4694 // FIXME: We are currently always generating CMPs because we don't support
4695 // generating CMN through the backend. This is not as good as the natural
4696 // CMP case because it causes a register dependency and cannot be folded
4697 // later.
4698
4699 switch (Op.getOpcode()) {
4700 default:
4701 llvm_unreachable("Unknown overflow instruction!");
4702 case ISD::SADDO:
4703 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
4704 Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS);
4705 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value, LHS);
4706 break;
4707 case ISD::UADDO:
4708 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
4709 // We use ADDC here to correspond to its use in LowerUnsignedALUO.
4710 // We do not use it in the USUBO case as Value may not be used.
4711 Value = DAG.getNode(ARMISD::ADDC, dl,
4712 DAG.getVTList(Op.getValueType(), MVT::i32), LHS, RHS)
4713 .getValue(0);
4714 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value, LHS);
4715 break;
4716 case ISD::SSUBO:
4717 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
4718 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
4719 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, LHS, RHS);
4720 break;
4721 case ISD::USUBO:
4722 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
4723 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
4724 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, LHS, RHS);
4725 break;
4726 case ISD::UMULO:
4727 // We generate a UMUL_LOHI and then check if the high word is 0.
4728 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
4729 Value = DAG.getNode(ISD::UMUL_LOHI, dl,
4730 DAG.getVTList(Op.getValueType(), Op.getValueType()),
4731 LHS, RHS);
4732 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value.getValue(1),
4733 DAG.getConstant(0, dl, MVT::i32));
4734 Value = Value.getValue(0); // We only want the low 32 bits for the result.
4735 break;
4736 case ISD::SMULO:
4737 // We generate a SMUL_LOHI and then check if all the bits of the high word
4738 // are the same as the sign bit of the low word.
4739 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
4740 Value = DAG.getNode(ISD::SMUL_LOHI, dl,
4741 DAG.getVTList(Op.getValueType(), Op.getValueType()),
4742 LHS, RHS);
4743 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value.getValue(1),
4744 DAG.getNode(ISD::SRA, dl, Op.getValueType(),
4745 Value.getValue(0),
4746 DAG.getConstant(31, dl, MVT::i32)));
4747 Value = Value.getValue(0); // We only want the low 32 bits for the result.
4748 break;
4749 } // switch (...)
4750
4751 return std::make_pair(Value, OverflowCmp);
4752}
4753
4754SDValue
4755ARMTargetLowering::LowerSignedALUO(SDValue Op, SelectionDAG &DAG) const {
4756 // Let legalize expand this if it isn't a legal type yet.
4757 if (!isTypeLegal(Op.getValueType()))
4758 return SDValue();
4759
4760 SDValue Value, OverflowCmp;
4761 SDValue ARMcc;
4762 std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc);
4763 SDLoc dl(Op);
4764 // We use 0 and 1 as false and true values.
4765 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
4766 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
4767 EVT VT = Op.getValueType();
4768
4769 SDValue Overflow =
4770 DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal, ARMcc, OverflowCmp);
4771
4772 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
4773 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
4774}
4775
4777 SelectionDAG &DAG) {
4778 SDLoc DL(BoolCarry);
4779 EVT CarryVT = BoolCarry.getValueType();
4780
4781 // This converts the boolean value carry into the carry flag by doing
4782 // ARMISD::SUBC Carry, 1
4783 SDValue Carry = DAG.getNode(ARMISD::SUBC, DL,
4784 DAG.getVTList(CarryVT, MVT::i32),
4785 BoolCarry, DAG.getConstant(1, DL, CarryVT));
4786 return Carry.getValue(1);
4787}
4788
4790 SelectionDAG &DAG) {
4791 SDLoc DL(Flags);
4792
4793 // Now convert the carry flag into a boolean carry. We do this
4794 // using ARMISD:ADDE 0, 0, Carry
4795 return DAG.getNode(ARMISD::ADDE, DL, DAG.getVTList(VT, MVT::i32),
4796 DAG.getConstant(0, DL, MVT::i32),
4797 DAG.getConstant(0, DL, MVT::i32), Flags);
4798}
4799
4800SDValue ARMTargetLowering::LowerUnsignedALUO(SDValue Op,
4801 SelectionDAG &DAG) const {
4802 // Let legalize expand this if it isn't a legal type yet.
4803 if (!isTypeLegal(Op.getValueType()))
4804 return SDValue();
4805
4806 SDValue LHS = Op.getOperand(0);
4807 SDValue RHS = Op.getOperand(1);
4808 SDLoc dl(Op);
4809
4810 EVT VT = Op.getValueType();
4811 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
4812 SDValue Value;
4813 SDValue Overflow;
4814 switch (Op.getOpcode()) {
4815 default:
4816 llvm_unreachable("Unknown overflow instruction!");
4817 case ISD::UADDO:
4818 Value = DAG.getNode(ARMISD::ADDC, dl, VTs, LHS, RHS);
4819 // Convert the carry flag into a boolean value.
4820 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG);
4821 break;
4822 case ISD::USUBO: {
4823 Value = DAG.getNode(ARMISD::SUBC, dl, VTs, LHS, RHS);
4824 // Convert the carry flag into a boolean value.
4825 Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG);
4826 // ARMISD::SUBC returns 0 when we have to borrow, so make it an overflow
4827 // value. So compute 1 - C.
4828 Overflow = DAG.getNode(ISD::SUB, dl, MVT::i32,
4829 DAG.getConstant(1, dl, MVT::i32), Overflow);
4830 break;
4831 }
4832 }
4833
4834 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
4835}
4836
4838 const ARMSubtarget *Subtarget) {
4839 EVT VT = Op.getValueType();
4840 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP() || Subtarget->isThumb1Only())
4841 return SDValue();
4842 if (!VT.isSimple())
4843 return SDValue();
4844
4845 unsigned NewOpcode;
4846 switch (VT.getSimpleVT().SimpleTy) {
4847 default:
4848 return SDValue();
4849 case MVT::i8:
4850 switch (Op->getOpcode()) {
4851 case ISD::UADDSAT:
4852 NewOpcode = ARMISD::UQADD8b;
4853 break;
4854 case ISD::SADDSAT:
4855 NewOpcode = ARMISD::QADD8b;
4856 break;
4857 case ISD::USUBSAT:
4858 NewOpcode = ARMISD::UQSUB8b;
4859 break;
4860 case ISD::SSUBSAT:
4861 NewOpcode = ARMISD::QSUB8b;
4862 break;
4863 }
4864 break;
4865 case MVT::i16:
4866 switch (Op->getOpcode()) {
4867 case ISD::UADDSAT:
4868 NewOpcode = ARMISD::UQADD16b;
4869 break;
4870 case ISD::SADDSAT:
4871 NewOpcode = ARMISD::QADD16b;
4872 break;
4873 case ISD::USUBSAT:
4874 NewOpcode = ARMISD::UQSUB16b;
4875 break;
4876 case ISD::SSUBSAT:
4877 NewOpcode = ARMISD::QSUB16b;
4878 break;
4879 }
4880 break;
4881 }
4882
4883 SDLoc dl(Op);
4884 SDValue Add =
4885 DAG.getNode(NewOpcode, dl, MVT::i32,
4886 DAG.getSExtOrTrunc(Op->getOperand(0), dl, MVT::i32),
4887 DAG.getSExtOrTrunc(Op->getOperand(1), dl, MVT::i32));
4888 return DAG.getNode(ISD::TRUNCATE, dl, VT, Add);
4889}
4890
4891SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
4892 SDValue Cond = Op.getOperand(0);
4893 SDValue SelectTrue = Op.getOperand(1);
4894 SDValue SelectFalse = Op.getOperand(2);
4895 SDLoc dl(Op);
4896 unsigned Opc = Cond.getOpcode();
4897
4898 if (Cond.getResNo() == 1 &&
4899 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
4900 Opc == ISD::USUBO)) {
4901 if (!isTypeLegal(Cond->getValueType(0)))
4902 return SDValue();
4903
4904 SDValue Value, OverflowCmp;
4905 SDValue ARMcc;
4906 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
4907 EVT VT = Op.getValueType();
4908
4909 return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, OverflowCmp, DAG);
4910 }
4911
4912 // Convert:
4913 //
4914 // (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond)
4915 // (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond)
4916 //
4917 if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) {
4918 const ConstantSDNode *CMOVTrue =
4919 dyn_cast<ConstantSDNode>(Cond.getOperand(0));
4920 const ConstantSDNode *CMOVFalse =
4921 dyn_cast<ConstantSDNode>(Cond.getOperand(1));
4922
4923 if (CMOVTrue && CMOVFalse) {
4924 unsigned CMOVTrueVal = CMOVTrue->getZExtValue();
4925 unsigned CMOVFalseVal = CMOVFalse->getZExtValue();
4926
4927 SDValue True;
4928 SDValue False;
4929 if (CMOVTrueVal == 1 && CMOVFalseVal == 0) {
4930 True = SelectTrue;
4931 False = SelectFalse;
4932 } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) {
4933 True = SelectFalse;
4934 False = SelectTrue;
4935 }
4936
4937 if (True.getNode() && False.getNode())
4938 return getCMOV(dl, Op.getValueType(), True, False, Cond.getOperand(2),
4939 Cond.getOperand(3), DAG);
4940 }
4941 }
4942
4943 // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the
4944 // undefined bits before doing a full-word comparison with zero.
4945 Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond,
4946 DAG.getConstant(1, dl, Cond.getValueType()));
4947
4948 return DAG.getSelectCC(dl, Cond,
4949 DAG.getConstant(0, dl, Cond.getValueType()),
4950 SelectTrue, SelectFalse, ISD::SETNE);
4951}
4952
4954 bool &swpCmpOps, bool &swpVselOps) {
4955 // Start by selecting the GE condition code for opcodes that return true for
4956 // 'equality'
4957 if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE ||
4958 CC == ISD::SETULE || CC == ISD::SETGE || CC == ISD::SETLE)
4959 CondCode = ARMCC::GE;
4960
4961 // and GT for opcodes that return false for 'equality'.
4962 else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT ||
4963 CC == ISD::SETULT || CC == ISD::SETGT || CC == ISD::SETLT)
4964 CondCode = ARMCC::GT;
4965
4966 // Since we are constrained to GE/GT, if the opcode contains 'less', we need
4967 // to swap the compare operands.
4968 if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT ||
4969 CC == ISD::SETULT || CC == ISD::SETLE || CC == ISD::SETLT)
4970 swpCmpOps = true;
4971
4972 // Both GT and GE are ordered comparisons, and return false for 'unordered'.
4973 // If we have an unordered opcode, we need to swap the operands to the VSEL
4974 // instruction (effectively negating the condition).
4975 //
4976 // This also has the effect of swapping which one of 'less' or 'greater'
4977 // returns true, so we also swap the compare operands. It also switches
4978 // whether we return true for 'equality', so we compensate by picking the
4979 // opposite condition code to our original choice.
4980 if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE ||
4981 CC == ISD::SETUGT) {
4982 swpCmpOps = !swpCmpOps;
4983 swpVselOps = !swpVselOps;
4984 CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT;
4985 }
4986
4987 // 'ordered' is 'anything but unordered', so use the VS condition code and
4988 // swap the VSEL operands.
4989 if (CC == ISD::SETO) {
4990 CondCode = ARMCC::VS;
4991 swpVselOps = true;
4992 }
4993
4994 // 'unordered or not equal' is 'anything but equal', so use the EQ condition
4995 // code and swap the VSEL operands. Also do this if we don't care about the
4996 // unordered case.
4997 if (CC == ISD::SETUNE || CC == ISD::SETNE) {
4998 CondCode = ARMCC::EQ;
4999 swpVselOps = true;
5000 }
5001}
5002
5003SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal,
5004 SDValue TrueVal, SDValue ARMcc,
5005 SDValue Flags, SelectionDAG &DAG) const {
5006 if (!Subtarget->hasFP64() && VT == MVT::f64) {
5007 FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl,
5008 DAG.getVTList(MVT::i32, MVT::i32), FalseVal);
5009 TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl,
5010 DAG.getVTList(MVT::i32, MVT::i32), TrueVal);
5011
5012 SDValue TrueLow = TrueVal.getValue(0);
5013 SDValue TrueHigh = TrueVal.getValue(1);
5014 SDValue FalseLow = FalseVal.getValue(0);
5015 SDValue FalseHigh = FalseVal.getValue(1);
5016
5017 SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow,
5018 ARMcc, Flags);
5019 SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh,
5020 ARMcc, Flags);
5021
5022 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High);
5023 }
5024 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, Flags);
5025}
5026
5027static bool isGTorGE(ISD::CondCode CC) {
5028 return CC == ISD::SETGT || CC == ISD::SETGE;
5029}
5030
5031static bool isLTorLE(ISD::CondCode CC) {
5032 return CC == ISD::SETLT || CC == ISD::SETLE;
5033}
5034
5035// See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating.
5036// All of these conditions (and their <= and >= counterparts) will do:
5037// x < k ? k : x
5038// x > k ? x : k
5039// k < x ? x : k
5040// k > x ? k : x
5041static bool isLowerSaturate(const SDValue LHS, const SDValue RHS,
5042 const SDValue TrueVal, const SDValue FalseVal,
5043 const ISD::CondCode CC, const SDValue K) {
5044 return (isGTorGE(CC) &&
5045 ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) ||
5046 (isLTorLE(CC) &&
5047 ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal)));
5048}
5049
5050// Check if two chained conditionals could be converted into SSAT or USAT.
5051//
5052// SSAT can replace a set of two conditional selectors that bound a number to an
5053// interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples:
5054//
5055// x < -k ? -k : (x > k ? k : x)
5056// x < -k ? -k : (x < k ? x : k)
5057// x > -k ? (x > k ? k : x) : -k
5058// x < k ? (x < -k ? -k : x) : k
5059// etc.
5060//
5061// LLVM canonicalizes these to either a min(max()) or a max(min())
5062// pattern. This function tries to match one of these and will return a SSAT
5063// node if successful.
5064//
5065// USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1
5066// is a power of 2.
5068 EVT VT = Op.getValueType();
5069 SDValue V1 = Op.getOperand(0);
5070 SDValue K1 = Op.getOperand(1);
5071 SDValue TrueVal1 = Op.getOperand(2);
5072 SDValue FalseVal1 = Op.getOperand(3);
5073 ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5074
5075 const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1;
5076 if (Op2.getOpcode() != ISD::SELECT_CC)
5077 return SDValue();
5078
5079 SDValue V2 = Op2.getOperand(0);
5080 SDValue K2 = Op2.getOperand(1);
5081 SDValue TrueVal2 = Op2.getOperand(2);
5082 SDValue FalseVal2 = Op2.getOperand(3);
5083 ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get();
5084
5085 SDValue V1Tmp = V1;
5086 SDValue V2Tmp = V2;
5087
5088 // Check that the registers and the constants match a max(min()) or min(max())
5089 // pattern
5090 if (V1Tmp != TrueVal1 || V2Tmp != TrueVal2 || K1 != FalseVal1 ||
5091 K2 != FalseVal2 ||
5092 !((isGTorGE(CC1) && isLTorLE(CC2)) || (isLTorLE(CC1) && isGTorGE(CC2))))
5093 return SDValue();
5094
5095 // Check that the constant in the lower-bound check is
5096 // the opposite of the constant in the upper-bound check
5097 // in 1's complement.
5099 return SDValue();
5100
5101 int64_t Val1 = cast<ConstantSDNode>(K1)->getSExtValue();
5102 int64_t Val2 = cast<ConstantSDNode>(K2)->getSExtValue();
5103 int64_t PosVal = std::max(Val1, Val2);
5104 int64_t NegVal = std::min(Val1, Val2);
5105
5106 if (!((Val1 > Val2 && isLTorLE(CC1)) || (Val1 < Val2 && isLTorLE(CC2))) ||
5107 !isPowerOf2_64(PosVal + 1))
5108 return SDValue();
5109
5110 // Handle the difference between USAT (unsigned) and SSAT (signed)
5111 // saturation
5112 // At this point, PosVal is guaranteed to be positive
5113 uint64_t K = PosVal;
5114 SDLoc dl(Op);
5115 if (Val1 == ~Val2)
5116 return DAG.getNode(ARMISD::SSAT, dl, VT, V2Tmp,
5117 DAG.getConstant(llvm::countr_one(K), dl, VT));
5118 if (NegVal == 0)
5119 return DAG.getNode(ARMISD::USAT, dl, VT, V2Tmp,
5120 DAG.getConstant(llvm::countr_one(K), dl, VT));
5121
5122 return SDValue();
5123}
5124
5125// Check if a condition of the type x < k ? k : x can be converted into a
5126// bit operation instead of conditional moves.
5127// Currently this is allowed given:
5128// - The conditions and values match up
5129// - k is 0 or -1 (all ones)
5130// This function will not check the last condition, thats up to the caller
5131// It returns true if the transformation can be made, and in such case
5132// returns x in V, and k in SatK.
5134 SDValue &SatK)
5135{
5136 SDValue LHS = Op.getOperand(0);
5137 SDValue RHS = Op.getOperand(1);
5138 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5139 SDValue TrueVal = Op.getOperand(2);
5140 SDValue FalseVal = Op.getOperand(3);
5141
5143 ? &RHS
5144 : nullptr;
5145
5146 // No constant operation in comparison, early out
5147 if (!K)
5148 return false;
5149
5150 SDValue KTmp = isa<ConstantSDNode>(TrueVal) ? TrueVal : FalseVal;
5151 V = (KTmp == TrueVal) ? FalseVal : TrueVal;
5152 SDValue VTmp = (K && *K == LHS) ? RHS : LHS;
5153
5154 // If the constant on left and right side, or variable on left and right,
5155 // does not match, early out
5156 if (*K != KTmp || V != VTmp)
5157 return false;
5158
5159 if (isLowerSaturate(LHS, RHS, TrueVal, FalseVal, CC, *K)) {
5160 SatK = *K;
5161 return true;
5162 }
5163
5164 return false;
5165}
5166
5167bool ARMTargetLowering::isUnsupportedFloatingType(EVT VT) const {
5168 if (VT == MVT::f32)
5169 return !Subtarget->hasVFP2Base();
5170 if (VT == MVT::f64)
5171 return !Subtarget->hasFP64();
5172 if (VT == MVT::f16)
5173 return !Subtarget->hasFullFP16();
5174 return false;
5175}
5176
5177SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
5178 EVT VT = Op.getValueType();
5179 SDLoc dl(Op);
5180
5181 // Try to convert two saturating conditional selects into a single SSAT
5182 if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2())
5183 if (SDValue SatValue = LowerSaturatingConditional(Op, DAG))
5184 return SatValue;
5185
5186 // Try to convert expressions of the form x < k ? k : x (and similar forms)
5187 // into more efficient bit operations, which is possible when k is 0 or -1
5188 // On ARM and Thumb-2 which have flexible operand 2 this will result in
5189 // single instructions. On Thumb the shift and the bit operation will be two
5190 // instructions.
5191 // Only allow this transformation on full-width (32-bit) operations
5192 SDValue LowerSatConstant;
5193 SDValue SatValue;
5194 if (VT == MVT::i32 &&
5195 isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) {
5196 SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue,
5197 DAG.getConstant(31, dl, VT));
5198 if (isNullConstant(LowerSatConstant)) {
5199 SDValue NotShiftV = DAG.getNode(ISD::XOR, dl, VT, ShiftV,
5200 DAG.getAllOnesConstant(dl, VT));
5201 return DAG.getNode(ISD::AND, dl, VT, SatValue, NotShiftV);
5202 } else if (isAllOnesConstant(LowerSatConstant))
5203 return DAG.getNode(ISD::OR, dl, VT, SatValue, ShiftV);
5204 }
5205
5206 SDValue LHS = Op.getOperand(0);
5207 SDValue RHS = Op.getOperand(1);
5208 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5209 SDValue TrueVal = Op.getOperand(2);
5210 SDValue FalseVal = Op.getOperand(3);
5211 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FalseVal);
5212 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TrueVal);
5213 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
5214 if (Op.getValueType().isInteger()) {
5215
5216 // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
5217 // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
5218 // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
5219 // Both require less instructions than compare and conditional select.
5220 if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TrueVal && RHSC &&
5221 RHSC->isZero() && CFVal && CFVal->isZero() &&
5222 LHS.getValueType() == RHS.getValueType()) {
5223 EVT VT = LHS.getValueType();
5224 SDValue Shift =
5225 DAG.getNode(ISD::SRA, dl, VT, LHS,
5226 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
5227
5228 if (CC == ISD::SETGT)
5229 Shift = DAG.getNOT(dl, Shift, VT);
5230
5231 return DAG.getNode(ISD::AND, dl, VT, LHS, Shift);
5232 }
5233 }
5234
5235 if (Subtarget->hasV8_1MMainlineOps() && CFVal && CTVal &&
5236 LHS.getValueType() == MVT::i32 && RHS.getValueType() == MVT::i32) {
5237 unsigned TVal = CTVal->getZExtValue();
5238 unsigned FVal = CFVal->getZExtValue();
5239 unsigned Opcode = 0;
5240
5241 if (TVal == ~FVal) {
5242 Opcode = ARMISD::CSINV;
5243 } else if (TVal == ~FVal + 1) {
5244 Opcode = ARMISD::CSNEG;
5245 } else if (TVal + 1 == FVal) {
5246 Opcode = ARMISD::CSINC;
5247 } else if (TVal == FVal + 1) {
5248 Opcode = ARMISD::CSINC;
5249 std::swap(TrueVal, FalseVal);
5250 std::swap(TVal, FVal);
5251 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5252 }
5253
5254 if (Opcode) {
5255 // If one of the constants is cheaper than another, materialise the
5256 // cheaper one and let the csel generate the other.
5257 if (Opcode != ARMISD::CSINC &&
5258 HasLowerConstantMaterializationCost(FVal, TVal, Subtarget)) {
5259 std::swap(TrueVal, FalseVal);
5260 std::swap(TVal, FVal);
5261 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5262 }
5263
5264 // Attempt to use ZR checking TVal is 0, possibly inverting the condition
5265 // to get there. CSINC not is invertable like the other two (~(~a) == a,
5266 // -(-a) == a, but (a+1)+1 != a).
5267 if (FVal == 0 && Opcode != ARMISD::CSINC) {
5268 std::swap(TrueVal, FalseVal);
5269 std::swap(TVal, FVal);
5270 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5271 }
5272
5273 // Drops F's value because we can get it by inverting/negating TVal.
5274 FalseVal = TrueVal;
5275
5276 SDValue ARMcc;
5277 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5278 EVT VT = TrueVal.getValueType();
5279 return DAG.getNode(Opcode, dl, VT, TrueVal, FalseVal, ARMcc, Cmp);
5280 }
5281 }
5282
5283 if (isUnsupportedFloatingType(LHS.getValueType())) {
5284 softenSetCCOperands(DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
5285
5286 // If softenSetCCOperands only returned one value, we should compare it to
5287 // zero.
5288 if (!RHS.getNode()) {
5289 RHS = DAG.getConstant(0, dl, LHS.getValueType());
5290 CC = ISD::SETNE;
5291 }
5292 }
5293
5294 if (LHS.getValueType() == MVT::i32) {
5295 // Try to generate VSEL on ARMv8.
5296 // The VSEL instruction can't use all the usual ARM condition
5297 // codes: it only has two bits to select the condition code, so it's
5298 // constrained to use only GE, GT, VS and EQ.
5299 //
5300 // To implement all the various ISD::SETXXX opcodes, we sometimes need to
5301 // swap the operands of the previous compare instruction (effectively
5302 // inverting the compare condition, swapping 'less' and 'greater') and
5303 // sometimes need to swap the operands to the VSEL (which inverts the
5304 // condition in the sense of firing whenever the previous condition didn't)
5305 if (Subtarget->hasFPARMv8Base() && (TrueVal.getValueType() == MVT::f16 ||
5306 TrueVal.getValueType() == MVT::f32 ||
5307 TrueVal.getValueType() == MVT::f64)) {
5309 if (CondCode == ARMCC::LT || CondCode == ARMCC::LE ||
5310 CondCode == ARMCC::VC || CondCode == ARMCC::NE) {
5311 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5312 std::swap(TrueVal, FalseVal);
5313 }
5314 }
5315
5316 SDValue ARMcc;
5317 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5318 // Choose GE over PL, which vsel does now support
5319 if (ARMcc->getAsZExtVal() == ARMCC::PL)
5320 ARMcc = DAG.getConstant(ARMCC::GE, dl, MVT::i32);
5321 return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, Cmp, DAG);
5322 }
5323
5324 ARMCC::CondCodes CondCode, CondCode2;
5325 FPCCToARMCC(CC, CondCode, CondCode2);
5326
5327 // Normalize the fp compare. If RHS is zero we prefer to keep it there so we
5328 // match CMPFPw0 instead of CMPFP, though we don't do this for f16 because we
5329 // must use VSEL (limited condition codes), due to not having conditional f16
5330 // moves.
5331 if (Subtarget->hasFPARMv8Base() &&
5332 !(isFloatingPointZero(RHS) && TrueVal.getValueType() != MVT::f16) &&
5333 (TrueVal.getValueType() == MVT::f16 ||
5334 TrueVal.getValueType() == MVT::f32 ||
5335 TrueVal.getValueType() == MVT::f64)) {
5336 bool swpCmpOps = false;
5337 bool swpVselOps = false;
5338 checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps);
5339
5340 if (CondCode == ARMCC::GT || CondCode == ARMCC::GE ||
5341 CondCode == ARMCC::VS || CondCode == ARMCC::EQ) {
5342 if (swpCmpOps)
5343 std::swap(LHS, RHS);
5344 if (swpVselOps)
5345 std::swap(TrueVal, FalseVal);
5346 }
5347 }
5348
5349 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5350 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
5351 SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, Cmp, DAG);
5352 if (CondCode2 != ARMCC::AL) {
5353 SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32);
5354 Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, Cmp, DAG);
5355 }
5356 return Result;
5357}
5358
5359/// canChangeToInt - Given the fp compare operand, return true if it is suitable
5360/// to morph to an integer compare sequence.
5361static bool canChangeToInt(SDValue Op, bool &SeenZero,
5362 const ARMSubtarget *Subtarget) {
5363 SDNode *N = Op.getNode();
5364 if (!N->hasOneUse())
5365 // Otherwise it requires moving the value from fp to integer registers.
5366 return false;
5367 if (!N->getNumValues())
5368 return false;
5369 EVT VT = Op.getValueType();
5370 if (VT != MVT::f32 && !Subtarget->isFPBrccSlow())
5371 // f32 case is generally profitable. f64 case only makes sense when vcmpe +
5372 // vmrs are very slow, e.g. cortex-a8.
5373 return false;
5374
5375 if (isFloatingPointZero(Op)) {
5376 SeenZero = true;
5377 return true;
5378 }
5379 return ISD::isNormalLoad(N);
5380}
5381
5384 return DAG.getConstant(0, SDLoc(Op), MVT::i32);
5385
5387 return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(),
5388 Ld->getPointerInfo(), Ld->getAlign(),
5389 Ld->getMemOperand()->getFlags());
5390
5391 llvm_unreachable("Unknown VFP cmp argument!");
5392}
5393
5395 SDValue &RetVal1, SDValue &RetVal2) {
5396 SDLoc dl(Op);
5397
5398 if (isFloatingPointZero(Op)) {
5399 RetVal1 = DAG.getConstant(0, dl, MVT::i32);
5400 RetVal2 = DAG.getConstant(0, dl, MVT::i32);
5401 return;
5402 }
5403
5404 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) {
5405 SDValue Ptr = Ld->getBasePtr();
5406 RetVal1 =
5407 DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
5408 Ld->getAlign(), Ld->getMemOperand()->getFlags());
5409
5410 EVT PtrType = Ptr.getValueType();
5411 SDValue NewPtr = DAG.getNode(ISD::ADD, dl,
5412 PtrType, Ptr, DAG.getConstant(4, dl, PtrType));
5413 RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr,
5414 Ld->getPointerInfo().getWithOffset(4),
5415 commonAlignment(Ld->getAlign(), 4),
5416 Ld->getMemOperand()->getFlags());
5417 return;
5418 }
5419
5420 llvm_unreachable("Unknown VFP cmp argument!");
5421}
5422
5423/// OptimizeVFPBrcond - With nnan and without daz, it's legal to optimize some
5424/// f32 and even f64 comparisons to integer ones.
5425SDValue
5426ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
5427 SDValue Chain = Op.getOperand(0);
5428 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
5429 SDValue LHS = Op.getOperand(2);
5430 SDValue RHS = Op.getOperand(3);
5431 SDValue Dest = Op.getOperand(4);
5432 SDLoc dl(Op);
5433
5434 bool LHSSeenZero = false;
5435 bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget);
5436 bool RHSSeenZero = false;
5437 bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget);
5438 if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) {
5439 // If unsafe fp math optimization is enabled and there are no other uses of
5440 // the CMP operands, and the condition code is EQ or NE, we can optimize it
5441 // to an integer comparison.
5442 if (CC == ISD::SETOEQ)
5443 CC = ISD::SETEQ;
5444 else if (CC == ISD::SETUNE)
5445 CC = ISD::SETNE;
5446
5447 SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32);
5448 SDValue ARMcc;
5449 if (LHS.getValueType() == MVT::f32) {
5450 LHS = DAG.getNode(ISD::AND, dl, MVT::i32,
5451 bitcastf32Toi32(LHS, DAG), Mask);
5452 RHS = DAG.getNode(ISD::AND, dl, MVT::i32,
5453 bitcastf32Toi32(RHS, DAG), Mask);
5454 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5455 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc,
5456 Cmp);
5457 }
5458
5459 SDValue LHS1, LHS2;
5460 SDValue RHS1, RHS2;
5461 expandf64Toi32(LHS, DAG, LHS1, LHS2);
5462 expandf64Toi32(RHS, DAG, RHS1, RHS2);
5463 LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask);
5464 RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask);
5466 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5467 SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest };
5468 return DAG.getNode(ARMISD::BCC_i64, dl, MVT::Other, Ops);
5469 }
5470
5471 return SDValue();
5472}
5473
5474// Generate CMP + CMOV for integer abs.
5475SDValue ARMTargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
5476 SDLoc DL(Op);
5477
5478 SDValue Neg = DAG.getNegative(Op.getOperand(0), DL, MVT::i32);
5479
5480 // Generate CMP & CMOV.
5481 SDValue Cmp = DAG.getNode(ARMISD::CMP, DL, FlagsVT, Op.getOperand(0),
5482 DAG.getConstant(0, DL, MVT::i32));
5483 return DAG.getNode(ARMISD::CMOV, DL, MVT::i32, Op.getOperand(0), Neg,
5484 DAG.getConstant(ARMCC::MI, DL, MVT::i32), Cmp);
5485}
5486
5487SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
5488 SDValue Chain = Op.getOperand(0);
5489 SDValue Cond = Op.getOperand(1);
5490 SDValue Dest = Op.getOperand(2);
5491 SDLoc dl(Op);
5492
5493 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
5494 // instruction.
5495 unsigned Opc = Cond.getOpcode();
5496 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
5497 !Subtarget->isThumb1Only();
5498 if (Cond.getResNo() == 1 &&
5499 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5500 Opc == ISD::USUBO || OptimizeMul)) {
5501 // Only lower legal XALUO ops.
5502 if (!isTypeLegal(Cond->getValueType(0)))
5503 return SDValue();
5504
5505 // The actual operation with overflow check.
5506 SDValue Value, OverflowCmp;
5507 SDValue ARMcc;
5508 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
5509
5510 // Reverse the condition code.
5512 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();
5514 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
5515
5516 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc,
5517 OverflowCmp);
5518 }
5519
5520 return SDValue();
5521}
5522
5523SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
5524 SDValue Chain = Op.getOperand(0);
5525 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
5526 SDValue LHS = Op.getOperand(2);
5527 SDValue RHS = Op.getOperand(3);
5528 SDValue Dest = Op.getOperand(4);
5529 SDLoc dl(Op);
5530
5531 if (isUnsupportedFloatingType(LHS.getValueType())) {
5532 softenSetCCOperands(DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
5533
5534 // If softenSetCCOperands only returned one value, we should compare it to
5535 // zero.
5536 if (!RHS.getNode()) {
5537 RHS = DAG.getConstant(0, dl, LHS.getValueType());
5538 CC = ISD::SETNE;
5539 }
5540 }
5541
5542 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
5543 // instruction.
5544 unsigned Opc = LHS.getOpcode();
5545 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
5546 !Subtarget->isThumb1Only();
5547 if (LHS.getResNo() == 1 && (isOneConstant(RHS) || isNullConstant(RHS)) &&
5548 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5549 Opc == ISD::USUBO || OptimizeMul) &&
5550 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
5551 // Only lower legal XALUO ops.
5552 if (!isTypeLegal(LHS->getValueType(0)))
5553 return SDValue();
5554
5555 // The actual operation with overflow check.
5556 SDValue Value, OverflowCmp;
5557 SDValue ARMcc;
5558 std::tie(Value, OverflowCmp) = getARMXALUOOp(LHS.getValue(0), DAG, ARMcc);
5559
5560 if ((CC == ISD::SETNE) != isOneConstant(RHS)) {
5561 // Reverse the condition code.
5563 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();
5565 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
5566 }
5567
5568 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc,
5569 OverflowCmp);
5570 }
5571
5572 if (LHS.getValueType() == MVT::i32) {
5573 SDValue ARMcc;
5574 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5575 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, Cmp);
5576 }
5577
5578 SDNodeFlags Flags = Op->getFlags();
5579 if (Flags.hasNoNaNs() &&
5580 DAG.getDenormalMode(MVT::f32) == DenormalMode::getIEEE() &&
5581 DAG.getDenormalMode(MVT::f64) == DenormalMode::getIEEE() &&
5582 (CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETNE ||
5583 CC == ISD::SETUNE)) {
5584 if (SDValue Result = OptimizeVFPBrcond(Op, DAG))
5585 return Result;
5586 }
5587
5588 ARMCC::CondCodes CondCode, CondCode2;
5589 FPCCToARMCC(CC, CondCode, CondCode2);
5590
5591 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5592 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
5593 SDValue Ops[] = {Chain, Dest, ARMcc, Cmp};
5594 SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Ops);
5595 if (CondCode2 != ARMCC::AL) {
5596 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
5597 SDValue Ops[] = {Res, Dest, ARMcc, Cmp};
5598 Res = DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Ops);
5599 }
5600 return Res;
5601}
5602
5603SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
5604 SDValue Chain = Op.getOperand(0);
5605 SDValue Table = Op.getOperand(1);
5606 SDValue Index = Op.getOperand(2);
5607 SDLoc dl(Op);
5608
5609 EVT PTy = getPointerTy(DAG.getDataLayout());
5610 JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
5611 SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy);
5612 Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI);
5613 Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy));
5614 SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Index);
5615 if (Subtarget->isThumb2() || (Subtarget->hasV8MBaselineOps() && Subtarget->isThumb())) {
5616 // Thumb2 and ARMv8-M use a two-level jump. That is, it jumps into the jump table
5617 // which does another jump to the destination. This also makes it easier
5618 // to translate it to TBB / TBH later (Thumb2 only).
5619 // FIXME: This might not work if the function is extremely large.
5620 return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain,
5621 Addr, Op.getOperand(2), JTI);
5622 }
5623 if (isPositionIndependent() || Subtarget->isROPI()) {
5624 Addr =
5625 DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,
5627 Chain = Addr.getValue(1);
5628 Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Addr);
5629 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
5630 } else {
5631 Addr =
5632 DAG.getLoad(PTy, dl, Chain, Addr,
5634 Chain = Addr.getValue(1);
5635 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
5636 }
5637}
5638
5640 EVT VT = Op.getValueType();
5641 SDLoc dl(Op);
5642
5643 if (Op.getValueType().getVectorElementType() == MVT::i32) {
5644 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32)
5645 return Op;
5646 return DAG.UnrollVectorOp(Op.getNode());
5647 }
5648
5649 const bool HasFullFP16 = DAG.getSubtarget<ARMSubtarget>().hasFullFP16();
5650
5651 EVT NewTy;
5652 const EVT OpTy = Op.getOperand(0).getValueType();
5653 if (OpTy == MVT::v4f32)
5654 NewTy = MVT::v4i32;
5655 else if (OpTy == MVT::v4f16 && HasFullFP16)
5656 NewTy = MVT::v4i16;
5657 else if (OpTy == MVT::v8f16 && HasFullFP16)
5658 NewTy = MVT::v8i16;
5659 else
5660 llvm_unreachable("Invalid type for custom lowering!");
5661
5662 if (VT != MVT::v4i16 && VT != MVT::v8i16)
5663 return DAG.UnrollVectorOp(Op.getNode());
5664
5665 Op = DAG.getNode(Op.getOpcode(), dl, NewTy, Op.getOperand(0));
5666 return DAG.getNode(ISD::TRUNCATE, dl, VT, Op);
5667}
5668
5669SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
5670 EVT VT = Op.getValueType();
5671 if (VT.isVector())
5672 return LowerVectorFP_TO_INT(Op, DAG);
5673
5674 bool IsStrict = Op->isStrictFPOpcode();
5675 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
5676
5677 if (isUnsupportedFloatingType(SrcVal.getValueType())) {
5678 RTLIB::Libcall LC;
5679 if (Op.getOpcode() == ISD::FP_TO_SINT ||
5680 Op.getOpcode() == ISD::STRICT_FP_TO_SINT)
5681 LC = RTLIB::getFPTOSINT(SrcVal.getValueType(),
5682 Op.getValueType());
5683 else
5684 LC = RTLIB::getFPTOUINT(SrcVal.getValueType(),
5685 Op.getValueType());
5686 SDLoc Loc(Op);
5687 MakeLibCallOptions CallOptions;
5688 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
5690 std::tie(Result, Chain) = makeLibCall(DAG, LC, Op.getValueType(), SrcVal,
5691 CallOptions, Loc, Chain);
5692 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;
5693 }
5694
5695 // FIXME: Remove this when we have strict fp instruction selection patterns
5696 if (IsStrict) {
5697 SDLoc Loc(Op);
5698 SDValue Result =
5701 Loc, Op.getValueType(), SrcVal);
5702 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc);
5703 }
5704
5705 return Op;
5706}
5707
5709 const ARMSubtarget *Subtarget) {
5710 EVT VT = Op.getValueType();
5711 EVT ToVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
5712 EVT FromVT = Op.getOperand(0).getValueType();
5713
5714 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f32)
5715 return Op;
5716 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f64 &&
5717 Subtarget->hasFP64())
5718 return Op;
5719 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f16 &&
5720 Subtarget->hasFullFP16())
5721 return Op;
5722 if (VT == MVT::v4i32 && ToVT == MVT::i32 && FromVT == MVT::v4f32 &&
5723 Subtarget->hasMVEFloatOps())
5724 return Op;
5725 if (VT == MVT::v8i16 && ToVT == MVT::i16 && FromVT == MVT::v8f16 &&
5726 Subtarget->hasMVEFloatOps())
5727 return Op;
5728
5729 if (FromVT != MVT::v4f32 && FromVT != MVT::v8f16)
5730 return SDValue();
5731
5732 SDLoc DL(Op);
5733 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT_SAT;
5734 unsigned BW = ToVT.getScalarSizeInBits() - IsSigned;
5735 SDValue CVT = DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
5736 DAG.getValueType(VT.getScalarType()));
5737 SDValue Max = DAG.getNode(IsSigned ? ISD::SMIN : ISD::UMIN, DL, VT, CVT,
5738 DAG.getConstant((1 << BW) - 1, DL, VT));
5739 if (IsSigned)
5740 Max = DAG.getNode(ISD::SMAX, DL, VT, Max,
5741 DAG.getSignedConstant(-(1 << BW), DL, VT));
5742 return Max;
5743}
5744
5746 EVT VT = Op.getValueType();
5747 SDLoc dl(Op);
5748
5749 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) {
5750 if (VT.getVectorElementType() == MVT::f32)
5751 return Op;
5752 return DAG.UnrollVectorOp(Op.getNode());
5753 }
5754
5755 assert((Op.getOperand(0).getValueType() == MVT::v4i16 ||
5756 Op.getOperand(0).getValueType() == MVT::v8i16) &&
5757 "Invalid type for custom lowering!");
5758
5759 const bool HasFullFP16 = DAG.getSubtarget<ARMSubtarget>().hasFullFP16();
5760
5761 EVT DestVecType;
5762 if (VT == MVT::v4f32)
5763 DestVecType = MVT::v4i32;
5764 else if (VT == MVT::v4f16 && HasFullFP16)
5765 DestVecType = MVT::v4i16;
5766 else if (VT == MVT::v8f16 && HasFullFP16)
5767 DestVecType = MVT::v8i16;
5768 else
5769 return DAG.UnrollVectorOp(Op.getNode());
5770
5771 unsigned CastOpc;
5772 unsigned Opc;
5773 switch (Op.getOpcode()) {
5774 default: llvm_unreachable("Invalid opcode!");
5775 case ISD::SINT_TO_FP:
5776 CastOpc = ISD::SIGN_EXTEND;
5778 break;
5779 case ISD::UINT_TO_FP:
5780 CastOpc = ISD::ZERO_EXTEND;
5782 break;
5783 }
5784
5785 Op = DAG.getNode(CastOpc, dl, DestVecType, Op.getOperand(0));
5786 return DAG.getNode(Opc, dl, VT, Op);
5787}
5788
5789SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const {
5790 EVT VT = Op.getValueType();
5791 if (VT.isVector())
5792 return LowerVectorINT_TO_FP(Op, DAG);
5793 if (isUnsupportedFloatingType(VT)) {
5794 RTLIB::Libcall LC;
5795 if (Op.getOpcode() == ISD::SINT_TO_FP)
5796 LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(),
5797 Op.getValueType());
5798 else
5799 LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(),
5800 Op.getValueType());
5801 MakeLibCallOptions CallOptions;
5802 return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
5803 CallOptions, SDLoc(Op)).first;
5804 }
5805
5806 return Op;
5807}
5808
5809SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
5810 // Implement fcopysign with a fabs and a conditional fneg.
5811 SDValue Tmp0 = Op.getOperand(0);
5812 SDValue Tmp1 = Op.getOperand(1);
5813 SDLoc dl(Op);
5814 EVT VT = Op.getValueType();
5815 EVT SrcVT = Tmp1.getValueType();
5816 bool InGPR = Tmp0.getOpcode() == ISD::BITCAST ||
5817 Tmp0.getOpcode() == ARMISD::VMOVDRR;
5818 bool UseNEON = !InGPR && Subtarget->hasNEON();
5819
5820 if (UseNEON) {
5821 // Use VBSL to copy the sign bit.
5822 unsigned EncodedVal = ARM_AM::createVMOVModImm(0x6, 0x80);
5823 SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32,
5824 DAG.getTargetConstant(EncodedVal, dl, MVT::i32));
5825 EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64;
5826 if (VT == MVT::f64)
5827 Mask = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
5828 DAG.getNode(ISD::BITCAST, dl, OpVT, Mask),
5829 DAG.getConstant(32, dl, MVT::i32));
5830 else /*if (VT == MVT::f32)*/
5831 Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0);
5832 if (SrcVT == MVT::f32) {
5833 Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1);
5834 if (VT == MVT::f64)
5835 Tmp1 = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
5836 DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1),
5837 DAG.getConstant(32, dl, MVT::i32));
5838 } else if (VT == MVT::f32)
5839 Tmp1 = DAG.getNode(ARMISD::VSHRuIMM, dl, MVT::v1i64,
5840 DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1),
5841 DAG.getConstant(32, dl, MVT::i32));
5842 Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);
5843 Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1);
5844
5846 dl, MVT::i32);
5847 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);
5848 SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask,
5849 DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes));
5850
5851 SDValue Res = DAG.getNode(ISD::OR, dl, OpVT,
5852 DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask),
5853 DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot));
5854 if (VT == MVT::f32) {
5855 Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res);
5856 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
5857 DAG.getConstant(0, dl, MVT::i32));
5858 } else {
5859 Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res);
5860 }
5861
5862 return Res;
5863 }
5864
5865 // Bitcast operand 1 to i32.
5866 if (SrcVT == MVT::f64)
5867 Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
5868 Tmp1).getValue(1);
5869 Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1);
5870
5871 // Or in the signbit with integer operations.
5872 SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32);
5873 SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32);
5874 Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1);
5875 if (VT == MVT::f32) {
5876 Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32,
5877 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2);
5878 return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
5879 DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1));
5880 }
5881
5882 // f64: Or the high part with signbit and then combine two parts.
5883 Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
5884 Tmp0);
5885 SDValue Lo = Tmp0.getValue(0);
5886 SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2);
5887 Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1);
5888 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
5889}
5890
5891SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
5892 MachineFunction &MF = DAG.getMachineFunction();
5893 MachineFrameInfo &MFI = MF.getFrameInfo();
5894 MFI.setReturnAddressIsTaken(true);
5895
5896 EVT VT = Op.getValueType();
5897 SDLoc dl(Op);
5898 unsigned Depth = Op.getConstantOperandVal(0);
5899 if (Depth) {
5900 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
5901 SDValue Offset = DAG.getConstant(4, dl, MVT::i32);
5902 return DAG.getLoad(VT, dl, DAG.getEntryNode(),
5903 DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
5904 MachinePointerInfo());
5905 }
5906
5907 // Return LR, which contains the return address. Mark it an implicit live-in.
5908 Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
5909 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
5910}
5911
5912SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
5913 const ARMBaseRegisterInfo &ARI =
5914 *static_cast<const ARMBaseRegisterInfo*>(RegInfo);
5915 MachineFunction &MF = DAG.getMachineFunction();
5916 MachineFrameInfo &MFI = MF.getFrameInfo();
5917 MFI.setFrameAddressIsTaken(true);
5918
5919 EVT VT = Op.getValueType();
5920 SDLoc dl(Op); // FIXME probably not meaningful
5921 unsigned Depth = Op.getConstantOperandVal(0);
5922 Register FrameReg = ARI.getFrameRegister(MF);
5923 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
5924 while (Depth--)
5925 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
5926 MachinePointerInfo());
5927 return FrameAddr;
5928}
5929
5930// FIXME? Maybe this could be a TableGen attribute on some registers and
5931// this table could be generated automatically from RegInfo.
5932Register ARMTargetLowering::getRegisterByName(const char* RegName, LLT VT,
5933 const MachineFunction &MF) const {
5934 return StringSwitch<Register>(RegName)
5935 .Case("sp", ARM::SP)
5936 .Default(Register());
5937}
5938
5939// Result is 64 bit value so split into two 32 bit values and return as a
5940// pair of values.
5942 SelectionDAG &DAG) {
5943 SDLoc DL(N);
5944
5945 // This function is only supposed to be called for i64 type destination.
5946 assert(N->getValueType(0) == MVT::i64
5947 && "ExpandREAD_REGISTER called for non-i64 type result.");
5948
5950 DAG.getVTList(MVT::i32, MVT::i32, MVT::Other),
5951 N->getOperand(0),
5952 N->getOperand(1));
5953
5954 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0),
5955 Read.getValue(1)));
5956 Results.push_back(Read.getValue(2)); // Chain
5957}
5958
5959/// \p BC is a bitcast that is about to be turned into a VMOVDRR.
5960/// When \p DstVT, the destination type of \p BC, is on the vector
5961/// register bank and the source of bitcast, \p Op, operates on the same bank,
5962/// it might be possible to combine them, such that everything stays on the
5963/// vector register bank.
5964/// \p return The node that would replace \p BT, if the combine
5965/// is possible.
5967 SelectionDAG &DAG) {
5968 SDValue Op = BC->getOperand(0);
5969 EVT DstVT = BC->getValueType(0);
5970
5971 // The only vector instruction that can produce a scalar (remember,
5972 // since the bitcast was about to be turned into VMOVDRR, the source
5973 // type is i64) from a vector is EXTRACT_VECTOR_ELT.
5974 // Moreover, we can do this combine only if there is one use.
5975 // Finally, if the destination type is not a vector, there is not
5976 // much point on forcing everything on the vector bank.
5977 if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
5978 !Op.hasOneUse())
5979 return SDValue();
5980
5981 // If the index is not constant, we will introduce an additional
5982 // multiply that will stick.
5983 // Give up in that case.
5984 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1));
5985 if (!Index)
5986 return SDValue();
5987 unsigned DstNumElt = DstVT.getVectorNumElements();
5988
5989 // Compute the new index.
5990 const APInt &APIntIndex = Index->getAPIntValue();
5991 APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt);
5992 NewIndex *= APIntIndex;
5993 // Check if the new constant index fits into i32.
5994 if (NewIndex.getBitWidth() > 32)
5995 return SDValue();
5996
5997 // vMTy bitcast(i64 extractelt vNi64 src, i32 index) ->
5998 // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M)
5999 SDLoc dl(Op);
6000 SDValue ExtractSrc = Op.getOperand(0);
6001 EVT VecVT = EVT::getVectorVT(
6002 *DAG.getContext(), DstVT.getScalarType(),
6003 ExtractSrc.getValueType().getVectorNumElements() * DstNumElt);
6004 SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc);
6005 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast,
6006 DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32));
6007}
6008
6009/// ExpandBITCAST - If the target supports VFP, this function is called to
6010/// expand a bit convert where either the source or destination type is i64 to
6011/// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64
6012/// operand type is illegal (e.g., v2f32 for a target that doesn't support
6013/// vectors), since the legalizer won't know what to do with that.
6014SDValue ARMTargetLowering::ExpandBITCAST(SDNode *N, SelectionDAG &DAG,
6015 const ARMSubtarget *Subtarget) const {
6016 SDLoc dl(N);
6017 SDValue Op = N->getOperand(0);
6018
6019 // This function is only supposed to be called for i16 and i64 types, either
6020 // as the source or destination of the bit convert.
6021 EVT SrcVT = Op.getValueType();
6022 EVT DstVT = N->getValueType(0);
6023
6024 if ((SrcVT == MVT::i16 || SrcVT == MVT::i32) &&
6025 (DstVT == MVT::f16 || DstVT == MVT::bf16))
6026 return MoveToHPR(SDLoc(N), DAG, MVT::i32, DstVT.getSimpleVT(),
6027 DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), MVT::i32, Op));
6028
6029 if ((DstVT == MVT::i16 || DstVT == MVT::i32) &&
6030 (SrcVT == MVT::f16 || SrcVT == MVT::bf16)) {
6031 if (Subtarget->hasFullFP16() && !Subtarget->hasBF16())
6032 Op = DAG.getBitcast(MVT::f16, Op);
6033 return DAG.getNode(
6034 ISD::TRUNCATE, SDLoc(N), DstVT,
6035 MoveFromHPR(SDLoc(N), DAG, MVT::i32, SrcVT.getSimpleVT(), Op));
6036 }
6037
6038 if (!(SrcVT == MVT::i64 || DstVT == MVT::i64))
6039 return SDValue();
6040
6041 // Turn i64->f64 into VMOVDRR.
6042 if (SrcVT == MVT::i64 && isTypeLegal(DstVT)) {
6043 // Do not force values to GPRs (this is what VMOVDRR does for the inputs)
6044 // if we can combine the bitcast with its source.
6046 return Val;
6047 SDValue Lo, Hi;
6048 std::tie(Lo, Hi) = DAG.SplitScalar(Op, dl, MVT::i32, MVT::i32);
6049 return DAG.getNode(ISD::BITCAST, dl, DstVT,
6050 DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi));
6051 }
6052
6053 // Turn f64->i64 into VMOVRRD.
6054 if (DstVT == MVT::i64 && isTypeLegal(SrcVT)) {
6055 SDValue Cvt;
6056 if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() &&
6057 SrcVT.getVectorNumElements() > 1)
6058 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
6059 DAG.getVTList(MVT::i32, MVT::i32),
6060 DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op));
6061 else
6062 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
6063 DAG.getVTList(MVT::i32, MVT::i32), Op);
6064 // Merge the pieces into a single i64 value.
6065 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1));
6066 }
6067
6068 return SDValue();
6069}
6070
6071/// getZeroVector - Returns a vector of specified type with all zero elements.
6072/// Zero vectors are used to represent vector negation and in those cases
6073/// will be implemented with the NEON VNEG instruction. However, VNEG does
6074/// not support i64 elements, so sometimes the zero vectors will need to be
6075/// explicitly constructed. Regardless, use a canonical VMOV to create the
6076/// zero vector.
6077static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
6078 assert(VT.isVector() && "Expected a vector type");
6079 // The canonical modified immediate encoding of a zero vector is....0!
6080 SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32);
6081 EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
6082 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal);
6083 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
6084}
6085
6086/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
6087/// i32 values and take a 2 x i32 value to shift plus a shift amount.
6088SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op,
6089 SelectionDAG &DAG) const {
6090 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
6091 EVT VT = Op.getValueType();
6092 unsigned VTBits = VT.getSizeInBits();
6093 SDLoc dl(Op);
6094 SDValue ShOpLo = Op.getOperand(0);
6095 SDValue ShOpHi = Op.getOperand(1);
6096 SDValue ShAmt = Op.getOperand(2);
6097 SDValue ARMcc;
6098 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
6099
6100 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
6101
6102 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6103 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
6104 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
6105 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
6106 DAG.getConstant(VTBits, dl, MVT::i32));
6107 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
6108 SDValue LoSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
6109 SDValue LoBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
6110 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6111 ISD::SETGE, ARMcc, DAG, dl);
6112 SDValue Lo =
6113 DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, LoBigShift, ARMcc, CmpLo);
6114
6115 SDValue HiSmallShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
6116 SDValue HiBigShift = Opc == ISD::SRA
6117 ? DAG.getNode(Opc, dl, VT, ShOpHi,
6118 DAG.getConstant(VTBits - 1, dl, VT))
6119 : DAG.getConstant(0, dl, VT);
6120 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6121 ISD::SETGE, ARMcc, DAG, dl);
6122 SDValue Hi =
6123 DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, ARMcc, CmpHi);
6124
6125 SDValue Ops[2] = { Lo, Hi };
6126 return DAG.getMergeValues(Ops, dl);
6127}
6128
6129/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
6130/// i32 values and take a 2 x i32 value to shift plus a shift amount.
6131SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
6132 SelectionDAG &DAG) const {
6133 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
6134 EVT VT = Op.getValueType();
6135 unsigned VTBits = VT.getSizeInBits();
6136 SDLoc dl(Op);
6137 SDValue ShOpLo = Op.getOperand(0);
6138 SDValue ShOpHi = Op.getOperand(1);
6139 SDValue ShAmt = Op.getOperand(2);
6140 SDValue ARMcc;
6141
6142 assert(Op.getOpcode() == ISD::SHL_PARTS);
6143 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6144 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
6145 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
6146 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
6147 SDValue HiSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
6148
6149 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
6150 DAG.getConstant(VTBits, dl, MVT::i32));
6151 SDValue HiBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
6152 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6153 ISD::SETGE, ARMcc, DAG, dl);
6154 SDValue Hi =
6155 DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, ARMcc, CmpHi);
6156
6157 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6158 ISD::SETGE, ARMcc, DAG, dl);
6159 SDValue LoSmallShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
6160 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift,
6161 DAG.getConstant(0, dl, VT), ARMcc, CmpLo);
6162
6163 SDValue Ops[2] = { Lo, Hi };
6164 return DAG.getMergeValues(Ops, dl);
6165}
6166
6167SDValue ARMTargetLowering::LowerGET_ROUNDING(SDValue Op,
6168 SelectionDAG &DAG) const {
6169 // The rounding mode is in bits 23:22 of the FPSCR.
6170 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
6171 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
6172 // so that the shift + and get folded into a bitfield extract.
6173 SDLoc dl(Op);
6174 SDValue Chain = Op.getOperand(0);
6175 SDValue Ops[] = {Chain,
6176 DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32)};
6177
6178 SDValue FPSCR =
6179 DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, {MVT::i32, MVT::Other}, Ops);
6180 Chain = FPSCR.getValue(1);
6181 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR,
6182 DAG.getConstant(1U << 22, dl, MVT::i32));
6183 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
6184 DAG.getConstant(22, dl, MVT::i32));
6185 SDValue And = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
6186 DAG.getConstant(3, dl, MVT::i32));
6187 return DAG.getMergeValues({And, Chain}, dl);
6188}
6189
6190SDValue ARMTargetLowering::LowerSET_ROUNDING(SDValue Op,
6191 SelectionDAG &DAG) const {
6192 SDLoc DL(Op);
6193 SDValue Chain = Op->getOperand(0);
6194 SDValue RMValue = Op->getOperand(1);
6195
6196 // The rounding mode is in bits 23:22 of the FPSCR.
6197 // The llvm.set.rounding argument value to ARM rounding mode value mapping
6198 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
6199 // ((arg - 1) & 3) << 22).
6200 //
6201 // It is expected that the argument of llvm.set.rounding is within the
6202 // segment [0, 3], so NearestTiesToAway (4) is not handled here. It is
6203 // responsibility of the code generated llvm.set.rounding to ensure this
6204 // condition.
6205
6206 // Calculate new value of FPSCR[23:22].
6207 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
6208 DAG.getConstant(1, DL, MVT::i32));
6209 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
6210 DAG.getConstant(0x3, DL, MVT::i32));
6211 RMValue = DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
6212 DAG.getConstant(ARM::RoundingBitsPos, DL, MVT::i32));
6213
6214 // Get current value of FPSCR.
6215 SDValue Ops[] = {Chain,
6216 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6217 SDValue FPSCR =
6218 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6219 Chain = FPSCR.getValue(1);
6220 FPSCR = FPSCR.getValue(0);
6221
6222 // Put new rounding mode into FPSCR[23:22].
6223 const unsigned RMMask = ~(ARM::Rounding::rmMask << ARM::RoundingBitsPos);
6224 FPSCR = DAG.getNode(ISD::AND, DL, MVT::i32, FPSCR,
6225 DAG.getConstant(RMMask, DL, MVT::i32));
6226 FPSCR = DAG.getNode(ISD::OR, DL, MVT::i32, FPSCR, RMValue);
6227 SDValue Ops2[] = {
6228 Chain, DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32), FPSCR};
6229 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6230}
6231
6232SDValue ARMTargetLowering::LowerSET_FPMODE(SDValue Op,
6233 SelectionDAG &DAG) const {
6234 SDLoc DL(Op);
6235 SDValue Chain = Op->getOperand(0);
6236 SDValue Mode = Op->getOperand(1);
6237
6238 // Generate nodes to build:
6239 // FPSCR = (FPSCR & FPStatusBits) | (Mode & ~FPStatusBits)
6240 SDValue Ops[] = {Chain,
6241 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6242 SDValue FPSCR =
6243 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6244 Chain = FPSCR.getValue(1);
6245 FPSCR = FPSCR.getValue(0);
6246
6247 SDValue FPSCRMasked =
6248 DAG.getNode(ISD::AND, DL, MVT::i32, FPSCR,
6249 DAG.getConstant(ARM::FPStatusBits, DL, MVT::i32));
6250 SDValue InputMasked =
6251 DAG.getNode(ISD::AND, DL, MVT::i32, Mode,
6252 DAG.getConstant(~ARM::FPStatusBits, DL, MVT::i32));
6253 FPSCR = DAG.getNode(ISD::OR, DL, MVT::i32, FPSCRMasked, InputMasked);
6254
6255 SDValue Ops2[] = {
6256 Chain, DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32), FPSCR};
6257 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6258}
6259
6260SDValue ARMTargetLowering::LowerRESET_FPMODE(SDValue Op,
6261 SelectionDAG &DAG) const {
6262 SDLoc DL(Op);
6263 SDValue Chain = Op->getOperand(0);
6264
6265 // To get the default FP mode all control bits are cleared:
6266 // FPSCR = FPSCR & (FPStatusBits | FPReservedBits)
6267 SDValue Ops[] = {Chain,
6268 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6269 SDValue FPSCR =
6270 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6271 Chain = FPSCR.getValue(1);
6272 FPSCR = FPSCR.getValue(0);
6273
6274 SDValue FPSCRMasked = DAG.getNode(
6275 ISD::AND, DL, MVT::i32, FPSCR,
6277 SDValue Ops2[] = {Chain,
6278 DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32),
6279 FPSCRMasked};
6280 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6281}
6282
6284 const ARMSubtarget *ST) {
6285 SDLoc dl(N);
6286 EVT VT = N->getValueType(0);
6287 if (VT.isVector() && ST->hasNEON()) {
6288
6289 // Compute the least significant set bit: LSB = X & -X
6290 SDValue X = N->getOperand(0);
6291 SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X);
6292 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX);
6293
6294 EVT ElemTy = VT.getVectorElementType();
6295
6296 if (ElemTy == MVT::i8) {
6297 // Compute with: cttz(x) = ctpop(lsb - 1)
6298 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6299 DAG.getTargetConstant(1, dl, ElemTy));
6300 SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
6301 return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
6302 }
6303
6304 if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) &&
6305 (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) {
6306 // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0
6307 unsigned NumBits = ElemTy.getSizeInBits();
6308 SDValue WidthMinus1 =
6309 DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6310 DAG.getTargetConstant(NumBits - 1, dl, ElemTy));
6311 SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB);
6312 return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ);
6313 }
6314
6315 // Compute with: cttz(x) = ctpop(lsb - 1)
6316
6317 // Compute LSB - 1.
6318 SDValue Bits;
6319 if (ElemTy == MVT::i64) {
6320 // Load constant 0xffff'ffff'ffff'ffff to register.
6321 SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6322 DAG.getTargetConstant(0x1eff, dl, MVT::i32));
6323 Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF);
6324 } else {
6325 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6326 DAG.getTargetConstant(1, dl, ElemTy));
6327 Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
6328 }
6329 return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
6330 }
6331
6332 if (!ST->hasV6T2Ops())
6333 return SDValue();
6334
6335 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0));
6336 return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
6337}
6338
6340 const ARMSubtarget *ST) {
6341 EVT VT = N->getValueType(0);
6342 SDLoc DL(N);
6343
6344 assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");
6345 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
6346 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
6347 "Unexpected type for custom ctpop lowering");
6348
6349 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6350 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
6351 SDValue Res = DAG.getBitcast(VT8Bit, N->getOperand(0));
6352 Res = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Res);
6353
6354 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
6355 unsigned EltSize = 8;
6356 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
6357 while (EltSize != VT.getScalarSizeInBits()) {
6359 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddlu, DL,
6360 TLI.getPointerTy(DAG.getDataLayout())));
6361 Ops.push_back(Res);
6362
6363 EltSize *= 2;
6364 NumElts /= 2;
6365 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
6366 Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, Ops);
6367 }
6368
6369 return Res;
6370}
6371
6372/// Getvshiftimm - Check if this is a valid build_vector for the immediate
6373/// operand of a vector shift operation, where all the elements of the
6374/// build_vector must have the same constant integer value.
6375static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
6376 // Ignore bit_converts.
6377 while (Op.getOpcode() == ISD::BITCAST)
6378 Op = Op.getOperand(0);
6380 APInt SplatBits, SplatUndef;
6381 unsigned SplatBitSize;
6382 bool HasAnyUndefs;
6383 if (!BVN ||
6384 !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs,
6385 ElementBits) ||
6386 SplatBitSize > ElementBits)
6387 return false;
6388 Cnt = SplatBits.getSExtValue();
6389 return true;
6390}
6391
6392/// isVShiftLImm - Check if this is a valid build_vector for the immediate
6393/// operand of a vector shift left operation. That value must be in the range:
6394/// 0 <= Value < ElementBits for a left shift; or
6395/// 0 <= Value <= ElementBits for a long left shift.
6396static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
6397 assert(VT.isVector() && "vector shift count is not a vector type");
6398 int64_t ElementBits = VT.getScalarSizeInBits();
6399 if (!getVShiftImm(Op, ElementBits, Cnt))
6400 return false;
6401 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
6402}
6403
6404/// isVShiftRImm - Check if this is a valid build_vector for the immediate
6405/// operand of a vector shift right operation. For a shift opcode, the value
6406/// is positive, but for an intrinsic the value count must be negative. The
6407/// absolute value must be in the range:
6408/// 1 <= |Value| <= ElementBits for a right shift; or
6409/// 1 <= |Value| <= ElementBits/2 for a narrow right shift.
6410static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
6411 int64_t &Cnt) {
6412 assert(VT.isVector() && "vector shift count is not a vector type");
6413 int64_t ElementBits = VT.getScalarSizeInBits();
6414 if (!getVShiftImm(Op, ElementBits, Cnt))
6415 return false;
6416 if (!isIntrinsic)
6417 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
6418 if (Cnt >= -(isNarrow ? ElementBits / 2 : ElementBits) && Cnt <= -1) {
6419 Cnt = -Cnt;
6420 return true;
6421 }
6422 return false;
6423}
6424
6426 const ARMSubtarget *ST) {
6427 EVT VT = N->getValueType(0);
6428 SDLoc dl(N);
6429 int64_t Cnt;
6430
6431 if (!VT.isVector())
6432 return SDValue();
6433
6434 // We essentially have two forms here. Shift by an immediate and shift by a
6435 // vector register (there are also shift by a gpr, but that is just handled
6436 // with a tablegen pattern). We cannot easily match shift by an immediate in
6437 // tablegen so we do that here and generate a VSHLIMM/VSHRsIMM/VSHRuIMM.
6438 // For shifting by a vector, we don't have VSHR, only VSHL (which can be
6439 // signed or unsigned, and a negative shift indicates a shift right).
6440 if (N->getOpcode() == ISD::SHL) {
6441 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt))
6442 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
6443 DAG.getConstant(Cnt, dl, MVT::i32));
6444 return DAG.getNode(ARMISD::VSHLu, dl, VT, N->getOperand(0),
6445 N->getOperand(1));
6446 }
6447
6448 assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) &&
6449 "unexpected vector shift opcode");
6450
6451 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
6452 unsigned VShiftOpc =
6453 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
6454 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
6455 DAG.getConstant(Cnt, dl, MVT::i32));
6456 }
6457
6458 // Other right shifts we don't have operations for (we use a shift left by a
6459 // negative number).
6460 EVT ShiftVT = N->getOperand(1).getValueType();
6461 SDValue NegatedCount = DAG.getNode(
6462 ISD::SUB, dl, ShiftVT, getZeroVector(ShiftVT, DAG, dl), N->getOperand(1));
6463 unsigned VShiftOpc =
6464 (N->getOpcode() == ISD::SRA ? ARMISD::VSHLs : ARMISD::VSHLu);
6465 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), NegatedCount);
6466}
6467
6469 const ARMSubtarget *ST) {
6470 EVT VT = N->getValueType(0);
6471 SDLoc dl(N);
6472
6473 // We can get here for a node like i32 = ISD::SHL i32, i64
6474 if (VT != MVT::i64)
6475 return SDValue();
6476
6477 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA ||
6478 N->getOpcode() == ISD::SHL) &&
6479 "Unknown shift to lower!");
6480
6481 unsigned ShOpc = N->getOpcode();
6482 if (ST->hasMVEIntegerOps()) {
6483 SDValue ShAmt = N->getOperand(1);
6484 unsigned ShPartsOpc = ARMISD::LSLL;
6486
6487 // If the shift amount is greater than 32 or has a greater bitwidth than 64
6488 // then do the default optimisation
6489 if ((!Con && ShAmt->getValueType(0).getSizeInBits() > 64) ||
6490 (Con && (Con->getAPIntValue() == 0 || Con->getAPIntValue().uge(32))))
6491 return SDValue();
6492
6493 // Extract the lower 32 bits of the shift amount if it's not an i32
6494 if (ShAmt->getValueType(0) != MVT::i32)
6495 ShAmt = DAG.getZExtOrTrunc(ShAmt, dl, MVT::i32);
6496
6497 if (ShOpc == ISD::SRL) {
6498 if (!Con)
6499 // There is no t2LSRLr instruction so negate and perform an lsll if the
6500 // shift amount is in a register, emulating a right shift.
6501 ShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6502 DAG.getConstant(0, dl, MVT::i32), ShAmt);
6503 else
6504 // Else generate an lsrl on the immediate shift amount
6505 ShPartsOpc = ARMISD::LSRL;
6506 } else if (ShOpc == ISD::SRA)
6507 ShPartsOpc = ARMISD::ASRL;
6508
6509 // Split Lower/Upper 32 bits of the destination/source
6510 SDValue Lo, Hi;
6511 std::tie(Lo, Hi) =
6512 DAG.SplitScalar(N->getOperand(0), dl, MVT::i32, MVT::i32);
6513 // Generate the shift operation as computed above
6514 Lo = DAG.getNode(ShPartsOpc, dl, DAG.getVTList(MVT::i32, MVT::i32), Lo, Hi,
6515 ShAmt);
6516 // The upper 32 bits come from the second return value of lsll
6517 Hi = SDValue(Lo.getNode(), 1);
6518 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
6519 }
6520
6521 // We only lower SRA, SRL of 1 here, all others use generic lowering.
6522 if (!isOneConstant(N->getOperand(1)) || N->getOpcode() == ISD::SHL)
6523 return SDValue();
6524
6525 // If we are in thumb mode, we don't have RRX.
6526 if (ST->isThumb1Only())
6527 return SDValue();
6528
6529 // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr.
6530 SDValue Lo, Hi;
6531 std::tie(Lo, Hi) = DAG.SplitScalar(N->getOperand(0), dl, MVT::i32, MVT::i32);
6532
6533 // First, build a LSRS1/ASRS1 op, which shifts the top part by one and
6534 // captures the shifted out bit into a carry flag.
6535 unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::LSRS1 : ARMISD::ASRS1;
6536 Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, FlagsVT), Hi);
6537
6538 // The low part is an ARMISD::RRX operand, which shifts the carry in.
6539 Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1));
6540
6541 // Merge the pieces into a single i64 value.
6542 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
6543}
6544
6546 const ARMSubtarget *ST) {
6547 bool Invert = false;
6548 bool Swap = false;
6549 unsigned Opc = ARMCC::AL;
6550
6551 SDValue Op0 = Op.getOperand(0);
6552 SDValue Op1 = Op.getOperand(1);
6553 SDValue CC = Op.getOperand(2);
6554 EVT VT = Op.getValueType();
6555 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
6556 SDLoc dl(Op);
6557
6558 EVT CmpVT;
6559 if (ST->hasNEON())
6561 else {
6562 assert(ST->hasMVEIntegerOps() &&
6563 "No hardware support for integer vector comparison!");
6564
6565 if (Op.getValueType().getVectorElementType() != MVT::i1)
6566 return SDValue();
6567
6568 // Make sure we expand floating point setcc to scalar if we do not have
6569 // mve.fp, so that we can handle them from there.
6570 if (Op0.getValueType().isFloatingPoint() && !ST->hasMVEFloatOps())
6571 return SDValue();
6572
6573 CmpVT = VT;
6574 }
6575
6576 if (Op0.getValueType().getVectorElementType() == MVT::i64 &&
6577 (SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) {
6578 // Special-case integer 64-bit equality comparisons. They aren't legal,
6579 // but they can be lowered with a few vector instructions.
6580 unsigned CmpElements = CmpVT.getVectorNumElements() * 2;
6581 EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, CmpElements);
6582 SDValue CastOp0 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op0);
6583 SDValue CastOp1 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op1);
6584 SDValue Cmp = DAG.getNode(ISD::SETCC, dl, SplitVT, CastOp0, CastOp1,
6585 DAG.getCondCode(ISD::SETEQ));
6586 SDValue Reversed = DAG.getNode(ARMISD::VREV64, dl, SplitVT, Cmp);
6587 SDValue Merged = DAG.getNode(ISD::AND, dl, SplitVT, Cmp, Reversed);
6588 Merged = DAG.getNode(ISD::BITCAST, dl, CmpVT, Merged);
6589 if (SetCCOpcode == ISD::SETNE)
6590 Merged = DAG.getNOT(dl, Merged, CmpVT);
6591 Merged = DAG.getSExtOrTrunc(Merged, dl, VT);
6592 return Merged;
6593 }
6594
6595 if (CmpVT.getVectorElementType() == MVT::i64)
6596 // 64-bit comparisons are not legal in general.
6597 return SDValue();
6598
6599 if (Op1.getValueType().isFloatingPoint()) {
6600 switch (SetCCOpcode) {
6601 default: llvm_unreachable("Illegal FP comparison");
6602 case ISD::SETUNE:
6603 case ISD::SETNE:
6604 if (ST->hasMVEFloatOps()) {
6605 Opc = ARMCC::NE; break;
6606 } else {
6607 Invert = true; [[fallthrough]];
6608 }
6609 case ISD::SETOEQ:
6610 case ISD::SETEQ: Opc = ARMCC::EQ; break;
6611 case ISD::SETOLT:
6612 case ISD::SETLT: Swap = true; [[fallthrough]];
6613 case ISD::SETOGT:
6614 case ISD::SETGT: Opc = ARMCC::GT; break;
6615 case ISD::SETOLE:
6616 case ISD::SETLE: Swap = true; [[fallthrough]];
6617 case ISD::SETOGE:
6618 case ISD::SETGE: Opc = ARMCC::GE; break;
6619 case ISD::SETUGE: Swap = true; [[fallthrough]];
6620 case ISD::SETULE: Invert = true; Opc = ARMCC::GT; break;
6621 case ISD::SETUGT: Swap = true; [[fallthrough]];
6622 case ISD::SETULT: Invert = true; Opc = ARMCC::GE; break;
6623 case ISD::SETUEQ: Invert = true; [[fallthrough]];
6624 case ISD::SETONE: {
6625 // Expand this to (OLT | OGT).
6626 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
6627 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6628 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6629 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6630 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
6631 if (Invert)
6632 Result = DAG.getNOT(dl, Result, VT);
6633 return Result;
6634 }
6635 case ISD::SETUO: Invert = true; [[fallthrough]];
6636 case ISD::SETO: {
6637 // Expand this to (OLT | OGE).
6638 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
6639 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6640 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6641 DAG.getConstant(ARMCC::GE, dl, MVT::i32));
6642 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
6643 if (Invert)
6644 Result = DAG.getNOT(dl, Result, VT);
6645 return Result;
6646 }
6647 }
6648 } else {
6649 // Integer comparisons.
6650 switch (SetCCOpcode) {
6651 default: llvm_unreachable("Illegal integer comparison");
6652 case ISD::SETNE:
6653 if (ST->hasMVEIntegerOps()) {
6654 Opc = ARMCC::NE; break;
6655 } else {
6656 Invert = true; [[fallthrough]];
6657 }
6658 case ISD::SETEQ: Opc = ARMCC::EQ; break;
6659 case ISD::SETLT: Swap = true; [[fallthrough]];
6660 case ISD::SETGT: Opc = ARMCC::GT; break;
6661 case ISD::SETLE: Swap = true; [[fallthrough]];
6662 case ISD::SETGE: Opc = ARMCC::GE; break;
6663 case ISD::SETULT: Swap = true; [[fallthrough]];
6664 case ISD::SETUGT: Opc = ARMCC::HI; break;
6665 case ISD::SETULE: Swap = true; [[fallthrough]];
6666 case ISD::SETUGE: Opc = ARMCC::HS; break;
6667 }
6668
6669 // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero).
6670 if (ST->hasNEON() && Opc == ARMCC::EQ) {
6671 SDValue AndOp;
6673 AndOp = Op0;
6674 else if (ISD::isBuildVectorAllZeros(Op0.getNode()))
6675 AndOp = Op1;
6676
6677 // Ignore bitconvert.
6678 if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST)
6679 AndOp = AndOp.getOperand(0);
6680
6681 if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {
6682 Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0));
6683 Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1));
6684 SDValue Result = DAG.getNode(ARMISD::VTST, dl, CmpVT, Op0, Op1);
6685 if (!Invert)
6686 Result = DAG.getNOT(dl, Result, VT);
6687 return Result;
6688 }
6689 }
6690 }
6691
6692 if (Swap)
6693 std::swap(Op0, Op1);
6694
6695 // If one of the operands is a constant vector zero, attempt to fold the
6696 // comparison to a specialized compare-against-zero form.
6698 (Opc == ARMCC::GE || Opc == ARMCC::GT || Opc == ARMCC::EQ ||
6699 Opc == ARMCC::NE)) {
6700 if (Opc == ARMCC::GE)
6701 Opc = ARMCC::LE;
6702 else if (Opc == ARMCC::GT)
6703 Opc = ARMCC::LT;
6704 std::swap(Op0, Op1);
6705 }
6706
6707 SDValue Result;
6709 (Opc == ARMCC::GE || Opc == ARMCC::GT || Opc == ARMCC::LE ||
6710 Opc == ARMCC::LT || Opc == ARMCC::NE || Opc == ARMCC::EQ))
6711 Result = DAG.getNode(ARMISD::VCMPZ, dl, CmpVT, Op0,
6712 DAG.getConstant(Opc, dl, MVT::i32));
6713 else
6714 Result = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6715 DAG.getConstant(Opc, dl, MVT::i32));
6716
6717 Result = DAG.getSExtOrTrunc(Result, dl, VT);
6718
6719 if (Invert)
6720 Result = DAG.getNOT(dl, Result, VT);
6721
6722 return Result;
6723}
6724
6726 SDValue LHS = Op.getOperand(0);
6727 SDValue RHS = Op.getOperand(1);
6728 SDValue Carry = Op.getOperand(2);
6729 SDValue Cond = Op.getOperand(3);
6730 SDLoc DL(Op);
6731
6732 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
6733
6734 // ARMISD::SUBE expects a carry not a borrow like ISD::USUBO_CARRY so we
6735 // have to invert the carry first.
6736 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
6737 DAG.getConstant(1, DL, MVT::i32), Carry);
6738 // This converts the boolean value carry into the carry flag.
6739 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
6740
6741 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
6742 SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry);
6743
6744 SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
6745 SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
6746 SDValue ARMcc = DAG.getConstant(
6747 IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32);
6748 return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc,
6749 Cmp.getValue(1));
6750}
6751
6752/// isVMOVModifiedImm - Check if the specified splat value corresponds to a
6753/// valid vector constant for a NEON or MVE instruction with a "modified
6754/// immediate" operand (e.g., VMOV). If so, return the encoded value.
6755static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
6756 unsigned SplatBitSize, SelectionDAG &DAG,
6757 const SDLoc &dl, EVT &VT, EVT VectorVT,
6758 VMOVModImmType type) {
6759 unsigned OpCmode, Imm;
6760 bool is128Bits = VectorVT.is128BitVector();
6761
6762 // SplatBitSize is set to the smallest size that splats the vector, so a
6763 // zero vector will always have SplatBitSize == 8. However, NEON modified
6764 // immediate instructions others than VMOV do not support the 8-bit encoding
6765 // of a zero vector, and the default encoding of zero is supposed to be the
6766 // 32-bit version.
6767 if (SplatBits == 0)
6768 SplatBitSize = 32;
6769
6770 switch (SplatBitSize) {
6771 case 8:
6772 if (type != VMOVModImm)
6773 return SDValue();
6774 // Any 1-byte value is OK. Op=0, Cmode=1110.
6775 assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
6776 OpCmode = 0xe;
6777 Imm = SplatBits;
6778 VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
6779 break;
6780
6781 case 16:
6782 // NEON's 16-bit VMOV supports splat values where only one byte is nonzero.
6783 VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
6784 if ((SplatBits & ~0xff) == 0) {
6785 // Value = 0x00nn: Op=x, Cmode=100x.
6786 OpCmode = 0x8;
6787 Imm = SplatBits;
6788 break;
6789 }
6790 if ((SplatBits & ~0xff00) == 0) {
6791 // Value = 0xnn00: Op=x, Cmode=101x.
6792 OpCmode = 0xa;
6793 Imm = SplatBits >> 8;
6794 break;
6795 }
6796 return SDValue();
6797
6798 case 32:
6799 // NEON's 32-bit VMOV supports splat values where:
6800 // * only one byte is nonzero, or
6801 // * the least significant byte is 0xff and the second byte is nonzero, or
6802 // * the least significant 2 bytes are 0xff and the third is nonzero.
6803 VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
6804 if ((SplatBits & ~0xff) == 0) {
6805 // Value = 0x000000nn: Op=x, Cmode=000x.
6806 OpCmode = 0;
6807 Imm = SplatBits;
6808 break;
6809 }
6810 if ((SplatBits & ~0xff00) == 0) {
6811 // Value = 0x0000nn00: Op=x, Cmode=001x.
6812 OpCmode = 0x2;
6813 Imm = SplatBits >> 8;
6814 break;
6815 }
6816 if ((SplatBits & ~0xff0000) == 0) {
6817 // Value = 0x00nn0000: Op=x, Cmode=010x.
6818 OpCmode = 0x4;
6819 Imm = SplatBits >> 16;
6820 break;
6821 }
6822 if ((SplatBits & ~0xff000000) == 0) {
6823 // Value = 0xnn000000: Op=x, Cmode=011x.
6824 OpCmode = 0x6;
6825 Imm = SplatBits >> 24;
6826 break;
6827 }
6828
6829 // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC
6830 if (type == OtherModImm) return SDValue();
6831
6832 if ((SplatBits & ~0xffff) == 0 &&
6833 ((SplatBits | SplatUndef) & 0xff) == 0xff) {
6834 // Value = 0x0000nnff: Op=x, Cmode=1100.
6835 OpCmode = 0xc;
6836 Imm = SplatBits >> 8;
6837 break;
6838 }
6839
6840 // cmode == 0b1101 is not supported for MVE VMVN
6841 if (type == MVEVMVNModImm)
6842 return SDValue();
6843
6844 if ((SplatBits & ~0xffffff) == 0 &&
6845 ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
6846 // Value = 0x00nnffff: Op=x, Cmode=1101.
6847 OpCmode = 0xd;
6848 Imm = SplatBits >> 16;
6849 break;
6850 }
6851
6852 // Note: there are a few 32-bit splat values (specifically: 00ffff00,
6853 // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not
6854 // VMOV.I32. A (very) minor optimization would be to replicate the value
6855 // and fall through here to test for a valid 64-bit splat. But, then the
6856 // caller would also need to check and handle the change in size.
6857 return SDValue();
6858
6859 case 64: {
6860 if (type != VMOVModImm)
6861 return SDValue();
6862 // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
6863 uint64_t BitMask = 0xff;
6864 unsigned ImmMask = 1;
6865 Imm = 0;
6866 for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
6867 if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
6868 Imm |= ImmMask;
6869 } else if ((SplatBits & BitMask) != 0) {
6870 return SDValue();
6871 }
6872 BitMask <<= 8;
6873 ImmMask <<= 1;
6874 }
6875
6876 // Op=1, Cmode=1110.
6877 OpCmode = 0x1e;
6878 VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
6879 break;
6880 }
6881
6882 default:
6883 llvm_unreachable("unexpected size for isVMOVModifiedImm");
6884 }
6885
6886 unsigned EncodedVal = ARM_AM::createVMOVModImm(OpCmode, Imm);
6887 return DAG.getTargetConstant(EncodedVal, dl, MVT::i32);
6888}
6889
6890SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
6891 const ARMSubtarget *ST) const {
6892 EVT VT = Op.getValueType();
6893 bool IsDouble = (VT == MVT::f64);
6894 ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);
6895 const APFloat &FPVal = CFP->getValueAPF();
6896
6897 // Prevent floating-point constants from using literal loads
6898 // when execute-only is enabled.
6899 if (ST->genExecuteOnly()) {
6900 // We shouldn't trigger this for v6m execute-only
6901 assert((!ST->isThumb1Only() || ST->hasV8MBaselineOps()) &&
6902 "Unexpected architecture");
6903
6904 // If we can represent the constant as an immediate, don't lower it
6905 if (isFPImmLegal(FPVal, VT))
6906 return Op;
6907 // Otherwise, construct as integer, and move to float register
6908 APInt INTVal = FPVal.bitcastToAPInt();
6909 SDLoc DL(CFP);
6910 switch (VT.getSimpleVT().SimpleTy) {
6911 default:
6912 llvm_unreachable("Unknown floating point type!");
6913 break;
6914 case MVT::f64: {
6915 SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32);
6916 SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32);
6917 return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi);
6918 }
6919 case MVT::f32:
6920 return DAG.getNode(ARMISD::VMOVSR, DL, VT,
6921 DAG.getConstant(INTVal, DL, MVT::i32));
6922 }
6923 }
6924
6925 if (!ST->hasVFP3Base())
6926 return SDValue();
6927
6928 // Use the default (constant pool) lowering for double constants when we have
6929 // an SP-only FPU
6930 if (IsDouble && !Subtarget->hasFP64())
6931 return SDValue();
6932
6933 // Try splatting with a VMOV.f32...
6934 int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal);
6935
6936 if (ImmVal != -1) {
6937 if (IsDouble || !ST->useNEONForSinglePrecisionFP()) {
6938 // We have code in place to select a valid ConstantFP already, no need to
6939 // do any mangling.
6940 return Op;
6941 }
6942
6943 // It's a float and we are trying to use NEON operations where
6944 // possible. Lower it to a splat followed by an extract.
6945 SDLoc DL(Op);
6946 SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32);
6947 SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32,
6948 NewVal);
6949 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant,
6950 DAG.getConstant(0, DL, MVT::i32));
6951 }
6952
6953 // The rest of our options are NEON only, make sure that's allowed before
6954 // proceeding..
6955 if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP()))
6956 return SDValue();
6957
6958 EVT VMovVT;
6959 uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue();
6960
6961 // It wouldn't really be worth bothering for doubles except for one very
6962 // important value, which does happen to match: 0.0. So make sure we don't do
6963 // anything stupid.
6964 if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32))
6965 return SDValue();
6966
6967 // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too).
6968 SDValue NewVal = isVMOVModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op),
6969 VMovVT, VT, VMOVModImm);
6970 if (NewVal != SDValue()) {
6971 SDLoc DL(Op);
6972 SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT,
6973 NewVal);
6974 if (IsDouble)
6975 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
6976
6977 // It's a float: cast and extract a vector element.
6978 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
6979 VecConstant);
6980 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
6981 DAG.getConstant(0, DL, MVT::i32));
6982 }
6983
6984 // Finally, try a VMVN.i32
6985 NewVal = isVMOVModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT,
6986 VT, VMVNModImm);
6987 if (NewVal != SDValue()) {
6988 SDLoc DL(Op);
6989 SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal);
6990
6991 if (IsDouble)
6992 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
6993
6994 // It's a float: cast and extract a vector element.
6995 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
6996 VecConstant);
6997 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
6998 DAG.getConstant(0, DL, MVT::i32));
6999 }
7000
7001 return SDValue();
7002}
7003
7004// check if an VEXT instruction can handle the shuffle mask when the
7005// vector sources of the shuffle are the same.
7006static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
7007 unsigned NumElts = VT.getVectorNumElements();
7008
7009 // Assume that the first shuffle index is not UNDEF. Fail if it is.
7010 if (M[0] < 0)
7011 return false;
7012
7013 Imm = M[0];
7014
7015 // If this is a VEXT shuffle, the immediate value is the index of the first
7016 // element. The other shuffle indices must be the successive elements after
7017 // the first one.
7018 unsigned ExpectedElt = Imm;
7019 for (unsigned i = 1; i < NumElts; ++i) {
7020 // Increment the expected index. If it wraps around, just follow it
7021 // back to index zero and keep going.
7022 ++ExpectedElt;
7023 if (ExpectedElt == NumElts)
7024 ExpectedElt = 0;
7025
7026 if (M[i] < 0) continue; // ignore UNDEF indices
7027 if (ExpectedElt != static_cast<unsigned>(M[i]))
7028 return false;
7029 }
7030
7031 return true;
7032}
7033
7034static bool isVEXTMask(ArrayRef<int> M, EVT VT,
7035 bool &ReverseVEXT, unsigned &Imm) {
7036 unsigned NumElts = VT.getVectorNumElements();
7037 ReverseVEXT = false;
7038
7039 // Assume that the first shuffle index is not UNDEF. Fail if it is.
7040 if (M[0] < 0)
7041 return false;
7042
7043 Imm = M[0];
7044
7045 // If this is a VEXT shuffle, the immediate value is the index of the first
7046 // element. The other shuffle indices must be the successive elements after
7047 // the first one.
7048 unsigned ExpectedElt = Imm;
7049 for (unsigned i = 1; i < NumElts; ++i) {
7050 // Increment the expected index. If it wraps around, it may still be
7051 // a VEXT but the source vectors must be swapped.
7052 ExpectedElt += 1;
7053 if (ExpectedElt == NumElts * 2) {
7054 ExpectedElt = 0;
7055 ReverseVEXT = true;
7056 }
7057
7058 if (M[i] < 0) continue; // ignore UNDEF indices
7059 if (ExpectedElt != static_cast<unsigned>(M[i]))
7060 return false;
7061 }
7062
7063 // Adjust the index value if the source operands will be swapped.
7064 if (ReverseVEXT)
7065 Imm -= NumElts;
7066
7067 return true;
7068}
7069
7070static bool isVTBLMask(ArrayRef<int> M, EVT VT) {
7071 // We can handle <8 x i8> vector shuffles. If the index in the mask is out of
7072 // range, then 0 is placed into the resulting vector. So pretty much any mask
7073 // of 8 elements can work here.
7074 return VT == MVT::v8i8 && M.size() == 8;
7075}
7076
7077static unsigned SelectPairHalf(unsigned Elements, ArrayRef<int> Mask,
7078 unsigned Index) {
7079 if (Mask.size() == Elements * 2)
7080 return Index / Elements;
7081 return Mask[Index] == 0 ? 0 : 1;
7082}
7083
7084// Checks whether the shuffle mask represents a vector transpose (VTRN) by
7085// checking that pairs of elements in the shuffle mask represent the same index
7086// in each vector, incrementing the expected index by 2 at each step.
7087// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6]
7088// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g}
7089// v2={e,f,g,h}
7090// WhichResult gives the offset for each element in the mask based on which
7091// of the two results it belongs to.
7092//
7093// The transpose can be represented either as:
7094// result1 = shufflevector v1, v2, result1_shuffle_mask
7095// result2 = shufflevector v1, v2, result2_shuffle_mask
7096// where v1/v2 and the shuffle masks have the same number of elements
7097// (here WhichResult (see below) indicates which result is being checked)
7098//
7099// or as:
7100// results = shufflevector v1, v2, shuffle_mask
7101// where both results are returned in one vector and the shuffle mask has twice
7102// as many elements as v1/v2 (here WhichResult will always be 0 if true) here we
7103// want to check the low half and high half of the shuffle mask as if it were
7104// the other case
7105static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7106 unsigned EltSz = VT.getScalarSizeInBits();
7107 if (EltSz == 64)
7108 return false;
7109
7110 unsigned NumElts = VT.getVectorNumElements();
7111 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7112 return false;
7113
7114 // If the mask is twice as long as the input vector then we need to check the
7115 // upper and lower parts of the mask with a matching value for WhichResult
7116 // FIXME: A mask with only even values will be rejected in case the first
7117 // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only
7118 // M[0] is used to determine WhichResult
7119 for (unsigned i = 0; i < M.size(); i += NumElts) {
7120 WhichResult = SelectPairHalf(NumElts, M, i);
7121 for (unsigned j = 0; j < NumElts; j += 2) {
7122 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
7123 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult))
7124 return false;
7125 }
7126 }
7127
7128 if (M.size() == NumElts*2)
7129 WhichResult = 0;
7130
7131 return true;
7132}
7133
7134/// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of
7135/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7136/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
7137static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7138 unsigned EltSz = VT.getScalarSizeInBits();
7139 if (EltSz == 64)
7140 return false;
7141
7142 unsigned NumElts = VT.getVectorNumElements();
7143 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7144 return false;
7145
7146 for (unsigned i = 0; i < M.size(); i += NumElts) {
7147 WhichResult = SelectPairHalf(NumElts, M, i);
7148 for (unsigned j = 0; j < NumElts; j += 2) {
7149 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
7150 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult))
7151 return false;
7152 }
7153 }
7154
7155 if (M.size() == NumElts*2)
7156 WhichResult = 0;
7157
7158 return true;
7159}
7160
7161// Checks whether the shuffle mask represents a vector unzip (VUZP) by checking
7162// that the mask elements are either all even and in steps of size 2 or all odd
7163// and in steps of size 2.
7164// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6]
7165// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g}
7166// v2={e,f,g,h}
7167// Requires similar checks to that of isVTRNMask with
7168// respect the how results are returned.
7169static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7170 unsigned EltSz = VT.getScalarSizeInBits();
7171 if (EltSz == 64)
7172 return false;
7173
7174 unsigned NumElts = VT.getVectorNumElements();
7175 if (M.size() != NumElts && M.size() != NumElts*2)
7176 return false;
7177
7178 for (unsigned i = 0; i < M.size(); i += NumElts) {
7179 WhichResult = SelectPairHalf(NumElts, M, i);
7180 for (unsigned j = 0; j < NumElts; ++j) {
7181 if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult)
7182 return false;
7183 }
7184 }
7185
7186 if (M.size() == NumElts*2)
7187 WhichResult = 0;
7188
7189 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7190 if (VT.is64BitVector() && EltSz == 32)
7191 return false;
7192
7193 return true;
7194}
7195
7196/// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of
7197/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7198/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
7199static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7200 unsigned EltSz = VT.getScalarSizeInBits();
7201 if (EltSz == 64)
7202 return false;
7203
7204 unsigned NumElts = VT.getVectorNumElements();
7205 if (M.size() != NumElts && M.size() != NumElts*2)
7206 return false;
7207
7208 unsigned Half = NumElts / 2;
7209 for (unsigned i = 0; i < M.size(); i += NumElts) {
7210 WhichResult = SelectPairHalf(NumElts, M, i);
7211 for (unsigned j = 0; j < NumElts; j += Half) {
7212 unsigned Idx = WhichResult;
7213 for (unsigned k = 0; k < Half; ++k) {
7214 int MIdx = M[i + j + k];
7215 if (MIdx >= 0 && (unsigned) MIdx != Idx)
7216 return false;
7217 Idx += 2;
7218 }
7219 }
7220 }
7221
7222 if (M.size() == NumElts*2)
7223 WhichResult = 0;
7224
7225 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7226 if (VT.is64BitVector() && EltSz == 32)
7227 return false;
7228
7229 return true;
7230}
7231
7232// Checks whether the shuffle mask represents a vector zip (VZIP) by checking
7233// that pairs of elements of the shufflemask represent the same index in each
7234// vector incrementing sequentially through the vectors.
7235// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5]
7236// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f}
7237// v2={e,f,g,h}
7238// Requires similar checks to that of isVTRNMask with respect the how results
7239// are returned.
7240static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7241 unsigned EltSz = VT.getScalarSizeInBits();
7242 if (EltSz == 64)
7243 return false;
7244
7245 unsigned NumElts = VT.getVectorNumElements();
7246 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7247 return false;
7248
7249 for (unsigned i = 0; i < M.size(); i += NumElts) {
7250 WhichResult = SelectPairHalf(NumElts, M, i);
7251 unsigned Idx = WhichResult * NumElts / 2;
7252 for (unsigned j = 0; j < NumElts; j += 2) {
7253 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
7254 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts))
7255 return false;
7256 Idx += 1;
7257 }
7258 }
7259
7260 if (M.size() == NumElts*2)
7261 WhichResult = 0;
7262
7263 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7264 if (VT.is64BitVector() && EltSz == 32)
7265 return false;
7266
7267 return true;
7268}
7269
7270/// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of
7271/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7272/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
7273static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7274 unsigned EltSz = VT.getScalarSizeInBits();
7275 if (EltSz == 64)
7276 return false;
7277
7278 unsigned NumElts = VT.getVectorNumElements();
7279 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7280 return false;
7281
7282 for (unsigned i = 0; i < M.size(); i += NumElts) {
7283 WhichResult = SelectPairHalf(NumElts, M, i);
7284 unsigned Idx = WhichResult * NumElts / 2;
7285 for (unsigned j = 0; j < NumElts; j += 2) {
7286 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
7287 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx))
7288 return false;
7289 Idx += 1;
7290 }
7291 }
7292
7293 if (M.size() == NumElts*2)
7294 WhichResult = 0;
7295
7296 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7297 if (VT.is64BitVector() && EltSz == 32)
7298 return false;
7299
7300 return true;
7301}
7302
7303/// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN),
7304/// and return the corresponding ARMISD opcode if it is, or 0 if it isn't.
7305static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT,
7306 unsigned &WhichResult,
7307 bool &isV_UNDEF) {
7308 isV_UNDEF = false;
7309 if (isVTRNMask(ShuffleMask, VT, WhichResult))
7310 return ARMISD::VTRN;
7311 if (isVUZPMask(ShuffleMask, VT, WhichResult))
7312 return ARMISD::VUZP;
7313 if (isVZIPMask(ShuffleMask, VT, WhichResult))
7314 return ARMISD::VZIP;
7315
7316 isV_UNDEF = true;
7317 if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult))
7318 return ARMISD::VTRN;
7319 if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult))
7320 return ARMISD::VUZP;
7321 if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult))
7322 return ARMISD::VZIP;
7323
7324 return 0;
7325}
7326
7327/// \return true if this is a reverse operation on an vector.
7328static bool isReverseMask(ArrayRef<int> M, EVT VT) {
7329 unsigned NumElts = VT.getVectorNumElements();
7330 // Make sure the mask has the right size.
7331 if (NumElts != M.size())
7332 return false;
7333
7334 // Look for <15, ..., 3, -1, 1, 0>.
7335 for (unsigned i = 0; i != NumElts; ++i)
7336 if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i))
7337 return false;
7338
7339 return true;
7340}
7341
7342static bool isTruncMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) {
7343 unsigned NumElts = VT.getVectorNumElements();
7344 // Make sure the mask has the right size.
7345 if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8))
7346 return false;
7347
7348 // Half-width truncation patterns (e.g. v4i32 -> v8i16):
7349 // !Top && SingleSource: <0, 2, 4, 6, 0, 2, 4, 6>
7350 // !Top && !SingleSource: <0, 2, 4, 6, 8, 10, 12, 14>
7351 // Top && SingleSource: <1, 3, 5, 7, 1, 3, 5, 7>
7352 // Top && !SingleSource: <1, 3, 5, 7, 9, 11, 13, 15>
7353 int Ofs = Top ? 1 : 0;
7354 int Upper = SingleSource ? 0 : NumElts;
7355 for (int i = 0, e = NumElts / 2; i != e; ++i) {
7356 if (M[i] >= 0 && M[i] != (i * 2) + Ofs)
7357 return false;
7358 if (M[i + e] >= 0 && M[i + e] != (i * 2) + Ofs + Upper)
7359 return false;
7360 }
7361 return true;
7362}
7363
7364static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) {
7365 unsigned NumElts = VT.getVectorNumElements();
7366 // Make sure the mask has the right size.
7367 if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8))
7368 return false;
7369
7370 // If Top
7371 // Look for <0, N, 2, N+2, 4, N+4, ..>.
7372 // This inserts Input2 into Input1
7373 // else if not Top
7374 // Look for <0, N+1, 2, N+3, 4, N+5, ..>
7375 // This inserts Input1 into Input2
7376 unsigned Offset = Top ? 0 : 1;
7377 unsigned N = SingleSource ? 0 : NumElts;
7378 for (unsigned i = 0; i < NumElts; i += 2) {
7379 if (M[i] >= 0 && M[i] != (int)i)
7380 return false;
7381 if (M[i + 1] >= 0 && M[i + 1] != (int)(N + i + Offset))
7382 return false;
7383 }
7384
7385 return true;
7386}
7387
7388static bool isVMOVNTruncMask(ArrayRef<int> M, EVT ToVT, bool rev) {
7389 unsigned NumElts = ToVT.getVectorNumElements();
7390 if (NumElts != M.size())
7391 return false;
7392
7393 // Test if the Trunc can be convertable to a VMOVN with this shuffle. We are
7394 // looking for patterns of:
7395 // !rev: 0 N/2 1 N/2+1 2 N/2+2 ...
7396 // rev: N/2 0 N/2+1 1 N/2+2 2 ...
7397
7398 unsigned Off0 = rev ? NumElts / 2 : 0;
7399 unsigned Off1 = rev ? 0 : NumElts / 2;
7400 for (unsigned i = 0; i < NumElts; i += 2) {
7401 if (M[i] >= 0 && M[i] != (int)(Off0 + i / 2))
7402 return false;
7403 if (M[i + 1] >= 0 && M[i + 1] != (int)(Off1 + i / 2))
7404 return false;
7405 }
7406
7407 return true;
7408}
7409
7410// Reconstruct an MVE VCVT from a BuildVector of scalar fptrunc, all extracted
7411// from a pair of inputs. For example:
7412// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
7413// FP_ROUND(EXTRACT_ELT(Y, 0),
7414// FP_ROUND(EXTRACT_ELT(X, 1),
7415// FP_ROUND(EXTRACT_ELT(Y, 1), ...)
7417 const ARMSubtarget *ST) {
7418 assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7419 if (!ST->hasMVEFloatOps())
7420 return SDValue();
7421
7422 SDLoc dl(BV);
7423 EVT VT = BV.getValueType();
7424 if (VT != MVT::v8f16)
7425 return SDValue();
7426
7427 // We are looking for a buildvector of fptrunc elements, where all the
7428 // elements are interleavingly extracted from two sources. Check the first two
7429 // items are valid enough and extract some info from them (they are checked
7430 // properly in the loop below).
7431 if (BV.getOperand(0).getOpcode() != ISD::FP_ROUND ||
7434 return SDValue();
7435 if (BV.getOperand(1).getOpcode() != ISD::FP_ROUND ||
7438 return SDValue();
7439 SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);
7440 SDValue Op1 = BV.getOperand(1).getOperand(0).getOperand(0);
7441 if (Op0.getValueType() != MVT::v4f32 || Op1.getValueType() != MVT::v4f32)
7442 return SDValue();
7443
7444 // Check all the values in the BuildVector line up with our expectations.
7445 for (unsigned i = 1; i < 4; i++) {
7446 auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
7447 return Trunc.getOpcode() == ISD::FP_ROUND &&
7449 Trunc.getOperand(0).getOperand(0) == Op &&
7450 Trunc.getOperand(0).getConstantOperandVal(1) == Idx;
7451 };
7452 if (!Check(BV.getOperand(i * 2 + 0), Op0, i))
7453 return SDValue();
7454 if (!Check(BV.getOperand(i * 2 + 1), Op1, i))
7455 return SDValue();
7456 }
7457
7458 SDValue N1 = DAG.getNode(ARMISD::VCVTN, dl, VT, DAG.getUNDEF(VT), Op0,
7459 DAG.getConstant(0, dl, MVT::i32));
7460 return DAG.getNode(ARMISD::VCVTN, dl, VT, N1, Op1,
7461 DAG.getConstant(1, dl, MVT::i32));
7462}
7463
7464// Reconstruct an MVE VCVT from a BuildVector of scalar fpext, all extracted
7465// from a single input on alternating lanes. For example:
7466// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
7467// FP_ROUND(EXTRACT_ELT(X, 2),
7468// FP_ROUND(EXTRACT_ELT(X, 4), ...)
7470 const ARMSubtarget *ST) {
7471 assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7472 if (!ST->hasMVEFloatOps())
7473 return SDValue();
7474
7475 SDLoc dl(BV);
7476 EVT VT = BV.getValueType();
7477 if (VT != MVT::v4f32)
7478 return SDValue();
7479
7480 // We are looking for a buildvector of fptext elements, where all the
7481 // elements are alternating lanes from a single source. For example <0,2,4,6>
7482 // or <1,3,5,7>. Check the first two items are valid enough and extract some
7483 // info from them (they are checked properly in the loop below).
7484 if (BV.getOperand(0).getOpcode() != ISD::FP_EXTEND ||
7486 return SDValue();
7487 SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);
7489 if (Op0.getValueType() != MVT::v8f16 || (Offset != 0 && Offset != 1))
7490 return SDValue();
7491
7492 // Check all the values in the BuildVector line up with our expectations.
7493 for (unsigned i = 1; i < 4; i++) {
7494 auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
7495 return Trunc.getOpcode() == ISD::FP_EXTEND &&
7497 Trunc.getOperand(0).getOperand(0) == Op &&
7498 Trunc.getOperand(0).getConstantOperandVal(1) == Idx;
7499 };
7500 if (!Check(BV.getOperand(i), Op0, 2 * i + Offset))
7501 return SDValue();
7502 }
7503
7504 return DAG.getNode(ARMISD::VCVTL, dl, VT, Op0,
7505 DAG.getConstant(Offset, dl, MVT::i32));
7506}
7507
7508// If N is an integer constant that can be moved into a register in one
7509// instruction, return an SDValue of such a constant (will become a MOV
7510// instruction). Otherwise return null.
7512 const ARMSubtarget *ST, const SDLoc &dl) {
7513 uint64_t Val;
7514 if (!isa<ConstantSDNode>(N))
7515 return SDValue();
7516 Val = N->getAsZExtVal();
7517
7518 if (ST->isThumb1Only()) {
7519 if (Val <= 255 || ~Val <= 255)
7520 return DAG.getConstant(Val, dl, MVT::i32);
7521 } else {
7522 if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1)
7523 return DAG.getConstant(Val, dl, MVT::i32);
7524 }
7525 return SDValue();
7526}
7527
7529 const ARMSubtarget *ST) {
7530 SDLoc dl(Op);
7531 EVT VT = Op.getValueType();
7532
7533 assert(ST->hasMVEIntegerOps() && "LowerBUILD_VECTOR_i1 called without MVE!");
7534
7535 unsigned NumElts = VT.getVectorNumElements();
7536 unsigned BoolMask;
7537 unsigned BitsPerBool;
7538 if (NumElts == 2) {
7539 BitsPerBool = 8;
7540 BoolMask = 0xff;
7541 } else if (NumElts == 4) {
7542 BitsPerBool = 4;
7543 BoolMask = 0xf;
7544 } else if (NumElts == 8) {
7545 BitsPerBool = 2;
7546 BoolMask = 0x3;
7547 } else if (NumElts == 16) {
7548 BitsPerBool = 1;
7549 BoolMask = 0x1;
7550 } else
7551 return SDValue();
7552
7553 // If this is a single value copied into all lanes (a splat), we can just sign
7554 // extend that single value
7555 SDValue FirstOp = Op.getOperand(0);
7556 if (!isa<ConstantSDNode>(FirstOp) &&
7557 llvm::all_of(llvm::drop_begin(Op->ops()), [&FirstOp](const SDUse &U) {
7558 return U.get().isUndef() || U.get() == FirstOp;
7559 })) {
7560 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, FirstOp,
7561 DAG.getValueType(MVT::i1));
7562 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), Ext);
7563 }
7564
7565 // First create base with bits set where known
7566 unsigned Bits32 = 0;
7567 for (unsigned i = 0; i < NumElts; ++i) {
7568 SDValue V = Op.getOperand(i);
7569 if (!isa<ConstantSDNode>(V) && !V.isUndef())
7570 continue;
7571 bool BitSet = V.isUndef() ? false : V->getAsZExtVal();
7572 if (BitSet)
7573 Bits32 |= BoolMask << (i * BitsPerBool);
7574 }
7575
7576 // Add in unknown nodes
7577 SDValue Base = DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT,
7578 DAG.getConstant(Bits32, dl, MVT::i32));
7579 for (unsigned i = 0; i < NumElts; ++i) {
7580 SDValue V = Op.getOperand(i);
7581 if (isa<ConstantSDNode>(V) || V.isUndef())
7582 continue;
7583 Base = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Base, V,
7584 DAG.getConstant(i, dl, MVT::i32));
7585 }
7586
7587 return Base;
7588}
7589
7591 const ARMSubtarget *ST) {
7592 if (!ST->hasMVEIntegerOps())
7593 return SDValue();
7594
7595 // We are looking for a buildvector where each element is Op[0] + i*N
7596 EVT VT = Op.getValueType();
7597 SDValue Op0 = Op.getOperand(0);
7598 unsigned NumElts = VT.getVectorNumElements();
7599
7600 // Get the increment value from operand 1
7601 SDValue Op1 = Op.getOperand(1);
7602 if (Op1.getOpcode() != ISD::ADD || Op1.getOperand(0) != Op0 ||
7604 return SDValue();
7605 unsigned N = Op1.getConstantOperandVal(1);
7606 if (N != 1 && N != 2 && N != 4 && N != 8)
7607 return SDValue();
7608
7609 // Check that each other operand matches
7610 for (unsigned I = 2; I < NumElts; I++) {
7611 SDValue OpI = Op.getOperand(I);
7612 if (OpI.getOpcode() != ISD::ADD || OpI.getOperand(0) != Op0 ||
7614 OpI.getConstantOperandVal(1) != I * N)
7615 return SDValue();
7616 }
7617
7618 SDLoc DL(Op);
7619 return DAG.getNode(ARMISD::VIDUP, DL, DAG.getVTList(VT, MVT::i32), Op0,
7620 DAG.getConstant(N, DL, MVT::i32));
7621}
7622
7623// Returns true if the operation N can be treated as qr instruction variant at
7624// operand Op.
7625static bool IsQRMVEInstruction(const SDNode *N, const SDNode *Op) {
7626 switch (N->getOpcode()) {
7627 case ISD::ADD:
7628 case ISD::MUL:
7629 case ISD::SADDSAT:
7630 case ISD::UADDSAT:
7631 case ISD::AVGFLOORS:
7632 case ISD::AVGFLOORU:
7633 return true;
7634 case ISD::SUB:
7635 case ISD::SSUBSAT:
7636 case ISD::USUBSAT:
7637 return N->getOperand(1).getNode() == Op;
7639 switch (N->getConstantOperandVal(0)) {
7640 case Intrinsic::arm_mve_add_predicated:
7641 case Intrinsic::arm_mve_mul_predicated:
7642 case Intrinsic::arm_mve_qadd_predicated:
7643 case Intrinsic::arm_mve_vhadd:
7644 case Intrinsic::arm_mve_hadd_predicated:
7645 case Intrinsic::arm_mve_vqdmulh:
7646 case Intrinsic::arm_mve_qdmulh_predicated:
7647 case Intrinsic::arm_mve_vqrdmulh:
7648 case Intrinsic::arm_mve_qrdmulh_predicated:
7649 case Intrinsic::arm_mve_vqdmull:
7650 case Intrinsic::arm_mve_vqdmull_predicated:
7651 return true;
7652 case Intrinsic::arm_mve_sub_predicated:
7653 case Intrinsic::arm_mve_qsub_predicated:
7654 case Intrinsic::arm_mve_vhsub:
7655 case Intrinsic::arm_mve_hsub_predicated:
7656 return N->getOperand(2).getNode() == Op;
7657 default:
7658 return false;
7659 }
7660 default:
7661 return false;
7662 }
7663}
7664
7665// If this is a case we can't handle, return null and let the default
7666// expansion code take care of it.
7667SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
7668 const ARMSubtarget *ST) const {
7669 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
7670 SDLoc dl(Op);
7671 EVT VT = Op.getValueType();
7672
7673 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
7674 return LowerBUILD_VECTOR_i1(Op, DAG, ST);
7675
7676 if (SDValue R = LowerBUILD_VECTORToVIDUP(Op, DAG, ST))
7677 return R;
7678
7679 APInt SplatBits, SplatUndef;
7680 unsigned SplatBitSize;
7681 bool HasAnyUndefs;
7682 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
7683 if (SplatUndef.isAllOnes())
7684 return DAG.getUNDEF(VT);
7685
7686 // If all the users of this constant splat are qr instruction variants,
7687 // generate a vdup of the constant.
7688 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == SplatBitSize &&
7689 (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32) &&
7690 all_of(BVN->users(),
7691 [BVN](const SDNode *U) { return IsQRMVEInstruction(U, BVN); })) {
7692 EVT DupVT = SplatBitSize == 32 ? MVT::v4i32
7693 : SplatBitSize == 16 ? MVT::v8i16
7694 : MVT::v16i8;
7695 SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32);
7696 SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const);
7697 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup);
7698 }
7699
7700 if ((ST->hasNEON() && SplatBitSize <= 64) ||
7701 (ST->hasMVEIntegerOps() && SplatBitSize <= 64)) {
7702 // Check if an immediate VMOV works.
7703 EVT VmovVT;
7704 SDValue Val =
7705 isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
7706 SplatBitSize, DAG, dl, VmovVT, VT, VMOVModImm);
7707
7708 if (Val.getNode()) {
7709 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);
7710 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vmov);
7711 }
7712
7713 // Try an immediate VMVN.
7714 uint64_t NegatedImm = (~SplatBits).getZExtValue();
7715 Val = isVMOVModifiedImm(
7716 NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VmovVT,
7717 VT, ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm);
7718 if (Val.getNode()) {
7719 SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val);
7720 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vmov);
7721 }
7722
7723 // Use vmov.f32 to materialize other v2f32 and v4f32 splats.
7724 if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) {
7725 int ImmVal = ARM_AM::getFP32Imm(SplatBits);
7726 if (ImmVal != -1) {
7727 SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32);
7728 return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val);
7729 }
7730 }
7731
7732 // If we are under MVE, generate a VDUP(constant), bitcast to the original
7733 // type.
7734 if (ST->hasMVEIntegerOps() &&
7735 (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32)) {
7736 EVT DupVT = SplatBitSize == 32 ? MVT::v4i32
7737 : SplatBitSize == 16 ? MVT::v8i16
7738 : MVT::v16i8;
7739 SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32);
7740 SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const);
7741 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup);
7742 }
7743 }
7744 }
7745
7746 // Scan through the operands to see if only one value is used.
7747 //
7748 // As an optimisation, even if more than one value is used it may be more
7749 // profitable to splat with one value then change some lanes.
7750 //
7751 // Heuristically we decide to do this if the vector has a "dominant" value,
7752 // defined as splatted to more than half of the lanes.
7753 unsigned NumElts = VT.getVectorNumElements();
7754 bool isOnlyLowElement = true;
7755 bool usesOnlyOneValue = true;
7756 bool hasDominantValue = false;
7757 bool isConstant = true;
7758
7759 // Map of the number of times a particular SDValue appears in the
7760 // element list.
7761 DenseMap<SDValue, unsigned> ValueCounts;
7762 SDValue Value;
7763 for (unsigned i = 0; i < NumElts; ++i) {
7764 SDValue V = Op.getOperand(i);
7765 if (V.isUndef())
7766 continue;
7767 if (i > 0)
7768 isOnlyLowElement = false;
7770 isConstant = false;
7771
7772 unsigned &Count = ValueCounts[V];
7773
7774 // Is this value dominant? (takes up more than half of the lanes)
7775 if (++Count > (NumElts / 2)) {
7776 hasDominantValue = true;
7777 Value = V;
7778 }
7779 }
7780 if (ValueCounts.size() != 1)
7781 usesOnlyOneValue = false;
7782 if (!Value.getNode() && !ValueCounts.empty())
7783 Value = ValueCounts.begin()->first;
7784
7785 if (ValueCounts.empty())
7786 return DAG.getUNDEF(VT);
7787
7788 // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR.
7789 // Keep going if we are hitting this case.
7790 if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode()))
7791 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
7792
7793 unsigned EltSize = VT.getScalarSizeInBits();
7794
7795 // Use VDUP for non-constant splats. For f32 constant splats, reduce to
7796 // i32 and try again.
7797 if (hasDominantValue && EltSize <= 32) {
7798 if (!isConstant) {
7799 SDValue N;
7800
7801 // If we are VDUPing a value that comes directly from a vector, that will
7802 // cause an unnecessary move to and from a GPR, where instead we could
7803 // just use VDUPLANE. We can only do this if the lane being extracted
7804 // is at a constant index, as the VDUP from lane instructions only have
7805 // constant-index forms.
7806 ConstantSDNode *constIndex;
7807 if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7808 (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) {
7809 // We need to create a new undef vector to use for the VDUPLANE if the
7810 // size of the vector from which we get the value is different than the
7811 // size of the vector that we need to create. We will insert the element
7812 // such that the register coalescer will remove unnecessary copies.
7813 if (VT != Value->getOperand(0).getValueType()) {
7814 unsigned index = constIndex->getAPIntValue().getLimitedValue() %
7816 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
7817 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT),
7818 Value, DAG.getConstant(index, dl, MVT::i32)),
7819 DAG.getConstant(index, dl, MVT::i32));
7820 } else
7821 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
7822 Value->getOperand(0), Value->getOperand(1));
7823 } else
7824 N = DAG.getNode(ARMISD::VDUP, dl, VT, Value);
7825
7826 if (!usesOnlyOneValue) {
7827 // The dominant value was splatted as 'N', but we now have to insert
7828 // all differing elements.
7829 for (unsigned I = 0; I < NumElts; ++I) {
7830 if (Op.getOperand(I) == Value)
7831 continue;
7833 Ops.push_back(N);
7834 Ops.push_back(Op.getOperand(I));
7835 Ops.push_back(DAG.getConstant(I, dl, MVT::i32));
7836 N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops);
7837 }
7838 }
7839 return N;
7840 }
7843 MVT FVT = VT.getVectorElementType().getSimpleVT();
7844 assert(FVT == MVT::f32 || FVT == MVT::f16);
7845 MVT IVT = (FVT == MVT::f32) ? MVT::i32 : MVT::i16;
7846 for (unsigned i = 0; i < NumElts; ++i)
7847 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, IVT,
7848 Op.getOperand(i)));
7849 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), IVT, NumElts);
7850 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
7851 Val = LowerBUILD_VECTOR(Val, DAG, ST);
7852 if (Val.getNode())
7853 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
7854 }
7855 if (usesOnlyOneValue) {
7856 SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
7857 if (isConstant && Val.getNode())
7858 return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
7859 }
7860 }
7861
7862 // If all elements are constants and the case above didn't get hit, fall back
7863 // to the default expansion, which will generate a load from the constant
7864 // pool.
7865 if (isConstant)
7866 return SDValue();
7867
7868 // Reconstruct the BUILDVECTOR to one of the legal shuffles (such as vext and
7869 // vmovn). Empirical tests suggest this is rarely worth it for vectors of
7870 // length <= 2.
7871 if (NumElts >= 4)
7872 if (SDValue shuffle = ReconstructShuffle(Op, DAG))
7873 return shuffle;
7874
7875 // Attempt to turn a buildvector of scalar fptrunc's or fpext's back into
7876 // VCVT's
7877 if (SDValue VCVT = LowerBuildVectorOfFPTrunc(Op, DAG, Subtarget))
7878 return VCVT;
7879 if (SDValue VCVT = LowerBuildVectorOfFPExt(Op, DAG, Subtarget))
7880 return VCVT;
7881
7882 if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) {
7883 // If we haven't found an efficient lowering, try splitting a 128-bit vector
7884 // into two 64-bit vectors; we might discover a better way to lower it.
7885 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts);
7886 EVT ExtVT = VT.getVectorElementType();
7887 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2);
7888 SDValue Lower = DAG.getBuildVector(HVT, dl, ArrayRef(&Ops[0], NumElts / 2));
7889 if (Lower.getOpcode() == ISD::BUILD_VECTOR)
7890 Lower = LowerBUILD_VECTOR(Lower, DAG, ST);
7891 SDValue Upper =
7892 DAG.getBuildVector(HVT, dl, ArrayRef(&Ops[NumElts / 2], NumElts / 2));
7893 if (Upper.getOpcode() == ISD::BUILD_VECTOR)
7894 Upper = LowerBUILD_VECTOR(Upper, DAG, ST);
7895 if (Lower && Upper)
7896 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lower, Upper);
7897 }
7898
7899 // Vectors with 32- or 64-bit elements can be built by directly assigning
7900 // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands
7901 // will be legalized.
7902 if (EltSize >= 32) {
7903 // Do the expansion with floating-point types, since that is what the VFP
7904 // registers are defined to use, and since i64 is not legal.
7905 EVT EltVT = EVT::getFloatingPointVT(EltSize);
7906 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
7908 for (unsigned i = 0; i < NumElts; ++i)
7909 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i)));
7910 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
7911 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
7912 }
7913
7914 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
7915 // know the default expansion would otherwise fall back on something even
7916 // worse. For a vector with one or two non-undef values, that's
7917 // scalar_to_vector for the elements followed by a shuffle (provided the
7918 // shuffle is valid for the target) and materialization element by element
7919 // on the stack followed by a load for everything else.
7920 if (!isConstant && !usesOnlyOneValue) {
7921 SDValue Vec = DAG.getUNDEF(VT);
7922 for (unsigned i = 0 ; i < NumElts; ++i) {
7923 SDValue V = Op.getOperand(i);
7924 if (V.isUndef())
7925 continue;
7926 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32);
7927 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
7928 }
7929 return Vec;
7930 }
7931
7932 return SDValue();
7933}
7934
7935// Gather data to see if the operation can be modelled as a
7936// shuffle in combination with VEXTs.
7937SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
7938 SelectionDAG &DAG) const {
7939 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7940 SDLoc dl(Op);
7941 EVT VT = Op.getValueType();
7942 unsigned NumElts = VT.getVectorNumElements();
7943
7944 struct ShuffleSourceInfo {
7945 SDValue Vec;
7946 unsigned MinElt = std::numeric_limits<unsigned>::max();
7947 unsigned MaxElt = 0;
7948
7949 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
7950 // be compatible with the shuffle we intend to construct. As a result
7951 // ShuffleVec will be some sliding window into the original Vec.
7952 SDValue ShuffleVec;
7953
7954 // Code should guarantee that element i in Vec starts at element "WindowBase
7955 // + i * WindowScale in ShuffleVec".
7956 int WindowBase = 0;
7957 int WindowScale = 1;
7958
7959 ShuffleSourceInfo(SDValue Vec) : Vec(Vec), ShuffleVec(Vec) {}
7960
7961 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
7962 };
7963
7964 // First gather all vectors used as an immediate source for this BUILD_VECTOR
7965 // node.
7967 for (unsigned i = 0; i < NumElts; ++i) {
7968 SDValue V = Op.getOperand(i);
7969 if (V.isUndef())
7970 continue;
7971 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
7972 // A shuffle can only come from building a vector from various
7973 // elements of other vectors.
7974 return SDValue();
7975 } else if (!isa<ConstantSDNode>(V.getOperand(1))) {
7976 // Furthermore, shuffles require a constant mask, whereas extractelts
7977 // accept variable indices.
7978 return SDValue();
7979 }
7980
7981 // Add this element source to the list if it's not already there.
7982 SDValue SourceVec = V.getOperand(0);
7983 auto Source = llvm::find(Sources, SourceVec);
7984 if (Source == Sources.end())
7985 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
7986
7987 // Update the minimum and maximum lane number seen.
7988 unsigned EltNo = V.getConstantOperandVal(1);
7989 Source->MinElt = std::min(Source->MinElt, EltNo);
7990 Source->MaxElt = std::max(Source->MaxElt, EltNo);
7991 }
7992
7993 // Currently only do something sane when at most two source vectors
7994 // are involved.
7995 if (Sources.size() > 2)
7996 return SDValue();
7997
7998 // Find out the smallest element size among result and two sources, and use
7999 // it as element size to build the shuffle_vector.
8000 EVT SmallestEltTy = VT.getVectorElementType();
8001 for (auto &Source : Sources) {
8002 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
8003 if (SrcEltTy.bitsLT(SmallestEltTy))
8004 SmallestEltTy = SrcEltTy;
8005 }
8006 unsigned ResMultiplier =
8007 VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();
8008 NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
8009 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
8010
8011 // If the source vector is too wide or too narrow, we may nevertheless be able
8012 // to construct a compatible shuffle either by concatenating it with UNDEF or
8013 // extracting a suitable range of elements.
8014 for (auto &Src : Sources) {
8015 EVT SrcVT = Src.ShuffleVec.getValueType();
8016
8017 uint64_t SrcVTSize = SrcVT.getFixedSizeInBits();
8018 uint64_t VTSize = VT.getFixedSizeInBits();
8019 if (SrcVTSize == VTSize)
8020 continue;
8021
8022 // This stage of the search produces a source with the same element type as
8023 // the original, but with a total width matching the BUILD_VECTOR output.
8024 EVT EltVT = SrcVT.getVectorElementType();
8025 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
8026 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
8027
8028 if (SrcVTSize < VTSize) {
8029 if (2 * SrcVTSize != VTSize)
8030 return SDValue();
8031 // We can pad out the smaller vector for free, so if it's part of a
8032 // shuffle...
8033 Src.ShuffleVec =
8034 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
8035 DAG.getUNDEF(Src.ShuffleVec.getValueType()));
8036 continue;
8037 }
8038
8039 if (SrcVTSize != 2 * VTSize)
8040 return SDValue();
8041
8042 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
8043 // Span too large for a VEXT to cope
8044 return SDValue();
8045 }
8046
8047 if (Src.MinElt >= NumSrcElts) {
8048 // The extraction can just take the second half
8049 Src.ShuffleVec =
8050 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8051 DAG.getConstant(NumSrcElts, dl, MVT::i32));
8052 Src.WindowBase = -NumSrcElts;
8053 } else if (Src.MaxElt < NumSrcElts) {
8054 // The extraction can just take the first half
8055 Src.ShuffleVec =
8056 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8057 DAG.getConstant(0, dl, MVT::i32));
8058 } else {
8059 // An actual VEXT is needed
8060 SDValue VEXTSrc1 =
8061 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8062 DAG.getConstant(0, dl, MVT::i32));
8063 SDValue VEXTSrc2 =
8064 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8065 DAG.getConstant(NumSrcElts, dl, MVT::i32));
8066
8067 Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1,
8068 VEXTSrc2,
8069 DAG.getConstant(Src.MinElt, dl, MVT::i32));
8070 Src.WindowBase = -Src.MinElt;
8071 }
8072 }
8073
8074 // Another possible incompatibility occurs from the vector element types. We
8075 // can fix this by bitcasting the source vectors to the same type we intend
8076 // for the shuffle.
8077 for (auto &Src : Sources) {
8078 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
8079 if (SrcEltTy == SmallestEltTy)
8080 continue;
8081 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
8082 Src.ShuffleVec = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, ShuffleVT, Src.ShuffleVec);
8083 Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
8084 Src.WindowBase *= Src.WindowScale;
8085 }
8086
8087 // Final check before we try to actually produce a shuffle.
8088 LLVM_DEBUG({
8089 for (auto Src : Sources)
8090 assert(Src.ShuffleVec.getValueType() == ShuffleVT);
8091 });
8092
8093 // The stars all align, our next step is to produce the mask for the shuffle.
8094 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
8095 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
8096 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
8097 SDValue Entry = Op.getOperand(i);
8098 if (Entry.isUndef())
8099 continue;
8100
8101 auto Src = llvm::find(Sources, Entry.getOperand(0));
8102 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
8103
8104 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
8105 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
8106 // segment.
8107 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
8108 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
8109 VT.getScalarSizeInBits());
8110 int LanesDefined = BitsDefined / BitsPerShuffleLane;
8111
8112 // This source is expected to fill ResMultiplier lanes of the final shuffle,
8113 // starting at the appropriate offset.
8114 int *LaneMask = &Mask[i * ResMultiplier];
8115
8116 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
8117 ExtractBase += NumElts * (Src - Sources.begin());
8118 for (int j = 0; j < LanesDefined; ++j)
8119 LaneMask[j] = ExtractBase + j;
8120 }
8121
8122
8123 // We can't handle more than two sources. This should have already
8124 // been checked before this point.
8125 assert(Sources.size() <= 2 && "Too many sources!");
8126
8127 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
8128 for (unsigned i = 0; i < Sources.size(); ++i)
8129 ShuffleOps[i] = Sources[i].ShuffleVec;
8130
8131 SDValue Shuffle = buildLegalVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
8132 ShuffleOps[1], Mask, DAG);
8133 if (!Shuffle)
8134 return SDValue();
8135 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Shuffle);
8136}
8137
8139 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
8148 OP_VUZPL, // VUZP, left result
8149 OP_VUZPR, // VUZP, right result
8150 OP_VZIPL, // VZIP, left result
8151 OP_VZIPR, // VZIP, right result
8152 OP_VTRNL, // VTRN, left result
8153 OP_VTRNR // VTRN, right result
8154};
8155
8156static bool isLegalMVEShuffleOp(unsigned PFEntry) {
8157 unsigned OpNum = (PFEntry >> 26) & 0x0F;
8158 switch (OpNum) {
8159 case OP_COPY:
8160 case OP_VREV:
8161 case OP_VDUP0:
8162 case OP_VDUP1:
8163 case OP_VDUP2:
8164 case OP_VDUP3:
8165 return true;
8166 }
8167 return false;
8168}
8169
8170/// isShuffleMaskLegal - Targets can use this to indicate that they only
8171/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
8172/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
8173/// are assumed to be legal.
8175 if (VT.getVectorNumElements() == 4 &&
8176 (VT.is128BitVector() || VT.is64BitVector())) {
8177 unsigned PFIndexes[4];
8178 for (unsigned i = 0; i != 4; ++i) {
8179 if (M[i] < 0)
8180 PFIndexes[i] = 8;
8181 else
8182 PFIndexes[i] = M[i];
8183 }
8184
8185 // Compute the index in the perfect shuffle table.
8186 unsigned PFTableIndex =
8187 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
8188 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
8189 unsigned Cost = (PFEntry >> 30);
8190
8191 if (Cost <= 4 && (Subtarget->hasNEON() || isLegalMVEShuffleOp(PFEntry)))
8192 return true;
8193 }
8194
8195 bool ReverseVEXT, isV_UNDEF;
8196 unsigned Imm, WhichResult;
8197
8198 unsigned EltSize = VT.getScalarSizeInBits();
8199 if (EltSize >= 32 ||
8201 ShuffleVectorInst::isIdentityMask(M, M.size()) ||
8202 isVREVMask(M, VT, 64) ||
8203 isVREVMask(M, VT, 32) ||
8204 isVREVMask(M, VT, 16))
8205 return true;
8206 else if (Subtarget->hasNEON() &&
8207 (isVEXTMask(M, VT, ReverseVEXT, Imm) ||
8208 isVTBLMask(M, VT) ||
8209 isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF)))
8210 return true;
8211 else if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8212 isReverseMask(M, VT))
8213 return true;
8214 else if (Subtarget->hasMVEIntegerOps() &&
8215 (isVMOVNMask(M, VT, true, false) ||
8216 isVMOVNMask(M, VT, false, false) || isVMOVNMask(M, VT, true, true)))
8217 return true;
8218 else if (Subtarget->hasMVEIntegerOps() &&
8219 (isTruncMask(M, VT, false, false) ||
8220 isTruncMask(M, VT, false, true) ||
8221 isTruncMask(M, VT, true, false) || isTruncMask(M, VT, true, true)))
8222 return true;
8223 else
8224 return false;
8225}
8226
8227/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
8228/// the specified operations to build the shuffle.
8230 SDValue RHS, SelectionDAG &DAG,
8231 const SDLoc &dl) {
8232 unsigned OpNum = (PFEntry >> 26) & 0x0F;
8233 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
8234 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
8235
8236 if (OpNum == OP_COPY) {
8237 if (LHSID == (1*9+2)*9+3) return LHS;
8238 assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
8239 return RHS;
8240 }
8241
8242 SDValue OpLHS, OpRHS;
8243 OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
8244 OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
8245 EVT VT = OpLHS.getValueType();
8246
8247 switch (OpNum) {
8248 default: llvm_unreachable("Unknown shuffle opcode!");
8249 case OP_VREV:
8250 // VREV divides the vector in half and swaps within the half.
8251 if (VT.getScalarSizeInBits() == 32)
8252 return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);
8253 // vrev <4 x i16> -> VREV32
8254 if (VT.getScalarSizeInBits() == 16)
8255 return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS);
8256 // vrev <4 x i8> -> VREV16
8257 assert(VT.getScalarSizeInBits() == 8);
8258 return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS);
8259 case OP_VDUP0:
8260 case OP_VDUP1:
8261 case OP_VDUP2:
8262 case OP_VDUP3:
8263 return DAG.getNode(ARMISD::VDUPLANE, dl, VT,
8264 OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32));
8265 case OP_VEXT1:
8266 case OP_VEXT2:
8267 case OP_VEXT3:
8268 return DAG.getNode(ARMISD::VEXT, dl, VT,
8269 OpLHS, OpRHS,
8270 DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32));
8271 case OP_VUZPL:
8272 case OP_VUZPR:
8273 return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
8274 OpLHS, OpRHS).getValue(OpNum-OP_VUZPL);
8275 case OP_VZIPL:
8276 case OP_VZIPR:
8277 return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
8278 OpLHS, OpRHS).getValue(OpNum-OP_VZIPL);
8279 case OP_VTRNL:
8280 case OP_VTRNR:
8281 return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
8282 OpLHS, OpRHS).getValue(OpNum-OP_VTRNL);
8283 }
8284}
8285
8287 ArrayRef<int> ShuffleMask,
8288 SelectionDAG &DAG) {
8289 // Check to see if we can use the VTBL instruction.
8290 SDValue V1 = Op.getOperand(0);
8291 SDValue V2 = Op.getOperand(1);
8292 SDLoc DL(Op);
8293
8294 SmallVector<SDValue, 8> VTBLMask;
8295 for (int I : ShuffleMask)
8296 VTBLMask.push_back(DAG.getSignedConstant(I, DL, MVT::i32));
8297
8298 if (V2.getNode()->isUndef())
8299 return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
8300 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
8301
8302 return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
8303 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
8304}
8305
8307 SDLoc DL(Op);
8308 EVT VT = Op.getValueType();
8309
8310 assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8311 "Expect an v8i16/v16i8 type");
8312 SDValue OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, Op.getOperand(0));
8313 // For a v16i8 type: After the VREV, we have got <7, ..., 0, 15, ..., 8>. Now,
8314 // extract the first 8 bytes into the top double word and the last 8 bytes
8315 // into the bottom double word, through a new vector shuffle that will be
8316 // turned into a VEXT on Neon, or a couple of VMOVDs on MVE.
8317 std::vector<int> NewMask;
8318 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++)
8319 NewMask.push_back(VT.getVectorNumElements() / 2 + i);
8320 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++)
8321 NewMask.push_back(i);
8322 return DAG.getVectorShuffle(VT, DL, OpLHS, OpLHS, NewMask);
8323}
8324
8326 switch (VT.getSimpleVT().SimpleTy) {
8327 case MVT::v2i1:
8328 return MVT::v2f64;
8329 case MVT::v4i1:
8330 return MVT::v4i32;
8331 case MVT::v8i1:
8332 return MVT::v8i16;
8333 case MVT::v16i1:
8334 return MVT::v16i8;
8335 default:
8336 llvm_unreachable("Unexpected vector predicate type");
8337 }
8338}
8339
8341 SelectionDAG &DAG) {
8342 // Converting from boolean predicates to integers involves creating a vector
8343 // of all ones or all zeroes and selecting the lanes based upon the real
8344 // predicate.
8346 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), dl, MVT::i32);
8347 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllOnes);
8348
8349 SDValue AllZeroes =
8350 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0x0), dl, MVT::i32);
8351 AllZeroes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllZeroes);
8352
8353 // Get full vector type from predicate type
8355
8356 SDValue RecastV1;
8357 // If the real predicate is an v8i1 or v4i1 (not v16i1) then we need to recast
8358 // this to a v16i1. This cannot be done with an ordinary bitcast because the
8359 // sizes are not the same. We have to use a MVE specific PREDICATE_CAST node,
8360 // since we know in hardware the sizes are really the same.
8361 if (VT != MVT::v16i1)
8362 RecastV1 = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Pred);
8363 else
8364 RecastV1 = Pred;
8365
8366 // Select either all ones or zeroes depending upon the real predicate bits.
8367 SDValue PredAsVector =
8368 DAG.getNode(ISD::VSELECT, dl, MVT::v16i8, RecastV1, AllOnes, AllZeroes);
8369
8370 // Recast our new predicate-as-integer v16i8 vector into something
8371 // appropriate for the shuffle, i.e. v4i32 for a real v4i1 predicate.
8372 return DAG.getNode(ISD::BITCAST, dl, NewVT, PredAsVector);
8373}
8374
8376 const ARMSubtarget *ST) {
8377 EVT VT = Op.getValueType();
8379 ArrayRef<int> ShuffleMask = SVN->getMask();
8380
8381 assert(ST->hasMVEIntegerOps() &&
8382 "No support for vector shuffle of boolean predicates");
8383
8384 SDValue V1 = Op.getOperand(0);
8385 SDValue V2 = Op.getOperand(1);
8386 SDLoc dl(Op);
8387 if (isReverseMask(ShuffleMask, VT)) {
8388 SDValue cast = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, V1);
8389 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, cast);
8390 SDValue srl = DAG.getNode(ISD::SRL, dl, MVT::i32, rbit,
8391 DAG.getConstant(16, dl, MVT::i32));
8392 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, srl);
8393 }
8394
8395 // Until we can come up with optimised cases for every single vector
8396 // shuffle in existence we have chosen the least painful strategy. This is
8397 // to essentially promote the boolean predicate to a 8-bit integer, where
8398 // each predicate represents a byte. Then we fall back on a normal integer
8399 // vector shuffle and convert the result back into a predicate vector. In
8400 // many cases the generated code might be even better than scalar code
8401 // operating on bits. Just imagine trying to shuffle 8 arbitrary 2-bit
8402 // fields in a register into 8 other arbitrary 2-bit fields!
8403 SDValue PredAsVector1 = PromoteMVEPredVector(dl, V1, VT, DAG);
8404 EVT NewVT = PredAsVector1.getValueType();
8405 SDValue PredAsVector2 = V2.isUndef() ? DAG.getUNDEF(NewVT)
8406 : PromoteMVEPredVector(dl, V2, VT, DAG);
8407 assert(PredAsVector2.getValueType() == NewVT &&
8408 "Expected identical vector type in expanded i1 shuffle!");
8409
8410 // Do the shuffle!
8411 SDValue Shuffled = DAG.getVectorShuffle(NewVT, dl, PredAsVector1,
8412 PredAsVector2, ShuffleMask);
8413
8414 // Now return the result of comparing the shuffled vector with zero,
8415 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. For a v2i1
8416 // we convert to a v4i1 compare to fill in the two halves of the i64 as i32s.
8417 if (VT == MVT::v2i1) {
8418 SDValue BC = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Shuffled);
8419 SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, BC,
8420 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8421 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp);
8422 }
8423 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Shuffled,
8424 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8425}
8426
8428 ArrayRef<int> ShuffleMask,
8429 SelectionDAG &DAG) {
8430 // Attempt to lower the vector shuffle using as many whole register movs as
8431 // possible. This is useful for types smaller than 32bits, which would
8432 // often otherwise become a series for grp movs.
8433 SDLoc dl(Op);
8434 EVT VT = Op.getValueType();
8435 if (VT.getScalarSizeInBits() >= 32)
8436 return SDValue();
8437
8438 assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8439 "Unexpected vector type");
8440 int NumElts = VT.getVectorNumElements();
8441 int QuarterSize = NumElts / 4;
8442 // The four final parts of the vector, as i32's
8443 SDValue Parts[4];
8444
8445 // Look for full lane vmovs like <0,1,2,3> or <u,5,6,7> etc, (but not
8446 // <u,u,u,u>), returning the vmov lane index
8447 auto getMovIdx = [](ArrayRef<int> ShuffleMask, int Start, int Length) {
8448 // Detect which mov lane this would be from the first non-undef element.
8449 int MovIdx = -1;
8450 for (int i = 0; i < Length; i++) {
8451 if (ShuffleMask[Start + i] >= 0) {
8452 if (ShuffleMask[Start + i] % Length != i)
8453 return -1;
8454 MovIdx = ShuffleMask[Start + i] / Length;
8455 break;
8456 }
8457 }
8458 // If all items are undef, leave this for other combines
8459 if (MovIdx == -1)
8460 return -1;
8461 // Check the remaining values are the correct part of the same mov
8462 for (int i = 1; i < Length; i++) {
8463 if (ShuffleMask[Start + i] >= 0 &&
8464 (ShuffleMask[Start + i] / Length != MovIdx ||
8465 ShuffleMask[Start + i] % Length != i))
8466 return -1;
8467 }
8468 return MovIdx;
8469 };
8470
8471 for (int Part = 0; Part < 4; ++Part) {
8472 // Does this part look like a mov
8473 int Elt = getMovIdx(ShuffleMask, Part * QuarterSize, QuarterSize);
8474 if (Elt != -1) {
8475 SDValue Input = Op->getOperand(0);
8476 if (Elt >= 4) {
8477 Input = Op->getOperand(1);
8478 Elt -= 4;
8479 }
8480 SDValue BitCast = DAG.getBitcast(MVT::v4f32, Input);
8481 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, BitCast,
8482 DAG.getConstant(Elt, dl, MVT::i32));
8483 }
8484 }
8485
8486 // Nothing interesting found, just return
8487 if (!Parts[0] && !Parts[1] && !Parts[2] && !Parts[3])
8488 return SDValue();
8489
8490 // The other parts need to be built with the old shuffle vector, cast to a
8491 // v4i32 and extract_vector_elts
8492 if (!Parts[0] || !Parts[1] || !Parts[2] || !Parts[3]) {
8493 SmallVector<int, 16> NewShuffleMask;
8494 for (int Part = 0; Part < 4; ++Part)
8495 for (int i = 0; i < QuarterSize; i++)
8496 NewShuffleMask.push_back(
8497 Parts[Part] ? -1 : ShuffleMask[Part * QuarterSize + i]);
8498 SDValue NewShuffle = DAG.getVectorShuffle(
8499 VT, dl, Op->getOperand(0), Op->getOperand(1), NewShuffleMask);
8500 SDValue BitCast = DAG.getBitcast(MVT::v4f32, NewShuffle);
8501
8502 for (int Part = 0; Part < 4; ++Part)
8503 if (!Parts[Part])
8504 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32,
8505 BitCast, DAG.getConstant(Part, dl, MVT::i32));
8506 }
8507 // Build a vector out of the various parts and bitcast it back to the original
8508 // type.
8509 SDValue NewVec = DAG.getNode(ARMISD::BUILD_VECTOR, dl, MVT::v4f32, Parts);
8510 return DAG.getBitcast(VT, NewVec);
8511}
8512
8514 ArrayRef<int> ShuffleMask,
8515 SelectionDAG &DAG) {
8516 SDValue V1 = Op.getOperand(0);
8517 SDValue V2 = Op.getOperand(1);
8518 EVT VT = Op.getValueType();
8519 unsigned NumElts = VT.getVectorNumElements();
8520
8521 // An One-Off Identity mask is one that is mostly an identity mask from as
8522 // single source but contains a single element out-of-place, either from a
8523 // different vector or from another position in the same vector. As opposed to
8524 // lowering this via a ARMISD::BUILD_VECTOR we can generate an extract/insert
8525 // pair directly.
8526 auto isOneOffIdentityMask = [](ArrayRef<int> Mask, EVT VT, int BaseOffset,
8527 int &OffElement) {
8528 OffElement = -1;
8529 int NonUndef = 0;
8530 for (int i = 0, NumMaskElts = Mask.size(); i < NumMaskElts; ++i) {
8531 if (Mask[i] == -1)
8532 continue;
8533 NonUndef++;
8534 if (Mask[i] != i + BaseOffset) {
8535 if (OffElement == -1)
8536 OffElement = i;
8537 else
8538 return false;
8539 }
8540 }
8541 return NonUndef > 2 && OffElement != -1;
8542 };
8543 int OffElement;
8544 SDValue VInput;
8545 if (isOneOffIdentityMask(ShuffleMask, VT, 0, OffElement))
8546 VInput = V1;
8547 else if (isOneOffIdentityMask(ShuffleMask, VT, NumElts, OffElement))
8548 VInput = V2;
8549 else
8550 return SDValue();
8551
8552 SDLoc dl(Op);
8553 EVT SVT = VT.getScalarType() == MVT::i8 || VT.getScalarType() == MVT::i16
8554 ? MVT::i32
8555 : VT.getScalarType();
8556 SDValue Elt = DAG.getNode(
8557 ISD::EXTRACT_VECTOR_ELT, dl, SVT,
8558 ShuffleMask[OffElement] < (int)NumElts ? V1 : V2,
8559 DAG.getVectorIdxConstant(ShuffleMask[OffElement] % NumElts, dl));
8560 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, VInput, Elt,
8561 DAG.getVectorIdxConstant(OffElement % NumElts, dl));
8562}
8563
8565 const ARMSubtarget *ST) {
8566 SDValue V1 = Op.getOperand(0);
8567 SDValue V2 = Op.getOperand(1);
8568 SDLoc dl(Op);
8569 EVT VT = Op.getValueType();
8571 unsigned EltSize = VT.getScalarSizeInBits();
8572
8573 if (ST->hasMVEIntegerOps() && EltSize == 1)
8574 return LowerVECTOR_SHUFFLE_i1(Op, DAG, ST);
8575
8576 // Convert shuffles that are directly supported on NEON to target-specific
8577 // DAG nodes, instead of keeping them as shuffles and matching them again
8578 // during code selection. This is more efficient and avoids the possibility
8579 // of inconsistencies between legalization and selection.
8580 // FIXME: floating-point vectors should be canonicalized to integer vectors
8581 // of the same time so that they get CSEd properly.
8582 ArrayRef<int> ShuffleMask = SVN->getMask();
8583
8584 if (EltSize <= 32) {
8585 if (SVN->isSplat()) {
8586 int Lane = SVN->getSplatIndex();
8587 // If this is undef splat, generate it via "just" vdup, if possible.
8588 if (Lane == -1) Lane = 0;
8589
8590 // Test if V1 is a SCALAR_TO_VECTOR.
8591 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
8592 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
8593 }
8594 // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
8595 // (and probably will turn into a SCALAR_TO_VECTOR once legalization
8596 // reaches it).
8597 if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR &&
8599 bool IsScalarToVector = true;
8600 for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i)
8601 if (!V1.getOperand(i).isUndef()) {
8602 IsScalarToVector = false;
8603 break;
8604 }
8605 if (IsScalarToVector)
8606 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
8607 }
8608 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
8609 DAG.getConstant(Lane, dl, MVT::i32));
8610 }
8611
8612 bool ReverseVEXT = false;
8613 unsigned Imm = 0;
8614 if (ST->hasNEON() && isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
8615 if (ReverseVEXT)
8616 std::swap(V1, V2);
8617 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
8618 DAG.getConstant(Imm, dl, MVT::i32));
8619 }
8620
8621 if (isVREVMask(ShuffleMask, VT, 64))
8622 return DAG.getNode(ARMISD::VREV64, dl, VT, V1);
8623 if (isVREVMask(ShuffleMask, VT, 32))
8624 return DAG.getNode(ARMISD::VREV32, dl, VT, V1);
8625 if (isVREVMask(ShuffleMask, VT, 16))
8626 return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
8627
8628 if (ST->hasNEON() && V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
8629 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1,
8630 DAG.getConstant(Imm, dl, MVT::i32));
8631 }
8632
8633 // Check for Neon shuffles that modify both input vectors in place.
8634 // If both results are used, i.e., if there are two shuffles with the same
8635 // source operands and with masks corresponding to both results of one of
8636 // these operations, DAG memoization will ensure that a single node is
8637 // used for both shuffles.
8638 unsigned WhichResult = 0;
8639 bool isV_UNDEF = false;
8640 if (ST->hasNEON()) {
8641 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
8642 ShuffleMask, VT, WhichResult, isV_UNDEF)) {
8643 if (isV_UNDEF)
8644 V2 = V1;
8645 return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2)
8646 .getValue(WhichResult);
8647 }
8648 }
8649 if (ST->hasMVEIntegerOps()) {
8650 if (isVMOVNMask(ShuffleMask, VT, false, false))
8651 return DAG.getNode(ARMISD::VMOVN, dl, VT, V2, V1,
8652 DAG.getConstant(0, dl, MVT::i32));
8653 if (isVMOVNMask(ShuffleMask, VT, true, false))
8654 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V2,
8655 DAG.getConstant(1, dl, MVT::i32));
8656 if (isVMOVNMask(ShuffleMask, VT, true, true))
8657 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V1,
8658 DAG.getConstant(1, dl, MVT::i32));
8659 }
8660
8661 // Also check for these shuffles through CONCAT_VECTORS: we canonicalize
8662 // shuffles that produce a result larger than their operands with:
8663 // shuffle(concat(v1, undef), concat(v2, undef))
8664 // ->
8665 // shuffle(concat(v1, v2), undef)
8666 // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine).
8667 //
8668 // This is useful in the general case, but there are special cases where
8669 // native shuffles produce larger results: the two-result ops.
8670 //
8671 // Look through the concat when lowering them:
8672 // shuffle(concat(v1, v2), undef)
8673 // ->
8674 // concat(VZIP(v1, v2):0, :1)
8675 //
8676 if (ST->hasNEON() && V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) {
8677 SDValue SubV1 = V1->getOperand(0);
8678 SDValue SubV2 = V1->getOperand(1);
8679 EVT SubVT = SubV1.getValueType();
8680
8681 // We expect these to have been canonicalized to -1.
8682 assert(llvm::all_of(ShuffleMask, [&](int i) {
8683 return i < (int)VT.getVectorNumElements();
8684 }) && "Unexpected shuffle index into UNDEF operand!");
8685
8686 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
8687 ShuffleMask, SubVT, WhichResult, isV_UNDEF)) {
8688 if (isV_UNDEF)
8689 SubV2 = SubV1;
8690 assert((WhichResult == 0) &&
8691 "In-place shuffle of concat can only have one result!");
8692 SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT),
8693 SubV1, SubV2);
8694 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0),
8695 Res.getValue(1));
8696 }
8697 }
8698 }
8699
8700 if (ST->hasMVEIntegerOps() && EltSize <= 32) {
8701 if (SDValue V = LowerVECTOR_SHUFFLEUsingOneOff(Op, ShuffleMask, DAG))
8702 return V;
8703
8704 for (bool Top : {false, true}) {
8705 for (bool SingleSource : {false, true}) {
8706 if (isTruncMask(ShuffleMask, VT, Top, SingleSource)) {
8707 MVT FromSVT = MVT::getIntegerVT(EltSize * 2);
8708 MVT FromVT = MVT::getVectorVT(FromSVT, ShuffleMask.size() / 2);
8709 SDValue Lo = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, FromVT, V1);
8710 SDValue Hi = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, FromVT,
8711 SingleSource ? V1 : V2);
8712 if (Top) {
8713 SDValue Amt = DAG.getConstant(EltSize, dl, FromVT);
8714 Lo = DAG.getNode(ISD::SRL, dl, FromVT, Lo, Amt);
8715 Hi = DAG.getNode(ISD::SRL, dl, FromVT, Hi, Amt);
8716 }
8717 return DAG.getNode(ARMISD::MVETRUNC, dl, VT, Lo, Hi);
8718 }
8719 }
8720 }
8721 }
8722
8723 // If the shuffle is not directly supported and it has 4 elements, use
8724 // the PerfectShuffle-generated table to synthesize it from other shuffles.
8725 unsigned NumElts = VT.getVectorNumElements();
8726 if (NumElts == 4) {
8727 unsigned PFIndexes[4];
8728 for (unsigned i = 0; i != 4; ++i) {
8729 if (ShuffleMask[i] < 0)
8730 PFIndexes[i] = 8;
8731 else
8732 PFIndexes[i] = ShuffleMask[i];
8733 }
8734
8735 // Compute the index in the perfect shuffle table.
8736 unsigned PFTableIndex =
8737 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
8738 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
8739 unsigned Cost = (PFEntry >> 30);
8740
8741 if (Cost <= 4) {
8742 if (ST->hasNEON())
8743 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
8744 else if (isLegalMVEShuffleOp(PFEntry)) {
8745 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
8746 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
8747 unsigned PFEntryLHS = PerfectShuffleTable[LHSID];
8748 unsigned PFEntryRHS = PerfectShuffleTable[RHSID];
8749 if (isLegalMVEShuffleOp(PFEntryLHS) && isLegalMVEShuffleOp(PFEntryRHS))
8750 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
8751 }
8752 }
8753 }
8754
8755 // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs.
8756 if (EltSize >= 32) {
8757 // Do the expansion with floating-point types, since that is what the VFP
8758 // registers are defined to use, and since i64 is not legal.
8759 EVT EltVT = EVT::getFloatingPointVT(EltSize);
8760 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
8761 V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1);
8762 V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2);
8764 for (unsigned i = 0; i < NumElts; ++i) {
8765 if (ShuffleMask[i] < 0)
8766 Ops.push_back(DAG.getUNDEF(EltVT));
8767 else
8768 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
8769 ShuffleMask[i] < (int)NumElts ? V1 : V2,
8770 DAG.getConstant(ShuffleMask[i] & (NumElts-1),
8771 dl, MVT::i32)));
8772 }
8773 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
8774 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
8775 }
8776
8777 if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8778 isReverseMask(ShuffleMask, VT))
8779 return LowerReverse_VECTOR_SHUFFLE(Op, DAG);
8780
8781 if (ST->hasNEON() && VT == MVT::v8i8)
8782 if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG))
8783 return NewOp;
8784
8785 if (ST->hasMVEIntegerOps())
8786 if (SDValue NewOp = LowerVECTOR_SHUFFLEUsingMovs(Op, ShuffleMask, DAG))
8787 return NewOp;
8788
8789 return SDValue();
8790}
8791
8793 const ARMSubtarget *ST) {
8794 EVT VecVT = Op.getOperand(0).getValueType();
8795 SDLoc dl(Op);
8796
8797 assert(ST->hasMVEIntegerOps() &&
8798 "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
8799
8800 SDValue Conv =
8801 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
8802 unsigned Lane = Op.getConstantOperandVal(2);
8803 unsigned LaneWidth =
8805 unsigned Mask = ((1 << LaneWidth) - 1) << Lane * LaneWidth;
8806 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32,
8807 Op.getOperand(1), DAG.getValueType(MVT::i1));
8808 SDValue BFI = DAG.getNode(ARMISD::BFI, dl, MVT::i32, Conv, Ext,
8809 DAG.getConstant(~Mask, dl, MVT::i32));
8810 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), BFI);
8811}
8812
8813SDValue ARMTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
8814 SelectionDAG &DAG) const {
8815 // INSERT_VECTOR_ELT is legal only for immediate indexes.
8816 SDValue Lane = Op.getOperand(2);
8817 if (!isa<ConstantSDNode>(Lane))
8818 return SDValue();
8819
8820 SDValue Elt = Op.getOperand(1);
8821 EVT EltVT = Elt.getValueType();
8822
8823 if (Subtarget->hasMVEIntegerOps() &&
8824 Op.getValueType().getScalarSizeInBits() == 1)
8825 return LowerINSERT_VECTOR_ELT_i1(Op, DAG, Subtarget);
8826
8827 if (getTypeAction(*DAG.getContext(), EltVT) ==
8829 // INSERT_VECTOR_ELT doesn't want f16 operands promoting to f32,
8830 // but the type system will try to do that if we don't intervene.
8831 // Reinterpret any such vector-element insertion as one with the
8832 // corresponding integer types.
8833
8834 SDLoc dl(Op);
8835
8836 EVT IEltVT = MVT::getIntegerVT(EltVT.getScalarSizeInBits());
8837 assert(getTypeAction(*DAG.getContext(), IEltVT) !=
8839
8840 SDValue VecIn = Op.getOperand(0);
8841 EVT VecVT = VecIn.getValueType();
8842 EVT IVecVT = EVT::getVectorVT(*DAG.getContext(), IEltVT,
8843 VecVT.getVectorNumElements());
8844
8845 SDValue IElt = DAG.getNode(ISD::BITCAST, dl, IEltVT, Elt);
8846 SDValue IVecIn = DAG.getNode(ISD::BITCAST, dl, IVecVT, VecIn);
8847 SDValue IVecOut = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVecVT,
8848 IVecIn, IElt, Lane);
8849 return DAG.getNode(ISD::BITCAST, dl, VecVT, IVecOut);
8850 }
8851
8852 return Op;
8853}
8854
8856 const ARMSubtarget *ST) {
8857 EVT VecVT = Op.getOperand(0).getValueType();
8858 SDLoc dl(Op);
8859
8860 assert(ST->hasMVEIntegerOps() &&
8861 "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
8862
8863 SDValue Conv =
8864 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
8865 unsigned Lane = Op.getConstantOperandVal(1);
8866 unsigned LaneWidth =
8868 SDValue Shift = DAG.getNode(ISD::SRL, dl, MVT::i32, Conv,
8869 DAG.getConstant(Lane * LaneWidth, dl, MVT::i32));
8870 return Shift;
8871}
8872
8874 const ARMSubtarget *ST) {
8875 // EXTRACT_VECTOR_ELT is legal only for immediate indexes.
8876 SDValue Lane = Op.getOperand(1);
8877 if (!isa<ConstantSDNode>(Lane))
8878 return SDValue();
8879
8880 SDValue Vec = Op.getOperand(0);
8881 EVT VT = Vec.getValueType();
8882
8883 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
8884 return LowerEXTRACT_VECTOR_ELT_i1(Op, DAG, ST);
8885
8886 if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) {
8887 SDLoc dl(Op);
8888 return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
8889 }
8890
8891 return Op;
8892}
8893
8895 const ARMSubtarget *ST) {
8896 SDLoc dl(Op);
8897 assert(Op.getValueType().getScalarSizeInBits() == 1 &&
8898 "Unexpected custom CONCAT_VECTORS lowering");
8899 assert(isPowerOf2_32(Op.getNumOperands()) &&
8900 "Unexpected custom CONCAT_VECTORS lowering");
8901 assert(ST->hasMVEIntegerOps() &&
8902 "CONCAT_VECTORS lowering only supported for MVE");
8903
8904 auto ConcatPair = [&](SDValue V1, SDValue V2) {
8905 EVT Op1VT = V1.getValueType();
8906 EVT Op2VT = V2.getValueType();
8907 assert(Op1VT == Op2VT && "Operand types don't match!");
8908 assert((Op1VT == MVT::v2i1 || Op1VT == MVT::v4i1 || Op1VT == MVT::v8i1) &&
8909 "Unexpected i1 concat operations!");
8910 EVT VT = Op1VT.getDoubleNumVectorElementsVT(*DAG.getContext());
8911
8912 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
8913 SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG);
8914
8915 // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets
8916 // promoted to v8i16, etc.
8917 MVT ElType =
8919 unsigned NumElts = 2 * Op1VT.getVectorNumElements();
8920
8921 EVT ConcatVT = MVT::getVectorVT(ElType, NumElts);
8922 if (Op1VT == MVT::v4i1 || Op1VT == MVT::v8i1) {
8923 // Use MVETRUNC to truncate the combined NewV1::NewV2 into the smaller
8924 // ConcatVT.
8925 SDValue ConVec =
8926 DAG.getNode(ARMISD::MVETRUNC, dl, ConcatVT, NewV1, NewV2);
8927 return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
8928 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8929 }
8930
8931 // Extract the vector elements from Op1 and Op2 one by one and truncate them
8932 // to be the right size for the destination. For example, if Op1 is v4i1
8933 // then the promoted vector is v4i32. The result of concatenation gives a
8934 // v8i1, which when promoted is v8i16. That means each i32 element from Op1
8935 // needs truncating to i16 and inserting in the result.
8936 auto ExtractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) {
8937 EVT NewVT = NewV.getValueType();
8938 EVT ConcatVT = ConVec.getValueType();
8939 unsigned ExtScale = 1;
8940 if (NewVT == MVT::v2f64) {
8941 NewV = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, NewV);
8942 ExtScale = 2;
8943 }
8944 for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) {
8945 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV,
8946 DAG.getIntPtrConstant(i * ExtScale, dl));
8947 ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt,
8948 DAG.getConstant(j, dl, MVT::i32));
8949 }
8950 return ConVec;
8951 };
8952 unsigned j = 0;
8953 SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT);
8954 ConVec = ExtractInto(NewV1, ConVec, j);
8955 ConVec = ExtractInto(NewV2, ConVec, j);
8956
8957 // Now return the result of comparing the subvector with zero, which will
8958 // generate a real predicate, i.e. v4i1, v8i1 or v16i1.
8959 return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
8960 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8961 };
8962
8963 // Concat each pair of subvectors and pack into the lower half of the array.
8964 SmallVector<SDValue> ConcatOps(Op->ops());
8965 while (ConcatOps.size() > 1) {
8966 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
8967 SDValue V1 = ConcatOps[I];
8968 SDValue V2 = ConcatOps[I + 1];
8969 ConcatOps[I / 2] = ConcatPair(V1, V2);
8970 }
8971 ConcatOps.resize(ConcatOps.size() / 2);
8972 }
8973 return ConcatOps[0];
8974}
8975
8977 const ARMSubtarget *ST) {
8978 EVT VT = Op->getValueType(0);
8979 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
8980 return LowerCONCAT_VECTORS_i1(Op, DAG, ST);
8981
8982 // The only time a CONCAT_VECTORS operation can have legal types is when
8983 // two 64-bit vectors are concatenated to a 128-bit vector.
8984 assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&
8985 "unexpected CONCAT_VECTORS");
8986 SDLoc dl(Op);
8987 SDValue Val = DAG.getUNDEF(MVT::v2f64);
8988 SDValue Op0 = Op.getOperand(0);
8989 SDValue Op1 = Op.getOperand(1);
8990 if (!Op0.isUndef())
8991 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
8992 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0),
8993 DAG.getIntPtrConstant(0, dl));
8994 if (!Op1.isUndef())
8995 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
8996 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1),
8997 DAG.getIntPtrConstant(1, dl));
8998 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val);
8999}
9000
9002 const ARMSubtarget *ST) {
9003 SDValue V1 = Op.getOperand(0);
9004 SDValue V2 = Op.getOperand(1);
9005 SDLoc dl(Op);
9006 EVT VT = Op.getValueType();
9007 EVT Op1VT = V1.getValueType();
9008 unsigned NumElts = VT.getVectorNumElements();
9009 unsigned Index = V2->getAsZExtVal();
9010
9011 assert(VT.getScalarSizeInBits() == 1 &&
9012 "Unexpected custom EXTRACT_SUBVECTOR lowering");
9013 assert(ST->hasMVEIntegerOps() &&
9014 "EXTRACT_SUBVECTOR lowering only supported for MVE");
9015
9016 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
9017
9018 // We now have Op1 promoted to a vector of integers, where v8i1 gets
9019 // promoted to v8i16, etc.
9020
9022
9023 if (NumElts == 2) {
9024 EVT SubVT = MVT::v4i32;
9025 SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT);
9026 for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j += 2) {
9027 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1,
9028 DAG.getIntPtrConstant(i, dl));
9029 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
9030 DAG.getConstant(j, dl, MVT::i32));
9031 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
9032 DAG.getConstant(j + 1, dl, MVT::i32));
9033 }
9034 SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, SubVec,
9035 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
9036 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp);
9037 }
9038
9039 EVT SubVT = MVT::getVectorVT(ElType, NumElts);
9040 SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT);
9041 for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j++) {
9042 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1,
9043 DAG.getIntPtrConstant(i, dl));
9044 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
9045 DAG.getConstant(j, dl, MVT::i32));
9046 }
9047
9048 // Now return the result of comparing the subvector with zero,
9049 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
9050 return DAG.getNode(ARMISD::VCMPZ, dl, VT, SubVec,
9051 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
9052}
9053
9054// Turn a truncate into a predicate (an i1 vector) into icmp(and(x, 1), 0).
9056 const ARMSubtarget *ST) {
9057 assert(ST->hasMVEIntegerOps() && "Expected MVE!");
9058 EVT VT = N->getValueType(0);
9059 assert((VT == MVT::v16i1 || VT == MVT::v8i1 || VT == MVT::v4i1) &&
9060 "Expected a vector i1 type!");
9061 SDValue Op = N->getOperand(0);
9062 EVT FromVT = Op.getValueType();
9063 SDLoc DL(N);
9064
9065 SDValue And =
9066 DAG.getNode(ISD::AND, DL, FromVT, Op, DAG.getConstant(1, DL, FromVT));
9067 return DAG.getNode(ISD::SETCC, DL, VT, And, DAG.getConstant(0, DL, FromVT),
9068 DAG.getCondCode(ISD::SETNE));
9069}
9070
9072 const ARMSubtarget *Subtarget) {
9073 if (!Subtarget->hasMVEIntegerOps())
9074 return SDValue();
9075
9076 EVT ToVT = N->getValueType(0);
9077 if (ToVT.getScalarType() == MVT::i1)
9078 return LowerTruncatei1(N, DAG, Subtarget);
9079
9080 // MVE does not have a single instruction to perform the truncation of a v4i32
9081 // into the lower half of a v8i16, in the same way that a NEON vmovn would.
9082 // Most of the instructions in MVE follow the 'Beats' system, where moving
9083 // values from different lanes is usually something that the instructions
9084 // avoid.
9085 //
9086 // Instead it has top/bottom instructions such as VMOVLT/B and VMOVNT/B,
9087 // which take a the top/bottom half of a larger lane and extend it (or do the
9088 // opposite, truncating into the top/bottom lane from a larger lane). Note
9089 // that because of the way we widen lanes, a v4i16 is really a v4i32 using the
9090 // bottom 16bits from each vector lane. This works really well with T/B
9091 // instructions, but that doesn't extend to v8i32->v8i16 where the lanes need
9092 // to move order.
9093 //
9094 // But truncates and sext/zext are always going to be fairly common from llvm.
9095 // We have several options for how to deal with them:
9096 // - Wherever possible combine them into an instruction that makes them
9097 // "free". This includes loads/stores, which can perform the trunc as part
9098 // of the memory operation. Or certain shuffles that can be turned into
9099 // VMOVN/VMOVL.
9100 // - Lane Interleaving to transform blocks surrounded by ext/trunc. So
9101 // trunc(mul(sext(a), sext(b))) may become
9102 // VMOVNT(VMUL(VMOVLB(a), VMOVLB(b)), VMUL(VMOVLT(a), VMOVLT(b))). (Which in
9103 // this case can use VMULL). This is performed in the
9104 // MVELaneInterleavingPass.
9105 // - Otherwise we have an option. By default we would expand the
9106 // zext/sext/trunc into a series of lane extract/inserts going via GPR
9107 // registers. One for each vector lane in the vector. This can obviously be
9108 // very expensive.
9109 // - The other option is to use the fact that loads/store can extend/truncate
9110 // to turn a trunc into two truncating stack stores and a stack reload. This
9111 // becomes 3 back-to-back memory operations, but at least that is less than
9112 // all the insert/extracts.
9113 //
9114 // In order to do the last, we convert certain trunc's into MVETRUNC, which
9115 // are either optimized where they can be, or eventually lowered into stack
9116 // stores/loads. This prevents us from splitting a v8i16 trunc into two stores
9117 // two early, where other instructions would be better, and stops us from
9118 // having to reconstruct multiple buildvector shuffles into loads/stores.
9119 if (ToVT != MVT::v8i16 && ToVT != MVT::v16i8)
9120 return SDValue();
9121 EVT FromVT = N->getOperand(0).getValueType();
9122 if (FromVT != MVT::v8i32 && FromVT != MVT::v16i16)
9123 return SDValue();
9124
9125 SDValue Lo, Hi;
9126 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
9127 SDLoc DL(N);
9128 return DAG.getNode(ARMISD::MVETRUNC, DL, ToVT, Lo, Hi);
9129}
9130
9132 const ARMSubtarget *Subtarget) {
9133 if (!Subtarget->hasMVEIntegerOps())
9134 return SDValue();
9135
9136 // See LowerTruncate above for an explanation of MVEEXT/MVETRUNC.
9137
9138 EVT ToVT = N->getValueType(0);
9139 if (ToVT != MVT::v16i32 && ToVT != MVT::v8i32 && ToVT != MVT::v16i16)
9140 return SDValue();
9141 SDValue Op = N->getOperand(0);
9142 EVT FromVT = Op.getValueType();
9143 if (FromVT != MVT::v8i16 && FromVT != MVT::v16i8)
9144 return SDValue();
9145
9146 SDLoc DL(N);
9147 EVT ExtVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext());
9148 if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8)
9149 ExtVT = MVT::v8i16;
9150
9151 unsigned Opcode =
9153 SDValue Ext = DAG.getNode(Opcode, DL, DAG.getVTList(ExtVT, ExtVT), Op);
9154 SDValue Ext1 = Ext.getValue(1);
9155
9156 if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8) {
9157 Ext = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext);
9158 Ext1 = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext1);
9159 }
9160
9161 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Ext, Ext1);
9162}
9163
9164/// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
9165/// element has been zero/sign-extended, depending on the isSigned parameter,
9166/// from an integer type half its size.
9168 bool isSigned) {
9169 // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32.
9170 EVT VT = N->getValueType(0);
9171 if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) {
9172 SDNode *BVN = N->getOperand(0).getNode();
9173 if (BVN->getValueType(0) != MVT::v4i32 ||
9174 BVN->getOpcode() != ISD::BUILD_VECTOR)
9175 return false;
9176 unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
9177 unsigned HiElt = 1 - LoElt;
9182 if (!Lo0 || !Hi0 || !Lo1 || !Hi1)
9183 return false;
9184 if (isSigned) {
9185 if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 &&
9186 Hi1->getSExtValue() == Lo1->getSExtValue() >> 32)
9187 return true;
9188 } else {
9189 if (Hi0->isZero() && Hi1->isZero())
9190 return true;
9191 }
9192 return false;
9193 }
9194
9195 if (N->getOpcode() != ISD::BUILD_VECTOR)
9196 return false;
9197
9198 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
9199 SDNode *Elt = N->getOperand(i).getNode();
9201 unsigned EltSize = VT.getScalarSizeInBits();
9202 unsigned HalfSize = EltSize / 2;
9203 if (isSigned) {
9204 if (!isIntN(HalfSize, C->getSExtValue()))
9205 return false;
9206 } else {
9207 if (!isUIntN(HalfSize, C->getZExtValue()))
9208 return false;
9209 }
9210 continue;
9211 }
9212 return false;
9213 }
9214
9215 return true;
9216}
9217
9218/// isSignExtended - Check if a node is a vector value that is sign-extended
9219/// or a constant BUILD_VECTOR with sign-extended elements.
9221 if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N))
9222 return true;
9223 if (isExtendedBUILD_VECTOR(N, DAG, true))
9224 return true;
9225 return false;
9226}
9227
9228/// isZeroExtended - Check if a node is a vector value that is zero-extended (or
9229/// any-extended) or a constant BUILD_VECTOR with zero-extended elements.
9231 if (N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND ||
9233 return true;
9234 if (isExtendedBUILD_VECTOR(N, DAG, false))
9235 return true;
9236 return false;
9237}
9238
9239static EVT getExtensionTo64Bits(const EVT &OrigVT) {
9240 if (OrigVT.getSizeInBits() >= 64)
9241 return OrigVT;
9242
9243 assert(OrigVT.isSimple() && "Expecting a simple value type");
9244
9245 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
9246 switch (OrigSimpleTy) {
9247 default: llvm_unreachable("Unexpected Vector Type");
9248 case MVT::v2i8:
9249 case MVT::v2i16:
9250 return MVT::v2i32;
9251 case MVT::v4i8:
9252 return MVT::v4i16;
9253 }
9254}
9255
9256/// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total
9257/// value size to 64 bits. We need a 64-bit D register as an operand to VMULL.
9258/// We insert the required extension here to get the vector to fill a D register.
9260 const EVT &OrigTy,
9261 const EVT &ExtTy,
9262 unsigned ExtOpcode) {
9263 // The vector originally had a size of OrigTy. It was then extended to ExtTy.
9264 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
9265 // 64-bits we need to insert a new extension so that it will be 64-bits.
9266 assert(ExtTy.is128BitVector() && "Unexpected extension size");
9267 if (OrigTy.getSizeInBits() >= 64)
9268 return N;
9269
9270 // Must extend size to at least 64 bits to be used as an operand for VMULL.
9271 EVT NewVT = getExtensionTo64Bits(OrigTy);
9272
9273 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
9274}
9275
9276/// SkipLoadExtensionForVMULL - return a load of the original vector size that
9277/// does not do any sign/zero extension. If the original vector is less
9278/// than 64 bits, an appropriate extension will be added after the load to
9279/// reach a total size of 64 bits. We have to add the extension separately
9280/// because ARM does not have a sign/zero extending load for vectors.
9282 EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT());
9283
9284 // The load already has the right type.
9285 if (ExtendedTy == LD->getMemoryVT())
9286 return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(),
9287 LD->getBasePtr(), LD->getPointerInfo(), LD->getAlign(),
9288 LD->getMemOperand()->getFlags());
9289
9290 // We need to create a zextload/sextload. We cannot just create a load
9291 // followed by a zext/zext node because LowerMUL is also run during normal
9292 // operation legalization where we can't create illegal types.
9293 return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy,
9294 LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
9295 LD->getMemoryVT(), LD->getAlign(),
9296 LD->getMemOperand()->getFlags());
9297}
9298
9299/// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
9300/// ANY_EXTEND, extending load, or BUILD_VECTOR with extended elements, return
9301/// the unextended value. The unextended vector should be 64 bits so that it can
9302/// be used as an operand to a VMULL instruction. If the original vector size
9303/// before extension is less than 64 bits we add a an extension to resize
9304/// the vector to 64 bits.
9306 if (N->getOpcode() == ISD::SIGN_EXTEND ||
9307 N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND)
9308 return AddRequiredExtensionForVMULL(N->getOperand(0), DAG,
9309 N->getOperand(0)->getValueType(0),
9310 N->getValueType(0),
9311 N->getOpcode());
9312
9313 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
9314 assert((ISD::isSEXTLoad(LD) || ISD::isZEXTLoad(LD)) &&
9315 "Expected extending load");
9316
9317 SDValue newLoad = SkipLoadExtensionForVMULL(LD, DAG);
9318 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), newLoad.getValue(1));
9319 unsigned Opcode = ISD::isSEXTLoad(LD) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
9320 SDValue extLoad =
9321 DAG.getNode(Opcode, SDLoc(newLoad), LD->getValueType(0), newLoad);
9322 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 0), extLoad);
9323
9324 return newLoad;
9325 }
9326
9327 // Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will
9328 // have been legalized as a BITCAST from v4i32.
9329 if (N->getOpcode() == ISD::BITCAST) {
9330 SDNode *BVN = N->getOperand(0).getNode();
9332 BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR");
9333 unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
9334 return DAG.getBuildVector(
9335 MVT::v2i32, SDLoc(N),
9336 {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)});
9337 }
9338 // Construct a new BUILD_VECTOR with elements truncated to half the size.
9339 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
9340 EVT VT = N->getValueType(0);
9341 unsigned EltSize = VT.getScalarSizeInBits() / 2;
9342 unsigned NumElts = VT.getVectorNumElements();
9343 MVT TruncVT = MVT::getIntegerVT(EltSize);
9345 SDLoc dl(N);
9346 for (unsigned i = 0; i != NumElts; ++i) {
9347 const APInt &CInt = N->getConstantOperandAPInt(i);
9348 // Element types smaller than 32 bits are not legal, so use i32 elements.
9349 // The values are implicitly truncated so sext vs. zext doesn't matter.
9350 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
9351 }
9352 return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
9353}
9354
9355static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
9356 unsigned Opcode = N->getOpcode();
9357 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
9358 SDNode *N0 = N->getOperand(0).getNode();
9359 SDNode *N1 = N->getOperand(1).getNode();
9360 return N0->hasOneUse() && N1->hasOneUse() &&
9361 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
9362 }
9363 return false;
9364}
9365
9366static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
9367 unsigned Opcode = N->getOpcode();
9368 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
9369 SDNode *N0 = N->getOperand(0).getNode();
9370 SDNode *N1 = N->getOperand(1).getNode();
9371 return N0->hasOneUse() && N1->hasOneUse() &&
9372 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
9373 }
9374 return false;
9375}
9376
9378 // Multiplications are only custom-lowered for 128-bit vectors so that
9379 // VMULL can be detected. Otherwise v2i64 multiplications are not legal.
9380 EVT VT = Op.getValueType();
9381 assert(VT.is128BitVector() && VT.isInteger() &&
9382 "unexpected type for custom-lowering ISD::MUL");
9383 SDNode *N0 = Op.getOperand(0).getNode();
9384 SDNode *N1 = Op.getOperand(1).getNode();
9385 unsigned NewOpc = 0;
9386 bool isMLA = false;
9387 bool isN0SExt = isSignExtended(N0, DAG);
9388 bool isN1SExt = isSignExtended(N1, DAG);
9389 if (isN0SExt && isN1SExt)
9390 NewOpc = ARMISD::VMULLs;
9391 else {
9392 bool isN0ZExt = isZeroExtended(N0, DAG);
9393 bool isN1ZExt = isZeroExtended(N1, DAG);
9394 if (isN0ZExt && isN1ZExt)
9395 NewOpc = ARMISD::VMULLu;
9396 else if (isN1SExt || isN1ZExt) {
9397 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
9398 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
9399 if (isN1SExt && isAddSubSExt(N0, DAG)) {
9400 NewOpc = ARMISD::VMULLs;
9401 isMLA = true;
9402 } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
9403 NewOpc = ARMISD::VMULLu;
9404 isMLA = true;
9405 } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
9406 std::swap(N0, N1);
9407 NewOpc = ARMISD::VMULLu;
9408 isMLA = true;
9409 }
9410 }
9411
9412 if (!NewOpc) {
9413 if (VT == MVT::v2i64)
9414 // Fall through to expand this. It is not legal.
9415 return SDValue();
9416 else
9417 // Other vector multiplications are legal.
9418 return Op;
9419 }
9420 }
9421
9422 // Legalize to a VMULL instruction.
9423 SDLoc DL(Op);
9424 SDValue Op0;
9425 SDValue Op1 = SkipExtensionForVMULL(N1, DAG);
9426 if (!isMLA) {
9427 Op0 = SkipExtensionForVMULL(N0, DAG);
9429 Op1.getValueType().is64BitVector() &&
9430 "unexpected types for extended operands to VMULL");
9431 return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
9432 }
9433
9434 // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during
9435 // isel lowering to take advantage of no-stall back to back vmul + vmla.
9436 // vmull q0, d4, d6
9437 // vmlal q0, d5, d6
9438 // is faster than
9439 // vaddl q0, d4, d5
9440 // vmovl q1, d6
9441 // vmul q0, q0, q1
9442 SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG);
9443 SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG);
9444 EVT Op1VT = Op1.getValueType();
9445 return DAG.getNode(N0->getOpcode(), DL, VT,
9446 DAG.getNode(NewOpc, DL, VT,
9447 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
9448 DAG.getNode(NewOpc, DL, VT,
9449 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
9450}
9451
9453 SelectionDAG &DAG) {
9454 // TODO: Should this propagate fast-math-flags?
9455
9456 // Convert to float
9457 // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
9458 // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo));
9459 X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X);
9460 Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y);
9461 X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X);
9462 Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y);
9463 // Get reciprocal estimate.
9464 // float4 recip = vrecpeq_f32(yf);
9465 Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9466 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9467 Y);
9468 // Because char has a smaller range than uchar, we can actually get away
9469 // without any newton steps. This requires that we use a weird bias
9470 // of 0xb000, however (again, this has been exhaustively tested).
9471 // float4 result = as_float4(as_int4(xf*recip) + 0xb000);
9472 X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y);
9473 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X);
9474 Y = DAG.getConstant(0xb000, dl, MVT::v4i32);
9475 X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y);
9476 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X);
9477 // Convert back to short.
9478 X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X);
9479 X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X);
9480 return X;
9481}
9482
9484 SelectionDAG &DAG) {
9485 // TODO: Should this propagate fast-math-flags?
9486
9487 SDValue N2;
9488 // Convert to float.
9489 // float4 yf = vcvt_f32_s32(vmovl_s16(y));
9490 // float4 xf = vcvt_f32_s32(vmovl_s16(x));
9491 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0);
9492 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1);
9493 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
9494 N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
9495
9496 // Use reciprocal estimate and one refinement step.
9497 // float4 recip = vrecpeq_f32(yf);
9498 // recip *= vrecpsq_f32(yf, recip);
9499 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9500 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9501 N1);
9502 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9503 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9504 N1, N2);
9505 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9506 // Because short has a smaller range than ushort, we can actually get away
9507 // with only a single newton step. This requires that we use a weird bias
9508 // of 89, however (again, this has been exhaustively tested).
9509 // float4 result = as_float4(as_int4(xf*recip) + 0x89);
9510 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
9511 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
9512 N1 = DAG.getConstant(0x89, dl, MVT::v4i32);
9513 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
9514 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
9515 // Convert back to integer and return.
9516 // return vmovn_s32(vcvt_s32_f32(result));
9517 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
9518 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
9519 return N0;
9520}
9521
9523 const ARMSubtarget *ST) {
9524 EVT VT = Op.getValueType();
9525 assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
9526 "unexpected type for custom-lowering ISD::SDIV");
9527
9528 SDLoc dl(Op);
9529 SDValue N0 = Op.getOperand(0);
9530 SDValue N1 = Op.getOperand(1);
9531 SDValue N2, N3;
9532
9533 if (VT == MVT::v8i8) {
9534 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0);
9535 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1);
9536
9537 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9538 DAG.getIntPtrConstant(4, dl));
9539 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9540 DAG.getIntPtrConstant(4, dl));
9541 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9542 DAG.getIntPtrConstant(0, dl));
9543 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9544 DAG.getIntPtrConstant(0, dl));
9545
9546 N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16
9547 N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16
9548
9549 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
9550 N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
9551
9552 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0);
9553 return N0;
9554 }
9555 return LowerSDIV_v4i16(N0, N1, dl, DAG);
9556}
9557
9559 const ARMSubtarget *ST) {
9560 // TODO: Should this propagate fast-math-flags?
9561 EVT VT = Op.getValueType();
9562 assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
9563 "unexpected type for custom-lowering ISD::UDIV");
9564
9565 SDLoc dl(Op);
9566 SDValue N0 = Op.getOperand(0);
9567 SDValue N1 = Op.getOperand(1);
9568 SDValue N2, N3;
9569
9570 if (VT == MVT::v8i8) {
9571 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0);
9572 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1);
9573
9574 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9575 DAG.getIntPtrConstant(4, dl));
9576 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9577 DAG.getIntPtrConstant(4, dl));
9578 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9579 DAG.getIntPtrConstant(0, dl));
9580 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9581 DAG.getIntPtrConstant(0, dl));
9582
9583 N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16
9584 N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16
9585
9586 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
9587 N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
9588
9589 N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8,
9590 DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl,
9591 MVT::i32),
9592 N0);
9593 return N0;
9594 }
9595
9596 // v4i16 sdiv ... Convert to float.
9597 // float4 yf = vcvt_f32_s32(vmovl_u16(y));
9598 // float4 xf = vcvt_f32_s32(vmovl_u16(x));
9599 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0);
9600 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1);
9601 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
9602 SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
9603
9604 // Use reciprocal estimate and two refinement steps.
9605 // float4 recip = vrecpeq_f32(yf);
9606 // recip *= vrecpsq_f32(yf, recip);
9607 // recip *= vrecpsq_f32(yf, recip);
9608 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9609 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9610 BN1);
9611 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9612 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9613 BN1, N2);
9614 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9615 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9616 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9617 BN1, N2);
9618 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9619 // Simply multiplying by the reciprocal estimate can leave us a few ulps
9620 // too low, so we add 2 ulps (exhaustive testing shows that this is enough,
9621 // and that it will never cause us to return an answer too large).
9622 // float4 result = as_float4(as_int4(xf*recip) + 2);
9623 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
9624 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
9625 N1 = DAG.getConstant(2, dl, MVT::v4i32);
9626 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
9627 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
9628 // Convert back to integer and return.
9629 // return vmovn_u32(vcvt_s32_f32(result));
9630 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
9631 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
9632 return N0;
9633}
9634
9636 SDNode *N = Op.getNode();
9637 EVT VT = N->getValueType(0);
9638 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
9639
9640 SDValue Carry = Op.getOperand(2);
9641
9642 SDLoc DL(Op);
9643
9644 SDValue Result;
9645 if (Op.getOpcode() == ISD::UADDO_CARRY) {
9646 // This converts the boolean value carry into the carry flag.
9647 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
9648
9649 // Do the addition proper using the carry flag we wanted.
9650 Result = DAG.getNode(ARMISD::ADDE, DL, VTs, Op.getOperand(0),
9651 Op.getOperand(1), Carry);
9652
9653 // Now convert the carry flag into a boolean value.
9654 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
9655 } else {
9656 // ARMISD::SUBE expects a carry not a borrow like ISD::USUBO_CARRY so we
9657 // have to invert the carry first.
9658 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
9659 DAG.getConstant(1, DL, MVT::i32), Carry);
9660 // This converts the boolean value carry into the carry flag.
9661 Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
9662
9663 // Do the subtraction proper using the carry flag we wanted.
9664 Result = DAG.getNode(ARMISD::SUBE, DL, VTs, Op.getOperand(0),
9665 Op.getOperand(1), Carry);
9666
9667 // Now convert the carry flag into a boolean value.
9668 Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
9669 // But the carry returned by ARMISD::SUBE is not a borrow as expected
9670 // by ISD::USUBO_CARRY, so compute 1 - C.
9671 Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
9672 DAG.getConstant(1, DL, MVT::i32), Carry);
9673 }
9674
9675 // Return both values.
9676 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry);
9677}
9678
9679SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
9680 bool Signed,
9681 SDValue &Chain) const {
9682 EVT VT = Op.getValueType();
9683 assert((VT == MVT::i32 || VT == MVT::i64) &&
9684 "unexpected type for custom lowering DIV");
9685 SDLoc dl(Op);
9686
9687 const auto &DL = DAG.getDataLayout();
9688 RTLIB::Libcall LC;
9689 if (Signed)
9690 LC = VT == MVT::i32 ? RTLIB::SDIVREM_I32 : RTLIB::SDIVREM_I64;
9691 else
9692 LC = VT == MVT::i32 ? RTLIB::UDIVREM_I32 : RTLIB::UDIVREM_I64;
9693
9694 const char *Name = getLibcallName(LC);
9695 SDValue ES = DAG.getExternalSymbol(Name, getPointerTy(DL));
9696
9698
9699 for (auto AI : {1, 0}) {
9700 SDValue Operand = Op.getOperand(AI);
9701 Args.emplace_back(Operand,
9702 Operand.getValueType().getTypeForEVT(*DAG.getContext()));
9703 }
9704
9705 CallLoweringInfo CLI(DAG);
9706 CLI.setDebugLoc(dl)
9707 .setChain(Chain)
9709 ES, std::move(Args));
9710
9711 return LowerCallTo(CLI).first;
9712}
9713
9714// This is a code size optimisation: return the original SDIV node to
9715// DAGCombiner when we don't want to expand SDIV into a sequence of
9716// instructions, and an empty node otherwise which will cause the
9717// SDIV to be expanded in DAGCombine.
9718SDValue
9719ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
9720 SelectionDAG &DAG,
9721 SmallVectorImpl<SDNode *> &Created) const {
9722 // TODO: Support SREM
9723 if (N->getOpcode() != ISD::SDIV)
9724 return SDValue();
9725
9726 const auto &ST = DAG.getSubtarget<ARMSubtarget>();
9727 const bool MinSize = ST.hasMinSize();
9728 const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode()
9729 : ST.hasDivideInARMMode();
9730
9731 // Don't touch vector types; rewriting this may lead to scalarizing
9732 // the int divs.
9733 if (N->getOperand(0).getValueType().isVector())
9734 return SDValue();
9735
9736 // Bail if MinSize is not set, and also for both ARM and Thumb mode we need
9737 // hwdiv support for this to be really profitable.
9738 if (!(MinSize && HasDivide))
9739 return SDValue();
9740
9741 // ARM mode is a bit simpler than Thumb: we can handle large power
9742 // of 2 immediates with 1 mov instruction; no further checks required,
9743 // just return the sdiv node.
9744 if (!ST.isThumb())
9745 return SDValue(N, 0);
9746
9747 // In Thumb mode, immediates larger than 128 need a wide 4-byte MOV,
9748 // and thus lose the code size benefits of a MOVS that requires only 2.
9749 // TargetTransformInfo and 'getIntImmCodeSizeCost' could be helpful here,
9750 // but as it's doing exactly this, it's not worth the trouble to get TTI.
9751 if (Divisor.sgt(128))
9752 return SDValue();
9753
9754 return SDValue(N, 0);
9755}
9756
9757SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG,
9758 bool Signed) const {
9759 assert(Op.getValueType() == MVT::i32 &&
9760 "unexpected type for custom lowering DIV");
9761 SDLoc dl(Op);
9762
9763 SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other,
9764 DAG.getEntryNode(), Op.getOperand(1));
9765
9766 return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
9767}
9768
9770 SDLoc DL(N);
9771 SDValue Op = N->getOperand(1);
9772 if (N->getValueType(0) == MVT::i32)
9773 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, Op);
9774 SDValue Lo, Hi;
9775 std::tie(Lo, Hi) = DAG.SplitScalar(Op, DL, MVT::i32, MVT::i32);
9776 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain,
9777 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi));
9778}
9779
9780void ARMTargetLowering::ExpandDIV_Windows(
9781 SDValue Op, SelectionDAG &DAG, bool Signed,
9783 const auto &DL = DAG.getDataLayout();
9784
9785 assert(Op.getValueType() == MVT::i64 &&
9786 "unexpected type for custom lowering DIV");
9787 SDLoc dl(Op);
9788
9789 SDValue DBZCHK = WinDBZCheckDenominator(DAG, Op.getNode(), DAG.getEntryNode());
9790
9791 SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
9792
9793 SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result);
9794 SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result,
9795 DAG.getConstant(32, dl, getPointerTy(DL)));
9796 Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper);
9797
9798 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lower, Upper));
9799}
9800
9802 LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
9803 EVT MemVT = LD->getMemoryVT();
9804 assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
9805 MemVT == MVT::v16i1) &&
9806 "Expected a predicate type!");
9807 assert(MemVT == Op.getValueType());
9808 assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
9809 "Expected a non-extending load");
9810 assert(LD->isUnindexed() && "Expected a unindexed load");
9811
9812 // The basic MVE VLDR on a v2i1/v4i1/v8i1 actually loads the entire 16bit
9813 // predicate, with the "v4i1" bits spread out over the 16 bits loaded. We
9814 // need to make sure that 8/4/2 bits are actually loaded into the correct
9815 // place, which means loading the value and then shuffling the values into
9816 // the bottom bits of the predicate.
9817 // Equally, VLDR for an v16i1 will actually load 32bits (so will be incorrect
9818 // for BE).
9819 // Speaking of BE, apparently the rest of llvm will assume a reverse order to
9820 // a natural VMSR(load), so needs to be reversed.
9821
9822 SDLoc dl(Op);
9823 SDValue Load = DAG.getExtLoad(
9824 ISD::EXTLOAD, dl, MVT::i32, LD->getChain(), LD->getBasePtr(),
9826 LD->getMemOperand());
9827 SDValue Val = Load;
9828 if (DAG.getDataLayout().isBigEndian())
9829 Val = DAG.getNode(ISD::SRL, dl, MVT::i32,
9830 DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Load),
9831 DAG.getConstant(32 - MemVT.getSizeInBits(), dl, MVT::i32));
9832 SDValue Pred = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Val);
9833 if (MemVT != MVT::v16i1)
9834 Pred = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Pred,
9835 DAG.getConstant(0, dl, MVT::i32));
9836 return DAG.getMergeValues({Pred, Load.getValue(1)}, dl);
9837}
9838
9839void ARMTargetLowering::LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results,
9840 SelectionDAG &DAG) const {
9841 LoadSDNode *LD = cast<LoadSDNode>(N);
9842 EVT MemVT = LD->getMemoryVT();
9843 assert(LD->isUnindexed() && "Loads should be unindexed at this point.");
9844
9845 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
9846 !Subtarget->isThumb1Only() && LD->isVolatile() &&
9847 LD->getAlign() >= Subtarget->getDualLoadStoreAlignment()) {
9848 SDLoc dl(N);
9850 ARMISD::LDRD, dl, DAG.getVTList({MVT::i32, MVT::i32, MVT::Other}),
9851 {LD->getChain(), LD->getBasePtr()}, MemVT, LD->getMemOperand());
9852 SDValue Lo = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 0 : 1);
9853 SDValue Hi = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 1 : 0);
9854 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
9855 Results.append({Pair, Result.getValue(2)});
9856 }
9857}
9858
9860 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
9861 EVT MemVT = ST->getMemoryVT();
9862 assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
9863 MemVT == MVT::v16i1) &&
9864 "Expected a predicate type!");
9865 assert(MemVT == ST->getValue().getValueType());
9866 assert(!ST->isTruncatingStore() && "Expected a non-extending store");
9867 assert(ST->isUnindexed() && "Expected a unindexed store");
9868
9869 // Only store the v2i1 or v4i1 or v8i1 worth of bits, via a buildvector with
9870 // top bits unset and a scalar store.
9871 SDLoc dl(Op);
9872 SDValue Build = ST->getValue();
9873 if (MemVT != MVT::v16i1) {
9875 for (unsigned I = 0; I < MemVT.getVectorNumElements(); I++) {
9876 unsigned Elt = DAG.getDataLayout().isBigEndian()
9877 ? MemVT.getVectorNumElements() - I - 1
9878 : I;
9879 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Build,
9880 DAG.getConstant(Elt, dl, MVT::i32)));
9881 }
9882 for (unsigned I = MemVT.getVectorNumElements(); I < 16; I++)
9883 Ops.push_back(DAG.getUNDEF(MVT::i32));
9884 Build = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i1, Ops);
9885 }
9886 SDValue GRP = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Build);
9887 if (MemVT == MVT::v16i1 && DAG.getDataLayout().isBigEndian())
9888 GRP = DAG.getNode(ISD::SRL, dl, MVT::i32,
9889 DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, GRP),
9890 DAG.getConstant(16, dl, MVT::i32));
9891 return DAG.getTruncStore(
9892 ST->getChain(), dl, GRP, ST->getBasePtr(),
9894 ST->getMemOperand());
9895}
9896
9898 const ARMSubtarget *Subtarget) {
9899 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
9900 EVT MemVT = ST->getMemoryVT();
9901 assert(ST->isUnindexed() && "Stores should be unindexed at this point.");
9902
9903 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
9904 !Subtarget->isThumb1Only() && ST->isVolatile() &&
9905 ST->getAlign() >= Subtarget->getDualLoadStoreAlignment()) {
9906 SDNode *N = Op.getNode();
9907 SDLoc dl(N);
9908
9909 SDValue Lo = DAG.getNode(
9910 ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
9911 DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 0 : 1, dl,
9912 MVT::i32));
9913 SDValue Hi = DAG.getNode(
9914 ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
9915 DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 1 : 0, dl,
9916 MVT::i32));
9917
9918 return DAG.getMemIntrinsicNode(ARMISD::STRD, dl, DAG.getVTList(MVT::Other),
9919 {ST->getChain(), Lo, Hi, ST->getBasePtr()},
9920 MemVT, ST->getMemOperand());
9921 } else if (Subtarget->hasMVEIntegerOps() &&
9922 ((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
9923 MemVT == MVT::v16i1))) {
9924 return LowerPredicateStore(Op, DAG);
9925 }
9926
9927 return SDValue();
9928}
9929
9930static bool isZeroVector(SDValue N) {
9931 return (ISD::isBuildVectorAllZeros(N.getNode()) ||
9932 (N->getOpcode() == ARMISD::VMOVIMM &&
9933 isNullConstant(N->getOperand(0))));
9934}
9935
9938 MVT VT = Op.getSimpleValueType();
9939 SDValue Mask = N->getMask();
9940 SDValue PassThru = N->getPassThru();
9941 SDLoc dl(Op);
9942
9943 if (isZeroVector(PassThru))
9944 return Op;
9945
9946 // MVE Masked loads use zero as the passthru value. Here we convert undef to
9947 // zero too, and other values are lowered to a select.
9948 SDValue ZeroVec = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
9949 DAG.getTargetConstant(0, dl, MVT::i32));
9950 SDValue NewLoad = DAG.getMaskedLoad(
9951 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask, ZeroVec,
9952 N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
9953 N->getExtensionType(), N->isExpandingLoad());
9954 SDValue Combo = NewLoad;
9955 bool PassThruIsCastZero = (PassThru.getOpcode() == ISD::BITCAST ||
9956 PassThru.getOpcode() == ARMISD::VECTOR_REG_CAST) &&
9957 isZeroVector(PassThru->getOperand(0));
9958 if (!PassThru.isUndef() && !PassThruIsCastZero)
9959 Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
9960 return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl);
9961}
9962
9964 const ARMSubtarget *ST) {
9965 if (!ST->hasMVEIntegerOps())
9966 return SDValue();
9967
9968 SDLoc dl(Op);
9969 unsigned BaseOpcode = 0;
9970 switch (Op->getOpcode()) {
9971 default: llvm_unreachable("Expected VECREDUCE opcode");
9972 case ISD::VECREDUCE_FADD: BaseOpcode = ISD::FADD; break;
9973 case ISD::VECREDUCE_FMUL: BaseOpcode = ISD::FMUL; break;
9974 case ISD::VECREDUCE_MUL: BaseOpcode = ISD::MUL; break;
9975 case ISD::VECREDUCE_AND: BaseOpcode = ISD::AND; break;
9976 case ISD::VECREDUCE_OR: BaseOpcode = ISD::OR; break;
9977 case ISD::VECREDUCE_XOR: BaseOpcode = ISD::XOR; break;
9978 case ISD::VECREDUCE_FMAX: BaseOpcode = ISD::FMAXNUM; break;
9979 case ISD::VECREDUCE_FMIN: BaseOpcode = ISD::FMINNUM; break;
9980 }
9981
9982 SDValue Op0 = Op->getOperand(0);
9983 EVT VT = Op0.getValueType();
9984 EVT EltVT = VT.getVectorElementType();
9985 unsigned NumElts = VT.getVectorNumElements();
9986 unsigned NumActiveLanes = NumElts;
9987
9988 assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 ||
9989 NumActiveLanes == 2) &&
9990 "Only expected a power 2 vector size");
9991
9992 // Use Mul(X, Rev(X)) until 4 items remain. Going down to 4 vector elements
9993 // allows us to easily extract vector elements from the lanes.
9994 while (NumActiveLanes > 4) {
9995 unsigned RevOpcode = NumActiveLanes == 16 ? ARMISD::VREV16 : ARMISD::VREV32;
9996 SDValue Rev = DAG.getNode(RevOpcode, dl, VT, Op0);
9997 Op0 = DAG.getNode(BaseOpcode, dl, VT, Op0, Rev);
9998 NumActiveLanes /= 2;
9999 }
10000
10001 SDValue Res;
10002 if (NumActiveLanes == 4) {
10003 // The remaining 4 elements are summed sequentially
10004 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10005 DAG.getConstant(0 * NumElts / 4, dl, MVT::i32));
10006 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10007 DAG.getConstant(1 * NumElts / 4, dl, MVT::i32));
10008 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10009 DAG.getConstant(2 * NumElts / 4, dl, MVT::i32));
10010 SDValue Ext3 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10011 DAG.getConstant(3 * NumElts / 4, dl, MVT::i32));
10012 SDValue Res0 = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
10013 SDValue Res1 = DAG.getNode(BaseOpcode, dl, EltVT, Ext2, Ext3, Op->getFlags());
10014 Res = DAG.getNode(BaseOpcode, dl, EltVT, Res0, Res1, Op->getFlags());
10015 } else {
10016 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10017 DAG.getConstant(0, dl, MVT::i32));
10018 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10019 DAG.getConstant(1, dl, MVT::i32));
10020 Res = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
10021 }
10022
10023 // Result type may be wider than element type.
10024 if (EltVT != Op->getValueType(0))
10025 Res = DAG.getNode(ISD::ANY_EXTEND, dl, Op->getValueType(0), Res);
10026 return Res;
10027}
10028
10030 const ARMSubtarget *ST) {
10031 if (!ST->hasMVEFloatOps())
10032 return SDValue();
10033 return LowerVecReduce(Op, DAG, ST);
10034}
10035
10037 const ARMSubtarget *ST) {
10038 if (!ST->hasNEON())
10039 return SDValue();
10040
10041 SDLoc dl(Op);
10042 SDValue Op0 = Op->getOperand(0);
10043 EVT VT = Op0.getValueType();
10044 EVT EltVT = VT.getVectorElementType();
10045
10046 unsigned PairwiseIntrinsic = 0;
10047 switch (Op->getOpcode()) {
10048 default:
10049 llvm_unreachable("Expected VECREDUCE opcode");
10050 case ISD::VECREDUCE_UMIN:
10051 PairwiseIntrinsic = Intrinsic::arm_neon_vpminu;
10052 break;
10053 case ISD::VECREDUCE_UMAX:
10054 PairwiseIntrinsic = Intrinsic::arm_neon_vpmaxu;
10055 break;
10056 case ISD::VECREDUCE_SMIN:
10057 PairwiseIntrinsic = Intrinsic::arm_neon_vpmins;
10058 break;
10059 case ISD::VECREDUCE_SMAX:
10060 PairwiseIntrinsic = Intrinsic::arm_neon_vpmaxs;
10061 break;
10062 }
10063 SDValue PairwiseOp = DAG.getConstant(PairwiseIntrinsic, dl, MVT::i32);
10064
10065 unsigned NumElts = VT.getVectorNumElements();
10066 unsigned NumActiveLanes = NumElts;
10067
10068 assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 ||
10069 NumActiveLanes == 2) &&
10070 "Only expected a power 2 vector size");
10071
10072 // Split 128-bit vectors, since vpmin/max takes 2 64-bit vectors.
10073 if (VT.is128BitVector()) {
10074 SDValue Lo, Hi;
10075 std::tie(Lo, Hi) = DAG.SplitVector(Op0, dl);
10076 VT = Lo.getValueType();
10077 Op0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, {PairwiseOp, Lo, Hi});
10078 NumActiveLanes /= 2;
10079 }
10080
10081 // Use pairwise reductions until one lane remains
10082 while (NumActiveLanes > 1) {
10083 Op0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, {PairwiseOp, Op0, Op0});
10084 NumActiveLanes /= 2;
10085 }
10086
10087 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10088 DAG.getConstant(0, dl, MVT::i32));
10089
10090 // Result type may be wider than element type.
10091 if (EltVT != Op.getValueType()) {
10092 unsigned Extend = 0;
10093 switch (Op->getOpcode()) {
10094 default:
10095 llvm_unreachable("Expected VECREDUCE opcode");
10096 case ISD::VECREDUCE_UMIN:
10097 case ISD::VECREDUCE_UMAX:
10098 Extend = ISD::ZERO_EXTEND;
10099 break;
10100 case ISD::VECREDUCE_SMIN:
10101 case ISD::VECREDUCE_SMAX:
10102 Extend = ISD::SIGN_EXTEND;
10103 break;
10104 }
10105 Res = DAG.getNode(Extend, dl, Op.getValueType(), Res);
10106 }
10107 return Res;
10108}
10109
10111 if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getSuccessOrdering()))
10112 // Acquire/Release load/store is not legal for targets without a dmb or
10113 // equivalent available.
10114 return SDValue();
10115
10116 // Monotonic load/store is legal for all targets.
10117 return Op;
10118}
10119
10122 SelectionDAG &DAG,
10123 const ARMSubtarget *Subtarget) {
10124 SDLoc DL(N);
10125 // Under Power Management extensions, the cycle-count is:
10126 // mrc p15, #0, <Rt>, c9, c13, #0
10127 SDValue Ops[] = { N->getOperand(0), // Chain
10128 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
10129 DAG.getTargetConstant(15, DL, MVT::i32),
10130 DAG.getTargetConstant(0, DL, MVT::i32),
10131 DAG.getTargetConstant(9, DL, MVT::i32),
10132 DAG.getTargetConstant(13, DL, MVT::i32),
10133 DAG.getTargetConstant(0, DL, MVT::i32)
10134 };
10135
10136 SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
10137 DAG.getVTList(MVT::i32, MVT::Other), Ops);
10138 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32,
10139 DAG.getConstant(0, DL, MVT::i32)));
10140 Results.push_back(Cycles32.getValue(1));
10141}
10142
10144 SDValue V1) {
10145 SDLoc dl(V0.getNode());
10146 SDValue RegClass =
10147 DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32);
10148 SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32);
10149 SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32);
10150 const SDValue Ops[] = {RegClass, V0, SubReg0, V1, SubReg1};
10151 return SDValue(
10152 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
10153}
10154
10156 SDLoc dl(V.getNode());
10157 auto [VLo, VHi] = DAG.SplitScalar(V, dl, MVT::i32, MVT::i32);
10158 bool isBigEndian = DAG.getDataLayout().isBigEndian();
10159 if (isBigEndian)
10160 std::swap(VLo, VHi);
10161 return createGPRPairNode2xi32(DAG, VLo, VHi);
10162}
10163
10166 SelectionDAG &DAG) {
10167 assert(N->getValueType(0) == MVT::i64 &&
10168 "AtomicCmpSwap on types less than 64 should be legal");
10169 SDValue Ops[] = {
10170 createGPRPairNode2xi32(DAG, N->getOperand(1),
10171 DAG.getUNDEF(MVT::i32)), // pointer, temp
10172 createGPRPairNodei64(DAG, N->getOperand(2)), // expected
10173 createGPRPairNodei64(DAG, N->getOperand(3)), // new
10174 N->getOperand(0), // chain in
10175 };
10176 SDNode *CmpSwap = DAG.getMachineNode(
10177 ARM::CMP_SWAP_64, SDLoc(N),
10178 DAG.getVTList(MVT::Untyped, MVT::Untyped, MVT::Other), Ops);
10179
10180 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
10181 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
10182
10183 bool isBigEndian = DAG.getDataLayout().isBigEndian();
10184
10185 SDValue Lo =
10186 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0,
10187 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));
10188 SDValue Hi =
10189 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1,
10190 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));
10191 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i64, Lo, Hi));
10192 Results.push_back(SDValue(CmpSwap, 2));
10193}
10194
10195SDValue ARMTargetLowering::LowerFSETCC(SDValue Op, SelectionDAG &DAG) const {
10196 SDLoc dl(Op);
10197 EVT VT = Op.getValueType();
10198 SDValue Chain = Op.getOperand(0);
10199 SDValue LHS = Op.getOperand(1);
10200 SDValue RHS = Op.getOperand(2);
10201 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(3))->get();
10202 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
10203
10204 // If we don't have instructions of this float type then soften to a libcall
10205 // and use SETCC instead.
10206 if (isUnsupportedFloatingType(LHS.getValueType())) {
10207 softenSetCCOperands(DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS,
10208 Chain, IsSignaling);
10209 if (!RHS.getNode()) {
10210 RHS = DAG.getConstant(0, dl, LHS.getValueType());
10211 CC = ISD::SETNE;
10212 }
10213 SDValue Result = DAG.getNode(ISD::SETCC, dl, VT, LHS, RHS,
10214 DAG.getCondCode(CC));
10215 return DAG.getMergeValues({Result, Chain}, dl);
10216 }
10217
10218 ARMCC::CondCodes CondCode, CondCode2;
10219 FPCCToARMCC(CC, CondCode, CondCode2);
10220
10221 SDValue True = DAG.getConstant(1, dl, VT);
10222 SDValue False = DAG.getConstant(0, dl, VT);
10223 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
10224 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, IsSignaling);
10225 SDValue Result = getCMOV(dl, VT, False, True, ARMcc, Cmp, DAG);
10226 if (CondCode2 != ARMCC::AL) {
10227 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
10228 Result = getCMOV(dl, VT, Result, True, ARMcc, Cmp, DAG);
10229 }
10230 return DAG.getMergeValues({Result, Chain}, dl);
10231}
10232
10233SDValue ARMTargetLowering::LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const {
10234 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
10235
10236 EVT VT = getPointerTy(DAG.getDataLayout());
10237 int FI = MFI.CreateFixedObject(4, 0, false);
10238 return DAG.getFrameIndex(FI, VT);
10239}
10240
10241SDValue ARMTargetLowering::LowerFP_TO_BF16(SDValue Op,
10242 SelectionDAG &DAG) const {
10243 SDLoc DL(Op);
10244 MakeLibCallOptions CallOptions;
10245 MVT SVT = Op.getOperand(0).getSimpleValueType();
10246 RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, MVT::bf16);
10247 SDValue Res =
10248 makeLibCall(DAG, LC, MVT::f32, Op.getOperand(0), CallOptions, DL).first;
10249 return DAG.getBitcast(MVT::i32, Res);
10250}
10251
10252SDValue ARMTargetLowering::LowerCMP(SDValue Op, SelectionDAG &DAG) const {
10253 SDLoc dl(Op);
10254 SDValue LHS = Op.getOperand(0);
10255 SDValue RHS = Op.getOperand(1);
10256
10257 // Determine if this is signed or unsigned comparison
10258 bool IsSigned = (Op.getOpcode() == ISD::SCMP);
10259
10260 // Special case for Thumb1 UCMP only
10261 if (!IsSigned && Subtarget->isThumb1Only()) {
10262 // For Thumb unsigned comparison, use this sequence:
10263 // subs r2, r0, r1 ; r2 = LHS - RHS, sets flags
10264 // sbc r2, r2 ; r2 = r2 - r2 - !carry
10265 // cmp r1, r0 ; compare RHS with LHS
10266 // sbc r1, r1 ; r1 = r1 - r1 - !carry
10267 // subs r0, r2, r1 ; r0 = r2 - r1 (final result)
10268
10269 // First subtraction: LHS - RHS
10270 SDValue Sub1WithFlags = DAG.getNode(
10271 ARMISD::SUBC, dl, DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
10272 SDValue Sub1Result = Sub1WithFlags.getValue(0);
10273 SDValue Flags1 = Sub1WithFlags.getValue(1);
10274
10275 // SUBE: Sub1Result - Sub1Result - !carry
10276 // This gives 0 if LHS >= RHS (unsigned), -1 if LHS < RHS (unsigned)
10277 SDValue Sbc1 =
10278 DAG.getNode(ARMISD::SUBE, dl, DAG.getVTList(MVT::i32, FlagsVT),
10279 Sub1Result, Sub1Result, Flags1);
10280 SDValue Sbc1Result = Sbc1.getValue(0);
10281
10282 // Second comparison: RHS vs LHS (reverse comparison)
10283 SDValue CmpFlags = DAG.getNode(ARMISD::CMP, dl, FlagsVT, RHS, LHS);
10284
10285 // SUBE: RHS - RHS - !carry
10286 // This gives 0 if RHS <= LHS (unsigned), -1 if RHS > LHS (unsigned)
10287 SDValue Sbc2 = DAG.getNode(
10288 ARMISD::SUBE, dl, DAG.getVTList(MVT::i32, FlagsVT), RHS, RHS, CmpFlags);
10289 SDValue Sbc2Result = Sbc2.getValue(0);
10290
10291 // Final subtraction: Sbc1Result - Sbc2Result (no flags needed)
10292 SDValue Result =
10293 DAG.getNode(ISD::SUB, dl, MVT::i32, Sbc1Result, Sbc2Result);
10294 if (Op.getValueType() != MVT::i32)
10295 Result = DAG.getSExtOrTrunc(Result, dl, Op.getValueType());
10296
10297 return Result;
10298 }
10299
10300 // For the ARM assembly pattern:
10301 // subs r0, r0, r1 ; subtract RHS from LHS and set flags
10302 // movgt r0, #1 ; if LHS > RHS, set result to 1 (GT for signed, HI for
10303 // unsigned) mvnlt r0, #0 ; if LHS < RHS, set result to -1 (LT for
10304 // signed, LO for unsigned)
10305 // ; if LHS == RHS, result remains 0 from the subs
10306
10307 // Optimization: if RHS is a subtraction against 0, use ADDC instead of SUBC
10308 unsigned Opcode = ARMISD::SUBC;
10309
10310 // Check if RHS is a subtraction against 0: (0 - X)
10311 if (RHS.getOpcode() == ISD::SUB) {
10312 SDValue SubLHS = RHS.getOperand(0);
10313 SDValue SubRHS = RHS.getOperand(1);
10314
10315 // Check if it's 0 - X
10316 if (isNullConstant(SubLHS)) {
10317 bool CanUseAdd = false;
10318 if (IsSigned) {
10319 // For SCMP: only if X is known to never be INT_MIN (to avoid overflow)
10320 if (RHS->getFlags().hasNoSignedWrap() || !DAG.computeKnownBits(SubRHS)
10322 .isMinSignedValue()) {
10323 CanUseAdd = true;
10324 }
10325 } else {
10326 // For UCMP: only if X is known to never be zero
10327 if (DAG.isKnownNeverZero(SubRHS)) {
10328 CanUseAdd = true;
10329 }
10330 }
10331
10332 if (CanUseAdd) {
10333 Opcode = ARMISD::ADDC;
10334 RHS = SubRHS; // Replace RHS with X, so we do LHS + X instead of
10335 // LHS - (0 - X)
10336 }
10337 }
10338 }
10339
10340 // Generate the operation with flags
10341 SDValue OpWithFlags =
10342 DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
10343
10344 SDValue OpResult = OpWithFlags.getValue(0);
10345 SDValue Flags = OpWithFlags.getValue(1);
10346
10347 // Constants for conditional moves
10348 SDValue One = DAG.getConstant(1, dl, MVT::i32);
10349 SDValue MinusOne = DAG.getAllOnesConstant(dl, MVT::i32);
10350
10351 // Select condition codes based on signed vs unsigned
10352 ARMCC::CondCodes GTCond = IsSigned ? ARMCC::GT : ARMCC::HI;
10353 ARMCC::CondCodes LTCond = IsSigned ? ARMCC::LT : ARMCC::LO;
10354
10355 // First conditional move: if greater than, set to 1
10356 SDValue GTCondValue = DAG.getConstant(GTCond, dl, MVT::i32);
10357 SDValue Result1 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, OpResult, One,
10358 GTCondValue, Flags);
10359
10360 // Second conditional move: if less than, set to -1
10361 SDValue LTCondValue = DAG.getConstant(LTCond, dl, MVT::i32);
10362 SDValue Result2 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, Result1, MinusOne,
10363 LTCondValue, Flags);
10364
10365 if (Op.getValueType() != MVT::i32)
10366 Result2 = DAG.getSExtOrTrunc(Result2, dl, Op.getValueType());
10367
10368 return Result2;
10369}
10370
10372 LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump());
10373 switch (Op.getOpcode()) {
10374 default: llvm_unreachable("Don't know how to custom lower this!");
10375 case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG);
10376 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
10377 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
10378 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
10379 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
10380 case ISD::SELECT: return LowerSELECT(Op, DAG);
10381 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
10382 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
10383 case ISD::BR_CC: return LowerBR_CC(Op, DAG);
10384 case ISD::BR_JT: return LowerBR_JT(Op, DAG);
10385 case ISD::VASTART: return LowerVASTART(Op, DAG);
10386 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG, Subtarget);
10387 case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget);
10388 case ISD::SINT_TO_FP:
10389 case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG);
10392 case ISD::FP_TO_SINT:
10393 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
10395 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG, Subtarget);
10396 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
10397 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
10398 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
10399 case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);
10400 case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);
10401 case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
10402 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG, Subtarget);
10403 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
10404 Subtarget);
10405 case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG, Subtarget);
10406 case ISD::SHL:
10407 case ISD::SRL:
10408 case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget);
10409 case ISD::SREM: return LowerREM(Op.getNode(), DAG);
10410 case ISD::UREM: return LowerREM(Op.getNode(), DAG);
10411 case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG);
10412 case ISD::SRL_PARTS:
10413 case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG);
10414 case ISD::CTTZ:
10415 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
10416 case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget);
10417 case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget);
10418 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
10419 case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget);
10420 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget);
10421 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG, Subtarget);
10422 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG, Subtarget);
10423 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
10424 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG, Subtarget);
10425 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG, Subtarget);
10426 case ISD::TRUNCATE: return LowerTruncate(Op.getNode(), DAG, Subtarget);
10427 case ISD::SIGN_EXTEND:
10428 case ISD::ZERO_EXTEND: return LowerVectorExtend(Op.getNode(), DAG, Subtarget);
10429 case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
10430 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
10431 case ISD::SET_FPMODE:
10432 return LowerSET_FPMODE(Op, DAG);
10433 case ISD::RESET_FPMODE:
10434 return LowerRESET_FPMODE(Op, DAG);
10435 case ISD::MUL: return LowerMUL(Op, DAG);
10436 case ISD::SDIV:
10437 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
10438 return LowerDIV_Windows(Op, DAG, /* Signed */ true);
10439 return LowerSDIV(Op, DAG, Subtarget);
10440 case ISD::UDIV:
10441 if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
10442 return LowerDIV_Windows(Op, DAG, /* Signed */ false);
10443 return LowerUDIV(Op, DAG, Subtarget);
10444 case ISD::UADDO_CARRY:
10445 case ISD::USUBO_CARRY:
10446 return LowerUADDSUBO_CARRY(Op, DAG);
10447 case ISD::SADDO:
10448 case ISD::SSUBO:
10449 return LowerSignedALUO(Op, DAG);
10450 case ISD::UADDO:
10451 case ISD::USUBO:
10452 return LowerUnsignedALUO(Op, DAG);
10453 case ISD::SADDSAT:
10454 case ISD::SSUBSAT:
10455 case ISD::UADDSAT:
10456 case ISD::USUBSAT:
10457 return LowerADDSUBSAT(Op, DAG, Subtarget);
10458 case ISD::LOAD:
10459 return LowerPredicateLoad(Op, DAG);
10460 case ISD::STORE:
10461 return LowerSTORE(Op, DAG, Subtarget);
10462 case ISD::MLOAD:
10463 return LowerMLOAD(Op, DAG);
10464 case ISD::VECREDUCE_MUL:
10465 case ISD::VECREDUCE_AND:
10466 case ISD::VECREDUCE_OR:
10467 case ISD::VECREDUCE_XOR:
10468 return LowerVecReduce(Op, DAG, Subtarget);
10469 case ISD::VECREDUCE_FADD:
10470 case ISD::VECREDUCE_FMUL:
10471 case ISD::VECREDUCE_FMIN:
10472 case ISD::VECREDUCE_FMAX:
10473 return LowerVecReduceF(Op, DAG, Subtarget);
10474 case ISD::VECREDUCE_UMIN:
10475 case ISD::VECREDUCE_UMAX:
10476 case ISD::VECREDUCE_SMIN:
10477 case ISD::VECREDUCE_SMAX:
10478 return LowerVecReduceMinMax(Op, DAG, Subtarget);
10479 case ISD::ATOMIC_LOAD:
10480 case ISD::ATOMIC_STORE:
10481 return LowerAtomicLoadStore(Op, DAG);
10482 case ISD::SDIVREM:
10483 case ISD::UDIVREM: return LowerDivRem(Op, DAG);
10484 case ISD::DYNAMIC_STACKALLOC:
10485 if (Subtarget->isTargetWindows())
10486 return LowerDYNAMIC_STACKALLOC(Op, DAG);
10487 llvm_unreachable("Don't know how to custom lower this!");
10489 case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
10491 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
10492 case ISD::STRICT_FSETCC:
10493 case ISD::STRICT_FSETCCS: return LowerFSETCC(Op, DAG);
10494 case ISD::SPONENTRY:
10495 return LowerSPONENTRY(Op, DAG);
10496 case ISD::FP_TO_BF16:
10497 return LowerFP_TO_BF16(Op, DAG);
10498 case ARMISD::WIN__DBZCHK: return SDValue();
10499 case ISD::UCMP:
10500 case ISD::SCMP:
10501 return LowerCMP(Op, DAG);
10502 case ISD::ABS:
10503 return LowerABS(Op, DAG);
10504 case ISD::STRICT_LROUND:
10506 case ISD::STRICT_LRINT:
10507 case ISD::STRICT_LLRINT: {
10508 assert((Op.getOperand(1).getValueType() == MVT::f16 ||
10509 Op.getOperand(1).getValueType() == MVT::bf16) &&
10510 "Expected custom lowering of rounding operations only for f16");
10511 SDLoc DL(Op);
10512 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
10513 {Op.getOperand(0), Op.getOperand(1)});
10514 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
10515 {Ext.getValue(1), Ext.getValue(0)});
10516 }
10517 }
10518}
10519
10521 SelectionDAG &DAG) {
10522 unsigned IntNo = N->getConstantOperandVal(0);
10523 unsigned Opc = 0;
10524 if (IntNo == Intrinsic::arm_smlald)
10525 Opc = ARMISD::SMLALD;
10526 else if (IntNo == Intrinsic::arm_smlaldx)
10527 Opc = ARMISD::SMLALDX;
10528 else if (IntNo == Intrinsic::arm_smlsld)
10529 Opc = ARMISD::SMLSLD;
10530 else if (IntNo == Intrinsic::arm_smlsldx)
10531 Opc = ARMISD::SMLSLDX;
10532 else
10533 return;
10534
10535 SDLoc dl(N);
10536 SDValue Lo, Hi;
10537 std::tie(Lo, Hi) = DAG.SplitScalar(N->getOperand(3), dl, MVT::i32, MVT::i32);
10538
10539 SDValue LongMul = DAG.getNode(Opc, dl,
10540 DAG.getVTList(MVT::i32, MVT::i32),
10541 N->getOperand(1), N->getOperand(2),
10542 Lo, Hi);
10543 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
10544 LongMul.getValue(0), LongMul.getValue(1)));
10545}
10546
10547/// ReplaceNodeResults - Replace the results of node with an illegal result
10548/// type with new values built out of custom code.
10551 SelectionDAG &DAG) const {
10552 SDValue Res;
10553 switch (N->getOpcode()) {
10554 default:
10555 llvm_unreachable("Don't know how to custom expand this!");
10556 case ISD::READ_REGISTER:
10558 break;
10559 case ISD::BITCAST:
10560 Res = ExpandBITCAST(N, DAG, Subtarget);
10561 break;
10562 case ISD::SRL:
10563 case ISD::SRA:
10564 case ISD::SHL:
10565 Res = Expand64BitShift(N, DAG, Subtarget);
10566 break;
10567 case ISD::SREM:
10568 case ISD::UREM:
10569 Res = LowerREM(N, DAG);
10570 break;
10571 case ISD::SDIVREM:
10572 case ISD::UDIVREM:
10573 Res = LowerDivRem(SDValue(N, 0), DAG);
10574 assert(Res.getNumOperands() == 2 && "DivRem needs two values");
10575 Results.push_back(Res.getValue(0));
10576 Results.push_back(Res.getValue(1));
10577 return;
10578 case ISD::SADDSAT:
10579 case ISD::SSUBSAT:
10580 case ISD::UADDSAT:
10581 case ISD::USUBSAT:
10582 Res = LowerADDSUBSAT(SDValue(N, 0), DAG, Subtarget);
10583 break;
10584 case ISD::READCYCLECOUNTER:
10585 ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
10586 return;
10587 case ISD::UDIV:
10588 case ISD::SDIV:
10589 assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows");
10590 return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV,
10591 Results);
10592 case ISD::ATOMIC_CMP_SWAP:
10594 return;
10596 return ReplaceLongIntrinsic(N, Results, DAG);
10597 case ISD::LOAD:
10598 LowerLOAD(N, Results, DAG);
10599 break;
10600 case ISD::TRUNCATE:
10601 Res = LowerTruncate(N, DAG, Subtarget);
10602 break;
10603 case ISD::SIGN_EXTEND:
10604 case ISD::ZERO_EXTEND:
10605 Res = LowerVectorExtend(N, DAG, Subtarget);
10606 break;
10609 Res = LowerFP_TO_INT_SAT(SDValue(N, 0), DAG, Subtarget);
10610 break;
10611 }
10612 if (Res.getNode())
10613 Results.push_back(Res);
10614}
10615
10616//===----------------------------------------------------------------------===//
10617// ARM Scheduler Hooks
10618//===----------------------------------------------------------------------===//
10619
10620/// SetupEntryBlockForSjLj - Insert code into the entry block that creates and
10621/// registers the function context.
10622void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
10624 MachineBasicBlock *DispatchBB,
10625 int FI) const {
10626 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
10627 "ROPI/RWPI not currently supported with SjLj");
10628 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
10629 DebugLoc dl = MI.getDebugLoc();
10630 MachineFunction *MF = MBB->getParent();
10634 const Function &F = MF->getFunction();
10635
10636 bool isThumb = Subtarget->isThumb();
10637 bool isThumb2 = Subtarget->isThumb2();
10638
10639 unsigned PCLabelId = AFI->createPICLabelUId();
10640 unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8;
10642 ARMConstantPoolMBB::Create(F.getContext(), DispatchBB, PCLabelId, PCAdj);
10643 unsigned CPI = MCP->getConstantPoolIndex(CPV, Align(4));
10644
10645 const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass
10646 : &ARM::GPRRegClass;
10647
10648 // Grab constant pool and fixed stack memory operands.
10649 MachineMemOperand *CPMMO =
10652
10653 MachineMemOperand *FIMMOSt =
10656
10657 // Load the address of the dispatch MBB into the jump buffer.
10658 if (isThumb2) {
10659 // Incoming value: jbuf
10660 // ldr.n r5, LCPI1_1
10661 // orr r5, r5, #1
10662 // add r5, pc
10663 // str r5, [$jbuf, #+4] ; &jbuf[1]
10664 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10665 BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1)
10667 .addMemOperand(CPMMO)
10669 // Set the low bit because of thumb mode.
10670 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10671 BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2)
10672 .addReg(NewVReg1, RegState::Kill)
10673 .addImm(0x01)
10675 .add(condCodeOp());
10676 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10677 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3)
10678 .addReg(NewVReg2, RegState::Kill)
10679 .addImm(PCLabelId);
10680 BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12))
10681 .addReg(NewVReg3, RegState::Kill)
10682 .addFrameIndex(FI)
10683 .addImm(36) // &jbuf[1] :: pc
10684 .addMemOperand(FIMMOSt)
10686 } else if (isThumb) {
10687 // Incoming value: jbuf
10688 // ldr.n r1, LCPI1_4
10689 // add r1, pc
10690 // mov r2, #1
10691 // orrs r1, r2
10692 // add r2, $jbuf, #+4 ; &jbuf[1]
10693 // str r1, [r2]
10694 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10695 BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1)
10697 .addMemOperand(CPMMO)
10699 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10700 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2)
10701 .addReg(NewVReg1, RegState::Kill)
10702 .addImm(PCLabelId);
10703 // Set the low bit because of thumb mode.
10704 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10705 BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3)
10706 .addReg(ARM::CPSR, RegState::Define)
10707 .addImm(1)
10709 Register NewVReg4 = MRI->createVirtualRegister(TRC);
10710 BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4)
10711 .addReg(ARM::CPSR, RegState::Define)
10712 .addReg(NewVReg2, RegState::Kill)
10713 .addReg(NewVReg3, RegState::Kill)
10715 Register NewVReg5 = MRI->createVirtualRegister(TRC);
10716 BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5)
10717 .addFrameIndex(FI)
10718 .addImm(36); // &jbuf[1] :: pc
10719 BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi))
10720 .addReg(NewVReg4, RegState::Kill)
10721 .addReg(NewVReg5, RegState::Kill)
10722 .addImm(0)
10723 .addMemOperand(FIMMOSt)
10725 } else {
10726 // Incoming value: jbuf
10727 // ldr r1, LCPI1_1
10728 // add r1, pc, r1
10729 // str r1, [$jbuf, #+4] ; &jbuf[1]
10730 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10731 BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1)
10733 .addImm(0)
10734 .addMemOperand(CPMMO)
10736 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10737 BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2)
10738 .addReg(NewVReg1, RegState::Kill)
10739 .addImm(PCLabelId)
10741 BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12))
10742 .addReg(NewVReg2, RegState::Kill)
10743 .addFrameIndex(FI)
10744 .addImm(36) // &jbuf[1] :: pc
10745 .addMemOperand(FIMMOSt)
10747 }
10748}
10749
10750void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
10751 MachineBasicBlock *MBB) const {
10752 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
10753 DebugLoc dl = MI.getDebugLoc();
10754 MachineFunction *MF = MBB->getParent();
10755 MachineRegisterInfo *MRI = &MF->getRegInfo();
10756 MachineFrameInfo &MFI = MF->getFrameInfo();
10757 int FI = MFI.getFunctionContextIndex();
10758
10759 const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass
10760 : &ARM::GPRnopcRegClass;
10761
10762 // Get a mapping of the call site numbers to all of the landing pads they're
10763 // associated with.
10764 DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2>> CallSiteNumToLPad;
10765 unsigned MaxCSNum = 0;
10766 for (MachineBasicBlock &BB : *MF) {
10767 if (!BB.isEHPad())
10768 continue;
10769
10770 // FIXME: We should assert that the EH_LABEL is the first MI in the landing
10771 // pad.
10772 for (MachineInstr &II : BB) {
10773 if (!II.isEHLabel())
10774 continue;
10775
10776 MCSymbol *Sym = II.getOperand(0).getMCSymbol();
10777 if (!MF->hasCallSiteLandingPad(Sym)) continue;
10778
10779 SmallVectorImpl<unsigned> &CallSiteIdxs = MF->getCallSiteLandingPad(Sym);
10780 for (unsigned Idx : CallSiteIdxs) {
10781 CallSiteNumToLPad[Idx].push_back(&BB);
10782 MaxCSNum = std::max(MaxCSNum, Idx);
10783 }
10784 break;
10785 }
10786 }
10787
10788 // Get an ordered list of the machine basic blocks for the jump table.
10789 std::vector<MachineBasicBlock*> LPadList;
10790 SmallPtrSet<MachineBasicBlock*, 32> InvokeBBs;
10791 LPadList.reserve(CallSiteNumToLPad.size());
10792 for (unsigned I = 1; I <= MaxCSNum; ++I) {
10793 SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I];
10794 for (MachineBasicBlock *MBB : MBBList) {
10795 LPadList.push_back(MBB);
10796 InvokeBBs.insert_range(MBB->predecessors());
10797 }
10798 }
10799
10800 assert(!LPadList.empty() &&
10801 "No landing pad destinations for the dispatch jump table!");
10802
10803 // Create the jump table and associated information.
10804 MachineJumpTableInfo *JTI =
10805 MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline);
10806 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
10807
10808 // Create the MBBs for the dispatch code.
10809
10810 // Shove the dispatch's address into the return slot in the function context.
10811 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
10812 DispatchBB->setIsEHPad();
10813
10814 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
10815
10816 BuildMI(TrapBB, dl, TII->get(Subtarget->isThumb() ? ARM::tTRAP : ARM::TRAP));
10817 DispatchBB->addSuccessor(TrapBB);
10818
10819 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
10820 DispatchBB->addSuccessor(DispContBB);
10821
10822 // Insert and MBBs.
10823 MF->insert(MF->end(), DispatchBB);
10824 MF->insert(MF->end(), DispContBB);
10825 MF->insert(MF->end(), TrapBB);
10826
10827 // Insert code into the entry block that creates and registers the function
10828 // context.
10829 SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI);
10830
10831 MachineMemOperand *FIMMOLd = MF->getMachineMemOperand(
10834
10835 MachineInstrBuilder MIB;
10836 MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
10837
10838 const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
10839 const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
10840
10841 // Add a register mask with no preserved registers. This results in all
10842 // registers being marked as clobbered. This can't work if the dispatch block
10843 // is in a Thumb1 function and is linked with ARM code which uses the FP
10844 // registers, as there is no way to preserve the FP registers in Thumb1 mode.
10846
10847 bool IsPositionIndependent = isPositionIndependent();
10848 unsigned NumLPads = LPadList.size();
10849 if (Subtarget->isThumb2()) {
10850 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10851 BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1)
10852 .addFrameIndex(FI)
10853 .addImm(4)
10854 .addMemOperand(FIMMOLd)
10856
10857 if (NumLPads < 256) {
10858 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri))
10859 .addReg(NewVReg1)
10860 .addImm(LPadList.size())
10862 } else {
10863 Register VReg1 = MRI->createVirtualRegister(TRC);
10864 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1)
10865 .addImm(NumLPads & 0xFFFF)
10867
10868 unsigned VReg2 = VReg1;
10869 if ((NumLPads & 0xFFFF0000) != 0) {
10870 VReg2 = MRI->createVirtualRegister(TRC);
10871 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2)
10872 .addReg(VReg1)
10873 .addImm(NumLPads >> 16)
10875 }
10876
10877 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr))
10878 .addReg(NewVReg1)
10879 .addReg(VReg2)
10881 }
10882
10883 BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc))
10884 .addMBB(TrapBB)
10886 .addReg(ARM::CPSR);
10887
10888 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10889 BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3)
10890 .addJumpTableIndex(MJTI)
10892
10893 Register NewVReg4 = MRI->createVirtualRegister(TRC);
10894 BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4)
10895 .addReg(NewVReg3, RegState::Kill)
10896 .addReg(NewVReg1)
10899 .add(condCodeOp());
10900
10901 BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT))
10902 .addReg(NewVReg4, RegState::Kill)
10903 .addReg(NewVReg1)
10904 .addJumpTableIndex(MJTI);
10905 } else if (Subtarget->isThumb()) {
10906 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10907 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)
10908 .addFrameIndex(FI)
10909 .addImm(1)
10910 .addMemOperand(FIMMOLd)
10912
10913 if (NumLPads < 256) {
10914 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8))
10915 .addReg(NewVReg1)
10916 .addImm(NumLPads)
10918 } else {
10919 MachineConstantPool *ConstantPool = MF->getConstantPool();
10920 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
10921 const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
10922
10923 // MachineConstantPool wants an explicit alignment.
10924 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
10925 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
10926
10927 Register VReg1 = MRI->createVirtualRegister(TRC);
10928 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))
10929 .addReg(VReg1, RegState::Define)
10932 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr))
10933 .addReg(NewVReg1)
10934 .addReg(VReg1)
10936 }
10937
10938 BuildMI(DispatchBB, dl, TII->get(ARM::tBcc))
10939 .addMBB(TrapBB)
10941 .addReg(ARM::CPSR);
10942
10943 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10944 BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2)
10945 .addReg(ARM::CPSR, RegState::Define)
10946 .addReg(NewVReg1)
10947 .addImm(2)
10949
10950 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10951 BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)
10952 .addJumpTableIndex(MJTI)
10954
10955 Register NewVReg4 = MRI->createVirtualRegister(TRC);
10956 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)
10957 .addReg(ARM::CPSR, RegState::Define)
10958 .addReg(NewVReg2, RegState::Kill)
10959 .addReg(NewVReg3)
10961
10962 MachineMemOperand *JTMMOLd =
10963 MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF),
10965
10966 Register NewVReg5 = MRI->createVirtualRegister(TRC);
10967 BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
10968 .addReg(NewVReg4, RegState::Kill)
10969 .addImm(0)
10970 .addMemOperand(JTMMOLd)
10972
10973 unsigned NewVReg6 = NewVReg5;
10974 if (IsPositionIndependent) {
10975 NewVReg6 = MRI->createVirtualRegister(TRC);
10976 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
10977 .addReg(ARM::CPSR, RegState::Define)
10978 .addReg(NewVReg5, RegState::Kill)
10979 .addReg(NewVReg3)
10981 }
10982
10983 BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr))
10984 .addReg(NewVReg6, RegState::Kill)
10985 .addJumpTableIndex(MJTI);
10986 } else {
10987 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10988 BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)
10989 .addFrameIndex(FI)
10990 .addImm(4)
10991 .addMemOperand(FIMMOLd)
10993
10994 if (NumLPads < 256) {
10995 BuildMI(DispatchBB, dl, TII->get(ARM::CMPri))
10996 .addReg(NewVReg1)
10997 .addImm(NumLPads)
10999 } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) {
11000 Register VReg1 = MRI->createVirtualRegister(TRC);
11001 BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1)
11002 .addImm(NumLPads & 0xFFFF)
11004
11005 unsigned VReg2 = VReg1;
11006 if ((NumLPads & 0xFFFF0000) != 0) {
11007 VReg2 = MRI->createVirtualRegister(TRC);
11008 BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2)
11009 .addReg(VReg1)
11010 .addImm(NumLPads >> 16)
11012 }
11013
11014 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
11015 .addReg(NewVReg1)
11016 .addReg(VReg2)
11018 } else {
11019 MachineConstantPool *ConstantPool = MF->getConstantPool();
11020 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
11021 const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
11022
11023 // MachineConstantPool wants an explicit alignment.
11024 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
11025 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
11026
11027 Register VReg1 = MRI->createVirtualRegister(TRC);
11028 BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))
11029 .addReg(VReg1, RegState::Define)
11031 .addImm(0)
11033 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
11034 .addReg(NewVReg1)
11035 .addReg(VReg1, RegState::Kill)
11037 }
11038
11039 BuildMI(DispatchBB, dl, TII->get(ARM::Bcc))
11040 .addMBB(TrapBB)
11042 .addReg(ARM::CPSR);
11043
11044 Register NewVReg3 = MRI->createVirtualRegister(TRC);
11045 BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3)
11046 .addReg(NewVReg1)
11049 .add(condCodeOp());
11050 Register NewVReg4 = MRI->createVirtualRegister(TRC);
11051 BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)
11052 .addJumpTableIndex(MJTI)
11054
11055 MachineMemOperand *JTMMOLd =
11056 MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF),
11058 Register NewVReg5 = MRI->createVirtualRegister(TRC);
11059 BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
11060 .addReg(NewVReg3, RegState::Kill)
11061 .addReg(NewVReg4)
11062 .addImm(0)
11063 .addMemOperand(JTMMOLd)
11065
11066 if (IsPositionIndependent) {
11067 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
11068 .addReg(NewVReg5, RegState::Kill)
11069 .addReg(NewVReg4)
11070 .addJumpTableIndex(MJTI);
11071 } else {
11072 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr))
11073 .addReg(NewVReg5, RegState::Kill)
11074 .addJumpTableIndex(MJTI);
11075 }
11076 }
11077
11078 // Add the jump table entries as successors to the MBB.
11079 SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs;
11080 for (MachineBasicBlock *CurMBB : LPadList) {
11081 if (SeenMBBs.insert(CurMBB).second)
11082 DispContBB->addSuccessor(CurMBB);
11083 }
11084
11085 // N.B. the order the invoke BBs are processed in doesn't matter here.
11086 const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF);
11088 for (MachineBasicBlock *BB : InvokeBBs) {
11089
11090 // Remove the landing pad successor from the invoke block and replace it
11091 // with the new dispatch block.
11092 SmallVector<MachineBasicBlock*, 4> Successors(BB->successors());
11093 while (!Successors.empty()) {
11094 MachineBasicBlock *SMBB = Successors.pop_back_val();
11095 if (SMBB->isEHPad()) {
11096 BB->removeSuccessor(SMBB);
11097 MBBLPads.push_back(SMBB);
11098 }
11099 }
11100
11101 BB->addSuccessor(DispatchBB, BranchProbability::getZero());
11102 BB->normalizeSuccProbs();
11103
11104 // Find the invoke call and mark all of the callee-saved registers as
11105 // 'implicit defined' so that they're spilled. This prevents code from
11106 // moving instructions to before the EH block, where they will never be
11107 // executed.
11109 II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) {
11110 if (!II->isCall()) continue;
11111
11112 DenseSet<unsigned> DefRegs;
11114 OI = II->operands_begin(), OE = II->operands_end();
11115 OI != OE; ++OI) {
11116 if (!OI->isReg()) continue;
11117 DefRegs.insert(OI->getReg());
11118 }
11119
11120 MachineInstrBuilder MIB(*MF, &*II);
11121
11122 for (unsigned i = 0; SavedRegs[i] != 0; ++i) {
11123 unsigned Reg = SavedRegs[i];
11124 if (Subtarget->isThumb2() &&
11125 !ARM::tGPRRegClass.contains(Reg) &&
11126 !ARM::hGPRRegClass.contains(Reg))
11127 continue;
11128 if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg))
11129 continue;
11130 if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg))
11131 continue;
11132 if (!DefRegs.contains(Reg))
11134 }
11135
11136 break;
11137 }
11138 }
11139
11140 // Mark all former landing pads as non-landing pads. The dispatch is the only
11141 // landing pad now.
11142 for (MachineBasicBlock *MBBLPad : MBBLPads)
11143 MBBLPad->setIsEHPad(false);
11144
11145 // The instruction is gone now.
11146 MI.eraseFromParent();
11147}
11148
11149static
11151 for (MachineBasicBlock *S : MBB->successors())
11152 if (S != Succ)
11153 return S;
11154 llvm_unreachable("Expecting a BB with two successors!");
11155}
11156
11157/// Return the load opcode for a given load size. If load size >= 8,
11158/// neon opcode will be returned.
11159static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) {
11160 if (LdSize >= 8)
11161 return LdSize == 16 ? ARM::VLD1q32wb_fixed
11162 : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0;
11163 if (IsThumb1)
11164 return LdSize == 4 ? ARM::tLDRi
11165 : LdSize == 2 ? ARM::tLDRHi
11166 : LdSize == 1 ? ARM::tLDRBi : 0;
11167 if (IsThumb2)
11168 return LdSize == 4 ? ARM::t2LDR_POST
11169 : LdSize == 2 ? ARM::t2LDRH_POST
11170 : LdSize == 1 ? ARM::t2LDRB_POST : 0;
11171 return LdSize == 4 ? ARM::LDR_POST_IMM
11172 : LdSize == 2 ? ARM::LDRH_POST
11173 : LdSize == 1 ? ARM::LDRB_POST_IMM : 0;
11174}
11175
11176/// Return the store opcode for a given store size. If store size >= 8,
11177/// neon opcode will be returned.
11178static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) {
11179 if (StSize >= 8)
11180 return StSize == 16 ? ARM::VST1q32wb_fixed
11181 : StSize == 8 ? ARM::VST1d32wb_fixed : 0;
11182 if (IsThumb1)
11183 return StSize == 4 ? ARM::tSTRi
11184 : StSize == 2 ? ARM::tSTRHi
11185 : StSize == 1 ? ARM::tSTRBi : 0;
11186 if (IsThumb2)
11187 return StSize == 4 ? ARM::t2STR_POST
11188 : StSize == 2 ? ARM::t2STRH_POST
11189 : StSize == 1 ? ARM::t2STRB_POST : 0;
11190 return StSize == 4 ? ARM::STR_POST_IMM
11191 : StSize == 2 ? ARM::STRH_POST
11192 : StSize == 1 ? ARM::STRB_POST_IMM : 0;
11193}
11194
11195/// Emit a post-increment load operation with given size. The instructions
11196/// will be added to BB at Pos.
11198 const TargetInstrInfo *TII, const DebugLoc &dl,
11199 unsigned LdSize, unsigned Data, unsigned AddrIn,
11200 unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
11201 unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2);
11202 assert(LdOpc != 0 && "Should have a load opcode");
11203 if (LdSize >= 8) {
11204 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11205 .addReg(AddrOut, RegState::Define)
11206 .addReg(AddrIn)
11207 .addImm(0)
11209 } else if (IsThumb1) {
11210 // load + update AddrIn
11211 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11212 .addReg(AddrIn)
11213 .addImm(0)
11215 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
11216 .add(t1CondCodeOp())
11217 .addReg(AddrIn)
11218 .addImm(LdSize)
11220 } else if (IsThumb2) {
11221 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11222 .addReg(AddrOut, RegState::Define)
11223 .addReg(AddrIn)
11224 .addImm(LdSize)
11226 } else { // arm
11227 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11228 .addReg(AddrOut, RegState::Define)
11229 .addReg(AddrIn)
11230 .addReg(0)
11231 .addImm(LdSize)
11233 }
11234}
11235
11236/// Emit a post-increment store operation with given size. The instructions
11237/// will be added to BB at Pos.
11239 const TargetInstrInfo *TII, const DebugLoc &dl,
11240 unsigned StSize, unsigned Data, unsigned AddrIn,
11241 unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
11242 unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2);
11243 assert(StOpc != 0 && "Should have a store opcode");
11244 if (StSize >= 8) {
11245 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11246 .addReg(AddrIn)
11247 .addImm(0)
11248 .addReg(Data)
11250 } else if (IsThumb1) {
11251 // store + update AddrIn
11252 BuildMI(*BB, Pos, dl, TII->get(StOpc))
11253 .addReg(Data)
11254 .addReg(AddrIn)
11255 .addImm(0)
11257 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
11258 .add(t1CondCodeOp())
11259 .addReg(AddrIn)
11260 .addImm(StSize)
11262 } else if (IsThumb2) {
11263 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11264 .addReg(Data)
11265 .addReg(AddrIn)
11266 .addImm(StSize)
11268 } else { // arm
11269 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11270 .addReg(Data)
11271 .addReg(AddrIn)
11272 .addReg(0)
11273 .addImm(StSize)
11275 }
11276}
11277
11279ARMTargetLowering::EmitStructByval(MachineInstr &MI,
11280 MachineBasicBlock *BB) const {
11281 // This pseudo instruction has 3 operands: dst, src, size
11282 // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
11283 // Otherwise, we will generate unrolled scalar copies.
11284 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11285 const BasicBlock *LLVM_BB = BB->getBasicBlock();
11287
11288 Register dest = MI.getOperand(0).getReg();
11289 Register src = MI.getOperand(1).getReg();
11290 unsigned SizeVal = MI.getOperand(2).getImm();
11291 unsigned Alignment = MI.getOperand(3).getImm();
11292 DebugLoc dl = MI.getDebugLoc();
11293
11294 MachineFunction *MF = BB->getParent();
11295 MachineRegisterInfo &MRI = MF->getRegInfo();
11296 unsigned UnitSize = 0;
11297 const TargetRegisterClass *TRC = nullptr;
11298 const TargetRegisterClass *VecTRC = nullptr;
11299
11300 bool IsThumb1 = Subtarget->isThumb1Only();
11301 bool IsThumb2 = Subtarget->isThumb2();
11302 bool IsThumb = Subtarget->isThumb();
11303
11304 if (Alignment & 1) {
11305 UnitSize = 1;
11306 } else if (Alignment & 2) {
11307 UnitSize = 2;
11308 } else {
11309 // Check whether we can use NEON instructions.
11310 if (!MF->getFunction().hasFnAttribute(Attribute::NoImplicitFloat) &&
11311 Subtarget->hasNEON()) {
11312 if ((Alignment % 16 == 0) && SizeVal >= 16)
11313 UnitSize = 16;
11314 else if ((Alignment % 8 == 0) && SizeVal >= 8)
11315 UnitSize = 8;
11316 }
11317 // Can't use NEON instructions.
11318 if (UnitSize == 0)
11319 UnitSize = 4;
11320 }
11321
11322 // Select the correct opcode and register class for unit size load/store
11323 bool IsNeon = UnitSize >= 8;
11324 TRC = IsThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
11325 if (IsNeon)
11326 VecTRC = UnitSize == 16 ? &ARM::DPairRegClass
11327 : UnitSize == 8 ? &ARM::DPRRegClass
11328 : nullptr;
11329
11330 unsigned BytesLeft = SizeVal % UnitSize;
11331 unsigned LoopSize = SizeVal - BytesLeft;
11332
11333 if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) {
11334 // Use LDR and STR to copy.
11335 // [scratch, srcOut] = LDR_POST(srcIn, UnitSize)
11336 // [destOut] = STR_POST(scratch, destIn, UnitSize)
11337 unsigned srcIn = src;
11338 unsigned destIn = dest;
11339 for (unsigned i = 0; i < LoopSize; i+=UnitSize) {
11340 Register srcOut = MRI.createVirtualRegister(TRC);
11341 Register destOut = MRI.createVirtualRegister(TRC);
11342 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
11343 emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut,
11344 IsThumb1, IsThumb2);
11345 emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut,
11346 IsThumb1, IsThumb2);
11347 srcIn = srcOut;
11348 destIn = destOut;
11349 }
11350
11351 // Handle the leftover bytes with LDRB and STRB.
11352 // [scratch, srcOut] = LDRB_POST(srcIn, 1)
11353 // [destOut] = STRB_POST(scratch, destIn, 1)
11354 for (unsigned i = 0; i < BytesLeft; i++) {
11355 Register srcOut = MRI.createVirtualRegister(TRC);
11356 Register destOut = MRI.createVirtualRegister(TRC);
11357 Register scratch = MRI.createVirtualRegister(TRC);
11358 emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut,
11359 IsThumb1, IsThumb2);
11360 emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut,
11361 IsThumb1, IsThumb2);
11362 srcIn = srcOut;
11363 destIn = destOut;
11364 }
11365 MI.eraseFromParent(); // The instruction is gone now.
11366 return BB;
11367 }
11368
11369 // Expand the pseudo op to a loop.
11370 // thisMBB:
11371 // ...
11372 // movw varEnd, # --> with thumb2
11373 // movt varEnd, #
11374 // ldrcp varEnd, idx --> without thumb2
11375 // fallthrough --> loopMBB
11376 // loopMBB:
11377 // PHI varPhi, varEnd, varLoop
11378 // PHI srcPhi, src, srcLoop
11379 // PHI destPhi, dst, destLoop
11380 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
11381 // [destLoop] = STR_POST(scratch, destPhi, UnitSize)
11382 // subs varLoop, varPhi, #UnitSize
11383 // bne loopMBB
11384 // fallthrough --> exitMBB
11385 // exitMBB:
11386 // epilogue to handle left-over bytes
11387 // [scratch, srcOut] = LDRB_POST(srcLoop, 1)
11388 // [destOut] = STRB_POST(scratch, destLoop, 1)
11389 MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
11390 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
11391 MF->insert(It, loopMBB);
11392 MF->insert(It, exitMBB);
11393
11394 // Set the call frame size on entry to the new basic blocks.
11395 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
11396 loopMBB->setCallFrameSize(CallFrameSize);
11397 exitMBB->setCallFrameSize(CallFrameSize);
11398
11399 // Transfer the remainder of BB and its successor edges to exitMBB.
11400 exitMBB->splice(exitMBB->begin(), BB,
11401 std::next(MachineBasicBlock::iterator(MI)), BB->end());
11403
11404 // Load an immediate to varEnd.
11405 Register varEnd = MRI.createVirtualRegister(TRC);
11406 if (Subtarget->useMovt()) {
11407 BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVi32imm : ARM::MOVi32imm),
11408 varEnd)
11409 .addImm(LoopSize);
11410 } else if (Subtarget->genExecuteOnly()) {
11411 assert(IsThumb && "Non-thumb expected to have used movt");
11412 BuildMI(BB, dl, TII->get(ARM::tMOVi32imm), varEnd).addImm(LoopSize);
11413 } else {
11414 MachineConstantPool *ConstantPool = MF->getConstantPool();
11416 const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
11417
11418 // MachineConstantPool wants an explicit alignment.
11419 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
11420 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
11421 MachineMemOperand *CPMMO =
11424
11425 if (IsThumb)
11426 BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci))
11427 .addReg(varEnd, RegState::Define)
11430 .addMemOperand(CPMMO);
11431 else
11432 BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp))
11433 .addReg(varEnd, RegState::Define)
11435 .addImm(0)
11437 .addMemOperand(CPMMO);
11438 }
11439 BB->addSuccessor(loopMBB);
11440
11441 // Generate the loop body:
11442 // varPhi = PHI(varLoop, varEnd)
11443 // srcPhi = PHI(srcLoop, src)
11444 // destPhi = PHI(destLoop, dst)
11445 MachineBasicBlock *entryBB = BB;
11446 BB = loopMBB;
11447 Register varLoop = MRI.createVirtualRegister(TRC);
11448 Register varPhi = MRI.createVirtualRegister(TRC);
11449 Register srcLoop = MRI.createVirtualRegister(TRC);
11450 Register srcPhi = MRI.createVirtualRegister(TRC);
11451 Register destLoop = MRI.createVirtualRegister(TRC);
11452 Register destPhi = MRI.createVirtualRegister(TRC);
11453
11454 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi)
11455 .addReg(varLoop).addMBB(loopMBB)
11456 .addReg(varEnd).addMBB(entryBB);
11457 BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi)
11458 .addReg(srcLoop).addMBB(loopMBB)
11459 .addReg(src).addMBB(entryBB);
11460 BuildMI(BB, dl, TII->get(ARM::PHI), destPhi)
11461 .addReg(destLoop).addMBB(loopMBB)
11462 .addReg(dest).addMBB(entryBB);
11463
11464 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
11465 // [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
11466 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
11467 emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop,
11468 IsThumb1, IsThumb2);
11469 emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop,
11470 IsThumb1, IsThumb2);
11471
11472 // Decrement loop variable by UnitSize.
11473 if (IsThumb1) {
11474 BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop)
11475 .add(t1CondCodeOp())
11476 .addReg(varPhi)
11477 .addImm(UnitSize)
11479 } else {
11480 MachineInstrBuilder MIB =
11481 BuildMI(*BB, BB->end(), dl,
11482 TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
11483 MIB.addReg(varPhi)
11484 .addImm(UnitSize)
11486 .add(condCodeOp());
11487 MIB->getOperand(5).setReg(ARM::CPSR);
11488 MIB->getOperand(5).setIsDef(true);
11489 }
11490 BuildMI(*BB, BB->end(), dl,
11491 TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc))
11492 .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
11493
11494 // loopMBB can loop back to loopMBB or fall through to exitMBB.
11495 BB->addSuccessor(loopMBB);
11496 BB->addSuccessor(exitMBB);
11497
11498 // Add epilogue to handle BytesLeft.
11499 BB = exitMBB;
11500 auto StartOfExit = exitMBB->begin();
11501
11502 // [scratch, srcOut] = LDRB_POST(srcLoop, 1)
11503 // [destOut] = STRB_POST(scratch, destLoop, 1)
11504 unsigned srcIn = srcLoop;
11505 unsigned destIn = destLoop;
11506 for (unsigned i = 0; i < BytesLeft; i++) {
11507 Register srcOut = MRI.createVirtualRegister(TRC);
11508 Register destOut = MRI.createVirtualRegister(TRC);
11509 Register scratch = MRI.createVirtualRegister(TRC);
11510 emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut,
11511 IsThumb1, IsThumb2);
11512 emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut,
11513 IsThumb1, IsThumb2);
11514 srcIn = srcOut;
11515 destIn = destOut;
11516 }
11517
11518 MI.eraseFromParent(); // The instruction is gone now.
11519 return BB;
11520}
11521
11523ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI,
11524 MachineBasicBlock *MBB) const {
11525 const TargetMachine &TM = getTargetMachine();
11526 const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
11527 DebugLoc DL = MI.getDebugLoc();
11528
11529 assert(Subtarget->isTargetWindows() &&
11530 "__chkstk is only supported on Windows");
11531 assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode");
11532
11533 // __chkstk takes the number of words to allocate on the stack in R4, and
11534 // returns the stack adjustment in number of bytes in R4. This will not
11535 // clober any other registers (other than the obvious lr).
11536 //
11537 // Although, technically, IP should be considered a register which may be
11538 // clobbered, the call itself will not touch it. Windows on ARM is a pure
11539 // thumb-2 environment, so there is no interworking required. As a result, we
11540 // do not expect a veneer to be emitted by the linker, clobbering IP.
11541 //
11542 // Each module receives its own copy of __chkstk, so no import thunk is
11543 // required, again, ensuring that IP is not clobbered.
11544 //
11545 // Finally, although some linkers may theoretically provide a trampoline for
11546 // out of range calls (which is quite common due to a 32M range limitation of
11547 // branches for Thumb), we can generate the long-call version via
11548 // -mcmodel=large, alleviating the need for the trampoline which may clobber
11549 // IP.
11550
11551 switch (TM.getCodeModel()) {
11552 case CodeModel::Tiny:
11553 llvm_unreachable("Tiny code model not available on ARM.");
11554 case CodeModel::Small:
11555 case CodeModel::Medium:
11556 case CodeModel::Kernel:
11557 BuildMI(*MBB, MI, DL, TII.get(ARM::tBL))
11559 .addExternalSymbol("__chkstk")
11562 .addReg(ARM::R12,
11564 .addReg(ARM::CPSR,
11566 break;
11567 case CodeModel::Large: {
11568 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
11569 Register Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11570
11571 BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg)
11572 .addExternalSymbol("__chkstk");
11578 .addReg(ARM::R12,
11580 .addReg(ARM::CPSR,
11582 break;
11583 }
11584 }
11585
11586 BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP)
11587 .addReg(ARM::SP, RegState::Kill)
11588 .addReg(ARM::R4, RegState::Kill)
11591 .add(condCodeOp());
11592
11593 MI.eraseFromParent();
11594 return MBB;
11595}
11596
11598ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI,
11599 MachineBasicBlock *MBB) const {
11600 DebugLoc DL = MI.getDebugLoc();
11601 MachineFunction *MF = MBB->getParent();
11602 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11603
11604 MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock();
11605 MF->insert(++MBB->getIterator(), ContBB);
11606 ContBB->splice(ContBB->begin(), MBB,
11607 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
11609 MBB->addSuccessor(ContBB);
11610
11611 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
11612 BuildMI(TrapBB, DL, TII->get(ARM::t__brkdiv0));
11613 MF->push_back(TrapBB);
11614 MBB->addSuccessor(TrapBB);
11615
11616 BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8))
11617 .addReg(MI.getOperand(0).getReg())
11618 .addImm(0)
11620 BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc))
11621 .addMBB(TrapBB)
11623 .addReg(ARM::CPSR);
11624
11625 MI.eraseFromParent();
11626 return ContBB;
11627}
11628
11629// The CPSR operand of SelectItr might be missing a kill marker
11630// because there were multiple uses of CPSR, and ISel didn't know
11631// which to mark. Figure out whether SelectItr should have had a
11632// kill marker, and set it if it should. Returns the correct kill
11633// marker value.
11636 const TargetRegisterInfo* TRI) {
11637 // Scan forward through BB for a use/def of CPSR.
11638 MachineBasicBlock::iterator miI(std::next(SelectItr));
11639 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
11640 const MachineInstr& mi = *miI;
11641 if (mi.readsRegister(ARM::CPSR, /*TRI=*/nullptr))
11642 return false;
11643 if (mi.definesRegister(ARM::CPSR, /*TRI=*/nullptr))
11644 break; // Should have kill-flag - update below.
11645 }
11646
11647 // If we hit the end of the block, check whether CPSR is live into a
11648 // successor.
11649 if (miI == BB->end()) {
11650 for (MachineBasicBlock *Succ : BB->successors())
11651 if (Succ->isLiveIn(ARM::CPSR))
11652 return false;
11653 }
11654
11655 // We found a def, or hit the end of the basic block and CPSR wasn't live
11656 // out. SelectMI should have a kill flag on CPSR.
11657 SelectItr->addRegisterKilled(ARM::CPSR, TRI);
11658 return true;
11659}
11660
11661/// Adds logic in loop entry MBB to calculate loop iteration count and adds
11662/// t2WhileLoopSetup and t2WhileLoopStart to generate WLS loop
11664 MachineBasicBlock *TpLoopBody,
11665 MachineBasicBlock *TpExit, Register OpSizeReg,
11666 const TargetInstrInfo *TII, DebugLoc Dl,
11668 // Calculates loop iteration count = ceil(n/16) = (n + 15) >> 4.
11669 Register AddDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11670 BuildMI(TpEntry, Dl, TII->get(ARM::t2ADDri), AddDestReg)
11671 .addUse(OpSizeReg)
11672 .addImm(15)
11674 .addReg(0);
11675
11676 Register LsrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11677 BuildMI(TpEntry, Dl, TII->get(ARM::t2LSRri), LsrDestReg)
11678 .addUse(AddDestReg, RegState::Kill)
11679 .addImm(4)
11681 .addReg(0);
11682
11683 Register TotalIterationsReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11684 BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopSetup), TotalIterationsReg)
11685 .addUse(LsrDestReg, RegState::Kill);
11686
11687 BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopStart))
11688 .addUse(TotalIterationsReg)
11689 .addMBB(TpExit);
11690
11691 BuildMI(TpEntry, Dl, TII->get(ARM::t2B))
11692 .addMBB(TpLoopBody)
11694
11695 return TotalIterationsReg;
11696}
11697
11698/// Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and
11699/// t2DoLoopEnd. These are used by later passes to generate tail predicated
11700/// loops.
11701static void genTPLoopBody(MachineBasicBlock *TpLoopBody,
11702 MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit,
11703 const TargetInstrInfo *TII, DebugLoc Dl,
11704 MachineRegisterInfo &MRI, Register OpSrcReg,
11705 Register OpDestReg, Register ElementCountReg,
11706 Register TotalIterationsReg, bool IsMemcpy) {
11707 // First insert 4 PHI nodes for: Current pointer to Src (if memcpy), Dest
11708 // array, loop iteration counter, predication counter.
11709
11710 Register SrcPhiReg, CurrSrcReg;
11711 if (IsMemcpy) {
11712 // Current position in the src array
11713 SrcPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11714 CurrSrcReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11715 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), SrcPhiReg)
11716 .addUse(OpSrcReg)
11717 .addMBB(TpEntry)
11718 .addUse(CurrSrcReg)
11719 .addMBB(TpLoopBody);
11720 }
11721
11722 // Current position in the dest array
11723 Register DestPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11724 Register CurrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11725 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), DestPhiReg)
11726 .addUse(OpDestReg)
11727 .addMBB(TpEntry)
11728 .addUse(CurrDestReg)
11729 .addMBB(TpLoopBody);
11730
11731 // Current loop counter
11732 Register LoopCounterPhiReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11733 Register RemainingLoopIterationsReg =
11734 MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11735 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), LoopCounterPhiReg)
11736 .addUse(TotalIterationsReg)
11737 .addMBB(TpEntry)
11738 .addUse(RemainingLoopIterationsReg)
11739 .addMBB(TpLoopBody);
11740
11741 // Predication counter
11742 Register PredCounterPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11743 Register RemainingElementsReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11744 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), PredCounterPhiReg)
11745 .addUse(ElementCountReg)
11746 .addMBB(TpEntry)
11747 .addUse(RemainingElementsReg)
11748 .addMBB(TpLoopBody);
11749
11750 // Pass predication counter to VCTP
11751 Register VccrReg = MRI.createVirtualRegister(&ARM::VCCRRegClass);
11752 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VCTP8), VccrReg)
11753 .addUse(PredCounterPhiReg)
11755 .addReg(0)
11756 .addReg(0);
11757
11758 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2SUBri), RemainingElementsReg)
11759 .addUse(PredCounterPhiReg)
11760 .addImm(16)
11762 .addReg(0);
11763
11764 // VLDRB (only if memcpy) and VSTRB instructions, predicated using VPR
11765 Register SrcValueReg;
11766 if (IsMemcpy) {
11767 SrcValueReg = MRI.createVirtualRegister(&ARM::MQPRRegClass);
11768 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VLDRBU8_post))
11769 .addDef(CurrSrcReg)
11770 .addDef(SrcValueReg)
11771 .addReg(SrcPhiReg)
11772 .addImm(16)
11774 .addUse(VccrReg)
11775 .addReg(0);
11776 } else
11777 SrcValueReg = OpSrcReg;
11778
11779 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VSTRBU8_post))
11780 .addDef(CurrDestReg)
11781 .addUse(SrcValueReg)
11782 .addReg(DestPhiReg)
11783 .addImm(16)
11785 .addUse(VccrReg)
11786 .addReg(0);
11787
11788 // Add the pseudoInstrs for decrementing the loop counter and marking the
11789 // end:t2DoLoopDec and t2DoLoopEnd
11790 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopDec), RemainingLoopIterationsReg)
11791 .addUse(LoopCounterPhiReg)
11792 .addImm(1);
11793
11794 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopEnd))
11795 .addUse(RemainingLoopIterationsReg)
11796 .addMBB(TpLoopBody);
11797
11798 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2B))
11799 .addMBB(TpExit)
11801}
11802
11804 // KCFI is supported in all ARM/Thumb modes
11805 return true;
11806}
11807
11811 const TargetInstrInfo *TII) const {
11812 assert(MBBI->isCall() && MBBI->getCFIType() &&
11813 "Invalid call instruction for a KCFI check");
11814
11815 MachineOperand *TargetOp = nullptr;
11816 switch (MBBI->getOpcode()) {
11817 // ARM mode opcodes
11818 case ARM::BLX:
11819 case ARM::BLX_pred:
11820 case ARM::BLX_noip:
11821 case ARM::BLX_pred_noip:
11822 case ARM::BX_CALL:
11823 TargetOp = &MBBI->getOperand(0);
11824 break;
11825 case ARM::TCRETURNri:
11826 case ARM::TCRETURNrinotr12:
11827 case ARM::TAILJMPr:
11828 case ARM::TAILJMPr4:
11829 TargetOp = &MBBI->getOperand(0);
11830 break;
11831 // Thumb mode opcodes (Thumb1 and Thumb2)
11832 // Note: Most Thumb call instructions have predicate operands before the
11833 // target register Format: tBLXr pred, predreg, target_register, ...
11834 case ARM::tBLXr: // Thumb1/Thumb2: BLX register (requires V5T)
11835 case ARM::tBLXr_noip: // Thumb1/Thumb2: BLX register, no IP clobber
11836 case ARM::tBX_CALL: // Thumb1 only: BX call (push LR, BX)
11837 TargetOp = &MBBI->getOperand(2);
11838 break;
11839 // Tail call instructions don't have predicates, target is operand 0
11840 case ARM::tTAILJMPr: // Thumb1/Thumb2: Tail call via register
11841 TargetOp = &MBBI->getOperand(0);
11842 break;
11843 default:
11844 llvm_unreachable("Unexpected CFI call opcode");
11845 }
11846
11847 assert(TargetOp && TargetOp->isReg() && "Invalid target operand");
11848 TargetOp->setIsRenamable(false);
11849
11850 // Select the appropriate KCFI_CHECK variant based on the instruction set
11851 unsigned KCFICheckOpcode;
11852 if (Subtarget->isThumb()) {
11853 if (Subtarget->isThumb2()) {
11854 KCFICheckOpcode = ARM::KCFI_CHECK_Thumb2;
11855 } else {
11856 KCFICheckOpcode = ARM::KCFI_CHECK_Thumb1;
11857 }
11858 } else {
11859 KCFICheckOpcode = ARM::KCFI_CHECK_ARM;
11860 }
11861
11862 return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(KCFICheckOpcode))
11863 .addReg(TargetOp->getReg())
11864 .addImm(MBBI->getCFIType())
11865 .getInstr();
11866}
11867
11870 MachineBasicBlock *BB) const {
11871 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11872 DebugLoc dl = MI.getDebugLoc();
11873 bool isThumb2 = Subtarget->isThumb2();
11874 switch (MI.getOpcode()) {
11875 default: {
11876 MI.print(errs());
11877 llvm_unreachable("Unexpected instr type to insert");
11878 }
11879
11880 // Thumb1 post-indexed loads are really just single-register LDMs.
11881 case ARM::tLDR_postidx: {
11882 MachineOperand Def(MI.getOperand(1));
11883 BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD))
11884 .add(Def) // Rn_wb
11885 .add(MI.getOperand(2)) // Rn
11886 .add(MI.getOperand(3)) // PredImm
11887 .add(MI.getOperand(4)) // PredReg
11888 .add(MI.getOperand(0)) // Rt
11889 .cloneMemRefs(MI);
11890 MI.eraseFromParent();
11891 return BB;
11892 }
11893
11894 case ARM::MVE_MEMCPYLOOPINST:
11895 case ARM::MVE_MEMSETLOOPINST: {
11896
11897 // Transformation below expands MVE_MEMCPYLOOPINST/MVE_MEMSETLOOPINST Pseudo
11898 // into a Tail Predicated (TP) Loop. It adds the instructions to calculate
11899 // the iteration count =ceil(size_in_bytes/16)) in the TP entry block and
11900 // adds the relevant instructions in the TP loop Body for generation of a
11901 // WLSTP loop.
11902
11903 // Below is relevant portion of the CFG after the transformation.
11904 // The Machine Basic Blocks are shown along with branch conditions (in
11905 // brackets). Note that TP entry/exit MBBs depict the entry/exit of this
11906 // portion of the CFG and may not necessarily be the entry/exit of the
11907 // function.
11908
11909 // (Relevant) CFG after transformation:
11910 // TP entry MBB
11911 // |
11912 // |-----------------|
11913 // (n <= 0) (n > 0)
11914 // | |
11915 // | TP loop Body MBB<--|
11916 // | | |
11917 // \ |___________|
11918 // \ /
11919 // TP exit MBB
11920
11921 MachineFunction *MF = BB->getParent();
11922 MachineFunctionProperties &Properties = MF->getProperties();
11924
11925 Register OpDestReg = MI.getOperand(0).getReg();
11926 Register OpSrcReg = MI.getOperand(1).getReg();
11927 Register OpSizeReg = MI.getOperand(2).getReg();
11928
11929 // Allocate the required MBBs and add to parent function.
11930 MachineBasicBlock *TpEntry = BB;
11931 MachineBasicBlock *TpLoopBody = MF->CreateMachineBasicBlock();
11932 MachineBasicBlock *TpExit;
11933
11934 MF->push_back(TpLoopBody);
11935
11936 // If any instructions are present in the current block after
11937 // MVE_MEMCPYLOOPINST or MVE_MEMSETLOOPINST, split the current block and
11938 // move the instructions into the newly created exit block. If there are no
11939 // instructions add an explicit branch to the FallThrough block and then
11940 // split.
11941 //
11942 // The split is required for two reasons:
11943 // 1) A terminator(t2WhileLoopStart) will be placed at that site.
11944 // 2) Since a TPLoopBody will be added later, any phis in successive blocks
11945 // need to be updated. splitAt() already handles this.
11946 TpExit = BB->splitAt(MI, false);
11947 if (TpExit == BB) {
11948 assert(BB->canFallThrough() && "Exit Block must be Fallthrough of the "
11949 "block containing memcpy/memset Pseudo");
11950 TpExit = BB->getFallThrough();
11951 BuildMI(BB, dl, TII->get(ARM::t2B))
11952 .addMBB(TpExit)
11954 TpExit = BB->splitAt(MI, false);
11955 }
11956
11957 // Add logic for iteration count
11958 Register TotalIterationsReg =
11959 genTPEntry(TpEntry, TpLoopBody, TpExit, OpSizeReg, TII, dl, MRI);
11960
11961 // Add the vectorized (and predicated) loads/store instructions
11962 bool IsMemcpy = MI.getOpcode() == ARM::MVE_MEMCPYLOOPINST;
11963 genTPLoopBody(TpLoopBody, TpEntry, TpExit, TII, dl, MRI, OpSrcReg,
11964 OpDestReg, OpSizeReg, TotalIterationsReg, IsMemcpy);
11965
11966 // Required to avoid conflict with the MachineVerifier during testing.
11967 Properties.resetNoPHIs();
11968
11969 // Connect the blocks
11970 TpEntry->addSuccessor(TpLoopBody);
11971 TpLoopBody->addSuccessor(TpLoopBody);
11972 TpLoopBody->addSuccessor(TpExit);
11973
11974 // Reorder for a more natural layout
11975 TpLoopBody->moveAfter(TpEntry);
11976 TpExit->moveAfter(TpLoopBody);
11977
11978 // Finally, remove the memcpy Pseudo Instruction
11979 MI.eraseFromParent();
11980
11981 // Return the exit block as it may contain other instructions requiring a
11982 // custom inserter
11983 return TpExit;
11984 }
11985
11986 // The Thumb2 pre-indexed stores have the same MI operands, they just
11987 // define them differently in the .td files from the isel patterns, so
11988 // they need pseudos.
11989 case ARM::t2STR_preidx:
11990 MI.setDesc(TII->get(ARM::t2STR_PRE));
11991 return BB;
11992 case ARM::t2STRB_preidx:
11993 MI.setDesc(TII->get(ARM::t2STRB_PRE));
11994 return BB;
11995 case ARM::t2STRH_preidx:
11996 MI.setDesc(TII->get(ARM::t2STRH_PRE));
11997 return BB;
11998
11999 case ARM::STRi_preidx:
12000 case ARM::STRBi_preidx: {
12001 unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM
12002 : ARM::STRB_PRE_IMM;
12003 // Decode the offset.
12004 unsigned Offset = MI.getOperand(4).getImm();
12005 bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub;
12007 if (isSub)
12008 Offset = -Offset;
12009
12010 MachineMemOperand *MMO = *MI.memoperands_begin();
12011 BuildMI(*BB, MI, dl, TII->get(NewOpc))
12012 .add(MI.getOperand(0)) // Rn_wb
12013 .add(MI.getOperand(1)) // Rt
12014 .add(MI.getOperand(2)) // Rn
12015 .addImm(Offset) // offset (skip GPR==zero_reg)
12016 .add(MI.getOperand(5)) // pred
12017 .add(MI.getOperand(6))
12018 .addMemOperand(MMO);
12019 MI.eraseFromParent();
12020 return BB;
12021 }
12022 case ARM::STRr_preidx:
12023 case ARM::STRBr_preidx:
12024 case ARM::STRH_preidx: {
12025 unsigned NewOpc;
12026 switch (MI.getOpcode()) {
12027 default: llvm_unreachable("unexpected opcode!");
12028 case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break;
12029 case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break;
12030 case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break;
12031 }
12032 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
12033 for (const MachineOperand &MO : MI.operands())
12034 MIB.add(MO);
12035 MI.eraseFromParent();
12036 return BB;
12037 }
12038
12039 case ARM::tMOVCCr_pseudo: {
12040 // To "insert" a SELECT_CC instruction, we actually have to insert the
12041 // diamond control-flow pattern. The incoming instruction knows the
12042 // destination vreg to set, the condition code register to branch on, the
12043 // true/false values to select between, and a branch opcode to use.
12044 const BasicBlock *LLVM_BB = BB->getBasicBlock();
12046
12047 // thisMBB:
12048 // ...
12049 // TrueVal = ...
12050 // cmpTY ccX, r1, r2
12051 // bCC copy1MBB
12052 // fallthrough --> copy0MBB
12053 MachineBasicBlock *thisMBB = BB;
12054 MachineFunction *F = BB->getParent();
12055 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
12056 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
12057 F->insert(It, copy0MBB);
12058 F->insert(It, sinkMBB);
12059
12060 // Set the call frame size on entry to the new basic blocks.
12061 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
12062 copy0MBB->setCallFrameSize(CallFrameSize);
12063 sinkMBB->setCallFrameSize(CallFrameSize);
12064
12065 // Check whether CPSR is live past the tMOVCCr_pseudo.
12066 const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
12067 if (!MI.killsRegister(ARM::CPSR, /*TRI=*/nullptr) &&
12068 !checkAndUpdateCPSRKill(MI, thisMBB, TRI)) {
12069 copy0MBB->addLiveIn(ARM::CPSR);
12070 sinkMBB->addLiveIn(ARM::CPSR);
12071 }
12072
12073 // Transfer the remainder of BB and its successor edges to sinkMBB.
12074 sinkMBB->splice(sinkMBB->begin(), BB,
12075 std::next(MachineBasicBlock::iterator(MI)), BB->end());
12077
12078 BB->addSuccessor(copy0MBB);
12079 BB->addSuccessor(sinkMBB);
12080
12081 BuildMI(BB, dl, TII->get(ARM::tBcc))
12082 .addMBB(sinkMBB)
12083 .addImm(MI.getOperand(3).getImm())
12084 .addReg(MI.getOperand(4).getReg());
12085
12086 // copy0MBB:
12087 // %FalseValue = ...
12088 // # fallthrough to sinkMBB
12089 BB = copy0MBB;
12090
12091 // Update machine-CFG edges
12092 BB->addSuccessor(sinkMBB);
12093
12094 // sinkMBB:
12095 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
12096 // ...
12097 BB = sinkMBB;
12098 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg())
12099 .addReg(MI.getOperand(1).getReg())
12100 .addMBB(copy0MBB)
12101 .addReg(MI.getOperand(2).getReg())
12102 .addMBB(thisMBB);
12103
12104 MI.eraseFromParent(); // The pseudo instruction is gone now.
12105 return BB;
12106 }
12107
12108 case ARM::BCCi64:
12109 case ARM::BCCZi64: {
12110 // If there is an unconditional branch to the other successor, remove it.
12111 BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end());
12112
12113 // Compare both parts that make up the double comparison separately for
12114 // equality.
12115 bool RHSisZero = MI.getOpcode() == ARM::BCCZi64;
12116
12117 Register LHS1 = MI.getOperand(1).getReg();
12118 Register LHS2 = MI.getOperand(2).getReg();
12119 if (RHSisZero) {
12120 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
12121 .addReg(LHS1)
12122 .addImm(0)
12124 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
12125 .addReg(LHS2).addImm(0)
12126 .addImm(ARMCC::EQ).addReg(ARM::CPSR);
12127 } else {
12128 Register RHS1 = MI.getOperand(3).getReg();
12129 Register RHS2 = MI.getOperand(4).getReg();
12130 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
12131 .addReg(LHS1)
12132 .addReg(RHS1)
12134 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
12135 .addReg(LHS2).addReg(RHS2)
12136 .addImm(ARMCC::EQ).addReg(ARM::CPSR);
12137 }
12138
12139 MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB();
12140 MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB);
12141 if (MI.getOperand(0).getImm() == ARMCC::NE)
12142 std::swap(destMBB, exitMBB);
12143
12144 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
12145 .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR);
12146 if (isThumb2)
12147 BuildMI(BB, dl, TII->get(ARM::t2B))
12148 .addMBB(exitMBB)
12150 else
12151 BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB);
12152
12153 MI.eraseFromParent(); // The pseudo instruction is gone now.
12154 return BB;
12155 }
12156
12157 case ARM::Int_eh_sjlj_setjmp:
12158 case ARM::Int_eh_sjlj_setjmp_nofp:
12159 case ARM::tInt_eh_sjlj_setjmp:
12160 case ARM::t2Int_eh_sjlj_setjmp:
12161 case ARM::t2Int_eh_sjlj_setjmp_nofp:
12162 return BB;
12163
12164 case ARM::Int_eh_sjlj_setup_dispatch:
12165 EmitSjLjDispatchBlock(MI, BB);
12166 return BB;
12167 case ARM::COPY_STRUCT_BYVAL_I32:
12168 ++NumLoopByVals;
12169 return EmitStructByval(MI, BB);
12170 case ARM::WIN__CHKSTK:
12171 return EmitLowered__chkstk(MI, BB);
12172 case ARM::WIN__DBZCHK:
12173 return EmitLowered__dbzchk(MI, BB);
12174 }
12175}
12176
12177/// Attaches vregs to MEMCPY that it will use as scratch registers
12178/// when it is expanded into LDM/STM. This is done as a post-isel lowering
12179/// instead of as a custom inserter because we need the use list from the SDNode.
12180static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget,
12181 MachineInstr &MI, const SDNode *Node) {
12182 bool isThumb1 = Subtarget->isThumb1Only();
12183
12184 MachineFunction *MF = MI.getParent()->getParent();
12186 MachineInstrBuilder MIB(*MF, MI);
12187
12188 // If the new dst/src is unused mark it as dead.
12189 if (!Node->hasAnyUseOfValue(0)) {
12190 MI.getOperand(0).setIsDead(true);
12191 }
12192 if (!Node->hasAnyUseOfValue(1)) {
12193 MI.getOperand(1).setIsDead(true);
12194 }
12195
12196 // The MEMCPY both defines and kills the scratch registers.
12197 for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) {
12198 Register TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass
12199 : &ARM::GPRRegClass);
12201 }
12202}
12203
12205 SDNode *Node) const {
12206 if (MI.getOpcode() == ARM::MEMCPY) {
12207 attachMEMCPYScratchRegs(Subtarget, MI, Node);
12208 return;
12209 }
12210
12211 const MCInstrDesc *MCID = &MI.getDesc();
12212 // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
12213 // RSC. Coming out of isel, they have an implicit CPSR def, but the optional
12214 // operand is still set to noreg. If needed, set the optional operand's
12215 // register to CPSR, and remove the redundant implicit def.
12216 //
12217 // e.g. ADCS (..., implicit-def CPSR) -> ADC (... opt:def CPSR).
12218
12219 // Rename pseudo opcodes.
12220 unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode());
12221 unsigned ccOutIdx;
12222 if (NewOpc) {
12223 const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo();
12224 MCID = &TII->get(NewOpc);
12225
12226 assert(MCID->getNumOperands() ==
12227 MI.getDesc().getNumOperands() + 5 - MI.getDesc().getSize()
12228 && "converted opcode should be the same except for cc_out"
12229 " (and, on Thumb1, pred)");
12230
12231 MI.setDesc(*MCID);
12232
12233 // Add the optional cc_out operand
12234 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true));
12235
12236 // On Thumb1, move all input operands to the end, then add the predicate
12237 if (Subtarget->isThumb1Only()) {
12238 for (unsigned c = MCID->getNumOperands() - 4; c--;) {
12239 MI.addOperand(MI.getOperand(1));
12240 MI.removeOperand(1);
12241 }
12242
12243 // Restore the ties
12244 for (unsigned i = MI.getNumOperands(); i--;) {
12245 const MachineOperand& op = MI.getOperand(i);
12246 if (op.isReg() && op.isUse()) {
12247 int DefIdx = MCID->getOperandConstraint(i, MCOI::TIED_TO);
12248 if (DefIdx != -1)
12249 MI.tieOperands(DefIdx, i);
12250 }
12251 }
12252
12254 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/false));
12255 ccOutIdx = 1;
12256 } else
12257 ccOutIdx = MCID->getNumOperands() - 1;
12258 } else
12259 ccOutIdx = MCID->getNumOperands() - 1;
12260
12261 // Any ARM instruction that sets the 's' bit should specify an optional
12262 // "cc_out" operand in the last operand position.
12263 if (!MI.hasOptionalDef() || !MCID->operands()[ccOutIdx].isOptionalDef()) {
12264 assert(!NewOpc && "Optional cc_out operand required");
12265 return;
12266 }
12267 // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it
12268 // since we already have an optional CPSR def.
12269 bool definesCPSR = false;
12270 bool deadCPSR = false;
12271 for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e;
12272 ++i) {
12273 const MachineOperand &MO = MI.getOperand(i);
12274 if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) {
12275 definesCPSR = true;
12276 if (MO.isDead())
12277 deadCPSR = true;
12278 MI.removeOperand(i);
12279 break;
12280 }
12281 }
12282 if (!definesCPSR) {
12283 assert(!NewOpc && "Optional cc_out operand required");
12284 return;
12285 }
12286 assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag");
12287 if (deadCPSR) {
12288 assert(!MI.getOperand(ccOutIdx).getReg() &&
12289 "expect uninitialized optional cc_out operand");
12290 // Thumb1 instructions must have the S bit even if the CPSR is dead.
12291 if (!Subtarget->isThumb1Only())
12292 return;
12293 }
12294
12295 // If this instruction was defined with an optional CPSR def and its dag node
12296 // had a live implicit CPSR def, then activate the optional CPSR def.
12297 MachineOperand &MO = MI.getOperand(ccOutIdx);
12298 MO.setReg(ARM::CPSR);
12299 MO.setIsDef(true);
12300}
12301
12302//===----------------------------------------------------------------------===//
12303// ARM Optimization Hooks
12304//===----------------------------------------------------------------------===//
12305
12306// Helper function that checks if N is a null or all ones constant.
12307static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) {
12309}
12310
12311// Return true if N is conditionally 0 or all ones.
12312// Detects these expressions where cc is an i1 value:
12313//
12314// (select cc 0, y) [AllOnes=0]
12315// (select cc y, 0) [AllOnes=0]
12316// (zext cc) [AllOnes=0]
12317// (sext cc) [AllOnes=0/1]
12318// (select cc -1, y) [AllOnes=1]
12319// (select cc y, -1) [AllOnes=1]
12320//
12321// Invert is set when N is the null/all ones constant when CC is false.
12322// OtherOp is set to the alternative value of N.
12324 SDValue &CC, bool &Invert,
12325 SDValue &OtherOp,
12326 SelectionDAG &DAG) {
12327 switch (N->getOpcode()) {
12328 default: return false;
12329 case ISD::SELECT: {
12330 CC = N->getOperand(0);
12331 SDValue N1 = N->getOperand(1);
12332 SDValue N2 = N->getOperand(2);
12333 if (isZeroOrAllOnes(N1, AllOnes)) {
12334 Invert = false;
12335 OtherOp = N2;
12336 return true;
12337 }
12338 if (isZeroOrAllOnes(N2, AllOnes)) {
12339 Invert = true;
12340 OtherOp = N1;
12341 return true;
12342 }
12343 return false;
12344 }
12345 case ISD::ZERO_EXTEND:
12346 // (zext cc) can never be the all ones value.
12347 if (AllOnes)
12348 return false;
12349 [[fallthrough]];
12350 case ISD::SIGN_EXTEND: {
12351 SDLoc dl(N);
12352 EVT VT = N->getValueType(0);
12353 CC = N->getOperand(0);
12354 if (CC.getValueType() != MVT::i1 || CC.getOpcode() != ISD::SETCC)
12355 return false;
12356 Invert = !AllOnes;
12357 if (AllOnes)
12358 // When looking for an AllOnes constant, N is an sext, and the 'other'
12359 // value is 0.
12360 OtherOp = DAG.getConstant(0, dl, VT);
12361 else if (N->getOpcode() == ISD::ZERO_EXTEND)
12362 // When looking for a 0 constant, N can be zext or sext.
12363 OtherOp = DAG.getConstant(1, dl, VT);
12364 else
12365 OtherOp = DAG.getAllOnesConstant(dl, VT);
12366 return true;
12367 }
12368 }
12369}
12370
12371// Combine a constant select operand into its use:
12372//
12373// (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
12374// (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
12375// (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) [AllOnes=1]
12376// (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
12377// (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
12378//
12379// The transform is rejected if the select doesn't have a constant operand that
12380// is null, or all ones when AllOnes is set.
12381//
12382// Also recognize sext/zext from i1:
12383//
12384// (add (zext cc), x) -> (select cc (add x, 1), x)
12385// (add (sext cc), x) -> (select cc (add x, -1), x)
12386//
12387// These transformations eventually create predicated instructions.
12388//
12389// @param N The node to transform.
12390// @param Slct The N operand that is a select.
12391// @param OtherOp The other N operand (x above).
12392// @param DCI Context.
12393// @param AllOnes Require the select constant to be all ones instead of null.
12394// @returns The new node, or SDValue() on failure.
12395static
12398 bool AllOnes = false) {
12399 SelectionDAG &DAG = DCI.DAG;
12400 EVT VT = N->getValueType(0);
12401 SDValue NonConstantVal;
12402 SDValue CCOp;
12403 bool SwapSelectOps;
12404 if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps,
12405 NonConstantVal, DAG))
12406 return SDValue();
12407
12408 // Slct is now know to be the desired identity constant when CC is true.
12409 SDValue TrueVal = OtherOp;
12410 SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
12411 OtherOp, NonConstantVal);
12412 // Unless SwapSelectOps says CC should be false.
12413 if (SwapSelectOps)
12414 std::swap(TrueVal, FalseVal);
12415
12416 return DAG.getNode(ISD::SELECT, SDLoc(N), VT,
12417 CCOp, TrueVal, FalseVal);
12418}
12419
12420// Attempt combineSelectAndUse on each operand of a commutative operator N.
12421static
12424 SDValue N0 = N->getOperand(0);
12425 SDValue N1 = N->getOperand(1);
12426 if (N0.getNode()->hasOneUse())
12427 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes))
12428 return Result;
12429 if (N1.getNode()->hasOneUse())
12430 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes))
12431 return Result;
12432 return SDValue();
12433}
12434
12436 // VUZP shuffle node.
12437 if (N->getOpcode() == ARMISD::VUZP)
12438 return true;
12439
12440 // "VUZP" on i32 is an alias for VTRN.
12441 if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32)
12442 return true;
12443
12444 return false;
12445}
12446
12449 const ARMSubtarget *Subtarget) {
12450 // Look for ADD(VUZP.0, VUZP.1).
12451 if (!IsVUZPShuffleNode(N0.getNode()) || N0.getNode() != N1.getNode() ||
12452 N0 == N1)
12453 return SDValue();
12454
12455 // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD.
12456 if (!N->getValueType(0).is64BitVector())
12457 return SDValue();
12458
12459 // Generate vpadd.
12460 SelectionDAG &DAG = DCI.DAG;
12461 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12462 SDLoc dl(N);
12463 SDNode *Unzip = N0.getNode();
12464 EVT VT = N->getValueType(0);
12465
12467 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl,
12468 TLI.getPointerTy(DAG.getDataLayout())));
12469 Ops.push_back(Unzip->getOperand(0));
12470 Ops.push_back(Unzip->getOperand(1));
12471
12472 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
12473}
12474
12477 const ARMSubtarget *Subtarget) {
12478 // Check for two extended operands.
12479 if (!(N0.getOpcode() == ISD::SIGN_EXTEND &&
12480 N1.getOpcode() == ISD::SIGN_EXTEND) &&
12481 !(N0.getOpcode() == ISD::ZERO_EXTEND &&
12482 N1.getOpcode() == ISD::ZERO_EXTEND))
12483 return SDValue();
12484
12485 SDValue N00 = N0.getOperand(0);
12486 SDValue N10 = N1.getOperand(0);
12487
12488 // Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1))
12489 if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() ||
12490 N00 == N10)
12491 return SDValue();
12492
12493 // We only recognize Q register paddl here; this can't be reached until
12494 // after type legalization.
12495 if (!N00.getValueType().is64BitVector() ||
12497 return SDValue();
12498
12499 // Generate vpaddl.
12500 SelectionDAG &DAG = DCI.DAG;
12501 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12502 SDLoc dl(N);
12503 EVT VT = N->getValueType(0);
12504
12506 // Form vpaddl.sN or vpaddl.uN depending on the kind of extension.
12507 unsigned Opcode;
12508 if (N0.getOpcode() == ISD::SIGN_EXTEND)
12509 Opcode = Intrinsic::arm_neon_vpaddls;
12510 else
12511 Opcode = Intrinsic::arm_neon_vpaddlu;
12512 Ops.push_back(DAG.getConstant(Opcode, dl,
12513 TLI.getPointerTy(DAG.getDataLayout())));
12514 EVT ElemTy = N00.getValueType().getVectorElementType();
12515 unsigned NumElts = VT.getVectorNumElements();
12516 EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), ElemTy, NumElts * 2);
12517 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), ConcatVT,
12518 N00.getOperand(0), N00.getOperand(1));
12519 Ops.push_back(Concat);
12520
12521 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
12522}
12523
12524// FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in
12525// an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is
12526// much easier to match.
12527static SDValue
12530 const ARMSubtarget *Subtarget) {
12531 // Only perform optimization if after legalize, and if NEON is available. We
12532 // also expected both operands to be BUILD_VECTORs.
12533 if (DCI.isBeforeLegalize() || !Subtarget->hasNEON()
12534 || N0.getOpcode() != ISD::BUILD_VECTOR
12535 || N1.getOpcode() != ISD::BUILD_VECTOR)
12536 return SDValue();
12537
12538 // Check output type since VPADDL operand elements can only be 8, 16, or 32.
12539 EVT VT = N->getValueType(0);
12540 if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64)
12541 return SDValue();
12542
12543 // Check that the vector operands are of the right form.
12544 // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR
12545 // operands, where N is the size of the formed vector.
12546 // Each EXTRACT_VECTOR should have the same input vector and odd or even
12547 // index such that we have a pair wise add pattern.
12548
12549 // Grab the vector that all EXTRACT_VECTOR nodes should be referencing.
12551 return SDValue();
12552 SDValue Vec = N0->getOperand(0)->getOperand(0);
12553 SDNode *V = Vec.getNode();
12554 unsigned nextIndex = 0;
12555
12556 // For each operands to the ADD which are BUILD_VECTORs,
12557 // check to see if each of their operands are an EXTRACT_VECTOR with
12558 // the same vector and appropriate index.
12559 for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) {
12562
12563 SDValue ExtVec0 = N0->getOperand(i);
12564 SDValue ExtVec1 = N1->getOperand(i);
12565
12566 // First operand is the vector, verify its the same.
12567 if (V != ExtVec0->getOperand(0).getNode() ||
12568 V != ExtVec1->getOperand(0).getNode())
12569 return SDValue();
12570
12571 // Second is the constant, verify its correct.
12574
12575 // For the constant, we want to see all the even or all the odd.
12576 if (!C0 || !C1 || C0->getZExtValue() != nextIndex
12577 || C1->getZExtValue() != nextIndex+1)
12578 return SDValue();
12579
12580 // Increment index.
12581 nextIndex+=2;
12582 } else
12583 return SDValue();
12584 }
12585
12586 // Don't generate vpaddl+vmovn; we'll match it to vpadd later. Also make sure
12587 // we're using the entire input vector, otherwise there's a size/legality
12588 // mismatch somewhere.
12589 if (nextIndex != Vec.getValueType().getVectorNumElements() ||
12591 return SDValue();
12592
12593 // Create VPADDL node.
12594 SelectionDAG &DAG = DCI.DAG;
12595 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12596
12597 SDLoc dl(N);
12598
12599 // Build operand list.
12601 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl,
12602 TLI.getPointerTy(DAG.getDataLayout())));
12603
12604 // Input is the vector.
12605 Ops.push_back(Vec);
12606
12607 // Get widened type and narrowed type.
12608 MVT widenType;
12609 unsigned numElem = VT.getVectorNumElements();
12610
12611 EVT inputLaneType = Vec.getValueType().getVectorElementType();
12612 switch (inputLaneType.getSimpleVT().SimpleTy) {
12613 case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break;
12614 case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break;
12615 case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break;
12616 default:
12617 llvm_unreachable("Invalid vector element type for padd optimization.");
12618 }
12619
12620 SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops);
12621 unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE;
12622 return DAG.getNode(ExtOp, dl, VT, tmp);
12623}
12624
12626 if (V->getOpcode() == ISD::UMUL_LOHI ||
12627 V->getOpcode() == ISD::SMUL_LOHI)
12628 return V;
12629 return SDValue();
12630}
12631
12632static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode,
12634 const ARMSubtarget *Subtarget) {
12635 if (!Subtarget->hasBaseDSP())
12636 return SDValue();
12637
12638 // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and
12639 // accumulates the product into a 64-bit value. The 16-bit values will
12640 // be sign extended somehow or SRA'd into 32-bit values
12641 // (addc (adde (mul 16bit, 16bit), lo), hi)
12642 SDValue Mul = AddcNode->getOperand(0);
12643 SDValue Lo = AddcNode->getOperand(1);
12644 if (Mul.getOpcode() != ISD::MUL) {
12645 Lo = AddcNode->getOperand(0);
12646 Mul = AddcNode->getOperand(1);
12647 if (Mul.getOpcode() != ISD::MUL)
12648 return SDValue();
12649 }
12650
12651 SDValue SRA = AddeNode->getOperand(0);
12652 SDValue Hi = AddeNode->getOperand(1);
12653 if (SRA.getOpcode() != ISD::SRA) {
12654 SRA = AddeNode->getOperand(1);
12655 Hi = AddeNode->getOperand(0);
12656 if (SRA.getOpcode() != ISD::SRA)
12657 return SDValue();
12658 }
12659 if (auto Const = dyn_cast<ConstantSDNode>(SRA.getOperand(1))) {
12660 if (Const->getZExtValue() != 31)
12661 return SDValue();
12662 } else
12663 return SDValue();
12664
12665 if (SRA.getOperand(0) != Mul)
12666 return SDValue();
12667
12668 SelectionDAG &DAG = DCI.DAG;
12669 SDLoc dl(AddcNode);
12670 unsigned Opcode = 0;
12671 SDValue Op0;
12672 SDValue Op1;
12673
12674 if (isS16(Mul.getOperand(0), DAG) && isS16(Mul.getOperand(1), DAG)) {
12675 Opcode = ARMISD::SMLALBB;
12676 Op0 = Mul.getOperand(0);
12677 Op1 = Mul.getOperand(1);
12678 } else if (isS16(Mul.getOperand(0), DAG) && isSRA16(Mul.getOperand(1))) {
12679 Opcode = ARMISD::SMLALBT;
12680 Op0 = Mul.getOperand(0);
12681 Op1 = Mul.getOperand(1).getOperand(0);
12682 } else if (isSRA16(Mul.getOperand(0)) && isS16(Mul.getOperand(1), DAG)) {
12683 Opcode = ARMISD::SMLALTB;
12684 Op0 = Mul.getOperand(0).getOperand(0);
12685 Op1 = Mul.getOperand(1);
12686 } else if (isSRA16(Mul.getOperand(0)) && isSRA16(Mul.getOperand(1))) {
12687 Opcode = ARMISD::SMLALTT;
12688 Op0 = Mul->getOperand(0).getOperand(0);
12689 Op1 = Mul->getOperand(1).getOperand(0);
12690 }
12691
12692 if (!Op0 || !Op1)
12693 return SDValue();
12694
12695 SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
12696 Op0, Op1, Lo, Hi);
12697 // Replace the ADDs' nodes uses by the MLA node's values.
12698 SDValue HiMLALResult(SMLAL.getNode(), 1);
12699 SDValue LoMLALResult(SMLAL.getNode(), 0);
12700
12701 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);
12702 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult);
12703
12704 // Return original node to notify the driver to stop replacing.
12705 SDValue resNode(AddcNode, 0);
12706 return resNode;
12707}
12708
12711 const ARMSubtarget *Subtarget) {
12712 // Look for multiply add opportunities.
12713 // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where
12714 // each add nodes consumes a value from ISD::UMUL_LOHI and there is
12715 // a glue link from the first add to the second add.
12716 // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by
12717 // a S/UMLAL instruction.
12718 // UMUL_LOHI
12719 // / :lo \ :hi
12720 // V \ [no multiline comment]
12721 // loAdd -> ADDC |
12722 // \ :carry /
12723 // V V
12724 // ADDE <- hiAdd
12725 //
12726 // In the special case where only the higher part of a signed result is used
12727 // and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts
12728 // a constant with the exact value of 0x80000000, we recognize we are dealing
12729 // with a "rounded multiply and add" (or subtract) and transform it into
12730 // either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively.
12731
12732 assert((AddeSubeNode->getOpcode() == ARMISD::ADDE ||
12733 AddeSubeNode->getOpcode() == ARMISD::SUBE) &&
12734 "Expect an ADDE or SUBE");
12735
12736 assert(AddeSubeNode->getNumOperands() == 3 &&
12737 AddeSubeNode->getOperand(2).getValueType() == MVT::i32 &&
12738 "ADDE node has the wrong inputs");
12739
12740 // Check that we are chained to the right ADDC or SUBC node.
12741 SDNode *AddcSubcNode = AddeSubeNode->getOperand(2).getNode();
12742 if ((AddeSubeNode->getOpcode() == ARMISD::ADDE &&
12743 AddcSubcNode->getOpcode() != ARMISD::ADDC) ||
12744 (AddeSubeNode->getOpcode() == ARMISD::SUBE &&
12745 AddcSubcNode->getOpcode() != ARMISD::SUBC))
12746 return SDValue();
12747
12748 SDValue AddcSubcOp0 = AddcSubcNode->getOperand(0);
12749 SDValue AddcSubcOp1 = AddcSubcNode->getOperand(1);
12750
12751 // Check if the two operands are from the same mul_lohi node.
12752 if (AddcSubcOp0.getNode() == AddcSubcOp1.getNode())
12753 return SDValue();
12754
12755 assert(AddcSubcNode->getNumValues() == 2 &&
12756 AddcSubcNode->getValueType(0) == MVT::i32 &&
12757 "Expect ADDC with two result values. First: i32");
12758
12759 // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it
12760 // maybe a SMLAL which multiplies two 16-bit values.
12761 if (AddeSubeNode->getOpcode() == ARMISD::ADDE &&
12762 AddcSubcOp0->getOpcode() != ISD::UMUL_LOHI &&
12763 AddcSubcOp0->getOpcode() != ISD::SMUL_LOHI &&
12764 AddcSubcOp1->getOpcode() != ISD::UMUL_LOHI &&
12765 AddcSubcOp1->getOpcode() != ISD::SMUL_LOHI)
12766 return AddCombineTo64BitSMLAL16(AddcSubcNode, AddeSubeNode, DCI, Subtarget);
12767
12768 // Check for the triangle shape.
12769 SDValue AddeSubeOp0 = AddeSubeNode->getOperand(0);
12770 SDValue AddeSubeOp1 = AddeSubeNode->getOperand(1);
12771
12772 // Make sure that the ADDE/SUBE operands are not coming from the same node.
12773 if (AddeSubeOp0.getNode() == AddeSubeOp1.getNode())
12774 return SDValue();
12775
12776 // Find the MUL_LOHI node walking up ADDE/SUBE's operands.
12777 bool IsLeftOperandMUL = false;
12778 SDValue MULOp = findMUL_LOHI(AddeSubeOp0);
12779 if (MULOp == SDValue())
12780 MULOp = findMUL_LOHI(AddeSubeOp1);
12781 else
12782 IsLeftOperandMUL = true;
12783 if (MULOp == SDValue())
12784 return SDValue();
12785
12786 // Figure out the right opcode.
12787 unsigned Opc = MULOp->getOpcode();
12788 unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL;
12789
12790 // Figure out the high and low input values to the MLAL node.
12791 SDValue *HiAddSub = nullptr;
12792 SDValue *LoMul = nullptr;
12793 SDValue *LowAddSub = nullptr;
12794
12795 // Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI.
12796 if ((AddeSubeOp0 != MULOp.getValue(1)) && (AddeSubeOp1 != MULOp.getValue(1)))
12797 return SDValue();
12798
12799 if (IsLeftOperandMUL)
12800 HiAddSub = &AddeSubeOp1;
12801 else
12802 HiAddSub = &AddeSubeOp0;
12803
12804 // Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node
12805 // whose low result is fed to the ADDC/SUBC we are checking.
12806
12807 if (AddcSubcOp0 == MULOp.getValue(0)) {
12808 LoMul = &AddcSubcOp0;
12809 LowAddSub = &AddcSubcOp1;
12810 }
12811 if (AddcSubcOp1 == MULOp.getValue(0)) {
12812 LoMul = &AddcSubcOp1;
12813 LowAddSub = &AddcSubcOp0;
12814 }
12815
12816 if (!LoMul)
12817 return SDValue();
12818
12819 // If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC
12820 // the replacement below will create a cycle.
12821 if (AddcSubcNode == HiAddSub->getNode() ||
12822 AddcSubcNode->isPredecessorOf(HiAddSub->getNode()))
12823 return SDValue();
12824
12825 // Create the merged node.
12826 SelectionDAG &DAG = DCI.DAG;
12827
12828 // Start building operand list.
12830 Ops.push_back(LoMul->getOperand(0));
12831 Ops.push_back(LoMul->getOperand(1));
12832
12833 // Check whether we can use SMMLAR, SMMLSR or SMMULR instead. For this to be
12834 // the case, we must be doing signed multiplication and only use the higher
12835 // part of the result of the MLAL, furthermore the LowAddSub must be a constant
12836 // addition or subtraction with the value of 0x800000.
12837 if (Subtarget->hasV6Ops() && Subtarget->hasDSP() && Subtarget->useMulOps() &&
12838 FinalOpc == ARMISD::SMLAL && !AddeSubeNode->hasAnyUseOfValue(1) &&
12839 LowAddSub->getNode()->getOpcode() == ISD::Constant &&
12840 static_cast<ConstantSDNode *>(LowAddSub->getNode())->getZExtValue() ==
12841 0x80000000) {
12842 Ops.push_back(*HiAddSub);
12843 if (AddcSubcNode->getOpcode() == ARMISD::SUBC) {
12844 FinalOpc = ARMISD::SMMLSR;
12845 } else {
12846 FinalOpc = ARMISD::SMMLAR;
12847 }
12848 SDValue NewNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), MVT::i32, Ops);
12849 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), NewNode);
12850
12851 return SDValue(AddeSubeNode, 0);
12852 } else if (AddcSubcNode->getOpcode() == ARMISD::SUBC)
12853 // SMMLS is generated during instruction selection and the rest of this
12854 // function can not handle the case where AddcSubcNode is a SUBC.
12855 return SDValue();
12856
12857 // Finish building the operand list for {U/S}MLAL
12858 Ops.push_back(*LowAddSub);
12859 Ops.push_back(*HiAddSub);
12860
12861 SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode),
12862 DAG.getVTList(MVT::i32, MVT::i32), Ops);
12863
12864 // Replace the ADDs' nodes uses by the MLA node's values.
12865 SDValue HiMLALResult(MLALNode.getNode(), 1);
12866 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), HiMLALResult);
12867
12868 SDValue LoMLALResult(MLALNode.getNode(), 0);
12869 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcSubcNode, 0), LoMLALResult);
12870
12871 // Return original node to notify the driver to stop replacing.
12872 return SDValue(AddeSubeNode, 0);
12873}
12874
12877 const ARMSubtarget *Subtarget) {
12878 // UMAAL is similar to UMLAL except that it adds two unsigned values.
12879 // While trying to combine for the other MLAL nodes, first search for the
12880 // chance to use UMAAL. Check if Addc uses a node which has already
12881 // been combined into a UMLAL. The other pattern is UMLAL using Addc/Adde
12882 // as the addend, and it's handled in PerformUMLALCombine.
12883
12884 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
12885 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
12886
12887 // Check that we have a glued ADDC node.
12888 SDNode* AddcNode = AddeNode->getOperand(2).getNode();
12889 if (AddcNode->getOpcode() != ARMISD::ADDC)
12890 return SDValue();
12891
12892 // Find the converted UMAAL or quit if it doesn't exist.
12893 SDNode *UmlalNode = nullptr;
12894 SDValue AddHi;
12895 if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) {
12896 UmlalNode = AddcNode->getOperand(0).getNode();
12897 AddHi = AddcNode->getOperand(1);
12898 } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) {
12899 UmlalNode = AddcNode->getOperand(1).getNode();
12900 AddHi = AddcNode->getOperand(0);
12901 } else {
12902 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
12903 }
12904
12905 // The ADDC should be glued to an ADDE node, which uses the same UMLAL as
12906 // the ADDC as well as Zero.
12907 if (!isNullConstant(UmlalNode->getOperand(3)))
12908 return SDValue();
12909
12910 if ((isNullConstant(AddeNode->getOperand(0)) &&
12911 AddeNode->getOperand(1).getNode() == UmlalNode) ||
12912 (AddeNode->getOperand(0).getNode() == UmlalNode &&
12913 isNullConstant(AddeNode->getOperand(1)))) {
12914 SelectionDAG &DAG = DCI.DAG;
12915 SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1),
12916 UmlalNode->getOperand(2), AddHi };
12917 SDValue UMAAL = DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode),
12918 DAG.getVTList(MVT::i32, MVT::i32), Ops);
12919
12920 // Replace the ADDs' nodes uses by the UMAAL node's values.
12921 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1));
12922 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0));
12923
12924 // Return original node to notify the driver to stop replacing.
12925 return SDValue(AddeNode, 0);
12926 }
12927 return SDValue();
12928}
12929
12931 const ARMSubtarget *Subtarget) {
12932 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
12933 return SDValue();
12934
12935 // Check that we have a pair of ADDC and ADDE as operands.
12936 // Both addends of the ADDE must be zero.
12937 SDNode* AddcNode = N->getOperand(2).getNode();
12938 SDNode* AddeNode = N->getOperand(3).getNode();
12939 if ((AddcNode->getOpcode() == ARMISD::ADDC) &&
12940 (AddeNode->getOpcode() == ARMISD::ADDE) &&
12941 isNullConstant(AddeNode->getOperand(0)) &&
12942 isNullConstant(AddeNode->getOperand(1)) &&
12943 (AddeNode->getOperand(2).getNode() == AddcNode))
12944 return DAG.getNode(ARMISD::UMAAL, SDLoc(N),
12945 DAG.getVTList(MVT::i32, MVT::i32),
12946 {N->getOperand(0), N->getOperand(1),
12947 AddcNode->getOperand(0), AddcNode->getOperand(1)});
12948 else
12949 return SDValue();
12950}
12951
12954 const ARMSubtarget *Subtarget) {
12955 SelectionDAG &DAG(DCI.DAG);
12956
12957 if (N->getOpcode() == ARMISD::SUBC && N->hasAnyUseOfValue(1)) {
12958 // (SUBC (ADDE 0, 0, C), 1) -> C
12959 SDValue LHS = N->getOperand(0);
12960 SDValue RHS = N->getOperand(1);
12961 if (LHS->getOpcode() == ARMISD::ADDE &&
12962 isNullConstant(LHS->getOperand(0)) &&
12963 isNullConstant(LHS->getOperand(1)) && isOneConstant(RHS)) {
12964 return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2));
12965 }
12966 }
12967
12968 if (Subtarget->isThumb1Only()) {
12969 SDValue RHS = N->getOperand(1);
12971 int32_t imm = C->getSExtValue();
12972 if (imm < 0 && imm > std::numeric_limits<int>::min()) {
12973 SDLoc DL(N);
12974 RHS = DAG.getConstant(-imm, DL, MVT::i32);
12975 unsigned Opcode = (N->getOpcode() == ARMISD::ADDC) ? ARMISD::SUBC
12976 : ARMISD::ADDC;
12977 return DAG.getNode(Opcode, DL, N->getVTList(), N->getOperand(0), RHS);
12978 }
12979 }
12980 }
12981
12982 return SDValue();
12983}
12984
12987 const ARMSubtarget *Subtarget) {
12988 if (Subtarget->isThumb1Only()) {
12989 SelectionDAG &DAG = DCI.DAG;
12990 SDValue RHS = N->getOperand(1);
12992 int64_t imm = C->getSExtValue();
12993 if (imm < 0) {
12994 SDLoc DL(N);
12995
12996 // The with-carry-in form matches bitwise not instead of the negation.
12997 // Effectively, the inverse interpretation of the carry flag already
12998 // accounts for part of the negation.
12999 RHS = DAG.getConstant(~imm, DL, MVT::i32);
13000
13001 unsigned Opcode = (N->getOpcode() == ARMISD::ADDE) ? ARMISD::SUBE
13002 : ARMISD::ADDE;
13003 return DAG.getNode(Opcode, DL, N->getVTList(),
13004 N->getOperand(0), RHS, N->getOperand(2));
13005 }
13006 }
13007 } else if (N->getOperand(1)->getOpcode() == ISD::SMUL_LOHI) {
13008 return AddCombineTo64bitMLAL(N, DCI, Subtarget);
13009 }
13010 return SDValue();
13011}
13012
13015 const ARMSubtarget *Subtarget) {
13016 if (!Subtarget->hasMVEIntegerOps())
13017 return SDValue();
13018
13019 SDLoc dl(N);
13020 SDValue SetCC;
13021 SDValue LHS;
13022 SDValue RHS;
13023 ISD::CondCode CC;
13024 SDValue TrueVal;
13025 SDValue FalseVal;
13026
13027 if (N->getOpcode() == ISD::SELECT &&
13028 N->getOperand(0)->getOpcode() == ISD::SETCC) {
13029 SetCC = N->getOperand(0);
13030 LHS = SetCC->getOperand(0);
13031 RHS = SetCC->getOperand(1);
13032 CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
13033 TrueVal = N->getOperand(1);
13034 FalseVal = N->getOperand(2);
13035 } else if (N->getOpcode() == ISD::SELECT_CC) {
13036 LHS = N->getOperand(0);
13037 RHS = N->getOperand(1);
13038 CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
13039 TrueVal = N->getOperand(2);
13040 FalseVal = N->getOperand(3);
13041 } else {
13042 return SDValue();
13043 }
13044
13045 unsigned int Opcode = 0;
13046 if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMIN ||
13047 FalseVal->getOpcode() == ISD::VECREDUCE_UMIN) &&
13048 (CC == ISD::SETULT || CC == ISD::SETUGT)) {
13049 Opcode = ARMISD::VMINVu;
13050 if (CC == ISD::SETUGT)
13051 std::swap(TrueVal, FalseVal);
13052 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMIN ||
13053 FalseVal->getOpcode() == ISD::VECREDUCE_SMIN) &&
13054 (CC == ISD::SETLT || CC == ISD::SETGT)) {
13055 Opcode = ARMISD::VMINVs;
13056 if (CC == ISD::SETGT)
13057 std::swap(TrueVal, FalseVal);
13058 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMAX ||
13059 FalseVal->getOpcode() == ISD::VECREDUCE_UMAX) &&
13060 (CC == ISD::SETUGT || CC == ISD::SETULT)) {
13061 Opcode = ARMISD::VMAXVu;
13062 if (CC == ISD::SETULT)
13063 std::swap(TrueVal, FalseVal);
13064 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMAX ||
13065 FalseVal->getOpcode() == ISD::VECREDUCE_SMAX) &&
13066 (CC == ISD::SETGT || CC == ISD::SETLT)) {
13067 Opcode = ARMISD::VMAXVs;
13068 if (CC == ISD::SETLT)
13069 std::swap(TrueVal, FalseVal);
13070 } else
13071 return SDValue();
13072
13073 // Normalise to the right hand side being the vector reduction
13074 switch (TrueVal->getOpcode()) {
13075 case ISD::VECREDUCE_UMIN:
13076 case ISD::VECREDUCE_SMIN:
13077 case ISD::VECREDUCE_UMAX:
13078 case ISD::VECREDUCE_SMAX:
13079 std::swap(LHS, RHS);
13080 std::swap(TrueVal, FalseVal);
13081 break;
13082 }
13083
13084 EVT VectorType = FalseVal->getOperand(0).getValueType();
13085
13086 if (VectorType != MVT::v16i8 && VectorType != MVT::v8i16 &&
13087 VectorType != MVT::v4i32)
13088 return SDValue();
13089
13090 EVT VectorScalarType = VectorType.getVectorElementType();
13091
13092 // The values being selected must also be the ones being compared
13093 if (TrueVal != LHS || FalseVal != RHS)
13094 return SDValue();
13095
13096 EVT LeftType = LHS->getValueType(0);
13097 EVT RightType = RHS->getValueType(0);
13098
13099 // The types must match the reduced type too
13100 if (LeftType != VectorScalarType || RightType != VectorScalarType)
13101 return SDValue();
13102
13103 // Legalise the scalar to an i32
13104 if (VectorScalarType != MVT::i32)
13105 LHS = DCI.DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
13106
13107 // Generate the reduction as an i32 for legalisation purposes
13108 auto Reduction =
13109 DCI.DAG.getNode(Opcode, dl, MVT::i32, LHS, RHS->getOperand(0));
13110
13111 // The result isn't actually an i32 so truncate it back to its original type
13112 if (VectorScalarType != MVT::i32)
13113 Reduction = DCI.DAG.getNode(ISD::TRUNCATE, dl, VectorScalarType, Reduction);
13114
13115 return Reduction;
13116}
13117
13118// A special combine for the vqdmulh family of instructions. This is one of the
13119// potential set of patterns that could patch this instruction. The base pattern
13120// you would expect to be min(max(ashr(mul(mul(sext(x), 2), sext(y)), 16))).
13121// This matches the different min(max(ashr(mul(mul(sext(x), sext(y)), 2), 16))),
13122// which llvm will have optimized to min(ashr(mul(sext(x), sext(y)), 15))) as
13123// the max is unnecessary.
13125 EVT VT = N->getValueType(0);
13126 SDValue Shft;
13127 ConstantSDNode *Clamp;
13128
13129 if (!VT.isVector() || VT.getScalarSizeInBits() > 64)
13130 return SDValue();
13131
13132 if (N->getOpcode() == ISD::SMIN) {
13133 Shft = N->getOperand(0);
13134 Clamp = isConstOrConstSplat(N->getOperand(1));
13135 } else if (N->getOpcode() == ISD::VSELECT) {
13136 // Detect a SMIN, which for an i64 node will be a vselect/setcc, not a smin.
13137 SDValue Cmp = N->getOperand(0);
13138 if (Cmp.getOpcode() != ISD::SETCC ||
13139 cast<CondCodeSDNode>(Cmp.getOperand(2))->get() != ISD::SETLT ||
13140 Cmp.getOperand(0) != N->getOperand(1) ||
13141 Cmp.getOperand(1) != N->getOperand(2))
13142 return SDValue();
13143 Shft = N->getOperand(1);
13144 Clamp = isConstOrConstSplat(N->getOperand(2));
13145 } else
13146 return SDValue();
13147
13148 if (!Clamp)
13149 return SDValue();
13150
13151 MVT ScalarType;
13152 int ShftAmt = 0;
13153 switch (Clamp->getSExtValue()) {
13154 case (1 << 7) - 1:
13155 ScalarType = MVT::i8;
13156 ShftAmt = 7;
13157 break;
13158 case (1 << 15) - 1:
13159 ScalarType = MVT::i16;
13160 ShftAmt = 15;
13161 break;
13162 case (1ULL << 31) - 1:
13163 ScalarType = MVT::i32;
13164 ShftAmt = 31;
13165 break;
13166 default:
13167 return SDValue();
13168 }
13169
13170 if (Shft.getOpcode() != ISD::SRA)
13171 return SDValue();
13173 if (!N1 || N1->getSExtValue() != ShftAmt)
13174 return SDValue();
13175
13176 SDValue Mul = Shft.getOperand(0);
13177 if (Mul.getOpcode() != ISD::MUL)
13178 return SDValue();
13179
13180 SDValue Ext0 = Mul.getOperand(0);
13181 SDValue Ext1 = Mul.getOperand(1);
13182 if (Ext0.getOpcode() != ISD::SIGN_EXTEND ||
13183 Ext1.getOpcode() != ISD::SIGN_EXTEND)
13184 return SDValue();
13185 EVT VecVT = Ext0.getOperand(0).getValueType();
13186 if (!VecVT.isPow2VectorType() || VecVT.getVectorNumElements() == 1)
13187 return SDValue();
13188 if (Ext1.getOperand(0).getValueType() != VecVT ||
13189 VecVT.getScalarType() != ScalarType ||
13190 VT.getScalarSizeInBits() < ScalarType.getScalarSizeInBits() * 2)
13191 return SDValue();
13192
13193 SDLoc DL(Mul);
13194 unsigned LegalLanes = 128 / (ShftAmt + 1);
13195 EVT LegalVecVT = MVT::getVectorVT(ScalarType, LegalLanes);
13196 // For types smaller than legal vectors extend to be legal and only use needed
13197 // lanes.
13198 if (VecVT.getSizeInBits() < 128) {
13199 EVT ExtVecVT =
13201 VecVT.getVectorNumElements());
13202 SDValue Inp0 =
13203 DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext0.getOperand(0));
13204 SDValue Inp1 =
13205 DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext1.getOperand(0));
13206 Inp0 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp0);
13207 Inp1 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp1);
13208 SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1);
13209 SDValue Trunc = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, ExtVecVT, VQDMULH);
13210 Trunc = DAG.getNode(ISD::TRUNCATE, DL, VecVT, Trunc);
13211 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Trunc);
13212 }
13213
13214 // For larger types, split into legal sized chunks.
13215 assert(VecVT.getSizeInBits() % 128 == 0 && "Expected a power2 type");
13216 unsigned NumParts = VecVT.getSizeInBits() / 128;
13218 for (unsigned I = 0; I < NumParts; ++I) {
13219 SDValue Inp0 =
13220 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext0.getOperand(0),
13221 DAG.getVectorIdxConstant(I * LegalLanes, DL));
13222 SDValue Inp1 =
13223 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext1.getOperand(0),
13224 DAG.getVectorIdxConstant(I * LegalLanes, DL));
13225 SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1);
13226 Parts.push_back(VQDMULH);
13227 }
13228 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT,
13229 DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Parts));
13230}
13231
13234 const ARMSubtarget *Subtarget) {
13235 if (!Subtarget->hasMVEIntegerOps())
13236 return SDValue();
13237
13238 if (SDValue V = PerformVQDMULHCombine(N, DCI.DAG))
13239 return V;
13240
13241 // Transforms vselect(not(cond), lhs, rhs) into vselect(cond, rhs, lhs).
13242 //
13243 // We need to re-implement this optimization here as the implementation in the
13244 // Target-Independent DAGCombiner does not handle the kind of constant we make
13245 // (it calls isConstOrConstSplat with AllowTruncation set to false - and for
13246 // good reason, allowing truncation there would break other targets).
13247 //
13248 // Currently, this is only done for MVE, as it's the only target that benefits
13249 // from this transformation (e.g. VPNOT+VPSEL becomes a single VPSEL).
13250 if (N->getOperand(0).getOpcode() != ISD::XOR)
13251 return SDValue();
13252 SDValue XOR = N->getOperand(0);
13253
13254 // Check if the XOR's RHS is either a 1, or a BUILD_VECTOR of 1s.
13255 // It is important to check with truncation allowed as the BUILD_VECTORs we
13256 // generate in those situations will truncate their operands.
13257 ConstantSDNode *Const =
13258 isConstOrConstSplat(XOR->getOperand(1), /*AllowUndefs*/ false,
13259 /*AllowTruncation*/ true);
13260 if (!Const || !Const->isOne())
13261 return SDValue();
13262
13263 // Rewrite into vselect(cond, rhs, lhs).
13264 SDValue Cond = XOR->getOperand(0);
13265 SDValue LHS = N->getOperand(1);
13266 SDValue RHS = N->getOperand(2);
13267 EVT Type = N->getValueType(0);
13268 return DCI.DAG.getNode(ISD::VSELECT, SDLoc(N), Type, Cond, RHS, LHS);
13269}
13270
13271// Convert vsetcc([0,1,2,..], splat(n), ult) -> vctp n
13274 const ARMSubtarget *Subtarget) {
13275 SDValue Op0 = N->getOperand(0);
13276 SDValue Op1 = N->getOperand(1);
13277 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
13278 EVT VT = N->getValueType(0);
13279
13280 if (!Subtarget->hasMVEIntegerOps() ||
13282 return SDValue();
13283
13284 if (CC == ISD::SETUGE) {
13285 std::swap(Op0, Op1);
13286 CC = ISD::SETULT;
13287 }
13288
13289 if (CC != ISD::SETULT || VT.getScalarSizeInBits() != 1 ||
13291 return SDValue();
13292
13293 // Check first operand is BuildVector of 0,1,2,...
13294 for (unsigned I = 0; I < VT.getVectorNumElements(); I++) {
13295 if (!Op0.getOperand(I).isUndef() &&
13297 Op0.getConstantOperandVal(I) == I))
13298 return SDValue();
13299 }
13300
13301 // The second is a Splat of Op1S
13302 SDValue Op1S = DCI.DAG.getSplatValue(Op1);
13303 if (!Op1S)
13304 return SDValue();
13305
13306 unsigned Opc;
13307 switch (VT.getVectorNumElements()) {
13308 case 2:
13309 Opc = Intrinsic::arm_mve_vctp64;
13310 break;
13311 case 4:
13312 Opc = Intrinsic::arm_mve_vctp32;
13313 break;
13314 case 8:
13315 Opc = Intrinsic::arm_mve_vctp16;
13316 break;
13317 case 16:
13318 Opc = Intrinsic::arm_mve_vctp8;
13319 break;
13320 default:
13321 return SDValue();
13322 }
13323
13324 SDLoc DL(N);
13325 return DCI.DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
13326 DCI.DAG.getConstant(Opc, DL, MVT::i32),
13327 DCI.DAG.getZExtOrTrunc(Op1S, DL, MVT::i32));
13328}
13329
13330/// PerformADDECombine - Target-specific dag combine transform from
13331/// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or
13332/// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL
13335 const ARMSubtarget *Subtarget) {
13336 // Only ARM and Thumb2 support UMLAL/SMLAL.
13337 if (Subtarget->isThumb1Only())
13338 return PerformAddeSubeCombine(N, DCI, Subtarget);
13339
13340 // Only perform the checks after legalize when the pattern is available.
13341 if (DCI.isBeforeLegalize()) return SDValue();
13342
13343 return AddCombineTo64bitUMAAL(N, DCI, Subtarget);
13344}
13345
13346/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
13347/// operands N0 and N1. This is a helper for PerformADDCombine that is
13348/// called with the default operands, and if that fails, with commuted
13349/// operands.
13352 const ARMSubtarget *Subtarget){
13353 // Attempt to create vpadd for this add.
13354 if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget))
13355 return Result;
13356
13357 // Attempt to create vpaddl for this add.
13358 if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget))
13359 return Result;
13360 if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI,
13361 Subtarget))
13362 return Result;
13363
13364 // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
13365 if (N0.getNode()->hasOneUse())
13366 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI))
13367 return Result;
13368 return SDValue();
13369}
13370
13372 EVT VT = N->getValueType(0);
13373 SDValue N0 = N->getOperand(0);
13374 SDValue N1 = N->getOperand(1);
13375 SDLoc dl(N);
13376
13377 auto IsVecReduce = [](SDValue Op) {
13378 switch (Op.getOpcode()) {
13379 case ISD::VECREDUCE_ADD:
13380 case ARMISD::VADDVs:
13381 case ARMISD::VADDVu:
13382 case ARMISD::VMLAVs:
13383 case ARMISD::VMLAVu:
13384 return true;
13385 }
13386 return false;
13387 };
13388
13389 auto DistrubuteAddAddVecReduce = [&](SDValue N0, SDValue N1) {
13390 // Distribute add(X, add(vecreduce(Y), vecreduce(Z))) ->
13391 // add(add(X, vecreduce(Y)), vecreduce(Z))
13392 // to make better use of vaddva style instructions.
13393 if (VT == MVT::i32 && N1.getOpcode() == ISD::ADD && !IsVecReduce(N0) &&
13394 IsVecReduce(N1.getOperand(0)) && IsVecReduce(N1.getOperand(1)) &&
13395 !isa<ConstantSDNode>(N0) && N1->hasOneUse()) {
13396 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0, N1.getOperand(0));
13397 return DAG.getNode(ISD::ADD, dl, VT, Add0, N1.getOperand(1));
13398 }
13399 // And turn add(add(A, reduce(B)), add(C, reduce(D))) ->
13400 // add(add(add(A, C), reduce(B)), reduce(D))
13401 if (VT == MVT::i32 && N0.getOpcode() == ISD::ADD &&
13402 N1.getOpcode() == ISD::ADD && N0->hasOneUse() && N1->hasOneUse()) {
13403 unsigned N0RedOp = 0;
13404 if (!IsVecReduce(N0.getOperand(N0RedOp))) {
13405 N0RedOp = 1;
13406 if (!IsVecReduce(N0.getOperand(N0RedOp)))
13407 return SDValue();
13408 }
13409
13410 unsigned N1RedOp = 0;
13411 if (!IsVecReduce(N1.getOperand(N1RedOp)))
13412 N1RedOp = 1;
13413 if (!IsVecReduce(N1.getOperand(N1RedOp)))
13414 return SDValue();
13415
13416 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0.getOperand(1 - N0RedOp),
13417 N1.getOperand(1 - N1RedOp));
13418 SDValue Add1 =
13419 DAG.getNode(ISD::ADD, dl, VT, Add0, N0.getOperand(N0RedOp));
13420 return DAG.getNode(ISD::ADD, dl, VT, Add1, N1.getOperand(N1RedOp));
13421 }
13422 return SDValue();
13423 };
13424 if (SDValue R = DistrubuteAddAddVecReduce(N0, N1))
13425 return R;
13426 if (SDValue R = DistrubuteAddAddVecReduce(N1, N0))
13427 return R;
13428
13429 // Distribute add(vecreduce(load(Y)), vecreduce(load(Z)))
13430 // Or add(add(X, vecreduce(load(Y))), vecreduce(load(Z)))
13431 // by ascending load offsets. This can help cores prefetch if the order of
13432 // loads is more predictable.
13433 auto DistrubuteVecReduceLoad = [&](SDValue N0, SDValue N1, bool IsForward) {
13434 // Check if two reductions are known to load data where one is before/after
13435 // another. Return negative if N0 loads data before N1, positive if N1 is
13436 // before N0 and 0 otherwise if nothing is known.
13437 auto IsKnownOrderedLoad = [&](SDValue N0, SDValue N1) {
13438 // Look through to the first operand of a MUL, for the VMLA case.
13439 // Currently only looks at the first operand, in the hope they are equal.
13440 if (N0.getOpcode() == ISD::MUL)
13441 N0 = N0.getOperand(0);
13442 if (N1.getOpcode() == ISD::MUL)
13443 N1 = N1.getOperand(0);
13444
13445 // Return true if the two operands are loads to the same object and the
13446 // offset of the first is known to be less than the offset of the second.
13447 LoadSDNode *Load0 = dyn_cast<LoadSDNode>(N0);
13448 LoadSDNode *Load1 = dyn_cast<LoadSDNode>(N1);
13449 if (!Load0 || !Load1 || Load0->getChain() != Load1->getChain() ||
13450 !Load0->isSimple() || !Load1->isSimple() || Load0->isIndexed() ||
13451 Load1->isIndexed())
13452 return 0;
13453
13454 auto BaseLocDecomp0 = BaseIndexOffset::match(Load0, DAG);
13455 auto BaseLocDecomp1 = BaseIndexOffset::match(Load1, DAG);
13456
13457 if (!BaseLocDecomp0.getBase() ||
13458 BaseLocDecomp0.getBase() != BaseLocDecomp1.getBase() ||
13459 !BaseLocDecomp0.hasValidOffset() || !BaseLocDecomp1.hasValidOffset())
13460 return 0;
13461 if (BaseLocDecomp0.getOffset() < BaseLocDecomp1.getOffset())
13462 return -1;
13463 if (BaseLocDecomp0.getOffset() > BaseLocDecomp1.getOffset())
13464 return 1;
13465 return 0;
13466 };
13467
13468 SDValue X;
13469 if (N0.getOpcode() == ISD::ADD && N0->hasOneUse()) {
13470 if (IsVecReduce(N0.getOperand(0)) && IsVecReduce(N0.getOperand(1))) {
13471 int IsBefore = IsKnownOrderedLoad(N0.getOperand(0).getOperand(0),
13472 N0.getOperand(1).getOperand(0));
13473 if (IsBefore < 0) {
13474 X = N0.getOperand(0);
13475 N0 = N0.getOperand(1);
13476 } else if (IsBefore > 0) {
13477 X = N0.getOperand(1);
13478 N0 = N0.getOperand(0);
13479 } else
13480 return SDValue();
13481 } else if (IsVecReduce(N0.getOperand(0))) {
13482 X = N0.getOperand(1);
13483 N0 = N0.getOperand(0);
13484 } else if (IsVecReduce(N0.getOperand(1))) {
13485 X = N0.getOperand(0);
13486 N0 = N0.getOperand(1);
13487 } else
13488 return SDValue();
13489 } else if (IsForward && IsVecReduce(N0) && IsVecReduce(N1) &&
13490 IsKnownOrderedLoad(N0.getOperand(0), N1.getOperand(0)) < 0) {
13491 // Note this is backward to how you would expect. We create
13492 // add(reduce(load + 16), reduce(load + 0)) so that the
13493 // add(reduce(load+16), X) is combined into VADDVA(X, load+16)), leaving
13494 // the X as VADDV(load + 0)
13495 return DAG.getNode(ISD::ADD, dl, VT, N1, N0);
13496 } else
13497 return SDValue();
13498
13499 if (!IsVecReduce(N0) || !IsVecReduce(N1))
13500 return SDValue();
13501
13502 if (IsKnownOrderedLoad(N1.getOperand(0), N0.getOperand(0)) >= 0)
13503 return SDValue();
13504
13505 // Switch from add(add(X, N0), N1) to add(add(X, N1), N0)
13506 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, X, N1);
13507 return DAG.getNode(ISD::ADD, dl, VT, Add0, N0);
13508 };
13509 if (SDValue R = DistrubuteVecReduceLoad(N0, N1, true))
13510 return R;
13511 if (SDValue R = DistrubuteVecReduceLoad(N1, N0, false))
13512 return R;
13513 return SDValue();
13514}
13515
13517 const ARMSubtarget *Subtarget) {
13518 if (!Subtarget->hasMVEIntegerOps())
13519 return SDValue();
13520
13522 return R;
13523
13524 EVT VT = N->getValueType(0);
13525 SDValue N0 = N->getOperand(0);
13526 SDValue N1 = N->getOperand(1);
13527 SDLoc dl(N);
13528
13529 if (VT != MVT::i64)
13530 return SDValue();
13531
13532 // We are looking for a i64 add of a VADDLVx. Due to these being i64's, this
13533 // will look like:
13534 // t1: i32,i32 = ARMISD::VADDLVs x
13535 // t2: i64 = build_pair t1, t1:1
13536 // t3: i64 = add t2, y
13537 // Otherwise we try to push the add up above VADDLVAx, to potentially allow
13538 // the add to be simplified separately.
13539 // We also need to check for sext / zext and commutitive adds.
13540 auto MakeVecReduce = [&](unsigned Opcode, unsigned OpcodeA, SDValue NA,
13541 SDValue NB) {
13542 if (NB->getOpcode() != ISD::BUILD_PAIR)
13543 return SDValue();
13544 SDValue VecRed = NB->getOperand(0);
13545 if ((VecRed->getOpcode() != Opcode && VecRed->getOpcode() != OpcodeA) ||
13546 VecRed.getResNo() != 0 ||
13547 NB->getOperand(1) != SDValue(VecRed.getNode(), 1))
13548 return SDValue();
13549
13550 if (VecRed->getOpcode() == OpcodeA) {
13551 // add(NA, VADDLVA(Inp), Y) -> VADDLVA(add(NA, Inp), Y)
13552 SDValue Inp = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
13553 VecRed.getOperand(0), VecRed.getOperand(1));
13554 NA = DAG.getNode(ISD::ADD, dl, MVT::i64, Inp, NA);
13555 }
13556
13558 std::tie(Ops[0], Ops[1]) = DAG.SplitScalar(NA, dl, MVT::i32, MVT::i32);
13559
13560 unsigned S = VecRed->getOpcode() == OpcodeA ? 2 : 0;
13561 for (unsigned I = S, E = VecRed.getNumOperands(); I < E; I++)
13562 Ops.push_back(VecRed->getOperand(I));
13563 SDValue Red =
13564 DAG.getNode(OpcodeA, dl, DAG.getVTList({MVT::i32, MVT::i32}), Ops);
13565 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Red,
13566 SDValue(Red.getNode(), 1));
13567 };
13568
13569 if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N0, N1))
13570 return M;
13571 if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N0, N1))
13572 return M;
13573 if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N1, N0))
13574 return M;
13575 if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N1, N0))
13576 return M;
13577 if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N0, N1))
13578 return M;
13579 if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N0, N1))
13580 return M;
13581 if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N1, N0))
13582 return M;
13583 if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N1, N0))
13584 return M;
13585 if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N0, N1))
13586 return M;
13587 if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N0, N1))
13588 return M;
13589 if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N1, N0))
13590 return M;
13591 if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N1, N0))
13592 return M;
13593 if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N0, N1))
13594 return M;
13595 if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N0, N1))
13596 return M;
13597 if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N1, N0))
13598 return M;
13599 if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N1, N0))
13600 return M;
13601 return SDValue();
13602}
13603
13604bool
13606 CombineLevel Level) const {
13607 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
13608 N->getOpcode() == ISD::SRL) &&
13609 "Expected shift op");
13610
13611 SDValue ShiftLHS = N->getOperand(0);
13612 if (!ShiftLHS->hasOneUse())
13613 return false;
13614
13615 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
13616 !ShiftLHS.getOperand(0)->hasOneUse())
13617 return false;
13618
13619 if (Level == BeforeLegalizeTypes)
13620 return true;
13621
13622 if (N->getOpcode() != ISD::SHL)
13623 return true;
13624
13625 if (Subtarget->isThumb1Only()) {
13626 // Avoid making expensive immediates by commuting shifts. (This logic
13627 // only applies to Thumb1 because ARM and Thumb2 immediates can be shifted
13628 // for free.)
13629 if (N->getOpcode() != ISD::SHL)
13630 return true;
13631 SDValue N1 = N->getOperand(0);
13632 if (N1->getOpcode() != ISD::ADD && N1->getOpcode() != ISD::AND &&
13633 N1->getOpcode() != ISD::OR && N1->getOpcode() != ISD::XOR)
13634 return true;
13635 if (auto *Const = dyn_cast<ConstantSDNode>(N1->getOperand(1))) {
13636 if (Const->getAPIntValue().ult(256))
13637 return false;
13638 if (N1->getOpcode() == ISD::ADD && Const->getAPIntValue().slt(0) &&
13639 Const->getAPIntValue().sgt(-256))
13640 return false;
13641 }
13642 return true;
13643 }
13644
13645 // Turn off commute-with-shift transform after legalization, so it doesn't
13646 // conflict with PerformSHLSimplify. (We could try to detect when
13647 // PerformSHLSimplify would trigger more precisely, but it isn't
13648 // really necessary.)
13649 return false;
13650}
13651
13653 const SDNode *N) const {
13654 assert(N->getOpcode() == ISD::XOR &&
13655 (N->getOperand(0).getOpcode() == ISD::SHL ||
13656 N->getOperand(0).getOpcode() == ISD::SRL) &&
13657 "Expected XOR(SHIFT) pattern");
13658
13659 // Only commute if the entire NOT mask is a hidden shifted mask.
13660 auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
13661 auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
13662 if (XorC && ShiftC) {
13663 unsigned MaskIdx, MaskLen;
13664 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
13665 unsigned ShiftAmt = ShiftC->getZExtValue();
13666 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
13667 if (N->getOperand(0).getOpcode() == ISD::SHL)
13668 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
13669 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
13670 }
13671 }
13672
13673 return false;
13674}
13675
13677 const SDNode *N) const {
13678 assert(((N->getOpcode() == ISD::SHL &&
13679 N->getOperand(0).getOpcode() == ISD::SRL) ||
13680 (N->getOpcode() == ISD::SRL &&
13681 N->getOperand(0).getOpcode() == ISD::SHL)) &&
13682 "Expected shift-shift mask");
13683
13684 if (!Subtarget->isThumb1Only())
13685 return true;
13686
13687 EVT VT = N->getValueType(0);
13688 if (VT.getScalarSizeInBits() > 32)
13689 return true;
13690
13691 return false;
13692}
13693
13695 unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
13696 SDValue Y) const {
13697 return Subtarget->hasMVEIntegerOps() && isTypeLegal(VT) &&
13698 SelectOpcode == ISD::VSELECT;
13699}
13700
13702 if (!Subtarget->hasNEON()) {
13703 if (Subtarget->isThumb1Only())
13704 return VT.getScalarSizeInBits() <= 32;
13705 return true;
13706 }
13707 return VT.isScalarInteger();
13708}
13709
13711 EVT VT) const {
13712 if (!isOperationLegalOrCustom(Op, VT) || !FPVT.isSimple())
13713 return false;
13714
13715 switch (FPVT.getSimpleVT().SimpleTy) {
13716 case MVT::f16:
13717 return Subtarget->hasVFP2Base();
13718 case MVT::f32:
13719 return Subtarget->hasVFP2Base();
13720 case MVT::f64:
13721 return Subtarget->hasFP64();
13722 case MVT::v4f32:
13723 case MVT::v8f16:
13724 return Subtarget->hasMVEFloatOps();
13725 default:
13726 return false;
13727 }
13728}
13729
13732 const ARMSubtarget *ST) {
13733 // Allow the generic combiner to identify potential bswaps.
13734 if (DCI.isBeforeLegalize())
13735 return SDValue();
13736
13737 // DAG combiner will fold:
13738 // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
13739 // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2
13740 // Other code patterns that can be also be modified have the following form:
13741 // b + ((a << 1) | 510)
13742 // b + ((a << 1) & 510)
13743 // b + ((a << 1) ^ 510)
13744 // b + ((a << 1) + 510)
13745
13746 // Many instructions can perform the shift for free, but it requires both
13747 // the operands to be registers. If c1 << c2 is too large, a mov immediate
13748 // instruction will needed. So, unfold back to the original pattern if:
13749 // - if c1 and c2 are small enough that they don't require mov imms.
13750 // - the user(s) of the node can perform an shl
13751
13752 // No shifted operands for 16-bit instructions.
13753 if (ST->isThumb() && ST->isThumb1Only())
13754 return SDValue();
13755
13756 // Check that all the users could perform the shl themselves.
13757 for (auto *U : N->users()) {
13758 switch(U->getOpcode()) {
13759 default:
13760 return SDValue();
13761 case ISD::SUB:
13762 case ISD::ADD:
13763 case ISD::AND:
13764 case ISD::OR:
13765 case ISD::XOR:
13766 case ISD::SETCC:
13767 case ARMISD::CMP:
13768 // Check that the user isn't already using a constant because there
13769 // aren't any instructions that support an immediate operand and a
13770 // shifted operand.
13771 if (isa<ConstantSDNode>(U->getOperand(0)) ||
13772 isa<ConstantSDNode>(U->getOperand(1)))
13773 return SDValue();
13774
13775 // Check that it's not already using a shift.
13776 if (U->getOperand(0).getOpcode() == ISD::SHL ||
13777 U->getOperand(1).getOpcode() == ISD::SHL)
13778 return SDValue();
13779 break;
13780 }
13781 }
13782
13783 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR &&
13784 N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND)
13785 return SDValue();
13786
13787 if (N->getOperand(0).getOpcode() != ISD::SHL)
13788 return SDValue();
13789
13790 SDValue SHL = N->getOperand(0);
13791
13792 auto *C1ShlC2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
13793 auto *C2 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
13794 if (!C1ShlC2 || !C2)
13795 return SDValue();
13796
13797 APInt C2Int = C2->getAPIntValue();
13798 APInt C1Int = C1ShlC2->getAPIntValue();
13799 unsigned C2Width = C2Int.getBitWidth();
13800 if (C2Int.uge(C2Width))
13801 return SDValue();
13802 uint64_t C2Value = C2Int.getZExtValue();
13803
13804 // Check that performing a lshr will not lose any information.
13805 APInt Mask = APInt::getHighBitsSet(C2Width, C2Width - C2Value);
13806 if ((C1Int & Mask) != C1Int)
13807 return SDValue();
13808
13809 // Shift the first constant.
13810 C1Int.lshrInPlace(C2Int);
13811
13812 // The immediates are encoded as an 8-bit value that can be rotated.
13813 auto LargeImm = [](const APInt &Imm) {
13814 unsigned Zeros = Imm.countl_zero() + Imm.countr_zero();
13815 return Imm.getBitWidth() - Zeros > 8;
13816 };
13817
13818 if (LargeImm(C1Int) || LargeImm(C2Int))
13819 return SDValue();
13820
13821 SelectionDAG &DAG = DCI.DAG;
13822 SDLoc dl(N);
13823 SDValue X = SHL.getOperand(0);
13824 SDValue BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X,
13825 DAG.getConstant(C1Int, dl, MVT::i32));
13826 // Shift left to compensate for the lshr of C1Int.
13827 SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1));
13828
13829 LLVM_DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump();
13830 SHL.dump(); N->dump());
13831 LLVM_DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump());
13832 return Res;
13833}
13834
13835
13836/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
13837///
13840 const ARMSubtarget *Subtarget) {
13841 SDValue N0 = N->getOperand(0);
13842 SDValue N1 = N->getOperand(1);
13843
13844 // Only works one way, because it needs an immediate operand.
13845 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
13846 return Result;
13847
13848 if (SDValue Result = PerformADDVecReduce(N, DCI.DAG, Subtarget))
13849 return Result;
13850
13851 // First try with the default operand order.
13852 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget))
13853 return Result;
13854
13855 // If that didn't work, try again with the operands commuted.
13856 return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget);
13857}
13858
13859// Combine (sub 0, (csinc X, Y, CC)) -> (csinv -X, Y, CC)
13860// providing -X is as cheap as X (currently, just a constant).
13862 if (N->getValueType(0) != MVT::i32 || !isNullConstant(N->getOperand(0)))
13863 return SDValue();
13864 SDValue CSINC = N->getOperand(1);
13865 if (CSINC.getOpcode() != ARMISD::CSINC || !CSINC.hasOneUse())
13866 return SDValue();
13867
13869 if (!X)
13870 return SDValue();
13871
13872 return DAG.getNode(ARMISD::CSINV, SDLoc(N), MVT::i32,
13873 DAG.getNode(ISD::SUB, SDLoc(N), MVT::i32, N->getOperand(0),
13874 CSINC.getOperand(0)),
13875 CSINC.getOperand(1), CSINC.getOperand(2),
13876 CSINC.getOperand(3));
13877}
13878
13880 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0));
13881}
13882
13883// Try to fold
13884//
13885// (neg (cmov X, Y)) -> (cmov (neg X), (neg Y))
13886//
13887// The folding helps cmov to be matched with csneg without generating
13888// redundant neg instruction.
13890 if (!isNegatedInteger(SDValue(N, 0)))
13891 return SDValue();
13892
13893 SDValue CMov = N->getOperand(1);
13894 if (CMov.getOpcode() != ARMISD::CMOV || !CMov->hasOneUse())
13895 return SDValue();
13896
13897 SDValue N0 = CMov.getOperand(0);
13898 SDValue N1 = CMov.getOperand(1);
13899
13900 // If neither of them are negations, it's not worth the folding as it
13901 // introduces two additional negations while reducing one negation.
13902 if (!isNegatedInteger(N0) && !isNegatedInteger(N1))
13903 return SDValue();
13904
13905 SDLoc DL(N);
13906 EVT VT = CMov.getValueType();
13907
13908 SDValue N0N = DAG.getNegative(N0, DL, VT);
13909 SDValue N1N = DAG.getNegative(N1, DL, VT);
13910 return DAG.getNode(ARMISD::CMOV, DL, VT, N0N, N1N, CMov.getOperand(2),
13911 CMov.getOperand(3));
13912}
13913
13914/// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
13915///
13918 const ARMSubtarget *Subtarget) {
13919 SDValue N0 = N->getOperand(0);
13920 SDValue N1 = N->getOperand(1);
13921
13922 // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
13923 if (N1.getNode()->hasOneUse())
13924 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI))
13925 return Result;
13926
13927 if (SDValue R = PerformSubCSINCCombine(N, DCI.DAG))
13928 return R;
13929
13930 if (SDValue Val = performNegCMovCombine(N, DCI.DAG))
13931 return Val;
13932
13933 if (!Subtarget->hasMVEIntegerOps() || !N->getValueType(0).isVector())
13934 return SDValue();
13935
13936 // Fold (sub (ARMvmovImm 0), (ARMvdup x)) -> (ARMvdup (sub 0, x))
13937 // so that we can readily pattern match more mve instructions which can use
13938 // a scalar operand.
13939 SDValue VDup = N->getOperand(1);
13940 if (VDup->getOpcode() != ARMISD::VDUP)
13941 return SDValue();
13942
13943 SDValue VMov = N->getOperand(0);
13944 if (VMov->getOpcode() == ISD::BITCAST)
13945 VMov = VMov->getOperand(0);
13946
13947 if (VMov->getOpcode() != ARMISD::VMOVIMM || !isZeroVector(VMov))
13948 return SDValue();
13949
13950 SDLoc dl(N);
13951 SDValue Negate = DCI.DAG.getNode(ISD::SUB, dl, MVT::i32,
13952 DCI.DAG.getConstant(0, dl, MVT::i32),
13953 VDup->getOperand(0));
13954 return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), Negate);
13955}
13956
13957/// PerformVMULCombine
13958/// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the
13959/// special multiplier accumulator forwarding.
13960/// vmul d3, d0, d2
13961/// vmla d3, d1, d2
13962/// is faster than
13963/// vadd d3, d0, d1
13964/// vmul d3, d3, d2
13965// However, for (A + B) * (A + B),
13966// vadd d2, d0, d1
13967// vmul d3, d0, d2
13968// vmla d3, d1, d2
13969// is slower than
13970// vadd d2, d0, d1
13971// vmul d3, d2, d2
13974 const ARMSubtarget *Subtarget) {
13975 if (!Subtarget->hasVMLxForwarding())
13976 return SDValue();
13977
13978 SelectionDAG &DAG = DCI.DAG;
13979 SDValue N0 = N->getOperand(0);
13980 SDValue N1 = N->getOperand(1);
13981 unsigned Opcode = N0.getOpcode();
13982 if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
13983 Opcode != ISD::FADD && Opcode != ISD::FSUB) {
13984 Opcode = N1.getOpcode();
13985 if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
13986 Opcode != ISD::FADD && Opcode != ISD::FSUB)
13987 return SDValue();
13988 std::swap(N0, N1);
13989 }
13990
13991 if (N0 == N1)
13992 return SDValue();
13993
13994 EVT VT = N->getValueType(0);
13995 SDLoc DL(N);
13996 SDValue N00 = N0->getOperand(0);
13997 SDValue N01 = N0->getOperand(1);
13998 return DAG.getNode(Opcode, DL, VT,
13999 DAG.getNode(ISD::MUL, DL, VT, N00, N1),
14000 DAG.getNode(ISD::MUL, DL, VT, N01, N1));
14001}
14002
14004 const ARMSubtarget *Subtarget) {
14005 EVT VT = N->getValueType(0);
14006 if (VT != MVT::v2i64)
14007 return SDValue();
14008
14009 SDValue N0 = N->getOperand(0);
14010 SDValue N1 = N->getOperand(1);
14011
14012 auto IsSignExt = [&](SDValue Op) {
14013 if (Op->getOpcode() != ISD::SIGN_EXTEND_INREG)
14014 return SDValue();
14015 EVT VT = cast<VTSDNode>(Op->getOperand(1))->getVT();
14016 if (VT.getScalarSizeInBits() == 32)
14017 return Op->getOperand(0);
14018 return SDValue();
14019 };
14020 auto IsZeroExt = [&](SDValue Op) {
14021 // Zero extends are a little more awkward. At the point we are matching
14022 // this, we are looking for an AND with a (-1, 0, -1, 0) buildvector mask.
14023 // That might be before of after a bitcast depending on how the and is
14024 // placed. Because this has to look through bitcasts, it is currently only
14025 // supported on LE.
14026 if (!Subtarget->isLittle())
14027 return SDValue();
14028
14029 SDValue And = Op;
14030 if (And->getOpcode() == ISD::BITCAST)
14031 And = And->getOperand(0);
14032 if (And->getOpcode() != ISD::AND)
14033 return SDValue();
14034 SDValue Mask = And->getOperand(1);
14035 if (Mask->getOpcode() == ISD::BITCAST)
14036 Mask = Mask->getOperand(0);
14037
14038 if (Mask->getOpcode() != ISD::BUILD_VECTOR ||
14039 Mask.getValueType() != MVT::v4i32)
14040 return SDValue();
14041 if (isAllOnesConstant(Mask->getOperand(0)) &&
14042 isNullConstant(Mask->getOperand(1)) &&
14043 isAllOnesConstant(Mask->getOperand(2)) &&
14044 isNullConstant(Mask->getOperand(3)))
14045 return And->getOperand(0);
14046 return SDValue();
14047 };
14048
14049 SDLoc dl(N);
14050 if (SDValue Op0 = IsSignExt(N0)) {
14051 if (SDValue Op1 = IsSignExt(N1)) {
14052 SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
14053 SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
14054 return DAG.getNode(ARMISD::VMULLs, dl, VT, New0a, New1a);
14055 }
14056 }
14057 if (SDValue Op0 = IsZeroExt(N0)) {
14058 if (SDValue Op1 = IsZeroExt(N1)) {
14059 SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
14060 SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
14061 return DAG.getNode(ARMISD::VMULLu, dl, VT, New0a, New1a);
14062 }
14063 }
14064
14065 return SDValue();
14066}
14067
14070 const ARMSubtarget *Subtarget) {
14071 SelectionDAG &DAG = DCI.DAG;
14072
14073 EVT VT = N->getValueType(0);
14074 if (Subtarget->hasMVEIntegerOps() && VT == MVT::v2i64)
14075 return PerformMVEVMULLCombine(N, DAG, Subtarget);
14076
14077 if (Subtarget->isThumb1Only())
14078 return SDValue();
14079
14080 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
14081 return SDValue();
14082
14083 if (VT.is64BitVector() || VT.is128BitVector())
14084 return PerformVMULCombine(N, DCI, Subtarget);
14085 if (VT != MVT::i32)
14086 return SDValue();
14087
14088 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
14089 if (!C)
14090 return SDValue();
14091
14092 int64_t MulAmt = C->getSExtValue();
14093 unsigned ShiftAmt = llvm::countr_zero<uint64_t>(MulAmt);
14094
14095 ShiftAmt = ShiftAmt & (32 - 1);
14096 SDValue V = N->getOperand(0);
14097 SDLoc DL(N);
14098
14099 SDValue Res;
14100 MulAmt >>= ShiftAmt;
14101
14102 if (MulAmt >= 0) {
14103 if (llvm::has_single_bit<uint32_t>(MulAmt - 1)) {
14104 // (mul x, 2^N + 1) => (add (shl x, N), x)
14105 Res = DAG.getNode(ISD::ADD, DL, VT,
14106 V,
14107 DAG.getNode(ISD::SHL, DL, VT,
14108 V,
14109 DAG.getConstant(Log2_32(MulAmt - 1), DL,
14110 MVT::i32)));
14111 } else if (llvm::has_single_bit<uint32_t>(MulAmt + 1)) {
14112 // (mul x, 2^N - 1) => (sub (shl x, N), x)
14113 Res = DAG.getNode(ISD::SUB, DL, VT,
14114 DAG.getNode(ISD::SHL, DL, VT,
14115 V,
14116 DAG.getConstant(Log2_32(MulAmt + 1), DL,
14117 MVT::i32)),
14118 V);
14119 } else
14120 return SDValue();
14121 } else {
14122 uint64_t MulAmtAbs = -MulAmt;
14123 if (llvm::has_single_bit<uint32_t>(MulAmtAbs + 1)) {
14124 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
14125 Res = DAG.getNode(ISD::SUB, DL, VT,
14126 V,
14127 DAG.getNode(ISD::SHL, DL, VT,
14128 V,
14129 DAG.getConstant(Log2_32(MulAmtAbs + 1), DL,
14130 MVT::i32)));
14131 } else if (llvm::has_single_bit<uint32_t>(MulAmtAbs - 1)) {
14132 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
14133 Res = DAG.getNode(ISD::ADD, DL, VT,
14134 V,
14135 DAG.getNode(ISD::SHL, DL, VT,
14136 V,
14137 DAG.getConstant(Log2_32(MulAmtAbs - 1), DL,
14138 MVT::i32)));
14139 Res = DAG.getNode(ISD::SUB, DL, VT,
14140 DAG.getConstant(0, DL, MVT::i32), Res);
14141 } else
14142 return SDValue();
14143 }
14144
14145 if (ShiftAmt != 0)
14146 Res = DAG.getNode(ISD::SHL, DL, VT,
14147 Res, DAG.getConstant(ShiftAmt, DL, MVT::i32));
14148
14149 // Do not add new nodes to DAG combiner worklist.
14150 DCI.CombineTo(N, Res, false);
14151 return SDValue();
14152}
14153
14156 const ARMSubtarget *Subtarget) {
14157 // Allow DAGCombine to pattern-match before we touch the canonical form.
14158 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
14159 return SDValue();
14160
14161 if (N->getValueType(0) != MVT::i32)
14162 return SDValue();
14163
14164 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1));
14165 if (!N1C)
14166 return SDValue();
14167
14168 uint32_t C1 = (uint32_t)N1C->getZExtValue();
14169 // Don't transform uxtb/uxth.
14170 if (C1 == 255 || C1 == 65535)
14171 return SDValue();
14172
14173 SDNode *N0 = N->getOperand(0).getNode();
14174 if (!N0->hasOneUse())
14175 return SDValue();
14176
14177 if (N0->getOpcode() != ISD::SHL && N0->getOpcode() != ISD::SRL)
14178 return SDValue();
14179
14180 bool LeftShift = N0->getOpcode() == ISD::SHL;
14181
14183 if (!N01C)
14184 return SDValue();
14185
14186 uint32_t C2 = (uint32_t)N01C->getZExtValue();
14187 if (!C2 || C2 >= 32)
14188 return SDValue();
14189
14190 // Clear irrelevant bits in the mask.
14191 if (LeftShift)
14192 C1 &= (-1U << C2);
14193 else
14194 C1 &= (-1U >> C2);
14195
14196 SelectionDAG &DAG = DCI.DAG;
14197 SDLoc DL(N);
14198
14199 // We have a pattern of the form "(and (shl x, c2) c1)" or
14200 // "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to
14201 // transform to a pair of shifts, to save materializing c1.
14202
14203 // First pattern: right shift, then mask off leading bits.
14204 // FIXME: Use demanded bits?
14205 if (!LeftShift && isMask_32(C1)) {
14206 uint32_t C3 = llvm::countl_zero(C1);
14207 if (C2 < C3) {
14208 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
14209 DAG.getConstant(C3 - C2, DL, MVT::i32));
14210 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
14211 DAG.getConstant(C3, DL, MVT::i32));
14212 }
14213 }
14214
14215 // First pattern, reversed: left shift, then mask off trailing bits.
14216 if (LeftShift && isMask_32(~C1)) {
14217 uint32_t C3 = llvm::countr_zero(C1);
14218 if (C2 < C3) {
14219 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
14220 DAG.getConstant(C3 - C2, DL, MVT::i32));
14221 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
14222 DAG.getConstant(C3, DL, MVT::i32));
14223 }
14224 }
14225
14226 // Second pattern: left shift, then mask off leading bits.
14227 // FIXME: Use demanded bits?
14228 if (LeftShift && isShiftedMask_32(C1)) {
14229 uint32_t Trailing = llvm::countr_zero(C1);
14230 uint32_t C3 = llvm::countl_zero(C1);
14231 if (Trailing == C2 && C2 + C3 < 32) {
14232 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
14233 DAG.getConstant(C2 + C3, DL, MVT::i32));
14234 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
14235 DAG.getConstant(C3, DL, MVT::i32));
14236 }
14237 }
14238
14239 // Second pattern, reversed: right shift, then mask off trailing bits.
14240 // FIXME: Handle other patterns of known/demanded bits.
14241 if (!LeftShift && isShiftedMask_32(C1)) {
14242 uint32_t Leading = llvm::countl_zero(C1);
14243 uint32_t C3 = llvm::countr_zero(C1);
14244 if (Leading == C2 && C2 + C3 < 32) {
14245 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
14246 DAG.getConstant(C2 + C3, DL, MVT::i32));
14247 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
14248 DAG.getConstant(C3, DL, MVT::i32));
14249 }
14250 }
14251
14252 // Transform "(and (shl x, c2) c1)" into "(shl (and x, c1>>c2), c2)"
14253 // if "c1 >> c2" is a cheaper immediate than "c1"
14254 if (LeftShift &&
14255 HasLowerConstantMaterializationCost(C1 >> C2, C1, Subtarget)) {
14256
14257 SDValue And = DAG.getNode(ISD::AND, DL, MVT::i32, N0->getOperand(0),
14258 DAG.getConstant(C1 >> C2, DL, MVT::i32));
14259 return DAG.getNode(ISD::SHL, DL, MVT::i32, And,
14260 DAG.getConstant(C2, DL, MVT::i32));
14261 }
14262
14263 return SDValue();
14264}
14265
14268 const ARMSubtarget *Subtarget) {
14269 // Attempt to use immediate-form VBIC
14270 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
14271 SDLoc dl(N);
14272 EVT VT = N->getValueType(0);
14273 SelectionDAG &DAG = DCI.DAG;
14274
14275 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT) || VT == MVT::v2i1 ||
14276 VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1)
14277 return SDValue();
14278
14279 APInt SplatBits, SplatUndef;
14280 unsigned SplatBitSize;
14281 bool HasAnyUndefs;
14282 if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
14283 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
14284 if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
14285 SplatBitSize == 64) {
14286 EVT VbicVT;
14287 SDValue Val = isVMOVModifiedImm((~SplatBits).getZExtValue(),
14288 SplatUndef.getZExtValue(), SplatBitSize,
14289 DAG, dl, VbicVT, VT, OtherModImm);
14290 if (Val.getNode()) {
14291 SDValue Input =
14292 DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VbicVT, N->getOperand(0));
14293 SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val);
14294 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vbic);
14295 }
14296 }
14297 }
14298
14299 if (!Subtarget->isThumb1Only()) {
14300 // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))
14301 if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI))
14302 return Result;
14303
14304 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14305 return Result;
14306 }
14307
14308 if (Subtarget->isThumb1Only())
14309 if (SDValue Result = CombineANDShift(N, DCI, Subtarget))
14310 return Result;
14311
14312 return SDValue();
14313}
14314
14315// Try combining OR nodes to SMULWB, SMULWT.
14318 const ARMSubtarget *Subtarget) {
14319 if (!Subtarget->hasV6Ops() ||
14320 (Subtarget->isThumb() &&
14321 (!Subtarget->hasThumb2() || !Subtarget->hasDSP())))
14322 return SDValue();
14323
14324 SDValue SRL = OR->getOperand(0);
14325 SDValue SHL = OR->getOperand(1);
14326
14327 if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) {
14328 SRL = OR->getOperand(1);
14329 SHL = OR->getOperand(0);
14330 }
14331 if (!isSRL16(SRL) || !isSHL16(SHL))
14332 return SDValue();
14333
14334 // The first operands to the shifts need to be the two results from the
14335 // same smul_lohi node.
14336 if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) ||
14337 SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI)
14338 return SDValue();
14339
14340 SDNode *SMULLOHI = SRL.getOperand(0).getNode();
14341 if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) ||
14342 SHL.getOperand(0) != SDValue(SMULLOHI, 1))
14343 return SDValue();
14344
14345 // Now we have:
14346 // (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16)))
14347 // For SMUL[B|T] smul_lohi will take a 32-bit and a 16-bit arguments.
14348 // For SMUWB the 16-bit value will signed extended somehow.
14349 // For SMULWT only the SRA is required.
14350 // Check both sides of SMUL_LOHI
14351 SDValue OpS16 = SMULLOHI->getOperand(0);
14352 SDValue OpS32 = SMULLOHI->getOperand(1);
14353
14354 SelectionDAG &DAG = DCI.DAG;
14355 if (!isS16(OpS16, DAG) && !isSRA16(OpS16)) {
14356 OpS16 = OpS32;
14357 OpS32 = SMULLOHI->getOperand(0);
14358 }
14359
14360 SDLoc dl(OR);
14361 unsigned Opcode = 0;
14362 if (isS16(OpS16, DAG))
14363 Opcode = ARMISD::SMULWB;
14364 else if (isSRA16(OpS16)) {
14365 Opcode = ARMISD::SMULWT;
14366 OpS16 = OpS16->getOperand(0);
14367 }
14368 else
14369 return SDValue();
14370
14371 SDValue Res = DAG.getNode(Opcode, dl, MVT::i32, OpS32, OpS16);
14372 DAG.ReplaceAllUsesOfValueWith(SDValue(OR, 0), Res);
14373 return SDValue(OR, 0);
14374}
14375
14378 const ARMSubtarget *Subtarget) {
14379 // BFI is only available on V6T2+
14380 if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
14381 return SDValue();
14382
14383 EVT VT = N->getValueType(0);
14384 SDValue N0 = N->getOperand(0);
14385 SDValue N1 = N->getOperand(1);
14386 SelectionDAG &DAG = DCI.DAG;
14387 SDLoc DL(N);
14388 // 1) or (and A, mask), val => ARMbfi A, val, mask
14389 // iff (val & mask) == val
14390 //
14391 // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
14392 // 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2)
14393 // && mask == ~mask2
14394 // 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2)
14395 // && ~mask == mask2
14396 // (i.e., copy a bitfield value into another bitfield of the same width)
14397
14398 if (VT != MVT::i32)
14399 return SDValue();
14400
14401 SDValue N00 = N0.getOperand(0);
14402
14403 // The value and the mask need to be constants so we can verify this is
14404 // actually a bitfield set. If the mask is 0xffff, we can do better
14405 // via a movt instruction, so don't use BFI in that case.
14406 SDValue MaskOp = N0.getOperand(1);
14408 if (!MaskC)
14409 return SDValue();
14410 unsigned Mask = MaskC->getZExtValue();
14411 if (Mask == 0xffff)
14412 return SDValue();
14413 SDValue Res;
14414 // Case (1): or (and A, mask), val => ARMbfi A, val, mask
14416 if (N1C) {
14417 unsigned Val = N1C->getZExtValue();
14418 if ((Val & ~Mask) != Val)
14419 return SDValue();
14420
14421 if (ARM::isBitFieldInvertedMask(Mask)) {
14422 Val >>= llvm::countr_zero(~Mask);
14423
14424 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00,
14425 DAG.getConstant(Val, DL, MVT::i32),
14426 DAG.getConstant(Mask, DL, MVT::i32));
14427
14428 DCI.CombineTo(N, Res, false);
14429 // Return value from the original node to inform the combiner than N is
14430 // now dead.
14431 return SDValue(N, 0);
14432 }
14433 } else if (N1.getOpcode() == ISD::AND) {
14434 // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
14436 if (!N11C)
14437 return SDValue();
14438 unsigned Mask2 = N11C->getZExtValue();
14439
14440 // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern
14441 // as is to match.
14442 if (ARM::isBitFieldInvertedMask(Mask) &&
14443 (Mask == ~Mask2)) {
14444 // The pack halfword instruction works better for masks that fit it,
14445 // so use that when it's available.
14446 if (Subtarget->hasDSP() &&
14447 (Mask == 0xffff || Mask == 0xffff0000))
14448 return SDValue();
14449 // 2a
14450 unsigned amt = llvm::countr_zero(Mask2);
14451 Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
14452 DAG.getConstant(amt, DL, MVT::i32));
14453 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res,
14454 DAG.getConstant(Mask, DL, MVT::i32));
14455 DCI.CombineTo(N, Res, false);
14456 // Return value from the original node to inform the combiner than N is
14457 // now dead.
14458 return SDValue(N, 0);
14459 } else if (ARM::isBitFieldInvertedMask(~Mask) &&
14460 (~Mask == Mask2)) {
14461 // The pack halfword instruction works better for masks that fit it,
14462 // so use that when it's available.
14463 if (Subtarget->hasDSP() &&
14464 (Mask2 == 0xffff || Mask2 == 0xffff0000))
14465 return SDValue();
14466 // 2b
14467 unsigned lsb = llvm::countr_zero(Mask);
14468 Res = DAG.getNode(ISD::SRL, DL, VT, N00,
14469 DAG.getConstant(lsb, DL, MVT::i32));
14470 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
14471 DAG.getConstant(Mask2, DL, MVT::i32));
14472 DCI.CombineTo(N, Res, false);
14473 // Return value from the original node to inform the combiner than N is
14474 // now dead.
14475 return SDValue(N, 0);
14476 }
14477 }
14478
14479 if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) &&
14480 N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) &&
14482 // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask
14483 // where lsb(mask) == #shamt and masked bits of B are known zero.
14484 SDValue ShAmt = N00.getOperand(1);
14485 unsigned ShAmtC = ShAmt->getAsZExtVal();
14486 unsigned LSB = llvm::countr_zero(Mask);
14487 if (ShAmtC != LSB)
14488 return SDValue();
14489
14490 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0),
14491 DAG.getConstant(~Mask, DL, MVT::i32));
14492
14493 DCI.CombineTo(N, Res, false);
14494 // Return value from the original node to inform the combiner than N is
14495 // now dead.
14496 return SDValue(N, 0);
14497 }
14498
14499 return SDValue();
14500}
14501
14502static bool isValidMVECond(unsigned CC, bool IsFloat) {
14503 switch (CC) {
14504 case ARMCC::EQ:
14505 case ARMCC::NE:
14506 case ARMCC::LE:
14507 case ARMCC::GT:
14508 case ARMCC::GE:
14509 case ARMCC::LT:
14510 return true;
14511 case ARMCC::HS:
14512 case ARMCC::HI:
14513 return !IsFloat;
14514 default:
14515 return false;
14516 };
14517}
14518
14520 if (N->getOpcode() == ARMISD::VCMP)
14521 return (ARMCC::CondCodes)N->getConstantOperandVal(2);
14522 else if (N->getOpcode() == ARMISD::VCMPZ)
14523 return (ARMCC::CondCodes)N->getConstantOperandVal(1);
14524 else
14525 llvm_unreachable("Not a VCMP/VCMPZ!");
14526}
14527
14530 return isValidMVECond(CC, N->getOperand(0).getValueType().isFloatingPoint());
14531}
14532
14534 const ARMSubtarget *Subtarget) {
14535 // Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain
14536 // together with predicates
14537 EVT VT = N->getValueType(0);
14538 SDLoc DL(N);
14539 SDValue N0 = N->getOperand(0);
14540 SDValue N1 = N->getOperand(1);
14541
14542 auto IsFreelyInvertable = [&](SDValue V) {
14543 if (V->getOpcode() == ARMISD::VCMP || V->getOpcode() == ARMISD::VCMPZ)
14544 return CanInvertMVEVCMP(V);
14545 return false;
14546 };
14547
14548 // At least one operand must be freely invertable.
14549 if (!(IsFreelyInvertable(N0) || IsFreelyInvertable(N1)))
14550 return SDValue();
14551
14552 SDValue NewN0 = DAG.getLogicalNOT(DL, N0, VT);
14553 SDValue NewN1 = DAG.getLogicalNOT(DL, N1, VT);
14554 SDValue And = DAG.getNode(ISD::AND, DL, VT, NewN0, NewN1);
14555 return DAG.getLogicalNOT(DL, And, VT);
14556}
14557
14558/// PerformORCombine - Target-specific dag combine xforms for ISD::OR
14561 const ARMSubtarget *Subtarget) {
14562 // Attempt to use immediate-form VORR
14563 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
14564 SDLoc dl(N);
14565 EVT VT = N->getValueType(0);
14566 SelectionDAG &DAG = DCI.DAG;
14567
14568 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
14569 return SDValue();
14570
14571 if (Subtarget->hasMVEIntegerOps() && (VT == MVT::v2i1 || VT == MVT::v4i1 ||
14572 VT == MVT::v8i1 || VT == MVT::v16i1))
14573 return PerformORCombine_i1(N, DAG, Subtarget);
14574
14575 APInt SplatBits, SplatUndef;
14576 unsigned SplatBitSize;
14577 bool HasAnyUndefs;
14578 if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
14579 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
14580 if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
14581 SplatBitSize == 64) {
14582 EVT VorrVT;
14583 SDValue Val =
14584 isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
14585 SplatBitSize, DAG, dl, VorrVT, VT, OtherModImm);
14586 if (Val.getNode()) {
14587 SDValue Input =
14588 DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VorrVT, N->getOperand(0));
14589 SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val);
14590 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vorr);
14591 }
14592 }
14593 }
14594
14595 if (!Subtarget->isThumb1Only()) {
14596 // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
14597 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
14598 return Result;
14599 if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget))
14600 return Result;
14601 }
14602
14603 SDValue N0 = N->getOperand(0);
14604 SDValue N1 = N->getOperand(1);
14605
14606 // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
14607 if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&
14609
14610 // The code below optimizes (or (and X, Y), Z).
14611 // The AND operand needs to have a single user to make these optimizations
14612 // profitable.
14613 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
14614 return SDValue();
14615
14616 APInt SplatUndef;
14617 unsigned SplatBitSize;
14618 bool HasAnyUndefs;
14619
14620 APInt SplatBits0, SplatBits1;
14623 // Ensure that the second operand of both ands are constants
14624 if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
14625 HasAnyUndefs) && !HasAnyUndefs) {
14626 if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
14627 HasAnyUndefs) && !HasAnyUndefs) {
14628 // Ensure that the bit width of the constants are the same and that
14629 // the splat arguments are logical inverses as per the pattern we
14630 // are trying to simplify.
14631 if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() &&
14632 SplatBits0 == ~SplatBits1) {
14633 // Canonicalize the vector type to make instruction selection
14634 // simpler.
14635 EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
14636 SDValue Result = DAG.getNode(ARMISD::VBSP, dl, CanonicalVT,
14637 N0->getOperand(1),
14638 N0->getOperand(0),
14639 N1->getOperand(0));
14640 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Result);
14641 }
14642 }
14643 }
14644 }
14645
14646 // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
14647 // reasonable.
14648 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
14649 if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget))
14650 return Res;
14651 }
14652
14653 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14654 return Result;
14655
14656 return SDValue();
14657}
14658
14661 const ARMSubtarget *Subtarget) {
14662 EVT VT = N->getValueType(0);
14663 SelectionDAG &DAG = DCI.DAG;
14664
14665 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
14666 return SDValue();
14667
14668 if (!Subtarget->isThumb1Only()) {
14669 // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
14670 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
14671 return Result;
14672
14673 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14674 return Result;
14675 }
14676
14677 if (Subtarget->hasMVEIntegerOps()) {
14678 // fold (xor(vcmp/z, 1)) into a vcmp with the opposite condition.
14679 SDValue N0 = N->getOperand(0);
14680 SDValue N1 = N->getOperand(1);
14681 const TargetLowering *TLI = Subtarget->getTargetLowering();
14682 if (TLI->isConstTrueVal(N1) &&
14683 (N0->getOpcode() == ARMISD::VCMP || N0->getOpcode() == ARMISD::VCMPZ)) {
14684 if (CanInvertMVEVCMP(N0)) {
14685 SDLoc DL(N0);
14687
14689 Ops.push_back(N0->getOperand(0));
14690 if (N0->getOpcode() == ARMISD::VCMP)
14691 Ops.push_back(N0->getOperand(1));
14692 Ops.push_back(DAG.getConstant(CC, DL, MVT::i32));
14693 return DAG.getNode(N0->getOpcode(), DL, N0->getValueType(0), Ops);
14694 }
14695 }
14696 }
14697
14698 return SDValue();
14699}
14700
14701// ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it,
14702// and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and
14703// their position in "to" (Rd).
14704static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) {
14705 assert(N->getOpcode() == ARMISD::BFI);
14706
14707 SDValue From = N->getOperand(1);
14708 ToMask = ~N->getConstantOperandAPInt(2);
14709 FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.popcount());
14710
14711 // If the Base came from a SHR #C, we can deduce that it is really testing bit
14712 // #C in the base of the SHR.
14713 if (From->getOpcode() == ISD::SRL &&
14714 isa<ConstantSDNode>(From->getOperand(1))) {
14715 APInt Shift = From->getConstantOperandAPInt(1);
14716 assert(Shift.getLimitedValue() < 32 && "Shift too large!");
14717 FromMask <<= Shift.getLimitedValue(31);
14718 From = From->getOperand(0);
14719 }
14720
14721 return From;
14722}
14723
14724// If A and B contain one contiguous set of bits, does A | B == A . B?
14725//
14726// Neither A nor B must be zero.
14727static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) {
14728 unsigned LastActiveBitInA = A.countr_zero();
14729 unsigned FirstActiveBitInB = B.getBitWidth() - B.countl_zero() - 1;
14730 return LastActiveBitInA - 1 == FirstActiveBitInB;
14731}
14732
14734 // We have a BFI in N. Find a BFI it can combine with, if one exists.
14735 APInt ToMask, FromMask;
14736 SDValue From = ParseBFI(N, ToMask, FromMask);
14737 SDValue To = N->getOperand(0);
14738
14739 SDValue V = To;
14740 if (V.getOpcode() != ARMISD::BFI)
14741 return SDValue();
14742
14743 APInt NewToMask, NewFromMask;
14744 SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask);
14745 if (NewFrom != From)
14746 return SDValue();
14747
14748 // Do the written bits conflict with any we've seen so far?
14749 if ((NewToMask & ToMask).getBoolValue())
14750 // Conflicting bits.
14751 return SDValue();
14752
14753 // Are the new bits contiguous when combined with the old bits?
14754 if (BitsProperlyConcatenate(ToMask, NewToMask) &&
14755 BitsProperlyConcatenate(FromMask, NewFromMask))
14756 return V;
14757 if (BitsProperlyConcatenate(NewToMask, ToMask) &&
14758 BitsProperlyConcatenate(NewFromMask, FromMask))
14759 return V;
14760
14761 return SDValue();
14762}
14763
14765 SDValue N0 = N->getOperand(0);
14766 SDValue N1 = N->getOperand(1);
14767
14768 if (N1.getOpcode() == ISD::AND) {
14769 // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff
14770 // the bits being cleared by the AND are not demanded by the BFI.
14772 if (!N11C)
14773 return SDValue();
14774 unsigned InvMask = N->getConstantOperandVal(2);
14775 unsigned LSB = llvm::countr_zero(~InvMask);
14776 unsigned Width = llvm::bit_width<unsigned>(~InvMask) - LSB;
14777 assert(Width <
14778 static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
14779 "undefined behavior");
14780 unsigned Mask = (1u << Width) - 1;
14781 unsigned Mask2 = N11C->getZExtValue();
14782 if ((Mask & (~Mask2)) == 0)
14783 return DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0),
14784 N->getOperand(0), N1.getOperand(0), N->getOperand(2));
14785 return SDValue();
14786 }
14787
14788 // Look for another BFI to combine with.
14789 if (SDValue CombineBFI = FindBFIToCombineWith(N)) {
14790 // We've found a BFI.
14791 APInt ToMask1, FromMask1;
14792 SDValue From1 = ParseBFI(N, ToMask1, FromMask1);
14793
14794 APInt ToMask2, FromMask2;
14795 SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2);
14796 assert(From1 == From2);
14797 (void)From2;
14798
14799 // Create a new BFI, combining the two together.
14800 APInt NewFromMask = FromMask1 | FromMask2;
14801 APInt NewToMask = ToMask1 | ToMask2;
14802
14803 EVT VT = N->getValueType(0);
14804 SDLoc dl(N);
14805
14806 if (NewFromMask[0] == 0)
14807 From1 = DAG.getNode(ISD::SRL, dl, VT, From1,
14808 DAG.getConstant(NewFromMask.countr_zero(), dl, VT));
14809 return DAG.getNode(ARMISD::BFI, dl, VT, CombineBFI.getOperand(0), From1,
14810 DAG.getConstant(~NewToMask, dl, VT));
14811 }
14812
14813 // Reassociate BFI(BFI (A, B, M1), C, M2) to BFI(BFI (A, C, M2), B, M1) so
14814 // that lower bit insertions are performed first, providing that M1 and M2
14815 // do no overlap. This can allow multiple BFI instructions to be combined
14816 // together by the other folds above.
14817 if (N->getOperand(0).getOpcode() == ARMISD::BFI) {
14818 APInt ToMask1 = ~N->getConstantOperandAPInt(2);
14819 APInt ToMask2 = ~N0.getConstantOperandAPInt(2);
14820
14821 if (!N0.hasOneUse() || (ToMask1 & ToMask2) != 0 ||
14822 ToMask1.countl_zero() < ToMask2.countl_zero())
14823 return SDValue();
14824
14825 EVT VT = N->getValueType(0);
14826 SDLoc dl(N);
14827 SDValue BFI1 = DAG.getNode(ARMISD::BFI, dl, VT, N0.getOperand(0),
14828 N->getOperand(1), N->getOperand(2));
14829 return DAG.getNode(ARMISD::BFI, dl, VT, BFI1, N0.getOperand(1),
14830 N0.getOperand(2));
14831 }
14832
14833 return SDValue();
14834}
14835
14836// Check that N is CMPZ(CSINC(0, 0, CC, X)),
14837// or CMPZ(CMOV(1, 0, CC, X))
14838// return X if valid.
14840 if (Cmp->getOpcode() != ARMISD::CMPZ || !isNullConstant(Cmp->getOperand(1)))
14841 return SDValue();
14842 SDValue CSInc = Cmp->getOperand(0);
14843
14844 // Ignore any `And 1` nodes that may not yet have been removed. We are
14845 // looking for a value that produces 1/0, so these have no effect on the
14846 // code.
14847 while (CSInc.getOpcode() == ISD::AND &&
14848 isa<ConstantSDNode>(CSInc.getOperand(1)) &&
14849 CSInc.getConstantOperandVal(1) == 1 && CSInc->hasOneUse())
14850 CSInc = CSInc.getOperand(0);
14851
14852 if (CSInc.getOpcode() == ARMISD::CSINC &&
14853 isNullConstant(CSInc.getOperand(0)) &&
14854 isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) {
14856 return CSInc.getOperand(3);
14857 }
14858 if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(0)) &&
14859 isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) {
14861 return CSInc.getOperand(3);
14862 }
14863 if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(1)) &&
14864 isNullConstant(CSInc.getOperand(0)) && CSInc->hasOneUse()) {
14867 return CSInc.getOperand(3);
14868 }
14869 return SDValue();
14870}
14871
14873 // Given CMPZ(CSINC(C, 0, 0, EQ), 0), we can just use C directly. As in
14874 // t92: flags = ARMISD::CMPZ t74, 0
14875 // t93: i32 = ARMISD::CSINC 0, 0, 1, t92
14876 // t96: flags = ARMISD::CMPZ t93, 0
14877 // t114: i32 = ARMISD::CSINV 0, 0, 0, t96
14879 if (SDValue C = IsCMPZCSINC(N, Cond))
14880 if (Cond == ARMCC::EQ)
14881 return C;
14882 return SDValue();
14883}
14884
14886 // Fold away an unneccessary CMPZ/CSINC
14887 // CSXYZ A, B, C1 (CMPZ (CSINC 0, 0, C2, D), 0) ->
14888 // if C1==EQ -> CSXYZ A, B, C2, D
14889 // if C1==NE -> CSXYZ A, B, NOT(C2), D
14891 if (SDValue C = IsCMPZCSINC(N->getOperand(3).getNode(), Cond)) {
14892 if (N->getConstantOperandVal(2) == ARMCC::EQ)
14893 return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
14894 N->getOperand(1),
14895 DAG.getConstant(Cond, SDLoc(N), MVT::i32), C);
14896 if (N->getConstantOperandVal(2) == ARMCC::NE)
14897 return DAG.getNode(
14898 N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
14899 N->getOperand(1),
14901 }
14902 return SDValue();
14903}
14904
14905/// PerformVMOVRRDCombine - Target-specific dag combine xforms for
14906/// ARMISD::VMOVRRD.
14909 const ARMSubtarget *Subtarget) {
14910 // vmovrrd(vmovdrr x, y) -> x,y
14911 SDValue InDouble = N->getOperand(0);
14912 if (InDouble.getOpcode() == ARMISD::VMOVDRR && Subtarget->hasFP64())
14913 return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1));
14914
14915 // vmovrrd(load f64) -> (load i32), (load i32)
14916 SDNode *InNode = InDouble.getNode();
14917 if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() &&
14918 InNode->getValueType(0) == MVT::f64 &&
14919 InNode->getOperand(1).getOpcode() == ISD::FrameIndex &&
14920 !cast<LoadSDNode>(InNode)->isVolatile()) {
14921 // TODO: Should this be done for non-FrameIndex operands?
14922 LoadSDNode *LD = cast<LoadSDNode>(InNode);
14923
14924 SelectionDAG &DAG = DCI.DAG;
14925 SDLoc DL(LD);
14926 SDValue BasePtr = LD->getBasePtr();
14927 SDValue NewLD1 =
14928 DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(),
14929 LD->getAlign(), LD->getMemOperand()->getFlags());
14930
14931 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
14932 DAG.getConstant(4, DL, MVT::i32));
14933
14934 SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, LD->getChain(), OffsetPtr,
14935 LD->getPointerInfo().getWithOffset(4),
14936 commonAlignment(LD->getAlign(), 4),
14937 LD->getMemOperand()->getFlags());
14938
14939 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
14940 if (DCI.DAG.getDataLayout().isBigEndian())
14941 std::swap (NewLD1, NewLD2);
14942 SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2);
14943 return Result;
14944 }
14945
14946 // VMOVRRD(extract(..(build_vector(a, b, c, d)))) -> a,b or c,d
14947 // VMOVRRD(extract(insert_vector(insert_vector(.., a, l1), b, l2))) -> a,b
14948 if (InDouble.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
14949 isa<ConstantSDNode>(InDouble.getOperand(1))) {
14950 SDValue BV = InDouble.getOperand(0);
14951 // Look up through any nop bitcasts and vector_reg_casts. bitcasts may
14952 // change lane order under big endian.
14953 bool BVSwap = BV.getOpcode() == ISD::BITCAST;
14954 while (
14955 (BV.getOpcode() == ISD::BITCAST ||
14956 BV.getOpcode() == ARMISD::VECTOR_REG_CAST) &&
14957 (BV.getValueType() == MVT::v2f64 || BV.getValueType() == MVT::v2i64)) {
14958 BVSwap = BV.getOpcode() == ISD::BITCAST;
14959 BV = BV.getOperand(0);
14960 }
14961 if (BV.getValueType() != MVT::v4i32)
14962 return SDValue();
14963
14964 // Handle buildvectors, pulling out the correct lane depending on
14965 // endianness.
14966 unsigned Offset = InDouble.getConstantOperandVal(1) == 1 ? 2 : 0;
14967 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
14968 SDValue Op0 = BV.getOperand(Offset);
14969 SDValue Op1 = BV.getOperand(Offset + 1);
14970 if (!Subtarget->isLittle() && BVSwap)
14971 std::swap(Op0, Op1);
14972
14973 return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N));
14974 }
14975
14976 // A chain of insert_vectors, grabbing the correct value of the chain of
14977 // inserts.
14978 SDValue Op0, Op1;
14979 while (BV.getOpcode() == ISD::INSERT_VECTOR_ELT) {
14980 if (isa<ConstantSDNode>(BV.getOperand(2))) {
14981 if (BV.getConstantOperandVal(2) == Offset && !Op0)
14982 Op0 = BV.getOperand(1);
14983 if (BV.getConstantOperandVal(2) == Offset + 1 && !Op1)
14984 Op1 = BV.getOperand(1);
14985 }
14986 BV = BV.getOperand(0);
14987 }
14988 if (!Subtarget->isLittle() && BVSwap)
14989 std::swap(Op0, Op1);
14990 if (Op0 && Op1)
14991 return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N));
14992 }
14993
14994 return SDValue();
14995}
14996
14997/// PerformVMOVDRRCombine - Target-specific dag combine xforms for
14998/// ARMISD::VMOVDRR. This is also used for BUILD_VECTORs with 2 operands.
15000 // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X)
15001 SDValue Op0 = N->getOperand(0);
15002 SDValue Op1 = N->getOperand(1);
15003 if (Op0.getOpcode() == ISD::BITCAST)
15004 Op0 = Op0.getOperand(0);
15005 if (Op1.getOpcode() == ISD::BITCAST)
15006 Op1 = Op1.getOperand(0);
15007 if (Op0.getOpcode() == ARMISD::VMOVRRD &&
15008 Op0.getNode() == Op1.getNode() &&
15009 Op0.getResNo() == 0 && Op1.getResNo() == 1)
15010 return DAG.getNode(ISD::BITCAST, SDLoc(N),
15011 N->getValueType(0), Op0.getOperand(0));
15012 return SDValue();
15013}
15014
15017 SDValue Op0 = N->getOperand(0);
15018
15019 // VMOVhr (VMOVrh (X)) -> X
15020 if (Op0->getOpcode() == ARMISD::VMOVrh)
15021 return Op0->getOperand(0);
15022
15023 // FullFP16: half values are passed in S-registers, and we don't
15024 // need any of the bitcast and moves:
15025 //
15026 // t2: f32,ch1,gl1? = CopyFromReg ch, Register:f32 %0, gl?
15027 // t5: i32 = bitcast t2
15028 // t18: f16 = ARMISD::VMOVhr t5
15029 // =>
15030 // tN: f16,ch2,gl2? = CopyFromReg ch, Register::f32 %0, gl?
15031 if (Op0->getOpcode() == ISD::BITCAST) {
15032 SDValue Copy = Op0->getOperand(0);
15033 if (Copy.getValueType() == MVT::f32 &&
15034 Copy->getOpcode() == ISD::CopyFromReg) {
15035 bool HasGlue = Copy->getNumOperands() == 3;
15036 SDValue Ops[] = {Copy->getOperand(0), Copy->getOperand(1),
15037 HasGlue ? Copy->getOperand(2) : SDValue()};
15038 EVT OutTys[] = {N->getValueType(0), MVT::Other, MVT::Glue};
15039 SDValue NewCopy =
15041 DCI.DAG.getVTList(ArrayRef(OutTys, HasGlue ? 3 : 2)),
15042 ArrayRef(Ops, HasGlue ? 3 : 2));
15043
15044 // Update Users, Chains, and Potential Glue.
15045 DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), NewCopy.getValue(0));
15046 DCI.DAG.ReplaceAllUsesOfValueWith(Copy.getValue(1), NewCopy.getValue(1));
15047 if (HasGlue)
15048 DCI.DAG.ReplaceAllUsesOfValueWith(Copy.getValue(2),
15049 NewCopy.getValue(2));
15050
15051 return NewCopy;
15052 }
15053 }
15054
15055 // fold (VMOVhr (load x)) -> (load (f16*)x)
15056 if (LoadSDNode *LN0 = dyn_cast<LoadSDNode>(Op0)) {
15057 if (LN0->hasOneUse() && LN0->isUnindexed() &&
15058 LN0->getMemoryVT() == MVT::i16) {
15059 SDValue Load =
15060 DCI.DAG.getLoad(N->getValueType(0), SDLoc(N), LN0->getChain(),
15061 LN0->getBasePtr(), LN0->getMemOperand());
15062 DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));
15063 DCI.DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Load.getValue(1));
15064 return Load;
15065 }
15066 }
15067
15068 // Only the bottom 16 bits of the source register are used.
15069 APInt DemandedMask = APInt::getLowBitsSet(32, 16);
15070 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
15071 if (TLI.SimplifyDemandedBits(Op0, DemandedMask, DCI))
15072 return SDValue(N, 0);
15073
15074 return SDValue();
15075}
15076
15078 SDValue N0 = N->getOperand(0);
15079 EVT VT = N->getValueType(0);
15080
15081 // fold (VMOVrh (fpconst x)) -> const x
15083 APFloat V = C->getValueAPF();
15084 return DAG.getConstant(V.bitcastToAPInt().getZExtValue(), SDLoc(N), VT);
15085 }
15086
15087 // fold (VMOVrh (load x)) -> (zextload (i16*)x)
15088 if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse()) {
15089 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
15090
15091 SDValue Load =
15092 DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT, LN0->getChain(),
15093 LN0->getBasePtr(), MVT::i16, LN0->getMemOperand());
15094 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));
15095 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
15096 return Load;
15097 }
15098
15099 // Fold VMOVrh(extract(x, n)) -> vgetlaneu(x, n)
15100 if (N0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15102 return DAG.getNode(ARMISD::VGETLANEu, SDLoc(N), VT, N0->getOperand(0),
15103 N0->getOperand(1));
15104
15105 return SDValue();
15106}
15107
15108/// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node
15109/// are normal, non-volatile loads. If so, it is profitable to bitcast an
15110/// i64 vector to have f64 elements, since the value can then be loaded
15111/// directly into a VFP register.
15113 unsigned NumElts = N->getValueType(0).getVectorNumElements();
15114 for (unsigned i = 0; i < NumElts; ++i) {
15115 SDNode *Elt = N->getOperand(i).getNode();
15116 if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile())
15117 return true;
15118 }
15119 return false;
15120}
15121
15122/// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for
15123/// ISD::BUILD_VECTOR.
15126 const ARMSubtarget *Subtarget) {
15127 // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X):
15128 // VMOVRRD is introduced when legalizing i64 types. It forces the i64 value
15129 // into a pair of GPRs, which is fine when the value is used as a scalar,
15130 // but if the i64 value is converted to a vector, we need to undo the VMOVRRD.
15131 SelectionDAG &DAG = DCI.DAG;
15132 if (N->getNumOperands() == 2)
15133 if (SDValue RV = PerformVMOVDRRCombine(N, DAG))
15134 return RV;
15135
15136 // Load i64 elements as f64 values so that type legalization does not split
15137 // them up into i32 values.
15138 EVT VT = N->getValueType(0);
15139 if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N))
15140 return SDValue();
15141 SDLoc dl(N);
15143 unsigned NumElts = VT.getVectorNumElements();
15144 for (unsigned i = 0; i < NumElts; ++i) {
15145 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i));
15146 Ops.push_back(V);
15147 // Make the DAGCombiner fold the bitcast.
15148 DCI.AddToWorklist(V.getNode());
15149 }
15150 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts);
15151 SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops);
15152 return DAG.getNode(ISD::BITCAST, dl, VT, BV);
15153}
15154
15155/// Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
15156static SDValue
15158 // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR.
15159 // At that time, we may have inserted bitcasts from integer to float.
15160 // If these bitcasts have survived DAGCombine, change the lowering of this
15161 // BUILD_VECTOR in something more vector friendly, i.e., that does not
15162 // force to use floating point types.
15163
15164 // Make sure we can change the type of the vector.
15165 // This is possible iff:
15166 // 1. The vector is only used in a bitcast to a integer type. I.e.,
15167 // 1.1. Vector is used only once.
15168 // 1.2. Use is a bit convert to an integer type.
15169 // 2. The size of its operands are 32-bits (64-bits are not legal).
15170 EVT VT = N->getValueType(0);
15171 EVT EltVT = VT.getVectorElementType();
15172
15173 // Check 1.1. and 2.
15174 if (EltVT.getSizeInBits() != 32 || !N->hasOneUse())
15175 return SDValue();
15176
15177 // By construction, the input type must be float.
15178 assert(EltVT == MVT::f32 && "Unexpected type!");
15179
15180 // Check 1.2.
15181 SDNode *Use = *N->user_begin();
15182 if (Use->getOpcode() != ISD::BITCAST ||
15183 Use->getValueType(0).isFloatingPoint())
15184 return SDValue();
15185
15186 // Check profitability.
15187 // Model is, if more than half of the relevant operands are bitcast from
15188 // i32, turn the build_vector into a sequence of insert_vector_elt.
15189 // Relevant operands are everything that is not statically
15190 // (i.e., at compile time) bitcasted.
15191 unsigned NumOfBitCastedElts = 0;
15192 unsigned NumElts = VT.getVectorNumElements();
15193 unsigned NumOfRelevantElts = NumElts;
15194 for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
15195 SDValue Elt = N->getOperand(Idx);
15196 if (Elt->getOpcode() == ISD::BITCAST) {
15197 // Assume only bit cast to i32 will go away.
15198 if (Elt->getOperand(0).getValueType() == MVT::i32)
15199 ++NumOfBitCastedElts;
15200 } else if (Elt.isUndef() || isa<ConstantSDNode>(Elt))
15201 // Constants are statically casted, thus do not count them as
15202 // relevant operands.
15203 --NumOfRelevantElts;
15204 }
15205
15206 // Check if more than half of the elements require a non-free bitcast.
15207 if (NumOfBitCastedElts <= NumOfRelevantElts / 2)
15208 return SDValue();
15209
15210 SelectionDAG &DAG = DCI.DAG;
15211 // Create the new vector type.
15212 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
15213 // Check if the type is legal.
15214 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15215 if (!TLI.isTypeLegal(VecVT))
15216 return SDValue();
15217
15218 // Combine:
15219 // ARMISD::BUILD_VECTOR E1, E2, ..., EN.
15220 // => BITCAST INSERT_VECTOR_ELT
15221 // (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1),
15222 // (BITCAST EN), N.
15223 SDValue Vec = DAG.getUNDEF(VecVT);
15224 SDLoc dl(N);
15225 for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) {
15226 SDValue V = N->getOperand(Idx);
15227 if (V.isUndef())
15228 continue;
15229 if (V.getOpcode() == ISD::BITCAST &&
15230 V->getOperand(0).getValueType() == MVT::i32)
15231 // Fold obvious case.
15232 V = V.getOperand(0);
15233 else {
15234 V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V);
15235 // Make the DAGCombiner fold the bitcasts.
15236 DCI.AddToWorklist(V.getNode());
15237 }
15238 SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32);
15239 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx);
15240 }
15241 Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec);
15242 // Make the DAGCombiner fold the bitcasts.
15243 DCI.AddToWorklist(Vec.getNode());
15244 return Vec;
15245}
15246
15247static SDValue
15249 EVT VT = N->getValueType(0);
15250 SDValue Op = N->getOperand(0);
15251 SDLoc dl(N);
15252
15253 // PREDICATE_CAST(PREDICATE_CAST(x)) == PREDICATE_CAST(x)
15254 if (Op->getOpcode() == ARMISD::PREDICATE_CAST) {
15255 // If the valuetypes are the same, we can remove the cast entirely.
15256 if (Op->getOperand(0).getValueType() == VT)
15257 return Op->getOperand(0);
15258 return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));
15259 }
15260
15261 // Turn pred_cast(xor x, -1) into xor(pred_cast x, -1), in order to produce
15262 // more VPNOT which might get folded as else predicates.
15263 if (Op.getValueType() == MVT::i32 && isBitwiseNot(Op)) {
15264 SDValue X =
15265 DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));
15266 SDValue C = DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT,
15267 DCI.DAG.getConstant(65535, dl, MVT::i32));
15268 return DCI.DAG.getNode(ISD::XOR, dl, VT, X, C);
15269 }
15270
15271 // Only the bottom 16 bits of the source register are used.
15272 if (Op.getValueType() == MVT::i32) {
15273 APInt DemandedMask = APInt::getLowBitsSet(32, 16);
15274 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
15275 if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
15276 return SDValue(N, 0);
15277 }
15278 return SDValue();
15279}
15280
15282 const ARMSubtarget *ST) {
15283 EVT VT = N->getValueType(0);
15284 SDValue Op = N->getOperand(0);
15285 SDLoc dl(N);
15286
15287 // Under Little endian, a VECTOR_REG_CAST is equivalent to a BITCAST
15288 if (ST->isLittle())
15289 return DAG.getNode(ISD::BITCAST, dl, VT, Op);
15290
15291 // VT VECTOR_REG_CAST (VT Op) -> Op
15292 if (Op.getValueType() == VT)
15293 return Op;
15294 // VECTOR_REG_CAST undef -> undef
15295 if (Op.isUndef())
15296 return DAG.getUNDEF(VT);
15297
15298 // VECTOR_REG_CAST(VECTOR_REG_CAST(x)) == VECTOR_REG_CAST(x)
15299 if (Op->getOpcode() == ARMISD::VECTOR_REG_CAST) {
15300 // If the valuetypes are the same, we can remove the cast entirely.
15301 if (Op->getOperand(0).getValueType() == VT)
15302 return Op->getOperand(0);
15303 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Op->getOperand(0));
15304 }
15305
15306 return SDValue();
15307}
15308
15310 const ARMSubtarget *Subtarget) {
15311 if (!Subtarget->hasMVEIntegerOps())
15312 return SDValue();
15313
15314 EVT VT = N->getValueType(0);
15315 SDValue Op0 = N->getOperand(0);
15316 SDValue Op1 = N->getOperand(1);
15317 ARMCC::CondCodes Cond = (ARMCC::CondCodes)N->getConstantOperandVal(2);
15318 SDLoc dl(N);
15319
15320 // vcmp X, 0, cc -> vcmpz X, cc
15321 if (isZeroVector(Op1))
15322 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op0, N->getOperand(2));
15323
15324 unsigned SwappedCond = getSwappedCondition(Cond);
15325 if (isValidMVECond(SwappedCond, VT.isFloatingPoint())) {
15326 // vcmp 0, X, cc -> vcmpz X, reversed(cc)
15327 if (isZeroVector(Op0))
15328 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op1,
15329 DAG.getConstant(SwappedCond, dl, MVT::i32));
15330 // vcmp vdup(Y), X, cc -> vcmp X, vdup(Y), reversed(cc)
15331 if (Op0->getOpcode() == ARMISD::VDUP && Op1->getOpcode() != ARMISD::VDUP)
15332 return DAG.getNode(ARMISD::VCMP, dl, VT, Op1, Op0,
15333 DAG.getConstant(SwappedCond, dl, MVT::i32));
15334 }
15335
15336 return SDValue();
15337}
15338
15339/// PerformInsertEltCombine - Target-specific dag combine xforms for
15340/// ISD::INSERT_VECTOR_ELT.
15343 // Bitcast an i64 load inserted into a vector to f64.
15344 // Otherwise, the i64 value will be legalized to a pair of i32 values.
15345 EVT VT = N->getValueType(0);
15346 SDNode *Elt = N->getOperand(1).getNode();
15347 if (VT.getVectorElementType() != MVT::i64 ||
15348 !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile())
15349 return SDValue();
15350
15351 SelectionDAG &DAG = DCI.DAG;
15352 SDLoc dl(N);
15353 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
15355 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0));
15356 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1));
15357 // Make the DAGCombiner fold the bitcasts.
15358 DCI.AddToWorklist(Vec.getNode());
15359 DCI.AddToWorklist(V.getNode());
15360 SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT,
15361 Vec, V, N->getOperand(2));
15362 return DAG.getNode(ISD::BITCAST, dl, VT, InsElt);
15363}
15364
15365// Convert a pair of extracts from the same base vector to a VMOVRRD. Either
15366// directly or bitcast to an integer if the original is a float vector.
15367// extract(x, n); extract(x, n+1) -> VMOVRRD(extract v2f64 x, n/2)
15368// bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD(extract x, n/2)
15369static SDValue
15371 EVT VT = N->getValueType(0);
15372 SDLoc dl(N);
15373
15374 if (!DCI.isAfterLegalizeDAG() || VT != MVT::i32 ||
15375 !DCI.DAG.getTargetLoweringInfo().isTypeLegal(MVT::f64))
15376 return SDValue();
15377
15378 SDValue Ext = SDValue(N, 0);
15379 if (Ext.getOpcode() == ISD::BITCAST &&
15380 Ext.getOperand(0).getValueType() == MVT::f32)
15381 Ext = Ext.getOperand(0);
15382 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
15383 !isa<ConstantSDNode>(Ext.getOperand(1)) ||
15384 Ext.getConstantOperandVal(1) % 2 != 0)
15385 return SDValue();
15386 if (Ext->hasOneUse() && (Ext->user_begin()->getOpcode() == ISD::SINT_TO_FP ||
15387 Ext->user_begin()->getOpcode() == ISD::UINT_TO_FP))
15388 return SDValue();
15389
15390 SDValue Op0 = Ext.getOperand(0);
15391 EVT VecVT = Op0.getValueType();
15392 unsigned ResNo = Op0.getResNo();
15393 unsigned Lane = Ext.getConstantOperandVal(1);
15394 if (VecVT.getVectorNumElements() != 4)
15395 return SDValue();
15396
15397 // Find another extract, of Lane + 1
15398 auto OtherIt = find_if(Op0->users(), [&](SDNode *V) {
15399 return V->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15400 isa<ConstantSDNode>(V->getOperand(1)) &&
15401 V->getConstantOperandVal(1) == Lane + 1 &&
15402 V->getOperand(0).getResNo() == ResNo;
15403 });
15404 if (OtherIt == Op0->users().end())
15405 return SDValue();
15406
15407 // For float extracts, we need to be converting to a i32 for both vector
15408 // lanes.
15409 SDValue OtherExt(*OtherIt, 0);
15410 if (OtherExt.getValueType() != MVT::i32) {
15411 if (!OtherExt->hasOneUse() ||
15412 OtherExt->user_begin()->getOpcode() != ISD::BITCAST ||
15413 OtherExt->user_begin()->getValueType(0) != MVT::i32)
15414 return SDValue();
15415 OtherExt = SDValue(*OtherExt->user_begin(), 0);
15416 }
15417
15418 // Convert the type to a f64 and extract with a VMOVRRD.
15419 SDValue F64 = DCI.DAG.getNode(
15420 ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15421 DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v2f64, Op0),
15422 DCI.DAG.getConstant(Ext.getConstantOperandVal(1) / 2, dl, MVT::i32));
15423 SDValue VMOVRRD =
15424 DCI.DAG.getNode(ARMISD::VMOVRRD, dl, {MVT::i32, MVT::i32}, F64);
15425
15426 DCI.CombineTo(OtherExt.getNode(), SDValue(VMOVRRD.getNode(), 1));
15427 return VMOVRRD;
15428}
15429
15432 const ARMSubtarget *ST) {
15433 SDValue Op0 = N->getOperand(0);
15434 EVT VT = N->getValueType(0);
15435 SDLoc dl(N);
15436
15437 // extract (vdup x) -> x
15438 if (Op0->getOpcode() == ARMISD::VDUP) {
15439 SDValue X = Op0->getOperand(0);
15440 if (VT == MVT::f16 && X.getValueType() == MVT::i32)
15441 return DCI.DAG.getNode(ARMISD::VMOVhr, dl, VT, X);
15442 if (VT == MVT::i32 && X.getValueType() == MVT::f16)
15443 return DCI.DAG.getNode(ARMISD::VMOVrh, dl, VT, X);
15444 if (VT == MVT::f32 && X.getValueType() == MVT::i32)
15445 return DCI.DAG.getNode(ISD::BITCAST, dl, VT, X);
15446
15447 while (X.getValueType() != VT && X->getOpcode() == ISD::BITCAST)
15448 X = X->getOperand(0);
15449 if (X.getValueType() == VT)
15450 return X;
15451 }
15452
15453 // extract ARM_BUILD_VECTOR -> x
15454 if (Op0->getOpcode() == ARMISD::BUILD_VECTOR &&
15455 isa<ConstantSDNode>(N->getOperand(1)) &&
15456 N->getConstantOperandVal(1) < Op0.getNumOperands()) {
15457 return Op0.getOperand(N->getConstantOperandVal(1));
15458 }
15459
15460 // extract(bitcast(BUILD_VECTOR(VMOVDRR(a, b), ..))) -> a or b
15461 if (Op0.getValueType() == MVT::v4i32 &&
15462 isa<ConstantSDNode>(N->getOperand(1)) &&
15463 Op0.getOpcode() == ISD::BITCAST &&
15465 Op0.getOperand(0).getValueType() == MVT::v2f64) {
15466 SDValue BV = Op0.getOperand(0);
15467 unsigned Offset = N->getConstantOperandVal(1);
15468 SDValue MOV = BV.getOperand(Offset < 2 ? 0 : 1);
15469 if (MOV.getOpcode() == ARMISD::VMOVDRR)
15470 return MOV.getOperand(ST->isLittle() ? Offset % 2 : 1 - Offset % 2);
15471 }
15472
15473 // extract x, n; extract x, n+1 -> VMOVRRD x
15474 if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI))
15475 return R;
15476
15477 // extract (MVETrunc(x)) -> extract x
15478 if (Op0->getOpcode() == ARMISD::MVETRUNC) {
15479 unsigned Idx = N->getConstantOperandVal(1);
15480 unsigned Vec =
15482 unsigned SubIdx =
15484 return DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Op0.getOperand(Vec),
15485 DCI.DAG.getConstant(SubIdx, dl, MVT::i32));
15486 }
15487
15488 return SDValue();
15489}
15490
15492 SDValue Op = N->getOperand(0);
15493 EVT VT = N->getValueType(0);
15494
15495 // sext_inreg(VGETLANEu) -> VGETLANEs
15496 if (Op.getOpcode() == ARMISD::VGETLANEu &&
15497 cast<VTSDNode>(N->getOperand(1))->getVT() ==
15498 Op.getOperand(0).getValueType().getScalarType())
15499 return DAG.getNode(ARMISD::VGETLANEs, SDLoc(N), VT, Op.getOperand(0),
15500 Op.getOperand(1));
15501
15502 return SDValue();
15503}
15504
15505static SDValue
15507 SDValue Vec = N->getOperand(0);
15508 SDValue SubVec = N->getOperand(1);
15509 uint64_t IdxVal = N->getConstantOperandVal(2);
15510 EVT VecVT = Vec.getValueType();
15511 EVT SubVT = SubVec.getValueType();
15512
15513 // Only do this for legal fixed vector types.
15514 if (!VecVT.isFixedLengthVector() ||
15515 !DCI.DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
15517 return SDValue();
15518
15519 // Ignore widening patterns.
15520 if (IdxVal == 0 && Vec.isUndef())
15521 return SDValue();
15522
15523 // Subvector must be half the width and an "aligned" insertion.
15524 unsigned NumSubElts = SubVT.getVectorNumElements();
15525 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
15526 (IdxVal != 0 && IdxVal != NumSubElts))
15527 return SDValue();
15528
15529 // Fold insert_subvector -> concat_vectors
15530 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
15531 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
15532 SDLoc DL(N);
15533 SDValue Lo, Hi;
15534 if (IdxVal == 0) {
15535 Lo = SubVec;
15536 Hi = DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
15537 DCI.DAG.getVectorIdxConstant(NumSubElts, DL));
15538 } else {
15539 Lo = DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
15540 DCI.DAG.getVectorIdxConstant(0, DL));
15541 Hi = SubVec;
15542 }
15543 return DCI.DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
15544}
15545
15546// shuffle(MVETrunc(x, y)) -> VMOVN(x, y)
15548 SelectionDAG &DAG) {
15549 SDValue Trunc = N->getOperand(0);
15550 EVT VT = Trunc.getValueType();
15551 if (Trunc.getOpcode() != ARMISD::MVETRUNC || !N->getOperand(1).isUndef())
15552 return SDValue();
15553
15554 SDLoc DL(Trunc);
15555 if (isVMOVNTruncMask(N->getMask(), VT, false))
15556 return DAG.getNode(
15557 ARMISD::VMOVN, DL, VT,
15558 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)),
15559 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)),
15560 DAG.getConstant(1, DL, MVT::i32));
15561 else if (isVMOVNTruncMask(N->getMask(), VT, true))
15562 return DAG.getNode(
15563 ARMISD::VMOVN, DL, VT,
15564 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)),
15565 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)),
15566 DAG.getConstant(1, DL, MVT::i32));
15567 return SDValue();
15568}
15569
15570/// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for
15571/// ISD::VECTOR_SHUFFLE.
15574 return R;
15575
15576 // The LLVM shufflevector instruction does not require the shuffle mask
15577 // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does
15578 // have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the
15579 // operands do not match the mask length, they are extended by concatenating
15580 // them with undef vectors. That is probably the right thing for other
15581 // targets, but for NEON it is better to concatenate two double-register
15582 // size vector operands into a single quad-register size vector. Do that
15583 // transformation here:
15584 // shuffle(concat(v1, undef), concat(v2, undef)) ->
15585 // shuffle(concat(v1, v2), undef)
15586 SDValue Op0 = N->getOperand(0);
15587 SDValue Op1 = N->getOperand(1);
15588 if (Op0.getOpcode() != ISD::CONCAT_VECTORS ||
15589 Op1.getOpcode() != ISD::CONCAT_VECTORS ||
15590 Op0.getNumOperands() != 2 ||
15591 Op1.getNumOperands() != 2)
15592 return SDValue();
15593 SDValue Concat0Op1 = Op0.getOperand(1);
15594 SDValue Concat1Op1 = Op1.getOperand(1);
15595 if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef())
15596 return SDValue();
15597 // Skip the transformation if any of the types are illegal.
15598 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15599 EVT VT = N->getValueType(0);
15600 if (!TLI.isTypeLegal(VT) ||
15601 !TLI.isTypeLegal(Concat0Op1.getValueType()) ||
15602 !TLI.isTypeLegal(Concat1Op1.getValueType()))
15603 return SDValue();
15604
15605 SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT,
15606 Op0.getOperand(0), Op1.getOperand(0));
15607 // Translate the shuffle mask.
15608 SmallVector<int, 16> NewMask;
15609 unsigned NumElts = VT.getVectorNumElements();
15610 unsigned HalfElts = NumElts/2;
15612 for (unsigned n = 0; n < NumElts; ++n) {
15613 int MaskElt = SVN->getMaskElt(n);
15614 int NewElt = -1;
15615 if (MaskElt < (int)HalfElts)
15616 NewElt = MaskElt;
15617 else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts))
15618 NewElt = HalfElts + MaskElt - NumElts;
15619 NewMask.push_back(NewElt);
15620 }
15621 return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat,
15622 DAG.getUNDEF(VT), NewMask);
15623}
15624
15625/// Load/store instruction that can be merged with a base address
15626/// update
15631 unsigned AddrOpIdx;
15632};
15633
15635 /// Instruction that updates a pointer
15637 /// Pointer increment operand
15639 /// Pointer increment value if it is a constant, or 0 otherwise
15640 unsigned ConstInc;
15641};
15642
15644 // Check that the add is independent of the load/store.
15645 // Otherwise, folding it would create a cycle. Search through Addr
15646 // as well, since the User may not be a direct user of Addr and
15647 // only share a base pointer.
15650 Worklist.push_back(N);
15651 Worklist.push_back(User);
15652 const unsigned MaxSteps = 1024;
15653 if (SDNode::hasPredecessorHelper(N, Visited, Worklist, MaxSteps) ||
15654 SDNode::hasPredecessorHelper(User, Visited, Worklist, MaxSteps))
15655 return false;
15656 return true;
15657}
15658
15660 struct BaseUpdateUser &User,
15661 bool SimpleConstIncOnly,
15663 SelectionDAG &DAG = DCI.DAG;
15664 SDNode *N = Target.N;
15665 MemSDNode *MemN = cast<MemSDNode>(N);
15666 SDLoc dl(N);
15667
15668 // Find the new opcode for the updating load/store.
15669 bool isLoadOp = true;
15670 bool isLaneOp = false;
15671 // Workaround for vst1x and vld1x intrinsics which do not have alignment
15672 // as an operand.
15673 bool hasAlignment = true;
15674 unsigned NewOpc = 0;
15675 unsigned NumVecs = 0;
15676 if (Target.isIntrinsic) {
15677 unsigned IntNo = N->getConstantOperandVal(1);
15678 switch (IntNo) {
15679 default:
15680 llvm_unreachable("unexpected intrinsic for Neon base update");
15681 case Intrinsic::arm_neon_vld1:
15682 NewOpc = ARMISD::VLD1_UPD;
15683 NumVecs = 1;
15684 break;
15685 case Intrinsic::arm_neon_vld2:
15686 NewOpc = ARMISD::VLD2_UPD;
15687 NumVecs = 2;
15688 break;
15689 case Intrinsic::arm_neon_vld3:
15690 NewOpc = ARMISD::VLD3_UPD;
15691 NumVecs = 3;
15692 break;
15693 case Intrinsic::arm_neon_vld4:
15694 NewOpc = ARMISD::VLD4_UPD;
15695 NumVecs = 4;
15696 break;
15697 case Intrinsic::arm_neon_vld1x2:
15698 NewOpc = ARMISD::VLD1x2_UPD;
15699 NumVecs = 2;
15700 hasAlignment = false;
15701 break;
15702 case Intrinsic::arm_neon_vld1x3:
15703 NewOpc = ARMISD::VLD1x3_UPD;
15704 NumVecs = 3;
15705 hasAlignment = false;
15706 break;
15707 case Intrinsic::arm_neon_vld1x4:
15708 NewOpc = ARMISD::VLD1x4_UPD;
15709 NumVecs = 4;
15710 hasAlignment = false;
15711 break;
15712 case Intrinsic::arm_neon_vld2dup:
15713 NewOpc = ARMISD::VLD2DUP_UPD;
15714 NumVecs = 2;
15715 break;
15716 case Intrinsic::arm_neon_vld3dup:
15717 NewOpc = ARMISD::VLD3DUP_UPD;
15718 NumVecs = 3;
15719 break;
15720 case Intrinsic::arm_neon_vld4dup:
15721 NewOpc = ARMISD::VLD4DUP_UPD;
15722 NumVecs = 4;
15723 break;
15724 case Intrinsic::arm_neon_vld2lane:
15725 NewOpc = ARMISD::VLD2LN_UPD;
15726 NumVecs = 2;
15727 isLaneOp = true;
15728 break;
15729 case Intrinsic::arm_neon_vld3lane:
15730 NewOpc = ARMISD::VLD3LN_UPD;
15731 NumVecs = 3;
15732 isLaneOp = true;
15733 break;
15734 case Intrinsic::arm_neon_vld4lane:
15735 NewOpc = ARMISD::VLD4LN_UPD;
15736 NumVecs = 4;
15737 isLaneOp = true;
15738 break;
15739 case Intrinsic::arm_neon_vst1:
15740 NewOpc = ARMISD::VST1_UPD;
15741 NumVecs = 1;
15742 isLoadOp = false;
15743 break;
15744 case Intrinsic::arm_neon_vst2:
15745 NewOpc = ARMISD::VST2_UPD;
15746 NumVecs = 2;
15747 isLoadOp = false;
15748 break;
15749 case Intrinsic::arm_neon_vst3:
15750 NewOpc = ARMISD::VST3_UPD;
15751 NumVecs = 3;
15752 isLoadOp = false;
15753 break;
15754 case Intrinsic::arm_neon_vst4:
15755 NewOpc = ARMISD::VST4_UPD;
15756 NumVecs = 4;
15757 isLoadOp = false;
15758 break;
15759 case Intrinsic::arm_neon_vst2lane:
15760 NewOpc = ARMISD::VST2LN_UPD;
15761 NumVecs = 2;
15762 isLoadOp = false;
15763 isLaneOp = true;
15764 break;
15765 case Intrinsic::arm_neon_vst3lane:
15766 NewOpc = ARMISD::VST3LN_UPD;
15767 NumVecs = 3;
15768 isLoadOp = false;
15769 isLaneOp = true;
15770 break;
15771 case Intrinsic::arm_neon_vst4lane:
15772 NewOpc = ARMISD::VST4LN_UPD;
15773 NumVecs = 4;
15774 isLoadOp = false;
15775 isLaneOp = true;
15776 break;
15777 case Intrinsic::arm_neon_vst1x2:
15778 NewOpc = ARMISD::VST1x2_UPD;
15779 NumVecs = 2;
15780 isLoadOp = false;
15781 hasAlignment = false;
15782 break;
15783 case Intrinsic::arm_neon_vst1x3:
15784 NewOpc = ARMISD::VST1x3_UPD;
15785 NumVecs = 3;
15786 isLoadOp = false;
15787 hasAlignment = false;
15788 break;
15789 case Intrinsic::arm_neon_vst1x4:
15790 NewOpc = ARMISD::VST1x4_UPD;
15791 NumVecs = 4;
15792 isLoadOp = false;
15793 hasAlignment = false;
15794 break;
15795 }
15796 } else {
15797 isLaneOp = true;
15798 switch (N->getOpcode()) {
15799 default:
15800 llvm_unreachable("unexpected opcode for Neon base update");
15801 case ARMISD::VLD1DUP:
15802 NewOpc = ARMISD::VLD1DUP_UPD;
15803 NumVecs = 1;
15804 break;
15805 case ARMISD::VLD2DUP:
15806 NewOpc = ARMISD::VLD2DUP_UPD;
15807 NumVecs = 2;
15808 break;
15809 case ARMISD::VLD3DUP:
15810 NewOpc = ARMISD::VLD3DUP_UPD;
15811 NumVecs = 3;
15812 break;
15813 case ARMISD::VLD4DUP:
15814 NewOpc = ARMISD::VLD4DUP_UPD;
15815 NumVecs = 4;
15816 break;
15817 case ISD::LOAD:
15818 NewOpc = ARMISD::VLD1_UPD;
15819 NumVecs = 1;
15820 isLaneOp = false;
15821 break;
15822 case ISD::STORE:
15823 NewOpc = ARMISD::VST1_UPD;
15824 NumVecs = 1;
15825 isLaneOp = false;
15826 isLoadOp = false;
15827 break;
15828 }
15829 }
15830
15831 // Find the size of memory referenced by the load/store.
15832 EVT VecTy;
15833 if (isLoadOp) {
15834 VecTy = N->getValueType(0);
15835 } else if (Target.isIntrinsic) {
15836 VecTy = N->getOperand(Target.AddrOpIdx + 1).getValueType();
15837 } else {
15838 assert(Target.isStore &&
15839 "Node has to be a load, a store, or an intrinsic!");
15840 VecTy = N->getOperand(1).getValueType();
15841 }
15842
15843 bool isVLDDUPOp =
15844 NewOpc == ARMISD::VLD1DUP_UPD || NewOpc == ARMISD::VLD2DUP_UPD ||
15845 NewOpc == ARMISD::VLD3DUP_UPD || NewOpc == ARMISD::VLD4DUP_UPD;
15846
15847 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
15848 if (isLaneOp || isVLDDUPOp)
15849 NumBytes /= VecTy.getVectorNumElements();
15850
15851 if (NumBytes >= 3 * 16 && User.ConstInc != NumBytes) {
15852 // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
15853 // separate instructions that make it harder to use a non-constant update.
15854 return false;
15855 }
15856
15857 if (SimpleConstIncOnly && User.ConstInc != NumBytes)
15858 return false;
15859
15860 if (!isValidBaseUpdate(N, User.N))
15861 return false;
15862
15863 // OK, we found an ADD we can fold into the base update.
15864 // Now, create a _UPD node, taking care of not breaking alignment.
15865
15866 EVT AlignedVecTy = VecTy;
15867 Align Alignment = MemN->getAlign();
15868
15869 // If this is a less-than-standard-aligned load/store, change the type to
15870 // match the standard alignment.
15871 // The alignment is overlooked when selecting _UPD variants; and it's
15872 // easier to introduce bitcasts here than fix that.
15873 // There are 3 ways to get to this base-update combine:
15874 // - intrinsics: they are assumed to be properly aligned (to the standard
15875 // alignment of the memory type), so we don't need to do anything.
15876 // - ARMISD::VLDx nodes: they are only generated from the aforementioned
15877 // intrinsics, so, likewise, there's nothing to do.
15878 // - generic load/store instructions: the alignment is specified as an
15879 // explicit operand, rather than implicitly as the standard alignment
15880 // of the memory type (like the intrisics). We need to change the
15881 // memory type to match the explicit alignment. That way, we don't
15882 // generate non-standard-aligned ARMISD::VLDx nodes.
15883 if (isa<LSBaseSDNode>(N)) {
15884 if (Alignment.value() < VecTy.getScalarSizeInBits() / 8) {
15885 MVT EltTy = MVT::getIntegerVT(Alignment.value() * 8);
15886 assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
15887 assert(!isLaneOp && "Unexpected generic load/store lane.");
15888 unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
15889 AlignedVecTy = MVT::getVectorVT(EltTy, NumElts);
15890 }
15891 // Don't set an explicit alignment on regular load/stores that we want
15892 // to transform to VLD/VST 1_UPD nodes.
15893 // This matches the behavior of regular load/stores, which only get an
15894 // explicit alignment if the MMO alignment is larger than the standard
15895 // alignment of the memory type.
15896 // Intrinsics, however, always get an explicit alignment, set to the
15897 // alignment of the MMO.
15898 Alignment = Align(1);
15899 }
15900
15901 // Create the new updating load/store node.
15902 // First, create an SDVTList for the new updating node's results.
15903 EVT Tys[6];
15904 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
15905 unsigned n;
15906 for (n = 0; n < NumResultVecs; ++n)
15907 Tys[n] = AlignedVecTy;
15908 Tys[n++] = MVT::i32;
15909 Tys[n] = MVT::Other;
15910 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
15911
15912 // Then, gather the new node's operands.
15914 Ops.push_back(N->getOperand(0)); // incoming chain
15915 Ops.push_back(N->getOperand(Target.AddrOpIdx));
15916 Ops.push_back(User.Inc);
15917
15918 if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) {
15919 // Try to match the intrinsic's signature
15920 Ops.push_back(StN->getValue());
15921 } else {
15922 // Loads (and of course intrinsics) match the intrinsics' signature,
15923 // so just add all but the alignment operand.
15924 unsigned LastOperand =
15925 hasAlignment ? N->getNumOperands() - 1 : N->getNumOperands();
15926 for (unsigned i = Target.AddrOpIdx + 1; i < LastOperand; ++i)
15927 Ops.push_back(N->getOperand(i));
15928 }
15929
15930 // For all node types, the alignment operand is always the last one.
15931 Ops.push_back(DAG.getConstant(Alignment.value(), dl, MVT::i32));
15932
15933 // If this is a non-standard-aligned STORE, the penultimate operand is the
15934 // stored value. Bitcast it to the aligned type.
15935 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {
15936 SDValue &StVal = Ops[Ops.size() - 2];
15937 StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);
15938 }
15939
15940 EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy;
15941 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT,
15942 MemN->getMemOperand());
15943
15944 // Update the uses.
15945 SmallVector<SDValue, 5> NewResults;
15946 for (unsigned i = 0; i < NumResultVecs; ++i)
15947 NewResults.push_back(SDValue(UpdN.getNode(), i));
15948
15949 // If this is an non-standard-aligned LOAD, the first result is the loaded
15950 // value. Bitcast it to the expected result type.
15951 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {
15952 SDValue &LdVal = NewResults[0];
15953 LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal);
15954 }
15955
15956 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
15957 DCI.CombineTo(N, NewResults);
15958 DCI.CombineTo(User.N, SDValue(UpdN.getNode(), NumResultVecs));
15959
15960 return true;
15961}
15962
15963// If (opcode ptr inc) is and ADD-like instruction, return the
15964// increment value. Otherwise return 0.
15965static unsigned getPointerConstIncrement(unsigned Opcode, SDValue Ptr,
15966 SDValue Inc, const SelectionDAG &DAG) {
15968 if (!CInc)
15969 return 0;
15970
15971 switch (Opcode) {
15972 case ARMISD::VLD1_UPD:
15973 case ISD::ADD:
15974 return CInc->getZExtValue();
15975 case ISD::OR: {
15976 if (DAG.haveNoCommonBitsSet(Ptr, Inc)) {
15977 // (OR ptr inc) is the same as (ADD ptr inc)
15978 return CInc->getZExtValue();
15979 }
15980 return 0;
15981 }
15982 default:
15983 return 0;
15984 }
15985}
15986
15988 switch (N->getOpcode()) {
15989 case ISD::ADD:
15990 case ISD::OR: {
15991 if (isa<ConstantSDNode>(N->getOperand(1))) {
15992 *Ptr = N->getOperand(0);
15993 *CInc = N->getOperand(1);
15994 return true;
15995 }
15996 return false;
15997 }
15998 case ARMISD::VLD1_UPD: {
15999 if (isa<ConstantSDNode>(N->getOperand(2))) {
16000 *Ptr = N->getOperand(1);
16001 *CInc = N->getOperand(2);
16002 return true;
16003 }
16004 return false;
16005 }
16006 default:
16007 return false;
16008 }
16009}
16010
16011/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP,
16012/// NEON load/store intrinsics, and generic vector load/stores, to merge
16013/// base address updates.
16014/// For generic load/stores, the memory type is assumed to be a vector.
16015/// The caller is assumed to have checked legality.
16018 const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
16019 N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
16020 const bool isStore = N->getOpcode() == ISD::STORE;
16021 const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1);
16022 BaseUpdateTarget Target = {N, isIntrinsic, isStore, AddrOpIdx};
16023
16024 // Limit the number of possible base-updates we look at to prevent degenerate
16025 // cases.
16026 unsigned MaxBaseUpdates = ArmMaxBaseUpdatesToCheck;
16027
16028 SDValue Addr = N->getOperand(AddrOpIdx);
16029
16031
16032 // Search for a use of the address operand that is an increment.
16033 for (SDUse &Use : Addr->uses()) {
16034 SDNode *User = Use.getUser();
16035 if (Use.getResNo() != Addr.getResNo() || User->getNumOperands() != 2)
16036 continue;
16037
16038 SDValue Inc = User->getOperand(Use.getOperandNo() == 1 ? 0 : 1);
16039 unsigned ConstInc =
16040 getPointerConstIncrement(User->getOpcode(), Addr, Inc, DCI.DAG);
16041
16042 if (ConstInc || User->getOpcode() == ISD::ADD) {
16043 BaseUpdates.push_back({User, Inc, ConstInc});
16044 if (BaseUpdates.size() >= MaxBaseUpdates)
16045 break;
16046 }
16047 }
16048
16049 // If the address is a constant pointer increment itself, find
16050 // another constant increment that has the same base operand
16051 SDValue Base;
16052 SDValue CInc;
16053 if (findPointerConstIncrement(Addr.getNode(), &Base, &CInc)) {
16054 unsigned Offset =
16055 getPointerConstIncrement(Addr->getOpcode(), Base, CInc, DCI.DAG);
16056 for (SDUse &Use : Base->uses()) {
16057
16058 SDNode *User = Use.getUser();
16059 if (Use.getResNo() != Base.getResNo() || User == Addr.getNode() ||
16060 User->getNumOperands() != 2)
16061 continue;
16062
16063 SDValue UserInc = User->getOperand(Use.getOperandNo() == 0 ? 1 : 0);
16064 unsigned UserOffset =
16065 getPointerConstIncrement(User->getOpcode(), Base, UserInc, DCI.DAG);
16066
16067 if (!UserOffset || UserOffset <= Offset)
16068 continue;
16069
16070 unsigned NewConstInc = UserOffset - Offset;
16071 SDValue NewInc = DCI.DAG.getConstant(NewConstInc, SDLoc(N), MVT::i32);
16072 BaseUpdates.push_back({User, NewInc, NewConstInc});
16073 if (BaseUpdates.size() >= MaxBaseUpdates)
16074 break;
16075 }
16076 }
16077
16078 // Try to fold the load/store with an update that matches memory
16079 // access size. This should work well for sequential loads.
16080 unsigned NumValidUpd = BaseUpdates.size();
16081 for (unsigned I = 0; I < NumValidUpd; I++) {
16082 BaseUpdateUser &User = BaseUpdates[I];
16083 if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/true, DCI))
16084 return SDValue();
16085 }
16086
16087 // Try to fold with other users. Non-constant updates are considered
16088 // first, and constant updates are sorted to not break a sequence of
16089 // strided accesses (if there is any).
16090 llvm::stable_sort(BaseUpdates,
16091 [](const BaseUpdateUser &LHS, const BaseUpdateUser &RHS) {
16092 return LHS.ConstInc < RHS.ConstInc;
16093 });
16094 for (BaseUpdateUser &User : BaseUpdates) {
16095 if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/false, DCI))
16096 return SDValue();
16097 }
16098 return SDValue();
16099}
16100
16103 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
16104 return SDValue();
16105
16106 return CombineBaseUpdate(N, DCI);
16107}
16108
16111 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
16112 return SDValue();
16113
16114 SelectionDAG &DAG = DCI.DAG;
16115 SDValue Addr = N->getOperand(2);
16116 MemSDNode *MemN = cast<MemSDNode>(N);
16117 SDLoc dl(N);
16118
16119 // For the stores, where there are multiple intrinsics we only actually want
16120 // to post-inc the last of the them.
16121 unsigned IntNo = N->getConstantOperandVal(1);
16122 if (IntNo == Intrinsic::arm_mve_vst2q && N->getConstantOperandVal(5) != 1)
16123 return SDValue();
16124 if (IntNo == Intrinsic::arm_mve_vst4q && N->getConstantOperandVal(7) != 3)
16125 return SDValue();
16126
16127 // Search for a use of the address operand that is an increment.
16128 for (SDUse &Use : Addr->uses()) {
16129 SDNode *User = Use.getUser();
16130 if (User->getOpcode() != ISD::ADD || Use.getResNo() != Addr.getResNo())
16131 continue;
16132
16133 // Check that the add is independent of the load/store. Otherwise, folding
16134 // it would create a cycle. We can avoid searching through Addr as it's a
16135 // predecessor to both.
16138 Visited.insert(Addr.getNode());
16139 Worklist.push_back(N);
16140 Worklist.push_back(User);
16141 const unsigned MaxSteps = 1024;
16142 if (SDNode::hasPredecessorHelper(N, Visited, Worklist, MaxSteps) ||
16143 SDNode::hasPredecessorHelper(User, Visited, Worklist, MaxSteps))
16144 continue;
16145
16146 // Find the new opcode for the updating load/store.
16147 bool isLoadOp = true;
16148 unsigned NewOpc = 0;
16149 unsigned NumVecs = 0;
16150 switch (IntNo) {
16151 default:
16152 llvm_unreachable("unexpected intrinsic for MVE VLDn combine");
16153 case Intrinsic::arm_mve_vld2q:
16154 NewOpc = ARMISD::VLD2_UPD;
16155 NumVecs = 2;
16156 break;
16157 case Intrinsic::arm_mve_vld4q:
16158 NewOpc = ARMISD::VLD4_UPD;
16159 NumVecs = 4;
16160 break;
16161 case Intrinsic::arm_mve_vst2q:
16162 NewOpc = ARMISD::VST2_UPD;
16163 NumVecs = 2;
16164 isLoadOp = false;
16165 break;
16166 case Intrinsic::arm_mve_vst4q:
16167 NewOpc = ARMISD::VST4_UPD;
16168 NumVecs = 4;
16169 isLoadOp = false;
16170 break;
16171 }
16172
16173 // Find the size of memory referenced by the load/store.
16174 EVT VecTy;
16175 if (isLoadOp) {
16176 VecTy = N->getValueType(0);
16177 } else {
16178 VecTy = N->getOperand(3).getValueType();
16179 }
16180
16181 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
16182
16183 // If the increment is a constant, it must match the memory ref size.
16184 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
16186 if (!CInc || CInc->getZExtValue() != NumBytes)
16187 continue;
16188
16189 // Create the new updating load/store node.
16190 // First, create an SDVTList for the new updating node's results.
16191 EVT Tys[6];
16192 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
16193 unsigned n;
16194 for (n = 0; n < NumResultVecs; ++n)
16195 Tys[n] = VecTy;
16196 Tys[n++] = MVT::i32;
16197 Tys[n] = MVT::Other;
16198 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
16199
16200 // Then, gather the new node's operands.
16202 Ops.push_back(N->getOperand(0)); // incoming chain
16203 Ops.push_back(N->getOperand(2)); // ptr
16204 Ops.push_back(Inc);
16205
16206 for (unsigned i = 3; i < N->getNumOperands(); ++i)
16207 Ops.push_back(N->getOperand(i));
16208
16209 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, VecTy,
16210 MemN->getMemOperand());
16211
16212 // Update the uses.
16213 SmallVector<SDValue, 5> NewResults;
16214 for (unsigned i = 0; i < NumResultVecs; ++i)
16215 NewResults.push_back(SDValue(UpdN.getNode(), i));
16216
16217 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
16218 DCI.CombineTo(N, NewResults);
16219 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
16220
16221 break;
16222 }
16223
16224 return SDValue();
16225}
16226
16227/// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
16228/// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
16229/// are also VDUPLANEs. If so, combine them to a vldN-dup operation and
16230/// return true.
16232 SelectionDAG &DAG = DCI.DAG;
16233 EVT VT = N->getValueType(0);
16234 // vldN-dup instructions only support 64-bit vectors for N > 1.
16235 if (!VT.is64BitVector())
16236 return false;
16237
16238 // Check if the VDUPLANE operand is a vldN-dup intrinsic.
16239 SDNode *VLD = N->getOperand(0).getNode();
16240 if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
16241 return false;
16242 unsigned NumVecs = 0;
16243 unsigned NewOpc = 0;
16244 unsigned IntNo = VLD->getConstantOperandVal(1);
16245 if (IntNo == Intrinsic::arm_neon_vld2lane) {
16246 NumVecs = 2;
16247 NewOpc = ARMISD::VLD2DUP;
16248 } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
16249 NumVecs = 3;
16250 NewOpc = ARMISD::VLD3DUP;
16251 } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
16252 NumVecs = 4;
16253 NewOpc = ARMISD::VLD4DUP;
16254 } else {
16255 return false;
16256 }
16257
16258 // First check that all the vldN-lane uses are VDUPLANEs and that the lane
16259 // numbers match the load.
16260 unsigned VLDLaneNo = VLD->getConstantOperandVal(NumVecs + 3);
16261 for (SDUse &Use : VLD->uses()) {
16262 // Ignore uses of the chain result.
16263 if (Use.getResNo() == NumVecs)
16264 continue;
16265 SDNode *User = Use.getUser();
16266 if (User->getOpcode() != ARMISD::VDUPLANE ||
16267 VLDLaneNo != User->getConstantOperandVal(1))
16268 return false;
16269 }
16270
16271 // Create the vldN-dup node.
16272 EVT Tys[5];
16273 unsigned n;
16274 for (n = 0; n < NumVecs; ++n)
16275 Tys[n] = VT;
16276 Tys[n] = MVT::Other;
16277 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumVecs + 1));
16278 SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
16280 SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys,
16281 Ops, VLDMemInt->getMemoryVT(),
16282 VLDMemInt->getMemOperand());
16283
16284 // Update the uses.
16285 for (SDUse &Use : VLD->uses()) {
16286 unsigned ResNo = Use.getResNo();
16287 // Ignore uses of the chain result.
16288 if (ResNo == NumVecs)
16289 continue;
16290 DCI.CombineTo(Use.getUser(), SDValue(VLDDup.getNode(), ResNo));
16291 }
16292
16293 // Now the vldN-lane intrinsic is dead except for its chain result.
16294 // Update uses of the chain.
16295 std::vector<SDValue> VLDDupResults;
16296 for (unsigned n = 0; n < NumVecs; ++n)
16297 VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
16298 VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
16299 DCI.CombineTo(VLD, VLDDupResults);
16300
16301 return true;
16302}
16303
16304/// PerformVDUPLANECombine - Target-specific dag combine xforms for
16305/// ARMISD::VDUPLANE.
16308 const ARMSubtarget *Subtarget) {
16309 SDValue Op = N->getOperand(0);
16310 EVT VT = N->getValueType(0);
16311
16312 // On MVE, we just convert the VDUPLANE to a VDUP with an extract.
16313 if (Subtarget->hasMVEIntegerOps()) {
16314 EVT ExtractVT = VT.getVectorElementType();
16315 // We need to ensure we are creating a legal type.
16316 if (!DCI.DAG.getTargetLoweringInfo().isTypeLegal(ExtractVT))
16317 ExtractVT = MVT::i32;
16318 SDValue Extract = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), ExtractVT,
16319 N->getOperand(0), N->getOperand(1));
16320 return DCI.DAG.getNode(ARMISD::VDUP, SDLoc(N), VT, Extract);
16321 }
16322
16323 // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses
16324 // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation.
16325 if (CombineVLDDUP(N, DCI))
16326 return SDValue(N, 0);
16327
16328 // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is
16329 // redundant. Ignore bit_converts for now; element sizes are checked below.
16330 while (Op.getOpcode() == ISD::BITCAST)
16331 Op = Op.getOperand(0);
16332 if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM)
16333 return SDValue();
16334
16335 // Make sure the VMOV element size is not bigger than the VDUPLANE elements.
16336 unsigned EltSize = Op.getScalarValueSizeInBits();
16337 // The canonical VMOV for a zero vector uses a 32-bit element size.
16338 unsigned Imm = Op.getConstantOperandVal(0);
16339 unsigned EltBits;
16340 if (ARM_AM::decodeVMOVModImm(Imm, EltBits) == 0)
16341 EltSize = 8;
16342 if (EltSize > VT.getScalarSizeInBits())
16343 return SDValue();
16344
16345 return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
16346}
16347
16348/// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
16350 const ARMSubtarget *Subtarget) {
16351 SDValue Op = N->getOperand(0);
16352 SDLoc dl(N);
16353
16354 if (Subtarget->hasMVEIntegerOps()) {
16355 // Convert VDUP f32 -> VDUP BITCAST i32 under MVE, as we know the value will
16356 // need to come from a GPR.
16357 if (Op.getValueType() == MVT::f32)
16358 return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
16359 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op));
16360 else if (Op.getValueType() == MVT::f16)
16361 return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
16362 DAG.getNode(ARMISD::VMOVrh, dl, MVT::i32, Op));
16363 }
16364
16365 if (!Subtarget->hasNEON())
16366 return SDValue();
16367
16368 // Match VDUP(LOAD) -> VLD1DUP.
16369 // We match this pattern here rather than waiting for isel because the
16370 // transform is only legal for unindexed loads.
16371 LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode());
16372 if (LD && Op.hasOneUse() && LD->isUnindexed() &&
16373 LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) {
16374 SDValue Ops[] = {LD->getOperand(0), LD->getOperand(1),
16375 DAG.getConstant(LD->getAlign().value(), SDLoc(N), MVT::i32)};
16376 SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other);
16377 SDValue VLDDup =
16379 LD->getMemoryVT(), LD->getMemOperand());
16380 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1));
16381 return VLDDup;
16382 }
16383
16384 return SDValue();
16385}
16386
16389 const ARMSubtarget *Subtarget) {
16390 EVT VT = N->getValueType(0);
16391
16392 // If this is a legal vector load, try to combine it into a VLD1_UPD.
16393 if (Subtarget->hasNEON() && ISD::isNormalLoad(N) && VT.isVector() &&
16395 return CombineBaseUpdate(N, DCI);
16396
16397 return SDValue();
16398}
16399
16400// Optimize trunc store (of multiple scalars) to shuffle and store. First,
16401// pack all of the elements in one place. Next, store to memory in fewer
16402// chunks.
16404 SelectionDAG &DAG) {
16405 SDValue StVal = St->getValue();
16406 EVT VT = StVal.getValueType();
16407 if (!St->isTruncatingStore() || !VT.isVector())
16408 return SDValue();
16409 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16410 EVT StVT = St->getMemoryVT();
16411 unsigned NumElems = VT.getVectorNumElements();
16412 assert(StVT != VT && "Cannot truncate to the same type");
16413 unsigned FromEltSz = VT.getScalarSizeInBits();
16414 unsigned ToEltSz = StVT.getScalarSizeInBits();
16415
16416 // From, To sizes and ElemCount must be pow of two
16417 if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz))
16418 return SDValue();
16419
16420 // We are going to use the original vector elt for storing.
16421 // Accumulated smaller vector elements must be a multiple of the store size.
16422 if (0 != (NumElems * FromEltSz) % ToEltSz)
16423 return SDValue();
16424
16425 unsigned SizeRatio = FromEltSz / ToEltSz;
16426 assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
16427
16428 // Create a type on which we perform the shuffle.
16429 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
16430 NumElems * SizeRatio);
16431 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
16432
16433 SDLoc DL(St);
16434 SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
16435 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
16436 for (unsigned i = 0; i < NumElems; ++i)
16437 ShuffleVec[i] = DAG.getDataLayout().isBigEndian() ? (i + 1) * SizeRatio - 1
16438 : i * SizeRatio;
16439
16440 // Can't shuffle using an illegal type.
16441 if (!TLI.isTypeLegal(WideVecVT))
16442 return SDValue();
16443
16444 SDValue Shuff = DAG.getVectorShuffle(
16445 WideVecVT, DL, WideVec, DAG.getUNDEF(WideVec.getValueType()), ShuffleVec);
16446 // At this point all of the data is stored at the bottom of the
16447 // register. We now need to save it to mem.
16448
16449 // Find the largest store unit
16450 MVT StoreType = MVT::i8;
16451 for (MVT Tp : MVT::integer_valuetypes()) {
16452 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
16453 StoreType = Tp;
16454 }
16455 // Didn't find a legal store type.
16456 if (!TLI.isTypeLegal(StoreType))
16457 return SDValue();
16458
16459 // Bitcast the original vector into a vector of store-size units
16460 EVT StoreVecVT =
16461 EVT::getVectorVT(*DAG.getContext(), StoreType,
16462 VT.getSizeInBits() / EVT(StoreType).getSizeInBits());
16463 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
16464 SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
16466 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL,
16467 TLI.getPointerTy(DAG.getDataLayout()));
16468 SDValue BasePtr = St->getBasePtr();
16469
16470 // Perform one or more big stores into memory.
16471 unsigned E = (ToEltSz * NumElems) / StoreType.getSizeInBits();
16472 for (unsigned I = 0; I < E; I++) {
16473 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreType,
16474 ShuffWide, DAG.getIntPtrConstant(I, DL));
16475 SDValue Ch =
16476 DAG.getStore(St->getChain(), DL, SubVec, BasePtr, St->getPointerInfo(),
16477 St->getAlign(), St->getMemOperand()->getFlags());
16478 BasePtr =
16479 DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, Increment);
16480 Chains.push_back(Ch);
16481 }
16482 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
16483}
16484
16485// Try taking a single vector store from an fpround (which would otherwise turn
16486// into an expensive buildvector) and splitting it into a series of narrowing
16487// stores.
16489 SelectionDAG &DAG) {
16490 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16491 return SDValue();
16492 SDValue Trunc = St->getValue();
16493 if (Trunc->getOpcode() != ISD::FP_ROUND)
16494 return SDValue();
16495 EVT FromVT = Trunc->getOperand(0).getValueType();
16496 EVT ToVT = Trunc.getValueType();
16497 if (!ToVT.isVector())
16498 return SDValue();
16500 EVT ToEltVT = ToVT.getVectorElementType();
16501 EVT FromEltVT = FromVT.getVectorElementType();
16502
16503 if (FromEltVT != MVT::f32 || ToEltVT != MVT::f16)
16504 return SDValue();
16505
16506 unsigned NumElements = 4;
16507 if (FromVT.getVectorNumElements() % NumElements != 0)
16508 return SDValue();
16509
16510 // Test if the Trunc will be convertable to a VMOVN with a shuffle, and if so
16511 // use the VMOVN over splitting the store. We are looking for patterns of:
16512 // !rev: 0 N 1 N+1 2 N+2 ...
16513 // rev: N 0 N+1 1 N+2 2 ...
16514 // The shuffle may either be a single source (in which case N = NumElts/2) or
16515 // two inputs extended with concat to the same size (in which case N =
16516 // NumElts).
16517 auto isVMOVNShuffle = [&](ShuffleVectorSDNode *SVN, bool Rev) {
16518 ArrayRef<int> M = SVN->getMask();
16519 unsigned NumElts = ToVT.getVectorNumElements();
16520 if (SVN->getOperand(1).isUndef())
16521 NumElts /= 2;
16522
16523 unsigned Off0 = Rev ? NumElts : 0;
16524 unsigned Off1 = Rev ? 0 : NumElts;
16525
16526 for (unsigned I = 0; I < NumElts; I += 2) {
16527 if (M[I] >= 0 && M[I] != (int)(Off0 + I / 2))
16528 return false;
16529 if (M[I + 1] >= 0 && M[I + 1] != (int)(Off1 + I / 2))
16530 return false;
16531 }
16532
16533 return true;
16534 };
16535
16536 if (auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(Trunc.getOperand(0)))
16537 if (isVMOVNShuffle(Shuffle, false) || isVMOVNShuffle(Shuffle, true))
16538 return SDValue();
16539
16540 LLVMContext &C = *DAG.getContext();
16541 SDLoc DL(St);
16542 // Details about the old store
16543 SDValue Ch = St->getChain();
16544 SDValue BasePtr = St->getBasePtr();
16545 Align Alignment = St->getBaseAlign();
16547 AAMDNodes AAInfo = St->getAAInfo();
16548
16549 // We split the store into slices of NumElements. fp16 trunc stores are vcvt
16550 // and then stored as truncating integer stores.
16551 EVT NewFromVT = EVT::getVectorVT(C, FromEltVT, NumElements);
16552 EVT NewToVT = EVT::getVectorVT(
16553 C, EVT::getIntegerVT(C, ToEltVT.getSizeInBits()), NumElements);
16554
16556 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
16557 unsigned NewOffset = i * NumElements * ToEltVT.getSizeInBits() / 8;
16558 SDValue NewPtr =
16559 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
16560
16561 SDValue Extract =
16562 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0),
16563 DAG.getConstant(i * NumElements, DL, MVT::i32));
16564
16565 SDValue FPTrunc =
16566 DAG.getNode(ARMISD::VCVTN, DL, MVT::v8f16, DAG.getUNDEF(MVT::v8f16),
16567 Extract, DAG.getConstant(0, DL, MVT::i32));
16568 Extract = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v4i32, FPTrunc);
16569
16570 SDValue Store = DAG.getTruncStore(
16571 Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
16572 NewToVT, Alignment, MMOFlags, AAInfo);
16573 Stores.push_back(Store);
16574 }
16575 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
16576}
16577
16578// Try taking a single vector store from an MVETRUNC (which would otherwise turn
16579// into an expensive buildvector) and splitting it into a series of narrowing
16580// stores.
16582 SelectionDAG &DAG) {
16583 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16584 return SDValue();
16585 SDValue Trunc = St->getValue();
16586 if (Trunc->getOpcode() != ARMISD::MVETRUNC)
16587 return SDValue();
16588 EVT FromVT = Trunc->getOperand(0).getValueType();
16589 EVT ToVT = Trunc.getValueType();
16590
16591 LLVMContext &C = *DAG.getContext();
16592 SDLoc DL(St);
16593 // Details about the old store
16594 SDValue Ch = St->getChain();
16595 SDValue BasePtr = St->getBasePtr();
16596 Align Alignment = St->getBaseAlign();
16598 AAMDNodes AAInfo = St->getAAInfo();
16599
16600 EVT NewToVT = EVT::getVectorVT(C, ToVT.getVectorElementType(),
16601 FromVT.getVectorNumElements());
16602
16604 for (unsigned i = 0; i < Trunc.getNumOperands(); i++) {
16605 unsigned NewOffset =
16606 i * FromVT.getVectorNumElements() * ToVT.getScalarSizeInBits() / 8;
16607 SDValue NewPtr =
16608 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
16609
16610 SDValue Extract = Trunc.getOperand(i);
16611 SDValue Store = DAG.getTruncStore(
16612 Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
16613 NewToVT, Alignment, MMOFlags, AAInfo);
16614 Stores.push_back(Store);
16615 }
16616 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
16617}
16618
16619// Given a floating point store from an extracted vector, with an integer
16620// VGETLANE that already exists, store the existing VGETLANEu directly. This can
16621// help reduce fp register pressure, doesn't require the fp extract and allows
16622// use of more integer post-inc stores not available with vstr.
16624 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16625 return SDValue();
16626 SDValue Extract = St->getValue();
16627 EVT VT = Extract.getValueType();
16628 // For now only uses f16. This may be useful for f32 too, but that will
16629 // be bitcast(extract), not the VGETLANEu we currently check here.
16630 if (VT != MVT::f16 || Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16631 return SDValue();
16632
16633 SDNode *GetLane =
16634 DAG.getNodeIfExists(ARMISD::VGETLANEu, DAG.getVTList(MVT::i32),
16635 {Extract.getOperand(0), Extract.getOperand(1)});
16636 if (!GetLane)
16637 return SDValue();
16638
16639 LLVMContext &C = *DAG.getContext();
16640 SDLoc DL(St);
16641 // Create a new integer store to replace the existing floating point version.
16642 SDValue Ch = St->getChain();
16643 SDValue BasePtr = St->getBasePtr();
16644 Align Alignment = St->getBaseAlign();
16646 AAMDNodes AAInfo = St->getAAInfo();
16647 EVT NewToVT = EVT::getIntegerVT(C, VT.getSizeInBits());
16648 SDValue Store = DAG.getTruncStore(Ch, DL, SDValue(GetLane, 0), BasePtr,
16649 St->getPointerInfo(), NewToVT, Alignment,
16650 MMOFlags, AAInfo);
16651
16652 return Store;
16653}
16654
16655/// PerformSTORECombine - Target-specific dag combine xforms for
16656/// ISD::STORE.
16659 const ARMSubtarget *Subtarget) {
16661 if (St->isVolatile())
16662 return SDValue();
16663 SDValue StVal = St->getValue();
16664 EVT VT = StVal.getValueType();
16665
16666 if (Subtarget->hasNEON())
16667 if (SDValue Store = PerformTruncatingStoreCombine(St, DCI.DAG))
16668 return Store;
16669
16670 if (Subtarget->hasMVEFloatOps())
16671 if (SDValue NewToken = PerformSplittingToNarrowingStores(St, DCI.DAG))
16672 return NewToken;
16673
16674 if (Subtarget->hasMVEIntegerOps()) {
16675 if (SDValue NewChain = PerformExtractFpToIntStores(St, DCI.DAG))
16676 return NewChain;
16677 if (SDValue NewToken =
16679 return NewToken;
16680 }
16681
16682 if (!ISD::isNormalStore(St))
16683 return SDValue();
16684
16685 // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and
16686 // ARM stores of arguments in the same cache line.
16687 if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
16688 StVal.getNode()->hasOneUse()) {
16689 SelectionDAG &DAG = DCI.DAG;
16690 bool isBigEndian = DAG.getDataLayout().isBigEndian();
16691 SDLoc DL(St);
16692 SDValue BasePtr = St->getBasePtr();
16693 SDValue NewST1 = DAG.getStore(
16694 St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0),
16695 BasePtr, St->getPointerInfo(), St->getBaseAlign(),
16696 St->getMemOperand()->getFlags());
16697
16698 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
16699 DAG.getConstant(4, DL, MVT::i32));
16700 return DAG.getStore(NewST1.getValue(0), DL,
16701 StVal.getNode()->getOperand(isBigEndian ? 0 : 1),
16702 OffsetPtr, St->getPointerInfo().getWithOffset(4),
16703 St->getBaseAlign(), St->getMemOperand()->getFlags());
16704 }
16705
16706 if (StVal.getValueType() == MVT::i64 &&
16708
16709 // Bitcast an i64 store extracted from a vector to f64.
16710 // Otherwise, the i64 value will be legalized to a pair of i32 values.
16711 SelectionDAG &DAG = DCI.DAG;
16712 SDLoc dl(StVal);
16713 SDValue IntVec = StVal.getOperand(0);
16714 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
16716 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec);
16717 SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
16718 Vec, StVal.getOperand(1));
16719 dl = SDLoc(N);
16720 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt);
16721 // Make the DAGCombiner fold the bitcasts.
16722 DCI.AddToWorklist(Vec.getNode());
16723 DCI.AddToWorklist(ExtElt.getNode());
16724 DCI.AddToWorklist(V.getNode());
16725 return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
16726 St->getPointerInfo(), St->getAlign(),
16727 St->getMemOperand()->getFlags(), St->getAAInfo());
16728 }
16729
16730 // If this is a legal vector store, try to combine it into a VST1_UPD.
16731 if (Subtarget->hasNEON() && ISD::isNormalStore(N) && VT.isVector() &&
16733 return CombineBaseUpdate(N, DCI);
16734
16735 return SDValue();
16736}
16737
16738/// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD)
16739/// can replace combinations of VMUL and VCVT (floating-point to integer)
16740/// when the VMUL has a constant operand that is a power of 2.
16741///
16742/// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
16743/// vmul.f32 d16, d17, d16
16744/// vcvt.s32.f32 d16, d16
16745/// becomes:
16746/// vcvt.s32.f32 d16, d16, #3
16748 const ARMSubtarget *Subtarget) {
16749 if (!Subtarget->hasNEON())
16750 return SDValue();
16751
16752 SDValue Op = N->getOperand(0);
16753 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
16754 Op.getOpcode() != ISD::FMUL)
16755 return SDValue();
16756
16757 SDValue ConstVec = Op->getOperand(1);
16758 if (!isa<BuildVectorSDNode>(ConstVec))
16759 return SDValue();
16760
16761 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
16762 uint32_t FloatBits = FloatTy.getSizeInBits();
16763 MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
16764 uint32_t IntBits = IntTy.getSizeInBits();
16765 unsigned NumLanes = Op.getValueType().getVectorNumElements();
16766 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
16767 // These instructions only exist converting from f32 to i32. We can handle
16768 // smaller integers by generating an extra truncate, but larger ones would
16769 // be lossy. We also can't handle anything other than 2 or 4 lanes, since
16770 // these intructions only support v2i32/v4i32 types.
16771 return SDValue();
16772 }
16773
16774 BitVector UndefElements;
16776 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
16777 if (C == -1 || C == 0 || C > 32)
16778 return SDValue();
16779
16780 SDLoc dl(N);
16781 bool isSigned = N->getOpcode() == ISD::FP_TO_SINT;
16782 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs :
16783 Intrinsic::arm_neon_vcvtfp2fxu;
16784 SDValue FixConv = DAG.getNode(
16785 ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
16786 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0),
16787 DAG.getConstant(C, dl, MVT::i32));
16788
16789 if (IntBits < FloatBits)
16790 FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv);
16791
16792 return FixConv;
16793}
16794
16796 const ARMSubtarget *Subtarget) {
16797 if (!Subtarget->hasMVEFloatOps())
16798 return SDValue();
16799
16800 // Turn (fadd x, (vselect c, y, -0.0)) into (vselect c, (fadd x, y), x)
16801 // The second form can be more easily turned into a predicated vadd, and
16802 // possibly combined into a fma to become a predicated vfma.
16803 SDValue Op0 = N->getOperand(0);
16804 SDValue Op1 = N->getOperand(1);
16805 EVT VT = N->getValueType(0);
16806 SDLoc DL(N);
16807
16808 // The identity element for a fadd is -0.0 or +0.0 when the nsz flag is set,
16809 // which these VMOV's represent.
16810 auto isIdentitySplat = [&](SDValue Op, bool NSZ) {
16811 if (Op.getOpcode() != ISD::BITCAST ||
16812 Op.getOperand(0).getOpcode() != ARMISD::VMOVIMM)
16813 return false;
16814 uint64_t ImmVal = Op.getOperand(0).getConstantOperandVal(0);
16815 if (VT == MVT::v4f32 && (ImmVal == 1664 || (ImmVal == 0 && NSZ)))
16816 return true;
16817 if (VT == MVT::v8f16 && (ImmVal == 2688 || (ImmVal == 0 && NSZ)))
16818 return true;
16819 return false;
16820 };
16821
16822 if (Op0.getOpcode() == ISD::VSELECT && Op1.getOpcode() != ISD::VSELECT)
16823 std::swap(Op0, Op1);
16824
16825 if (Op1.getOpcode() != ISD::VSELECT)
16826 return SDValue();
16827
16828 SDNodeFlags FaddFlags = N->getFlags();
16829 bool NSZ = FaddFlags.hasNoSignedZeros();
16830 if (!isIdentitySplat(Op1.getOperand(2), NSZ))
16831 return SDValue();
16832
16833 SDValue FAdd =
16834 DAG.getNode(ISD::FADD, DL, VT, Op0, Op1.getOperand(1), FaddFlags);
16835 return DAG.getNode(ISD::VSELECT, DL, VT, Op1.getOperand(0), FAdd, Op0, FaddFlags);
16836}
16837
16839 SDValue LHS = N->getOperand(0);
16840 SDValue RHS = N->getOperand(1);
16841 EVT VT = N->getValueType(0);
16842 SDLoc DL(N);
16843
16844 if (!N->getFlags().hasAllowReassociation())
16845 return SDValue();
16846
16847 // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
16848 auto ReassocComplex = [&](SDValue A, SDValue B) {
16849 if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
16850 return SDValue();
16851 unsigned Opc = A.getConstantOperandVal(0);
16852 if (Opc != Intrinsic::arm_mve_vcmlaq)
16853 return SDValue();
16854 SDValue VCMLA = DAG.getNode(
16855 ISD::INTRINSIC_WO_CHAIN, DL, VT, A.getOperand(0), A.getOperand(1),
16856 DAG.getNode(ISD::FADD, DL, VT, A.getOperand(2), B, N->getFlags()),
16857 A.getOperand(3), A.getOperand(4));
16858 VCMLA->setFlags(A->getFlags());
16859 return VCMLA;
16860 };
16861 if (SDValue R = ReassocComplex(LHS, RHS))
16862 return R;
16863 if (SDValue R = ReassocComplex(RHS, LHS))
16864 return R;
16865
16866 return SDValue();
16867}
16868
16870 const ARMSubtarget *Subtarget) {
16871 if (SDValue S = PerformFAddVSelectCombine(N, DAG, Subtarget))
16872 return S;
16873 if (SDValue S = PerformFADDVCMLACombine(N, DAG))
16874 return S;
16875 return SDValue();
16876}
16877
16878/// PerformVMulVCTPCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
16879/// can replace combinations of VCVT (integer to floating-point) and VMUL
16880/// when the VMUL has a constant operand that is a power of 2.
16881///
16882/// Example (assume d17 = <float 0.125, float 0.125>):
16883/// vcvt.f32.s32 d16, d16
16884/// vmul.f32 d16, d16, d17
16885/// becomes:
16886/// vcvt.f32.s32 d16, d16, #3
16888 const ARMSubtarget *Subtarget) {
16889 if (!Subtarget->hasNEON())
16890 return SDValue();
16891
16892 SDValue Op = N->getOperand(0);
16893 unsigned OpOpcode = Op.getNode()->getOpcode();
16894 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() ||
16895 (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP))
16896 return SDValue();
16897
16898 SDValue ConstVec = N->getOperand(1);
16899 if (!isa<BuildVectorSDNode>(ConstVec))
16900 return SDValue();
16901
16902 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
16903 uint32_t FloatBits = FloatTy.getSizeInBits();
16904 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
16905 uint32_t IntBits = IntTy.getSizeInBits();
16906 unsigned NumLanes = Op.getValueType().getVectorNumElements();
16907 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
16908 // These instructions only exist converting from i32 to f32. We can handle
16909 // smaller integers by generating an extra extend, but larger ones would
16910 // be lossy. We also can't handle anything other than 2 or 4 lanes, since
16911 // these intructions only support v2i32/v4i32 types.
16912 return SDValue();
16913 }
16914
16915 ConstantFPSDNode *CN = isConstOrConstSplatFP(ConstVec, true);
16916 APFloat Recip(0.0f);
16917 if (!CN || !CN->getValueAPF().getExactInverse(&Recip))
16918 return SDValue();
16919
16920 bool IsExact;
16921 APSInt IntVal(33);
16922 if (Recip.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact) !=
16923 APFloat::opOK ||
16924 !IsExact)
16925 return SDValue();
16926
16927 int32_t C = IntVal.exactLogBase2();
16928 if (C == -1 || C == 0 || C > 32)
16929 return SDValue();
16930
16931 SDLoc DL(N);
16932 bool isSigned = OpOpcode == ISD::SINT_TO_FP;
16933 SDValue ConvInput = Op.getOperand(0);
16934 if (IntBits < FloatBits)
16936 NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, ConvInput);
16937
16938 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp
16939 : Intrinsic::arm_neon_vcvtfxu2fp;
16940 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
16941 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
16942 DAG.getConstant(C, DL, MVT::i32));
16943}
16944
16946 const ARMSubtarget *ST) {
16947 if (!ST->hasMVEIntegerOps())
16948 return SDValue();
16949
16950 assert(N->getOpcode() == ISD::VECREDUCE_ADD);
16951 EVT ResVT = N->getValueType(0);
16952 SDValue N0 = N->getOperand(0);
16953 SDLoc dl(N);
16954
16955 // Try to turn vecreduce_add(add(x, y)) into vecreduce(x) + vecreduce(y)
16956 if (ResVT == MVT::i32 && N0.getOpcode() == ISD::ADD &&
16957 (N0.getValueType() == MVT::v4i32 || N0.getValueType() == MVT::v8i16 ||
16958 N0.getValueType() == MVT::v16i8)) {
16959 SDValue Red0 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(0));
16960 SDValue Red1 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(1));
16961 return DAG.getNode(ISD::ADD, dl, ResVT, Red0, Red1);
16962 }
16963
16964 // We are looking for something that will have illegal types if left alone,
16965 // but that we can convert to a single instruction under MVE. For example
16966 // vecreduce_add(sext(A, v8i32)) => VADDV.s16 A
16967 // or
16968 // vecreduce_add(mul(zext(A, v16i32), zext(B, v16i32))) => VMLADAV.u8 A, B
16969
16970 // The legal cases are:
16971 // VADDV u/s 8/16/32
16972 // VMLAV u/s 8/16/32
16973 // VADDLV u/s 32
16974 // VMLALV u/s 16/32
16975
16976 // If the input vector is smaller than legal (v4i8/v4i16 for example) we can
16977 // extend it and use v4i32 instead.
16978 auto ExtTypeMatches = [](SDValue A, ArrayRef<MVT> ExtTypes) {
16979 EVT AVT = A.getValueType();
16980 return any_of(ExtTypes, [&](MVT Ty) {
16981 return AVT.getVectorNumElements() == Ty.getVectorNumElements() &&
16982 AVT.bitsLE(Ty);
16983 });
16984 };
16985 auto ExtendIfNeeded = [&](SDValue A, unsigned ExtendCode) {
16986 EVT AVT = A.getValueType();
16987 if (!AVT.is128BitVector())
16988 A = DAG.getNode(ExtendCode, dl,
16990 128 / AVT.getVectorMinNumElements())),
16991 A);
16992 return A;
16993 };
16994 auto IsVADDV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes) {
16995 if (ResVT != RetTy || N0->getOpcode() != ExtendCode)
16996 return SDValue();
16997 SDValue A = N0->getOperand(0);
16998 if (ExtTypeMatches(A, ExtTypes))
16999 return ExtendIfNeeded(A, ExtendCode);
17000 return SDValue();
17001 };
17002 auto IsPredVADDV = [&](MVT RetTy, unsigned ExtendCode,
17003 ArrayRef<MVT> ExtTypes, SDValue &Mask) {
17004 if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||
17006 return SDValue();
17007 Mask = N0->getOperand(0);
17008 SDValue Ext = N0->getOperand(1);
17009 if (Ext->getOpcode() != ExtendCode)
17010 return SDValue();
17011 SDValue A = Ext->getOperand(0);
17012 if (ExtTypeMatches(A, ExtTypes))
17013 return ExtendIfNeeded(A, ExtendCode);
17014 return SDValue();
17015 };
17016 auto IsVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
17017 SDValue &A, SDValue &B) {
17018 // For a vmla we are trying to match a larger pattern:
17019 // ExtA = sext/zext A
17020 // ExtB = sext/zext B
17021 // Mul = mul ExtA, ExtB
17022 // vecreduce.add Mul
17023 // There might also be en extra extend between the mul and the addreduce, so
17024 // long as the bitwidth is high enough to make them equivalent (for example
17025 // original v8i16 might be mul at v8i32 and the reduce happens at v8i64).
17026 if (ResVT != RetTy)
17027 return false;
17028 SDValue Mul = N0;
17029 if (Mul->getOpcode() == ExtendCode &&
17030 Mul->getOperand(0).getScalarValueSizeInBits() * 2 >=
17031 ResVT.getScalarSizeInBits())
17032 Mul = Mul->getOperand(0);
17033 if (Mul->getOpcode() != ISD::MUL)
17034 return false;
17035 SDValue ExtA = Mul->getOperand(0);
17036 SDValue ExtB = Mul->getOperand(1);
17037 if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode)
17038 return false;
17039 A = ExtA->getOperand(0);
17040 B = ExtB->getOperand(0);
17041 if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) {
17042 A = ExtendIfNeeded(A, ExtendCode);
17043 B = ExtendIfNeeded(B, ExtendCode);
17044 return true;
17045 }
17046 return false;
17047 };
17048 auto IsPredVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
17049 SDValue &A, SDValue &B, SDValue &Mask) {
17050 // Same as the pattern above with a select for the zero predicated lanes
17051 // ExtA = sext/zext A
17052 // ExtB = sext/zext B
17053 // Mul = mul ExtA, ExtB
17054 // N0 = select Mask, Mul, 0
17055 // vecreduce.add N0
17056 if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||
17058 return false;
17059 Mask = N0->getOperand(0);
17060 SDValue Mul = N0->getOperand(1);
17061 if (Mul->getOpcode() == ExtendCode &&
17062 Mul->getOperand(0).getScalarValueSizeInBits() * 2 >=
17063 ResVT.getScalarSizeInBits())
17064 Mul = Mul->getOperand(0);
17065 if (Mul->getOpcode() != ISD::MUL)
17066 return false;
17067 SDValue ExtA = Mul->getOperand(0);
17068 SDValue ExtB = Mul->getOperand(1);
17069 if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode)
17070 return false;
17071 A = ExtA->getOperand(0);
17072 B = ExtB->getOperand(0);
17073 if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) {
17074 A = ExtendIfNeeded(A, ExtendCode);
17075 B = ExtendIfNeeded(B, ExtendCode);
17076 return true;
17077 }
17078 return false;
17079 };
17080 auto Create64bitNode = [&](unsigned Opcode, ArrayRef<SDValue> Ops) {
17081 // Split illegal MVT::v16i8->i64 vector reductions into two legal v8i16->i64
17082 // reductions. The operands are extended with MVEEXT, but as they are
17083 // reductions the lane orders do not matter. MVEEXT may be combined with
17084 // loads to produce two extending loads, or else they will be expanded to
17085 // VREV/VMOVL.
17086 EVT VT = Ops[0].getValueType();
17087 if (VT == MVT::v16i8) {
17088 assert((Opcode == ARMISD::VMLALVs || Opcode == ARMISD::VMLALVu) &&
17089 "Unexpected illegal long reduction opcode");
17090 bool IsUnsigned = Opcode == ARMISD::VMLALVu;
17091
17092 SDValue Ext0 =
17093 DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl,
17094 DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[0]);
17095 SDValue Ext1 =
17096 DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl,
17097 DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[1]);
17098
17099 SDValue MLA0 = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
17100 Ext0, Ext1);
17101 SDValue MLA1 =
17102 DAG.getNode(IsUnsigned ? ARMISD::VMLALVAu : ARMISD::VMLALVAs, dl,
17103 DAG.getVTList(MVT::i32, MVT::i32), MLA0, MLA0.getValue(1),
17104 Ext0.getValue(1), Ext1.getValue(1));
17105 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, MLA1, MLA1.getValue(1));
17106 }
17107 SDValue Node = DAG.getNode(Opcode, dl, {MVT::i32, MVT::i32}, Ops);
17108 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Node,
17109 SDValue(Node.getNode(), 1));
17110 };
17111
17112 SDValue A, B;
17113 SDValue Mask;
17114 if (IsVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
17115 return DAG.getNode(ARMISD::VMLAVs, dl, ResVT, A, B);
17116 if (IsVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
17117 return DAG.getNode(ARMISD::VMLAVu, dl, ResVT, A, B);
17118 if (IsVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32},
17119 A, B))
17120 return Create64bitNode(ARMISD::VMLALVs, {A, B});
17121 if (IsVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32},
17122 A, B))
17123 return Create64bitNode(ARMISD::VMLALVu, {A, B});
17124 if (IsVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B))
17125 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17126 DAG.getNode(ARMISD::VMLAVs, dl, MVT::i32, A, B));
17127 if (IsVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B))
17128 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17129 DAG.getNode(ARMISD::VMLAVu, dl, MVT::i32, A, B));
17130
17131 if (IsPredVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B,
17132 Mask))
17133 return DAG.getNode(ARMISD::VMLAVps, dl, ResVT, A, B, Mask);
17134 if (IsPredVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B,
17135 Mask))
17136 return DAG.getNode(ARMISD::VMLAVpu, dl, ResVT, A, B, Mask);
17137 if (IsPredVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B,
17138 Mask))
17139 return Create64bitNode(ARMISD::VMLALVps, {A, B, Mask});
17140 if (IsPredVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B,
17141 Mask))
17142 return Create64bitNode(ARMISD::VMLALVpu, {A, B, Mask});
17143 if (IsPredVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B, Mask))
17144 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17145 DAG.getNode(ARMISD::VMLAVps, dl, MVT::i32, A, B, Mask));
17146 if (IsPredVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B, Mask))
17147 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17148 DAG.getNode(ARMISD::VMLAVpu, dl, MVT::i32, A, B, Mask));
17149
17150 if (SDValue A = IsVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}))
17151 return DAG.getNode(ARMISD::VADDVs, dl, ResVT, A);
17152 if (SDValue A = IsVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}))
17153 return DAG.getNode(ARMISD::VADDVu, dl, ResVT, A);
17154 if (SDValue A = IsVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}))
17155 return Create64bitNode(ARMISD::VADDLVs, {A});
17156 if (SDValue A = IsVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}))
17157 return Create64bitNode(ARMISD::VADDLVu, {A});
17158 if (SDValue A = IsVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}))
17159 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17160 DAG.getNode(ARMISD::VADDVs, dl, MVT::i32, A));
17161 if (SDValue A = IsVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}))
17162 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17163 DAG.getNode(ARMISD::VADDVu, dl, MVT::i32, A));
17164
17165 if (SDValue A = IsPredVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
17166 return DAG.getNode(ARMISD::VADDVps, dl, ResVT, A, Mask);
17167 if (SDValue A = IsPredVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
17168 return DAG.getNode(ARMISD::VADDVpu, dl, ResVT, A, Mask);
17169 if (SDValue A = IsPredVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}, Mask))
17170 return Create64bitNode(ARMISD::VADDLVps, {A, Mask});
17171 if (SDValue A = IsPredVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}, Mask))
17172 return Create64bitNode(ARMISD::VADDLVpu, {A, Mask});
17173 if (SDValue A = IsPredVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, Mask))
17174 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17175 DAG.getNode(ARMISD::VADDVps, dl, MVT::i32, A, Mask));
17176 if (SDValue A = IsPredVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, Mask))
17177 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17178 DAG.getNode(ARMISD::VADDVpu, dl, MVT::i32, A, Mask));
17179
17180 // Some complications. We can get a case where the two inputs of the mul are
17181 // the same, then the output sext will have been helpfully converted to a
17182 // zext. Turn it back.
17183 SDValue Op = N0;
17184 if (Op->getOpcode() == ISD::VSELECT)
17185 Op = Op->getOperand(1);
17186 if (Op->getOpcode() == ISD::ZERO_EXTEND &&
17187 Op->getOperand(0)->getOpcode() == ISD::MUL) {
17188 SDValue Mul = Op->getOperand(0);
17189 if (Mul->getOperand(0) == Mul->getOperand(1) &&
17190 Mul->getOperand(0)->getOpcode() == ISD::SIGN_EXTEND) {
17191 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, N0->getValueType(0), Mul);
17192 if (Op != N0)
17193 Ext = DAG.getNode(ISD::VSELECT, dl, N0->getValueType(0),
17194 N0->getOperand(0), Ext, N0->getOperand(2));
17195 return DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, Ext);
17196 }
17197 }
17198
17199 return SDValue();
17200}
17201
17202// Looks for vaddv(shuffle) or vmlav(shuffle, shuffle), with a shuffle where all
17203// the lanes are used. Due to the reduction being commutative the shuffle can be
17204// removed.
17206 unsigned VecOp = N->getOperand(0).getValueType().isVector() ? 0 : 2;
17207 auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(VecOp));
17208 if (!Shuf || !Shuf->getOperand(1).isUndef())
17209 return SDValue();
17210
17211 // Check all elements are used once in the mask.
17212 ArrayRef<int> Mask = Shuf->getMask();
17213 APInt SetElts(Mask.size(), 0);
17214 for (int E : Mask) {
17215 if (E < 0 || E >= (int)Mask.size())
17216 return SDValue();
17217 SetElts.setBit(E);
17218 }
17219 if (!SetElts.isAllOnes())
17220 return SDValue();
17221
17222 if (N->getNumOperands() != VecOp + 1) {
17223 auto *Shuf2 = dyn_cast<ShuffleVectorSDNode>(N->getOperand(VecOp + 1));
17224 if (!Shuf2 || !Shuf2->getOperand(1).isUndef() || Shuf2->getMask() != Mask)
17225 return SDValue();
17226 }
17227
17229 for (SDValue Op : N->ops()) {
17230 if (Op.getValueType().isVector())
17231 Ops.push_back(Op.getOperand(0));
17232 else
17233 Ops.push_back(Op);
17234 }
17235 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getVTList(), Ops);
17236}
17237
17240 SDValue Op0 = N->getOperand(0);
17241 SDValue Op1 = N->getOperand(1);
17242 unsigned IsTop = N->getConstantOperandVal(2);
17243
17244 // VMOVNT a undef -> a
17245 // VMOVNB a undef -> a
17246 // VMOVNB undef a -> a
17247 if (Op1->isUndef())
17248 return Op0;
17249 if (Op0->isUndef() && !IsTop)
17250 return Op1;
17251
17252 // VMOVNt(c, VQMOVNb(a, b)) => VQMOVNt(c, b)
17253 // VMOVNb(c, VQMOVNb(a, b)) => VQMOVNb(c, b)
17254 if ((Op1->getOpcode() == ARMISD::VQMOVNs ||
17255 Op1->getOpcode() == ARMISD::VQMOVNu) &&
17256 Op1->getConstantOperandVal(2) == 0)
17257 return DCI.DAG.getNode(Op1->getOpcode(), SDLoc(Op1), N->getValueType(0),
17258 Op0, Op1->getOperand(1), N->getOperand(2));
17259
17260 // Only the bottom lanes from Qm (Op1) and either the top or bottom lanes from
17261 // Qd (Op0) are demanded from a VMOVN, depending on whether we are inserting
17262 // into the top or bottom lanes.
17263 unsigned NumElts = N->getValueType(0).getVectorNumElements();
17264 APInt Op1DemandedElts = APInt::getSplat(NumElts, APInt::getLowBitsSet(2, 1));
17265 APInt Op0DemandedElts =
17266 IsTop ? Op1DemandedElts
17267 : APInt::getSplat(NumElts, APInt::getHighBitsSet(2, 1));
17268
17269 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
17270 if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, DCI))
17271 return SDValue(N, 0);
17272 if (TLI.SimplifyDemandedVectorElts(Op1, Op1DemandedElts, DCI))
17273 return SDValue(N, 0);
17274
17275 return SDValue();
17276}
17277
17280 SDValue Op0 = N->getOperand(0);
17281 unsigned IsTop = N->getConstantOperandVal(2);
17282
17283 unsigned NumElts = N->getValueType(0).getVectorNumElements();
17284 APInt Op0DemandedElts =
17285 APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
17286 : APInt::getHighBitsSet(2, 1));
17287
17288 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
17289 if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, DCI))
17290 return SDValue(N, 0);
17291 return SDValue();
17292}
17293
17296 EVT VT = N->getValueType(0);
17297 SDValue LHS = N->getOperand(0);
17298 SDValue RHS = N->getOperand(1);
17299
17300 auto *Shuf0 = dyn_cast<ShuffleVectorSDNode>(LHS);
17301 auto *Shuf1 = dyn_cast<ShuffleVectorSDNode>(RHS);
17302 // Turn VQDMULH(shuffle, shuffle) -> shuffle(VQDMULH)
17303 if (Shuf0 && Shuf1 && Shuf0->getMask().equals(Shuf1->getMask()) &&
17304 LHS.getOperand(1).isUndef() && RHS.getOperand(1).isUndef() &&
17305 (LHS.hasOneUse() || RHS.hasOneUse() || LHS == RHS)) {
17306 SDLoc DL(N);
17307 SDValue NewBinOp = DCI.DAG.getNode(N->getOpcode(), DL, VT,
17308 LHS.getOperand(0), RHS.getOperand(0));
17309 SDValue UndefV = LHS.getOperand(1);
17310 return DCI.DAG.getVectorShuffle(VT, DL, NewBinOp, UndefV, Shuf0->getMask());
17311 }
17312 return SDValue();
17313}
17314
17316 SDLoc DL(N);
17317 SDValue Op0 = N->getOperand(0);
17318 SDValue Op1 = N->getOperand(1);
17319
17320 // Turn X << -C -> X >> C and viceversa. The negative shifts can come up from
17321 // uses of the intrinsics.
17322 if (auto C = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
17323 int ShiftAmt = C->getSExtValue();
17324 if (ShiftAmt == 0) {
17325 SDValue Merge = DAG.getMergeValues({Op0, Op1}, DL);
17326 DAG.ReplaceAllUsesWith(N, Merge.getNode());
17327 return SDValue();
17328 }
17329
17330 if (ShiftAmt >= -32 && ShiftAmt < 0) {
17331 unsigned NewOpcode =
17332 N->getOpcode() == ARMISD::LSLL ? ARMISD::LSRL : ARMISD::LSLL;
17333 SDValue NewShift = DAG.getNode(NewOpcode, DL, N->getVTList(), Op0, Op1,
17334 DAG.getConstant(-ShiftAmt, DL, MVT::i32));
17335 DAG.ReplaceAllUsesWith(N, NewShift.getNode());
17336 return NewShift;
17337 }
17338 }
17339
17340 return SDValue();
17341}
17342
17343/// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
17345 DAGCombinerInfo &DCI) const {
17346 SelectionDAG &DAG = DCI.DAG;
17347 unsigned IntNo = N->getConstantOperandVal(0);
17348 switch (IntNo) {
17349 default:
17350 // Don't do anything for most intrinsics.
17351 break;
17352
17353 // Vector shifts: check for immediate versions and lower them.
17354 // Note: This is done during DAG combining instead of DAG legalizing because
17355 // the build_vectors for 64-bit vector element shift counts are generally
17356 // not legal, and it is hard to see their values after they get legalized to
17357 // loads from a constant pool.
17358 case Intrinsic::arm_neon_vshifts:
17359 case Intrinsic::arm_neon_vshiftu:
17360 case Intrinsic::arm_neon_vrshifts:
17361 case Intrinsic::arm_neon_vrshiftu:
17362 case Intrinsic::arm_neon_vrshiftn:
17363 case Intrinsic::arm_neon_vqshifts:
17364 case Intrinsic::arm_neon_vqshiftu:
17365 case Intrinsic::arm_neon_vqshiftsu:
17366 case Intrinsic::arm_neon_vqshiftns:
17367 case Intrinsic::arm_neon_vqshiftnu:
17368 case Intrinsic::arm_neon_vqshiftnsu:
17369 case Intrinsic::arm_neon_vqrshiftns:
17370 case Intrinsic::arm_neon_vqrshiftnu:
17371 case Intrinsic::arm_neon_vqrshiftnsu: {
17372 EVT VT = N->getOperand(1).getValueType();
17373 int64_t Cnt;
17374 unsigned VShiftOpc = 0;
17375
17376 switch (IntNo) {
17377 case Intrinsic::arm_neon_vshifts:
17378 case Intrinsic::arm_neon_vshiftu:
17379 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) {
17380 VShiftOpc = ARMISD::VSHLIMM;
17381 break;
17382 }
17383 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) {
17384 VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? ARMISD::VSHRsIMM
17385 : ARMISD::VSHRuIMM);
17386 break;
17387 }
17388 return SDValue();
17389
17390 case Intrinsic::arm_neon_vrshifts:
17391 case Intrinsic::arm_neon_vrshiftu:
17392 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt))
17393 break;
17394 return SDValue();
17395
17396 case Intrinsic::arm_neon_vqshifts:
17397 case Intrinsic::arm_neon_vqshiftu:
17398 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
17399 break;
17400 return SDValue();
17401
17402 case Intrinsic::arm_neon_vqshiftsu:
17403 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
17404 break;
17405 llvm_unreachable("invalid shift count for vqshlu intrinsic");
17406
17407 case Intrinsic::arm_neon_vrshiftn:
17408 case Intrinsic::arm_neon_vqshiftns:
17409 case Intrinsic::arm_neon_vqshiftnu:
17410 case Intrinsic::arm_neon_vqshiftnsu:
17411 case Intrinsic::arm_neon_vqrshiftns:
17412 case Intrinsic::arm_neon_vqrshiftnu:
17413 case Intrinsic::arm_neon_vqrshiftnsu:
17414 // Narrowing shifts require an immediate right shift.
17415 if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt))
17416 break;
17417 llvm_unreachable("invalid shift count for narrowing vector shift "
17418 "intrinsic");
17419
17420 default:
17421 llvm_unreachable("unhandled vector shift");
17422 }
17423
17424 switch (IntNo) {
17425 case Intrinsic::arm_neon_vshifts:
17426 case Intrinsic::arm_neon_vshiftu:
17427 // Opcode already set above.
17428 break;
17429 case Intrinsic::arm_neon_vrshifts:
17430 VShiftOpc = ARMISD::VRSHRsIMM;
17431 break;
17432 case Intrinsic::arm_neon_vrshiftu:
17433 VShiftOpc = ARMISD::VRSHRuIMM;
17434 break;
17435 case Intrinsic::arm_neon_vrshiftn:
17436 VShiftOpc = ARMISD::VRSHRNIMM;
17437 break;
17438 case Intrinsic::arm_neon_vqshifts:
17439 VShiftOpc = ARMISD::VQSHLsIMM;
17440 break;
17441 case Intrinsic::arm_neon_vqshiftu:
17442 VShiftOpc = ARMISD::VQSHLuIMM;
17443 break;
17444 case Intrinsic::arm_neon_vqshiftsu:
17445 VShiftOpc = ARMISD::VQSHLsuIMM;
17446 break;
17447 case Intrinsic::arm_neon_vqshiftns:
17448 VShiftOpc = ARMISD::VQSHRNsIMM;
17449 break;
17450 case Intrinsic::arm_neon_vqshiftnu:
17451 VShiftOpc = ARMISD::VQSHRNuIMM;
17452 break;
17453 case Intrinsic::arm_neon_vqshiftnsu:
17454 VShiftOpc = ARMISD::VQSHRNsuIMM;
17455 break;
17456 case Intrinsic::arm_neon_vqrshiftns:
17457 VShiftOpc = ARMISD::VQRSHRNsIMM;
17458 break;
17459 case Intrinsic::arm_neon_vqrshiftnu:
17460 VShiftOpc = ARMISD::VQRSHRNuIMM;
17461 break;
17462 case Intrinsic::arm_neon_vqrshiftnsu:
17463 VShiftOpc = ARMISD::VQRSHRNsuIMM;
17464 break;
17465 }
17466
17467 SDLoc dl(N);
17468 return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
17469 N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32));
17470 }
17471
17472 case Intrinsic::arm_neon_vshiftins: {
17473 EVT VT = N->getOperand(1).getValueType();
17474 int64_t Cnt;
17475 unsigned VShiftOpc = 0;
17476
17477 if (isVShiftLImm(N->getOperand(3), VT, false, Cnt))
17478 VShiftOpc = ARMISD::VSLIIMM;
17479 else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt))
17480 VShiftOpc = ARMISD::VSRIIMM;
17481 else {
17482 llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
17483 }
17484
17485 SDLoc dl(N);
17486 return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
17487 N->getOperand(1), N->getOperand(2),
17488 DAG.getConstant(Cnt, dl, MVT::i32));
17489 }
17490
17491 case Intrinsic::arm_neon_vqrshifts:
17492 case Intrinsic::arm_neon_vqrshiftu:
17493 // No immediate versions of these to check for.
17494 break;
17495
17496 case Intrinsic::arm_neon_vbsl: {
17497 SDLoc dl(N);
17498 return DAG.getNode(ARMISD::VBSP, dl, N->getValueType(0), N->getOperand(1),
17499 N->getOperand(2), N->getOperand(3));
17500 }
17501 case Intrinsic::arm_mve_vqdmlah:
17502 case Intrinsic::arm_mve_vqdmlash:
17503 case Intrinsic::arm_mve_vqrdmlah:
17504 case Intrinsic::arm_mve_vqrdmlash:
17505 case Intrinsic::arm_mve_vmla_n_predicated:
17506 case Intrinsic::arm_mve_vmlas_n_predicated:
17507 case Intrinsic::arm_mve_vqdmlah_predicated:
17508 case Intrinsic::arm_mve_vqdmlash_predicated:
17509 case Intrinsic::arm_mve_vqrdmlah_predicated:
17510 case Intrinsic::arm_mve_vqrdmlash_predicated: {
17511 // These intrinsics all take an i32 scalar operand which is narrowed to the
17512 // size of a single lane of the vector type they return. So we don't need
17513 // any bits of that operand above that point, which allows us to eliminate
17514 // uxth/sxth.
17515 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
17516 APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);
17517 if (SimplifyDemandedBits(N->getOperand(3), DemandedMask, DCI))
17518 return SDValue();
17519 break;
17520 }
17521
17522 case Intrinsic::arm_mve_minv:
17523 case Intrinsic::arm_mve_maxv:
17524 case Intrinsic::arm_mve_minav:
17525 case Intrinsic::arm_mve_maxav:
17526 case Intrinsic::arm_mve_minv_predicated:
17527 case Intrinsic::arm_mve_maxv_predicated:
17528 case Intrinsic::arm_mve_minav_predicated:
17529 case Intrinsic::arm_mve_maxav_predicated: {
17530 // These intrinsics all take an i32 scalar operand which is narrowed to the
17531 // size of a single lane of the vector type they take as the other input.
17532 unsigned BitWidth = N->getOperand(2)->getValueType(0).getScalarSizeInBits();
17533 APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);
17534 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
17535 return SDValue();
17536 break;
17537 }
17538
17539 case Intrinsic::arm_mve_addv: {
17540 // Turn this intrinsic straight into the appropriate ARMISD::VADDV node,
17541 // which allow PerformADDVecReduce to turn it into VADDLV when possible.
17542 bool Unsigned = N->getConstantOperandVal(2);
17543 unsigned Opc = Unsigned ? ARMISD::VADDVu : ARMISD::VADDVs;
17544 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), N->getOperand(1));
17545 }
17546
17547 case Intrinsic::arm_mve_addlv:
17548 case Intrinsic::arm_mve_addlv_predicated: {
17549 // Same for these, but ARMISD::VADDLV has to be followed by a BUILD_PAIR
17550 // which recombines the two outputs into an i64
17551 bool Unsigned = N->getConstantOperandVal(2);
17552 unsigned Opc = IntNo == Intrinsic::arm_mve_addlv ?
17553 (Unsigned ? ARMISD::VADDLVu : ARMISD::VADDLVs) :
17554 (Unsigned ? ARMISD::VADDLVpu : ARMISD::VADDLVps);
17555
17557 for (unsigned i = 1, e = N->getNumOperands(); i < e; i++)
17558 if (i != 2) // skip the unsigned flag
17559 Ops.push_back(N->getOperand(i));
17560
17561 SDLoc dl(N);
17562 SDValue val = DAG.getNode(Opc, dl, {MVT::i32, MVT::i32}, Ops);
17563 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, val.getValue(0),
17564 val.getValue(1));
17565 }
17566 }
17567
17568 return SDValue();
17569}
17570
17571/// PerformShiftCombine - Checks for immediate versions of vector shifts and
17572/// lowers them. As with the vector shift intrinsics, this is done during DAG
17573/// combining instead of DAG legalizing because the build_vectors for 64-bit
17574/// vector element shift counts are generally not legal, and it is hard to see
17575/// their values after they get legalized to loads from a constant pool.
17578 const ARMSubtarget *ST) {
17579 SelectionDAG &DAG = DCI.DAG;
17580 EVT VT = N->getValueType(0);
17581
17582 if (ST->isThumb1Only() && N->getOpcode() == ISD::SHL && VT == MVT::i32 &&
17583 N->getOperand(0)->getOpcode() == ISD::AND &&
17584 N->getOperand(0)->hasOneUse()) {
17585 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
17586 return SDValue();
17587 // Look for the pattern (shl (and x, AndMask), ShiftAmt). This doesn't
17588 // usually show up because instcombine prefers to canonicalize it to
17589 // (and (shl x, ShiftAmt) (shl AndMask, ShiftAmt)), but the shift can come
17590 // out of GEP lowering in some cases.
17591 SDValue N0 = N->getOperand(0);
17592 ConstantSDNode *ShiftAmtNode = dyn_cast<ConstantSDNode>(N->getOperand(1));
17593 if (!ShiftAmtNode)
17594 return SDValue();
17595 uint32_t ShiftAmt = static_cast<uint32_t>(ShiftAmtNode->getZExtValue());
17596 ConstantSDNode *AndMaskNode = dyn_cast<ConstantSDNode>(N0->getOperand(1));
17597 if (!AndMaskNode)
17598 return SDValue();
17599 uint32_t AndMask = static_cast<uint32_t>(AndMaskNode->getZExtValue());
17600 // Don't transform uxtb/uxth.
17601 if (AndMask == 255 || AndMask == 65535)
17602 return SDValue();
17603 if (isMask_32(AndMask)) {
17604 uint32_t MaskedBits = llvm::countl_zero(AndMask);
17605 if (MaskedBits > ShiftAmt) {
17606 SDLoc DL(N);
17607 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
17608 DAG.getConstant(MaskedBits, DL, MVT::i32));
17609 return DAG.getNode(
17610 ISD::SRL, DL, MVT::i32, SHL,
17611 DAG.getConstant(MaskedBits - ShiftAmt, DL, MVT::i32));
17612 }
17613 }
17614 }
17615
17616 // Nothing to be done for scalar shifts.
17617 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17618 if (!VT.isVector() || !TLI.isTypeLegal(VT))
17619 return SDValue();
17620 if (ST->hasMVEIntegerOps())
17621 return SDValue();
17622
17623 int64_t Cnt;
17624
17625 switch (N->getOpcode()) {
17626 default: llvm_unreachable("unexpected shift opcode");
17627
17628 case ISD::SHL:
17629 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) {
17630 SDLoc dl(N);
17631 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
17632 DAG.getConstant(Cnt, dl, MVT::i32));
17633 }
17634 break;
17635
17636 case ISD::SRA:
17637 case ISD::SRL:
17638 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
17639 unsigned VShiftOpc =
17640 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
17641 SDLoc dl(N);
17642 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
17643 DAG.getConstant(Cnt, dl, MVT::i32));
17644 }
17645 }
17646 return SDValue();
17647}
17648
17649// Look for a sign/zero/fpextend extend of a larger than legal load. This can be
17650// split into multiple extending loads, which are simpler to deal with than an
17651// arbitrary extend. For fp extends we use an integer extending load and a VCVTL
17652// to convert the type to an f32.
17654 SDValue N0 = N->getOperand(0);
17655 if (N0.getOpcode() != ISD::LOAD)
17656 return SDValue();
17658 if (!LD->isSimple() || !N0.hasOneUse() || LD->isIndexed() ||
17659 LD->getExtensionType() != ISD::NON_EXTLOAD)
17660 return SDValue();
17661 EVT FromVT = LD->getValueType(0);
17662 EVT ToVT = N->getValueType(0);
17663 if (!ToVT.isVector())
17664 return SDValue();
17666 EVT ToEltVT = ToVT.getVectorElementType();
17667 EVT FromEltVT = FromVT.getVectorElementType();
17668
17669 unsigned NumElements = 0;
17670 if (ToEltVT == MVT::i32 && FromEltVT == MVT::i8)
17671 NumElements = 4;
17672 if (ToEltVT == MVT::f32 && FromEltVT == MVT::f16)
17673 NumElements = 4;
17674 if (NumElements == 0 ||
17675 (FromEltVT != MVT::f16 && FromVT.getVectorNumElements() == NumElements) ||
17676 FromVT.getVectorNumElements() % NumElements != 0 ||
17677 !isPowerOf2_32(NumElements))
17678 return SDValue();
17679
17680 LLVMContext &C = *DAG.getContext();
17681 SDLoc DL(LD);
17682 // Details about the old load
17683 SDValue Ch = LD->getChain();
17684 SDValue BasePtr = LD->getBasePtr();
17685 Align Alignment = LD->getBaseAlign();
17686 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
17687 AAMDNodes AAInfo = LD->getAAInfo();
17688
17689 ISD::LoadExtType NewExtType =
17690 N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
17691 SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
17692 EVT NewFromVT = EVT::getVectorVT(
17693 C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);
17694 EVT NewToVT = EVT::getVectorVT(
17695 C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);
17696
17699 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
17700 unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
17701 SDValue NewPtr =
17702 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
17703
17704 SDValue NewLoad =
17705 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
17706 LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
17707 Alignment, MMOFlags, AAInfo);
17708 Loads.push_back(NewLoad);
17709 Chains.push_back(SDValue(NewLoad.getNode(), 1));
17710 }
17711
17712 // Float truncs need to extended with VCVTB's into their floating point types.
17713 if (FromEltVT == MVT::f16) {
17715
17716 for (unsigned i = 0; i < Loads.size(); i++) {
17717 SDValue LoadBC =
17718 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v8f16, Loads[i]);
17719 SDValue FPExt = DAG.getNode(ARMISD::VCVTL, DL, MVT::v4f32, LoadBC,
17720 DAG.getConstant(0, DL, MVT::i32));
17721 Extends.push_back(FPExt);
17722 }
17723
17724 Loads = Extends;
17725 }
17726
17727 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
17728 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
17729 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Loads);
17730}
17731
17732/// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
17733/// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND.
17735 const ARMSubtarget *ST) {
17736 SDValue N0 = N->getOperand(0);
17737
17738 // Check for sign- and zero-extensions of vector extract operations of 8- and
17739 // 16-bit vector elements. NEON and MVE support these directly. They are
17740 // handled during DAG combining because type legalization will promote them
17741 // to 32-bit types and it is messy to recognize the operations after that.
17742 if ((ST->hasNEON() || ST->hasMVEIntegerOps()) &&
17744 SDValue Vec = N0.getOperand(0);
17745 SDValue Lane = N0.getOperand(1);
17746 EVT VT = N->getValueType(0);
17747 EVT EltVT = N0.getValueType();
17748 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17749
17750 if (VT == MVT::i32 &&
17751 (EltVT == MVT::i8 || EltVT == MVT::i16) &&
17752 TLI.isTypeLegal(Vec.getValueType()) &&
17753 isa<ConstantSDNode>(Lane)) {
17754
17755 unsigned Opc = 0;
17756 switch (N->getOpcode()) {
17757 default: llvm_unreachable("unexpected opcode");
17758 case ISD::SIGN_EXTEND:
17759 Opc = ARMISD::VGETLANEs;
17760 break;
17761 case ISD::ZERO_EXTEND:
17762 case ISD::ANY_EXTEND:
17763 Opc = ARMISD::VGETLANEu;
17764 break;
17765 }
17766 return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane);
17767 }
17768 }
17769
17770 if (ST->hasMVEIntegerOps())
17771 if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
17772 return NewLoad;
17773
17774 return SDValue();
17775}
17776
17778 const ARMSubtarget *ST) {
17779 if (ST->hasMVEFloatOps())
17780 if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
17781 return NewLoad;
17782
17783 return SDValue();
17784}
17785
17786// Lower smin(smax(x, C1), C2) to ssat or usat, if they have saturating
17787// constant bounds.
17789 const ARMSubtarget *Subtarget) {
17790 if ((Subtarget->isThumb() || !Subtarget->hasV6Ops()) &&
17791 !Subtarget->isThumb2())
17792 return SDValue();
17793
17794 EVT VT = Op.getValueType();
17795 SDValue Op0 = Op.getOperand(0);
17796
17797 if (VT != MVT::i32 ||
17798 (Op0.getOpcode() != ISD::SMIN && Op0.getOpcode() != ISD::SMAX) ||
17799 !isa<ConstantSDNode>(Op.getOperand(1)) ||
17801 return SDValue();
17802
17803 SDValue Min = Op;
17804 SDValue Max = Op0;
17805 SDValue Input = Op0.getOperand(0);
17806 if (Min.getOpcode() == ISD::SMAX)
17807 std::swap(Min, Max);
17808
17809 APInt MinC = Min.getConstantOperandAPInt(1);
17810 APInt MaxC = Max.getConstantOperandAPInt(1);
17811
17812 if (Min.getOpcode() != ISD::SMIN || Max.getOpcode() != ISD::SMAX ||
17813 !(MinC + 1).isPowerOf2())
17814 return SDValue();
17815
17816 SDLoc DL(Op);
17817 if (MinC == ~MaxC)
17818 return DAG.getNode(ARMISD::SSAT, DL, VT, Input,
17819 DAG.getConstant(MinC.countr_one(), DL, VT));
17820 if (MaxC == 0)
17821 return DAG.getNode(ARMISD::USAT, DL, VT, Input,
17822 DAG.getConstant(MinC.countr_one(), DL, VT));
17823
17824 return SDValue();
17825}
17826
17827/// PerformMinMaxCombine - Target-specific DAG combining for creating truncating
17828/// saturates.
17830 const ARMSubtarget *ST) {
17831 EVT VT = N->getValueType(0);
17832 SDValue N0 = N->getOperand(0);
17833
17834 if (VT == MVT::i32)
17835 return PerformMinMaxToSatCombine(SDValue(N, 0), DAG, ST);
17836
17837 if (!ST->hasMVEIntegerOps())
17838 return SDValue();
17839
17840 if (SDValue V = PerformVQDMULHCombine(N, DAG))
17841 return V;
17842
17843 if (VT != MVT::v4i32 && VT != MVT::v8i16)
17844 return SDValue();
17845
17846 auto IsSignedSaturate = [&](SDNode *Min, SDNode *Max) {
17847 // Check one is a smin and the other is a smax
17848 if (Min->getOpcode() != ISD::SMIN)
17849 std::swap(Min, Max);
17850 if (Min->getOpcode() != ISD::SMIN || Max->getOpcode() != ISD::SMAX)
17851 return false;
17852
17853 APInt SaturateC;
17854 if (VT == MVT::v4i32)
17855 SaturateC = APInt(32, (1 << 15) - 1, true);
17856 else //if (VT == MVT::v8i16)
17857 SaturateC = APInt(16, (1 << 7) - 1, true);
17858
17859 APInt MinC, MaxC;
17860 if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) ||
17861 MinC != SaturateC)
17862 return false;
17863 if (!ISD::isConstantSplatVector(Max->getOperand(1).getNode(), MaxC) ||
17864 MaxC != ~SaturateC)
17865 return false;
17866 return true;
17867 };
17868
17869 if (IsSignedSaturate(N, N0.getNode())) {
17870 SDLoc DL(N);
17871 MVT ExtVT, HalfVT;
17872 if (VT == MVT::v4i32) {
17873 HalfVT = MVT::v8i16;
17874 ExtVT = MVT::v4i16;
17875 } else { // if (VT == MVT::v8i16)
17876 HalfVT = MVT::v16i8;
17877 ExtVT = MVT::v8i8;
17878 }
17879
17880 // Create a VQMOVNB with undef top lanes, then signed extended into the top
17881 // half. That extend will hopefully be removed if only the bottom bits are
17882 // demanded (though a truncating store, for example).
17883 SDValue VQMOVN =
17884 DAG.getNode(ARMISD::VQMOVNs, DL, HalfVT, DAG.getUNDEF(HalfVT),
17885 N0->getOperand(0), DAG.getConstant(0, DL, MVT::i32));
17886 SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);
17887 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Bitcast,
17888 DAG.getValueType(ExtVT));
17889 }
17890
17891 auto IsUnsignedSaturate = [&](SDNode *Min) {
17892 // For unsigned, we just need to check for <= 0xffff
17893 if (Min->getOpcode() != ISD::UMIN)
17894 return false;
17895
17896 APInt SaturateC;
17897 if (VT == MVT::v4i32)
17898 SaturateC = APInt(32, (1 << 16) - 1, true);
17899 else //if (VT == MVT::v8i16)
17900 SaturateC = APInt(16, (1 << 8) - 1, true);
17901
17902 APInt MinC;
17903 if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) ||
17904 MinC != SaturateC)
17905 return false;
17906 return true;
17907 };
17908
17909 if (IsUnsignedSaturate(N)) {
17910 SDLoc DL(N);
17911 MVT HalfVT;
17912 unsigned ExtConst;
17913 if (VT == MVT::v4i32) {
17914 HalfVT = MVT::v8i16;
17915 ExtConst = 0x0000FFFF;
17916 } else { //if (VT == MVT::v8i16)
17917 HalfVT = MVT::v16i8;
17918 ExtConst = 0x00FF;
17919 }
17920
17921 // Create a VQMOVNB with undef top lanes, then ZExt into the top half with
17922 // an AND. That extend will hopefully be removed if only the bottom bits are
17923 // demanded (though a truncating store, for example).
17924 SDValue VQMOVN =
17925 DAG.getNode(ARMISD::VQMOVNu, DL, HalfVT, DAG.getUNDEF(HalfVT), N0,
17926 DAG.getConstant(0, DL, MVT::i32));
17927 SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);
17928 return DAG.getNode(ISD::AND, DL, VT, Bitcast,
17929 DAG.getConstant(ExtConst, DL, VT));
17930 }
17931
17932 return SDValue();
17933}
17934
17937 if (!C)
17938 return nullptr;
17939 const APInt *CV = &C->getAPIntValue();
17940 return CV->isPowerOf2() ? CV : nullptr;
17941}
17942
17944 // If we have a CMOV, OR and AND combination such as:
17945 // if (x & CN)
17946 // y |= CM;
17947 //
17948 // And:
17949 // * CN is a single bit;
17950 // * All bits covered by CM are known zero in y
17951 //
17952 // Then we can convert this into a sequence of BFI instructions. This will
17953 // always be a win if CM is a single bit, will always be no worse than the
17954 // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is
17955 // three bits (due to the extra IT instruction).
17956
17957 SDValue Op0 = CMOV->getOperand(0);
17958 SDValue Op1 = CMOV->getOperand(1);
17959 auto CC = CMOV->getConstantOperandAPInt(2).getLimitedValue();
17960 SDValue CmpZ = CMOV->getOperand(3);
17961
17962 // The compare must be against zero.
17963 if (!isNullConstant(CmpZ->getOperand(1)))
17964 return SDValue();
17965
17966 assert(CmpZ->getOpcode() == ARMISD::CMPZ);
17967 SDValue And = CmpZ->getOperand(0);
17968 if (And->getOpcode() != ISD::AND)
17969 return SDValue();
17970 const APInt *AndC = isPowerOf2Constant(And->getOperand(1));
17971 if (!AndC)
17972 return SDValue();
17973 SDValue X = And->getOperand(0);
17974
17975 if (CC == ARMCC::EQ) {
17976 // We're performing an "equal to zero" compare. Swap the operands so we
17977 // canonicalize on a "not equal to zero" compare.
17978 std::swap(Op0, Op1);
17979 } else {
17980 assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?");
17981 }
17982
17983 if (Op1->getOpcode() != ISD::OR)
17984 return SDValue();
17985
17987 if (!OrC)
17988 return SDValue();
17989 SDValue Y = Op1->getOperand(0);
17990
17991 if (Op0 != Y)
17992 return SDValue();
17993
17994 // Now, is it profitable to continue?
17995 APInt OrCI = OrC->getAPIntValue();
17996 unsigned Heuristic = Subtarget->isThumb() ? 3 : 2;
17997 if (OrCI.popcount() > Heuristic)
17998 return SDValue();
17999
18000 // Lastly, can we determine that the bits defined by OrCI
18001 // are zero in Y?
18002 KnownBits Known = DAG.computeKnownBits(Y);
18003 if ((OrCI & Known.Zero) != OrCI)
18004 return SDValue();
18005
18006 // OK, we can do the combine.
18007 SDValue V = Y;
18008 SDLoc dl(X);
18009 EVT VT = X.getValueType();
18010 unsigned BitInX = AndC->logBase2();
18011
18012 if (BitInX != 0) {
18013 // We must shift X first.
18014 X = DAG.getNode(ISD::SRL, dl, VT, X,
18015 DAG.getConstant(BitInX, dl, VT));
18016 }
18017
18018 for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits();
18019 BitInY < NumActiveBits; ++BitInY) {
18020 if (OrCI[BitInY] == 0)
18021 continue;
18022 APInt Mask(VT.getSizeInBits(), 0);
18023 Mask.setBit(BitInY);
18024 V = DAG.getNode(ARMISD::BFI, dl, VT, V, X,
18025 // Confusingly, the operand is an *inverted* mask.
18026 DAG.getConstant(~Mask, dl, VT));
18027 }
18028
18029 return V;
18030}
18031
18032// Given N, the value controlling the conditional branch, search for the loop
18033// intrinsic, returning it, along with how the value is used. We need to handle
18034// patterns such as the following:
18035// (brcond (xor (setcc (loop.decrement), 0, ne), 1), exit)
18036// (brcond (setcc (loop.decrement), 0, eq), exit)
18037// (brcond (setcc (loop.decrement), 0, ne), header)
18039 bool &Negate) {
18040 switch (N->getOpcode()) {
18041 default:
18042 break;
18043 case ISD::XOR: {
18044 if (!isa<ConstantSDNode>(N.getOperand(1)))
18045 return SDValue();
18046 if (!cast<ConstantSDNode>(N.getOperand(1))->isOne())
18047 return SDValue();
18048 Negate = !Negate;
18049 return SearchLoopIntrinsic(N.getOperand(0), CC, Imm, Negate);
18050 }
18051 case ISD::SETCC: {
18052 auto *Const = dyn_cast<ConstantSDNode>(N.getOperand(1));
18053 if (!Const)
18054 return SDValue();
18055 if (Const->isZero())
18056 Imm = 0;
18057 else if (Const->isOne())
18058 Imm = 1;
18059 else
18060 return SDValue();
18061 CC = cast<CondCodeSDNode>(N.getOperand(2))->get();
18062 return SearchLoopIntrinsic(N->getOperand(0), CC, Imm, Negate);
18063 }
18065 unsigned IntOp = N.getConstantOperandVal(1);
18066 if (IntOp != Intrinsic::test_start_loop_iterations &&
18067 IntOp != Intrinsic::loop_decrement_reg)
18068 return SDValue();
18069 return N;
18070 }
18071 }
18072 return SDValue();
18073}
18074
18077 const ARMSubtarget *ST) {
18078
18079 // The hwloop intrinsics that we're interested are used for control-flow,
18080 // either for entering or exiting the loop:
18081 // - test.start.loop.iterations will test whether its operand is zero. If it
18082 // is zero, the proceeding branch should not enter the loop.
18083 // - loop.decrement.reg also tests whether its operand is zero. If it is
18084 // zero, the proceeding branch should not branch back to the beginning of
18085 // the loop.
18086 // So here, we need to check that how the brcond is using the result of each
18087 // of the intrinsics to ensure that we're branching to the right place at the
18088 // right time.
18089
18090 ISD::CondCode CC;
18091 SDValue Cond;
18092 int Imm = 1;
18093 bool Negate = false;
18094 SDValue Chain = N->getOperand(0);
18095 SDValue Dest;
18096
18097 if (N->getOpcode() == ISD::BRCOND) {
18098 CC = ISD::SETEQ;
18099 Cond = N->getOperand(1);
18100 Dest = N->getOperand(2);
18101 } else {
18102 assert(N->getOpcode() == ISD::BR_CC && "Expected BRCOND or BR_CC!");
18103 CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
18104 Cond = N->getOperand(2);
18105 Dest = N->getOperand(4);
18106 if (auto *Const = dyn_cast<ConstantSDNode>(N->getOperand(3))) {
18107 if (!Const->isOne() && !Const->isZero())
18108 return SDValue();
18109 Imm = Const->getZExtValue();
18110 } else
18111 return SDValue();
18112 }
18113
18114 SDValue Int = SearchLoopIntrinsic(Cond, CC, Imm, Negate);
18115 if (!Int)
18116 return SDValue();
18117
18118 if (Negate)
18119 CC = ISD::getSetCCInverse(CC, /* Integer inverse */ MVT::i32);
18120
18121 auto IsTrueIfZero = [](ISD::CondCode CC, int Imm) {
18122 return (CC == ISD::SETEQ && Imm == 0) ||
18123 (CC == ISD::SETNE && Imm == 1) ||
18124 (CC == ISD::SETLT && Imm == 1) ||
18125 (CC == ISD::SETULT && Imm == 1);
18126 };
18127
18128 auto IsFalseIfZero = [](ISD::CondCode CC, int Imm) {
18129 return (CC == ISD::SETEQ && Imm == 1) ||
18130 (CC == ISD::SETNE && Imm == 0) ||
18131 (CC == ISD::SETGT && Imm == 0) ||
18132 (CC == ISD::SETUGT && Imm == 0) ||
18133 (CC == ISD::SETGE && Imm == 1) ||
18134 (CC == ISD::SETUGE && Imm == 1);
18135 };
18136
18137 assert((IsTrueIfZero(CC, Imm) || IsFalseIfZero(CC, Imm)) &&
18138 "unsupported condition");
18139
18140 SDLoc dl(Int);
18141 SelectionDAG &DAG = DCI.DAG;
18142 SDValue Elements = Int.getOperand(2);
18143 unsigned IntOp = Int->getConstantOperandVal(1);
18144 assert((N->hasOneUse() && N->user_begin()->getOpcode() == ISD::BR) &&
18145 "expected single br user");
18146 SDNode *Br = *N->user_begin();
18147 SDValue OtherTarget = Br->getOperand(1);
18148
18149 // Update the unconditional branch to branch to the given Dest.
18150 auto UpdateUncondBr = [](SDNode *Br, SDValue Dest, SelectionDAG &DAG) {
18151 SDValue NewBrOps[] = { Br->getOperand(0), Dest };
18152 SDValue NewBr = DAG.getNode(ISD::BR, SDLoc(Br), MVT::Other, NewBrOps);
18153 DAG.ReplaceAllUsesOfValueWith(SDValue(Br, 0), NewBr);
18154 };
18155
18156 if (IntOp == Intrinsic::test_start_loop_iterations) {
18157 SDValue Res;
18158 SDValue Setup = DAG.getNode(ARMISD::WLSSETUP, dl, MVT::i32, Elements);
18159 // We expect this 'instruction' to branch when the counter is zero.
18160 if (IsTrueIfZero(CC, Imm)) {
18161 SDValue Ops[] = {Chain, Setup, Dest};
18162 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
18163 } else {
18164 // The logic is the reverse of what we need for WLS, so find the other
18165 // basic block target: the target of the proceeding br.
18166 UpdateUncondBr(Br, Dest, DAG);
18167
18168 SDValue Ops[] = {Chain, Setup, OtherTarget};
18169 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
18170 }
18171 // Update LR count to the new value
18172 DAG.ReplaceAllUsesOfValueWith(Int.getValue(0), Setup);
18173 // Update chain
18174 DAG.ReplaceAllUsesOfValueWith(Int.getValue(2), Int.getOperand(0));
18175 return Res;
18176 } else {
18177 SDValue Size =
18178 DAG.getTargetConstant(Int.getConstantOperandVal(3), dl, MVT::i32);
18179 SDValue Args[] = { Int.getOperand(0), Elements, Size, };
18180 SDValue LoopDec = DAG.getNode(ARMISD::LOOP_DEC, dl,
18181 DAG.getVTList(MVT::i32, MVT::Other), Args);
18182 DAG.ReplaceAllUsesWith(Int.getNode(), LoopDec.getNode());
18183
18184 // We expect this instruction to branch when the count is not zero.
18185 SDValue Target = IsFalseIfZero(CC, Imm) ? Dest : OtherTarget;
18186
18187 // Update the unconditional branch to target the loop preheader if we've
18188 // found the condition has been reversed.
18189 if (Target == OtherTarget)
18190 UpdateUncondBr(Br, Dest, DAG);
18191
18192 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
18193 SDValue(LoopDec.getNode(), 1), Chain);
18194
18195 SDValue EndArgs[] = { Chain, SDValue(LoopDec.getNode(), 0), Target };
18196 return DAG.getNode(ARMISD::LE, dl, MVT::Other, EndArgs);
18197 }
18198 return SDValue();
18199}
18200
18201/// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
18202SDValue
18204 SDValue Cmp = N->getOperand(3);
18205 if (Cmp.getOpcode() != ARMISD::CMPZ)
18206 // Only looking at NE cases.
18207 return SDValue();
18208
18209 SDLoc dl(N);
18210 SDValue LHS = Cmp.getOperand(0);
18211 SDValue RHS = Cmp.getOperand(1);
18212 SDValue Chain = N->getOperand(0);
18213 SDValue BB = N->getOperand(1);
18214 SDValue ARMcc = N->getOperand(2);
18216
18217 // (brcond Chain BB ne (cmpz (and (cmov 0 1 CC Flags) 1) 0))
18218 // -> (brcond Chain BB CC Flags)
18219 if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() &&
18220 LHS->getOperand(0)->getOpcode() == ARMISD::CMOV &&
18221 LHS->getOperand(0)->hasOneUse() &&
18222 isNullConstant(LHS->getOperand(0)->getOperand(0)) &&
18223 isOneConstant(LHS->getOperand(0)->getOperand(1)) &&
18224 isOneConstant(LHS->getOperand(1)) && isNullConstant(RHS)) {
18225 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, BB,
18226 LHS->getOperand(0)->getOperand(2),
18227 LHS->getOperand(0)->getOperand(3));
18228 }
18229
18230 return SDValue();
18231}
18232
18233/// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
18234SDValue
18236 SDValue Cmp = N->getOperand(3);
18237 if (Cmp.getOpcode() != ARMISD::CMPZ)
18238 // Only looking at EQ and NE cases.
18239 return SDValue();
18240
18241 EVT VT = N->getValueType(0);
18242 SDLoc dl(N);
18243 SDValue LHS = Cmp.getOperand(0);
18244 SDValue RHS = Cmp.getOperand(1);
18245 SDValue FalseVal = N->getOperand(0);
18246 SDValue TrueVal = N->getOperand(1);
18247 SDValue ARMcc = N->getOperand(2);
18249
18250 // BFI is only available on V6T2+.
18251 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) {
18253 if (R)
18254 return R;
18255 }
18256
18257 // Simplify
18258 // mov r1, r0
18259 // cmp r1, x
18260 // mov r0, y
18261 // moveq r0, x
18262 // to
18263 // cmp r0, x
18264 // movne r0, y
18265 //
18266 // mov r1, r0
18267 // cmp r1, x
18268 // mov r0, x
18269 // movne r0, y
18270 // to
18271 // cmp r0, x
18272 // movne r0, y
18273 /// FIXME: Turn this into a target neutral optimization?
18274 SDValue Res;
18275 if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) {
18276 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc, Cmp);
18277 } else if (CC == ARMCC::EQ && TrueVal == RHS) {
18278 SDValue ARMcc;
18279 SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl);
18280 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc, NewCmp);
18281 }
18282
18283 // (cmov F T ne (cmpz (cmov 0 1 CC Flags) 0))
18284 // -> (cmov F T CC Flags)
18285 if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse() &&
18286 isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
18287 isNullConstant(RHS)) {
18288 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal,
18289 LHS->getOperand(2), LHS->getOperand(3));
18290 }
18291
18292 if (!VT.isInteger())
18293 return SDValue();
18294
18295 // Fold away an unneccessary CMPZ/CMOV
18296 // CMOV A, B, C1, (CMPZ (CMOV 1, 0, C2, D), 0) ->
18297 // if C1==EQ -> CMOV A, B, C2, D
18298 // if C1==NE -> CMOV A, B, NOT(C2), D
18299 if (N->getConstantOperandVal(2) == ARMCC::EQ ||
18300 N->getConstantOperandVal(2) == ARMCC::NE) {
18302 if (SDValue C = IsCMPZCSINC(N->getOperand(3).getNode(), Cond)) {
18303 if (N->getConstantOperandVal(2) == ARMCC::NE)
18305 return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
18306 N->getOperand(1),
18307 DAG.getConstant(Cond, SDLoc(N), MVT::i32), C);
18308 }
18309 }
18310
18311 // Materialize a boolean comparison for integers so we can avoid branching.
18312 if (isNullConstant(FalseVal)) {
18313 if (CC == ARMCC::EQ && isOneConstant(TrueVal)) {
18314 if (!Subtarget->isThumb1Only() && Subtarget->hasV5TOps()) {
18315 // If x == y then x - y == 0 and ARM's CLZ will return 32, shifting it
18316 // right 5 bits will make that 32 be 1, otherwise it will be 0.
18317 // CMOV 0, 1, ==, (CMPZ x, y) -> SRL (CTLZ (SUB x, y)), 5
18318 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
18319 Res = DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::CTLZ, dl, VT, Sub),
18320 DAG.getConstant(5, dl, MVT::i32));
18321 } else {
18322 // CMOV 0, 1, ==, (CMPZ x, y) ->
18323 // (UADDO_CARRY (SUB x, y), t:0, t:1)
18324 // where t = (USUBO_CARRY 0, (SUB x, y), 0)
18325 //
18326 // The USUBO_CARRY computes 0 - (x - y) and this will give a borrow when
18327 // x != y. In other words, a carry C == 1 when x == y, C == 0
18328 // otherwise.
18329 // The final UADDO_CARRY computes
18330 // x - y + (0 - (x - y)) + C == C
18331 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
18332 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18333 SDValue Neg = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, Sub);
18334 // ISD::USUBO_CARRY returns a borrow but we want the carry here
18335 // actually.
18336 SDValue Carry =
18337 DAG.getNode(ISD::SUB, dl, MVT::i32,
18338 DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1));
18339 Res = DAG.getNode(ISD::UADDO_CARRY, dl, VTs, Sub, Neg, Carry);
18340 }
18341 } else if (CC == ARMCC::NE && !isNullConstant(RHS) &&
18342 (!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) {
18343 // This seems pointless but will allow us to combine it further below.
18344 // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUBC x, y), z, !=, (SUBC x, y):1
18345 SDValue Sub =
18346 DAG.getNode(ARMISD::SUBC, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
18347 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc,
18348 Sub.getValue(1));
18349 FalseVal = Sub;
18350 }
18351 } else if (isNullConstant(TrueVal)) {
18352 if (CC == ARMCC::EQ && !isNullConstant(RHS) &&
18353 (!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) {
18354 // This seems pointless but will allow us to combine it further below
18355 // Note that we change == for != as this is the dual for the case above.
18356 // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUBC x, y), z, !=, (SUBC x, y):1
18357 SDValue Sub =
18358 DAG.getNode(ARMISD::SUBC, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
18359 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal,
18360 DAG.getConstant(ARMCC::NE, dl, MVT::i32),
18361 Sub.getValue(1));
18362 FalseVal = Sub;
18363 }
18364 }
18365
18366 // On Thumb1, the DAG above may be further combined if z is a power of 2
18367 // (z == 2 ^ K).
18368 // CMOV (SUBC x, y), z, !=, (SUBC x, y):1 ->
18369 // t1 = (USUBO (SUB x, y), 1)
18370 // t2 = (USUBO_CARRY (SUB x, y), t1:0, t1:1)
18371 // Result = if K != 0 then (SHL t2:0, K) else t2:0
18372 //
18373 // This also handles the special case of comparing against zero; it's
18374 // essentially, the same pattern, except there's no SUBC:
18375 // CMOV x, z, !=, (CMPZ x, 0) ->
18376 // t1 = (USUBO x, 1)
18377 // t2 = (USUBO_CARRY x, t1:0, t1:1)
18378 // Result = if K != 0 then (SHL t2:0, K) else t2:0
18379 const APInt *TrueConst;
18380 if (Subtarget->isThumb1Only() && CC == ARMCC::NE &&
18381 ((FalseVal.getOpcode() == ARMISD::SUBC && FalseVal.getOperand(0) == LHS &&
18382 FalseVal.getOperand(1) == RHS) ||
18383 (FalseVal == LHS && isNullConstant(RHS))) &&
18384 (TrueConst = isPowerOf2Constant(TrueVal))) {
18385 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18386 unsigned ShiftAmount = TrueConst->logBase2();
18387 if (ShiftAmount)
18388 TrueVal = DAG.getConstant(1, dl, VT);
18389 SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal);
18390 Res = DAG.getNode(ISD::USUBO_CARRY, dl, VTs, FalseVal, Subc,
18391 Subc.getValue(1));
18392
18393 if (ShiftAmount)
18394 Res = DAG.getNode(ISD::SHL, dl, VT, Res,
18395 DAG.getConstant(ShiftAmount, dl, MVT::i32));
18396 }
18397
18398 if (Res.getNode()) {
18399 KnownBits Known = DAG.computeKnownBits(SDValue(N,0));
18400 // Capture demanded bits information that would be otherwise lost.
18401 if (Known.Zero == 0xfffffffe)
18402 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18403 DAG.getValueType(MVT::i1));
18404 else if (Known.Zero == 0xffffff00)
18405 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18406 DAG.getValueType(MVT::i8));
18407 else if (Known.Zero == 0xffff0000)
18408 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18409 DAG.getValueType(MVT::i16));
18410 }
18411
18412 return Res;
18413}
18414
18417 const ARMSubtarget *ST) {
18418 SelectionDAG &DAG = DCI.DAG;
18419 SDValue Src = N->getOperand(0);
18420 EVT DstVT = N->getValueType(0);
18421
18422 // Convert v4f32 bitcast (v4i32 vdup (i32)) -> v4f32 vdup (i32) under MVE.
18423 if (ST->hasMVEIntegerOps() && Src.getOpcode() == ARMISD::VDUP) {
18424 EVT SrcVT = Src.getValueType();
18425 if (SrcVT.getScalarSizeInBits() == DstVT.getScalarSizeInBits())
18426 return DAG.getNode(ARMISD::VDUP, SDLoc(N), DstVT, Src.getOperand(0));
18427 }
18428
18429 // We may have a bitcast of something that has already had this bitcast
18430 // combine performed on it, so skip past any VECTOR_REG_CASTs.
18431 if (Src.getOpcode() == ARMISD::VECTOR_REG_CAST &&
18432 Src.getOperand(0).getValueType().getScalarSizeInBits() <=
18433 Src.getValueType().getScalarSizeInBits())
18434 Src = Src.getOperand(0);
18435
18436 // Bitcast from element-wise VMOV or VMVN doesn't need VREV if the VREV that
18437 // would be generated is at least the width of the element type.
18438 EVT SrcVT = Src.getValueType();
18439 if ((Src.getOpcode() == ARMISD::VMOVIMM ||
18440 Src.getOpcode() == ARMISD::VMVNIMM ||
18441 Src.getOpcode() == ARMISD::VMOVFPIMM) &&
18442 SrcVT.getScalarSizeInBits() <= DstVT.getScalarSizeInBits() &&
18443 DAG.getDataLayout().isBigEndian())
18444 return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(N), DstVT, Src);
18445
18446 // bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD x
18447 if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI))
18448 return R;
18449
18450 return SDValue();
18451}
18452
18453// Some combines for the MVETrunc truncations legalizer helper. Also lowers the
18454// node into stack operations after legalizeOps.
18457 SelectionDAG &DAG = DCI.DAG;
18458 EVT VT = N->getValueType(0);
18459 SDLoc DL(N);
18460
18461 // MVETrunc(Undef, Undef) -> Undef
18462 if (all_of(N->ops(), [](SDValue Op) { return Op.isUndef(); }))
18463 return DAG.getUNDEF(VT);
18464
18465 // MVETrunc(MVETrunc a b, MVETrunc c, d) -> MVETrunc
18466 if (N->getNumOperands() == 2 &&
18467 N->getOperand(0).getOpcode() == ARMISD::MVETRUNC &&
18468 N->getOperand(1).getOpcode() == ARMISD::MVETRUNC)
18469 return DAG.getNode(ARMISD::MVETRUNC, DL, VT, N->getOperand(0).getOperand(0),
18470 N->getOperand(0).getOperand(1),
18471 N->getOperand(1).getOperand(0),
18472 N->getOperand(1).getOperand(1));
18473
18474 // MVETrunc(shuffle, shuffle) -> VMOVN
18475 if (N->getNumOperands() == 2 &&
18476 N->getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE &&
18477 N->getOperand(1).getOpcode() == ISD::VECTOR_SHUFFLE) {
18478 auto *S0 = cast<ShuffleVectorSDNode>(N->getOperand(0).getNode());
18479 auto *S1 = cast<ShuffleVectorSDNode>(N->getOperand(1).getNode());
18480
18481 if (S0->getOperand(0) == S1->getOperand(0) &&
18482 S0->getOperand(1) == S1->getOperand(1)) {
18483 // Construct complete shuffle mask
18484 SmallVector<int, 8> Mask(S0->getMask());
18485 Mask.append(S1->getMask().begin(), S1->getMask().end());
18486
18487 if (isVMOVNTruncMask(Mask, VT, false))
18488 return DAG.getNode(
18489 ARMISD::VMOVN, DL, VT,
18490 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)),
18491 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)),
18492 DAG.getConstant(1, DL, MVT::i32));
18493 if (isVMOVNTruncMask(Mask, VT, true))
18494 return DAG.getNode(
18495 ARMISD::VMOVN, DL, VT,
18496 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)),
18497 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)),
18498 DAG.getConstant(1, DL, MVT::i32));
18499 }
18500 }
18501
18502 // For MVETrunc of a buildvector or shuffle, it can be beneficial to lower the
18503 // truncate to a buildvector to allow the generic optimisations to kick in.
18504 if (all_of(N->ops(), [](SDValue Op) {
18505 return Op.getOpcode() == ISD::BUILD_VECTOR ||
18506 Op.getOpcode() == ISD::VECTOR_SHUFFLE ||
18507 (Op.getOpcode() == ISD::BITCAST &&
18508 Op.getOperand(0).getOpcode() == ISD::BUILD_VECTOR);
18509 })) {
18510 SmallVector<SDValue, 8> Extracts;
18511 for (unsigned Op = 0; Op < N->getNumOperands(); Op++) {
18512 SDValue O = N->getOperand(Op);
18513 for (unsigned i = 0; i < O.getValueType().getVectorNumElements(); i++) {
18514 SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, O,
18515 DAG.getConstant(i, DL, MVT::i32));
18516 Extracts.push_back(Ext);
18517 }
18518 }
18519 return DAG.getBuildVector(VT, DL, Extracts);
18520 }
18521
18522 // If we are late in the legalization process and nothing has optimised
18523 // the trunc to anything better, lower it to a stack store and reload,
18524 // performing the truncation whilst keeping the lanes in the correct order:
18525 // VSTRH.32 a, stack; VSTRH.32 b, stack+8; VLDRW.32 stack;
18526 if (!DCI.isAfterLegalizeDAG())
18527 return SDValue();
18528
18529 SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::getFixed(16), Align(4));
18530 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
18531 int NumIns = N->getNumOperands();
18532 assert((NumIns == 2 || NumIns == 4) &&
18533 "Expected 2 or 4 inputs to an MVETrunc");
18534 EVT StoreVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
18535 if (N->getNumOperands() == 4)
18536 StoreVT = StoreVT.getHalfNumVectorElementsVT(*DAG.getContext());
18537
18538 SmallVector<SDValue> Chains;
18539 for (int I = 0; I < NumIns; I++) {
18540 SDValue Ptr = DAG.getNode(
18541 ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
18542 DAG.getConstant(I * 16 / NumIns, DL, StackPtr.getValueType()));
18544 DAG.getMachineFunction(), SPFI, I * 16 / NumIns);
18545 SDValue Ch = DAG.getTruncStore(DAG.getEntryNode(), DL, N->getOperand(I),
18546 Ptr, MPI, StoreVT, Align(4));
18547 Chains.push_back(Ch);
18548 }
18549
18550 SDValue Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
18551 MachinePointerInfo MPI =
18553 return DAG.getLoad(VT, DL, Chain, StackPtr, MPI, Align(4));
18554}
18555
18556// Take a MVEEXT(load x) and split that into (extload x, extload x+8)
18558 SelectionDAG &DAG) {
18559 SDValue N0 = N->getOperand(0);
18561 if (!LD || !LD->isSimple() || !N0.hasOneUse() || LD->isIndexed())
18562 return SDValue();
18563
18564 EVT FromVT = LD->getMemoryVT();
18565 EVT ToVT = N->getValueType(0);
18566 if (!ToVT.isVector())
18567 return SDValue();
18568 assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements() * 2);
18569 EVT ToEltVT = ToVT.getVectorElementType();
18570 EVT FromEltVT = FromVT.getVectorElementType();
18571
18572 unsigned NumElements = 0;
18573 if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8))
18574 NumElements = 4;
18575 if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8)
18576 NumElements = 8;
18577 assert(NumElements != 0);
18578
18579 ISD::LoadExtType NewExtType =
18580 N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
18581 if (LD->getExtensionType() != ISD::NON_EXTLOAD &&
18582 LD->getExtensionType() != ISD::EXTLOAD &&
18583 LD->getExtensionType() != NewExtType)
18584 return SDValue();
18585
18586 LLVMContext &C = *DAG.getContext();
18587 SDLoc DL(LD);
18588 // Details about the old load
18589 SDValue Ch = LD->getChain();
18590 SDValue BasePtr = LD->getBasePtr();
18591 Align Alignment = LD->getBaseAlign();
18592 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
18593 AAMDNodes AAInfo = LD->getAAInfo();
18594
18595 SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
18596 EVT NewFromVT = EVT::getVectorVT(
18597 C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);
18598 EVT NewToVT = EVT::getVectorVT(
18599 C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);
18600
18603 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
18604 unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
18605 SDValue NewPtr =
18606 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
18607
18608 SDValue NewLoad =
18609 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
18610 LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
18611 Alignment, MMOFlags, AAInfo);
18612 Loads.push_back(NewLoad);
18613 Chains.push_back(SDValue(NewLoad.getNode(), 1));
18614 }
18615
18616 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
18617 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
18618 return DAG.getMergeValues(Loads, DL);
18619}
18620
18621// Perform combines for MVEEXT. If it has not be optimized to anything better
18622// before lowering, it gets converted to stack store and extloads performing the
18623// extend whilst still keeping the same lane ordering.
18626 SelectionDAG &DAG = DCI.DAG;
18627 EVT VT = N->getValueType(0);
18628 SDLoc DL(N);
18629 assert(N->getNumValues() == 2 && "Expected MVEEXT with 2 elements");
18630 assert((VT == MVT::v4i32 || VT == MVT::v8i16) && "Unexpected MVEEXT type");
18631
18632 EVT ExtVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(
18633 *DAG.getContext());
18634 auto Extend = [&](SDValue V) {
18635 SDValue VVT = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, V);
18636 return N->getOpcode() == ARMISD::MVESEXT
18637 ? DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, VVT,
18638 DAG.getValueType(ExtVT))
18639 : DAG.getZeroExtendInReg(VVT, DL, ExtVT);
18640 };
18641
18642 // MVEEXT(VDUP) -> SIGN_EXTEND_INREG(VDUP)
18643 if (N->getOperand(0).getOpcode() == ARMISD::VDUP) {
18644 SDValue Ext = Extend(N->getOperand(0));
18645 return DAG.getMergeValues({Ext, Ext}, DL);
18646 }
18647
18648 // MVEEXT(shuffle) -> SIGN_EXTEND_INREG/ZERO_EXTEND_INREG
18649 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0))) {
18650 ArrayRef<int> Mask = SVN->getMask();
18651 assert(Mask.size() == 2 * VT.getVectorNumElements());
18652 assert(Mask.size() == SVN->getValueType(0).getVectorNumElements());
18653 unsigned Rev = VT == MVT::v4i32 ? ARMISD::VREV32 : ARMISD::VREV16;
18654 SDValue Op0 = SVN->getOperand(0);
18655 SDValue Op1 = SVN->getOperand(1);
18656
18657 auto CheckInregMask = [&](int Start, int Offset) {
18658 for (int Idx = 0, E = VT.getVectorNumElements(); Idx < E; ++Idx)
18659 if (Mask[Start + Idx] >= 0 && Mask[Start + Idx] != Idx * 2 + Offset)
18660 return false;
18661 return true;
18662 };
18663 SDValue V0 = SDValue(N, 0);
18664 SDValue V1 = SDValue(N, 1);
18665 if (CheckInregMask(0, 0))
18666 V0 = Extend(Op0);
18667 else if (CheckInregMask(0, 1))
18668 V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0));
18669 else if (CheckInregMask(0, Mask.size()))
18670 V0 = Extend(Op1);
18671 else if (CheckInregMask(0, Mask.size() + 1))
18672 V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1));
18673
18674 if (CheckInregMask(VT.getVectorNumElements(), Mask.size()))
18675 V1 = Extend(Op1);
18676 else if (CheckInregMask(VT.getVectorNumElements(), Mask.size() + 1))
18677 V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1));
18678 else if (CheckInregMask(VT.getVectorNumElements(), 0))
18679 V1 = Extend(Op0);
18680 else if (CheckInregMask(VT.getVectorNumElements(), 1))
18681 V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0));
18682
18683 if (V0.getNode() != N || V1.getNode() != N)
18684 return DAG.getMergeValues({V0, V1}, DL);
18685 }
18686
18687 // MVEEXT(load) -> extload, extload
18688 if (N->getOperand(0)->getOpcode() == ISD::LOAD)
18690 return L;
18691
18692 if (!DCI.isAfterLegalizeDAG())
18693 return SDValue();
18694
18695 // Lower to a stack store and reload:
18696 // VSTRW.32 a, stack; VLDRH.32 stack; VLDRH.32 stack+8;
18697 SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::getFixed(16), Align(4));
18698 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
18699 int NumOuts = N->getNumValues();
18700 assert((NumOuts == 2 || NumOuts == 4) &&
18701 "Expected 2 or 4 outputs to an MVEEXT");
18702 EVT LoadVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(
18703 *DAG.getContext());
18704 if (N->getNumOperands() == 4)
18705 LoadVT = LoadVT.getHalfNumVectorElementsVT(*DAG.getContext());
18706
18707 MachinePointerInfo MPI =
18709 SDValue Chain = DAG.getStore(DAG.getEntryNode(), DL, N->getOperand(0),
18710 StackPtr, MPI, Align(4));
18711
18713 for (int I = 0; I < NumOuts; I++) {
18714 SDValue Ptr = DAG.getNode(
18715 ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
18716 DAG.getConstant(I * 16 / NumOuts, DL, StackPtr.getValueType()));
18718 DAG.getMachineFunction(), SPFI, I * 16 / NumOuts);
18719 SDValue Load = DAG.getExtLoad(
18720 N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD, DL,
18721 VT, Chain, Ptr, MPI, LoadVT, Align(4));
18722 Loads.push_back(Load);
18723 }
18724
18725 return DAG.getMergeValues(Loads, DL);
18726}
18727
18729 DAGCombinerInfo &DCI) const {
18730 switch (N->getOpcode()) {
18731 default: break;
18732 case ISD::SELECT_CC:
18733 case ISD::SELECT: return PerformSELECTCombine(N, DCI, Subtarget);
18734 case ISD::VSELECT: return PerformVSELECTCombine(N, DCI, Subtarget);
18735 case ISD::SETCC: return PerformVSetCCToVCTPCombine(N, DCI, Subtarget);
18736 case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget);
18737 case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget);
18738 case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget);
18739 case ISD::SUB: return PerformSUBCombine(N, DCI, Subtarget);
18740 case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget);
18741 case ISD::OR: return PerformORCombine(N, DCI, Subtarget);
18742 case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget);
18743 case ISD::AND: return PerformANDCombine(N, DCI, Subtarget);
18744 case ISD::BRCOND:
18745 case ISD::BR_CC: return PerformHWLoopCombine(N, DCI, Subtarget);
18746 case ARMISD::ADDC:
18747 case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget);
18748 case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget);
18749 case ARMISD::BFI: return PerformBFICombine(N, DCI.DAG);
18750 case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
18751 case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
18752 case ARMISD::VMOVhr: return PerformVMOVhrCombine(N, DCI);
18753 case ARMISD::VMOVrh: return PerformVMOVrhCombine(N, DCI.DAG);
18754 case ISD::STORE: return PerformSTORECombine(N, DCI, Subtarget);
18755 case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget);
18758 return PerformExtractEltCombine(N, DCI, Subtarget);
18762 case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI, Subtarget);
18763 case ARMISD::VDUP: return PerformVDUPCombine(N, DCI.DAG, Subtarget);
18764 case ISD::FP_TO_SINT:
18765 case ISD::FP_TO_UINT:
18766 return PerformVCVTCombine(N, DCI.DAG, Subtarget);
18767 case ISD::FADD:
18768 return PerformFADDCombine(N, DCI.DAG, Subtarget);
18769 case ISD::FMUL:
18770 return PerformVMulVCTPCombine(N, DCI.DAG, Subtarget);
18772 return PerformIntrinsicCombine(N, DCI);
18773 case ISD::SHL:
18774 case ISD::SRA:
18775 case ISD::SRL:
18776 return PerformShiftCombine(N, DCI, Subtarget);
18777 case ISD::SIGN_EXTEND:
18778 case ISD::ZERO_EXTEND:
18779 case ISD::ANY_EXTEND:
18780 return PerformExtendCombine(N, DCI.DAG, Subtarget);
18781 case ISD::FP_EXTEND:
18782 return PerformFPExtendCombine(N, DCI.DAG, Subtarget);
18783 case ISD::SMIN:
18784 case ISD::UMIN:
18785 case ISD::SMAX:
18786 case ISD::UMAX:
18787 return PerformMinMaxCombine(N, DCI.DAG, Subtarget);
18788 case ARMISD::CMOV:
18789 return PerformCMOVCombine(N, DCI.DAG);
18790 case ARMISD::BRCOND:
18791 return PerformBRCONDCombine(N, DCI.DAG);
18792 case ARMISD::CMPZ:
18793 return PerformCMPZCombine(N, DCI.DAG);
18794 case ARMISD::CSINC:
18795 case ARMISD::CSINV:
18796 case ARMISD::CSNEG:
18797 return PerformCSETCombine(N, DCI.DAG);
18798 case ISD::LOAD:
18799 return PerformLOADCombine(N, DCI, Subtarget);
18800 case ARMISD::VLD1DUP:
18801 case ARMISD::VLD2DUP:
18802 case ARMISD::VLD3DUP:
18803 case ARMISD::VLD4DUP:
18804 return PerformVLDCombine(N, DCI);
18806 return PerformARMBUILD_VECTORCombine(N, DCI);
18807 case ISD::BITCAST:
18808 return PerformBITCASTCombine(N, DCI, Subtarget);
18809 case ARMISD::PREDICATE_CAST:
18810 return PerformPREDICATE_CASTCombine(N, DCI);
18811 case ARMISD::VECTOR_REG_CAST:
18812 return PerformVECTOR_REG_CASTCombine(N, DCI.DAG, Subtarget);
18813 case ARMISD::MVETRUNC:
18814 return PerformMVETruncCombine(N, DCI);
18815 case ARMISD::MVESEXT:
18816 case ARMISD::MVEZEXT:
18817 return PerformMVEExtCombine(N, DCI);
18818 case ARMISD::VCMP:
18819 return PerformVCMPCombine(N, DCI.DAG, Subtarget);
18820 case ISD::VECREDUCE_ADD:
18821 return PerformVECREDUCE_ADDCombine(N, DCI.DAG, Subtarget);
18822 case ARMISD::VADDVs:
18823 case ARMISD::VADDVu:
18824 case ARMISD::VADDLVs:
18825 case ARMISD::VADDLVu:
18826 case ARMISD::VADDLVAs:
18827 case ARMISD::VADDLVAu:
18828 case ARMISD::VMLAVs:
18829 case ARMISD::VMLAVu:
18830 case ARMISD::VMLALVs:
18831 case ARMISD::VMLALVu:
18832 case ARMISD::VMLALVAs:
18833 case ARMISD::VMLALVAu:
18834 return PerformReduceShuffleCombine(N, DCI.DAG);
18835 case ARMISD::VMOVN:
18836 return PerformVMOVNCombine(N, DCI);
18837 case ARMISD::VQMOVNs:
18838 case ARMISD::VQMOVNu:
18839 return PerformVQMOVNCombine(N, DCI);
18840 case ARMISD::VQDMULH:
18841 return PerformVQDMULHCombine(N, DCI);
18842 case ARMISD::ASRL:
18843 case ARMISD::LSRL:
18844 case ARMISD::LSLL:
18845 return PerformLongShiftCombine(N, DCI.DAG);
18846 case ARMISD::SMULWB: {
18847 unsigned BitWidth = N->getValueType(0).getSizeInBits();
18848 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
18849 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
18850 return SDValue();
18851 break;
18852 }
18853 case ARMISD::SMULWT: {
18854 unsigned BitWidth = N->getValueType(0).getSizeInBits();
18855 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
18856 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
18857 return SDValue();
18858 break;
18859 }
18860 case ARMISD::SMLALBB:
18861 case ARMISD::QADD16b:
18862 case ARMISD::QSUB16b:
18863 case ARMISD::UQADD16b:
18864 case ARMISD::UQSUB16b: {
18865 unsigned BitWidth = N->getValueType(0).getSizeInBits();
18866 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
18867 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
18868 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
18869 return SDValue();
18870 break;
18871 }
18872 case ARMISD::SMLALBT: {
18873 unsigned LowWidth = N->getOperand(0).getValueType().getSizeInBits();
18874 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
18875 unsigned HighWidth = N->getOperand(1).getValueType().getSizeInBits();
18876 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
18877 if ((SimplifyDemandedBits(N->getOperand(0), LowMask, DCI)) ||
18878 (SimplifyDemandedBits(N->getOperand(1), HighMask, DCI)))
18879 return SDValue();
18880 break;
18881 }
18882 case ARMISD::SMLALTB: {
18883 unsigned HighWidth = N->getOperand(0).getValueType().getSizeInBits();
18884 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
18885 unsigned LowWidth = N->getOperand(1).getValueType().getSizeInBits();
18886 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
18887 if ((SimplifyDemandedBits(N->getOperand(0), HighMask, DCI)) ||
18888 (SimplifyDemandedBits(N->getOperand(1), LowMask, DCI)))
18889 return SDValue();
18890 break;
18891 }
18892 case ARMISD::SMLALTT: {
18893 unsigned BitWidth = N->getValueType(0).getSizeInBits();
18894 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
18895 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
18896 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
18897 return SDValue();
18898 break;
18899 }
18900 case ARMISD::QADD8b:
18901 case ARMISD::QSUB8b:
18902 case ARMISD::UQADD8b:
18903 case ARMISD::UQSUB8b: {
18904 unsigned BitWidth = N->getValueType(0).getSizeInBits();
18905 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 8);
18906 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
18907 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
18908 return SDValue();
18909 break;
18910 }
18911 case ARMISD::VBSP:
18912 if (N->getOperand(1) == N->getOperand(2))
18913 return N->getOperand(1);
18914 return SDValue();
18917 switch (N->getConstantOperandVal(1)) {
18918 case Intrinsic::arm_neon_vld1:
18919 case Intrinsic::arm_neon_vld1x2:
18920 case Intrinsic::arm_neon_vld1x3:
18921 case Intrinsic::arm_neon_vld1x4:
18922 case Intrinsic::arm_neon_vld2:
18923 case Intrinsic::arm_neon_vld3:
18924 case Intrinsic::arm_neon_vld4:
18925 case Intrinsic::arm_neon_vld2lane:
18926 case Intrinsic::arm_neon_vld3lane:
18927 case Intrinsic::arm_neon_vld4lane:
18928 case Intrinsic::arm_neon_vld2dup:
18929 case Intrinsic::arm_neon_vld3dup:
18930 case Intrinsic::arm_neon_vld4dup:
18931 case Intrinsic::arm_neon_vst1:
18932 case Intrinsic::arm_neon_vst1x2:
18933 case Intrinsic::arm_neon_vst1x3:
18934 case Intrinsic::arm_neon_vst1x4:
18935 case Intrinsic::arm_neon_vst2:
18936 case Intrinsic::arm_neon_vst3:
18937 case Intrinsic::arm_neon_vst4:
18938 case Intrinsic::arm_neon_vst2lane:
18939 case Intrinsic::arm_neon_vst3lane:
18940 case Intrinsic::arm_neon_vst4lane:
18941 return PerformVLDCombine(N, DCI);
18942 case Intrinsic::arm_mve_vld2q:
18943 case Intrinsic::arm_mve_vld4q:
18944 case Intrinsic::arm_mve_vst2q:
18945 case Intrinsic::arm_mve_vst4q:
18946 return PerformMVEVLDCombine(N, DCI);
18947 default: break;
18948 }
18949 break;
18950 }
18951 return SDValue();
18952}
18953
18955 EVT VT) const {
18956 return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
18957}
18958
18960 Align Alignment,
18962 unsigned *Fast) const {
18963 // Depends what it gets converted into if the type is weird.
18964 if (!VT.isSimple())
18965 return false;
18966
18967 // The AllowsUnaligned flag models the SCTLR.A setting in ARM cpus
18968 bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
18969 auto Ty = VT.getSimpleVT().SimpleTy;
18970
18971 if (Ty == MVT::i8 || Ty == MVT::i16 || Ty == MVT::i32) {
18972 // Unaligned access can use (for example) LRDB, LRDH, LDR
18973 if (AllowsUnaligned) {
18974 if (Fast)
18975 *Fast = Subtarget->hasV7Ops();
18976 return true;
18977 }
18978 }
18979
18980 if (Ty == MVT::f64 || Ty == MVT::v2f64) {
18981 // For any little-endian targets with neon, we can support unaligned ld/st
18982 // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
18983 // A big-endian target may also explicitly support unaligned accesses
18984 if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) {
18985 if (Fast)
18986 *Fast = 1;
18987 return true;
18988 }
18989 }
18990
18991 if (!Subtarget->hasMVEIntegerOps())
18992 return false;
18993
18994 // These are for predicates
18995 if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1 ||
18996 Ty == MVT::v2i1)) {
18997 if (Fast)
18998 *Fast = 1;
18999 return true;
19000 }
19001
19002 // These are for truncated stores/narrowing loads. They are fine so long as
19003 // the alignment is at least the size of the item being loaded
19004 if ((Ty == MVT::v4i8 || Ty == MVT::v8i8 || Ty == MVT::v4i16) &&
19005 Alignment >= VT.getScalarSizeInBits() / 8) {
19006 if (Fast)
19007 *Fast = true;
19008 return true;
19009 }
19010
19011 // In little-endian MVE, the store instructions VSTRB.U8, VSTRH.U16 and
19012 // VSTRW.U32 all store the vector register in exactly the same format, and
19013 // differ only in the range of their immediate offset field and the required
19014 // alignment. So there is always a store that can be used, regardless of
19015 // actual type.
19016 //
19017 // For big endian, that is not the case. But can still emit a (VSTRB.U8;
19018 // VREV64.8) pair and get the same effect. This will likely be better than
19019 // aligning the vector through the stack.
19020 if (Ty == MVT::v16i8 || Ty == MVT::v8i16 || Ty == MVT::v8f16 ||
19021 Ty == MVT::v4i32 || Ty == MVT::v4f32 || Ty == MVT::v2i64 ||
19022 Ty == MVT::v2f64) {
19023 if (Fast)
19024 *Fast = 1;
19025 return true;
19026 }
19027
19028 return false;
19029}
19030
19032 LLVMContext &Context, const MemOp &Op,
19033 const AttributeList &FuncAttributes) const {
19034 // See if we can use NEON instructions for this...
19035 if ((Op.isMemcpy() || Op.isZeroMemset()) && Subtarget->hasNEON() &&
19036 !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {
19037 unsigned Fast;
19038 if (Op.size() >= 16 &&
19039 (Op.isAligned(Align(16)) ||
19040 (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, Align(1),
19042 Fast))) {
19043 return MVT::v2f64;
19044 } else if (Op.size() >= 8 &&
19045 (Op.isAligned(Align(8)) ||
19047 MVT::f64, 0, Align(1), MachineMemOperand::MONone, &Fast) &&
19048 Fast))) {
19049 return MVT::f64;
19050 }
19051 }
19052
19053 // Let the target-independent logic figure it out.
19054 return MVT::Other;
19055}
19056
19057// 64-bit integers are split into their high and low parts and held in two
19058// different registers, so the trunc is free since the low register can just
19059// be used.
19060bool ARMTargetLowering::isTruncateFree(Type *SrcTy, Type *DstTy) const {
19061 if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy())
19062 return false;
19063 unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();
19064 unsigned DestBits = DstTy->getPrimitiveSizeInBits();
19065 return (SrcBits == 64 && DestBits == 32);
19066}
19067
19069 if (SrcVT.isVector() || DstVT.isVector() || !SrcVT.isInteger() ||
19070 !DstVT.isInteger())
19071 return false;
19072 unsigned SrcBits = SrcVT.getSizeInBits();
19073 unsigned DestBits = DstVT.getSizeInBits();
19074 return (SrcBits == 64 && DestBits == 32);
19075}
19076
19078 if (Val.getOpcode() != ISD::LOAD)
19079 return false;
19080
19081 EVT VT1 = Val.getValueType();
19082 if (!VT1.isSimple() || !VT1.isInteger() ||
19083 !VT2.isSimple() || !VT2.isInteger())
19084 return false;
19085
19086 switch (VT1.getSimpleVT().SimpleTy) {
19087 default: break;
19088 case MVT::i1:
19089 case MVT::i8:
19090 case MVT::i16:
19091 // 8-bit and 16-bit loads implicitly zero-extend to 32-bits.
19092 return true;
19093 }
19094
19095 return false;
19096}
19097
19099 if (!VT.isSimple())
19100 return false;
19101
19102 // There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that
19103 // negate values directly (fneg is free). So, we don't want to let the DAG
19104 // combiner rewrite fneg into xors and some other instructions. For f16 and
19105 // FullFP16 argument passing, some bitcast nodes may be introduced,
19106 // triggering this DAG combine rewrite, so we are avoiding that with this.
19107 switch (VT.getSimpleVT().SimpleTy) {
19108 default: break;
19109 case MVT::f16:
19110 return Subtarget->hasFullFP16();
19111 }
19112
19113 return false;
19114}
19115
19117 if (!Subtarget->hasMVEIntegerOps())
19118 return nullptr;
19119 Type *SVIType = SVI->getType();
19120 Type *ScalarType = SVIType->getScalarType();
19121
19122 if (ScalarType->isFloatTy())
19123 return Type::getInt32Ty(SVIType->getContext());
19124 if (ScalarType->isHalfTy())
19125 return Type::getInt16Ty(SVIType->getContext());
19126 return nullptr;
19127}
19128
19130 EVT VT = ExtVal.getValueType();
19131
19132 if (!isTypeLegal(VT))
19133 return false;
19134
19135 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal.getOperand(0))) {
19136 if (Ld->isExpandingLoad())
19137 return false;
19138 }
19139
19140 if (Subtarget->hasMVEIntegerOps())
19141 return true;
19142
19143 // Don't create a loadext if we can fold the extension into a wide/long
19144 // instruction.
19145 // If there's more than one user instruction, the loadext is desirable no
19146 // matter what. There can be two uses by the same instruction.
19147 if (ExtVal->use_empty() ||
19148 !ExtVal->user_begin()->isOnlyUserOf(ExtVal.getNode()))
19149 return true;
19150
19151 SDNode *U = *ExtVal->user_begin();
19152 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB ||
19153 U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHLIMM))
19154 return false;
19155
19156 return true;
19157}
19158
19160 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
19161 return false;
19162
19163 if (!isTypeLegal(EVT::getEVT(Ty1)))
19164 return false;
19165
19166 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
19167
19168 // Assuming the caller doesn't have a zeroext or signext return parameter,
19169 // truncation all the way down to i1 is valid.
19170 return true;
19171}
19172
19173/// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
19174/// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
19175/// expanded to FMAs when this method returns true, otherwise fmuladd is
19176/// expanded to fmul + fadd.
19177///
19178/// ARM supports both fused and unfused multiply-add operations; we already
19179/// lower a pair of fmul and fadd to the latter so it's not clear that there
19180/// would be a gain or that the gain would be worthwhile enough to risk
19181/// correctness bugs.
19182///
19183/// For MVE, we set this to true as it helps simplify the need for some
19184/// patterns (and we don't have the non-fused floating point instruction).
19185bool ARMTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
19186 EVT VT) const {
19187 if (Subtarget->useSoftFloat())
19188 return false;
19189
19190 if (!VT.isSimple())
19191 return false;
19192
19193 switch (VT.getSimpleVT().SimpleTy) {
19194 case MVT::v4f32:
19195 case MVT::v8f16:
19196 return Subtarget->hasMVEFloatOps();
19197 case MVT::f16:
19198 return Subtarget->useFPVFMx16();
19199 case MVT::f32:
19200 return Subtarget->useFPVFMx();
19201 case MVT::f64:
19202 return Subtarget->useFPVFMx64();
19203 default:
19204 break;
19205 }
19206
19207 return false;
19208}
19209
19210static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
19211 if (V < 0)
19212 return false;
19213
19214 unsigned Scale = 1;
19215 switch (VT.getSimpleVT().SimpleTy) {
19216 case MVT::i1:
19217 case MVT::i8:
19218 // Scale == 1;
19219 break;
19220 case MVT::i16:
19221 // Scale == 2;
19222 Scale = 2;
19223 break;
19224 default:
19225 // On thumb1 we load most things (i32, i64, floats, etc) with a LDR
19226 // Scale == 4;
19227 Scale = 4;
19228 break;
19229 }
19230
19231 if ((V & (Scale - 1)) != 0)
19232 return false;
19233 return isUInt<5>(V / Scale);
19234}
19235
19236static bool isLegalT2AddressImmediate(int64_t V, EVT VT,
19237 const ARMSubtarget *Subtarget) {
19238 if (!VT.isInteger() && !VT.isFloatingPoint())
19239 return false;
19240 if (VT.isVector() && Subtarget->hasNEON())
19241 return false;
19242 if (VT.isVector() && VT.isFloatingPoint() && Subtarget->hasMVEIntegerOps() &&
19243 !Subtarget->hasMVEFloatOps())
19244 return false;
19245
19246 bool IsNeg = false;
19247 if (V < 0) {
19248 IsNeg = true;
19249 V = -V;
19250 }
19251
19252 unsigned NumBytes = std::max((unsigned)VT.getSizeInBits() / 8, 1U);
19253
19254 // MVE: size * imm7
19255 if (VT.isVector() && Subtarget->hasMVEIntegerOps()) {
19256 switch (VT.getSimpleVT().getVectorElementType().SimpleTy) {
19257 case MVT::i32:
19258 case MVT::f32:
19259 return isShiftedUInt<7,2>(V);
19260 case MVT::i16:
19261 case MVT::f16:
19262 return isShiftedUInt<7,1>(V);
19263 case MVT::i8:
19264 return isUInt<7>(V);
19265 default:
19266 return false;
19267 }
19268 }
19269
19270 // half VLDR: 2 * imm8
19271 if (VT.isFloatingPoint() && NumBytes == 2 && Subtarget->hasFPRegs16())
19272 return isShiftedUInt<8, 1>(V);
19273 // VLDR and LDRD: 4 * imm8
19274 if ((VT.isFloatingPoint() && Subtarget->hasVFP2Base()) || NumBytes == 8)
19275 return isShiftedUInt<8, 2>(V);
19276
19277 if (NumBytes == 1 || NumBytes == 2 || NumBytes == 4) {
19278 // + imm12 or - imm8
19279 if (IsNeg)
19280 return isUInt<8>(V);
19281 return isUInt<12>(V);
19282 }
19283
19284 return false;
19285}
19286
19287/// isLegalAddressImmediate - Return true if the integer value can be used
19288/// as the offset of the target addressing mode for load / store of the
19289/// given type.
19290static bool isLegalAddressImmediate(int64_t V, EVT VT,
19291 const ARMSubtarget *Subtarget) {
19292 if (V == 0)
19293 return true;
19294
19295 if (!VT.isSimple())
19296 return false;
19297
19298 if (Subtarget->isThumb1Only())
19299 return isLegalT1AddressImmediate(V, VT);
19300 else if (Subtarget->isThumb2())
19301 return isLegalT2AddressImmediate(V, VT, Subtarget);
19302
19303 // ARM mode.
19304 if (V < 0)
19305 V = - V;
19306 switch (VT.getSimpleVT().SimpleTy) {
19307 default: return false;
19308 case MVT::i1:
19309 case MVT::i8:
19310 case MVT::i32:
19311 // +- imm12
19312 return isUInt<12>(V);
19313 case MVT::i16:
19314 // +- imm8
19315 return isUInt<8>(V);
19316 case MVT::f32:
19317 case MVT::f64:
19318 if (!Subtarget->hasVFP2Base()) // FIXME: NEON?
19319 return false;
19320 return isShiftedUInt<8, 2>(V);
19321 }
19322}
19323
19325 EVT VT) const {
19326 int Scale = AM.Scale;
19327 if (Scale < 0)
19328 return false;
19329
19330 switch (VT.getSimpleVT().SimpleTy) {
19331 default: return false;
19332 case MVT::i1:
19333 case MVT::i8:
19334 case MVT::i16:
19335 case MVT::i32:
19336 if (Scale == 1)
19337 return true;
19338 // r + r << imm
19339 Scale = Scale & ~1;
19340 return Scale == 2 || Scale == 4 || Scale == 8;
19341 case MVT::i64:
19342 // FIXME: What are we trying to model here? ldrd doesn't have an r + r
19343 // version in Thumb mode.
19344 // r + r
19345 if (Scale == 1)
19346 return true;
19347 // r * 2 (this can be lowered to r + r).
19348 if (!AM.HasBaseReg && Scale == 2)
19349 return true;
19350 return false;
19351 case MVT::isVoid:
19352 // Note, we allow "void" uses (basically, uses that aren't loads or
19353 // stores), because arm allows folding a scale into many arithmetic
19354 // operations. This should be made more precise and revisited later.
19355
19356 // Allow r << imm, but the imm has to be a multiple of two.
19357 if (Scale & 1) return false;
19358 return isPowerOf2_32(Scale);
19359 }
19360}
19361
19363 EVT VT) const {
19364 const int Scale = AM.Scale;
19365
19366 // Negative scales are not supported in Thumb1.
19367 if (Scale < 0)
19368 return false;
19369
19370 // Thumb1 addressing modes do not support register scaling excepting the
19371 // following cases:
19372 // 1. Scale == 1 means no scaling.
19373 // 2. Scale == 2 this can be lowered to r + r if there is no base register.
19374 return (Scale == 1) || (!AM.HasBaseReg && Scale == 2);
19375}
19376
19377/// isLegalAddressingMode - Return true if the addressing mode represented
19378/// by AM is legal for this target, for a load/store of the specified type.
19380 const AddrMode &AM, Type *Ty,
19381 unsigned AS, Instruction *I) const {
19382 EVT VT = getValueType(DL, Ty, true);
19383 if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))
19384 return false;
19385
19386 // Can never fold addr of global into load/store.
19387 if (AM.BaseGV)
19388 return false;
19389
19390 switch (AM.Scale) {
19391 case 0: // no scale reg, must be "r+i" or "r", or "i".
19392 break;
19393 default:
19394 // ARM doesn't support any R+R*scale+imm addr modes.
19395 if (AM.BaseOffs)
19396 return false;
19397
19398 if (!VT.isSimple())
19399 return false;
19400
19401 if (Subtarget->isThumb1Only())
19402 return isLegalT1ScaledAddressingMode(AM, VT);
19403
19404 if (Subtarget->isThumb2())
19405 return isLegalT2ScaledAddressingMode(AM, VT);
19406
19407 int Scale = AM.Scale;
19408 switch (VT.getSimpleVT().SimpleTy) {
19409 default: return false;
19410 case MVT::i1:
19411 case MVT::i8:
19412 case MVT::i32:
19413 if (Scale < 0) Scale = -Scale;
19414 if (Scale == 1)
19415 return true;
19416 // r + r << imm
19417 return isPowerOf2_32(Scale & ~1);
19418 case MVT::i16:
19419 case MVT::i64:
19420 // r +/- r
19421 if (Scale == 1 || (AM.HasBaseReg && Scale == -1))
19422 return true;
19423 // r * 2 (this can be lowered to r + r).
19424 if (!AM.HasBaseReg && Scale == 2)
19425 return true;
19426 return false;
19427
19428 case MVT::isVoid:
19429 // Note, we allow "void" uses (basically, uses that aren't loads or
19430 // stores), because arm allows folding a scale into many arithmetic
19431 // operations. This should be made more precise and revisited later.
19432
19433 // Allow r << imm, but the imm has to be a multiple of two.
19434 if (Scale & 1) return false;
19435 return isPowerOf2_32(Scale);
19436 }
19437 }
19438 return true;
19439}
19440
19441/// isLegalICmpImmediate - Return true if the specified immediate is legal
19442/// icmp immediate, that is the target has icmp instructions which can compare
19443/// a register against the immediate without having to materialize the
19444/// immediate into a register.
19446 // Thumb2 and ARM modes can use cmn for negative immediates.
19447 if (!Subtarget->isThumb())
19448 return ARM_AM::getSOImmVal((uint32_t)Imm) != -1 ||
19449 ARM_AM::getSOImmVal(-(uint32_t)Imm) != -1;
19450 if (Subtarget->isThumb2())
19451 return ARM_AM::getT2SOImmVal((uint32_t)Imm) != -1 ||
19452 ARM_AM::getT2SOImmVal(-(uint32_t)Imm) != -1;
19453 // Thumb1 doesn't have cmn, and only 8-bit immediates.
19454 return Imm >= 0 && Imm <= 255;
19455}
19456
19457/// isLegalAddImmediate - Return true if the specified immediate is a legal add
19458/// *or sub* immediate, that is the target has add or sub instructions which can
19459/// add a register with the immediate without having to materialize the
19460/// immediate into a register.
19462 // Same encoding for add/sub, just flip the sign.
19463 uint64_t AbsImm = AbsoluteValue(Imm);
19464 if (!Subtarget->isThumb())
19465 return ARM_AM::getSOImmVal(AbsImm) != -1;
19466 if (Subtarget->isThumb2())
19467 return ARM_AM::getT2SOImmVal(AbsImm) != -1;
19468 // Thumb1 only has 8-bit unsigned immediate.
19469 return AbsImm <= 255;
19470}
19471
19472// Return false to prevent folding
19473// (mul (add r, c0), c1) -> (add (mul r, c1), c0*c1) in DAGCombine,
19474// if the folding leads to worse code.
19476 SDValue ConstNode) const {
19477 // Let the DAGCombiner decide for vector types and large types.
19478 const EVT VT = AddNode.getValueType();
19479 if (VT.isVector() || VT.getScalarSizeInBits() > 32)
19480 return true;
19481
19482 // It is worse if c0 is legal add immediate, while c1*c0 is not
19483 // and has to be composed by at least two instructions.
19484 const ConstantSDNode *C0Node = cast<ConstantSDNode>(AddNode.getOperand(1));
19485 const ConstantSDNode *C1Node = cast<ConstantSDNode>(ConstNode);
19486 const int64_t C0 = C0Node->getSExtValue();
19487 APInt CA = C0Node->getAPIntValue() * C1Node->getAPIntValue();
19489 return true;
19490 if (ConstantMaterializationCost((unsigned)CA.getZExtValue(), Subtarget) > 1)
19491 return false;
19492
19493 // Default to true and let the DAGCombiner decide.
19494 return true;
19495}
19496
19498 bool isSEXTLoad, SDValue &Base,
19499 SDValue &Offset, bool &isInc,
19500 SelectionDAG &DAG) {
19501 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19502 return false;
19503
19504 if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) {
19505 // AddressingMode 3
19506 Base = Ptr->getOperand(0);
19508 int RHSC = (int)RHS->getZExtValue();
19509 if (RHSC < 0 && RHSC > -256) {
19510 assert(Ptr->getOpcode() == ISD::ADD);
19511 isInc = false;
19512 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19513 return true;
19514 }
19515 }
19516 isInc = (Ptr->getOpcode() == ISD::ADD);
19517 Offset = Ptr->getOperand(1);
19518 return true;
19519 } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) {
19520 // AddressingMode 2
19522 int RHSC = (int)RHS->getZExtValue();
19523 if (RHSC < 0 && RHSC > -0x1000) {
19524 assert(Ptr->getOpcode() == ISD::ADD);
19525 isInc = false;
19526 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19527 Base = Ptr->getOperand(0);
19528 return true;
19529 }
19530 }
19531
19532 if (Ptr->getOpcode() == ISD::ADD) {
19533 isInc = true;
19534 ARM_AM::ShiftOpc ShOpcVal=
19536 if (ShOpcVal != ARM_AM::no_shift) {
19537 Base = Ptr->getOperand(1);
19538 Offset = Ptr->getOperand(0);
19539 } else {
19540 Base = Ptr->getOperand(0);
19541 Offset = Ptr->getOperand(1);
19542 }
19543 return true;
19544 }
19545
19546 isInc = (Ptr->getOpcode() == ISD::ADD);
19547 Base = Ptr->getOperand(0);
19548 Offset = Ptr->getOperand(1);
19549 return true;
19550 }
19551
19552 // FIXME: Use VLDM / VSTM to emulate indexed FP load / store.
19553 return false;
19554}
19555
19557 bool isSEXTLoad, SDValue &Base,
19558 SDValue &Offset, bool &isInc,
19559 SelectionDAG &DAG) {
19560 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19561 return false;
19562
19563 Base = Ptr->getOperand(0);
19565 int RHSC = (int)RHS->getZExtValue();
19566 if (RHSC < 0 && RHSC > -0x100) { // 8 bits.
19567 assert(Ptr->getOpcode() == ISD::ADD);
19568 isInc = false;
19569 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19570 return true;
19571 } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero.
19572 isInc = Ptr->getOpcode() == ISD::ADD;
19573 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
19574 return true;
19575 }
19576 }
19577
19578 return false;
19579}
19580
19581static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment,
19582 bool isSEXTLoad, bool IsMasked, bool isLE,
19584 bool &isInc, SelectionDAG &DAG) {
19585 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19586 return false;
19587 if (!isa<ConstantSDNode>(Ptr->getOperand(1)))
19588 return false;
19589
19590 // We allow LE non-masked loads to change the type (for example use a vldrb.8
19591 // as opposed to a vldrw.32). This can allow extra addressing modes or
19592 // alignments for what is otherwise an equivalent instruction.
19593 bool CanChangeType = isLE && !IsMasked;
19594
19596 int RHSC = (int)RHS->getZExtValue();
19597
19598 auto IsInRange = [&](int RHSC, int Limit, int Scale) {
19599 if (RHSC < 0 && RHSC > -Limit * Scale && RHSC % Scale == 0) {
19600 assert(Ptr->getOpcode() == ISD::ADD);
19601 isInc = false;
19602 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19603 return true;
19604 } else if (RHSC > 0 && RHSC < Limit * Scale && RHSC % Scale == 0) {
19605 isInc = Ptr->getOpcode() == ISD::ADD;
19606 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
19607 return true;
19608 }
19609 return false;
19610 };
19611
19612 // Try to find a matching instruction based on s/zext, Alignment, Offset and
19613 // (in BE/masked) type.
19614 Base = Ptr->getOperand(0);
19615 if (VT == MVT::v4i16) {
19616 if (Alignment >= 2 && IsInRange(RHSC, 0x80, 2))
19617 return true;
19618 } else if (VT == MVT::v4i8 || VT == MVT::v8i8) {
19619 if (IsInRange(RHSC, 0x80, 1))
19620 return true;
19621 } else if (Alignment >= 4 &&
19622 (CanChangeType || VT == MVT::v4i32 || VT == MVT::v4f32) &&
19623 IsInRange(RHSC, 0x80, 4))
19624 return true;
19625 else if (Alignment >= 2 &&
19626 (CanChangeType || VT == MVT::v8i16 || VT == MVT::v8f16) &&
19627 IsInRange(RHSC, 0x80, 2))
19628 return true;
19629 else if ((CanChangeType || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1))
19630 return true;
19631 return false;
19632}
19633
19634/// getPreIndexedAddressParts - returns true by value, base pointer and
19635/// offset pointer and addressing mode by reference if the node's address
19636/// can be legally represented as pre-indexed load / store address.
19637bool
19639 SDValue &Offset,
19641 SelectionDAG &DAG) const {
19642 if (Subtarget->isThumb1Only())
19643 return false;
19644
19645 EVT VT;
19646 SDValue Ptr;
19647 Align Alignment;
19648 bool isSEXTLoad = false;
19649 bool IsMasked = false;
19650 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
19651 Ptr = LD->getBasePtr();
19652 VT = LD->getMemoryVT();
19653 Alignment = LD->getAlign();
19654 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19655 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
19656 Ptr = ST->getBasePtr();
19657 VT = ST->getMemoryVT();
19658 Alignment = ST->getAlign();
19659 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
19660 Ptr = LD->getBasePtr();
19661 VT = LD->getMemoryVT();
19662 Alignment = LD->getAlign();
19663 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19664 IsMasked = true;
19666 Ptr = ST->getBasePtr();
19667 VT = ST->getMemoryVT();
19668 Alignment = ST->getAlign();
19669 IsMasked = true;
19670 } else
19671 return false;
19672
19673 bool isInc;
19674 bool isLegal = false;
19675 if (VT.isVector())
19676 isLegal = Subtarget->hasMVEIntegerOps() &&
19678 Ptr.getNode(), VT, Alignment, isSEXTLoad, IsMasked,
19679 Subtarget->isLittle(), Base, Offset, isInc, DAG);
19680 else {
19681 if (Subtarget->isThumb2())
19682 isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
19683 Offset, isInc, DAG);
19684 else
19685 isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
19686 Offset, isInc, DAG);
19687 }
19688 if (!isLegal)
19689 return false;
19690
19691 AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC;
19692 return true;
19693}
19694
19695/// getPostIndexedAddressParts - returns true by value, base pointer and
19696/// offset pointer and addressing mode by reference if this node can be
19697/// combined with a load / store to form a post-indexed load / store.
19699 SDValue &Base,
19700 SDValue &Offset,
19702 SelectionDAG &DAG) const {
19703 EVT VT;
19704 SDValue Ptr;
19705 Align Alignment;
19706 bool isSEXTLoad = false, isNonExt;
19707 bool IsMasked = false;
19708 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
19709 VT = LD->getMemoryVT();
19710 Ptr = LD->getBasePtr();
19711 Alignment = LD->getAlign();
19712 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19713 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
19714 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
19715 VT = ST->getMemoryVT();
19716 Ptr = ST->getBasePtr();
19717 Alignment = ST->getAlign();
19718 isNonExt = !ST->isTruncatingStore();
19719 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
19720 VT = LD->getMemoryVT();
19721 Ptr = LD->getBasePtr();
19722 Alignment = LD->getAlign();
19723 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19724 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
19725 IsMasked = true;
19727 VT = ST->getMemoryVT();
19728 Ptr = ST->getBasePtr();
19729 Alignment = ST->getAlign();
19730 isNonExt = !ST->isTruncatingStore();
19731 IsMasked = true;
19732 } else
19733 return false;
19734
19735 if (Subtarget->isThumb1Only()) {
19736 // Thumb-1 can do a limited post-inc load or store as an updating LDM. It
19737 // must be non-extending/truncating, i32, with an offset of 4.
19738 assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!");
19739 if (Op->getOpcode() != ISD::ADD || !isNonExt)
19740 return false;
19741 auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1));
19742 if (!RHS || RHS->getZExtValue() != 4)
19743 return false;
19744 if (Alignment < Align(4))
19745 return false;
19746
19747 Offset = Op->getOperand(1);
19748 Base = Op->getOperand(0);
19749 AM = ISD::POST_INC;
19750 return true;
19751 }
19752
19753 bool isInc;
19754 bool isLegal = false;
19755 if (VT.isVector())
19756 isLegal = Subtarget->hasMVEIntegerOps() &&
19757 getMVEIndexedAddressParts(Op, VT, Alignment, isSEXTLoad, IsMasked,
19758 Subtarget->isLittle(), Base, Offset,
19759 isInc, DAG);
19760 else {
19761 if (Subtarget->isThumb2())
19762 isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
19763 isInc, DAG);
19764 else
19765 isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
19766 isInc, DAG);
19767 }
19768 if (!isLegal)
19769 return false;
19770
19771 if (Ptr != Base) {
19772 // Swap base ptr and offset to catch more post-index load / store when
19773 // it's legal. In Thumb2 mode, offset must be an immediate.
19774 if (Ptr == Offset && Op->getOpcode() == ISD::ADD &&
19775 !Subtarget->isThumb2())
19777
19778 // Post-indexed load / store update the base pointer.
19779 if (Ptr != Base)
19780 return false;
19781 }
19782
19783 AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
19784 return true;
19785}
19786
19788 KnownBits &Known,
19789 const APInt &DemandedElts,
19790 const SelectionDAG &DAG,
19791 unsigned Depth) const {
19792 unsigned BitWidth = Known.getBitWidth();
19793 Known.resetAll();
19794 switch (Op.getOpcode()) {
19795 default: break;
19796 case ARMISD::ADDC:
19797 case ARMISD::ADDE:
19798 case ARMISD::SUBC:
19799 case ARMISD::SUBE:
19800 // Special cases when we convert a carry to a boolean.
19801 if (Op.getResNo() == 0) {
19802 SDValue LHS = Op.getOperand(0);
19803 SDValue RHS = Op.getOperand(1);
19804 // (ADDE 0, 0, C) will give us a single bit.
19805 if (Op->getOpcode() == ARMISD::ADDE && isNullConstant(LHS) &&
19806 isNullConstant(RHS)) {
19808 return;
19809 }
19810 }
19811 break;
19812 case ARMISD::CMOV: {
19813 // Bits are known zero/one if known on the LHS and RHS.
19814 Known = DAG.computeKnownBits(Op.getOperand(0), Depth+1);
19815 if (Known.isUnknown())
19816 return;
19817
19818 KnownBits KnownRHS = DAG.computeKnownBits(Op.getOperand(1), Depth+1);
19819 Known = Known.intersectWith(KnownRHS);
19820 return;
19821 }
19823 Intrinsic::ID IntID =
19824 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
19825 switch (IntID) {
19826 default: return;
19827 case Intrinsic::arm_ldaex:
19828 case Intrinsic::arm_ldrex: {
19829 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
19830 unsigned MemBits = VT.getScalarSizeInBits();
19831 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
19832 return;
19833 }
19834 }
19835 }
19836 case ARMISD::BFI: {
19837 // Conservatively, we can recurse down the first operand
19838 // and just mask out all affected bits.
19839 Known = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
19840
19841 // The operand to BFI is already a mask suitable for removing the bits it
19842 // sets.
19843 const APInt &Mask = Op.getConstantOperandAPInt(2);
19844 Known.Zero &= Mask;
19845 Known.One &= Mask;
19846 return;
19847 }
19848 case ARMISD::VGETLANEs:
19849 case ARMISD::VGETLANEu: {
19850 const SDValue &SrcSV = Op.getOperand(0);
19851 EVT VecVT = SrcSV.getValueType();
19852 assert(VecVT.isVector() && "VGETLANE expected a vector type");
19853 const unsigned NumSrcElts = VecVT.getVectorNumElements();
19854 ConstantSDNode *Pos = cast<ConstantSDNode>(Op.getOperand(1).getNode());
19855 assert(Pos->getAPIntValue().ult(NumSrcElts) &&
19856 "VGETLANE index out of bounds");
19857 unsigned Idx = Pos->getZExtValue();
19858 APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx);
19859 Known = DAG.computeKnownBits(SrcSV, DemandedElt, Depth + 1);
19860
19861 EVT VT = Op.getValueType();
19862 const unsigned DstSz = VT.getScalarSizeInBits();
19863 const unsigned SrcSz = VecVT.getVectorElementType().getSizeInBits();
19864 (void)SrcSz;
19865 assert(SrcSz == Known.getBitWidth());
19866 assert(DstSz > SrcSz);
19867 if (Op.getOpcode() == ARMISD::VGETLANEs)
19868 Known = Known.sext(DstSz);
19869 else {
19870 Known = Known.zext(DstSz);
19871 }
19872 assert(DstSz == Known.getBitWidth());
19873 break;
19874 }
19875 case ARMISD::VMOVrh: {
19876 KnownBits KnownOp = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
19877 assert(KnownOp.getBitWidth() == 16);
19878 Known = KnownOp.zext(32);
19879 break;
19880 }
19881 case ARMISD::CSINC:
19882 case ARMISD::CSINV:
19883 case ARMISD::CSNEG: {
19884 KnownBits KnownOp0 = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
19885 KnownBits KnownOp1 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
19886
19887 // The result is either:
19888 // CSINC: KnownOp0 or KnownOp1 + 1
19889 // CSINV: KnownOp0 or ~KnownOp1
19890 // CSNEG: KnownOp0 or KnownOp1 * -1
19891 if (Op.getOpcode() == ARMISD::CSINC)
19892 KnownOp1 =
19893 KnownBits::add(KnownOp1, KnownBits::makeConstant(APInt(32, 1)));
19894 else if (Op.getOpcode() == ARMISD::CSINV)
19895 std::swap(KnownOp1.Zero, KnownOp1.One);
19896 else if (Op.getOpcode() == ARMISD::CSNEG)
19897 KnownOp1 = KnownBits::mul(KnownOp1,
19899
19900 Known = KnownOp0.intersectWith(KnownOp1);
19901 break;
19902 }
19903 case ARMISD::VORRIMM:
19904 case ARMISD::VBICIMM: {
19905 unsigned Encoded = Op.getConstantOperandVal(1);
19906 unsigned DecEltBits = 0;
19907 uint64_t DecodedVal = ARM_AM::decodeVMOVModImm(Encoded, DecEltBits);
19908
19909 unsigned EltBits = Op.getScalarValueSizeInBits();
19910 if (EltBits != DecEltBits) {
19911 // Be conservative: only update Known when EltBits == DecEltBits.
19912 // This is believed to always be true for VORRIMM/VBICIMM today, but if
19913 // that changes in the future, doing nothing here is safer than risking
19914 // subtle bugs.
19915 break;
19916 }
19917
19918 KnownBits KnownLHS = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
19919 bool IsVORR = Op.getOpcode() == ARMISD::VORRIMM;
19920 APInt Imm(DecEltBits, DecodedVal);
19921
19922 Known.One = IsVORR ? (KnownLHS.One | Imm) : (KnownLHS.One & ~Imm);
19923 Known.Zero = IsVORR ? (KnownLHS.Zero & ~Imm) : (KnownLHS.Zero | Imm);
19924 break;
19925 }
19926 }
19927}
19928
19930 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
19931 TargetLoweringOpt &TLO) const {
19932 // Delay optimization, so we don't have to deal with illegal types, or block
19933 // optimizations.
19934 if (!TLO.LegalOps)
19935 return false;
19936
19937 // Only optimize AND for now.
19938 if (Op.getOpcode() != ISD::AND)
19939 return false;
19940
19941 EVT VT = Op.getValueType();
19942
19943 // Ignore vectors.
19944 if (VT.isVector())
19945 return false;
19946
19947 assert(VT == MVT::i32 && "Unexpected integer type");
19948
19949 // Make sure the RHS really is a constant.
19950 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
19951 if (!C)
19952 return false;
19953
19954 unsigned Mask = C->getZExtValue();
19955
19956 unsigned Demanded = DemandedBits.getZExtValue();
19957 unsigned ShrunkMask = Mask & Demanded;
19958 unsigned ExpandedMask = Mask | ~Demanded;
19959
19960 // If the mask is all zeros, let the target-independent code replace the
19961 // result with zero.
19962 if (ShrunkMask == 0)
19963 return false;
19964
19965 // If the mask is all ones, erase the AND. (Currently, the target-independent
19966 // code won't do this, so we have to do it explicitly to avoid an infinite
19967 // loop in obscure cases.)
19968 if (ExpandedMask == ~0U)
19969 return TLO.CombineTo(Op, Op.getOperand(0));
19970
19971 auto IsLegalMask = [ShrunkMask, ExpandedMask](unsigned Mask) -> bool {
19972 return (ShrunkMask & Mask) == ShrunkMask && (~ExpandedMask & Mask) == 0;
19973 };
19974 auto UseMask = [Mask, Op, VT, &TLO](unsigned NewMask) -> bool {
19975 if (NewMask == Mask)
19976 return true;
19977 SDLoc DL(Op);
19978 SDValue NewC = TLO.DAG.getConstant(NewMask, DL, VT);
19979 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
19980 return TLO.CombineTo(Op, NewOp);
19981 };
19982
19983 // Prefer uxtb mask.
19984 if (IsLegalMask(0xFF))
19985 return UseMask(0xFF);
19986
19987 // Prefer uxth mask.
19988 if (IsLegalMask(0xFFFF))
19989 return UseMask(0xFFFF);
19990
19991 // [1, 255] is Thumb1 movs+ands, legal immediate for ARM/Thumb2.
19992 // FIXME: Prefer a contiguous sequence of bits for other optimizations.
19993 if (ShrunkMask < 256)
19994 return UseMask(ShrunkMask);
19995
19996 // [-256, -2] is Thumb1 movs+bics, legal immediate for ARM/Thumb2.
19997 // FIXME: Prefer a contiguous sequence of bits for other optimizations.
19998 if ((int)ExpandedMask <= -2 && (int)ExpandedMask >= -256)
19999 return UseMask(ExpandedMask);
20000
20001 // Potential improvements:
20002 //
20003 // We could try to recognize lsls+lsrs or lsrs+lsls pairs here.
20004 // We could try to prefer Thumb1 immediates which can be lowered to a
20005 // two-instruction sequence.
20006 // We could try to recognize more legal ARM/Thumb2 immediates here.
20007
20008 return false;
20009}
20010
20012 SDValue Op, const APInt &OriginalDemandedBits,
20013 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
20014 unsigned Depth) const {
20015 unsigned Opc = Op.getOpcode();
20016
20017 switch (Opc) {
20018 case ARMISD::ASRL:
20019 case ARMISD::LSRL: {
20020 // If this is result 0 and the other result is unused, see if the demand
20021 // bits allow us to shrink this long shift into a standard small shift in
20022 // the opposite direction.
20023 if (Op.getResNo() == 0 && !Op->hasAnyUseOfValue(1) &&
20024 isa<ConstantSDNode>(Op->getOperand(2))) {
20025 unsigned ShAmt = Op->getConstantOperandVal(2);
20026 if (ShAmt < 32 && OriginalDemandedBits.isSubsetOf(APInt::getAllOnes(32)
20027 << (32 - ShAmt)))
20028 return TLO.CombineTo(
20029 Op, TLO.DAG.getNode(
20030 ISD::SHL, SDLoc(Op), MVT::i32, Op.getOperand(1),
20031 TLO.DAG.getConstant(32 - ShAmt, SDLoc(Op), MVT::i32)));
20032 }
20033 break;
20034 }
20035 case ARMISD::VBICIMM: {
20036 SDValue Op0 = Op.getOperand(0);
20037 unsigned ModImm = Op.getConstantOperandVal(1);
20038 unsigned EltBits = 0;
20039 uint64_t Mask = ARM_AM::decodeVMOVModImm(ModImm, EltBits);
20040 if ((OriginalDemandedBits & Mask) == 0)
20041 return TLO.CombineTo(Op, Op0);
20042 }
20043 }
20044
20046 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
20047}
20048
20049//===----------------------------------------------------------------------===//
20050// ARM Inline Assembly Support
20051//===----------------------------------------------------------------------===//
20052
20053const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const {
20054 // At this point, we have to lower this constraint to something else, so we
20055 // lower it to an "r" or "w". However, by doing this we will force the result
20056 // to be in register, while the X constraint is much more permissive.
20057 //
20058 // Although we are correct (we are free to emit anything, without
20059 // constraints), we might break use cases that would expect us to be more
20060 // efficient and emit something else.
20061 if (!Subtarget->hasVFP2Base())
20062 return "r";
20063 if (ConstraintVT.isFloatingPoint())
20064 return "w";
20065 if (ConstraintVT.isVector() && Subtarget->hasNEON() &&
20066 (ConstraintVT.getSizeInBits() == 64 ||
20067 ConstraintVT.getSizeInBits() == 128))
20068 return "w";
20069
20070 return "r";
20071}
20072
20073/// getConstraintType - Given a constraint letter, return the type of
20074/// constraint it is for this target.
20077 unsigned S = Constraint.size();
20078 if (S == 1) {
20079 switch (Constraint[0]) {
20080 default: break;
20081 case 'l': return C_RegisterClass;
20082 case 'w': return C_RegisterClass;
20083 case 'h': return C_RegisterClass;
20084 case 'x': return C_RegisterClass;
20085 case 't': return C_RegisterClass;
20086 case 'j': return C_Immediate; // Constant for movw.
20087 // An address with a single base register. Due to the way we
20088 // currently handle addresses it is the same as an 'r' memory constraint.
20089 case 'Q': return C_Memory;
20090 }
20091 } else if (S == 2) {
20092 switch (Constraint[0]) {
20093 default: break;
20094 case 'T': return C_RegisterClass;
20095 // All 'U+' constraints are addresses.
20096 case 'U': return C_Memory;
20097 }
20098 }
20099 return TargetLowering::getConstraintType(Constraint);
20100}
20101
20102/// Examine constraint type and operand type and determine a weight value.
20103/// This object must already have been set up with the operand type
20104/// and the current alternative constraint selected.
20107 AsmOperandInfo &info, const char *constraint) const {
20109 Value *CallOperandVal = info.CallOperandVal;
20110 // If we don't have a value, we can't do a match,
20111 // but allow it at the lowest weight.
20112 if (!CallOperandVal)
20113 return CW_Default;
20114 Type *type = CallOperandVal->getType();
20115 // Look at the constraint type.
20116 switch (*constraint) {
20117 default:
20119 break;
20120 case 'l':
20121 if (type->isIntegerTy()) {
20122 if (Subtarget->isThumb())
20123 weight = CW_SpecificReg;
20124 else
20125 weight = CW_Register;
20126 }
20127 break;
20128 case 'w':
20129 if (type->isFloatingPointTy())
20130 weight = CW_Register;
20131 break;
20132 }
20133 return weight;
20134}
20135
20136static bool isIncompatibleReg(const MCPhysReg &PR, MVT VT) {
20137 if (PR == 0 || VT == MVT::Other)
20138 return false;
20139 if (ARM::SPRRegClass.contains(PR))
20140 return VT != MVT::f32 && VT != MVT::f16 && VT != MVT::i32;
20141 if (ARM::DPRRegClass.contains(PR))
20142 return VT != MVT::f64 && !VT.is64BitVector();
20143 return false;
20144}
20145
20146using RCPair = std::pair<unsigned, const TargetRegisterClass *>;
20147
20149 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
20150 switch (Constraint.size()) {
20151 case 1:
20152 // GCC ARM Constraint Letters
20153 switch (Constraint[0]) {
20154 case 'l': // Low regs or general regs.
20155 if (Subtarget->isThumb())
20156 return RCPair(0U, &ARM::tGPRRegClass);
20157 return RCPair(0U, &ARM::GPRRegClass);
20158 case 'h': // High regs or no regs.
20159 if (Subtarget->isThumb())
20160 return RCPair(0U, &ARM::hGPRRegClass);
20161 break;
20162 case 'r':
20163 if (Subtarget->isThumb1Only())
20164 return RCPair(0U, &ARM::tGPRRegClass);
20165 return RCPair(0U, &ARM::GPRRegClass);
20166 case 'w':
20167 if (VT == MVT::Other)
20168 break;
20169 if (VT == MVT::f32 || VT == MVT::f16 || VT == MVT::bf16)
20170 return RCPair(0U, &ARM::SPRRegClass);
20171 if (VT.getSizeInBits() == 64)
20172 return RCPair(0U, &ARM::DPRRegClass);
20173 if (VT.getSizeInBits() == 128)
20174 return RCPair(0U, &ARM::QPRRegClass);
20175 break;
20176 case 'x':
20177 if (VT == MVT::Other)
20178 break;
20179 if (VT == MVT::f32 || VT == MVT::f16 || VT == MVT::bf16)
20180 return RCPair(0U, &ARM::SPR_8RegClass);
20181 if (VT.getSizeInBits() == 64)
20182 return RCPair(0U, &ARM::DPR_8RegClass);
20183 if (VT.getSizeInBits() == 128)
20184 return RCPair(0U, &ARM::QPR_8RegClass);
20185 break;
20186 case 't':
20187 if (VT == MVT::Other)
20188 break;
20189 if (VT == MVT::f32 || VT == MVT::i32 || VT == MVT::f16 || VT == MVT::bf16)
20190 return RCPair(0U, &ARM::SPRRegClass);
20191 if (VT.getSizeInBits() == 64)
20192 return RCPair(0U, &ARM::DPR_VFP2RegClass);
20193 if (VT.getSizeInBits() == 128)
20194 return RCPair(0U, &ARM::QPR_VFP2RegClass);
20195 break;
20196 }
20197 break;
20198
20199 case 2:
20200 if (Constraint[0] == 'T') {
20201 switch (Constraint[1]) {
20202 default:
20203 break;
20204 case 'e':
20205 return RCPair(0U, &ARM::tGPREvenRegClass);
20206 case 'o':
20207 return RCPair(0U, &ARM::tGPROddRegClass);
20208 }
20209 }
20210 break;
20211
20212 default:
20213 break;
20214 }
20215
20216 if (StringRef("{cc}").equals_insensitive(Constraint))
20217 return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
20218
20219 auto RCP = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
20220 if (isIncompatibleReg(RCP.first, VT))
20221 return {0, nullptr};
20222 return RCP;
20223}
20224
20225/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
20226/// vector. If it is invalid, don't add anything to Ops.
20228 StringRef Constraint,
20229 std::vector<SDValue> &Ops,
20230 SelectionDAG &DAG) const {
20231 SDValue Result;
20232
20233 // Currently only support length 1 constraints.
20234 if (Constraint.size() != 1)
20235 return;
20236
20237 char ConstraintLetter = Constraint[0];
20238 switch (ConstraintLetter) {
20239 default: break;
20240 case 'j':
20241 case 'I': case 'J': case 'K': case 'L':
20242 case 'M': case 'N': case 'O':
20244 if (!C)
20245 return;
20246
20247 int64_t CVal64 = C->getSExtValue();
20248 int CVal = (int) CVal64;
20249 // None of these constraints allow values larger than 32 bits. Check
20250 // that the value fits in an int.
20251 if (CVal != CVal64)
20252 return;
20253
20254 switch (ConstraintLetter) {
20255 case 'j':
20256 // Constant suitable for movw, must be between 0 and
20257 // 65535.
20258 if (Subtarget->hasV6T2Ops() || (Subtarget->hasV8MBaselineOps()))
20259 if (CVal >= 0 && CVal <= 65535)
20260 break;
20261 return;
20262 case 'I':
20263 if (Subtarget->isThumb1Only()) {
20264 // This must be a constant between 0 and 255, for ADD
20265 // immediates.
20266 if (CVal >= 0 && CVal <= 255)
20267 break;
20268 } else if (Subtarget->isThumb2()) {
20269 // A constant that can be used as an immediate value in a
20270 // data-processing instruction.
20271 if (ARM_AM::getT2SOImmVal(CVal) != -1)
20272 break;
20273 } else {
20274 // A constant that can be used as an immediate value in a
20275 // data-processing instruction.
20276 if (ARM_AM::getSOImmVal(CVal) != -1)
20277 break;
20278 }
20279 return;
20280
20281 case 'J':
20282 if (Subtarget->isThumb1Only()) {
20283 // This must be a constant between -255 and -1, for negated ADD
20284 // immediates. This can be used in GCC with an "n" modifier that
20285 // prints the negated value, for use with SUB instructions. It is
20286 // not useful otherwise but is implemented for compatibility.
20287 if (CVal >= -255 && CVal <= -1)
20288 break;
20289 } else {
20290 // This must be a constant between -4095 and 4095. This is suitable
20291 // for use as the immediate offset field in LDR and STR instructions
20292 // such as LDR r0,[r1,#offset].
20293 if (CVal >= -4095 && CVal <= 4095)
20294 break;
20295 }
20296 return;
20297
20298 case 'K':
20299 if (Subtarget->isThumb1Only()) {
20300 // A 32-bit value where only one byte has a nonzero value. Exclude
20301 // zero to match GCC. This constraint is used by GCC internally for
20302 // constants that can be loaded with a move/shift combination.
20303 // It is not useful otherwise but is implemented for compatibility.
20304 if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal))
20305 break;
20306 } else if (Subtarget->isThumb2()) {
20307 // A constant whose bitwise inverse can be used as an immediate
20308 // value in a data-processing instruction. This can be used in GCC
20309 // with a "B" modifier that prints the inverted value, for use with
20310 // BIC and MVN instructions. It is not useful otherwise but is
20311 // implemented for compatibility.
20312 if (ARM_AM::getT2SOImmVal(~CVal) != -1)
20313 break;
20314 } else {
20315 // A constant whose bitwise inverse can be used as an immediate
20316 // value in a data-processing instruction. This can be used in GCC
20317 // with a "B" modifier that prints the inverted value, for use with
20318 // BIC and MVN instructions. It is not useful otherwise but is
20319 // implemented for compatibility.
20320 if (ARM_AM::getSOImmVal(~CVal) != -1)
20321 break;
20322 }
20323 return;
20324
20325 case 'L':
20326 if (Subtarget->isThumb1Only()) {
20327 // This must be a constant between -7 and 7,
20328 // for 3-operand ADD/SUB immediate instructions.
20329 if (CVal >= -7 && CVal < 7)
20330 break;
20331 } else if (Subtarget->isThumb2()) {
20332 // A constant whose negation can be used as an immediate value in a
20333 // data-processing instruction. This can be used in GCC with an "n"
20334 // modifier that prints the negated value, for use with SUB
20335 // instructions. It is not useful otherwise but is implemented for
20336 // compatibility.
20337 if (ARM_AM::getT2SOImmVal(-CVal) != -1)
20338 break;
20339 } else {
20340 // A constant whose negation can be used as an immediate value in a
20341 // data-processing instruction. This can be used in GCC with an "n"
20342 // modifier that prints the negated value, for use with SUB
20343 // instructions. It is not useful otherwise but is implemented for
20344 // compatibility.
20345 if (ARM_AM::getSOImmVal(-CVal) != -1)
20346 break;
20347 }
20348 return;
20349
20350 case 'M':
20351 if (Subtarget->isThumb1Only()) {
20352 // This must be a multiple of 4 between 0 and 1020, for
20353 // ADD sp + immediate.
20354 if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0))
20355 break;
20356 } else {
20357 // A power of two or a constant between 0 and 32. This is used in
20358 // GCC for the shift amount on shifted register operands, but it is
20359 // useful in general for any shift amounts.
20360 if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0))
20361 break;
20362 }
20363 return;
20364
20365 case 'N':
20366 if (Subtarget->isThumb1Only()) {
20367 // This must be a constant between 0 and 31, for shift amounts.
20368 if (CVal >= 0 && CVal <= 31)
20369 break;
20370 }
20371 return;
20372
20373 case 'O':
20374 if (Subtarget->isThumb1Only()) {
20375 // This must be a multiple of 4 between -508 and 508, for
20376 // ADD/SUB sp = sp + immediate.
20377 if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0))
20378 break;
20379 }
20380 return;
20381 }
20382 Result = DAG.getSignedTargetConstant(CVal, SDLoc(Op), Op.getValueType());
20383 break;
20384 }
20385
20386 if (Result.getNode()) {
20387 Ops.push_back(Result);
20388 return;
20389 }
20390 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
20391}
20392
20393static RTLIB::Libcall getDivRemLibcall(
20394 const SDNode *N, MVT::SimpleValueType SVT) {
20395 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
20396 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) &&
20397 "Unhandled Opcode in getDivRemLibcall");
20398 bool isSigned = N->getOpcode() == ISD::SDIVREM ||
20399 N->getOpcode() == ISD::SREM;
20400 RTLIB::Libcall LC;
20401 switch (SVT) {
20402 default: llvm_unreachable("Unexpected request for libcall!");
20403 case MVT::i8: LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break;
20404 case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
20405 case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
20406 case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
20407 }
20408 return LC;
20409}
20410
20412 const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget) {
20413 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
20414 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) &&
20415 "Unhandled Opcode in getDivRemArgList");
20416 bool isSigned = N->getOpcode() == ISD::SDIVREM ||
20417 N->getOpcode() == ISD::SREM;
20419 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
20420 EVT ArgVT = N->getOperand(i).getValueType();
20421 Type *ArgTy = ArgVT.getTypeForEVT(*Context);
20422 TargetLowering::ArgListEntry Entry(N->getOperand(i), ArgTy);
20423 Entry.IsSExt = isSigned;
20424 Entry.IsZExt = !isSigned;
20425 Args.push_back(Entry);
20426 }
20427 if (Subtarget->isTargetWindows() && Args.size() >= 2)
20428 std::swap(Args[0], Args[1]);
20429 return Args;
20430}
20431
20432SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
20433 assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
20434 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
20435 Subtarget->isTargetFuchsia() || Subtarget->isTargetWindows()) &&
20436 "Register-based DivRem lowering only");
20437 unsigned Opcode = Op->getOpcode();
20438 assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
20439 "Invalid opcode for Div/Rem lowering");
20440 bool isSigned = (Opcode == ISD::SDIVREM);
20441 EVT VT = Op->getValueType(0);
20442 SDLoc dl(Op);
20443
20444 if (VT == MVT::i64 && isa<ConstantSDNode>(Op.getOperand(1))) {
20446 if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i32, DAG)) {
20447 SDValue Res0 =
20448 DAG.getNode(ISD::BUILD_PAIR, dl, VT, Result[0], Result[1]);
20449 SDValue Res1 =
20450 DAG.getNode(ISD::BUILD_PAIR, dl, VT, Result[2], Result[3]);
20451 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
20452 {Res0, Res1});
20453 }
20454 }
20455
20456 Type *Ty = VT.getTypeForEVT(*DAG.getContext());
20457
20458 // If the target has hardware divide, use divide + multiply + subtract:
20459 // div = a / b
20460 // rem = a - b * div
20461 // return {div, rem}
20462 // This should be lowered into UDIV/SDIV + MLS later on.
20463 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
20464 : Subtarget->hasDivideInARMMode();
20465 if (hasDivide && Op->getValueType(0).isSimple() &&
20466 Op->getSimpleValueType(0) == MVT::i32) {
20467 unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
20468 const SDValue Dividend = Op->getOperand(0);
20469 const SDValue Divisor = Op->getOperand(1);
20470 SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor);
20471 SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor);
20472 SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul);
20473
20474 SDValue Values[2] = {Div, Rem};
20475 return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values);
20476 }
20477
20478 RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(),
20479 VT.getSimpleVT().SimpleTy);
20480 SDValue InChain = DAG.getEntryNode();
20481
20483 DAG.getContext(),
20484 Subtarget);
20485
20488
20489 Type *RetTy = StructType::get(Ty, Ty);
20490
20491 if (Subtarget->isTargetWindows())
20492 InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain);
20493
20494 TargetLowering::CallLoweringInfo CLI(DAG);
20495 CLI.setDebugLoc(dl).setChain(InChain)
20496 .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
20498
20499 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
20500 return CallInfo.first;
20501}
20502
20503// Lowers REM using divmod helpers
20504// see RTABI section 4.2/4.3
20505SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const {
20506 EVT VT = N->getValueType(0);
20507
20508 if (VT == MVT::i64 && isa<ConstantSDNode>(N->getOperand(1))) {
20510 if (expandDIVREMByConstant(N, Result, MVT::i32, DAG))
20511 return DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), N->getValueType(0),
20512 Result[0], Result[1]);
20513 }
20514
20515 // Build return types (div and rem)
20516 std::vector<Type*> RetTyParams;
20517 Type *RetTyElement;
20518
20519 switch (VT.getSimpleVT().SimpleTy) {
20520 default: llvm_unreachable("Unexpected request for libcall!");
20521 case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break;
20522 case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break;
20523 case MVT::i32: RetTyElement = Type::getInt32Ty(*DAG.getContext()); break;
20524 case MVT::i64: RetTyElement = Type::getInt64Ty(*DAG.getContext()); break;
20525 }
20526
20527 RetTyParams.push_back(RetTyElement);
20528 RetTyParams.push_back(RetTyElement);
20529 ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams);
20530 Type *RetTy = StructType::get(*DAG.getContext(), ret);
20531
20532 RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT().
20533 SimpleTy);
20534 SDValue InChain = DAG.getEntryNode();
20536 Subtarget);
20537 bool isSigned = N->getOpcode() == ISD::SREM;
20540
20541 if (Subtarget->isTargetWindows())
20542 InChain = WinDBZCheckDenominator(DAG, N, InChain);
20543
20544 // Lower call
20545 CallLoweringInfo CLI(DAG);
20546 CLI.setChain(InChain)
20547 .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args))
20549 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
20550
20551 // Return second (rem) result operand (first contains div)
20552 SDNode *ResNode = CallResult.first.getNode();
20553 assert(ResNode->getNumOperands() == 2 && "divmod should return two operands");
20554 return ResNode->getOperand(1);
20555}
20556
20557SDValue
20558ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
20559 assert(Subtarget->isTargetWindows() && "unsupported target platform");
20560 SDLoc DL(Op);
20561
20562 // Get the inputs.
20563 SDValue Chain = Op.getOperand(0);
20564 SDValue Size = Op.getOperand(1);
20565
20567 "no-stack-arg-probe")) {
20568 MaybeAlign Align =
20569 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
20570 SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
20571 Chain = SP.getValue(1);
20572 SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size);
20573 if (Align)
20574 SP = DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0),
20575 DAG.getSignedConstant(-Align->value(), DL, MVT::i32));
20576 Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP);
20577 SDValue Ops[2] = { SP, Chain };
20578 return DAG.getMergeValues(Ops, DL);
20579 }
20580
20581 SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size,
20582 DAG.getConstant(2, DL, MVT::i32));
20583
20584 SDValue Glue;
20585 Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Glue);
20586 Glue = Chain.getValue(1);
20587
20588 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
20589 Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Glue);
20590
20591 SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
20592 Chain = NewSP.getValue(1);
20593
20594 SDValue Ops[2] = { NewSP, Chain };
20595 return DAG.getMergeValues(Ops, DL);
20596}
20597
20598SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
20599 bool IsStrict = Op->isStrictFPOpcode();
20600 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
20601 const unsigned DstSz = Op.getValueType().getSizeInBits();
20602 const unsigned SrcSz = SrcVal.getValueType().getSizeInBits();
20603 assert(DstSz > SrcSz && DstSz <= 64 && SrcSz >= 16 &&
20604 "Unexpected type for custom-lowering FP_EXTEND");
20605
20606 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
20607 "With both FP DP and 16, any FP conversion is legal!");
20608
20609 assert(!(DstSz == 32 && Subtarget->hasFP16()) &&
20610 "With FP16, 16 to 32 conversion is legal!");
20611
20612 // Converting from 32 -> 64 is valid if we have FP64.
20613 if (SrcSz == 32 && DstSz == 64 && Subtarget->hasFP64()) {
20614 // FIXME: Remove this when we have strict fp instruction selection patterns
20615 if (IsStrict) {
20616 SDLoc Loc(Op);
20617 SDValue Result = DAG.getNode(ISD::FP_EXTEND,
20618 Loc, Op.getValueType(), SrcVal);
20619 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc);
20620 }
20621 return Op;
20622 }
20623
20624 // Either we are converting from 16 -> 64, without FP16 and/or
20625 // FP.double-precision or without Armv8-fp. So we must do it in two
20626 // steps.
20627 // Or we are converting from 32 -> 64 without fp.double-precision or 16 -> 32
20628 // without FP16. So we must do a function call.
20629 SDLoc Loc(Op);
20630 RTLIB::Libcall LC;
20631 MakeLibCallOptions CallOptions;
20632 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
20633 for (unsigned Sz = SrcSz; Sz <= 32 && Sz < DstSz; Sz *= 2) {
20634 bool Supported = (Sz == 16 ? Subtarget->hasFP16() : Subtarget->hasFP64());
20635 MVT SrcVT = (Sz == 16 ? MVT::f16 : MVT::f32);
20636 MVT DstVT = (Sz == 16 ? MVT::f32 : MVT::f64);
20637 if (Supported) {
20638 if (IsStrict) {
20639 SrcVal = DAG.getNode(ISD::STRICT_FP_EXTEND, Loc,
20640 {DstVT, MVT::Other}, {Chain, SrcVal});
20641 Chain = SrcVal.getValue(1);
20642 } else {
20643 SrcVal = DAG.getNode(ISD::FP_EXTEND, Loc, DstVT, SrcVal);
20644 }
20645 } else {
20646 LC = RTLIB::getFPEXT(SrcVT, DstVT);
20647 assert(LC != RTLIB::UNKNOWN_LIBCALL &&
20648 "Unexpected type for custom-lowering FP_EXTEND");
20649 std::tie(SrcVal, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions,
20650 Loc, Chain);
20651 }
20652 }
20653
20654 return IsStrict ? DAG.getMergeValues({SrcVal, Chain}, Loc) : SrcVal;
20655}
20656
20657SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
20658 bool IsStrict = Op->isStrictFPOpcode();
20659
20660 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
20661 EVT SrcVT = SrcVal.getValueType();
20662 EVT DstVT = Op.getValueType();
20663 const unsigned DstSz = Op.getValueType().getSizeInBits();
20664 const unsigned SrcSz = SrcVT.getSizeInBits();
20665 (void)DstSz;
20666 assert(DstSz < SrcSz && SrcSz <= 64 && DstSz >= 16 &&
20667 "Unexpected type for custom-lowering FP_ROUND");
20668
20669 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
20670 "With both FP DP and 16, any FP conversion is legal!");
20671
20672 SDLoc Loc(Op);
20673
20674 // Instruction from 32 -> 16 if hasFP16 is valid
20675 if (SrcSz == 32 && Subtarget->hasFP16())
20676 return Op;
20677
20678 // Lib call from 32 -> 16 / 64 -> [32, 16]
20679 RTLIB::Libcall LC = RTLIB::getFPROUND(SrcVT, DstVT);
20680 assert(LC != RTLIB::UNKNOWN_LIBCALL &&
20681 "Unexpected type for custom-lowering FP_ROUND");
20682 MakeLibCallOptions CallOptions;
20683 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
20685 std::tie(Result, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions,
20686 Loc, Chain);
20687 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;
20688}
20689
20690bool
20692 // The ARM target isn't yet aware of offsets.
20693 return false;
20694}
20695
20697 if (v == 0xffffffff)
20698 return false;
20699
20700 // there can be 1's on either or both "outsides", all the "inside"
20701 // bits must be 0's
20702 return isShiftedMask_32(~v);
20703}
20704
20705/// isFPImmLegal - Returns true if the target can instruction select the
20706/// specified FP immediate natively. If false, the legalizer will
20707/// materialize the FP immediate as a load from a constant pool.
20709 bool ForCodeSize) const {
20710 if (!Subtarget->hasVFP3Base())
20711 return false;
20712 if (VT == MVT::f16 && Subtarget->hasFullFP16())
20713 return ARM_AM::getFP16Imm(Imm) != -1;
20714 if (VT == MVT::f32 && Subtarget->hasFullFP16() &&
20715 ARM_AM::getFP32FP16Imm(Imm) != -1)
20716 return true;
20717 if (VT == MVT::f32)
20718 return ARM_AM::getFP32Imm(Imm) != -1;
20719 if (VT == MVT::f64 && Subtarget->hasFP64())
20720 return ARM_AM::getFP64Imm(Imm) != -1;
20721 return false;
20722}
20723
20724/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
20725/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
20726/// specified in the intrinsic calls.
20728 const CallInst &I,
20729 MachineFunction &MF,
20730 unsigned Intrinsic) const {
20731 switch (Intrinsic) {
20732 case Intrinsic::arm_neon_vld1:
20733 case Intrinsic::arm_neon_vld2:
20734 case Intrinsic::arm_neon_vld3:
20735 case Intrinsic::arm_neon_vld4:
20736 case Intrinsic::arm_neon_vld2lane:
20737 case Intrinsic::arm_neon_vld3lane:
20738 case Intrinsic::arm_neon_vld4lane:
20739 case Intrinsic::arm_neon_vld2dup:
20740 case Intrinsic::arm_neon_vld3dup:
20741 case Intrinsic::arm_neon_vld4dup: {
20742 Info.opc = ISD::INTRINSIC_W_CHAIN;
20743 // Conservatively set memVT to the entire set of vectors loaded.
20744 auto &DL = I.getDataLayout();
20745 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
20746 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
20747 Info.ptrVal = I.getArgOperand(0);
20748 Info.offset = 0;
20749 Value *AlignArg = I.getArgOperand(I.arg_size() - 1);
20750 Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();
20751 // volatile loads with NEON intrinsics not supported
20752 Info.flags = MachineMemOperand::MOLoad;
20753 return true;
20754 }
20755 case Intrinsic::arm_neon_vld1x2:
20756 case Intrinsic::arm_neon_vld1x3:
20757 case Intrinsic::arm_neon_vld1x4: {
20758 Info.opc = ISD::INTRINSIC_W_CHAIN;
20759 // Conservatively set memVT to the entire set of vectors loaded.
20760 auto &DL = I.getDataLayout();
20761 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
20762 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
20763 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
20764 Info.offset = 0;
20765 Info.align = I.getParamAlign(I.arg_size() - 1).valueOrOne();
20766 // volatile loads with NEON intrinsics not supported
20767 Info.flags = MachineMemOperand::MOLoad;
20768 return true;
20769 }
20770 case Intrinsic::arm_neon_vst1:
20771 case Intrinsic::arm_neon_vst2:
20772 case Intrinsic::arm_neon_vst3:
20773 case Intrinsic::arm_neon_vst4:
20774 case Intrinsic::arm_neon_vst2lane:
20775 case Intrinsic::arm_neon_vst3lane:
20776 case Intrinsic::arm_neon_vst4lane: {
20777 Info.opc = ISD::INTRINSIC_VOID;
20778 // Conservatively set memVT to the entire set of vectors stored.
20779 auto &DL = I.getDataLayout();
20780 unsigned NumElts = 0;
20781 for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) {
20782 Type *ArgTy = I.getArgOperand(ArgI)->getType();
20783 if (!ArgTy->isVectorTy())
20784 break;
20785 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
20786 }
20787 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
20788 Info.ptrVal = I.getArgOperand(0);
20789 Info.offset = 0;
20790 Value *AlignArg = I.getArgOperand(I.arg_size() - 1);
20791 Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();
20792 // volatile stores with NEON intrinsics not supported
20793 Info.flags = MachineMemOperand::MOStore;
20794 return true;
20795 }
20796 case Intrinsic::arm_neon_vst1x2:
20797 case Intrinsic::arm_neon_vst1x3:
20798 case Intrinsic::arm_neon_vst1x4: {
20799 Info.opc = ISD::INTRINSIC_VOID;
20800 // Conservatively set memVT to the entire set of vectors stored.
20801 auto &DL = I.getDataLayout();
20802 unsigned NumElts = 0;
20803 for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) {
20804 Type *ArgTy = I.getArgOperand(ArgI)->getType();
20805 if (!ArgTy->isVectorTy())
20806 break;
20807 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
20808 }
20809 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
20810 Info.ptrVal = I.getArgOperand(0);
20811 Info.offset = 0;
20812 Info.align = I.getParamAlign(0).valueOrOne();
20813 // volatile stores with NEON intrinsics not supported
20814 Info.flags = MachineMemOperand::MOStore;
20815 return true;
20816 }
20817 case Intrinsic::arm_mve_vld2q:
20818 case Intrinsic::arm_mve_vld4q: {
20819 Info.opc = ISD::INTRINSIC_W_CHAIN;
20820 // Conservatively set memVT to the entire set of vectors loaded.
20821 Type *VecTy = cast<StructType>(I.getType())->getElementType(1);
20822 unsigned Factor = Intrinsic == Intrinsic::arm_mve_vld2q ? 2 : 4;
20823 Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);
20824 Info.ptrVal = I.getArgOperand(0);
20825 Info.offset = 0;
20826 Info.align = Align(VecTy->getScalarSizeInBits() / 8);
20827 // volatile loads with MVE intrinsics not supported
20828 Info.flags = MachineMemOperand::MOLoad;
20829 return true;
20830 }
20831 case Intrinsic::arm_mve_vst2q:
20832 case Intrinsic::arm_mve_vst4q: {
20833 Info.opc = ISD::INTRINSIC_VOID;
20834 // Conservatively set memVT to the entire set of vectors stored.
20835 Type *VecTy = I.getArgOperand(1)->getType();
20836 unsigned Factor = Intrinsic == Intrinsic::arm_mve_vst2q ? 2 : 4;
20837 Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);
20838 Info.ptrVal = I.getArgOperand(0);
20839 Info.offset = 0;
20840 Info.align = Align(VecTy->getScalarSizeInBits() / 8);
20841 // volatile stores with MVE intrinsics not supported
20842 Info.flags = MachineMemOperand::MOStore;
20843 return true;
20844 }
20845 case Intrinsic::arm_mve_vldr_gather_base:
20846 case Intrinsic::arm_mve_vldr_gather_base_predicated: {
20847 Info.opc = ISD::INTRINSIC_W_CHAIN;
20848 Info.ptrVal = nullptr;
20849 Info.memVT = MVT::getVT(I.getType());
20850 Info.align = Align(1);
20851 Info.flags |= MachineMemOperand::MOLoad;
20852 return true;
20853 }
20854 case Intrinsic::arm_mve_vldr_gather_base_wb:
20855 case Intrinsic::arm_mve_vldr_gather_base_wb_predicated: {
20856 Info.opc = ISD::INTRINSIC_W_CHAIN;
20857 Info.ptrVal = nullptr;
20858 Info.memVT = MVT::getVT(I.getType()->getContainedType(0));
20859 Info.align = Align(1);
20860 Info.flags |= MachineMemOperand::MOLoad;
20861 return true;
20862 }
20863 case Intrinsic::arm_mve_vldr_gather_offset:
20864 case Intrinsic::arm_mve_vldr_gather_offset_predicated: {
20865 Info.opc = ISD::INTRINSIC_W_CHAIN;
20866 Info.ptrVal = nullptr;
20867 MVT DataVT = MVT::getVT(I.getType());
20868 unsigned MemSize = cast<ConstantInt>(I.getArgOperand(2))->getZExtValue();
20869 Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize),
20870 DataVT.getVectorNumElements());
20871 Info.align = Align(1);
20872 Info.flags |= MachineMemOperand::MOLoad;
20873 return true;
20874 }
20875 case Intrinsic::arm_mve_vstr_scatter_base:
20876 case Intrinsic::arm_mve_vstr_scatter_base_predicated: {
20877 Info.opc = ISD::INTRINSIC_VOID;
20878 Info.ptrVal = nullptr;
20879 Info.memVT = MVT::getVT(I.getArgOperand(2)->getType());
20880 Info.align = Align(1);
20881 Info.flags |= MachineMemOperand::MOStore;
20882 return true;
20883 }
20884 case Intrinsic::arm_mve_vstr_scatter_base_wb:
20885 case Intrinsic::arm_mve_vstr_scatter_base_wb_predicated: {
20886 Info.opc = ISD::INTRINSIC_W_CHAIN;
20887 Info.ptrVal = nullptr;
20888 Info.memVT = MVT::getVT(I.getArgOperand(2)->getType());
20889 Info.align = Align(1);
20890 Info.flags |= MachineMemOperand::MOStore;
20891 return true;
20892 }
20893 case Intrinsic::arm_mve_vstr_scatter_offset:
20894 case Intrinsic::arm_mve_vstr_scatter_offset_predicated: {
20895 Info.opc = ISD::INTRINSIC_VOID;
20896 Info.ptrVal = nullptr;
20897 MVT DataVT = MVT::getVT(I.getArgOperand(2)->getType());
20898 unsigned MemSize = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
20899 Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize),
20900 DataVT.getVectorNumElements());
20901 Info.align = Align(1);
20902 Info.flags |= MachineMemOperand::MOStore;
20903 return true;
20904 }
20905 case Intrinsic::arm_ldaex:
20906 case Intrinsic::arm_ldrex: {
20907 auto &DL = I.getDataLayout();
20908 Type *ValTy = I.getParamElementType(0);
20909 Info.opc = ISD::INTRINSIC_W_CHAIN;
20910 Info.memVT = MVT::getVT(ValTy);
20911 Info.ptrVal = I.getArgOperand(0);
20912 Info.offset = 0;
20913 Info.align = DL.getABITypeAlign(ValTy);
20915 return true;
20916 }
20917 case Intrinsic::arm_stlex:
20918 case Intrinsic::arm_strex: {
20919 auto &DL = I.getDataLayout();
20920 Type *ValTy = I.getParamElementType(1);
20921 Info.opc = ISD::INTRINSIC_W_CHAIN;
20922 Info.memVT = MVT::getVT(ValTy);
20923 Info.ptrVal = I.getArgOperand(1);
20924 Info.offset = 0;
20925 Info.align = DL.getABITypeAlign(ValTy);
20927 return true;
20928 }
20929 case Intrinsic::arm_stlexd:
20930 case Intrinsic::arm_strexd:
20931 Info.opc = ISD::INTRINSIC_W_CHAIN;
20932 Info.memVT = MVT::i64;
20933 Info.ptrVal = I.getArgOperand(2);
20934 Info.offset = 0;
20935 Info.align = Align(8);
20937 return true;
20938
20939 case Intrinsic::arm_ldaexd:
20940 case Intrinsic::arm_ldrexd:
20941 Info.opc = ISD::INTRINSIC_W_CHAIN;
20942 Info.memVT = MVT::i64;
20943 Info.ptrVal = I.getArgOperand(0);
20944 Info.offset = 0;
20945 Info.align = Align(8);
20947 return true;
20948
20949 default:
20950 break;
20951 }
20952
20953 return false;
20954}
20955
20956/// Returns true if it is beneficial to convert a load of a constant
20957/// to just the constant itself.
20959 Type *Ty) const {
20960 assert(Ty->isIntegerTy());
20961
20962 unsigned Bits = Ty->getPrimitiveSizeInBits();
20963 if (Bits == 0 || Bits > 32)
20964 return false;
20965 return true;
20966}
20967
20969 unsigned Index) const {
20971 return false;
20972
20973 return (Index == 0 || Index == ResVT.getVectorNumElements());
20974}
20975
20977 ARM_MB::MemBOpt Domain) const {
20978 // First, if the target has no DMB, see what fallback we can use.
20979 if (!Subtarget->hasDataBarrier()) {
20980 // Some ARMv6 cpus can support data barriers with an mcr instruction.
20981 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
20982 // here.
20983 if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) {
20984 Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0),
20985 Builder.getInt32(0), Builder.getInt32(7),
20986 Builder.getInt32(10), Builder.getInt32(5)};
20987 return Builder.CreateIntrinsic(Intrinsic::arm_mcr, args);
20988 } else {
20989 // Instead of using barriers, atomic accesses on these subtargets use
20990 // libcalls.
20991 llvm_unreachable("makeDMB on a target so old that it has no barriers");
20992 }
20993 } else {
20994 // Only a full system barrier exists in the M-class architectures.
20995 Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain;
20996 Constant *CDomain = Builder.getInt32(Domain);
20997 return Builder.CreateIntrinsic(Intrinsic::arm_dmb, CDomain);
20998 }
20999}
21000
21001// Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
21003 Instruction *Inst,
21004 AtomicOrdering Ord) const {
21005 switch (Ord) {
21008 llvm_unreachable("Invalid fence: unordered/non-atomic");
21011 return nullptr; // Nothing to do
21013 if (!Inst->hasAtomicStore())
21014 return nullptr; // Nothing to do
21015 [[fallthrough]];
21018 if (Subtarget->preferISHSTBarriers())
21019 return makeDMB(Builder, ARM_MB::ISHST);
21020 // FIXME: add a comment with a link to documentation justifying this.
21021 else
21022 return makeDMB(Builder, ARM_MB::ISH);
21023 }
21024 llvm_unreachable("Unknown fence ordering in emitLeadingFence");
21025}
21026
21028 Instruction *Inst,
21029 AtomicOrdering Ord) const {
21030 switch (Ord) {
21033 llvm_unreachable("Invalid fence: unordered/not-atomic");
21036 return nullptr; // Nothing to do
21040 return makeDMB(Builder, ARM_MB::ISH);
21041 }
21042 llvm_unreachable("Unknown fence ordering in emitTrailingFence");
21043}
21044
21045// Loads and stores less than 64-bits are already atomic; ones above that
21046// are doomed anyway, so defer to the default libcall and blame the OS when
21047// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
21048// anything for those.
21051 bool has64BitAtomicStore;
21052 if (Subtarget->isMClass())
21053 has64BitAtomicStore = false;
21054 else if (Subtarget->isThumb())
21055 has64BitAtomicStore = Subtarget->hasV7Ops();
21056 else
21057 has64BitAtomicStore = Subtarget->hasV6Ops();
21058
21059 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
21060 return Size == 64 && has64BitAtomicStore ? AtomicExpansionKind::Expand
21062}
21063
21064// Loads and stores less than 64-bits are already atomic; ones above that
21065// are doomed anyway, so defer to the default libcall and blame the OS when
21066// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
21067// anything for those.
21068// FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that
21069// guarantee, see DDI0406C ARM architecture reference manual,
21070// sections A8.8.72-74 LDRD)
21073 bool has64BitAtomicLoad;
21074 if (Subtarget->isMClass())
21075 has64BitAtomicLoad = false;
21076 else if (Subtarget->isThumb())
21077 has64BitAtomicLoad = Subtarget->hasV7Ops();
21078 else
21079 has64BitAtomicLoad = Subtarget->hasV6Ops();
21080
21081 unsigned Size = LI->getType()->getPrimitiveSizeInBits();
21082 return (Size == 64 && has64BitAtomicLoad) ? AtomicExpansionKind::LLOnly
21084}
21085
21086// For the real atomic operations, we have ldrex/strex up to 32 bits,
21087// and up to 64 bits on the non-M profiles
21090 if (AI->isFloatingPointOperation())
21092
21093 unsigned Size = AI->getType()->getPrimitiveSizeInBits();
21094 bool hasAtomicRMW;
21095 if (Subtarget->isMClass())
21096 hasAtomicRMW = Subtarget->hasV8MBaselineOps();
21097 else if (Subtarget->isThumb())
21098 hasAtomicRMW = Subtarget->hasV7Ops();
21099 else
21100 hasAtomicRMW = Subtarget->hasV6Ops();
21101 if (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW) {
21102 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
21103 // implement atomicrmw without spilling. If the target address is also on
21104 // the stack and close enough to the spill slot, this can lead to a
21105 // situation where the monitor always gets cleared and the atomic operation
21106 // can never succeed. So at -O0 lower this operation to a CAS loop.
21107 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
21110 }
21112}
21113
21114// Similar to shouldExpandAtomicRMWInIR, ldrex/strex can be used up to 32
21115// bits, and up to 64 bits on the non-M profiles.
21118 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
21119 // implement cmpxchg without spilling. If the address being exchanged is also
21120 // on the stack and close enough to the spill slot, this can lead to a
21121 // situation where the monitor always gets cleared and the atomic operation
21122 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
21123 unsigned Size = AI->getOperand(1)->getType()->getPrimitiveSizeInBits();
21124 bool HasAtomicCmpXchg;
21125 if (Subtarget->isMClass())
21126 HasAtomicCmpXchg = Subtarget->hasV8MBaselineOps();
21127 else if (Subtarget->isThumb())
21128 HasAtomicCmpXchg = Subtarget->hasV7Ops();
21129 else
21130 HasAtomicCmpXchg = Subtarget->hasV6Ops();
21131 if (getTargetMachine().getOptLevel() != CodeGenOptLevel::None &&
21132 HasAtomicCmpXchg && Size <= (Subtarget->isMClass() ? 32U : 64U))
21135}
21136
21138 const Instruction *I) const {
21139 return InsertFencesForAtomic;
21140}
21141
21143 // ROPI/RWPI are not supported currently.
21144 return !Subtarget->isROPI() && !Subtarget->isRWPI();
21145}
21146
21148 // MSVC CRT provides functionalities for stack protection.
21149 RTLIB::LibcallImpl SecurityCheckCookieLibcall =
21150 getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
21151
21152 RTLIB::LibcallImpl SecurityCookieVar =
21153 getLibcallImpl(RTLIB::STACK_CHECK_GUARD);
21154 if (SecurityCheckCookieLibcall != RTLIB::Unsupported &&
21155 SecurityCookieVar != RTLIB::Unsupported) {
21156 // MSVC CRT has a global variable holding security cookie.
21157 M.getOrInsertGlobal(getLibcallImplName(SecurityCookieVar),
21158 PointerType::getUnqual(M.getContext()));
21159
21160 // MSVC CRT has a function to validate security cookie.
21161 FunctionCallee SecurityCheckCookie =
21162 M.getOrInsertFunction(getLibcallImplName(SecurityCheckCookieLibcall),
21163 Type::getVoidTy(M.getContext()),
21164 PointerType::getUnqual(M.getContext()));
21165 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee()))
21166 F->addParamAttr(0, Attribute::AttrKind::InReg);
21167 }
21168
21170}
21171
21173 unsigned &Cost) const {
21174 // If we do not have NEON, vector types are not natively supported.
21175 if (!Subtarget->hasNEON())
21176 return false;
21177
21178 // Floating point values and vector values map to the same register file.
21179 // Therefore, although we could do a store extract of a vector type, this is
21180 // better to leave at float as we have more freedom in the addressing mode for
21181 // those.
21182 if (VectorTy->isFPOrFPVectorTy())
21183 return false;
21184
21185 // If the index is unknown at compile time, this is very expensive to lower
21186 // and it is not possible to combine the store with the extract.
21187 if (!isa<ConstantInt>(Idx))
21188 return false;
21189
21190 assert(VectorTy->isVectorTy() && "VectorTy is not a vector type");
21191 unsigned BitWidth = VectorTy->getPrimitiveSizeInBits().getFixedValue();
21192 // We can do a store + vector extract on any vector that fits perfectly in a D
21193 // or Q register.
21194 if (BitWidth == 64 || BitWidth == 128) {
21195 Cost = 0;
21196 return true;
21197 }
21198 return false;
21199}
21200
21202 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
21203 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
21204 unsigned Opcode = Op.getOpcode();
21205 switch (Opcode) {
21206 case ARMISD::VORRIMM:
21207 case ARMISD::VBICIMM:
21208 return false;
21209 }
21211 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
21212}
21213
21215 return Subtarget->hasV5TOps() && !Subtarget->isThumb1Only();
21216}
21217
21219 return Subtarget->hasV5TOps() && !Subtarget->isThumb1Only();
21220}
21221
21223 const Instruction &AndI) const {
21224 if (!Subtarget->hasV7Ops())
21225 return false;
21226
21227 // Sink the `and` instruction only if the mask would fit into a modified
21228 // immediate operand.
21230 if (!Mask || Mask->getValue().getBitWidth() > 32u)
21231 return false;
21232 auto MaskVal = unsigned(Mask->getValue().getZExtValue());
21233 return (Subtarget->isThumb2() ? ARM_AM::getT2SOImmVal(MaskVal)
21234 : ARM_AM::getSOImmVal(MaskVal)) != -1;
21235}
21236
21239 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
21240 if (Subtarget->hasMinSize() && !Subtarget->isTargetWindows())
21243 ExpansionFactor);
21244}
21245
21247 Value *Addr,
21248 AtomicOrdering Ord) const {
21249 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
21250 bool IsAcquire = isAcquireOrStronger(Ord);
21251
21252 // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd
21253 // intrinsic must return {i32, i32} and we have to recombine them into a
21254 // single i64 here.
21255 if (ValueTy->getPrimitiveSizeInBits() == 64) {
21257 IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd;
21258
21259 Value *LoHi =
21260 Builder.CreateIntrinsic(Int, Addr, /*FMFSource=*/nullptr, "lohi");
21261
21262 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
21263 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
21264 if (!Subtarget->isLittle())
21265 std::swap (Lo, Hi);
21266 Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
21267 Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
21268 return Builder.CreateOr(
21269 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 32)), "val64");
21270 }
21271
21272 Type *Tys[] = { Addr->getType() };
21273 Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex;
21274 CallInst *CI = Builder.CreateIntrinsic(Int, Tys, Addr);
21275
21276 CI->addParamAttr(
21277 0, Attribute::get(M->getContext(), Attribute::ElementType, ValueTy));
21278 return Builder.CreateTruncOrBitCast(CI, ValueTy);
21279}
21280
21282 IRBuilderBase &Builder) const {
21283 if (!Subtarget->hasV7Ops())
21284 return;
21285 Builder.CreateIntrinsic(Intrinsic::arm_clrex, {});
21286}
21287
21289 Value *Val, Value *Addr,
21290 AtomicOrdering Ord) const {
21291 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
21292 bool IsRelease = isReleaseOrStronger(Ord);
21293
21294 // Since the intrinsics must have legal type, the i64 intrinsics take two
21295 // parameters: "i32, i32". We must marshal Val into the appropriate form
21296 // before the call.
21297 if (Val->getType()->getPrimitiveSizeInBits() == 64) {
21299 IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd;
21300 Type *Int32Ty = Type::getInt32Ty(M->getContext());
21301
21302 Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo");
21303 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi");
21304 if (!Subtarget->isLittle())
21305 std::swap(Lo, Hi);
21306 return Builder.CreateIntrinsic(Int, {Lo, Hi, Addr});
21307 }
21308
21309 Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex;
21310 Type *Tys[] = { Addr->getType() };
21312
21313 CallInst *CI = Builder.CreateCall(
21314 Strex, {Builder.CreateZExtOrBitCast(
21315 Val, Strex->getFunctionType()->getParamType(0)),
21316 Addr});
21317 CI->addParamAttr(1, Attribute::get(M->getContext(), Attribute::ElementType,
21318 Val->getType()));
21319 return CI;
21320}
21321
21322
21324 return Subtarget->isMClass();
21325}
21326
21327/// A helper function for determining the number of interleaved accesses we
21328/// will generate when lowering accesses of the given type.
21329unsigned
21331 const DataLayout &DL) const {
21332 return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
21333}
21334
21336 unsigned Factor, FixedVectorType *VecTy, Align Alignment,
21337 const DataLayout &DL) const {
21338
21339 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
21340 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
21341
21342 if (!Subtarget->hasNEON() && !Subtarget->hasMVEIntegerOps())
21343 return false;
21344
21345 // Ensure the vector doesn't have f16 elements. Even though we could do an
21346 // i16 vldN, we can't hold the f16 vectors and will end up converting via
21347 // f32.
21348 if (Subtarget->hasNEON() && VecTy->getElementType()->isHalfTy())
21349 return false;
21350 if (Subtarget->hasMVEIntegerOps() && Factor == 3)
21351 return false;
21352
21353 // Ensure the number of vector elements is greater than 1.
21354 if (VecTy->getNumElements() < 2)
21355 return false;
21356
21357 // Ensure the element type is legal.
21358 if (ElSize != 8 && ElSize != 16 && ElSize != 32)
21359 return false;
21360 // And the alignment if high enough under MVE.
21361 if (Subtarget->hasMVEIntegerOps() && Alignment < ElSize / 8)
21362 return false;
21363
21364 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
21365 // 128 will be split into multiple interleaved accesses.
21366 if (Subtarget->hasNEON() && VecSize == 64)
21367 return true;
21368 return VecSize % 128 == 0;
21369}
21370
21372 if (Subtarget->hasNEON())
21373 return 4;
21374 if (Subtarget->hasMVEIntegerOps())
21377}
21378
21379/// Lower an interleaved load into a vldN intrinsic.
21380///
21381/// E.g. Lower an interleaved load (Factor = 2):
21382/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4
21383/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
21384/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
21385///
21386/// Into:
21387/// %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4)
21388/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
21389/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
21391 Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
21392 ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const {
21393 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
21394 "Invalid interleave factor");
21395 assert(!Shuffles.empty() && "Empty shufflevector input");
21396 assert(Shuffles.size() == Indices.size() &&
21397 "Unmatched number of shufflevectors and indices");
21398
21399 auto *LI = dyn_cast<LoadInst>(Load);
21400 if (!LI)
21401 return false;
21402 assert(!Mask && GapMask.popcount() == Factor && "Unexpected mask on a load");
21403
21404 auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType());
21405 Type *EltTy = VecTy->getElementType();
21406
21407 const DataLayout &DL = LI->getDataLayout();
21408 Align Alignment = LI->getAlign();
21409
21410 // Skip if we do not have NEON and skip illegal vector types. We can
21411 // "legalize" wide vector types into multiple interleaved accesses as long as
21412 // the vector types are divisible by 128.
21413 if (!isLegalInterleavedAccessType(Factor, VecTy, Alignment, DL))
21414 return false;
21415
21416 unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);
21417
21418 // A pointer vector can not be the return type of the ldN intrinsics. Need to
21419 // load integer vectors first and then convert to pointer vectors.
21420 if (EltTy->isPointerTy())
21421 VecTy = FixedVectorType::get(DL.getIntPtrType(EltTy), VecTy);
21422
21423 IRBuilder<> Builder(LI);
21424
21425 // The base address of the load.
21426 Value *BaseAddr = LI->getPointerOperand();
21427
21428 if (NumLoads > 1) {
21429 // If we're going to generate more than one load, reset the sub-vector type
21430 // to something legal.
21431 VecTy = FixedVectorType::get(VecTy->getElementType(),
21432 VecTy->getNumElements() / NumLoads);
21433 }
21434
21435 assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!");
21436
21437 auto createLoadIntrinsic = [&](Value *BaseAddr) {
21438 if (Subtarget->hasNEON()) {
21439 Type *PtrTy = Builder.getPtrTy(LI->getPointerAddressSpace());
21440 Type *Tys[] = {VecTy, PtrTy};
21441 static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
21442 Intrinsic::arm_neon_vld3,
21443 Intrinsic::arm_neon_vld4};
21444
21446 Ops.push_back(BaseAddr);
21447 Ops.push_back(Builder.getInt32(LI->getAlign().value()));
21448
21449 return Builder.CreateIntrinsic(LoadInts[Factor - 2], Tys, Ops,
21450 /*FMFSource=*/nullptr, "vldN");
21451 } else {
21452 assert((Factor == 2 || Factor == 4) &&
21453 "expected interleave factor of 2 or 4 for MVE");
21454 Intrinsic::ID LoadInts =
21455 Factor == 2 ? Intrinsic::arm_mve_vld2q : Intrinsic::arm_mve_vld4q;
21456 Type *PtrTy = Builder.getPtrTy(LI->getPointerAddressSpace());
21457 Type *Tys[] = {VecTy, PtrTy};
21458
21460 Ops.push_back(BaseAddr);
21461 return Builder.CreateIntrinsic(LoadInts, Tys, Ops, /*FMFSource=*/nullptr,
21462 "vldN");
21463 }
21464 };
21465
21466 // Holds sub-vectors extracted from the load intrinsic return values. The
21467 // sub-vectors are associated with the shufflevector instructions they will
21468 // replace.
21470
21471 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
21472 // If we're generating more than one load, compute the base address of
21473 // subsequent loads as an offset from the previous.
21474 if (LoadCount > 0)
21475 BaseAddr = Builder.CreateConstGEP1_32(VecTy->getElementType(), BaseAddr,
21476 VecTy->getNumElements() * Factor);
21477
21478 CallInst *VldN = createLoadIntrinsic(BaseAddr);
21479
21480 // Replace uses of each shufflevector with the corresponding vector loaded
21481 // by ldN.
21482 for (unsigned i = 0; i < Shuffles.size(); i++) {
21483 ShuffleVectorInst *SV = Shuffles[i];
21484 unsigned Index = Indices[i];
21485
21486 Value *SubVec = Builder.CreateExtractValue(VldN, Index);
21487
21488 // Convert the integer vector to pointer vector if the element is pointer.
21489 if (EltTy->isPointerTy())
21490 SubVec = Builder.CreateIntToPtr(
21491 SubVec,
21493
21494 SubVecs[SV].push_back(SubVec);
21495 }
21496 }
21497
21498 // Replace uses of the shufflevector instructions with the sub-vectors
21499 // returned by the load intrinsic. If a shufflevector instruction is
21500 // associated with more than one sub-vector, those sub-vectors will be
21501 // concatenated into a single wide vector.
21502 for (ShuffleVectorInst *SVI : Shuffles) {
21503 auto &SubVec = SubVecs[SVI];
21504 auto *WideVec =
21505 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
21506 SVI->replaceAllUsesWith(WideVec);
21507 }
21508
21509 return true;
21510}
21511
21512/// Lower an interleaved store into a vstN intrinsic.
21513///
21514/// E.g. Lower an interleaved store (Factor = 3):
21515/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
21516/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
21517/// store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4
21518///
21519/// Into:
21520/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
21521/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
21522/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
21523/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
21524///
21525/// Note that the new shufflevectors will be removed and we'll only generate one
21526/// vst3 instruction in CodeGen.
21527///
21528/// Example for a more general valid mask (Factor 3). Lower:
21529/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
21530/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
21531/// store <12 x i32> %i.vec, <12 x i32>* %ptr
21532///
21533/// Into:
21534/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
21535/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
21536/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
21537/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
21539 Value *LaneMask,
21540 ShuffleVectorInst *SVI,
21541 unsigned Factor,
21542 const APInt &GapMask) const {
21543 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
21544 "Invalid interleave factor");
21545 auto *SI = dyn_cast<StoreInst>(Store);
21546 if (!SI)
21547 return false;
21548 assert(!LaneMask && GapMask.popcount() == Factor &&
21549 "Unexpected mask on store");
21550
21551 auto *VecTy = cast<FixedVectorType>(SVI->getType());
21552 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
21553
21554 unsigned LaneLen = VecTy->getNumElements() / Factor;
21555 Type *EltTy = VecTy->getElementType();
21556 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
21557
21558 const DataLayout &DL = SI->getDataLayout();
21559 Align Alignment = SI->getAlign();
21560
21561 // Skip if we do not have NEON and skip illegal vector types. We can
21562 // "legalize" wide vector types into multiple interleaved accesses as long as
21563 // the vector types are divisible by 128.
21564 if (!isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
21565 return false;
21566
21567 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
21568
21569 Value *Op0 = SVI->getOperand(0);
21570 Value *Op1 = SVI->getOperand(1);
21571 IRBuilder<> Builder(SI);
21572
21573 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
21574 // vectors to integer vectors.
21575 if (EltTy->isPointerTy()) {
21576 Type *IntTy = DL.getIntPtrType(EltTy);
21577
21578 // Convert to the corresponding integer vector.
21579 auto *IntVecTy =
21581 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
21582 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
21583
21584 SubVecTy = FixedVectorType::get(IntTy, LaneLen);
21585 }
21586
21587 // The base address of the store.
21588 Value *BaseAddr = SI->getPointerOperand();
21589
21590 if (NumStores > 1) {
21591 // If we're going to generate more than one store, reset the lane length
21592 // and sub-vector type to something legal.
21593 LaneLen /= NumStores;
21594 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
21595 }
21596
21597 assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!");
21598
21599 auto Mask = SVI->getShuffleMask();
21600
21601 auto createStoreIntrinsic = [&](Value *BaseAddr,
21602 SmallVectorImpl<Value *> &Shuffles) {
21603 if (Subtarget->hasNEON()) {
21604 static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
21605 Intrinsic::arm_neon_vst3,
21606 Intrinsic::arm_neon_vst4};
21607 Type *PtrTy = Builder.getPtrTy(SI->getPointerAddressSpace());
21608 Type *Tys[] = {PtrTy, SubVecTy};
21609
21611 Ops.push_back(BaseAddr);
21612 append_range(Ops, Shuffles);
21613 Ops.push_back(Builder.getInt32(SI->getAlign().value()));
21614 Builder.CreateIntrinsic(StoreInts[Factor - 2], Tys, Ops);
21615 } else {
21616 assert((Factor == 2 || Factor == 4) &&
21617 "expected interleave factor of 2 or 4 for MVE");
21618 Intrinsic::ID StoreInts =
21619 Factor == 2 ? Intrinsic::arm_mve_vst2q : Intrinsic::arm_mve_vst4q;
21620 Type *PtrTy = Builder.getPtrTy(SI->getPointerAddressSpace());
21621 Type *Tys[] = {PtrTy, SubVecTy};
21622
21624 Ops.push_back(BaseAddr);
21625 append_range(Ops, Shuffles);
21626 for (unsigned F = 0; F < Factor; F++) {
21627 Ops.push_back(Builder.getInt32(F));
21628 Builder.CreateIntrinsic(StoreInts, Tys, Ops);
21629 Ops.pop_back();
21630 }
21631 }
21632 };
21633
21634 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
21635 // If we generating more than one store, we compute the base address of
21636 // subsequent stores as an offset from the previous.
21637 if (StoreCount > 0)
21638 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
21639 BaseAddr, LaneLen * Factor);
21640
21641 SmallVector<Value *, 4> Shuffles;
21642
21643 // Split the shufflevector operands into sub vectors for the new vstN call.
21644 for (unsigned i = 0; i < Factor; i++) {
21645 unsigned IdxI = StoreCount * LaneLen * Factor + i;
21646 if (Mask[IdxI] >= 0) {
21647 Shuffles.push_back(Builder.CreateShuffleVector(
21648 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0)));
21649 } else {
21650 unsigned StartMask = 0;
21651 for (unsigned j = 1; j < LaneLen; j++) {
21652 unsigned IdxJ = StoreCount * LaneLen * Factor + j;
21653 if (Mask[IdxJ * Factor + IdxI] >= 0) {
21654 StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
21655 break;
21656 }
21657 }
21658 // Note: If all elements in a chunk are undefs, StartMask=0!
21659 // Note: Filling undef gaps with random elements is ok, since
21660 // those elements were being written anyway (with undefs).
21661 // In the case of all undefs we're defaulting to using elems from 0
21662 // Note: StartMask cannot be negative, it's checked in
21663 // isReInterleaveMask
21664 Shuffles.push_back(Builder.CreateShuffleVector(
21665 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0)));
21666 }
21667 }
21668
21669 createStoreIntrinsic(BaseAddr, Shuffles);
21670 }
21671 return true;
21672}
21673
21681
21683 uint64_t &Members) {
21684 if (auto *ST = dyn_cast<StructType>(Ty)) {
21685 for (unsigned i = 0; i < ST->getNumElements(); ++i) {
21686 uint64_t SubMembers = 0;
21687 if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers))
21688 return false;
21689 Members += SubMembers;
21690 }
21691 } else if (auto *AT = dyn_cast<ArrayType>(Ty)) {
21692 uint64_t SubMembers = 0;
21693 if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers))
21694 return false;
21695 Members += SubMembers * AT->getNumElements();
21696 } else if (Ty->isFloatTy()) {
21697 if (Base != HA_UNKNOWN && Base != HA_FLOAT)
21698 return false;
21699 Members = 1;
21700 Base = HA_FLOAT;
21701 } else if (Ty->isDoubleTy()) {
21702 if (Base != HA_UNKNOWN && Base != HA_DOUBLE)
21703 return false;
21704 Members = 1;
21705 Base = HA_DOUBLE;
21706 } else if (auto *VT = dyn_cast<VectorType>(Ty)) {
21707 Members = 1;
21708 switch (Base) {
21709 case HA_FLOAT:
21710 case HA_DOUBLE:
21711 return false;
21712 case HA_VECT64:
21713 return VT->getPrimitiveSizeInBits().getFixedValue() == 64;
21714 case HA_VECT128:
21715 return VT->getPrimitiveSizeInBits().getFixedValue() == 128;
21716 case HA_UNKNOWN:
21717 switch (VT->getPrimitiveSizeInBits().getFixedValue()) {
21718 case 64:
21719 Base = HA_VECT64;
21720 return true;
21721 case 128:
21722 Base = HA_VECT128;
21723 return true;
21724 default:
21725 return false;
21726 }
21727 }
21728 }
21729
21730 return (Members > 0 && Members <= 4);
21731}
21732
21733/// Return the correct alignment for the current calling convention.
21735 Type *ArgTy, const DataLayout &DL) const {
21736 const Align ABITypeAlign = DL.getABITypeAlign(ArgTy);
21737 if (!ArgTy->isVectorTy())
21738 return ABITypeAlign;
21739
21740 // Avoid over-aligning vector parameters. It would require realigning the
21741 // stack and waste space for no real benefit.
21742 MaybeAlign StackAlign = DL.getStackAlignment();
21743 assert(StackAlign && "data layout string is missing stack alignment");
21744 return std::min(ABITypeAlign, *StackAlign);
21745}
21746
21747/// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of
21748/// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when
21749/// passing according to AAPCS rules.
21751 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
21752 const DataLayout &DL) const {
21753 if (getEffectiveCallingConv(CallConv, isVarArg) !=
21755 return false;
21756
21758 uint64_t Members = 0;
21759 bool IsHA = isHomogeneousAggregate(Ty, Base, Members);
21760 LLVM_DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump());
21761
21762 bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy();
21763 return IsHA || IsIntArray;
21764}
21765
21767 const Constant *PersonalityFn) const {
21768 // Platforms which do not use SjLj EH may return values in these registers
21769 // via the personality function.
21771 return EM == ExceptionHandling::SjLj ? Register() : ARM::R0;
21772}
21773
21775 const Constant *PersonalityFn) const {
21776 // Platforms which do not use SjLj EH may return values in these registers
21777 // via the personality function.
21779 return EM == ExceptionHandling::SjLj ? Register() : ARM::R1;
21780}
21781
21782void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
21783 // Update IsSplitCSR in ARMFunctionInfo.
21784 ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>();
21785 AFI->setIsSplitCSR(true);
21786}
21787
21788void ARMTargetLowering::insertCopiesSplitCSR(
21789 MachineBasicBlock *Entry,
21790 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
21791 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
21792 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
21793 if (!IStart)
21794 return;
21795
21796 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
21797 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
21798 MachineBasicBlock::iterator MBBI = Entry->begin();
21799 for (const MCPhysReg *I = IStart; *I; ++I) {
21800 const TargetRegisterClass *RC = nullptr;
21801 if (ARM::GPRRegClass.contains(*I))
21802 RC = &ARM::GPRRegClass;
21803 else if (ARM::DPRRegClass.contains(*I))
21804 RC = &ARM::DPRRegClass;
21805 else
21806 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
21807
21808 Register NewVR = MRI->createVirtualRegister(RC);
21809 // Create copy from CSR to a virtual register.
21810 // FIXME: this currently does not emit CFI pseudo-instructions, it works
21811 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
21812 // nounwind. If we want to generalize this later, we may need to emit
21813 // CFI pseudo-instructions.
21814 assert(Entry->getParent()->getFunction().hasFnAttribute(
21815 Attribute::NoUnwind) &&
21816 "Function should be nounwind in insertCopiesSplitCSR!");
21817 Entry->addLiveIn(*I);
21818 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
21819 .addReg(*I);
21820
21821 // Insert the copy-back instructions right before the terminator.
21822 for (auto *Exit : Exits)
21823 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
21824 TII->get(TargetOpcode::COPY), *I)
21825 .addReg(NewVR);
21826 }
21827}
21828
21833
21835 return Subtarget->hasMVEIntegerOps();
21836}
21837
21840 auto *VTy = dyn_cast<FixedVectorType>(Ty);
21841 if (!VTy)
21842 return false;
21843
21844 auto *ScalarTy = VTy->getScalarType();
21845 unsigned NumElements = VTy->getNumElements();
21846
21847 unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
21848 if (VTyWidth < 128 || !llvm::isPowerOf2_32(VTyWidth))
21849 return false;
21850
21851 // Both VCADD and VCMUL/VCMLA support the same types, F16 and F32
21852 if (ScalarTy->isHalfTy() || ScalarTy->isFloatTy())
21853 return Subtarget->hasMVEFloatOps();
21854
21856 return false;
21857
21858 return Subtarget->hasMVEIntegerOps() &&
21859 (ScalarTy->isIntegerTy(8) || ScalarTy->isIntegerTy(16) ||
21860 ScalarTy->isIntegerTy(32));
21861}
21862
21864 static const MCPhysReg RCRegs[] = {ARM::FPSCR_RM};
21865 return RCRegs;
21866}
21867
21870 ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
21871 Value *Accumulator) const {
21872
21874
21875 unsigned TyWidth = Ty->getScalarSizeInBits() * Ty->getNumElements();
21876
21877 assert(TyWidth >= 128 && "Width of vector type must be at least 128 bits");
21878
21879 if (TyWidth > 128) {
21880 int Stride = Ty->getNumElements() / 2;
21881 auto SplitSeq = llvm::seq<int>(0, Ty->getNumElements());
21882 auto SplitSeqVec = llvm::to_vector(SplitSeq);
21883 ArrayRef<int> LowerSplitMask(&SplitSeqVec[0], Stride);
21884 ArrayRef<int> UpperSplitMask(&SplitSeqVec[Stride], Stride);
21885
21886 auto *LowerSplitA = B.CreateShuffleVector(InputA, LowerSplitMask);
21887 auto *LowerSplitB = B.CreateShuffleVector(InputB, LowerSplitMask);
21888 auto *UpperSplitA = B.CreateShuffleVector(InputA, UpperSplitMask);
21889 auto *UpperSplitB = B.CreateShuffleVector(InputB, UpperSplitMask);
21890 Value *LowerSplitAcc = nullptr;
21891 Value *UpperSplitAcc = nullptr;
21892
21893 if (Accumulator) {
21894 LowerSplitAcc = B.CreateShuffleVector(Accumulator, LowerSplitMask);
21895 UpperSplitAcc = B.CreateShuffleVector(Accumulator, UpperSplitMask);
21896 }
21897
21898 auto *LowerSplitInt = createComplexDeinterleavingIR(
21899 B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
21900 auto *UpperSplitInt = createComplexDeinterleavingIR(
21901 B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
21902
21903 ArrayRef<int> JoinMask(&SplitSeqVec[0], Ty->getNumElements());
21904 return B.CreateShuffleVector(LowerSplitInt, UpperSplitInt, JoinMask);
21905 }
21906
21907 auto *IntTy = Type::getInt32Ty(B.getContext());
21908
21909 ConstantInt *ConstRotation = nullptr;
21910 if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
21911 ConstRotation = ConstantInt::get(IntTy, (int)Rotation);
21912
21913 if (Accumulator)
21914 return B.CreateIntrinsic(Intrinsic::arm_mve_vcmlaq, Ty,
21915 {ConstRotation, Accumulator, InputB, InputA});
21916 return B.CreateIntrinsic(Intrinsic::arm_mve_vcmulq, Ty,
21917 {ConstRotation, InputB, InputA});
21918 }
21919
21920 if (OperationType == ComplexDeinterleavingOperation::CAdd) {
21921 // 1 means the value is not halved.
21922 auto *ConstHalving = ConstantInt::get(IntTy, 1);
21923
21925 ConstRotation = ConstantInt::get(IntTy, 0);
21927 ConstRotation = ConstantInt::get(IntTy, 1);
21928
21929 if (!ConstRotation)
21930 return nullptr; // Invalid rotation for arm_mve_vcaddq
21931
21932 return B.CreateIntrinsic(Intrinsic::arm_mve_vcaddq, Ty,
21933 {ConstHalving, ConstRotation, InputA, InputB});
21934 }
21935
21936 return nullptr;
21937}
unsigned const MachineRegisterInfo * MRI
static bool isAddSubSExt(SDValue N, SelectionDAG &DAG)
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt)
isVShiftRImm - Check if this is a valid build_vector for the immediate operand of a vector shift righ...
static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG, bool isSigned)
static bool isZeroExtended(SDValue N, SelectionDAG &DAG)
return SDValue()
static const MCPhysReg GPRArgRegs[]
static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, SDValue V2, unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &DL)
GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit the specified operations t...
constexpr MVT FlagsVT
Value type used for NZCV flags.
static bool isNegatedInteger(SDValue Op)
static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt)
getVShiftImm - Check if this is a valid build_vector for the immediate operand of a vector shift oper...
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static bool isSignExtended(SDValue N, SelectionDAG &DAG)
static bool isAddSubZExt(SDValue N, SelectionDAG &DAG)
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt)
isVShiftLImm - Check if this is a valid build_vector for the immediate operand of a vector shift left...
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
static bool isConstant(const MachineInstr &MI)
constexpr LLT F64
constexpr LLT S1
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG)
static bool isStore(int Opcode)
static bool isThumb(const MCSubtargetInfo &STI)
static SDValue LowerUADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG)
static SDValue PerformExtractEltToVMOVRRD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isIncompatibleReg(const MCPhysReg &PR, MVT VT)
static SDValue PerformVQDMULHCombine(SDNode *N, SelectionDAG &DAG)
static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG, const EVT &OrigTy, const EVT &ExtTy, unsigned ExtOpcode)
AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total value size to 64 bits.
static cl::opt< unsigned > ConstpoolPromotionMaxSize("arm-promote-constant-max-size", cl::Hidden, cl::desc("Maximum size of constant to promote into a constant pool"), cl::init(64))
static bool isZeroOrAllOnes(SDValue N, bool AllOnes)
static SDValue LowerINSERT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isVTBLMask(ArrayRef< int > M, EVT VT)
static SDValue PerformSUBCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
static cl::opt< bool > EnableConstpoolPromotion("arm-promote-constant", cl::Hidden, cl::desc("Enable / disable promotion of unnamed_addr constants into " "constant pools"), cl::init(false))
static SDValue PerformFAddVSelectCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformExtractFpToIntStores(StoreSDNode *St, SelectionDAG &DAG)
static SDValue PerformVDUPCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
static SDValue PerformExtractEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static const APInt * isPowerOf2Constant(SDValue V)
static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD) can replace combinations of ...
static SDValue PerformVMOVhrCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG)
static SDValue LowerVECTOR_SHUFFLEUsingOneOff(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static bool isValidMVECond(unsigned CC, bool IsFloat)
static SDValue PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC)
IntCCToARMCC - Convert a DAG integer condition code to an ARM CC.
static SDValue PerformSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformSTORECombine - Target-specific dag combine xforms for ISD::STORE.
static SDValue ConvertBooleanCarryToCarryFlag(SDValue BoolCarry, SelectionDAG &DAG)
static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isGTorGE(ISD::CondCode CC)
static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a vldN-lane (N > 1) intrinsic,...
static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask)
static bool isReverseMask(ArrayRef< int > M, EVT VT)
static bool isVZIP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of "vector_shuffle v,...
static SDValue PerformSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformVECTOR_REG_CASTCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformVMulVCTPCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVMulVCTPCombine - VCVT (fixed-point to floating-point, Advanced SIMD) can replace combinations...
static SDValue createGPRPairNode2xi32(SelectionDAG &DAG, SDValue V0, SDValue V1)
static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG)
static bool findPointerConstIncrement(SDNode *N, SDValue *Ptr, SDValue *CInc)
static bool isVTRNMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool CanInvertMVEVCMP(SDValue N)
static SDValue PerformLongShiftCombine(SDNode *N, SelectionDAG &DAG)
static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformShiftCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
PerformShiftCombine - Checks for immediate versions of vector shifts and lowers them.
static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, ARMCC::CondCodes &CondCode2)
FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static EVT getVectorTyFromPredicateVector(EVT VT)
static SDValue PerformFADDVCMLACombine(SDNode *N, SelectionDAG &DAG)
static SDValue handleCMSEValue(const SDValue &Value, const ISD::InputArg &Arg, SelectionDAG &DAG, const SDLoc &DL)
static SDValue PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
static bool isSRL16(const SDValue &Op)
static SDValue PerformVMOVrhCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformLOADCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue IsCMPZCSINC(SDNode *Cmp, ARMCC::CondCodes &CC)
static unsigned getPointerConstIncrement(unsigned Opcode, SDValue Ptr, SDValue Inc, const SelectionDAG &DAG)
static SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static Register genTPEntry(MachineBasicBlock *TpEntry, MachineBasicBlock *TpLoopBody, MachineBasicBlock *TpExit, Register OpSizeReg, const TargetInstrInfo *TII, DebugLoc Dl, MachineRegisterInfo &MRI)
Adds logic in loop entry MBB to calculate loop iteration count and adds t2WhileLoopSetup and t2WhileL...
static SDValue createGPRPairNodei64(SelectionDAG &DAG, SDValue V)
static bool isLTorLE(ISD::CondCode CC)
static SDValue PerformVCMPCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformMVEVMULLCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl, SelectionDAG &DAG)
static SDValue performNegCMovCombine(SDNode *N, SelectionDAG &DAG)
static EVT getExtensionTo64Bits(const EVT &OrigVT)
static SDValue PerformBITCASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue AddCombineTo64bitMLAL(SDNode *AddeSubeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG)
static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock *BB, const TargetRegisterInfo *TRI)
static SDValue PerformCMPZCombine(SDNode *N, SelectionDAG &DAG)
static bool hasNormalLoadOperand(SDNode *N)
hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node are normal,...
static SDValue PerformInsertEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
PerformInsertEltCombine - Target-specific dag combine xforms for ISD::INSERT_VECTOR_ELT.
static SDValue PerformVDUPLANECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVDUPLANECombine - Target-specific dag combine xforms for ARMISD::VDUPLANE.
static SDValue LowerBuildVectorOfFPTrunc(SDValue BV, SelectionDAG &DAG, const ARMSubtarget *ST)
static cl::opt< unsigned > ConstpoolPromotionMaxTotal("arm-promote-constant-max-total", cl::Hidden, cl::desc("Maximum size of ALL constants to promote into a constant pool"), cl::init(128))
static SDValue LowerTruncatei1(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static RTLIB::Libcall getDivRemLibcall(const SDNode *N, MVT::SimpleValueType SVT)
static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG &DAG)
SkipLoadExtensionForVMULL - return a load of the original vector size that does not do any sign/zero ...
static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombineWithOperands - Try DAG combinations for an ADD with operands N0 and N1.
static SDValue PromoteMVEPredVector(SDLoc dl, SDValue Pred, EVT VT, SelectionDAG &DAG)
static bool isVZIPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue PerformORCombineToSMULWBT(SDNode *OR, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool isVTRN_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of "vector_shuffle v,...
static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue FindBFIToCombineWith(SDNode *N)
static SDValue LowerADDSUBSAT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue ConvertCarryFlagToBooleanCarry(SDValue Flags, EVT VT, SelectionDAG &DAG)
static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, bool &swpCmpOps, bool &swpVselOps)
static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static bool isS16(const SDValue &Op, SelectionDAG &DAG)
static bool isSRA16(const SDValue &Op)
static SDValue AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerVECTOR_SHUFFLEUsingMovs(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue LowerInterruptReturn(SmallVectorImpl< SDValue > &RetOps, const SDLoc &DL, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl, SelectionDAG &DAG)
static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, SDValue &RetVal1, SDValue &RetVal2)
static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformVLDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isSHL16(const SDValue &Op)
static bool isVEXTMask(ArrayRef< int > M, EVT VT, bool &ReverseVEXT, unsigned &Imm)
static SDValue PerformMVEVLDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
cl::opt< unsigned > ArmMaxBaseUpdatesToCheck("arm-max-base-updates-to-check", cl::Hidden, cl::desc("Maximum number of base-updates to check generating postindex."), cl::init(64))
static bool isTruncMask(ArrayRef< int > M, EVT VT, bool Top, bool SingleSource)
static SDValue PerformADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2)
Return the load opcode for a given load size.
static bool isLegalT2AddressImmediate(int64_t V, EVT VT, const ARMSubtarget *Subtarget)
static bool isLegalMVEShuffleOp(unsigned PFEntry)
static SDValue PerformSignExtendInregCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformShuffleVMOVNCombine(ShuffleVectorSDNode *N, SelectionDAG &DAG)
static bool isVUZPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG)
PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for ISD::VECTOR_SHUFFLE.
static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG)
SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND, ANY_EXTEND,...
static bool isVMOVNTruncMask(ArrayRef< int > M, EVT ToVT, bool rev)
static SDValue PerformVQMOVNCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static MachineBasicBlock * OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ)
static SDValue LowerVecReduceMinMax(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformFPExtendCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformAddcSubcCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformVSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static TargetLowering::ArgListTy getDivRemArgList(const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget)
static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
getZeroVector - Returns a vector of specified type with all zero elements.
static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, SelectionDAG &DAG)
static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT, bool isSEXTLoad, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
static ARMCC::CondCodes getVCMPCondCode(SDValue N)
static cl::opt< bool > ARMInterworking("arm-interworking", cl::Hidden, cl::desc("Enable / disable ARM interworking (for debugging only)"), cl::init(true))
static void ReplaceREADCYCLECOUNTER(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformORCombineToBFI(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes, SDValue &CC, bool &Invert, SDValue &OtherOp, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformVSetCCToVCTPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerBUILD_VECTORToVIDUP(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isZeroVector(SDValue N)
static SDValue PerformAddeSubeCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static void ReplaceCMP_SWAP_64Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static bool isLowerSaturate(const SDValue LHS, const SDValue RHS, const SDValue TrueVal, const SDValue FalseVal, const ISD::CondCode CC, const SDValue K)
static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG)
static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, const TargetInstrInfo *TII, const DebugLoc &dl, unsigned StSize, unsigned Data, unsigned AddrIn, unsigned AddrOut, bool IsThumb1, bool IsThumb2)
Emit a post-increment store operation with given size.
static bool isVMOVNMask(ArrayRef< int > M, EVT VT, bool Top, bool SingleSource)
static SDValue CombineBaseUpdate(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
CombineBaseUpdate - Target-specific DAG combine function for VLDDUP, NEON load/store intrinsics,...
static SDValue LowerSaturatingConditional(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSubCSINCCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformVMOVRRDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVMOVRRDCombine - Target-specific dag combine xforms for ARMISD::VMOVRRD.
static SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformCSETCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformVMOVNCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue PerformInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerVectorExtend(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue WinDBZCheckDenominator(SelectionDAG &DAG, SDNode *N, SDValue InChain)
static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue PerformVMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVMULCombine Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the special multi...
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG)
static SDValue PerformBFICombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformORCombine - Target-specific dag combine xforms for ISD::OR.
static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG)
static SDValue PerformTruncatingStoreCombine(StoreSDNode *St, SelectionDAG &DAG)
static unsigned SelectPairHalf(unsigned Elements, ArrayRef< int > Mask, unsigned Index)
static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, const TargetInstrInfo *TII, const DebugLoc &dl, unsigned LdSize, unsigned Data, unsigned AddrIn, unsigned AddrOut, bool IsThumb1, bool IsThumb2)
Emit a post-increment load operation with given size.
static SDValue TryDistrubutionADDVecReduce(SDNode *N, SelectionDAG &DAG)
static bool isValidBaseUpdate(SDNode *N, SDNode *User)
static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, const ARMSubtarget *ST, const SDLoc &dl)
static bool IsQRMVEInstruction(const SDNode *N, const SDNode *Op)
static SDValue PerformMinMaxToSatCombine(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformXORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment, bool isSEXTLoad, bool IsMasked, bool isLE, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
std::pair< unsigned, const TargetRegisterClass * > RCPair
static SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp, TargetLowering::DAGCombinerInfo &DCI, bool AllOnes=false)
static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND, ISD::ZERO_EXTEND,...
static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
cl::opt< unsigned > MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden, cl::desc("Maximum interleave factor for MVE VLDn to generate."), cl::init(2))
static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, unsigned SplatBitSize, SelectionDAG &DAG, const SDLoc &dl, EVT &VT, EVT VectorVT, VMOVModImmType type)
isVMOVModifiedImm - Check if the specified splat value corresponds to a valid vector constant for a N...
static SDValue LowerBuildVectorOfFPExt(SDValue BV, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC, SelectionDAG &DAG)
BC is a bitcast that is about to be turned into a VMOVDRR.
static SDValue promoteToConstantPool(const ARMTargetLowering *TLI, const GlobalValue *GV, SelectionDAG &DAG, EVT PtrVT, const SDLoc &dl)
static unsigned isNEONTwoResultShuffleMask(ArrayRef< int > ShuffleMask, EVT VT, unsigned &WhichResult, bool &isV_UNDEF)
Check if ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN), and return the corresponding AR...
static bool BitsProperlyConcatenate(const APInt &A, const APInt &B)
static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT, bool isSEXTLoad, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
static SDValue LowerVecReduce(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG)
static bool TryCombineBaseUpdate(struct BaseUpdateTarget &Target, struct BaseUpdateUser &User, bool SimpleConstIncOnly, TargetLowering::DAGCombinerInfo &DCI)
static bool allUsersAreInFunction(const Value *V, const Function *F)
Return true if all users of V are within function F, looking through ConstantExprs.
static bool isSingletonVEXTMask(ArrayRef< int > M, EVT VT, unsigned &Imm)
static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG)
PerformVMOVDRRCombine - Target-specific dag combine xforms for ARMISD::VMOVDRR.
static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V, SDValue &SatK)
static bool isLegalAddressImmediate(int64_t V, EVT VT, const ARMSubtarget *Subtarget)
isLegalAddressImmediate - Return true if the integer value can be used as the offset of the target ad...
static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isLegalT1AddressImmediate(int64_t V, EVT VT)
static SDValue CombineANDShift(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSHLSimplify(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue PerformADDECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDECombine - Target-specific dag combine transform from ARMISD::ADDC, ARMISD::ADDE,...
static SDValue PerformReduceShuffleCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformUMLALCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerTruncate(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformHWLoopCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue PerformSplittingMVETruncToNarrowingStores(StoreSDNode *St, SelectionDAG &DAG)
static bool isVUZP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of "vector_shuffle v,...
static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, uint64_t &Members)
static SDValue PerformMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerReverse_VECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG)
static SDValue PerformANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformADDVecReduce(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG)
static SDValue SearchLoopIntrinsic(SDValue N, ISD::CondCode &CC, int &Imm, bool &Negate)
static bool canChangeToInt(SDValue Op, bool &SeenZero, const ARMSubtarget *Subtarget)
canChangeToInt - Given the fp compare operand, return true if it is suitable to morph to an integer c...
static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2)
Return the store opcode for a given store size.
static bool IsVUZPShuffleNode(SDNode *N)
static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget, MachineInstr &MI, const SDNode *Node)
Attaches vregs to MEMCPY that it will use as scratch registers when it is expanded into LDM/STM.
static bool isFloatingPointZero(SDValue Op)
isFloatingPointZero - Return true if this is +0.0.
static SDValue findMUL_LOHI(SDValue V)
static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformORCombine_i1(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformSplittingMVEEXTToWideningLoad(SDNode *N, SelectionDAG &DAG)
static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG)
static void genTPLoopBody(MachineBasicBlock *TpLoopBody, MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit, const TargetInstrInfo *TII, DebugLoc Dl, MachineRegisterInfo &MRI, Register OpSrcReg, Register OpDestReg, Register ElementCountReg, Register TotalIterationsReg, bool IsMemcpy)
Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and t2DoLoopEnd.
static SDValue PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformBUILD_VECTORCombine - Target-specific dag combine xforms for ISD::BUILD_VECTOR.
static SDValue LowerVecReduceF(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformMinMaxCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
PerformMinMaxCombine - Target-specific DAG combining for creating truncating saturates.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
This file a TargetTransformInfoImplBase conforming object specific to the ARM target machine.
Function Alias Analysis false
Function Alias Analysis Results
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
This file implements the BitVector class.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static std::optional< bool > isBigEndian(const SmallDenseMap< int64_t, int64_t, 8 > &MemOffset2Idx, int64_t LowestIdx)
Given a map from byte offsets in memory to indices in a load/store, determine if that map corresponds...
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static void createLoadIntrinsic(IntrinsicInst *II, LoadInst *LI, Value *Offset, dxil::ResourceTypeInfo &RTI)
static void createStoreIntrinsic(IntrinsicInst *II, StoreInst *SI, Value *Offset, dxil::ResourceTypeInfo &RTI)
This file defines the DenseMap class.
static bool isSigned(unsigned int Opcode)
#define Check(C,...)
#define op(i)
#define im(i)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
static Value * LowerCTPOP(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctpop of V before the specified instruction IP.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
lazy value info
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
This file declares the MachineConstantPool class which is an abstract constant pool to keep track of ...
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
nvptx lower args
uint64_t High
uint64_t IntrinsicInst * II
PowerPC Reduce CR logical Operation
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
SI Lower i1 Copies
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
static cl::opt< unsigned > MaxSteps("has-predecessor-max-steps", cl::Hidden, cl::init(8192), cl::desc("DAG combiner limit number of steps when searching DAG " "for predecessor nodes"))
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
This file contains some functions that are useful when dealing with strings.
This file implements the StringSwitch template, which mimics a switch() statement whose cases are str...
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file describes how to lower LLVM code to machine code.
static X86::CondCode getSwappedCondition(X86::CondCode CC)
Assuming the flags are set by MI(a,b), return the condition code if we modify the instructions such t...
static constexpr int Concat[]
Value * RHS
Value * LHS
BinaryOperator * Mul
static bool isIntrinsic(const CallBase &Call, Intrinsic::ID ID)
The Input class is used to parse a yaml document into in-memory structs and vectors.
static constexpr roundingMode rmTowardZero
Definition APFloat.h:348
LLVM_ABI bool getExactInverse(APFloat *Inv) const
If this value is normal and has an exact, normal, multiplicative inverse, store it in inv and return ...
Definition APFloat.cpp:5995
APInt bitcastToAPInt() const
Definition APFloat.h:1335
opStatus convertToInteger(MutableArrayRef< integerPart > Input, unsigned int Width, bool IsSigned, roundingMode RM, bool *IsExact) const
Definition APFloat.h:1314
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition APInt.h:424
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1541
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1671
LLVM_ABI APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition APInt.cpp:1033
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1513
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:936
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1331
bool sgt(const APInt &RHS) const
Signed greater than comparison.
Definition APInt.h:1202
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:372
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1489
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition APInt.h:1112
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1640
unsigned countl_zero() const
The APInt version of std::countl_zero.
Definition APInt.h:1599
static LLVM_ABI APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition APInt.cpp:651
unsigned logBase2() const
Definition APInt.h:1762
uint64_t getLimitedValue(uint64_t Limit=UINT64_MAX) const
If this value is smaller than the specified limit, return it, otherwise return the limit value.
Definition APInt.h:476
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition APInt.h:1258
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:240
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1563
void lshrInPlace(unsigned ShiftAmt)
Logical right-shift this APInt by ShiftAmt in place.
Definition APInt.h:859
APInt lshr(unsigned shiftAmt) const
Logical right-shift function.
Definition APInt.h:852
unsigned countr_one() const
Count the number of trailing one bits.
Definition APInt.h:1657
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1222
An arbitrary precision integer that knows its signedness.
Definition APSInt.h:24
const ARMBaseRegisterInfo & getRegisterInfo() const
const uint32_t * getSjLjDispatchPreservedMask(const MachineFunction &MF) const
const MCPhysReg * getCalleeSavedRegs(const MachineFunction *MF) const override
Code Generation virtual methods...
Register getFrameRegister(const MachineFunction &MF) const override
const uint32_t * getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override
const uint32_t * getTLSCallPreservedMask(const MachineFunction &MF) const
const uint32_t * getThisReturnPreservedMask(const MachineFunction &MF, CallingConv::ID) const
getThisReturnPreservedMask - Returns a call preserved mask specific to the case that 'returned' is on...
static ARMConstantPoolConstant * Create(const Constant *C, unsigned ID)
static ARMConstantPoolMBB * Create(LLVMContext &C, const MachineBasicBlock *mbb, unsigned ID, unsigned char PCAdj)
static ARMConstantPoolSymbol * Create(LLVMContext &C, StringRef s, unsigned ID, unsigned char PCAdj)
ARMConstantPoolValue - ARM specific constantpool value.
ARMFunctionInfo - This class is derived from MachineFunctionInfo and contains private ARM-specific in...
SmallPtrSet< const GlobalVariable *, 2 > & getGlobalsPromotedToConstantPool()
void setArgumentStackToRestore(unsigned v)
void setArgRegsSaveSize(unsigned s)
void setReturnRegsCount(unsigned s)
unsigned getArgRegsSaveSize() const
void markGlobalAsPromotedToConstantPool(const GlobalVariable *GV)
Indicate to the backend that GV has had its storage changed to inside a constant pool.
void setArgumentStackSize(unsigned size)
unsigned getArgumentStackSize() const
const Triple & getTargetTriple() const
const ARMBaseInstrInfo * getInstrInfo() const override
bool isThumb1Only() const
bool useFPVFMx() const
bool isThumb2() const
bool isTargetWindows() const
bool hasBaseDSP() const
const ARMTargetLowering * getTargetLowering() const override
const ARMBaseRegisterInfo * getRegisterInfo() const override
bool hasVFP2Base() const
bool useFPVFMx64() const
bool isLittle() const
bool useFPVFMx16() const
bool isMClass() const
bool useMulOps() const
Align getDualLoadStoreAlignment() const
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X, SDValue Y) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool isReadOnly(const GlobalValue *GV) const
unsigned getMaxSupportedInterleaveFactor() const override
Get the maximum supported factor for interleaved memory accesses.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool shouldInsertFencesForAtomic(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
Align getABIAlignmentForCallingConv(Type *ArgTy, const DataLayout &DL) const override
Return the correct alignment for the current calling convention.
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to move this shift by a constant amount through its operand,...
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const override
Examine constraint string and operand type and determine a weight value.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
isLegalAddressingMode - Return true if the addressing mode represented by AM is legal for this target...
const ARMSubtarget * getSubtarget() const
bool isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const
bool isLegalT1ScaledAddressingMode(const AddrMode &AM, EVT VT) const
Returns true if the addressing mode representing by AM is legal for the Thumb1 target,...
bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const override
getPreIndexedAddressParts - returns true by value, base pointer and offset pointer and addressing mod...
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize, Align &PrefAlign) const override
Return true if the pointer arguments to CI should be aligned by aligning the object whose address is ...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
ReplaceNodeResults - Replace the results of node with an illegal result type with new values built ou...
void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override
bool isMulAddWithConstProfitable(SDValue AddNode, SDValue ConstNode) const override
Return true if it may be profitable to transform (mul (add x, c1), c2) -> (add (mul x,...
bool isLegalAddImmediate(int64_t Imm) const override
isLegalAddImmediate - Return true if the specified immediate is legal add immediate,...
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
Instruction * emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
SDValue PerformMVETruncCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize=false) const override
isFPImmLegal - Returns true if the target can instruction select the specified FP immediate natively.
ConstraintType getConstraintType(StringRef Constraint) const override
getConstraintType - Given a constraint letter, return the type of constraint it is for this target.
bool preferIncOfAddToSubOfNot(EVT VT) const override
These two forms are equivalent: sub y, (xor x, -1) add (add x, 1), y The variant with two add's is IR...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void insertSSPDeclarations(Module &M) const override
Inserts necessary declarations for SSP (stack protection) purpose.
SDValue PerformIntrinsicCombine(SDNode *N, DAGCombinerInfo &DCI) const
PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
bool shouldFoldConstantShiftPairToMask(const SDNode *N) const override
Return true if it is profitable to fold a pair of shifts into a mask.
bool isDesirableToCommuteXorWithShift(const SDNode *N) const override
Return true if it is profitable to combine an XOR of a logical shift to create a logical shift of NOT...
SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const
PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
Value * createComplexDeinterleavingIR(IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator=nullptr) const override
Create the IR node for the given complex deinterleaving operation.
bool isComplexDeinterleavingSupported() const override
Does this target support complex deinterleaving.
SDValue PerformMVEExtCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &OriginalDemandedBits, const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
getSetCCResultType - Return the value type to use for ISD::SETCC.
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
Value * emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override
Perform a store-conditional operation to Addr.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool isVarArg) const
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override
createFastISel - This method returns a target specific FastISel object, or null if the target does no...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
This method should be implemented by targets that mark instructions with the 'hasPostISelHook' flag.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
getTgtMemIntrinsic - Represent NEON load and store intrinsics as MemIntrinsicNodes.
bool isTruncateFree(Type *SrcTy, Type *DstTy) const override
Return true if it's free to truncate a value of type FromTy to type ToTy.
bool isShuffleMaskLegal(ArrayRef< int > M, EVT VT) const override
isShuffleMaskLegal - Targets can use this to indicate that they only support some VECTOR_SHUFFLE oper...
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool lowerInterleavedStore(Instruction *Store, Value *Mask, ShuffleVectorInst *SVI, unsigned Factor, const APInt &GapMask) const override
Lower an interleaved store into a vstN intrinsic.
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const override
getRegClassFor - Return the register class that should be used for the specified value type.
bool useLoadStackGuardNode(const Module &M) const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef< ShuffleVectorInst * > Shuffles, ArrayRef< unsigned > Indices, unsigned Factor, const APInt &GapMask) const override
Lower an interleaved load into a vldN intrinsic.
std::pair< const TargetRegisterClass *, uint8_t > findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const override
Return the largest legal super-reg register class of the register class for the specified type and it...
bool preferSelectsOverBooleanArithmetic(EVT VT) const override
Should we prefer selects to doing arithmetic on boolean types.
bool isZExtFree(SDValue Val, EVT VT2) const override
Return true if zero-extending the specific node Val to type VT2 is free (either because it's implicit...
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
ARMTargetLowering(const TargetMachine &TM, const ARMSubtarget &STI)
bool isComplexDeinterleavingOperationSupported(ComplexDeinterleavingOperation Operation, Type *Ty) const override
Does this target support complex deinterleaving with the given operation and type.
bool supportKCFIBundles() const override
Return true if the target supports kcfi operand bundles.
SDValue PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const
PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
Type * shouldConvertSplatType(ShuffleVectorInst *SVI) const override
Given a shuffle vector SVI representing a vector splat, return a new scalar type of size equal to SVI...
Value * emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr, AtomicOrdering Ord) const override
Perform a load-linked operation on Addr, returning a "Value *" with the corresponding pointee type.
Instruction * makeDMB(IRBuilderBase &Builder, ARM_MB::MemBOpt Domain) const
bool isLegalICmpImmediate(int64_t Imm) const override
isLegalICmpImmediate - Return true if the specified immediate is legal icmp immediate,...
const char * LowerXConstraint(EVT ConstraintVT) const override
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
bool isDesirableToTransformToIntegerOp(unsigned Opc, EVT VT) const override
Return true if it is profitable for dag combiner to transform a floating point op of specified opcode...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool isVarArg) const
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, unsigned *Fast) const override
allowsMisalignedMemoryAccesses - Returns true if the target allows unaligned memory accesses of the s...
bool isLegalInterleavedAccessType(unsigned Factor, FixedVectorType *VecTy, Align Alignment, const DataLayout &DL) const
Returns true if VecTy is a legal interleaved access type.
bool isVectorLoadExtDesirable(SDValue ExtVal) const override
Return true if folding a vector load into ExtVal (a sign, zero, or any extend node) is profitable.
bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx, unsigned &Cost) const override
Return true if the target can combine store(extractelement VectorTy,Idx).
bool useSoftFloat() const override
bool alignLoopsWithOptSize() const override
Should loops be aligned even when the function is marked OptSize (but not MinSize).
SDValue PerformCMOVToBFICombine(SDNode *N, SelectionDAG &DAG) const
bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override
Return true if a truncation from FromTy to ToTy is permitted when deciding whether a call is in tail ...
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
LowerAsmOperandForConstraint - Lower the specified operand into the Ops vector.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg, const DataLayout &DL) const override
Returns true if an argument of type Ty needs to be passed in a contiguous block of registers in calli...
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
const ARMBaseTargetMachine & getTM() const
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const override
getPostIndexedAddressParts - returns true by value, base pointer and offset pointer and addressing mo...
Instruction * emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
Inserts in the IR a target-specific intrinsic specifying a fence.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
An instruction that atomically checks whether a specified value is in a memory location,...
an instruction that atomically reads a memory location, combines it with another value,...
bool isFloatingPointOperation() const
static LLVM_ABI Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
static LLVM_ABI BaseIndexOffset match(const SDNode *N, const SelectionDAG &DAG)
Parses tree in N for base, index, offset addresses.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
The address of a basic block.
Definition Constants.h:899
static BranchProbability getZero()
A "pseudo-class" with methods for operating on BUILD_VECTORs.
LLVM_ABI bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
LLVM_ABI int32_t getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements, uint32_t BitWidth) const
If this is a constant FP splat and the splatted constant FP is an exact power or 2,...
CCState - This class holds information needed while lowering arguments and return values.
void getInRegsParamInfo(unsigned InRegsParamRecordIndex, unsigned &BeginReg, unsigned &EndReg) const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
unsigned getInRegsParamsProcessed() const
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
void addInRegsParamInfo(unsigned RegBegin, unsigned RegEnd)
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
unsigned getInRegsParamsCount() const
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
bool needsCustom() const
int64_t getLocMemOffset() const
unsigned getValNo() const
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
LLVM_ABI bool isIndirectCall() const
Return true if the callsite is an indirect call.
AttributeList getAttributes() const
Return the attributes for this call.
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition Constants.h:715
const APFloat & getValueAPF() const
ConstantFP - Floating Point Values [float, double].
Definition Constants.h:277
This is the shared class of boolean and integer constants.
Definition Constants.h:87
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:163
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
int64_t getSExtValue() const
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
bool isLittleEndian() const
Layout endianness...
Definition DataLayout.h:207
bool isBigEndian() const
Definition DataLayout.h:208
MaybeAlign getStackAlignment() const
Returns the natural stack alignment, or MaybeAlign() if one wasn't specified.
Definition DataLayout.h:237
LLVM_ABI TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
LLVM_ABI Align getPreferredAlign(const GlobalVariable *GV) const
Returns the preferred alignment of the specified global.
StringRef getPrivateGlobalPrefix() const
Definition DataLayout.h:295
LLVM_ABI Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
A debug info location.
Definition DebugLoc.h:124
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
unsigned size() const
Definition DenseMap.h:110
bool empty() const
Definition DenseMap.h:109
iterator begin()
Definition DenseMap.h:78
iterator end()
Definition DenseMap.h:81
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition FastISel.h:66
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
arg_iterator arg_begin()
Definition Function.h:866
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
bool hasStructRetAttr() const
Determine if the function returns a structure through first or second pointer argument.
Definition Function.h:687
const Argument * const_arg_iterator
Definition Function.h:73
bool isVarArg() const
isVarArg - Return true if this function takes a variable number of arguments.
Definition Function.h:227
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:730
const GlobalValue * getGlobal() const
bool isDSOLocal() const
bool hasExternalWeakLinkage() const
bool hasDLLImportStorageClass() const
Module * getParent()
Get the module that this global value is contained inside of...
bool isStrongDefinitionForLinker() const
Returns true if this global's definition will be the one chosen by the linker.
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788
LLVM_ABI bool hasAtomicStore() const LLVM_READONLY
Return true if this atomic instruction stores to memory.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool isUnindexed() const
Return true if this is NOT a pre/post inc/dec load/store.
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
An instruction for reading from memory.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
Describe properties that are true of each instruction in the target description file.
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isInteger() const
Return true if this is an integer or a vector integer type.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
bool is64BitVector() const
Return true if this is a 64-bit vector type.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
bool isEHPad() const
Returns true if the block is a landing pad.
LLVM_ABI MachineBasicBlock * getFallThrough(bool JumpToFallThrough=true)
Return the fallthrough block if the block can implicitly transfer control to the block after it by fa...
void setCallFrameSize(unsigned N)
Set the call frame size on entry to this basic block.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
LLVM_ABI bool canFallThrough()
Return true if the block can implicitly transfer control to the block after it by falling off the end...
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
Instructions::iterator instr_iterator
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator_range< succ_iterator > successors()
iterator_range< pred_iterator > predecessors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
LLVM_ABI void moveAfter(MachineBasicBlock *NewBefore)
void setIsEHPad(bool V=true)
Indicates the block is a landing pad.
The MachineConstantPool class keeps track of constants referenced by a function which must be spilled...
unsigned getConstantPoolIndex(const Constant *C, Align Alignment)
getConstantPoolIndex - Create a new entry in the constant pool or return an existing one.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
LLVM_ABI void computeMaxCallFrameSize(MachineFunction &MF, std::vector< MachineBasicBlock::iterator > *FrameSDOps=nullptr)
Computes the maximum size of a callframe.
void setFrameAddressIsTaken(bool T)
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
bool hasVAStart() const
Returns true if the function calls the llvm.va_start intrinsic.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
int getFunctionContextIndex() const
Return the index for the function context object.
Properties which a MachineFunction may have at a given point in time.
unsigned getFunctionNumber() const
getFunctionNumber - Return a unique ID for the current function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineConstantPool * getConstantPool()
getConstantPool - Return the constant pool object for the current function.
const MachineFunctionProperties & getProperties() const
Get the function properties.
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addConstantPoolIndex(unsigned Idx, int Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addJumpTableIndex(unsigned Idx, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr reads the specified register.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr fully defines the specified register.
MachineOperand * mop_iterator
iterator/begin/end - Iterate over all operands of a machine instruction.
const MachineOperand & getOperand(unsigned i) const
LLVM_ABI unsigned createJumpTableIndex(const std::vector< MachineBasicBlock * > &DestBBs)
createJumpTableIndex - Create a new jump table.
@ EK_Inline
EK_Inline - Jump table entries are emitted inline at their point of use.
@ EK_BlockAddress
EK_BlockAddress - Each entry is a plain address of block, e.g.: .word LBB123.
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
LLVM_ABI void setIsRenamable(bool Val=true)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
LLVM_ABI void setIsDef(bool Val=true)
Change a def to a use, or a use to a def.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This class is used to represent an MLOAD node.
This class is used to represent an MSTORE node.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
Align getBaseAlign() const
Returns alignment and volatility of the memory access.
Align getAlign() const
bool isVolatile() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Wrapper class representing virtual and physical registers.
Definition Register.h:20
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
LLVM_ABI bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< use_iterator > uses()
SDNodeFlags getFlags() const
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
bool use_empty() const
Return true if there are no uses of this node.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getNumOperands() const
Return the number of values used by this operation.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
const APInt & getConstantOperandAPInt(unsigned Num) const
Helper method returns the APInt of a ConstantSDNode operand.
bool isPredecessorOf(const SDNode *N) const
Return true if this node is a predecessor of N.
LLVM_ABI bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
void setCFIType(uint32_t Type)
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
iterator_range< user_iterator > users()
void setFlags(SDNodeFlags NewFlags)
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
const SDValue & getOperand(unsigned i) const
const APInt & getConstantOperandAPInt(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
unsigned getOpcode() const
unsigned getNumOperands() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
LLVM_ABI SDValue getStackArgumentTokenFactor(SDValue Chain)
Compute a TokenFactor to force all the incoming stack arguments to be loaded from the stack.
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getSplatValue(SDValue V, bool LegalTypes=false)
If V is a splat vector, return its scalar source operand by extracting that element from the source v...
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags, bool AllowCommute=false)
Get the specified node if it's already available, or else return NULL.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge)
Set NoMergeSiteInfo to be associated with Node if NoMerge is true.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI SDValue getNegative(SDValue Val, const SDLoc &DL, EVT VT)
Create negative operation as (SUB 0, Val).
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
LLVM_ABI MaybeAlign InferPtrAlign(SDValue Ptr) const
Infer alignment of a load / store address.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
void addCallSiteInfo(const SDNode *Node, CallSiteInfo &&CallInfo)
Set CallSiteInfo to be associated with Node.
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
LLVM_ABI SDValue CreateStackTemporary(TypeSize Bytes, Align Alignment)
Create a stack temporary based on the size in bytes and the alignment.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
DenormalMode getDenormalMode(EVT VT) const
Return the current function's default denormal handling kind for the given floating point type.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
LLVM_ABI SDValue getLogicalNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a logical NOT operation as (XOR Val, BooleanOne).
This instruction constructs a fixed permutation of two input vectors.
VectorType * getType() const
Overload to return most specific vector type.
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
static LLVM_ABI bool isSplatMask(ArrayRef< int > Mask)
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
bool empty() const
Definition SmallSet.h:168
bool erase(const T &V)
Definition SmallSet.h:199
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
iterator insert(iterator I, T &&Elt)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
const unsigned char * bytes_end() const
Definition StringRef.h:127
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
const unsigned char * bytes_begin() const
Definition StringRef.h:124
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:413
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const
Get the CallingConv that should be used for the specified libcall.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
void setMinStackArgumentAlignment(Align Alignment)
Set the minimum stack alignment of an argument.
const TargetMachine & getTargetMachine() const
void setIndexedMaskedLoadAction(unsigned IdxMode, MVT VT, LegalizeAction Action)
Indicate that the specified indexed masked load does or does not work with the specified type and ind...
void setIndexedLoadAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed load does or does not work with the specified type and indicate w...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
Sched::Preference getSchedulingPreference() const
Return target scheduling preference.
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setIndexedStoreAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed store does or does not work with the specified type and indicate ...
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
virtual unsigned getMaxSupportedInterleaveFactor() const
Get the maximum supported factor for interleaved memory accesses.
void setIndexedMaskedStoreAction(unsigned IdxMode, MVT VT, LegalizeAction Action)
Indicate that the specified indexed masked store does or does not work with the specified type and in...
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
virtual std::pair< const TargetRegisterClass *, uint8_t > findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const
Return the largest legal super-reg register class of the register class for the specified type and it...
RTLIB::LibcallImpl getLibcallImpl(RTLIB::Libcall Call) const
Get the libcall impl routine name for the specified libcall.
static StringRef getLibcallImplName(RTLIB::LibcallImpl Call)
Get the libcall routine name for the specified libcall implementation.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
void setLibcallImpl(RTLIB::Libcall Call, RTLIB::LibcallImpl Impl)
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
const char * getLibcallName(RTLIB::Libcall Call) const
Get the libcall routine name for the specified libcall.
std::vector< ArgListEntry > ArgListTy
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void insertSSPDeclarations(Module &M) const
Inserts necessary declarations for SSP (stack protection) purpose.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedEltMask, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Vector Op.
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
std::pair< SDValue, SDValue > makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, ArrayRef< SDValue > Ops, MakeLibCallOptions CallOptions, const SDLoc &dl, SDValue Chain=SDValue()) const
Returns a pair of (return value, chain).
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool expandDIVREMByConstant(SDNode *N, SmallVectorImpl< SDValue > &Result, EVT HiLoVT, SelectionDAG &DAG, SDValue LL=SDValue(), SDValue LH=SDValue()) const
Attempt to expand an n-bit div/rem/divrem by constant using a n/2-bit urem by constant and other arit...
bool isPositionIndependent() const
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
SDValue buildLegalVectorShuffle(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1, MutableArrayRef< int > Mask, SelectionDAG &DAG) const
Tries to build a legal vector shuffle using the provided parameters or equivalent variations.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
TargetLowering(const TargetLowering &)=delete
bool isConstTrueVal(SDValue N) const
Return if the N is a constant or constant vector equal to the true value from getBooleanContents().
virtual ArrayRef< MCPhysReg > getRoundingControlRegisters() const
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
ExceptionHandling getExceptionModel() const
Return the ExceptionHandling to use, considering TargetOptions and the Triple's default.
const Triple & getTargetTriple() const
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
TargetOptions Options
unsigned EnableFastISel
EnableFastISel - This flag enables fast-path instruction selection which trades away generated code q...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned EmitCallGraphSection
Emit section containing call graph metadata.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
ObjectFormatType getObjectFormat() const
Get the object format for this triple.
Definition Triple.h:439
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:280
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:294
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
Definition Type.cpp:295
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition Type.h:225
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:35
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
Value * getOperand(unsigned i) const
Definition User.h:232
unsigned getNumOperands() const
Definition User.h:254
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
Base class of all SIMD vector types.
Type * getElementType() const
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
IteratorT end() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
static CondCodes getOppositeCondition(CondCodes CC)
Definition ARMBaseInfo.h:48
@ SECREL
Thread Pointer Offset.
@ SBREL
Section Relative (Windows TLS)
@ GOTTPOFF
Global Offset Table, PC Relative.
@ TPOFF
Global Offset Table, Thread Pointer Offset.
TOF
Target Operand Flag enum.
@ MO_NONLAZY
MO_NONLAZY - This is an independent flag, on a symbol operand "FOO" it represents a symbol which,...
@ MO_SBREL
MO_SBREL - On a symbol operand, this represents a static base relative relocation.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_GOT
MO_GOT - On a symbol operand, this represents a GOT relative relocation.
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
static ShiftOpc getShiftOpcForNode(unsigned Opcode)
int getSOImmVal(unsigned Arg)
getSOImmVal - Given a 32-bit immediate, if it is something that can fit into an shifter_operand immed...
int getFP32Imm(const APInt &Imm)
getFP32Imm - Return an 8-bit floating-point version of the 32-bit floating-point value.
uint64_t decodeVMOVModImm(unsigned ModImm, unsigned &EltBits)
decodeVMOVModImm - Decode a NEON/MVE modified immediate value into the element value and the element ...
unsigned getAM2Offset(unsigned AM2Opc)
bool isThumbImmShiftedVal(unsigned V)
isThumbImmShiftedVal - Return true if the specified value can be obtained by left shifting a 8-bit im...
int getT2SOImmVal(unsigned Arg)
getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit into a Thumb-2 shifter_oper...
unsigned createVMOVModImm(unsigned OpCmode, unsigned Val)
int getFP64Imm(const APInt &Imm)
getFP64Imm - Return an 8-bit floating-point version of the 64-bit floating-point value.
int getFP16Imm(const APInt &Imm)
getFP16Imm - Return an 8-bit floating-point version of the 16-bit floating-point value.
unsigned getSORegOpc(ShiftOpc ShOp, unsigned Imm)
int getFP32FP16Imm(const APInt &Imm)
If this is a FP16Imm encoded as a fp32 value, return the 8-bit encoding for it.
AddrOpc getAM2Op(unsigned AM2Opc)
bool isBitFieldInvertedMask(unsigned v)
const unsigned FPStatusBits
const unsigned FPReservedBits
const unsigned RoundingBitsPos
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Swift
Calling convention for Swift.
Definition CallingConv.h:69
@ ARM_APCS
ARM Procedure Calling Standard (obsolete, but still used on some targets).
@ CFGuard_Check
Special calling convention on Windows for calling the Control Guard Check ICall funtion.
Definition CallingConv.h:82
@ PreserveMost
Used for runtime calls that preserves most registers.
Definition CallingConv.h:63
@ ARM_AAPCS
ARM Architecture Procedure Calling Standard calling convention (aka EABI).
@ CXX_FAST_TLS
Used for access functions.
Definition CallingConv.h:72
@ GHC
Used by the Glasgow Haskell Compiler (GHC).
Definition CallingConv.h:50
@ PreserveAll
Used for runtime calls that preserves (almost) all registers.
Definition CallingConv.h:66
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition CallingConv.h:87
@ ARM_AAPCS_VFP
Same as ARM_AAPCS, but uses hard floating point ABI.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:807
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:256
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:780
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition ISDOpcodes.h:504
@ EH_SJLJ_LONGJMP
OUTCHAIN = EH_SJLJ_LONGJMP(INCHAIN, buffer) This corresponds to the eh.sjlj.longjmp intrinsic.
Definition ISDOpcodes.h:163
@ FGETSIGN
INT = FGETSIGN(FP) - Return the sign bit of the specified floating point value as an integer 0/1 valu...
Definition ISDOpcodes.h:531
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:593
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:771
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:841
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:215
@ EH_SJLJ_SETUP_DISPATCH
OUTCHAIN = EH_SJLJ_SETUP_DISPATCH(INCHAIN) The target initializes the dispatch table here.
Definition ISDOpcodes.h:167
@ GlobalAddress
Definition ISDOpcodes.h:88
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:868
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:577
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:744
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:275
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition ISDOpcodes.h:431
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ GlobalTLSAddress
Definition ISDOpcodes.h:89
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:832
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition ISDOpcodes.h:712
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:662
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:779
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition ISDOpcodes.h:815
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:347
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:369
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:784
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:228
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:242
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:225
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:343
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:958
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:701
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:762
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:642
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:607
@ READ_REGISTER
READ_REGISTER, WRITE_REGISTER - This node represents llvm.register on the DAG, which implements the n...
Definition ISDOpcodes.h:134
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:569
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:838
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:799
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:351
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:876
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:724
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:793
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ STRICT_FROUNDEVEN
Definition ISDOpcodes.h:457
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition ISDOpcodes.h:110
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:471
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:493
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:470
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:914
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:498
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:736
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ SCMP
[US]CMP - 3-way comparison of signed or unsigned integers.
Definition ISDOpcodes.h:732
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition ISDOpcodes.h:707
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition ISDOpcodes.h:420
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:558
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:947
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
Definition ISDOpcodes.h:122
@ STRICT_FNEARBYINT
Definition ISDOpcodes.h:451
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition ISDOpcodes.h:933
@ EH_SJLJ_SETJMP
RESULT, OUTCHAIN = EH_SJLJ_SETJMP(INCHAIN, buffer) This corresponds to the eh.sjlj....
Definition ISDOpcodes.h:157
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:844
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:821
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:527
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:360
@ ABDS
ABDS/ABDU - Absolute difference - Return the absolute difference between two numbers interpreted as s...
Definition ISDOpcodes.h:719
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:549
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
bool isZEXTLoad(const SDNode *N)
Returns true if the specified node is a ZEXTLOAD.
LLVM_ABI CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
bool isEXTLoad(const SDNode *N)
Returns true if the specified node is a EXTLOAD.
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
LLVM_ABI bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
LLVM_ABI bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
bool isSEXTLoad(const SDNode *N)
Returns true if the specified node is a SEXTLOAD.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
static const int LAST_INDEXED_MODE
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
LLVM_ABI Libcall getSINTTOFP(EVT OpVT, EVT RetVT)
getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getUINTTOFP(EVT OpVT, EVT RetVT)
getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOUINT(EVT OpVT, EVT RetVT)
getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOSINT(EVT OpVT, EVT RetVT)
getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPEXT(EVT OpVT, EVT RetVT)
getFPEXT - Return the FPEXT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPROUND(EVT OpVT, EVT RetVT)
getFPROUND - Return the FPROUND_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition LLVMContext.h:55
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
bool RetFastCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280
@ Offset
Definition DWP.cpp:532
@ Length
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2058
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1751
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
bool HasLowerConstantMaterializationCost(unsigned Val1, unsigned Val2, const ARMSubtarget *Subtarget, bool ForCodesize=false)
Returns true if Val1 has a lower Constant Materialization Cost than Val2.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
InstructionCost Cost
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
bool isStrongerThanMonotonic(AtomicOrdering AO)
int countr_one(T Value)
Count the number of ones from the least significant bit to the first zero bit.
Definition bit.h:293
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:296
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
bool CC_ARM_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
constexpr bool isMask_32(uint32_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition MathExtras.h:255
ExceptionHandling
Definition CodeGen.h:53
@ SjLj
setjmp/longjmp based exceptions
Definition CodeGen.h:56
bool RetCC_ARM_AAPCS_VFP(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool RetCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:303
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2136
constexpr bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition MathExtras.h:243
bool RetCC_ARM_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
void shuffle(Iterator first, Iterator last, RNG &&g)
Definition STLExtras.h:1516
bool CC_ARM_APCS_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
static std::array< MachineOperand, 2 > predOps(ARMCC::CondCodes Pred, unsigned PredReg=0)
Get the operands corresponding to the given Pred value.
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
constexpr bool isShiftedMask_32(uint32_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (32 bit ver...
Definition MathExtras.h:267
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
bool isReleaseOrStronger(AtomicOrdering AO)
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:236
LLVM_ABI bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
bool FastCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool CC_ARM_Win32_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
CombineLevel
Definition DAGCombine.h:15
@ BeforeLegalizeTypes
Definition DAGCombine.h:16
unsigned ConstantMaterializationCost(unsigned Val, const ARMSubtarget *Subtarget, bool ForCodesize=false)
Returns the number of instructions required to materialize the given constant in a register,...
@ Mul
Product of integers.
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ FAdd
Sum of floats.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr U AbsoluteValue(T X)
Return the absolute value of a signed integer, converted to the corresponding unsigned integer type.
Definition MathExtras.h:592
bool isAcquireOrStronger(AtomicOrdering AO)
constexpr unsigned BitWidth
static MachineOperand t1CondCodeOp(bool isDead=false)
Get the operand corresponding to the conditional code result for Thumb1.
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:1961
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1758
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
static MachineOperand condCodeOp(unsigned CCReg=0)
Get the operand corresponding to the conditional code result.
bool isVREVMask(ArrayRef< int > M, EVT VT, unsigned BlockSize)
isVREVMask - Check if a vector shuffle corresponds to a VREV instruction with the specified blocksize...
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
unsigned gettBLXrOpcode(const MachineFunction &MF)
bool CC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
@ Increment
Incrementally increasing token ID.
Definition AllocToken.h:26
bool CC_ARM_AAPCS_VFP(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
constexpr bool isShiftedUInt(uint64_t x)
Checks if a unsigned integer is an N bit number shifted left by S.
Definition MathExtras.h:198
unsigned convertAddSubFlagsOpcode(unsigned OldOpc)
Map pseudo instructions that imply an 'S' bit onto real opcodes.
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
static const unsigned PerfectShuffleTable[6561+1]
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
#define N
Load/store instruction that can be merged with a base address update.
SDNode * N
Instruction that updates a pointer.
unsigned ConstInc
Pointer increment value if it is a constant, or 0 otherwise.
SDValue Inc
Pointer increment operand.
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:761
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition ValueTypes.h:35
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition ValueTypes.h:94
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:284
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:300
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
ElementCount getVectorElementCount() const
Definition ValueTypes.h:350
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:463
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition ValueTypes.h:359
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:470
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition ValueTypes.h:207
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:381
bool isFixedLengthVector() const
Definition ValueTypes.h:181
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition ValueTypes.h:59
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:157
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
bool bitsLE(EVT VT) const
Return true if this has no more bits than VT.
Definition ValueTypes.h:308
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:453
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition ValueTypes.h:202
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
EVT ArgVT
Usually the non-legalized type of the argument, which is the EVT corresponding to the OrigTy IR type.
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:301
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:172
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition KnownBits.h:311
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:180
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:347
static LLVM_ABI KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
APInt getSignedMinValue() const
Return the minimal signed value possible given these KnownBits.
Definition KnownBits.h:135
Matching combinators.
SmallVector< ArgRegPair, 1 > ArgRegPairs
Vector of call argument and its forwarding register.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getJumpTable(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a jump table entry.
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
static LLVM_ABI MachinePointerInfo getConstantPool(MachineFunction &MF)
Return a MachinePointerInfo record that refers to the constant pool.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoSignedZeros() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This contains information for each constraint that we are lowering.
This structure contains all information that is necessary for lowering calls.
CallLoweringInfo & setInRegister(bool Value=true)
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
SmallVector< ISD::InputArg, 32 > Ins
CallLoweringInfo & setZExtResult(bool Value=true)
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
CallLoweringInfo & setSExtResult(bool Value=true)
SmallVector< ISD::OutputArg, 32 > Outs
CallLoweringInfo & setChain(SDValue InChain)
CallLoweringInfo & setCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList, AttributeSet ResultAttrs={})
LLVM_ABI void AddToWorklist(SDNode *N)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
This structure is used to pass arguments to makeLibCall function.
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...