LLVM 19.0.0git
ARMISelLowering.cpp
Go to the documentation of this file.
1//===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that ARM uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "ARMISelLowering.h"
15#include "ARMBaseInstrInfo.h"
16#include "ARMBaseRegisterInfo.h"
17#include "ARMCallingConv.h"
20#include "ARMPerfectShuffle.h"
21#include "ARMRegisterInfo.h"
22#include "ARMSelectionDAGInfo.h"
23#include "ARMSubtarget.h"
27#include "Utils/ARMBaseInfo.h"
28#include "llvm/ADT/APFloat.h"
29#include "llvm/ADT/APInt.h"
30#include "llvm/ADT/ArrayRef.h"
31#include "llvm/ADT/BitVector.h"
32#include "llvm/ADT/DenseMap.h"
33#include "llvm/ADT/STLExtras.h"
36#include "llvm/ADT/Statistic.h"
38#include "llvm/ADT/StringRef.h"
40#include "llvm/ADT/Twine.h"
67#include "llvm/IR/Attributes.h"
68#include "llvm/IR/CallingConv.h"
69#include "llvm/IR/Constant.h"
70#include "llvm/IR/Constants.h"
71#include "llvm/IR/DataLayout.h"
72#include "llvm/IR/DebugLoc.h"
74#include "llvm/IR/Function.h"
75#include "llvm/IR/GlobalAlias.h"
76#include "llvm/IR/GlobalValue.h"
78#include "llvm/IR/IRBuilder.h"
79#include "llvm/IR/InlineAsm.h"
80#include "llvm/IR/Instruction.h"
83#include "llvm/IR/Intrinsics.h"
84#include "llvm/IR/IntrinsicsARM.h"
85#include "llvm/IR/Module.h"
87#include "llvm/IR/Type.h"
88#include "llvm/IR/User.h"
89#include "llvm/IR/Value.h"
90#include "llvm/MC/MCInstrDesc.h"
93#include "llvm/MC/MCSchedule.h"
100#include "llvm/Support/Debug.h"
108#include <algorithm>
109#include <cassert>
110#include <cstdint>
111#include <cstdlib>
112#include <iterator>
113#include <limits>
114#include <optional>
115#include <tuple>
116#include <utility>
117#include <vector>
118
119using namespace llvm;
120using namespace llvm::PatternMatch;
121
122#define DEBUG_TYPE "arm-isel"
123
124STATISTIC(NumTailCalls, "Number of tail calls");
125STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
126STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
127STATISTIC(NumConstpoolPromoted,
128 "Number of constants with their storage promoted into constant pools");
129
130static cl::opt<bool>
131ARMInterworking("arm-interworking", cl::Hidden,
132 cl::desc("Enable / disable ARM interworking (for debugging only)"),
133 cl::init(true));
134
136 "arm-promote-constant", cl::Hidden,
137 cl::desc("Enable / disable promotion of unnamed_addr constants into "
138 "constant pools"),
139 cl::init(false)); // FIXME: set to true by default once PR32780 is fixed
141 "arm-promote-constant-max-size", cl::Hidden,
142 cl::desc("Maximum size of constant to promote into a constant pool"),
143 cl::init(64));
145 "arm-promote-constant-max-total", cl::Hidden,
146 cl::desc("Maximum size of ALL constants to promote into a constant pool"),
147 cl::init(128));
148
150MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden,
151 cl::desc("Maximum interleave factor for MVE VLDn to generate."),
152 cl::init(2));
153
154// The APCS parameter registers.
155static const MCPhysReg GPRArgRegs[] = {
156 ARM::R0, ARM::R1, ARM::R2, ARM::R3
157};
158
159void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT) {
160 if (VT != PromotedLdStVT) {
162 AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);
163
165 AddPromotedToType (ISD::STORE, VT, PromotedLdStVT);
166 }
167
168 MVT ElemTy = VT.getVectorElementType();
169 if (ElemTy != MVT::f64)
173 if (ElemTy == MVT::i32) {
178 } else {
183 }
192 if (VT.isInteger()) {
196 }
197
198 // Neon does not support vector divide/remainder operations.
207
208 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
209 for (auto Opcode : {ISD::ABS, ISD::ABDS, ISD::ABDU, ISD::SMIN, ISD::SMAX,
211 setOperationAction(Opcode, VT, Legal);
212 if (!VT.isFloatingPoint())
213 for (auto Opcode : {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT})
214 setOperationAction(Opcode, VT, Legal);
215}
216
217void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
218 addRegisterClass(VT, &ARM::DPRRegClass);
219 addTypeForNEON(VT, MVT::f64);
220}
221
222void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
223 addRegisterClass(VT, &ARM::DPairRegClass);
224 addTypeForNEON(VT, MVT::v2f64);
225}
226
227void ARMTargetLowering::setAllExpand(MVT VT) {
228 for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
229 setOperationAction(Opc, VT, Expand);
230
231 // We support these really simple operations even on types where all
232 // the actual arithmetic has to be broken down into simpler
233 // operations or turned into library calls.
238}
239
240void ARMTargetLowering::addAllExtLoads(const MVT From, const MVT To,
241 LegalizeAction Action) {
242 setLoadExtAction(ISD::EXTLOAD, From, To, Action);
243 setLoadExtAction(ISD::ZEXTLOAD, From, To, Action);
244 setLoadExtAction(ISD::SEXTLOAD, From, To, Action);
245}
246
247void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
248 const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 };
249
250 for (auto VT : IntTypes) {
251 addRegisterClass(VT, &ARM::MQPRRegClass);
281
282 // No native support for these.
292
293 // Vector reductions
303
304 if (!HasMVEFP) {
309 } else {
312 }
313
314 // Pre and Post inc are supported on loads and stores
315 for (unsigned im = (unsigned)ISD::PRE_INC;
321 }
322 }
323
324 const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 };
325 for (auto VT : FloatTypes) {
326 addRegisterClass(VT, &ARM::MQPRRegClass);
327 if (!HasMVEFP)
328 setAllExpand(VT);
329
330 // These are legal or custom whether we have MVE.fp or not
343
344 // Pre and Post inc are supported on loads and stores
345 for (unsigned im = (unsigned)ISD::PRE_INC;
351 }
352
353 if (HasMVEFP) {
361
362 // No native support for these.
377 }
378 }
379
380 // Custom Expand smaller than legal vector reductions to prevent false zero
381 // items being added.
390
391 // We 'support' these types up to bitcast/load/store level, regardless of
392 // MVE integer-only / float support. Only doing FP data processing on the FP
393 // vector types is inhibited at integer-only level.
394 const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 };
395 for (auto VT : LongTypes) {
396 addRegisterClass(VT, &ARM::MQPRRegClass);
397 setAllExpand(VT);
403 }
405
406 // We can do bitwise operations on v2i64 vectors
407 setOperationAction(ISD::AND, MVT::v2i64, Legal);
408 setOperationAction(ISD::OR, MVT::v2i64, Legal);
409 setOperationAction(ISD::XOR, MVT::v2i64, Legal);
410
411 // It is legal to extload from v4i8 to v4i16 or v4i32.
412 addAllExtLoads(MVT::v8i16, MVT::v8i8, Legal);
413 addAllExtLoads(MVT::v4i32, MVT::v4i16, Legal);
414 addAllExtLoads(MVT::v4i32, MVT::v4i8, Legal);
415
416 // It is legal to sign extend from v4i8/v4i16 to v4i32 or v8i8 to v8i16.
422
423 // Some truncating stores are legal too.
424 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
425 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
426 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
427
428 // Pre and Post inc on these are legal, given the correct extends
429 for (unsigned im = (unsigned)ISD::PRE_INC;
431 for (auto VT : {MVT::v8i8, MVT::v4i8, MVT::v4i16}) {
436 }
437 }
438
439 // Predicate types
440 const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1};
441 for (auto VT : pTypes) {
442 addRegisterClass(VT, &ARM::VCCRRegClass);
457
458 if (!HasMVEFP) {
463 }
464 }
468 setOperationAction(ISD::OR, MVT::v2i1, Expand);
474
483}
484
486 const ARMSubtarget &STI)
487 : TargetLowering(TM), Subtarget(&STI) {
488 RegInfo = Subtarget->getRegisterInfo();
489 Itins = Subtarget->getInstrItineraryData();
490
493
494 if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetIOS() &&
495 !Subtarget->isTargetWatchOS() && !Subtarget->isTargetDriverKit()) {
496 bool IsHFTarget = TM.Options.FloatABIType == FloatABI::Hard;
497 for (int LCID = 0; LCID < RTLIB::UNKNOWN_LIBCALL; ++LCID)
498 setLibcallCallingConv(static_cast<RTLIB::Libcall>(LCID),
499 IsHFTarget ? CallingConv::ARM_AAPCS_VFP
501 }
502
503 if (Subtarget->isTargetMachO()) {
504 // Uses VFP for Thumb libfuncs if available.
505 if (Subtarget->isThumb() && Subtarget->hasVFP2Base() &&
506 Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) {
507 static const struct {
508 const RTLIB::Libcall Op;
509 const char * const Name;
510 const ISD::CondCode Cond;
511 } LibraryCalls[] = {
512 // Single-precision floating-point arithmetic.
513 { RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID },
514 { RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID },
515 { RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID },
516 { RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID },
517
518 // Double-precision floating-point arithmetic.
519 { RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID },
520 { RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID },
521 { RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID },
522 { RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID },
523
524 // Single-precision comparisons.
525 { RTLIB::OEQ_F32, "__eqsf2vfp", ISD::SETNE },
526 { RTLIB::UNE_F32, "__nesf2vfp", ISD::SETNE },
527 { RTLIB::OLT_F32, "__ltsf2vfp", ISD::SETNE },
528 { RTLIB::OLE_F32, "__lesf2vfp", ISD::SETNE },
529 { RTLIB::OGE_F32, "__gesf2vfp", ISD::SETNE },
530 { RTLIB::OGT_F32, "__gtsf2vfp", ISD::SETNE },
531 { RTLIB::UO_F32, "__unordsf2vfp", ISD::SETNE },
532
533 // Double-precision comparisons.
534 { RTLIB::OEQ_F64, "__eqdf2vfp", ISD::SETNE },
535 { RTLIB::UNE_F64, "__nedf2vfp", ISD::SETNE },
536 { RTLIB::OLT_F64, "__ltdf2vfp", ISD::SETNE },
537 { RTLIB::OLE_F64, "__ledf2vfp", ISD::SETNE },
538 { RTLIB::OGE_F64, "__gedf2vfp", ISD::SETNE },
539 { RTLIB::OGT_F64, "__gtdf2vfp", ISD::SETNE },
540 { RTLIB::UO_F64, "__unorddf2vfp", ISD::SETNE },
541
542 // Floating-point to integer conversions.
543 // i64 conversions are done via library routines even when generating VFP
544 // instructions, so use the same ones.
545 { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp", ISD::SETCC_INVALID },
546 { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID },
547 { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp", ISD::SETCC_INVALID },
548 { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID },
549
550 // Conversions between floating types.
551 { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp", ISD::SETCC_INVALID },
552 { RTLIB::FPEXT_F32_F64, "__extendsfdf2vfp", ISD::SETCC_INVALID },
553
554 // Integer to floating-point conversions.
555 // i64 conversions are done via library routines even when generating VFP
556 // instructions, so use the same ones.
557 // FIXME: There appears to be some naming inconsistency in ARM libgcc:
558 // e.g., __floatunsidf vs. __floatunssidfvfp.
559 { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp", ISD::SETCC_INVALID },
560 { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID },
561 { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp", ISD::SETCC_INVALID },
562 { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID },
563 };
564
565 for (const auto &LC : LibraryCalls) {
566 setLibcallName(LC.Op, LC.Name);
567 if (LC.Cond != ISD::SETCC_INVALID)
568 setCmpLibcallCC(LC.Op, LC.Cond);
569 }
570 }
571 }
572
573 // These libcalls are not available in 32-bit.
574 setLibcallName(RTLIB::SHL_I128, nullptr);
575 setLibcallName(RTLIB::SRL_I128, nullptr);
576 setLibcallName(RTLIB::SRA_I128, nullptr);
577 setLibcallName(RTLIB::MUL_I128, nullptr);
578 setLibcallName(RTLIB::MULO_I64, nullptr);
579 setLibcallName(RTLIB::MULO_I128, nullptr);
580
581 // RTLIB
582 if (Subtarget->isAAPCS_ABI() &&
583 (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() ||
584 Subtarget->isTargetMuslAEABI() || Subtarget->isTargetAndroid())) {
585 static const struct {
586 const RTLIB::Libcall Op;
587 const char * const Name;
588 const CallingConv::ID CC;
589 const ISD::CondCode Cond;
590 } LibraryCalls[] = {
591 // Double-precision floating-point arithmetic helper functions
592 // RTABI chapter 4.1.2, Table 2
593 { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
594 { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
595 { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
596 { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
597
598 // Double-precision floating-point comparison helper functions
599 // RTABI chapter 4.1.2, Table 3
600 { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
601 { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
602 { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
603 { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
604 { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
605 { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
606 { RTLIB::UO_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
607
608 // Single-precision floating-point arithmetic helper functions
609 // RTABI chapter 4.1.2, Table 4
610 { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
611 { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
612 { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
613 { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
614
615 // Single-precision floating-point comparison helper functions
616 // RTABI chapter 4.1.2, Table 5
617 { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
618 { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
619 { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
620 { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
621 { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
622 { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
623 { RTLIB::UO_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
624
625 // Floating-point to integer conversions.
626 // RTABI chapter 4.1.2, Table 6
627 { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
628 { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
629 { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
630 { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
631 { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
632 { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
633 { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
634 { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
635
636 // Conversions between floating types.
637 // RTABI chapter 4.1.2, Table 7
638 { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
639 { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
640 { RTLIB::FPEXT_F32_F64, "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
641
642 // Integer to floating-point conversions.
643 // RTABI chapter 4.1.2, Table 8
644 { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
645 { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
646 { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
647 { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
648 { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
649 { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
650 { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
651 { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
652
653 // Long long helper functions
654 // RTABI chapter 4.2, Table 9
655 { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
656 { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
657 { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
658 { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
659
660 // Integer division functions
661 // RTABI chapter 4.3.1
662 { RTLIB::SDIV_I8, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
663 { RTLIB::SDIV_I16, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
664 { RTLIB::SDIV_I32, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
665 { RTLIB::SDIV_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
666 { RTLIB::UDIV_I8, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
667 { RTLIB::UDIV_I16, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
668 { RTLIB::UDIV_I32, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
669 { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
670 };
671
672 for (const auto &LC : LibraryCalls) {
673 setLibcallName(LC.Op, LC.Name);
674 setLibcallCallingConv(LC.Op, LC.CC);
675 if (LC.Cond != ISD::SETCC_INVALID)
676 setCmpLibcallCC(LC.Op, LC.Cond);
677 }
678
679 // EABI dependent RTLIB
680 if (TM.Options.EABIVersion == EABI::EABI4 ||
681 TM.Options.EABIVersion == EABI::EABI5) {
682 static const struct {
683 const RTLIB::Libcall Op;
684 const char *const Name;
685 const CallingConv::ID CC;
686 const ISD::CondCode Cond;
687 } MemOpsLibraryCalls[] = {
688 // Memory operations
689 // RTABI chapter 4.3.4
690 { RTLIB::MEMCPY, "__aeabi_memcpy", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
691 { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
692 { RTLIB::MEMSET, "__aeabi_memset", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
693 };
694
695 for (const auto &LC : MemOpsLibraryCalls) {
696 setLibcallName(LC.Op, LC.Name);
697 setLibcallCallingConv(LC.Op, LC.CC);
698 if (LC.Cond != ISD::SETCC_INVALID)
699 setCmpLibcallCC(LC.Op, LC.Cond);
700 }
701 }
702 }
703
704 if (Subtarget->isTargetWindows()) {
705 static const struct {
706 const RTLIB::Libcall Op;
707 const char * const Name;
708 const CallingConv::ID CC;
709 } LibraryCalls[] = {
710 { RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP },
711 { RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP },
712 { RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP },
713 { RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP },
714 { RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP },
715 { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP },
716 { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP },
717 { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP },
718 };
719
720 for (const auto &LC : LibraryCalls) {
721 setLibcallName(LC.Op, LC.Name);
722 setLibcallCallingConv(LC.Op, LC.CC);
723 }
724 }
725
726 // Use divmod compiler-rt calls for iOS 5.0 and later.
727 if (Subtarget->isTargetMachO() &&
728 !(Subtarget->isTargetIOS() &&
729 Subtarget->getTargetTriple().isOSVersionLT(5, 0))) {
730 setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
731 setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
732 }
733
734 // The half <-> float conversion functions are always soft-float on
735 // non-watchos platforms, but are needed for some targets which use a
736 // hard-float calling convention by default.
737 if (!Subtarget->isTargetWatchABI()) {
738 if (Subtarget->isAAPCS_ABI()) {
739 setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS);
740 setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS);
741 setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS);
742 } else {
743 setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS);
744 setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS);
745 setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS);
746 }
747 }
748
749 // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have
750 // a __gnu_ prefix (which is the default).
751 if (Subtarget->isTargetAEABI()) {
752 static const struct {
753 const RTLIB::Libcall Op;
754 const char * const Name;
755 const CallingConv::ID CC;
756 } LibraryCalls[] = {
757 { RTLIB::FPROUND_F32_F16, "__aeabi_f2h", CallingConv::ARM_AAPCS },
758 { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS },
759 { RTLIB::FPEXT_F16_F32, "__aeabi_h2f", CallingConv::ARM_AAPCS },
760 };
761
762 for (const auto &LC : LibraryCalls) {
763 setLibcallName(LC.Op, LC.Name);
764 setLibcallCallingConv(LC.Op, LC.CC);
765 }
766 }
767
768 if (Subtarget->isThumb1Only())
769 addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
770 else
771 addRegisterClass(MVT::i32, &ARM::GPRRegClass);
772
773 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only() &&
774 Subtarget->hasFPRegs()) {
775 addRegisterClass(MVT::f32, &ARM::SPRRegClass);
776 addRegisterClass(MVT::f64, &ARM::DPRRegClass);
777
782
783 if (!Subtarget->hasVFP2Base())
784 setAllExpand(MVT::f32);
785 if (!Subtarget->hasFP64())
786 setAllExpand(MVT::f64);
787 }
788
789 if (Subtarget->hasFullFP16()) {
790 addRegisterClass(MVT::f16, &ARM::HPRRegClass);
793
796 }
797
798 if (Subtarget->hasBF16()) {
799 addRegisterClass(MVT::bf16, &ARM::HPRRegClass);
800 setAllExpand(MVT::bf16);
801 if (!Subtarget->hasFullFP16())
803 }
804
806 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
807 setTruncStoreAction(VT, InnerVT, Expand);
808 addAllExtLoads(VT, InnerVT, Expand);
809 }
810
813
815 }
816
819
822
823 if (Subtarget->hasMVEIntegerOps())
824 addMVEVectorTypes(Subtarget->hasMVEFloatOps());
825
826 // Combine low-overhead loop intrinsics so that we can lower i1 types.
827 if (Subtarget->hasLOB()) {
829 }
830
831 if (Subtarget->hasNEON()) {
832 addDRTypeForNEON(MVT::v2f32);
833 addDRTypeForNEON(MVT::v8i8);
834 addDRTypeForNEON(MVT::v4i16);
835 addDRTypeForNEON(MVT::v2i32);
836 addDRTypeForNEON(MVT::v1i64);
837
838 addQRTypeForNEON(MVT::v4f32);
839 addQRTypeForNEON(MVT::v2f64);
840 addQRTypeForNEON(MVT::v16i8);
841 addQRTypeForNEON(MVT::v8i16);
842 addQRTypeForNEON(MVT::v4i32);
843 addQRTypeForNEON(MVT::v2i64);
844
845 if (Subtarget->hasFullFP16()) {
846 addQRTypeForNEON(MVT::v8f16);
847 addDRTypeForNEON(MVT::v4f16);
848 }
849
850 if (Subtarget->hasBF16()) {
851 addQRTypeForNEON(MVT::v8bf16);
852 addDRTypeForNEON(MVT::v4bf16);
853 }
854 }
855
856 if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) {
857 // v2f64 is legal so that QR subregs can be extracted as f64 elements, but
858 // none of Neon, MVE or VFP supports any arithmetic operations on it.
859 setOperationAction(ISD::FADD, MVT::v2f64, Expand);
860 setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
861 setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
862 // FIXME: Code duplication: FDIV and FREM are expanded always, see
863 // ARMTargetLowering::addTypeForNEON method for details.
864 setOperationAction(ISD::FDIV, MVT::v2f64, Expand);
865 setOperationAction(ISD::FREM, MVT::v2f64, Expand);
866 // FIXME: Create unittest.
867 // In another words, find a way when "copysign" appears in DAG with vector
868 // operands.
870 // FIXME: Code duplication: SETCC has custom operation action, see
871 // ARMTargetLowering::addTypeForNEON method for details.
873 // FIXME: Create unittest for FNEG and for FABS.
874 setOperationAction(ISD::FNEG, MVT::v2f64, Expand);
875 setOperationAction(ISD::FABS, MVT::v2f64, Expand);
877 setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
878 setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
879 setOperationAction(ISD::FTAN, MVT::v2f64, Expand);
880 setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
881 setOperationAction(ISD::FLOG, MVT::v2f64, Expand);
884 setOperationAction(ISD::FEXP, MVT::v2f64, Expand);
887 // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR.
893 setOperationAction(ISD::FMA, MVT::v2f64, Expand);
894 }
895
896 if (Subtarget->hasNEON()) {
897 // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
898 // supported for v4f32.
900 setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
901 setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
902 setOperationAction(ISD::FTAN, MVT::v4f32, Expand);
903 setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
904 setOperationAction(ISD::FLOG, MVT::v4f32, Expand);
907 setOperationAction(ISD::FEXP, MVT::v4f32, Expand);
915
916 // Mark v2f32 intrinsics.
918 setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
919 setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
920 setOperationAction(ISD::FTAN, MVT::v2f32, Expand);
921 setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
922 setOperationAction(ISD::FLOG, MVT::v2f32, Expand);
925 setOperationAction(ISD::FEXP, MVT::v2f32, Expand);
933
934 // Neon does not support some operations on v1i64 and v2i64 types.
935 setOperationAction(ISD::MUL, MVT::v1i64, Expand);
936 // Custom handling for some quad-vector types to detect VMULL.
937 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
938 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
939 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
940 // Custom handling for some vector types to avoid expensive expansions
941 setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
943 setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
945 // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
946 // a destination type that is wider than the source, and nor does
947 // it have a FP_TO_[SU]INT instruction with a narrower destination than
948 // source.
957
960
961 // NEON does not have single instruction CTPOP for vectors with element
962 // types wider than 8-bits. However, custom lowering can leverage the
963 // v8i8/v16i8 vcnt instruction.
970
971 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
972 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
973
974 // NEON does not have single instruction CTTZ for vectors.
976 setOperationAction(ISD::CTTZ, MVT::v4i16, Custom);
977 setOperationAction(ISD::CTTZ, MVT::v2i32, Custom);
978 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
979
980 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
981 setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
982 setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
983 setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);
984
989
994
998 }
999
1000 // NEON only has FMA instructions as of VFP4.
1001 if (!Subtarget->hasVFP4Base()) {
1002 setOperationAction(ISD::FMA, MVT::v2f32, Expand);
1003 setOperationAction(ISD::FMA, MVT::v4f32, Expand);
1004 }
1005
1008
1009 // It is legal to extload from v4i8 to v4i16 or v4i32.
1010 for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16,
1011 MVT::v2i32}) {
1016 }
1017 }
1018
1019 for (auto VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1020 MVT::v4i32}) {
1025 }
1026 }
1027
1028 if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) {
1035 }
1036 if (Subtarget->hasMVEIntegerOps()) {
1039 ISD::SETCC});
1040 }
1041 if (Subtarget->hasMVEFloatOps()) {
1043 }
1044
1045 if (!Subtarget->hasFP64()) {
1046 // When targeting a floating-point unit with only single-precision
1047 // operations, f64 is legal for the few double-precision instructions which
1048 // are present However, no double-precision operations other than moves,
1049 // loads and stores are provided by the hardware.
1087 }
1088
1089 if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) {
1092 if (Subtarget->hasFullFP16()) {
1095 }
1096 }
1097
1098 if (!Subtarget->hasFP16()) {
1101 }
1102
1104
1105 // ARM does not have floating-point extending loads.
1106 for (MVT VT : MVT::fp_valuetypes()) {
1107 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
1108 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
1109 }
1110
1111 // ... or truncating stores
1112 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
1113 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
1114 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
1115
1116 // ARM does not have i1 sign extending load.
1117 for (MVT VT : MVT::integer_valuetypes())
1118 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
1119
1120 // ARM supports all 4 flavors of integer indexed load / store.
1121 if (!Subtarget->isThumb1Only()) {
1122 for (unsigned im = (unsigned)ISD::PRE_INC;
1124 setIndexedLoadAction(im, MVT::i1, Legal);
1125 setIndexedLoadAction(im, MVT::i8, Legal);
1126 setIndexedLoadAction(im, MVT::i16, Legal);
1127 setIndexedLoadAction(im, MVT::i32, Legal);
1128 setIndexedStoreAction(im, MVT::i1, Legal);
1129 setIndexedStoreAction(im, MVT::i8, Legal);
1130 setIndexedStoreAction(im, MVT::i16, Legal);
1131 setIndexedStoreAction(im, MVT::i32, Legal);
1132 }
1133 } else {
1134 // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}.
1137 }
1138
1143
1146 if (Subtarget->hasDSP()) {
1155 }
1156 if (Subtarget->hasBaseDSP()) {
1159 }
1160
1161 // i64 operation support.
1164 if (Subtarget->isThumb1Only()) {
1167 }
1168 if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops()
1169 || (Subtarget->isThumb2() && !Subtarget->hasDSP()))
1171
1181
1182 // MVE lowers 64 bit shifts to lsll and lsrl
1183 // assuming that ISD::SRL and SRA of i64 are already marked custom
1184 if (Subtarget->hasMVEIntegerOps())
1186
1187 // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1.
1188 if (Subtarget->isThumb1Only()) {
1192 }
1193
1194 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops())
1196
1197 // ARM does not have ROTL.
1202 }
1205 if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) {
1208 }
1209
1210 // @llvm.readcyclecounter requires the Performance Monitors extension.
1211 // Default to the 0 expansion on unsupported platforms.
1212 // FIXME: Technically there are older ARM CPUs that have
1213 // implementation-specific ways of obtaining this information.
1214 if (Subtarget->hasPerfMon())
1216
1217 // Only ARMv6 has BSWAP.
1218 if (!Subtarget->hasV6Ops())
1220
1221 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
1222 : Subtarget->hasDivideInARMMode();
1223 if (!hasDivide) {
1224 // These are expanded into libcalls if the cpu doesn't have HW divider.
1227 }
1228
1229 if (Subtarget->isTargetWindows() && !Subtarget->hasDivideInThumbMode()) {
1232
1235 }
1236
1239
1240 // Register based DivRem for AEABI (RTABI 4.2)
1241 if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
1242 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
1243 Subtarget->isTargetWindows()) {
1246 HasStandaloneRem = false;
1247
1248 if (Subtarget->isTargetWindows()) {
1249 const struct {
1250 const RTLIB::Libcall Op;
1251 const char * const Name;
1252 const CallingConv::ID CC;
1253 } LibraryCalls[] = {
1254 { RTLIB::SDIVREM_I8, "__rt_sdiv", CallingConv::ARM_AAPCS },
1255 { RTLIB::SDIVREM_I16, "__rt_sdiv", CallingConv::ARM_AAPCS },
1256 { RTLIB::SDIVREM_I32, "__rt_sdiv", CallingConv::ARM_AAPCS },
1257 { RTLIB::SDIVREM_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS },
1258
1259 { RTLIB::UDIVREM_I8, "__rt_udiv", CallingConv::ARM_AAPCS },
1260 { RTLIB::UDIVREM_I16, "__rt_udiv", CallingConv::ARM_AAPCS },
1261 { RTLIB::UDIVREM_I32, "__rt_udiv", CallingConv::ARM_AAPCS },
1262 { RTLIB::UDIVREM_I64, "__rt_udiv64", CallingConv::ARM_AAPCS },
1263 };
1264
1265 for (const auto &LC : LibraryCalls) {
1266 setLibcallName(LC.Op, LC.Name);
1267 setLibcallCallingConv(LC.Op, LC.CC);
1268 }
1269 } else {
1270 const struct {
1271 const RTLIB::Libcall Op;
1272 const char * const Name;
1273 const CallingConv::ID CC;
1274 } LibraryCalls[] = {
1275 { RTLIB::SDIVREM_I8, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
1276 { RTLIB::SDIVREM_I16, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
1277 { RTLIB::SDIVREM_I32, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
1278 { RTLIB::SDIVREM_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS },
1279
1280 { RTLIB::UDIVREM_I8, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
1281 { RTLIB::UDIVREM_I16, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
1282 { RTLIB::UDIVREM_I32, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
1283 { RTLIB::UDIVREM_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS },
1284 };
1285
1286 for (const auto &LC : LibraryCalls) {
1287 setLibcallName(LC.Op, LC.Name);
1288 setLibcallCallingConv(LC.Op, LC.CC);
1289 }
1290 }
1291
1296 } else {
1299 }
1300
1301 if (Subtarget->getTargetTriple().isOSMSVCRT()) {
1302 // MSVCRT doesn't have powi; fall back to pow
1303 setLibcallName(RTLIB::POWI_F32, nullptr);
1304 setLibcallName(RTLIB::POWI_F64, nullptr);
1305 }
1306
1311
1312 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1314
1315 // Use the default implementation.
1317 setOperationAction(ISD::VAARG, MVT::Other, Expand);
1319 setOperationAction(ISD::VAEND, MVT::Other, Expand);
1322
1323 if (Subtarget->isTargetWindows())
1325 else
1327
1328 // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
1329 // the default expansion.
1330 InsertFencesForAtomic = false;
1331 if (Subtarget->hasAnyDataBarrier() &&
1332 (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) {
1333 // ATOMIC_FENCE needs custom lowering; the others should have been expanded
1334 // to ldrex/strex loops already.
1336 if (!Subtarget->isThumb() || !Subtarget->isMClass())
1338
1339 // On v8, we have particularly efficient implementations of atomic fences
1340 // if they can be combined with nearby atomic loads and stores.
1341 if (!Subtarget->hasAcquireRelease() ||
1342 getTargetMachine().getOptLevel() == CodeGenOptLevel::None) {
1343 // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc.
1344 InsertFencesForAtomic = true;
1345 }
1346 } else {
1347 // If there's anything we can use as a barrier, go through custom lowering
1348 // for ATOMIC_FENCE.
1349 // If target has DMB in thumb, Fences can be inserted.
1350 if (Subtarget->hasDataBarrier())
1351 InsertFencesForAtomic = true;
1352
1354 Subtarget->hasAnyDataBarrier() ? Custom : Expand);
1355
1356 // Set them all for libcall, which will force libcalls.
1369 // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the
1370 // Unordered/Monotonic case.
1371 if (!InsertFencesForAtomic) {
1374 }
1375 }
1376
1377 // Compute supported atomic widths.
1378 if (Subtarget->isTargetLinux() ||
1379 (!Subtarget->isMClass() && Subtarget->hasV6Ops())) {
1380 // For targets where __sync_* routines are reliably available, we use them
1381 // if necessary.
1382 //
1383 // ARM Linux always supports 64-bit atomics through kernel-assisted atomic
1384 // routines (kernel 3.1 or later). FIXME: Not with compiler-rt?
1385 //
1386 // ARMv6 targets have native instructions in ARM mode. For Thumb mode,
1387 // such targets should provide __sync_* routines, which use the ARM mode
1388 // instructions. (ARMv6 doesn't have dmb, but it has an equivalent
1389 // encoding; see ARMISD::MEMBARRIER_MCR.)
1391 } else if ((Subtarget->isMClass() && Subtarget->hasV8MBaselineOps()) ||
1392 Subtarget->hasForced32BitAtomics()) {
1393 // Cortex-M (besides Cortex-M0) have 32-bit atomics.
1395 } else {
1396 // We can't assume anything about other targets; just use libatomic
1397 // routines.
1399 }
1400
1402
1404
1405 // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
1406 if (!Subtarget->hasV6Ops()) {
1409 }
1411
1412 if (!Subtarget->useSoftFloat() && Subtarget->hasFPRegs() &&
1413 !Subtarget->isThumb1Only()) {
1414 // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
1415 // iff target supports vfp2.
1425 }
1426
1427 // We want to custom lower some of our intrinsics.
1432 if (Subtarget->useSjLjEH())
1433 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
1434
1444 if (Subtarget->hasFullFP16()) {
1448 }
1449
1451
1454 if (Subtarget->hasFullFP16())
1458 setOperationAction(ISD::BR_JT, MVT::Other, Custom);
1459
1460 // We don't support sin/cos/fmod/copysign/pow
1469 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2Base() &&
1470 !Subtarget->isThumb1Only()) {
1473 }
1476
1477 if (!Subtarget->hasVFP4Base()) {
1480 }
1481
1482 // Various VFP goodness
1483 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) {
1484 // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded.
1485 if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) {
1488 }
1489
1490 // fp16 is a special v7 extension that adds f16 <-> f32 conversions.
1491 if (!Subtarget->hasFP16()) {
1494 }
1495
1496 // Strict floating-point comparisons need custom lowering.
1503 }
1504
1505 // Use __sincos_stret if available.
1506 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1507 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1510 }
1511
1512 // FP-ARMv8 implements a lot of rounding-like FP operations.
1513 if (Subtarget->hasFPARMv8Base()) {
1522 if (Subtarget->hasNEON()) {
1527 }
1528
1529 if (Subtarget->hasFP64()) {
1538 }
1539 }
1540
1541 // FP16 often need to be promoted to call lib functions
1542 if (Subtarget->hasFullFP16()) {
1557
1559 }
1560
1561 if (Subtarget->hasNEON()) {
1562 // vmin and vmax aren't available in a scalar form, so we can use
1563 // a NEON instruction with an undef lane instead.
1572
1573 if (Subtarget->hasFullFP16()) {
1578
1583 }
1584 }
1585
1586 // We have target-specific dag combine patterns for the following nodes:
1587 // ARMISD::VMOVRRD - No need to call setTargetDAGCombine
1590
1591 if (Subtarget->hasMVEIntegerOps())
1593
1594 if (Subtarget->hasV6Ops())
1596 if (Subtarget->isThumb1Only())
1598 // Attempt to lower smin/smax to ssat/usat
1599 if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) ||
1600 Subtarget->isThumb2()) {
1602 }
1603
1605
1606 if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() ||
1607 !Subtarget->hasVFP2Base() || Subtarget->hasMinSize())
1609 else
1611
1612 //// temporary - rewrite interface to use type
1615 MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
1617 MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
1619
1620 // On ARM arguments smaller than 4 bytes are extended, so all arguments
1621 // are at least 4 bytes aligned.
1623
1624 // Prefer likely predicted branches to selects on out-of-order cores.
1625 PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder();
1626
1627 setPrefLoopAlignment(Align(1ULL << Subtarget->getPrefLoopLogAlignment()));
1629
1630 setMinFunctionAlignment(Subtarget->isThumb() ? Align(2) : Align(4));
1631}
1632
1634 return Subtarget->useSoftFloat();
1635}
1636
1637// FIXME: It might make sense to define the representative register class as the
1638// nearest super-register that has a non-null superset. For example, DPR_VFP2 is
1639// a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
1640// SPR's representative would be DPR_VFP2. This should work well if register
1641// pressure tracking were modified such that a register use would increment the
1642// pressure of the register class's representative and all of it's super
1643// classes' representatives transitively. We have not implemented this because
1644// of the difficulty prior to coalescing of modeling operand register classes
1645// due to the common occurrence of cross class copies and subregister insertions
1646// and extractions.
1647std::pair<const TargetRegisterClass *, uint8_t>
1649 MVT VT) const {
1650 const TargetRegisterClass *RRC = nullptr;
1651 uint8_t Cost = 1;
1652 switch (VT.SimpleTy) {
1653 default:
1655 // Use DPR as representative register class for all floating point
1656 // and vector types. Since there are 32 SPR registers and 32 DPR registers so
1657 // the cost is 1 for both f32 and f64.
1658 case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:
1659 case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:
1660 RRC = &ARM::DPRRegClass;
1661 // When NEON is used for SP, only half of the register file is available
1662 // because operations that define both SP and DP results will be constrained
1663 // to the VFP2 class (D0-D15). We currently model this constraint prior to
1664 // coalescing by double-counting the SP regs. See the FIXME above.
1665 if (Subtarget->useNEONForSinglePrecisionFP())
1666 Cost = 2;
1667 break;
1668 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1669 case MVT::v4f32: case MVT::v2f64:
1670 RRC = &ARM::DPRRegClass;
1671 Cost = 2;
1672 break;
1673 case MVT::v4i64:
1674 RRC = &ARM::DPRRegClass;
1675 Cost = 4;
1676 break;
1677 case MVT::v8i64:
1678 RRC = &ARM::DPRRegClass;
1679 Cost = 8;
1680 break;
1681 }
1682 return std::make_pair(RRC, Cost);
1683}
1684
1685const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
1686#define MAKE_CASE(V) \
1687 case V: \
1688 return #V;
1689 switch ((ARMISD::NodeType)Opcode) {
1691 break;
1894#undef MAKE_CASE
1895 }
1896 return nullptr;
1897}
1898
1900 EVT VT) const {
1901 if (!VT.isVector())
1902 return getPointerTy(DL);
1903
1904 // MVE has a predicate register.
1905 if ((Subtarget->hasMVEIntegerOps() &&
1906 (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
1907 VT == MVT::v16i8)) ||
1908 (Subtarget->hasMVEFloatOps() &&
1909 (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16)))
1910 return MVT::getVectorVT(MVT::i1, VT.getVectorElementCount());
1912}
1913
1914/// getRegClassFor - Return the register class that should be used for the
1915/// specified value type.
1916const TargetRegisterClass *
1917ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
1918 (void)isDivergent;
1919 // Map v4i64 to QQ registers but do not make the type legal. Similarly map
1920 // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
1921 // load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive
1922 // MVE Q registers.
1923 if (Subtarget->hasNEON()) {
1924 if (VT == MVT::v4i64)
1925 return &ARM::QQPRRegClass;
1926 if (VT == MVT::v8i64)
1927 return &ARM::QQQQPRRegClass;
1928 }
1929 if (Subtarget->hasMVEIntegerOps()) {
1930 if (VT == MVT::v4i64)
1931 return &ARM::MQQPRRegClass;
1932 if (VT == MVT::v8i64)
1933 return &ARM::MQQQQPRRegClass;
1934 }
1936}
1937
1938// memcpy, and other memory intrinsics, typically tries to use LDM/STM if the
1939// source/dest is aligned and the copy size is large enough. We therefore want
1940// to align such objects passed to memory intrinsics.
1942 Align &PrefAlign) const {
1943 if (!isa<MemIntrinsic>(CI))
1944 return false;
1945 MinSize = 8;
1946 // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1
1947 // cycle faster than 4-byte aligned LDM.
1948 PrefAlign =
1949 (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? Align(8) : Align(4));
1950 return true;
1951}
1952
1953// Create a fast isel object.
1954FastISel *
1956 const TargetLibraryInfo *libInfo) const {
1957 return ARM::createFastISel(funcInfo, libInfo);
1958}
1959
1961 unsigned NumVals = N->getNumValues();
1962 if (!NumVals)
1963 return Sched::RegPressure;
1964
1965 for (unsigned i = 0; i != NumVals; ++i) {
1966 EVT VT = N->getValueType(i);
1967 if (VT == MVT::Glue || VT == MVT::Other)
1968 continue;
1969 if (VT.isFloatingPoint() || VT.isVector())
1970 return Sched::ILP;
1971 }
1972
1973 if (!N->isMachineOpcode())
1974 return Sched::RegPressure;
1975
1976 // Load are scheduled for latency even if there instruction itinerary
1977 // is not available.
1978 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1979 const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
1980
1981 if (MCID.getNumDefs() == 0)
1982 return Sched::RegPressure;
1983 if (!Itins->isEmpty() &&
1984 Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2U)
1985 return Sched::ILP;
1986
1987 return Sched::RegPressure;
1988}
1989
1990//===----------------------------------------------------------------------===//
1991// Lowering Code
1992//===----------------------------------------------------------------------===//
1993
1994static bool isSRL16(const SDValue &Op) {
1995 if (Op.getOpcode() != ISD::SRL)
1996 return false;
1997 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1998 return Const->getZExtValue() == 16;
1999 return false;
2000}
2001
2002static bool isSRA16(const SDValue &Op) {
2003 if (Op.getOpcode() != ISD::SRA)
2004 return false;
2005 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
2006 return Const->getZExtValue() == 16;
2007 return false;
2008}
2009
2010static bool isSHL16(const SDValue &Op) {
2011 if (Op.getOpcode() != ISD::SHL)
2012 return false;
2013 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
2014 return Const->getZExtValue() == 16;
2015 return false;
2016}
2017
2018// Check for a signed 16-bit value. We special case SRA because it makes it
2019// more simple when also looking for SRAs that aren't sign extending a
2020// smaller value. Without the check, we'd need to take extra care with
2021// checking order for some operations.
2022static bool isS16(const SDValue &Op, SelectionDAG &DAG) {
2023 if (isSRA16(Op))
2024 return isSHL16(Op.getOperand(0));
2025 return DAG.ComputeNumSignBits(Op) == 17;
2026}
2027
2028/// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
2030 switch (CC) {
2031 default: llvm_unreachable("Unknown condition code!");
2032 case ISD::SETNE: return ARMCC::NE;
2033 case ISD::SETEQ: return ARMCC::EQ;
2034 case ISD::SETGT: return ARMCC::GT;
2035 case ISD::SETGE: return ARMCC::GE;
2036 case ISD::SETLT: return ARMCC::LT;
2037 case ISD::SETLE: return ARMCC::LE;
2038 case ISD::SETUGT: return ARMCC::HI;
2039 case ISD::SETUGE: return ARMCC::HS;
2040 case ISD::SETULT: return ARMCC::LO;
2041 case ISD::SETULE: return ARMCC::LS;
2042 }
2043}
2044
2045/// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
2047 ARMCC::CondCodes &CondCode2) {
2048 CondCode2 = ARMCC::AL;
2049 switch (CC) {
2050 default: llvm_unreachable("Unknown FP condition!");
2051 case ISD::SETEQ:
2052 case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
2053 case ISD::SETGT:
2054 case ISD::SETOGT: CondCode = ARMCC::GT; break;
2055 case ISD::SETGE:
2056 case ISD::SETOGE: CondCode = ARMCC::GE; break;
2057 case ISD::SETOLT: CondCode = ARMCC::MI; break;
2058 case ISD::SETOLE: CondCode = ARMCC::LS; break;
2059 case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
2060 case ISD::SETO: CondCode = ARMCC::VC; break;
2061 case ISD::SETUO: CondCode = ARMCC::VS; break;
2062 case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break;
2063 case ISD::SETUGT: CondCode = ARMCC::HI; break;
2064 case ISD::SETUGE: CondCode = ARMCC::PL; break;
2065 case ISD::SETLT:
2066 case ISD::SETULT: CondCode = ARMCC::LT; break;
2067 case ISD::SETLE:
2068 case ISD::SETULE: CondCode = ARMCC::LE; break;
2069 case ISD::SETNE:
2070 case ISD::SETUNE: CondCode = ARMCC::NE; break;
2071 }
2072}
2073
2074//===----------------------------------------------------------------------===//
2075// Calling Convention Implementation
2076//===----------------------------------------------------------------------===//
2077
2078/// getEffectiveCallingConv - Get the effective calling convention, taking into
2079/// account presence of floating point hardware and calling convention
2080/// limitations, such as support for variadic functions.
2082ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
2083 bool isVarArg) const {
2084 switch (CC) {
2085 default:
2086 report_fatal_error("Unsupported calling convention");
2089 case CallingConv::GHC:
2091 return CC;
2097 case CallingConv::Swift:
2100 case CallingConv::C:
2101 case CallingConv::Tail:
2102 if (!Subtarget->isAAPCS_ABI())
2103 return CallingConv::ARM_APCS;
2104 else if (Subtarget->hasFPRegs() && !Subtarget->isThumb1Only() &&
2105 getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
2106 !isVarArg)
2108 else
2110 case CallingConv::Fast:
2112 if (!Subtarget->isAAPCS_ABI()) {
2113 if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && !isVarArg)
2114 return CallingConv::Fast;
2115 return CallingConv::ARM_APCS;
2116 } else if (Subtarget->hasVFP2Base() &&
2117 !Subtarget->isThumb1Only() && !isVarArg)
2119 else
2121 }
2122}
2123
2125 bool isVarArg) const {
2126 return CCAssignFnForNode(CC, false, isVarArg);
2127}
2128
2130 bool isVarArg) const {
2131 return CCAssignFnForNode(CC, true, isVarArg);
2132}
2133
2134/// CCAssignFnForNode - Selects the correct CCAssignFn for the given
2135/// CallingConvention.
2136CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
2137 bool Return,
2138 bool isVarArg) const {
2139 switch (getEffectiveCallingConv(CC, isVarArg)) {
2140 default:
2141 report_fatal_error("Unsupported calling convention");
2143 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
2145 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
2147 return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
2148 case CallingConv::Fast:
2149 return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
2150 case CallingConv::GHC:
2151 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
2153 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
2155 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
2157 return (Return ? RetCC_ARM_AAPCS : CC_ARM_Win32_CFGuard_Check);
2158 }
2159}
2160
2161SDValue ARMTargetLowering::MoveToHPR(const SDLoc &dl, SelectionDAG &DAG,
2162 MVT LocVT, MVT ValVT, SDValue Val) const {
2163 Val = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocVT.getSizeInBits()),
2164 Val);
2165 if (Subtarget->hasFullFP16()) {
2166 Val = DAG.getNode(ARMISD::VMOVhr, dl, ValVT, Val);
2167 } else {
2168 Val = DAG.getNode(ISD::TRUNCATE, dl,
2169 MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
2170 Val = DAG.getNode(ISD::BITCAST, dl, ValVT, Val);
2171 }
2172 return Val;
2173}
2174
2175SDValue ARMTargetLowering::MoveFromHPR(const SDLoc &dl, SelectionDAG &DAG,
2176 MVT LocVT, MVT ValVT,
2177 SDValue Val) const {
2178 if (Subtarget->hasFullFP16()) {
2179 Val = DAG.getNode(ARMISD::VMOVrh, dl,
2180 MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
2181 } else {
2182 Val = DAG.getNode(ISD::BITCAST, dl,
2183 MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
2184 Val = DAG.getNode(ISD::ZERO_EXTEND, dl,
2185 MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
2186 }
2187 return DAG.getNode(ISD::BITCAST, dl, LocVT, Val);
2188}
2189
2190/// LowerCallResult - Lower the result values of a call into the
2191/// appropriate copies out of appropriate physical registers.
2192SDValue ARMTargetLowering::LowerCallResult(
2193 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
2194 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2195 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
2196 SDValue ThisVal) const {
2197 // Assign locations to each value returned by this call.
2199 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2200 *DAG.getContext());
2201 CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg));
2202
2203 // Copy all of the result registers out of their specified physreg.
2204 for (unsigned i = 0; i != RVLocs.size(); ++i) {
2205 CCValAssign VA = RVLocs[i];
2206
2207 // Pass 'this' value directly from the argument to return value, to avoid
2208 // reg unit interference
2209 if (i == 0 && isThisReturn) {
2210 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 &&
2211 "unexpected return calling convention register assignment");
2212 InVals.push_back(ThisVal);
2213 continue;
2214 }
2215
2216 SDValue Val;
2217 if (VA.needsCustom() &&
2218 (VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2f64)) {
2219 // Handle f64 or half of a v2f64.
2220 SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
2221 InGlue);
2222 Chain = Lo.getValue(1);
2223 InGlue = Lo.getValue(2);
2224 VA = RVLocs[++i]; // skip ahead to next loc
2225 SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
2226 InGlue);
2227 Chain = Hi.getValue(1);
2228 InGlue = Hi.getValue(2);
2229 if (!Subtarget->isLittle())
2230 std::swap (Lo, Hi);
2231 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
2232
2233 if (VA.getLocVT() == MVT::v2f64) {
2234 SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
2235 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
2236 DAG.getConstant(0, dl, MVT::i32));
2237
2238 VA = RVLocs[++i]; // skip ahead to next loc
2239 Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InGlue);
2240 Chain = Lo.getValue(1);
2241 InGlue = Lo.getValue(2);
2242 VA = RVLocs[++i]; // skip ahead to next loc
2243 Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InGlue);
2244 Chain = Hi.getValue(1);
2245 InGlue = Hi.getValue(2);
2246 if (!Subtarget->isLittle())
2247 std::swap (Lo, Hi);
2248 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
2249 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
2250 DAG.getConstant(1, dl, MVT::i32));
2251 }
2252 } else {
2253 Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
2254 InGlue);
2255 Chain = Val.getValue(1);
2256 InGlue = Val.getValue(2);
2257 }
2258
2259 switch (VA.getLocInfo()) {
2260 default: llvm_unreachable("Unknown loc info!");
2261 case CCValAssign::Full: break;
2262 case CCValAssign::BCvt:
2263 Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
2264 break;
2265 }
2266
2267 // f16 arguments have their size extended to 4 bytes and passed as if they
2268 // had been copied to the LSBs of a 32-bit register.
2269 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
2270 if (VA.needsCustom() &&
2271 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
2272 Val = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Val);
2273
2274 InVals.push_back(Val);
2275 }
2276
2277 return Chain;
2278}
2279
2280std::pair<SDValue, MachinePointerInfo> ARMTargetLowering::computeAddrForCallArg(
2281 const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, SDValue StackPtr,
2282 bool IsTailCall, int SPDiff) const {
2283 SDValue DstAddr;
2284 MachinePointerInfo DstInfo;
2285 int32_t Offset = VA.getLocMemOffset();
2287
2288 if (IsTailCall) {
2289 Offset += SPDiff;
2290 auto PtrVT = getPointerTy(DAG.getDataLayout());
2291 int Size = VA.getLocVT().getFixedSizeInBits() / 8;
2292 int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true);
2293 DstAddr = DAG.getFrameIndex(FI, PtrVT);
2294 DstInfo =
2296 } else {
2297 SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl);
2298 DstAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
2299 StackPtr, PtrOff);
2300 DstInfo =
2302 }
2303
2304 return std::make_pair(DstAddr, DstInfo);
2305}
2306
2307void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,
2308 SDValue Chain, SDValue &Arg,
2309 RegsToPassVector &RegsToPass,
2310 CCValAssign &VA, CCValAssign &NextVA,
2311 SDValue &StackPtr,
2312 SmallVectorImpl<SDValue> &MemOpChains,
2313 bool IsTailCall,
2314 int SPDiff) const {
2315 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
2316 DAG.getVTList(MVT::i32, MVT::i32), Arg);
2317 unsigned id = Subtarget->isLittle() ? 0 : 1;
2318 RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id)));
2319
2320 if (NextVA.isRegLoc())
2321 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id)));
2322 else {
2323 assert(NextVA.isMemLoc());
2324 if (!StackPtr.getNode())
2325 StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP,
2327
2328 SDValue DstAddr;
2329 MachinePointerInfo DstInfo;
2330 std::tie(DstAddr, DstInfo) =
2331 computeAddrForCallArg(dl, DAG, NextVA, StackPtr, IsTailCall, SPDiff);
2332 MemOpChains.push_back(
2333 DAG.getStore(Chain, dl, fmrrd.getValue(1 - id), DstAddr, DstInfo));
2334 }
2335}
2336
2337static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
2338 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
2340}
2341
2342/// LowerCall - Lowering a call into a callseq_start <-
2343/// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
2344/// nodes.
2345SDValue
2346ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2347 SmallVectorImpl<SDValue> &InVals) const {
2348 SelectionDAG &DAG = CLI.DAG;
2349 SDLoc &dl = CLI.DL;
2351 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2353 SDValue Chain = CLI.Chain;
2354 SDValue Callee = CLI.Callee;
2355 bool &isTailCall = CLI.IsTailCall;
2356 CallingConv::ID CallConv = CLI.CallConv;
2357 bool doesNotRet = CLI.DoesNotReturn;
2358 bool isVarArg = CLI.IsVarArg;
2359
2363 bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
2364 bool isThisReturn = false;
2365 bool isCmseNSCall = false;
2366 bool isSibCall = false;
2367 bool PreferIndirect = false;
2368 bool GuardWithBTI = false;
2369
2370 // Analyze operands of the call, assigning locations to each operand.
2372 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2373 *DAG.getContext());
2374 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg));
2375
2376 // Lower 'returns_twice' calls to a pseudo-instruction.
2377 if (CLI.CB && CLI.CB->getAttributes().hasFnAttr(Attribute::ReturnsTwice) &&
2378 !Subtarget->noBTIAtReturnTwice())
2379 GuardWithBTI = AFI->branchTargetEnforcement();
2380
2381 // Determine whether this is a non-secure function call.
2382 if (CLI.CB && CLI.CB->getAttributes().hasFnAttr("cmse_nonsecure_call"))
2383 isCmseNSCall = true;
2384
2385 // Disable tail calls if they're not supported.
2386 if (!Subtarget->supportsTailCall())
2387 isTailCall = false;
2388
2389 // For both the non-secure calls and the returns from a CMSE entry function,
2390 // the function needs to do some extra work afte r the call, or before the
2391 // return, respectively, thus it cannot end with atail call
2392 if (isCmseNSCall || AFI->isCmseNSEntryFunction())
2393 isTailCall = false;
2394
2395 if (isa<GlobalAddressSDNode>(Callee)) {
2396 // If we're optimizing for minimum size and the function is called three or
2397 // more times in this block, we can improve codesize by calling indirectly
2398 // as BLXr has a 16-bit encoding.
2399 auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
2400 if (CLI.CB) {
2401 auto *BB = CLI.CB->getParent();
2402 PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() &&
2403 count_if(GV->users(), [&BB](const User *U) {
2404 return isa<Instruction>(U) &&
2405 cast<Instruction>(U)->getParent() == BB;
2406 }) > 2;
2407 }
2408 }
2409 if (isTailCall) {
2410 // Check if it's really possible to do a tail call.
2411 isTailCall =
2412 IsEligibleForTailCallOptimization(CLI, CCInfo, ArgLocs, PreferIndirect);
2413
2414 if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt &&
2415 CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail)
2416 isSibCall = true;
2417
2418 // We don't support GuaranteedTailCallOpt for ARM, only automatically
2419 // detected sibcalls.
2420 if (isTailCall)
2421 ++NumTailCalls;
2422 }
2423
2424 if (!isTailCall && CLI.CB && CLI.CB->isMustTailCall())
2425 report_fatal_error("failed to perform tail call elimination on a call "
2426 "site marked musttail");
2427
2428 // Get a count of how many bytes are to be pushed on the stack.
2429 unsigned NumBytes = CCInfo.getStackSize();
2430
2431 // SPDiff is the byte offset of the call's argument area from the callee's.
2432 // Stores to callee stack arguments will be placed in FixedStackSlots offset
2433 // by this amount for a tail call. In a sibling call it must be 0 because the
2434 // caller will deallocate the entire stack and the callee still expects its
2435 // arguments to begin at SP+0. Completely unused for non-tail calls.
2436 int SPDiff = 0;
2437
2438 if (isTailCall && !isSibCall) {
2439 auto FuncInfo = MF.getInfo<ARMFunctionInfo>();
2440 unsigned NumReusableBytes = FuncInfo->getArgumentStackSize();
2441
2442 // Since callee will pop argument stack as a tail call, we must keep the
2443 // popped size 16-byte aligned.
2444 Align StackAlign = DAG.getDataLayout().getStackAlignment();
2445 NumBytes = alignTo(NumBytes, StackAlign);
2446
2447 // SPDiff will be negative if this tail call requires more space than we
2448 // would automatically have in our incoming argument space. Positive if we
2449 // can actually shrink the stack.
2450 SPDiff = NumReusableBytes - NumBytes;
2451
2452 // If this call requires more stack than we have available from
2453 // LowerFormalArguments, tell FrameLowering to reserve space for it.
2454 if (SPDiff < 0 && AFI->getArgRegsSaveSize() < (unsigned)-SPDiff)
2455 AFI->setArgRegsSaveSize(-SPDiff);
2456 }
2457
2458 if (isSibCall) {
2459 // For sibling tail calls, memory operands are available in our caller's stack.
2460 NumBytes = 0;
2461 } else {
2462 // Adjust the stack pointer for the new arguments...
2463 // These operations are automatically eliminated by the prolog/epilog pass
2464 Chain = DAG.getCALLSEQ_START(Chain, isTailCall ? 0 : NumBytes, 0, dl);
2465 }
2466
2468 DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout()));
2469
2470 RegsToPassVector RegsToPass;
2471 SmallVector<SDValue, 8> MemOpChains;
2472
2473 // During a tail call, stores to the argument area must happen after all of
2474 // the function's incoming arguments have been loaded because they may alias.
2475 // This is done by folding in a TokenFactor from LowerFormalArguments, but
2476 // there's no point in doing so repeatedly so this tracks whether that's
2477 // happened yet.
2478 bool AfterFormalArgLoads = false;
2479
2480 // Walk the register/memloc assignments, inserting copies/loads. In the case
2481 // of tail call optimization, arguments are handled later.
2482 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
2483 i != e;
2484 ++i, ++realArgIdx) {
2485 CCValAssign &VA = ArgLocs[i];
2486 SDValue Arg = OutVals[realArgIdx];
2487 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2488 bool isByVal = Flags.isByVal();
2489
2490 // Promote the value if needed.
2491 switch (VA.getLocInfo()) {
2492 default: llvm_unreachable("Unknown loc info!");
2493 case CCValAssign::Full: break;
2494 case CCValAssign::SExt:
2495 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
2496 break;
2497 case CCValAssign::ZExt:
2498 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
2499 break;
2500 case CCValAssign::AExt:
2501 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
2502 break;
2503 case CCValAssign::BCvt:
2504 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2505 break;
2506 }
2507
2508 if (isTailCall && VA.isMemLoc() && !AfterFormalArgLoads) {
2509 Chain = DAG.getStackArgumentTokenFactor(Chain);
2510 AfterFormalArgLoads = true;
2511 }
2512
2513 // f16 arguments have their size extended to 4 bytes and passed as if they
2514 // had been copied to the LSBs of a 32-bit register.
2515 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
2516 if (VA.needsCustom() &&
2517 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) {
2518 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
2519 } else {
2520 // f16 arguments could have been extended prior to argument lowering.
2521 // Mask them arguments if this is a CMSE nonsecure call.
2522 auto ArgVT = Outs[realArgIdx].ArgVT;
2523 if (isCmseNSCall && (ArgVT == MVT::f16)) {
2524 auto LocBits = VA.getLocVT().getSizeInBits();
2525 auto MaskValue = APInt::getLowBitsSet(LocBits, ArgVT.getSizeInBits());
2526 SDValue Mask =
2527 DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
2528 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
2529 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
2530 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2531 }
2532 }
2533
2534 // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
2535 if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
2536 SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2537 DAG.getConstant(0, dl, MVT::i32));
2538 SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2539 DAG.getConstant(1, dl, MVT::i32));
2540
2541 PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, VA, ArgLocs[++i],
2542 StackPtr, MemOpChains, isTailCall, SPDiff);
2543
2544 VA = ArgLocs[++i]; // skip ahead to next loc
2545 if (VA.isRegLoc()) {
2546 PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, VA, ArgLocs[++i],
2547 StackPtr, MemOpChains, isTailCall, SPDiff);
2548 } else {
2549 assert(VA.isMemLoc());
2550 SDValue DstAddr;
2551 MachinePointerInfo DstInfo;
2552 std::tie(DstAddr, DstInfo) =
2553 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2554 MemOpChains.push_back(DAG.getStore(Chain, dl, Op1, DstAddr, DstInfo));
2555 }
2556 } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
2557 PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
2558 StackPtr, MemOpChains, isTailCall, SPDiff);
2559 } else if (VA.isRegLoc()) {
2560 if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
2561 Outs[0].VT == MVT::i32) {
2562 assert(VA.getLocVT() == MVT::i32 &&
2563 "unexpected calling convention register assignment");
2564 assert(!Ins.empty() && Ins[0].VT == MVT::i32 &&
2565 "unexpected use of 'returned'");
2566 isThisReturn = true;
2567 }
2568 const TargetOptions &Options = DAG.getTarget().Options;
2569 if (Options.EmitCallSiteInfo)
2570 CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i);
2571 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2572 } else if (isByVal) {
2573 assert(VA.isMemLoc());
2574 unsigned offset = 0;
2575
2576 // True if this byval aggregate will be split between registers
2577 // and memory.
2578 unsigned ByValArgsCount = CCInfo.getInRegsParamsCount();
2579 unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed();
2580
2581 if (CurByValIdx < ByValArgsCount) {
2582
2583 unsigned RegBegin, RegEnd;
2584 CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd);
2585
2586 EVT PtrVT =
2588 unsigned int i, j;
2589 for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
2590 SDValue Const = DAG.getConstant(4*i, dl, MVT::i32);
2591 SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
2592 SDValue Load =
2593 DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo(),
2594 DAG.InferPtrAlign(AddArg));
2595 MemOpChains.push_back(Load.getValue(1));
2596 RegsToPass.push_back(std::make_pair(j, Load));
2597 }
2598
2599 // If parameter size outsides register area, "offset" value
2600 // helps us to calculate stack slot for remained part properly.
2601 offset = RegEnd - RegBegin;
2602
2603 CCInfo.nextInRegsParam();
2604 }
2605
2606 if (Flags.getByValSize() > 4*offset) {
2607 auto PtrVT = getPointerTy(DAG.getDataLayout());
2608 SDValue Dst;
2609 MachinePointerInfo DstInfo;
2610 std::tie(Dst, DstInfo) =
2611 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2612 SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl);
2613 SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset);
2614 SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl,
2615 MVT::i32);
2616 SDValue AlignNode =
2617 DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32);
2618
2619 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
2620 SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
2621 MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
2622 Ops));
2623 }
2624 } else {
2625 assert(VA.isMemLoc());
2626 SDValue DstAddr;
2627 MachinePointerInfo DstInfo;
2628 std::tie(DstAddr, DstInfo) =
2629 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2630
2631 SDValue Store = DAG.getStore(Chain, dl, Arg, DstAddr, DstInfo);
2632 MemOpChains.push_back(Store);
2633 }
2634 }
2635
2636 if (!MemOpChains.empty())
2637 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
2638
2639 // Build a sequence of copy-to-reg nodes chained together with token chain
2640 // and flag operands which copy the outgoing args into the appropriate regs.
2641 SDValue InGlue;
2642 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
2643 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
2644 RegsToPass[i].second, InGlue);
2645 InGlue = Chain.getValue(1);
2646 }
2647
2648 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
2649 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
2650 // node so that legalize doesn't hack it.
2651 bool isDirect = false;
2652
2654 const GlobalValue *GVal = nullptr;
2655 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
2656 GVal = G->getGlobal();
2657 bool isStub = !TM.shouldAssumeDSOLocal(GVal) && Subtarget->isTargetMachO();
2658
2659 bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
2660 bool isLocalARMFunc = false;
2661 auto PtrVt = getPointerTy(DAG.getDataLayout());
2662
2663 if (Subtarget->genLongCalls()) {
2664 assert((!isPositionIndependent() || Subtarget->isTargetWindows()) &&
2665 "long-calls codegen is not position independent!");
2666 // Handle a global address or an external symbol. If it's not one of
2667 // those, the target's already in a register, so we don't need to do
2668 // anything extra.
2669 if (isa<GlobalAddressSDNode>(Callee)) {
2670 if (Subtarget->genExecuteOnly()) {
2671 if (Subtarget->useMovt())
2672 ++NumMovwMovt;
2673 Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt,
2674 DAG.getTargetGlobalAddress(GVal, dl, PtrVt));
2675 } else {
2676 // Create a constant pool entry for the callee address
2677 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2679 GVal, ARMPCLabelIndex, ARMCP::CPValue, 0);
2680
2681 // Get the address of the callee into a register
2682 SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2683 Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr);
2684 Callee = DAG.getLoad(
2685 PtrVt, dl, DAG.getEntryNode(), Addr,
2687 }
2688 } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
2689 const char *Sym = S->getSymbol();
2690
2691 if (Subtarget->genExecuteOnly()) {
2692 if (Subtarget->useMovt())
2693 ++NumMovwMovt;
2694 Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt,
2695 DAG.getTargetGlobalAddress(GVal, dl, PtrVt));
2696 } else {
2697 // Create a constant pool entry for the callee address
2698 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2700 *DAG.getContext(), Sym, ARMPCLabelIndex, 0);
2701
2702 // Get the address of the callee into a register
2703 SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2704 Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr);
2705 Callee = DAG.getLoad(
2706 PtrVt, dl, DAG.getEntryNode(), Addr,
2708 }
2709 }
2710 } else if (isa<GlobalAddressSDNode>(Callee)) {
2711 if (!PreferIndirect) {
2712 isDirect = true;
2713 bool isDef = GVal->isStrongDefinitionForLinker();
2714
2715 // ARM call to a local ARM function is predicable.
2716 isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking);
2717 // tBX takes a register source operand.
2718 if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2719 assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?");
2720 Callee = DAG.getNode(
2721 ARMISD::WrapperPIC, dl, PtrVt,
2722 DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, ARMII::MO_NONLAZY));
2723 Callee = DAG.getLoad(
2724 PtrVt, dl, DAG.getEntryNode(), Callee,
2728 } else if (Subtarget->isTargetCOFF()) {
2729 assert(Subtarget->isTargetWindows() &&
2730 "Windows is the only supported COFF target");
2731 unsigned TargetFlags = ARMII::MO_NO_FLAG;
2732 if (GVal->hasDLLImportStorageClass())
2733 TargetFlags = ARMII::MO_DLLIMPORT;
2734 else if (!TM.shouldAssumeDSOLocal(GVal))
2735 TargetFlags = ARMII::MO_COFFSTUB;
2736 Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, /*offset=*/0,
2737 TargetFlags);
2738 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
2739 Callee =
2740 DAG.getLoad(PtrVt, dl, DAG.getEntryNode(),
2741 DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee),
2743 } else {
2744 Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, 0);
2745 }
2746 }
2747 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
2748 isDirect = true;
2749 // tBX takes a register source operand.
2750 const char *Sym = S->getSymbol();
2751 if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2752 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2755 ARMPCLabelIndex, 4);
2756 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2757 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2758 Callee = DAG.getLoad(
2759 PtrVt, dl, DAG.getEntryNode(), CPAddr,
2761 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
2762 Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel);
2763 } else {
2764 Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0);
2765 }
2766 }
2767
2768 if (isCmseNSCall) {
2769 assert(!isARMFunc && !isDirect &&
2770 "Cannot handle call to ARM function or direct call");
2771 if (NumBytes > 0) {
2773 "call to non-secure function would "
2774 "require passing arguments on stack",
2775 dl.getDebugLoc());
2776 DAG.getContext()->diagnose(Diag);
2777 }
2778 if (isStructRet) {
2781 "call to non-secure function would return value through pointer",
2782 dl.getDebugLoc());
2783 DAG.getContext()->diagnose(Diag);
2784 }
2785 }
2786
2787 // FIXME: handle tail calls differently.
2788 unsigned CallOpc;
2789 if (Subtarget->isThumb()) {
2790 if (GuardWithBTI)
2791 CallOpc = ARMISD::t2CALL_BTI;
2792 else if (isCmseNSCall)
2793 CallOpc = ARMISD::tSECALL;
2794 else if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
2795 CallOpc = ARMISD::CALL_NOLINK;
2796 else
2797 CallOpc = ARMISD::CALL;
2798 } else {
2799 if (!isDirect && !Subtarget->hasV5TOps())
2800 CallOpc = ARMISD::CALL_NOLINK;
2801 else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() &&
2802 // Emit regular call when code size is the priority
2803 !Subtarget->hasMinSize())
2804 // "mov lr, pc; b _foo" to avoid confusing the RSP
2805 CallOpc = ARMISD::CALL_NOLINK;
2806 else
2807 CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL;
2808 }
2809
2810 // We don't usually want to end the call-sequence here because we would tidy
2811 // the frame up *after* the call, however in the ABI-changing tail-call case
2812 // we've carefully laid out the parameters so that when sp is reset they'll be
2813 // in the correct location.
2814 if (isTailCall && !isSibCall) {
2815 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, dl);
2816 InGlue = Chain.getValue(1);
2817 }
2818
2819 std::vector<SDValue> Ops;
2820 Ops.push_back(Chain);
2821 Ops.push_back(Callee);
2822
2823 if (isTailCall) {
2824 Ops.push_back(DAG.getTargetConstant(SPDiff, dl, MVT::i32));
2825 }
2826
2827 // Add argument registers to the end of the list so that they are known live
2828 // into the call.
2829 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
2830 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
2831 RegsToPass[i].second.getValueType()));
2832
2833 // Add a register mask operand representing the call-preserved registers.
2834 const uint32_t *Mask;
2835 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
2836 if (isThisReturn) {
2837 // For 'this' returns, use the R0-preserving mask if applicable
2838 Mask = ARI->getThisReturnPreservedMask(MF, CallConv);
2839 if (!Mask) {
2840 // Set isThisReturn to false if the calling convention is not one that
2841 // allows 'returned' to be modeled in this way, so LowerCallResult does
2842 // not try to pass 'this' straight through
2843 isThisReturn = false;
2844 Mask = ARI->getCallPreservedMask(MF, CallConv);
2845 }
2846 } else
2847 Mask = ARI->getCallPreservedMask(MF, CallConv);
2848
2849 assert(Mask && "Missing call preserved mask for calling convention");
2850 Ops.push_back(DAG.getRegisterMask(Mask));
2851
2852 if (InGlue.getNode())
2853 Ops.push_back(InGlue);
2854
2855 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2856 if (isTailCall) {
2858 SDValue Ret = DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops);
2859 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
2860 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
2861 return Ret;
2862 }
2863
2864 // Returns a chain and a flag for retval copy to use.
2865 Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
2866 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
2867 InGlue = Chain.getValue(1);
2868 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
2869
2870 // If we're guaranteeing tail-calls will be honoured, the callee must
2871 // pop its own argument stack on return. But this call is *not* a tail call so
2872 // we need to undo that after it returns to restore the status-quo.
2873 bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt;
2874 uint64_t CalleePopBytes =
2875 canGuaranteeTCO(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : -1ULL;
2876
2877 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, dl);
2878 if (!Ins.empty())
2879 InGlue = Chain.getValue(1);
2880
2881 // Handle result values, copying them out of physregs into vregs that we
2882 // return.
2883 return LowerCallResult(Chain, InGlue, CallConv, isVarArg, Ins, dl, DAG,
2884 InVals, isThisReturn,
2885 isThisReturn ? OutVals[0] : SDValue());
2886}
2887
2888/// HandleByVal - Every parameter *after* a byval parameter is passed
2889/// on the stack. Remember the next parameter register to allocate,
2890/// and then confiscate the rest of the parameter registers to insure
2891/// this.
2892void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
2893 Align Alignment) const {
2894 // Byval (as with any stack) slots are always at least 4 byte aligned.
2895 Alignment = std::max(Alignment, Align(4));
2896
2897 unsigned Reg = State->AllocateReg(GPRArgRegs);
2898 if (!Reg)
2899 return;
2900
2901 unsigned AlignInRegs = Alignment.value() / 4;
2902 unsigned Waste = (ARM::R4 - Reg) % AlignInRegs;
2903 for (unsigned i = 0; i < Waste; ++i)
2904 Reg = State->AllocateReg(GPRArgRegs);
2905
2906 if (!Reg)
2907 return;
2908
2909 unsigned Excess = 4 * (ARM::R4 - Reg);
2910
2911 // Special case when NSAA != SP and parameter size greater than size of
2912 // all remained GPR regs. In that case we can't split parameter, we must
2913 // send it to stack. We also must set NCRN to R4, so waste all
2914 // remained registers.
2915 const unsigned NSAAOffset = State->getStackSize();
2916 if (NSAAOffset != 0 && Size > Excess) {
2917 while (State->AllocateReg(GPRArgRegs))
2918 ;
2919 return;
2920 }
2921
2922 // First register for byval parameter is the first register that wasn't
2923 // allocated before this method call, so it would be "reg".
2924 // If parameter is small enough to be saved in range [reg, r4), then
2925 // the end (first after last) register would be reg + param-size-in-regs,
2926 // else parameter would be splitted between registers and stack,
2927 // end register would be r4 in this case.
2928 unsigned ByValRegBegin = Reg;
2929 unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4);
2930 State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd);
2931 // Note, first register is allocated in the beginning of function already,
2932 // allocate remained amount of registers we need.
2933 for (unsigned i = Reg + 1; i != ByValRegEnd; ++i)
2934 State->AllocateReg(GPRArgRegs);
2935 // A byval parameter that is split between registers and memory needs its
2936 // size truncated here.
2937 // In the case where the entire structure fits in registers, we set the
2938 // size in memory to zero.
2939 Size = std::max<int>(Size - Excess, 0);
2940}
2941
2942/// MatchingStackOffset - Return true if the given stack call argument is
2943/// already available in the same position (relatively) of the caller's
2944/// incoming argument stack.
2945static
2948 const TargetInstrInfo *TII) {
2949 unsigned Bytes = Arg.getValueSizeInBits() / 8;
2950 int FI = std::numeric_limits<int>::max();
2951 if (Arg.getOpcode() == ISD::CopyFromReg) {
2952 Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
2953 if (!VR.isVirtual())
2954 return false;
2955 MachineInstr *Def = MRI->getVRegDef(VR);
2956 if (!Def)
2957 return false;
2958 if (!Flags.isByVal()) {
2959 if (!TII->isLoadFromStackSlot(*Def, FI))
2960 return false;
2961 } else {
2962 return false;
2963 }
2964 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
2965 if (Flags.isByVal())
2966 // ByVal argument is passed in as a pointer but it's now being
2967 // dereferenced. e.g.
2968 // define @foo(%struct.X* %A) {
2969 // tail call @bar(%struct.X* byval %A)
2970 // }
2971 return false;
2972 SDValue Ptr = Ld->getBasePtr();
2973 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
2974 if (!FINode)
2975 return false;
2976 FI = FINode->getIndex();
2977 } else
2978 return false;
2979
2980 assert(FI != std::numeric_limits<int>::max());
2981 if (!MFI.isFixedObjectIndex(FI))
2982 return false;
2983 return Offset == MFI.getObjectOffset(FI) && Bytes == MFI.getObjectSize(FI);
2984}
2985
2986/// IsEligibleForTailCallOptimization - Check whether the call is eligible
2987/// for tail call optimization. Targets which want to do tail call
2988/// optimization should implement this function. Note that this function also
2989/// processes musttail calls, so when this function returns false on a valid
2990/// musttail call, a fatal backend error occurs.
2991bool ARMTargetLowering::IsEligibleForTailCallOptimization(
2993 SmallVectorImpl<CCValAssign> &ArgLocs, const bool isIndirect) const {
2994 CallingConv::ID CalleeCC = CLI.CallConv;
2995 SDValue Callee = CLI.Callee;
2996 bool isVarArg = CLI.IsVarArg;
2997 const SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2998 const SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
3000 const SelectionDAG &DAG = CLI.DAG;
3002 const Function &CallerF = MF.getFunction();
3003 CallingConv::ID CallerCC = CallerF.getCallingConv();
3004
3005 assert(Subtarget->supportsTailCall());
3006
3007 // Indirect tail calls cannot be optimized for Thumb1 if the args
3008 // to the call take up r0-r3. The reason is that there are no legal registers
3009 // left to hold the pointer to the function to be called.
3010 // Similarly, if the function uses return address sign and authentication,
3011 // r12 is needed to hold the PAC and is not available to hold the callee
3012 // address.
3013 if (Outs.size() >= 4 &&
3014 (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect)) {
3015 if (Subtarget->isThumb1Only())
3016 return false;
3017 // Conservatively assume the function spills LR.
3019 return false;
3020 }
3021
3022 // Look for obvious safe cases to perform tail call optimization that do not
3023 // require ABI changes. This is what gcc calls sibcall.
3024
3025 // Exception-handling functions need a special set of instructions to indicate
3026 // a return to the hardware. Tail-calling another function would probably
3027 // break this.
3028 if (CallerF.hasFnAttribute("interrupt"))
3029 return false;
3030
3031 if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
3032 return CalleeCC == CallerCC;
3033
3034 // Also avoid sibcall optimization if either caller or callee uses struct
3035 // return semantics.
3036 bool isCalleeStructRet = Outs.empty() ? false : Outs[0].Flags.isSRet();
3037 bool isCallerStructRet = MF.getFunction().hasStructRetAttr();
3038 if (isCalleeStructRet || isCallerStructRet)
3039 return false;
3040
3041 // Externally-defined functions with weak linkage should not be
3042 // tail-called on ARM when the OS does not support dynamic
3043 // pre-emption of symbols, as the AAELF spec requires normal calls
3044 // to undefined weak functions to be replaced with a NOP or jump to the
3045 // next instruction. The behaviour of branch instructions in this
3046 // situation (as used for tail calls) is implementation-defined, so we
3047 // cannot rely on the linker replacing the tail call with a return.
3048 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
3049 const GlobalValue *GV = G->getGlobal();
3051 if (GV->hasExternalWeakLinkage() &&
3052 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
3053 return false;
3054 }
3055
3056 // Check that the call results are passed in the same way.
3057 LLVMContext &C = *DAG.getContext();
3059 getEffectiveCallingConv(CalleeCC, isVarArg),
3060 getEffectiveCallingConv(CallerCC, CallerF.isVarArg()), MF, C, Ins,
3061 CCAssignFnForReturn(CalleeCC, isVarArg),
3062 CCAssignFnForReturn(CallerCC, CallerF.isVarArg())))
3063 return false;
3064 // The callee has to preserve all registers the caller needs to preserve.
3065 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
3066 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3067 if (CalleeCC != CallerCC) {
3068 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3069 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3070 return false;
3071 }
3072
3073 // If Caller's vararg or byval argument has been split between registers and
3074 // stack, do not perform tail call, since part of the argument is in caller's
3075 // local frame.
3076 const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>();
3077 if (AFI_Caller->getArgRegsSaveSize())
3078 return false;
3079
3080 // If the callee takes no arguments then go on to check the results of the
3081 // call.
3082 if (!Outs.empty()) {
3083 if (CCInfo.getStackSize()) {
3084 // Check if the arguments are already laid out in the right way as
3085 // the caller's fixed stack objects.
3086 MachineFrameInfo &MFI = MF.getFrameInfo();
3087 const MachineRegisterInfo *MRI = &MF.getRegInfo();
3088 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3089 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
3090 i != e;
3091 ++i, ++realArgIdx) {
3092 CCValAssign &VA = ArgLocs[i];
3093 EVT RegVT = VA.getLocVT();
3094 SDValue Arg = OutVals[realArgIdx];
3095 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
3097 return false;
3098 if (VA.needsCustom() && (RegVT == MVT::f64 || RegVT == MVT::v2f64)) {
3099 // f64 and vector types are split into multiple registers or
3100 // register/stack-slot combinations. The types will not match
3101 // the registers; give up on memory f64 refs until we figure
3102 // out what to do about this.
3103 if (!VA.isRegLoc())
3104 return false;
3105 if (!ArgLocs[++i].isRegLoc())
3106 return false;
3107 if (RegVT == MVT::v2f64) {
3108 if (!ArgLocs[++i].isRegLoc())
3109 return false;
3110 if (!ArgLocs[++i].isRegLoc())
3111 return false;
3112 }
3113 } else if (!VA.isRegLoc()) {
3115 MFI, MRI, TII))
3116 return false;
3117 }
3118 }
3119 }
3120
3121 const MachineRegisterInfo &MRI = MF.getRegInfo();
3122 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
3123 return false;
3124 }
3125
3126 return true;
3127}
3128
3129bool
3130ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
3131 MachineFunction &MF, bool isVarArg,
3133 LLVMContext &Context) const {
3135 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
3136 return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3137}
3138
3140 const SDLoc &DL, SelectionDAG &DAG) {
3141 const MachineFunction &MF = DAG.getMachineFunction();
3142 const Function &F = MF.getFunction();
3143
3144 StringRef IntKind = F.getFnAttribute("interrupt").getValueAsString();
3145
3146 // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset
3147 // version of the "preferred return address". These offsets affect the return
3148 // instruction if this is a return from PL1 without hypervisor extensions.
3149 // IRQ/FIQ: +4 "subs pc, lr, #4"
3150 // SWI: 0 "subs pc, lr, #0"
3151 // ABORT: +4 "subs pc, lr, #4"
3152 // UNDEF: +4/+2 "subs pc, lr, #0"
3153 // UNDEF varies depending on where the exception came from ARM or Thumb
3154 // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0.
3155
3156 int64_t LROffset;
3157 if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" ||
3158 IntKind == "ABORT")
3159 LROffset = 4;
3160 else if (IntKind == "SWI" || IntKind == "UNDEF")
3161 LROffset = 0;
3162 else
3163 report_fatal_error("Unsupported interrupt attribute. If present, value "
3164 "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF");
3165
3166 RetOps.insert(RetOps.begin() + 1,
3167 DAG.getConstant(LROffset, DL, MVT::i32, false));
3168
3169 return DAG.getNode(ARMISD::INTRET_GLUE, DL, MVT::Other, RetOps);
3170}
3171
3172SDValue
3173ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3174 bool isVarArg,
3176 const SmallVectorImpl<SDValue> &OutVals,
3177 const SDLoc &dl, SelectionDAG &DAG) const {
3178 // CCValAssign - represent the assignment of the return value to a location.
3180
3181 // CCState - Info about the registers and stack slots.
3182 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3183 *DAG.getContext());
3184
3185 // Analyze outgoing return values.
3186 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3187
3188 SDValue Glue;
3190 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3191 bool isLittleEndian = Subtarget->isLittle();
3192
3195 AFI->setReturnRegsCount(RVLocs.size());
3196
3197 // Report error if cmse entry function returns structure through first ptr arg.
3198 if (AFI->isCmseNSEntryFunction() && MF.getFunction().hasStructRetAttr()) {
3199 // Note: using an empty SDLoc(), as the first line of the function is a
3200 // better place to report than the last line.
3203 "secure entry function would return value through pointer",
3204 SDLoc().getDebugLoc());
3205 DAG.getContext()->diagnose(Diag);
3206 }
3207
3208 // Copy the result values into the output registers.
3209 for (unsigned i = 0, realRVLocIdx = 0;
3210 i != RVLocs.size();
3211 ++i, ++realRVLocIdx) {
3212 CCValAssign &VA = RVLocs[i];
3213 assert(VA.isRegLoc() && "Can only return in registers!");
3214
3215 SDValue Arg = OutVals[realRVLocIdx];
3216 bool ReturnF16 = false;
3217
3218 if (Subtarget->hasFullFP16() && Subtarget->isTargetHardFloat()) {
3219 // Half-precision return values can be returned like this:
3220 //
3221 // t11 f16 = fadd ...
3222 // t12: i16 = bitcast t11
3223 // t13: i32 = zero_extend t12
3224 // t14: f32 = bitcast t13 <~~~~~~~ Arg
3225 //
3226 // to avoid code generation for bitcasts, we simply set Arg to the node
3227 // that produces the f16 value, t11 in this case.
3228 //
3229 if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) {
3230 SDValue ZE = Arg.getOperand(0);
3231 if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) {
3232 SDValue BC = ZE.getOperand(0);
3233 if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) {
3234 Arg = BC.getOperand(0);
3235 ReturnF16 = true;
3236 }
3237 }
3238 }
3239 }
3240
3241 switch (VA.getLocInfo()) {
3242 default: llvm_unreachable("Unknown loc info!");
3243 case CCValAssign::Full: break;
3244 case CCValAssign::BCvt:
3245 if (!ReturnF16)
3246 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
3247 break;
3248 }
3249
3250 // Mask f16 arguments if this is a CMSE nonsecure entry.
3251 auto RetVT = Outs[realRVLocIdx].ArgVT;
3252 if (AFI->isCmseNSEntryFunction() && (RetVT == MVT::f16)) {
3253 if (VA.needsCustom() && VA.getValVT() == MVT::f16) {
3254 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
3255 } else {
3256 auto LocBits = VA.getLocVT().getSizeInBits();
3257 auto MaskValue = APInt::getLowBitsSet(LocBits, RetVT.getSizeInBits());
3258 SDValue Mask =
3259 DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
3260 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
3261 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
3262 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
3263 }
3264 }
3265
3266 if (VA.needsCustom() &&
3267 (VA.getLocVT() == MVT::v2f64 || VA.getLocVT() == MVT::f64)) {
3268 if (VA.getLocVT() == MVT::v2f64) {
3269 // Extract the first half and return it in two registers.
3270 SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
3271 DAG.getConstant(0, dl, MVT::i32));
3272 SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
3273 DAG.getVTList(MVT::i32, MVT::i32), Half);
3274
3275 Chain =
3276 DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3277 HalfGPRs.getValue(isLittleEndian ? 0 : 1), Glue);
3278 Glue = Chain.getValue(1);
3279 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3280 VA = RVLocs[++i]; // skip ahead to next loc
3281 Chain =
3282 DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3283 HalfGPRs.getValue(isLittleEndian ? 1 : 0), Glue);
3284 Glue = Chain.getValue(1);
3285 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3286 VA = RVLocs[++i]; // skip ahead to next loc
3287
3288 // Extract the 2nd half and fall through to handle it as an f64 value.
3289 Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
3290 DAG.getConstant(1, dl, MVT::i32));
3291 }
3292 // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is
3293 // available.
3294 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
3295 DAG.getVTList(MVT::i32, MVT::i32), Arg);
3296 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3297 fmrrd.getValue(isLittleEndian ? 0 : 1), Glue);
3298 Glue = Chain.getValue(1);
3299 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3300 VA = RVLocs[++i]; // skip ahead to next loc
3301 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3302 fmrrd.getValue(isLittleEndian ? 1 : 0), Glue);
3303 } else
3304 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Glue);
3305
3306 // Guarantee that all emitted copies are
3307 // stuck together, avoiding something bad.
3308 Glue = Chain.getValue(1);
3309 RetOps.push_back(DAG.getRegister(
3310 VA.getLocReg(), ReturnF16 ? Arg.getValueType() : VA.getLocVT()));
3311 }
3312 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
3313 const MCPhysReg *I =
3314 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3315 if (I) {
3316 for (; *I; ++I) {
3317 if (ARM::GPRRegClass.contains(*I))
3318 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3319 else if (ARM::DPRRegClass.contains(*I))
3321 else
3322 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3323 }
3324 }
3325
3326 // Update chain and glue.
3327 RetOps[0] = Chain;
3328 if (Glue.getNode())
3329 RetOps.push_back(Glue);
3330
3331 // CPUs which aren't M-class use a special sequence to return from
3332 // exceptions (roughly, any instruction setting pc and cpsr simultaneously,
3333 // though we use "subs pc, lr, #N").
3334 //
3335 // M-class CPUs actually use a normal return sequence with a special
3336 // (hardware-provided) value in LR, so the normal code path works.
3337 if (DAG.getMachineFunction().getFunction().hasFnAttribute("interrupt") &&
3338 !Subtarget->isMClass()) {
3339 if (Subtarget->isThumb1Only())
3340 report_fatal_error("interrupt attribute is not supported in Thumb1");
3341 return LowerInterruptReturn(RetOps, dl, DAG);
3342 }
3343
3346 return DAG.getNode(RetNode, dl, MVT::Other, RetOps);
3347}
3348
3349bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
3350 if (N->getNumValues() != 1)
3351 return false;
3352 if (!N->hasNUsesOfValue(1, 0))
3353 return false;
3354
3355 SDValue TCChain = Chain;
3356 SDNode *Copy = *N->use_begin();
3357 if (Copy->getOpcode() == ISD::CopyToReg) {
3358 // If the copy has a glue operand, we conservatively assume it isn't safe to
3359 // perform a tail call.
3360 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3361 return false;
3362 TCChain = Copy->getOperand(0);
3363 } else if (Copy->getOpcode() == ARMISD::VMOVRRD) {
3364 SDNode *VMov = Copy;
3365 // f64 returned in a pair of GPRs.
3367 for (SDNode *U : VMov->uses()) {
3368 if (U->getOpcode() != ISD::CopyToReg)
3369 return false;
3370 Copies.insert(U);
3371 }
3372 if (Copies.size() > 2)
3373 return false;
3374
3375 for (SDNode *U : VMov->uses()) {
3376 SDValue UseChain = U->getOperand(0);
3377 if (Copies.count(UseChain.getNode()))
3378 // Second CopyToReg
3379 Copy = U;
3380 else {
3381 // We are at the top of this chain.
3382 // If the copy has a glue operand, we conservatively assume it
3383 // isn't safe to perform a tail call.
3384 if (U->getOperand(U->getNumOperands() - 1).getValueType() == MVT::Glue)
3385 return false;
3386 // First CopyToReg
3387 TCChain = UseChain;
3388 }
3389 }
3390 } else if (Copy->getOpcode() == ISD::BITCAST) {
3391 // f32 returned in a single GPR.
3392 if (!Copy->hasOneUse())
3393 return false;
3394 Copy = *Copy->use_begin();
3395 if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0))
3396 return false;
3397 // If the copy has a glue operand, we conservatively assume it isn't safe to
3398 // perform a tail call.
3399 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3400 return false;
3401 TCChain = Copy->getOperand(0);
3402 } else {
3403 return false;
3404 }
3405
3406 bool HasRet = false;
3407 for (const SDNode *U : Copy->uses()) {
3408 if (U->getOpcode() != ARMISD::RET_GLUE &&
3409 U->getOpcode() != ARMISD::INTRET_GLUE)
3410 return false;
3411 HasRet = true;
3412 }
3413
3414 if (!HasRet)
3415 return false;
3416
3417 Chain = TCChain;
3418 return true;
3419}
3420
3421bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3422 if (!Subtarget->supportsTailCall())
3423 return false;
3424
3425 if (!CI->isTailCall())
3426 return false;
3427
3428 return true;
3429}
3430
3431// Trying to write a 64 bit value so need to split into two 32 bit values first,
3432// and pass the lower and high parts through.
3434 SDLoc DL(Op);
3435 SDValue WriteValue = Op->getOperand(2);
3436
3437 // This function is only supposed to be called for i64 type argument.
3438 assert(WriteValue.getValueType() == MVT::i64
3439 && "LowerWRITE_REGISTER called for non-i64 type argument.");
3440
3441 SDValue Lo, Hi;
3442 std::tie(Lo, Hi) = DAG.SplitScalar(WriteValue, DL, MVT::i32, MVT::i32);
3443 SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi };
3444 return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops);
3445}
3446
3447// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
3448// their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
3449// one of the above mentioned nodes. It has to be wrapped because otherwise
3450// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
3451// be used to form addressing mode. These wrapped nodes will be selected
3452// into MOVi.
3453SDValue ARMTargetLowering::LowerConstantPool(SDValue Op,
3454 SelectionDAG &DAG) const {
3455 EVT PtrVT = Op.getValueType();
3456 // FIXME there is no actual debug info here
3457 SDLoc dl(Op);
3458 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
3459 SDValue Res;
3460
3461 // When generating execute-only code Constant Pools must be promoted to the
3462 // global data section. It's a bit ugly that we can't share them across basic
3463 // blocks, but this way we guarantee that execute-only behaves correct with
3464 // position-independent addressing modes.
3465 if (Subtarget->genExecuteOnly()) {
3466 auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
3467 auto T = const_cast<Type*>(CP->getType());
3468 auto C = const_cast<Constant*>(CP->getConstVal());
3469 auto M = const_cast<Module*>(DAG.getMachineFunction().
3471 auto GV = new GlobalVariable(
3472 *M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C,
3475 Twine(AFI->createPICLabelUId())
3476 );
3477 SDValue GA = DAG.getTargetGlobalAddress(dyn_cast<GlobalValue>(GV),
3478 dl, PtrVT);
3479 return LowerGlobalAddress(GA, DAG);
3480 }
3481
3482 // The 16-bit ADR instruction can only encode offsets that are multiples of 4,
3483 // so we need to align to at least 4 bytes when we don't have 32-bit ADR.
3484 Align CPAlign = CP->getAlign();
3485 if (Subtarget->isThumb1Only())
3486 CPAlign = std::max(CPAlign, Align(4));
3487 if (CP->isMachineConstantPoolEntry())
3488 Res =
3489 DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, CPAlign);
3490 else
3491 Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CPAlign);
3492 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
3493}
3494
3496 // If we don't have a 32-bit pc-relative branch instruction then the jump
3497 // table consists of block addresses. Usually this is inline, but for
3498 // execute-only it must be placed out-of-line.
3499 if (Subtarget->genExecuteOnly() && !Subtarget->hasV8MBaselineOps())
3502}
3503
3504SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
3505 SelectionDAG &DAG) const {
3508 unsigned ARMPCLabelIndex = 0;
3509 SDLoc DL(Op);
3510 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3511 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
3512 SDValue CPAddr;
3513 bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI();
3514 if (!IsPositionIndependent) {
3515 CPAddr = DAG.getTargetConstantPool(BA, PtrVT, Align(4));
3516 } else {
3517 unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
3518 ARMPCLabelIndex = AFI->createPICLabelUId();
3520 ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex,
3521 ARMCP::CPBlockAddress, PCAdj);
3522 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3523 }
3524 CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
3525 SDValue Result = DAG.getLoad(
3526 PtrVT, DL, DAG.getEntryNode(), CPAddr,
3528 if (!IsPositionIndependent)
3529 return Result;
3530 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32);
3531 return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
3532}
3533
3534/// Convert a TLS address reference into the correct sequence of loads
3535/// and calls to compute the variable's address for Darwin, and return an
3536/// SDValue containing the final node.
3537
3538/// Darwin only has one TLS scheme which must be capable of dealing with the
3539/// fully general situation, in the worst case. This means:
3540/// + "extern __thread" declaration.
3541/// + Defined in a possibly unknown dynamic library.
3542///
3543/// The general system is that each __thread variable has a [3 x i32] descriptor
3544/// which contains information used by the runtime to calculate the address. The
3545/// only part of this the compiler needs to know about is the first word, which
3546/// contains a function pointer that must be called with the address of the
3547/// entire descriptor in "r0".
3548///
3549/// Since this descriptor may be in a different unit, in general access must
3550/// proceed along the usual ARM rules. A common sequence to produce is:
3551///
3552/// movw rT1, :lower16:_var$non_lazy_ptr
3553/// movt rT1, :upper16:_var$non_lazy_ptr
3554/// ldr r0, [rT1]
3555/// ldr rT2, [r0]
3556/// blx rT2
3557/// [...address now in r0...]
3558SDValue
3559ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op,
3560 SelectionDAG &DAG) const {
3561 assert(Subtarget->isTargetDarwin() &&
3562 "This function expects a Darwin target");
3563 SDLoc DL(Op);
3564
3565 // First step is to get the address of the actua global symbol. This is where
3566 // the TLS descriptor lives.
3567 SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG);
3568
3569 // The first entry in the descriptor is a function pointer that we must call
3570 // to obtain the address of the variable.
3571 SDValue Chain = DAG.getEntryNode();
3572 SDValue FuncTLVGet = DAG.getLoad(
3573 MVT::i32, DL, Chain, DescAddr,
3577 Chain = FuncTLVGet.getValue(1);
3578
3580 MachineFrameInfo &MFI = F.getFrameInfo();
3581 MFI.setAdjustsStack(true);
3582
3583 // TLS calls preserve all registers except those that absolutely must be
3584 // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be
3585 // silly).
3586 auto TRI =
3588 auto ARI = static_cast<const ARMRegisterInfo *>(TRI);
3590
3591 // Finally, we can make the call. This is just a degenerate version of a
3592 // normal AArch64 call node: r0 takes the address of the descriptor, and
3593 // returns the address of the variable in this thread.
3594 Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue());
3595 Chain =
3596 DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
3597 Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32),
3598 DAG.getRegisterMask(Mask), Chain.getValue(1));
3599 return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1));
3600}
3601
3602SDValue
3603ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op,
3604 SelectionDAG &DAG) const {
3605 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
3606
3607 SDValue Chain = DAG.getEntryNode();
3608 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3609 SDLoc DL(Op);
3610
3611 // Load the current TEB (thread environment block)
3612 SDValue Ops[] = {Chain,
3613 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
3614 DAG.getTargetConstant(15, DL, MVT::i32),
3615 DAG.getTargetConstant(0, DL, MVT::i32),
3616 DAG.getTargetConstant(13, DL, MVT::i32),
3617 DAG.getTargetConstant(0, DL, MVT::i32),
3618 DAG.getTargetConstant(2, DL, MVT::i32)};
3619 SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
3620 DAG.getVTList(MVT::i32, MVT::Other), Ops);
3621
3622 SDValue TEB = CurrentTEB.getValue(0);
3623 Chain = CurrentTEB.getValue(1);
3624
3625 // Load the ThreadLocalStoragePointer from the TEB
3626 // A pointer to the TLS array is located at offset 0x2c from the TEB.
3627 SDValue TLSArray =
3628 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL));
3629 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
3630
3631 // The pointer to the thread's TLS data area is at the TLS Index scaled by 4
3632 // offset into the TLSArray.
3633
3634 // Load the TLS index from the C runtime
3635 SDValue TLSIndex =
3636 DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG);
3637 TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex);
3638 TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo());
3639