LLVM 17.0.0git
ARMISelLowering.cpp
Go to the documentation of this file.
1//===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that ARM uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "ARMISelLowering.h"
15#include "ARMBaseInstrInfo.h"
16#include "ARMBaseRegisterInfo.h"
17#include "ARMCallingConv.h"
20#include "ARMPerfectShuffle.h"
21#include "ARMRegisterInfo.h"
22#include "ARMSelectionDAGInfo.h"
23#include "ARMSubtarget.h"
27#include "Utils/ARMBaseInfo.h"
28#include "llvm/ADT/APFloat.h"
29#include "llvm/ADT/APInt.h"
30#include "llvm/ADT/ArrayRef.h"
31#include "llvm/ADT/BitVector.h"
32#include "llvm/ADT/DenseMap.h"
33#include "llvm/ADT/STLExtras.h"
36#include "llvm/ADT/Statistic.h"
38#include "llvm/ADT/StringRef.h"
40#include "llvm/ADT/Twine.h"
65#include "llvm/IR/Attributes.h"
66#include "llvm/IR/CallingConv.h"
67#include "llvm/IR/Constant.h"
68#include "llvm/IR/Constants.h"
69#include "llvm/IR/DataLayout.h"
70#include "llvm/IR/DebugLoc.h"
72#include "llvm/IR/Function.h"
73#include "llvm/IR/GlobalAlias.h"
74#include "llvm/IR/GlobalValue.h"
76#include "llvm/IR/IRBuilder.h"
77#include "llvm/IR/InlineAsm.h"
78#include "llvm/IR/Instruction.h"
81#include "llvm/IR/Intrinsics.h"
82#include "llvm/IR/IntrinsicsARM.h"
83#include "llvm/IR/Module.h"
85#include "llvm/IR/Type.h"
86#include "llvm/IR/User.h"
87#include "llvm/IR/Value.h"
88#include "llvm/MC/MCInstrDesc.h"
91#include "llvm/MC/MCSchedule.h"
98#include "llvm/Support/Debug.h"
107#include <algorithm>
108#include <cassert>
109#include <cstdint>
110#include <cstdlib>
111#include <iterator>
112#include <limits>
113#include <optional>
114#include <string>
115#include <tuple>
116#include <utility>
117#include <vector>
118
119using namespace llvm;
120using namespace llvm::PatternMatch;
121
122#define DEBUG_TYPE "arm-isel"
123
124STATISTIC(NumTailCalls, "Number of tail calls");
125STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
126STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
127STATISTIC(NumConstpoolPromoted,
128 "Number of constants with their storage promoted into constant pools");
129
130static cl::opt<bool>
131ARMInterworking("arm-interworking", cl::Hidden,
132 cl::desc("Enable / disable ARM interworking (for debugging only)"),
133 cl::init(true));
134
136 "arm-promote-constant", cl::Hidden,
137 cl::desc("Enable / disable promotion of unnamed_addr constants into "
138 "constant pools"),
139 cl::init(false)); // FIXME: set to true by default once PR32780 is fixed
141 "arm-promote-constant-max-size", cl::Hidden,
142 cl::desc("Maximum size of constant to promote into a constant pool"),
143 cl::init(64));
145 "arm-promote-constant-max-total", cl::Hidden,
146 cl::desc("Maximum size of ALL constants to promote into a constant pool"),
147 cl::init(128));
148
150MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden,
151 cl::desc("Maximum interleave factor for MVE VLDn to generate."),
152 cl::init(2));
153
154// The APCS parameter registers.
155static const MCPhysReg GPRArgRegs[] = {
156 ARM::R0, ARM::R1, ARM::R2, ARM::R3
157};
158
159void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT) {
160 if (VT != PromotedLdStVT) {
162 AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);
163
165 AddPromotedToType (ISD::STORE, VT, PromotedLdStVT);
166 }
167
168 MVT ElemTy = VT.getVectorElementType();
169 if (ElemTy != MVT::f64)
173 if (ElemTy == MVT::i32) {
178 } else {
183 }
192 if (VT.isInteger()) {
196 }
197
198 // Neon does not support vector divide/remainder operations.
207
208 if (!VT.isFloatingPoint() &&
209 VT != MVT::v2i64 && VT != MVT::v1i64)
210 for (auto Opcode : {ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
211 setOperationAction(Opcode, VT, Legal);
212 if (!VT.isFloatingPoint())
213 for (auto Opcode : {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT})
214 setOperationAction(Opcode, VT, Legal);
215}
216
217void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
218 addRegisterClass(VT, &ARM::DPRRegClass);
219 addTypeForNEON(VT, MVT::f64);
220}
221
222void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
223 addRegisterClass(VT, &ARM::DPairRegClass);
224 addTypeForNEON(VT, MVT::v2f64);
225}
226
227void ARMTargetLowering::setAllExpand(MVT VT) {
228 for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
229 setOperationAction(Opc, VT, Expand);
230
231 // We support these really simple operations even on types where all
232 // the actual arithmetic has to be broken down into simpler
233 // operations or turned into library calls.
238}
239
240void ARMTargetLowering::addAllExtLoads(const MVT From, const MVT To,
241 LegalizeAction Action) {
242 setLoadExtAction(ISD::EXTLOAD, From, To, Action);
243 setLoadExtAction(ISD::ZEXTLOAD, From, To, Action);
244 setLoadExtAction(ISD::SEXTLOAD, From, To, Action);
245}
246
247void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
248 const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 };
249
250 for (auto VT : IntTypes) {
251 addRegisterClass(VT, &ARM::MQPRRegClass);
281
282 // No native support for these.
292
293 // Vector reductions
303
304 if (!HasMVEFP) {
309 } else {
312 }
313
314 // Pre and Post inc are supported on loads and stores
315 for (unsigned im = (unsigned)ISD::PRE_INC;
321 }
322 }
323
324 const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 };
325 for (auto VT : FloatTypes) {
326 addRegisterClass(VT, &ARM::MQPRRegClass);
327 if (!HasMVEFP)
328 setAllExpand(VT);
329
330 // These are legal or custom whether we have MVE.fp or not
343
344 // Pre and Post inc are supported on loads and stores
345 for (unsigned im = (unsigned)ISD::PRE_INC;
351 }
352
353 if (HasMVEFP) {
361
362 // No native support for these.
375 }
376 }
377
378 // Custom Expand smaller than legal vector reductions to prevent false zero
379 // items being added.
388
389 // We 'support' these types up to bitcast/load/store level, regardless of
390 // MVE integer-only / float support. Only doing FP data processing on the FP
391 // vector types is inhibited at integer-only level.
392 const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 };
393 for (auto VT : LongTypes) {
394 addRegisterClass(VT, &ARM::MQPRRegClass);
395 setAllExpand(VT);
401 }
403
404 // We can do bitwise operations on v2i64 vectors
408
409 // It is legal to extload from v4i8 to v4i16 or v4i32.
410 addAllExtLoads(MVT::v8i16, MVT::v8i8, Legal);
411 addAllExtLoads(MVT::v4i32, MVT::v4i16, Legal);
412 addAllExtLoads(MVT::v4i32, MVT::v4i8, Legal);
413
414 // It is legal to sign extend from v4i8/v4i16 to v4i32 or v8i8 to v8i16.
420
421 // Some truncating stores are legal too.
425
426 // Pre and Post inc on these are legal, given the correct extends
427 for (unsigned im = (unsigned)ISD::PRE_INC;
429 for (auto VT : {MVT::v8i8, MVT::v4i8, MVT::v4i16}) {
434 }
435 }
436
437 // Predicate types
438 const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1};
439 for (auto VT : pTypes) {
440 addRegisterClass(VT, &ARM::VCCRRegClass);
455
456 if (!HasMVEFP) {
461 }
462 }
472
481}
482
484 const ARMSubtarget &STI)
485 : TargetLowering(TM), Subtarget(&STI) {
486 RegInfo = Subtarget->getRegisterInfo();
487 Itins = Subtarget->getInstrItineraryData();
488
491
492 if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetIOS() &&
493 !Subtarget->isTargetWatchOS() && !Subtarget->isTargetDriverKit()) {
494 bool IsHFTarget = TM.Options.FloatABIType == FloatABI::Hard;
495 for (int LCID = 0; LCID < RTLIB::UNKNOWN_LIBCALL; ++LCID)
496 setLibcallCallingConv(static_cast<RTLIB::Libcall>(LCID),
497 IsHFTarget ? CallingConv::ARM_AAPCS_VFP
499 }
500
501 if (Subtarget->isTargetMachO()) {
502 // Uses VFP for Thumb libfuncs if available.
503 if (Subtarget->isThumb() && Subtarget->hasVFP2Base() &&
504 Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) {
505 static const struct {
506 const RTLIB::Libcall Op;
507 const char * const Name;
508 const ISD::CondCode Cond;
509 } LibraryCalls[] = {
510 // Single-precision floating-point arithmetic.
511 { RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID },
512 { RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID },
513 { RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID },
514 { RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID },
515
516 // Double-precision floating-point arithmetic.
517 { RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID },
518 { RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID },
519 { RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID },
520 { RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID },
521
522 // Single-precision comparisons.
523 { RTLIB::OEQ_F32, "__eqsf2vfp", ISD::SETNE },
524 { RTLIB::UNE_F32, "__nesf2vfp", ISD::SETNE },
525 { RTLIB::OLT_F32, "__ltsf2vfp", ISD::SETNE },
526 { RTLIB::OLE_F32, "__lesf2vfp", ISD::SETNE },
527 { RTLIB::OGE_F32, "__gesf2vfp", ISD::SETNE },
528 { RTLIB::OGT_F32, "__gtsf2vfp", ISD::SETNE },
529 { RTLIB::UO_F32, "__unordsf2vfp", ISD::SETNE },
530
531 // Double-precision comparisons.
532 { RTLIB::OEQ_F64, "__eqdf2vfp", ISD::SETNE },
533 { RTLIB::UNE_F64, "__nedf2vfp", ISD::SETNE },
534 { RTLIB::OLT_F64, "__ltdf2vfp", ISD::SETNE },
535 { RTLIB::OLE_F64, "__ledf2vfp", ISD::SETNE },
536 { RTLIB::OGE_F64, "__gedf2vfp", ISD::SETNE },
537 { RTLIB::OGT_F64, "__gtdf2vfp", ISD::SETNE },
538 { RTLIB::UO_F64, "__unorddf2vfp", ISD::SETNE },
539
540 // Floating-point to integer conversions.
541 // i64 conversions are done via library routines even when generating VFP
542 // instructions, so use the same ones.
543 { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp", ISD::SETCC_INVALID },
544 { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID },
545 { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp", ISD::SETCC_INVALID },
546 { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID },
547
548 // Conversions between floating types.
549 { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp", ISD::SETCC_INVALID },
550 { RTLIB::FPEXT_F32_F64, "__extendsfdf2vfp", ISD::SETCC_INVALID },
551
552 // Integer to floating-point conversions.
553 // i64 conversions are done via library routines even when generating VFP
554 // instructions, so use the same ones.
555 // FIXME: There appears to be some naming inconsistency in ARM libgcc:
556 // e.g., __floatunsidf vs. __floatunssidfvfp.
557 { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp", ISD::SETCC_INVALID },
558 { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID },
559 { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp", ISD::SETCC_INVALID },
560 { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID },
561 };
562
563 for (const auto &LC : LibraryCalls) {
564 setLibcallName(LC.Op, LC.Name);
565 if (LC.Cond != ISD::SETCC_INVALID)
566 setCmpLibcallCC(LC.Op, LC.Cond);
567 }
568 }
569 }
570
571 // These libcalls are not available in 32-bit.
572 setLibcallName(RTLIB::SHL_I128, nullptr);
573 setLibcallName(RTLIB::SRL_I128, nullptr);
574 setLibcallName(RTLIB::SRA_I128, nullptr);
575 setLibcallName(RTLIB::MUL_I128, nullptr);
576 setLibcallName(RTLIB::MULO_I64, nullptr);
577 setLibcallName(RTLIB::MULO_I128, nullptr);
578
579 // RTLIB
580 if (Subtarget->isAAPCS_ABI() &&
581 (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() ||
582 Subtarget->isTargetMuslAEABI() || Subtarget->isTargetAndroid())) {
583 static const struct {
584 const RTLIB::Libcall Op;
585 const char * const Name;
586 const CallingConv::ID CC;
587 const ISD::CondCode Cond;
588 } LibraryCalls[] = {
589 // Double-precision floating-point arithmetic helper functions
590 // RTABI chapter 4.1.2, Table 2
591 { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
592 { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
593 { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
594 { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
595
596 // Double-precision floating-point comparison helper functions
597 // RTABI chapter 4.1.2, Table 3
598 { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
599 { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
600 { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
601 { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
602 { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
603 { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
604 { RTLIB::UO_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
605
606 // Single-precision floating-point arithmetic helper functions
607 // RTABI chapter 4.1.2, Table 4
608 { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
609 { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
610 { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
611 { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
612
613 // Single-precision floating-point comparison helper functions
614 // RTABI chapter 4.1.2, Table 5
615 { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
616 { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
617 { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
618 { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
619 { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
620 { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
621 { RTLIB::UO_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
622
623 // Floating-point to integer conversions.
624 // RTABI chapter 4.1.2, Table 6
625 { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
626 { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
627 { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
628 { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
629 { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
630 { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
631 { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
632 { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
633
634 // Conversions between floating types.
635 // RTABI chapter 4.1.2, Table 7
636 { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
637 { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
638 { RTLIB::FPEXT_F32_F64, "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
639
640 // Integer to floating-point conversions.
641 // RTABI chapter 4.1.2, Table 8
642 { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
643 { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
644 { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
645 { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
646 { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
647 { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
648 { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
649 { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
650
651 // Long long helper functions
652 // RTABI chapter 4.2, Table 9
653 { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
654 { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
655 { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
656 { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
657
658 // Integer division functions
659 // RTABI chapter 4.3.1
660 { RTLIB::SDIV_I8, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
661 { RTLIB::SDIV_I16, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
662 { RTLIB::SDIV_I32, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
663 { RTLIB::SDIV_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
664 { RTLIB::UDIV_I8, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
665 { RTLIB::UDIV_I16, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
666 { RTLIB::UDIV_I32, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
667 { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
668 };
669
670 for (const auto &LC : LibraryCalls) {
671 setLibcallName(LC.Op, LC.Name);
672 setLibcallCallingConv(LC.Op, LC.CC);
673 if (LC.Cond != ISD::SETCC_INVALID)
674 setCmpLibcallCC(LC.Op, LC.Cond);
675 }
676
677 // EABI dependent RTLIB
678 if (TM.Options.EABIVersion == EABI::EABI4 ||
679 TM.Options.EABIVersion == EABI::EABI5) {
680 static const struct {
681 const RTLIB::Libcall Op;
682 const char *const Name;
683 const CallingConv::ID CC;
684 const ISD::CondCode Cond;
685 } MemOpsLibraryCalls[] = {
686 // Memory operations
687 // RTABI chapter 4.3.4
688 { RTLIB::MEMCPY, "__aeabi_memcpy", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
689 { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
690 { RTLIB::MEMSET, "__aeabi_memset", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
691 };
692
693 for (const auto &LC : MemOpsLibraryCalls) {
694 setLibcallName(LC.Op, LC.Name);
695 setLibcallCallingConv(LC.Op, LC.CC);
696 if (LC.Cond != ISD::SETCC_INVALID)
697 setCmpLibcallCC(LC.Op, LC.Cond);
698 }
699 }
700 }
701
702 if (Subtarget->isTargetWindows()) {
703 static const struct {
704 const RTLIB::Libcall Op;
705 const char * const Name;
706 const CallingConv::ID CC;
707 } LibraryCalls[] = {
708 { RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP },
709 { RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP },
710 { RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP },
711 { RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP },
712 { RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP },
713 { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP },
714 { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP },
715 { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP },
716 };
717
718 for (const auto &LC : LibraryCalls) {
719 setLibcallName(LC.Op, LC.Name);
720 setLibcallCallingConv(LC.Op, LC.CC);
721 }
722 }
723
724 // Use divmod compiler-rt calls for iOS 5.0 and later.
725 if (Subtarget->isTargetMachO() &&
726 !(Subtarget->isTargetIOS() &&
727 Subtarget->getTargetTriple().isOSVersionLT(5, 0))) {
728 setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
729 setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
730 }
731
732 // The half <-> float conversion functions are always soft-float on
733 // non-watchos platforms, but are needed for some targets which use a
734 // hard-float calling convention by default.
735 if (!Subtarget->isTargetWatchABI()) {
736 if (Subtarget->isAAPCS_ABI()) {
737 setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS);
738 setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS);
739 setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS);
740 } else {
741 setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS);
742 setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS);
743 setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS);
744 }
745 }
746
747 // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have
748 // a __gnu_ prefix (which is the default).
749 if (Subtarget->isTargetAEABI()) {
750 static const struct {
751 const RTLIB::Libcall Op;
752 const char * const Name;
753 const CallingConv::ID CC;
754 } LibraryCalls[] = {
755 { RTLIB::FPROUND_F32_F16, "__aeabi_f2h", CallingConv::ARM_AAPCS },
756 { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS },
757 { RTLIB::FPEXT_F16_F32, "__aeabi_h2f", CallingConv::ARM_AAPCS },
758 };
759
760 for (const auto &LC : LibraryCalls) {
761 setLibcallName(LC.Op, LC.Name);
762 setLibcallCallingConv(LC.Op, LC.CC);
763 }
764 }
765
766 if (Subtarget->isThumb1Only())
767 addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
768 else
769 addRegisterClass(MVT::i32, &ARM::GPRRegClass);
770
771 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only() &&
772 Subtarget->hasFPRegs()) {
773 addRegisterClass(MVT::f32, &ARM::SPRRegClass);
774 addRegisterClass(MVT::f64, &ARM::DPRRegClass);
775
780
781 if (!Subtarget->hasVFP2Base())
782 setAllExpand(MVT::f32);
783 if (!Subtarget->hasFP64())
784 setAllExpand(MVT::f64);
785 }
786
787 if (Subtarget->hasFullFP16()) {
788 addRegisterClass(MVT::f16, &ARM::HPRRegClass);
791
794 }
795
796 if (Subtarget->hasBF16()) {
797 addRegisterClass(MVT::bf16, &ARM::HPRRegClass);
798 setAllExpand(MVT::bf16);
799 if (!Subtarget->hasFullFP16())
801 }
802
804 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
805 setTruncStoreAction(VT, InnerVT, Expand);
806 addAllExtLoads(VT, InnerVT, Expand);
807 }
808
811
813 }
814
817
820
821 if (Subtarget->hasMVEIntegerOps())
822 addMVEVectorTypes(Subtarget->hasMVEFloatOps());
823
824 // Combine low-overhead loop intrinsics so that we can lower i1 types.
825 if (Subtarget->hasLOB()) {
827 }
828
829 if (Subtarget->hasNEON()) {
830 addDRTypeForNEON(MVT::v2f32);
831 addDRTypeForNEON(MVT::v8i8);
832 addDRTypeForNEON(MVT::v4i16);
833 addDRTypeForNEON(MVT::v2i32);
834 addDRTypeForNEON(MVT::v1i64);
835
836 addQRTypeForNEON(MVT::v4f32);
837 addQRTypeForNEON(MVT::v2f64);
838 addQRTypeForNEON(MVT::v16i8);
839 addQRTypeForNEON(MVT::v8i16);
840 addQRTypeForNEON(MVT::v4i32);
841 addQRTypeForNEON(MVT::v2i64);
842
843 if (Subtarget->hasFullFP16()) {
844 addQRTypeForNEON(MVT::v8f16);
845 addDRTypeForNEON(MVT::v4f16);
846 }
847
848 if (Subtarget->hasBF16()) {
849 addQRTypeForNEON(MVT::v8bf16);
850 addDRTypeForNEON(MVT::v4bf16);
851 }
852 }
853
854 if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) {
855 // v2f64 is legal so that QR subregs can be extracted as f64 elements, but
856 // none of Neon, MVE or VFP supports any arithmetic operations on it.
860 // FIXME: Code duplication: FDIV and FREM are expanded always, see
861 // ARMTargetLowering::addTypeForNEON method for details.
864 // FIXME: Create unittest.
865 // In another words, find a way when "copysign" appears in DAG with vector
866 // operands.
868 // FIXME: Code duplication: SETCC has custom operation action, see
869 // ARMTargetLowering::addTypeForNEON method for details.
871 // FIXME: Create unittest for FNEG and for FABS.
883 // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR.
890 }
891
892 if (Subtarget->hasNEON()) {
893 // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
894 // supported for v4f32.
909
910 // Mark v2f32 intrinsics.
925
926 // Neon does not support some operations on v1i64 and v2i64 types.
928 // Custom handling for some quad-vector types to detect VMULL.
932 // Custom handling for some vector types to avoid expensive expansions
937 // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
938 // a destination type that is wider than the source, and nor does
939 // it have a FP_TO_[SU]INT instruction with a narrower destination than
940 // source.
949
952
953 // NEON does not have single instruction CTPOP for vectors with element
954 // types wider than 8-bits. However, custom lowering can leverage the
955 // v8i8/v16i8 vcnt instruction.
962
965
966 // NEON does not have single instruction CTTZ for vectors.
971
976
981
986
990 }
991
992 // NEON only has FMA instructions as of VFP4.
993 if (!Subtarget->hasVFP4Base()) {
996 }
997
1000
1001 // It is legal to extload from v4i8 to v4i16 or v4i32.
1003 MVT::v2i32}) {
1008 }
1009 }
1010 }
1011
1012 if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) {
1019 }
1020 if (Subtarget->hasMVEIntegerOps()) {
1023 ISD::SETCC});
1024 }
1025 if (Subtarget->hasMVEFloatOps()) {
1027 }
1028
1029 if (!Subtarget->hasFP64()) {
1030 // When targeting a floating-point unit with only single-precision
1031 // operations, f64 is legal for the few double-precision instructions which
1032 // are present However, no double-precision operations other than moves,
1033 // loads and stores are provided by the hardware.
1070 }
1071
1072 if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) {
1075 if (Subtarget->hasFullFP16()) {
1078 }
1079 }
1080
1081 if (!Subtarget->hasFP16()) {
1084 }
1085
1087
1088 // ARM does not have floating-point extending loads.
1089 for (MVT VT : MVT::fp_valuetypes()) {
1092 }
1093
1094 // ... or truncating stores
1098
1099 // ARM does not have i1 sign extending load.
1100 for (MVT VT : MVT::integer_valuetypes())
1102
1103 // ARM supports all 4 flavors of integer indexed load / store.
1104 if (!Subtarget->isThumb1Only()) {
1105 for (unsigned im = (unsigned)ISD::PRE_INC;
1115 }
1116 } else {
1117 // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}.
1120 }
1121
1126
1129 if (Subtarget->hasDSP()) {
1138 }
1139 if (Subtarget->hasBaseDSP()) {
1142 }
1143
1144 // i64 operation support.
1147 if (Subtarget->isThumb1Only()) {
1150 }
1151 if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops()
1152 || (Subtarget->isThumb2() && !Subtarget->hasDSP()))
1154
1164
1165 // MVE lowers 64 bit shifts to lsll and lsrl
1166 // assuming that ISD::SRL and SRA of i64 are already marked custom
1167 if (Subtarget->hasMVEIntegerOps())
1169
1170 // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1.
1171 if (Subtarget->isThumb1Only()) {
1175 }
1176
1177 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops())
1179
1180 // ARM does not have ROTL.
1185 }
1188 if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) {
1191 }
1192
1193 // @llvm.readcyclecounter requires the Performance Monitors extension.
1194 // Default to the 0 expansion on unsupported platforms.
1195 // FIXME: Technically there are older ARM CPUs that have
1196 // implementation-specific ways of obtaining this information.
1197 if (Subtarget->hasPerfMon())
1199
1200 // Only ARMv6 has BSWAP.
1201 if (!Subtarget->hasV6Ops())
1203
1204 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
1205 : Subtarget->hasDivideInARMMode();
1206 if (!hasDivide) {
1207 // These are expanded into libcalls if the cpu doesn't have HW divider.
1210 }
1211
1212 if (Subtarget->isTargetWindows() && !Subtarget->hasDivideInThumbMode()) {
1215
1218 }
1219
1222
1223 // Register based DivRem for AEABI (RTABI 4.2)
1224 if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
1225 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
1226 Subtarget->isTargetWindows()) {
1229 HasStandaloneRem = false;
1230
1231 if (Subtarget->isTargetWindows()) {
1232 const struct {
1233 const RTLIB::Libcall Op;
1234 const char * const Name;
1235 const CallingConv::ID CC;
1236 } LibraryCalls[] = {
1237 { RTLIB::SDIVREM_I8, "__rt_sdiv", CallingConv::ARM_AAPCS },
1238 { RTLIB::SDIVREM_I16, "__rt_sdiv", CallingConv::ARM_AAPCS },
1239 { RTLIB::SDIVREM_I32, "__rt_sdiv", CallingConv::ARM_AAPCS },
1240 { RTLIB::SDIVREM_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS },
1241
1242 { RTLIB::UDIVREM_I8, "__rt_udiv", CallingConv::ARM_AAPCS },
1243 { RTLIB::UDIVREM_I16, "__rt_udiv", CallingConv::ARM_AAPCS },
1244 { RTLIB::UDIVREM_I32, "__rt_udiv", CallingConv::ARM_AAPCS },
1245 { RTLIB::UDIVREM_I64, "__rt_udiv64", CallingConv::ARM_AAPCS },
1246 };
1247
1248 for (const auto &LC : LibraryCalls) {
1249 setLibcallName(LC.Op, LC.Name);
1250 setLibcallCallingConv(LC.Op, LC.CC);
1251 }
1252 } else {
1253 const struct {
1254 const RTLIB::Libcall Op;
1255 const char * const Name;
1256 const CallingConv::ID CC;
1257 } LibraryCalls[] = {
1258 { RTLIB::SDIVREM_I8, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
1259 { RTLIB::SDIVREM_I16, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
1260 { RTLIB::SDIVREM_I32, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
1261 { RTLIB::SDIVREM_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS },
1262
1263 { RTLIB::UDIVREM_I8, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
1264 { RTLIB::UDIVREM_I16, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
1265 { RTLIB::UDIVREM_I32, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
1266 { RTLIB::UDIVREM_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS },
1267 };
1268
1269 for (const auto &LC : LibraryCalls) {
1270 setLibcallName(LC.Op, LC.Name);
1271 setLibcallCallingConv(LC.Op, LC.CC);
1272 }
1273 }
1274
1279 } else {
1282 }
1283
1284 if (Subtarget->getTargetTriple().isOSMSVCRT()) {
1285 // MSVCRT doesn't have powi; fall back to pow
1286 setLibcallName(RTLIB::POWI_F32, nullptr);
1287 setLibcallName(RTLIB::POWI_F64, nullptr);
1288 }
1289
1294
1297
1298 // Use the default implementation.
1305
1306 if (Subtarget->isTargetWindows())
1308 else
1310
1311 // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
1312 // the default expansion.
1313 InsertFencesForAtomic = false;
1314 if (Subtarget->hasAnyDataBarrier() &&
1315 (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) {
1316 // ATOMIC_FENCE needs custom lowering; the others should have been expanded
1317 // to ldrex/strex loops already.
1319 if (!Subtarget->isThumb() || !Subtarget->isMClass())
1321
1322 // On v8, we have particularly efficient implementations of atomic fences
1323 // if they can be combined with nearby atomic loads and stores.
1324 if (!Subtarget->hasAcquireRelease() ||
1325 getTargetMachine().getOptLevel() == 0) {
1326 // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc.
1327 InsertFencesForAtomic = true;
1328 }
1329 } else {
1330 // If there's anything we can use as a barrier, go through custom lowering
1331 // for ATOMIC_FENCE.
1332 // If target has DMB in thumb, Fences can be inserted.
1333 if (Subtarget->hasDataBarrier())
1334 InsertFencesForAtomic = true;
1335
1337 Subtarget->hasAnyDataBarrier() ? Custom : Expand);
1338
1339 // Set them all for expansion, which will force libcalls.
1352 // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the
1353 // Unordered/Monotonic case.
1354 if (!InsertFencesForAtomic) {
1357 }
1358 }
1359
1360 // Compute supported atomic widths.
1361 if (Subtarget->isTargetLinux() ||
1362 (!Subtarget->isMClass() && Subtarget->hasV6Ops())) {
1363 // For targets where __sync_* routines are reliably available, we use them
1364 // if necessary.
1365 //
1366 // ARM Linux always supports 64-bit atomics through kernel-assisted atomic
1367 // routines (kernel 3.1 or later). FIXME: Not with compiler-rt?
1368 //
1369 // ARMv6 targets have native instructions in ARM mode. For Thumb mode,
1370 // such targets should provide __sync_* routines, which use the ARM mode
1371 // instructions. (ARMv6 doesn't have dmb, but it has an equivalent
1372 // encoding; see ARMISD::MEMBARRIER_MCR.)
1374 } else if ((Subtarget->isMClass() && Subtarget->hasV8MBaselineOps()) ||
1375 Subtarget->hasForced32BitAtomics()) {
1376 // Cortex-M (besides Cortex-M0) have 32-bit atomics.
1378 } else {
1379 // We can't assume anything about other targets; just use libatomic
1380 // routines.
1382 }
1383
1385
1387
1388 // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
1389 if (!Subtarget->hasV6Ops()) {
1392 }
1394
1395 if (!Subtarget->useSoftFloat() && Subtarget->hasFPRegs() &&
1396 !Subtarget->isThumb1Only()) {
1397 // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
1398 // iff target supports vfp2.
1402 }
1403
1404 // We want to custom lower some of our intrinsics.
1409 if (Subtarget->useSjLjEH())
1410 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
1411
1421 if (Subtarget->hasFullFP16()) {
1425 }
1426
1428
1431 if (Subtarget->hasFullFP16())
1436
1437 // We don't support sin/cos/fmod/copysign/pow
1446 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2Base() &&
1447 !Subtarget->isThumb1Only()) {
1450 }
1453
1454 if (!Subtarget->hasVFP4Base()) {
1457 }
1458
1459 // Various VFP goodness
1460 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) {
1461 // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded.
1462 if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) {
1465 }
1466
1467 // fp16 is a special v7 extension that adds f16 <-> f32 conversions.
1468 if (!Subtarget->hasFP16()) {
1471 }
1472
1473 // Strict floating-point comparisons need custom lowering.
1480 }
1481
1482 // Use __sincos_stret if available.
1483 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1484 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1487 }
1488
1489 // FP-ARMv8 implements a lot of rounding-like FP operations.
1490 if (Subtarget->hasFPARMv8Base()) {
1499 if (Subtarget->hasNEON()) {
1504 }
1505
1506 if (Subtarget->hasFP64()) {
1515 }
1516 }
1517
1518 // FP16 often need to be promoted to call lib functions
1519 if (Subtarget->hasFullFP16()) {
1532
1534 }
1535
1536 if (Subtarget->hasNEON()) {
1537 // vmin and vmax aren't available in a scalar form, so we can use
1538 // a NEON instruction with an undef lane instead. This has a performance
1539 // penalty on some cores, so we don't do this unless we have been
1540 // asked to by the core tuning model.
1541 if (Subtarget->useNEONForSinglePrecisionFP()) {
1546 }
1551
1552 if (Subtarget->hasFullFP16()) {
1557
1562 }
1563 }
1564
1565 // We have target-specific dag combine patterns for the following nodes:
1566 // ARMISD::VMOVRRD - No need to call setTargetDAGCombine
1569
1570 if (Subtarget->hasMVEIntegerOps())
1572
1573 if (Subtarget->hasV6Ops())
1575 if (Subtarget->isThumb1Only())
1577 // Attempt to lower smin/smax to ssat/usat
1578 if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) ||
1579 Subtarget->isThumb2()) {
1581 }
1582
1584
1585 if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() ||
1586 !Subtarget->hasVFP2Base() || Subtarget->hasMinSize())
1588 else
1590
1591 //// temporary - rewrite interface to use type
1594 MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
1596 MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
1598
1599 // On ARM arguments smaller than 4 bytes are extended, so all arguments
1600 // are at least 4 bytes aligned.
1602
1603 // Prefer likely predicted branches to selects on out-of-order cores.
1604 PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder();
1605
1606 setPrefLoopAlignment(Align(1ULL << Subtarget->getPrefLoopLogAlignment()));
1607
1608 setMinFunctionAlignment(Subtarget->isThumb() ? Align(2) : Align(4));
1609
1610 if (Subtarget->isThumb() || Subtarget->isThumb2())
1612}
1613
1615 return Subtarget->useSoftFloat();
1616}
1617
1618// FIXME: It might make sense to define the representative register class as the
1619// nearest super-register that has a non-null superset. For example, DPR_VFP2 is
1620// a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
1621// SPR's representative would be DPR_VFP2. This should work well if register
1622// pressure tracking were modified such that a register use would increment the
1623// pressure of the register class's representative and all of it's super
1624// classes' representatives transitively. We have not implemented this because
1625// of the difficulty prior to coalescing of modeling operand register classes
1626// due to the common occurrence of cross class copies and subregister insertions
1627// and extractions.
1628std::pair<const TargetRegisterClass *, uint8_t>
1630 MVT VT) const {
1631 const TargetRegisterClass *RRC = nullptr;
1632 uint8_t Cost = 1;
1633 switch (VT.SimpleTy) {
1634 default:
1636 // Use DPR as representative register class for all floating point
1637 // and vector types. Since there are 32 SPR registers and 32 DPR registers so
1638 // the cost is 1 for both f32 and f64.
1639 case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:
1640 case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:
1641 RRC = &ARM::DPRRegClass;
1642 // When NEON is used for SP, only half of the register file is available
1643 // because operations that define both SP and DP results will be constrained
1644 // to the VFP2 class (D0-D15). We currently model this constraint prior to
1645 // coalescing by double-counting the SP regs. See the FIXME above.
1646 if (Subtarget->useNEONForSinglePrecisionFP())
1647 Cost = 2;
1648 break;
1649 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1650 case MVT::v4f32: case MVT::v2f64:
1651 RRC = &ARM::DPRRegClass;
1652 Cost = 2;
1653 break;
1654 case MVT::v4i64:
1655 RRC = &ARM::DPRRegClass;
1656 Cost = 4;
1657 break;
1658 case MVT::v8i64:
1659 RRC = &ARM::DPRRegClass;
1660 Cost = 8;
1661 break;
1662 }
1663 return std::make_pair(RRC, Cost);
1664}
1665
1666const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
1667#define MAKE_CASE(V) \
1668 case V: \
1669 return #V;
1670 switch ((ARMISD::NodeType)Opcode) {
1672 break;
1876#undef MAKE_CASE
1877 }
1878 return nullptr;
1879}
1880
1882 EVT VT) const {
1883 if (!VT.isVector())
1884 return getPointerTy(DL);
1885
1886 // MVE has a predicate register.
1887 if ((Subtarget->hasMVEIntegerOps() &&
1888 (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
1889 VT == MVT::v16i8)) ||
1890 (Subtarget->hasMVEFloatOps() &&
1891 (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16)))
1894}
1895
1896/// getRegClassFor - Return the register class that should be used for the
1897/// specified value type.
1898const TargetRegisterClass *
1899ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
1900 (void)isDivergent;
1901 // Map v4i64 to QQ registers but do not make the type legal. Similarly map
1902 // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
1903 // load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive
1904 // MVE Q registers.
1905 if (Subtarget->hasNEON()) {
1906 if (VT == MVT::v4i64)
1907 return &ARM::QQPRRegClass;
1908 if (VT == MVT::v8i64)
1909 return &ARM::QQQQPRRegClass;
1910 }
1911 if (Subtarget->hasMVEIntegerOps()) {
1912 if (VT == MVT::v4i64)
1913 return &ARM::MQQPRRegClass;
1914 if (VT == MVT::v8i64)
1915 return &ARM::MQQQQPRRegClass;
1916 }
1918}
1919
1920// memcpy, and other memory intrinsics, typically tries to use LDM/STM if the
1921// source/dest is aligned and the copy size is large enough. We therefore want
1922// to align such objects passed to memory intrinsics.
1924 Align &PrefAlign) const {
1925 if (!isa<MemIntrinsic>(CI))
1926 return false;
1927 MinSize = 8;
1928 // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1
1929 // cycle faster than 4-byte aligned LDM.
1930 PrefAlign =
1931 (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? Align(8) : Align(4));
1932 return true;
1933}
1934
1935// Create a fast isel object.
1936FastISel *
1938 const TargetLibraryInfo *libInfo) const {
1939 return ARM::createFastISel(funcInfo, libInfo);
1940}
1941
1943 unsigned NumVals = N->getNumValues();
1944 if (!NumVals)
1945 return Sched::RegPressure;
1946
1947 for (unsigned i = 0; i != NumVals; ++i) {
1948 EVT VT = N->getValueType(i);
1949 if (VT == MVT::Glue || VT == MVT::Other)
1950 continue;
1951 if (VT.isFloatingPoint() || VT.isVector())
1952 return Sched::ILP;
1953 }
1954
1955 if (!N->isMachineOpcode())
1956 return Sched::RegPressure;
1957
1958 // Load are scheduled for latency even if there instruction itinerary
1959 // is not available.
1960 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1961 const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
1962
1963 if (MCID.getNumDefs() == 0)
1964 return Sched::RegPressure;
1965 if (!Itins->isEmpty() &&
1966 Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2)
1967 return Sched::ILP;
1968
1969 return Sched::RegPressure;
1970}
1971
1972//===----------------------------------------------------------------------===//
1973// Lowering Code
1974//===----------------------------------------------------------------------===//
1975
1976static bool isSRL16(const SDValue &Op) {
1977 if (Op.getOpcode() != ISD::SRL)
1978 return false;
1979 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1980 return Const->getZExtValue() == 16;
1981 return false;
1982}
1983
1984static bool isSRA16(const SDValue &Op) {
1985 if (Op.getOpcode() != ISD::SRA)
1986 return false;
1987 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1988 return Const->getZExtValue() == 16;
1989 return false;
1990}
1991
1992static bool isSHL16(const SDValue &Op) {
1993 if (Op.getOpcode() != ISD::SHL)
1994 return false;
1995 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1996 return Const->getZExtValue() == 16;
1997 return false;
1998}
1999
2000// Check for a signed 16-bit value. We special case SRA because it makes it
2001// more simple when also looking for SRAs that aren't sign extending a
2002// smaller value. Without the check, we'd need to take extra care with
2003// checking order for some operations.
2004static bool isS16(const SDValue &Op, SelectionDAG &DAG) {
2005 if (isSRA16(Op))
2006 return isSHL16(Op.getOperand(0));
2007 return DAG.ComputeNumSignBits(Op) == 17;
2008}
2009
2010/// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
2012 switch (CC) {
2013 default: llvm_unreachable("Unknown condition code!");
2014 case ISD::SETNE: return ARMCC::NE;
2015 case ISD::SETEQ: return ARMCC::EQ;
2016 case ISD::SETGT: return ARMCC::GT;
2017 case ISD::SETGE: return ARMCC::GE;
2018 case ISD::SETLT: return ARMCC::LT;
2019 case ISD::SETLE: return ARMCC::LE;
2020 case ISD::SETUGT: return ARMCC::HI;
2021 case ISD::SETUGE: return ARMCC::HS;
2022 case ISD::SETULT: return ARMCC::LO;
2023 case ISD::SETULE: return ARMCC::LS;
2024 }
2025}
2026
2027/// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
2029 ARMCC::CondCodes &CondCode2) {
2030 CondCode2 = ARMCC::AL;
2031 switch (CC) {
2032 default: llvm_unreachable("Unknown FP condition!");
2033 case ISD::SETEQ:
2034 case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
2035 case ISD::SETGT:
2036 case ISD::SETOGT: CondCode = ARMCC::GT; break;
2037 case ISD::SETGE:
2038 case ISD::SETOGE: CondCode = ARMCC::GE; break;
2039 case ISD::SETOLT: CondCode = ARMCC::MI; break;
2040 case ISD::SETOLE: CondCode = ARMCC::LS; break;
2041 case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
2042 case ISD::SETO: CondCode = ARMCC::VC; break;
2043 case ISD::SETUO: CondCode = ARMCC::VS; break;
2044 case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break;
2045 case ISD::SETUGT: CondCode = ARMCC::HI; break;
2046 case ISD::SETUGE: CondCode = ARMCC::PL; break;
2047 case ISD::SETLT:
2048 case ISD::SETULT: CondCode = ARMCC::LT; break;
2049 case ISD::SETLE:
2050 case ISD::SETULE: CondCode = ARMCC::LE; break;
2051 case ISD::SETNE:
2052 case ISD::SETUNE: CondCode = ARMCC::NE; break;
2053 }
2054}
2055
2056//===----------------------------------------------------------------------===//
2057// Calling Convention Implementation
2058//===----------------------------------------------------------------------===//
2059
2060/// getEffectiveCallingConv - Get the effective calling convention, taking into
2061/// account presence of floating point hardware and calling convention
2062/// limitations, such as support for variadic functions.
2064ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
2065 bool isVarArg) const {
2066 switch (CC) {
2067 default:
2068 report_fatal_error("Unsupported calling convention");
2071 case CallingConv::GHC:
2073 return CC;
2077 case CallingConv::Swift:
2080 case CallingConv::C:
2081 case CallingConv::Tail:
2082 if (!Subtarget->isAAPCS_ABI())
2083 return CallingConv::ARM_APCS;
2084 else if (Subtarget->hasFPRegs() && !Subtarget->isThumb1Only() &&
2085 getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
2086 !isVarArg)
2088 else
2090 case CallingConv::Fast:
2092 if (!Subtarget->isAAPCS_ABI()) {
2093 if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && !isVarArg)
2094 return CallingConv::Fast;
2095 return CallingConv::ARM_APCS;
2096 } else if (Subtarget->hasVFP2Base() &&
2097 !Subtarget->isThumb1Only() && !isVarArg)
2099 else
2101 }
2102}
2103
2105 bool isVarArg) const {
2106 return CCAssignFnForNode(CC, false, isVarArg);
2107}
2108
2110 bool isVarArg) const {
2111 return CCAssignFnForNode(CC, true, isVarArg);
2112}
2113
2114/// CCAssignFnForNode - Selects the correct CCAssignFn for the given
2115/// CallingConvention.
2116CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
2117 bool Return,
2118 bool isVarArg) const {
2119 switch (getEffectiveCallingConv(CC, isVarArg)) {
2120 default:
2121 report_fatal_error("Unsupported calling convention");
2123 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
2125 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
2127 return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
2128 case CallingConv::Fast:
2129 return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
2130 case CallingConv::GHC:
2131 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
2133 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
2135 return (Return ? RetCC_ARM_AAPCS : CC_ARM_Win32_CFGuard_Check);
2136 }
2137}
2138
2139SDValue ARMTargetLowering::MoveToHPR(const SDLoc &dl, SelectionDAG &DAG,
2140 MVT LocVT, MVT ValVT, SDValue Val) const {
2141 Val = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocVT.getSizeInBits()),
2142 Val);
2143 if (Subtarget->hasFullFP16()) {
2144 Val = DAG.getNode(ARMISD::VMOVhr, dl, ValVT, Val);
2145 } else {
2146 Val = DAG.getNode(ISD::TRUNCATE, dl,
2147 MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
2148 Val = DAG.getNode(ISD::BITCAST, dl, ValVT, Val);
2149 }
2150 return Val;
2151}
2152
2153SDValue ARMTargetLowering::MoveFromHPR(const SDLoc &dl, SelectionDAG &DAG,
2154 MVT LocVT, MVT ValVT,
2155 SDValue Val) const {
2156 if (Subtarget->hasFullFP16()) {
2157 Val = DAG.getNode(ARMISD::VMOVrh, dl,
2158 MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
2159 } else {
2160 Val = DAG.getNode(ISD::BITCAST, dl,
2161 MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
2162 Val = DAG.getNode(ISD::ZERO_EXTEND, dl,
2163 MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
2164 }
2165 return DAG.getNode(ISD::BITCAST, dl, LocVT, Val);
2166}
2167
2168/// LowerCallResult - Lower the result values of a call into the
2169/// appropriate copies out of appropriate physical registers.
2170SDValue ARMTargetLowering::LowerCallResult(
2171 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
2172 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2173 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
2174 SDValue ThisVal) const {
2175 // Assign locations to each value returned by this call.
2177 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2178 *DAG.getContext());
2179 CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg));
2180
2181 // Copy all of the result registers out of their specified physreg.
2182 for (unsigned i = 0; i != RVLocs.size(); ++i) {
2183 CCValAssign VA = RVLocs[i];
2184
2185 // Pass 'this' value directly from the argument to return value, to avoid
2186 // reg unit interference
2187 if (i == 0 && isThisReturn) {
2188 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 &&
2189 "unexpected return calling convention register assignment");
2190 InVals.push_back(ThisVal);
2191 continue;
2192 }
2193
2194 SDValue Val;
2195 if (VA.needsCustom() &&
2196 (VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2f64)) {
2197 // Handle f64 or half of a v2f64.
2198 SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
2199 InFlag);
2200 Chain = Lo.getValue(1);
2201 InFlag = Lo.getValue(2);
2202 VA = RVLocs[++i]; // skip ahead to next loc
2203 SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
2204 InFlag);
2205 Chain = Hi.getValue(1);
2206 InFlag = Hi.getValue(2);
2207 if (!Subtarget->isLittle())
2208 std::swap (Lo, Hi);
2209 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
2210
2211 if (VA.getLocVT() == MVT::v2f64) {
2212 SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
2213 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
2214 DAG.getConstant(0, dl, MVT::i32));
2215
2216 VA = RVLocs[++i]; // skip ahead to next loc
2217 Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
2218 Chain = Lo.getValue(1);
2219 InFlag = Lo.getValue(2);
2220 VA = RVLocs[++i]; // skip ahead to next loc
2221 Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
2222 Chain = Hi.getValue(1);
2223 InFlag = Hi.getValue(2);
2224 if (!Subtarget->isLittle())
2225 std::swap (Lo, Hi);
2226 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
2227 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
2228 DAG.getConstant(1, dl, MVT::i32));
2229 }
2230 } else {
2231 Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
2232 InFlag);
2233 Chain = Val.getValue(1);
2234 InFlag = Val.getValue(2);
2235 }
2236
2237 switch (VA.getLocInfo()) {
2238 default: llvm_unreachable("Unknown loc info!");
2239 case CCValAssign::Full: break;
2240 case CCValAssign::BCvt:
2241 Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
2242 break;
2243 }
2244
2245 // f16 arguments have their size extended to 4 bytes and passed as if they
2246 // had been copied to the LSBs of a 32-bit register.
2247 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
2248 if (VA.needsCustom() &&
2249 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
2250 Val = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Val);
2251
2252 InVals.push_back(Val);
2253 }
2254
2255 return Chain;
2256}
2257
2258std::pair<SDValue, MachinePointerInfo> ARMTargetLowering::computeAddrForCallArg(
2259 const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, SDValue StackPtr,
2260 bool IsTailCall, int SPDiff) const {
2261 SDValue DstAddr;
2262 MachinePointerInfo DstInfo;
2263 int32_t Offset = VA.getLocMemOffset();
2265
2266 if (IsTailCall) {
2267 Offset += SPDiff;
2268 auto PtrVT = getPointerTy(DAG.getDataLayout());
2269 int Size = VA.getLocVT().getFixedSizeInBits() / 8;
2270 int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true);
2271 DstAddr = DAG.getFrameIndex(FI, PtrVT);
2272 DstInfo =
2274 } else {
2275 SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl);
2276 DstAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
2277 StackPtr, PtrOff);
2278 DstInfo =
2280 }
2281
2282 return std::make_pair(DstAddr, DstInfo);
2283}
2284
2285void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,
2286 SDValue Chain, SDValue &Arg,
2287 RegsToPassVector &RegsToPass,
2288 CCValAssign &VA, CCValAssign &NextVA,
2289 SDValue &StackPtr,
2290 SmallVectorImpl<SDValue> &MemOpChains,
2291 bool IsTailCall,
2292 int SPDiff) const {
2293 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
2295 unsigned id = Subtarget->isLittle() ? 0 : 1;
2296 RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id)));
2297
2298 if (NextVA.isRegLoc())
2299 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id)));
2300 else {
2301 assert(NextVA.isMemLoc());
2302 if (!StackPtr.getNode())
2303 StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP,
2305
2306 SDValue DstAddr;
2307 MachinePointerInfo DstInfo;
2308 std::tie(DstAddr, DstInfo) =
2309 computeAddrForCallArg(dl, DAG, NextVA, StackPtr, IsTailCall, SPDiff);
2310 MemOpChains.push_back(
2311 DAG.getStore(Chain, dl, fmrrd.getValue(1 - id), DstAddr, DstInfo));
2312 }
2313}
2314
2315static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
2316 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
2318}
2319
2320/// LowerCall - Lowering a call into a callseq_start <-
2321/// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
2322/// nodes.
2323SDValue
2324ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2325 SmallVectorImpl<SDValue> &InVals) const {
2326 SelectionDAG &DAG = CLI.DAG;
2327 SDLoc &dl = CLI.DL;
2329 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2331 SDValue Chain = CLI.Chain;
2332 SDValue Callee = CLI.Callee;
2333 bool &isTailCall = CLI.IsTailCall;
2334 CallingConv::ID CallConv = CLI.CallConv;
2335 bool doesNotRet = CLI.DoesNotReturn;
2336 bool isVarArg = CLI.IsVarArg;
2337
2341 bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
2342 bool isThisReturn = false;
2343 bool isCmseNSCall = false;
2344 bool isSibCall = false;
2345 bool PreferIndirect = false;
2346 bool GuardWithBTI = false;
2347
2348 // Lower 'returns_twice' calls to a pseudo-instruction.
2349 if (CLI.CB && CLI.CB->getAttributes().hasFnAttr(Attribute::ReturnsTwice) &&
2350 !Subtarget->noBTIAtReturnTwice())
2351 GuardWithBTI = AFI->branchTargetEnforcement();
2352
2353 // Determine whether this is a non-secure function call.
2354 if (CLI.CB && CLI.CB->getAttributes().hasFnAttr("cmse_nonsecure_call"))
2355 isCmseNSCall = true;
2356
2357 // Disable tail calls if they're not supported.
2358 if (!Subtarget->supportsTailCall())
2359 isTailCall = false;
2360
2361 // For both the non-secure calls and the returns from a CMSE entry function,
2362 // the function needs to do some extra work afte r the call, or before the
2363 // return, respectively, thus it cannot end with atail call
2364 if (isCmseNSCall || AFI->isCmseNSEntryFunction())
2365 isTailCall = false;
2366
2367 if (isa<GlobalAddressSDNode>(Callee)) {
2368 // If we're optimizing for minimum size and the function is called three or
2369 // more times in this block, we can improve codesize by calling indirectly
2370 // as BLXr has a 16-bit encoding.
2371 auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
2372 if (CLI.CB) {
2373 auto *BB = CLI.CB->getParent();
2374 PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() &&
2375 count_if(GV->users(), [&BB](const User *U) {
2376 return isa<Instruction>(U) &&
2377 cast<Instruction>(U)->getParent() == BB;
2378 }) > 2;
2379 }
2380 }
2381 if (isTailCall) {
2382 // Check if it's really possible to do a tail call.
2383 isTailCall = IsEligibleForTailCallOptimization(
2384 Callee, CallConv, isVarArg, isStructRet,
2385 MF.getFunction().hasStructRetAttr(), Outs, OutVals, Ins, DAG,
2386 PreferIndirect);
2387
2388 if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt &&
2389 CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail)
2390 isSibCall = true;
2391
2392 // We don't support GuaranteedTailCallOpt for ARM, only automatically
2393 // detected sibcalls.
2394 if (isTailCall)
2395 ++NumTailCalls;
2396 }
2397
2398 if (!isTailCall && CLI.CB && CLI.CB->isMustTailCall())
2399 report_fatal_error("failed to perform tail call elimination on a call "
2400 "site marked musttail");
2401 // Analyze operands of the call, assigning locations to each operand.
2403 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2404 *DAG.getContext());
2405 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg));
2406
2407 // Get a count of how many bytes are to be pushed on the stack.
2408 unsigned NumBytes = CCInfo.getNextStackOffset();
2409
2410 // SPDiff is the byte offset of the call's argument area from the callee's.
2411 // Stores to callee stack arguments will be placed in FixedStackSlots offset
2412 // by this amount for a tail call. In a sibling call it must be 0 because the
2413 // caller will deallocate the entire stack and the callee still expects its
2414 // arguments to begin at SP+0. Completely unused for non-tail calls.
2415 int SPDiff = 0;
2416
2417 if (isTailCall && !isSibCall) {
2418 auto FuncInfo = MF.getInfo<ARMFunctionInfo>();
2419 unsigned NumReusableBytes = FuncInfo->getArgumentStackSize();
2420
2421 // Since callee will pop argument stack as a tail call, we must keep the
2422 // popped size 16-byte aligned.
2424 NumBytes = alignTo(NumBytes, StackAlign);
2425
2426 // SPDiff will be negative if this tail call requires more space than we
2427 // would automatically have in our incoming argument space. Positive if we
2428 // can actually shrink the stack.
2429 SPDiff = NumReusableBytes - NumBytes;
2430
2431 // If this call requires more stack than we have available from
2432 // LowerFormalArguments, tell FrameLowering to reserve space for it.
2433 if (SPDiff < 0 && AFI->getArgRegsSaveSize() < (unsigned)-SPDiff)
2434 AFI->setArgRegsSaveSize(-SPDiff);
2435 }
2436
2437 if (isSibCall) {
2438 // For sibling tail calls, memory operands are available in our caller's stack.
2439 NumBytes = 0;
2440 } else {
2441 // Adjust the stack pointer for the new arguments...
2442 // These operations are automatically eliminated by the prolog/epilog pass
2443 Chain = DAG.getCALLSEQ_START(Chain, isTailCall ? 0 : NumBytes, 0, dl);
2444 }
2445
2447 DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout()));
2448
2449 RegsToPassVector RegsToPass;
2450 SmallVector<SDValue, 8> MemOpChains;
2451
2452 // During a tail call, stores to the argument area must happen after all of
2453 // the function's incoming arguments have been loaded because they may alias.
2454 // This is done by folding in a TokenFactor from LowerFormalArguments, but
2455 // there's no point in doing so repeatedly so this tracks whether that's
2456 // happened yet.
2457 bool AfterFormalArgLoads = false;
2458
2459 // Walk the register/memloc assignments, inserting copies/loads. In the case
2460 // of tail call optimization, arguments are handled later.
2461 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
2462 i != e;
2463 ++i, ++realArgIdx) {
2464 CCValAssign &VA = ArgLocs[i];
2465 SDValue Arg = OutVals[realArgIdx];
2466 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2467 bool isByVal = Flags.isByVal();
2468
2469 // Promote the value if needed.
2470 switch (VA.getLocInfo()) {
2471 default: llvm_unreachable("Unknown loc info!");
2472 case CCValAssign::Full: break;
2473 case CCValAssign::SExt:
2474 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
2475 break;
2476 case CCValAssign::ZExt:
2477 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
2478 break;
2479 case CCValAssign::AExt:
2480 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
2481 break;
2482 case CCValAssign::BCvt:
2483 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2484 break;
2485 }
2486
2487 if (isTailCall && VA.isMemLoc() && !AfterFormalArgLoads) {
2488 Chain = DAG.getStackArgumentTokenFactor(Chain);
2489 AfterFormalArgLoads = true;
2490 }
2491
2492 // f16 arguments have their size extended to 4 bytes and passed as if they
2493 // had been copied to the LSBs of a 32-bit register.
2494 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
2495 if (VA.needsCustom() &&
2496 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) {
2497 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
2498 } else {
2499 // f16 arguments could have been extended prior to argument lowering.
2500 // Mask them arguments if this is a CMSE nonsecure call.
2501 auto ArgVT = Outs[realArgIdx].ArgVT;
2502 if (isCmseNSCall && (ArgVT == MVT::f16)) {
2503 auto LocBits = VA.getLocVT().getSizeInBits();
2504 auto MaskValue = APInt::getLowBitsSet(LocBits, ArgVT.getSizeInBits());
2505 SDValue Mask =
2506 DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
2507 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
2508 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
2509 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2510 }
2511 }
2512
2513 // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
2514 if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
2516 DAG.getConstant(0, dl, MVT::i32));
2518 DAG.getConstant(1, dl, MVT::i32));
2519
2520 PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, VA, ArgLocs[++i],
2521 StackPtr, MemOpChains, isTailCall, SPDiff);
2522
2523 VA = ArgLocs[++i]; // skip ahead to next loc
2524 if (VA.isRegLoc()) {
2525 PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, VA, ArgLocs[++i],
2526 StackPtr, MemOpChains, isTailCall, SPDiff);
2527 } else {
2528 assert(VA.isMemLoc());
2529 SDValue DstAddr;
2530 MachinePointerInfo DstInfo;
2531 std::tie(DstAddr, DstInfo) =
2532 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2533 MemOpChains.push_back(DAG.getStore(Chain, dl, Op1, DstAddr, DstInfo));
2534 }
2535 } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
2536 PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
2537 StackPtr, MemOpChains, isTailCall, SPDiff);
2538 } else if (VA.isRegLoc()) {
2539 if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
2540 Outs[0].VT == MVT::i32) {
2541 assert(VA.getLocVT() == MVT::i32 &&
2542 "unexpected calling convention register assignment");
2543 assert(!Ins.empty() && Ins[0].VT == MVT::i32 &&
2544 "unexpected use of 'returned'");
2545 isThisReturn = true;
2546 }
2547 const TargetOptions &Options = DAG.getTarget().Options;
2548 if (Options.EmitCallSiteInfo)
2549 CSInfo.emplace_back(VA.getLocReg(), i);
2550 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2551 } else if (isByVal) {
2552 assert(VA.isMemLoc());
2553 unsigned offset = 0;
2554
2555 // True if this byval aggregate will be split between registers
2556 // and memory.
2557 unsigned ByValArgsCount = CCInfo.getInRegsParamsCount();
2558 unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed();
2559
2560 if (CurByValIdx < ByValArgsCount) {
2561
2562 unsigned RegBegin, RegEnd;
2563 CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd);
2564
2565 EVT PtrVT =
2567 unsigned int i, j;
2568 for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
2569 SDValue Const = DAG.getConstant(4*i, dl, MVT::i32);
2570 SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
2571 SDValue Load =
2572 DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo(),
2573 DAG.InferPtrAlign(AddArg));
2574 MemOpChains.push_back(Load.getValue(1));
2575 RegsToPass.push_back(std::make_pair(j, Load));
2576 }
2577
2578 // If parameter size outsides register area, "offset" value
2579 // helps us to calculate stack slot for remained part properly.
2580 offset = RegEnd - RegBegin;
2581
2582 CCInfo.nextInRegsParam();
2583 }
2584
2585 if (Flags.getByValSize() > 4*offset) {
2586 auto PtrVT = getPointerTy(DAG.getDataLayout());
2587 SDValue Dst;
2588 MachinePointerInfo DstInfo;
2589 std::tie(Dst, DstInfo) =
2590 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2591 SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl);
2592 SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset);
2593 SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl,
2594 MVT::i32);
2595 SDValue AlignNode =
2596 DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32);
2597
2599 SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
2600 MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
2601 Ops));
2602 }
2603 } else {
2604 assert(VA.isMemLoc());
2605 SDValue DstAddr;
2606 MachinePointerInfo DstInfo;
2607 std::tie(DstAddr, DstInfo) =
2608 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2609
2610 SDValue Store = DAG.getStore(Chain, dl, Arg, DstAddr, DstInfo);
2611 MemOpChains.push_back(Store);
2612 }
2613 }
2614
2615 if (!MemOpChains.empty())
2616 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
2617
2618 // Build a sequence of copy-to-reg nodes chained together with token chain
2619 // and flag operands which copy the outgoing args into the appropriate regs.
2620 SDValue InFlag;
2621 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
2622 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
2623 RegsToPass[i].second, InFlag);
2624 InFlag = Chain.getValue(1);
2625 }
2626
2627 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
2628 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
2629 // node so that legalize doesn't hack it.
2630 bool isDirect = false;
2631
2633 const Module *Mod = MF.getFunction().getParent();
2634 const GlobalValue *GVal = nullptr;
2635 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
2636 GVal = G->getGlobal();
2637 bool isStub =
2638 !TM.shouldAssumeDSOLocal(*Mod, GVal) && Subtarget->isTargetMachO();
2639
2640 bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
2641 bool isLocalARMFunc = false;
2642 auto PtrVt = getPointerTy(DAG.getDataLayout());
2643
2644 if (Subtarget->genLongCalls()) {
2645 assert((!isPositionIndependent() || Subtarget->isTargetWindows()) &&
2646 "long-calls codegen is not position independent!");
2647 // Handle a global address or an external symbol. If it's not one of
2648 // those, the target's already in a register, so we don't need to do
2649 // anything extra.
2650 if (isa<GlobalAddressSDNode>(Callee)) {
2651 // When generating execute-only code we use movw movt pair.
2652 // Currently execute-only is only available for architectures that
2653 // support movw movt, so we are safe to assume that.
2654 if (Subtarget->genExecuteOnly()) {
2655 assert(Subtarget->useMovt() &&
2656 "long-calls with execute-only requires movt and movw!");
2657 ++NumMovwMovt;
2658 Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt,
2659 DAG.getTargetGlobalAddress(GVal, dl, PtrVt));
2660 } else {
2661 // Create a constant pool entry for the callee address
2662 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2664 GVal, ARMPCLabelIndex, ARMCP::CPValue, 0);
2665
2666 // Get the address of the callee into a register
2667 SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2669 Callee = DAG.getLoad(
2670 PtrVt, dl, DAG.getEntryNode(), Addr,
2672 }
2673 } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
2674 const char *Sym = S->getSymbol();
2675
2676 // When generating execute-only code we use movw movt pair.
2677 // Currently execute-only is only available for architectures that
2678 // support movw movt, so we are safe to assume that.
2679 if (Subtarget->genExecuteOnly()) {
2680 assert(Subtarget->useMovt() &&
2681 "long-calls with execute-only requires movt and movw!");
2682 ++NumMovwMovt;
2683 Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt,
2684 DAG.getTargetGlobalAddress(GVal, dl, PtrVt));
2685 } else {
2686 // Create a constant pool entry for the callee address
2687 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2689 *DAG.getContext(), Sym, ARMPCLabelIndex, 0);
2690
2691 // Get the address of the callee into a register
2692 SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2694 Callee = DAG.getLoad(
2695 PtrVt, dl, DAG.getEntryNode(), Addr,
2697 }
2698 }
2699 } else if (isa<GlobalAddressSDNode>(Callee)) {
2700 if (!PreferIndirect) {
2701 isDirect = true;
2702 bool isDef = GVal->isStrongDefinitionForLinker();
2703
2704 // ARM call to a local ARM function is predicable.
2705 isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking);
2706 // tBX takes a register source operand.
2707 if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2708 assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?");
2709 Callee = DAG.getNode(
2710 ARMISD::WrapperPIC, dl, PtrVt,
2711 DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, ARMII::MO_NONLAZY));
2712 Callee = DAG.getLoad(
2713 PtrVt, dl, DAG.getEntryNode(), Callee,
2717 } else if (Subtarget->isTargetCOFF()) {
2718 assert(Subtarget->isTargetWindows() &&
2719 "Windows is the only supported COFF target");
2720 unsigned TargetFlags = ARMII::MO_NO_FLAG;
2721 if (GVal->hasDLLImportStorageClass())
2722 TargetFlags = ARMII::MO_DLLIMPORT;
2723 else if (!TM.shouldAssumeDSOLocal(*GVal->getParent(), GVal))
2724 TargetFlags = ARMII::MO_COFFSTUB;
2725 Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, /*offset=*/0,
2726 TargetFlags);
2727 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
2728 Callee =
2729 DAG.getLoad(PtrVt, dl, DAG.getEntryNode(),
2730 DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee),
2732 } else {
2733 Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, 0);
2734 }
2735 }
2736 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
2737 isDirect = true;
2738 // tBX takes a register source operand.
2739 const char *Sym = S->getSymbol();
2740 if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2741 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2744 ARMPCLabelIndex, 4);
2745 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2746 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2747 Callee = DAG.getLoad(
2748 PtrVt, dl, DAG.getEntryNode(), CPAddr,
2750 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
2751 Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel);
2752 } else {
2753 Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0);
2754 }
2755 }
2756
2757 if (isCmseNSCall) {
2758 assert(!isARMFunc && !isDirect &&
2759 "Cannot handle call to ARM function or direct call");
2760 if (NumBytes > 0) {
2762 "call to non-secure function would "
2763 "require passing arguments on stack",
2764 dl.getDebugLoc());
2765 DAG.getContext()->diagnose(Diag);
2766 }
2767 if (isStructRet) {
2770 "call to non-secure function would return value through pointer",
2771 dl.getDebugLoc());
2772 DAG.getContext()->diagnose(Diag);
2773 }
2774 }
2775
2776 // FIXME: handle tail calls differently.
2777 unsigned CallOpc;
2778 if (Subtarget->isThumb()) {
2779 if (GuardWithBTI)
2780 CallOpc = ARMISD::t2CALL_BTI;
2781 else if (isCmseNSCall)
2782 CallOpc = ARMISD::tSECALL;
2783 else if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
2784 CallOpc = ARMISD::CALL_NOLINK;
2785 else
2786 CallOpc = ARMISD::CALL;
2787 } else {
2788 if (!isDirect && !Subtarget->hasV5TOps())
2789 CallOpc = ARMISD::CALL_NOLINK;
2790 else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() &&
2791 // Emit regular call when code size is the priority
2792 !Subtarget->hasMinSize())
2793 // "mov lr, pc; b _foo" to avoid confusing the RSP
2794 CallOpc = ARMISD::CALL_NOLINK;
2795 else
2796 CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL;
2797 }
2798
2799 // We don't usually want to end the call-sequence here because we would tidy
2800 // the frame up *after* the call, however in the ABI-changing tail-call case
2801 // we've carefully laid out the parameters so that when sp is reset they'll be
2802 // in the correct location.
2803 if (isTailCall && !isSibCall) {
2804 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InFlag, dl);
2805 InFlag = Chain.getValue(1);
2806 }
2807
2808 std::vector<SDValue> Ops;
2809 Ops.push_back(Chain);
2810 Ops.push_back(Callee);
2811
2812 if (isTailCall) {
2813 Ops.push_back(DAG.getTargetConstant(SPDiff, dl, MVT::i32));
2814 }
2815
2816 // Add argument registers to the end of the list so that they are known live
2817 // into the call.
2818 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
2819 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
2820 RegsToPass[i].second.getValueType()));
2821
2822 // Add a register mask operand representing the call-preserved registers.
2823 const uint32_t *Mask;
2824 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
2825 if (isThisReturn) {
2826 // For 'this' returns, use the R0-preserving mask if applicable
2827 Mask = ARI->getThisReturnPreservedMask(MF, CallConv);
2828 if (!Mask) {
2829 // Set isThisReturn to false if the calling convention is not one that
2830 // allows 'returned' to be modeled in this way, so LowerCallResult does
2831 // not try to pass 'this' straight through
2832 isThisReturn = false;
2833 Mask = ARI->getCallPreservedMask(MF, CallConv);
2834 }
2835 } else
2836 Mask = ARI->getCallPreservedMask(MF, CallConv);
2837
2838 assert(Mask && "Missing call preserved mask for calling convention");
2839 Ops.push_back(DAG.getRegisterMask(Mask));
2840
2841 if (InFlag.getNode())
2842 Ops.push_back(InFlag);
2843
2844 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2845 if (isTailCall) {
2847 SDValue Ret = DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops);
2848 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
2849 return Ret;
2850 }
2851
2852 // Returns a chain and a flag for retval copy to use.
2853 Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
2854 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
2855 InFlag = Chain.getValue(1);
2856 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
2857
2858 // If we're guaranteeing tail-calls will be honoured, the callee must
2859 // pop its own argument stack on return. But this call is *not* a tail call so
2860 // we need to undo that after it returns to restore the status-quo.
2861 bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt;
2862 uint64_t CalleePopBytes =
2863 canGuaranteeTCO(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : -1ULL;
2864
2865 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InFlag, dl);
2866 if (!Ins.empty())
2867 InFlag = Chain.getValue(1);
2868
2869 // Handle result values, copying them out of physregs into vregs that we
2870 // return.
2871 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
2872 InVals, isThisReturn,
2873 isThisReturn ? OutVals[0] : SDValue());
2874}
2875
2876/// HandleByVal - Every parameter *after* a byval parameter is passed
2877/// on the stack. Remember the next parameter register to allocate,
2878/// and then confiscate the rest of the parameter registers to insure
2879/// this.
2880void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
2881 Align Alignment) const {
2882 // Byval (as with any stack) slots are always at least 4 byte aligned.
2883 Alignment = std::max(Alignment, Align(4));
2884
2885 unsigned Reg = State->AllocateReg(GPRArgRegs);
2886 if (!Reg)
2887 return;
2888
2889 unsigned AlignInRegs = Alignment.value() / 4;
2890 unsigned Waste = (ARM::R4 - Reg) % AlignInRegs;
2891 for (unsigned i = 0; i < Waste; ++i)
2892 Reg = State->AllocateReg(GPRArgRegs);
2893
2894 if (!Reg)
2895 return;
2896
2897 unsigned Excess = 4 * (ARM::R4 - Reg);
2898
2899 // Special case when NSAA != SP and parameter size greater than size of
2900 // all remained GPR regs. In that case we can't split parameter, we must
2901 // send it to stack. We also must set NCRN to R4, so waste all
2902 // remained registers.
2903 const unsigned NSAAOffset = State->getNextStackOffset();
2904 if (NSAAOffset != 0 && Size > Excess) {
2905 while (State->AllocateReg(GPRArgRegs))
2906 ;
2907 return;
2908 }
2909
2910 // First register for byval parameter is the first register that wasn't
2911 // allocated before this method call, so it would be "reg".
2912 // If parameter is small enough to be saved in range [reg, r4), then
2913 // the end (first after last) register would be reg + param-size-in-regs,
2914 // else parameter would be splitted between registers and stack,
2915 // end register would be r4 in this case.
2916 unsigned ByValRegBegin = Reg;
2917 unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4);
2918 State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd);
2919 // Note, first register is allocated in the beginning of function already,
2920 // allocate remained amount of registers we need.
2921 for (unsigned i = Reg + 1; i != ByValRegEnd; ++i)
2922 State->AllocateReg(GPRArgRegs);
2923 // A byval parameter that is split between registers and memory needs its
2924 // size truncated here.
2925 // In the case where the entire structure fits in registers, we set the
2926 // size in memory to zero.
2927 Size = std::max<int>(Size - Excess, 0);
2928}
2929
2930/// MatchingStackOffset - Return true if the given stack call argument is
2931/// already available in the same position (relatively) of the caller's
2932/// incoming argument stack.
2933static
2936 const TargetInstrInfo *TII) {
2937 unsigned Bytes = Arg.getValueSizeInBits() / 8;
2938 int FI = std::numeric_limits<int>::max();
2939 if (Arg.getOpcode() == ISD::CopyFromReg) {
2940 Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
2941 if (!VR.isVirtual())
2942 return false;
2943 MachineInstr *Def = MRI->getVRegDef(VR);
2944 if (!Def)
2945 return false;
2946 if (!Flags.isByVal()) {
2947 if (!TII->isLoadFromStackSlot(*Def, FI))
2948 return false;
2949 } else {
2950 return false;
2951 }
2952 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
2953 if (Flags.isByVal())
2954 // ByVal argument is passed in as a pointer but it's now being
2955 // dereferenced. e.g.
2956 // define @foo(%struct.X* %A) {
2957 // tail call @bar(%struct.X* byval %A)
2958 // }
2959 return false;
2960 SDValue Ptr = Ld->getBasePtr();
2961 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
2962 if (!FINode)
2963 return false;
2964 FI = FINode->getIndex();
2965 } else
2966 return false;
2967
2968 assert(FI != std::numeric_limits<int>::max());
2969 if (!MFI.isFixedObjectIndex(FI))
2970 return false;
2971 return Offset == MFI.getObjectOffset(FI) && Bytes == MFI.getObjectSize(FI);
2972}
2973
2974/// IsEligibleForTailCallOptimization - Check whether the call is eligible
2975/// for tail call optimization. Targets which want to do tail call
2976/// optimization should implement this function.
2977bool ARMTargetLowering::IsEligibleForTailCallOptimization(
2978 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
2979 bool isCalleeStructRet, bool isCallerStructRet,
2981 const SmallVectorImpl<SDValue> &OutVals,
2983 const bool isIndirect) const {
2985 const Function &CallerF = MF.getFunction();
2986 CallingConv::ID CallerCC = CallerF.getCallingConv();
2987
2988 assert(Subtarget->supportsTailCall());
2989
2990 // Indirect tail calls cannot be optimized for Thumb1 if the args
2991 // to the call take up r0-r3. The reason is that there are no legal registers
2992 // left to hold the pointer to the function to be called.
2993 // Similarly, if the function uses return address sign and authentication,
2994 // r12 is needed to hold the PAC and is not available to hold the callee
2995 // address.
2996 if (Outs.size() >= 4 &&
2997 (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect)) {
2998 if (Subtarget->isThumb1Only())
2999 return false;
3000 // Conservatively assume the function spills LR.
3002 return false;
3003 }
3004
3005 // Look for obvious safe cases to perform tail call optimization that do not
3006 // require ABI changes. This is what gcc calls sibcall.
3007
3008 // Exception-handling functions need a special set of instructions to indicate
3009 // a return to the hardware. Tail-calling another function would probably
3010 // break this.
3011 if (CallerF.hasFnAttribute("interrupt"))
3012 return false;
3013
3014 if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
3015 return CalleeCC == CallerCC;
3016
3017 // Also avoid sibcall optimization if either caller or callee uses struct
3018 // return semantics.
3019 if (isCalleeStructRet || isCallerStructRet)
3020 return false;
3021
3022 // Externally-defined functions with weak linkage should not be
3023 // tail-called on ARM when the OS does not support dynamic
3024 // pre-emption of symbols, as the AAELF spec requires normal calls
3025 // to undefined weak functions to be replaced with a NOP or jump to the
3026 // next instruction. The behaviour of branch instructions in this
3027 // situation (as used for tail calls) is implementation-defined, so we
3028 // cannot rely on the linker replacing the tail call with a return.
3029 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
3030 const GlobalValue *GV = G->getGlobal();
3032 if (GV->hasExternalWeakLinkage() &&
3033 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
3034 return false;
3035 }
3036
3037 // Check that the call results are passed in the same way.
3038 LLVMContext &C = *DAG.getContext();
3040 getEffectiveCallingConv(CalleeCC, isVarArg),
3041 getEffectiveCallingConv(CallerCC, CallerF.isVarArg()), MF, C, Ins,
3042 CCAssignFnForReturn(CalleeCC, isVarArg),
3043 CCAssignFnForReturn(CallerCC, CallerF.isVarArg())))
3044 return false;
3045 // The callee has to preserve all registers the caller needs to preserve.
3046 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
3047 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3048 if (CalleeCC != CallerCC) {
3049 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3050 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3051 return false;
3052 }
3053
3054 // If Caller's vararg or byval argument has been split between registers and
3055 // stack, do not perform tail call, since part of the argument is in caller's
3056 // local frame.
3057 const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>();
3058 if (AFI_Caller->getArgRegsSaveSize())
3059 return false;
3060
3061 // If the callee takes no arguments then go on to check the results of the
3062 // call.
3063 if (!Outs.empty()) {
3064 // Check if stack adjustment is needed. For now, do not do this if any
3065 // argument is passed on the stack.
3067 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
3068 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
3069 if (CCInfo.getNextStackOffset()) {
3070 // Check if the arguments are already laid out in the right way as
3071 // the caller's fixed stack objects.
3072 MachineFrameInfo &MFI = MF.getFrameInfo();
3073 const MachineRegisterInfo *MRI = &MF.getRegInfo();
3074 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3075 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
3076 i != e;
3077 ++i, ++realArgIdx) {
3078 CCValAssign &VA = ArgLocs[i];
3079 EVT RegVT = VA.getLocVT();
3080 SDValue Arg = OutVals[realArgIdx];
3081 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
3083 return false;
3084 if (VA.needsCustom() && (RegVT == MVT::f64 || RegVT == MVT::v2f64)) {
3085 // f64 and vector types are split into multiple registers or
3086 // register/stack-slot combinations. The types will not match
3087 // the registers; give up on memory f64 refs until we figure
3088 // out what to do about this.
3089 if (!VA.isRegLoc())
3090 return false;
3091 if (!ArgLocs[++i].isRegLoc())
3092 return false;
3093 if (RegVT == MVT::v2f64) {
3094 if (!ArgLocs[++i].isRegLoc())
3095 return false;
3096 if (!ArgLocs[++i].isRegLoc())
3097 return false;
3098 }
3099 } else if (!VA.isRegLoc()) {
3100 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
3101 MFI, MRI, TII))
3102 return false;
3103 }
3104 }
3105 }
3106
3107 const MachineRegisterInfo &MRI = MF.getRegInfo();
3108 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
3109 return false;
3110 }
3111
3112 return true;
3113}
3114
3115bool
3116ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
3117 MachineFunction &MF, bool isVarArg,
3119 LLVMContext &Context) const {
3121 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
3122 return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3123}
3124
3126 const SDLoc &DL, SelectionDAG &DAG) {
3127 const MachineFunction &MF = DAG.getMachineFunction();
3128 const Function &F = MF.getFunction();
3129
3130 StringRef IntKind = F.getFnAttribute("interrupt").getValueAsString();
3131
3132 // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset
3133 // version of the "preferred return address". These offsets affect the return
3134 // instruction if this is a return from PL1 without hypervisor extensions.
3135 // IRQ/FIQ: +4 "subs pc, lr, #4"
3136 // SWI: 0 "subs pc, lr, #0"
3137 // ABORT: +4 "subs pc, lr, #4"
3138 // UNDEF: +4/+2 "subs pc, lr, #0"
3139 // UNDEF varies depending on where the exception came from ARM or Thumb
3140 // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0.
3141
3142 int64_t LROffset;
3143 if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" ||
3144 IntKind == "ABORT")
3145 LROffset = 4;
3146 else if (IntKind == "SWI" || IntKind == "UNDEF")
3147 LROffset = 0;
3148 else
3149 report_fatal_error("Unsupported interrupt attribute. If present, value "
3150 "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF");
3151
3152 RetOps.insert(RetOps.begin() + 1,
3153 DAG.getConstant(LROffset, DL, MVT::i32, false));
3154
3155 return DAG.getNode(ARMISD::INTRET_FLAG, DL, MVT::Other, RetOps);
3156}
3157
3158SDValue
3159ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3160 bool isVarArg,
3162 const SmallVectorImpl<SDValue> &OutVals,
3163 const SDLoc &dl, SelectionDAG &DAG) const {
3164 // CCValAssign - represent the assignment of the return value to a location.
3166
3167 // CCState - Info about the registers and stack slots.
3168 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3169 *DAG.getContext());
3170
3171 // Analyze outgoing return values.
3172 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3173
3174 SDValue Flag;
3176 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3177 bool isLittleEndian = Subtarget->isLittle();
3178
3181 AFI->setReturnRegsCount(RVLocs.size());
3182
3183 // Report error if cmse entry function returns structure through first ptr arg.
3184 if (AFI->isCmseNSEntryFunction() && MF.getFunction().hasStructRetAttr()) {
3185 // Note: using an empty SDLoc(), as the first line of the function is a
3186 // better place to report than the last line.
3189 "secure entry function would return value through pointer",
3190 SDLoc().getDebugLoc());
3191 DAG.getContext()->diagnose(Diag);
3192 }
3193
3194 // Copy the result values into the output registers.
3195 for (unsigned i = 0, realRVLocIdx = 0;
3196 i != RVLocs.size();
3197 ++i, ++realRVLocIdx) {
3198 CCValAssign &VA = RVLocs[i];
3199 assert(VA.isRegLoc() && "Can only return in registers!");
3200
3201 SDValue Arg = OutVals[realRVLocIdx];
3202 bool ReturnF16 = false;
3203
3204 if (Subtarget->hasFullFP16() && Subtarget->isTargetHardFloat()) {
3205 // Half-precision return values can be returned like this:
3206 //
3207 // t11 f16 = fadd ...
3208 // t12: i16 = bitcast t11
3209 // t13: i32 = zero_extend t12
3210 // t14: f32 = bitcast t13 <~~~~~~~ Arg
3211 //
3212 // to avoid code generation for bitcasts, we simply set Arg to the node
3213 // that produces the f16 value, t11 in this case.
3214 //
3215 if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) {
3216 SDValue ZE = Arg.getOperand(0);
3217 if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) {
3218 SDValue BC = ZE.getOperand(0);
3219 if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) {
3220 Arg = BC.getOperand(0);
3221 ReturnF16 = true;
3222 }
3223 }
3224 }
3225 }
3226
3227 switch (VA.getLocInfo()) {
3228 default: llvm_unreachable("Unknown loc info!");
3229 case CCValAssign::Full: break;
3230 case CCValAssign::BCvt:
3231 if (!ReturnF16)
3232 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
3233 break;
3234 }
3235
3236 // Mask f16 arguments if this is a CMSE nonsecure entry.
3237 auto RetVT = Outs[realRVLocIdx].ArgVT;
3238 if (AFI->isCmseNSEntryFunction() && (RetVT == MVT::f16)) {
3239 if (VA.needsCustom() && VA.getValVT() == MVT::f16) {
3240 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
3241 } else {
3242 auto LocBits = VA.getLocVT().getSizeInBits();
3243 auto MaskValue = APInt::getLowBitsSet(LocBits, RetVT.getSizeInBits());
3244 SDValue Mask =
3245 DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
3246 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
3247 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
3248 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
3249 }
3250 }
3251
3252 if (VA.needsCustom() &&
3253 (VA.getLocVT() == MVT::v2f64 || VA.getLocVT() == MVT::f64)) {
3254 if (VA.getLocVT() == MVT::v2f64) {
3255 // Extract the first half and return it in two registers.
3257 DAG.getConstant(0, dl, MVT::i32));
3258 SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
3259 DAG.getVTList(MVT::i32, MVT::i32), Half);
3260
3261 Chain =
3262 DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3263 HalfGPRs.getValue(isLittleEndian ? 0 : 1), Flag);
3264 Flag = Chain.getValue(1);
3265 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3266 VA = RVLocs[++i]; // skip ahead to next loc
3267 Chain =
3268 DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3269 HalfGPRs.getValue(isLittleEndian ? 1 : 0), Flag);
3270 Flag = Chain.getValue(1);
3271 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3272 VA = RVLocs[++i]; // skip ahead to next loc
3273
3274 // Extract the 2nd half and fall through to handle it as an f64 value.
3276 DAG.getConstant(1, dl, MVT::i32));
3277 }
3278 // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is
3279 // available.
3280 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
3282 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3283 fmrrd.getValue(isLittleEndian ? 0 : 1), Flag);
3284 Flag = Chain.getValue(1);
3285 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3286 VA = RVLocs[++i]; // skip ahead to next loc
3287 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3288 fmrrd.getValue(isLittleEndian ? 1 : 0), Flag);
3289 } else
3290 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
3291
3292 // Guarantee that all emitted copies are
3293 // stuck together, avoiding something bad.
3294 Flag = Chain.getValue(1);
3295 RetOps.push_back(DAG.getRegister(
3296 VA.getLocReg(), ReturnF16 ? Arg.getValueType() : VA.getLocVT()));
3297 }
3298 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
3299 const MCPhysReg *I =
3300 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3301 if (I) {
3302 for (; *I; ++I) {
3303 if (ARM::GPRRegClass.contains(*I))
3304 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3305 else if (ARM::DPRRegClass.contains(*I))
3307 else
3308 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3309 }
3310 }
3311
3312 // Update chain and glue.
3313 RetOps[0] = Chain;
3314 if (Flag.getNode())
3315 RetOps.push_back(Flag);
3316
3317 // CPUs which aren't M-class use a special sequence to return from
3318 // exceptions (roughly, any instruction setting pc and cpsr simultaneously,
3319 // though we use "subs pc, lr, #N").
3320 //
3321 // M-class CPUs actually use a normal return sequence with a special
3322 // (hardware-provided) value in LR, so the normal code path works.
3323 if (DAG.getMachineFunction().getFunction().hasFnAttribute("interrupt") &&
3324 !Subtarget->isMClass()) {
3325 if (Subtarget->isThumb1Only())
3326 report_fatal_error("interrupt attribute is not supported in Thumb1");
3327 return LowerInterruptReturn(RetOps, dl, DAG);
3328 }
3329
3332 return DAG.getNode(RetNode, dl, MVT::Other, RetOps);
3333}
3334
3335bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
3336 if (N->getNumValues() != 1)
3337 return false;
3338 if (!N->hasNUsesOfValue(1, 0))
3339 return false;
3340
3341 SDValue TCChain = Chain;
3342 SDNode *Copy = *N->use_begin();
3343 if (Copy->getOpcode() == ISD::CopyToReg) {
3344 // If the copy has a glue operand, we conservatively assume it isn't safe to
3345 // perform a tail call.
3346 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3347 return false;
3348 TCChain = Copy->getOperand(0);
3349 } else if (Copy->getOpcode() == ARMISD::VMOVRRD) {
3350 SDNode *VMov = Copy;
3351 // f64 returned in a pair of GPRs.
3353 for (SDNode *U : VMov->uses()) {
3354 if (U->getOpcode() != ISD::CopyToReg)
3355 return false;
3356 Copies.insert(U);
3357 }
3358 if (Copies.size() > 2)
3359 return false;
3360
3361 for (SDNode *U : VMov->uses()) {
3362 SDValue UseChain = U->getOperand(0);
3363 if (Copies.count(UseChain.getNode()))
3364 // Second CopyToReg
3365 Copy = U;
3366 else {
3367 // We are at the top of this chain.
3368 // If the copy has a glue operand, we conservatively assume it
3369 // isn't safe to perform a tail call.
3370 if (U->getOperand(U->getNumOperands() - 1).ge