LLVM 19.0.0git
ARMISelLowering.cpp
Go to the documentation of this file.
1//===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that ARM uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "ARMISelLowering.h"
15#include "ARMBaseInstrInfo.h"
16#include "ARMBaseRegisterInfo.h"
17#include "ARMCallingConv.h"
20#include "ARMPerfectShuffle.h"
21#include "ARMRegisterInfo.h"
22#include "ARMSelectionDAGInfo.h"
23#include "ARMSubtarget.h"
27#include "Utils/ARMBaseInfo.h"
28#include "llvm/ADT/APFloat.h"
29#include "llvm/ADT/APInt.h"
30#include "llvm/ADT/ArrayRef.h"
31#include "llvm/ADT/BitVector.h"
32#include "llvm/ADT/DenseMap.h"
33#include "llvm/ADT/STLExtras.h"
36#include "llvm/ADT/Statistic.h"
38#include "llvm/ADT/StringRef.h"
40#include "llvm/ADT/Twine.h"
67#include "llvm/IR/Attributes.h"
68#include "llvm/IR/CallingConv.h"
69#include "llvm/IR/Constant.h"
70#include "llvm/IR/Constants.h"
71#include "llvm/IR/DataLayout.h"
72#include "llvm/IR/DebugLoc.h"
74#include "llvm/IR/Function.h"
75#include "llvm/IR/GlobalAlias.h"
76#include "llvm/IR/GlobalValue.h"
78#include "llvm/IR/IRBuilder.h"
79#include "llvm/IR/InlineAsm.h"
80#include "llvm/IR/Instruction.h"
83#include "llvm/IR/Intrinsics.h"
84#include "llvm/IR/IntrinsicsARM.h"
85#include "llvm/IR/Module.h"
87#include "llvm/IR/Type.h"
88#include "llvm/IR/User.h"
89#include "llvm/IR/Value.h"
90#include "llvm/MC/MCInstrDesc.h"
93#include "llvm/MC/MCSchedule.h"
100#include "llvm/Support/Debug.h"
108#include <algorithm>
109#include <cassert>
110#include <cstdint>
111#include <cstdlib>
112#include <iterator>
113#include <limits>
114#include <optional>
115#include <tuple>
116#include <utility>
117#include <vector>
118
119using namespace llvm;
120using namespace llvm::PatternMatch;
121
122#define DEBUG_TYPE "arm-isel"
123
124STATISTIC(NumTailCalls, "Number of tail calls");
125STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
126STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
127STATISTIC(NumConstpoolPromoted,
128 "Number of constants with their storage promoted into constant pools");
129
130static cl::opt<bool>
131ARMInterworking("arm-interworking", cl::Hidden,
132 cl::desc("Enable / disable ARM interworking (for debugging only)"),
133 cl::init(true));
134
136 "arm-promote-constant", cl::Hidden,
137 cl::desc("Enable / disable promotion of unnamed_addr constants into "
138 "constant pools"),
139 cl::init(false)); // FIXME: set to true by default once PR32780 is fixed
141 "arm-promote-constant-max-size", cl::Hidden,
142 cl::desc("Maximum size of constant to promote into a constant pool"),
143 cl::init(64));
145 "arm-promote-constant-max-total", cl::Hidden,
146 cl::desc("Maximum size of ALL constants to promote into a constant pool"),
147 cl::init(128));
148
150MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden,
151 cl::desc("Maximum interleave factor for MVE VLDn to generate."),
152 cl::init(2));
153
154// The APCS parameter registers.
155static const MCPhysReg GPRArgRegs[] = {
156 ARM::R0, ARM::R1, ARM::R2, ARM::R3
157};
158
159void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT) {
160 if (VT != PromotedLdStVT) {
162 AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);
163
165 AddPromotedToType (ISD::STORE, VT, PromotedLdStVT);
166 }
167
168 MVT ElemTy = VT.getVectorElementType();
169 if (ElemTy != MVT::f64)
173 if (ElemTy == MVT::i32) {
178 } else {
183 }
192 if (VT.isInteger()) {
196 }
197
198 // Neon does not support vector divide/remainder operations.
207
208 if (!VT.isFloatingPoint() &&
209 VT != MVT::v2i64 && VT != MVT::v1i64)
210 for (auto Opcode : {ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
211 setOperationAction(Opcode, VT, Legal);
212 if (!VT.isFloatingPoint())
213 for (auto Opcode : {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT})
214 setOperationAction(Opcode, VT, Legal);
215}
216
217void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
218 addRegisterClass(VT, &ARM::DPRRegClass);
219 addTypeForNEON(VT, MVT::f64);
220}
221
222void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
223 addRegisterClass(VT, &ARM::DPairRegClass);
224 addTypeForNEON(VT, MVT::v2f64);
225}
226
227void ARMTargetLowering::setAllExpand(MVT VT) {
228 for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
229 setOperationAction(Opc, VT, Expand);
230
231 // We support these really simple operations even on types where all
232 // the actual arithmetic has to be broken down into simpler
233 // operations or turned into library calls.
238}
239
240void ARMTargetLowering::addAllExtLoads(const MVT From, const MVT To,
241 LegalizeAction Action) {
242 setLoadExtAction(ISD::EXTLOAD, From, To, Action);
243 setLoadExtAction(ISD::ZEXTLOAD, From, To, Action);
244 setLoadExtAction(ISD::SEXTLOAD, From, To, Action);
245}
246
247void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
248 const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 };
249
250 for (auto VT : IntTypes) {
251 addRegisterClass(VT, &ARM::MQPRRegClass);
281
282 // No native support for these.
292
293 // Vector reductions
303
304 if (!HasMVEFP) {
309 } else {
312 }
313
314 // Pre and Post inc are supported on loads and stores
315 for (unsigned im = (unsigned)ISD::PRE_INC;
321 }
322 }
323
324 const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 };
325 for (auto VT : FloatTypes) {
326 addRegisterClass(VT, &ARM::MQPRRegClass);
327 if (!HasMVEFP)
328 setAllExpand(VT);
329
330 // These are legal or custom whether we have MVE.fp or not
343
344 // Pre and Post inc are supported on loads and stores
345 for (unsigned im = (unsigned)ISD::PRE_INC;
351 }
352
353 if (HasMVEFP) {
361
362 // No native support for these.
376 }
377 }
378
379 // Custom Expand smaller than legal vector reductions to prevent false zero
380 // items being added.
389
390 // We 'support' these types up to bitcast/load/store level, regardless of
391 // MVE integer-only / float support. Only doing FP data processing on the FP
392 // vector types is inhibited at integer-only level.
393 const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 };
394 for (auto VT : LongTypes) {
395 addRegisterClass(VT, &ARM::MQPRRegClass);
396 setAllExpand(VT);
402 }
404
405 // We can do bitwise operations on v2i64 vectors
406 setOperationAction(ISD::AND, MVT::v2i64, Legal);
407 setOperationAction(ISD::OR, MVT::v2i64, Legal);
408 setOperationAction(ISD::XOR, MVT::v2i64, Legal);
409
410 // It is legal to extload from v4i8 to v4i16 or v4i32.
411 addAllExtLoads(MVT::v8i16, MVT::v8i8, Legal);
412 addAllExtLoads(MVT::v4i32, MVT::v4i16, Legal);
413 addAllExtLoads(MVT::v4i32, MVT::v4i8, Legal);
414
415 // It is legal to sign extend from v4i8/v4i16 to v4i32 or v8i8 to v8i16.
421
422 // Some truncating stores are legal too.
423 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
424 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
425 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
426
427 // Pre and Post inc on these are legal, given the correct extends
428 for (unsigned im = (unsigned)ISD::PRE_INC;
430 for (auto VT : {MVT::v8i8, MVT::v4i8, MVT::v4i16}) {
435 }
436 }
437
438 // Predicate types
439 const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1};
440 for (auto VT : pTypes) {
441 addRegisterClass(VT, &ARM::VCCRRegClass);
456
457 if (!HasMVEFP) {
462 }
463 }
467 setOperationAction(ISD::OR, MVT::v2i1, Expand);
473
482}
483
485 const ARMSubtarget &STI)
486 : TargetLowering(TM), Subtarget(&STI) {
487 RegInfo = Subtarget->getRegisterInfo();
488 Itins = Subtarget->getInstrItineraryData();
489
492
493 if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetIOS() &&
494 !Subtarget->isTargetWatchOS() && !Subtarget->isTargetDriverKit()) {
495 bool IsHFTarget = TM.Options.FloatABIType == FloatABI::Hard;
496 for (int LCID = 0; LCID < RTLIB::UNKNOWN_LIBCALL; ++LCID)
497 setLibcallCallingConv(static_cast<RTLIB::Libcall>(LCID),
498 IsHFTarget ? CallingConv::ARM_AAPCS_VFP
500 }
501
502 if (Subtarget->isTargetMachO()) {
503 // Uses VFP for Thumb libfuncs if available.
504 if (Subtarget->isThumb() && Subtarget->hasVFP2Base() &&
505 Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) {
506 static const struct {
507 const RTLIB::Libcall Op;
508 const char * const Name;
509 const ISD::CondCode Cond;
510 } LibraryCalls[] = {
511 // Single-precision floating-point arithmetic.
512 { RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID },
513 { RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID },
514 { RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID },
515 { RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID },
516
517 // Double-precision floating-point arithmetic.
518 { RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID },
519 { RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID },
520 { RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID },
521 { RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID },
522
523 // Single-precision comparisons.
524 { RTLIB::OEQ_F32, "__eqsf2vfp", ISD::SETNE },
525 { RTLIB::UNE_F32, "__nesf2vfp", ISD::SETNE },
526 { RTLIB::OLT_F32, "__ltsf2vfp", ISD::SETNE },
527 { RTLIB::OLE_F32, "__lesf2vfp", ISD::SETNE },
528 { RTLIB::OGE_F32, "__gesf2vfp", ISD::SETNE },
529 { RTLIB::OGT_F32, "__gtsf2vfp", ISD::SETNE },
530 { RTLIB::UO_F32, "__unordsf2vfp", ISD::SETNE },
531
532 // Double-precision comparisons.
533 { RTLIB::OEQ_F64, "__eqdf2vfp", ISD::SETNE },
534 { RTLIB::UNE_F64, "__nedf2vfp", ISD::SETNE },
535 { RTLIB::OLT_F64, "__ltdf2vfp", ISD::SETNE },
536 { RTLIB::OLE_F64, "__ledf2vfp", ISD::SETNE },
537 { RTLIB::OGE_F64, "__gedf2vfp", ISD::SETNE },
538 { RTLIB::OGT_F64, "__gtdf2vfp", ISD::SETNE },
539 { RTLIB::UO_F64, "__unorddf2vfp", ISD::SETNE },
540
541 // Floating-point to integer conversions.
542 // i64 conversions are done via library routines even when generating VFP
543 // instructions, so use the same ones.
544 { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp", ISD::SETCC_INVALID },
545 { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID },
546 { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp", ISD::SETCC_INVALID },
547 { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID },
548
549 // Conversions between floating types.
550 { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp", ISD::SETCC_INVALID },
551 { RTLIB::FPEXT_F32_F64, "__extendsfdf2vfp", ISD::SETCC_INVALID },
552
553 // Integer to floating-point conversions.
554 // i64 conversions are done via library routines even when generating VFP
555 // instructions, so use the same ones.
556 // FIXME: There appears to be some naming inconsistency in ARM libgcc:
557 // e.g., __floatunsidf vs. __floatunssidfvfp.
558 { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp", ISD::SETCC_INVALID },
559 { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID },
560 { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp", ISD::SETCC_INVALID },
561 { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID },
562 };
563
564 for (const auto &LC : LibraryCalls) {
565 setLibcallName(LC.Op, LC.Name);
566 if (LC.Cond != ISD::SETCC_INVALID)
567 setCmpLibcallCC(LC.Op, LC.Cond);
568 }
569 }
570 }
571
572 // These libcalls are not available in 32-bit.
573 setLibcallName(RTLIB::SHL_I128, nullptr);
574 setLibcallName(RTLIB::SRL_I128, nullptr);
575 setLibcallName(RTLIB::SRA_I128, nullptr);
576 setLibcallName(RTLIB::MUL_I128, nullptr);
577 setLibcallName(RTLIB::MULO_I64, nullptr);
578 setLibcallName(RTLIB::MULO_I128, nullptr);
579
580 // RTLIB
581 if (Subtarget->isAAPCS_ABI() &&
582 (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() ||
583 Subtarget->isTargetMuslAEABI() || Subtarget->isTargetAndroid())) {
584 static const struct {
585 const RTLIB::Libcall Op;
586 const char * const Name;
587 const CallingConv::ID CC;
588 const ISD::CondCode Cond;
589 } LibraryCalls[] = {
590 // Double-precision floating-point arithmetic helper functions
591 // RTABI chapter 4.1.2, Table 2
592 { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
593 { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
594 { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
595 { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
596
597 // Double-precision floating-point comparison helper functions
598 // RTABI chapter 4.1.2, Table 3
599 { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
600 { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
601 { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
602 { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
603 { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
604 { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
605 { RTLIB::UO_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
606
607 // Single-precision floating-point arithmetic helper functions
608 // RTABI chapter 4.1.2, Table 4
609 { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
610 { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
611 { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
612 { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
613
614 // Single-precision floating-point comparison helper functions
615 // RTABI chapter 4.1.2, Table 5
616 { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
617 { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
618 { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
619 { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
620 { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
621 { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
622 { RTLIB::UO_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
623
624 // Floating-point to integer conversions.
625 // RTABI chapter 4.1.2, Table 6
626 { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
627 { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
628 { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
629 { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
630 { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
631 { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
632 { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
633 { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
634
635 // Conversions between floating types.
636 // RTABI chapter 4.1.2, Table 7
637 { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
638 { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
639 { RTLIB::FPEXT_F32_F64, "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
640
641 // Integer to floating-point conversions.
642 // RTABI chapter 4.1.2, Table 8
643 { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
644 { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
645 { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
646 { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
647 { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
648 { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
649 { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
650 { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
651
652 // Long long helper functions
653 // RTABI chapter 4.2, Table 9
654 { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
655 { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
656 { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
657 { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
658
659 // Integer division functions
660 // RTABI chapter 4.3.1
661 { RTLIB::SDIV_I8, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
662 { RTLIB::SDIV_I16, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
663 { RTLIB::SDIV_I32, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
664 { RTLIB::SDIV_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
665 { RTLIB::UDIV_I8, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
666 { RTLIB::UDIV_I16, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
667 { RTLIB::UDIV_I32, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
668 { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
669 };
670
671 for (const auto &LC : LibraryCalls) {
672 setLibcallName(LC.Op, LC.Name);
673 setLibcallCallingConv(LC.Op, LC.CC);
674 if (LC.Cond != ISD::SETCC_INVALID)
675 setCmpLibcallCC(LC.Op, LC.Cond);
676 }
677
678 // EABI dependent RTLIB
679 if (TM.Options.EABIVersion == EABI::EABI4 ||
680 TM.Options.EABIVersion == EABI::EABI5) {
681 static const struct {
682 const RTLIB::Libcall Op;
683 const char *const Name;
684 const CallingConv::ID CC;
685 const ISD::CondCode Cond;
686 } MemOpsLibraryCalls[] = {
687 // Memory operations
688 // RTABI chapter 4.3.4
689 { RTLIB::MEMCPY, "__aeabi_memcpy", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
690 { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
691 { RTLIB::MEMSET, "__aeabi_memset", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
692 };
693
694 for (const auto &LC : MemOpsLibraryCalls) {
695 setLibcallName(LC.Op, LC.Name);
696 setLibcallCallingConv(LC.Op, LC.CC);
697 if (LC.Cond != ISD::SETCC_INVALID)
698 setCmpLibcallCC(LC.Op, LC.Cond);
699 }
700 }
701 }
702
703 if (Subtarget->isTargetWindows()) {
704 static const struct {
705 const RTLIB::Libcall Op;
706 const char * const Name;
707 const CallingConv::ID CC;
708 } LibraryCalls[] = {
709 { RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP },
710 { RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP },
711 { RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP },
712 { RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP },
713 { RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP },
714 { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP },
715 { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP },
716 { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP },
717 };
718
719 for (const auto &LC : LibraryCalls) {
720 setLibcallName(LC.Op, LC.Name);
721 setLibcallCallingConv(LC.Op, LC.CC);
722 }
723 }
724
725 // Use divmod compiler-rt calls for iOS 5.0 and later.
726 if (Subtarget->isTargetMachO() &&
727 !(Subtarget->isTargetIOS() &&
728 Subtarget->getTargetTriple().isOSVersionLT(5, 0))) {
729 setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
730 setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
731 }
732
733 // The half <-> float conversion functions are always soft-float on
734 // non-watchos platforms, but are needed for some targets which use a
735 // hard-float calling convention by default.
736 if (!Subtarget->isTargetWatchABI()) {
737 if (Subtarget->isAAPCS_ABI()) {
738 setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS);
739 setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS);
740 setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS);
741 } else {
742 setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS);
743 setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS);
744 setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS);
745 }
746 }
747
748 // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have
749 // a __gnu_ prefix (which is the default).
750 if (Subtarget->isTargetAEABI()) {
751 static const struct {
752 const RTLIB::Libcall Op;
753 const char * const Name;
754 const CallingConv::ID CC;
755 } LibraryCalls[] = {
756 { RTLIB::FPROUND_F32_F16, "__aeabi_f2h", CallingConv::ARM_AAPCS },
757 { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS },
758 { RTLIB::FPEXT_F16_F32, "__aeabi_h2f", CallingConv::ARM_AAPCS },
759 };
760
761 for (const auto &LC : LibraryCalls) {
762 setLibcallName(LC.Op, LC.Name);
763 setLibcallCallingConv(LC.Op, LC.CC);
764 }
765 }
766
767 if (Subtarget->isThumb1Only())
768 addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
769 else
770 addRegisterClass(MVT::i32, &ARM::GPRRegClass);
771
772 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only() &&
773 Subtarget->hasFPRegs()) {
774 addRegisterClass(MVT::f32, &ARM::SPRRegClass);
775 addRegisterClass(MVT::f64, &ARM::DPRRegClass);
776
781
782 if (!Subtarget->hasVFP2Base())
783 setAllExpand(MVT::f32);
784 if (!Subtarget->hasFP64())
785 setAllExpand(MVT::f64);
786 }
787
788 if (Subtarget->hasFullFP16()) {
789 addRegisterClass(MVT::f16, &ARM::HPRRegClass);
792
795 }
796
797 if (Subtarget->hasBF16()) {
798 addRegisterClass(MVT::bf16, &ARM::HPRRegClass);
799 setAllExpand(MVT::bf16);
800 if (!Subtarget->hasFullFP16())
802 }
803
805 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
806 setTruncStoreAction(VT, InnerVT, Expand);
807 addAllExtLoads(VT, InnerVT, Expand);
808 }
809
812
814 }
815
818
821
822 if (Subtarget->hasMVEIntegerOps())
823 addMVEVectorTypes(Subtarget->hasMVEFloatOps());
824
825 // Combine low-overhead loop intrinsics so that we can lower i1 types.
826 if (Subtarget->hasLOB()) {
828 }
829
830 if (Subtarget->hasNEON()) {
831 addDRTypeForNEON(MVT::v2f32);
832 addDRTypeForNEON(MVT::v8i8);
833 addDRTypeForNEON(MVT::v4i16);
834 addDRTypeForNEON(MVT::v2i32);
835 addDRTypeForNEON(MVT::v1i64);
836
837 addQRTypeForNEON(MVT::v4f32);
838 addQRTypeForNEON(MVT::v2f64);
839 addQRTypeForNEON(MVT::v16i8);
840 addQRTypeForNEON(MVT::v8i16);
841 addQRTypeForNEON(MVT::v4i32);
842 addQRTypeForNEON(MVT::v2i64);
843
844 if (Subtarget->hasFullFP16()) {
845 addQRTypeForNEON(MVT::v8f16);
846 addDRTypeForNEON(MVT::v4f16);
847 }
848
849 if (Subtarget->hasBF16()) {
850 addQRTypeForNEON(MVT::v8bf16);
851 addDRTypeForNEON(MVT::v4bf16);
852 }
853 }
854
855 if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) {
856 // v2f64 is legal so that QR subregs can be extracted as f64 elements, but
857 // none of Neon, MVE or VFP supports any arithmetic operations on it.
858 setOperationAction(ISD::FADD, MVT::v2f64, Expand);
859 setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
860 setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
861 // FIXME: Code duplication: FDIV and FREM are expanded always, see
862 // ARMTargetLowering::addTypeForNEON method for details.
863 setOperationAction(ISD::FDIV, MVT::v2f64, Expand);
864 setOperationAction(ISD::FREM, MVT::v2f64, Expand);
865 // FIXME: Create unittest.
866 // In another words, find a way when "copysign" appears in DAG with vector
867 // operands.
869 // FIXME: Code duplication: SETCC has custom operation action, see
870 // ARMTargetLowering::addTypeForNEON method for details.
872 // FIXME: Create unittest for FNEG and for FABS.
873 setOperationAction(ISD::FNEG, MVT::v2f64, Expand);
874 setOperationAction(ISD::FABS, MVT::v2f64, Expand);
876 setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
877 setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
878 setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
879 setOperationAction(ISD::FLOG, MVT::v2f64, Expand);
882 setOperationAction(ISD::FEXP, MVT::v2f64, Expand);
885 // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR.
891 setOperationAction(ISD::FMA, MVT::v2f64, Expand);
892 }
893
894 if (Subtarget->hasNEON()) {
895 // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
896 // supported for v4f32.
898 setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
899 setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
900 setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
901 setOperationAction(ISD::FLOG, MVT::v4f32, Expand);
904 setOperationAction(ISD::FEXP, MVT::v4f32, Expand);
912
913 // Mark v2f32 intrinsics.
915 setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
916 setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
917 setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
918 setOperationAction(ISD::FLOG, MVT::v2f32, Expand);
921 setOperationAction(ISD::FEXP, MVT::v2f32, Expand);
929
930 // Neon does not support some operations on v1i64 and v2i64 types.
931 setOperationAction(ISD::MUL, MVT::v1i64, Expand);
932 // Custom handling for some quad-vector types to detect VMULL.
933 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
934 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
935 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
936 // Custom handling for some vector types to avoid expensive expansions
937 setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
939 setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
941 // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
942 // a destination type that is wider than the source, and nor does
943 // it have a FP_TO_[SU]INT instruction with a narrower destination than
944 // source.
953
956
957 // NEON does not have single instruction CTPOP for vectors with element
958 // types wider than 8-bits. However, custom lowering can leverage the
959 // v8i8/v16i8 vcnt instruction.
966
967 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
968 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
969
970 // NEON does not have single instruction CTTZ for vectors.
972 setOperationAction(ISD::CTTZ, MVT::v4i16, Custom);
973 setOperationAction(ISD::CTTZ, MVT::v2i32, Custom);
974 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
975
976 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
977 setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
978 setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
979 setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);
980
985
990
994 }
995
996 // NEON only has FMA instructions as of VFP4.
997 if (!Subtarget->hasVFP4Base()) {
998 setOperationAction(ISD::FMA, MVT::v2f32, Expand);
999 setOperationAction(ISD::FMA, MVT::v4f32, Expand);
1000 }
1001
1004
1005 // It is legal to extload from v4i8 to v4i16 or v4i32.
1006 for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16,
1007 MVT::v2i32}) {
1012 }
1013 }
1014
1015 for (auto VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1016 MVT::v4i32}) {
1021 }
1022 }
1023
1024 if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) {
1031 }
1032 if (Subtarget->hasMVEIntegerOps()) {
1035 ISD::SETCC});
1036 }
1037 if (Subtarget->hasMVEFloatOps()) {
1039 }
1040
1041 if (!Subtarget->hasFP64()) {
1042 // When targeting a floating-point unit with only single-precision
1043 // operations, f64 is legal for the few double-precision instructions which
1044 // are present However, no double-precision operations other than moves,
1045 // loads and stores are provided by the hardware.
1083 }
1084
1085 if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) {
1088 if (Subtarget->hasFullFP16()) {
1091 }
1092 }
1093
1094 if (!Subtarget->hasFP16()) {
1097 }
1098
1100
1101 // ARM does not have floating-point extending loads.
1102 for (MVT VT : MVT::fp_valuetypes()) {
1103 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
1104 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
1105 }
1106
1107 // ... or truncating stores
1108 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
1109 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
1110 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
1111
1112 // ARM does not have i1 sign extending load.
1113 for (MVT VT : MVT::integer_valuetypes())
1114 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
1115
1116 // ARM supports all 4 flavors of integer indexed load / store.
1117 if (!Subtarget->isThumb1Only()) {
1118 for (unsigned im = (unsigned)ISD::PRE_INC;
1120 setIndexedLoadAction(im, MVT::i1, Legal);
1121 setIndexedLoadAction(im, MVT::i8, Legal);
1122 setIndexedLoadAction(im, MVT::i16, Legal);
1123 setIndexedLoadAction(im, MVT::i32, Legal);
1124 setIndexedStoreAction(im, MVT::i1, Legal);
1125 setIndexedStoreAction(im, MVT::i8, Legal);
1126 setIndexedStoreAction(im, MVT::i16, Legal);
1127 setIndexedStoreAction(im, MVT::i32, Legal);
1128 }
1129 } else {
1130 // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}.
1133 }
1134
1139
1142 if (Subtarget->hasDSP()) {
1151 }
1152 if (Subtarget->hasBaseDSP()) {
1155 }
1156
1157 // i64 operation support.
1160 if (Subtarget->isThumb1Only()) {
1163 }
1164 if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops()
1165 || (Subtarget->isThumb2() && !Subtarget->hasDSP()))
1167
1177
1178 // MVE lowers 64 bit shifts to lsll and lsrl
1179 // assuming that ISD::SRL and SRA of i64 are already marked custom
1180 if (Subtarget->hasMVEIntegerOps())
1182
1183 // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1.
1184 if (Subtarget->isThumb1Only()) {
1188 }
1189
1190 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops())
1192
1193 // ARM does not have ROTL.
1198 }
1201 if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) {
1204 }
1205
1206 // @llvm.readcyclecounter requires the Performance Monitors extension.
1207 // Default to the 0 expansion on unsupported platforms.
1208 // FIXME: Technically there are older ARM CPUs that have
1209 // implementation-specific ways of obtaining this information.
1210 if (Subtarget->hasPerfMon())
1212
1213 // Only ARMv6 has BSWAP.
1214 if (!Subtarget->hasV6Ops())
1216
1217 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
1218 : Subtarget->hasDivideInARMMode();
1219 if (!hasDivide) {
1220 // These are expanded into libcalls if the cpu doesn't have HW divider.
1223 }
1224
1225 if (Subtarget->isTargetWindows() && !Subtarget->hasDivideInThumbMode()) {
1228
1231 }
1232
1235
1236 // Register based DivRem for AEABI (RTABI 4.2)
1237 if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
1238 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
1239 Subtarget->isTargetWindows()) {
1242 HasStandaloneRem = false;
1243
1244 if (Subtarget->isTargetWindows()) {
1245 const struct {
1246 const RTLIB::Libcall Op;
1247 const char * const Name;
1248 const CallingConv::ID CC;
1249 } LibraryCalls[] = {
1250 { RTLIB::SDIVREM_I8, "__rt_sdiv", CallingConv::ARM_AAPCS },
1251 { RTLIB::SDIVREM_I16, "__rt_sdiv", CallingConv::ARM_AAPCS },
1252 { RTLIB::SDIVREM_I32, "__rt_sdiv", CallingConv::ARM_AAPCS },
1253 { RTLIB::SDIVREM_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS },
1254
1255 { RTLIB::UDIVREM_I8, "__rt_udiv", CallingConv::ARM_AAPCS },
1256 { RTLIB::UDIVREM_I16, "__rt_udiv", CallingConv::ARM_AAPCS },
1257 { RTLIB::UDIVREM_I32, "__rt_udiv", CallingConv::ARM_AAPCS },
1258 { RTLIB::UDIVREM_I64, "__rt_udiv64", CallingConv::ARM_AAPCS },
1259 };
1260
1261 for (const auto &LC : LibraryCalls) {
1262 setLibcallName(LC.Op, LC.Name);
1263 setLibcallCallingConv(LC.Op, LC.CC);
1264 }
1265 } else {
1266 const struct {
1267 const RTLIB::Libcall Op;
1268 const char * const Name;
1269 const CallingConv::ID CC;
1270 } LibraryCalls[] = {
1271 { RTLIB::SDIVREM_I8, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
1272 { RTLIB::SDIVREM_I16, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
1273 { RTLIB::SDIVREM_I32, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
1274 { RTLIB::SDIVREM_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS },
1275
1276 { RTLIB::UDIVREM_I8, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
1277 { RTLIB::UDIVREM_I16, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
1278 { RTLIB::UDIVREM_I32, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
1279 { RTLIB::UDIVREM_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS },
1280 };
1281
1282 for (const auto &LC : LibraryCalls) {
1283 setLibcallName(LC.Op, LC.Name);
1284 setLibcallCallingConv(LC.Op, LC.CC);
1285 }
1286 }
1287
1292 } else {
1295 }
1296
1297 if (Subtarget->getTargetTriple().isOSMSVCRT()) {
1298 // MSVCRT doesn't have powi; fall back to pow
1299 setLibcallName(RTLIB::POWI_F32, nullptr);
1300 setLibcallName(RTLIB::POWI_F64, nullptr);
1301 }
1302
1307
1308 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1310
1311 // Use the default implementation.
1313 setOperationAction(ISD::VAARG, MVT::Other, Expand);
1315 setOperationAction(ISD::VAEND, MVT::Other, Expand);
1318
1319 if (Subtarget->isTargetWindows())
1321 else
1323
1324 // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
1325 // the default expansion.
1326 InsertFencesForAtomic = false;
1327 if (Subtarget->hasAnyDataBarrier() &&
1328 (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) {
1329 // ATOMIC_FENCE needs custom lowering; the others should have been expanded
1330 // to ldrex/strex loops already.
1332 if (!Subtarget->isThumb() || !Subtarget->isMClass())
1334
1335 // On v8, we have particularly efficient implementations of atomic fences
1336 // if they can be combined with nearby atomic loads and stores.
1337 if (!Subtarget->hasAcquireRelease() ||
1338 getTargetMachine().getOptLevel() == CodeGenOptLevel::None) {
1339 // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc.
1340 InsertFencesForAtomic = true;
1341 }
1342 } else {
1343 // If there's anything we can use as a barrier, go through custom lowering
1344 // for ATOMIC_FENCE.
1345 // If target has DMB in thumb, Fences can be inserted.
1346 if (Subtarget->hasDataBarrier())
1347 InsertFencesForAtomic = true;
1348
1350 Subtarget->hasAnyDataBarrier() ? Custom : Expand);
1351
1352 // Set them all for libcall, which will force libcalls.
1365 // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the
1366 // Unordered/Monotonic case.
1367 if (!InsertFencesForAtomic) {
1370 }
1371 }
1372
1373 // Compute supported atomic widths.
1374 if (Subtarget->isTargetLinux() ||
1375 (!Subtarget->isMClass() && Subtarget->hasV6Ops())) {
1376 // For targets where __sync_* routines are reliably available, we use them
1377 // if necessary.
1378 //
1379 // ARM Linux always supports 64-bit atomics through kernel-assisted atomic
1380 // routines (kernel 3.1 or later). FIXME: Not with compiler-rt?
1381 //
1382 // ARMv6 targets have native instructions in ARM mode. For Thumb mode,
1383 // such targets should provide __sync_* routines, which use the ARM mode
1384 // instructions. (ARMv6 doesn't have dmb, but it has an equivalent
1385 // encoding; see ARMISD::MEMBARRIER_MCR.)
1387 } else if ((Subtarget->isMClass() && Subtarget->hasV8MBaselineOps()) ||
1388 Subtarget->hasForced32BitAtomics()) {
1389 // Cortex-M (besides Cortex-M0) have 32-bit atomics.
1391 } else {
1392 // We can't assume anything about other targets; just use libatomic
1393 // routines.
1395 }
1396
1398
1400
1401 // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
1402 if (!Subtarget->hasV6Ops()) {
1405 }
1407
1408 if (!Subtarget->useSoftFloat() && Subtarget->hasFPRegs() &&
1409 !Subtarget->isThumb1Only()) {
1410 // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
1411 // iff target supports vfp2.
1421 }
1422
1423 // We want to custom lower some of our intrinsics.
1428 if (Subtarget->useSjLjEH())
1429 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
1430
1440 if (Subtarget->hasFullFP16()) {
1444 }
1445
1447
1450 if (Subtarget->hasFullFP16())
1454 setOperationAction(ISD::BR_JT, MVT::Other, Custom);
1455
1456 // We don't support sin/cos/fmod/copysign/pow
1465 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2Base() &&
1466 !Subtarget->isThumb1Only()) {
1469 }
1472
1473 if (!Subtarget->hasVFP4Base()) {
1476 }
1477
1478 // Various VFP goodness
1479 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) {
1480 // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded.
1481 if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) {
1484 }
1485
1486 // fp16 is a special v7 extension that adds f16 <-> f32 conversions.
1487 if (!Subtarget->hasFP16()) {
1490 }
1491
1492 // Strict floating-point comparisons need custom lowering.
1499 }
1500
1501 // Use __sincos_stret if available.
1502 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1503 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1506 }
1507
1508 // FP-ARMv8 implements a lot of rounding-like FP operations.
1509 if (Subtarget->hasFPARMv8Base()) {
1518 if (Subtarget->hasNEON()) {
1523 }
1524
1525 if (Subtarget->hasFP64()) {
1534 }
1535 }
1536
1537 // FP16 often need to be promoted to call lib functions
1538 if (Subtarget->hasFullFP16()) {
1552
1554 }
1555
1556 if (Subtarget->hasNEON()) {
1557 // vmin and vmax aren't available in a scalar form, so we can use
1558 // a NEON instruction with an undef lane instead. This has a performance
1559 // penalty on some cores, so we don't do this unless we have been
1560 // asked to by the core tuning model.
1561 if (Subtarget->useNEONForSinglePrecisionFP()) {
1566 }
1571
1572 if (Subtarget->hasFullFP16()) {
1577
1582 }
1583 }
1584
1585 // We have target-specific dag combine patterns for the following nodes:
1586 // ARMISD::VMOVRRD - No need to call setTargetDAGCombine
1589
1590 if (Subtarget->hasMVEIntegerOps())
1592
1593 if (Subtarget->hasV6Ops())
1595 if (Subtarget->isThumb1Only())
1597 // Attempt to lower smin/smax to ssat/usat
1598 if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) ||
1599 Subtarget->isThumb2()) {
1601 }
1602
1604
1605 if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() ||
1606 !Subtarget->hasVFP2Base() || Subtarget->hasMinSize())
1608 else
1610
1611 //// temporary - rewrite interface to use type
1614 MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
1616 MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
1618
1619 // On ARM arguments smaller than 4 bytes are extended, so all arguments
1620 // are at least 4 bytes aligned.
1622
1623 // Prefer likely predicted branches to selects on out-of-order cores.
1624 PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder();
1625
1626 setPrefLoopAlignment(Align(1ULL << Subtarget->getPrefLoopLogAlignment()));
1628
1629 setMinFunctionAlignment(Subtarget->isThumb() ? Align(2) : Align(4));
1630
1631 if (Subtarget->isThumb() || Subtarget->isThumb2())
1633}
1634
1636 return Subtarget->useSoftFloat();
1637}
1638
1639// FIXME: It might make sense to define the representative register class as the
1640// nearest super-register that has a non-null superset. For example, DPR_VFP2 is
1641// a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
1642// SPR's representative would be DPR_VFP2. This should work well if register
1643// pressure tracking were modified such that a register use would increment the
1644// pressure of the register class's representative and all of it's super
1645// classes' representatives transitively. We have not implemented this because
1646// of the difficulty prior to coalescing of modeling operand register classes
1647// due to the common occurrence of cross class copies and subregister insertions
1648// and extractions.
1649std::pair<const TargetRegisterClass *, uint8_t>
1651 MVT VT) const {
1652 const TargetRegisterClass *RRC = nullptr;
1653 uint8_t Cost = 1;
1654 switch (VT.SimpleTy) {
1655 default:
1657 // Use DPR as representative register class for all floating point
1658 // and vector types. Since there are 32 SPR registers and 32 DPR registers so
1659 // the cost is 1 for both f32 and f64.
1660 case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:
1661 case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:
1662 RRC = &ARM::DPRRegClass;
1663 // When NEON is used for SP, only half of the register file is available
1664 // because operations that define both SP and DP results will be constrained
1665 // to the VFP2 class (D0-D15). We currently model this constraint prior to
1666 // coalescing by double-counting the SP regs. See the FIXME above.
1667 if (Subtarget->useNEONForSinglePrecisionFP())
1668 Cost = 2;
1669 break;
1670 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1671 case MVT::v4f32: case MVT::v2f64:
1672 RRC = &ARM::DPRRegClass;
1673 Cost = 2;
1674 break;
1675 case MVT::v4i64:
1676 RRC = &ARM::DPRRegClass;
1677 Cost = 4;
1678 break;
1679 case MVT::v8i64:
1680 RRC = &ARM::DPRRegClass;
1681 Cost = 8;
1682 break;
1683 }
1684 return std::make_pair(RRC, Cost);
1685}
1686
1687const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
1688#define MAKE_CASE(V) \
1689 case V: \
1690 return #V;
1691 switch ((ARMISD::NodeType)Opcode) {
1693 break;
1897#undef MAKE_CASE
1898 }
1899 return nullptr;
1900}
1901
1903 EVT VT) const {
1904 if (!VT.isVector())
1905 return getPointerTy(DL);
1906
1907 // MVE has a predicate register.
1908 if ((Subtarget->hasMVEIntegerOps() &&
1909 (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
1910 VT == MVT::v16i8)) ||
1911 (Subtarget->hasMVEFloatOps() &&
1912 (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16)))
1913 return MVT::getVectorVT(MVT::i1, VT.getVectorElementCount());
1915}
1916
1917/// getRegClassFor - Return the register class that should be used for the
1918/// specified value type.
1919const TargetRegisterClass *
1920ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
1921 (void)isDivergent;
1922 // Map v4i64 to QQ registers but do not make the type legal. Similarly map
1923 // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
1924 // load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive
1925 // MVE Q registers.
1926 if (Subtarget->hasNEON()) {
1927 if (VT == MVT::v4i64)
1928 return &ARM::QQPRRegClass;
1929 if (VT == MVT::v8i64)
1930 return &ARM::QQQQPRRegClass;
1931 }
1932 if (Subtarget->hasMVEIntegerOps()) {
1933 if (VT == MVT::v4i64)
1934 return &ARM::MQQPRRegClass;
1935 if (VT == MVT::v8i64)
1936 return &ARM::MQQQQPRRegClass;
1937 }
1939}
1940
1941// memcpy, and other memory intrinsics, typically tries to use LDM/STM if the
1942// source/dest is aligned and the copy size is large enough. We therefore want
1943// to align such objects passed to memory intrinsics.
1945 Align &PrefAlign) const {
1946 if (!isa<MemIntrinsic>(CI))
1947 return false;
1948 MinSize = 8;
1949 // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1
1950 // cycle faster than 4-byte aligned LDM.
1951 PrefAlign =
1952 (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? Align(8) : Align(4));
1953 return true;
1954}
1955
1956// Create a fast isel object.
1957FastISel *
1959 const TargetLibraryInfo *libInfo) const {
1960 return ARM::createFastISel(funcInfo, libInfo);
1961}
1962
1964 unsigned NumVals = N->getNumValues();
1965 if (!NumVals)
1966 return Sched::RegPressure;
1967
1968 for (unsigned i = 0; i != NumVals; ++i) {
1969 EVT VT = N->getValueType(i);
1970 if (VT == MVT::Glue || VT == MVT::Other)
1971 continue;
1972 if (VT.isFloatingPoint() || VT.isVector())
1973 return Sched::ILP;
1974 }
1975
1976 if (!N->isMachineOpcode())
1977 return Sched::RegPressure;
1978
1979 // Load are scheduled for latency even if there instruction itinerary
1980 // is not available.
1981 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1982 const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
1983
1984 if (MCID.getNumDefs() == 0)
1985 return Sched::RegPressure;
1986 if (!Itins->isEmpty() &&
1987 Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2U)
1988 return Sched::ILP;
1989
1990 return Sched::RegPressure;
1991}
1992
1993//===----------------------------------------------------------------------===//
1994// Lowering Code
1995//===----------------------------------------------------------------------===//
1996
1997static bool isSRL16(const SDValue &Op) {
1998 if (Op.getOpcode() != ISD::SRL)
1999 return false;
2000 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
2001 return Const->getZExtValue() == 16;
2002 return false;
2003}
2004
2005static bool isSRA16(const SDValue &Op) {
2006 if (Op.getOpcode() != ISD::SRA)
2007 return false;
2008 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
2009 return Const->getZExtValue() == 16;
2010 return false;
2011}
2012
2013static bool isSHL16(const SDValue &Op) {
2014 if (Op.getOpcode() != ISD::SHL)
2015 return false;
2016 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
2017 return Const->getZExtValue() == 16;
2018 return false;
2019}
2020
2021// Check for a signed 16-bit value. We special case SRA because it makes it
2022// more simple when also looking for SRAs that aren't sign extending a
2023// smaller value. Without the check, we'd need to take extra care with
2024// checking order for some operations.
2025static bool isS16(const SDValue &Op, SelectionDAG &DAG) {
2026 if (isSRA16(Op))
2027 return isSHL16(Op.getOperand(0));
2028 return DAG.ComputeNumSignBits(Op) == 17;
2029}
2030
2031/// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
2033 switch (CC) {
2034 default: llvm_unreachable("Unknown condition code!");
2035 case ISD::SETNE: return ARMCC::NE;
2036 case ISD::SETEQ: return ARMCC::EQ;
2037 case ISD::SETGT: return ARMCC::GT;
2038 case ISD::SETGE: return ARMCC::GE;
2039 case ISD::SETLT: return ARMCC::LT;
2040 case ISD::SETLE: return ARMCC::LE;
2041 case ISD::SETUGT: return ARMCC::HI;
2042 case ISD::SETUGE: return ARMCC::HS;
2043 case ISD::SETULT: return ARMCC::LO;
2044 case ISD::SETULE: return ARMCC::LS;
2045 }
2046}
2047
2048/// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
2050 ARMCC::CondCodes &CondCode2) {
2051 CondCode2 = ARMCC::AL;
2052 switch (CC) {
2053 default: llvm_unreachable("Unknown FP condition!");
2054 case ISD::SETEQ:
2055 case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
2056 case ISD::SETGT:
2057 case ISD::SETOGT: CondCode = ARMCC::GT; break;
2058 case ISD::SETGE:
2059 case ISD::SETOGE: CondCode = ARMCC::GE; break;
2060 case ISD::SETOLT: CondCode = ARMCC::MI; break;
2061 case ISD::SETOLE: CondCode = ARMCC::LS; break;
2062 case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
2063 case ISD::SETO: CondCode = ARMCC::VC; break;
2064 case ISD::SETUO: CondCode = ARMCC::VS; break;
2065 case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break;
2066 case ISD::SETUGT: CondCode = ARMCC::HI; break;
2067 case ISD::SETUGE: CondCode = ARMCC::PL; break;
2068 case ISD::SETLT:
2069 case ISD::SETULT: CondCode = ARMCC::LT; break;
2070 case ISD::SETLE:
2071 case ISD::SETULE: CondCode = ARMCC::LE; break;
2072 case ISD::SETNE:
2073 case ISD::SETUNE: CondCode = ARMCC::NE; break;
2074 }
2075}
2076
2077//===----------------------------------------------------------------------===//
2078// Calling Convention Implementation
2079//===----------------------------------------------------------------------===//
2080
2081/// getEffectiveCallingConv - Get the effective calling convention, taking into
2082/// account presence of floating point hardware and calling convention
2083/// limitations, such as support for variadic functions.
2085ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
2086 bool isVarArg) const {
2087 switch (CC) {
2088 default:
2089 report_fatal_error("Unsupported calling convention");
2092 case CallingConv::GHC:
2094 return CC;
2100 case CallingConv::Swift:
2103 case CallingConv::C:
2104 case CallingConv::Tail:
2105 if (!Subtarget->isAAPCS_ABI())
2106 return CallingConv::ARM_APCS;
2107 else if (Subtarget->hasFPRegs() && !Subtarget->isThumb1Only() &&
2108 getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
2109 !isVarArg)
2111 else
2113 case CallingConv::Fast:
2115 if (!Subtarget->isAAPCS_ABI()) {
2116 if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && !isVarArg)
2117 return CallingConv::Fast;
2118 return CallingConv::ARM_APCS;
2119 } else if (Subtarget->hasVFP2Base() &&
2120 !Subtarget->isThumb1Only() && !isVarArg)
2122 else
2124 }
2125}
2126
2128 bool isVarArg) const {
2129 return CCAssignFnForNode(CC, false, isVarArg);
2130}
2131
2133 bool isVarArg) const {
2134 return CCAssignFnForNode(CC, true, isVarArg);
2135}
2136
2137/// CCAssignFnForNode - Selects the correct CCAssignFn for the given
2138/// CallingConvention.
2139CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
2140 bool Return,
2141 bool isVarArg) const {
2142 switch (getEffectiveCallingConv(CC, isVarArg)) {
2143 default:
2144 report_fatal_error("Unsupported calling convention");
2146 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
2148 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
2150 return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
2151 case CallingConv::Fast:
2152 return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
2153 case CallingConv::GHC:
2154 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
2156 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
2158 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
2160 return (Return ? RetCC_ARM_AAPCS : CC_ARM_Win32_CFGuard_Check);
2161 }
2162}
2163
2164SDValue ARMTargetLowering::MoveToHPR(const SDLoc &dl, SelectionDAG &DAG,
2165 MVT LocVT, MVT ValVT, SDValue Val) const {
2166 Val = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocVT.getSizeInBits()),
2167 Val);
2168 if (Subtarget->hasFullFP16()) {
2169 Val = DAG.getNode(ARMISD::VMOVhr, dl, ValVT, Val);
2170 } else {
2171 Val = DAG.getNode(ISD::TRUNCATE, dl,
2172 MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
2173 Val = DAG.getNode(ISD::BITCAST, dl, ValVT, Val);
2174 }
2175 return Val;
2176}
2177
2178SDValue ARMTargetLowering::MoveFromHPR(const SDLoc &dl, SelectionDAG &DAG,
2179 MVT LocVT, MVT ValVT,
2180 SDValue Val) const {
2181 if (Subtarget->hasFullFP16()) {
2182 Val = DAG.getNode(ARMISD::VMOVrh, dl,
2183 MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
2184 } else {
2185 Val = DAG.getNode(ISD::BITCAST, dl,
2186 MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
2187 Val = DAG.getNode(ISD::ZERO_EXTEND, dl,
2188 MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
2189 }
2190 return DAG.getNode(ISD::BITCAST, dl, LocVT, Val);
2191}
2192
2193/// LowerCallResult - Lower the result values of a call into the
2194/// appropriate copies out of appropriate physical registers.
2195SDValue ARMTargetLowering::LowerCallResult(
2196 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
2197 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2198 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
2199 SDValue ThisVal) const {
2200 // Assign locations to each value returned by this call.
2202 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2203 *DAG.getContext());
2204 CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg));
2205
2206 // Copy all of the result registers out of their specified physreg.
2207 for (unsigned i = 0; i != RVLocs.size(); ++i) {
2208 CCValAssign VA = RVLocs[i];
2209
2210 // Pass 'this' value directly from the argument to return value, to avoid
2211 // reg unit interference
2212 if (i == 0 && isThisReturn) {
2213 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 &&
2214 "unexpected return calling convention register assignment");
2215 InVals.push_back(ThisVal);
2216 continue;
2217 }
2218
2219 SDValue Val;
2220 if (VA.needsCustom() &&
2221 (VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2f64)) {
2222 // Handle f64 or half of a v2f64.
2223 SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
2224 InGlue);
2225 Chain = Lo.getValue(1);
2226 InGlue = Lo.getValue(2);
2227 VA = RVLocs[++i]; // skip ahead to next loc
2228 SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
2229 InGlue);
2230 Chain = Hi.getValue(1);
2231 InGlue = Hi.getValue(2);
2232 if (!Subtarget->isLittle())
2233 std::swap (Lo, Hi);
2234 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
2235
2236 if (VA.getLocVT() == MVT::v2f64) {
2237 SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
2238 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
2239 DAG.getConstant(0, dl, MVT::i32));
2240
2241 VA = RVLocs[++i]; // skip ahead to next loc
2242 Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InGlue);
2243 Chain = Lo.getValue(1);
2244 InGlue = Lo.getValue(2);
2245 VA = RVLocs[++i]; // skip ahead to next loc
2246 Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InGlue);
2247 Chain = Hi.getValue(1);
2248 InGlue = Hi.getValue(2);
2249 if (!Subtarget->isLittle())
2250 std::swap (Lo, Hi);
2251 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
2252 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
2253 DAG.getConstant(1, dl, MVT::i32));
2254 }
2255 } else {
2256 Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
2257 InGlue);
2258 Chain = Val.getValue(1);
2259 InGlue = Val.getValue(2);
2260 }
2261
2262 switch (VA.getLocInfo()) {
2263 default: llvm_unreachable("Unknown loc info!");
2264 case CCValAssign::Full: break;
2265 case CCValAssign::BCvt:
2266 Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
2267 break;
2268 }
2269
2270 // f16 arguments have their size extended to 4 bytes and passed as if they
2271 // had been copied to the LSBs of a 32-bit register.
2272 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
2273 if (VA.needsCustom() &&
2274 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
2275 Val = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Val);
2276
2277 InVals.push_back(Val);
2278 }
2279
2280 return Chain;
2281}
2282
2283std::pair<SDValue, MachinePointerInfo> ARMTargetLowering::computeAddrForCallArg(
2284 const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, SDValue StackPtr,
2285 bool IsTailCall, int SPDiff) const {
2286 SDValue DstAddr;
2287 MachinePointerInfo DstInfo;
2288 int32_t Offset = VA.getLocMemOffset();
2290
2291 if (IsTailCall) {
2292 Offset += SPDiff;
2293 auto PtrVT = getPointerTy(DAG.getDataLayout());
2294 int Size = VA.getLocVT().getFixedSizeInBits() / 8;
2295 int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true);
2296 DstAddr = DAG.getFrameIndex(FI, PtrVT);
2297 DstInfo =
2299 } else {
2300 SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl);
2301 DstAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
2302 StackPtr, PtrOff);
2303 DstInfo =
2305 }
2306
2307 return std::make_pair(DstAddr, DstInfo);
2308}
2309
2310void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,
2311 SDValue Chain, SDValue &Arg,
2312 RegsToPassVector &RegsToPass,
2313 CCValAssign &VA, CCValAssign &NextVA,
2314 SDValue &StackPtr,
2315 SmallVectorImpl<SDValue> &MemOpChains,
2316 bool IsTailCall,
2317 int SPDiff) const {
2318 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
2319 DAG.getVTList(MVT::i32, MVT::i32), Arg);
2320 unsigned id = Subtarget->isLittle() ? 0 : 1;
2321 RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id)));
2322
2323 if (NextVA.isRegLoc())
2324 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id)));
2325 else {
2326 assert(NextVA.isMemLoc());
2327 if (!StackPtr.getNode())
2328 StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP,
2330
2331 SDValue DstAddr;
2332 MachinePointerInfo DstInfo;
2333 std::tie(DstAddr, DstInfo) =
2334 computeAddrForCallArg(dl, DAG, NextVA, StackPtr, IsTailCall, SPDiff);
2335 MemOpChains.push_back(
2336 DAG.getStore(Chain, dl, fmrrd.getValue(1 - id), DstAddr, DstInfo));
2337 }
2338}
2339
2340static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
2341 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
2343}
2344
2345/// LowerCall - Lowering a call into a callseq_start <-
2346/// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
2347/// nodes.
2348SDValue
2349ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2350 SmallVectorImpl<SDValue> &InVals) const {
2351 SelectionDAG &DAG = CLI.DAG;
2352 SDLoc &dl = CLI.DL;
2354 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2356 SDValue Chain = CLI.Chain;
2357 SDValue Callee = CLI.Callee;
2358 bool &isTailCall = CLI.IsTailCall;
2359 CallingConv::ID CallConv = CLI.CallConv;
2360 bool doesNotRet = CLI.DoesNotReturn;
2361 bool isVarArg = CLI.IsVarArg;
2362
2366 bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
2367 bool isThisReturn = false;
2368 bool isCmseNSCall = false;
2369 bool isSibCall = false;
2370 bool PreferIndirect = false;
2371 bool GuardWithBTI = false;
2372
2373 // Lower 'returns_twice' calls to a pseudo-instruction.
2374 if (CLI.CB && CLI.CB->getAttributes().hasFnAttr(Attribute::ReturnsTwice) &&
2375 !Subtarget->noBTIAtReturnTwice())
2376 GuardWithBTI = AFI->branchTargetEnforcement();
2377
2378 // Determine whether this is a non-secure function call.
2379 if (CLI.CB && CLI.CB->getAttributes().hasFnAttr("cmse_nonsecure_call"))
2380 isCmseNSCall = true;
2381
2382 // Disable tail calls if they're not supported.
2383 if (!Subtarget->supportsTailCall())
2384 isTailCall = false;
2385
2386 // For both the non-secure calls and the returns from a CMSE entry function,
2387 // the function needs to do some extra work afte r the call, or before the
2388 // return, respectively, thus it cannot end with atail call
2389 if (isCmseNSCall || AFI->isCmseNSEntryFunction())
2390 isTailCall = false;
2391
2392 if (isa<GlobalAddressSDNode>(Callee)) {
2393 // If we're optimizing for minimum size and the function is called three or
2394 // more times in this block, we can improve codesize by calling indirectly
2395 // as BLXr has a 16-bit encoding.
2396 auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
2397 if (CLI.CB) {
2398 auto *BB = CLI.CB->getParent();
2399 PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() &&
2400 count_if(GV->users(), [&BB](const User *U) {
2401 return isa<Instruction>(U) &&
2402 cast<Instruction>(U)->getParent() == BB;
2403 }) > 2;
2404 }
2405 }
2406 if (isTailCall) {
2407 // Check if it's really possible to do a tail call.
2408 isTailCall = IsEligibleForTailCallOptimization(
2409 Callee, CallConv, isVarArg, isStructRet,
2410 MF.getFunction().hasStructRetAttr(), Outs, OutVals, Ins, DAG,
2411 PreferIndirect);
2412
2413 if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt &&
2414 CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail)
2415 isSibCall = true;
2416
2417 // We don't support GuaranteedTailCallOpt for ARM, only automatically
2418 // detected sibcalls.
2419 if (isTailCall)
2420 ++NumTailCalls;
2421 }
2422
2423 if (!isTailCall && CLI.CB && CLI.CB->isMustTailCall())
2424 report_fatal_error("failed to perform tail call elimination on a call "
2425 "site marked musttail");
2426 // Analyze operands of the call, assigning locations to each operand.
2428 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2429 *DAG.getContext());
2430 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg));
2431
2432 // Get a count of how many bytes are to be pushed on the stack.
2433 unsigned NumBytes = CCInfo.getStackSize();
2434
2435 // SPDiff is the byte offset of the call's argument area from the callee's.
2436 // Stores to callee stack arguments will be placed in FixedStackSlots offset
2437 // by this amount for a tail call. In a sibling call it must be 0 because the
2438 // caller will deallocate the entire stack and the callee still expects its
2439 // arguments to begin at SP+0. Completely unused for non-tail calls.
2440 int SPDiff = 0;
2441
2442 if (isTailCall && !isSibCall) {
2443 auto FuncInfo = MF.getInfo<ARMFunctionInfo>();
2444 unsigned NumReusableBytes = FuncInfo->getArgumentStackSize();
2445
2446 // Since callee will pop argument stack as a tail call, we must keep the
2447 // popped size 16-byte aligned.
2448 Align StackAlign = DAG.getDataLayout().getStackAlignment();
2449 NumBytes = alignTo(NumBytes, StackAlign);
2450
2451 // SPDiff will be negative if this tail call requires more space than we
2452 // would automatically have in our incoming argument space. Positive if we
2453 // can actually shrink the stack.
2454 SPDiff = NumReusableBytes - NumBytes;
2455
2456 // If this call requires more stack than we have available from
2457 // LowerFormalArguments, tell FrameLowering to reserve space for it.
2458 if (SPDiff < 0 && AFI->getArgRegsSaveSize() < (unsigned)-SPDiff)
2459 AFI->setArgRegsSaveSize(-SPDiff);
2460 }
2461
2462 if (isSibCall) {
2463 // For sibling tail calls, memory operands are available in our caller's stack.
2464 NumBytes = 0;
2465 } else {
2466 // Adjust the stack pointer for the new arguments...
2467 // These operations are automatically eliminated by the prolog/epilog pass
2468 Chain = DAG.getCALLSEQ_START(Chain, isTailCall ? 0 : NumBytes, 0, dl);
2469 }
2470
2472 DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout()));
2473
2474 RegsToPassVector RegsToPass;
2475 SmallVector<SDValue, 8> MemOpChains;
2476
2477 // During a tail call, stores to the argument area must happen after all of
2478 // the function's incoming arguments have been loaded because they may alias.
2479 // This is done by folding in a TokenFactor from LowerFormalArguments, but
2480 // there's no point in doing so repeatedly so this tracks whether that's
2481 // happened yet.
2482 bool AfterFormalArgLoads = false;
2483
2484 // Walk the register/memloc assignments, inserting copies/loads. In the case
2485 // of tail call optimization, arguments are handled later.
2486 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
2487 i != e;
2488 ++i, ++realArgIdx) {
2489 CCValAssign &VA = ArgLocs[i];
2490 SDValue Arg = OutVals[realArgIdx];
2491 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2492 bool isByVal = Flags.isByVal();
2493
2494 // Promote the value if needed.
2495 switch (VA.getLocInfo()) {
2496 default: llvm_unreachable("Unknown loc info!");
2497 case CCValAssign::Full: break;
2498 case CCValAssign::SExt:
2499 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
2500 break;
2501 case CCValAssign::ZExt:
2502 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
2503 break;
2504 case CCValAssign::AExt:
2505 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
2506 break;
2507 case CCValAssign::BCvt:
2508 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2509 break;
2510 }
2511
2512 if (isTailCall && VA.isMemLoc() && !AfterFormalArgLoads) {
2513 Chain = DAG.getStackArgumentTokenFactor(Chain);
2514 AfterFormalArgLoads = true;
2515 }
2516
2517 // f16 arguments have their size extended to 4 bytes and passed as if they
2518 // had been copied to the LSBs of a 32-bit register.
2519 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
2520 if (VA.needsCustom() &&
2521 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) {
2522 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
2523 } else {
2524 // f16 arguments could have been extended prior to argument lowering.
2525 // Mask them arguments if this is a CMSE nonsecure call.
2526 auto ArgVT = Outs[realArgIdx].ArgVT;
2527 if (isCmseNSCall && (ArgVT == MVT::f16)) {
2528 auto LocBits = VA.getLocVT().getSizeInBits();
2529 auto MaskValue = APInt::getLowBitsSet(LocBits, ArgVT.getSizeInBits());
2530 SDValue Mask =
2531 DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
2532 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
2533 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
2534 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2535 }
2536 }
2537
2538 // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
2539 if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
2540 SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2541 DAG.getConstant(0, dl, MVT::i32));
2542 SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2543 DAG.getConstant(1, dl, MVT::i32));
2544
2545 PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, VA, ArgLocs[++i],
2546 StackPtr, MemOpChains, isTailCall, SPDiff);
2547
2548 VA = ArgLocs[++i]; // skip ahead to next loc
2549 if (VA.isRegLoc()) {
2550 PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, VA, ArgLocs[++i],
2551 StackPtr, MemOpChains, isTailCall, SPDiff);
2552 } else {
2553 assert(VA.isMemLoc());
2554 SDValue DstAddr;
2555 MachinePointerInfo DstInfo;
2556 std::tie(DstAddr, DstInfo) =
2557 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2558 MemOpChains.push_back(DAG.getStore(Chain, dl, Op1, DstAddr, DstInfo));
2559 }
2560 } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
2561 PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
2562 StackPtr, MemOpChains, isTailCall, SPDiff);
2563 } else if (VA.isRegLoc()) {
2564 if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
2565 Outs[0].VT == MVT::i32) {
2566 assert(VA.getLocVT() == MVT::i32 &&
2567 "unexpected calling convention register assignment");
2568 assert(!Ins.empty() && Ins[0].VT == MVT::i32 &&
2569 "unexpected use of 'returned'");
2570 isThisReturn = true;
2571 }
2572 const TargetOptions &Options = DAG.getTarget().Options;
2573 if (Options.EmitCallSiteInfo)
2574 CSInfo.emplace_back(VA.getLocReg(), i);
2575 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2576 } else if (isByVal) {
2577 assert(VA.isMemLoc());
2578 unsigned offset = 0;
2579
2580 // True if this byval aggregate will be split between registers
2581 // and memory.
2582 unsigned ByValArgsCount = CCInfo.getInRegsParamsCount();
2583 unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed();
2584
2585 if (CurByValIdx < ByValArgsCount) {
2586
2587 unsigned RegBegin, RegEnd;
2588 CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd);
2589
2590 EVT PtrVT =
2592 unsigned int i, j;
2593 for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
2594 SDValue Const = DAG.getConstant(4*i, dl, MVT::i32);
2595 SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
2596 SDValue Load =
2597 DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo(),
2598 DAG.InferPtrAlign(AddArg));
2599 MemOpChains.push_back(Load.getValue(1));
2600 RegsToPass.push_back(std::make_pair(j, Load));
2601 }
2602
2603 // If parameter size outsides register area, "offset" value
2604 // helps us to calculate stack slot for remained part properly.
2605 offset = RegEnd - RegBegin;
2606
2607 CCInfo.nextInRegsParam();
2608 }
2609
2610 if (Flags.getByValSize() > 4*offset) {
2611 auto PtrVT = getPointerTy(DAG.getDataLayout());
2612 SDValue Dst;
2613 MachinePointerInfo DstInfo;
2614 std::tie(Dst, DstInfo) =
2615 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2616 SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl);
2617 SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset);
2618 SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl,
2619 MVT::i32);
2620 SDValue AlignNode =
2621 DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32);
2622
2623 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
2624 SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
2625 MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
2626 Ops));
2627 }
2628 } else {
2629 assert(VA.isMemLoc());
2630 SDValue DstAddr;
2631 MachinePointerInfo DstInfo;
2632 std::tie(DstAddr, DstInfo) =
2633 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2634
2635 SDValue Store = DAG.getStore(Chain, dl, Arg, DstAddr, DstInfo);
2636 MemOpChains.push_back(Store);
2637 }
2638 }
2639
2640 if (!MemOpChains.empty())
2641 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
2642
2643 // Build a sequence of copy-to-reg nodes chained together with token chain
2644 // and flag operands which copy the outgoing args into the appropriate regs.
2645 SDValue InGlue;
2646 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
2647 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
2648 RegsToPass[i].second, InGlue);
2649 InGlue = Chain.getValue(1);
2650 }
2651
2652 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
2653 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
2654 // node so that legalize doesn't hack it.
2655 bool isDirect = false;
2656
2658 const Module *Mod = MF.getFunction().getParent();
2659 const GlobalValue *GVal = nullptr;
2660 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
2661 GVal = G->getGlobal();
2662 bool isStub =
2663 !TM.shouldAssumeDSOLocal(*Mod, GVal) && Subtarget->isTargetMachO();
2664
2665 bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
2666 bool isLocalARMFunc = false;
2667 auto PtrVt = getPointerTy(DAG.getDataLayout());
2668
2669 if (Subtarget->genLongCalls()) {
2670 assert((!isPositionIndependent() || Subtarget->isTargetWindows()) &&
2671 "long-calls codegen is not position independent!");
2672 // Handle a global address or an external symbol. If it's not one of
2673 // those, the target's already in a register, so we don't need to do
2674 // anything extra.
2675 if (isa<GlobalAddressSDNode>(Callee)) {
2676 if (Subtarget->genExecuteOnly()) {
2677 if (Subtarget->useMovt())
2678 ++NumMovwMovt;
2679 Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt,
2680 DAG.getTargetGlobalAddress(GVal, dl, PtrVt));
2681 } else {
2682 // Create a constant pool entry for the callee address
2683 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2685 GVal, ARMPCLabelIndex, ARMCP::CPValue, 0);
2686
2687 // Get the address of the callee into a register
2688 SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2689 Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr);
2690 Callee = DAG.getLoad(
2691 PtrVt, dl, DAG.getEntryNode(), Addr,
2693 }
2694 } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
2695 const char *Sym = S->getSymbol();
2696
2697 if (Subtarget->genExecuteOnly()) {
2698 if (Subtarget->useMovt())
2699 ++NumMovwMovt;
2700 Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt,
2701 DAG.getTargetGlobalAddress(GVal, dl, PtrVt));
2702 } else {
2703 // Create a constant pool entry for the callee address
2704 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2706 *DAG.getContext(), Sym, ARMPCLabelIndex, 0);
2707
2708 // Get the address of the callee into a register
2709 SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2710 Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr);
2711 Callee = DAG.getLoad(
2712 PtrVt, dl, DAG.getEntryNode(), Addr,
2714 }
2715 }
2716 } else if (isa<GlobalAddressSDNode>(Callee)) {
2717 if (!PreferIndirect) {
2718 isDirect = true;
2719 bool isDef = GVal->isStrongDefinitionForLinker();
2720
2721 // ARM call to a local ARM function is predicable.
2722 isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking);
2723 // tBX takes a register source operand.
2724 if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2725 assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?");
2726 Callee = DAG.getNode(
2727 ARMISD::WrapperPIC, dl, PtrVt,
2728 DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, ARMII::MO_NONLAZY));
2729 Callee = DAG.getLoad(
2730 PtrVt, dl, DAG.getEntryNode(), Callee,
2734 } else if (Subtarget->isTargetCOFF()) {
2735 assert(Subtarget->isTargetWindows() &&
2736 "Windows is the only supported COFF target");
2737 unsigned TargetFlags = ARMII::MO_NO_FLAG;
2738 if (GVal->hasDLLImportStorageClass())
2739 TargetFlags = ARMII::MO_DLLIMPORT;
2740 else if (!TM.shouldAssumeDSOLocal(*GVal->getParent(), GVal))
2741 TargetFlags = ARMII::MO_COFFSTUB;
2742 Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, /*offset=*/0,
2743 TargetFlags);
2744 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
2745 Callee =
2746 DAG.getLoad(PtrVt, dl, DAG.getEntryNode(),
2747 DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee),
2749 } else {
2750 Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, 0);
2751 }
2752 }
2753 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
2754 isDirect = true;
2755 // tBX takes a register source operand.
2756 const char *Sym = S->getSymbol();
2757 if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2758 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2761 ARMPCLabelIndex, 4);
2762 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2763 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2764 Callee = DAG.getLoad(
2765 PtrVt, dl, DAG.getEntryNode(), CPAddr,
2767 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
2768 Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel);
2769 } else {
2770 Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0);
2771 }
2772 }
2773
2774 if (isCmseNSCall) {
2775 assert(!isARMFunc && !isDirect &&
2776 "Cannot handle call to ARM function or direct call");
2777 if (NumBytes > 0) {
2779 "call to non-secure function would "
2780 "require passing arguments on stack",
2781 dl.getDebugLoc());
2782 DAG.getContext()->diagnose(Diag);
2783 }
2784 if (isStructRet) {
2787 "call to non-secure function would return value through pointer",
2788 dl.getDebugLoc());
2789 DAG.getContext()->diagnose(Diag);
2790 }
2791 }
2792
2793 // FIXME: handle tail calls differently.
2794 unsigned CallOpc;
2795 if (Subtarget->isThumb()) {
2796 if (GuardWithBTI)
2797 CallOpc = ARMISD::t2CALL_BTI;
2798 else if (isCmseNSCall)
2799 CallOpc = ARMISD::tSECALL;
2800 else if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
2801 CallOpc = ARMISD::CALL_NOLINK;
2802 else
2803 CallOpc = ARMISD::CALL;
2804 } else {
2805 if (!isDirect && !Subtarget->hasV5TOps())
2806 CallOpc = ARMISD::CALL_NOLINK;
2807 else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() &&
2808 // Emit regular call when code size is the priority
2809 !Subtarget->hasMinSize())
2810 // "mov lr, pc; b _foo" to avoid confusing the RSP
2811 CallOpc = ARMISD::CALL_NOLINK;
2812 else
2813 CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL;
2814 }
2815
2816 // We don't usually want to end the call-sequence here because we would tidy
2817 // the frame up *after* the call, however in the ABI-changing tail-call case
2818 // we've carefully laid out the parameters so that when sp is reset they'll be
2819 // in the correct location.
2820 if (isTailCall && !isSibCall) {
2821 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, dl);
2822 InGlue = Chain.getValue(1);
2823 }
2824
2825 std::vector<SDValue> Ops;
2826 Ops.push_back(Chain);
2827 Ops.push_back(Callee);
2828
2829 if (isTailCall) {
2830 Ops.push_back(DAG.getTargetConstant(SPDiff, dl, MVT::i32));
2831 }
2832
2833 // Add argument registers to the end of the list so that they are known live
2834 // into the call.
2835 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
2836 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
2837 RegsToPass[i].second.getValueType()));
2838
2839 // Add a register mask operand representing the call-preserved registers.
2840 const uint32_t *Mask;
2841 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
2842 if (isThisReturn) {
2843 // For 'this' returns, use the R0-preserving mask if applicable
2844 Mask = ARI->getThisReturnPreservedMask(MF, CallConv);
2845 if (!Mask) {
2846 // Set isThisReturn to false if the calling convention is not one that
2847 // allows 'returned' to be modeled in this way, so LowerCallResult does
2848 // not try to pass 'this' straight through
2849 isThisReturn = false;
2850 Mask = ARI->getCallPreservedMask(MF, CallConv);
2851 }
2852 } else
2853 Mask = ARI->getCallPreservedMask(MF, CallConv);
2854
2855 assert(Mask && "Missing call preserved mask for calling convention");
2856 Ops.push_back(DAG.getRegisterMask(Mask));
2857
2858 if (InGlue.getNode())
2859 Ops.push_back(InGlue);
2860
2861 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2862 if (isTailCall) {
2864 SDValue Ret = DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops);
2865 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
2866 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
2867 return Ret;
2868 }
2869
2870 // Returns a chain and a flag for retval copy to use.
2871 Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
2872 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
2873 InGlue = Chain.getValue(1);
2874 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
2875
2876 // If we're guaranteeing tail-calls will be honoured, the callee must
2877 // pop its own argument stack on return. But this call is *not* a tail call so
2878 // we need to undo that after it returns to restore the status-quo.
2879 bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt;
2880 uint64_t CalleePopBytes =
2881 canGuaranteeTCO(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : -1ULL;
2882
2883 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, dl);
2884 if (!Ins.empty())
2885 InGlue = Chain.getValue(1);
2886
2887 // Handle result values, copying them out of physregs into vregs that we
2888 // return.
2889 return LowerCallResult(Chain, InGlue, CallConv, isVarArg, Ins, dl, DAG,
2890 InVals, isThisReturn,
2891 isThisReturn ? OutVals[0] : SDValue());
2892}
2893
2894/// HandleByVal - Every parameter *after* a byval parameter is passed
2895/// on the stack. Remember the next parameter register to allocate,
2896/// and then confiscate the rest of the parameter registers to insure
2897/// this.
2898void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
2899 Align Alignment) const {
2900 // Byval (as with any stack) slots are always at least 4 byte aligned.
2901 Alignment = std::max(Alignment, Align(4));
2902
2903 unsigned Reg = State->AllocateReg(GPRArgRegs);
2904 if (!Reg)
2905 return;
2906
2907 unsigned AlignInRegs = Alignment.value() / 4;
2908 unsigned Waste = (ARM::R4 - Reg) % AlignInRegs;
2909 for (unsigned i = 0; i < Waste; ++i)
2910 Reg = State->AllocateReg(GPRArgRegs);
2911
2912 if (!Reg)
2913 return;
2914
2915 unsigned Excess = 4 * (ARM::R4 - Reg);
2916
2917 // Special case when NSAA != SP and parameter size greater than size of
2918 // all remained GPR regs. In that case we can't split parameter, we must
2919 // send it to stack. We also must set NCRN to R4, so waste all
2920 // remained registers.
2921 const unsigned NSAAOffset = State->getStackSize();
2922 if (NSAAOffset != 0 && Size > Excess) {
2923 while (State->AllocateReg(GPRArgRegs))
2924 ;
2925 return;
2926 }
2927
2928 // First register for byval parameter is the first register that wasn't
2929 // allocated before this method call, so it would be "reg".
2930 // If parameter is small enough to be saved in range [reg, r4), then
2931 // the end (first after last) register would be reg + param-size-in-regs,
2932 // else parameter would be splitted between registers and stack,
2933 // end register would be r4 in this case.
2934 unsigned ByValRegBegin = Reg;
2935 unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4);
2936 State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd);
2937 // Note, first register is allocated in the beginning of function already,
2938 // allocate remained amount of registers we need.
2939 for (unsigned i = Reg + 1; i != ByValRegEnd; ++i)
2940 State->AllocateReg(GPRArgRegs);
2941 // A byval parameter that is split between registers and memory needs its
2942 // size truncated here.
2943 // In the case where the entire structure fits in registers, we set the
2944 // size in memory to zero.
2945 Size = std::max<int>(Size - Excess, 0);
2946}
2947
2948/// MatchingStackOffset - Return true if the given stack call argument is
2949/// already available in the same position (relatively) of the caller's
2950/// incoming argument stack.
2951static
2954 const TargetInstrInfo *TII) {
2955 unsigned Bytes = Arg.getValueSizeInBits() / 8;
2956 int FI = std::numeric_limits<int>::max();
2957 if (Arg.getOpcode() == ISD::CopyFromReg) {
2958 Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
2959 if (!VR.isVirtual())
2960 return false;
2961 MachineInstr *Def = MRI->getVRegDef(VR);
2962 if (!Def)
2963 return false;
2964 if (!Flags.isByVal()) {
2965 if (!TII->isLoadFromStackSlot(*Def, FI))
2966 return false;
2967 } else {
2968 return false;
2969 }
2970 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
2971 if (Flags.isByVal())
2972 // ByVal argument is passed in as a pointer but it's now being
2973 // dereferenced. e.g.
2974 // define @foo(%struct.X* %A) {
2975 // tail call @bar(%struct.X* byval %A)
2976 // }
2977 return false;
2978 SDValue Ptr = Ld->getBasePtr();
2979 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
2980 if (!FINode)
2981 return false;
2982 FI = FINode->getIndex();
2983 } else
2984 return false;
2985
2986 assert(FI != std::numeric_limits<int>::max());
2987 if (!MFI.isFixedObjectIndex(FI))
2988 return false;
2989 return Offset == MFI.getObjectOffset(FI) && Bytes == MFI.getObjectSize(FI);
2990}
2991
2992/// IsEligibleForTailCallOptimization - Check whether the call is eligible
2993/// for tail call optimization. Targets which want to do tail call
2994/// optimization should implement this function.
2995bool ARMTargetLowering::IsEligibleForTailCallOptimization(
2996 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
2997 bool isCalleeStructRet, bool isCallerStructRet,
2999 const SmallVectorImpl<SDValue> &OutVals,
3001 const bool isIndirect) const {
3003 const Function &CallerF = MF.getFunction();
3004 CallingConv::ID CallerCC = CallerF.getCallingConv();
3005
3006 assert(Subtarget->supportsTailCall());
3007
3008 // Indirect tail calls cannot be optimized for Thumb1 if the args
3009 // to the call take up r0-r3. The reason is that there are no legal registers
3010 // left to hold the pointer to the function to be called.
3011 // Similarly, if the function uses return address sign and authentication,
3012 // r12 is needed to hold the PAC and is not available to hold the callee
3013 // address.
3014 if (Outs.size() >= 4 &&
3015 (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect)) {
3016 if (Subtarget->isThumb1Only())
3017 return false;
3018 // Conservatively assume the function spills LR.
3020 return false;
3021 }
3022
3023 // Look for obvious safe cases to perform tail call optimization that do not
3024 // require ABI changes. This is what gcc calls sibcall.
3025
3026 // Exception-handling functions need a special set of instructions to indicate
3027 // a return to the hardware. Tail-calling another function would probably
3028 // break this.
3029 if (CallerF.hasFnAttribute("interrupt"))
3030 return false;
3031
3032 if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
3033 return CalleeCC == CallerCC;
3034
3035 // Also avoid sibcall optimization if either caller or callee uses struct
3036 // return semantics.
3037 if (isCalleeStructRet || isCallerStructRet)
3038 return false;
3039
3040 // Externally-defined functions with weak linkage should not be
3041 // tail-called on ARM when the OS does not support dynamic
3042 // pre-emption of symbols, as the AAELF spec requires normal calls
3043 // to undefined weak functions to be replaced with a NOP or jump to the
3044 // next instruction. The behaviour of branch instructions in this
3045 // situation (as used for tail calls) is implementation-defined, so we
3046 // cannot rely on the linker replacing the tail call with a return.
3047 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
3048 const GlobalValue *GV = G->getGlobal();
3050 if (GV->hasExternalWeakLinkage() &&
3051 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
3052 return false;
3053 }
3054
3055 // Check that the call results are passed in the same way.
3056 LLVMContext &C = *DAG.getContext();
3058 getEffectiveCallingConv(CalleeCC, isVarArg),
3059 getEffectiveCallingConv(CallerCC, CallerF.isVarArg()), MF, C, Ins,
3060 CCAssignFnForReturn(CalleeCC, isVarArg),
3061 CCAssignFnForReturn(CallerCC, CallerF.isVarArg())))
3062 return false;
3063 // The callee has to preserve all registers the caller needs to preserve.
3064 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
3065 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3066 if (CalleeCC != CallerCC) {
3067 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3068 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3069 return false;
3070 }
3071
3072 // If Caller's vararg or byval argument has been split between registers and
3073 // stack, do not perform tail call, since part of the argument is in caller's
3074 // local frame.
3075 const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>();
3076 if (AFI_Caller->getArgRegsSaveSize())
3077 return false;
3078
3079 // If the callee takes no arguments then go on to check the results of the
3080 // call.
3081 if (!Outs.empty()) {
3082 // Check if stack adjustment is needed. For now, do not do this if any
3083 // argument is passed on the stack.
3085 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
3086 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
3087 if (CCInfo.getStackSize()) {
3088 // Check if the arguments are already laid out in the right way as
3089 // the caller's fixed stack objects.
3090 MachineFrameInfo &MFI = MF.getFrameInfo();
3091 const MachineRegisterInfo *MRI = &MF.getRegInfo();
3092 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3093 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
3094 i != e;
3095 ++i, ++realArgIdx) {
3096 CCValAssign &VA = ArgLocs[i];
3097 EVT RegVT = VA.getLocVT();
3098 SDValue Arg = OutVals[realArgIdx];
3099 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
3101 return false;
3102 if (VA.needsCustom() && (RegVT == MVT::f64 || RegVT == MVT::v2f64)) {
3103 // f64 and vector types are split into multiple registers or
3104 // register/stack-slot combinations. The types will not match
3105 // the registers; give up on memory f64 refs until we figure
3106 // out what to do about this.
3107 if (!VA.isRegLoc())
3108 return false;
3109 if (!ArgLocs[++i].isRegLoc())
3110 return false;
3111 if (RegVT == MVT::v2f64) {
3112 if (!ArgLocs[++i].isRegLoc())
3113 return false;
3114 if (!ArgLocs[++i].isRegLoc())
3115 return false;
3116 }
3117 } else if (!VA.isRegLoc()) {
3118 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
3119 MFI, MRI, TII))
3120 return false;
3121 }
3122 }
3123 }
3124
3125 const MachineRegisterInfo &MRI = MF.getRegInfo();
3126 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
3127 return false;
3128 }
3129
3130 return true;
3131}
3132
3133bool
3134ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
3135 MachineFunction &MF, bool isVarArg,
3137 LLVMContext &Context) const {
3139 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
3140 return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3141}
3142
3144 const SDLoc &DL, SelectionDAG &DAG) {
3145 const MachineFunction &MF = DAG.getMachineFunction();
3146 const Function &F = MF.getFunction();
3147
3148 StringRef IntKind = F.getFnAttribute("interrupt").getValueAsString();
3149
3150 // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset
3151 // version of the "preferred return address". These offsets affect the return
3152 // instruction if this is a return from PL1 without hypervisor extensions.
3153 // IRQ/FIQ: +4 "subs pc, lr, #4"
3154 // SWI: 0 "subs pc, lr, #0"
3155 // ABORT: +4 "subs pc, lr, #4"
3156 // UNDEF: +4/+2 "subs pc, lr, #0"
3157 // UNDEF varies depending on where the exception came from ARM or Thumb
3158 // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0.
3159
3160 int64_t LROffset;
3161 if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" ||
3162 IntKind == "ABORT")
3163 LROffset = 4;
3164 else if (IntKind == "SWI" || IntKind == "UNDEF")
3165 LROffset = 0;
3166 else
3167 report_fatal_error("Unsupported interrupt attribute. If present, value "
3168 "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF");
3169
3170 RetOps.insert(RetOps.begin() + 1,
3171 DAG.getConstant(LROffset, DL, MVT::i32, false));
3172
3173 return DAG.getNode(ARMISD::INTRET_GLUE, DL, MVT::Other, RetOps);
3174}
3175
3176SDValue
3177ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3178 bool isVarArg,
3180 const SmallVectorImpl<SDValue> &OutVals,
3181 const SDLoc &dl, SelectionDAG &DAG) const {
3182 // CCValAssign - represent the assignment of the return value to a location.
3184
3185 // CCState - Info about the registers and stack slots.
3186 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3187 *DAG.getContext());
3188
3189 // Analyze outgoing return values.
3190 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3191
3192 SDValue Glue;
3194 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3195 bool isLittleEndian = Subtarget->isLittle();
3196
3199 AFI->setReturnRegsCount(RVLocs.size());
3200
3201 // Report error if cmse entry function returns structure through first ptr arg.
3202 if (AFI->isCmseNSEntryFunction() && MF.getFunction().hasStructRetAttr()) {
3203 // Note: using an empty SDLoc(), as the first line of the function is a
3204 // better place to report than the last line.
3207 "secure entry function would return value through pointer",
3208 SDLoc().getDebugLoc());
3209 DAG.getContext()->diagnose(Diag);
3210 }
3211
3212 // Copy the result values into the output registers.
3213 for (unsigned i = 0, realRVLocIdx = 0;
3214 i != RVLocs.size();
3215 ++i, ++realRVLocIdx) {
3216 CCValAssign &VA = RVLocs[i];
3217 assert(VA.isRegLoc() && "Can only return in registers!");
3218
3219 SDValue Arg = OutVals[realRVLocIdx];
3220 bool ReturnF16 = false;
3221
3222 if (Subtarget->hasFullFP16() && Subtarget->isTargetHardFloat()) {
3223 // Half-precision return values can be returned like this:
3224 //
3225 // t11 f16 = fadd ...
3226 // t12: i16 = bitcast t11
3227 // t13: i32 = zero_extend t12
3228 // t14: f32 = bitcast t13 <~~~~~~~ Arg
3229 //
3230 // to avoid code generation for bitcasts, we simply set Arg to the node
3231 // that produces the f16 value, t11 in this case.
3232 //
3233 if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) {
3234 SDValue ZE = Arg.getOperand(0);
3235 if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) {
3236 SDValue BC = ZE.getOperand(0);
3237 if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) {
3238 Arg = BC.getOperand(0);
3239 ReturnF16 = true;
3240 }
3241 }
3242 }
3243 }
3244
3245 switch (VA.getLocInfo()) {
3246 default: llvm_unreachable("Unknown loc info!");
3247 case CCValAssign::Full: break;
3248 case CCValAssign::BCvt:
3249 if (!ReturnF16)
3250 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
3251 break;
3252 }
3253
3254 // Mask f16 arguments if this is a CMSE nonsecure entry.
3255 auto RetVT = Outs[realRVLocIdx].ArgVT;
3256 if (AFI->isCmseNSEntryFunction() && (RetVT == MVT::f16)) {
3257 if (VA.needsCustom() && VA.getValVT() == MVT::f16) {
3258 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
3259 } else {
3260 auto LocBits = VA.getLocVT().getSizeInBits();
3261 auto MaskValue = APInt::getLowBitsSet(LocBits, RetVT.getSizeInBits());
3262 SDValue Mask =
3263 DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
3264 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
3265 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
3266 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
3267 }
3268 }
3269
3270 if (VA.needsCustom() &&
3271 (VA.getLocVT() == MVT::v2f64 || VA.getLocVT() == MVT::f64)) {
3272 if (VA.getLocVT() == MVT::v2f64) {
3273 // Extract the first half and return it in two registers.
3274 SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
3275 DAG.getConstant(0, dl, MVT::i32));
3276 SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
3277 DAG.getVTList(MVT::i32, MVT::i32), Half);
3278
3279 Chain =
3280 DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3281 HalfGPRs.getValue(isLittleEndian ? 0 : 1), Glue);
3282 Glue = Chain.getValue(1);
3283 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3284 VA = RVLocs[++i]; // skip ahead to next loc
3285 Chain =
3286 DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3287 HalfGPRs.getValue(isLittleEndian ? 1 : 0), Glue);
3288 Glue = Chain.getValue(1);
3289 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3290 VA = RVLocs[++i]; // skip ahead to next loc
3291
3292 // Extract the 2nd half and fall through to handle it as an f64 value.
3293 Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
3294 DAG.getConstant(1, dl, MVT::i32));
3295 }
3296 // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is
3297 // available.
3298 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
3299 DAG.getVTList(MVT::i32, MVT::i32), Arg);
3300 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3301 fmrrd.getValue(isLittleEndian ? 0 : 1), Glue);
3302 Glue = Chain.getValue(1);
3303 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3304 VA = RVLocs[++i]; // skip ahead to next loc
3305 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3306 fmrrd.getValue(isLittleEndian ? 1 : 0), Glue);
3307 } else
3308 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Glue);
3309
3310 // Guarantee that all emitted copies are
3311 // stuck together, avoiding something bad.
3312 Glue = Chain.getValue(1);
3313 RetOps.push_back(DAG.getRegister(
3314 VA.getLocReg(), ReturnF16 ? Arg.getValueType() : VA.getLocVT()));
3315 }
3316 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
3317 const MCPhysReg *I =
3318 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3319 if (I) {
3320 for (; *I; ++I) {
3321 if (ARM::GPRRegClass.contains(*I))
3322 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3323 else if (ARM::DPRRegClass.contains(*I))
3325 else
3326 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3327 }
3328 }
3329
3330 // Update chain and glue.
3331 RetOps[0] = Chain;
3332 if (Glue.getNode())
3333 RetOps.push_back(Glue);
3334
3335 // CPUs which aren't M-class use a special sequence to return from
3336 // exceptions (roughly, any instruction setting pc and cpsr simultaneously,
3337 // though we use "subs pc, lr, #N").
3338 //
3339 // M-class CPUs actually use a normal return sequence with a special
3340 // (hardware-provided) value in LR, so the normal code path works.
3341 if (DAG.getMachineFunction().getFunction().hasFnAttribute("interrupt") &&
3342 !Subtarget->isMClass()) {
3343 if (Subtarget->isThumb1Only())
3344 report_fatal_error("interrupt attribute is not supported in Thumb1");
3345 return LowerInterruptReturn(RetOps, dl, DAG);
3346 }
3347
3350 return DAG.getNode(RetNode, dl, MVT::Other, RetOps);
3351}
3352
3353bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
3354 if (N->getNumValues() != 1)
3355 return false;
3356 if (!N->hasNUsesOfValue(1, 0))
3357 return false;
3358
3359 SDValue TCChain = Chain;
3360 SDNode *Copy = *N->use_begin();
3361 if (Copy->getOpcode() == ISD::CopyToReg) {
3362 // If the copy has a glue operand, we conservatively assume it isn't safe to
3363 // perform a tail call.
3364 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3365 return false;
3366 TCChain = Copy->getOperand(0);
3367 } else if (Copy->getOpcode() == ARMISD::VMOVRRD) {
3368 SDNode *VMov = Copy;
3369 // f64 returned in a pair of GPRs.
3371 for (SDNode *U : VMov->uses()) {
3372 if (U->getOpcode() != ISD::CopyToReg)
3373 return false;
3374 Copies.insert(U);
3375 }
3376 if (Copies.size() > 2)
3377 return false;
3378
3379 for (SDNode *U : VMov->uses()) {
3380 SDValue UseChain = U->getOperand(0);
3381 if (Copies.count(UseChain.getNode()))
3382 // Second CopyToReg
3383 Copy = U;
3384 else {
3385 // We are at the top of this chain.
3386 // If the copy has a glue operand, we conservatively assume it
3387 // isn't safe to perform a tail call.
3388 if (U->getOperand(U->getNumOperands() - 1).getValueType() == MVT::Glue)
3389 return false;
3390 // First CopyToReg
3391 TCChain = UseChain;
3392 }
3393 }
3394 } else if (Copy->getOpcode() == ISD::BITCAST) {
3395 // f32 returned in a single GPR.
3396 if (!Copy->hasOneUse())
3397 return false;
3398 Copy = *Copy->use_begin();
3399 if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0))
3400 return false;
3401 // If the copy has a glue operand, we conservatively assume it isn't safe to
3402 // perform a tail call.
3403 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3404 return false;
3405 TCChain = Copy->getOperand(0);
3406 } else {
3407 return false;
3408 }
3409
3410 bool HasRet = false;
3411 for (const SDNode *U : Copy->uses()) {
3412 if (U->getOpcode() != ARMISD::RET_GLUE &&
3413 U->getOpcode() != ARMISD::INTRET_GLUE)
3414 return false;
3415 HasRet = true;
3416 }
3417
3418 if (!HasRet)
3419 return false;
3420
3421 Chain = TCChain;
3422 return true;
3423}
3424
3425bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3426 if (!Subtarget->supportsTailCall())
3427 return false;
3428
3429 if (!CI->isTailCall())
3430 return false;
3431
3432 return true;
3433}
3434
3435// Trying to write a 64 bit value so need to split into two 32 bit values first,
3436// and pass the lower and high parts through.
3438 SDLoc DL(Op);
3439 SDValue WriteValue = Op->getOperand(2);
3440
3441 // This function is only supposed to be called for i64 type argument.
3442 assert(WriteValue.getValueType() == MVT::i64
3443 && "LowerWRITE_REGISTER called for non-i64 type argument.");
3444
3445 SDValue Lo, Hi;
3446 std::tie(Lo, Hi) = DAG.SplitScalar(WriteValue, DL, MVT::i32, MVT::i32);
3447 SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi };
3448 return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops);
3449}
3450
3451// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
3452// their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
3453// one of the above mentioned nodes. It has to be wrapped because otherwise
3454// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
3455// be used to form addressing mode. These wrapped nodes will be selected
3456// into MOVi.
3457SDValue ARMTargetLowering::LowerConstantPool(SDValue Op,
3458 SelectionDAG &DAG) const {
3459 EVT PtrVT = Op.getValueType();
3460 // FIXME there is no actual debug info here
3461 SDLoc dl(Op);
3462 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
3463 SDValue Res;
3464
3465 // When generating execute-only code Constant Pools must be promoted to the
3466 // global data section. It's a bit ugly that we can't share them across basic
3467 // blocks, but this way we guarantee that execute-only behaves correct with
3468 // position-independent addressing modes.
3469 if (Subtarget->genExecuteOnly()) {
3470 auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
3471 auto T = const_cast<Type*>(CP->getType());
3472 auto C = const_cast<Constant*>(CP->getConstVal());
3473 auto M = const_cast<Module*>(DAG.getMachineFunction().
3475 auto GV = new GlobalVariable(
3476 *M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C,
3479 Twine(AFI->createPICLabelUId())
3480 );
3481 SDValue GA = DAG.getTargetGlobalAddress(dyn_cast<GlobalValue>(GV),
3482 dl, PtrVT);
3483 return LowerGlobalAddress(GA, DAG);
3484 }
3485
3486 // The 16-bit ADR instruction can only encode offsets that are multiples of 4,
3487 // so we need to align to at least 4 bytes when we don't have 32-bit ADR.
3488 Align CPAlign = CP->getAlign();
3489 if (Subtarget->isThumb1Only())
3490 CPAlign = std::max(CPAlign, Align(4));
3491 if (CP->isMachineConstantPoolEntry())
3492 Res =
3493 DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, CPAlign);
3494 else
3495 Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CPAlign);
3496 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
3497}
3498
3500 // If we don't have a 32-bit pc-relative branch instruction then the jump
3501 // table consists of block addresses. Usually this is inline, but for
3502 // execute-only it must be placed out-of-line.
3503 if (Subtarget->genExecuteOnly() && !Subtarget->hasV8MBaselineOps())
3506}
3507
3508SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
3509 SelectionDAG &DAG) const {
3512 unsigned ARMPCLabelIndex = 0;
3513 SDLoc DL(Op);
3514 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3515 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
3516 SDValue CPAddr;
3517 bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI();
3518 if (!IsPositionIndependent) {
3519 CPAddr = DAG.getTargetConstantPool(BA, PtrVT, Align(4));
3520 } else {
3521 unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
3522 ARMPCLabelIndex = AFI->createPICLabelUId();
3524 ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex,
3525 ARMCP::CPBlockAddress, PCAdj);
3526 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3527 }
3528 CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
3529 SDValue Result = DAG.getLoad(
3530 PtrVT, DL, DAG.getEntryNode(), CPAddr,
3532 if (!IsPositionIndependent)
3533 return Result;
3534 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32);
3535 return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
3536}
3537
3538/// Convert a TLS address reference into the correct sequence of loads
3539/// and calls to compute the variable's address for Darwin, and return an
3540/// SDValue containing the final node.
3541
3542/// Darwin only has one TLS scheme which must be capable of dealing with the
3543/// fully general situation, in the worst case. This means:
3544/// + "extern __thread" declaration.
3545/// + Defined in a possibly unknown dynamic library.
3546///
3547/// The general system is that each __thread variable has a [3 x i32] descriptor
3548/// which contains information used by the runtime to calculate the address. The
3549/// only part of this the compiler needs to know about is the first word, which
3550/// contains a function pointer that must be called with the address of the
3551/// entire descriptor in "r0".
3552///
3553/// Since this descriptor may be in a different unit, in general access must
3554/// proceed along the usual ARM rules. A common sequence to produce is:
3555///
3556/// movw rT1, :lower16:_var$non_lazy_ptr
3557/// movt rT1, :upper16:_var$non_lazy_ptr
3558/// ldr r0, [rT1]
3559/// ldr rT2, [r0]
3560/// blx rT2
3561/// [...address now in r0...]
3562SDValue
3563ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op,
3564 SelectionDAG &DAG) const {
3565 assert(Subtarget->isTargetDarwin() &&
3566 "This function expects a Darwin target");
3567 SDLoc DL(Op);
3568
3569 // First step is to get the address of the actua global symbol. This is where
3570 // the TLS descriptor lives.
3571 SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG);
3572
3573 // The first entry in the descriptor is a function pointer that we must call
3574 // to obtain the address of the variable.
3575 SDValue Chain = DAG.getEntryNode();
3576 SDValue FuncTLVGet = DAG.getLoad(
3577 MVT::i32, DL, Chain, DescAddr,
3581 Chain = FuncTLVGet.getValue(1);
3582
3584 MachineFrameInfo &MFI = F.getFrameInfo();
3585 MFI.setAdjustsStack(true);
3586
3587 // TLS calls preserve all registers except those that absolutely must be
3588 // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be
3589 // silly).
3590 auto TRI =
3592 auto ARI = static_cast<const ARMRegisterInfo *>(TRI);
3594
3595 // Finally, we can make the call. This is just a degenerate version of a
3596 // normal AArch64 call node: r0 takes the address of the descriptor, and
3597 // returns the address of the variable in this thread.
3598 Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue());
3599 Chain =
3600 DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
3601 Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32),
3602 DAG.getRegisterMask(Mask), Chain.getValue(1));
3603 return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1));
3604}
3605
3606SDValue
3607ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op,
3608 SelectionDAG &DAG) const {
3609 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
3610
3611 SDValue Chain = DAG.getEntryNode();
3612 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3613 SDLoc DL(Op);
3614
3615 // Load the current TEB (thread environment block)
3616 SDValue Ops[] = {Chain,
3617 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
3618 DAG.getTargetConstant(15, DL, MVT::i32),
3619 DAG.getTargetConstant(0, DL, MVT::i32),
3620 DAG.getTargetConstant(13, DL, MVT::i32),
3621 DAG.getTargetConstant(0, DL, MVT::i32),
3622 DAG.getTargetConstant(2, DL, MVT::i32)};
3623 SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
3624 DAG.getVTList(MVT::i32, MVT::Other), Ops);
3625
3626 SDValue TEB = CurrentTEB.getValue(0);
3627 Chain = CurrentTEB.getValue(1);
3628
3629 // Load the ThreadLocalStoragePointer from the TEB
3630 // A pointer to the TLS array is located at offset 0x2c from the TEB.
3631 SDValue TLSArray =
3632 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL));
3633 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
3634
3635 // The pointer to the thread's TLS data area is at the TLS Index scaled by 4
3636 // offset into the TLSArray.
3637
3638 // Load the TLS index from the C runtime
3639 SDValue TLSIndex =
3640 DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG);
3641 TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex);
3642 TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo());
3643
3644 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
3645 DAG.getConstant(2, DL, MVT::i32));
3646 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
3647 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
3649
3650 // Get the offset of the start of the .tls section (section base)
3651 const auto *GA = cast<GlobalAddressSDNode>(