LLVM 18.0.0git
ARMISelLowering.cpp
Go to the documentation of this file.
1//===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that ARM uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "ARMISelLowering.h"
15#include "ARMBaseInstrInfo.h"
16#include "ARMBaseRegisterInfo.h"
17#include "ARMCallingConv.h"
20#include "ARMPerfectShuffle.h"
21#include "ARMRegisterInfo.h"
22#include "ARMSelectionDAGInfo.h"
23#include "ARMSubtarget.h"
27#include "Utils/ARMBaseInfo.h"
28#include "llvm/ADT/APFloat.h"
29#include "llvm/ADT/APInt.h"
30#include "llvm/ADT/ArrayRef.h"
31#include "llvm/ADT/BitVector.h"
32#include "llvm/ADT/DenseMap.h"
33#include "llvm/ADT/STLExtras.h"
36#include "llvm/ADT/Statistic.h"
38#include "llvm/ADT/StringRef.h"
40#include "llvm/ADT/Twine.h"
66#include "llvm/IR/Attributes.h"
67#include "llvm/IR/CallingConv.h"
68#include "llvm/IR/Constant.h"
69#include "llvm/IR/Constants.h"
70#include "llvm/IR/DataLayout.h"
71#include "llvm/IR/DebugLoc.h"
73#include "llvm/IR/Function.h"
74#include "llvm/IR/GlobalAlias.h"
75#include "llvm/IR/GlobalValue.h"
77#include "llvm/IR/IRBuilder.h"
78#include "llvm/IR/InlineAsm.h"
79#include "llvm/IR/Instruction.h"
82#include "llvm/IR/Intrinsics.h"
83#include "llvm/IR/IntrinsicsARM.h"
84#include "llvm/IR/Module.h"
86#include "llvm/IR/Type.h"
87#include "llvm/IR/User.h"
88#include "llvm/IR/Value.h"
89#include "llvm/MC/MCInstrDesc.h"
92#include "llvm/MC/MCSchedule.h"
99#include "llvm/Support/Debug.h"
107#include <algorithm>
108#include <cassert>
109#include <cstdint>
110#include <cstdlib>
111#include <iterator>
112#include <limits>
113#include <optional>
114#include <string>
115#include <tuple>
116#include <utility>
117#include <vector>
118
119using namespace llvm;
120using namespace llvm::PatternMatch;
121
122#define DEBUG_TYPE "arm-isel"
123
124STATISTIC(NumTailCalls, "Number of tail calls");
125STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
126STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
127STATISTIC(NumConstpoolPromoted,
128 "Number of constants with their storage promoted into constant pools");
129
130static cl::opt<bool>
131ARMInterworking("arm-interworking", cl::Hidden,
132 cl::desc("Enable / disable ARM interworking (for debugging only)"),
133 cl::init(true));
134
136 "arm-promote-constant", cl::Hidden,
137 cl::desc("Enable / disable promotion of unnamed_addr constants into "
138 "constant pools"),
139 cl::init(false)); // FIXME: set to true by default once PR32780 is fixed
141 "arm-promote-constant-max-size", cl::Hidden,
142 cl::desc("Maximum size of constant to promote into a constant pool"),
143 cl::init(64));
145 "arm-promote-constant-max-total", cl::Hidden,
146 cl::desc("Maximum size of ALL constants to promote into a constant pool"),
147 cl::init(128));
148
150MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden,
151 cl::desc("Maximum interleave factor for MVE VLDn to generate."),
152 cl::init(2));
153
154// The APCS parameter registers.
155static const MCPhysReg GPRArgRegs[] = {
156 ARM::R0, ARM::R1, ARM::R2, ARM::R3
157};
158
159void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT) {
160 if (VT != PromotedLdStVT) {
162 AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);
163
165 AddPromotedToType (ISD::STORE, VT, PromotedLdStVT);
166 }
167
168 MVT ElemTy = VT.getVectorElementType();
169 if (ElemTy != MVT::f64)
173 if (ElemTy == MVT::i32) {
178 } else {
183 }
192 if (VT.isInteger()) {
196 }
197
198 // Neon does not support vector divide/remainder operations.
207
208 if (!VT.isFloatingPoint() &&
209 VT != MVT::v2i64 && VT != MVT::v1i64)
210 for (auto Opcode : {ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
211 setOperationAction(Opcode, VT, Legal);
212 if (!VT.isFloatingPoint())
213 for (auto Opcode : {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT})
214 setOperationAction(Opcode, VT, Legal);
215}
216
217void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
218 addRegisterClass(VT, &ARM::DPRRegClass);
219 addTypeForNEON(VT, MVT::f64);
220}
221
222void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
223 addRegisterClass(VT, &ARM::DPairRegClass);
224 addTypeForNEON(VT, MVT::v2f64);
225}
226
227void ARMTargetLowering::setAllExpand(MVT VT) {
228 for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
229 setOperationAction(Opc, VT, Expand);
230
231 // We support these really simple operations even on types where all
232 // the actual arithmetic has to be broken down into simpler
233 // operations or turned into library calls.
238}
239
240void ARMTargetLowering::addAllExtLoads(const MVT From, const MVT To,
241 LegalizeAction Action) {
242 setLoadExtAction(ISD::EXTLOAD, From, To, Action);
243 setLoadExtAction(ISD::ZEXTLOAD, From, To, Action);
244 setLoadExtAction(ISD::SEXTLOAD, From, To, Action);
245}
246
247void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
248 const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 };
249
250 for (auto VT : IntTypes) {
251 addRegisterClass(VT, &ARM::MQPRRegClass);
281
282 // No native support for these.
292
293 // Vector reductions
303
304 if (!HasMVEFP) {
309 } else {
312 }
313
314 // Pre and Post inc are supported on loads and stores
315 for (unsigned im = (unsigned)ISD::PRE_INC;
321 }
322 }
323
324 const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 };
325 for (auto VT : FloatTypes) {
326 addRegisterClass(VT, &ARM::MQPRRegClass);
327 if (!HasMVEFP)
328 setAllExpand(VT);
329
330 // These are legal or custom whether we have MVE.fp or not
343
344 // Pre and Post inc are supported on loads and stores
345 for (unsigned im = (unsigned)ISD::PRE_INC;
351 }
352
353 if (HasMVEFP) {
361
362 // No native support for these.
376 }
377 }
378
379 // Custom Expand smaller than legal vector reductions to prevent false zero
380 // items being added.
389
390 // We 'support' these types up to bitcast/load/store level, regardless of
391 // MVE integer-only / float support. Only doing FP data processing on the FP
392 // vector types is inhibited at integer-only level.
393 const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 };
394 for (auto VT : LongTypes) {
395 addRegisterClass(VT, &ARM::MQPRRegClass);
396 setAllExpand(VT);
402 }
404
405 // We can do bitwise operations on v2i64 vectors
406 setOperationAction(ISD::AND, MVT::v2i64, Legal);
407 setOperationAction(ISD::OR, MVT::v2i64, Legal);
408 setOperationAction(ISD::XOR, MVT::v2i64, Legal);
409
410 // It is legal to extload from v4i8 to v4i16 or v4i32.
411 addAllExtLoads(MVT::v8i16, MVT::v8i8, Legal);
412 addAllExtLoads(MVT::v4i32, MVT::v4i16, Legal);
413 addAllExtLoads(MVT::v4i32, MVT::v4i8, Legal);
414
415 // It is legal to sign extend from v4i8/v4i16 to v4i32 or v8i8 to v8i16.
421
422 // Some truncating stores are legal too.
423 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
424 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
425 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
426
427 // Pre and Post inc on these are legal, given the correct extends
428 for (unsigned im = (unsigned)ISD::PRE_INC;
430 for (auto VT : {MVT::v8i8, MVT::v4i8, MVT::v4i16}) {
435 }
436 }
437
438 // Predicate types
439 const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1};
440 for (auto VT : pTypes) {
441 addRegisterClass(VT, &ARM::VCCRRegClass);
456
457 if (!HasMVEFP) {
462 }
463 }
467 setOperationAction(ISD::OR, MVT::v2i1, Expand);
473
482}
483
485 const ARMSubtarget &STI)
486 : TargetLowering(TM), Subtarget(&STI) {
487 RegInfo = Subtarget->getRegisterInfo();
488 Itins = Subtarget->getInstrItineraryData();
489
492
493 if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetIOS() &&
494 !Subtarget->isTargetWatchOS() && !Subtarget->isTargetDriverKit()) {
495 bool IsHFTarget = TM.Options.FloatABIType == FloatABI::Hard;
496 for (int LCID = 0; LCID < RTLIB::UNKNOWN_LIBCALL; ++LCID)
497 setLibcallCallingConv(static_cast<RTLIB::Libcall>(LCID),
498 IsHFTarget ? CallingConv::ARM_AAPCS_VFP
500 }
501
502 if (Subtarget->isTargetMachO()) {
503 // Uses VFP for Thumb libfuncs if available.
504 if (Subtarget->isThumb() && Subtarget->hasVFP2Base() &&
505 Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) {
506 static const struct {
507 const RTLIB::Libcall Op;
508 const char * const Name;
509 const ISD::CondCode Cond;
510 } LibraryCalls[] = {
511 // Single-precision floating-point arithmetic.
512 { RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID },
513 { RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID },
514 { RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID },
515 { RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID },
516
517 // Double-precision floating-point arithmetic.
518 { RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID },
519 { RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID },
520 { RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID },
521 { RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID },
522
523 // Single-precision comparisons.
524 { RTLIB::OEQ_F32, "__eqsf2vfp", ISD::SETNE },
525 { RTLIB::UNE_F32, "__nesf2vfp", ISD::SETNE },
526 { RTLIB::OLT_F32, "__ltsf2vfp", ISD::SETNE },
527 { RTLIB::OLE_F32, "__lesf2vfp", ISD::SETNE },
528 { RTLIB::OGE_F32, "__gesf2vfp", ISD::SETNE },
529 { RTLIB::OGT_F32, "__gtsf2vfp", ISD::SETNE },
530 { RTLIB::UO_F32, "__unordsf2vfp", ISD::SETNE },
531
532 // Double-precision comparisons.
533 { RTLIB::OEQ_F64, "__eqdf2vfp", ISD::SETNE },
534 { RTLIB::UNE_F64, "__nedf2vfp", ISD::SETNE },
535 { RTLIB::OLT_F64, "__ltdf2vfp", ISD::SETNE },
536 { RTLIB::OLE_F64, "__ledf2vfp", ISD::SETNE },
537 { RTLIB::OGE_F64, "__gedf2vfp", ISD::SETNE },
538 { RTLIB::OGT_F64, "__gtdf2vfp", ISD::SETNE },
539 { RTLIB::UO_F64, "__unorddf2vfp", ISD::SETNE },
540
541 // Floating-point to integer conversions.
542 // i64 conversions are done via library routines even when generating VFP
543 // instructions, so use the same ones.
544 { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp", ISD::SETCC_INVALID },
545 { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID },
546 { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp", ISD::SETCC_INVALID },
547 { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID },
548
549 // Conversions between floating types.
550 { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp", ISD::SETCC_INVALID },
551 { RTLIB::FPEXT_F32_F64, "__extendsfdf2vfp", ISD::SETCC_INVALID },
552
553 // Integer to floating-point conversions.
554 // i64 conversions are done via library routines even when generating VFP
555 // instructions, so use the same ones.
556 // FIXME: There appears to be some naming inconsistency in ARM libgcc:
557 // e.g., __floatunsidf vs. __floatunssidfvfp.
558 { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp", ISD::SETCC_INVALID },
559 { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID },
560 { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp", ISD::SETCC_INVALID },
561 { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID },
562 };
563
564 for (const auto &LC : LibraryCalls) {
565 setLibcallName(LC.Op, LC.Name);
566 if (LC.Cond != ISD::SETCC_INVALID)
567 setCmpLibcallCC(LC.Op, LC.Cond);
568 }
569 }
570 }
571
572 // These libcalls are not available in 32-bit.
573 setLibcallName(RTLIB::SHL_I128, nullptr);
574 setLibcallName(RTLIB::SRL_I128, nullptr);
575 setLibcallName(RTLIB::SRA_I128, nullptr);
576 setLibcallName(RTLIB::MUL_I128, nullptr);
577 setLibcallName(RTLIB::MULO_I64, nullptr);
578 setLibcallName(RTLIB::MULO_I128, nullptr);
579
580 // RTLIB
581 if (Subtarget->isAAPCS_ABI() &&
582 (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() ||
583 Subtarget->isTargetMuslAEABI() || Subtarget->isTargetAndroid())) {
584 static const struct {
585 const RTLIB::Libcall Op;
586 const char * const Name;
587 const CallingConv::ID CC;
588 const ISD::CondCode Cond;
589 } LibraryCalls[] = {
590 // Double-precision floating-point arithmetic helper functions
591 // RTABI chapter 4.1.2, Table 2
592 { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
593 { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
594 { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
595 { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
596
597 // Double-precision floating-point comparison helper functions
598 // RTABI chapter 4.1.2, Table 3
599 { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
600 { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
601 { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
602 { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
603 { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
604 { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
605 { RTLIB::UO_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
606
607 // Single-precision floating-point arithmetic helper functions
608 // RTABI chapter 4.1.2, Table 4
609 { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
610 { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
611 { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
612 { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
613
614 // Single-precision floating-point comparison helper functions
615 // RTABI chapter 4.1.2, Table 5
616 { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
617 { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
618 { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
619 { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
620 { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
621 { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
622 { RTLIB::UO_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
623
624 // Floating-point to integer conversions.
625 // RTABI chapter 4.1.2, Table 6
626 { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
627 { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
628 { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
629 { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
630 { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
631 { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
632 { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
633 { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
634
635 // Conversions between floating types.
636 // RTABI chapter 4.1.2, Table 7
637 { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
638 { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
639 { RTLIB::FPEXT_F32_F64, "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
640
641 // Integer to floating-point conversions.
642 // RTABI chapter 4.1.2, Table 8
643 { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
644 { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
645 { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
646 { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
647 { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
648 { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
649 { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
650 { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
651
652 // Long long helper functions
653 // RTABI chapter 4.2, Table 9
654 { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
655 { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
656 { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
657 { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
658
659 // Integer division functions
660 // RTABI chapter 4.3.1
661 { RTLIB::SDIV_I8, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
662 { RTLIB::SDIV_I16, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
663 { RTLIB::SDIV_I32, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
664 { RTLIB::SDIV_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
665 { RTLIB::UDIV_I8, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
666 { RTLIB::UDIV_I16, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
667 { RTLIB::UDIV_I32, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
668 { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
669 };
670
671 for (const auto &LC : LibraryCalls) {
672 setLibcallName(LC.Op, LC.Name);
673 setLibcallCallingConv(LC.Op, LC.CC);
674 if (LC.Cond != ISD::SETCC_INVALID)
675 setCmpLibcallCC(LC.Op, LC.Cond);
676 }
677
678 // EABI dependent RTLIB
679 if (TM.Options.EABIVersion == EABI::EABI4 ||
680 TM.Options.EABIVersion == EABI::EABI5) {
681 static const struct {
682 const RTLIB::Libcall Op;
683 const char *const Name;
684 const CallingConv::ID CC;
685 const ISD::CondCode Cond;
686 } MemOpsLibraryCalls[] = {
687 // Memory operations
688 // RTABI chapter 4.3.4
689 { RTLIB::MEMCPY, "__aeabi_memcpy", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
690 { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
691 { RTLIB::MEMSET, "__aeabi_memset", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
692 };
693
694 for (const auto &LC : MemOpsLibraryCalls) {
695 setLibcallName(LC.Op, LC.Name);
696 setLibcallCallingConv(LC.Op, LC.CC);
697 if (LC.Cond != ISD::SETCC_INVALID)
698 setCmpLibcallCC(LC.Op, LC.Cond);
699 }
700 }
701 }
702
703 if (Subtarget->isTargetWindows()) {
704 static const struct {
705 const RTLIB::Libcall Op;
706 const char * const Name;
707 const CallingConv::ID CC;
708 } LibraryCalls[] = {
709 { RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP },
710 { RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP },
711 { RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP },
712 { RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP },
713 { RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP },
714 { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP },
715 { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP },
716 { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP },
717 };
718
719 for (const auto &LC : LibraryCalls) {
720 setLibcallName(LC.Op, LC.Name);
721 setLibcallCallingConv(LC.Op, LC.CC);
722 }
723 }
724
725 // Use divmod compiler-rt calls for iOS 5.0 and later.
726 if (Subtarget->isTargetMachO() &&
727 !(Subtarget->isTargetIOS() &&
728 Subtarget->getTargetTriple().isOSVersionLT(5, 0))) {
729 setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
730 setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
731 }
732
733 // The half <-> float conversion functions are always soft-float on
734 // non-watchos platforms, but are needed for some targets which use a
735 // hard-float calling convention by default.
736 if (!Subtarget->isTargetWatchABI()) {
737 if (Subtarget->isAAPCS_ABI()) {
738 setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS);
739 setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS);
740 setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS);
741 } else {
742 setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS);
743 setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS);
744 setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS);
745 }
746 }
747
748 // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have
749 // a __gnu_ prefix (which is the default).
750 if (Subtarget->isTargetAEABI()) {
751 static const struct {
752 const RTLIB::Libcall Op;
753 const char * const Name;
754 const CallingConv::ID CC;
755 } LibraryCalls[] = {
756 { RTLIB::FPROUND_F32_F16, "__aeabi_f2h", CallingConv::ARM_AAPCS },
757 { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS },
758 { RTLIB::FPEXT_F16_F32, "__aeabi_h2f", CallingConv::ARM_AAPCS },
759 };
760
761 for (const auto &LC : LibraryCalls) {
762 setLibcallName(LC.Op, LC.Name);
763 setLibcallCallingConv(LC.Op, LC.CC);
764 }
765 }
766
767 if (Subtarget->isThumb1Only())
768 addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
769 else
770 addRegisterClass(MVT::i32, &ARM::GPRRegClass);
771
772 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only() &&
773 Subtarget->hasFPRegs()) {
774 addRegisterClass(MVT::f32, &ARM::SPRRegClass);
775 addRegisterClass(MVT::f64, &ARM::DPRRegClass);
776
781
782 if (!Subtarget->hasVFP2Base())
783 setAllExpand(MVT::f32);
784 if (!Subtarget->hasFP64())
785 setAllExpand(MVT::f64);
786 }
787
788 if (Subtarget->hasFullFP16()) {
789 addRegisterClass(MVT::f16, &ARM::HPRRegClass);
792
795 }
796
797 if (Subtarget->hasBF16()) {
798 addRegisterClass(MVT::bf16, &ARM::HPRRegClass);
799 setAllExpand(MVT::bf16);
800 if (!Subtarget->hasFullFP16())
802 }
803
805 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
806 setTruncStoreAction(VT, InnerVT, Expand);
807 addAllExtLoads(VT, InnerVT, Expand);
808 }
809
812
814 }
815
818
821
822 if (Subtarget->hasMVEIntegerOps())
823 addMVEVectorTypes(Subtarget->hasMVEFloatOps());
824
825 // Combine low-overhead loop intrinsics so that we can lower i1 types.
826 if (Subtarget->hasLOB()) {
828 }
829
830 if (Subtarget->hasNEON()) {
831 addDRTypeForNEON(MVT::v2f32);
832 addDRTypeForNEON(MVT::v8i8);
833 addDRTypeForNEON(MVT::v4i16);
834 addDRTypeForNEON(MVT::v2i32);
835 addDRTypeForNEON(MVT::v1i64);
836
837 addQRTypeForNEON(MVT::v4f32);
838 addQRTypeForNEON(MVT::v2f64);
839 addQRTypeForNEON(MVT::v16i8);
840 addQRTypeForNEON(MVT::v8i16);
841 addQRTypeForNEON(MVT::v4i32);
842 addQRTypeForNEON(MVT::v2i64);
843
844 if (Subtarget->hasFullFP16()) {
845 addQRTypeForNEON(MVT::v8f16);
846 addDRTypeForNEON(MVT::v4f16);
847 }
848
849 if (Subtarget->hasBF16()) {
850 addQRTypeForNEON(MVT::v8bf16);
851 addDRTypeForNEON(MVT::v4bf16);
852 }
853 }
854
855 if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) {
856 // v2f64 is legal so that QR subregs can be extracted as f64 elements, but
857 // none of Neon, MVE or VFP supports any arithmetic operations on it.
858 setOperationAction(ISD::FADD, MVT::v2f64, Expand);
859 setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
860 setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
861 // FIXME: Code duplication: FDIV and FREM are expanded always, see
862 // ARMTargetLowering::addTypeForNEON method for details.
863 setOperationAction(ISD::FDIV, MVT::v2f64, Expand);
864 setOperationAction(ISD::FREM, MVT::v2f64, Expand);
865 // FIXME: Create unittest.
866 // In another words, find a way when "copysign" appears in DAG with vector
867 // operands.
869 // FIXME: Code duplication: SETCC has custom operation action, see
870 // ARMTargetLowering::addTypeForNEON method for details.
872 // FIXME: Create unittest for FNEG and for FABS.
873 setOperationAction(ISD::FNEG, MVT::v2f64, Expand);
874 setOperationAction(ISD::FABS, MVT::v2f64, Expand);
876 setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
877 setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
878 setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
879 setOperationAction(ISD::FLOG, MVT::v2f64, Expand);
882 setOperationAction(ISD::FEXP, MVT::v2f64, Expand);
885 // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR.
891 setOperationAction(ISD::FMA, MVT::v2f64, Expand);
892 }
893
894 if (Subtarget->hasNEON()) {
895 // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
896 // supported for v4f32.
898 setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
899 setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
900 setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
901 setOperationAction(ISD::FLOG, MVT::v4f32, Expand);
904 setOperationAction(ISD::FEXP, MVT::v4f32, Expand);
912
913 // Mark v2f32 intrinsics.
915 setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
916 setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
917 setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
918 setOperationAction(ISD::FLOG, MVT::v2f32, Expand);
921 setOperationAction(ISD::FEXP, MVT::v2f32, Expand);
929
930 // Neon does not support some operations on v1i64 and v2i64 types.
931 setOperationAction(ISD::MUL, MVT::v1i64, Expand);
932 // Custom handling for some quad-vector types to detect VMULL.
933 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
934 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
935 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
936 // Custom handling for some vector types to avoid expensive expansions
937 setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
939 setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
941 // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
942 // a destination type that is wider than the source, and nor does
943 // it have a FP_TO_[SU]INT instruction with a narrower destination than
944 // source.
953
956
957 // NEON does not have single instruction CTPOP for vectors with element
958 // types wider than 8-bits. However, custom lowering can leverage the
959 // v8i8/v16i8 vcnt instruction.
966
967 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
968 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
969
970 // NEON does not have single instruction CTTZ for vectors.
972 setOperationAction(ISD::CTTZ, MVT::v4i16, Custom);
973 setOperationAction(ISD::CTTZ, MVT::v2i32, Custom);
974 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
975
976 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
977 setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
978 setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
979 setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);
980
985
990
994 }
995
996 // NEON only has FMA instructions as of VFP4.
997 if (!Subtarget->hasVFP4Base()) {
998 setOperationAction(ISD::FMA, MVT::v2f32, Expand);
999 setOperationAction(ISD::FMA, MVT::v4f32, Expand);
1000 }
1001
1004
1005 // It is legal to extload from v4i8 to v4i16 or v4i32.
1006 for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16,
1007 MVT::v2i32}) {
1012 }
1013 }
1014
1015 for (auto VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1016 MVT::v4i32}) {
1021 }
1022 }
1023
1024 if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) {
1031 }
1032 if (Subtarget->hasMVEIntegerOps()) {
1035 ISD::SETCC});
1036 }
1037 if (Subtarget->hasMVEFloatOps()) {
1039 }
1040
1041 if (!Subtarget->hasFP64()) {
1042 // When targeting a floating-point unit with only single-precision
1043 // operations, f64 is legal for the few double-precision instructions which
1044 // are present However, no double-precision operations other than moves,
1045 // loads and stores are provided by the hardware.
1083 }
1084
1085 if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) {
1088 if (Subtarget->hasFullFP16()) {
1091 }
1092 }
1093
1094 if (!Subtarget->hasFP16()) {
1097 }
1098
1100
1101 // ARM does not have floating-point extending loads.
1102 for (MVT VT : MVT::fp_valuetypes()) {
1103 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
1104 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
1105 }
1106
1107 // ... or truncating stores
1108 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
1109 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
1110 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
1111
1112 // ARM does not have i1 sign extending load.
1113 for (MVT VT : MVT::integer_valuetypes())
1114 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
1115
1116 // ARM supports all 4 flavors of integer indexed load / store.
1117 if (!Subtarget->isThumb1Only()) {
1118 for (unsigned im = (unsigned)ISD::PRE_INC;
1120 setIndexedLoadAction(im, MVT::i1, Legal);
1121 setIndexedLoadAction(im, MVT::i8, Legal);
1122 setIndexedLoadAction(im, MVT::i16, Legal);
1123 setIndexedLoadAction(im, MVT::i32, Legal);
1124 setIndexedStoreAction(im, MVT::i1, Legal);
1125 setIndexedStoreAction(im, MVT::i8, Legal);
1126 setIndexedStoreAction(im, MVT::i16, Legal);
1127 setIndexedStoreAction(im, MVT::i32, Legal);
1128 }
1129 } else {
1130 // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}.
1133 }
1134
1139
1142 if (Subtarget->hasDSP()) {
1151 }
1152 if (Subtarget->hasBaseDSP()) {
1155 }
1156
1157 // i64 operation support.
1160 if (Subtarget->isThumb1Only()) {
1163 }
1164 if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops()
1165 || (Subtarget->isThumb2() && !Subtarget->hasDSP()))
1167
1177
1178 // MVE lowers 64 bit shifts to lsll and lsrl
1179 // assuming that ISD::SRL and SRA of i64 are already marked custom
1180 if (Subtarget->hasMVEIntegerOps())
1182
1183 // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1.
1184 if (Subtarget->isThumb1Only()) {
1188 }
1189
1190 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops())
1192
1193 // ARM does not have ROTL.
1198 }
1201 if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) {
1204 }
1205
1206 // @llvm.readcyclecounter requires the Performance Monitors extension.
1207 // Default to the 0 expansion on unsupported platforms.
1208 // FIXME: Technically there are older ARM CPUs that have
1209 // implementation-specific ways of obtaining this information.
1210 if (Subtarget->hasPerfMon())
1212
1213 // Only ARMv6 has BSWAP.
1214 if (!Subtarget->hasV6Ops())
1216
1217 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
1218 : Subtarget->hasDivideInARMMode();
1219 if (!hasDivide) {
1220 // These are expanded into libcalls if the cpu doesn't have HW divider.
1223 }
1224
1225 if (Subtarget->isTargetWindows() && !Subtarget->hasDivideInThumbMode()) {
1228
1231 }
1232
1235
1236 // Register based DivRem for AEABI (RTABI 4.2)
1237 if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
1238 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
1239 Subtarget->isTargetWindows()) {
1242 HasStandaloneRem = false;
1243
1244 if (Subtarget->isTargetWindows()) {
1245 const struct {
1246 const RTLIB::Libcall Op;
1247 const char * const Name;
1248 const CallingConv::ID CC;
1249 } LibraryCalls[] = {
1250 { RTLIB::SDIVREM_I8, "__rt_sdiv", CallingConv::ARM_AAPCS },
1251 { RTLIB::SDIVREM_I16, "__rt_sdiv", CallingConv::ARM_AAPCS },
1252 { RTLIB::SDIVREM_I32, "__rt_sdiv", CallingConv::ARM_AAPCS },
1253 { RTLIB::SDIVREM_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS },
1254
1255 { RTLIB::UDIVREM_I8, "__rt_udiv", CallingConv::ARM_AAPCS },
1256 { RTLIB::UDIVREM_I16, "__rt_udiv", CallingConv::ARM_AAPCS },
1257 { RTLIB::UDIVREM_I32, "__rt_udiv", CallingConv::ARM_AAPCS },
1258 { RTLIB::UDIVREM_I64, "__rt_udiv64", CallingConv::ARM_AAPCS },
1259 };
1260
1261 for (const auto &LC : LibraryCalls) {
1262 setLibcallName(LC.Op, LC.Name);
1263 setLibcallCallingConv(LC.Op, LC.CC);
1264 }
1265 } else {
1266 const struct {
1267 const RTLIB::Libcall Op;
1268 const char * const Name;
1269 const CallingConv::ID CC;
1270 } LibraryCalls[] = {
1271 { RTLIB::SDIVREM_I8, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
1272 { RTLIB::SDIVREM_I16, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
1273 { RTLIB::SDIVREM_I32, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
1274 { RTLIB::SDIVREM_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS },
1275
1276 { RTLIB::UDIVREM_I8, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
1277 { RTLIB::UDIVREM_I16, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
1278 { RTLIB::UDIVREM_I32, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
1279 { RTLIB::UDIVREM_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS },
1280 };
1281
1282 for (const auto &LC : LibraryCalls) {
1283 setLibcallName(LC.Op, LC.Name);
1284 setLibcallCallingConv(LC.Op, LC.CC);
1285 }
1286 }
1287
1292 } else {
1295 }
1296
1297 if (Subtarget->getTargetTriple().isOSMSVCRT()) {
1298 // MSVCRT doesn't have powi; fall back to pow
1299 setLibcallName(RTLIB::POWI_F32, nullptr);
1300 setLibcallName(RTLIB::POWI_F64, nullptr);
1301 }
1302
1307
1308 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1310
1311 // Use the default implementation.
1313 setOperationAction(ISD::VAARG, MVT::Other, Expand);
1315 setOperationAction(ISD::VAEND, MVT::Other, Expand);
1318
1319 if (Subtarget->isTargetWindows())
1321 else
1323
1324 // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
1325 // the default expansion.
1326 InsertFencesForAtomic = false;
1327 if (Subtarget->hasAnyDataBarrier() &&
1328 (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) {
1329 // ATOMIC_FENCE needs custom lowering; the others should have been expanded
1330 // to ldrex/strex loops already.
1332 if (!Subtarget->isThumb() || !Subtarget->isMClass())
1334
1335 // On v8, we have particularly efficient implementations of atomic fences
1336 // if they can be combined with nearby atomic loads and stores.
1337 if (!Subtarget->hasAcquireRelease() ||
1338 getTargetMachine().getOptLevel() == CodeGenOptLevel::None) {
1339 // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc.
1340 InsertFencesForAtomic = true;
1341 }
1342 } else {
1343 // If there's anything we can use as a barrier, go through custom lowering
1344 // for ATOMIC_FENCE.
1345 // If target has DMB in thumb, Fences can be inserted.
1346 if (Subtarget->hasDataBarrier())
1347 InsertFencesForAtomic = true;
1348
1350 Subtarget->hasAnyDataBarrier() ? Custom : Expand);
1351
1352 // Set them all for libcall, which will force libcalls.
1365 // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the
1366 // Unordered/Monotonic case.
1367 if (!InsertFencesForAtomic) {
1370 }
1371 }
1372
1373 // Compute supported atomic widths.
1374 if (Subtarget->isTargetLinux() ||
1375 (!Subtarget->isMClass() && Subtarget->hasV6Ops())) {
1376 // For targets where __sync_* routines are reliably available, we use them
1377 // if necessary.
1378 //
1379 // ARM Linux always supports 64-bit atomics through kernel-assisted atomic
1380 // routines (kernel 3.1 or later). FIXME: Not with compiler-rt?
1381 //
1382 // ARMv6 targets have native instructions in ARM mode. For Thumb mode,
1383 // such targets should provide __sync_* routines, which use the ARM mode
1384 // instructions. (ARMv6 doesn't have dmb, but it has an equivalent
1385 // encoding; see ARMISD::MEMBARRIER_MCR.)
1387 } else if ((Subtarget->isMClass() && Subtarget->hasV8MBaselineOps()) ||
1388 Subtarget->hasForced32BitAtomics()) {
1389 // Cortex-M (besides Cortex-M0) have 32-bit atomics.
1391 } else {
1392 // We can't assume anything about other targets; just use libatomic
1393 // routines.
1395 }
1396
1398
1400
1401 // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
1402 if (!Subtarget->hasV6Ops()) {
1405 }
1407
1408 if (!Subtarget->useSoftFloat() && Subtarget->hasFPRegs() &&
1409 !Subtarget->isThumb1Only()) {
1410 // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
1411 // iff target supports vfp2.
1415 }
1416
1417 // We want to custom lower some of our intrinsics.
1422 if (Subtarget->useSjLjEH())
1423 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
1424
1434 if (Subtarget->hasFullFP16()) {
1438 }
1439
1441
1444 if (Subtarget->hasFullFP16())
1448 setOperationAction(ISD::BR_JT, MVT::Other, Custom);
1449
1450 // We don't support sin/cos/fmod/copysign/pow
1459 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2Base() &&
1460 !Subtarget->isThumb1Only()) {
1463 }
1466
1467 if (!Subtarget->hasVFP4Base()) {
1470 }
1471
1472 // Various VFP goodness
1473 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) {
1474 // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded.
1475 if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) {
1478 }
1479
1480 // fp16 is a special v7 extension that adds f16 <-> f32 conversions.
1481 if (!Subtarget->hasFP16()) {
1484 }
1485
1486 // Strict floating-point comparisons need custom lowering.
1493 }
1494
1495 // Use __sincos_stret if available.
1496 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1497 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1500 }
1501
1502 // FP-ARMv8 implements a lot of rounding-like FP operations.
1503 if (Subtarget->hasFPARMv8Base()) {
1512 if (Subtarget->hasNEON()) {
1517 }
1518
1519 if (Subtarget->hasFP64()) {
1528 }
1529 }
1530
1531 // FP16 often need to be promoted to call lib functions
1532 if (Subtarget->hasFullFP16()) {
1546
1548 }
1549
1550 if (Subtarget->hasNEON()) {
1551 // vmin and vmax aren't available in a scalar form, so we can use
1552 // a NEON instruction with an undef lane instead. This has a performance
1553 // penalty on some cores, so we don't do this unless we have been
1554 // asked to by the core tuning model.
1555 if (Subtarget->useNEONForSinglePrecisionFP()) {
1560 }
1565
1566 if (Subtarget->hasFullFP16()) {
1571
1576 }
1577 }
1578
1579 // We have target-specific dag combine patterns for the following nodes:
1580 // ARMISD::VMOVRRD - No need to call setTargetDAGCombine
1583
1584 if (Subtarget->hasMVEIntegerOps())
1586
1587 if (Subtarget->hasV6Ops())
1589 if (Subtarget->isThumb1Only())
1591 // Attempt to lower smin/smax to ssat/usat
1592 if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) ||
1593 Subtarget->isThumb2()) {
1595 }
1596
1598
1599 if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() ||
1600 !Subtarget->hasVFP2Base() || Subtarget->hasMinSize())
1602 else
1604
1605 //// temporary - rewrite interface to use type
1608 MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
1610 MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
1612
1613 // On ARM arguments smaller than 4 bytes are extended, so all arguments
1614 // are at least 4 bytes aligned.
1616
1617 // Prefer likely predicted branches to selects on out-of-order cores.
1618 PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder();
1619
1620 setPrefLoopAlignment(Align(1ULL << Subtarget->getPrefLoopLogAlignment()));
1622
1623 setMinFunctionAlignment(Subtarget->isThumb() ? Align(2) : Align(4));
1624
1625 if (Subtarget->isThumb() || Subtarget->isThumb2())
1627}
1628
1630 return Subtarget->useSoftFloat();
1631}
1632
1633// FIXME: It might make sense to define the representative register class as the
1634// nearest super-register that has a non-null superset. For example, DPR_VFP2 is
1635// a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
1636// SPR's representative would be DPR_VFP2. This should work well if register
1637// pressure tracking were modified such that a register use would increment the
1638// pressure of the register class's representative and all of it's super
1639// classes' representatives transitively. We have not implemented this because
1640// of the difficulty prior to coalescing of modeling operand register classes
1641// due to the common occurrence of cross class copies and subregister insertions
1642// and extractions.
1643std::pair<const TargetRegisterClass *, uint8_t>
1645 MVT VT) const {
1646 const TargetRegisterClass *RRC = nullptr;
1647 uint8_t Cost = 1;
1648 switch (VT.SimpleTy) {
1649 default:
1651 // Use DPR as representative register class for all floating point
1652 // and vector types. Since there are 32 SPR registers and 32 DPR registers so
1653 // the cost is 1 for both f32 and f64.
1654 case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:
1655 case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:
1656 RRC = &ARM::DPRRegClass;
1657 // When NEON is used for SP, only half of the register file is available
1658 // because operations that define both SP and DP results will be constrained
1659 // to the VFP2 class (D0-D15). We currently model this constraint prior to
1660 // coalescing by double-counting the SP regs. See the FIXME above.
1661 if (Subtarget->useNEONForSinglePrecisionFP())
1662 Cost = 2;
1663 break;
1664 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1665 case MVT::v4f32: case MVT::v2f64:
1666 RRC = &ARM::DPRRegClass;
1667 Cost = 2;
1668 break;
1669 case MVT::v4i64:
1670 RRC = &ARM::DPRRegClass;
1671 Cost = 4;
1672 break;
1673 case MVT::v8i64:
1674 RRC = &ARM::DPRRegClass;
1675 Cost = 8;
1676 break;
1677 }
1678 return std::make_pair(RRC, Cost);
1679}
1680
1681const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
1682#define MAKE_CASE(V) \
1683 case V: \
1684 return #V;
1685 switch ((ARMISD::NodeType)Opcode) {
1687 break;
1891#undef MAKE_CASE
1892 }
1893 return nullptr;
1894}
1895
1897 EVT VT) const {
1898 if (!VT.isVector())
1899 return getPointerTy(DL);
1900
1901 // MVE has a predicate register.
1902 if ((Subtarget->hasMVEIntegerOps() &&
1903 (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
1904 VT == MVT::v16i8)) ||
1905 (Subtarget->hasMVEFloatOps() &&
1906 (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16)))
1907 return MVT::getVectorVT(MVT::i1, VT.getVectorElementCount());
1909}
1910
1911/// getRegClassFor - Return the register class that should be used for the
1912/// specified value type.
1913const TargetRegisterClass *
1914ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
1915 (void)isDivergent;
1916 // Map v4i64 to QQ registers but do not make the type legal. Similarly map
1917 // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
1918 // load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive
1919 // MVE Q registers.
1920 if (Subtarget->hasNEON()) {
1921 if (VT == MVT::v4i64)
1922 return &ARM::QQPRRegClass;
1923 if (VT == MVT::v8i64)
1924 return &ARM::QQQQPRRegClass;
1925 }
1926 if (Subtarget->hasMVEIntegerOps()) {
1927 if (VT == MVT::v4i64)
1928 return &ARM::MQQPRRegClass;
1929 if (VT == MVT::v8i64)
1930 return &ARM::MQQQQPRRegClass;
1931 }
1933}
1934
1935// memcpy, and other memory intrinsics, typically tries to use LDM/STM if the
1936// source/dest is aligned and the copy size is large enough. We therefore want
1937// to align such objects passed to memory intrinsics.
1939 Align &PrefAlign) const {
1940 if (!isa<MemIntrinsic>(CI))
1941 return false;
1942 MinSize = 8;
1943 // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1
1944 // cycle faster than 4-byte aligned LDM.
1945 PrefAlign =
1946 (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? Align(8) : Align(4));
1947 return true;
1948}
1949
1950// Create a fast isel object.
1951FastISel *
1953 const TargetLibraryInfo *libInfo) const {
1954 return ARM::createFastISel(funcInfo, libInfo);
1955}
1956
1958 unsigned NumVals = N->getNumValues();
1959 if (!NumVals)
1960 return Sched::RegPressure;
1961
1962 for (unsigned i = 0; i != NumVals; ++i) {
1963 EVT VT = N->getValueType(i);
1964 if (VT == MVT::Glue || VT == MVT::Other)
1965 continue;
1966 if (VT.isFloatingPoint() || VT.isVector())
1967 return Sched::ILP;
1968 }
1969
1970 if (!N->isMachineOpcode())
1971 return Sched::RegPressure;
1972
1973 // Load are scheduled for latency even if there instruction itinerary
1974 // is not available.
1975 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1976 const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
1977
1978 if (MCID.getNumDefs() == 0)
1979 return Sched::RegPressure;
1980 if (!Itins->isEmpty() &&
1981 Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2)
1982 return Sched::ILP;
1983
1984 return Sched::RegPressure;
1985}
1986
1987//===----------------------------------------------------------------------===//
1988// Lowering Code
1989//===----------------------------------------------------------------------===//
1990
1991static bool isSRL16(const SDValue &Op) {
1992 if (Op.getOpcode() != ISD::SRL)
1993 return false;
1994 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1995 return Const->getZExtValue() == 16;
1996 return false;
1997}
1998
1999static bool isSRA16(const SDValue &Op) {
2000 if (Op.getOpcode() != ISD::SRA)
2001 return false;
2002 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
2003 return Const->getZExtValue() == 16;
2004 return false;
2005}
2006
2007static bool isSHL16(const SDValue &Op) {
2008 if (Op.getOpcode() != ISD::SHL)
2009 return false;
2010 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
2011 return Const->getZExtValue() == 16;
2012 return false;
2013}
2014
2015// Check for a signed 16-bit value. We special case SRA because it makes it
2016// more simple when also looking for SRAs that aren't sign extending a
2017// smaller value. Without the check, we'd need to take extra care with
2018// checking order for some operations.
2019static bool isS16(const SDValue &Op, SelectionDAG &DAG) {
2020 if (isSRA16(Op))
2021 return isSHL16(Op.getOperand(0));
2022 return DAG.ComputeNumSignBits(Op) == 17;
2023}
2024
2025/// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
2027 switch (CC) {
2028 default: llvm_unreachable("Unknown condition code!");
2029 case ISD::SETNE: return ARMCC::NE;
2030 case ISD::SETEQ: return ARMCC::EQ;
2031 case ISD::SETGT: return ARMCC::GT;
2032 case ISD::SETGE: return ARMCC::GE;
2033 case ISD::SETLT: return ARMCC::LT;
2034 case ISD::SETLE: return ARMCC::LE;
2035 case ISD::SETUGT: return ARMCC::HI;
2036 case ISD::SETUGE: return ARMCC::HS;
2037 case ISD::SETULT: return ARMCC::LO;
2038 case ISD::SETULE: return ARMCC::LS;
2039 }
2040}
2041
2042/// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
2044 ARMCC::CondCodes &CondCode2) {
2045 CondCode2 = ARMCC::AL;
2046 switch (CC) {
2047 default: llvm_unreachable("Unknown FP condition!");
2048 case ISD::SETEQ:
2049 case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
2050 case ISD::SETGT:
2051 case ISD::SETOGT: CondCode = ARMCC::GT; break;
2052 case ISD::SETGE:
2053 case ISD::SETOGE: CondCode = ARMCC::GE; break;
2054 case ISD::SETOLT: CondCode = ARMCC::MI; break;
2055 case ISD::SETOLE: CondCode = ARMCC::LS; break;
2056 case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
2057 case ISD::SETO: CondCode = ARMCC::VC; break;
2058 case ISD::SETUO: CondCode = ARMCC::VS; break;
2059 case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break;
2060 case ISD::SETUGT: CondCode = ARMCC::HI; break;
2061 case ISD::SETUGE: CondCode = ARMCC::PL; break;
2062 case ISD::SETLT:
2063 case ISD::SETULT: CondCode = ARMCC::LT; break;
2064 case ISD::SETLE:
2065 case ISD::SETULE: CondCode = ARMCC::LE; break;
2066 case ISD::SETNE:
2067 case ISD::SETUNE: CondCode = ARMCC::NE; break;
2068 }
2069}
2070
2071//===----------------------------------------------------------------------===//
2072// Calling Convention Implementation
2073//===----------------------------------------------------------------------===//
2074
2075/// getEffectiveCallingConv - Get the effective calling convention, taking into
2076/// account presence of floating point hardware and calling convention
2077/// limitations, such as support for variadic functions.
2079ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
2080 bool isVarArg) const {
2081 switch (CC) {
2082 default:
2083 report_fatal_error("Unsupported calling convention");
2086 case CallingConv::GHC:
2088 return CC;
2094 case CallingConv::Swift:
2097 case CallingConv::C:
2098 case CallingConv::Tail:
2099 if (!Subtarget->isAAPCS_ABI())
2100 return CallingConv::ARM_APCS;
2101 else if (Subtarget->hasFPRegs() && !Subtarget->isThumb1Only() &&
2102 getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
2103 !isVarArg)
2105 else
2107 case CallingConv::Fast:
2109 if (!Subtarget->isAAPCS_ABI()) {
2110 if (Subtarget->hasVFP2Base() && !Subtarget->isThumb1Only() && !isVarArg)
2111 return CallingConv::Fast;
2112 return CallingConv::ARM_APCS;
2113 } else if (Subtarget->hasVFP2Base() &&
2114 !Subtarget->isThumb1Only() && !isVarArg)
2116 else
2118 }
2119}
2120
2122 bool isVarArg) const {
2123 return CCAssignFnForNode(CC, false, isVarArg);
2124}
2125
2127 bool isVarArg) const {
2128 return CCAssignFnForNode(CC, true, isVarArg);
2129}
2130
2131/// CCAssignFnForNode - Selects the correct CCAssignFn for the given
2132/// CallingConvention.
2133CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
2134 bool Return,
2135 bool isVarArg) const {
2136 switch (getEffectiveCallingConv(CC, isVarArg)) {
2137 default:
2138 report_fatal_error("Unsupported calling convention");
2140 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
2142 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
2144 return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
2145 case CallingConv::Fast:
2146 return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
2147 case CallingConv::GHC:
2148 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
2150 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
2152 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
2154 return (Return ? RetCC_ARM_AAPCS : CC_ARM_Win32_CFGuard_Check);
2155 }
2156}
2157
2158SDValue ARMTargetLowering::MoveToHPR(const SDLoc &dl, SelectionDAG &DAG,
2159 MVT LocVT, MVT ValVT, SDValue Val) const {
2160 Val = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocVT.getSizeInBits()),
2161 Val);
2162 if (Subtarget->hasFullFP16()) {
2163 Val = DAG.getNode(ARMISD::VMOVhr, dl, ValVT, Val);
2164 } else {
2165 Val = DAG.getNode(ISD::TRUNCATE, dl,
2166 MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
2167 Val = DAG.getNode(ISD::BITCAST, dl, ValVT, Val);
2168 }
2169 return Val;
2170}
2171
2172SDValue ARMTargetLowering::MoveFromHPR(const SDLoc &dl, SelectionDAG &DAG,
2173 MVT LocVT, MVT ValVT,
2174 SDValue Val) const {
2175 if (Subtarget->hasFullFP16()) {
2176 Val = DAG.getNode(ARMISD::VMOVrh, dl,
2177 MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
2178 } else {
2179 Val = DAG.getNode(ISD::BITCAST, dl,
2180 MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
2181 Val = DAG.getNode(ISD::ZERO_EXTEND, dl,
2182 MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
2183 }
2184 return DAG.getNode(ISD::BITCAST, dl, LocVT, Val);
2185}
2186
2187/// LowerCallResult - Lower the result values of a call into the
2188/// appropriate copies out of appropriate physical registers.
2189SDValue ARMTargetLowering::LowerCallResult(
2190 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
2191 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2192 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
2193 SDValue ThisVal) const {
2194 // Assign locations to each value returned by this call.
2196 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2197 *DAG.getContext());
2198 CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg));
2199
2200 // Copy all of the result registers out of their specified physreg.
2201 for (unsigned i = 0; i != RVLocs.size(); ++i) {
2202 CCValAssign VA = RVLocs[i];
2203
2204 // Pass 'this' value directly from the argument to return value, to avoid
2205 // reg unit interference
2206 if (i == 0 && isThisReturn) {
2207 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 &&
2208 "unexpected return calling convention register assignment");
2209 InVals.push_back(ThisVal);
2210 continue;
2211 }
2212
2213 SDValue Val;
2214 if (VA.needsCustom() &&
2215 (VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2f64)) {
2216 // Handle f64 or half of a v2f64.
2217 SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
2218 InGlue);
2219 Chain = Lo.getValue(1);
2220 InGlue = Lo.getValue(2);
2221 VA = RVLocs[++i]; // skip ahead to next loc
2222 SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
2223 InGlue);
2224 Chain = Hi.getValue(1);
2225 InGlue = Hi.getValue(2);
2226 if (!Subtarget->isLittle())
2227 std::swap (Lo, Hi);
2228 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
2229
2230 if (VA.getLocVT() == MVT::v2f64) {
2231 SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
2232 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
2233 DAG.getConstant(0, dl, MVT::i32));
2234
2235 VA = RVLocs[++i]; // skip ahead to next loc
2236 Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InGlue);
2237 Chain = Lo.getValue(1);
2238 InGlue = Lo.getValue(2);
2239 VA = RVLocs[++i]; // skip ahead to next loc
2240 Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InGlue);
2241 Chain = Hi.getValue(1);
2242 InGlue = Hi.getValue(2);
2243 if (!Subtarget->isLittle())
2244 std::swap (Lo, Hi);
2245 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
2246 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
2247 DAG.getConstant(1, dl, MVT::i32));
2248 }
2249 } else {
2250 Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
2251 InGlue);
2252 Chain = Val.getValue(1);
2253 InGlue = Val.getValue(2);
2254 }
2255
2256 switch (VA.getLocInfo()) {
2257 default: llvm_unreachable("Unknown loc info!");
2258 case CCValAssign::Full: break;
2259 case CCValAssign::BCvt:
2260 Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
2261 break;
2262 }
2263
2264 // f16 arguments have their size extended to 4 bytes and passed as if they
2265 // had been copied to the LSBs of a 32-bit register.
2266 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
2267 if (VA.needsCustom() &&
2268 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
2269 Val = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Val);
2270
2271 InVals.push_back(Val);
2272 }
2273
2274 return Chain;
2275}
2276
2277std::pair<SDValue, MachinePointerInfo> ARMTargetLowering::computeAddrForCallArg(
2278 const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, SDValue StackPtr,
2279 bool IsTailCall, int SPDiff) const {
2280 SDValue DstAddr;
2281 MachinePointerInfo DstInfo;
2282 int32_t Offset = VA.getLocMemOffset();
2284
2285 if (IsTailCall) {
2286 Offset += SPDiff;
2287 auto PtrVT = getPointerTy(DAG.getDataLayout());
2288 int Size = VA.getLocVT().getFixedSizeInBits() / 8;
2289 int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true);
2290 DstAddr = DAG.getFrameIndex(FI, PtrVT);
2291 DstInfo =
2293 } else {
2294 SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl);
2295 DstAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
2296 StackPtr, PtrOff);
2297 DstInfo =
2299 }
2300
2301 return std::make_pair(DstAddr, DstInfo);
2302}
2303
2304void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,
2305 SDValue Chain, SDValue &Arg,
2306 RegsToPassVector &RegsToPass,
2307 CCValAssign &VA, CCValAssign &NextVA,
2308 SDValue &StackPtr,
2309 SmallVectorImpl<SDValue> &MemOpChains,
2310 bool IsTailCall,
2311 int SPDiff) const {
2312 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
2313 DAG.getVTList(MVT::i32, MVT::i32), Arg);
2314 unsigned id = Subtarget->isLittle() ? 0 : 1;
2315 RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id)));
2316
2317 if (NextVA.isRegLoc())
2318 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id)));
2319 else {
2320 assert(NextVA.isMemLoc());
2321 if (!StackPtr.getNode())
2322 StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP,
2324
2325 SDValue DstAddr;
2326 MachinePointerInfo DstInfo;
2327 std::tie(DstAddr, DstInfo) =
2328 computeAddrForCallArg(dl, DAG, NextVA, StackPtr, IsTailCall, SPDiff);
2329 MemOpChains.push_back(
2330 DAG.getStore(Chain, dl, fmrrd.getValue(1 - id), DstAddr, DstInfo));
2331 }
2332}
2333
2334static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
2335 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
2337}
2338
2339/// LowerCall - Lowering a call into a callseq_start <-
2340/// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
2341/// nodes.
2342SDValue
2343ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2344 SmallVectorImpl<SDValue> &InVals) const {
2345 SelectionDAG &DAG = CLI.DAG;
2346 SDLoc &dl = CLI.DL;
2348 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2350 SDValue Chain = CLI.Chain;
2351 SDValue Callee = CLI.Callee;
2352 bool &isTailCall = CLI.IsTailCall;
2353 CallingConv::ID CallConv = CLI.CallConv;
2354 bool doesNotRet = CLI.DoesNotReturn;
2355 bool isVarArg = CLI.IsVarArg;
2356
2360 bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
2361 bool isThisReturn = false;
2362 bool isCmseNSCall = false;
2363 bool isSibCall = false;
2364 bool PreferIndirect = false;
2365 bool GuardWithBTI = false;
2366
2367 // Lower 'returns_twice' calls to a pseudo-instruction.
2368 if (CLI.CB && CLI.CB->getAttributes().hasFnAttr(Attribute::ReturnsTwice) &&
2369 !Subtarget->noBTIAtReturnTwice())
2370 GuardWithBTI = AFI->branchTargetEnforcement();
2371
2372 // Determine whether this is a non-secure function call.
2373 if (CLI.CB && CLI.CB->getAttributes().hasFnAttr("cmse_nonsecure_call"))
2374 isCmseNSCall = true;
2375
2376 // Disable tail calls if they're not supported.
2377 if (!Subtarget->supportsTailCall())
2378 isTailCall = false;
2379
2380 // For both the non-secure calls and the returns from a CMSE entry function,
2381 // the function needs to do some extra work afte r the call, or before the
2382 // return, respectively, thus it cannot end with atail call
2383 if (isCmseNSCall || AFI->isCmseNSEntryFunction())
2384 isTailCall = false;
2385
2386 if (isa<GlobalAddressSDNode>(Callee)) {
2387 // If we're optimizing for minimum size and the function is called three or
2388 // more times in this block, we can improve codesize by calling indirectly
2389 // as BLXr has a 16-bit encoding.
2390 auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
2391 if (CLI.CB) {
2392 auto *BB = CLI.CB->getParent();
2393 PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() &&
2394 count_if(GV->users(), [&BB](const User *U) {
2395 return isa<Instruction>(U) &&
2396 cast<Instruction>(U)->getParent() == BB;
2397 }) > 2;
2398 }
2399 }
2400 if (isTailCall) {
2401 // Check if it's really possible to do a tail call.
2402 isTailCall = IsEligibleForTailCallOptimization(
2403 Callee, CallConv, isVarArg, isStructRet,
2404 MF.getFunction().hasStructRetAttr(), Outs, OutVals, Ins, DAG,
2405 PreferIndirect);
2406
2407 if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt &&
2408 CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail)
2409 isSibCall = true;
2410
2411 // We don't support GuaranteedTailCallOpt for ARM, only automatically
2412 // detected sibcalls.
2413 if (isTailCall)
2414 ++NumTailCalls;
2415 }
2416
2417 if (!isTailCall && CLI.CB && CLI.CB->isMustTailCall())
2418 report_fatal_error("failed to perform tail call elimination on a call "
2419 "site marked musttail");
2420 // Analyze operands of the call, assigning locations to each operand.
2422 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2423 *DAG.getContext());
2424 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg));
2425
2426 // Get a count of how many bytes are to be pushed on the stack.
2427 unsigned NumBytes = CCInfo.getStackSize();
2428
2429 // SPDiff is the byte offset of the call's argument area from the callee's.
2430 // Stores to callee stack arguments will be placed in FixedStackSlots offset
2431 // by this amount for a tail call. In a sibling call it must be 0 because the
2432 // caller will deallocate the entire stack and the callee still expects its
2433 // arguments to begin at SP+0. Completely unused for non-tail calls.
2434 int SPDiff = 0;
2435
2436 if (isTailCall && !isSibCall) {
2437 auto FuncInfo = MF.getInfo<ARMFunctionInfo>();
2438 unsigned NumReusableBytes = FuncInfo->getArgumentStackSize();
2439
2440 // Since callee will pop argument stack as a tail call, we must keep the
2441 // popped size 16-byte aligned.
2443 NumBytes = alignTo(NumBytes, StackAlign);
2444
2445 // SPDiff will be negative if this tail call requires more space than we
2446 // would automatically have in our incoming argument space. Positive if we
2447 // can actually shrink the stack.
2448 SPDiff = NumReusableBytes - NumBytes;
2449
2450 // If this call requires more stack than we have available from
2451 // LowerFormalArguments, tell FrameLowering to reserve space for it.
2452 if (SPDiff < 0 && AFI->getArgRegsSaveSize() < (unsigned)-SPDiff)
2453 AFI->setArgRegsSaveSize(-SPDiff);
2454 }
2455
2456 if (isSibCall) {
2457 // For sibling tail calls, memory operands are available in our caller's stack.
2458 NumBytes = 0;
2459 } else {
2460 // Adjust the stack pointer for the new arguments...
2461 // These operations are automatically eliminated by the prolog/epilog pass
2462 Chain = DAG.getCALLSEQ_START(Chain, isTailCall ? 0 : NumBytes, 0, dl);
2463 }
2464
2466 DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout()));
2467
2468 RegsToPassVector RegsToPass;
2469 SmallVector<SDValue, 8> MemOpChains;
2470
2471 // During a tail call, stores to the argument area must happen after all of
2472 // the function's incoming arguments have been loaded because they may alias.
2473 // This is done by folding in a TokenFactor from LowerFormalArguments, but
2474 // there's no point in doing so repeatedly so this tracks whether that's
2475 // happened yet.
2476 bool AfterFormalArgLoads = false;
2477
2478 // Walk the register/memloc assignments, inserting copies/loads. In the case
2479 // of tail call optimization, arguments are handled later.
2480 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
2481 i != e;
2482 ++i, ++realArgIdx) {
2483 CCValAssign &VA = ArgLocs[i];
2484 SDValue Arg = OutVals[realArgIdx];
2485 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2486 bool isByVal = Flags.isByVal();
2487
2488 // Promote the value if needed.
2489 switch (VA.getLocInfo()) {
2490 default: llvm_unreachable("Unknown loc info!");
2491 case CCValAssign::Full: break;
2492 case CCValAssign::SExt:
2493 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
2494 break;
2495 case CCValAssign::ZExt:
2496 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
2497 break;
2498 case CCValAssign::AExt:
2499 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
2500 break;
2501 case CCValAssign::BCvt:
2502 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2503 break;
2504 }
2505
2506 if (isTailCall && VA.isMemLoc() && !AfterFormalArgLoads) {
2507 Chain = DAG.getStackArgumentTokenFactor(Chain);
2508 AfterFormalArgLoads = true;
2509 }
2510
2511 // f16 arguments have their size extended to 4 bytes and passed as if they
2512 // had been copied to the LSBs of a 32-bit register.
2513 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
2514 if (VA.needsCustom() &&
2515 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) {
2516 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
2517 } else {
2518 // f16 arguments could have been extended prior to argument lowering.
2519 // Mask them arguments if this is a CMSE nonsecure call.
2520 auto ArgVT = Outs[realArgIdx].ArgVT;
2521 if (isCmseNSCall && (ArgVT == MVT::f16)) {
2522 auto LocBits = VA.getLocVT().getSizeInBits();
2523 auto MaskValue = APInt::getLowBitsSet(LocBits, ArgVT.getSizeInBits());
2524 SDValue Mask =
2525 DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
2526 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
2527 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
2528 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2529 }
2530 }
2531
2532 // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
2533 if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
2534 SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2535 DAG.getConstant(0, dl, MVT::i32));
2536 SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2537 DAG.getConstant(1, dl, MVT::i32));
2538
2539 PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, VA, ArgLocs[++i],
2540 StackPtr, MemOpChains, isTailCall, SPDiff);
2541
2542 VA = ArgLocs[++i]; // skip ahead to next loc
2543 if (VA.isRegLoc()) {
2544 PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, VA, ArgLocs[++i],
2545 StackPtr, MemOpChains, isTailCall, SPDiff);
2546 } else {
2547 assert(VA.isMemLoc());
2548 SDValue DstAddr;
2549 MachinePointerInfo DstInfo;
2550 std::tie(DstAddr, DstInfo) =
2551 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2552 MemOpChains.push_back(DAG.getStore(Chain, dl, Op1, DstAddr, DstInfo));
2553 }
2554 } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
2555 PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
2556 StackPtr, MemOpChains, isTailCall, SPDiff);
2557 } else if (VA.isRegLoc()) {
2558 if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
2559 Outs[0].VT == MVT::i32) {
2560 assert(VA.getLocVT() == MVT::i32 &&
2561 "unexpected calling convention register assignment");
2562 assert(!Ins.empty() && Ins[0].VT == MVT::i32 &&
2563 "unexpected use of 'returned'");
2564 isThisReturn = true;
2565 }
2566 const TargetOptions &Options = DAG.getTarget().Options;
2567 if (Options.EmitCallSiteInfo)
2568 CSInfo.emplace_back(VA.getLocReg(), i);
2569 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2570 } else if (isByVal) {
2571 assert(VA.isMemLoc());
2572 unsigned offset = 0;
2573
2574 // True if this byval aggregate will be split between registers
2575 // and memory.
2576 unsigned ByValArgsCount = CCInfo.getInRegsParamsCount();
2577 unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed();
2578
2579 if (CurByValIdx < ByValArgsCount) {
2580
2581 unsigned RegBegin, RegEnd;
2582 CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd);
2583
2584 EVT PtrVT =
2586 unsigned int i, j;
2587 for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
2588 SDValue Const = DAG.getConstant(4*i, dl, MVT::i32);
2589 SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
2590 SDValue Load =
2591 DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo(),
2592 DAG.InferPtrAlign(AddArg));
2593 MemOpChains.push_back(Load.getValue(1));
2594 RegsToPass.push_back(std::make_pair(j, Load));
2595 }
2596
2597 // If parameter size outsides register area, "offset" value
2598 // helps us to calculate stack slot for remained part properly.
2599 offset = RegEnd - RegBegin;
2600
2601 CCInfo.nextInRegsParam();
2602 }
2603
2604 if (Flags.getByValSize() > 4*offset) {
2605 auto PtrVT = getPointerTy(DAG.getDataLayout());
2606 SDValue Dst;
2607 MachinePointerInfo DstInfo;
2608 std::tie(Dst, DstInfo) =
2609 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2610 SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl);
2611 SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset);
2612 SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl,
2613 MVT::i32);
2614 SDValue AlignNode =
2615 DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32);
2616
2617 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
2618 SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
2619 MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
2620 Ops));
2621 }
2622 } else {
2623 assert(VA.isMemLoc());
2624 SDValue DstAddr;
2625 MachinePointerInfo DstInfo;
2626 std::tie(DstAddr, DstInfo) =
2627 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2628
2629 SDValue Store = DAG.getStore(Chain, dl, Arg, DstAddr, DstInfo);
2630 MemOpChains.push_back(Store);
2631 }
2632 }
2633
2634 if (!MemOpChains.empty())
2635 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
2636
2637 // Build a sequence of copy-to-reg nodes chained together with token chain
2638 // and flag operands which copy the outgoing args into the appropriate regs.
2639 SDValue InGlue;
2640 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
2641 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
2642 RegsToPass[i].second, InGlue);
2643 InGlue = Chain.getValue(1);
2644 }
2645
2646 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
2647 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
2648 // node so that legalize doesn't hack it.
2649 bool isDirect = false;
2650
2652 const Module *Mod = MF.getFunction().getParent();
2653 const GlobalValue *GVal = nullptr;
2654 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
2655 GVal = G->getGlobal();
2656 bool isStub =
2657 !TM.shouldAssumeDSOLocal(*Mod, GVal) && Subtarget->isTargetMachO();
2658
2659 bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
2660 bool isLocalARMFunc = false;
2661 auto PtrVt = getPointerTy(DAG.getDataLayout());
2662
2663 if (Subtarget->genLongCalls()) {
2664 assert((!isPositionIndependent() || Subtarget->isTargetWindows()) &&
2665 "long-calls codegen is not position independent!");
2666 // Handle a global address or an external symbol. If it's not one of
2667 // those, the target's already in a register, so we don't need to do
2668 // anything extra.
2669 if (isa<GlobalAddressSDNode>(Callee)) {
2670 if (Subtarget->genExecuteOnly()) {
2671 if (Subtarget->useMovt())
2672 ++NumMovwMovt;
2673 Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt,
2674 DAG.getTargetGlobalAddress(GVal, dl, PtrVt));
2675 } else {
2676 // Create a constant pool entry for the callee address
2677 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2679 GVal, ARMPCLabelIndex, ARMCP::CPValue, 0);
2680
2681 // Get the address of the callee into a register
2682 SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2683 Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr);
2684 Callee = DAG.getLoad(
2685 PtrVt, dl, DAG.getEntryNode(), Addr,
2687 }
2688 } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
2689 const char *Sym = S->getSymbol();
2690
2691 if (Subtarget->genExecuteOnly()) {
2692 if (Subtarget->useMovt())
2693 ++NumMovwMovt;
2694 Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt,
2695 DAG.getTargetGlobalAddress(GVal, dl, PtrVt));
2696 } else {
2697 // Create a constant pool entry for the callee address
2698 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2700 *DAG.getContext(), Sym, ARMPCLabelIndex, 0);
2701
2702 // Get the address of the callee into a register
2703 SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2704 Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr);
2705 Callee = DAG.getLoad(
2706 PtrVt, dl, DAG.getEntryNode(), Addr,
2708 }
2709 }
2710 } else if (isa<GlobalAddressSDNode>(Callee)) {
2711 if (!PreferIndirect) {
2712 isDirect = true;
2713 bool isDef = GVal->isStrongDefinitionForLinker();
2714
2715 // ARM call to a local ARM function is predicable.
2716 isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking);
2717 // tBX takes a register source operand.
2718 if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2719 assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?");
2720 Callee = DAG.getNode(
2721 ARMISD::WrapperPIC, dl, PtrVt,
2722 DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, ARMII::MO_NONLAZY));
2723 Callee = DAG.getLoad(
2724 PtrVt, dl, DAG.getEntryNode(), Callee,
2728 } else if (Subtarget->isTargetCOFF()) {
2729 assert(Subtarget->isTargetWindows() &&
2730 "Windows is the only supported COFF target");
2731 unsigned TargetFlags = ARMII::MO_NO_FLAG;
2732 if (GVal->hasDLLImportStorageClass())
2733 TargetFlags = ARMII::MO_DLLIMPORT;
2734 else if (!TM.shouldAssumeDSOLocal(*GVal->getParent(), GVal))
2735 TargetFlags = ARMII::MO_COFFSTUB;
2736 Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, /*offset=*/0,
2737 TargetFlags);
2738 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
2739 Callee =
2740 DAG.getLoad(PtrVt, dl, DAG.getEntryNode(),
2741 DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee),
2743 } else {
2744 Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, 0);
2745 }
2746 }
2747 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
2748 isDirect = true;
2749 // tBX takes a register source operand.
2750 const char *Sym = S->getSymbol();
2751 if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2752 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2755 ARMPCLabelIndex, 4);
2756 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2757 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2758 Callee = DAG.getLoad(
2759 PtrVt, dl, DAG.getEntryNode(), CPAddr,
2761 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
2762 Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel);
2763 } else {
2764 Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0);
2765 }
2766 }
2767
2768 if (isCmseNSCall) {
2769 assert(!isARMFunc && !isDirect &&
2770 "Cannot handle call to ARM function or direct call");
2771 if (NumBytes > 0) {
2773 "call to non-secure function would "
2774 "require passing arguments on stack",
2775 dl.getDebugLoc());
2776 DAG.getContext()->diagnose(Diag);
2777 }
2778 if (isStructRet) {
2781 "call to non-secure function would return value through pointer",
2782 dl.getDebugLoc());
2783 DAG.getContext()->diagnose(Diag);
2784 }
2785 }
2786
2787 // FIXME: handle tail calls differently.
2788 unsigned CallOpc;
2789 if (Subtarget->isThumb()) {
2790 if (GuardWithBTI)
2791 CallOpc = ARMISD::t2CALL_BTI;
2792 else if (isCmseNSCall)
2793 CallOpc = ARMISD::tSECALL;
2794 else if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
2795 CallOpc = ARMISD::CALL_NOLINK;
2796 else
2797 CallOpc = ARMISD::CALL;
2798 } else {
2799 if (!isDirect && !Subtarget->hasV5TOps())
2800 CallOpc = ARMISD::CALL_NOLINK;
2801 else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() &&
2802 // Emit regular call when code size is the priority
2803 !Subtarget->hasMinSize())
2804 // "mov lr, pc; b _foo" to avoid confusing the RSP
2805 CallOpc = ARMISD::CALL_NOLINK;
2806 else
2807 CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL;
2808 }
2809
2810 // We don't usually want to end the call-sequence here because we would tidy
2811 // the frame up *after* the call, however in the ABI-changing tail-call case
2812 // we've carefully laid out the parameters so that when sp is reset they'll be
2813 // in the correct location.
2814 if (isTailCall && !isSibCall) {
2815 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, dl);
2816 InGlue = Chain.getValue(1);
2817 }
2818
2819 std::vector<SDValue> Ops;
2820 Ops.push_back(Chain);
2821 Ops.push_back(Callee);
2822
2823 if (isTailCall) {
2824 Ops.push_back(DAG.getTargetConstant(SPDiff, dl, MVT::i32));
2825 }
2826
2827 // Add argument registers to the end of the list so that they are known live
2828 // into the call.
2829 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
2830 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
2831 RegsToPass[i].second.getValueType()));
2832
2833 // Add a register mask operand representing the call-preserved registers.
2834 const uint32_t *Mask;
2835 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
2836 if (isThisReturn) {
2837 // For 'this' returns, use the R0-preserving mask if applicable
2838 Mask = ARI->getThisReturnPreservedMask(MF, CallConv);
2839 if (!Mask) {
2840 // Set isThisReturn to false if the calling convention is not one that
2841 // allows 'returned' to be modeled in this way, so LowerCallResult does
2842 // not try to pass 'this' straight through
2843 isThisReturn = false;
2844 Mask = ARI->getCallPreservedMask(MF, CallConv);
2845 }
2846 } else
2847 Mask = ARI->getCallPreservedMask(MF, CallConv);
2848
2849 assert(Mask && "Missing call preserved mask for calling convention");
2850 Ops.push_back(DAG.getRegisterMask(Mask));
2851
2852 if (InGlue.getNode())
2853 Ops.push_back(InGlue);
2854
2855 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2856 if (isTailCall) {
2858 SDValue Ret = DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops);
2859 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
2860 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
2861 return Ret;
2862 }
2863
2864 // Returns a chain and a flag for retval copy to use.
2865 Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
2866 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
2867 InGlue = Chain.getValue(1);
2868 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
2869
2870 // If we're guaranteeing tail-calls will be honoured, the callee must
2871 // pop its own argument stack on return. But this call is *not* a tail call so
2872 // we need to undo that after it returns to restore the status-quo.
2873 bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt;
2874 uint64_t CalleePopBytes =
2875 canGuaranteeTCO(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : -1ULL;
2876
2877 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, dl);
2878 if (!Ins.empty())
2879 InGlue = Chain.getValue(1);
2880
2881 // Handle result values, copying them out of physregs into vregs that we
2882 // return.
2883 return LowerCallResult(Chain, InGlue, CallConv, isVarArg, Ins, dl, DAG,
2884 InVals, isThisReturn,
2885 isThisReturn ? OutVals[0] : SDValue());
2886}
2887
2888/// HandleByVal - Every parameter *after* a byval parameter is passed
2889/// on the stack. Remember the next parameter register to allocate,
2890/// and then confiscate the rest of the parameter registers to insure
2891/// this.
2892void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
2893 Align Alignment) const {
2894 // Byval (as with any stack) slots are always at least 4 byte aligned.
2895 Alignment = std::max(Alignment, Align(4));
2896
2897 unsigned Reg = State->AllocateReg(GPRArgRegs);
2898 if (!Reg)
2899 return;
2900
2901 unsigned AlignInRegs = Alignment.value() / 4;
2902 unsigned Waste = (ARM::R4 - Reg) % AlignInRegs;
2903 for (unsigned i = 0; i < Waste; ++i)
2904 Reg = State->AllocateReg(GPRArgRegs);
2905
2906 if (!Reg)
2907 return;
2908
2909 unsigned Excess = 4 * (ARM::R4 - Reg);
2910
2911 // Special case when NSAA != SP and parameter size greater than size of
2912 // all remained GPR regs. In that case we can't split parameter, we must
2913 // send it to stack. We also must set NCRN to R4, so waste all
2914 // remained registers.
2915 const unsigned NSAAOffset = State->getStackSize();
2916 if (NSAAOffset != 0 && Size > Excess) {
2917 while (State->AllocateReg(GPRArgRegs))
2918 ;
2919 return;
2920 }
2921
2922 // First register for byval parameter is the first register that wasn't
2923 // allocated before this method call, so it would be "reg".
2924 // If parameter is small enough to be saved in range [reg, r4), then
2925 // the end (first after last) register would be reg + param-size-in-regs,
2926 // else parameter would be splitted between registers and stack,
2927 // end register would be r4 in this case.
2928 unsigned ByValRegBegin = Reg;
2929 unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4);
2930 State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd);
2931 // Note, first register is allocated in the beginning of function already,
2932 // allocate remained amount of registers we need.
2933 for (unsigned i = Reg + 1; i != ByValRegEnd; ++i)
2934 State->AllocateReg(GPRArgRegs);
2935 // A byval parameter that is split between registers and memory needs its
2936 // size truncated here.
2937 // In the case where the entire structure fits in registers, we set the
2938 // size in memory to zero.
2939 Size = std::max<int>(Size - Excess, 0);
2940}
2941
2942/// MatchingStackOffset - Return true if the given stack call argument is
2943/// already available in the same position (relatively) of the caller's
2944/// incoming argument stack.
2945static
2948 const TargetInstrInfo *TII) {
2949 unsigned Bytes = Arg.getValueSizeInBits() / 8;
2950 int FI = std::numeric_limits<int>::max();
2951 if (Arg.getOpcode() == ISD::CopyFromReg) {
2952 Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
2953 if (!VR.isVirtual())
2954 return false;
2955 MachineInstr *Def = MRI->getVRegDef(VR);
2956 if (!Def)
2957 return false;
2958 if (!Flags.isByVal()) {
2959 if (!TII->isLoadFromStackSlot(*Def, FI))
2960 return false;
2961 } else {
2962 return false;
2963 }
2964 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
2965 if (Flags.isByVal())
2966 // ByVal argument is passed in as a pointer but it's now being
2967 // dereferenced. e.g.
2968 // define @foo(%struct.X* %A) {
2969 // tail call @bar(%struct.X* byval %A)
2970 // }
2971 return false;
2972 SDValue Ptr = Ld->getBasePtr();
2973 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
2974 if (!FINode)
2975 return false;
2976 FI = FINode->getIndex();
2977 } else
2978 return false;
2979
2980 assert(FI != std::numeric_limits<int>::max());
2981 if (!MFI.isFixedObjectIndex(FI))
2982 return false;
2983 return Offset == MFI.getObjectOffset(FI) && Bytes == MFI.getObjectSize(FI);
2984}
2985
2986/// IsEligibleForTailCallOptimization - Check whether the call is eligible
2987/// for tail call optimization. Targets which want to do tail call
2988/// optimization should implement this function.
2989bool ARMTargetLowering::IsEligibleForTailCallOptimization(
2990 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
2991 bool isCalleeStructRet, bool isCallerStructRet,
2993 const SmallVectorImpl<SDValue> &OutVals,
2995 const bool isIndirect) const {
2997 const Function &CallerF = MF.getFunction();
2998 CallingConv::ID CallerCC = CallerF.getCallingConv();
2999
3000 assert(Subtarget->supportsTailCall());
3001
3002 // Indirect tail calls cannot be optimized for Thumb1 if the args
3003 // to the call take up r0-r3. The reason is that there are no legal registers
3004 // left to hold the pointer to the function to be called.
3005 // Similarly, if the function uses return address sign and authentication,
3006 // r12 is needed to hold the PAC and is not available to hold the callee
3007 // address.
3008 if (Outs.size() >= 4 &&
3009 (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect)) {
3010 if (Subtarget->isThumb1Only())
3011 return false;
3012 // Conservatively assume the function spills LR.
3014 return false;
3015 }
3016
3017 // Look for obvious safe cases to perform tail call optimization that do not
3018 // require ABI changes. This is what gcc calls sibcall.
3019
3020 // Exception-handling functions need a special set of instructions to indicate
3021 // a return to the hardware. Tail-calling another function would probably
3022 // break this.
3023 if (CallerF.hasFnAttribute("interrupt"))
3024 return false;
3025
3026 if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
3027 return CalleeCC == CallerCC;
3028
3029 // Also avoid sibcall optimization if either caller or callee uses struct
3030 // return semantics.
3031 if (isCalleeStructRet || isCallerStructRet)
3032 return false;
3033
3034 // Externally-defined functions with weak linkage should not be
3035 // tail-called on ARM when the OS does not support dynamic
3036 // pre-emption of symbols, as the AAELF spec requires normal calls
3037 // to undefined weak functions to be replaced with a NOP or jump to the
3038 // next instruction. The behaviour of branch instructions in this
3039 // situation (as used for tail calls) is implementation-defined, so we
3040 // cannot rely on the linker replacing the tail call with a return.
3041 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
3042 const GlobalValue *GV = G->getGlobal();
3044 if (GV->hasExternalWeakLinkage() &&
3045 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
3046 return false;
3047 }
3048
3049 // Check that the call results are passed in the same way.
3050 LLVMContext &C = *DAG.getContext();
3052 getEffectiveCallingConv(CalleeCC, isVarArg),
3053 getEffectiveCallingConv(CallerCC, CallerF.isVarArg()), MF, C, Ins,
3054 CCAssignFnForReturn(CalleeCC, isVarArg),
3055 CCAssignFnForReturn(CallerCC, CallerF.isVarArg())))
3056 return false;
3057 // The callee has to preserve all registers the caller needs to preserve.
3058 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
3059 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3060 if (CalleeCC != CallerCC) {
3061 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3062 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3063 return false;
3064 }
3065
3066 // If Caller's vararg or byval argument has been split between registers and
3067 // stack, do not perform tail call, since part of the argument is in caller's
3068 // local frame.
3069 const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>();
3070 if (AFI_Caller->getArgRegsSaveSize())
3071 return false;
3072
3073 // If the callee takes no arguments then go on to check the results of the
3074 // call.
3075 if (!Outs.empty()) {
3076 // Check if stack adjustment is needed. For now, do not do this if any
3077 // argument is passed on the stack.
3079 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
3080 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
3081 if (CCInfo.getStackSize()) {
3082 // Check if the arguments are already laid out in the right way as
3083 // the caller's fixed stack objects.
3084 MachineFrameInfo &MFI = MF.getFrameInfo();
3085 const MachineRegisterInfo *MRI = &MF.getRegInfo();
3086 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3087 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
3088 i != e;
3089 ++i, ++realArgIdx) {
3090 CCValAssign &VA = ArgLocs[i];
3091 EVT RegVT = VA.getLocVT();
3092 SDValue Arg = OutVals[realArgIdx];
3093 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
3095 return false;
3096 if (VA.needsCustom() && (RegVT == MVT::f64 || RegVT == MVT::v2f64)) {
3097 // f64 and vector types are split into multiple registers or
3098 // register/stack-slot combinations. The types will not match
3099 // the registers; give up on memory f64 refs until we figure
3100 // out what to do about this.
3101 if (!VA.isRegLoc())
3102 return false;
3103 if (!ArgLocs[++i].isRegLoc())
3104 return false;
3105 if (RegVT == MVT::v2f64) {
3106 if (!ArgLocs[++i].isRegLoc())
3107 return false;
3108 if (!ArgLocs[++i].isRegLoc())
3109 return false;
3110 }
3111 } else if (!VA.isRegLoc()) {
3112 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
3113 MFI, MRI, TII))
3114 return false;
3115 }
3116 }
3117 }
3118
3119 const MachineRegisterInfo &MRI = MF.getRegInfo();
3120 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
3121 return false;
3122 }
3123
3124 return true;
3125}
3126
3127bool
3128ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
3129 MachineFunction &MF, bool isVarArg,
3131 LLVMContext &Context) const {
3133 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
3134 return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3135}
3136
3138 const SDLoc &DL, SelectionDAG &DAG) {
3139 const MachineFunction &MF = DAG.getMachineFunction();
3140 const Function &F = MF.getFunction();
3141
3142 StringRef IntKind = F.getFnAttribute("interrupt").getValueAsString();
3143
3144 // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset
3145 // version of the "preferred return address". These offsets affect the return
3146 // instruction if this is a return from PL1 without hypervisor extensions.
3147 // IRQ/FIQ: +4 "subs pc, lr, #4"
3148 // SWI: 0 "subs pc, lr, #0"
3149 // ABORT: +4 "subs pc, lr, #4"
3150 // UNDEF: +4/+2 "subs pc, lr, #0"
3151 // UNDEF varies depending on where the exception came from ARM or Thumb
3152 // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0.
3153
3154 int64_t LROffset;
3155 if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" ||
3156 IntKind == "ABORT")
3157 LROffset = 4;
3158 else if (IntKind == "SWI" || IntKind == "UNDEF")
3159 LROffset = 0;
3160 else
3161 report_fatal_error("Unsupported interrupt attribute. If present, value "
3162 "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF");
3163
3164 RetOps.insert(RetOps.begin() + 1,
3165 DAG.getConstant(LROffset, DL, MVT::i32, false));
3166
3167 return DAG.getNode(ARMISD::INTRET_GLUE, DL, MVT::Other, RetOps);
3168}
3169
3170SDValue
3171ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
3172 bool isVarArg,
3174 const SmallVectorImpl<SDValue> &OutVals,
3175 const SDLoc &dl, SelectionDAG &DAG) const {
3176 // CCValAssign - represent the assignment of the return value to a location.
3178
3179 // CCState - Info about the registers and stack slots.
3180 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3181 *DAG.getContext());
3182
3183 // Analyze outgoing return values.
3184 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3185
3186 SDValue Glue;
3188 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3189 bool isLittleEndian = Subtarget->isLittle();
3190
3193 AFI->setReturnRegsCount(RVLocs.size());
3194
3195 // Report error if cmse entry function returns structure through first ptr arg.
3196 if (AFI->isCmseNSEntryFunction() && MF.getFunction().hasStructRetAttr()) {
3197 // Note: using an empty SDLoc(), as the first line of the function is a
3198 // better place to report than the last line.
3201 "secure entry function would return value through pointer",
3202 SDLoc().getDebugLoc());
3203 DAG.getContext()->diagnose(Diag);
3204 }
3205
3206 // Copy the result values into the output registers.
3207 for (unsigned i = 0, realRVLocIdx = 0;
3208 i != RVLocs.size();
3209 ++i, ++realRVLocIdx) {
3210 CCValAssign &VA = RVLocs[i];
3211 assert(VA.isRegLoc() && "Can only return in registers!");
3212
3213 SDValue Arg = OutVals[realRVLocIdx];
3214 bool ReturnF16 = false;
3215
3216 if (Subtarget->hasFullFP16() && Subtarget->isTargetHardFloat()) {
3217 // Half-precision return values can be returned like this:
3218 //
3219 // t11 f16 = fadd ...
3220 // t12: i16 = bitcast t11
3221 // t13: i32 = zero_extend t12
3222 // t14: f32 = bitcast t13 <~~~~~~~ Arg
3223 //
3224 // to avoid code generation for bitcasts, we simply set Arg to the node
3225 // that produces the f16 value, t11 in this case.
3226 //
3227 if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) {
3228 SDValue ZE = Arg.getOperand(0);
3229 if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) {
3230 SDValue BC = ZE.getOperand(0);
3231 if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) {
3232 Arg = BC.getOperand(0);
3233 ReturnF16 = true;
3234 }
3235 }
3236 }
3237 }
3238
3239 switch (VA.getLocInfo()) {
3240 default: llvm_unreachable("Unknown loc info!");
3241 case CCValAssign::Full: break;
3242 case CCValAssign::BCvt:
3243 if (!ReturnF16)
3244 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
3245 break;
3246 }
3247
3248 // Mask f16 arguments if this is a CMSE nonsecure entry.
3249 auto RetVT = Outs[realRVLocIdx].ArgVT;
3250 if (AFI->isCmseNSEntryFunction() && (RetVT == MVT::f16)) {
3251 if (VA.needsCustom() && VA.getValVT() == MVT::f16) {
3252 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
3253 } else {
3254 auto LocBits = VA.getLocVT().getSizeInBits();
3255 auto MaskValue = APInt::getLowBitsSet(LocBits, RetVT.getSizeInBits());
3256 SDValue Mask =
3257 DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
3258 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
3259 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
3260 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
3261 }
3262 }
3263
3264 if (VA.needsCustom() &&
3265 (VA.getLocVT() == MVT::v2f64 || VA.getLocVT() == MVT::f64)) {
3266 if (VA.getLocVT() == MVT::v2f64) {
3267 // Extract the first half and return it in two registers.
3268 SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
3269 DAG.getConstant(0, dl, MVT::i32));
3270 SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
3271 DAG.getVTList(MVT::i32, MVT::i32), Half);
3272
3273 Chain =
3274 DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3275 HalfGPRs.getValue(isLittleEndian ? 0 : 1), Glue);
3276 Glue = Chain.getValue(1);
3277 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3278 VA = RVLocs[++i]; // skip ahead to next loc
3279 Chain =
3280 DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3281 HalfGPRs.getValue(isLittleEndian ? 1 : 0), Glue);
3282 Glue = Chain.getValue(1);
3283 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3284 VA = RVLocs[++i]; // skip ahead to next loc
3285
3286 // Extract the 2nd half and fall through to handle it as an f64 value.
3287 Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
3288 DAG.getConstant(1, dl, MVT::i32));
3289 }
3290 // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is
3291 // available.
3292 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
3293 DAG.getVTList(MVT::i32, MVT::i32), Arg);
3294 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3295 fmrrd.getValue(isLittleEndian ? 0 : 1), Glue);
3296 Glue = Chain.getValue(1);
3297 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3298 VA = RVLocs[++i]; // skip ahead to next loc
3299 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3300 fmrrd.getValue(isLittleEndian ? 1 : 0), Glue);
3301 } else
3302 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Glue);
3303
3304 // Guarantee that all emitted copies are
3305 // stuck together, avoiding something bad.
3306 Glue = Chain.getValue(1);
3307 RetOps.push_back(DAG.getRegister(
3308 VA.getLocReg(), ReturnF16 ? Arg.getValueType() : VA.getLocVT()));
3309 }
3310 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
3311 const MCPhysReg *I =
3312 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3313 if (I) {
3314 for (; *I; ++I) {
3315 if (ARM::GPRRegClass.contains(*I))
3316 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3317 else if (ARM::DPRRegClass.contains(*I))
3319 else
3320 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3321 }
3322 }
3323
3324 // Update chain and glue.
3325 RetOps[0] = Chain;
3326 if (Glue.getNode())
3327 RetOps.push_back(Glue);
3328
3329 // CPUs which aren't M-class use a special sequence to return from
3330 // exceptions (roughly, any instruction setting pc and cpsr simultaneously,
3331 // though we use "subs pc, lr, #N").
3332 //
3333 // M-class CPUs actually use a normal return sequence with a special
3334 // (hardware-provided) value in LR, so the normal code path works.
3335 if (DAG.getMachineFunction().getFunction().hasFnAttribute("interrupt") &&
3336 !Subtarget->isMClass()) {
3337 if (Subtarget->isThumb1Only())
3338 report_fatal_error("interrupt attribute is not supported in Thumb1");
3339 return LowerInterruptReturn(RetOps, dl, DAG);
3340 }
3341
3344 return DAG.getNode(RetNode, dl, MVT::Other, RetOps);
3345}
3346
3347bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
3348 if (N->getNumValues() != 1)
3349 return false;
3350 if (!N->hasNUsesOfValue(1, 0))
3351 return false;
3352
3353 SDValue TCChain = Chain;
3354 SDNode *Copy = *N->use_begin();
3355 if (Copy->getOpcode() == ISD::CopyToReg) {
3356 // If the copy has a glue operand, we conservatively assume it isn't safe to
3357 // perform a tail call.
3358 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3359 return false;
3360 TCChain = Copy->getOperand(0);
3361 } else if (Copy->getOpcode() == ARMISD::VMOVRRD) {
3362 SDNode *VMov = Copy;
3363 // f64 returned in a pair of GPRs.
3365 for (SDNode *U : VMov->uses()) {
3366 if (U->getOpcode() != ISD::CopyToReg)
3367 return false;
3368 Copies.insert(U);
3369 }
3370 if (Copies.size() > 2)
3371 return false;
3372
3373 for (SDNode *U : VMov->uses()) {
3374 SDValue UseChain = U->getOperand(0);
3375 if (Copies.count(UseChain.getNode()))
3376 // Second CopyToReg
3377 Copy = U;
3378 else {
3379 // We are at the top of this chain.
3380 // If the copy has a glue operand, we conservatively assume it
3381 // isn't safe to perform a tail call.
3382 if (U->getOperand(U->getNumOperands() - 1).getValueType() == MVT::Glue)
3383 return false;
3384 // First CopyToReg
3385 TCChain = UseChain;
3386 }
3387 }
3388 } else if (Copy->getOpcode() == ISD::BITCAST) {
3389 // f32 returned in a single GPR.
3390 if (!Copy->hasOneUse())
3391 return false;
3392 Copy = *Copy->use_begin();
3393 if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0))
3394 return false;
3395 // If the copy has a glue operand, we conservatively assume it isn't safe to
3396 // perform a tail call.
3397 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3398 return false;
3399 TCChain = Copy->getOperand(0);
3400 } else {
3401 return false;
3402 }
3403
3404 bool HasRet = false;
3405 for (const SDNode *U : Copy->uses()) {
3406 if (U->getOpcode() != ARMISD::RET_GLUE &&
3407 U->getOpcode() != ARMISD::INTRET_GLUE)
3408 return false;
3409 HasRet = true;
3410 }
3411
3412 if (!HasRet)
3413 return false;
3414
3415 Chain = TCChain;
3416 return true;
3417}
3418
3419bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3420 if (!Subtarget->supportsTailCall())
3421 return false;
3422
3423 if (!CI->isTailCall())
3424 return false;
3425
3426 return true;
3427}
3428
3429// Trying to write a 64 bit value so need to split into two 32 bit values first,
3430// and pass the lower and high parts through.
3432 SDLoc DL(Op);
3433 SDValue WriteValue = Op->getOperand(2);
3434
3435 // This function is only supposed to be called for i64 type argument.
3436 assert(WriteValue.getValueType() == MVT::i64
3437 && "LowerWRITE_REGISTER called for non-i64 type argument.");
3438
3439 SDValue Lo, Hi;
3440 std::tie(Lo, Hi) = DAG.SplitScalar(WriteValue, DL, MVT::i32, MVT::i32);
3441 SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi };
3442 return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops);
3443}
3444
3445// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
3446// their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
3447// one of the above mentioned nodes. It has to be wrapped because otherwise
3448// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
3449// be used to form addressing mode. These wrapped nodes will be selected
3450// into MOVi.
3451SDValue ARMTargetLowering::LowerConstantPool(SDValue Op,
3452 SelectionDAG &DAG) const {
3453 EVT PtrVT = Op.getValueType();
3454 // FIXME there is no actual debug info here
3455 SDLoc dl(Op);
3456 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
3457 SDValue Res;
3458
3459 // When generating execute-only code Constant Pools must be promoted to the
3460 // global data section. It's a bit ugly that we can't share them across basic
3461 // blocks, but this way we guarantee that execute-only behaves correct with
3462 // position-independent addressing modes.
3463 if (Subtarget->genExecuteOnly()) {
3464 auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
3465 auto T = const_cast<Type*>(CP->getType());
3466 auto C = const_cast<Constant*>(CP->getConstVal());
3467 auto M = const_cast<Module*>(DAG.getMachineFunction().
3469 auto GV = new GlobalVariable(
3470 *M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C,
3473 Twine(AFI->createPICLabelUId())
3474 );
3475 SDValue GA = DAG.getTargetGlobalAddress(dyn_cast<GlobalValue>(GV),
3476 dl, PtrVT);
3477 return LowerGlobalAddress(GA, DAG);
3478 }
3479
3480 // The 16-bit ADR instruction can only encode offsets that are multiples of 4,
3481 // so we need to align to at least 4 bytes when we don't have 32-bit ADR.
3482 Align CPAlign = CP->getAlign();
3483 if (Subtarget->isThumb1Only())
3484 CPAlign = std::max(CPAlign, Align(4));
3485 if (CP->isMachineConstantPoolEntry())
3486 Res =
3487 DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, CPAlign);
3488 else
3489 Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CPAlign);
3490 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
3491}
3492
3494 // If we don't have a 32-bit pc-relative branch instruction then the jump
3495 // table consists of block addresses. Usually this is inline, but for
3496 // execute-only it must be placed out-of-line.
3497 if (Subtarget->genExecuteOnly() && !Subtarget->hasV8MBaselineOps())
3500}
3501
3502SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
3503 SelectionDAG &DAG) const {
3506 unsigned ARMPCLabelIndex = 0;
3507 SDLoc DL(Op);
3508 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3509 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
3510 SDValue CPAddr;
3511 bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI();
3512 if (!IsPositionIndependent) {
3513 CPAddr = DAG.getTargetConstantPool(BA, PtrVT, Align(4));
3514 } else {
3515 unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
3516 ARMPCLabelIndex = AFI->createPICLabelUId();
3518 ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex,
3519 ARMCP::CPBlockAddress, PCAdj);
3520 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3521 }
3522 CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
3523 SDValue Result = DAG.getLoad(
3524 PtrVT, DL, DAG.getEntryNode(), CPAddr,
3526 if (!IsPositionIndependent)
3527 return Result;
3528 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32);
3529 return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
3530}
3531
3532/// Convert a TLS address reference into the correct sequence of loads
3533/// and calls to compute the variable's address for Darwin, and return an
3534/// SDValue containing the final node.
3535
3536/// Darwin only has one TLS scheme which must be capable of dealing with the
3537/// fully general situation, in the worst case. This means:
3538/// + "extern __thread" declaration.
3539/// + Defined in a possibly unknown dynamic library.
3540///
3541/// The general system is that each __thread variable has a [3 x i32] descriptor
3542/// which contains information used by the runtime to calculate the address. The
3543/// only part of this the compiler needs to know about is the first word, which
3544/// contains a function pointer that must be called with the address of the
3545/// entire descriptor in "r0".
3546///
3547/// Since this descriptor may be in a different unit, in general access must
3548/// proceed along the usual ARM rules. A common sequence to produce is:
3549///
3550/// movw rT1, :lower16:_var$non_lazy_ptr
3551/// movt rT1, :upper16:_var$non_lazy_ptr
3552/// ldr r0, [rT1]
3553/// ldr rT2, [r0]
3554/// blx rT2
3555/// [...address now in r0...]
3556SDValue
3557ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op,
3558 SelectionDAG &DAG) const {
3559 assert(Subtarget->isTargetDarwin() &&
3560 "This function expects a Darwin target");
3561 SDLoc DL(Op);
3562
3563 // First step is to get the address of the actua global symbol. This is where
3564 // the TLS descriptor lives.
3565 SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG);
3566
3567 // The first entry in the descriptor is a function pointer that we must call
3568 // to obtain the address of the variable.
3569 SDValue Chain = DAG.getEntryNode();
3570 SDValue FuncTLVGet = DAG.getLoad(
3571 MVT::i32, DL, Chain, DescAddr,
3575 Chain = FuncTLVGet.getValue(1);
3576
3578 MachineFrameInfo &MFI = F.getFrameInfo();
3579 MFI.setAdjustsStack(true);
3580
3581 // TLS calls preserve all registers except those that absolutely must be
3582 // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be
3583 // silly).
3584 auto TRI =
3586 auto ARI = static_cast<const ARMRegisterInfo *>(TRI);
3588
3589 // Finally, we can make the call. This is just a degenerate version of a
3590 // normal AArch64 call node: r0 takes the address of the descriptor, and
3591 // returns the address of the variable in this thread.
3592 Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue());
3593 Chain =
3594 DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
3595 Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32),
3596 DAG.getRegisterMask(Mask), Chain.getValue(1));
3597 return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1));
3598}
3599
3600SDValue
3601ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op,
3602 SelectionDAG &DAG) const {
3603 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
3604
3605 SDValue Chain = DAG.getEntryNode();
3606 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3607 SDLoc DL(Op);
3608
3609 // Load the current TEB (thread environment block)
3610 SDValue Ops[] = {Chain,
3611 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
3612 DAG.getTargetConstant(15, DL, MVT::i32),
3613 DAG.getTargetConstant(0, DL, MVT::i32),
3614 DAG.getTargetConstant(13, DL, MVT::i32),
3615 DAG.getTargetConstant(0, DL, MVT::i32),
3616 DAG.getTargetConstant(2, DL, MVT::i32)};
3617 SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
3618 DAG.getVTList(MVT::i32, MVT::Other), Ops);
3619
3620 SDValue TEB = CurrentTEB.getValue(0);
3621 Chain = CurrentTEB.getValue(1);
3622
3623 // Load the ThreadLocalStoragePointer from the TEB
3624 // A pointer to the TLS array is located at offset 0x2c from the TEB.
3625 SDValue TLSArray =
3626 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL));
3627 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
3628
3629 // The pointer to the thread's TLS data area is at the TLS Index scaled by 4
3630 // offset into the TLSArray.
3631
3632 // Load the TLS index from the C runtime
3633 SDValue TLSIndex =
3634 DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG);
3635 TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex);
3636 TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo());
3637
3638 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
3639 DAG.getConstant(2, DL, MVT::i32));
3640 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
3641 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
3643
3644 // Get the offset of the start of the .tls section (section base)
3645 const auto *GA = cast<GlobalAddressSDNode>(Op);
3646 auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL);
3647 SDValue Offset = DAG.getLoad(
3648 PtrVT, DL, Chain,
3649 DAG.getNode(ARMISD::Wrapper, DL, MVT::i32,
3650 DAG.getTargetConstantPool(CPV, PtrVT, Align(4))),
3652
3653 return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset);
3654}