LLVM 20.0.0git
AArch64FrameLowering.cpp
Go to the documentation of this file.
1//===- AArch64FrameLowering.cpp - AArch64 Frame Lowering -------*- C++ -*-====//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file contains the AArch64 implementation of TargetFrameLowering class.
10//
11// On AArch64, stack frames are structured as follows:
12//
13// The stack grows downward.
14//
15// All of the individual frame areas on the frame below are optional, i.e. it's
16// possible to create a function so that the particular area isn't present
17// in the frame.
18//
19// At function entry, the "frame" looks as follows:
20//
21// | | Higher address
22// |-----------------------------------|
23// | |
24// | arguments passed on the stack |
25// | |
26// |-----------------------------------| <- sp
27// | | Lower address
28//
29//
30// After the prologue has run, the frame has the following general structure.
31// Note that this doesn't depict the case where a red-zone is used. Also,
32// technically the last frame area (VLAs) doesn't get created until in the
33// main function body, after the prologue is run. However, it's depicted here
34// for completeness.
35//
36// | | Higher address
37// |-----------------------------------|
38// | |
39// | arguments passed on the stack |
40// | |
41// |-----------------------------------|
42// | |
43// | (Win64 only) varargs from reg |
44// | |
45// |-----------------------------------|
46// | |
47// | callee-saved gpr registers | <--.
48// | | | On Darwin platforms these
49// |- - - - - - - - - - - - - - - - - -| | callee saves are swapped,
50// | prev_lr | | (frame record first)
51// | prev_fp | <--'
52// | async context if needed |
53// | (a.k.a. "frame record") |
54// |-----------------------------------| <- fp(=x29)
55// | <hazard padding> |
56// |-----------------------------------|
57// | |
58// | callee-saved fp/simd/SVE regs |
59// | |
60// |-----------------------------------|
61// | |
62// | SVE stack objects |
63// | |
64// |-----------------------------------|
65// |.empty.space.to.make.part.below....|
66// |.aligned.in.case.it.needs.more.than| (size of this area is unknown at
67// |.the.standard.16-byte.alignment....| compile time; if present)
68// |-----------------------------------|
69// | local variables of fixed size |
70// | including spill slots |
71// | <FPR> |
72// | <hazard padding> |
73// | <GPR> |
74// |-----------------------------------| <- bp(not defined by ABI,
75// |.variable-sized.local.variables....| LLVM chooses X19)
76// |.(VLAs)............................| (size of this area is unknown at
77// |...................................| compile time)
78// |-----------------------------------| <- sp
79// | | Lower address
80//
81//
82// To access the data in a frame, at-compile time, a constant offset must be
83// computable from one of the pointers (fp, bp, sp) to access it. The size
84// of the areas with a dotted background cannot be computed at compile-time
85// if they are present, making it required to have all three of fp, bp and
86// sp to be set up to be able to access all contents in the frame areas,
87// assuming all of the frame areas are non-empty.
88//
89// For most functions, some of the frame areas are empty. For those functions,
90// it may not be necessary to set up fp or bp:
91// * A base pointer is definitely needed when there are both VLAs and local
92// variables with more-than-default alignment requirements.
93// * A frame pointer is definitely needed when there are local variables with
94// more-than-default alignment requirements.
95//
96// For Darwin platforms the frame-record (fp, lr) is stored at the top of the
97// callee-saved area, since the unwind encoding does not allow for encoding
98// this dynamically and existing tools depend on this layout. For other
99// platforms, the frame-record is stored at the bottom of the (gpr) callee-saved
100// area to allow SVE stack objects (allocated directly below the callee-saves,
101// if available) to be accessed directly from the framepointer.
102// The SVE spill/fill instructions have VL-scaled addressing modes such
103// as:
104// ldr z8, [fp, #-7 mul vl]
105// For SVE the size of the vector length (VL) is not known at compile-time, so
106// '#-7 mul vl' is an offset that can only be evaluated at runtime. With this
107// layout, we don't need to add an unscaled offset to the framepointer before
108// accessing the SVE object in the frame.
109//
110// In some cases when a base pointer is not strictly needed, it is generated
111// anyway when offsets from the frame pointer to access local variables become
112// so large that the offset can't be encoded in the immediate fields of loads
113// or stores.
114//
115// Outgoing function arguments must be at the bottom of the stack frame when
116// calling another function. If we do not have variable-sized stack objects, we
117// can allocate a "reserved call frame" area at the bottom of the local
118// variable area, large enough for all outgoing calls. If we do have VLAs, then
119// the stack pointer must be decremented and incremented around each call to
120// make space for the arguments below the VLAs.
121//
122// FIXME: also explain the redzone concept.
123//
124// About stack hazards: Under some SME contexts, a coprocessor with its own
125// separate cache can used for FP operations. This can create hazards if the CPU
126// and the SME unit try to access the same area of memory, including if the
127// access is to an area of the stack. To try to alleviate this we attempt to
128// introduce extra padding into the stack frame between FP and GPR accesses,
129// controlled by the aarch64-stack-hazard-size option. Without changing the
130// layout of the stack frame in the diagram above, a stack object of size
131// aarch64-stack-hazard-size is added between GPR and FPR CSRs. Another is added
132// to the stack objects section, and stack objects are sorted so that FPR >
133// Hazard padding slot > GPRs (where possible). Unfortunately some things are
134// not handled well (VLA area, arguments on the stack, objects with both GPR and
135// FPR accesses), but if those are controlled by the user then the entire stack
136// frame becomes GPR at the start/end with FPR in the middle, surrounded by
137// Hazard padding.
138//
139// An example of the prologue:
140//
141// .globl __foo
142// .align 2
143// __foo:
144// Ltmp0:
145// .cfi_startproc
146// .cfi_personality 155, ___gxx_personality_v0
147// Leh_func_begin:
148// .cfi_lsda 16, Lexception33
149//
150// stp xa,bx, [sp, -#offset]!
151// ...
152// stp x28, x27, [sp, #offset-32]
153// stp fp, lr, [sp, #offset-16]
154// add fp, sp, #offset - 16
155// sub sp, sp, #1360
156//
157// The Stack:
158// +-------------------------------------------+
159// 10000 | ........ | ........ | ........ | ........ |
160// 10004 | ........ | ........ | ........ | ........ |
161// +-------------------------------------------+
162// 10008 | ........ | ........ | ........ | ........ |
163// 1000c | ........ | ........ | ........ | ........ |
164// +===========================================+
165// 10010 | X28 Register |
166// 10014 | X28 Register |
167// +-------------------------------------------+
168// 10018 | X27 Register |
169// 1001c | X27 Register |
170// +===========================================+
171// 10020 | Frame Pointer |
172// 10024 | Frame Pointer |
173// +-------------------------------------------+
174// 10028 | Link Register |
175// 1002c | Link Register |
176// +===========================================+
177// 10030 | ........ | ........ | ........ | ........ |
178// 10034 | ........ | ........ | ........ | ........ |
179// +-------------------------------------------+
180// 10038 | ........ | ........ | ........ | ........ |
181// 1003c | ........ | ........ | ........ | ........ |
182// +-------------------------------------------+
183//
184// [sp] = 10030 :: >>initial value<<
185// sp = 10020 :: stp fp, lr, [sp, #-16]!
186// fp = sp == 10020 :: mov fp, sp
187// [sp] == 10020 :: stp x28, x27, [sp, #-16]!
188// sp == 10010 :: >>final value<<
189//
190// The frame pointer (w29) points to address 10020. If we use an offset of
191// '16' from 'w29', we get the CFI offsets of -8 for w30, -16 for w29, -24
192// for w27, and -32 for w28:
193//
194// Ltmp1:
195// .cfi_def_cfa w29, 16
196// Ltmp2:
197// .cfi_offset w30, -8
198// Ltmp3:
199// .cfi_offset w29, -16
200// Ltmp4:
201// .cfi_offset w27, -24
202// Ltmp5:
203// .cfi_offset w28, -32
204//
205//===----------------------------------------------------------------------===//
206
207#include "AArch64FrameLowering.h"
208#include "AArch64InstrInfo.h"
210#include "AArch64RegisterInfo.h"
211#include "AArch64Subtarget.h"
215#include "llvm/ADT/ScopeExit.h"
216#include "llvm/ADT/SmallVector.h"
217#include "llvm/ADT/Statistic.h"
234#include "llvm/IR/Attributes.h"
235#include "llvm/IR/CallingConv.h"
236#include "llvm/IR/DataLayout.h"
237#include "llvm/IR/DebugLoc.h"
238#include "llvm/IR/Function.h"
239#include "llvm/MC/MCAsmInfo.h"
240#include "llvm/MC/MCDwarf.h"
242#include "llvm/Support/Debug.h"
249#include <cassert>
250#include <cstdint>
251#include <iterator>
252#include <optional>
253#include <vector>
254
255using namespace llvm;
256
257#define DEBUG_TYPE "frame-info"
258
259static cl::opt<bool> EnableRedZone("aarch64-redzone",
260 cl::desc("enable use of redzone on AArch64"),
261 cl::init(false), cl::Hidden);
262
264 "stack-tagging-merge-settag",
265 cl::desc("merge settag instruction in function epilog"), cl::init(true),
266 cl::Hidden);
267
268static cl::opt<bool> OrderFrameObjects("aarch64-order-frame-objects",
269 cl::desc("sort stack allocations"),
270 cl::init(true), cl::Hidden);
271
273 "homogeneous-prolog-epilog", cl::Hidden,
274 cl::desc("Emit homogeneous prologue and epilogue for the size "
275 "optimization (default = off)"));
276
277// Stack hazard size for analysis remarks. StackHazardSize takes precedence.
279 StackHazardRemarkSize("aarch64-stack-hazard-remark-size", cl::init(0),
280 cl::Hidden);
281// Whether to insert padding into non-streaming functions (for testing).
282static cl::opt<bool>
283 StackHazardInNonStreaming("aarch64-stack-hazard-in-non-streaming",
284 cl::init(false), cl::Hidden);
285
287 "aarch64-disable-multivector-spill-fill",
288 cl::desc("Disable use of LD/ST pairs for SME2 or SVE2p1"), cl::init(false),
289 cl::Hidden);
290
291STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
292
293/// Returns how much of the incoming argument stack area (in bytes) we should
294/// clean up in an epilogue. For the C calling convention this will be 0, for
295/// guaranteed tail call conventions it can be positive (a normal return or a
296/// tail call to a function that uses less stack space for arguments) or
297/// negative (for a tail call to a function that needs more stack space than us
298/// for arguments).
303 bool IsTailCallReturn = (MBB.end() != MBBI)
305 : false;
306
307 int64_t ArgumentPopSize = 0;
308 if (IsTailCallReturn) {
309 MachineOperand &StackAdjust = MBBI->getOperand(1);
310
311 // For a tail-call in a callee-pops-arguments environment, some or all of
312 // the stack may actually be in use for the call's arguments, this is
313 // calculated during LowerCall and consumed here...
314 ArgumentPopSize = StackAdjust.getImm();
315 } else {
316 // ... otherwise the amount to pop is *all* of the argument space,
317 // conveniently stored in the MachineFunctionInfo by
318 // LowerFormalArguments. This will, of course, be zero for the C calling
319 // convention.
320 ArgumentPopSize = AFI->getArgumentStackToRestore();
321 }
322
323 return ArgumentPopSize;
324}
325
327static bool needsWinCFI(const MachineFunction &MF);
330
331/// Returns true if a homogeneous prolog or epilog code can be emitted
332/// for the size optimization. If possible, a frame helper call is injected.
333/// When Exit block is given, this check is for epilog.
334bool AArch64FrameLowering::homogeneousPrologEpilog(
335 MachineFunction &MF, MachineBasicBlock *Exit) const {
336 if (!MF.getFunction().hasMinSize())
337 return false;
339 return false;
340 if (EnableRedZone)
341 return false;
342
343 // TODO: Window is supported yet.
344 if (needsWinCFI(MF))
345 return false;
346 // TODO: SVE is not supported yet.
347 if (getSVEStackSize(MF))
348 return false;
349
350 // Bail on stack adjustment needed on return for simplicity.
351 const MachineFrameInfo &MFI = MF.getFrameInfo();
353 if (MFI.hasVarSizedObjects() || RegInfo->hasStackRealignment(MF))
354 return false;
355 if (Exit && getArgumentStackToRestore(MF, *Exit))
356 return false;
357
358 auto *AFI = MF.getInfo<AArch64FunctionInfo>();
359 if (AFI->hasSwiftAsyncContext() || AFI->hasStreamingModeChanges())
360 return false;
361
362 // If there are an odd number of GPRs before LR and FP in the CSRs list,
363 // they will not be paired into one RegPairInfo, which is incompatible with
364 // the assumption made by the homogeneous prolog epilog pass.
365 const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
366 unsigned NumGPRs = 0;
367 for (unsigned I = 0; CSRegs[I]; ++I) {
368 Register Reg = CSRegs[I];
369 if (Reg == AArch64::LR) {
370 assert(CSRegs[I + 1] == AArch64::FP);
371 if (NumGPRs % 2 != 0)
372 return false;
373 break;
374 }
375 if (AArch64::GPR64RegClass.contains(Reg))
376 ++NumGPRs;
377 }
378
379 return true;
380}
381
382/// Returns true if CSRs should be paired.
383bool AArch64FrameLowering::producePairRegisters(MachineFunction &MF) const {
384 return produceCompactUnwindFrame(MF) || homogeneousPrologEpilog(MF);
385}
386
387/// This is the biggest offset to the stack pointer we can encode in aarch64
388/// instructions (without using a separate calculation and a temp register).
389/// Note that the exception here are vector stores/loads which cannot encode any
390/// displacements (see estimateRSStackSizeLimit(), isAArch64FrameOffsetLegal()).
391static const unsigned DefaultSafeSPDisplacement = 255;
392
393/// Look at each instruction that references stack frames and return the stack
394/// size limit beyond which some of these instructions will require a scratch
395/// register during their expansion later.
397 // FIXME: For now, just conservatively guestimate based on unscaled indexing
398 // range. We'll end up allocating an unnecessary spill slot a lot, but
399 // realistically that's not a big deal at this stage of the game.
400 for (MachineBasicBlock &MBB : MF) {
401 for (MachineInstr &MI : MBB) {
402 if (MI.isDebugInstr() || MI.isPseudo() ||
403 MI.getOpcode() == AArch64::ADDXri ||
404 MI.getOpcode() == AArch64::ADDSXri)
405 continue;
406
407 for (const MachineOperand &MO : MI.operands()) {
408 if (!MO.isFI())
409 continue;
410
412 if (isAArch64FrameOffsetLegal(MI, Offset, nullptr, nullptr, nullptr) ==
414 return 0;
415 }
416 }
417 }
419}
420
424}
425
426/// Returns the size of the fixed object area (allocated next to sp on entry)
427/// On Win64 this may include a var args area and an UnwindHelp object for EH.
428static unsigned getFixedObjectSize(const MachineFunction &MF,
429 const AArch64FunctionInfo *AFI, bool IsWin64,
430 bool IsFunclet) {
431 if (!IsWin64 || IsFunclet) {
432 return AFI->getTailCallReservedStack();
433 } else {
434 if (AFI->getTailCallReservedStack() != 0 &&
436 Attribute::SwiftAsync))
437 report_fatal_error("cannot generate ABI-changing tail call for Win64");
438 // Var args are stored here in the primary function.
439 const unsigned VarArgsArea = AFI->getVarArgsGPRSize();
440 // To support EH funclets we allocate an UnwindHelp object
441 const unsigned UnwindHelpObject = (MF.hasEHFunclets() ? 8 : 0);
442 return AFI->getTailCallReservedStack() +
443 alignTo(VarArgsArea + UnwindHelpObject, 16);
444 }
445}
446
447/// Returns the size of the entire SVE stackframe (calleesaves + spills).
450 return StackOffset::getScalable((int64_t)AFI->getStackSizeSVE());
451}
452
454 if (!EnableRedZone)
455 return false;
456
457 // Don't use the red zone if the function explicitly asks us not to.
458 // This is typically used for kernel code.
459 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
460 const unsigned RedZoneSize =
462 if (!RedZoneSize)
463 return false;
464
465 const MachineFrameInfo &MFI = MF.getFrameInfo();
467 uint64_t NumBytes = AFI->getLocalStackSize();
468
469 // If neither NEON or SVE are available, a COPY from one Q-reg to
470 // another requires a spill -> reload sequence. We can do that
471 // using a pre-decrementing store/post-decrementing load, but
472 // if we do so, we can't use the Red Zone.
473 bool LowerQRegCopyThroughMem = Subtarget.hasFPARMv8() &&
474 !Subtarget.isNeonAvailable() &&
475 !Subtarget.hasSVE();
476
477 return !(MFI.hasCalls() || hasFP(MF) || NumBytes > RedZoneSize ||
478 getSVEStackSize(MF) || LowerQRegCopyThroughMem);
479}
480
481/// hasFPImpl - Return true if the specified function should have a dedicated
482/// frame pointer register.
484 const MachineFrameInfo &MFI = MF.getFrameInfo();
485 const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
486
487 // Win64 EH requires a frame pointer if funclets are present, as the locals
488 // are accessed off the frame pointer in both the parent function and the
489 // funclets.
490 if (MF.hasEHFunclets())
491 return true;
492 // Retain behavior of always omitting the FP for leaf functions when possible.
494 return true;
495 if (MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() ||
496 MFI.hasStackMap() || MFI.hasPatchPoint() ||
497 RegInfo->hasStackRealignment(MF))
498 return true;
499 // With large callframes around we may need to use FP to access the scavenging
500 // emergency spillslot.
501 //
502 // Unfortunately some calls to hasFP() like machine verifier ->
503 // getReservedReg() -> hasFP in the middle of global isel are too early
504 // to know the max call frame size. Hopefully conservatively returning "true"
505 // in those cases is fine.
506 // DefaultSafeSPDisplacement is fine as we only emergency spill GP regs.
507 if (!MFI.isMaxCallFrameSizeComputed() ||
509 return true;
510
511 return false;
512}
513
514/// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
515/// not required, we reserve argument space for call sites in the function
516/// immediately on entry to the current function. This eliminates the need for
517/// add/sub sp brackets around call sites. Returns true if the call frame is
518/// included as part of the stack frame.
520 const MachineFunction &MF) const {
521 // The stack probing code for the dynamically allocated outgoing arguments
522 // area assumes that the stack is probed at the top - either by the prologue
523 // code, which issues a probe if `hasVarSizedObjects` return true, or by the
524 // most recent variable-sized object allocation. Changing the condition here
525 // may need to be followed up by changes to the probe issuing logic.
526 return !MF.getFrameInfo().hasVarSizedObjects();
527}
528
532 const AArch64InstrInfo *TII =
533 static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
534 const AArch64TargetLowering *TLI =
535 MF.getSubtarget<AArch64Subtarget>().getTargetLowering();
536 [[maybe_unused]] MachineFrameInfo &MFI = MF.getFrameInfo();
537 DebugLoc DL = I->getDebugLoc();
538 unsigned Opc = I->getOpcode();
539 bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
540 uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
541
542 if (!hasReservedCallFrame(MF)) {
543 int64_t Amount = I->getOperand(0).getImm();
544 Amount = alignTo(Amount, getStackAlign());
545 if (!IsDestroy)
546 Amount = -Amount;
547
548 // N.b. if CalleePopAmount is valid but zero (i.e. callee would pop, but it
549 // doesn't have to pop anything), then the first operand will be zero too so
550 // this adjustment is a no-op.
551 if (CalleePopAmount == 0) {
552 // FIXME: in-function stack adjustment for calls is limited to 24-bits
553 // because there's no guaranteed temporary register available.
554 //
555 // ADD/SUB (immediate) has only LSL #0 and LSL #12 available.
556 // 1) For offset <= 12-bit, we use LSL #0
557 // 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses
558 // LSL #0, and the other uses LSL #12.
559 //
560 // Most call frames will be allocated at the start of a function so
561 // this is OK, but it is a limitation that needs dealing with.
562 assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
563
564 if (TLI->hasInlineStackProbe(MF) &&
566 // When stack probing is enabled, the decrement of SP may need to be
567 // probed. We only need to do this if the call site needs 1024 bytes of
568 // space or more, because a region smaller than that is allowed to be
569 // unprobed at an ABI boundary. We rely on the fact that SP has been
570 // probed exactly at this point, either by the prologue or most recent
571 // dynamic allocation.
573 "non-reserved call frame without var sized objects?");
574 Register ScratchReg =
575 MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
576 inlineStackProbeFixed(I, ScratchReg, -Amount, StackOffset::get(0, 0));
577 } else {
578 emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
579 StackOffset::getFixed(Amount), TII);
580 }
581 }
582 } else if (CalleePopAmount != 0) {
583 // If the calling convention demands that the callee pops arguments from the
584 // stack, we want to add it back if we have a reserved call frame.
585 assert(CalleePopAmount < 0xffffff && "call frame too large");
586 emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP,
587 StackOffset::getFixed(-(int64_t)CalleePopAmount), TII);
588 }
589 return MBB.erase(I);
590}
591
592void AArch64FrameLowering::emitCalleeSavedGPRLocations(
595 MachineFrameInfo &MFI = MF.getFrameInfo();
597 SMEAttrs Attrs(MF.getFunction());
598 bool LocallyStreaming =
599 Attrs.hasStreamingBody() && !Attrs.hasStreamingInterface();
600
601 const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
602 if (CSI.empty())
603 return;
604
605 const TargetSubtargetInfo &STI = MF.getSubtarget();
606 const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
607 const TargetInstrInfo &TII = *STI.getInstrInfo();
609
610 for (const auto &Info : CSI) {
611 unsigned FrameIdx = Info.getFrameIdx();
612 if (MFI.getStackID(FrameIdx) == TargetStackID::ScalableVector)
613 continue;
614
615 assert(!Info.isSpilledToReg() && "Spilling to registers not implemented");
616 int64_t DwarfReg = TRI.getDwarfRegNum(Info.getReg(), true);
617 int64_t Offset = MFI.getObjectOffset(FrameIdx) - getOffsetOfLocalArea();
618
619 // The location of VG will be emitted before each streaming-mode change in
620 // the function. Only locally-streaming functions require emitting the
621 // non-streaming VG location here.
622 if ((LocallyStreaming && FrameIdx == AFI->getStreamingVGIdx()) ||
623 (!LocallyStreaming &&
624 DwarfReg == TRI.getDwarfRegNum(AArch64::VG, true)))
625 continue;
626
627 unsigned CFIIndex = MF.addFrameInst(
628 MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
629 BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
630 .addCFIIndex(CFIIndex)
632 }
633}
634
635void AArch64FrameLowering::emitCalleeSavedSVELocations(
638 MachineFrameInfo &MFI = MF.getFrameInfo();
639
640 // Add callee saved registers to move list.
641 const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
642 if (CSI.empty())
643 return;
644
645 const TargetSubtargetInfo &STI = MF.getSubtarget();
646 const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
647 const TargetInstrInfo &TII = *STI.getInstrInfo();
650
651 for (const auto &Info : CSI) {
652 if (!(MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector))
653 continue;
654
655 // Not all unwinders may know about SVE registers, so assume the lowest
656 // common demoninator.
657 assert(!Info.isSpilledToReg() && "Spilling to registers not implemented");
658 unsigned Reg = Info.getReg();
659 if (!static_cast<const AArch64RegisterInfo &>(TRI).regNeedsCFI(Reg, Reg))
660 continue;
661
663 StackOffset::getScalable(MFI.getObjectOffset(Info.getFrameIdx())) -
665
666 unsigned CFIIndex = MF.addFrameInst(createCFAOffset(TRI, Reg, Offset));
667 BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
668 .addCFIIndex(CFIIndex)
670 }
671}
672
676 unsigned DwarfReg) {
677 unsigned CFIIndex =
678 MF.addFrameInst(MCCFIInstruction::createSameValue(nullptr, DwarfReg));
679 BuildMI(MBB, InsertPt, DebugLoc(), Desc).addCFIIndex(CFIIndex);
680}
681
683 MachineBasicBlock &MBB) const {
684
686 const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
687 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
688 const auto &TRI =
689 static_cast<const AArch64RegisterInfo &>(*Subtarget.getRegisterInfo());
690 const auto &MFI = *MF.getInfo<AArch64FunctionInfo>();
691
692 const MCInstrDesc &CFIDesc = TII.get(TargetOpcode::CFI_INSTRUCTION);
693 DebugLoc DL;
694
695 // Reset the CFA to `SP + 0`.
697 unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfa(
698 nullptr, TRI.getDwarfRegNum(AArch64::SP, true), 0));
699 BuildMI(MBB, InsertPt, DL, CFIDesc).addCFIIndex(CFIIndex);
700
701 // Flip the RA sign state.
702 if (MFI.shouldSignReturnAddress(MF)) {
703 auto CFIInst = MFI.branchProtectionPAuthLR()
706 CFIIndex = MF.addFrameInst(CFIInst);
707 BuildMI(MBB, InsertPt, DL, CFIDesc).addCFIIndex(CFIIndex);
708 }
709
710 // Shadow call stack uses X18, reset it.
711 if (MFI.needsShadowCallStackPrologueEpilogue(MF))
712 insertCFISameValue(CFIDesc, MF, MBB, InsertPt,
713 TRI.getDwarfRegNum(AArch64::X18, true));
714
715 // Emit .cfi_same_value for callee-saved registers.
716 const std::vector<CalleeSavedInfo> &CSI =
718 for (const auto &Info : CSI) {
719 unsigned Reg = Info.getReg();
720 if (!TRI.regNeedsCFI(Reg, Reg))
721 continue;
722 insertCFISameValue(CFIDesc, MF, MBB, InsertPt,
723 TRI.getDwarfRegNum(Reg, true));
724 }
725}
726
729 bool SVE) {
731 MachineFrameInfo &MFI = MF.getFrameInfo();
732
733 const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
734 if (CSI.empty())
735 return;
736
737 const TargetSubtargetInfo &STI = MF.getSubtarget();
738 const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
739 const TargetInstrInfo &TII = *STI.getInstrInfo();
741
742 for (const auto &Info : CSI) {
743 if (SVE !=
744 (MFI.getStackID(Info.getFrameIdx()) == TargetStackID::ScalableVector))
745 continue;
746
747 unsigned Reg = Info.getReg();
748 if (SVE &&
749 !static_cast<const AArch64RegisterInfo &>(TRI).regNeedsCFI(Reg, Reg))
750 continue;
751
752 if (!Info.isRestored())
753 continue;
754
755 unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createRestore(
756 nullptr, TRI.getDwarfRegNum(Info.getReg(), true)));
757 BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
758 .addCFIIndex(CFIIndex)
760 }
761}
762
763void AArch64FrameLowering::emitCalleeSavedGPRRestores(
766}
767
768void AArch64FrameLowering::emitCalleeSavedSVERestores(
771}
772
773// Return the maximum possible number of bytes for `Size` due to the
774// architectural limit on the size of a SVE register.
775static int64_t upperBound(StackOffset Size) {
776 static const int64_t MAX_BYTES_PER_SCALABLE_BYTE = 16;
777 return Size.getScalable() * MAX_BYTES_PER_SCALABLE_BYTE + Size.getFixed();
778}
779
780void AArch64FrameLowering::allocateStackSpace(
782 int64_t RealignmentPadding, StackOffset AllocSize, bool NeedsWinCFI,
783 bool *HasWinCFI, bool EmitCFI, StackOffset InitialOffset,
784 bool FollowupAllocs) const {
785
786 if (!AllocSize)
787 return;
788
789 DebugLoc DL;
791 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
792 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
794 const MachineFrameInfo &MFI = MF.getFrameInfo();
795
796 const int64_t MaxAlign = MFI.getMaxAlign().value();
797 const uint64_t AndMask = ~(MaxAlign - 1);
798
799 if (!Subtarget.getTargetLowering()->hasInlineStackProbe(MF)) {
800 Register TargetReg = RealignmentPadding
802 : AArch64::SP;
803 // SUB Xd/SP, SP, AllocSize
804 emitFrameOffset(MBB, MBBI, DL, TargetReg, AArch64::SP, -AllocSize, &TII,
805 MachineInstr::FrameSetup, false, NeedsWinCFI, HasWinCFI,
806 EmitCFI, InitialOffset);
807
808 if (RealignmentPadding) {
809 // AND SP, X9, 0b11111...0000
810 BuildMI(MBB, MBBI, DL, TII.get(AArch64::ANDXri), AArch64::SP)
811 .addReg(TargetReg, RegState::Kill)
814 AFI.setStackRealigned(true);
815
816 // No need for SEH instructions here; if we're realigning the stack,
817 // we've set a frame pointer and already finished the SEH prologue.
818 assert(!NeedsWinCFI);
819 }
820 return;
821 }
822
823 //
824 // Stack probing allocation.
825 //
826
827 // Fixed length allocation. If we don't need to re-align the stack and don't
828 // have SVE objects, we can use a more efficient sequence for stack probing.
829 if (AllocSize.getScalable() == 0 && RealignmentPadding == 0) {
831 assert(ScratchReg != AArch64::NoRegister);
832 BuildMI(MBB, MBBI, DL, TII.get(AArch64::PROBED_STACKALLOC))
833 .addDef(ScratchReg)
834 .addImm(AllocSize.getFixed())
835 .addImm(InitialOffset.getFixed())
836 .addImm(InitialOffset.getScalable());
837 // The fixed allocation may leave unprobed bytes at the top of the
838 // stack. If we have subsequent alocation (e.g. if we have variable-sized
839 // objects), we need to issue an extra probe, so these allocations start in
840 // a known state.
841 if (FollowupAllocs) {
842 // STR XZR, [SP]
843 BuildMI(MBB, MBBI, DL, TII.get(AArch64::STRXui))
844 .addReg(AArch64::XZR)
845 .addReg(AArch64::SP)
846 .addImm(0)
848 }
849
850 return;
851 }
852
853 // Variable length allocation.
854
855 // If the (unknown) allocation size cannot exceed the probe size, decrement
856 // the stack pointer right away.
857 int64_t ProbeSize = AFI.getStackProbeSize();
858 if (upperBound(AllocSize) + RealignmentPadding <= ProbeSize) {
859 Register ScratchReg = RealignmentPadding
861 : AArch64::SP;
862 assert(ScratchReg != AArch64::NoRegister);
863 // SUB Xd, SP, AllocSize
864 emitFrameOffset(MBB, MBBI, DL, ScratchReg, AArch64::SP, -AllocSize, &TII,
865 MachineInstr::FrameSetup, false, NeedsWinCFI, HasWinCFI,
866 EmitCFI, InitialOffset);
867 if (RealignmentPadding) {
868 // AND SP, Xn, 0b11111...0000
869 BuildMI(MBB, MBBI, DL, TII.get(AArch64::ANDXri), AArch64::SP)
870 .addReg(ScratchReg, RegState::Kill)
873 AFI.setStackRealigned(true);
874 }
875 if (FollowupAllocs || upperBound(AllocSize) + RealignmentPadding >
877 // STR XZR, [SP]
878 BuildMI(MBB, MBBI, DL, TII.get(AArch64::STRXui))
879 .addReg(AArch64::XZR)
880 .addReg(AArch64::SP)
881 .addImm(0)
883 }
884 return;
885 }
886
887 // Emit a variable-length allocation probing loop.
888 // TODO: As an optimisation, the loop can be "unrolled" into a few parts,
889 // each of them guaranteed to adjust the stack by less than the probe size.
891 assert(TargetReg != AArch64::NoRegister);
892 // SUB Xd, SP, AllocSize
893 emitFrameOffset(MBB, MBBI, DL, TargetReg, AArch64::SP, -AllocSize, &TII,
894 MachineInstr::FrameSetup, false, NeedsWinCFI, HasWinCFI,
895 EmitCFI, InitialOffset);
896 if (RealignmentPadding) {
897 // AND Xn, Xn, 0b11111...0000
898 BuildMI(MBB, MBBI, DL, TII.get(AArch64::ANDXri), TargetReg)
899 .addReg(TargetReg, RegState::Kill)
902 }
903
904 BuildMI(MBB, MBBI, DL, TII.get(AArch64::PROBED_STACKALLOC_VAR))
905 .addReg(TargetReg);
906 if (EmitCFI) {
907 // Set the CFA register back to SP.
908 unsigned Reg =
909 Subtarget.getRegisterInfo()->getDwarfRegNum(AArch64::SP, true);
910 unsigned CFIIndex =
912 BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
913 .addCFIIndex(CFIIndex)
915 }
916 if (RealignmentPadding)
917 AFI.setStackRealigned(true);
918}
919
920static MCRegister getRegisterOrZero(MCRegister Reg, bool HasSVE) {
921 switch (Reg.id()) {
922 default:
923 // The called routine is expected to preserve r19-r28
924 // r29 and r30 are used as frame pointer and link register resp.
925 return 0;
926
927 // GPRs
928#define CASE(n) \
929 case AArch64::W##n: \
930 case AArch64::X##n: \
931 return AArch64::X##n
932 CASE(0);
933 CASE(1);
934 CASE(2);
935 CASE(3);
936 CASE(4);
937 CASE(5);
938 CASE(6);
939 CASE(7);
940 CASE(8);
941 CASE(9);
942 CASE(10);
943 CASE(11);
944 CASE(12);
945 CASE(13);
946 CASE(14);
947 CASE(15);
948 CASE(16);
949 CASE(17);
950 CASE(18);
951#undef CASE
952
953 // FPRs
954#define CASE(n) \
955 case AArch64::B##n: \
956 case AArch64::H##n: \
957 case AArch64::S##n: \
958 case AArch64::D##n: \
959 case AArch64::Q##n: \
960 return HasSVE ? AArch64::Z##n : AArch64::Q##n
961 CASE(0);
962 CASE(1);
963 CASE(2);
964 CASE(3);
965 CASE(4);
966 CASE(5);
967 CASE(6);
968 CASE(7);
969 CASE(8);
970 CASE(9);
971 CASE(10);
972 CASE(11);
973 CASE(12);
974 CASE(13);
975 CASE(14);
976 CASE(15);
977 CASE(16);
978 CASE(17);
979 CASE(18);
980 CASE(19);
981 CASE(20);
982 CASE(21);
983 CASE(22);
984 CASE(23);
985 CASE(24);
986 CASE(25);
987 CASE(26);
988 CASE(27);
989 CASE(28);
990 CASE(29);
991 CASE(30);
992 CASE(31);
993#undef CASE
994 }
995}
996
997void AArch64FrameLowering::emitZeroCallUsedRegs(BitVector RegsToZero,
998 MachineBasicBlock &MBB) const {
999 // Insertion point.
1001
1002 // Fake a debug loc.
1003 DebugLoc DL;
1004 if (MBBI != MBB.end())
1005 DL = MBBI->getDebugLoc();
1006
1007 const MachineFunction &MF = *MBB.getParent();
1009 const AArch64RegisterInfo &TRI = *STI.getRegisterInfo();
1010
1011 BitVector GPRsToZero(TRI.getNumRegs());
1012 BitVector FPRsToZero(TRI.getNumRegs());
1013 bool HasSVE = STI.isSVEorStreamingSVEAvailable();
1014 for (MCRegister Reg : RegsToZero.set_bits()) {
1015 if (TRI.isGeneralPurposeRegister(MF, Reg)) {
1016 // For GPRs, we only care to clear out the 64-bit register.
1017 if (MCRegister XReg = getRegisterOrZero(Reg, HasSVE))
1018 GPRsToZero.set(XReg);
1019 } else if (AArch64InstrInfo::isFpOrNEON(Reg)) {
1020 // For FPRs,
1021 if (MCRegister XReg = getRegisterOrZero(Reg, HasSVE))
1022 FPRsToZero.set(XReg);
1023 }
1024 }
1025
1026 const AArch64InstrInfo &TII = *STI.getInstrInfo();
1027
1028 // Zero out GPRs.
1029 for (MCRegister Reg : GPRsToZero.set_bits())
1030 TII.buildClearRegister(Reg, MBB, MBBI, DL);
1031
1032 // Zero out FP/vector registers.
1033 for (MCRegister Reg : FPRsToZero.set_bits())
1034 TII.buildClearRegister(Reg, MBB, MBBI, DL);
1035
1036 if (HasSVE) {
1037 for (MCRegister PReg :
1038 {AArch64::P0, AArch64::P1, AArch64::P2, AArch64::P3, AArch64::P4,
1039 AArch64::P5, AArch64::P6, AArch64::P7, AArch64::P8, AArch64::P9,
1040 AArch64::P10, AArch64::P11, AArch64::P12, AArch64::P13, AArch64::P14,
1041 AArch64::P15}) {
1042 if (RegsToZero[PReg])
1043 BuildMI(MBB, MBBI, DL, TII.get(AArch64::PFALSE), PReg);
1044 }
1045 }
1046}
1047
1049 const MachineBasicBlock &MBB) {
1050 const MachineFunction *MF = MBB.getParent();
1051 LiveRegs.addLiveIns(MBB);
1052 // Mark callee saved registers as used so we will not choose them.
1053 const MCPhysReg *CSRegs = MF->getRegInfo().getCalleeSavedRegs();
1054 for (unsigned i = 0; CSRegs[i]; ++i)
1055 LiveRegs.addReg(CSRegs[i]);
1056}
1057
1058// Find a scratch register that we can use at the start of the prologue to
1059// re-align the stack pointer. We avoid using callee-save registers since they
1060// may appear to be free when this is called from canUseAsPrologue (during
1061// shrink wrapping), but then no longer be free when this is called from
1062// emitPrologue.
1063//
1064// FIXME: This is a bit conservative, since in the above case we could use one
1065// of the callee-save registers as a scratch temp to re-align the stack pointer,
1066// but we would then have to make sure that we were in fact saving at least one
1067// callee-save register in the prologue, which is additional complexity that
1068// doesn't seem worth the benefit.
1070 MachineFunction *MF = MBB->getParent();
1071
1072 // If MBB is an entry block, use X9 as the scratch register
1073 // preserve_none functions may be using X9 to pass arguments,
1074 // so prefer to pick an available register below.
1075 if (&MF->front() == MBB &&
1077 return AArch64::X9;
1078
1079 const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
1080 const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();
1081 LivePhysRegs LiveRegs(TRI);
1082 getLiveRegsForEntryMBB(LiveRegs, *MBB);
1083
1084 // Prefer X9 since it was historically used for the prologue scratch reg.
1085 const MachineRegisterInfo &MRI = MF->getRegInfo();
1086 if (LiveRegs.available(MRI, AArch64::X9))
1087 return AArch64::X9;
1088
1089 for (unsigned Reg : AArch64::GPR64RegClass) {
1090 if (LiveRegs.available(MRI, Reg))
1091 return Reg;
1092 }
1093 return AArch64::NoRegister;
1094}
1095
1097 const MachineBasicBlock &MBB) const {
1098 const MachineFunction *MF = MBB.getParent();
1099 MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB);
1100 const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
1101 const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
1102 const AArch64TargetLowering *TLI = Subtarget.getTargetLowering();
1104
1105 if (AFI->hasSwiftAsyncContext()) {
1106 const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();
1107 const MachineRegisterInfo &MRI = MF->getRegInfo();
1108 LivePhysRegs LiveRegs(TRI);
1109 getLiveRegsForEntryMBB(LiveRegs, MBB);
1110 // The StoreSwiftAsyncContext clobbers X16 and X17. Make sure they are
1111 // available.
1112 if (!LiveRegs.available(MRI, AArch64::X16) ||
1113 !LiveRegs.available(MRI, AArch64::X17))
1114 return false;
1115 }
1116
1117 // Certain stack probing sequences might clobber flags, then we can't use
1118 // the block as a prologue if the flags register is a live-in.
1120 MBB.isLiveIn(AArch64::NZCV))
1121 return false;
1122
1123 // Don't need a scratch register if we're not going to re-align the stack or
1124 // emit stack probes.
1125 if (!RegInfo->hasStackRealignment(*MF) && !TLI->hasInlineStackProbe(*MF))
1126 return true;
1127 // Otherwise, we can use any block as long as it has a scratch register
1128 // available.
1129 return findScratchNonCalleeSaveRegister(TmpMBB) != AArch64::NoRegister;
1130}
1131
1133 uint64_t StackSizeInBytes) {
1134 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1136 // TODO: When implementing stack protectors, take that into account
1137 // for the probe threshold.
1138 return Subtarget.isTargetWindows() && MFI.hasStackProbing() &&
1139 StackSizeInBytes >= uint64_t(MFI.getStackProbeSize());
1140}
1141
1142static bool needsWinCFI(const MachineFunction &MF) {
1143 const Function &F = MF.getFunction();
1144 return MF.getTarget().getMCAsmInfo()->usesWindowsCFI() &&
1145 F.needsUnwindTableEntry();
1146}
1147
1148bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
1149 MachineFunction &MF, uint64_t StackBumpBytes) const {
1151 const MachineFrameInfo &MFI = MF.getFrameInfo();
1152 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1153 const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
1154 if (homogeneousPrologEpilog(MF))
1155 return false;
1156
1157 if (AFI->getLocalStackSize() == 0)
1158 return false;
1159
1160 // For WinCFI, if optimizing for size, prefer to not combine the stack bump
1161 // (to force a stp with predecrement) to match the packed unwind format,
1162 // provided that there actually are any callee saved registers to merge the
1163 // decrement with.
1164 // This is potentially marginally slower, but allows using the packed
1165 // unwind format for functions that both have a local area and callee saved
1166 // registers. Using the packed unwind format notably reduces the size of
1167 // the unwind info.
1168 if (needsWinCFI(MF) && AFI->getCalleeSavedStackSize() > 0 &&
1169 MF.getFunction().hasOptSize())
1170 return false;
1171
1172 // 512 is the maximum immediate for stp/ldp that will be used for
1173 // callee-save save/restores
1174 if (StackBumpBytes >= 512 || windowsRequiresStackProbe(MF, StackBumpBytes))
1175 return false;
1176
1177 if (MFI.hasVarSizedObjects())
1178 return false;
1179
1180 if (RegInfo->hasStackRealignment(MF))
1181 return false;
1182
1183 // This isn't strictly necessary, but it simplifies things a bit since the
1184 // current RedZone handling code assumes the SP is adjusted by the
1185 // callee-save save/restore code.
1186 if (canUseRedZone(MF))
1187 return false;
1188
1189 // When there is an SVE area on the stack, always allocate the
1190 // callee-saves and spills/locals separately.
1191 if (getSVEStackSize(MF))
1192 return false;
1193
1194 return true;
1195}
1196
1197bool AArch64FrameLowering::shouldCombineCSRLocalStackBumpInEpilogue(
1198 MachineBasicBlock &MBB, unsigned StackBumpBytes) const {
1199 if (!shouldCombineCSRLocalStackBump(*MBB.getParent(), StackBumpBytes))
1200 return false;
1201
1202 if (MBB.empty())
1203 return true;
1204
1205 // Disable combined SP bump if the last instruction is an MTE tag store. It
1206 // is almost always better to merge SP adjustment into those instructions.
1209 while (LastI != Begin) {
1210 --LastI;
1211 if (LastI->isTransient())
1212 continue;
1213 if (!LastI->getFlag(MachineInstr::FrameDestroy))
1214 break;
1215 }
1216 switch (LastI->getOpcode()) {
1217 case AArch64::STGloop:
1218 case AArch64::STZGloop:
1219 case AArch64::STGi:
1220 case AArch64::STZGi:
1221 case AArch64::ST2Gi:
1222 case AArch64::STZ2Gi:
1223 return false;
1224 default:
1225 return true;
1226 }
1227 llvm_unreachable("unreachable");
1228}
1229
1230// Given a load or a store instruction, generate an appropriate unwinding SEH
1231// code on Windows.
1233 const TargetInstrInfo &TII,
1234 MachineInstr::MIFlag Flag) {
1235 unsigned Opc = MBBI->getOpcode();
1237 MachineFunction &MF = *MBB->getParent();
1238 DebugLoc DL = MBBI->getDebugLoc();
1239 unsigned ImmIdx = MBBI->getNumOperands() - 1;
1240 int Imm = MBBI->getOperand(ImmIdx).getImm();
1242 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1243 const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
1244
1245 switch (Opc) {
1246 default:
1247 llvm_unreachable("No SEH Opcode for this instruction");
1248 case AArch64::LDPDpost:
1249 Imm = -Imm;
1250 [[fallthrough]];
1251 case AArch64::STPDpre: {
1252 unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
1253 unsigned Reg1 = RegInfo->getSEHRegNum(MBBI->getOperand(2).getReg());
1254 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFRegP_X))
1255 .addImm(Reg0)
1256 .addImm(Reg1)
1257 .addImm(Imm * 8)
1258 .setMIFlag(Flag);
1259 break;
1260 }
1261 case AArch64::LDPXpost:
1262 Imm = -Imm;
1263 [[fallthrough]];
1264 case AArch64::STPXpre: {
1265 Register Reg0 = MBBI->getOperand(1).getReg();
1266 Register Reg1 = MBBI->getOperand(2).getReg();
1267 if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
1268 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR_X))
1269 .addImm(Imm * 8)
1270 .setMIFlag(Flag);
1271 else
1272 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveRegP_X))
1273 .addImm(RegInfo->getSEHRegNum(Reg0))
1274 .addImm(RegInfo->getSEHRegNum(Reg1))
1275 .addImm(Imm * 8)
1276 .setMIFlag(Flag);
1277 break;
1278 }
1279 case AArch64::LDRDpost:
1280 Imm = -Imm;
1281 [[fallthrough]];
1282 case AArch64::STRDpre: {
1283 unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
1284 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFReg_X))
1285 .addImm(Reg)
1286 .addImm(Imm)
1287 .setMIFlag(Flag);
1288 break;
1289 }
1290 case AArch64::LDRXpost:
1291 Imm = -Imm;
1292 [[fallthrough]];
1293 case AArch64::STRXpre: {
1294 unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
1295 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveReg_X))
1296 .addImm(Reg)
1297 .addImm(Imm)
1298 .setMIFlag(Flag);
1299 break;
1300 }
1301 case AArch64::STPDi:
1302 case AArch64::LDPDi: {
1303 unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
1304 unsigned Reg1 = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
1305 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFRegP))
1306 .addImm(Reg0)
1307 .addImm(Reg1)
1308 .addImm(Imm * 8)
1309 .setMIFlag(Flag);
1310 break;
1311 }
1312 case AArch64::STPXi:
1313 case AArch64::LDPXi: {
1314 Register Reg0 = MBBI->getOperand(0).getReg();
1315 Register Reg1 = MBBI->getOperand(1).getReg();
1316 if (Reg0 == AArch64::FP && Reg1 == AArch64::LR)
1317 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFPLR))
1318 .addImm(Imm * 8)
1319 .setMIFlag(Flag);
1320 else
1321 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveRegP))
1322 .addImm(RegInfo->getSEHRegNum(Reg0))
1323 .addImm(RegInfo->getSEHRegNum(Reg1))
1324 .addImm(Imm * 8)
1325 .setMIFlag(Flag);
1326 break;
1327 }
1328 case AArch64::STRXui:
1329 case AArch64::LDRXui: {
1330 int Reg = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
1331 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveReg))
1332 .addImm(Reg)
1333 .addImm(Imm * 8)
1334 .setMIFlag(Flag);
1335 break;
1336 }
1337 case AArch64::STRDui:
1338 case AArch64::LDRDui: {
1339 unsigned Reg = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
1340 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveFReg))
1341 .addImm(Reg)
1342 .addImm(Imm * 8)
1343 .setMIFlag(Flag);
1344 break;
1345 }
1346 case AArch64::STPQi:
1347 case AArch64::LDPQi: {
1348 unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
1349 unsigned Reg1 = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
1350 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveAnyRegQP))
1351 .addImm(Reg0)
1352 .addImm(Reg1)
1353 .addImm(Imm * 16)
1354 .setMIFlag(Flag);
1355 break;
1356 }
1357 case AArch64::LDPQpost:
1358 Imm = -Imm;
1359 [[fallthrough]];
1360 case AArch64::STPQpre: {
1361 unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
1362 unsigned Reg1 = RegInfo->getSEHRegNum(MBBI->getOperand(2).getReg());
1363 MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveAnyRegQPX))
1364 .addImm(Reg0)
1365 .addImm(Reg1)
1366 .addImm(Imm * 16)
1367 .setMIFlag(Flag);
1368 break;
1369 }
1370 }
1371 auto I = MBB->insertAfter(MBBI, MIB);
1372 return I;
1373}
1374
1375// Fix up the SEH opcode associated with the save/restore instruction.
1377 unsigned LocalStackSize) {
1378 MachineOperand *ImmOpnd = nullptr;
1379 unsigned ImmIdx = MBBI->getNumOperands() - 1;
1380 switch (MBBI->getOpcode()) {
1381 default:
1382 llvm_unreachable("Fix the offset in the SEH instruction");
1383 case AArch64::SEH_SaveFPLR:
1384 case AArch64::SEH_SaveRegP:
1385 case AArch64::SEH_SaveReg:
1386 case AArch64::SEH_SaveFRegP:
1387 case AArch64::SEH_SaveFReg:
1388 case AArch64::SEH_SaveAnyRegQP:
1389 case AArch64::SEH_SaveAnyRegQPX:
1390 ImmOpnd = &MBBI->getOperand(ImmIdx);
1391 break;
1392 }
1393 if (ImmOpnd)
1394 ImmOpnd->setImm(ImmOpnd->getImm() + LocalStackSize);
1395}
1396
1399 return AFI->hasStreamingModeChanges() &&
1400 !MF.getSubtarget<AArch64Subtarget>().hasSVE();
1401}
1402
1405 // For Darwin platforms we don't save VG for non-SVE functions, even if SME
1406 // is enabled with streaming mode changes.
1407 if (!AFI->hasStreamingModeChanges())
1408 return false;
1409 auto &ST = MF.getSubtarget<AArch64Subtarget>();
1410 if (ST.isTargetDarwin())
1411 return ST.hasSVE();
1412 return true;
1413}
1414
1416 unsigned Opc = MBBI->getOpcode();
1417 if (Opc == AArch64::CNTD_XPiI || Opc == AArch64::RDSVLI_XI ||
1418 Opc == AArch64::UBFMXri)
1419 return true;
1420
1421 if (requiresGetVGCall(*MBBI->getMF())) {
1422 if (Opc == AArch64::ORRXrr)
1423 return true;
1424
1425 if (Opc == AArch64::BL) {
1426 auto Op1 = MBBI->getOperand(0);
1427 return Op1.isSymbol() &&
1428 (StringRef(Op1.getSymbolName()) == "__arm_get_current_vg");
1429 }
1430 }
1431
1432 return false;
1433}
1434
1435// Convert callee-save register save/restore instruction to do stack pointer
1436// decrement/increment to allocate/deallocate the callee-save stack area by
1437// converting store/load to use pre/post increment version.
1440 const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc,
1441 bool NeedsWinCFI, bool *HasWinCFI, bool EmitCFI,
1443 int CFAOffset = 0) {
1444 unsigned NewOpc;
1445
1446 // If the function contains streaming mode changes, we expect instructions
1447 // to calculate the value of VG before spilling. For locally-streaming
1448 // functions, we need to do this for both the streaming and non-streaming
1449 // vector length. Move past these instructions if necessary.
1450 MachineFunction &MF = *MBB.getParent();
1451 if (requiresSaveVG(MF))
1452 while (isVGInstruction(MBBI))
1453 ++MBBI;
1454
1455 switch (MBBI->getOpcode()) {
1456 default:
1457 llvm_unreachable("Unexpected callee-save save/restore opcode!");
1458 case AArch64::STPXi:
1459 NewOpc = AArch64::STPXpre;
1460 break;
1461 case AArch64::STPDi:
1462 NewOpc = AArch64::STPDpre;
1463 break;
1464 case AArch64::STPQi:
1465 NewOpc = AArch64::STPQpre;
1466 break;
1467 case AArch64::STRXui:
1468 NewOpc = AArch64::STRXpre;
1469 break;
1470 case AArch64::STRDui:
1471 NewOpc = AArch64::STRDpre;
1472 break;
1473 case AArch64::STRQui:
1474 NewOpc = AArch64::STRQpre;
1475 break;
1476 case AArch64::LDPXi:
1477 NewOpc = AArch64::LDPXpost;
1478 break;
1479 case AArch64::LDPDi:
1480 NewOpc = AArch64::LDPDpost;
1481 break;
1482 case AArch64::LDPQi:
1483 NewOpc = AArch64::LDPQpost;
1484 break;
1485 case AArch64::LDRXui:
1486 NewOpc = AArch64::LDRXpost;
1487 break;
1488 case AArch64::LDRDui:
1489 NewOpc = AArch64::LDRDpost;
1490 break;
1491 case AArch64::LDRQui:
1492 NewOpc = AArch64::LDRQpost;
1493 break;
1494 }
1495 // Get rid of the SEH code associated with the old instruction.
1496 if (NeedsWinCFI) {
1497 auto SEH = std::next(MBBI);
1499 SEH->eraseFromParent();
1500 }
1501
1502 TypeSize Scale = TypeSize::getFixed(1), Width = TypeSize::getFixed(0);
1503 int64_t MinOffset, MaxOffset;
1504 bool Success = static_cast<const AArch64InstrInfo *>(TII)->getMemOpInfo(
1505 NewOpc, Scale, Width, MinOffset, MaxOffset);
1506 (void)Success;
1507 assert(Success && "unknown load/store opcode");
1508
1509 // If the first store isn't right where we want SP then we can't fold the
1510 // update in so create a normal arithmetic instruction instead.
1511 if (MBBI->getOperand(MBBI->getNumOperands() - 1).getImm() != 0 ||
1512 CSStackSizeInc < MinOffset * (int64_t)Scale.getFixedValue() ||
1513 CSStackSizeInc > MaxOffset * (int64_t)Scale.getFixedValue()) {
1514 // If we are destroying the frame, make sure we add the increment after the
1515 // last frame operation.
1516 if (FrameFlag == MachineInstr::FrameDestroy)
1517 ++MBBI;
1518 emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
1519 StackOffset::getFixed(CSStackSizeInc), TII, FrameFlag,
1520 false, false, nullptr, EmitCFI,
1521 StackOffset::getFixed(CFAOffset));
1522
1523 return std::prev(MBBI);
1524 }
1525
1526 MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc));
1527 MIB.addReg(AArch64::SP, RegState::Define);
1528
1529 // Copy all operands other than the immediate offset.
1530 unsigned OpndIdx = 0;
1531 for (unsigned OpndEnd = MBBI->getNumOperands() - 1; OpndIdx < OpndEnd;
1532 ++OpndIdx)
1533 MIB.add(MBBI->getOperand(OpndIdx));
1534
1535 assert(MBBI->getOperand(OpndIdx).getImm() == 0 &&
1536 "Unexpected immediate offset in first/last callee-save save/restore "
1537 "instruction!");
1538 assert(MBBI->getOperand(OpndIdx - 1).getReg() == AArch64::SP &&
1539 "Unexpected base register in callee-save save/restore instruction!");
1540 assert(CSStackSizeInc % Scale == 0);
1541 MIB.addImm(CSStackSizeInc / (int)Scale);
1542
1543 MIB.setMIFlags(MBBI->getFlags());
1544 MIB.setMemRefs(MBBI->memoperands());
1545
1546 // Generate a new SEH code that corresponds to the new instruction.
1547 if (NeedsWinCFI) {
1548 *HasWinCFI = true;
1549 InsertSEH(*MIB, *TII, FrameFlag);
1550 }
1551
1552 if (EmitCFI) {
1553 unsigned CFIIndex = MF.addFrameInst(
1554 MCCFIInstruction::cfiDefCfaOffset(nullptr, CFAOffset - CSStackSizeInc));
1555 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
1556 .addCFIIndex(CFIIndex)
1557 .setMIFlags(FrameFlag);
1558 }
1559
1560 return std::prev(MBB.erase(MBBI));
1561}
1562
1563// Fixup callee-save register save/restore instructions to take into account
1564// combined SP bump by adding the local stack size to the stack offsets.
1566 uint64_t LocalStackSize,
1567 bool NeedsWinCFI,
1568 bool *HasWinCFI) {
1570 return;
1571
1572 unsigned Opc = MI.getOpcode();
1573 unsigned Scale;
1574 switch (Opc) {
1575 case AArch64::STPXi:
1576 case AArch64::STRXui:
1577 case AArch64::STPDi:
1578 case AArch64::STRDui:
1579 case AArch64::LDPXi:
1580 case AArch64::LDRXui:
1581 case AArch64::LDPDi:
1582 case AArch64::LDRDui:
1583 Scale = 8;
1584 break;
1585 case AArch64::STPQi:
1586 case AArch64::STRQui:
1587 case AArch64::LDPQi:
1588 case AArch64::LDRQui:
1589 Scale = 16;
1590 break;
1591 default:
1592 llvm_unreachable("Unexpected callee-save save/restore opcode!");
1593 }
1594
1595 unsigned OffsetIdx = MI.getNumExplicitOperands() - 1;
1596 assert(MI.getOperand(OffsetIdx - 1).getReg() == AArch64::SP &&
1597 "Unexpected base register in callee-save save/restore instruction!");
1598 // Last operand is immediate offset that needs fixing.
1599 MachineOperand &OffsetOpnd = MI.getOperand(OffsetIdx);
1600 // All generated opcodes have scaled offsets.
1601 assert(LocalStackSize % Scale == 0);
1602 OffsetOpnd.setImm(OffsetOpnd.getImm() + LocalStackSize / Scale);
1603
1604 if (NeedsWinCFI) {
1605 *HasWinCFI = true;
1606 auto MBBI = std::next(MachineBasicBlock::iterator(MI));
1607 assert(MBBI != MI.getParent()->end() && "Expecting a valid instruction");
1609 "Expecting a SEH instruction");
1610 fixupSEHOpcode(MBBI, LocalStackSize);
1611 }
1612}
1613
1614static bool isTargetWindows(const MachineFunction &MF) {
1616}
1617
1618static unsigned getStackHazardSize(const MachineFunction &MF) {
1619 return MF.getSubtarget<AArch64Subtarget>().getStreamingHazardSize();
1620}
1621
1622// Convenience function to determine whether I is an SVE callee save.
1624 switch (I->getOpcode()) {
1625 default:
1626 return false;
1627 case AArch64::PTRUE_C_B:
1628 case AArch64::LD1B_2Z_IMM:
1629 case AArch64::ST1B_2Z_IMM:
1630 case AArch64::STR_ZXI:
1631 case AArch64::STR_PXI:
1632 case AArch64::LDR_ZXI:
1633 case AArch64::LDR_PXI:
1634 return I->getFlag(MachineInstr::FrameSetup) ||
1635 I->getFlag(MachineInstr::FrameDestroy);
1636 }
1637}
1638
1640 MachineFunction &MF,
1643 const DebugLoc &DL, bool NeedsWinCFI,
1644 bool NeedsUnwindInfo) {
1645 // Shadow call stack prolog: str x30, [x18], #8
1646 BuildMI(MBB, MBBI, DL, TII.get(AArch64::STRXpost))
1647 .addReg(AArch64::X18, RegState::Define)
1648 .addReg(AArch64::LR)
1649 .addReg(AArch64::X18)
1650 .addImm(8)
1652
1653 // This instruction also makes x18 live-in to the entry block.
1654 MBB.addLiveIn(AArch64::X18);
1655
1656 if (NeedsWinCFI)
1657 BuildMI(MBB, MBBI, DL, TII.get(AArch64::SEH_Nop))
1659
1660 if (NeedsUnwindInfo) {
1661 // Emit a CFI instruction that causes 8 to be subtracted from the value of
1662 // x18 when unwinding past this frame.
1663 static const char CFIInst[] = {
1664 dwarf::DW_CFA_val_expression,
1665 18, // register
1666 2, // length
1667 static_cast<char>(unsigned(dwarf::DW_OP_breg18)),
1668 static_cast<char>(-8) & 0x7f, // addend (sleb128)
1669 };
1670 unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createEscape(
1671 nullptr, StringRef(CFIInst, sizeof(CFIInst))));
1672 BuildMI(MBB, MBBI, DL, TII.get(AArch64::CFI_INSTRUCTION))
1673 .addCFIIndex(CFIIndex)
1675 }
1676}
1677
1679 MachineFunction &MF,
1682 const DebugLoc &DL) {
1683 // Shadow call stack epilog: ldr x30, [x18, #-8]!
1684 BuildMI(MBB, MBBI, DL, TII.get(AArch64::LDRXpre))
1685 .addReg(AArch64::X18, RegState::Define)
1686 .addReg(AArch64::LR, RegState::Define)
1687 .addReg(AArch64::X18)
1688 .addImm(-8)
1690
1692 unsigned CFIIndex =
1694 BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
1695 .addCFIIndex(CFIIndex)
1697 }
1698}
1699
1700// Define the current CFA rule to use the provided FP.
1703 const DebugLoc &DL, unsigned FixedObject) {
1706 const TargetInstrInfo *TII = STI.getInstrInfo();
1708
1709 const int OffsetToFirstCalleeSaveFromFP =
1712 Register FramePtr = TRI->getFrameRegister(MF);
1713 unsigned Reg = TRI->getDwarfRegNum(FramePtr, true);
1714 unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfa(
1715 nullptr, Reg, FixedObject - OffsetToFirstCalleeSaveFromFP));
1716 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
1717 .addCFIIndex(CFIIndex)
1719}
1720
1721#ifndef NDEBUG
1722/// Collect live registers from the end of \p MI's parent up to (including) \p
1723/// MI in \p LiveRegs.
1725 LivePhysRegs &LiveRegs) {
1726
1727 MachineBasicBlock &MBB = *MI.getParent();
1728 LiveRegs.addLiveOuts(MBB);
1729 for (const MachineInstr &MI :
1730 reverse(make_range(MI.getIterator(), MBB.instr_end())))
1731 LiveRegs.stepBackward(MI);
1732}
1733#endif
1734
1736 MachineBasicBlock &MBB) const {
1738 const MachineFrameInfo &MFI = MF.getFrameInfo();
1739 const Function &F = MF.getFunction();
1740 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1741 const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
1742 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
1743
1745 bool EmitCFI = AFI->needsDwarfUnwindInfo(MF);
1746 bool EmitAsyncCFI = AFI->needsAsyncDwarfUnwindInfo(MF);
1747 bool HasFP = hasFP(MF);
1748 bool NeedsWinCFI = needsWinCFI(MF);
1749 bool HasWinCFI = false;
1750 auto Cleanup = make_scope_exit([&]() { MF.setHasWinCFI(HasWinCFI); });
1751
1753#ifndef NDEBUG
1755 // Collect live register from the end of MBB up to the start of the existing
1756 // frame setup instructions.
1757 MachineBasicBlock::iterator NonFrameStart = MBB.begin();
1758 while (NonFrameStart != End &&
1759 NonFrameStart->getFlag(MachineInstr::FrameSetup))
1760 ++NonFrameStart;
1761
1762 LivePhysRegs LiveRegs(*TRI);
1763 if (NonFrameStart != MBB.end()) {
1764 getLivePhysRegsUpTo(*NonFrameStart, *TRI, LiveRegs);
1765 // Ignore registers used for stack management for now.
1766 LiveRegs.removeReg(AArch64::SP);
1767 LiveRegs.removeReg(AArch64::X19);
1768 LiveRegs.removeReg(AArch64::FP);
1769 LiveRegs.removeReg(AArch64::LR);
1770
1771 // X0 will be clobbered by a call to __arm_get_current_vg in the prologue.
1772 // This is necessary to spill VG if required where SVE is unavailable, but
1773 // X0 is preserved around this call.
1774 if (requiresGetVGCall(MF))
1775 LiveRegs.removeReg(AArch64::X0);
1776 }
1777
1778 auto VerifyClobberOnExit = make_scope_exit([&]() {
1779 if (NonFrameStart == MBB.end())
1780 return;
1781 // Check if any of the newly instructions clobber any of the live registers.
1782 for (MachineInstr &MI :
1783 make_range(MBB.instr_begin(), NonFrameStart->getIterator())) {
1784 for (auto &Op : MI.operands())
1785 if (Op.isReg() && Op.isDef())
1786 assert(!LiveRegs.contains(Op.getReg()) &&
1787 "live register clobbered by inserted prologue instructions");
1788 }
1789 });
1790#endif
1791
1792 bool IsFunclet = MBB.isEHFuncletEntry();
1793
1794 // At this point, we're going to decide whether or not the function uses a
1795 // redzone. In most cases, the function doesn't have a redzone so let's
1796 // assume that's false and set it to true in the case that there's a redzone.
1797 AFI->setHasRedZone(false);
1798
1799 // Debug location must be unknown since the first debug location is used
1800 // to determine the end of the prologue.
1801 DebugLoc DL;
1802
1803 const auto &MFnI = *MF.getInfo<AArch64FunctionInfo>();
1804 if (MFnI.needsShadowCallStackPrologueEpilogue(MF))
1805 emitShadowCallStackPrologue(*TII, MF, MBB, MBBI, DL, NeedsWinCFI,
1806 MFnI.needsDwarfUnwindInfo(MF));
1807
1808 if (MFnI.shouldSignReturnAddress(MF)) {
1809 BuildMI(MBB, MBBI, DL, TII->get(AArch64::PAUTH_PROLOGUE))
1811 if (NeedsWinCFI)
1812 HasWinCFI = true; // AArch64PointerAuth pass will insert SEH_PACSignLR
1813 }
1814
1815 if (EmitCFI && MFnI.isMTETagged()) {
1816 BuildMI(MBB, MBBI, DL, TII->get(AArch64::EMITMTETAGGED))
1818 }
1819
1820 // We signal the presence of a Swift extended frame to external tools by
1821 // storing FP with 0b0001 in bits 63:60. In normal userland operation a simple
1822 // ORR is sufficient, it is assumed a Swift kernel would initialize the TBI
1823 // bits so that is still true.
1824 if (HasFP && AFI->hasSwiftAsyncContext()) {
1827 if (Subtarget.swiftAsyncContextIsDynamicallySet()) {
1828 // The special symbol below is absolute and has a *value* that can be
1829 // combined with the frame pointer to signal an extended frame.
1830 BuildMI(MBB, MBBI, DL, TII->get(AArch64::LOADgot), AArch64::X16)
1831 .addExternalSymbol("swift_async_extendedFramePointerFlags",
1833 if (NeedsWinCFI) {
1834 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1836 HasWinCFI = true;
1837 }
1838 BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXrs), AArch64::FP)
1839 .addUse(AArch64::FP)
1840 .addUse(AArch64::X16)
1841 .addImm(Subtarget.isTargetILP32() ? 32 : 0);
1842 if (NeedsWinCFI) {
1843 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1845 HasWinCFI = true;
1846 }
1847 break;
1848 }
1849 [[fallthrough]];
1850
1852 // ORR x29, x29, #0x1000_0000_0000_0000
1853 BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXri), AArch64::FP)
1854 .addUse(AArch64::FP)
1855 .addImm(0x1100)
1857 if (NeedsWinCFI) {
1858 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1860 HasWinCFI = true;
1861 }
1862 break;
1863
1865 break;
1866 }
1867 }
1868
1869 // All calls are tail calls in GHC calling conv, and functions have no
1870 // prologue/epilogue.
1872 return;
1873
1874 // Set tagged base pointer to the requested stack slot.
1875 // Ideally it should match SP value after prologue.
1876 std::optional<int> TBPI = AFI->getTaggedBasePointerIndex();
1877 if (TBPI)
1879 else
1881
1882 const StackOffset &SVEStackSize = getSVEStackSize(MF);
1883
1884 // getStackSize() includes all the locals in its size calculation. We don't
1885 // include these locals when computing the stack size of a funclet, as they
1886 // are allocated in the parent's stack frame and accessed via the frame
1887 // pointer from the funclet. We only save the callee saved registers in the
1888 // funclet, which are really the callee saved registers of the parent
1889 // function, including the funclet.
1890 int64_t NumBytes =
1891 IsFunclet ? getWinEHFuncletFrameSize(MF) : MFI.getStackSize();
1892 if (!AFI->hasStackFrame() && !windowsRequiresStackProbe(MF, NumBytes)) {
1893 assert(!HasFP && "unexpected function without stack frame but with FP");
1894 assert(!SVEStackSize &&
1895 "unexpected function without stack frame but with SVE objects");
1896 // All of the stack allocation is for locals.
1897 AFI->setLocalStackSize(NumBytes);
1898 if (!NumBytes)
1899 return;
1900 // REDZONE: If the stack size is less than 128 bytes, we don't need
1901 // to actually allocate.
1902 if (canUseRedZone(MF)) {
1903 AFI->setHasRedZone(true);
1904 ++NumRedZoneFunctions;
1905 } else {
1906 emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
1907 StackOffset::getFixed(-NumBytes), TII,
1908 MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
1909 if (EmitCFI) {
1910 // Label used to tie together the PROLOG_LABEL and the MachineMoves.
1911 MCSymbol *FrameLabel = MF.getContext().createTempSymbol();
1912 // Encode the stack size of the leaf function.
1913 unsigned CFIIndex = MF.addFrameInst(
1914 MCCFIInstruction::cfiDefCfaOffset(FrameLabel, NumBytes));
1915 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
1916 .addCFIIndex(CFIIndex)
1918 }
1919 }
1920
1921 if (NeedsWinCFI) {
1922 HasWinCFI = true;
1923 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
1925 }
1926
1927 return;
1928 }
1929
1930 bool IsWin64 = Subtarget.isCallingConvWin64(F.getCallingConv(), F.isVarArg());
1931 unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
1932
1933 auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
1934 // All of the remaining stack allocations are for locals.
1935 AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
1936 bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
1937 bool HomPrologEpilog = homogeneousPrologEpilog(MF);
1938 if (CombineSPBump) {
1939 assert(!SVEStackSize && "Cannot combine SP bump with SVE");
1940 emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
1941 StackOffset::getFixed(-NumBytes), TII,
1942 MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI,
1943 EmitAsyncCFI);
1944 NumBytes = 0;
1945 } else if (HomPrologEpilog) {
1946 // Stack has been already adjusted.
1947 NumBytes -= PrologueSaveSize;
1948 } else if (PrologueSaveSize != 0) {
1950 MBB, MBBI, DL, TII, -PrologueSaveSize, NeedsWinCFI, &HasWinCFI,
1951 EmitAsyncCFI);
1952 NumBytes -= PrologueSaveSize;
1953 }
1954 assert(NumBytes >= 0 && "Negative stack allocation size!?");
1955
1956 // Move past the saves of the callee-saved registers, fixing up the offsets
1957 // and pre-inc if we decided to combine the callee-save and local stack
1958 // pointer bump above.
1959 while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup) &&
1961 if (CombineSPBump &&
1962 // Only fix-up frame-setup load/store instructions.
1965 NeedsWinCFI, &HasWinCFI);
1966 ++MBBI;
1967 }
1968
1969 // For funclets the FP belongs to the containing function.
1970 if (!IsFunclet && HasFP) {
1971 // Only set up FP if we actually need to.
1972 int64_t FPOffset = AFI->getCalleeSaveBaseToFrameRecordOffset();
1973
1974 if (CombineSPBump)
1975 FPOffset += AFI->getLocalStackSize();
1976
1977 if (AFI->hasSwiftAsyncContext()) {
1978 // Before we update the live FP we have to ensure there's a valid (or
1979 // null) asynchronous context in its slot just before FP in the frame
1980 // record, so store it now.
1981 const auto &Attrs = MF.getFunction().getAttributes();
1982 bool HaveInitialContext = Attrs.hasAttrSomewhere(Attribute::SwiftAsync);
1983 if (HaveInitialContext)
1984 MBB.addLiveIn(AArch64::X22);
1985 Register Reg = HaveInitialContext ? AArch64::X22 : AArch64::XZR;
1986 BuildMI(MBB, MBBI, DL, TII->get(AArch64::StoreSwiftAsyncContext))
1987 .addUse(Reg)
1988 .addUse(AArch64::SP)
1989 .addImm(FPOffset - 8)
1991 if (NeedsWinCFI) {
1992 // WinCFI and arm64e, where StoreSwiftAsyncContext is expanded
1993 // to multiple instructions, should be mutually-exclusive.
1994 assert(Subtarget.getTargetTriple().getArchName() != "arm64e");
1995 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
1997 HasWinCFI = true;
1998 }
1999 }
2000
2001 if (HomPrologEpilog) {
2002 auto Prolog = MBBI;
2003 --Prolog;
2004 assert(Prolog->getOpcode() == AArch64::HOM_Prolog);
2005 Prolog->addOperand(MachineOperand::CreateImm(FPOffset));
2006 } else {
2007 // Issue sub fp, sp, FPOffset or
2008 // mov fp,sp when FPOffset is zero.
2009 // Note: All stores of callee-saved registers are marked as "FrameSetup".
2010 // This code marks the instruction(s) that set the FP also.
2011 emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP,
2012 StackOffset::getFixed(FPOffset), TII,
2013 MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI);
2014 if (NeedsWinCFI && HasWinCFI) {
2015 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
2017 // After setting up the FP, the rest of the prolog doesn't need to be
2018 // included in the SEH unwind info.
2019 NeedsWinCFI = false;
2020 }
2021 }
2022 if (EmitAsyncCFI)
2023 emitDefineCFAWithFP(MF, MBB, MBBI, DL, FixedObject);
2024 }
2025
2026 // Now emit the moves for whatever callee saved regs we have (including FP,
2027 // LR if those are saved). Frame instructions for SVE register are emitted
2028 // later, after the instruction which actually save SVE regs.
2029 if (EmitAsyncCFI)
2030 emitCalleeSavedGPRLocations(MBB, MBBI);
2031
2032 // Alignment is required for the parent frame, not the funclet
2033 const bool NeedsRealignment =
2034 NumBytes && !IsFunclet && RegInfo->hasStackRealignment(MF);
2035 const int64_t RealignmentPadding =
2036 (NeedsRealignment && MFI.getMaxAlign() > Align(16))
2037 ? MFI.getMaxAlign().value() - 16
2038 : 0;
2039
2040 if (windowsRequiresStackProbe(MF, NumBytes + RealignmentPadding)) {
2041 uint64_t NumWords = (NumBytes + RealignmentPadding) >> 4;
2042 if (NeedsWinCFI) {
2043 HasWinCFI = true;
2044 // alloc_l can hold at most 256MB, so assume that NumBytes doesn't
2045 // exceed this amount. We need to move at most 2^24 - 1 into x15.
2046 // This is at most two instructions, MOVZ follwed by MOVK.
2047 // TODO: Fix to use multiple stack alloc unwind codes for stacks
2048 // exceeding 256MB in size.
2049 if (NumBytes >= (1 << 28))
2050 report_fatal_error("Stack size cannot exceed 256MB for stack "
2051 "unwinding purposes");
2052
2053 uint32_t LowNumWords = NumWords & 0xFFFF;
2054 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVZXi), AArch64::X15)
2055 .addImm(LowNumWords)
2058 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
2060 if ((NumWords & 0xFFFF0000) != 0) {
2061 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVKXi), AArch64::X15)
2062 .addReg(AArch64::X15)
2063 .addImm((NumWords & 0xFFFF0000) >> 16) // High half
2066 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
2068 }
2069 } else {
2070 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm), AArch64::X15)
2071 .addImm(NumWords)
2073 }
2074
2075 const char *ChkStk = Subtarget.getChkStkName();
2076 switch (MF.getTarget().getCodeModel()) {
2077 case CodeModel::Tiny:
2078 case CodeModel::Small:
2079 case CodeModel::Medium:
2080 case CodeModel::Kernel:
2081 BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
2082 .addExternalSymbol(ChkStk)
2083 .addReg(AArch64::X15, RegState::Implicit)
2088 if (NeedsWinCFI) {
2089 HasWinCFI = true;
2090 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
2092 }
2093 break;
2094 case CodeModel::Large:
2095 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVaddrEXT))
2096 .addReg(AArch64::X16, RegState::Define)
2097 .addExternalSymbol(ChkStk)
2098 .addExternalSymbol(ChkStk)
2100 if (NeedsWinCFI) {
2101 HasWinCFI = true;
2102 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
2104 }
2105
2106 BuildMI(MBB, MBBI, DL, TII->get(getBLRCallOpcode(MF)))
2107 .addReg(AArch64::X16, RegState::Kill)
2113 if (NeedsWinCFI) {
2114 HasWinCFI = true;
2115 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
2117 }
2118 break;
2119 }
2120
2121 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SUBXrx64), AArch64::SP)
2122 .addReg(AArch64::SP, RegState::Kill)
2123 .addReg(AArch64::X15, RegState::Kill)
2126 if (NeedsWinCFI) {
2127 HasWinCFI = true;
2128 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_StackAlloc))
2129 .addImm(NumBytes)
2131 }
2132 NumBytes = 0;
2133
2134 if (RealignmentPadding > 0) {
2135 if (RealignmentPadding >= 4096) {
2136 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm))
2137 .addReg(AArch64::X16, RegState::Define)
2138 .addImm(RealignmentPadding)
2140 BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXrx64), AArch64::X15)
2141 .addReg(AArch64::SP)
2142 .addReg(AArch64::X16, RegState::Kill)
2145 } else {
2146 BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXri), AArch64::X15)
2147 .addReg(AArch64::SP)
2148 .addImm(RealignmentPadding)
2149 .addImm(0)
2151 }
2152
2153 uint64_t AndMask = ~(MFI.getMaxAlign().value() - 1);
2154 BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP)
2155 .addReg(AArch64::X15, RegState::Kill)
2157 AFI->setStackRealigned(true);
2158
2159 // No need for SEH instructions here; if we're realigning the stack,
2160 // we've set a frame pointer and already finished the SEH prologue.
2161 assert(!NeedsWinCFI);
2162 }
2163 }
2164
2165 StackOffset SVECalleeSavesSize = {}, SVELocalsSize = SVEStackSize;
2166 MachineBasicBlock::iterator CalleeSavesBegin = MBBI, CalleeSavesEnd = MBBI;
2167
2168 // Process the SVE callee-saves to determine what space needs to be
2169 // allocated.
2170 if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
2171 LLVM_DEBUG(dbgs() << "SVECalleeSavedStackSize = " << CalleeSavedSize
2172 << "\n");
2173 // Find callee save instructions in frame.
2174 CalleeSavesBegin = MBBI;
2175 assert(IsSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction");
2177 ++MBBI;
2178 CalleeSavesEnd = MBBI;
2179
2180 SVECalleeSavesSize = StackOffset::getScalable(CalleeSavedSize);
2181 SVELocalsSize = SVEStackSize - SVECalleeSavesSize;
2182 }
2183
2184 // Allocate space for the callee saves (if any).
2185 StackOffset CFAOffset =
2186 StackOffset::getFixed((int64_t)MFI.getStackSize() - NumBytes);
2187 StackOffset LocalsSize = SVELocalsSize + StackOffset::getFixed(NumBytes);
2188 allocateStackSpace(MBB, CalleeSavesBegin, 0, SVECalleeSavesSize, false,
2189 nullptr, EmitAsyncCFI && !HasFP, CFAOffset,
2190 MFI.hasVarSizedObjects() || LocalsSize);
2191 CFAOffset += SVECalleeSavesSize;
2192
2193 if (EmitAsyncCFI)
2194 emitCalleeSavedSVELocations(MBB, CalleeSavesEnd);
2195
2196 // Allocate space for the rest of the frame including SVE locals. Align the
2197 // stack as necessary.
2198 assert(!(canUseRedZone(MF) && NeedsRealignment) &&
2199 "Cannot use redzone with stack realignment");
2200 if (!canUseRedZone(MF)) {
2201 // FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
2202 // the correct value here, as NumBytes also includes padding bytes,
2203 // which shouldn't be counted here.
2204 allocateStackSpace(MBB, CalleeSavesEnd, RealignmentPadding,
2205 SVELocalsSize + StackOffset::getFixed(NumBytes),
2206 NeedsWinCFI, &HasWinCFI, EmitAsyncCFI && !HasFP,
2207 CFAOffset, MFI.hasVarSizedObjects());
2208 }
2209
2210 // If we need a base pointer, set it up here. It's whatever the value of the
2211 // stack pointer is at this point. Any variable size objects will be allocated
2212 // after this, so we can still use the base pointer to reference locals.
2213 //
2214 // FIXME: Clarify FrameSetup flags here.
2215 // Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is
2216 // needed.
2217 // For funclets the BP belongs to the containing function.
2218 if (!IsFunclet && RegInfo->hasBasePointer(MF)) {
2219 TII->copyPhysReg(MBB, MBBI, DL, RegInfo->getBaseRegister(), AArch64::SP,
2220 false);
2221 if (NeedsWinCFI) {
2222 HasWinCFI = true;
2223 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
2225 }
2226 }
2227
2228 // The very last FrameSetup instruction indicates the end of prologue. Emit a
2229 // SEH opcode indicating the prologue end.
2230 if (NeedsWinCFI && HasWinCFI) {
2231 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
2233 }
2234
2235 // SEH funclets are passed the frame pointer in X1. If the parent
2236 // function uses the base register, then the base register is used
2237 // directly, and is not retrieved from X1.
2238 if (IsFunclet && F.hasPersonalityFn()) {
2239 EHPersonality Per = classifyEHPersonality(F.getPersonalityFn());
2240 if (isAsynchronousEHPersonality(Per)) {
2241 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), AArch64::FP)
2242 .addReg(AArch64::X1)
2244 MBB.addLiveIn(AArch64::X1);
2245 }
2246 }
2247
2248 if (EmitCFI && !EmitAsyncCFI) {
2249 if (HasFP) {
2250 emitDefineCFAWithFP(MF, MBB, MBBI, DL, FixedObject);
2251 } else {
2252 StackOffset TotalSize =
2253 SVEStackSize + StackOffset::getFixed((int64_t)MFI.getStackSize());
2254 unsigned CFIIndex = MF.addFrameInst(createDefCFA(
2255 *RegInfo, /*FrameReg=*/AArch64::SP, /*Reg=*/AArch64::SP, TotalSize,
2256 /*LastAdjustmentWasScalable=*/false));
2257 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
2258 .addCFIIndex(CFIIndex)
2260 }
2261 emitCalleeSavedGPRLocations(MBB, MBBI);
2262 emitCalleeSavedSVELocations(MBB, MBBI);
2263 }
2264}
2265
2267 switch (MI.getOpcode()) {
2268 default:
2269 return false;
2270 case AArch64::CATCHRET:
2271 case AArch64::CLEANUPRET:
2272 return true;
2273 }
2274}
2275
2277 MachineBasicBlock &MBB) const {
2279 MachineFrameInfo &MFI = MF.getFrameInfo();
2281 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2282 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
2283 DebugLoc DL;
2284 bool NeedsWinCFI = needsWinCFI(MF);
2285 bool EmitCFI = AFI->needsAsyncDwarfUnwindInfo(MF);
2286 bool HasWinCFI = false;
2287 bool IsFunclet = false;
2288
2289 if (MBB.end() != MBBI) {
2290 DL = MBBI->getDebugLoc();
2291 IsFunclet = isFuncletReturnInstr(*MBBI);
2292 }
2293
2294 MachineBasicBlock::iterator EpilogStartI = MBB.end();
2295
2296 auto FinishingTouches = make_scope_exit([&]() {
2297 if (AFI->shouldSignReturnAddress(MF)) {
2298 BuildMI(MBB, MBB.getFirstTerminator(), DL,
2299 TII->get(AArch64::PAUTH_EPILOGUE))
2300 .setMIFlag(MachineInstr::FrameDestroy);
2301 if (NeedsWinCFI)
2302 HasWinCFI = true; // AArch64PointerAuth pass will insert SEH_PACSignLR
2303 }
2306 if (EmitCFI)
2307 emitCalleeSavedGPRRestores(MBB, MBB.getFirstTerminator());
2308 if (HasWinCFI) {
2310 TII->get(AArch64::SEH_EpilogEnd))
2312 if (!MF.hasWinCFI())
2313 MF.setHasWinCFI(true);
2314 }
2315 if (NeedsWinCFI) {
2316 assert(EpilogStartI != MBB.end());
2317 if (!HasWinCFI)
2318 MBB.erase(EpilogStartI);
2319 }
2320 });
2321
2322 int64_t NumBytes = IsFunclet ? getWinEHFuncletFrameSize(MF)
2323 : MFI.getStackSize();
2324
2325 // All calls are tail calls in GHC calling conv, and functions have no
2326 // prologue/epilogue.
2328 return;
2329
2330 // How much of the stack used by incoming arguments this function is expected
2331 // to restore in this particular epilogue.
2332 int64_t ArgumentStackToRestore = getArgumentStackToRestore(MF, MBB);
2333 bool IsWin64 = Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv(),
2334 MF.getFunction().isVarArg());
2335 unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
2336
2337 int64_t AfterCSRPopSize = ArgumentStackToRestore;
2338 auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
2339 // We cannot rely on the local stack size set in emitPrologue if the function
2340 // has funclets, as funclets have different local stack size requirements, and
2341 // the current value set in emitPrologue may be that of the containing
2342 // function.
2343 if (MF.hasEHFunclets())
2344 AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
2345 if (homogeneousPrologEpilog(MF, &MBB)) {
2346 assert(!NeedsWinCFI);
2347 auto LastPopI = MBB.getFirstTerminator();
2348 if (LastPopI != MBB.begin()) {
2349 auto HomogeneousEpilog = std::prev(LastPopI);
2350 if (HomogeneousEpilog->getOpcode() == AArch64::HOM_Epilog)
2351 LastPopI = HomogeneousEpilog;
2352 }
2353
2354 // Adjust local stack
2355 emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
2357 MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
2358
2359 // SP has been already adjusted while restoring callee save regs.
2360 // We've bailed-out the case with adjusting SP for arguments.
2361 assert(AfterCSRPopSize == 0);
2362 return;
2363 }
2364 bool CombineSPBump = shouldCombineCSRLocalStackBumpInEpilogue(MBB, NumBytes);
2365 // Assume we can't combine the last pop with the sp restore.
2366
2367 bool CombineAfterCSRBump = false;
2368 if (!CombineSPBump && PrologueSaveSize != 0) {
2370 while (Pop->getOpcode() == TargetOpcode::CFI_INSTRUCTION ||
2372 Pop = std::prev(Pop);
2373 // Converting the last ldp to a post-index ldp is valid only if the last
2374 // ldp's offset is 0.
2375 const MachineOperand &OffsetOp = Pop->getOperand(Pop->getNumOperands() - 1);
2376 // If the offset is 0 and the AfterCSR pop is not actually trying to
2377 // allocate more stack for arguments (in space that an untimely interrupt
2378 // may clobber), convert it to a post-index ldp.
2379 if (OffsetOp.getImm() == 0 && AfterCSRPopSize >= 0) {
2381 MBB, Pop, DL, TII, PrologueSaveSize, NeedsWinCFI, &HasWinCFI, EmitCFI,
2382 MachineInstr::FrameDestroy, PrologueSaveSize);
2383 } else {
2384 // If not, make sure to emit an add after the last ldp.
2385 // We're doing this by transfering the size to be restored from the
2386 // adjustment *before* the CSR pops to the adjustment *after* the CSR
2387 // pops.
2388 AfterCSRPopSize += PrologueSaveSize;
2389 CombineAfterCSRBump = true;
2390 }
2391 }
2392
2393 // Move past the restores of the callee-saved registers.
2394 // If we plan on combining the sp bump of the local stack size and the callee
2395 // save stack size, we might need to adjust the CSR save and restore offsets.
2398 while (LastPopI != Begin) {
2399 --LastPopI;
2400 if (!LastPopI->getFlag(MachineInstr::FrameDestroy) ||
2401 IsSVECalleeSave(LastPopI)) {
2402 ++LastPopI;
2403 break;
2404 } else if (CombineSPBump)
2406 NeedsWinCFI, &HasWinCFI);
2407 }
2408
2409 if (NeedsWinCFI) {
2410 // Note that there are cases where we insert SEH opcodes in the
2411 // epilogue when we had no SEH opcodes in the prologue. For
2412 // example, when there is no stack frame but there are stack
2413 // arguments. Insert the SEH_EpilogStart and remove it later if it
2414 // we didn't emit any SEH opcodes to avoid generating WinCFI for
2415 // functions that don't need it.
2416 BuildMI(MBB, LastPopI, DL, TII->get(AArch64::SEH_EpilogStart))
2418 EpilogStartI = LastPopI;
2419 --EpilogStartI;
2420 }
2421
2422 if (hasFP(MF) && AFI->hasSwiftAsyncContext()) {
2425 // Avoid the reload as it is GOT relative, and instead fall back to the
2426 // hardcoded value below. This allows a mismatch between the OS and
2427 // application without immediately terminating on the difference.
2428 [[fallthrough]];
2430 // We need to reset FP to its untagged state on return. Bit 60 is
2431 // currently used to show the presence of an extended frame.
2432
2433 // BIC x29, x29, #0x1000_0000_0000_0000
2434 BuildMI(MBB, MBB.getFirstTerminator(), DL, TII->get(AArch64::ANDXri),
2435 AArch64::FP)
2436 .addUse(AArch64::FP)
2437 .addImm(0x10fe)
2439 if (NeedsWinCFI) {
2440 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_Nop))
2442 HasWinCFI = true;
2443 }
2444 break;
2445
2447 break;
2448 }
2449 }
2450
2451 const StackOffset &SVEStackSize = getSVEStackSize(MF);
2452
2453 // If there is a single SP update, insert it before the ret and we're done.
2454 if (CombineSPBump) {
2455 assert(!SVEStackSize && "Cannot combine SP bump with SVE");
2456
2457 // When we are about to restore the CSRs, the CFA register is SP again.
2458 if (EmitCFI && hasFP(MF)) {
2459 const AArch64RegisterInfo &RegInfo = *Subtarget.getRegisterInfo();
2460 unsigned Reg = RegInfo.getDwarfRegNum(AArch64::SP, true);
2461 unsigned CFIIndex =
2462 MF.addFrameInst(MCCFIInstruction::cfiDefCfa(nullptr, Reg, NumBytes));
2463 BuildMI(MBB, LastPopI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
2464 .addCFIIndex(CFIIndex)
2466 }
2467
2468 emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
2469 StackOffset::getFixed(NumBytes + (int64_t)AfterCSRPopSize),
2470 TII, MachineInstr::FrameDestroy, false, NeedsWinCFI,
2471 &HasWinCFI, EmitCFI, StackOffset::getFixed(NumBytes));
2472 return;
2473 }
2474
2475 NumBytes -= PrologueSaveSize;
2476 assert(NumBytes >= 0 && "Negative stack allocation size!?");
2477
2478 // Process the SVE callee-saves to determine what space needs to be
2479 // deallocated.
2480 StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize;
2481 MachineBasicBlock::iterator RestoreBegin = LastPopI, RestoreEnd = LastPopI;
2482 if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
2483 RestoreBegin = std::prev(RestoreEnd);
2484 while (RestoreBegin != MBB.begin() &&
2485 IsSVECalleeSave(std::prev(RestoreBegin)))
2486 --RestoreBegin;
2487
2488 assert(IsSVECalleeSave(RestoreBegin) &&
2489 IsSVECalleeSave(std::prev(RestoreEnd)) && "Unexpected instruction");
2490
2491 StackOffset CalleeSavedSizeAsOffset =
2492 StackOffset::getScalable(CalleeSavedSize);
2493 DeallocateBefore = SVEStackSize - CalleeSavedSizeAsOffset;
2494 DeallocateAfter = CalleeSavedSizeAsOffset;
2495 }
2496
2497 // Deallocate the SVE area.
2498 if (SVEStackSize) {
2499 // If we have stack realignment or variable sized objects on the stack,
2500 // restore the stack pointer from the frame pointer prior to SVE CSR
2501 // restoration.
2502 if (AFI->isStackRealigned() || MFI.hasVarSizedObjects()) {
2503 if (int64_t CalleeSavedSize = AFI->getSVECalleeSavedStackSize()) {
2504 // Set SP to start of SVE callee-save area from which they can
2505 // be reloaded. The code below will deallocate the stack space
2506 // space by moving FP -> SP.
2507 emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::FP,
2508 StackOffset::getScalable(-CalleeSavedSize), TII,
2510 }
2511 } else {
2512 if (AFI->getSVECalleeSavedStackSize()) {
2513 // Deallocate the non-SVE locals first before we can deallocate (and
2514 // restore callee saves) from the SVE area.
2516 MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
2518 false, false, nullptr, EmitCFI && !hasFP(MF),
2519 SVEStackSize + StackOffset::getFixed(NumBytes + PrologueSaveSize));
2520 NumBytes = 0;
2521 }
2522
2523 emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
2524 DeallocateBefore, TII, MachineInstr::FrameDestroy, false,
2525 false, nullptr, EmitCFI && !hasFP(MF),
2526 SVEStackSize +
2527 StackOffset::getFixed(NumBytes + PrologueSaveSize));
2528
2529 emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
2530 DeallocateAfter, TII, MachineInstr::FrameDestroy, false,
2531 false, nullptr, EmitCFI && !hasFP(MF),
2532 DeallocateAfter +
2533 StackOffset::getFixed(NumBytes + PrologueSaveSize));
2534 }
2535 if (EmitCFI)
2536 emitCalleeSavedSVERestores(MBB, RestoreEnd);
2537 }
2538
2539 if (!hasFP(MF)) {
2540 bool RedZone = canUseRedZone(MF);
2541 // If this was a redzone leaf function, we don't need to restore the
2542 // stack pointer (but we may need to pop stack args for fastcc).
2543 if (RedZone && AfterCSRPopSize == 0)
2544 return;
2545
2546 // Pop the local variables off the stack. If there are no callee-saved
2547 // registers, it means we are actually positioned at the terminator and can
2548 // combine stack increment for the locals and the stack increment for
2549 // callee-popped arguments into (possibly) a single instruction and be done.
2550 bool NoCalleeSaveRestore = PrologueSaveSize == 0;
2551 int64_t StackRestoreBytes = RedZone ? 0 : NumBytes;
2552 if (NoCalleeSaveRestore)
2553 StackRestoreBytes += AfterCSRPopSize;
2554
2556 MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
2557 StackOffset::getFixed(StackRestoreBytes), TII,
2558 MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI, EmitCFI,
2559 StackOffset::getFixed((RedZone ? 0 : NumBytes) + PrologueSaveSize));
2560
2561 // If we were able to combine the local stack pop with the argument pop,
2562 // then we're done.
2563 if (NoCalleeSaveRestore || AfterCSRPopSize == 0) {
2564 return;
2565 }
2566
2567 NumBytes = 0;
2568 }
2569
2570 // Restore the original stack pointer.
2571 // FIXME: Rather than doing the math here, we should instead just use
2572 // non-post-indexed loads for the restores if we aren't actually going to
2573 // be able to save any instructions.
2574 if (!IsFunclet && (MFI.hasVarSizedObjects() || AFI->isStackRealigned())) {
2576 MBB, LastPopI, DL, AArch64::SP, AArch64::FP,
2578 TII, MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
2579 } else if (NumBytes)
2580 emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
2581 StackOffset::getFixed(NumBytes), TII,
2582 MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI);
2583
2584 // When we are about to restore the CSRs, the CFA register is SP again.
2585 if (EmitCFI && hasFP(MF)) {
2586 const AArch64RegisterInfo &RegInfo = *Subtarget.getRegisterInfo();
2587 unsigned Reg = RegInfo.getDwarfRegNum(AArch64::SP, true);
2588 unsigned CFIIndex = MF.addFrameInst(
2589 MCCFIInstruction::cfiDefCfa(nullptr, Reg, PrologueSaveSize));
2590 BuildMI(MBB, LastPopI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
2591 .addCFIIndex(CFIIndex)
2593 }
2594
2595 // This must be placed after the callee-save restore code because that code
2596 // assumes the SP is at the same location as it was after the callee-save save
2597 // code in the prologue.
2598 if (AfterCSRPopSize) {
2599 assert(AfterCSRPopSize > 0 && "attempting to reallocate arg stack that an "
2600 "interrupt may have clobbered");
2601
2603 MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
2605 false, NeedsWinCFI, &HasWinCFI, EmitCFI,
2606 StackOffset::getFixed(CombineAfterCSRBump ? PrologueSaveSize : 0));
2607 }
2608}
2609
2612 MF.getInfo<AArch64FunctionInfo>()->needsAsyncDwarfUnwindInfo(MF);
2613}
2614
2615/// getFrameIndexReference - Provide a base+offset reference to an FI slot for
2616/// debug info. It's the same as what we use for resolving the code-gen
2617/// references for now. FIXME: This can go wrong when references are
2618/// SP-relative and simple call frames aren't used.
2621 Register &FrameReg) const {
2623 MF, FI, FrameReg,
2624 /*PreferFP=*/
2625 MF.getFunction().hasFnAttribute(Attribute::SanitizeHWAddress) ||
2626 MF.getFunction().hasFnAttribute(Attribute::SanitizeMemTag),
2627 /*ForSimm=*/false);
2628}
2629
2632 int FI) const {
2633 // This function serves to provide a comparable offset from a single reference
2634 // point (the value of SP at function entry) that can be used for analysis,
2635 // e.g. the stack-frame-layout analysis pass. It is not guaranteed to be
2636 // correct for all objects in the presence of VLA-area objects or dynamic
2637 // stack re-alignment.
2638
2639 const auto &MFI = MF.getFrameInfo();
2640
2641 int64_t ObjectOffset = MFI.getObjectOffset(FI);
2642 StackOffset SVEStackSize = getSVEStackSize(MF);
2643
2644 // For VLA-area objects, just emit an offset at the end of the stack frame.
2645 // Whilst not quite correct, these objects do live at the end of the frame and
2646 // so it is more useful for analysis for the offset to reflect this.
2647 if (MFI.isVariableSizedObjectIndex(FI)) {
2648 return StackOffset::getFixed(-((int64_t)MFI.getStackSize())) - SVEStackSize;
2649 }
2650
2651 // This is correct in the absence of any SVE stack objects.
2652 if (!SVEStackSize)
2653 return StackOffset::getFixed(ObjectOffset - getOffsetOfLocalArea());
2654
2655 const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
2656 if (MFI.getStackID(FI) == TargetStackID::ScalableVector) {
2657 return StackOffset::get(-((int64_t)AFI->getCalleeSavedStackSize()),
2658 ObjectOffset);
2659 }
2660
2661 bool IsFixed = MFI.isFixedObjectIndex(FI);
2662 bool IsCSR =
2663 !IsFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize(MFI));
2664
2665 StackOffset ScalableOffset = {};
2666 if (!IsFixed && !IsCSR)
2667 ScalableOffset = -SVEStackSize;
2668
2669 return StackOffset::getFixed(ObjectOffset) + ScalableOffset;
2670}
2671
2674 int FI) const {
2676}
2677
2679 int64_t ObjectOffset) {
2680 const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
2681 const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2682 const Function &F = MF.getFunction();
2683 bool IsWin64 = Subtarget.isCallingConvWin64(F.getCallingConv(), F.isVarArg());
2684 unsigned FixedObject =
2685 getFixedObjectSize(MF, AFI, IsWin64, /*IsFunclet=*/false);
2686 int64_t CalleeSaveSize = AFI->getCalleeSavedStackSize(MF.getFrameInfo());
2687 int64_t FPAdjust =
2688 CalleeSaveSize - AFI->getCalleeSaveBaseToFrameRecordOffset();
2689 return StackOffset::getFixed(ObjectOffset + FixedObject + FPAdjust);
2690}
2691
2693 int64_t ObjectOffset) {
2694 const auto &MFI = MF.getFrameInfo();
2695 return StackOffset::getFixed(ObjectOffset + (int64_t)MFI.getStackSize());
2696}
2697
2698// TODO: This function currently does not work for scalable vectors.
2700 int FI) const {
2701 const auto *RegInfo = static_cast<const AArch64RegisterInfo *>(
2703 int ObjectOffset = MF.getFrameInfo().getObjectOffset(FI);
2704 return RegInfo->getLocalAddressRegister(MF) == AArch64::FP
2705 ? getFPOffset(MF, ObjectOffset).getFixed()
2706 : getStackOffset(MF, ObjectOffset).getFixed();
2707}
2708
2710 const MachineFunction &MF, int FI, Register &FrameReg, bool PreferFP,
2711 bool ForSimm) const {
2712 const auto &MFI = MF.getFrameInfo();
2713 int64_t ObjectOffset = MFI.getObjectOffset(FI);
2714 bool isFixed = MFI.isFixedObjectIndex(FI);
2715 bool isSVE = MFI.getStackID(FI) == TargetStackID::ScalableVector;
2716 return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, isSVE, FrameReg,
2717 PreferFP, ForSimm);
2718}
2719
2721 const MachineFunction &MF, int64_t ObjectOffset, bool isFixed, bool isSVE,
2722 Register &FrameReg, bool PreferFP, bool ForSimm) const {
2723 const auto &MFI = MF.getFrameInfo();
2724 const auto *RegInfo = static_cast<const AArch64RegisterInfo *>(
2726 const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
2727 const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2728
2729 int64_t FPOffset = getFPOffset(MF, ObjectOffset).getFixed();
2730 int64_t Offset = getStackOffset(MF, ObjectOffset).getFixed();
2731 bool isCSR =
2732 !isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize(MFI));
2733
2734 const StackOffset &SVEStackSize = getSVEStackSize(MF);
2735
2736 // Use frame pointer to reference fixed objects. Use it for locals if
2737 // there are VLAs or a dynamically realigned SP (and thus the SP isn't
2738 // reliable as a base). Make sure useFPForScavengingIndex() does the
2739 // right thing for the emergency spill slot.
2740 bool UseFP = false;
2741 if (AFI->hasStackFrame() && !isSVE) {
2742 // We shouldn't prefer using the FP to access fixed-sized stack objects when
2743 // there are scalable (SVE) objects in between the FP and the fixed-sized
2744 // objects.
2745 PreferFP &= !SVEStackSize;
2746
2747 // Note: Keeping the following as multiple 'if' statements rather than
2748 // merging to a single expression for readability.
2749 //
2750 // Argument access should always use the FP.
2751 if (isFixed) {
2752 UseFP = hasFP(MF);
2753 } else if (isCSR && RegInfo->hasStackRealignment(MF)) {
2754 // References to the CSR area must use FP if we're re-aligning the stack
2755 // since the dynamically-sized alignment padding is between the SP/BP and
2756 // the CSR area.
2757 assert(hasFP(MF) && "Re-aligned stack must have frame pointer");
2758 UseFP = true;
2759 } else if (hasFP(MF) && !RegInfo->hasStackRealignment(MF)) {
2760 // If the FPOffset is negative and we're producing a signed immediate, we
2761 // have to keep in mind that the available offset range for negative
2762 // offsets is smaller than for positive ones. If an offset is available
2763 // via the FP and the SP, use whichever is closest.
2764 bool FPOffsetFits = !ForSimm || FPOffset >= -256;
2765 PreferFP |= Offset > -FPOffset && !SVEStackSize;
2766
2767 if (FPOffset >= 0) {
2768 // If the FPOffset is positive, that'll always be best, as the SP/BP
2769 // will be even further away.
2770 UseFP = true;
2771 } else if (MFI.hasVarSizedObjects()) {
2772 // If we have variable sized objects, we can use either FP or BP, as the
2773 // SP offset is unknown. We can use the base pointer if we have one and
2774 // FP is not preferred. If not, we're stuck with using FP.
2775 bool CanUseBP = RegInfo->hasBasePointer(MF);
2776 if (FPOffsetFits && CanUseBP) // Both are ok. Pick the best.
2777 UseFP = PreferFP;
2778 else if (!CanUseBP) // Can't use BP. Forced to use FP.
2779 UseFP = true;
2780 // else we can use BP and FP, but the offset from FP won't fit.
2781 // That will make us scavenge registers which we can probably avoid by
2782 // using BP. If it won't fit for BP either, we'll scavenge anyway.
2783 } else if (MF.hasEHFunclets() && !RegInfo->hasBasePointer(MF)) {
2784 // Funclets access the locals contained in the parent's stack frame
2785 // via the frame pointer, so we have to use the FP in the parent
2786 // function.
2787 (void) Subtarget;
2788 assert(Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv(),
2789 MF.getFunction().isVarArg()) &&
2790 "Funclets should only be present on Win64");
2791 UseFP = true;
2792 } else {
2793 // We have the choice between FP and (SP or BP).
2794 if (FPOffsetFits && PreferFP) // If FP is the best fit, use it.
2795 UseFP = true;
2796 }
2797 }
2798 }
2799
2800 assert(
2801 ((isFixed || isCSR) || !RegInfo->hasStackRealignment(MF) || !UseFP) &&
2802 "In the presence of dynamic stack pointer realignment, "
2803 "non-argument/CSR objects cannot be accessed through the frame pointer");
2804
2805 if (isSVE) {
2806 StackOffset FPOffset =
2808 StackOffset SPOffset =
2809 SVEStackSize +
2810 StackOffset::get(MFI.getStackSize() - AFI->getCalleeSavedStackSize(),
2811 ObjectOffset);
2812 // Always use the FP for SVE spills if available and beneficial.
2813 if (hasFP(MF) && (SPOffset.getFixed() ||
2814 FPOffset.getScalable() < SPOffset.getScalable() ||
2815 RegInfo->hasStackRealignment(MF))) {
2816 FrameReg = RegInfo->getFrameRegister(MF);
2817 return FPOffset;
2818 }
2819
2820 FrameReg = RegInfo->hasBasePointer(MF) ? RegInfo->getBaseRegister()
2821 : (unsigned)AArch64::SP;
2822 return SPOffset;
2823 }
2824
2825 StackOffset ScalableOffset = {};
2826 if (UseFP && !(isFixed || isCSR))
2827 ScalableOffset = -SVEStackSize;
2828 if (!UseFP && (isFixed || isCSR))
2829 ScalableOffset = SVEStackSize;
2830
2831 if (UseFP) {
2832 FrameReg = RegInfo->getFrameRegister(MF);
2833 return StackOffset::getFixed(FPOffset) + ScalableOffset;
2834 }
2835
2836 // Use the base pointer if we have one.
2837 if (RegInfo->hasBasePointer(MF))
2838 FrameReg = RegInfo->getBaseRegister();
2839 else {
2840 assert(!MFI.hasVarSizedObjects() &&
2841 "Can't use SP when we have var sized objects.");
2842 FrameReg = AArch64::SP;
2843 // If we're using the red zone for this function, the SP won't actually
2844 // be adjusted, so the offsets will be negative. They're also all
2845 // within range of the signed 9-bit immediate instructions.
2846 if (canUseRedZone(MF))
2847 Offset -= AFI->getLocalStackSize();
2848 }
2849
2850 return StackOffset::getFixed(Offset) + ScalableOffset;
2851}
2852
2853static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
2854 // Do not set a kill flag on values that are also marked as live-in. This
2855 // happens with the @llvm-returnaddress intrinsic and with arguments passed in
2856 // callee saved registers.
2857 // Omitting the kill flags is conservatively correct even if the live-in
2858 // is not used after all.
2859 bool IsLiveIn = MF.getRegInfo().isLiveIn(Reg);
2860 return getKillRegState(!IsLiveIn);
2861}
2862
2864 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
2867 return Subtarget.isTargetMachO() &&
2868 !(Subtarget.getTargetLowering()->supportSwiftError() &&
2869 Attrs.hasAttrSomewhere(Attribute::SwiftError)) &&
2871 !requiresSaveVG(MF) && AFI->getSVECalleeSavedStackSize() == 0;
2872}
2873
2874static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2,
2875 bool NeedsWinCFI, bool IsFirst,
2876 const TargetRegisterInfo *TRI) {
2877 // If we are generating register pairs for a Windows function that requires
2878 // EH support, then pair consecutive registers only. There are no unwind
2879 // opcodes for saves/restores of non-consectuve register pairs.
2880 // The unwind opcodes are save_regp, save_regp_x, save_fregp, save_frepg_x,
2881 // save_lrpair.
2882 // https://docs.microsoft.com/en-us/cpp/build/arm64-exception-handling
2883
2884 if (Reg2 == AArch64::FP)
2885 return true;
2886 if (!NeedsWinCFI)
2887 return false;
2888 if (TRI->getEncodingValue(Reg2) == TRI->getEncodingValue(Reg1) + 1)
2889 return false;
2890 // If pairing a GPR with LR, the pair can be described by the save_lrpair
2891 // opcode. If this is the first register pair, it would end up with a
2892 // predecrement, but there's no save_lrpair_x opcode, so we can only do this
2893 // if LR is paired with something else than the first register.
2894 // The save_lrpair opcode requires the first register to be an odd one.
2895 if (Reg1 >= AArch64::X19 && Reg1 <= AArch64::X27 &&
2896 (Reg1 - AArch64::X19) % 2 == 0 && Reg2 == AArch64::LR && !IsFirst)
2897 return false;
2898 return true;
2899}
2900
2901/// Returns true if Reg1 and Reg2 cannot be paired using a ldp/stp instruction.
2902/// WindowsCFI requires that only consecutive registers can be paired.
2903/// LR and FP need to be allocated together when the frame needs to save
2904/// the frame-record. This means any other register pairing with LR is invalid.
2905static bool invalidateRegisterPairing(unsigned Reg1, unsigned Reg2,
2906 bool UsesWinAAPCS, bool NeedsWinCFI,
2907 bool NeedsFrameRecord, bool IsFirst,
2908 const TargetRegisterInfo *TRI) {
2909 if (UsesWinAAPCS)
2910 return invalidateWindowsRegisterPairing(Reg1, Reg2, NeedsWinCFI, IsFirst,
2911 TRI);
2912
2913 // If we need to store the frame record, don't pair any register
2914 // with LR other than FP.
2915 if (NeedsFrameRecord)
2916 return Reg2 == AArch64::LR;
2917
2918 return false;
2919}
2920
2921namespace {
2922
2923struct RegPairInfo {
2924 unsigned Reg1 = AArch64::NoRegister;
2925 unsigned Reg2 = AArch64::NoRegister;
2926 int FrameIdx;
2927 int Offset;
2928 enum RegType { GPR, FPR64, FPR128, PPR, ZPR, VG } Type;
2929
2930 RegPairInfo() = default;
2931
2932 bool isPaired() const { return Reg2 != AArch64::NoRegister; }
2933
2934 unsigned getScale() const {
2935 switch (Type) {
2936 case PPR:
2937 return 2;
2938 case GPR:
2939 case FPR64:
2940 case VG:
2941 return 8;
2942 case ZPR:
2943 case FPR128:
2944 return 16;
2945 }
2946 llvm_unreachable("Unsupported type");
2947 }
2948
2949 bool isScalable() const { return Type == PPR || Type == ZPR; }
2950};
2951
2952} // end anonymous namespace
2953
2954unsigned findFreePredicateReg(BitVector &SavedRegs) {
2955 for (unsigned PReg = AArch64::P8; PReg <= AArch64::P15; ++PReg) {
2956 if (SavedRegs.test(PReg)) {
2957 unsigned PNReg = PReg - AArch64::P0 + AArch64::PN0;
2958 return PNReg;
2959 }
2960 }
2961 return AArch64::NoRegister;
2962}
2963
2964// The multivector LD/ST are available only for SME or SVE2p1 targets
2966 MachineFunction &MF) {
2968 return false;
2969
2970 SMEAttrs FuncAttrs(MF.getFunction());
2971 bool IsLocallyStreaming =
2972 FuncAttrs.hasStreamingBody() && !FuncAttrs.hasStreamingInterface();
2973
2974 // Only when in streaming mode SME2 instructions can be safely used.
2975 // It is not safe to use SME2 instructions when in streaming compatible or
2976 // locally streaming mode.
2977 return Subtarget.hasSVE2p1() ||
2978 (Subtarget.hasSME2() &&
2979 (!IsLocallyStreaming && Subtarget.isStreaming()));
2980}
2981
2985 bool NeedsFrameRecord) {
2986
2987 if (CSI.empty())
2988 return;
2989
2990 bool IsWindows = isTargetWindows(MF);
2991 bool NeedsWinCFI = needsWinCFI(MF);
2993 unsigned StackHazardSize = getStackHazardSize(MF);
2994 MachineFrameInfo &MFI = MF.getFrameInfo();
2996 unsigned Count = CSI.size();
2997 (void)CC;
2998 // MachO's compact unwind format relies on all registers being stored in
2999 // pairs.
3002 CC == CallingConv::Win64 || (Count & 1) == 0) &&
3003 "Odd number of callee-saved regs to spill!");
3004 int ByteOffset = AFI->getCalleeSavedStackSize();
3005 int StackFillDir = -1;
3006 int RegInc = 1;
3007 unsigned FirstReg = 0;
3008 if (NeedsWinCFI) {
3009 // For WinCFI, fill the stack from the bottom up.
3010 ByteOffset = 0;
3011 StackFillDir = 1;
3012 // As the CSI array is reversed to match PrologEpilogInserter, iterate
3013 // backwards, to pair up registers starting from lower numbered registers.
3014 RegInc = -1;
3015 FirstReg = Count - 1;
3016 }
3017 int ScalableByteOffset = AFI->getSVECalleeSavedStackSize();
3018 bool NeedGapToAlignStack = AFI->hasCalleeSaveStackFreeSpace();
3019 Register LastReg = 0;
3020
3021 // When iterating backwards, the loop condition relies on unsigned wraparound.
3022 for (unsigned i = FirstReg; i < Count; i += RegInc) {
3023 RegPairInfo RPI;
3024 RPI.Reg1 = CSI[i].getReg();
3025
3026 if (AArch64::GPR64RegClass.contains(RPI.Reg1))
3027 RPI.Type = RegPairInfo::GPR;
3028 else if (AArch64::FPR64RegClass.contains(RPI.Reg1))
3029 RPI.Type = RegPairInfo::FPR64;
3030 else if (AArch64::FPR128RegClass.contains(RPI.Reg1))
3031 RPI.Type = RegPairInfo::FPR128;
3032 else if (AArch64::ZPRRegClass.contains(RPI.Reg1))
3033 RPI.Type = RegPairInfo::ZPR;
3034 else if (AArch64::PPRRegClass.contains(RPI.Reg1))
3035 RPI.Type = RegPairInfo::PPR;
3036 else if (RPI.Reg1 == AArch64::VG)
3037 RPI.Type = RegPairInfo::VG;
3038 else
3039 llvm_unreachable("Unsupported register class.");
3040
3041 // Add the stack hazard size as we transition from GPR->FPR CSRs.
3042 if (AFI->hasStackHazardSlotIndex() &&
3043 (!LastReg || !AArch64InstrInfo::isFpOrNEON(LastReg)) &&
3045 ByteOffset += StackFillDir * StackHazardSize;
3046 LastReg = RPI.Reg1;
3047
3048 int Scale = RPI.getScale();
3049 // Add the next reg to the pair if it is in the same register class.
3050 if (unsigned(i + RegInc) < Count && !AFI->hasStackHazardSlotIndex()) {
3051 Register NextReg = CSI[i + RegInc].getReg();
3052 bool IsFirst = i == FirstReg;
3053 switch (RPI.Type) {
3054 case RegPairInfo::GPR:
3055 if (AArch64::GPR64RegClass.contains(NextReg) &&
3056 !invalidateRegisterPairing(RPI.Reg1, NextReg, IsWindows,
3057 NeedsWinCFI, NeedsFrameRecord, IsFirst,
3058 TRI))
3059 RPI.Reg2 = NextReg;
3060 break;
3061 case RegPairInfo::FPR64:
3062 if (AArch64::FPR64RegClass.contains(NextReg) &&
3063 !invalidateWindowsRegisterPairing(RPI.Reg1, NextReg, NeedsWinCFI,
3064 IsFirst, TRI))
3065 RPI.Reg2 = NextReg;
3066 break;
3067 case RegPairInfo::FPR128:
3068 if (AArch64::FPR128RegClass.contains(NextReg))
3069 RPI.Reg2 = NextReg;
3070 break;
3071 case RegPairInfo::PPR:
3072 break;
3073 case RegPairInfo::ZPR:
3074 if (AFI->getPredicateRegForFillSpill() != 0 &&
3075 ((RPI.Reg1 - AArch64::Z0) & 1) == 0 && (NextReg == RPI.Reg1 + 1)) {
3076 // Calculate offset of register pair to see if pair instruction can be
3077 // used.
3078 int Offset = (ScalableByteOffset + StackFillDir * 2 * Scale) / Scale;
3079 if ((-16 <= Offset && Offset <= 14) && (Offset % 2 == 0))
3080 RPI.Reg2 = NextReg;
3081 }
3082 break;
3083 case RegPairInfo::VG:
3084 break;
3085 }
3086 }
3087
3088 // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
3089 // list to come in sorted by frame index so that we can issue the store
3090 // pair instructions directly. Assert if we see anything otherwise.
3091 //
3092 // The order of the registers in the list is controlled by
3093 // getCalleeSavedRegs(), so they will always be in-order, as well.
3094 assert((!RPI.isPaired() ||
3095 (CSI[i].getFrameIdx() + RegInc == CSI[i + RegInc].getFrameIdx())) &&
3096 "Out of order callee saved regs!");
3097
3098 assert((!RPI.isPaired() || !NeedsFrameRecord || RPI.Reg2 != AArch64::FP ||
3099 RPI.Reg1 == AArch64::LR) &&
3100 "FrameRecord must be allocated together with LR");
3101
3102 // Windows AAPCS has FP and LR reversed.
3103 assert((!RPI.isPaired() || !NeedsFrameRecord || RPI.Reg1 != AArch64::FP ||
3104 RPI.Reg2 == AArch64::LR) &&
3105 "FrameRecord must be allocated together with LR");
3106
3107 // MachO's compact unwind format relies on all registers being stored in
3108 // adjacent register pairs.
3112 (RPI.isPaired() &&
3113 ((RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP) ||
3114 RPI.Reg1 + 1 == RPI.Reg2))) &&
3115 "Callee-save registers not saved as adjacent register pair!");
3116
3117 RPI.FrameIdx = CSI[i].getFrameIdx();
3118 if (NeedsWinCFI &&
3119 RPI.isPaired()) // RPI.FrameIdx must be the lower index of the pair
3120 RPI.FrameIdx = CSI[i + RegInc].getFrameIdx();
3121
3122 int OffsetPre = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
3123 assert(OffsetPre % Scale == 0);
3124
3125 if (RPI.isScalable())
3126 ScalableByteOffset += StackFillDir * (RPI.isPaired() ? 2 * Scale : Scale);
3127 else
3128 ByteOffset += StackFillDir * (RPI.isPaired() ? 2 * Scale : Scale);
3129
3130 // Swift's async context is directly before FP, so allocate an extra
3131 // 8 bytes for it.
3132 if (NeedsFrameRecord && AFI->hasSwiftAsyncContext() &&
3133 ((!IsWindows && RPI.Reg2 == AArch64::FP) ||
3134 (IsWindows && RPI.Reg2 == AArch64::LR)))
3135 ByteOffset += StackFillDir * 8;
3136
3137 // Round up size of non-pair to pair size if we need to pad the
3138 // callee-save area to ensure 16-byte alignment.
3139 if (NeedGapToAlignStack && !NeedsWinCFI && !RPI.isScalable() &&
3140 RPI.Type != RegPairInfo::FPR128 && !RPI.isPaired() &&
3141 ByteOffset % 16 != 0) {
3142 ByteOffset += 8 * StackFillDir;
3143 assert(MFI.getObjectAlign(RPI.FrameIdx) <= Align(16));
3144 // A stack frame with a gap looks like this, bottom up:
3145 // d9, d8. x21, gap, x20, x19.
3146 // Set extra alignment on the x21 object to create the gap above it.
3147 MFI.setObjectAlignment(RPI.FrameIdx, Align(16));
3148 NeedGapToAlignStack = false;
3149 }
3150
3151 int OffsetPost = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
3152 assert(OffsetPost % Scale == 0);
3153 // If filling top down (default), we want the offset after incrementing it.
3154 // If filling bottom up (WinCFI) we need the original offset.
3155 int Offset = NeedsWinCFI ? OffsetPre : OffsetPost;
3156
3157 // The FP, LR pair goes 8 bytes into our expanded 24-byte slot so that the
3158 // Swift context can directly precede FP.
3159 if (NeedsFrameRecord && AFI->hasSwiftAsyncContext() &&
3160 ((!IsWindows && RPI.Reg2 == AArch64::FP) ||
3161 (IsWindows && RPI.Reg2 == AArch64::LR)))
3162 Offset += 8;
3163 RPI.Offset = Offset / Scale;
3164
3165 assert((!RPI.isPaired() ||
3166 (!RPI.isScalable() && RPI.Offset >= -64 && RPI.Offset <= 63) ||
3167 (RPI.isScalable() && RPI.Offset >= -256 && RPI.Offset <= 255)) &&
3168 "Offset out of bounds for LDP/STP immediate");
3169
3170 auto isFrameRecord = [&] {
3171 if (RPI.isPaired())
3172 return IsWindows ? RPI.Reg1 == AArch64::FP && RPI.Reg2 == AArch64::LR
3173 : RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP;
3174 // Otherwise, look for the frame record as two unpaired registers. This is
3175 // needed for -aarch64-stack-hazard-size=<val>, which disables register
3176 // pairing (as the padding may be too large for the LDP/STP offset). Note:
3177 // On Windows, this check works out as current reg == FP, next reg == LR,
3178 // and on other platforms current reg == FP, previous reg == LR. This
3179 // works out as the correct pre-increment or post-increment offsets
3180 // respectively.
3181 return i > 0 && RPI.Reg1 == AArch64::FP &&
3182 CSI[i - 1].getReg() == AArch64::LR;
3183 };
3184
3185 // Save the offset to frame record so that the FP register can point to the
3186 // innermost frame record (spilled FP and LR registers).
3187 if (NeedsFrameRecord && isFrameRecord())
3189
3190 RegPairs.push_back(RPI);
3191 if (RPI.isPaired())
3192 i += RegInc;
3193 }
3194 if (NeedsWinCFI) {
3195 // If we need an alignment gap in the stack, align the topmost stack
3196 // object. A stack frame with a gap looks like this, bottom up:
3197 // x19, d8. d9, gap.
3198 // Set extra alignment on the topmost stack object (the first element in
3199 // CSI, which goes top down), to create the gap above it.
3200 if (AFI->hasCalleeSaveStackFreeSpace())
3201 MFI.setObjectAlignment(CSI[0].getFrameIdx(), Align(16));
3202 // We iterated bottom up over the registers; flip RegPairs back to top
3203 // down order.
3204 std::reverse(RegPairs.begin(), RegPairs.end());
3205 }
3206}
3207
3211 MachineFunction &MF = *MBB.getParent();
3214 bool NeedsWinCFI = needsWinCFI(MF);
3215 DebugLoc DL;
3217
3218 computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, hasFP(MF));
3219
3221 // Refresh the reserved regs in case there are any potential changes since the
3222 // last freeze.
3223 MRI.freezeReservedRegs();
3224
3225 if (homogeneousPrologEpilog(MF)) {
3226 auto MIB = BuildMI(MBB, MI, DL, TII.get(AArch64::HOM_Prolog))
3228
3229 for (auto &RPI : RegPairs) {
3230 MIB.addReg(RPI.Reg1);
3231 MIB.addReg(RPI.Reg2);
3232
3233 // Update register live in.
3234 if (!MRI.isReserved(RPI.Reg1))
3235 MBB.addLiveIn(RPI.Reg1);
3236 if (RPI.isPaired() && !MRI.isReserved(RPI.Reg2))
3237 MBB.addLiveIn(RPI.Reg2);
3238 }
3239 return true;
3240 }
3241 bool PTrueCreated = false;
3242 for (const RegPairInfo &RPI : llvm::reverse(RegPairs)) {
3243 unsigned Reg1 = RPI.Reg1;
3244 unsigned Reg2 = RPI.Reg2;
3245 unsigned StrOpc;
3246
3247 // Issue sequence of spills for cs regs. The first spill may be converted
3248 // to a pre-decrement store later by emitPrologue if the callee-save stack
3249 // area allocation can't be combined with the local stack area allocation.
3250 // For example:
3251 // stp x22, x21, [sp, #0] // addImm(+0)
3252 // stp x20, x19, [sp, #16] // addImm(+2)
3253 // stp fp, lr, [sp, #32] // addImm(+4)
3254 // Rationale: This sequence saves uop updates compared to a sequence of
3255 // pre-increment spills like stp xi,xj,[sp,#-16]!
3256 // Note: Similar rationale and sequence for restores in epilog.
3257 unsigned Size;
3258 Align Alignment;
3259 switch (RPI.Type) {
3260 case RegPairInfo::GPR:
3261 StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui;
3262 Size = 8;
3263 Alignment = Align(8);
3264 break;
3265 case RegPairInfo::FPR64:
3266 StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui;
3267 Size = 8;
3268 Alignment = Align(8);
3269 break;
3270 case RegPairInfo::FPR128:
3271 StrOpc = RPI.isPaired() ? AArch64::STPQi : AArch64::STRQui;
3272 Size = 16;
3273 Alignment = Align(16);
3274 break;
3275 case RegPairInfo::ZPR:
3276 StrOpc = RPI.isPaired() ? AArch64::ST1B_2Z_IMM : AArch64::STR_ZXI;
3277 Size = 16;
3278 Alignment = Align(16);
3279 break;
3280 case RegPairInfo::PPR:
3281 StrOpc = AArch64::STR_PXI;
3282 Size = 2;
3283 Alignment = Align(2);
3284 break;
3285 case RegPairInfo::VG:
3286 StrOpc = AArch64::STRXui;
3287 Size = 8;
3288 Alignment = Align(8);
3289 break;
3290 }
3291
3292 unsigned X0Scratch = AArch64::NoRegister;
3293 if (Reg1 == AArch64::VG) {
3294 // Find an available register to store value of VG to.
3296 assert(Reg1 != AArch64::NoRegister);
3297 SMEAttrs Attrs(MF.getFunction());
3298
3299 if (Attrs.hasStreamingBody() && !Attrs.hasStreamingInterface() &&
3300 AFI->getStreamingVGIdx() == std::numeric_limits<int>::max()) {
3301 // For locally-streaming functions, we need to store both the streaming
3302 // & non-streaming VG. Spill the streaming value first.
3303 BuildMI(MBB, MI, DL, TII.get(AArch64::RDSVLI_XI), Reg1)
3304 .addImm(1)
3306 BuildMI(MBB, MI, DL, TII.get(AArch64::UBFMXri), Reg1)
3307 .addReg(Reg1)
3308 .addImm(3)
3309 .addImm(63)
3311
3312 AFI->setStreamingVGIdx(RPI.FrameIdx);
3313 } else if (MF.getSubtarget<AArch64Subtarget>().hasSVE()) {
3314 BuildMI(MBB, MI, DL, TII.get(AArch64::CNTD_XPiI), Reg1)
3315 .addImm(31)
3316 .addImm(1)
3318 AFI->setVGIdx(RPI.FrameIdx);
3319 } else {
3321 if (llvm::any_of(
3322 MBB.liveins(),
3323 [&STI](const MachineBasicBlock::RegisterMaskPair &LiveIn) {
3324 return STI.getRegisterInfo()->isSuperOrSubRegisterEq(
3325 AArch64::X0, LiveIn.PhysReg);
3326 }))
3327 X0Scratch = Reg1;
3328
3329 if (X0Scratch != AArch64::NoRegister)
3330 BuildMI(MBB, MI, DL, TII.get(AArch64::ORRXrr), Reg1)
3331 .addReg(AArch64::XZR)
3332 .addReg(AArch64::X0, RegState::Undef)
3333 .addReg(AArch64::X0, RegState::Implicit)
3335
3336 const uint32_t *RegMask = TRI->getCallPreservedMask(
3337 MF,
3339 BuildMI(MBB, MI, DL, TII.get(AArch64::BL))
3340 .addExternalSymbol("__arm_get_current_vg")
3341 .addRegMask(RegMask)
3342 .addReg(AArch64::X0, RegState::ImplicitDefine)
3344 Reg1 = AArch64::X0;
3345 AFI->setVGIdx(RPI.FrameIdx);
3346 }
3347 }
3348
3349 LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI);
3350 if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
3351 dbgs() << ") -> fi#(" << RPI.FrameIdx;
3352 if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1;
3353 dbgs() << ")\n");
3354
3355 assert((!NeedsWinCFI || !(Reg1 == AArch64::LR && Reg2 == AArch64::FP)) &&
3356 "Windows unwdinding requires a consecutive (FP,LR) pair");
3357 // Windows unwind codes require consecutive registers if registers are
3358 // paired. Make the switch here, so that the code below will save (x,x+1)
3359 // and not (x+1,x).
3360 unsigned FrameIdxReg1 = RPI.FrameIdx;
3361 unsigned FrameIdxReg2 = RPI.FrameIdx + 1;
3362 if (NeedsWinCFI && RPI.isPaired()) {
3363 std::swap(Reg1, Reg2);
3364 std::swap(FrameIdxReg1, FrameIdxReg2);
3365 }
3366
3367 if (RPI.isPaired() && RPI.isScalable()) {
3368 [[maybe_unused]] const AArch64Subtarget &Subtarget =
3371 unsigned PnReg = AFI->getPredicateRegForFillSpill();
3372 assert((PnReg != 0 && enableMultiVectorSpillFill(Subtarget, MF)) &&
3373 "Expects SVE2.1 or SME2 target and a predicate register");
3374#ifdef EXPENSIVE_CHECKS
3375 auto IsPPR = [](const RegPairInfo &c) {
3376 return c.Reg1 == RegPairInfo::PPR;
3377 };
3378 auto PPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsPPR);
3379 auto IsZPR = [](const RegPairInfo &c) {
3380 return c.Type == RegPairInfo::ZPR;
3381 };
3382 auto ZPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsZPR);
3383 assert(!(PPRBegin < ZPRBegin) &&
3384 "Expected callee save predicate to be handled first");
3385#endif
3386 if (!PTrueCreated) {
3387 PTrueCreated = true;
3388 BuildMI(MBB, MI, DL, TII.get(AArch64::PTRUE_C_B), PnReg)
3390 }
3391 MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
3392 if (!MRI.isReserved(Reg1))
3393 MBB.addLiveIn(Reg1);
3394 if (!MRI.isReserved(Reg2))
3395 MBB.addLiveIn(Reg2);
3396 MIB.addReg(/*PairRegs*/ AArch64::Z0_Z1 + (RPI.Reg1 - AArch64::Z0));
3398 MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
3399 MachineMemOperand::MOStore, Size, Alignment));
3400 MIB.addReg(PnReg);
3401 MIB.addReg(AArch64::SP)
3402 .addImm(RPI.Offset / 2) // [sp, #imm*2*vscale],
3403 // where 2*vscale is implicit
3406 MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
3407 MachineMemOperand::MOStore, Size, Alignment));
3408 if (NeedsWinCFI)
3410 } else { // The code when the pair of ZReg is not present
3411 MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
3412 if (!MRI.isReserved(Reg1))
3413 MBB.addLiveIn(Reg1);
3414 if (RPI.isPaired()) {
3415 if (!MRI.isReserved(Reg2))
3416 MBB.addLiveIn(Reg2);
3417 MIB.addReg(Reg2, getPrologueDeath(MF, Reg2));
3419 MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
3420 MachineMemOperand::MOStore, Size, Alignment));
3421 }
3422 MIB.addReg(Reg1, getPrologueDeath(MF, Reg1))
3423 .addReg(AArch64::SP)
3424 .addImm(RPI.Offset) // [sp, #offset*vscale],
3425 // where factor*vscale is implicit
3428 MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
3429 MachineMemOperand::MOStore, Size, Alignment));
3430 if (NeedsWinCFI)
3432 }
3433 // Update the StackIDs of the SVE stack slots.
3434 MachineFrameInfo &MFI = MF.getFrameInfo();
3435 if (RPI.Type == RegPairInfo::ZPR || RPI.Type == RegPairInfo::PPR) {
3436 MFI.setStackID(FrameIdxReg1, TargetStackID::ScalableVector);
3437 if (RPI.isPaired())
3438 MFI.setStackID(FrameIdxReg2, TargetStackID::ScalableVector);
3439 }
3440
3441 if (X0Scratch != AArch64::NoRegister)
3442 BuildMI(MBB, MI, DL, TII.get(AArch64::ORRXrr), AArch64::X0)
3443 .addReg(AArch64::XZR)
3444 .addReg(X0Scratch, RegState::Undef)
3445 .addReg(X0Scratch, RegState::Implicit)
3447 }
3448 return true;
3449}
3450
3454 MachineFunction &MF = *MBB.getParent();
3456 DebugLoc DL;
3458 bool NeedsWinCFI = needsWinCFI(MF);
3459
3460 if (MBBI != MBB.end())
3461 DL = MBBI->getDebugLoc();
3462
3463 computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, hasFP(MF));
3464 if (homogeneousPrologEpilog(MF, &MBB)) {
3465 auto MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::HOM_Epilog))
3467 for (auto &RPI : RegPairs) {
3468 MIB.addReg(RPI.Reg1, RegState::Define);
3469 MIB.addReg(RPI.Reg2, RegState::Define);
3470 }
3471 return true;
3472 }
3473
3474 // For performance reasons restore SVE register in increasing order
3475 auto IsPPR = [](const RegPairInfo &c) { return c.Type == RegPairInfo::PPR; };
3476 auto PPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsPPR);
3477 auto PPREnd = std::find_if_not(PPRBegin, RegPairs.end(), IsPPR);
3478 std::reverse(PPRBegin, PPREnd);
3479 auto IsZPR = [](const RegPairInfo &c) { return c.Type == RegPairInfo::ZPR; };
3480 auto ZPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsZPR);
3481 auto ZPREnd = std::find_if_not(ZPRBegin, RegPairs.end(), IsZPR);
3482 std::reverse(ZPRBegin, ZPREnd);
3483
3484 bool PTrueCreated = false;
3485 for (const RegPairInfo &RPI : RegPairs) {
3486 unsigned Reg1 = RPI.Reg1;
3487 unsigned Reg2 = RPI.Reg2;
3488
3489 // Issue sequence of restores for cs regs. The last restore may be converted
3490 // to a post-increment load later by emitEpilogue if the callee-save stack
3491 // area allocation can't be combined with the local stack area allocation.
3492 // For example:
3493 // ldp fp, lr, [sp, #32] // addImm(+4)
3494 // ldp x20, x19, [sp, #16] // addImm(+2)
3495 // ldp x22, x21, [sp, #0] // addImm(+0)
3496 // Note: see comment in spillCalleeSavedRegisters()
3497 unsigned LdrOpc;
3498 unsigned Size;
3499 Align Alignment;
3500 switch (RPI.Type) {
3501 case RegPairInfo::GPR:
3502 LdrOpc = RPI.isPaired() ? AArch64::LDPXi : AArch64::LDRXui;
3503 Size = 8;
3504 Alignment = Align(8);
3505 break;
3506 case RegPairInfo::FPR64:
3507 LdrOpc = RPI.isPaired() ? AArch64::LDPDi : AArch64::LDRDui;
3508 Size = 8;
3509 Alignment = Align(8);
3510 break;
3511 case RegPairInfo::FPR128:
3512 LdrOpc = RPI.isPaired() ? AArch64::LDPQi : AArch64::LDRQui;
3513 Size = 16;
3514 Alignment = Align(16);
3515 break;
3516 case RegPairInfo::ZPR:
3517 LdrOpc = RPI.isPaired() ? AArch64::LD1B_2Z_IMM : AArch64::LDR_ZXI;
3518 Size = 16;
3519 Alignment = Align(16);
3520 break;
3521 case RegPairInfo::PPR:
3522 LdrOpc = AArch64::LDR_PXI;
3523 Size = 2;
3524 Alignment = Align(2);
3525 break;
3526 case RegPairInfo::VG:
3527 continue;
3528 }
3529 LLVM_DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI);
3530 if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
3531 dbgs() << ") -> fi#(" << RPI.FrameIdx;
3532 if (RPI.isPaired()) dbgs() << ", " << RPI.FrameIdx + 1;
3533 dbgs() << ")\n");
3534
3535 // Windows unwind codes require consecutive registers if registers are
3536 // paired. Make the switch here, so that the code below will save (x,x+1)
3537 // and not (x+1,x).
3538 unsigned FrameIdxReg1 = RPI.FrameIdx;
3539 unsigned FrameIdxReg2 = RPI.FrameIdx + 1;
3540 if (NeedsWinCFI && RPI.isPaired()) {
3541 std::swap(Reg1, Reg2);
3542 std::swap(FrameIdxReg1, FrameIdxReg2);
3543 }
3544
3546 if (RPI.isPaired() && RPI.isScalable()) {
3547 [[maybe_unused]] const AArch64Subtarget &Subtarget =
3549 unsigned PnReg = AFI->getPredicateRegForFillSpill();
3550 assert((PnReg != 0 && enableMultiVectorSpillFill(Subtarget, MF)) &&
3551 "Expects SVE2.1 or SME2 target and a predicate register");
3552#ifdef EXPENSIVE_CHECKS
3553 assert(!(PPRBegin < ZPRBegin) &&
3554 "Expected callee save predicate to be handled first");
3555#endif
3556 if (!PTrueCreated) {
3557 PTrueCreated = true;
3558 BuildMI(MBB, MBBI, DL, TII.get(AArch64::PTRUE_C_B), PnReg)
3560 }
3561 MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII.get(LdrOpc));
3562 MIB.addReg(/*PairRegs*/ AArch64::Z0_Z1 + (RPI.Reg1 - AArch64::Z0),
3563 getDefRegState(true));
3565 MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
3566 MachineMemOperand::MOLoad, Size, Alignment));
3567 MIB.addReg(PnReg);
3568 MIB.addReg(AArch64::SP)
3569 .addImm(RPI.Offset / 2) // [sp, #imm*2*vscale]
3570 // where 2*vscale is implicit
3573 MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
3574 MachineMemOperand::MOLoad, Size, Alignment));
3575 if (NeedsWinCFI)
3577 } else {
3578 MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII.get(LdrOpc));
3579 if (RPI.isPaired()) {
3580 MIB.addReg(Reg2, getDefRegState(true));
3582 MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
3583 MachineMemOperand::MOLoad, Size, Alignment));
3584 }
3585 MIB.addReg(Reg1, getDefRegState(true));
3586 MIB.addReg(AArch64::SP)
3587 .addImm(RPI.Offset) // [sp, #offset*vscale]
3588 // where factor*vscale is implicit
3591 MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
3592 MachineMemOperand::MOLoad, Size, Alignment));
3593 if (NeedsWinCFI)
3595 }
3596 }
3597 return true;
3598}
3599
3600// Return the FrameID for a MMO.
3601static std::optional<int> getMMOFrameID(MachineMemOperand *MMO,
3602 const MachineFrameInfo &MFI) {
3603 auto *PSV =
3604 dyn_cast_or_null<FixedStackPseudoSourceValue>(MMO->getPseudoValue());
3605 if (PSV)
3606 return std::optional<int>(PSV->getFrameIndex());
3607
3608 if (MMO->getValue()) {
3609 if (auto *Al = dyn_cast<AllocaInst>(getUnderlyingObject(MMO->getValue()))) {
3610 for (int FI = MFI.getObjectIndexBegin(); FI < MFI.getObjectIndexEnd();
3611 FI++)
3612 if (MFI.getObjectAllocation(FI) == Al)
3613 return FI;
3614 }
3615 }
3616
3617 return std::nullopt;
3618}
3619
3620// Return the FrameID for a Load/Store instruction by looking at the first MMO.
3621static std::optional<int> getLdStFrameID(const MachineInstr &MI,
3622 const MachineFrameInfo &MFI) {
3623 if (!MI.mayLoadOrStore() || MI.getNumMemOperands() < 1)
3624 return std::nullopt;
3625
3626 return getMMOFrameID(*MI.memoperands_begin(), MFI);
3627}
3628
3629// Check if a Hazard slot is needed for the current function, and if so create
3630// one for it. The index is stored in AArch64FunctionInfo->StackHazardSlotIndex,
3631// which can be used to determine if any hazard padding is needed.
3632void AArch64FrameLowering::determineStackHazardSlot(
3633 MachineFunction &MF, BitVector &SavedRegs) const {
3634 unsigned StackHazardSize = getStackHazardSize(MF);
3635 if (StackHazardSize == 0 || StackHazardSize % 16 != 0 ||
3637 return;
3638
3639 // Stack hazards are only needed in streaming functions.
3641 if (!StackHazardInNonStreaming && Attrs.hasNonStreamingInterfaceAndBody())
3642 return;
3643
3644 MachineFrameInfo &MFI = MF.getFrameInfo();
3645
3646 // Add a hazard slot if there are any CSR FPR registers, or are any fp-only
3647 // stack objects.
3648 bool HasFPRCSRs = any_of(SavedRegs.set_bits(), [](unsigned Reg) {
3649 return AArch64::FPR64RegClass.contains(Reg) ||
3650 AArch64::FPR128RegClass.contains(Reg) ||
3651 AArch64::ZPRRegClass.contains(Reg) ||
3652 AArch64::PPRRegClass.contains(Reg);
3653 });
3654 bool HasFPRStackObjects = false;
3655 if (!HasFPRCSRs) {
3656 std::vector<unsigned> FrameObjects(MFI.getObjectIndexEnd());
3657 for (auto &MBB : MF) {
3658 for (auto &MI : MBB) {
3659 std::optional<int> FI = getLdStFrameID(MI, MFI);
3660 if (FI && *FI >= 0 && *FI < (int)FrameObjects.size()) {
3661 if (MFI.getStackID(*FI) == TargetStackID::ScalableVector ||
3663 FrameObjects[*FI] |= 2;
3664 else
3665 FrameObjects[*FI] |= 1;
3666 }
3667 }
3668 }
3669 HasFPRStackObjects =
3670 any_of(FrameObjects, [](unsigned B) { return (B & 3) == 2; });
3671 }
3672
3673 if (HasFPRCSRs || HasFPRStackObjects) {
3674 int ID = MFI.CreateStackObject(StackHazardSize, Align(16), false);
3675 LLVM_DEBUG(dbgs() << "Created Hazard slot at " << ID << " size "
3676 << StackHazardSize << "\n");
3677 MF.getInfo<AArch64FunctionInfo>()->setStackHazardSlotIndex(ID);
3678 }
3679}
3680
3682 BitVector &SavedRegs,
3683 RegScavenger *RS) const {
3684 // All calls are tail calls in GHC calling conv, and functions have no
3685 // prologue/epilogue.
3687 return;
3688
3690 const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
3692 const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
3694 unsigned UnspilledCSGPR = AArch64::NoRegister;
3695 unsigned UnspilledCSGPRPaired = AArch64::NoRegister;
3696
3697 MachineFrameInfo &MFI = MF.getFrameInfo();
3698 const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
3699
3700 unsigned BasePointerReg = RegInfo->hasBasePointer(MF)
3701 ? RegInfo->getBaseRegister()
3702 : (unsigned)AArch64::NoRegister;
3703
3704 unsigned ExtraCSSpill = 0;
3705 bool HasUnpairedGPR64 = false;
3706 bool HasPairZReg = false;
3707 // Figure out which callee-saved registers to save/restore.
3708 for (unsigned i = 0; CSRegs[i]; ++i) {
3709 const unsigned Reg = CSRegs[i];
3710
3711 // Add the base pointer register to SavedRegs if it is callee-save.
3712 if (Reg == BasePointerReg)
3713 SavedRegs.set(Reg);
3714
3715 bool RegUsed = SavedRegs.test(Reg);
3716 unsigned PairedReg = AArch64::NoRegister;
3717 const bool RegIsGPR64 = AArch64::GPR64RegClass.contains(Reg);
3718 if (RegIsGPR64 || AArch64::FPR64RegClass.contains(Reg) ||
3719 AArch64::FPR128RegClass.contains(Reg)) {
3720 // Compensate for odd numbers of GP CSRs.
3721 // For now, all the known cases of odd number of CSRs are of GPRs.
3722 if (HasUnpairedGPR64)
3723 PairedReg = CSRegs[i % 2 == 0 ? i - 1 : i + 1];
3724 else
3725 PairedReg = CSRegs[i ^ 1];
3726 }
3727
3728 // If the function requires all the GP registers to save (SavedRegs),
3729 // and there are an odd number of GP CSRs at the same time (CSRegs),
3730 // PairedReg could be in a different register class from Reg, which would
3731 // lead to a FPR (usually D8) accidentally being marked saved.
3732 if (RegIsGPR64 && !AArch64::GPR64RegClass.contains(PairedReg)) {
3733 PairedReg = AArch64::NoRegister;
3734 HasUnpairedGPR64 = true;
3735 }
3736 assert(PairedReg == AArch64::NoRegister ||
3737 AArch64::GPR64RegClass.contains(Reg, PairedReg) ||
3738 AArch64::FPR64RegClass.contains(Reg, PairedReg) ||
3739 AArch64::FPR128RegClass.contains(Reg, PairedReg));
3740
3741 if (!RegUsed) {
3742 if (AArch64::GPR64RegClass.contains(Reg) &&
3743 !RegInfo->isReservedReg(MF, Reg)) {
3744 UnspilledCSGPR = Reg;
3745 UnspilledCSGPRPaired = PairedReg;
3746 }
3747 continue;
3748 }
3749
3750 // MachO's compact unwind format relies on all registers being stored in
3751 // pairs.
3752 // FIXME: the usual format is actually better if unwinding isn't needed.
3753 if (producePairRegisters(MF) && PairedReg != AArch64::NoRegister &&
3754 !SavedRegs.test(PairedReg)) {
3755 SavedRegs.set(PairedReg);
3756 if (AArch64::GPR64RegClass.contains(PairedReg) &&
3757 !RegInfo->isReservedReg(MF, PairedReg))
3758 ExtraCSSpill = PairedReg;
3759 }
3760 // Check if there is a pair of ZRegs, so it can select PReg for spill/fill
3761 HasPairZReg |= (AArch64::ZPRRegClass.contains(Reg, CSRegs[i ^ 1]) &&
3762 SavedRegs.test(CSRegs[i ^ 1]));
3763 }
3764
3765 if (HasPairZReg && enableMultiVectorSpillFill(Subtarget, MF)) {
3767 // Find a suitable predicate register for the multi-vector spill/fill
3768 // instructions.
3769 unsigned PnReg = findFreePredicateReg(SavedRegs);
3770 if (PnReg != AArch64::NoRegister)
3771 AFI->setPredicateRegForFillSpill(PnReg);
3772 // If no free callee-save has been found assign one.
3773 if (!AFI->getPredicateRegForFillSpill() &&
3774 MF.getFunction().getCallingConv() ==
3776 SavedRegs.set(AArch64::P8);
3777 AFI->setPredicateRegForFillSpill(AArch64::PN8);
3778 }
3779
3780 assert(!RegInfo->isReservedReg(MF, AFI->getPredicateRegForFillSpill()) &&
3781 "Predicate cannot be a reserved register");
3782 }
3783
3785 !Subtarget.isTargetWindows()) {
3786 // For Windows calling convention on a non-windows OS, where X18 is treated
3787 // as reserved, back up X18 when entering non-windows code (marked with the
3788 // Windows calling convention) and restore when returning regardless of
3789 // whether the individual function uses it - it might call other functions
3790 // that clobber it.
3791 SavedRegs.set(AArch64::X18);
3792 }
3793
3794 // Calculates the callee saved stack size.
3795 unsigned CSStackSize = 0;
3796 unsigned SVECSStackSize = 0;
3798 const MachineRegisterInfo &MRI = MF.getRegInfo();
3799 for (unsigned Reg : SavedRegs.set_bits()) {
3800 auto RegSize = TRI->getRegSizeInBits(Reg, MRI) / 8;
3801 if (AArch64::PPRRegClass.contains(Reg) ||
3802 AArch64::ZPRRegClass.contains(Reg))
3803 SVECSStackSize += RegSize;
3804 else
3805 CSStackSize += RegSize;
3806 }
3807
3808 // Increase the callee-saved stack size if the function has streaming mode
3809 // changes, as we will need to spill the value of the VG register.
3810 // For locally streaming functions, we spill both the streaming and
3811 // non-streaming VG value.
3812 const Function &F = MF.getFunction();
3813 SMEAttrs Attrs(F);
3814 if (requiresSaveVG(MF)) {
3815 if (Attrs.hasStreamingBody() && !Attrs.hasStreamingInterface())
3816 CSStackSize += 16;
3817 else
3818 CSStackSize += 8;
3819 }
3820
3821 // Determine if a Hazard slot should be used, and increase the CSStackSize by
3822 // StackHazardSize if so.
3823 determineStackHazardSlot(MF, SavedRegs);
3824 if (AFI->hasStackHazardSlotIndex())
3825 CSStackSize += getStackHazardSize(MF);
3826
3827 // Save number of saved regs, so we can easily update CSStackSize later.
3828 unsigned NumSavedRegs = SavedRegs.count();
3829
3830 // The frame record needs to be created by saving the appropriate registers
3831 uint64_t EstimatedStackSize = MFI.estimateStackSize(MF);
3832 if (hasFP(MF) ||
3833 windowsRequiresStackProbe(MF, EstimatedStackSize + CSStackSize + 16)) {
3834 SavedRegs.set(AArch64::FP);
3835 SavedRegs.set(AArch64::LR);
3836 }
3837
3838 LLVM_DEBUG({
3839 dbgs() << "*** determineCalleeSaves\nSaved CSRs:";
3840 for (unsigned Reg : SavedRegs.set_bits())
3841 dbgs() << ' ' << printReg(Reg, RegInfo);
3842 dbgs() << "\n";
3843 });
3844
3845 // If any callee-saved registers are used, the frame cannot be eliminated.
3846 int64_t SVEStackSize =
3847 alignTo(SVECSStackSize + estimateSVEStackObjectOffsets(MFI), 16);
3848 bool CanEliminateFrame = (SavedRegs.count() == 0) && !SVEStackSize;
3849
3850 // The CSR spill slots have not been allocated yet, so estimateStackSize
3851 // won't include them.
3852 unsigned EstimatedStackSizeLimit = estimateRSStackSizeLimit(MF);
3853
3854 // We may address some of the stack above the canonical frame address, either
3855 // for our own arguments or during a call. Include that in calculating whether
3856 // we have complicated addressing concerns.
3857 int64_t CalleeStackUsed = 0;
3858 for (int I = MFI.getObjectIndexBegin(); I != 0; ++I) {
3859 int64_t FixedOff = MFI.getObjectOffset(I);
3860 if (FixedOff > CalleeStackUsed)
3861 CalleeStackUsed = FixedOff;
3862 }
3863
3864 // Conservatively always assume BigStack when there are SVE spills.
3865 bool BigStack = SVEStackSize || (EstimatedStackSize + CSStackSize +
3866 CalleeStackUsed) > EstimatedStackSizeLimit;
3867 if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF))
3868 AFI->setHasStackFrame(true);
3869
3870 // Estimate if we might need to scavenge a register at some point in order
3871 // to materialize a stack offset. If so, either spill one additional
3872 // callee-saved register or reserve a special spill slot to facilitate
3873 // register scavenging. If we already spilled an extra callee-saved register
3874 // above to keep the number of spills even, we don't need to do anything else
3875 // here.
3876 if (BigStack) {
3877 if (!ExtraCSSpill && UnspilledCSGPR != AArch64::NoRegister) {
3878 LLVM_DEBUG(dbgs() << "Spilling " << printReg(UnspilledCSGPR, RegInfo)
3879 << " to get a scratch register.\n");
3880 SavedRegs.set(UnspilledCSGPR);
3881 ExtraCSSpill = UnspilledCSGPR;
3882
3883 // MachO's compact unwind format relies on all registers being stored in
3884 // pairs, so if we need to spill one extra for BigStack, then we need to
3885 // store the pair.
3886 if (producePairRegisters(MF)) {
3887 if (UnspilledCSGPRPaired == AArch64::NoRegister) {
3888 // Failed to make a pair for compact unwind format, revert spilling.
3889 if (produceCompactUnwindFrame(MF)) {
3890 SavedRegs.reset(UnspilledCSGPR);
3891 ExtraCSSpill = AArch64::NoRegister;
3892 }
3893 } else
3894 SavedRegs.set(UnspilledCSGPRPaired);
3895 }
3896 }
3897
3898 // If we didn't find an extra callee-saved register to spill, create
3899 // an emergency spill slot.
3900 if (!ExtraCSSpill || MF.getRegInfo().isPhysRegUsed(ExtraCSSpill)) {
3902 const TargetRegisterClass &RC = AArch64::GPR64RegClass;
3903 unsigned Size = TRI->getSpillSize(RC);
3904 Align Alignment = TRI->getSpillAlign(RC);
3905 int FI = MFI.CreateStackObject(Size, Alignment, false);
3907 LLVM_DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI
3908 << " as the emergency spill slot.\n");
3909 }
3910 }
3911
3912 // Adding the size of additional 64bit GPR saves.
3913 CSStackSize += 8 * (SavedRegs.count() - NumSavedRegs);
3914
3915 // A Swift asynchronous context extends the frame record with a pointer
3916 // directly before FP.
3917 if (hasFP(MF) && AFI->hasSwiftAsyncContext())
3918 CSStackSize += 8;
3919
3920 uint64_t AlignedCSStackSize = alignTo(CSStackSize, 16);
3921 LLVM_DEBUG(dbgs() << "Estimated stack frame size: "
3922 << EstimatedStackSize + AlignedCSStackSize << " bytes.\n");
3923
3925 AFI->getCalleeSavedStackSize() == AlignedCSStackSize) &&
3926 "Should not invalidate callee saved info");
3927
3928 // Round up to register pair alignment to avoid additional SP adjustment
3929 // instructions.
3930 AFI->setCalleeSavedStackSize(AlignedCSStackSize);
3931 AFI->setCalleeSaveStackHasFreeSpace(AlignedCSStackSize != CSStackSize);
3932 AFI->setSVECalleeSavedStackSize(alignTo(SVECSStackSize, 16));
3933}
3934
3936 MachineFunction &MF, const TargetRegisterInfo *RegInfo,
3937 std::vector<CalleeSavedInfo> &CSI, unsigned &MinCSFrameIndex,
3938 unsigned &MaxCSFrameIndex) const {
3939 bool NeedsWinCFI = needsWinCFI(MF);
3940 unsigned StackHazardSize = getStackHazardSize(MF);
3941 // To match the canonical windows frame layout, reverse the list of
3942 // callee saved registers to get them laid out by PrologEpilogInserter
3943 // in the right order. (PrologEpilogInserter allocates stack objects top
3944 // down. Windows canonical prologs store higher numbered registers at
3945 // the top, thus have the CSI array start from the highest registers.)
3946 if (NeedsWinCFI)
3947 std::reverse(CSI.begin(), CSI.end());
3948
3949 if (CSI.empty())
3950 return true; // Early exit if no callee saved registers are modified!
3951
3952 // Now that we know which registers need to be saved and restored, allocate
3953 // stack slots for them.
3954 MachineFrameInfo &MFI = MF.getFrameInfo();
3955 auto *AFI = MF.getInfo<AArch64FunctionInfo>();
3956
3957 bool UsesWinAAPCS = isTargetWindows(MF);
3958 if (UsesWinAAPCS && hasFP(MF) && AFI->hasSwiftAsyncContext()) {
3959 int FrameIdx = MFI.CreateStackObject(8, Align(16), true);
3960 AFI->setSwiftAsyncContextFrameIdx(FrameIdx);
3961 if ((unsigned)FrameIdx < MinCSFrameIndex)
3962 MinCSFrameIndex = FrameIdx;
3963 if ((unsigned)FrameIdx > MaxCSFrameIndex)
3964 MaxCSFrameIndex = FrameIdx;
3965 }
3966
3967 // Insert VG into the list of CSRs, immediately before LR if saved.
3968 if (requiresSaveVG(MF)) {
3969 std::vector<CalleeSavedInfo> VGSaves;
3970 SMEAttrs Attrs(MF.getFunction());
3971
3972 auto VGInfo = CalleeSavedInfo(AArch64::VG);
3973 VGInfo.setRestored(false);
3974 VGSaves.push_back(VGInfo);
3975
3976 // Add VG again if the function is locally-streaming, as we will spill two
3977 // values.
3978 if (Attrs.hasStreamingBody() && !Attrs.hasStreamingInterface())
3979 VGSaves.push_back(VGInfo);
3980
3981 bool InsertBeforeLR = false;
3982
3983 for (unsigned I = 0; I < CSI.size(); I++)
3984 if (CSI[I].getReg() == AArch64::LR) {
3985 InsertBeforeLR = true;
3986 CSI.insert(CSI.begin() + I, VGSaves.begin(), VGSaves.end());
3987 break;
3988 }
3989
3990 if (!InsertBeforeLR)
3991 CSI.insert(CSI.end(), VGSaves.begin(), VGSaves.end());
3992 }
3993
3994 Register LastReg = 0;
3995 int HazardSlotIndex = std::numeric_limits<int>::max();
3996 for (auto &CS : CSI) {
3997 Register Reg = CS.getReg();
3998 const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg);
3999
4000 // Create a hazard slot as we switch between GPR and FPR CSRs.
4001 if (AFI->hasStackHazardSlotIndex() &&
4002 (!LastReg || !AArch64InstrInfo::isFpOrNEON(LastReg)) &&
4004 assert(HazardSlotIndex == std::numeric_limits<int>::max() &&
4005 "Unexpected register order for hazard slot");
4006 HazardSlotIndex = MFI.CreateStackObject(StackHazardSize, Align(8), true);
4007 LLVM_DEBUG(dbgs() << "Created CSR Hazard at slot " << HazardSlotIndex
4008 << "\n");
4009 AFI->setStackHazardCSRSlotIndex(HazardSlotIndex);
4010 if ((unsigned)HazardSlotIndex < MinCSFrameIndex)
4011 MinCSFrameIndex = HazardSlotIndex;
4012 if ((unsigned)HazardSlotIndex > MaxCSFrameIndex)
4013 MaxCSFrameIndex = HazardSlotIndex;
4014 }
4015
4016 unsigned Size = RegInfo->getSpillSize(*RC);
4017 Align Alignment(RegInfo->getSpillAlign(*RC));
4018 int FrameIdx = MFI.CreateStackObject(Size, Alignment, true);
4019 CS.setFrameIdx(FrameIdx);
4020
4021 if ((unsigned)FrameIdx < MinCSFrameIndex)
4022 MinCSFrameIndex = FrameIdx;
4023 if ((unsigned)FrameIdx > MaxCSFrameIndex)
4024 MaxCSFrameIndex = FrameIdx;
4025
4026 // Grab 8 bytes below FP for the extended asynchronous frame info.
4027 if (hasFP(MF) && AFI->hasSwiftAsyncContext() && !UsesWinAAPCS &&
4028 Reg == AArch64::FP) {
4029 FrameIdx = MFI.CreateStackObject(8, Alignment, true);
4030 AFI->setSwiftAsyncContextFrameIdx(FrameIdx);
4031 if ((unsigned)FrameIdx < MinCSFrameIndex)
4032 MinCSFrameIndex = FrameIdx;
4033 if ((unsigned)FrameIdx > MaxCSFrameIndex)
4034 MaxCSFrameIndex = FrameIdx;
4035 }
4036 LastReg = Reg;
4037 }
4038
4039 // Add hazard slot in the case where no FPR CSRs are present.
4040 if (AFI->hasStackHazardSlotIndex() &&
4041 HazardSlotIndex == std::numeric_limits<int>::max()) {
4042 HazardSlotIndex = MFI.CreateStackObject(StackHazardSize, Align(8), true);
4043 LLVM_DEBUG(dbgs() << "Created CSR Hazard at slot " << HazardSlotIndex
4044 << "\n");
4045 AFI->setStackHazardCSRSlotIndex(HazardSlotIndex);
4046 if ((unsigned)HazardSlotIndex < MinCSFrameIndex)
4047 MinCSFrameIndex = HazardSlotIndex;
4048 if ((unsigned)HazardSlotIndex > MaxCSFrameIndex)
4049 MaxCSFrameIndex = HazardSlotIndex;
4050 }
4051
4052 return true;
4053}
4054
4056 const MachineFunction &MF) const {
4058 // If the function has streaming-mode changes, don't scavenge a
4059 // spillslot in the callee-save area, as that might require an
4060 // 'addvl' in the streaming-mode-changing call-sequence when the
4061 // function doesn't use a FP.
4062 if (AFI->hasStreamingModeChanges() && !hasFP(MF))
4063 return false;
4064 // Don't allow register salvaging with hazard slots, in case it moves objects
4065 // into the wrong place.
4066 if (AFI->hasStackHazardSlotIndex())
4067 return false;
4068 return AFI->hasCalleeSaveStackFreeSpace();
4069}
4070
4071/// returns true if there are any SVE callee saves.
4073 int &Min, int &Max) {
4074 Min = std::numeric_limits<int>::max();
4075 Max = std::numeric_limits<int>::min();
4076
4077 if (!MFI.isCalleeSavedInfoValid())
4078 return false;
4079
4080 const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
4081 for (auto &CS : CSI) {
4082 if (AArch64::ZPRRegClass.contains(CS.getReg()) ||
4083 AArch64::PPRRegClass.contains(CS.getReg())) {
4084 assert((Max == std::numeric_limits<int>::min() ||
4085 Max + 1 == CS.getFrameIdx()) &&
4086 "SVE CalleeSaves are not consecutive");
4087
4088 Min = std::min(Min, CS.getFrameIdx());
4089 Max = std::max(Max, CS.getFrameIdx());
4090 }
4091 }
4092 return Min != std::numeric_limits<int>::max();
4093}
4094
4095// Process all the SVE stack objects and determine offsets for each
4096// object. If AssignOffsets is true, the offsets get assigned.
4097// Fills in the first and last callee-saved frame indices into
4098// Min/MaxCSFrameIndex, respectively.
4099// Returns the size of the stack.
4101 int &MinCSFrameIndex,
4102 int &MaxCSFrameIndex,
4103 bool AssignOffsets) {
4104#ifndef NDEBUG
4105 // First process all fixed stack objects.
4106 for (int I = MFI.getObjectIndexBegin(); I != 0; ++I)
4108 "SVE vectors should never be passed on the stack by value, only by "
4109 "reference.");
4110#endif
4111
4112 auto Assign = [&MFI](int FI, int64_t Offset) {
4113 LLVM_DEBUG(dbgs() << "alloc FI(" << FI << ") at SP[" << Offset << "]\n");
4114 MFI.setObjectOffset(FI, Offset);
4115 };
4116
4117 int64_t Offset = 0;
4118
4119 // Then process all callee saved slots.
4120 if (getSVECalleeSaveSlotRange(MFI, MinCSFrameIndex, MaxCSFrameIndex)) {
4121 // Assign offsets to the callee save slots.
4122 for (int I = MinCSFrameIndex; I <= MaxCSFrameIndex; ++I) {
4123 Offset += MFI.getObjectSize(I);
4125 if (AssignOffsets)
4126 Assign(I, -Offset);
4127 }
4128 }
4129
4130 // Ensure that the Callee-save area is aligned to 16bytes.
4131 Offset = alignTo(Offset, Align(16U));
4132
4133 // Create a buffer of SVE objects to allocate and sort it.
4134 SmallVector<int, 8> ObjectsToAllocate;
4135 // If we have a stack protector, and we've previously decided that we have SVE
4136 // objects on the stack and thus need it to go in the SVE stack area, then it
4137 // needs to go first.
4138 int StackProtectorFI = -1;
4139 if (MFI.hasStackProtectorIndex()) {
4140 StackProtectorFI = MFI.getStackProtectorIndex();
4141 if (MFI.getStackID(StackProtectorFI) == TargetStackID::ScalableVector)
4142 ObjectsToAllocate.push_back(StackProtectorFI);
4143 }
4144 for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) {
4145 unsigned StackID = MFI.getStackID(I);
4146 if (StackID != TargetStackID::ScalableVector)
4147 continue;
4148 if (I == StackProtectorFI)
4149 continue;
4150 if (MaxCSFrameIndex >= I && I >= MinCSFrameIndex)
4151 continue;
4152 if (MFI.isDeadObjectIndex(I))
4153 continue;
4154
4155 ObjectsToAllocate.push_back(I);
4156 }
4157
4158 // Allocate all SVE locals and spills
4159 for (unsigned FI : ObjectsToAllocate) {
4160 Align Alignment = MFI.getObjectAlign(FI);
4161 // FIXME: Given that the length of SVE vectors is not necessarily a power of
4162 // two, we'd need to align every object dynamically at runtime if the
4163 // alignment is larger than 16. This is not yet supported.
4164 if (Alignment > Align(16))
4166 "Alignment of scalable vectors > 16 bytes is not yet supported");
4167
4168 Offset = alignTo(Offset + MFI.getObjectSize(FI), Alignment);
4169 if (AssignOffsets)
4170 Assign(FI, -Offset);
4171 }
4172
4173 return Offset;
4174}
4175
4176int64_t AArch64FrameLowering::estimateSVEStackObjectOffsets(
4177 MachineFrameInfo &MFI) const {
4178 int MinCSFrameIndex, MaxCSFrameIndex;
4179 return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex, false);
4180}
4181
4182int64_t AArch64FrameLowering::assignSVEStackObjectOffsets(
4183 MachineFrameInfo &MFI, int &MinCSFrameIndex, int &MaxCSFrameIndex) const {
4184 return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex,
4185 true);
4186}
4187
4189 MachineFunction &MF, RegScavenger *RS) const {
4190 MachineFrameInfo &MFI = MF.getFrameInfo();
4191
4193 "Upwards growing stack unsupported");
4194
4195 int MinCSFrameIndex, MaxCSFrameIndex;
4196 int64_t SVEStackSize =
4197 assignSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex);
4198
4200 AFI->setStackSizeSVE(alignTo(SVEStackSize, 16U));
4201 AFI->setMinMaxSVECSFrameIndex(MinCSFrameIndex, MaxCSFrameIndex);
4202
4203 // If this function isn't doing Win64-style C++ EH, we don't need to do
4204 // anything.
4205 if (!MF.hasEHFunclets())
4206 return;
4208 WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo();
4209
4210 MachineBasicBlock &MBB = MF.front();
4211 auto MBBI = MBB.begin();
4212 while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup))
4213 ++MBBI;
4214
4215 // Create an UnwindHelp object.
4216 // The UnwindHelp object is allocated at the start of the fixed object area
4217 int64_t FixedObject =
4218 getFixedObjectSize(MF, AFI, /*IsWin64*/ true, /*IsFunclet*/ false);
4219 int UnwindHelpFI = MFI.CreateFixedObject(/*Size*/ 8,
4220 /*SPOffset*/ -FixedObject,
4221 /*IsImmutable=*/false);
4222 EHInfo.UnwindHelpFrameIdx = UnwindHelpFI;
4223
4224 // We need to store -2 into the UnwindHelp object at the start of the
4225 // function.
4226 DebugLoc DL;
4228 RS->backward(MBBI);
4229 Register DstReg = RS->FindUnusedReg(&AArch64::GPR64commonRegClass);
4230 assert(DstReg && "There must be a free register after frame setup");
4231 BuildMI(MBB, MBBI, DL, TII.get(AArch64::MOVi64imm), DstReg).addImm(-2);
4232 BuildMI(MBB, MBBI, DL, TII.get(AArch64::STURXi))
4233 .addReg(DstReg, getKillRegState(true))
4234 .addFrameIndex(UnwindHelpFI)
4235 .addImm(0);
4236}
4237
4238namespace {
4239struct TagStoreInstr {
4241 int64_t Offset, Size;
4242 explicit TagStoreInstr(MachineInstr *MI, int64_t Offset, int64_t Size)
4243 : MI(MI), Offset(Offset), Size(Size) {}
4244};
4245
4246class TagStoreEdit {
4247 MachineFunction *MF;
4250 // Tag store instructions that are being replaced.
4252 // Combined memref arguments of the above instructions.
4254
4255 // Replace allocation tags in [FrameReg + FrameRegOffset, FrameReg +
4256 // FrameRegOffset + Size) with the address tag of SP.
4257 Register FrameReg;
4258 StackOffset FrameRegOffset;
4259 int64_t Size;
4260 // If not std::nullopt, move FrameReg to (FrameReg + FrameRegUpdate) at the
4261 // end.
4262 std::optional<int64_t> FrameRegUpdate;
4263 // MIFlags for any FrameReg updating instructions.
4264 unsigned FrameRegUpdateFlags;
4265
4266 // Use zeroing instruction variants.
4267 bool ZeroData;
4268 DebugLoc DL;
4269
4270 void emitUnrolled(MachineBasicBlock::iterator InsertI);
4271 void emitLoop(MachineBasicBlock::iterator InsertI);
4272
4273public:
4274 TagStoreEdit(MachineBasicBlock *MBB, bool ZeroData)
4275 : MBB(MBB), ZeroData(ZeroData) {
4276 MF = MBB->getParent();
4277 MRI = &MF->getRegInfo();
4278 }
4279 // Add an instruction to be replaced. Instructions must be added in the
4280 // ascending order of Offset, and have to be adjacent.
4281 void addInstruction(TagStoreInstr I) {
4282 assert((TagStores.empty() ||
4283 TagStores.back().Offset + TagStores.back().Size == I.Offset) &&
4284 "Non-adjacent tag store instructions.");
4285 TagStores.push_back(I);
4286 }
4287 void clear() { TagStores.clear(); }
4288 // Emit equivalent code at the given location, and erase the current set of
4289 // instructions. May skip if the replacement is not profitable. May invalidate
4290 // the input iterator and replace it with a valid one.
4291 void emitCode(MachineBasicBlock::iterator &InsertI,
4292 const AArch64FrameLowering *TFI, bool TryMergeSPUpdate);
4293};
4294
4295void TagStoreEdit::emitUnrolled(MachineBasicBlock::iterator InsertI) {
4296 const AArch64InstrInfo *TII =
4297 MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
4298
4299 const int64_t kMinOffset = -256 * 16;
4300 const int64_t kMaxOffset = 255 * 16;
4301
4302 Register BaseReg = FrameReg;
4303 int64_t BaseRegOffsetBytes = FrameRegOffset.getFixed();
4304 if (BaseRegOffsetBytes < kMinOffset ||
4305 BaseRegOffsetBytes + (Size - Size % 32) > kMaxOffset ||
4306 // BaseReg can be FP, which is not necessarily aligned to 16-bytes. In
4307 // that case, BaseRegOffsetBytes will not be aligned to 16 bytes, which
4308 // is required for the offset of ST2G.
4309 BaseRegOffsetBytes % 16 != 0) {
4310 Register ScratchReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
4311 emitFrameOffset(*MBB, InsertI, DL, ScratchReg, BaseReg,
4312 StackOffset::getFixed(BaseRegOffsetBytes), TII);
4313 BaseReg = ScratchReg;
4314 BaseRegOffsetBytes = 0;
4315 }
4316
4317 MachineInstr *LastI = nullptr;
4318 while (Size) {
4319 int64_t InstrSize = (Size > 16) ? 32 : 16;
4320 unsigned Opcode =
4321 InstrSize == 16
4322 ? (ZeroData ? AArch64::STZGi : AArch64::STGi)
4323 : (ZeroData ? AArch64::STZ2Gi : AArch64::ST2Gi);
4324 assert(BaseRegOffsetBytes % 16 == 0);
4325 MachineInstr *I = BuildMI(*MBB, InsertI, DL, TII->get(Opcode))
4326 .addReg(AArch64::SP)
4327 .addReg(BaseReg)
4328 .addImm(BaseRegOffsetBytes / 16)
4329 .setMemRefs(CombinedMemRefs);
4330 // A store to [BaseReg, #0] should go last for an opportunity to fold the
4331 // final SP adjustment in the epilogue.
4332 if (BaseRegOffsetBytes == 0)
4333 LastI = I;
4334 BaseRegOffsetBytes += InstrSize;
4335 Size -= InstrSize;
4336 }
4337
4338 if (LastI)
4339 MBB->splice(InsertI, MBB, LastI);
4340}
4341
4342void TagStoreEdit::emitLoop(MachineBasicBlock::iterator InsertI) {
4343 const AArch64InstrInfo *TII =
4344 MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
4345
4346 Register BaseReg = FrameRegUpdate
4347 ? FrameReg
4348 : MRI->createVirtualRegister(&AArch64::GPR64RegClass);
4349 Register SizeReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
4350
4351 emitFrameOffset(*MBB, InsertI, DL, BaseReg, FrameReg, FrameRegOffset, TII);
4352
4353 int64_t LoopSize = Size;
4354 // If the loop size is not a multiple of 32, split off one 16-byte store at
4355 // the end to fold BaseReg update into.
4356 if (FrameRegUpdate && *FrameRegUpdate)
4357 LoopSize -= LoopSize % 32;
4358 MachineInstr *LoopI = BuildMI(*MBB, InsertI, DL,
4359 TII->get(ZeroData ? AArch64::STZGloop_wback
4360 : AArch64::STGloop_wback))
4361 .addDef(SizeReg)
4362 .addDef(BaseReg)
4363 .addImm(LoopSize)
4364 .addReg(BaseReg)
4365 .setMemRefs(CombinedMemRefs);
4366 if (FrameRegUpdate)
4367 LoopI->setFlags(FrameRegUpdateFlags);
4368
4369 int64_t ExtraBaseRegUpdate =
4370 FrameRegUpdate ? (*FrameRegUpdate - FrameRegOffset.getFixed() - Size) : 0;
4371 LLVM_DEBUG(dbgs() << "TagStoreEdit::emitLoop: LoopSize=" << LoopSize
4372 << ", Size=" << Size
4373 << ", ExtraBaseRegUpdate=" << ExtraBaseRegUpdate
4374 << ", FrameRegUpdate=" << FrameRegUpdate
4375 << ", FrameRegOffset.getFixed()="
4376 << FrameRegOffset.getFixed() << "\n");
4377 if (LoopSize < Size) {
4378 assert(FrameRegUpdate);
4379 assert(Size - LoopSize == 16);
4380 // Tag 16 more bytes at BaseReg and update BaseReg.
4381 int64_t STGOffset = ExtraBaseRegUpdate + 16;
4382 assert(STGOffset % 16 == 0 && STGOffset >= -4096 && STGOffset <= 4080 &&
4383 "STG immediate out of range");
4384 BuildMI(*MBB, InsertI, DL,
4385 TII->get(ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex))
4386 .addDef(BaseReg)
4387 .addReg(BaseReg)
4388 .addReg(BaseReg)
4389 .addImm(STGOffset / 16)
4390 .setMemRefs(CombinedMemRefs)
4391 .setMIFlags(FrameRegUpdateFlags);
4392 } else if (ExtraBaseRegUpdate) {
4393 // Update BaseReg.
4394 int64_t AddSubOffset = std::abs(ExtraBaseRegUpdate);
4395 assert(AddSubOffset <= 4095 && "ADD/SUB immediate out of range");
4396 BuildMI(
4397 *MBB, InsertI, DL,
4398 TII->get(ExtraBaseRegUpdate > 0 ? AArch64::ADDXri : AArch64::SUBXri))
4399 .addDef(BaseReg)
4400 .addReg(BaseReg)
4401 .addImm(AddSubOffset)
4402 .addImm(0)
4403 .setMIFlags(FrameRegUpdateFlags);
4404 }
4405}
4406
4407// Check if *II is a register update that can be merged into STGloop that ends
4408// at (Reg + Size). RemainingOffset is the required adjustment to Reg after the
4409// end of the loop.
4410bool canMergeRegUpdate(MachineBasicBlock::iterator II, unsigned Reg,
4411 int64_t Size, int64_t *TotalOffset) {
4412 MachineInstr &MI = *II;
4413 if ((MI.getOpcode() == AArch64::ADDXri ||
4414 MI.getOpcode() == AArch64::SUBXri) &&
4415 MI.getOperand(0).getReg() == Reg && MI.getOperand(1).getReg() == Reg) {
4416 unsigned Shift = AArch64_AM::getShiftValue(MI.getOperand(3).getImm());
4417 int64_t Offset = MI.getOperand(2).getImm() << Shift;
4418 if (MI.getOpcode() == AArch64::SUBXri)
4419 Offset = -Offset;
4420 int64_t PostOffset = Offset - Size;
4421 // TagStoreEdit::emitLoop might emit either an ADD/SUB after the loop, or
4422 // an STGPostIndex which does the last 16 bytes of tag write. Which one is
4423 // chosen depends on the alignment of the loop size, but the difference
4424 // between the valid ranges for the two instructions is small, so we
4425 // conservatively assume that it could be either case here.
4426 //
4427 // Max offset of STGPostIndex, minus the 16 byte tag write folded into that
4428 // instruction.
4429 const int64_t kMaxOffset = 4080 - 16;
4430 // Max offset of SUBXri.
4431 const int64_t kMinOffset = -4095;
4432 if (PostOffset <= kMaxOffset && PostOffset >= kMinOffset &&
4433 PostOffset % 16 == 0) {
4434 *TotalOffset = Offset;
4435 return true;
4436 }
4437 }
4438 return false;
4439}
4440
4441void mergeMemRefs(const SmallVectorImpl<TagStoreInstr> &TSE,
4443 MemRefs.clear();
4444 for (auto &TS : TSE) {
4445 MachineInstr *MI = TS.MI;
4446 // An instruction without memory operands may access anything. Be
4447 // conservative and return an empty list.
4448 if (MI->memoperands_empty()) {
4449 MemRefs.clear();
4450 return;
4451 }
4452 MemRefs.append(MI->memoperands_begin(), MI->memoperands_end());
4453 }
4454}
4455
4456void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI,
4457 const AArch64FrameLowering *TFI,
4458 bool TryMergeSPUpdate) {
4459 if (TagStores.empty())
4460 return;
4461 TagStoreInstr &FirstTagStore = TagStores[0];
4462 TagStoreInstr &LastTagStore = TagStores[TagStores.size() - 1];
4463 Size = LastTagStore.Offset - FirstTagStore.Offset + LastTagStore.Size;
4464 DL = TagStores[0].MI->getDebugLoc();
4465
4466 Register Reg;
4467 FrameRegOffset = TFI->resolveFrameOffsetReference(
4468 *MF, FirstTagStore.Offset, false /*isFixed*/, false /*isSVE*/, Reg,
4469 /*PreferFP=*/false, /*ForSimm=*/true);
4470 FrameReg = Reg;
4471 FrameRegUpdate = std::nullopt;
4472
4473 mergeMemRefs(TagStores, CombinedMemRefs);
4474
4475 LLVM_DEBUG({
4476 dbgs() << "Replacing adjacent STG instructions:\n";
4477 for (const auto &Instr : TagStores) {
4478 dbgs() << " " << *Instr.MI;
4479 }
4480 });
4481
4482 // Size threshold where a loop becomes shorter than a linear sequence of
4483 // tagging instructions.
4484 const int kSetTagLoopThreshold = 176;
4485 if (Size < kSetTagLoopThreshold) {
4486 if (TagStores.size() < 2)
4487 return;
4488 emitUnrolled(InsertI);
4489 } else {
4490 MachineInstr *UpdateInstr = nullptr;
4491 int64_t TotalOffset = 0;
4492 if (TryMergeSPUpdate) {
4493 // See if we can merge base register update into the STGloop.
4494 // This is done in AArch64LoadStoreOptimizer for "normal" stores,
4495 // but STGloop is way too unusual for that, and also it only
4496 // realistically happens in function epilogue. Also, STGloop is expanded
4497 // before that pass.
4498 if (InsertI != MBB->end() &&
4499 canMergeRegUpdate(InsertI, FrameReg, FrameRegOffset.getFixed() + Size,
4500 &TotalOffset)) {
4501 UpdateInstr = &*InsertI++;
4502 LLVM_DEBUG(dbgs() << "Folding SP update into loop:\n "
4503 << *UpdateInstr);
4504 }
4505 }
4506
4507 if (!UpdateInstr && TagStores.size() < 2)
4508 return;
4509
4510 if (UpdateInstr) {
4511 FrameRegUpdate = TotalOffset;
4512 FrameRegUpdateFlags = UpdateInstr->getFlags();
4513 }
4514 emitLoop(InsertI);
4515 if (UpdateInstr)
4516 UpdateInstr->eraseFromParent();
4517 }
4518
4519 for (auto &TS : TagStores)
4520 TS.MI->eraseFromParent();
4521}
4522
4523bool isMergeableStackTaggingInstruction(MachineInstr &MI, int64_t &Offset,
4524 int64_t &Size, bool &ZeroData) {
4525 MachineFunction &MF = *MI.getParent()->getParent();
4526 const MachineFrameInfo &MFI = MF.getFrameInfo();
4527
4528 unsigned Opcode = MI.getOpcode();
4529 ZeroData = (Opcode == AArch64::STZGloop || Opcode == AArch64::STZGi ||
4530 Opcode == AArch64::STZ2Gi);
4531
4532 if (Opcode == AArch64::STGloop || Opcode == AArch64::STZGloop) {
4533 if (!MI.getOperand(0).isDead() || !MI.getOperand(1).isDead())
4534 return false;
4535 if (!MI.getOperand(2).isImm() || !MI.getOperand(3).isFI())
4536 return false;
4537 Offset = MFI.getObjectOffset(MI.getOperand(3).getIndex());
4538 Size = MI.getOperand(2).getImm();
4539 return true;
4540 }
4541
4542 if (Opcode == AArch64::STGi || Opcode == AArch64::STZGi)
4543 Size = 16;
4544 else if (Opcode == AArch64::ST2Gi || Opcode == AArch64::STZ2Gi)
4545 Size = 32;
4546 else
4547 return false;
4548
4549 if (MI.getOperand(0).getReg() != AArch64::SP || !MI.getOperand(1).isFI())
4550 return false;
4551
4552 Offset = MFI.getObjectOffset(MI.getOperand(1).getIndex()) +
4553 16 * MI.getOperand(2).getImm();
4554 return true;
4555}
4556
4557// Detect a run of memory tagging instructions for adjacent stack frame slots,
4558// and replace them with a shorter instruction sequence:
4559// * replace STG + STG with ST2G
4560// * replace STGloop + STGloop with STGloop
4561// This code needs to run when stack slot offsets are already known, but before
4562// FrameIndex operands in STG instructions are eliminated.
4564 const AArch64FrameLowering *TFI,
4565 RegScavenger *RS) {
4566 bool FirstZeroData;
4567 int64_t Size, Offset;
4568 MachineInstr &MI = *II;
4569 MachineBasicBlock *MBB = MI.getParent();
4571 if (&MI == &MBB->instr_back())
4572 return II;
4573 if (!isMergeableStackTaggingInstruction(MI, Offset, Size, FirstZeroData))
4574 return II;
4575
4577 Instrs.emplace_back(&MI, Offset, Size);
4578
4579 constexpr int kScanLimit = 10;
4580 int Count = 0;
4582 NextI != E && Count < kScanLimit; ++NextI) {
4583 MachineInstr &MI = *NextI;
4584 bool ZeroData;
4585 int64_t Size, Offset;
4586 // Collect instructions that update memory tags with a FrameIndex operand
4587 // and (when applicable) constant size, and whose output registers are dead
4588 // (the latter is almost always the case in practice). Since these
4589 // instructions effectively have no inputs or outputs, we are free to skip
4590 // any non-aliasing instructions in between without tracking used registers.