LLVM 22.0.0git
MachineSMEABIPass.cpp
Go to the documentation of this file.
1//===- MachineSMEABIPass.cpp ----------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the SME ABI requirements for ZA state. This includes
10// implementing the lazy (and agnostic) ZA state save schemes around calls.
11//
12//===----------------------------------------------------------------------===//
13//
14// This pass works by collecting instructions that require ZA to be in a
15// specific state (e.g., "ACTIVE" or "SAVED") and inserting the necessary state
16// transitions to ensure ZA is in the required state before instructions. State
17// transitions represent actions such as setting up or restoring a lazy save.
18// Certain points within a function may also have predefined states independent
19// of any instructions, for example, a "shared_za" function is always entered
20// and exited in the "ACTIVE" state.
21//
22// To handle ZA state across control flow, we make use of edge bundling. This
23// assigns each block an "incoming" and "outgoing" edge bundle (representing
24// incoming and outgoing edges). Initially, these are unique to each block;
25// then, in the process of forming bundles, the outgoing block of a block is
26// joined with the incoming bundle of all successors. The result is that each
27// bundle can be assigned a single ZA state, which ensures the state required by
28// all a blocks' successors is the same, and that each basic block will always
29// be entered with the same ZA state. This eliminates the need for splitting
30// edges to insert state transitions or "phi" nodes for ZA states.
31//
32// See below for a simple example of edge bundling.
33//
34// The following shows a conditionally executed basic block (BB1):
35//
36// if (cond)
37// BB1
38// BB2
39//
40// Initial Bundles Joined Bundles
41//
42// ┌──0──┐ ┌──0──┐
43// │ BB0 │ │ BB0 │
44// └──1──┘ └──1──┘
45// ├───────┐ ├───────┐
46// ▼ │ ▼ │
47// ┌──2──┐ │ ─────► ┌──1──┐ │
48// │ BB1 │ ▼ │ BB1 │ ▼
49// └──3──┘ ┌──4──┐ └──1──┘ ┌──1──┐
50// └───►4 BB2 │ └───►1 BB2 │
51// └──5──┘ └──2──┘
52//
53// On the left are the initial per-block bundles, and on the right are the
54// joined bundles (which are the result of the EdgeBundles analysis).
55
56#include "AArch64InstrInfo.h"
58#include "AArch64Subtarget.h"
68
69using namespace llvm;
70
71#define DEBUG_TYPE "aarch64-machine-sme-abi"
72
73namespace {
74
75enum ZAState {
76 // Any/unknown state (not valid)
77 ANY = 0,
78
79 // ZA is in use and active (i.e. within the accumulator)
80 ACTIVE,
81
82 // A ZA save has been set up or committed (i.e. ZA is dormant or off)
83 LOCAL_SAVED,
84
85 // ZA is off or a lazy save has been set up by the caller
86 CALLER_DORMANT,
87
88 // ZA is off
89 OFF,
90
91 // The number of ZA states (not a valid state)
92 NUM_ZA_STATE
93};
94
95/// A bitmask enum to record live physical registers that the "emit*" routines
96/// may need to preserve. Note: This only tracks registers we may clobber.
97enum LiveRegs : uint8_t {
98 None = 0,
99 NZCV = 1 << 0,
100 W0 = 1 << 1,
101 W0_HI = 1 << 2,
102 X0 = W0 | W0_HI,
103 LLVM_MARK_AS_BITMASK_ENUM(/* LargestValue = */ W0_HI)
104};
105
106/// Holds the virtual registers live physical registers have been saved to.
107struct PhysRegSave {
108 LiveRegs PhysLiveRegs;
109 Register StatusFlags = AArch64::NoRegister;
110 Register X0Save = AArch64::NoRegister;
111};
112
113/// Contains the needed ZA state (and live registers) at an instruction. That is
114/// the state ZA must be in _before_ "InsertPt".
115struct InstInfo {
116 ZAState NeededState{ZAState::ANY};
118 LiveRegs PhysLiveRegs = LiveRegs::None;
119};
120
121/// Contains the needed ZA state for each instruction in a block. Instructions
122/// that do not require a ZA state are not recorded.
123struct BlockInfo {
124 ZAState FixedEntryState{ZAState::ANY};
126 LiveRegs PhysLiveRegsAtEntry = LiveRegs::None;
127 LiveRegs PhysLiveRegsAtExit = LiveRegs::None;
128};
129
130/// Contains the needed ZA state information for all blocks within a function.
131struct FunctionInfo {
133 std::optional<MachineBasicBlock::iterator> AfterSMEProloguePt;
134 LiveRegs PhysLiveRegsAfterSMEPrologue = LiveRegs::None;
135};
136
137/// State/helpers that is only needed when emitting code to handle
138/// saving/restoring ZA.
139class EmitContext {
140public:
141 EmitContext() = default;
142
143 /// Get or create a TPIDR2 block in \p MF.
144 int getTPIDR2Block(MachineFunction &MF) {
145 if (TPIDR2BlockFI)
146 return *TPIDR2BlockFI;
147 MachineFrameInfo &MFI = MF.getFrameInfo();
148 TPIDR2BlockFI = MFI.CreateStackObject(16, Align(16), false);
149 return *TPIDR2BlockFI;
150 }
151
152 /// Get or create agnostic ZA buffer pointer in \p MF.
153 Register getAgnosticZABufferPtr(MachineFunction &MF) {
154 if (AgnosticZABufferPtr != AArch64::NoRegister)
155 return AgnosticZABufferPtr;
156 Register BufferPtr =
157 MF.getInfo<AArch64FunctionInfo>()->getEarlyAllocSMESaveBuffer();
158 AgnosticZABufferPtr =
159 BufferPtr != AArch64::NoRegister
160 ? BufferPtr
161 : MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
162 return AgnosticZABufferPtr;
163 }
164
165 /// Returns true if the function must allocate a ZA save buffer on entry. This
166 /// will be the case if, at any point in the function, a ZA save was emitted.
167 bool needsSaveBuffer() const {
168 assert(!(TPIDR2BlockFI && AgnosticZABufferPtr) &&
169 "Cannot have both a TPIDR2 block and agnostic ZA buffer");
170 return TPIDR2BlockFI || AgnosticZABufferPtr != AArch64::NoRegister;
171 }
172
173private:
174 std::optional<int> TPIDR2BlockFI;
175 Register AgnosticZABufferPtr = AArch64::NoRegister;
176};
177
178static bool isLegalEdgeBundleZAState(ZAState State) {
179 switch (State) {
180 case ZAState::ACTIVE:
181 case ZAState::LOCAL_SAVED:
182 return true;
183 default:
184 return false;
185 }
186}
187
188StringRef getZAStateString(ZAState State) {
189#define MAKE_CASE(V) \
190 case V: \
191 return #V;
192 switch (State) {
193 MAKE_CASE(ZAState::ANY)
194 MAKE_CASE(ZAState::ACTIVE)
195 MAKE_CASE(ZAState::LOCAL_SAVED)
196 MAKE_CASE(ZAState::CALLER_DORMANT)
197 MAKE_CASE(ZAState::OFF)
198 default:
199 llvm_unreachable("Unexpected ZAState");
200 }
201#undef MAKE_CASE
202}
203
204static bool isZAorZTRegOp(const TargetRegisterInfo &TRI,
205 const MachineOperand &MO) {
206 if (!MO.isReg() || !MO.getReg().isPhysical())
207 return false;
208 return any_of(TRI.subregs_inclusive(MO.getReg()), [](const MCPhysReg &SR) {
209 return AArch64::MPR128RegClass.contains(SR) ||
210 AArch64::ZTRRegClass.contains(SR);
211 });
212}
213
214/// Returns the required ZA state needed before \p MI and an iterator pointing
215/// to where any code required to change the ZA state should be inserted.
216static std::pair<ZAState, MachineBasicBlock::iterator>
217getZAStateBeforeInst(const TargetRegisterInfo &TRI, MachineInstr &MI,
218 bool ZAOffAtReturn) {
220
221 if (MI.getOpcode() == AArch64::InOutZAUsePseudo)
222 return {ZAState::ACTIVE, std::prev(InsertPt)};
223
224 if (MI.getOpcode() == AArch64::RequiresZASavePseudo)
225 return {ZAState::LOCAL_SAVED, std::prev(InsertPt)};
226
227 if (MI.isReturn())
228 return {ZAOffAtReturn ? ZAState::OFF : ZAState::ACTIVE, InsertPt};
229
230 for (auto &MO : MI.operands()) {
231 if (isZAorZTRegOp(TRI, MO))
232 return {ZAState::ACTIVE, InsertPt};
233 }
234
235 return {ZAState::ANY, InsertPt};
236}
237
238struct MachineSMEABI : public MachineFunctionPass {
239 inline static char ID = 0;
240
241 MachineSMEABI() : MachineFunctionPass(ID) {}
242
243 bool runOnMachineFunction(MachineFunction &MF) override;
244
245 StringRef getPassName() const override { return "Machine SME ABI pass"; }
246
247 void getAnalysisUsage(AnalysisUsage &AU) const override {
248 AU.setPreservesCFG();
253 }
254
255 /// Collects the needed ZA state (and live registers) before each instruction
256 /// within the machine function.
257 FunctionInfo collectNeededZAStates(SMEAttrs SMEFnAttrs);
258
259 /// Assigns each edge bundle a ZA state based on the needed states of blocks
260 /// that have incoming or outgoing edges in that bundle.
261 SmallVector<ZAState> assignBundleZAStates(const EdgeBundles &Bundles,
262 const FunctionInfo &FnInfo);
263
264 /// Inserts code to handle changes between ZA states within the function.
265 /// E.g., ACTIVE -> LOCAL_SAVED will insert code required to save ZA.
266 void insertStateChanges(EmitContext &, const FunctionInfo &FnInfo,
267 const EdgeBundles &Bundles,
268 ArrayRef<ZAState> BundleStates);
269
270 // Emission routines for private and shared ZA functions (using lazy saves).
271 void emitNewZAPrologue(MachineBasicBlock &MBB,
273 void emitRestoreLazySave(EmitContext &, MachineBasicBlock &MBB,
275 LiveRegs PhysLiveRegs);
276 void emitSetupLazySave(EmitContext &, MachineBasicBlock &MBB,
278 void emitAllocateLazySaveBuffer(EmitContext &, MachineBasicBlock &MBB,
281 bool ClearTPIDR2);
282
283 // Emission routines for agnostic ZA functions.
284 void emitSetupFullZASave(MachineBasicBlock &MBB,
286 LiveRegs PhysLiveRegs);
287 // Emit a "full" ZA save or restore. It is "full" in the sense that this
288 // function will emit a call to __arm_sme_save or __arm_sme_restore, which
289 // handles saving and restoring both ZA and ZT0.
290 void emitFullZASaveRestore(EmitContext &, MachineBasicBlock &MBB,
292 LiveRegs PhysLiveRegs, bool IsSave);
293 void emitAllocateFullZASaveBuffer(EmitContext &, MachineBasicBlock &MBB,
295 LiveRegs PhysLiveRegs);
296
297 void emitStateChange(EmitContext &, MachineBasicBlock &MBB,
298 MachineBasicBlock::iterator MBBI, ZAState From,
299 ZAState To, LiveRegs PhysLiveRegs);
300
301 // Helpers for switching between lazy/full ZA save/restore routines.
302 void emitZASave(EmitContext &Context, MachineBasicBlock &MBB,
304 if (AFI->getSMEFnAttrs().hasAgnosticZAInterface())
305 return emitFullZASaveRestore(Context, MBB, MBBI, PhysLiveRegs,
306 /*IsSave=*/true);
307 return emitSetupLazySave(Context, MBB, MBBI);
308 }
309 void emitZARestore(EmitContext &Context, MachineBasicBlock &MBB,
311 if (AFI->getSMEFnAttrs().hasAgnosticZAInterface())
312 return emitFullZASaveRestore(Context, MBB, MBBI, PhysLiveRegs,
313 /*IsSave=*/false);
314 return emitRestoreLazySave(Context, MBB, MBBI, PhysLiveRegs);
315 }
316 void emitAllocateZASaveBuffer(EmitContext &Context, MachineBasicBlock &MBB,
318 LiveRegs PhysLiveRegs) {
319 if (AFI->getSMEFnAttrs().hasAgnosticZAInterface())
320 return emitAllocateFullZASaveBuffer(Context, MBB, MBBI, PhysLiveRegs);
321 return emitAllocateLazySaveBuffer(Context, MBB, MBBI);
322 }
323
324 /// Save live physical registers to virtual registers.
325 PhysRegSave createPhysRegSave(LiveRegs PhysLiveRegs, MachineBasicBlock &MBB,
327 /// Restore physical registers from a save of their previous values.
328 void restorePhyRegSave(const PhysRegSave &RegSave, MachineBasicBlock &MBB,
330
331private:
332 MachineFunction *MF = nullptr;
333 const AArch64Subtarget *Subtarget = nullptr;
334 const AArch64RegisterInfo *TRI = nullptr;
335 const AArch64FunctionInfo *AFI = nullptr;
336 const TargetInstrInfo *TII = nullptr;
337 MachineRegisterInfo *MRI = nullptr;
338};
339
340FunctionInfo MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) {
341 assert((SMEFnAttrs.hasAgnosticZAInterface() || SMEFnAttrs.hasZT0State() ||
342 SMEFnAttrs.hasZAState()) &&
343 "Expected function to have ZA/ZT0 state!");
344
346 LiveRegs PhysLiveRegsAfterSMEPrologue = LiveRegs::None;
347 std::optional<MachineBasicBlock::iterator> AfterSMEProloguePt;
348
349 for (MachineBasicBlock &MBB : *MF) {
350 BlockInfo &Block = Blocks[MBB.getNumber()];
351
352 if (MBB.isEntryBlock()) {
353 // Entry block:
354 Block.FixedEntryState = SMEFnAttrs.hasPrivateZAInterface()
355 ? ZAState::CALLER_DORMANT
356 : ZAState::ACTIVE;
357 } else if (MBB.isEHPad()) {
358 // EH entry block:
359 Block.FixedEntryState = ZAState::LOCAL_SAVED;
360 }
361
362 LiveRegUnits LiveUnits(*TRI);
363 LiveUnits.addLiveOuts(MBB);
364
365 auto GetPhysLiveRegs = [&] {
366 LiveRegs PhysLiveRegs = LiveRegs::None;
367 if (!LiveUnits.available(AArch64::NZCV))
368 PhysLiveRegs |= LiveRegs::NZCV;
369 // We have to track W0 and X0 separately as otherwise things can get
370 // confused if we attempt to preserve X0 but only W0 was defined.
371 if (!LiveUnits.available(AArch64::W0))
372 PhysLiveRegs |= LiveRegs::W0;
373 if (!LiveUnits.available(AArch64::W0_HI))
374 PhysLiveRegs |= LiveRegs::W0_HI;
375 return PhysLiveRegs;
376 };
377
378 Block.PhysLiveRegsAtExit = GetPhysLiveRegs();
379 auto FirstTerminatorInsertPt = MBB.getFirstTerminator();
380 auto FirstNonPhiInsertPt = MBB.getFirstNonPHI();
381 for (MachineInstr &MI : reverse(MBB)) {
383 LiveUnits.stepBackward(MI);
384 LiveRegs PhysLiveRegs = GetPhysLiveRegs();
385 // The SMEStateAllocPseudo marker is added to a function if the save
386 // buffer was allocated in SelectionDAG. It marks the end of the
387 // allocation -- which is a safe point for this pass to insert any TPIDR2
388 // block setup.
389 if (MI.getOpcode() == AArch64::SMEStateAllocPseudo) {
390 AfterSMEProloguePt = MBBI;
391 PhysLiveRegsAfterSMEPrologue = PhysLiveRegs;
392 }
393 // Note: We treat Agnostic ZA as inout_za with an alternate save/restore.
394 auto [NeededState, InsertPt] = getZAStateBeforeInst(
395 *TRI, MI, /*ZAOffAtReturn=*/SMEFnAttrs.hasPrivateZAInterface());
396 assert((InsertPt == MBBI ||
397 InsertPt->getOpcode() == AArch64::ADJCALLSTACKDOWN) &&
398 "Unexpected state change insertion point!");
399 // TODO: Do something to avoid state changes where NZCV is live.
400 if (MBBI == FirstTerminatorInsertPt)
401 Block.PhysLiveRegsAtExit = PhysLiveRegs;
402 if (MBBI == FirstNonPhiInsertPt)
403 Block.PhysLiveRegsAtEntry = PhysLiveRegs;
404 if (NeededState != ZAState::ANY)
405 Block.Insts.push_back({NeededState, InsertPt, PhysLiveRegs});
406 }
407
408 // Reverse vector (as we had to iterate backwards for liveness).
409 std::reverse(Block.Insts.begin(), Block.Insts.end());
410 }
411
412 return FunctionInfo{std::move(Blocks), AfterSMEProloguePt,
413 PhysLiveRegsAfterSMEPrologue};
414}
415
416/// Assigns each edge bundle a ZA state based on the needed states of blocks
417/// that have incoming or outgoing edges in that bundle.
419MachineSMEABI::assignBundleZAStates(const EdgeBundles &Bundles,
420 const FunctionInfo &FnInfo) {
421 SmallVector<ZAState> BundleStates(Bundles.getNumBundles());
422 for (unsigned I = 0, E = Bundles.getNumBundles(); I != E; ++I) {
423 LLVM_DEBUG(dbgs() << "Assigning ZA state for edge bundle: " << I << '\n');
424
425 // Attempt to assign a ZA state for this bundle that minimizes state
426 // transitions. Edges within loops are given a higher weight as we assume
427 // they will be executed more than once.
428 // TODO: We should propagate desired incoming/outgoing states through blocks
429 // that have the "ANY" state first to make better global decisions.
430 int EdgeStateCounts[ZAState::NUM_ZA_STATE] = {0};
431 for (unsigned BlockID : Bundles.getBlocks(I)) {
432 LLVM_DEBUG(dbgs() << "- bb." << BlockID);
433
434 const BlockInfo &Block = FnInfo.Blocks[BlockID];
435 if (Block.Insts.empty()) {
436 LLVM_DEBUG(dbgs() << " (no state preference)\n");
437 continue;
438 }
439 bool InEdge = Bundles.getBundle(BlockID, /*Out=*/false) == I;
440 bool OutEdge = Bundles.getBundle(BlockID, /*Out=*/true) == I;
441
442 ZAState DesiredIncomingState = Block.Insts.front().NeededState;
443 if (InEdge && isLegalEdgeBundleZAState(DesiredIncomingState)) {
444 EdgeStateCounts[DesiredIncomingState]++;
445 LLVM_DEBUG(dbgs() << " DesiredIncomingState: "
446 << getZAStateString(DesiredIncomingState));
447 }
448 ZAState DesiredOutgoingState = Block.Insts.back().NeededState;
449 if (OutEdge && isLegalEdgeBundleZAState(DesiredOutgoingState)) {
450 EdgeStateCounts[DesiredOutgoingState]++;
451 LLVM_DEBUG(dbgs() << " DesiredOutgoingState: "
452 << getZAStateString(DesiredOutgoingState));
453 }
454 LLVM_DEBUG(dbgs() << '\n');
455 }
456
457 ZAState BundleState =
458 ZAState(max_element(EdgeStateCounts) - EdgeStateCounts);
459
460 // Force ZA to be active in bundles that don't have a preferred state.
461 // TODO: Something better here (to avoid extra mode switches).
462 if (BundleState == ZAState::ANY)
463 BundleState = ZAState::ACTIVE;
464
465 LLVM_DEBUG({
466 dbgs() << "Chosen ZA state: " << getZAStateString(BundleState) << '\n'
467 << "Edge counts:";
468 for (auto [State, Count] : enumerate(EdgeStateCounts))
469 dbgs() << " " << getZAStateString(ZAState(State)) << ": " << Count;
470 dbgs() << "\n\n";
471 });
472
473 BundleStates[I] = BundleState;
474 }
475
476 return BundleStates;
477}
478
479void MachineSMEABI::insertStateChanges(EmitContext &Context,
480 const FunctionInfo &FnInfo,
481 const EdgeBundles &Bundles,
482 ArrayRef<ZAState> BundleStates) {
483 for (MachineBasicBlock &MBB : *MF) {
484 const BlockInfo &Block = FnInfo.Blocks[MBB.getNumber()];
485 ZAState InState = BundleStates[Bundles.getBundle(MBB.getNumber(),
486 /*Out=*/false)];
487
488 ZAState CurrentState = Block.FixedEntryState;
489 if (CurrentState == ZAState::ANY)
490 CurrentState = InState;
491
492 for (auto &Inst : Block.Insts) {
493 if (CurrentState != Inst.NeededState)
494 emitStateChange(Context, MBB, Inst.InsertPt, CurrentState,
495 Inst.NeededState, Inst.PhysLiveRegs);
496 CurrentState = Inst.NeededState;
497 }
498
499 if (MBB.succ_empty())
500 continue;
501
502 ZAState OutState =
503 BundleStates[Bundles.getBundle(MBB.getNumber(), /*Out=*/true)];
504 if (CurrentState != OutState)
505 emitStateChange(Context, MBB, MBB.getFirstTerminator(), CurrentState,
506 OutState, Block.PhysLiveRegsAtExit);
507 }
508}
509
512 if (MBBI != MBB.end())
513 return MBBI->getDebugLoc();
514 return DebugLoc();
515}
516
517void MachineSMEABI::emitSetupLazySave(EmitContext &Context,
521
522 // Get pointer to TPIDR2 block.
523 Register TPIDR2 = MRI->createVirtualRegister(&AArch64::GPR64spRegClass);
524 Register TPIDR2Ptr = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
525 BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXri), TPIDR2)
526 .addFrameIndex(Context.getTPIDR2Block(*MF))
527 .addImm(0)
528 .addImm(0);
529 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), TPIDR2Ptr)
530 .addReg(TPIDR2);
531 // Set TPIDR2_EL0 to point to TPIDR2 block.
532 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSR))
533 .addImm(AArch64SysReg::TPIDR2_EL0)
534 .addReg(TPIDR2Ptr);
535}
536
537PhysRegSave MachineSMEABI::createPhysRegSave(LiveRegs PhysLiveRegs,
540 DebugLoc DL) {
541 PhysRegSave RegSave{PhysLiveRegs};
542 if (PhysLiveRegs & LiveRegs::NZCV) {
543 RegSave.StatusFlags = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
544 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MRS), RegSave.StatusFlags)
545 .addImm(AArch64SysReg::NZCV)
546 .addReg(AArch64::NZCV, RegState::Implicit);
547 }
548 // Note: Preserving X0 is "free" as this is before register allocation, so
549 // the register allocator is still able to optimize these copies.
550 if (PhysLiveRegs & LiveRegs::W0) {
551 RegSave.X0Save = MRI->createVirtualRegister(PhysLiveRegs & LiveRegs::W0_HI
552 ? &AArch64::GPR64RegClass
553 : &AArch64::GPR32RegClass);
554 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), RegSave.X0Save)
555 .addReg(PhysLiveRegs & LiveRegs::W0_HI ? AArch64::X0 : AArch64::W0);
556 }
557 return RegSave;
558}
559
560void MachineSMEABI::restorePhyRegSave(const PhysRegSave &RegSave,
563 DebugLoc DL) {
564 if (RegSave.StatusFlags != AArch64::NoRegister)
565 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSR))
566 .addImm(AArch64SysReg::NZCV)
567 .addReg(RegSave.StatusFlags)
568 .addReg(AArch64::NZCV, RegState::ImplicitDefine);
569
570 if (RegSave.X0Save != AArch64::NoRegister)
571 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY),
572 RegSave.PhysLiveRegs & LiveRegs::W0_HI ? AArch64::X0 : AArch64::W0)
573 .addReg(RegSave.X0Save);
574}
575
576void MachineSMEABI::emitRestoreLazySave(EmitContext &Context,
579 LiveRegs PhysLiveRegs) {
580 auto *TLI = Subtarget->getTargetLowering();
582 Register TPIDR2EL0 = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
583 Register TPIDR2 = AArch64::X0;
584
585 // TODO: Emit these within the restore MBB to prevent unnecessary saves.
586 PhysRegSave RegSave = createPhysRegSave(PhysLiveRegs, MBB, MBBI, DL);
587
588 // Enable ZA.
589 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSRpstatesvcrImm1))
590 .addImm(AArch64SVCR::SVCRZA)
591 .addImm(1);
592 // Get current TPIDR2_EL0.
593 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MRS), TPIDR2EL0)
594 .addImm(AArch64SysReg::TPIDR2_EL0);
595 // Get pointer to TPIDR2 block.
596 BuildMI(MBB, MBBI, DL, TII->get(AArch64::ADDXri), TPIDR2)
597 .addFrameIndex(Context.getTPIDR2Block(*MF))
598 .addImm(0)
599 .addImm(0);
600 // (Conditionally) restore ZA state.
601 BuildMI(MBB, MBBI, DL, TII->get(AArch64::RestoreZAPseudo))
602 .addReg(TPIDR2EL0)
603 .addReg(TPIDR2)
604 .addExternalSymbol(TLI->getLibcallName(RTLIB::SMEABI_TPIDR2_RESTORE))
605 .addRegMask(TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
606 // Zero TPIDR2_EL0.
607 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSR))
608 .addImm(AArch64SysReg::TPIDR2_EL0)
609 .addReg(AArch64::XZR);
610
611 restorePhyRegSave(RegSave, MBB, MBBI, DL);
612}
613
614void MachineSMEABI::emitZAOff(MachineBasicBlock &MBB,
616 bool ClearTPIDR2) {
618
619 if (ClearTPIDR2)
620 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSR))
621 .addImm(AArch64SysReg::TPIDR2_EL0)
622 .addReg(AArch64::XZR);
623
624 // Disable ZA.
625 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSRpstatesvcrImm1))
626 .addImm(AArch64SVCR::SVCRZA)
627 .addImm(0);
628}
629
630void MachineSMEABI::emitAllocateLazySaveBuffer(
631 EmitContext &Context, MachineBasicBlock &MBB,
633 MachineFrameInfo &MFI = MF->getFrameInfo();
635 Register SP = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
636 Register SVL = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
637 Register Buffer = AFI->getEarlyAllocSMESaveBuffer();
638
639 // Calculate SVL.
640 BuildMI(MBB, MBBI, DL, TII->get(AArch64::RDSVLI_XI), SVL).addImm(1);
641
642 // 1. Allocate the lazy save buffer.
643 if (Buffer == AArch64::NoRegister) {
644 // TODO: On Windows, we allocate the lazy save buffer in SelectionDAG (so
645 // Buffer != AArch64::NoRegister). This is done to reuse the existing
646 // expansions (which can insert stack checks). This works, but it means we
647 // will always allocate the lazy save buffer (even if the function contains
648 // no lazy saves). If we want to handle Windows here, we'll need to
649 // implement something similar to LowerWindowsDYNAMIC_STACKALLOC.
650 assert(!Subtarget->isTargetWindows() &&
651 "Lazy ZA save is not yet supported on Windows");
652 Buffer = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
653 // Get original stack pointer.
654 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), SP)
655 .addReg(AArch64::SP);
656 // Allocate a lazy-save buffer object of the size given, normally SVL * SVL
657 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSUBXrrr), Buffer)
658 .addReg(SVL)
659 .addReg(SVL)
660 .addReg(SP);
661 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), AArch64::SP)
662 .addReg(Buffer);
663 // We have just allocated a variable sized object, tell this to PEI.
664 MFI.CreateVariableSizedObject(Align(16), nullptr);
665 }
666
667 // 2. Setup the TPIDR2 block.
668 {
669 // Note: This case just needs to do `SVL << 48`. It is not implemented as we
670 // generally don't support big-endian SVE/SME.
671 if (!Subtarget->isLittleEndian())
673 "TPIDR2 block initialization is not supported on big-endian targets");
674
675 // Store buffer pointer and num_za_save_slices.
676 // Bytes 10-15 are implicitly zeroed.
677 BuildMI(MBB, MBBI, DL, TII->get(AArch64::STPXi))
678 .addReg(Buffer)
679 .addReg(SVL)
680 .addFrameIndex(Context.getTPIDR2Block(*MF))
681 .addImm(0);
682 }
683}
684
685void MachineSMEABI::emitNewZAPrologue(MachineBasicBlock &MBB,
687 auto *TLI = Subtarget->getTargetLowering();
689
690 // Get current TPIDR2_EL0.
691 Register TPIDR2EL0 = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
692 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MRS))
693 .addReg(TPIDR2EL0, RegState::Define)
694 .addImm(AArch64SysReg::TPIDR2_EL0);
695 // If TPIDR2_EL0 is non-zero, commit the lazy save.
696 // NOTE: Functions that only use ZT0 don't need to zero ZA.
697 bool ZeroZA = AFI->getSMEFnAttrs().hasZAState();
698 auto CommitZASave =
699 BuildMI(MBB, MBBI, DL, TII->get(AArch64::CommitZASavePseudo))
700 .addReg(TPIDR2EL0)
701 .addImm(ZeroZA ? 1 : 0)
702 .addExternalSymbol(TLI->getLibcallName(RTLIB::SMEABI_TPIDR2_SAVE))
703 .addRegMask(TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
704 if (ZeroZA)
705 CommitZASave.addDef(AArch64::ZAB0, RegState::ImplicitDefine);
706 // Enable ZA (as ZA could have previously been in the OFF state).
707 BuildMI(MBB, MBBI, DL, TII->get(AArch64::MSRpstatesvcrImm1))
708 .addImm(AArch64SVCR::SVCRZA)
709 .addImm(1);
710}
711
712void MachineSMEABI::emitFullZASaveRestore(EmitContext &Context,
715 LiveRegs PhysLiveRegs, bool IsSave) {
716 auto *TLI = Subtarget->getTargetLowering();
718 Register BufferPtr = AArch64::X0;
719
720 PhysRegSave RegSave = createPhysRegSave(PhysLiveRegs, MBB, MBBI, DL);
721
722 // Copy the buffer pointer into X0.
723 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), BufferPtr)
724 .addReg(Context.getAgnosticZABufferPtr(*MF));
725
726 // Call __arm_sme_save/__arm_sme_restore.
727 BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
728 .addReg(BufferPtr, RegState::Implicit)
729 .addExternalSymbol(TLI->getLibcallName(
730 IsSave ? RTLIB::SMEABI_SME_SAVE : RTLIB::SMEABI_SME_RESTORE))
731 .addRegMask(TRI->getCallPreservedMask(
732 *MF,
734
735 restorePhyRegSave(RegSave, MBB, MBBI, DL);
736}
737
738void MachineSMEABI::emitAllocateFullZASaveBuffer(
739 EmitContext &Context, MachineBasicBlock &MBB,
741 // Buffer already allocated in SelectionDAG.
743 return;
744
746 Register BufferPtr = Context.getAgnosticZABufferPtr(*MF);
747 Register BufferSize = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
748
749 PhysRegSave RegSave = createPhysRegSave(PhysLiveRegs, MBB, MBBI, DL);
750
751 // Calculate the SME state size.
752 {
753 auto *TLI = Subtarget->getTargetLowering();
754 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
755 BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
756 .addExternalSymbol(TLI->getLibcallName(RTLIB::SMEABI_SME_STATE_SIZE))
757 .addReg(AArch64::X0, RegState::ImplicitDefine)
758 .addRegMask(TRI->getCallPreservedMask(
759 *MF, CallingConv::
761 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), BufferSize)
762 .addReg(AArch64::X0);
763 }
764
765 // Allocate a buffer object of the size given __arm_sme_state_size.
766 {
767 MachineFrameInfo &MFI = MF->getFrameInfo();
768 BuildMI(MBB, MBBI, DL, TII->get(AArch64::SUBXrx64), AArch64::SP)
769 .addReg(AArch64::SP)
770 .addReg(BufferSize)
772 BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), BufferPtr)
773 .addReg(AArch64::SP);
774
775 // We have just allocated a variable sized object, tell this to PEI.
776 MFI.CreateVariableSizedObject(Align(16), nullptr);
777 }
778
779 restorePhyRegSave(RegSave, MBB, MBBI, DL);
780}
781
782void MachineSMEABI::emitStateChange(EmitContext &Context,
785 ZAState From, ZAState To,
786 LiveRegs PhysLiveRegs) {
787 // ZA not used.
788 if (From == ZAState::ANY || To == ZAState::ANY)
789 return;
790
791 // If we're exiting from the CALLER_DORMANT state that means this new ZA
792 // function did not touch ZA (so ZA was never turned on).
793 if (From == ZAState::CALLER_DORMANT && To == ZAState::OFF)
794 return;
795
796 // TODO: Avoid setting up the save buffer if there's no transition to
797 // LOCAL_SAVED.
798 if (From == ZAState::CALLER_DORMANT) {
800 "CALLER_DORMANT state requires private ZA interface");
801 assert(&MBB == &MBB.getParent()->front() &&
802 "CALLER_DORMANT state only valid in entry block");
803 emitNewZAPrologue(MBB, MBB.getFirstNonPHI());
804 if (To == ZAState::ACTIVE)
805 return; // Nothing more to do (ZA is active after the prologue).
806
807 // Note: "emitNewZAPrologue" zeros ZA, so we may need to setup a lazy save
808 // if "To" is "ZAState::LOCAL_SAVED". It may be possible to improve this
809 // case by changing the placement of the zero instruction.
810 From = ZAState::ACTIVE;
811 }
812
813 if (From == ZAState::ACTIVE && To == ZAState::LOCAL_SAVED)
814 emitZASave(Context, MBB, InsertPt, PhysLiveRegs);
815 else if (From == ZAState::LOCAL_SAVED && To == ZAState::ACTIVE)
816 emitZARestore(Context, MBB, InsertPt, PhysLiveRegs);
817 else if (To == ZAState::OFF) {
818 assert(From != ZAState::CALLER_DORMANT &&
819 "CALLER_DORMANT to OFF should have already been handled");
821 "Should not turn ZA off in agnostic ZA function");
822 emitZAOff(MBB, InsertPt, /*ClearTPIDR2=*/From == ZAState::LOCAL_SAVED);
823 } else {
824 dbgs() << "Error: Transition from " << getZAStateString(From) << " to "
825 << getZAStateString(To) << '\n';
826 llvm_unreachable("Unimplemented state transition");
827 }
828}
829
830} // end anonymous namespace
831
832INITIALIZE_PASS(MachineSMEABI, "aarch64-machine-sme-abi", "Machine SME ABI",
833 false, false)
834
835bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) {
836 if (!MF.getSubtarget<AArch64Subtarget>().hasSME())
837 return false;
838
839 AFI = MF.getInfo<AArch64FunctionInfo>();
840 SMEAttrs SMEFnAttrs = AFI->getSMEFnAttrs();
841 if (!SMEFnAttrs.hasZAState() && !SMEFnAttrs.hasZT0State() &&
842 !SMEFnAttrs.hasAgnosticZAInterface())
843 return false;
844
845 assert(MF.getRegInfo().isSSA() && "Expected to be run on SSA form!");
846
847 this->MF = &MF;
848 Subtarget = &MF.getSubtarget<AArch64Subtarget>();
849 TII = Subtarget->getInstrInfo();
850 TRI = Subtarget->getRegisterInfo();
851 MRI = &MF.getRegInfo();
852
853 const EdgeBundles &Bundles =
854 getAnalysis<EdgeBundlesWrapperLegacy>().getEdgeBundles();
855
856 FunctionInfo FnInfo = collectNeededZAStates(SMEFnAttrs);
857 SmallVector<ZAState> BundleStates = assignBundleZAStates(Bundles, FnInfo);
858
859 EmitContext Context;
860 insertStateChanges(Context, FnInfo, Bundles, BundleStates);
861
862 if (Context.needsSaveBuffer()) {
863 if (FnInfo.AfterSMEProloguePt) {
864 // Note: With inline stack probes the AfterSMEProloguePt may not be in the
865 // entry block (due to the probing loop).
866 MachineBasicBlock::iterator MBBI = *FnInfo.AfterSMEProloguePt;
867 emitAllocateZASaveBuffer(Context, *MBBI->getParent(), MBBI,
868 FnInfo.PhysLiveRegsAfterSMEPrologue);
869 } else {
870 MachineBasicBlock &EntryBlock = MF.front();
871 emitAllocateZASaveBuffer(
872 Context, EntryBlock, EntryBlock.getFirstNonPHI(),
873 FnInfo.Blocks[EntryBlock.getNumber()].PhysLiveRegsAtEntry);
874 }
875 }
876
877 return true;
878}
879
880FunctionPass *llvm::createMachineSMEABIPass() { return new MachineSMEABI(); }
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
This file implements the LivePhysRegs utility for tracking liveness of physical registers.
#define I(x, y, z)
Definition MD5.cpp:58
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
#define MAKE_CASE(V)
Register const TargetRegisterInfo * TRI
if(PassOpts->AAPipeline)
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
This file defines the SmallVector class.
#define LLVM_DEBUG(...)
Definition Debug.h:114
AArch64FunctionInfo - This class is derived from MachineFunctionInfo and contains private AArch64-spe...
const AArch64RegisterInfo * getRegisterInfo() const override
const AArch64TargetLowering * getTargetLowering() const override
Represent the analysis usage information of a pass.
AnalysisUsage & addPreservedID(const void *ID)
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
A debug info location.
Definition DebugLoc.h:124
ArrayRef< unsigned > getBlocks(unsigned Bundle) const
getBlocks - Return an array of blocks that are connected to Bundle.
Definition EdgeBundles.h:53
unsigned getBundle(unsigned N, bool Out) const
getBundle - Return the ingoing (Out = false) or outgoing (Out = true) bundle number for basic block N
Definition EdgeBundles.h:47
unsigned getNumBundles() const
getNumBundles - Return the total number of bundles in the CFG.
Definition EdgeBundles.h:50
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
A set of register units used to track register liveness.
bool available(MCRegister Reg) const
Returns true if no part of physical register Reg is live.
LLVM_ABI void stepBackward(const MachineInstr &MI)
Updates liveness when stepping backwards over the instruction MI.
LLVM_ABI void addLiveOuts(const MachineBasicBlock &MBB)
Adds registers living out of block MBB.
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
LLVM_ABI iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
LLVM_ABI int CreateVariableSizedObject(Align Alignment, const AllocaInst *Alloca)
Notify the MachineFrameInfo object that a variable sized object has been created.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
unsigned getNumBlockIDs() const
getNumBlockIDs - Return the number of MBB ID's allocated.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
Wrapper class representing virtual and physical registers.
Definition Register.h:19
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:78
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool hasAgnosticZAInterface() const
bool hasPrivateZAInterface() const
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
TargetInstrInfo - Interface to description of machine instruction set.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static unsigned getArithExtendImm(AArch64_AM::ShiftExtendType ET, unsigned Imm)
getArithExtendImm - Encode the extend type and shift amount for an arithmetic instruction: imm: 3-bit...
CallingConv Namespace - This namespace contains an enum with a value for the well-known calling conve...
Definition CallingConv.h:21
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1
Preserve X1-X15, X19-X29, SP, Z0-Z31, P0-P15.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Define
Register definition.
This is an optimization pass for GlobalISel generic memory operations.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
FunctionPass * createMachineSMEABIPass()
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2454
LLVM_ABI char & MachineDominatorsID
MachineDominators - This pass is a machine dominators analysis pass.
LLVM_ABI void reportFatalInternalError(Error Err)
Report a fatal error that indicates a bug in LLVM.
Definition Error.cpp:177
LLVM_ABI char & MachineLoopInfoID
MachineLoopInfo - This pass is a loop analysis pass.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1714
auto reverse(ContainerTy &&C)
Definition STLExtras.h:400
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
@ LLVM_MARK_AS_BITMASK_ENUM
Definition ModRef.h:37
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2012
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39