LLVM 23.0.0git
SIFrameLowering.cpp
Go to the documentation of this file.
1//===----------------------- SIFrameLowering.cpp --------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8
9#include "SIFrameLowering.h"
10#include "AMDGPU.h"
11#include "AMDGPULaneMaskUtils.h"
12#include "GCNSubtarget.h"
15#include "SISpillUtils.h"
21#include "llvm/Support/LEB128.h"
23
24using namespace llvm;
25
26#define DEBUG_TYPE "frame-info"
27
29 "amdgpu-spill-vgpr-to-agpr",
30 cl::desc("Enable spilling VGPRs to AGPRs"),
32 cl::init(true));
33
34static constexpr unsigned SGPRBitSize = 32;
35static constexpr unsigned SGPRByteSize = SGPRBitSize / 8;
36static constexpr unsigned VGPRLaneBitSize = 32;
37
38// Find a register matching \p RC from \p LiveUnits which is unused and
39// available throughout the function. On failure, returns AMDGPU::NoRegister.
40// TODO: Rewrite the loop here to iterate over MCRegUnits instead of
41// MCRegisters. This should reduce the number of iterations and avoid redundant
42// checking.
44 const LiveRegUnits &LiveUnits,
45 const TargetRegisterClass &RC) {
46 for (MCRegister Reg : RC) {
47 if (!MRI.isPhysRegUsed(Reg) && LiveUnits.available(Reg) &&
48 !MRI.isReserved(Reg))
49 return Reg;
50 }
51 return MCRegister();
52}
53
54static void encodeDwarfRegisterLocation(int DwarfReg, raw_ostream &OS) {
55 assert(DwarfReg >= 0);
56 if (DwarfReg < 32) {
57 OS << uint8_t(dwarf::DW_OP_reg0 + DwarfReg);
58 } else {
59 OS << uint8_t(dwarf::DW_OP_regx);
60 encodeULEB128(DwarfReg, OS);
61 }
62}
63
66 MCRegister DwarfStackPtrReg) {
67 assert(ST.enableFlatScratch());
68
69 // When flat scratch is enabled, the stack pointer is an address in the
70 // private_lane DWARF address space (i.e. swizzled), but in order to
71 // accurately and efficiently describe things like masked spills of vector
72 // registers we want to define the CFA to be an address in the private_wave
73 // DWARF address space (i.e. unswizzled). To achieve this we scale the stack
74 // pointer by the wavefront size, implemented as (SP << wave_size_log2).
75 const unsigned WavefrontSizeLog2 = ST.getWavefrontSizeLog2();
76 assert(WavefrontSizeLog2 < 32);
77
80 encodeDwarfRegisterLocation(DwarfStackPtrReg, OSBlock);
81 OSBlock << uint8_t(dwarf::DW_OP_deref_size) << uint8_t(SGPRByteSize)
82 << uint8_t(dwarf::DW_OP_lit0 + WavefrontSizeLog2)
83 << uint8_t(dwarf::DW_OP_shl)
84 << uint8_t(dwarf::DW_OP_lit0 +
85 dwarf::DW_ASPACE_LLVM_AMDGPU_private_wave)
86 << uint8_t(dwarf::DW_OP_LLVM_user)
87 << uint8_t(dwarf::DW_OP_LLVM_form_aspace_address);
88
89 SmallString<20> CFIInst;
90 raw_svector_ostream OSCFIInst(CFIInst);
91 OSCFIInst << uint8_t(dwarf::DW_CFA_def_cfa_expression);
92 encodeULEB128(Block.size(), OSCFIInst);
93 OSCFIInst << Block;
94
95 return MCCFIInstruction::createEscape(nullptr, OSCFIInst.str());
96}
97
98void SIFrameLowering::emitDefCFA(MachineBasicBlock &MBB,
100 DebugLoc const &DL, MCRegister StackPtrReg,
101 bool AspaceAlreadyDefined,
102 MachineInstr::MIFlag Flags) const {
103 MachineFunction &MF = *MBB.getParent();
104 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
105 const SIRegisterInfo *TRI = ST.getRegisterInfo();
106
107 MCRegister DwarfStackPtrReg = TRI->getDwarfRegNum(StackPtrReg, false);
108 MCCFIInstruction CFIInst =
109 ST.enableFlatScratch()
110 ? createScaledCFAInPrivateWave(ST, DwarfStackPtrReg)
111 : (AspaceAlreadyDefined
112 ? MCCFIInstruction::createLLVMDefAspaceCfa(
113 nullptr, DwarfStackPtrReg, 0,
114 dwarf::DW_ASPACE_LLVM_AMDGPU_private_wave, SMLoc())
115 : MCCFIInstruction::createDefCfaRegister(nullptr,
116 DwarfStackPtrReg));
117 buildCFI(MBB, MBBI, DL, CFIInst, Flags);
118}
119
120// Find a scratch register that we can use in the prologue. We avoid using
121// callee-save registers since they may appear to be free when this is called
122// from canUseAsPrologue (during shrink wrapping), but then no longer be free
123// when this is called from emitPrologue.
125 MachineRegisterInfo &MRI, LiveRegUnits &LiveUnits,
126 const TargetRegisterClass &RC, bool Unused = false) {
127 // Mark callee saved registers as used so we will not choose them.
128 const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
129 for (unsigned i = 0; CSRegs[i]; ++i)
130 LiveUnits.addReg(CSRegs[i]);
131
132 // We are looking for a register that can be used throughout the entire
133 // function, so any use is unacceptable.
134 if (Unused)
135 return findUnusedRegister(MRI, LiveUnits, RC);
136
137 for (MCRegister Reg : RC) {
138 if (LiveUnits.available(Reg) && !MRI.isReserved(Reg))
139 return Reg;
140 }
141
142 return MCRegister();
143}
144
145/// Query target location for spilling SGPRs
146/// \p IncludeScratchCopy : Also look for free scratch SGPRs
148 MachineFunction &MF, LiveRegUnits &LiveUnits, Register SGPR,
149 const TargetRegisterClass &RC = AMDGPU::SReg_32_XM0_XEXECRegClass,
150 bool IncludeScratchCopy = true) {
152 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
153
154 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
155 const SIRegisterInfo *TRI = ST.getRegisterInfo();
156 unsigned Size = TRI->getSpillSize(RC);
157 Align Alignment = TRI->getSpillAlign(RC);
158
159 // We need to save and restore the given SGPR.
160
161 Register ScratchSGPR;
162 // 1: Try to save the given register into an unused scratch SGPR. The
163 // LiveUnits should have all the callee saved registers marked as used. For
164 // certain cases we skip copy to scratch SGPR.
165 if (IncludeScratchCopy)
166 ScratchSGPR = findUnusedRegister(MF.getRegInfo(), LiveUnits, RC);
167
168 if (!ScratchSGPR) {
169 int FI = FrameInfo.CreateStackObject(Size, Alignment, true, nullptr,
171
172 if (TRI->spillSGPRToVGPR() &&
173 MFI->allocateSGPRSpillToVGPRLane(MF, FI, /*SpillToPhysVGPRLane=*/true,
174 /*IsPrologEpilog=*/true)) {
175 // 2: There's no free lane to spill, and no free register to save the
176 // SGPR, so we're forced to take another VGPR to use for the spill.
180
181 LLVM_DEBUG(auto Spill = MFI->getSGPRSpillToPhysicalVGPRLanes(FI).front();
182 dbgs() << printReg(SGPR, TRI) << " requires fallback spill to "
183 << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane
184 << '\n';);
185 } else {
186 // Remove dead <FI> index
188 // 3: If all else fails, spill the register to memory.
189 FI = FrameInfo.CreateSpillStackObject(Size, Alignment);
191 SGPR,
193 LLVM_DEBUG(dbgs() << "Reserved FI " << FI << " for spilling "
194 << printReg(SGPR, TRI) << '\n');
195 }
196 } else {
200 LiveUnits.addReg(ScratchSGPR);
201 LLVM_DEBUG(dbgs() << "Saving " << printReg(SGPR, TRI) << " with copy to "
202 << printReg(ScratchSGPR, TRI) << '\n');
203 }
204}
205
206// We need to specially emit stack operations here because a different frame
207// register is used than in the rest of the function, as getFrameRegister would
208// use.
209static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI,
210 const SIMachineFunctionInfo &FuncInfo,
211 LiveRegUnits &LiveUnits, MachineFunction &MF,
214 Register SpillReg, int FI, Register FrameReg,
215 int64_t DwordOff = 0) {
216 unsigned Opc = ST.hasFlatScratchEnabled() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
217 : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
218
219 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
222 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FI),
223 FrameInfo.getObjectAlign(FI));
224 LiveUnits.addReg(SpillReg);
225 bool IsKill = !MBB.isLiveIn(SpillReg);
226 TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, IsKill, FrameReg,
227 DwordOff, MMO, nullptr, &LiveUnits);
228 if (IsKill)
229 LiveUnits.removeReg(SpillReg);
230}
231
232static void buildEpilogRestore(const GCNSubtarget &ST,
233 const SIRegisterInfo &TRI,
234 const SIMachineFunctionInfo &FuncInfo,
235 LiveRegUnits &LiveUnits, MachineFunction &MF,
238 const DebugLoc &DL, Register SpillReg, int FI,
239 Register FrameReg, int64_t DwordOff = 0) {
240 unsigned Opc = ST.hasFlatScratchEnabled() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
241 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
242
243 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
246 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FI),
247 FrameInfo.getObjectAlign(FI));
248 TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, false, FrameReg,
249 DwordOff, MMO, nullptr, &LiveUnits);
250}
251
253 const DebugLoc &DL, const SIInstrInfo *TII,
254 Register TargetReg) {
255 MachineFunction *MF = MBB.getParent();
257 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
258 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
259 Register TargetLo = TRI->getSubReg(TargetReg, AMDGPU::sub0);
260 Register TargetHi = TRI->getSubReg(TargetReg, AMDGPU::sub1);
261
262 if (MFI->getGITPtrHigh() != 0xffffffff) {
263 BuildMI(MBB, I, DL, SMovB32, TargetHi)
264 .addImm(MFI->getGITPtrHigh())
265 .addReg(TargetReg, RegState::ImplicitDefine);
266 } else {
267 const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64_pseudo);
268 BuildMI(MBB, I, DL, GetPC64, TargetReg);
269 }
270 Register GitPtrLo = MFI->getGITPtrLoReg(*MF);
271 MF->getRegInfo().addLiveIn(GitPtrLo);
272 MBB.addLiveIn(GitPtrLo);
273 BuildMI(MBB, I, DL, SMovB32, TargetLo)
274 .addReg(GitPtrLo);
275}
276
277static void initLiveUnits(LiveRegUnits &LiveUnits, const SIRegisterInfo &TRI,
278 const SIMachineFunctionInfo *FuncInfo,
280 MachineBasicBlock::iterator MBBI, bool IsProlog) {
281 if (LiveUnits.empty()) {
282 LiveUnits.init(TRI);
283 if (IsProlog) {
284 LiveUnits.addLiveIns(MBB);
285 } else {
286 // In epilog.
287 LiveUnits.addLiveOuts(MBB);
288 LiveUnits.stepBackward(*MBBI);
289 }
290 }
291}
292
293namespace llvm {
294
295// SpillBuilder to save/restore special SGPR spills like the one needed for FP,
296// BP, etc. These spills are delayed until the current function's frame is
297// finalized. For a given register, the builder uses the
298// PrologEpilogSGPRSaveRestoreInfo to decide the spill method.
302 MachineFunction &MF;
303 const GCNSubtarget &ST;
304 MachineFrameInfo &MFI;
305 SIMachineFunctionInfo *FuncInfo;
306 const SIInstrInfo *TII;
307 const SIRegisterInfo &TRI;
308 const MCRegisterInfo *MCRI;
309 const SIFrameLowering *TFI;
310 Register SuperReg;
312 LiveRegUnits &LiveUnits;
313 const DebugLoc &DL;
314 Register FrameReg;
315 ArrayRef<int16_t> SplitParts;
316 unsigned NumSubRegs;
317 unsigned EltSize = 4;
318 bool IsFramePtrPrologSpill;
319 bool NeedsFrameMoves;
320
321 static bool isExec(Register Reg) {
322 return Reg == AMDGPU::EXEC_LO || Reg == AMDGPU::EXEC;
323 }
324
325 /// If this builder requires SuperReg-based CFI, which is emitted after all
326 /// SubRegs are actually spilled, return the Register which should be used
327 /// as input to getDwarfRegNum. Otherwise, CFI should be generated per-SubReg.
328 ///
329 /// Note: Most spills handled by this builder generate CFI after each
330 /// SubReg spill, as each SubReg maps directly to a CFI register via
331 /// getDwarfRegNum(SubReg, false). All other cases currently currently
332 /// correspond to the SuperReg directly.
333 MCRegister getCFISuperReg() const {
334 if (IsFramePtrPrologSpill)
335 return FuncInfo->getFrameOffsetReg();
336 // FIXME: CFI for EXEC needs a fix by accurately computing the spill
337 // offset for both the low and high components.
338 if (isExec(SuperReg))
339 return AMDGPU::EXEC;
340 return {};
341 }
342
343 void saveToMemory(const int FI) const {
344 MachineRegisterInfo &MRI = MF.getRegInfo();
345 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
346 assert(!MFI.isDeadObjectIndex(FI));
347
348 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ true);
349
351 MRI, LiveUnits, AMDGPU::VGPR_32RegClass);
352 if (!TmpVGPR)
353 report_fatal_error("failed to find free scratch register");
354
355 auto BuildCFI = [&](Register Reg) {
356 TFI->buildCFI(MBB, MI, DL,
358 nullptr, MCRI->getDwarfRegNum(Reg, false),
359 MFI.getObjectOffset(FI) * ST.getWavefrontSize()));
360 };
361 MCRegister CFISuperReg = getCFISuperReg();
362 for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) {
363 Register SubReg = NumSubRegs == 1
364 ? SuperReg
365 : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
366 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
367 .addReg(SubReg);
368
369 buildPrologSpill(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MI, DL, TmpVGPR,
370 FI, FrameReg, DwordOff);
371 if (NeedsFrameMoves && !CFISuperReg)
372 BuildCFI(SubReg);
373 DwordOff += 4;
374 }
375 if (NeedsFrameMoves && CFISuperReg)
376 BuildCFI(CFISuperReg);
377 }
378
379 void saveToVGPRLane(const int FI) const {
380 assert(!MFI.isDeadObjectIndex(FI));
381
382 assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
384 FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI);
385 assert(Spill.size() == NumSubRegs);
386
387 MCRegister CFISuperReg = getCFISuperReg();
388 for (unsigned I = 0; I < NumSubRegs; ++I) {
389 Register SubReg = NumSubRegs == 1
390 ? SuperReg
391 : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
392 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_S32_TO_VGPR),
393 Spill[I].VGPR)
394 .addReg(SubReg)
395 .addImm(Spill[I].Lane)
396 .addReg(Spill[I].VGPR, RegState::Undef);
397 if (NeedsFrameMoves && !CFISuperReg)
398 TFI->buildCFIForSGPRToVGPRSpill(MBB, MI, DL, SubReg, Spill[I].VGPR,
399 Spill[I].Lane);
400 }
401 if (NeedsFrameMoves && CFISuperReg)
402 TFI->buildCFIForSGPRToVGPRSpill(MBB, MI, DL, CFISuperReg, Spill);
403 }
404
405 void copyToScratchSGPR(Register DstReg) const {
406 BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), DstReg)
407 .addReg(SuperReg)
409 if (NeedsFrameMoves) {
410 const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(DstReg);
411 ArrayRef<int16_t> DstSplitParts = TRI.getRegSplitParts(RC, EltSize);
412 assert(NumSubRegs == (DstSplitParts.empty() ? 1 : DstSplitParts.size()));
413 MCRegister CFISuperReg = getCFISuperReg();
414 if (NumSubRegs == 1) {
415 TFI->buildCFI(
416 MBB, MI, DL,
418 nullptr,
419 MCRI->getDwarfRegNum(
420 CFISuperReg ? CFISuperReg : SuperReg.asMCReg(), false),
421 MCRI->getDwarfRegNum(DstReg, false)));
422 } else if (isExec(CFISuperReg)) {
423 assert(NumSubRegs == 2 && "EXEC larger than 64-bit");
424 TFI->buildCFIForRegToSGPRPairSpill(MBB, MI, DL, CFISuperReg, DstReg);
425 } else {
426 for (unsigned I = 0; I < NumSubRegs; ++I) {
427 MCRegister SrcSubReg = TRI.getSubReg(SuperReg, SplitParts[I]);
428 MCRegister DstSubReg = TRI.getSubReg(DstReg, DstSplitParts[I]);
429 TFI->buildCFI(MBB, MI, DL,
431 nullptr, MCRI->getDwarfRegNum(SrcSubReg, false),
432 MCRI->getDwarfRegNum(DstSubReg, false)));
433 }
434 }
435 }
436 }
437
438 void restoreFromMemory(const int FI) {
439 MachineRegisterInfo &MRI = MF.getRegInfo();
440 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
441
442 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ false);
444 MRI, LiveUnits, AMDGPU::VGPR_32RegClass);
445 if (!TmpVGPR)
446 report_fatal_error("failed to find free scratch register");
447
448 for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) {
449 MCRegister SubReg = NumSubRegs == 1
450 ? SuperReg.asMCReg()
451 : TRI.getSubReg(SuperReg, SplitParts[I]);
452
453 buildEpilogRestore(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MI, DL,
454 TmpVGPR, FI, FrameReg, DwordOff);
455 assert(SubReg.isPhysical());
456
457 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
458 .addReg(TmpVGPR, RegState::Kill);
459 DwordOff += 4;
460 }
461 }
462
463 void restoreFromVGPRLane(const int FI) {
464 assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
466 FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI);
467 assert(Spill.size() == NumSubRegs);
468
469 for (unsigned I = 0; I < NumSubRegs; ++I) {
470 MCRegister SubReg = NumSubRegs == 1
471 ? SuperReg.asMCReg()
472 : TRI.getSubReg(SuperReg, SplitParts[I]);
473 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
474 .addReg(Spill[I].VGPR)
475 .addImm(Spill[I].Lane);
476 }
477 }
478
479 void copyFromScratchSGPR(Register SrcReg) const {
480 BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), SuperReg)
481 .addReg(SrcReg)
483 }
484
485public:
490 const DebugLoc &DL, const SIInstrInfo *TII,
491 const SIRegisterInfo &TRI,
492 LiveRegUnits &LiveUnits, Register FrameReg,
493 bool IsFramePtrPrologSpill = false)
494 : MI(MI), MBB(MBB), MF(*MBB.getParent()),
495 ST(MF.getSubtarget<GCNSubtarget>()), MFI(MF.getFrameInfo()),
496 FuncInfo(MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI),
497 MCRI(MF.getContext().getRegisterInfo()), TFI(ST.getFrameLowering()),
498 SuperReg(Reg), SI(SI), LiveUnits(LiveUnits), DL(DL), FrameReg(FrameReg),
499 IsFramePtrPrologSpill(IsFramePtrPrologSpill),
500 NeedsFrameMoves(MF.needsFrameMoves()) {
501 const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg);
502 SplitParts = TRI.getRegSplitParts(RC, EltSize);
503 NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
504
505 assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
506 }
507
508 void save() {
509 switch (SI.getKind()) {
511 return saveToMemory(SI.getIndex());
513 return saveToVGPRLane(SI.getIndex());
515 return copyToScratchSGPR(SI.getReg());
516 }
517 }
518
519 void restore() {
520 switch (SI.getKind()) {
522 return restoreFromMemory(SI.getIndex());
524 return restoreFromVGPRLane(SI.getIndex());
526 return copyFromScratchSGPR(SI.getReg());
527 }
528 }
529};
530
531} // namespace llvm
532
533// Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()`
534void SIFrameLowering::emitEntryFunctionFlatScratchInit(
536 const DebugLoc &DL, Register ScratchWaveOffsetReg) const {
537 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
538 const SIInstrInfo *TII = ST.getInstrInfo();
539 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
540 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
541
542 // We don't need this if we only have spills since there is no user facing
543 // scratch.
544
545 // TODO: If we know we don't have flat instructions earlier, we can omit
546 // this from the input registers.
547 //
548 // TODO: We only need to know if we access scratch space through a flat
549 // pointer. Because we only detect if flat instructions are used at all,
550 // this will be used more often than necessary on VI.
551
552 Register FlatScrInitLo;
553 Register FlatScrInitHi;
554
555 if (ST.isAmdPalOS()) {
556 // Extract the scratch offset from the descriptor in the GIT
557 LiveRegUnits LiveUnits;
558 LiveUnits.init(*TRI);
559 LiveUnits.addLiveIns(MBB);
560
561 // Find unused reg to load flat scratch init into
562 MachineRegisterInfo &MRI = MF.getRegInfo();
563 Register FlatScrInit = AMDGPU::NoRegister;
564 ArrayRef<MCPhysReg> AllSGPR64s = TRI->getAllSGPR64(MF);
565 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2;
566 AllSGPR64s = AllSGPR64s.slice(
567 std::min(static_cast<unsigned>(AllSGPR64s.size()), NumPreloaded));
568 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
569 for (MCPhysReg Reg : AllSGPR64s) {
570 if (LiveUnits.available(Reg) && !MRI.isReserved(Reg) &&
571 MRI.isAllocatable(Reg) && !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
572 FlatScrInit = Reg;
573 break;
574 }
575 }
576 assert(FlatScrInit && "Failed to find free register for scratch init");
577
578 FlatScrInitLo = TRI->getSubReg(FlatScrInit, AMDGPU::sub0);
579 FlatScrInitHi = TRI->getSubReg(FlatScrInit, AMDGPU::sub1);
580
581 buildGitPtr(MBB, I, DL, TII, FlatScrInit);
582
583 // We now have the GIT ptr - now get the scratch descriptor from the entry
584 // at offset 0 (or offset 16 for a compute shader).
585 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
586 const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
587 auto *MMO = MF.getMachineMemOperand(
588 PtrInfo,
591 8, Align(4));
592 unsigned Offset =
594 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
595 unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
596 BuildMI(MBB, I, DL, LoadDwordX2, FlatScrInit)
597 .addReg(FlatScrInit)
598 .addImm(EncodedOffset) // offset
599 .addImm(0) // cpol
600 .addMemOperand(MMO);
601
602 // Mask the offset in [47:0] of the descriptor
603 const MCInstrDesc &SAndB32 = TII->get(AMDGPU::S_AND_B32);
604 auto And = BuildMI(MBB, I, DL, SAndB32, FlatScrInitHi)
605 .addReg(FlatScrInitHi)
606 .addImm(0xffff);
607 And->getOperand(3).setIsDead(); // Mark SCC as dead.
608 } else {
609 Register FlatScratchInitReg =
611 assert(FlatScratchInitReg);
612
613 MachineRegisterInfo &MRI = MF.getRegInfo();
614 MRI.addLiveIn(FlatScratchInitReg);
615 MBB.addLiveIn(FlatScratchInitReg);
616
617 FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
618 FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
619 }
620
621 // Do a 64-bit pointer add.
622 if (ST.flatScratchIsPointer()) {
623 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
624 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
625 .addReg(FlatScrInitLo)
626 .addReg(ScratchWaveOffsetReg);
627 auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32),
628 FlatScrInitHi)
629 .addReg(FlatScrInitHi)
630 .addImm(0);
631 Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
632
633 using namespace AMDGPU::Hwreg;
634 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32))
635 .addReg(FlatScrInitLo)
636 .addImm(int16_t(HwregEncoding::encode(ID_FLAT_SCR_LO, 0, 32)));
637 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32))
638 .addReg(FlatScrInitHi)
639 .addImm(int16_t(HwregEncoding::encode(ID_FLAT_SCR_HI, 0, 32)));
640 return;
641 }
642
643 // For GFX9.
644 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO)
645 .addReg(FlatScrInitLo)
646 .addReg(ScratchWaveOffsetReg);
647 auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32),
648 AMDGPU::FLAT_SCR_HI)
649 .addReg(FlatScrInitHi)
650 .addImm(0);
651 Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
652
653 return;
654 }
655
656 assert(ST.getGeneration() < AMDGPUSubtarget::GFX9);
657
658 // Copy the size in bytes.
659 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO)
660 .addReg(FlatScrInitHi, RegState::Kill);
661
662 // Add wave offset in bytes to private base offset.
663 // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
664 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), FlatScrInitLo)
665 .addReg(FlatScrInitLo)
666 .addReg(ScratchWaveOffsetReg);
667
668 // Convert offset to 256-byte units.
669 auto LShr = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32),
670 AMDGPU::FLAT_SCR_HI)
671 .addReg(FlatScrInitLo, RegState::Kill)
672 .addImm(8);
673 LShr->getOperand(3).setIsDead(); // Mark SCC as dead.
674}
675
676// Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
677// memory. They should have been removed by now.
679 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
680 I != E; ++I) {
681 if (!MFI.isDeadObjectIndex(I))
682 return false;
683 }
684
685 return true;
686}
687
688// Shift down registers reserved for the scratch RSRC.
689Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
690 MachineFunction &MF) const {
691
692 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
693 const SIInstrInfo *TII = ST.getInstrInfo();
694 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
695 MachineRegisterInfo &MRI = MF.getRegInfo();
696 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
697
698 assert(MFI->isEntryFunction());
699
700 Register ScratchRsrcReg = MFI->getScratchRSrcReg();
701
702 if (!ScratchRsrcReg || (!MRI.isPhysRegUsed(ScratchRsrcReg) &&
704 return Register();
705
706 if (ST.hasSGPRInitBug() ||
707 ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF))
708 return ScratchRsrcReg;
709
710 // We reserved the last registers for this. Shift it down to the end of those
711 // which were actually used.
712 //
713 // FIXME: It might be safer to use a pseudoregister before replacement.
714
715 // FIXME: We should be able to eliminate unused input registers. We only
716 // cannot do this for the resources required for scratch access. For now we
717 // skip over user SGPRs and may leave unused holes.
718
719 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4;
720 ArrayRef<MCPhysReg> AllSGPR128s = TRI->getAllSGPR128(MF);
721 AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded));
722
723 // Skip the last N reserved elements because they should have already been
724 // reserved for VCC etc.
725 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
726 for (MCPhysReg Reg : AllSGPR128s) {
727 // Pick the first unallocated one. Make sure we don't clobber the other
728 // reserved input we needed. Also for PAL, make sure we don't clobber
729 // the GIT pointer passed in SGPR0 or SGPR8.
730 if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
731 (!GITPtrLoReg || !TRI->isSubRegisterEq(Reg, GITPtrLoReg))) {
732 MRI.replaceRegWith(ScratchRsrcReg, Reg);
734 MRI.reserveReg(Reg, TRI);
735 return Reg;
736 }
737 }
738
739 return ScratchRsrcReg;
740}
741
742static unsigned getScratchScaleFactor(const GCNSubtarget &ST) {
743 return ST.hasFlatScratchEnabled() ? 1 : ST.getWavefrontSize();
744}
745
747 MachineBasicBlock &MBB) const {
748 assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
749
750 // FIXME: If we only have SGPR spills, we won't actually be using scratch
751 // memory since these spill to VGPRs. We should be cleaning up these unused
752 // SGPR spill frame indices somewhere.
753
754 // FIXME: We still have implicit uses on SGPR spill instructions in case they
755 // need to spill to vector memory. It's likely that will not happen, but at
756 // this point it appears we need the setup. This part of the prolog should be
757 // emitted after frame indices are eliminated.
758
759 // FIXME: Remove all of the isPhysRegUsed checks
760
762 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
763 const SIInstrInfo *TII = ST.getInstrInfo();
764 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
766 const Function &F = MF.getFunction();
767 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
768
769 assert(MFI->isEntryFunction());
770
771 // Debug location must be unknown since the first debug location is used to
772 // determine the end of the prologue.
773 DebugLoc DL;
775
776 if (MF.needsFrameMoves()) {
777 // On entry the SP/FP are not set up, so we need to define the CFA in terms
778 // of a literal location expression.
779 static const char CFAEncodedInstUserOpsArr[] = {
780 dwarf::DW_CFA_def_cfa_expression,
781 4, // length
782 static_cast<char>(dwarf::DW_OP_lit0),
783 static_cast<char>(dwarf::DW_OP_lit0 +
784 dwarf::DW_ASPACE_LLVM_AMDGPU_private_wave),
785 static_cast<char>(dwarf::DW_OP_LLVM_user),
786 static_cast<char>(dwarf::DW_OP_LLVM_form_aspace_address)};
787 static StringRef CFAEncodedInstUserOps =
788 StringRef(CFAEncodedInstUserOpsArr, sizeof(CFAEncodedInstUserOpsArr));
789 buildCFI(MBB, I, DL,
790 MCCFIInstruction::createEscape(nullptr, CFAEncodedInstUserOps,
791 SMLoc(),
792 "CFA is 0 in private_wave aspace"));
793 // Unwinding halts when the return address (PC) is undefined.
794 buildCFI(MBB, I, DL,
796 nullptr, TRI->getDwarfRegNum(AMDGPU::PC_REG, false)));
797 }
798
799 Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
801
802 // We need to do the replacement of the private segment buffer register even
803 // if there are no stack objects. There could be stores to undef or a
804 // constant without an associated object.
805 //
806 // This will return `Register()` in cases where there are no actual
807 // uses of the SRSRC.
808 Register ScratchRsrcReg;
809 if (!ST.hasFlatScratchEnabled())
810 ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF);
811
812 // Make the selected register live throughout the function.
813 if (ScratchRsrcReg) {
814 for (MachineBasicBlock &OtherBB : MF) {
815 if (&OtherBB != &MBB) {
816 OtherBB.addLiveIn(ScratchRsrcReg);
817 }
818 }
819 }
820
821 // Now that we have fixed the reserved SRSRC we need to locate the
822 // (potentially) preloaded SRSRC.
823 Register PreloadedScratchRsrcReg;
824 if (ST.isAmdHsaOrMesa(F)) {
825 PreloadedScratchRsrcReg =
827 if (ScratchRsrcReg && PreloadedScratchRsrcReg) {
828 // We added live-ins during argument lowering, but since they were not
829 // used they were deleted. We're adding the uses now, so add them back.
830 MRI.addLiveIn(PreloadedScratchRsrcReg);
831 MBB.addLiveIn(PreloadedScratchRsrcReg);
832 }
833 }
834
835 // We found the SRSRC first because it needs four registers and has an
836 // alignment requirement. If the SRSRC that we found is clobbering with
837 // the scratch wave offset, which may be in a fixed SGPR or a free SGPR
838 // chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch
839 // wave offset to a free SGPR.
840 Register ScratchWaveOffsetReg;
841 if (PreloadedScratchWaveOffsetReg &&
842 TRI->isSubRegisterEq(ScratchRsrcReg, PreloadedScratchWaveOffsetReg)) {
843 ArrayRef<MCPhysReg> AllSGPRs = TRI->getAllSGPR32(MF);
844 unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
845 AllSGPRs = AllSGPRs.slice(
846 std::min(static_cast<unsigned>(AllSGPRs.size()), NumPreloaded));
847 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
848 for (MCPhysReg Reg : AllSGPRs) {
849 if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
850 !TRI->isSubRegisterEq(ScratchRsrcReg, Reg) && GITPtrLoReg != Reg) {
851 ScratchWaveOffsetReg = Reg;
852 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
853 .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill);
854 break;
855 }
856 }
857
858 // FIXME: We can spill incoming arguments and restore at the end of the
859 // prolog.
860 if (!ScratchWaveOffsetReg)
862 "could not find temporary scratch offset register in prolog");
863 } else {
864 ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg;
865 }
866 assert(ScratchWaveOffsetReg || !PreloadedScratchWaveOffsetReg);
867
868 unsigned Offset = FrameInfo.getStackSize() * getScratchScaleFactor(ST);
869 if (!mayReserveScratchForCWSR(MF)) {
870 if (hasFP(MF)) {
872 assert(FPReg != AMDGPU::FP_REG);
873 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
874 }
875
878 assert(SPReg != AMDGPU::SP_REG);
879 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg).addImm(Offset);
880 }
881 } else {
882 // We need to check if we're on a compute queue - if we are, then the CWSR
883 // trap handler may need to store some VGPRs on the stack. The first VGPR
884 // block is saved separately, so we only need to allocate space for any
885 // additional VGPR blocks used. For now, we will make sure there's enough
886 // room for the theoretical maximum number of VGPRs that can be allocated.
887 // FIXME: Figure out if the shader uses fewer VGPRs in practice.
888 assert(hasFP(MF));
890 assert(FPReg != AMDGPU::FP_REG);
891 unsigned VGPRSize = llvm::alignTo(
892 (ST.getAddressableNumVGPRs(MFI->getDynamicVGPRBlockSize()) -
894 MFI->getDynamicVGPRBlockSize())) *
895 4,
896 FrameInfo.getMaxAlign());
898
899 BuildMI(MBB, I, DL, TII->get(AMDGPU::GET_STACK_BASE), FPReg);
902 assert(SPReg != AMDGPU::SP_REG);
903
904 // If at least one of the constants can be inlined, then we can use
905 // s_cselect. Otherwise, use a mov and cmovk.
906 if (AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm()) ||
908 ST.hasInv2PiInlineImm())) {
909 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CSELECT_B32), SPReg)
910 .addImm(Offset + VGPRSize)
911 .addImm(Offset);
912 } else {
913 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg).addImm(Offset);
914 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CMOVK_I32), SPReg)
915 .addImm(Offset + VGPRSize);
916 }
917 }
918 }
919
920 bool NeedsFlatScratchInit =
922 (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() ||
923 (!allStackObjectsAreDead(FrameInfo) && ST.hasFlatScratchEnabled()));
924
925 if ((NeedsFlatScratchInit || ScratchRsrcReg) &&
926 PreloadedScratchWaveOffsetReg && !ST.hasArchitectedFlatScratch()) {
927 MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
928 MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
929 }
930
931 if (NeedsFlatScratchInit) {
932 emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
933 }
934
935 if (ScratchRsrcReg) {
936 emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL,
937 PreloadedScratchRsrcReg,
938 ScratchRsrcReg, ScratchWaveOffsetReg);
939 }
940
941 if (ST.hasWaitXcnt()) {
942 // Set REPLAY_MODE (bit 25) in MODE register to enable multi-group XNACK
943 // replay. This aligns hardware behavior with the compiler's s_wait_xcnt
944 // insertion logic, which assumes multi-group mode by default.
945 unsigned RegEncoding =
947 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
948 .addImm(1)
949 .addImm(RegEncoding);
950 }
951}
952
953// Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg`
954void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
956 const DebugLoc &DL, Register PreloadedScratchRsrcReg,
957 Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const {
958
959 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
960 const SIInstrInfo *TII = ST.getInstrInfo();
961 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
963 const Function &Fn = MF.getFunction();
964
965 if (ST.isAmdPalOS()) {
966 // The pointer to the GIT is formed from the offset passed in and either
967 // the amdgpu-git-ptr-high function attribute or the top part of the PC
968 Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
969 Register Rsrc03 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
970
971 buildGitPtr(MBB, I, DL, TII, Rsrc01);
972
973 // We now have the GIT ptr - now get the scratch descriptor from the entry
974 // at offset 0 (or offset 16 for a compute shader).
976 const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM);
977 auto *MMO = MF.getMachineMemOperand(
978 PtrInfo,
981 16, Align(4));
982 unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
983 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
984 unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
985 BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg)
986 .addReg(Rsrc01)
987 .addImm(EncodedOffset) // offset
988 .addImm(0) // cpol
989 .addReg(ScratchRsrcReg, RegState::ImplicitDefine)
990 .addMemOperand(MMO);
991
992 // The driver will always set the SRD for wave 64 (bits 118:117 of
993 // descriptor / bits 22:21 of third sub-reg will be 0b11)
994 // If the shader is actually wave32 we have to modify the const_index_stride
995 // field of the descriptor 3rd sub-reg (bits 22:21) to 0b10 (stride=32). The
996 // reason the driver does this is that there can be cases where it presents
997 // 2 shaders with different wave size (e.g. VsFs).
998 // TODO: convert to using SCRATCH instructions or multiple SRD buffers
999 if (ST.isWave32()) {
1000 const MCInstrDesc &SBitsetB32 = TII->get(AMDGPU::S_BITSET0_B32);
1001 BuildMI(MBB, I, DL, SBitsetB32, Rsrc03)
1002 .addImm(21)
1003 .addReg(Rsrc03);
1004 }
1005 } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) {
1006 assert(!ST.isAmdHsaOrMesa(Fn));
1007 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
1008
1009 Register Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
1010 Register Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
1011
1012 // Use relocations to get the pointer, and setup the other bits manually.
1013 uint64_t Rsrc23 = TII->getScratchRsrcWords23();
1014
1016 Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
1017
1019 const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64);
1020
1021 BuildMI(MBB, I, DL, Mov64, Rsrc01)
1023 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
1024 } else {
1025 const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
1026
1027 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1028 auto *MMO = MF.getMachineMemOperand(
1029 PtrInfo,
1032 8, Align(4));
1033 BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01)
1035 .addImm(0) // offset
1036 .addImm(0) // cpol
1037 .addMemOperand(MMO)
1038 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
1039
1042 }
1043 } else {
1044 Register Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
1045 Register Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
1046
1047 BuildMI(MBB, I, DL, SMovB32, Rsrc0)
1048 .addExternalSymbol("SCRATCH_RSRC_DWORD0")
1049 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
1050
1051 BuildMI(MBB, I, DL, SMovB32, Rsrc1)
1052 .addExternalSymbol("SCRATCH_RSRC_DWORD1")
1053 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
1054 }
1055
1056 BuildMI(MBB, I, DL, SMovB32, Rsrc2)
1057 .addImm(Lo_32(Rsrc23))
1058 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
1059
1060 BuildMI(MBB, I, DL, SMovB32, Rsrc3)
1061 .addImm(Hi_32(Rsrc23))
1062 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
1063 } else if (ST.isAmdHsaOrMesa(Fn)) {
1064 assert(PreloadedScratchRsrcReg);
1065
1066 if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
1067 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
1068 .addReg(PreloadedScratchRsrcReg, RegState::Kill);
1069 }
1070 }
1071
1072 // Add the scratch wave offset into the scratch RSRC.
1073 //
1074 // We only want to update the first 48 bits, which is the base address
1075 // pointer, without touching the adjacent 16 bits of flags. We know this add
1076 // cannot carry-out from bit 47, otherwise the scratch allocation would be
1077 // impossible to fit in the 48-bit global address space.
1078 //
1079 // TODO: Evaluate if it is better to just construct an SRD using the flat
1080 // scratch init and some constants rather than update the one we are passed.
1081 Register ScratchRsrcSub0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
1082 Register ScratchRsrcSub1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
1083
1084 // We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in
1085 // the kernel body via inreg arguments.
1086 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), ScratchRsrcSub0)
1087 .addReg(ScratchRsrcSub0)
1088 .addReg(ScratchWaveOffsetReg)
1089 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
1090 auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), ScratchRsrcSub1)
1091 .addReg(ScratchRsrcSub1)
1092 .addImm(0)
1093 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
1094 Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
1095}
1096
1098 switch (ID) {
1102 return true;
1106 return false;
1107 }
1108 llvm_unreachable("Invalid TargetStackID::Value");
1109}
1110
1111void SIFrameLowering::emitPrologueEntryCFI(MachineBasicBlock &MBB,
1113 const DebugLoc &DL) const {
1114 const MachineFunction &MF = *MBB.getParent();
1115 const MachineRegisterInfo &MRI = MF.getRegInfo();
1116 const MCRegisterInfo *MCRI = MF.getContext().getRegisterInfo();
1117 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1118 const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo();
1119 MCRegister StackPtrReg =
1120 MF.getInfo<SIMachineFunctionInfo>()->getStackPtrOffsetReg();
1121
1122 emitDefCFA(MBB, MBBI, DL, StackPtrReg, /*AspaceAlreadyDefined=*/true,
1124
1125 buildCFIForRegToSGPRPairSpill(MBB, MBBI, DL, AMDGPU::PC_REG,
1126 TRI.getReturnAddressReg(MF));
1127
1128 BitVector IsCalleeSaved(TRI.getNumRegs());
1129 const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
1130 for (unsigned I = 0; CSRegs[I]; ++I) {
1131 IsCalleeSaved.set(CSRegs[I]);
1132 }
1133 auto ProcessReg = [&](MCPhysReg Reg) {
1134 // VCC is not preserved across calls.
1135 if (Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI)
1136 return;
1137 if (IsCalleeSaved.test(Reg) || !MRI.isPhysRegModified(Reg))
1138 return;
1139 MCRegister DwarfReg = MCRI->getDwarfRegNum(Reg, false);
1140 buildCFI(MBB, MBBI, DL,
1141 MCCFIInstruction::createUndefined(nullptr, DwarfReg));
1142 };
1143
1144 // Emit CFI rules for caller saved Arch VGPRs which are clobbered
1145 unsigned NumArchVGPRs = ST.has1024AddressableVGPRs() ? 1024 : 256;
1146 for_each(AMDGPU::VGPR_32RegClass.getRegisters().take_front(NumArchVGPRs),
1147 ProcessReg);
1148
1149 // Emit CFI rules for caller saved Accum VGPRs which are clobbered
1150 if (ST.hasMAIInsts()) {
1151 for_each(AMDGPU::AGPR_32RegClass.getRegisters(), ProcessReg);
1152 }
1153
1154 // Emit CFI rules for caller saved SGPRs which are clobbered
1155 for_each(AMDGPU::SGPR_32RegClass.getRegisters(), ProcessReg);
1156}
1157
1158// Activate only the inactive lanes when \p EnableInactiveLanes is true.
1159// Otherwise, activate all lanes. It returns the saved exec.
1161 MachineFunction &MF,
1164 const DebugLoc &DL, bool IsProlog,
1165 bool EnableInactiveLanes) {
1166 Register ScratchExecCopy;
1167 MachineRegisterInfo &MRI = MF.getRegInfo();
1168 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1169 const SIInstrInfo *TII = ST.getInstrInfo();
1170 const SIRegisterInfo &TRI = TII->getRegisterInfo();
1172
1173 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, IsProlog);
1174
1175 if (FuncInfo->isWholeWaveFunction()) {
1176 // Whole wave functions already have a copy of the original EXEC mask that
1177 // we can use.
1178 assert(IsProlog && "Epilog should look at return, not setup");
1179 ScratchExecCopy =
1180 TII->getWholeWaveFunctionSetup(MF)->getOperand(0).getReg();
1181 assert(ScratchExecCopy && "Couldn't find copy of EXEC");
1182 } else {
1183 ScratchExecCopy = findScratchNonCalleeSaveRegister(
1184 MRI, LiveUnits, *TRI.getWaveMaskRegClass());
1185 }
1186
1187 if (!ScratchExecCopy)
1188 report_fatal_error("failed to find free scratch register");
1189
1190 LiveUnits.addReg(ScratchExecCopy);
1191
1192 const unsigned SaveExecOpc =
1193 ST.isWave32() ? (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B32
1194 : AMDGPU::S_OR_SAVEEXEC_B32)
1195 : (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B64
1196 : AMDGPU::S_OR_SAVEEXEC_B64);
1197 auto SaveExec =
1198 BuildMI(MBB, MBBI, DL, TII->get(SaveExecOpc), ScratchExecCopy).addImm(-1);
1199 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
1200
1201 return ScratchExecCopy;
1202}
1203
1207 LiveRegUnits &LiveUnits, Register FrameReg, Register FramePtrRegScratchCopy,
1208 const bool NeedsFrameMoves) const {
1210 MachineFrameInfo &MFI = MF.getFrameInfo();
1211 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1212 const SIInstrInfo *TII = ST.getInstrInfo();
1213 const SIRegisterInfo &TRI = TII->getRegisterInfo();
1214 const MCRegisterInfo *MCRI = MF.getContext().getRegisterInfo();
1215 MachineRegisterInfo &MRI = MF.getRegInfo();
1217
1218 // Spill Whole-Wave Mode VGPRs. Save only the inactive lanes of the scratch
1219 // registers. However, save all lanes of callee-saved VGPRs. Due to this, we
1220 // might end up flipping the EXEC bits twice.
1221 Register ScratchExecCopy;
1222 SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs;
1223 FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs);
1224 if (!WWMScratchRegs.empty())
1225 ScratchExecCopy =
1226 buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
1227 /*IsProlog*/ true, /*EnableInactiveLanes*/ true);
1228
1229 auto StoreWWMRegisters =
1231 for (const auto &Reg : WWMRegs) {
1232 Register VGPR = Reg.first;
1233 int FI = Reg.second;
1234 buildPrologSpill(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MBBI, DL,
1235 VGPR, FI, FrameReg);
1236 if (NeedsFrameMoves) {
1237 // We spill the entire VGPR, so we can get away with just cfi_offset
1238 buildCFI(MBB, MBBI, DL,
1240 nullptr, MCRI->getDwarfRegNum(VGPR, false),
1241 MFI.getObjectOffset(FI) * ST.getWavefrontSize()));
1242 }
1243 }
1244 };
1245
1246 for (const Register Reg : make_first_range(WWMScratchRegs)) {
1247 if (!MRI.isReserved(Reg)) {
1248 MRI.addLiveIn(Reg);
1249 MBB.addLiveIn(Reg);
1250 }
1251 }
1252 StoreWWMRegisters(WWMScratchRegs);
1253
1254 auto EnableAllLanes = [&]() {
1255 BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addImm(-1);
1256 };
1257
1258 if (!WWMCalleeSavedRegs.empty()) {
1259 if (ScratchExecCopy) {
1260 EnableAllLanes();
1261 } else {
1262 ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
1263 /*IsProlog*/ true,
1264 /*EnableInactiveLanes*/ false);
1265 }
1266 }
1267
1268 StoreWWMRegisters(WWMCalleeSavedRegs);
1269 if (FuncInfo->isWholeWaveFunction()) {
1270 // If we have already saved some WWM CSR registers, then the EXEC is already
1271 // -1 and we don't need to do anything else. Otherwise, set EXEC to -1 here.
1272 if (!ScratchExecCopy)
1273 buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL, /*IsProlog*/ true,
1274 /*EnableInactiveLanes*/ true);
1275 else if (WWMCalleeSavedRegs.empty())
1276 EnableAllLanes();
1277 } else if (ScratchExecCopy) {
1278 // FIXME: Split block and make terminator.
1279 BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg)
1280 .addReg(ScratchExecCopy, RegState::Kill);
1281 LiveUnits.addReg(ScratchExecCopy);
1282 }
1283
1284 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1285
1286 for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) {
1287 // Special handle FP spill:
1288 // Skip if FP is saved to a scratch SGPR, the save has already been emitted.
1289 // Otherwise, FP has been moved to a temporary register and spill it
1290 // instead.
1291 bool IsFramePtrPrologSpill = Spill.first == FramePtrReg;
1292 Register Reg = IsFramePtrPrologSpill ? FramePtrRegScratchCopy : Spill.first;
1293 if (!Reg)
1294 continue;
1295
1296 PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI,
1297 LiveUnits, FrameReg, IsFramePtrPrologSpill);
1298 SB.save();
1299 }
1300
1301 // If a copy to scratch SGPR has been chosen for any of the SGPR spills, make
1302 // such scratch registers live throughout the function.
1303 SmallVector<Register, 1> ScratchSGPRs;
1304 FuncInfo->getAllScratchSGPRCopyDstRegs(ScratchSGPRs);
1305 if (!ScratchSGPRs.empty()) {
1306 for (MachineBasicBlock &MBB : MF) {
1307 for (MCPhysReg Reg : ScratchSGPRs)
1308 MBB.addLiveIn(Reg);
1309
1310 MBB.sortUniqueLiveIns();
1311 }
1312 if (!LiveUnits.empty()) {
1313 for (MCPhysReg Reg : ScratchSGPRs)
1314 LiveUnits.addReg(Reg);
1315 }
1316 }
1317
1318 // Remove the spill entry created for EXEC. It is needed only for CFISaves in
1319 // the prologue.
1320 if (TRI.isCFISavedRegsSpillEnabled())
1321 FuncInfo->removePrologEpilogSGPRSpillEntry(TRI.getExec());
1322}
1323
1327 LiveRegUnits &LiveUnits, Register FrameReg,
1328 Register FramePtrRegScratchCopy) const {
1329 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1330 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1331 const SIInstrInfo *TII = ST.getInstrInfo();
1332 const SIRegisterInfo &TRI = TII->getRegisterInfo();
1334 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1335
1336 for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) {
1337 // Special handle FP restore:
1338 // Skip if FP needs to be restored from the scratch SGPR. Otherwise, restore
1339 // the FP value to a temporary register. The frame pointer should be
1340 // overwritten only at the end when all other spills are restored from
1341 // current frame.
1342 Register Reg =
1343 Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first;
1344 if (!Reg)
1345 continue;
1346
1347 PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI,
1348 LiveUnits, FrameReg);
1349 SB.restore();
1350 }
1351
1352 // Restore Whole-Wave Mode VGPRs. Restore only the inactive lanes of the
1353 // scratch registers. However, restore all lanes of callee-saved VGPRs. Due to
1354 // this, we might end up flipping the EXEC bits twice.
1355 Register ScratchExecCopy;
1356 SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs;
1357 FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs);
1358 auto RestoreWWMRegisters =
1360 for (const auto &Reg : WWMRegs) {
1361 Register VGPR = Reg.first;
1362 int FI = Reg.second;
1363 buildEpilogRestore(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MBBI, DL,
1364 VGPR, FI, FrameReg);
1365 }
1366 };
1367
1368 if (FuncInfo->isWholeWaveFunction()) {
1369 // For whole wave functions, the EXEC is already -1 at this point.
1370 // Therefore, we can restore the CSR WWM registers right away.
1371 RestoreWWMRegisters(WWMCalleeSavedRegs);
1372
1373 // The original EXEC is the first operand of the return instruction.
1374 MachineInstr &Return = MBB.instr_back();
1375 unsigned Opcode = Return.getOpcode();
1376 switch (Opcode) {
1377 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN:
1378 Opcode = AMDGPU::SI_RETURN;
1379 break;
1380 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
1381 Opcode = AMDGPU::SI_TCRETURN_GFX;
1382 break;
1383 default:
1384 llvm_unreachable("Unexpected return inst");
1385 }
1386 Register OrigExec = Return.getOperand(0).getReg();
1387
1388 if (!WWMScratchRegs.empty()) {
1389 BuildMI(MBB, MBBI, DL, TII->get(LMC.XorOpc), LMC.ExecReg)
1390 .addReg(OrigExec)
1391 .addImm(-1);
1392 RestoreWWMRegisters(WWMScratchRegs);
1393 }
1394
1395 // Restore original EXEC.
1396 BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addReg(OrigExec);
1397
1398 // Drop the first operand and update the opcode.
1399 Return.removeOperand(0);
1400 Return.setDesc(TII->get(Opcode));
1401
1402 return;
1403 }
1404
1405 if (!WWMScratchRegs.empty()) {
1406 ScratchExecCopy =
1407 buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
1408 /*IsProlog=*/false, /*EnableInactiveLanes=*/true);
1409 }
1410 RestoreWWMRegisters(WWMScratchRegs);
1411 if (!WWMCalleeSavedRegs.empty()) {
1412 if (ScratchExecCopy) {
1413 BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg).addImm(-1);
1414 } else {
1415 ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
1416 /*IsProlog*/ false,
1417 /*EnableInactiveLanes*/ false);
1418 }
1419 }
1420
1421 RestoreWWMRegisters(WWMCalleeSavedRegs);
1422 if (ScratchExecCopy) {
1423 // FIXME: Split block and make terminator.
1424 BuildMI(MBB, MBBI, DL, TII->get(LMC.MovOpc), LMC.ExecReg)
1425 .addReg(ScratchExecCopy, RegState::Kill);
1426 }
1427}
1428
1430 MachineBasicBlock &MBB) const {
1432 if (FuncInfo->isEntryFunction()) {
1434 return;
1435 }
1436
1437 MachineFrameInfo &MFI = MF.getFrameInfo();
1438 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1439 const SIInstrInfo *TII = ST.getInstrInfo();
1440 const SIRegisterInfo &TRI = TII->getRegisterInfo();
1441 MachineRegisterInfo &MRI = MF.getRegInfo();
1442
1443 Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
1444 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1445 Register BasePtrReg =
1446 TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register();
1447 LiveRegUnits LiveUnits;
1448
1450 // DebugLoc must be unknown since the first instruction with DebugLoc is used
1451 // to determine the end of the prologue.
1452 DebugLoc DL;
1453
1454 bool HasFP = false;
1455 bool HasBP = false;
1456 uint32_t NumBytes = MFI.getStackSize();
1457 uint32_t RoundedSize = NumBytes;
1458
1459 // Functions that never return don't need to save and restore the FP or BP.
1460 const Function &F = MF.getFunction();
1461 bool SavesStackRegs =
1462 !F.hasFnAttribute(Attribute::NoReturn) && !FuncInfo->isChainFunction();
1463
1464 const bool NeedsFrameMoves = MF.needsFrameMoves();
1465
1466 if (NeedsFrameMoves)
1467 emitPrologueEntryCFI(MBB, MBBI, DL);
1468
1469 if (TRI.hasStackRealignment(MF))
1470 HasFP = true;
1471
1472 Register FramePtrRegScratchCopy;
1473 if (!HasFP && !hasFP(MF)) {
1474 // Emit the CSR spill stores with SP base register.
1475 emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits, StackPtrReg,
1476 FramePtrRegScratchCopy, NeedsFrameMoves);
1477 } else if (SavesStackRegs) {
1478 // CSR spill stores will use FP as base register.
1479 Register SGPRForFPSaveRestoreCopy =
1480 FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
1481
1482 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true);
1483 if (SGPRForFPSaveRestoreCopy) {
1484 // Copy FP to the scratch register now and emit the CFI entry. It avoids
1485 // the extra FP copy needed in the other two cases when FP is spilled to
1486 // memory or to a VGPR lane.
1488 FramePtrReg,
1489 FuncInfo->getPrologEpilogSGPRSaveRestoreInfo(FramePtrReg), MBB, MBBI,
1490 DL, TII, TRI, LiveUnits, FramePtrReg,
1491 /*IsFramePtrPrologSpill*/ true);
1492 SB.save();
1493 LiveUnits.addReg(SGPRForFPSaveRestoreCopy);
1494 } else {
1495 // Copy FP into a new scratch register so that its previous value can be
1496 // spilled after setting up the new frame.
1497 FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister(
1498 MRI, LiveUnits, AMDGPU::SReg_32_XM0_XEXECRegClass);
1499 if (!FramePtrRegScratchCopy)
1500 report_fatal_error("failed to find free scratch register");
1501
1502 LiveUnits.addReg(FramePtrRegScratchCopy);
1503 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrRegScratchCopy)
1504 .addReg(FramePtrReg);
1505 }
1506 }
1507
1508 if (HasFP) {
1509 const unsigned Alignment = MFI.getMaxAlign().value();
1510
1511 RoundedSize += Alignment;
1512 if (LiveUnits.empty()) {
1513 LiveUnits.init(TRI);
1514 LiveUnits.addLiveIns(MBB);
1515 }
1516
1517 // s_add_i32 s33, s32, NumBytes
1518 // s_and_b32 s33, s33, 0b111...0000
1519 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), FramePtrReg)
1520 .addReg(StackPtrReg)
1521 .addImm((Alignment - 1) * getScratchScaleFactor(ST))
1523 auto And = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
1524 .addReg(FramePtrReg, RegState::Kill)
1525 .addImm(-Alignment * getScratchScaleFactor(ST))
1527 And->getOperand(3).setIsDead(); // Mark SCC as dead.
1528 FuncInfo->setIsStackRealigned(true);
1529 } else if ((HasFP = hasFP(MF))) {
1530 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
1531 .addReg(StackPtrReg)
1533 }
1534
1535 // If FP is used, emit the CSR spills with FP base register.
1536 if (HasFP) {
1537 emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits, FramePtrReg,
1538 FramePtrRegScratchCopy, NeedsFrameMoves);
1539 if (FramePtrRegScratchCopy)
1540 LiveUnits.removeReg(FramePtrRegScratchCopy);
1541 }
1542
1543 // If we need a base pointer, set it up here. It's whatever the value of
1544 // the stack pointer is at this point. Any variable size objects will be
1545 // allocated after this, so we can still use the base pointer to reference
1546 // the incoming arguments.
1547 if ((HasBP = TRI.hasBasePointer(MF))) {
1548 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg)
1549 .addReg(StackPtrReg)
1551 }
1552
1553 if (HasFP) {
1554 if (NeedsFrameMoves)
1555 emitDefCFA(MBB, MBBI, DL, FramePtrReg, /*AspaceAlreadyDefined=*/false,
1557 }
1558
1559 if (HasFP && RoundedSize != 0) {
1560 auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
1561 .addReg(StackPtrReg)
1562 .addImm(RoundedSize * getScratchScaleFactor(ST))
1564 Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1565 }
1566
1567 bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(FramePtrReg);
1568 (void)FPSaved;
1569 assert((!HasFP || FPSaved || !SavesStackRegs) &&
1570 "Needed to save FP but didn't save it anywhere");
1571
1572 // If we allow spilling to AGPRs we may have saved FP but then spill
1573 // everything into AGPRs instead of the stack.
1574 assert((HasFP || !FPSaved || !SavesStackRegs || EnableSpillVGPRToAGPR) &&
1575 "Saved FP but didn't need it");
1576
1577 bool BPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(BasePtrReg);
1578 (void)BPSaved;
1579 assert((!HasBP || BPSaved || !SavesStackRegs) &&
1580 "Needed to save BP but didn't save it anywhere");
1581
1582 assert((HasBP || !BPSaved) && "Saved BP but didn't need it");
1583
1584 if (FuncInfo->isWholeWaveFunction()) {
1585 // SI_WHOLE_WAVE_FUNC_SETUP has outlived its purpose.
1586 TII->getWholeWaveFunctionSetup(MF)->eraseFromParent();
1587 }
1588}
1589
1591 MachineBasicBlock &MBB) const {
1592 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1593 if (FuncInfo->isEntryFunction())
1594 return;
1595
1596 const MachineFrameInfo &MFI = MF.getFrameInfo();
1597 if (FuncInfo->isChainFunction() && !MFI.hasTailCall())
1598 return;
1599
1600 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1601 const SIInstrInfo *TII = ST.getInstrInfo();
1602 const SIRegisterInfo &TRI = TII->getRegisterInfo();
1603 MachineRegisterInfo &MRI = MF.getRegInfo();
1604 LiveRegUnits LiveUnits;
1605 // Get the insert location for the epilogue. If there were no terminators in
1606 // the block, get the last instruction.
1608 DebugLoc DL;
1609 if (!MBB.empty()) {
1610 MBBI = MBB.getLastNonDebugInstr();
1611 if (MBBI != MBB.end())
1612 DL = MBBI->getDebugLoc();
1613
1614 MBBI = MBB.getFirstTerminator();
1615 }
1616
1617 uint32_t NumBytes = MFI.getStackSize();
1618 uint32_t RoundedSize = FuncInfo->isStackRealigned()
1619 ? NumBytes + MFI.getMaxAlign().value()
1620 : NumBytes;
1621 const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
1622 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1623 bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(FramePtrReg);
1624
1625 if (RoundedSize != 0) {
1626 if (TRI.hasBasePointer(MF)) {
1627 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), StackPtrReg)
1628 .addReg(TRI.getBaseRegister())
1630 } else if (hasFP(MF)) {
1631 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), StackPtrReg)
1632 .addReg(FramePtrReg)
1634 }
1635 }
1636
1637 Register FramePtrRegScratchCopy;
1638 Register SGPRForFPSaveRestoreCopy =
1639 FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
1640 if (FPSaved) {
1641 // CSR spill restores should use FP as base register. If
1642 // SGPRForFPSaveRestoreCopy is not true, restore the previous value of FP
1643 // into a new scratch register and copy to FP later when other registers are
1644 // restored from the current stack frame.
1645 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false);
1646 if (SGPRForFPSaveRestoreCopy) {
1647 LiveUnits.addReg(SGPRForFPSaveRestoreCopy);
1648 } else {
1649 FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister(
1650 MRI, LiveUnits, AMDGPU::SReg_32_XM0_XEXECRegClass);
1651 if (!FramePtrRegScratchCopy)
1652 report_fatal_error("failed to find free scratch register");
1653
1654 LiveUnits.addReg(FramePtrRegScratchCopy);
1655 }
1656
1657 emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, FramePtrReg,
1658 FramePtrRegScratchCopy);
1659 }
1660
1661 if (hasFP(MF) && MF.needsFrameMoves()) {
1662 emitDefCFA(MBB, MBBI, DL, StackPtrReg, /*AspaceAlreadyDefined=*/false,
1664 }
1665
1666 if (FPSaved) {
1667 // Insert the copy to restore FP.
1668 Register SrcReg = SGPRForFPSaveRestoreCopy ? SGPRForFPSaveRestoreCopy
1669 : FramePtrRegScratchCopy;
1671 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
1672 .addReg(SrcReg);
1673 if (SGPRForFPSaveRestoreCopy)
1675 } else {
1676 // Insert the CSR spill restores with SP as the base register.
1677 emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, StackPtrReg,
1678 FramePtrRegScratchCopy);
1679 }
1680}
1681
1682#ifndef NDEBUG
1684 const MachineFrameInfo &MFI = MF.getFrameInfo();
1685 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1686 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
1687 I != E; ++I) {
1688 if (!MFI.isDeadObjectIndex(I) &&
1691 return false;
1692 }
1693 }
1694
1695 return true;
1696}
1697#endif
1698
1700 int FI,
1701 Register &FrameReg) const {
1702 const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
1703
1704 FrameReg = RI->getFrameRegister(MF);
1706}
1707
1709 MachineFunction &MF,
1710 RegScavenger *RS) const {
1711 MachineFrameInfo &MFI = MF.getFrameInfo();
1712
1713 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1714 const SIInstrInfo *TII = ST.getInstrInfo();
1715 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1716 MachineRegisterInfo &MRI = MF.getRegInfo();
1718
1719 const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs()
1721
1722 if (SpillVGPRToAGPR) {
1723 // To track the spill frame indices handled in this pass.
1724 BitVector SpillFIs(MFI.getObjectIndexEnd(), false);
1725 BitVector NonVGPRSpillFIs(MFI.getObjectIndexEnd(), false);
1726
1727 bool SeenDbgInstr = false;
1728
1729 for (MachineBasicBlock &MBB : MF) {
1731 int FrameIndex;
1732 if (MI.isDebugInstr())
1733 SeenDbgInstr = true;
1734
1735 if (TII->isVGPRSpill(MI)) {
1736 // Try to eliminate stack used by VGPR spills before frame
1737 // finalization.
1738 unsigned FIOp = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
1739 AMDGPU::OpName::vaddr);
1740 int FI = MI.getOperand(FIOp).getIndex();
1741 Register VReg =
1742 TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
1743 if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI,
1744 TRI->isAGPR(MRI, VReg))) {
1745 assert(RS != nullptr);
1746 RS->enterBasicBlockEnd(MBB);
1747 RS->backward(std::next(MI.getIterator()));
1748 TRI->eliminateFrameIndex(MI, 0, FIOp, RS);
1749 SpillFIs.set(FI);
1750 continue;
1751 }
1752 } else if (TII->isStoreToStackSlot(MI, FrameIndex) ||
1753 TII->isLoadFromStackSlot(MI, FrameIndex))
1754 if (!MFI.isFixedObjectIndex(FrameIndex))
1755 NonVGPRSpillFIs.set(FrameIndex);
1756 }
1757 }
1758
1759 // Stack slot coloring may assign different objects to the same stack slot.
1760 // If not, then the VGPR to AGPR spill slot is dead.
1761 for (unsigned FI : SpillFIs.set_bits())
1762 if (!NonVGPRSpillFIs.test(FI))
1763 FuncInfo->setVGPRToAGPRSpillDead(FI);
1764
1765 for (MachineBasicBlock &MBB : MF) {
1766 for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs())
1767 MBB.addLiveIn(Reg);
1768
1769 for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs())
1770 MBB.addLiveIn(Reg);
1771
1772 MBB.sortUniqueLiveIns();
1773
1774 if (!SpillFIs.empty() && SeenDbgInstr)
1775 clearDebugInfoForSpillFIs(MFI, MBB, SpillFIs);
1776 }
1777 }
1778
1779 // At this point we've already allocated all spilled SGPRs to VGPRs if we
1780 // can. Any remaining SGPR spills will go to memory, so move them back to the
1781 // default stack.
1782 bool HaveSGPRToVMemSpill =
1783 FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ true);
1785 "SGPR spill should have been removed in SILowerSGPRSpills");
1786
1787 // FIXME: The other checks should be redundant with allStackObjectsAreDead,
1788 // but currently hasNonSpillStackObjects is set only from source
1789 // allocas. Stack temps produced from legalization are not counted currently.
1790 if (!allStackObjectsAreDead(MFI)) {
1791 assert(RS && "RegScavenger required if spilling");
1792
1793 // Add an emergency spill slot
1794 RS->addScavengingFrameIndex(FuncInfo->getScavengeFI(MFI, *TRI));
1795
1796 // If we are spilling SGPRs to memory with a large frame, we may need a
1797 // second VGPR emergency frame index.
1798 if (HaveSGPRToVMemSpill &&
1800 RS->addScavengingFrameIndex(MFI.CreateSpillStackObject(4, Align(4)));
1801 }
1802 }
1803}
1804
1806 MachineFunction &MF, RegScavenger *RS) const {
1807 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1808 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1809 MachineRegisterInfo &MRI = MF.getRegInfo();
1811
1812 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
1813 // On gfx908, we had initially reserved highest available VGPR for AGPR
1814 // copy. Now since we are done with RA, check if there exist an unused VGPR
1815 // which is lower than the eariler reserved VGPR before RA. If one exist,
1816 // use it for AGPR copy instead of one reserved before RA.
1817 Register VGPRForAGPRCopy = FuncInfo->getVGPRForAGPRCopy();
1818 Register UnusedLowVGPR =
1819 TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
1820 if (UnusedLowVGPR && (TRI->getHWRegIndex(UnusedLowVGPR) <
1821 TRI->getHWRegIndex(VGPRForAGPRCopy))) {
1822 // Reserve this newly identified VGPR (for AGPR copy)
1823 // reserved registers should already be frozen at this point
1824 // so we can avoid calling MRI.freezeReservedRegs and just use
1825 // MRI.reserveReg
1826 FuncInfo->setVGPRForAGPRCopy(UnusedLowVGPR);
1827 MRI.reserveReg(UnusedLowVGPR, TRI);
1828 }
1829 }
1830 // We initally reserved the highest available SGPR pair for long branches
1831 // now, after RA, we shift down to a lower unused one if one exists
1832 Register LongBranchReservedReg = FuncInfo->getLongBranchReservedReg();
1833 Register UnusedLowSGPR =
1834 TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_64RegClass, MF);
1835 // If LongBranchReservedReg is null then we didn't find a long branch
1836 // and never reserved a register to begin with so there is nothing to
1837 // shift down. Then if UnusedLowSGPR is null, there isn't available lower
1838 // register to use so just keep the original one we set.
1839 if (LongBranchReservedReg && UnusedLowSGPR) {
1840 FuncInfo->setLongBranchReservedReg(UnusedLowSGPR);
1841 MRI.reserveReg(UnusedLowSGPR, TRI);
1842 }
1843}
1844
1845// The special SGPR spills like the one needed for FP, BP or any reserved
1846// registers delayed until frame lowering.
1848 MachineFunction &MF, BitVector &SavedVGPRs,
1849 bool NeedExecCopyReservedReg) const {
1850 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
1851 MachineRegisterInfo &MRI = MF.getRegInfo();
1853 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1854 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1855 LiveRegUnits LiveUnits;
1856 LiveUnits.init(*TRI);
1857 // Initially mark callee saved registers as used so we will not choose them
1858 // while looking for scratch SGPRs.
1859 const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
1860 for (unsigned I = 0; CSRegs[I]; ++I)
1861 LiveUnits.addReg(CSRegs[I]);
1862
1863 const TargetRegisterClass &RC = *TRI->getWaveMaskRegClass();
1864
1865 Register ReservedRegForExecCopy = MFI->getSGPRForEXECCopy();
1866 if (NeedExecCopyReservedReg ||
1867 (ReservedRegForExecCopy &&
1868 MRI.isPhysRegUsed(ReservedRegForExecCopy, /*SkipRegMaskTest=*/true))) {
1869 MRI.reserveReg(ReservedRegForExecCopy, TRI);
1870 Register UnusedScratchReg = findUnusedRegister(MRI, LiveUnits, RC);
1871 if (UnusedScratchReg) {
1872 // If found any unused scratch SGPR, reserve the register itself for Exec
1873 // copy and there is no need for any spills in that case.
1874 MFI->setSGPRForEXECCopy(UnusedScratchReg);
1875 MRI.replaceRegWith(ReservedRegForExecCopy, UnusedScratchReg);
1876 LiveUnits.addReg(UnusedScratchReg);
1877 } else {
1878 // Needs spill.
1879 assert(!MFI->hasPrologEpilogSGPRSpillEntry(ReservedRegForExecCopy) &&
1880 "Re-reserving spill slot for EXEC copy register");
1881 getVGPRSpillLaneOrTempRegister(MF, LiveUnits, ReservedRegForExecCopy, RC,
1882 /*IncludeScratchCopy=*/false);
1883 }
1884 } else if (ReservedRegForExecCopy) {
1885 // Reset it at this point. There are no whole-wave copies and spills
1886 // encountered.
1887 MFI->setSGPRForEXECCopy(AMDGPU::NoRegister);
1888 }
1889
1890 if (TRI->isCFISavedRegsSpillEnabled()) {
1891 Register Exec = TRI->getExec();
1893 "Re-reserving spill slot for EXEC");
1894 getVGPRSpillLaneOrTempRegister(MF, LiveUnits, Exec, RC);
1895 }
1896
1897 // Functions that don't return to the caller don't need to preserve
1898 // the FP and BP.
1899 const Function &F = MF.getFunction();
1900 if (F.hasFnAttribute(Attribute::NoReturn) ||
1901 AMDGPU::isChainCC(F.getCallingConv()))
1902 return;
1903
1904 // hasFP only knows about stack objects that already exist. We're now
1905 // determining the stack slots that will be created, so we have to predict
1906 // them. Stack objects force FP usage with calls.
1907 //
1908 // Note a new VGPR CSR may be introduced if one is used for the spill, but we
1909 // don't want to report it here.
1910 //
1911 // FIXME: Is this really hasReservedCallFrame?
1912 const bool WillHaveFP =
1913 FrameInfo.hasCalls() &&
1914 (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo));
1915
1916 if (WillHaveFP || hasFP(MF)) {
1917 Register FramePtrReg = MFI->getFrameOffsetReg();
1918 assert(!MFI->hasPrologEpilogSGPRSpillEntry(FramePtrReg) &&
1919 "Re-reserving spill slot for FP");
1920 getVGPRSpillLaneOrTempRegister(MF, LiveUnits, FramePtrReg);
1921 }
1922
1923 if (TRI->hasBasePointer(MF)) {
1924 Register BasePtrReg = TRI->getBaseRegister();
1925 assert(!MFI->hasPrologEpilogSGPRSpillEntry(BasePtrReg) &&
1926 "Re-reserving spill slot for BP");
1927 getVGPRSpillLaneOrTempRegister(MF, LiveUnits, BasePtrReg);
1928 }
1929}
1930
1931// Only report VGPRs to generic code.
1933 BitVector &SavedVGPRs,
1934 RegScavenger *RS) const {
1936
1937 // If this is a function with the amdgpu_cs_chain[_preserve] calling
1938 // convention and it doesn't contain any calls to llvm.amdgcn.cs.chain, then
1939 // we don't need to save and restore anything.
1940 if (MFI->isChainFunction() && !MF.getFrameInfo().hasTailCall())
1941 return;
1942
1944
1945 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1946 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1947 const SIInstrInfo *TII = ST.getInstrInfo();
1948 bool NeedExecCopyReservedReg = false;
1949
1950 MachineInstr *ReturnMI = nullptr;
1951 for (MachineBasicBlock &MBB : MF) {
1952 for (MachineInstr &MI : MBB) {
1953 // TODO: Walking through all MBBs here would be a bad heuristic. Better
1954 // handle them elsewhere.
1955 if (TII->isWWMRegSpillOpcode(MI.getOpcode()))
1956 NeedExecCopyReservedReg = true;
1957 else if (MI.getOpcode() == AMDGPU::SI_RETURN ||
1958 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
1959 MI.getOpcode() == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN ||
1960 (MFI->isChainFunction() &&
1961 TII->isChainCallOpcode(MI.getOpcode()))) {
1962 // We expect all return to be the same size.
1963 assert(!ReturnMI ||
1964 (count_if(MI.operands(), [](auto Op) { return Op.isReg(); }) ==
1965 count_if(ReturnMI->operands(), [](auto Op) { return Op.isReg(); })));
1966 ReturnMI = &MI;
1967 }
1968 }
1969 }
1970
1971 SmallVector<Register> SortedWWMVGPRs;
1972 for (Register Reg : MFI->getWWMReservedRegs()) {
1973 // The shift-back is needed only for the VGPRs used for SGPR spills and they
1974 // are of 32-bit size. SIPreAllocateWWMRegs pass can add tuples into WWM
1975 // reserved registers.
1976 const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
1977 if (TRI->getRegSizeInBits(*RC) != 32)
1978 continue;
1979 SortedWWMVGPRs.push_back(Reg);
1980 }
1981
1982 sort(SortedWWMVGPRs, std::greater<Register>());
1983 MFI->shiftWwmVGPRsToLowestRange(MF, SortedWWMVGPRs, SavedVGPRs);
1984
1985 if (MFI->isEntryFunction())
1986 return;
1987
1988 if (MFI->isWholeWaveFunction()) {
1989 // In practice, all the VGPRs are WWM registers, and we will need to save at
1990 // least their inactive lanes. Add them to WWMReservedRegs.
1991 assert(!NeedExecCopyReservedReg &&
1992 "Whole wave functions can use the reg mapped for their i1 argument");
1993
1994 unsigned NumArchVGPRs = ST.getAddressableNumArchVGPRs();
1995 for (MCRegister Reg :
1996 AMDGPU::VGPR_32RegClass.getRegisters().take_front(NumArchVGPRs))
1997 if (MF.getRegInfo().isPhysRegModified(Reg)) {
1998 MFI->reserveWWMRegister(Reg);
1999 MF.begin()->addLiveIn(Reg);
2000 }
2001 MF.begin()->sortUniqueLiveIns();
2002 }
2003
2004 // Remove any VGPRs used in the return value because these do not need to be saved.
2005 // This prevents CSR restore from clobbering return VGPRs.
2006 if (ReturnMI) {
2007 for (auto &Op : ReturnMI->operands()) {
2008 if (Op.isReg())
2009 SavedVGPRs.reset(Op.getReg());
2010 }
2011 }
2012
2013 // Create the stack objects for WWM registers now.
2014 for (Register Reg : MFI->getWWMReservedRegs()) {
2015 const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
2016 MFI->allocateWWMSpill(MF, Reg, TRI->getSpillSize(*RC),
2017 TRI->getSpillAlign(*RC));
2018 }
2019
2020 // Ignore the SGPRs the default implementation found.
2021 SavedVGPRs.clearBitsNotInMask(TRI->getAllVectorRegMask());
2022
2023 // Do not save AGPRs prior to GFX90A because there was no easy way to do so.
2024 // In gfx908 there was do AGPR loads and stores and thus spilling also
2025 // require a temporary VGPR.
2026 if (!ST.hasGFX90AInsts())
2027 SavedVGPRs.clearBitsInMask(TRI->getAllAGPRRegMask());
2028
2029 determinePrologEpilogSGPRSaves(MF, SavedVGPRs, NeedExecCopyReservedReg);
2030
2031 // The Whole-Wave VGPRs need to be specially inserted in the prolog, so don't
2032 // allow the default insertion to handle them.
2033 for (auto &Reg : MFI->getWWMSpills())
2034 SavedVGPRs.reset(Reg.first);
2035}
2036
2038 BitVector &SavedRegs,
2039 RegScavenger *RS) const {
2042 if (MFI->isEntryFunction())
2043 return;
2044
2045 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2046 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2047
2048 // The SP is specifically managed and we don't want extra spills of it.
2049 SavedRegs.reset(MFI->getStackPtrOffsetReg());
2050
2051 const BitVector AllSavedRegs = SavedRegs;
2052 SavedRegs.clearBitsInMask(TRI->getAllVectorRegMask());
2053
2054 // We have to anticipate introducing CSR VGPR spills or spill of caller
2055 // save VGPR reserved for SGPR spills as we now always create stack entry
2056 // for it, if we don't have any stack objects already, since we require a FP
2057 // if there is a call and stack. We will allocate a VGPR for SGPR spills if
2058 // there are any SGPR spills. Whether they are CSR spills or otherwise.
2059 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
2060 const bool WillHaveFP =
2061 FrameInfo.hasCalls() && (AllSavedRegs.any() || MFI->hasSpilledSGPRs());
2062
2063 // FP will be specially managed like SP.
2064 if (WillHaveFP || hasFP(MF))
2065 SavedRegs.reset(MFI->getFrameOffsetReg());
2066
2067 // Return address use with return instruction is hidden through the SI_RETURN
2068 // pseudo. Given that and since the IPRA computes actual register usage and
2069 // does not use CSR list, the clobbering of return address by function calls
2070 // (D117243) or otherwise (D120922) is ignored/not seen by the IPRA's register
2071 // usage collection. This will ensure save/restore of return address happens
2072 // in those scenarios.
2073 const MachineRegisterInfo &MRI = MF.getRegInfo();
2074 Register RetAddrReg = TRI->getReturnAddressReg(MF);
2075 if (!MFI->isEntryFunction() &&
2076 (FrameInfo.hasCalls() || MRI.isPhysRegModified(RetAddrReg))) {
2077 SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub0));
2078 SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub1));
2079 }
2080}
2081
2083 const GCNSubtarget &ST,
2084 std::vector<CalleeSavedInfo> &CSI) {
2086 MachineFrameInfo &MFI = MF.getFrameInfo();
2087 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2088
2089 assert(
2090 llvm::is_sorted(CSI,
2091 [](const CalleeSavedInfo &A, const CalleeSavedInfo &B) {
2092 return A.getReg() < B.getReg();
2093 }) &&
2094 "Callee saved registers not sorted");
2095
2096 auto CanUseBlockOps = [&](const CalleeSavedInfo &CSI) {
2097 return !CSI.isSpilledToReg() &&
2098 TRI->getPhysRegBaseClass(CSI.getReg()) == &AMDGPU::VGPR_32RegClass &&
2099 !FuncInfo->isWWMReservedRegister(CSI.getReg());
2100 };
2101
2102 auto CSEnd = CSI.end();
2103 for (auto CSIt = CSI.begin(); CSIt != CSEnd; ++CSIt) {
2104 Register Reg = CSIt->getReg();
2105 if (!CanUseBlockOps(*CSIt))
2106 continue;
2107
2108 // Find all the regs that will fit in a 32-bit mask starting at the current
2109 // reg and build said mask. It should have 1 for every register that's
2110 // included, with the current register as the least significant bit.
2111 uint32_t Mask = 1;
2112 CSEnd = std::remove_if(
2113 CSIt + 1, CSEnd, [&](const CalleeSavedInfo &CSI) -> bool {
2114 if (CanUseBlockOps(CSI) && CSI.getReg() < Reg + 32) {
2115 Mask |= 1 << (CSI.getReg() - Reg);
2116 return true;
2117 } else {
2118 return false;
2119 }
2120 });
2121
2122 const TargetRegisterClass *BlockRegClass = TRI->getRegClassForBlockOp(MF);
2123 Register RegBlock =
2124 TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, BlockRegClass);
2125 if (!RegBlock) {
2126 // We couldn't find a super register for the block. This can happen if
2127 // the register we started with is too high (e.g. v232 if the maximum is
2128 // v255). We therefore try to get the last register block and figure out
2129 // the mask from there.
2130 Register LastBlockStart =
2131 AMDGPU::VGPR0 + alignDown(Reg - AMDGPU::VGPR0, 32);
2132 RegBlock =
2133 TRI->getMatchingSuperReg(LastBlockStart, AMDGPU::sub0, BlockRegClass);
2134 assert(RegBlock && TRI->isSubRegister(RegBlock, Reg) &&
2135 "Couldn't find super register");
2136 int RegDelta = Reg - LastBlockStart;
2137 assert(RegDelta > 0 && llvm::countl_zero(Mask) >= RegDelta &&
2138 "Bad shift amount");
2139 Mask <<= RegDelta;
2140 }
2141
2142 FuncInfo->setMaskForVGPRBlockOps(RegBlock, Mask);
2143
2144 // The stack objects can be a bit smaller than the register block if we know
2145 // some of the high bits of Mask are 0. This may happen often with calling
2146 // conventions where the caller and callee-saved VGPRs are interleaved at
2147 // a small boundary (e.g. 8 or 16).
2148 int UnusedBits = llvm::countl_zero(Mask);
2149 unsigned BlockSize = TRI->getSpillSize(*BlockRegClass) - UnusedBits * 4;
2150 int FrameIdx =
2151 MFI.CreateStackObject(BlockSize, TRI->getSpillAlign(*BlockRegClass),
2152 /*isSpillSlot=*/true);
2153 MFI.setIsCalleeSavedObjectIndex(FrameIdx, true);
2154
2155 CSIt->setFrameIdx(FrameIdx);
2156 CSIt->setReg(RegBlock);
2157 }
2158 CSI.erase(CSEnd, CSI.end());
2159}
2160
2163 std::vector<CalleeSavedInfo> &CSI) const {
2164 if (CSI.empty())
2165 return true; // Early exit if no callee saved registers are modified!
2166
2167 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2168 bool UseVGPRBlocks = ST.useVGPRBlockOpsForCSR();
2169
2170 if (UseVGPRBlocks)
2171 assignSlotsUsingVGPRBlocks(MF, ST, CSI);
2172
2173 return assignCalleeSavedSpillSlotsImpl(MF, TRI, CSI) || UseVGPRBlocks;
2174}
2175
2178 std::vector<CalleeSavedInfo> &CSI) const {
2179 if (CSI.empty())
2180 return true; // Early exit if no callee saved registers are modified!
2181
2182 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
2183 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2184 const SIRegisterInfo *RI = ST.getRegisterInfo();
2185 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
2186 Register BasePtrReg = RI->getBaseRegister();
2187 Register SGPRForFPSaveRestoreCopy =
2188 FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
2189 Register SGPRForBPSaveRestoreCopy =
2190 FuncInfo->getScratchSGPRCopyDstReg(BasePtrReg);
2191 if (!SGPRForFPSaveRestoreCopy && !SGPRForBPSaveRestoreCopy)
2192 return false;
2193
2194 unsigned NumModifiedRegs = 0;
2195
2196 if (SGPRForFPSaveRestoreCopy)
2197 NumModifiedRegs++;
2198 if (SGPRForBPSaveRestoreCopy)
2199 NumModifiedRegs++;
2200
2201 for (auto &CS : CSI) {
2202 if (CS.getReg() == FramePtrReg.asMCReg() && SGPRForFPSaveRestoreCopy) {
2203 CS.setDstReg(SGPRForFPSaveRestoreCopy);
2204 if (--NumModifiedRegs)
2205 break;
2206 } else if (CS.getReg() == BasePtrReg.asMCReg() &&
2207 SGPRForBPSaveRestoreCopy) {
2208 CS.setDstReg(SGPRForBPSaveRestoreCopy);
2209 if (--NumModifiedRegs)
2210 break;
2211 }
2212 }
2213
2214 return false;
2215}
2216
2218 const MachineFunction &MF) const {
2219
2220 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2221 const MachineFrameInfo &MFI = MF.getFrameInfo();
2222 const SIInstrInfo *TII = ST.getInstrInfo();
2223 uint64_t EstStackSize = MFI.estimateStackSize(MF);
2224 uint64_t MaxOffset = EstStackSize - 1;
2225
2226 // We need the emergency stack slots to be allocated in range of the
2227 // MUBUF/flat scratch immediate offset from the base register, so assign these
2228 // first at the incoming SP position.
2229 //
2230 // TODO: We could try sorting the objects to find a hole in the first bytes
2231 // rather than allocating as close to possible. This could save a lot of space
2232 // on frames with alignment requirements.
2233 if (ST.hasFlatScratchEnabled()) {
2234 if (TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS,
2236 return false;
2237 } else {
2238 if (TII->isLegalMUBUFImmOffset(MaxOffset))
2239 return false;
2240 }
2241
2242 return true;
2243}
2244
2245/// Return the set of all root registers of regunits live-in to @p MBB.
2246///
2247/// Intended to avoid using the expensive @c MCRegAliasIterator when deciding
2248/// if a register to be spilled is already live-in (see @c isAnyRootLiveIn).
2250 const SIRegisterInfo &TRI) {
2251 SparseBitVector<> LiveInRoots;
2252 for (const auto &LI : MBB.liveins()) {
2253 for (MCRegUnitMaskIterator MI(LI.PhysReg, &TRI); MI.isValid(); ++MI) {
2254 auto [Unit, UnitLaneMask] = *MI;
2255 if ((LI.LaneMask & UnitLaneMask).none())
2256 continue;
2257 for (MCRegUnitRootIterator RI(Unit, &TRI); RI.isValid(); ++RI)
2258 LiveInRoots.set(*RI);
2259 }
2260 }
2261 return LiveInRoots;
2262}
2263
2264/// Returns true iff any root of @p Reg is in @p LiveInRoots
2265/// (see @c buildLiveInRoots).
2266static bool isAnyRootLiveIn(const SparseBitVector<> &LiveInRoots,
2267 const SIRegisterInfo &TRI, MCRegister Reg) {
2268 for (MCRegUnitIterator UI(Reg, &TRI); UI.isValid(); ++UI) {
2269 for (MCRegUnitRootIterator RI(*UI, &TRI); RI.isValid(); ++RI) {
2270 if (LiveInRoots.test(*RI))
2271 return true;
2272 }
2273 }
2274 return false;
2275}
2276
2277void SIFrameLowering::spillCalleeSavedRegisterWithoutBlockOps(
2279 const CalleeSavedInfo &CS, const SIInstrInfo *TII,
2280 const SIRegisterInfo &TRI,
2281 const std::optional<SparseBitVector<>> &LiveInRoots) const {
2282 MCRegister Reg = CS.getReg();
2283
2284 // We assume a sortUniqueLiveIns later
2285 MBB.addLiveIn(Reg);
2286
2287 if (CS.isSpilledToReg()) {
2288 BuildMI(MBB, MI, DebugLoc(), TII->get(TargetOpcode::COPY), CS.getDstReg())
2289 .addReg(Reg, getKillRegState(true));
2290 } else {
2291 const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(Reg);
2292 bool IsKill = true;
2293 // If this value was already livein, we probably have a direct use of
2294 // the incoming register value, so don't kill at the spill point. This
2295 // happens since we pass some special inputs (workgroup IDs) in the
2296 // callee saved range.
2297 if (LiveInRoots)
2298 IsKill = !isAnyRootLiveIn(*LiveInRoots, TRI, Reg);
2299 TII->storeRegToStackSlotCFI(MBB, MI, Reg, IsKill, CS.getFrameIdx(), RC);
2300 }
2301}
2302
2305 ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *OrigTRI) const {
2306 auto &TRI = *static_cast<const SIRegisterInfo *>(OrigTRI);
2307 MachineFunction *MF = MBB.getParent();
2308 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2309 const SIInstrInfo *TII = ST.getInstrInfo();
2310
2311 std::optional<SparseBitVector<>> LiveInRoots;
2312 if (MBB.getParent()->getRegInfo().tracksLiveness())
2313 LiveInRoots = buildLiveInRoots(MBB, TRI);
2314
2315 if (!ST.useVGPRBlockOpsForCSR()) {
2316 for (const CalleeSavedInfo &CS : CSI)
2317 spillCalleeSavedRegisterWithoutBlockOps(MBB, MI, CS, TII, TRI,
2318 LiveInRoots);
2319 if (LiveInRoots)
2320 MBB.sortUniqueLiveIns();
2321 return true;
2322 }
2323
2324 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
2326
2327 const TargetRegisterClass *BlockRegClass = TRI.getRegClassForBlockOp(*MF);
2328 for (const CalleeSavedInfo &CS : CSI) {
2329 Register Reg = CS.getReg();
2330 if (!BlockRegClass->contains(Reg) ||
2331 !FuncInfo->hasMaskForVGPRBlockOps(Reg)) {
2332 spillCalleeSavedRegisterWithoutBlockOps(MBB, MI, CS, TII, TRI,
2333 LiveInRoots);
2334 continue;
2335 }
2336
2337 // Build a scratch block store.
2338 uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(Reg);
2339 int FrameIndex = CS.getFrameIdx();
2340 MachinePointerInfo PtrInfo =
2341 MachinePointerInfo::getFixedStack(*MF, FrameIndex);
2342 MachineMemOperand *MMO =
2344 FrameInfo.getObjectSize(FrameIndex),
2345 FrameInfo.getObjectAlign(FrameIndex));
2346
2347 BuildMI(MBB, MI, MI->getDebugLoc(),
2348 TII->get(AMDGPU::SI_BLOCK_SPILL_V1024_CFI_SAVE))
2349 .addReg(Reg, getKillRegState(false))
2350 .addFrameIndex(FrameIndex)
2351 .addReg(FuncInfo->getStackPtrOffsetReg())
2352 .addImm(0)
2353 .addImm(Mask)
2354 .addMemOperand(MMO);
2355
2356 FuncInfo->setHasSpilledVGPRs();
2357
2358 // Add the register to the liveins. This is necessary because if any of the
2359 // VGPRs in the register block is reserved (e.g. if it's a WWM register),
2360 // then the whole block will be marked as reserved and `updateLiveness` will
2361 // skip it.
2362 if (LiveInRoots)
2363 MBB.addLiveIn(Reg);
2364 }
2365 if (LiveInRoots)
2366 MBB.sortUniqueLiveIns();
2367
2368 return true;
2369}
2370
2374 const TargetRegisterInfo *OrigTRI) const {
2375 auto &TRI = *static_cast<const SIRegisterInfo *>(OrigTRI);
2376 MachineFunction *MF = MBB.getParent();
2377 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2378 if (!ST.useVGPRBlockOpsForCSR())
2379 return false;
2380
2382 MachineFrameInfo &MFI = MF->getFrameInfo();
2383 const SIInstrInfo *TII = ST.getInstrInfo();
2384 const TargetRegisterClass *BlockRegClass = TRI.getRegClassForBlockOp(*MF);
2385 for (const CalleeSavedInfo &CS : reverse(CSI)) {
2386 Register Reg = CS.getReg();
2387 if (!BlockRegClass->contains(Reg) ||
2388 !FuncInfo->hasMaskForVGPRBlockOps(Reg)) {
2390 continue;
2391 }
2392
2393 // Build a scratch block load.
2394 uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(Reg);
2395 int FrameIndex = CS.getFrameIdx();
2396 MachinePointerInfo PtrInfo =
2397 MachinePointerInfo::getFixedStack(*MF, FrameIndex);
2399 PtrInfo, MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIndex),
2400 MFI.getObjectAlign(FrameIndex));
2401
2402 auto MIB = BuildMI(MBB, MI, MI->getDebugLoc(),
2403 TII->get(AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE), Reg)
2404 .addFrameIndex(FrameIndex)
2405 .addReg(FuncInfo->getStackPtrOffsetReg())
2406 .addImm(0)
2407 .addImm(Mask)
2408 .addMemOperand(MMO);
2409 TRI.addImplicitUsesForBlockCSRLoad(MIB, Reg);
2410
2411 // Add the register to the liveins. This is necessary because if any of the
2412 // VGPRs in the register block is reserved (e.g. if it's a WWM register),
2413 // then the whole block will be marked as reserved and `updateLiveness` will
2414 // skip it.
2415 MBB.addLiveIn(Reg);
2416 }
2417
2418 MBB.sortUniqueLiveIns();
2419 return true;
2420}
2421
2423 MachineFunction &MF,
2426 int64_t Amount = I->getOperand(0).getImm();
2427 if (Amount == 0)
2428 return MBB.erase(I);
2429
2430 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2431 const SIInstrInfo *TII = ST.getInstrInfo();
2432 const DebugLoc &DL = I->getDebugLoc();
2433 unsigned Opc = I->getOpcode();
2434 bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
2435 uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
2436
2437 if (!hasReservedCallFrame(MF)) {
2438 Amount = alignTo(Amount, getStackAlign());
2439 assert(isUInt<32>(Amount) && "exceeded stack address space size");
2442
2443 Amount *= getScratchScaleFactor(ST);
2444 if (IsDestroy)
2445 Amount = -Amount;
2446 auto Add = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SPReg)
2447 .addReg(SPReg)
2448 .addImm(Amount);
2449 Add->getOperand(3).setIsDead(); // Mark SCC as dead.
2450 } else if (CalleePopAmount != 0) {
2451 llvm_unreachable("is this used?");
2452 }
2453
2454 return MBB.erase(I);
2455}
2456
2457/// Returns true if the frame will require a reference to the stack pointer.
2458///
2459/// This is the set of conditions common to setting up the stack pointer in a
2460/// kernel, and for using a frame pointer in a callable function.
2461///
2462/// FIXME: Should also check hasOpaqueSPAdjustment and if any inline asm
2463/// references SP.
2465 return MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint();
2466}
2467
2468// The FP for kernels is always known 0, so we never really need to setup an
2469// explicit register for it. However, DisableFramePointerElim will force us to
2470// use a register for it.
2472 const MachineFrameInfo &MFI = MF.getFrameInfo();
2473
2474 // For entry functions we can use an immediate offset in most cases,
2475 // so the presence of calls doesn't imply we need a distinct frame pointer.
2476 if (MFI.hasCalls() &&
2478 // All offsets are unsigned, so need to be addressed in the same direction
2479 // as stack growth.
2480
2481 // FIXME: This function is pretty broken, since it can be called before the
2482 // frame layout is determined or CSR spills are inserted.
2483 return MFI.getStackSize() != 0;
2484 }
2485
2486 return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() ||
2487 MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment(
2488 MF) ||
2491}
2492
2494 const MachineFunction &MF) const {
2495 return MF.getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled() &&
2498}
2499
2500// This is essentially a reduced version of hasFP for entry functions. Since the
2501// stack pointer is known 0 on entry to kernels, we never really need an FP
2502// register. We may need to initialize the stack pointer depending on the frame
2503// properties, which logically overlaps many of the cases where an ordinary
2504// function would require an FP.
2506 const MachineFunction &MF) const {
2507 // Callable functions always require a stack pointer reference.
2509 "only expected to call this for entry points functions");
2510
2511 const MachineFrameInfo &MFI = MF.getFrameInfo();
2512
2513 // Entry points ordinarily don't need to initialize SP. We have to set it up
2514 // for callees if there are any. Also note tail calls are only possible via
2515 // the `llvm.amdgcn.cs.chain` intrinsic.
2516 if (MFI.hasCalls() || MFI.hasTailCall())
2517 return true;
2518
2519 // We still need to initialize the SP if we're doing anything weird that
2520 // references the SP, like variable sized stack objects.
2521 return frameTriviallyRequiresSP(MFI);
2522}
2523
2526 const DebugLoc &DL,
2527 const MCCFIInstruction &CFIInst,
2528 MachineInstr::MIFlag Flag) const {
2529 MachineFunction &MF = *MBB.getParent();
2530 const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo();
2531 return BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
2532 .addCFIIndex(MF.addFrameInst(CFIInst))
2533 .setMIFlag(Flag);
2534}
2535
2538 const DebugLoc &DL, const MCRegister Reg, const MCRegister RegCopy) const {
2539 MachineFunction &MF = *MBB.getParent();
2540 const MCRegisterInfo &MCRI = *MF.getContext().getRegisterInfo();
2541 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2542
2543 MCRegister MaskReg = MCRI.getDwarfRegNum(
2544 ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC, false);
2546 nullptr, MCRI.getDwarfRegNum(Reg, false),
2547 MCRI.getDwarfRegNum(RegCopy, false), VGPRLaneBitSize, MaskReg,
2548 ST.getWavefrontSize());
2549 return buildCFI(MBB, MBBI, DL, std::move(CFIInst));
2550}
2551
2554 const DebugLoc &DL, const MCRegister SGPR, const MCRegister VGPR,
2555 const int Lane) const {
2556 const MachineFunction &MF = *MBB.getParent();
2557 const MCRegisterInfo &MCRI = *MF.getContext().getRegisterInfo();
2558
2559 int DwarfSGPR = MCRI.getDwarfRegNum(SGPR, false);
2560 int DwarfVGPR = MCRI.getDwarfRegNum(VGPR, false);
2561 assert(DwarfSGPR != -1 && DwarfVGPR != -1);
2562 assert(Lane != -1 && "Expected a lane to be present");
2563
2564 // Build a CFI instruction that represents a SGPR spilled to a single lane of
2565 // a VGPR.
2567 unsigned(Lane), VGPRLaneBitSize};
2568 auto CFIInst =
2569 MCCFIInstruction::createLLVMVectorRegisters(nullptr, DwarfSGPR, {VR});
2570 return buildCFI(MBB, MBBI, DL, std::move(CFIInst));
2571}
2572
2575 const DebugLoc &DL, MCRegister SGPR,
2576 ArrayRef<SIRegisterInfo::SpilledReg> VGPRSpills) const {
2577 if (VGPRSpills.size() == 1u)
2578 return buildCFIForSGPRToVGPRSpill(MBB, MBBI, DL, SGPR, VGPRSpills[0].VGPR,
2579 VGPRSpills[0].Lane);
2580 const MachineFunction &MF = *MBB.getParent();
2581 const MCRegisterInfo &MCRI = *MF.getContext().getRegisterInfo();
2582
2583 int DwarfSGPR = MCRI.getDwarfRegNum(SGPR, false);
2584 assert(DwarfSGPR != -1);
2585
2586 // Build a CFI instruction that represents a SGPR spilled to multiple lanes of
2587 // multiple VGPRs.
2588
2590 for (SIRegisterInfo::SpilledReg Spill : VGPRSpills) {
2591 int DwarfVGPR = MCRI.getDwarfRegNum(Spill.VGPR, false);
2592 assert(DwarfVGPR != -1);
2593 assert(Spill.hasLane() && "Expected a lane to be present");
2594 VGPRs.push_back(
2595 {unsigned(DwarfVGPR), unsigned(Spill.Lane), VGPRLaneBitSize});
2596 }
2597
2598 auto CFIInst = MCCFIInstruction::createLLVMVectorRegisters(nullptr, DwarfSGPR,
2599 std::move(VGPRs));
2600 return buildCFI(MBB, MBBI, DL, std::move(CFIInst));
2601}
2602
2605 const DebugLoc &DL, MCRegister SGPR, int64_t Offset) const {
2606 MachineFunction &MF = *MBB.getParent();
2607 const MCRegisterInfo &MCRI = *MF.getContext().getRegisterInfo();
2608 return buildCFI(MBB, MBBI, DL,
2610 nullptr, MCRI.getDwarfRegNum(SGPR, false), Offset));
2611}
2612
2615 const DebugLoc &DL, MCRegister VGPR, int64_t Offset) const {
2616 const MachineFunction &MF = *MBB.getParent();
2617 const MCRegisterInfo &MCRI = *MF.getContext().getRegisterInfo();
2618 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2619
2620 int DwarfVGPR = MCRI.getDwarfRegNum(VGPR, false);
2621 assert(DwarfVGPR != -1);
2622
2623 MCRegister MaskReg = MCRI.getDwarfRegNum(
2624 ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC, false);
2626 nullptr, DwarfVGPR, VGPRLaneBitSize, MaskReg, ST.getWavefrontSize(),
2627 Offset);
2628 return buildCFI(MBB, MBBI, DL, std::move(CFIInst));
2629}
2630
2633 const DebugLoc &DL, const MCRegister Reg, const MCRegister SGPRPair) const {
2634 const MachineFunction &MF = *MBB.getParent();
2635 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2636 const SIRegisterInfo &TRI = *ST.getRegisterInfo();
2637
2638 MCRegister SGPR0 = TRI.getSubReg(SGPRPair, AMDGPU::sub0);
2639 MCRegister SGPR1 = TRI.getSubReg(SGPRPair, AMDGPU::sub1);
2640
2641 int DwarfReg = TRI.getDwarfRegNum(Reg, false);
2642 int DwarfSGPR0 = TRI.getDwarfRegNum(SGPR0, false);
2643 int DwarfSGPR1 = TRI.getDwarfRegNum(SGPR1, false);
2644 assert(DwarfReg != -1 && DwarfSGPR0 != -1 && DwarfSGPR1 != -1);
2645
2647 nullptr, DwarfReg, DwarfSGPR0, SGPRBitSize, DwarfSGPR1, SGPRBitSize);
2648 return buildCFI(MBB, MBBI, DL, std::move(CFIInst));
2649}
2650
2653 const DebugLoc &DL, MCRegister Reg) const {
2654 const MachineFunction &MF = *MBB.getParent();
2655 const MCRegisterInfo &MCRI = *MF.getContext().getRegisterInfo();
2656 int DwarfReg = MCRI.getDwarfRegNum(Reg, /*isEH=*/false);
2657 auto CFIInst = MCCFIInstruction::createSameValue(nullptr, DwarfReg);
2658 return buildCFI(MBB, MBBI, DL, std::move(CFIInst));
2659}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
This file contains constants used for implementing Dwarf debug support.
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
A set of register units.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static constexpr MCPhysReg FPReg
static constexpr MCPhysReg SPReg
This file declares the machine register scavenger class.
static void buildEpilogRestore(const GCNSubtarget &ST, const SIRegisterInfo &TRI, const SIMachineFunctionInfo &FuncInfo, LiveRegUnits &LiveUnits, MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SpillReg, int FI, Register FrameReg, int64_t DwordOff=0)
static cl::opt< bool > EnableSpillVGPRToAGPR("amdgpu-spill-vgpr-to-agpr", cl::desc("Enable spilling VGPRs to AGPRs"), cl::ReallyHidden, cl::init(true))
static void getVGPRSpillLaneOrTempRegister(MachineFunction &MF, LiveRegUnits &LiveUnits, Register SGPR, const TargetRegisterClass &RC=AMDGPU::SReg_32_XM0_XEXECRegClass, bool IncludeScratchCopy=true)
Query target location for spilling SGPRs IncludeScratchCopy : Also look for free scratch SGPRs.
static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, const SIInstrInfo *TII, Register TargetReg)
static bool allStackObjectsAreDead(const MachineFrameInfo &MFI)
static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI, const SIMachineFunctionInfo &FuncInfo, LiveRegUnits &LiveUnits, MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SpillReg, int FI, Register FrameReg, int64_t DwordOff=0)
static Register buildScratchExecCopy(LiveRegUnits &LiveUnits, MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool IsProlog, bool EnableInactiveLanes)
static void encodeDwarfRegisterLocation(int DwarfReg, raw_ostream &OS)
static constexpr unsigned SGPRBitSize
static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI)
Returns true if the frame will require a reference to the stack pointer.
static SparseBitVector buildLiveInRoots(const MachineBasicBlock &MBB, const SIRegisterInfo &TRI)
Return the set of all root registers of regunits live-in to MBB.
static void initLiveUnits(LiveRegUnits &LiveUnits, const SIRegisterInfo &TRI, const SIMachineFunctionInfo *FuncInfo, MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, bool IsProlog)
static constexpr unsigned VGPRLaneBitSize
static bool allSGPRSpillsAreDead(const MachineFunction &MF)
static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI, LiveRegUnits &LiveUnits, const TargetRegisterClass &RC, bool Unused=false)
static MCCFIInstruction createScaledCFAInPrivateWave(const GCNSubtarget &ST, MCRegister DwarfStackPtrReg)
static MCRegister findUnusedRegister(MachineRegisterInfo &MRI, const LiveRegUnits &LiveUnits, const TargetRegisterClass &RC)
static constexpr unsigned SGPRByteSize
static void assignSlotsUsingVGPRBlocks(MachineFunction &MF, const GCNSubtarget &ST, std::vector< CalleeSavedInfo > &CSI)
static bool isAnyRootLiveIn(const SparseBitVector<> &LiveInRoots, const SIRegisterInfo &TRI, MCRegister Reg)
Returns true iff any root of Reg is in LiveInRoots (see buildLiveInRoots).
static unsigned getScratchScaleFactor(const GCNSubtarget &ST)
#define LLVM_DEBUG(...)
Definition Debug.h:119
static const int BlockSize
Definition TarWriter.cpp:33
static const LaneMaskConstants & get(const GCNSubtarget &ST)
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
Get the array size.
Definition ArrayRef.h:141
bool empty() const
Check if the array is empty.
Definition ArrayRef.h:136
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:185
bool test(unsigned Idx) const
Returns true if bit Idx is set.
Definition BitVector.h:482
BitVector & reset()
Reset all bits in the bitvector.
Definition BitVector.h:409
void clearBitsNotInMask(const uint32_t *Mask, unsigned MaskWords=~0u)
Clear a bit in this vector for every '0' bit in Mask.
Definition BitVector.h:748
BitVector & set()
Set all bits in the bitvector.
Definition BitVector.h:366
bool any() const
Returns true if any bit is set.
Definition BitVector.h:189
void clearBitsInMask(const uint32_t *Mask, unsigned MaskWords=~0u)
Clear any bits in this vector that are set in Mask.
Definition BitVector.h:736
iterator_range< const_set_bits_iterator > set_bits() const
Definition BitVector.h:159
bool empty() const
Returns whether there are no bits in this bitvector.
Definition BitVector.h:175
The CalleeSavedInfo class tracks the information need to locate where a callee saved register is in t...
MCRegister getReg() const
MCRegister getDstReg() const
A debug info location.
Definition DebugLoc.h:123
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
const HexagonRegisterInfo & getRegisterInfo() const
A set of register units used to track register liveness.
bool available(MCRegister Reg) const
Returns true if no part of physical register Reg is live.
void init(const TargetRegisterInfo &TRI)
Initialize and clear the set.
void addReg(MCRegister Reg)
Adds register units covered by physical register Reg.
LLVM_ABI void stepBackward(const MachineInstr &MI)
Updates liveness when stepping backwards over the instruction MI.
LLVM_ABI void addLiveOuts(const MachineBasicBlock &MBB)
Adds registers living out of block MBB.
void removeReg(MCRegister Reg)
Removes all register units covered by physical register Reg.
bool empty() const
Returns true if the set is empty.
LLVM_ABI void addLiveIns(const MachineBasicBlock &MBB)
Adds registers living into block MBB.
static MCCFIInstruction createLLVMVectorOffset(MCSymbol *L, unsigned Register, unsigned RegisterSizeInBits, unsigned MaskRegister, unsigned MaskRegisterSizeInBits, int64_t Offset, SMLoc Loc={})
.cfi_llvm_vector_offset Previous value of Register is saved at Offset from CFA.
Definition MCDwarf.h:768
static MCCFIInstruction createUndefined(MCSymbol *L, unsigned Register, SMLoc Loc={})
.cfi_undefined From now on the previous value of Register can't be restored anymore.
Definition MCDwarf.h:703
static MCCFIInstruction createLLVMVectorRegisters(MCSymbol *L, unsigned Register, ArrayRef< VectorRegisterWithLane > VectorRegisters, SMLoc Loc={})
.cfi_llvm_vector_registers Previous value of Register is saved in lanes of vector registers.
Definition MCDwarf.h:758
static MCCFIInstruction createLLVMVectorRegisterMask(MCSymbol *L, unsigned Register, unsigned SpillRegister, unsigned SpillRegisterLaneSizeInBits, unsigned MaskRegister, unsigned MaskRegisterSizeInBits, SMLoc Loc={})
.cfi_llvm_vector_register_mask Previous value of Register is saved in SpillRegister,...
Definition MCDwarf.h:779
static MCCFIInstruction createRegister(MCSymbol *L, unsigned Register1, unsigned Register2, SMLoc Loc={})
.cfi_register Previous value of Register1 is saved in register Register2.
Definition MCDwarf.h:672
static MCCFIInstruction createOffset(MCSymbol *L, unsigned Register, int64_t Offset, SMLoc Loc={})
.cfi_offset Previous value of Register is saved at offset Offset from CFA.
Definition MCDwarf.h:657
static MCCFIInstruction createLLVMRegisterPair(MCSymbol *L, unsigned Register, unsigned R1, unsigned R1SizeInBits, unsigned R2, unsigned R2SizeInBits, SMLoc Loc={})
.cfi_llvm_register_pair Previous value of Register is saved in R1:R2.
Definition MCDwarf.h:748
static MCCFIInstruction createEscape(MCSymbol *L, StringRef Vals, SMLoc Loc={}, StringRef Comment="")
.cfi_escape Allows the user to add arbitrary bytes to the unwind info.
Definition MCDwarf.h:727
static MCCFIInstruction createSameValue(MCSymbol *L, unsigned Register, SMLoc Loc={})
.cfi_same_value Current value of Register is the same as in the previous frame.
Definition MCDwarf.h:710
const MCRegisterInfo * getRegisterInfo() const
Definition MCContext.h:411
Describe properties that are true of each instruction in the target description file.
bool isValid() const
Returns true if this iterator is not yet at the end.
MCRegUnitMaskIterator enumerates a list of register units and their associated lane masks for Reg.
MCRegUnitRootIterator enumerates the root registers of a register unit.
bool isValid() const
Check if the iterator is at the end of the list.
MCRegisterInfo base class - We assume that the target defines a static array of MCRegisterDesc object...
virtual int64_t getDwarfRegNum(MCRegister Reg, bool isEH) const
Map a target register to an equivalent dwarf register number.
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition MCRegister.h:72
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
bool hasVarSizedObjects() const
This method may be called any time after instruction selection is complete to determine if the stack ...
uint64_t getStackSize() const
Return the number of bytes that must be allocated to hold all of the fixed size frame objects.
bool hasCalls() const
Return true if the current function has any function calls.
bool isFrameAddressTaken() const
This method may be called any time after instruction selection is complete to determine if there is a...
Align getMaxAlign() const
Return the alignment in bytes that this function must be aligned to, which is greater than the defaul...
bool hasPatchPoint() const
This method may be called any time after instruction selection is complete to determine if there is a...
LLVM_ABI int CreateSpillStackObject(uint64_t Size, Align Alignment)
Create a new statically sized stack object that represents a spill slot, returning a nonnegative iden...
bool hasTailCall() const
Returns true if the function contains a tail call.
bool hasStackMap() const
This method may be called any time after instruction selection is complete to determine if there is a...
void RemoveStackObject(int ObjectIdx)
Remove or mark dead a statically sized stack object.
int getObjectIndexEnd() const
Return one past the maximum frame object index.
uint8_t getStackID(int ObjectIdx) const
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
int getObjectIndexBegin() const
Return the minimum frame object index.
bool isDeadObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a dead object.
unsigned addFrameInst(const MCCFIInstruction &Inst)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
bool needsFrameMoves() const
True if this function needs frame moves for debug or exceptions.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & addCFIIndex(unsigned CFIIndex) const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
Representation of each machine instruction.
mop_range operands()
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
void setIsDead(bool Val=true)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
bool isReserved(MCRegister PhysReg) const
isReserved - Returns true when PhysReg is a reserved register.
bool isAllocatable(MCRegister PhysReg) const
isAllocatable - Returns true when PhysReg belongs to an allocatable register class and it hasn't been...
LLVM_ABI const MCPhysReg * getCalleeSavedRegs() const
Returns list of callee saved registers.
void reserveReg(MCRegister PhysReg, const TargetRegisterInfo *TRI)
reserveReg – Mark a register as reserved so checks like isAllocatable will not suggest using it.
void addLiveIn(MCRegister Reg, Register vreg=Register())
addLiveIn - Add the specified register as a live-in.
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
LLVM_ABI bool isPhysRegModified(MCRegister PhysReg, bool SkipNoReturnDef=false) const
Return true if the specified register is modified in this function.
LLVM_ABI bool isPhysRegUsed(MCRegister PhysReg, bool SkipRegMaskTest=false) const
Return true if the specified register is modified or read in this function.
Represent a mutable reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:294
PrologEpilogSGPRSpillBuilder(Register Reg, const PrologEpilogSGPRSaveRestoreInfo SI, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, const SIInstrInfo *TII, const SIRegisterInfo &TRI, LiveRegUnits &LiveUnits, Register FrameReg, bool IsFramePtrPrologSpill=false)
Wrapper class representing virtual and physical registers.
Definition Register.h:20
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
void determinePrologEpilogSGPRSaves(MachineFunction &MF, BitVector &SavedRegs, bool NeedExecCopyReservedReg) const
MachineInstr * buildCFIForSGPRToVMEMSpill(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, MCRegister SGPR, int64_t Offset) const
Create a CFI index describing a spill of a SGPR to VMEM and build a MachineInstr around it.
void emitCSRSpillRestores(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, LiveRegUnits &LiveUnits, Register FrameReg, Register FramePtrRegScratchCopy) const
StackOffset getFrameIndexReference(const MachineFunction &MF, int FI, Register &FrameReg) const override
getFrameIndexReference - This method should return the base register and offset used to reference a f...
void processFunctionBeforeFrameFinalized(MachineFunction &MF, RegScavenger *RS=nullptr) const override
processFunctionBeforeFrameFinalized - This method is called immediately before the specified function...
bool mayReserveScratchForCWSR(const MachineFunction &MF) const
bool allocateScavengingFrameIndexesNearIncomingSP(const MachineFunction &MF) const override
Control the placement of special register scavenging spill slots when allocating a stack frame.
bool requiresStackPointerReference(const MachineFunction &MF) const
void emitEntryFunctionPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const
void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS=nullptr) const override
This method determines which of the registers reported by TargetRegisterInfo::getCalleeSavedRegs() sh...
bool hasFPImpl(const MachineFunction &MF) const override
bool assignCalleeSavedSpillSlotsImpl(MachineFunction &MF, const TargetRegisterInfo *TRI, std::vector< CalleeSavedInfo > &CSI) const
MachineInstr * buildCFIForVRegToVRegSpill(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, const MCRegister Reg, const MCRegister RegCopy) const
Create a CFI index describing a spill of the VGPR/AGPR Reg to another VGPR/AGPR RegCopy and build a M...
bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, ArrayRef< CalleeSavedInfo > CSI, const TargetRegisterInfo *TRI) const override
spillCalleeSavedRegisters - Issues instruction(s) to spill all callee saved registers and returns tru...
MachineInstr * buildCFIForRegToSGPRPairSpill(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, MCRegister Reg, MCRegister SGPRPair) const
MachineInstr * buildCFIForVGPRToVMEMSpill(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, MCRegister VGPR, int64_t Offset) const
Create a CFI index describing a spill of a VGPR to VMEM and build a MachineInstr around it.
MachineInstr * buildCFIForSGPRToVGPRSpill(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, const MCRegister SGPR, const MCRegister VGPR, const int Lane) const
Create a CFI index describing a spill of an SGPR to a single lane of a VGPR and build a MachineInstr ...
bool assignCalleeSavedSpillSlots(MachineFunction &MF, const TargetRegisterInfo *TRI, std::vector< CalleeSavedInfo > &CSI) const override
assignCalleeSavedSpillSlots - Allows target to override spill slot assignment logic.
void determineCalleeSavesSGPR(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS=nullptr) const
void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override
MachineInstr * buildCFIForSameValue(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, MCRegister Reg) const
MachineInstr * buildCFI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, const MCCFIInstruction &CFIInst, MachineInstr::MIFlag flag=MachineInstr::FrameSetup) const
Create a CFI index for CFIInst and build a MachineInstr around it.
void emitCSRSpillStores(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, LiveRegUnits &LiveUnits, Register FrameReg, Register FramePtrRegScratchCopy, const bool NeedsFrameMoves) const
void processFunctionBeforeFrameIndicesReplaced(MachineFunction &MF, RegScavenger *RS=nullptr) const override
processFunctionBeforeFrameIndicesReplaced - This method is called immediately before MO_FrameIndex op...
bool isSupportedStackID(TargetStackID::Value ID) const override
void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override
emitProlog/emitEpilog - These methods insert prolog and epilog code into the function.
MachineBasicBlock::iterator eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
This method is called during prolog/epilog code insertion to eliminate call frame setup and destroy p...
bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, MutableArrayRef< CalleeSavedInfo > CSI, const TargetRegisterInfo *TRI) const override
restoreCalleeSavedRegisters - Issues instruction(s) to restore all callee saved registers and returns...
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
ArrayRef< PrologEpilogSGPRSpill > getPrologEpilogSGPRSpills() const
const WWMSpillsMap & getWWMSpills() const
void getAllScratchSGPRCopyDstRegs(SmallVectorImpl< Register > &Regs) const
ArrayRef< MCPhysReg > getAGPRSpillVGPRs() const
void removePrologEpilogSGPRSpillEntry(Register Reg)
void shiftWwmVGPRsToLowestRange(MachineFunction &MF, SmallVectorImpl< Register > &WWMVGPRs, BitVector &SavedVGPRs)
void setMaskForVGPRBlockOps(Register RegisterBlock, uint32_t Mask)
GCNUserSGPRUsageInfo & getUserSGPRInfo()
void allocateWWMSpill(MachineFunction &MF, Register VGPR, uint64_t Size=4, Align Alignment=Align(4))
void setVGPRToAGPRSpillDead(int FrameIndex)
Register getScratchRSrcReg() const
Returns the physical register reserved for use as the resource descriptor for scratch accesses.
ArrayRef< MCPhysReg > getVGPRSpillAGPRs() const
int getScavengeFI(MachineFrameInfo &MFI, const SIRegisterInfo &TRI)
uint32_t getMaskForVGPRBlockOps(Register RegisterBlock) const
bool hasMaskForVGPRBlockOps(Register RegisterBlock) const
bool hasPrologEpilogSGPRSpillEntry(Register Reg) const
Register getGITPtrLoReg(const MachineFunction &MF) const
void setVGPRForAGPRCopy(Register NewVGPRForAGPRCopy)
bool allocateVGPRSpillToAGPR(MachineFunction &MF, int FI, bool isAGPRtoVGPR)
Reserve AGPRs or VGPRs to support spilling for FrameIndex FI.
void splitWWMSpillRegisters(MachineFunction &MF, SmallVectorImpl< std::pair< Register, int > > &CalleeSavedRegs, SmallVectorImpl< std::pair< Register, int > > &ScratchRegs) const
bool isWWMReservedRegister(Register Reg) const
ArrayRef< SIRegisterInfo::SpilledReg > getSGPRSpillToPhysicalVGPRLanes(int FrameIndex) const
bool allocateSGPRSpillToVGPRLane(MachineFunction &MF, int FI, bool SpillToPhysVGPRLane=false, bool IsPrologEpilog=false)
void setLongBranchReservedReg(Register Reg)
void setHasSpilledVGPRs(bool Spill=true)
bool removeDeadFrameIndices(MachineFrameInfo &MFI, bool ResetSGPRSpillStackIDs)
If ResetSGPRSpillStackIDs is true, reset the stack ID from sgpr-spill to the default stack.
void setScratchReservedForDynamicVGPRs(unsigned SizeInBytes)
MCRegister getPreloadedReg(AMDGPUFunctionArgInfo::PreloadedValue Value) const
bool checkIndexInPrologEpilogSGPRSpills(int FI) const
const ReservedRegSet & getWWMReservedRegs() const
const PrologEpilogSGPRSaveRestoreInfo & getPrologEpilogSGPRSaveRestoreInfo(Register Reg) const
void setIsStackRealigned(bool Realigned=true)
void addToPrologEpilogSGPRSpills(Register Reg, PrologEpilogSGPRSaveRestoreInfo SI)
Register getScratchSGPRCopyDstReg(Register Reg) const
Register getFrameRegister(const MachineFunction &MF) const override
Represents a location in source code.
Definition SMLoc.h:22
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
void set(unsigned Idx)
bool test(unsigned Idx) const
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30
int64_t getFixed() const
Returns the fixed component of the stack.
Definition TypeSize.h:46
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
bool hasFP(const MachineFunction &MF) const
hasFP - Return true if the specified function should have a dedicated frame pointer register.
virtual bool hasReservedCallFrame(const MachineFunction &MF) const
hasReservedCallFrame - Under normal circumstances, when a frame pointer is not required,...
virtual void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS=nullptr) const
This method determines which of the registers reported by TargetRegisterInfo::getCalleeSavedRegs() sh...
void restoreCalleeSavedRegister(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const CalleeSavedInfo &CS, const TargetInstrInfo *TII, const TargetRegisterInfo *TRI) const
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
TargetOptions Options
LLVM_ABI bool DisableFramePointerElim(const MachineFunction &MF) const
DisableFramePointerElim - This returns true if frame pointer elimination optimization should be disab...
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
A raw_ostream that writes to an SmallVector or SmallString.
StringRef str() const
Return a StringRef for the vector contents.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ PRIVATE_ADDRESS
Address space for private memory.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
unsigned getVGPRAllocGranule(const MCSubtargetInfo &STI, unsigned DynamicVGPRBlockSize, std::optional< bool > EnableWavefrontSize32)
uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset)
Convert ByteOffset to dwords if the subtarget uses dword SMRD immediate offsets.
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:558
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1731
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
constexpr RegState getKillRegState(bool B)
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
void clearDebugInfoForSpillFIs(MachineFrameInfo &MFI, MachineBasicBlock &MBB, const BitVector &SpillFIs)
Replace frame index operands with null registers in debug value instructions for the specified spill ...
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:263
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1635
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
auto make_first_range(ContainerTy &&c)
Given a container of pairs, return a range over the first elements.
Definition STLExtras.h:1398
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
bool is_sorted(R &&Range, Compare C)
Wrapper function around std::is_sorted to check if elements in a range R are sorted with respect to a...
Definition STLExtras.h:1969
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:2018
unsigned encodeULEB128(uint64_t Value, raw_ostream &OS, unsigned PadTo=0)
Utility function to encode a ULEB128 value to an output stream.
Definition LEB128.h:79
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
static constexpr uint64_t encode(Fields... Values)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.