LLVM 19.0.0git
SIFrameLowering.cpp
Go to the documentation of this file.
1//===----------------------- SIFrameLowering.cpp --------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8
9#include "SIFrameLowering.h"
10#include "AMDGPU.h"
11#include "GCNSubtarget.h"
18
19using namespace llvm;
20
21#define DEBUG_TYPE "frame-info"
22
24 "amdgpu-spill-vgpr-to-agpr",
25 cl::desc("Enable spilling VGPRs to AGPRs"),
27 cl::init(true));
28
29// Find a register matching \p RC from \p LiveUnits which is unused and
30// available throughout the function. On failure, returns AMDGPU::NoRegister.
31// TODO: Rewrite the loop here to iterate over MCRegUnits instead of
32// MCRegisters. This should reduce the number of iterations and avoid redundant
33// checking.
35 const LiveRegUnits &LiveUnits,
36 const TargetRegisterClass &RC) {
37 for (MCRegister Reg : RC) {
38 if (!MRI.isPhysRegUsed(Reg) && LiveUnits.available(Reg) &&
39 !MRI.isReserved(Reg))
40 return Reg;
41 }
42 return MCRegister();
43}
44
45// Find a scratch register that we can use in the prologue. We avoid using
46// callee-save registers since they may appear to be free when this is called
47// from canUseAsPrologue (during shrink wrapping), but then no longer be free
48// when this is called from emitPrologue.
51 const TargetRegisterClass &RC, bool Unused = false) {
52 // Mark callee saved registers as used so we will not choose them.
53 const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
54 for (unsigned i = 0; CSRegs[i]; ++i)
55 LiveUnits.addReg(CSRegs[i]);
56
57 // We are looking for a register that can be used throughout the entire
58 // function, so any use is unacceptable.
59 if (Unused)
60 return findUnusedRegister(MRI, LiveUnits, RC);
61
62 for (MCRegister Reg : RC) {
63 if (LiveUnits.available(Reg) && !MRI.isReserved(Reg))
64 return Reg;
65 }
66
67 return MCRegister();
68}
69
70/// Query target location for spilling SGPRs
71/// \p IncludeScratchCopy : Also look for free scratch SGPRs
73 MachineFunction &MF, LiveRegUnits &LiveUnits, Register SGPR,
74 const TargetRegisterClass &RC = AMDGPU::SReg_32_XM0_XEXECRegClass,
75 bool IncludeScratchCopy = true) {
77 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
78
79 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
80 const SIRegisterInfo *TRI = ST.getRegisterInfo();
81 unsigned Size = TRI->getSpillSize(RC);
82 Align Alignment = TRI->getSpillAlign(RC);
83
84 // We need to save and restore the given SGPR.
85
86 Register ScratchSGPR;
87 // 1: Try to save the given register into an unused scratch SGPR. The
88 // LiveUnits should have all the callee saved registers marked as used. For
89 // certain cases we skip copy to scratch SGPR.
90 if (IncludeScratchCopy)
91 ScratchSGPR = findUnusedRegister(MF.getRegInfo(), LiveUnits, RC);
92
93 if (!ScratchSGPR) {
94 int FI = FrameInfo.CreateStackObject(Size, Alignment, true, nullptr,
96
97 if (TRI->spillSGPRToVGPR() &&
98 MFI->allocateSGPRSpillToVGPRLane(MF, FI, /*SpillToPhysVGPRLane=*/true,
99 /*IsPrologEpilog=*/true)) {
100 // 2: There's no free lane to spill, and no free register to save the
101 // SGPR, so we're forced to take another VGPR to use for the spill.
104 SGPRSaveKind::SPILL_TO_VGPR_LANE, FI));
105
106 LLVM_DEBUG(auto Spill = MFI->getSGPRSpillToPhysicalVGPRLanes(FI).front();
107 dbgs() << printReg(SGPR, TRI) << " requires fallback spill to "
108 << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane
109 << '\n';);
110 } else {
111 // Remove dead <FI> index
113 // 3: If all else fails, spill the register to memory.
114 FI = FrameInfo.CreateSpillStackObject(Size, Alignment);
116 SGPR,
117 PrologEpilogSGPRSaveRestoreInfo(SGPRSaveKind::SPILL_TO_MEM, FI));
118 LLVM_DEBUG(dbgs() << "Reserved FI " << FI << " for spilling "
119 << printReg(SGPR, TRI) << '\n');
120 }
121 } else {
124 SGPRSaveKind::COPY_TO_SCRATCH_SGPR, ScratchSGPR));
125 LiveUnits.addReg(ScratchSGPR);
126 LLVM_DEBUG(dbgs() << "Saving " << printReg(SGPR, TRI) << " with copy to "
127 << printReg(ScratchSGPR, TRI) << '\n');
128 }
129}
130
131// We need to specially emit stack operations here because a different frame
132// register is used than in the rest of the function, as getFrameRegister would
133// use.
134static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI,
135 const SIMachineFunctionInfo &FuncInfo,
136 LiveRegUnits &LiveUnits, MachineFunction &MF,
139 Register SpillReg, int FI, Register FrameReg,
140 int64_t DwordOff = 0) {
141 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
142 : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
143
144 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
147 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FI),
148 FrameInfo.getObjectAlign(FI));
149 LiveUnits.addReg(SpillReg);
150 bool IsKill = !MBB.isLiveIn(SpillReg);
151 TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, IsKill, FrameReg,
152 DwordOff, MMO, nullptr, &LiveUnits);
153 if (IsKill)
154 LiveUnits.removeReg(SpillReg);
155}
156
157static void buildEpilogRestore(const GCNSubtarget &ST,
158 const SIRegisterInfo &TRI,
159 const SIMachineFunctionInfo &FuncInfo,
160 LiveRegUnits &LiveUnits, MachineFunction &MF,
163 const DebugLoc &DL, Register SpillReg, int FI,
164 Register FrameReg, int64_t DwordOff = 0) {
165 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
166 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
167
168 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
171 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FI),
172 FrameInfo.getObjectAlign(FI));
173 TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, false, FrameReg,
174 DwordOff, MMO, nullptr, &LiveUnits);
175}
176
178 const DebugLoc &DL, const SIInstrInfo *TII,
179 Register TargetReg) {
182 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
183 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
184 Register TargetLo = TRI->getSubReg(TargetReg, AMDGPU::sub0);
185 Register TargetHi = TRI->getSubReg(TargetReg, AMDGPU::sub1);
186
187 if (MFI->getGITPtrHigh() != 0xffffffff) {
188 BuildMI(MBB, I, DL, SMovB32, TargetHi)
189 .addImm(MFI->getGITPtrHigh())
190 .addReg(TargetReg, RegState::ImplicitDefine);
191 } else {
192 const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64_pseudo);
193 BuildMI(MBB, I, DL, GetPC64, TargetReg);
194 }
195 Register GitPtrLo = MFI->getGITPtrLoReg(*MF);
196 MF->getRegInfo().addLiveIn(GitPtrLo);
197 MBB.addLiveIn(GitPtrLo);
198 BuildMI(MBB, I, DL, SMovB32, TargetLo)
199 .addReg(GitPtrLo);
200}
201
202static void initLiveUnits(LiveRegUnits &LiveUnits, const SIRegisterInfo &TRI,
203 const SIMachineFunctionInfo *FuncInfo,
205 MachineBasicBlock::iterator MBBI, bool IsProlog) {
206 if (LiveUnits.empty()) {
207 LiveUnits.init(TRI);
208 if (IsProlog) {
209 LiveUnits.addLiveIns(MBB);
210 } else {
211 // In epilog.
212 LiveUnits.addLiveOuts(MBB);
213 LiveUnits.stepBackward(*MBBI);
214 }
215 }
216}
217
218namespace llvm {
219
220// SpillBuilder to save/restore special SGPR spills like the one needed for FP,
221// BP, etc. These spills are delayed until the current function's frame is
222// finalized. For a given register, the builder uses the
223// PrologEpilogSGPRSaveRestoreInfo to decide the spill method.
227 MachineFunction &MF;
228 const GCNSubtarget &ST;
229 MachineFrameInfo &MFI;
230 SIMachineFunctionInfo *FuncInfo;
231 const SIInstrInfo *TII;
232 const SIRegisterInfo &TRI;
233 Register SuperReg;
235 LiveRegUnits &LiveUnits;
236 const DebugLoc &DL;
237 Register FrameReg;
238 ArrayRef<int16_t> SplitParts;
239 unsigned NumSubRegs;
240 unsigned EltSize = 4;
241
242 void saveToMemory(const int FI) const {
244 assert(!MFI.isDeadObjectIndex(FI));
245
246 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ true);
247
249 MRI, LiveUnits, AMDGPU::VGPR_32RegClass);
250 if (!TmpVGPR)
251 report_fatal_error("failed to find free scratch register");
252
253 for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) {
254 Register SubReg = NumSubRegs == 1
255 ? SuperReg
256 : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
257 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
258 .addReg(SubReg);
259
260 buildPrologSpill(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MI, DL, TmpVGPR,
261 FI, FrameReg, DwordOff);
262 DwordOff += 4;
263 }
264 }
265
266 void saveToVGPRLane(const int FI) const {
267 assert(!MFI.isDeadObjectIndex(FI));
268
272 assert(Spill.size() == NumSubRegs);
273
274 for (unsigned I = 0; I < NumSubRegs; ++I) {
275 Register SubReg = NumSubRegs == 1
276 ? SuperReg
277 : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
278 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_S32_TO_VGPR),
279 Spill[I].VGPR)
280 .addReg(SubReg)
281 .addImm(Spill[I].Lane)
282 .addReg(Spill[I].VGPR, RegState::Undef);
283 }
284 }
285
286 void copyToScratchSGPR(Register DstReg) const {
287 BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), DstReg)
288 .addReg(SuperReg)
290 }
291
292 void restoreFromMemory(const int FI) {
294
295 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ false);
297 MRI, LiveUnits, AMDGPU::VGPR_32RegClass);
298 if (!TmpVGPR)
299 report_fatal_error("failed to find free scratch register");
300
301 for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) {
302 Register SubReg = NumSubRegs == 1
303 ? SuperReg
304 : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
305
306 buildEpilogRestore(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MI, DL,
307 TmpVGPR, FI, FrameReg, DwordOff);
308 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
309 .addReg(TmpVGPR, RegState::Kill);
310 DwordOff += 4;
311 }
312 }
313
314 void restoreFromVGPRLane(const int FI) {
318 assert(Spill.size() == NumSubRegs);
319
320 for (unsigned I = 0; I < NumSubRegs; ++I) {
321 Register SubReg = NumSubRegs == 1
322 ? SuperReg
323 : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
324 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
325 .addReg(Spill[I].VGPR)
326 .addImm(Spill[I].Lane);
327 }
328 }
329
330 void copyFromScratchSGPR(Register SrcReg) const {
331 BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), SuperReg)
332 .addReg(SrcReg)
334 }
335
336public:
341 const DebugLoc &DL, const SIInstrInfo *TII,
342 const SIRegisterInfo &TRI,
343 LiveRegUnits &LiveUnits, Register FrameReg)
344 : MI(MI), MBB(MBB), MF(*MBB.getParent()),
345 ST(MF.getSubtarget<GCNSubtarget>()), MFI(MF.getFrameInfo()),
346 FuncInfo(MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI),
347 SuperReg(Reg), SI(SI), LiveUnits(LiveUnits), DL(DL),
348 FrameReg(FrameReg) {
349 const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg);
350 SplitParts = TRI.getRegSplitParts(RC, EltSize);
351 NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
352
353 assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
354 }
355
356 void save() {
357 switch (SI.getKind()) {
359 return saveToMemory(SI.getIndex());
361 return saveToVGPRLane(SI.getIndex());
363 return copyToScratchSGPR(SI.getReg());
364 }
365 }
366
367 void restore() {
368 switch (SI.getKind()) {
370 return restoreFromMemory(SI.getIndex());
372 return restoreFromVGPRLane(SI.getIndex());
374 return copyFromScratchSGPR(SI.getReg());
375 }
376 }
377};
378
379} // namespace llvm
380
381// Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()`
382void SIFrameLowering::emitEntryFunctionFlatScratchInit(
384 const DebugLoc &DL, Register ScratchWaveOffsetReg) const {
386 const SIInstrInfo *TII = ST.getInstrInfo();
387 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
389
390 // We don't need this if we only have spills since there is no user facing
391 // scratch.
392
393 // TODO: If we know we don't have flat instructions earlier, we can omit
394 // this from the input registers.
395 //
396 // TODO: We only need to know if we access scratch space through a flat
397 // pointer. Because we only detect if flat instructions are used at all,
398 // this will be used more often than necessary on VI.
399
400 Register FlatScrInitLo;
401 Register FlatScrInitHi;
402
403 if (ST.isAmdPalOS()) {
404 // Extract the scratch offset from the descriptor in the GIT
405 LiveRegUnits LiveUnits;
406 LiveUnits.init(*TRI);
407 LiveUnits.addLiveIns(MBB);
408
409 // Find unused reg to load flat scratch init into
411 Register FlatScrInit = AMDGPU::NoRegister;
412 ArrayRef<MCPhysReg> AllSGPR64s = TRI->getAllSGPR64(MF);
413 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2;
414 AllSGPR64s = AllSGPR64s.slice(
415 std::min(static_cast<unsigned>(AllSGPR64s.size()), NumPreloaded));
416 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
417 for (MCPhysReg Reg : AllSGPR64s) {
418 if (LiveUnits.available(Reg) && !MRI.isReserved(Reg) &&
419 MRI.isAllocatable(Reg) && !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
420 FlatScrInit = Reg;
421 break;
422 }
423 }
424 assert(FlatScrInit && "Failed to find free register for scratch init");
425
426 FlatScrInitLo = TRI->getSubReg(FlatScrInit, AMDGPU::sub0);
427 FlatScrInitHi = TRI->getSubReg(FlatScrInit, AMDGPU::sub1);
428
429 buildGitPtr(MBB, I, DL, TII, FlatScrInit);
430
431 // We now have the GIT ptr - now get the scratch descriptor from the entry
432 // at offset 0 (or offset 16 for a compute shader).
434 const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
435 auto *MMO = MF.getMachineMemOperand(
436 PtrInfo,
439 8, Align(4));
440 unsigned Offset =
442 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
443 unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
444 BuildMI(MBB, I, DL, LoadDwordX2, FlatScrInit)
445 .addReg(FlatScrInit)
446 .addImm(EncodedOffset) // offset
447 .addImm(0) // cpol
448 .addMemOperand(MMO);
449
450 // Mask the offset in [47:0] of the descriptor
451 const MCInstrDesc &SAndB32 = TII->get(AMDGPU::S_AND_B32);
452 auto And = BuildMI(MBB, I, DL, SAndB32, FlatScrInitHi)
453 .addReg(FlatScrInitHi)
454 .addImm(0xffff);
455 And->getOperand(3).setIsDead(); // Mark SCC as dead.
456 } else {
457 Register FlatScratchInitReg =
459 assert(FlatScratchInitReg);
460
462 MRI.addLiveIn(FlatScratchInitReg);
463 MBB.addLiveIn(FlatScratchInitReg);
464
465 FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
466 FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
467 }
468
469 // Do a 64-bit pointer add.
470 if (ST.flatScratchIsPointer()) {
471 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
472 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
473 .addReg(FlatScrInitLo)
474 .addReg(ScratchWaveOffsetReg);
475 auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32),
476 FlatScrInitHi)
477 .addReg(FlatScrInitHi)
478 .addImm(0);
479 Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
480
481 using namespace AMDGPU::Hwreg;
482 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32))
483 .addReg(FlatScrInitLo)
484 .addImm(int16_t(HwregEncoding::encode(ID_FLAT_SCR_LO, 0, 32)));
485 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32))
486 .addReg(FlatScrInitHi)
487 .addImm(int16_t(HwregEncoding::encode(ID_FLAT_SCR_HI, 0, 32)));
488 return;
489 }
490
491 // For GFX9.
492 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO)
493 .addReg(FlatScrInitLo)
494 .addReg(ScratchWaveOffsetReg);
495 auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32),
496 AMDGPU::FLAT_SCR_HI)
497 .addReg(FlatScrInitHi)
498 .addImm(0);
499 Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
500
501 return;
502 }
503
504 assert(ST.getGeneration() < AMDGPUSubtarget::GFX9);
505
506 // Copy the size in bytes.
507 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO)
508 .addReg(FlatScrInitHi, RegState::Kill);
509
510 // Add wave offset in bytes to private base offset.
511 // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
512 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), FlatScrInitLo)
513 .addReg(FlatScrInitLo)
514 .addReg(ScratchWaveOffsetReg);
515
516 // Convert offset to 256-byte units.
517 auto LShr = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32),
518 AMDGPU::FLAT_SCR_HI)
519 .addReg(FlatScrInitLo, RegState::Kill)
520 .addImm(8);
521 LShr->getOperand(3).setIsDead(); // Mark SCC as dead.
522}
523
524// Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
525// memory. They should have been removed by now.
527 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
528 I != E; ++I) {
529 if (!MFI.isDeadObjectIndex(I))
530 return false;
531 }
532
533 return true;
534}
535
536// Shift down registers reserved for the scratch RSRC.
537Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
538 MachineFunction &MF) const {
539
541 const SIInstrInfo *TII = ST.getInstrInfo();
542 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
545
546 assert(MFI->isEntryFunction());
547
548 Register ScratchRsrcReg = MFI->getScratchRSrcReg();
549
550 if (!ScratchRsrcReg || (!MRI.isPhysRegUsed(ScratchRsrcReg) &&
552 return Register();
553
554 if (ST.hasSGPRInitBug() ||
555 ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF))
556 return ScratchRsrcReg;
557
558 // We reserved the last registers for this. Shift it down to the end of those
559 // which were actually used.
560 //
561 // FIXME: It might be safer to use a pseudoregister before replacement.
562
563 // FIXME: We should be able to eliminate unused input registers. We only
564 // cannot do this for the resources required for scratch access. For now we
565 // skip over user SGPRs and may leave unused holes.
566
567 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4;
568 ArrayRef<MCPhysReg> AllSGPR128s = TRI->getAllSGPR128(MF);
569 AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded));
570
571 // Skip the last N reserved elements because they should have already been
572 // reserved for VCC etc.
573 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
574 for (MCPhysReg Reg : AllSGPR128s) {
575 // Pick the first unallocated one. Make sure we don't clobber the other
576 // reserved input we needed. Also for PAL, make sure we don't clobber
577 // the GIT pointer passed in SGPR0 or SGPR8.
578 if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
579 (!GITPtrLoReg || !TRI->isSubRegisterEq(Reg, GITPtrLoReg))) {
580 MRI.replaceRegWith(ScratchRsrcReg, Reg);
581 MFI->setScratchRSrcReg(Reg);
582 return Reg;
583 }
584 }
585
586 return ScratchRsrcReg;
587}
588
589static unsigned getScratchScaleFactor(const GCNSubtarget &ST) {
590 return ST.enableFlatScratch() ? 1 : ST.getWavefrontSize();
591}
592
594 MachineBasicBlock &MBB) const {
595 assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
596
597 // FIXME: If we only have SGPR spills, we won't actually be using scratch
598 // memory since these spill to VGPRs. We should be cleaning up these unused
599 // SGPR spill frame indices somewhere.
600
601 // FIXME: We still have implicit uses on SGPR spill instructions in case they
602 // need to spill to vector memory. It's likely that will not happen, but at
603 // this point it appears we need the setup. This part of the prolog should be
604 // emitted after frame indices are eliminated.
605
606 // FIXME: Remove all of the isPhysRegUsed checks
607
609 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
610 const SIInstrInfo *TII = ST.getInstrInfo();
611 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
613 const Function &F = MF.getFunction();
614 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
615
616 assert(MFI->isEntryFunction());
617
618 Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
620
621 // We need to do the replacement of the private segment buffer register even
622 // if there are no stack objects. There could be stores to undef or a
623 // constant without an associated object.
624 //
625 // This will return `Register()` in cases where there are no actual
626 // uses of the SRSRC.
627 Register ScratchRsrcReg;
628 if (!ST.enableFlatScratch())
629 ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF);
630
631 // Make the selected register live throughout the function.
632 if (ScratchRsrcReg) {
633 for (MachineBasicBlock &OtherBB : MF) {
634 if (&OtherBB != &MBB) {
635 OtherBB.addLiveIn(ScratchRsrcReg);
636 }
637 }
638 }
639
640 // Now that we have fixed the reserved SRSRC we need to locate the
641 // (potentially) preloaded SRSRC.
642 Register PreloadedScratchRsrcReg;
643 if (ST.isAmdHsaOrMesa(F)) {
644 PreloadedScratchRsrcReg =
646 if (ScratchRsrcReg && PreloadedScratchRsrcReg) {
647 // We added live-ins during argument lowering, but since they were not
648 // used they were deleted. We're adding the uses now, so add them back.
649 MRI.addLiveIn(PreloadedScratchRsrcReg);
650 MBB.addLiveIn(PreloadedScratchRsrcReg);
651 }
652 }
653
654 // Debug location must be unknown since the first debug location is used to
655 // determine the end of the prologue.
656 DebugLoc DL;
658
659 // We found the SRSRC first because it needs four registers and has an
660 // alignment requirement. If the SRSRC that we found is clobbering with
661 // the scratch wave offset, which may be in a fixed SGPR or a free SGPR
662 // chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch
663 // wave offset to a free SGPR.
664 Register ScratchWaveOffsetReg;
665 if (PreloadedScratchWaveOffsetReg &&
666 TRI->isSubRegisterEq(ScratchRsrcReg, PreloadedScratchWaveOffsetReg)) {
667 ArrayRef<MCPhysReg> AllSGPRs = TRI->getAllSGPR32(MF);
668 unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
669 AllSGPRs = AllSGPRs.slice(
670 std::min(static_cast<unsigned>(AllSGPRs.size()), NumPreloaded));
671 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
672 for (MCPhysReg Reg : AllSGPRs) {
673 if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
674 !TRI->isSubRegisterEq(ScratchRsrcReg, Reg) && GITPtrLoReg != Reg) {
675 ScratchWaveOffsetReg = Reg;
676 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
677 .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill);
678 break;
679 }
680 }
681 } else {
682 ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg;
683 }
684 assert(ScratchWaveOffsetReg || !PreloadedScratchWaveOffsetReg);
685
687 Register SPReg = MFI->getStackPtrOffsetReg();
688 assert(SPReg != AMDGPU::SP_REG);
689 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg)
690 .addImm(FrameInfo.getStackSize() * getScratchScaleFactor(ST));
691 }
692
693 if (hasFP(MF)) {
694 Register FPReg = MFI->getFrameOffsetReg();
695 assert(FPReg != AMDGPU::FP_REG);
696 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
697 }
698
699 bool NeedsFlatScratchInit =
701 (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() ||
702 (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch()));
703
704 if ((NeedsFlatScratchInit || ScratchRsrcReg) &&
705 PreloadedScratchWaveOffsetReg && !ST.flatScratchIsArchitected()) {
706 MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
707 MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
708 }
709
710 if (NeedsFlatScratchInit) {
711 emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
712 }
713
714 if (ScratchRsrcReg) {
715 emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL,
716 PreloadedScratchRsrcReg,
717 ScratchRsrcReg, ScratchWaveOffsetReg);
718 }
719}
720
721// Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg`
722void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
724 const DebugLoc &DL, Register PreloadedScratchRsrcReg,
725 Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const {
726
727 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
728 const SIInstrInfo *TII = ST.getInstrInfo();
729 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
731 const Function &Fn = MF.getFunction();
732
733 if (ST.isAmdPalOS()) {
734 // The pointer to the GIT is formed from the offset passed in and either
735 // the amdgpu-git-ptr-high function attribute or the top part of the PC
736 Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
737 Register Rsrc03 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
738
739 buildGitPtr(MBB, I, DL, TII, Rsrc01);
740
741 // We now have the GIT ptr - now get the scratch descriptor from the entry
742 // at offset 0 (or offset 16 for a compute shader).
744 const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM);
745 auto MMO = MF.getMachineMemOperand(PtrInfo,
749 16, Align(4));
750 unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
751 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
752 unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
753 BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg)
754 .addReg(Rsrc01)
755 .addImm(EncodedOffset) // offset
756 .addImm(0) // cpol
757 .addReg(ScratchRsrcReg, RegState::ImplicitDefine)
758 .addMemOperand(MMO);
759
760 // The driver will always set the SRD for wave 64 (bits 118:117 of
761 // descriptor / bits 22:21 of third sub-reg will be 0b11)
762 // If the shader is actually wave32 we have to modify the const_index_stride
763 // field of the descriptor 3rd sub-reg (bits 22:21) to 0b10 (stride=32). The
764 // reason the driver does this is that there can be cases where it presents
765 // 2 shaders with different wave size (e.g. VsFs).
766 // TODO: convert to using SCRATCH instructions or multiple SRD buffers
767 if (ST.isWave32()) {
768 const MCInstrDesc &SBitsetB32 = TII->get(AMDGPU::S_BITSET0_B32);
769 BuildMI(MBB, I, DL, SBitsetB32, Rsrc03)
770 .addImm(21)
771 .addReg(Rsrc03);
772 }
773 } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) {
774 assert(!ST.isAmdHsaOrMesa(Fn));
775 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
776
777 Register Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
778 Register Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
779
780 // Use relocations to get the pointer, and setup the other bits manually.
781 uint64_t Rsrc23 = TII->getScratchRsrcWords23();
782
784 Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
785
787 const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64);
788
789 BuildMI(MBB, I, DL, Mov64, Rsrc01)
791 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
792 } else {
793 const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
794
796 auto MMO = MF.getMachineMemOperand(
797 PtrInfo,
800 8, Align(4));
801 BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01)
803 .addImm(0) // offset
804 .addImm(0) // cpol
805 .addMemOperand(MMO)
806 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
807
810 }
811 } else {
812 Register Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
813 Register Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
814
815 BuildMI(MBB, I, DL, SMovB32, Rsrc0)
816 .addExternalSymbol("SCRATCH_RSRC_DWORD0")
817 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
818
819 BuildMI(MBB, I, DL, SMovB32, Rsrc1)
820 .addExternalSymbol("SCRATCH_RSRC_DWORD1")
821 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
822 }
823
824 BuildMI(MBB, I, DL, SMovB32, Rsrc2)
825 .addImm(Rsrc23 & 0xffffffff)
826 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
827
828 BuildMI(MBB, I, DL, SMovB32, Rsrc3)
829 .addImm(Rsrc23 >> 32)
830 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
831 } else if (ST.isAmdHsaOrMesa(Fn)) {
832 assert(PreloadedScratchRsrcReg);
833
834 if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
835 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
836 .addReg(PreloadedScratchRsrcReg, RegState::Kill);
837 }
838 }
839
840 // Add the scratch wave offset into the scratch RSRC.
841 //
842 // We only want to update the first 48 bits, which is the base address
843 // pointer, without touching the adjacent 16 bits of flags. We know this add
844 // cannot carry-out from bit 47, otherwise the scratch allocation would be
845 // impossible to fit in the 48-bit global address space.
846 //
847 // TODO: Evaluate if it is better to just construct an SRD using the flat
848 // scratch init and some constants rather than update the one we are passed.
849 Register ScratchRsrcSub0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
850 Register ScratchRsrcSub1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
851
852 // We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in
853 // the kernel body via inreg arguments.
854 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), ScratchRsrcSub0)
855 .addReg(ScratchRsrcSub0)
856 .addReg(ScratchWaveOffsetReg)
857 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
858 auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), ScratchRsrcSub1)
859 .addReg(ScratchRsrcSub1)
860 .addImm(0)
861 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
862 Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
863}
864
866 switch (ID) {
870 return true;
873 return false;
874 }
875 llvm_unreachable("Invalid TargetStackID::Value");
876}
877
878// Activate only the inactive lanes when \p EnableInactiveLanes is true.
879// Otherwise, activate all lanes. It returns the saved exec.
881 MachineFunction &MF,
884 const DebugLoc &DL, bool IsProlog,
885 bool EnableInactiveLanes) {
886 Register ScratchExecCopy;
888 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
889 const SIInstrInfo *TII = ST.getInstrInfo();
890 const SIRegisterInfo &TRI = TII->getRegisterInfo();
892
893 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, IsProlog);
894
895 ScratchExecCopy = findScratchNonCalleeSaveRegister(
896 MRI, LiveUnits, *TRI.getWaveMaskRegClass());
897 if (!ScratchExecCopy)
898 report_fatal_error("failed to find free scratch register");
899
900 LiveUnits.addReg(ScratchExecCopy);
901
902 const unsigned SaveExecOpc =
903 ST.isWave32() ? (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B32
904 : AMDGPU::S_OR_SAVEEXEC_B32)
905 : (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B64
906 : AMDGPU::S_OR_SAVEEXEC_B64);
907 auto SaveExec =
908 BuildMI(MBB, MBBI, DL, TII->get(SaveExecOpc), ScratchExecCopy).addImm(-1);
909 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
910
911 return ScratchExecCopy;
912}
913
917 Register FrameReg, Register FramePtrRegScratchCopy) const {
919 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
920 const SIInstrInfo *TII = ST.getInstrInfo();
921 const SIRegisterInfo &TRI = TII->getRegisterInfo();
922
923 // Spill Whole-Wave Mode VGPRs. Save only the inactive lanes of the scratch
924 // registers. However, save all lanes of callee-saved VGPRs. Due to this, we
925 // might end up flipping the EXEC bits twice.
926 Register ScratchExecCopy;
927 SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs;
928 FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs);
929 if (!WWMScratchRegs.empty())
930 ScratchExecCopy =
931 buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
932 /*IsProlog*/ true, /*EnableInactiveLanes*/ true);
933
934 auto StoreWWMRegisters =
936 for (const auto &Reg : WWMRegs) {
937 Register VGPR = Reg.first;
938 int FI = Reg.second;
939 buildPrologSpill(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MBBI, DL,
940 VGPR, FI, FrameReg);
941 }
942 };
943
944 StoreWWMRegisters(WWMScratchRegs);
945 if (!WWMCalleeSavedRegs.empty()) {
946 if (ScratchExecCopy) {
947 unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
948 BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1);
949 } else {
950 ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
951 /*IsProlog*/ true,
952 /*EnableInactiveLanes*/ false);
953 }
954 }
955
956 StoreWWMRegisters(WWMCalleeSavedRegs);
957 if (ScratchExecCopy) {
958 // FIXME: Split block and make terminator.
959 unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
960 BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec())
961 .addReg(ScratchExecCopy, RegState::Kill);
962 LiveUnits.addReg(ScratchExecCopy);
963 }
964
965 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
966
967 for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) {
968 // Special handle FP spill:
969 // Skip if FP is saved to a scratch SGPR, the save has already been emitted.
970 // Otherwise, FP has been moved to a temporary register and spill it
971 // instead.
972 Register Reg =
973 Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first;
974 if (!Reg)
975 continue;
976
977 PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI,
978 LiveUnits, FrameReg);
979 SB.save();
980 }
981
982 // If a copy to scratch SGPR has been chosen for any of the SGPR spills, make
983 // such scratch registers live throughout the function.
984 SmallVector<Register, 1> ScratchSGPRs;
985 FuncInfo->getAllScratchSGPRCopyDstRegs(ScratchSGPRs);
986 if (!ScratchSGPRs.empty()) {
987 for (MachineBasicBlock &MBB : MF) {
988 for (MCPhysReg Reg : ScratchSGPRs)
989 MBB.addLiveIn(Reg);
990
992 }
993 if (!LiveUnits.empty()) {
994 for (MCPhysReg Reg : ScratchSGPRs)
995 LiveUnits.addReg(Reg);
996 }
997 }
998}
999
1003 Register FrameReg, Register FramePtrRegScratchCopy) const {
1004 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1005 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1006 const SIInstrInfo *TII = ST.getInstrInfo();
1007 const SIRegisterInfo &TRI = TII->getRegisterInfo();
1008 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1009
1010 for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) {
1011 // Special handle FP restore:
1012 // Skip if FP needs to be restored from the scratch SGPR. Otherwise, restore
1013 // the FP value to a temporary register. The frame pointer should be
1014 // overwritten only at the end when all other spills are restored from
1015 // current frame.
1016 Register Reg =
1017 Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first;
1018 if (!Reg)
1019 continue;
1020
1021 PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI,
1022 LiveUnits, FrameReg);
1023 SB.restore();
1024 }
1025
1026 // Restore Whole-Wave Mode VGPRs. Restore only the inactive lanes of the
1027 // scratch registers. However, restore all lanes of callee-saved VGPRs. Due to
1028 // this, we might end up flipping the EXEC bits twice.
1029 Register ScratchExecCopy;
1030 SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs;
1031 FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs);
1032 if (!WWMScratchRegs.empty())
1033 ScratchExecCopy =
1034 buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
1035 /*IsProlog*/ false, /*EnableInactiveLanes*/ true);
1036
1037 auto RestoreWWMRegisters =
1039 for (const auto &Reg : WWMRegs) {
1040 Register VGPR = Reg.first;
1041 int FI = Reg.second;
1042 buildEpilogRestore(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MBBI, DL,
1043 VGPR, FI, FrameReg);
1044 }
1045 };
1046
1047 RestoreWWMRegisters(WWMScratchRegs);
1048 if (!WWMCalleeSavedRegs.empty()) {
1049 if (ScratchExecCopy) {
1050 unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1051 BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1);
1052 } else {
1053 ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
1054 /*IsProlog*/ false,
1055 /*EnableInactiveLanes*/ false);
1056 }
1057 }
1058
1059 RestoreWWMRegisters(WWMCalleeSavedRegs);
1060 if (ScratchExecCopy) {
1061 // FIXME: Split block and make terminator.
1062 unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1063 BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec())
1064 .addReg(ScratchExecCopy, RegState::Kill);
1065 }
1066}
1067
1069 MachineBasicBlock &MBB) const {
1071 if (FuncInfo->isEntryFunction()) {
1073 return;
1074 }
1075
1076 MachineFrameInfo &MFI = MF.getFrameInfo();
1077 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1078 const SIInstrInfo *TII = ST.getInstrInfo();
1079 const SIRegisterInfo &TRI = TII->getRegisterInfo();
1081
1082 Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
1083 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1084 Register BasePtrReg =
1085 TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register();
1086 LiveRegUnits LiveUnits;
1087
1089 // DebugLoc must be unknown since the first instruction with DebugLoc is used
1090 // to determine the end of the prologue.
1091 DebugLoc DL;
1092
1093 if (FuncInfo->isChainFunction()) {
1094 // Functions with the amdgpu_cs_chain[_preserve] CC don't receive a SP, but
1095 // are free to set one up if they need it.
1096 bool UseSP = requiresStackPointerReference(MF);
1097 if (UseSP) {
1098 assert(StackPtrReg != AMDGPU::SP_REG);
1099
1100 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_MOV_B32), StackPtrReg)
1102 }
1103 }
1104
1105 bool HasFP = false;
1106 bool HasBP = false;
1107 uint32_t NumBytes = MFI.getStackSize();
1108 uint32_t RoundedSize = NumBytes;
1109
1110 if (TRI.hasStackRealignment(MF))
1111 HasFP = true;
1112
1113 Register FramePtrRegScratchCopy;
1114 if (!HasFP && !hasFP(MF)) {
1115 // Emit the CSR spill stores with SP base register.
1116 emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits,
1117 FuncInfo->isChainFunction() ? Register() : StackPtrReg,
1118 FramePtrRegScratchCopy);
1119 } else {
1120 // CSR spill stores will use FP as base register.
1121 Register SGPRForFPSaveRestoreCopy =
1122 FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
1123
1124 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true);
1125 if (SGPRForFPSaveRestoreCopy) {
1126 // Copy FP to the scratch register now and emit the CFI entry. It avoids
1127 // the extra FP copy needed in the other two cases when FP is spilled to
1128 // memory or to a VGPR lane.
1130 FramePtrReg,
1131 FuncInfo->getPrologEpilogSGPRSaveRestoreInfo(FramePtrReg), MBB, MBBI,
1132 DL, TII, TRI, LiveUnits, FramePtrReg);
1133 SB.save();
1134 LiveUnits.addReg(SGPRForFPSaveRestoreCopy);
1135 } else {
1136 // Copy FP into a new scratch register so that its previous value can be
1137 // spilled after setting up the new frame.
1138 FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister(
1139 MRI, LiveUnits, AMDGPU::SReg_32_XM0_XEXECRegClass);
1140 if (!FramePtrRegScratchCopy)
1141 report_fatal_error("failed to find free scratch register");
1142
1143 LiveUnits.addReg(FramePtrRegScratchCopy);
1144 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrRegScratchCopy)
1145 .addReg(FramePtrReg);
1146 }
1147 }
1148
1149 if (HasFP) {
1150 const unsigned Alignment = MFI.getMaxAlign().value();
1151
1152 RoundedSize += Alignment;
1153 if (LiveUnits.empty()) {
1154 LiveUnits.init(TRI);
1155 LiveUnits.addLiveIns(MBB);
1156 }
1157
1158 // s_add_i32 s33, s32, NumBytes
1159 // s_and_b32 s33, s33, 0b111...0000
1160 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), FramePtrReg)
1161 .addReg(StackPtrReg)
1162 .addImm((Alignment - 1) * getScratchScaleFactor(ST))
1164 auto And = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
1165 .addReg(FramePtrReg, RegState::Kill)
1166 .addImm(-Alignment * getScratchScaleFactor(ST))
1168 And->getOperand(3).setIsDead(); // Mark SCC as dead.
1169 FuncInfo->setIsStackRealigned(true);
1170 } else if ((HasFP = hasFP(MF))) {
1171 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
1172 .addReg(StackPtrReg)
1174 }
1175
1176 // If FP is used, emit the CSR spills with FP base register.
1177 if (HasFP) {
1178 emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits, FramePtrReg,
1179 FramePtrRegScratchCopy);
1180 if (FramePtrRegScratchCopy)
1181 LiveUnits.removeReg(FramePtrRegScratchCopy);
1182 }
1183
1184 // If we need a base pointer, set it up here. It's whatever the value of
1185 // the stack pointer is at this point. Any variable size objects will be
1186 // allocated after this, so we can still use the base pointer to reference
1187 // the incoming arguments.
1188 if ((HasBP = TRI.hasBasePointer(MF))) {
1189 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg)
1190 .addReg(StackPtrReg)
1192 }
1193
1194 if (HasFP && RoundedSize != 0) {
1195 auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
1196 .addReg(StackPtrReg)
1197 .addImm(RoundedSize * getScratchScaleFactor(ST))
1199 Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1200 }
1201
1202 bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(FramePtrReg);
1203 (void)FPSaved;
1204 assert((!HasFP || FPSaved) &&
1205 "Needed to save FP but didn't save it anywhere");
1206
1207 // If we allow spilling to AGPRs we may have saved FP but then spill
1208 // everything into AGPRs instead of the stack.
1209 assert((HasFP || !FPSaved || EnableSpillVGPRToAGPR) &&
1210 "Saved FP but didn't need it");
1211
1212 bool BPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(BasePtrReg);
1213 (void)BPSaved;
1214 assert((!HasBP || BPSaved) &&
1215 "Needed to save BP but didn't save it anywhere");
1216
1217 assert((HasBP || !BPSaved) && "Saved BP but didn't need it");
1218}
1219
1221 MachineBasicBlock &MBB) const {
1222 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1223 if (FuncInfo->isEntryFunction())
1224 return;
1225
1226 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1227 const SIInstrInfo *TII = ST.getInstrInfo();
1228 const SIRegisterInfo &TRI = TII->getRegisterInfo();
1230 LiveRegUnits LiveUnits;
1231 // Get the insert location for the epilogue. If there were no terminators in
1232 // the block, get the last instruction.
1234 DebugLoc DL;
1235 if (!MBB.empty()) {
1237 if (MBBI != MBB.end())
1238 DL = MBBI->getDebugLoc();
1239
1241 }
1242
1243 const MachineFrameInfo &MFI = MF.getFrameInfo();
1244 uint32_t NumBytes = MFI.getStackSize();
1245 uint32_t RoundedSize = FuncInfo->isStackRealigned()
1246 ? NumBytes + MFI.getMaxAlign().value()
1247 : NumBytes;
1248 const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
1249 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1250 bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(FramePtrReg);
1251
1252 Register FramePtrRegScratchCopy;
1253 Register SGPRForFPSaveRestoreCopy =
1254 FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
1255 if (FPSaved) {
1256 // CSR spill restores should use FP as base register. If
1257 // SGPRForFPSaveRestoreCopy is not true, restore the previous value of FP
1258 // into a new scratch register and copy to FP later when other registers are
1259 // restored from the current stack frame.
1260 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false);
1261 if (SGPRForFPSaveRestoreCopy) {
1262 LiveUnits.addReg(SGPRForFPSaveRestoreCopy);
1263 } else {
1264 FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister(
1265 MRI, LiveUnits, AMDGPU::SReg_32_XM0_XEXECRegClass);
1266 if (!FramePtrRegScratchCopy)
1267 report_fatal_error("failed to find free scratch register");
1268
1269 LiveUnits.addReg(FramePtrRegScratchCopy);
1270 }
1271
1272 emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, FramePtrReg,
1273 FramePtrRegScratchCopy);
1274 }
1275
1276 if (RoundedSize != 0 && hasFP(MF)) {
1277 auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
1278 .addReg(StackPtrReg)
1279 .addImm(-static_cast<int64_t>(RoundedSize * getScratchScaleFactor(ST)))
1281 Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1282 }
1283
1284 if (FPSaved) {
1285 // Insert the copy to restore FP.
1286 Register SrcReg = SGPRForFPSaveRestoreCopy ? SGPRForFPSaveRestoreCopy
1287 : FramePtrRegScratchCopy;
1289 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
1290 .addReg(SrcReg);
1291 if (SGPRForFPSaveRestoreCopy)
1293 } else {
1294 // Insert the CSR spill restores with SP as the base register.
1295 emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, StackPtrReg,
1296 FramePtrRegScratchCopy);
1297 }
1298}
1299
1300#ifndef NDEBUG
1302 const MachineFrameInfo &MFI = MF.getFrameInfo();
1303 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1304 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
1305 I != E; ++I) {
1306 if (!MFI.isDeadObjectIndex(I) &&
1309 return false;
1310 }
1311 }
1312
1313 return true;
1314}
1315#endif
1316
1318 int FI,
1319 Register &FrameReg) const {
1320 const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
1321
1322 FrameReg = RI->getFrameRegister(MF);
1324}
1325
1327 MachineFunction &MF,
1328 RegScavenger *RS) const {
1329 MachineFrameInfo &MFI = MF.getFrameInfo();
1330
1331 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1332 const SIInstrInfo *TII = ST.getInstrInfo();
1333 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1336
1337 // Allocate spill slots for WWM reserved VGPRs.
1338 // For chain functions, we only need to do this if we have calls to
1339 // llvm.amdgcn.cs.chain.
1340 bool IsChainWithoutCalls =
1341 FuncInfo->isChainFunction() && !MF.getFrameInfo().hasTailCall();
1342 if (!FuncInfo->isEntryFunction() && !IsChainWithoutCalls) {
1343 for (Register Reg : FuncInfo->getWWMReservedRegs()) {
1344 const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
1345 FuncInfo->allocateWWMSpill(MF, Reg, TRI->getSpillSize(*RC),
1346 TRI->getSpillAlign(*RC));
1347 }
1348 }
1349
1350 const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs()
1352
1353 if (SpillVGPRToAGPR) {
1354 // To track the spill frame indices handled in this pass.
1355 BitVector SpillFIs(MFI.getObjectIndexEnd(), false);
1356 BitVector NonVGPRSpillFIs(MFI.getObjectIndexEnd(), false);
1357
1358 bool SeenDbgInstr = false;
1359
1360 for (MachineBasicBlock &MBB : MF) {
1362 int FrameIndex;
1363 if (MI.isDebugInstr())
1364 SeenDbgInstr = true;
1365
1366 if (TII->isVGPRSpill(MI)) {
1367 // Try to eliminate stack used by VGPR spills before frame
1368 // finalization.
1369 unsigned FIOp = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
1370 AMDGPU::OpName::vaddr);
1371 int FI = MI.getOperand(FIOp).getIndex();
1372 Register VReg =
1373 TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
1374 if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI,
1375 TRI->isAGPR(MRI, VReg))) {
1376 assert(RS != nullptr);
1378 RS->backward(std::next(MI.getIterator()));
1379 TRI->eliminateFrameIndex(MI, 0, FIOp, RS);
1380 SpillFIs.set(FI);
1381 continue;
1382 }
1383 } else if (TII->isStoreToStackSlot(MI, FrameIndex) ||
1384 TII->isLoadFromStackSlot(MI, FrameIndex))
1385 if (!MFI.isFixedObjectIndex(FrameIndex))
1386 NonVGPRSpillFIs.set(FrameIndex);
1387 }
1388 }
1389
1390 // Stack slot coloring may assign different objects to the same stack slot.
1391 // If not, then the VGPR to AGPR spill slot is dead.
1392 for (unsigned FI : SpillFIs.set_bits())
1393 if (!NonVGPRSpillFIs.test(FI))
1394 FuncInfo->setVGPRToAGPRSpillDead(FI);
1395
1396 for (MachineBasicBlock &MBB : MF) {
1397 for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs())
1398 MBB.addLiveIn(Reg);
1399
1400 for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs())
1401 MBB.addLiveIn(Reg);
1402
1404
1405 if (!SpillFIs.empty() && SeenDbgInstr) {
1406 // FIXME: The dead frame indices are replaced with a null register from
1407 // the debug value instructions. We should instead, update it with the
1408 // correct register value. But not sure the register value alone is
1409 for (MachineInstr &MI : MBB) {
1410 if (MI.isDebugValue() && MI.getOperand(0).isFI() &&
1411 !MFI.isFixedObjectIndex(MI.getOperand(0).getIndex()) &&
1412 SpillFIs[MI.getOperand(0).getIndex()]) {
1413 MI.getOperand(0).ChangeToRegister(Register(), false /*isDef*/);
1414 }
1415 }
1416 }
1417 }
1418 }
1419
1420 // At this point we've already allocated all spilled SGPRs to VGPRs if we
1421 // can. Any remaining SGPR spills will go to memory, so move them back to the
1422 // default stack.
1423 bool HaveSGPRToVMemSpill =
1424 FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ true);
1426 "SGPR spill should have been removed in SILowerSGPRSpills");
1427
1428 // FIXME: The other checks should be redundant with allStackObjectsAreDead,
1429 // but currently hasNonSpillStackObjects is set only from source
1430 // allocas. Stack temps produced from legalization are not counted currently.
1431 if (!allStackObjectsAreDead(MFI)) {
1432 assert(RS && "RegScavenger required if spilling");
1433
1434 // Add an emergency spill slot
1435 RS->addScavengingFrameIndex(FuncInfo->getScavengeFI(MFI, *TRI));
1436
1437 // If we are spilling SGPRs to memory with a large frame, we may need a
1438 // second VGPR emergency frame index.
1439 if (HaveSGPRToVMemSpill &&
1441 RS->addScavengingFrameIndex(MFI.CreateStackObject(4, Align(4), false));
1442 }
1443 }
1444}
1445
1447 MachineFunction &MF, RegScavenger *RS) const {
1448 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1449 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1452
1453 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
1454 // On gfx908, we had initially reserved highest available VGPR for AGPR
1455 // copy. Now since we are done with RA, check if there exist an unused VGPR
1456 // which is lower than the eariler reserved VGPR before RA. If one exist,
1457 // use it for AGPR copy instead of one reserved before RA.
1458 Register VGPRForAGPRCopy = FuncInfo->getVGPRForAGPRCopy();
1459 Register UnusedLowVGPR =
1460 TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
1461 if (UnusedLowVGPR && (TRI->getHWRegIndex(UnusedLowVGPR) <
1462 TRI->getHWRegIndex(VGPRForAGPRCopy))) {
1463 // Reserve this newly identified VGPR (for AGPR copy)
1464 // reserved registers should already be frozen at this point
1465 // so we can avoid calling MRI.freezeReservedRegs and just use
1466 // MRI.reserveReg
1467 FuncInfo->setVGPRForAGPRCopy(UnusedLowVGPR);
1468 MRI.reserveReg(UnusedLowVGPR, TRI);
1469 }
1470 }
1471 // We initally reserved the highest available SGPR pair for long branches
1472 // now, after RA, we shift down to a lower unused one if one exists
1473 Register LongBranchReservedReg = FuncInfo->getLongBranchReservedReg();
1474 Register UnusedLowSGPR =
1475 TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_64RegClass, MF);
1476 // If LongBranchReservedReg is null then we didn't find a long branch
1477 // and never reserved a register to begin with so there is nothing to
1478 // shift down. Then if UnusedLowSGPR is null, there isn't available lower
1479 // register to use so just keep the original one we set.
1480 if (LongBranchReservedReg && UnusedLowSGPR) {
1481 FuncInfo->setLongBranchReservedReg(UnusedLowSGPR);
1482 MRI.reserveReg(UnusedLowSGPR, TRI);
1483 }
1484}
1485
1486// The special SGPR spills like the one needed for FP, BP or any reserved
1487// registers delayed until frame lowering.
1489 MachineFunction &MF, BitVector &SavedVGPRs,
1490 bool NeedExecCopyReservedReg) const {
1491 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
1494 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1495 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1496 LiveRegUnits LiveUnits;
1497 LiveUnits.init(*TRI);
1498 // Initially mark callee saved registers as used so we will not choose them
1499 // while looking for scratch SGPRs.
1500 const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
1501 for (unsigned I = 0; CSRegs[I]; ++I)
1502 LiveUnits.addReg(CSRegs[I]);
1503
1504 const TargetRegisterClass &RC = *TRI->getWaveMaskRegClass();
1505
1506 Register ReservedRegForExecCopy = MFI->getSGPRForEXECCopy();
1507 if (NeedExecCopyReservedReg ||
1508 (ReservedRegForExecCopy &&
1509 MRI.isPhysRegUsed(ReservedRegForExecCopy, /*SkipRegMaskTest=*/true))) {
1510 MRI.reserveReg(ReservedRegForExecCopy, TRI);
1511 Register UnusedScratchReg = findUnusedRegister(MRI, LiveUnits, RC);
1512 if (UnusedScratchReg) {
1513 // If found any unused scratch SGPR, reserve the register itself for Exec
1514 // copy and there is no need for any spills in that case.
1515 MFI->setSGPRForEXECCopy(UnusedScratchReg);
1516 MRI.replaceRegWith(ReservedRegForExecCopy, UnusedScratchReg);
1517 LiveUnits.addReg(UnusedScratchReg);
1518 } else {
1519 // Needs spill.
1520 assert(!MFI->hasPrologEpilogSGPRSpillEntry(ReservedRegForExecCopy) &&
1521 "Re-reserving spill slot for EXEC copy register");
1522 getVGPRSpillLaneOrTempRegister(MF, LiveUnits, ReservedRegForExecCopy, RC,
1523 /*IncludeScratchCopy=*/false);
1524 }
1525 } else if (ReservedRegForExecCopy) {
1526 // Reset it at this point. There are no whole-wave copies and spills
1527 // encountered.
1528 MFI->setSGPRForEXECCopy(AMDGPU::NoRegister);
1529 }
1530
1531 // hasFP only knows about stack objects that already exist. We're now
1532 // determining the stack slots that will be created, so we have to predict
1533 // them. Stack objects force FP usage with calls.
1534 //
1535 // Note a new VGPR CSR may be introduced if one is used for the spill, but we
1536 // don't want to report it here.
1537 //
1538 // FIXME: Is this really hasReservedCallFrame?
1539 const bool WillHaveFP =
1540 FrameInfo.hasCalls() &&
1541 (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo));
1542
1543 if (WillHaveFP || hasFP(MF)) {
1544 Register FramePtrReg = MFI->getFrameOffsetReg();
1545 assert(!MFI->hasPrologEpilogSGPRSpillEntry(FramePtrReg) &&
1546 "Re-reserving spill slot for FP");
1547 getVGPRSpillLaneOrTempRegister(MF, LiveUnits, FramePtrReg);
1548 }
1549
1550 if (TRI->hasBasePointer(MF)) {
1551 Register BasePtrReg = TRI->getBaseRegister();
1552 assert(!MFI->hasPrologEpilogSGPRSpillEntry(BasePtrReg) &&
1553 "Re-reserving spill slot for BP");
1554 getVGPRSpillLaneOrTempRegister(MF, LiveUnits, BasePtrReg);
1555 }
1556}
1557
1558// Only report VGPRs to generic code.
1560 BitVector &SavedVGPRs,
1561 RegScavenger *RS) const {
1563
1564 // If this is a function with the amdgpu_cs_chain[_preserve] calling
1565 // convention and it doesn't contain any calls to llvm.amdgcn.cs.chain, then
1566 // we don't need to save and restore anything.
1567 if (MFI->isChainFunction() && !MF.getFrameInfo().hasTailCall())
1568 return;
1569
1571
1573 if (MFI->isEntryFunction())
1574 return;
1575
1576 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1577 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1578 const SIInstrInfo *TII = ST.getInstrInfo();
1579 bool NeedExecCopyReservedReg = false;
1580
1581 MachineInstr *ReturnMI = nullptr;
1582 for (MachineBasicBlock &MBB : MF) {
1583 for (MachineInstr &MI : MBB) {
1584 // WRITELANE instructions used for SGPR spills can overwrite the inactive
1585 // lanes of VGPRs and callee must spill and restore them even if they are
1586 // marked Caller-saved.
1587
1588 // TODO: Handle this elsewhere at an early point. Walking through all MBBs
1589 // here would be a bad heuristic. A better way should be by calling
1590 // allocateWWMSpill during the regalloc pipeline whenever a physical
1591 // register is allocated for the intended virtual registers.
1592 if (MI.getOpcode() == AMDGPU::SI_SPILL_S32_TO_VGPR)
1593 MFI->allocateWWMSpill(MF, MI.getOperand(0).getReg());
1594 else if (MI.getOpcode() == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
1595 MFI->allocateWWMSpill(MF, MI.getOperand(1).getReg());
1596 else if (TII->isWWMRegSpillOpcode(MI.getOpcode()))
1597 NeedExecCopyReservedReg = true;
1598 else if (MI.getOpcode() == AMDGPU::SI_RETURN ||
1599 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
1600 (MFI->isChainFunction() &&
1601 TII->isChainCallOpcode(MI.getOpcode()))) {
1602 // We expect all return to be the same size.
1603 assert(!ReturnMI ||
1604 (count_if(MI.operands(), [](auto Op) { return Op.isReg(); }) ==
1605 count_if(ReturnMI->operands(), [](auto Op) { return Op.isReg(); })));
1606 ReturnMI = &MI;
1607 }
1608 }
1609 }
1610
1611 // Remove any VGPRs used in the return value because these do not need to be saved.
1612 // This prevents CSR restore from clobbering return VGPRs.
1613 if (ReturnMI) {
1614 for (auto &Op : ReturnMI->operands()) {
1615 if (Op.isReg())
1616 SavedVGPRs.reset(Op.getReg());
1617 }
1618 }
1619
1620 // Ignore the SGPRs the default implementation found.
1621 SavedVGPRs.clearBitsNotInMask(TRI->getAllVectorRegMask());
1622
1623 // Do not save AGPRs prior to GFX90A because there was no easy way to do so.
1624 // In gfx908 there was do AGPR loads and stores and thus spilling also
1625 // require a temporary VGPR.
1626 if (!ST.hasGFX90AInsts())
1627 SavedVGPRs.clearBitsInMask(TRI->getAllAGPRRegMask());
1628
1629 determinePrologEpilogSGPRSaves(MF, SavedVGPRs, NeedExecCopyReservedReg);
1630
1631 // The Whole-Wave VGPRs need to be specially inserted in the prolog, so don't
1632 // allow the default insertion to handle them.
1633 for (auto &Reg : MFI->getWWMSpills())
1634 SavedVGPRs.reset(Reg.first);
1635
1636 // Mark all lane VGPRs as BB LiveIns.
1637 for (MachineBasicBlock &MBB : MF) {
1638 for (auto &Reg : MFI->getWWMSpills())
1639 MBB.addLiveIn(Reg.first);
1640
1642 }
1643}
1644
1646 BitVector &SavedRegs,
1647 RegScavenger *RS) const {
1650 if (MFI->isEntryFunction())
1651 return;
1652
1653 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1654 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1655
1656 // The SP is specifically managed and we don't want extra spills of it.
1657 SavedRegs.reset(MFI->getStackPtrOffsetReg());
1658
1659 const BitVector AllSavedRegs = SavedRegs;
1660 SavedRegs.clearBitsInMask(TRI->getAllVectorRegMask());
1661
1662 // We have to anticipate introducing CSR VGPR spills or spill of caller
1663 // save VGPR reserved for SGPR spills as we now always create stack entry
1664 // for it, if we don't have any stack objects already, since we require a FP
1665 // if there is a call and stack. We will allocate a VGPR for SGPR spills if
1666 // there are any SGPR spills. Whether they are CSR spills or otherwise.
1667 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
1668 const bool WillHaveFP =
1669 FrameInfo.hasCalls() && (AllSavedRegs.any() || MFI->hasSpilledSGPRs());
1670
1671 // FP will be specially managed like SP.
1672 if (WillHaveFP || hasFP(MF))
1673 SavedRegs.reset(MFI->getFrameOffsetReg());
1674
1675 // Return address use with return instruction is hidden through the SI_RETURN
1676 // pseudo. Given that and since the IPRA computes actual register usage and
1677 // does not use CSR list, the clobbering of return address by function calls
1678 // (D117243) or otherwise (D120922) is ignored/not seen by the IPRA's register
1679 // usage collection. This will ensure save/restore of return address happens
1680 // in those scenarios.
1681 const MachineRegisterInfo &MRI = MF.getRegInfo();
1682 Register RetAddrReg = TRI->getReturnAddressReg(MF);
1683 if (!MFI->isEntryFunction() &&
1684 (FrameInfo.hasCalls() || MRI.isPhysRegModified(RetAddrReg))) {
1685 SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub0));
1686 SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub1));
1687 }
1688}
1689
1692 std::vector<CalleeSavedInfo> &CSI) const {
1693 if (CSI.empty())
1694 return true; // Early exit if no callee saved registers are modified!
1695
1696 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1697 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1698 const SIRegisterInfo *RI = ST.getRegisterInfo();
1699 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1700 Register BasePtrReg = RI->getBaseRegister();
1701 Register SGPRForFPSaveRestoreCopy =
1702 FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
1703 Register SGPRForBPSaveRestoreCopy =
1704 FuncInfo->getScratchSGPRCopyDstReg(BasePtrReg);
1705 if (!SGPRForFPSaveRestoreCopy && !SGPRForBPSaveRestoreCopy)
1706 return false;
1707
1708 unsigned NumModifiedRegs = 0;
1709
1710 if (SGPRForFPSaveRestoreCopy)
1711 NumModifiedRegs++;
1712 if (SGPRForBPSaveRestoreCopy)
1713 NumModifiedRegs++;
1714
1715 for (auto &CS : CSI) {
1716 if (CS.getReg() == FramePtrReg && SGPRForFPSaveRestoreCopy) {
1717 CS.setDstReg(SGPRForFPSaveRestoreCopy);
1718 if (--NumModifiedRegs)
1719 break;
1720 } else if (CS.getReg() == BasePtrReg && SGPRForBPSaveRestoreCopy) {
1721 CS.setDstReg(SGPRForBPSaveRestoreCopy);
1722 if (--NumModifiedRegs)
1723 break;
1724 }
1725 }
1726
1727 return false;
1728}
1729
1731 const MachineFunction &MF) const {
1732
1733 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1734 const MachineFrameInfo &MFI = MF.getFrameInfo();
1735 const SIInstrInfo *TII = ST.getInstrInfo();
1736 uint64_t EstStackSize = MFI.estimateStackSize(MF);
1737 uint64_t MaxOffset = EstStackSize - 1;
1738
1739 // We need the emergency stack slots to be allocated in range of the
1740 // MUBUF/flat scratch immediate offset from the base register, so assign these
1741 // first at the incoming SP position.
1742 //
1743 // TODO: We could try sorting the objects to find a hole in the first bytes
1744 // rather than allocating as close to possible. This could save a lot of space
1745 // on frames with alignment requirements.
1746 if (ST.enableFlatScratch()) {
1747 if (TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS,
1749 return false;
1750 } else {
1751 if (TII->isLegalMUBUFImmOffset(MaxOffset))
1752 return false;
1753 }
1754
1755 return true;
1756}
1757
1759 MachineFunction &MF,
1762 int64_t Amount = I->getOperand(0).getImm();
1763 if (Amount == 0)
1764 return MBB.erase(I);
1765
1766 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1767 const SIInstrInfo *TII = ST.getInstrInfo();
1768 const DebugLoc &DL = I->getDebugLoc();
1769 unsigned Opc = I->getOpcode();
1770 bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
1771 uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
1772
1773 if (!hasReservedCallFrame(MF)) {
1774 Amount = alignTo(Amount, getStackAlign());
1775 assert(isUInt<32>(Amount) && "exceeded stack address space size");
1777 Register SPReg = MFI->getStackPtrOffsetReg();
1778
1779 Amount *= getScratchScaleFactor(ST);
1780 if (IsDestroy)
1781 Amount = -Amount;
1782 auto Add = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SPReg)
1783 .addReg(SPReg)
1784 .addImm(Amount);
1785 Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1786 } else if (CalleePopAmount != 0) {
1787 llvm_unreachable("is this used?");
1788 }
1789
1790 return MBB.erase(I);
1791}
1792
1793/// Returns true if the frame will require a reference to the stack pointer.
1794///
1795/// This is the set of conditions common to setting up the stack pointer in a
1796/// kernel, and for using a frame pointer in a callable function.
1797///
1798/// FIXME: Should also check hasOpaqueSPAdjustment and if any inline asm
1799/// references SP.
1801 return MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint();
1802}
1803
1804// The FP for kernels is always known 0, so we never really need to setup an
1805// explicit register for it. However, DisableFramePointerElim will force us to
1806// use a register for it.
1808 const MachineFrameInfo &MFI = MF.getFrameInfo();
1809
1810 // For entry & chain functions we can use an immediate offset in most cases,
1811 // so the presence of calls doesn't imply we need a distinct frame pointer.
1812 if (MFI.hasCalls() &&
1815 // All offsets are unsigned, so need to be addressed in the same direction
1816 // as stack growth.
1817
1818 // FIXME: This function is pretty broken, since it can be called before the
1819 // frame layout is determined or CSR spills are inserted.
1820 return MFI.getStackSize() != 0;
1821 }
1822
1823 return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() ||
1824 MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment(
1825 MF) ||
1827}
1828
1829// This is essentially a reduced version of hasFP for entry functions. Since the
1830// stack pointer is known 0 on entry to kernels, we never really need an FP
1831// register. We may need to initialize the stack pointer depending on the frame
1832// properties, which logically overlaps many of the cases where an ordinary
1833// function would require an FP.
1834// Also used for chain functions. While not technically entry functions, chain
1835// functions may need to set up a stack pointer in some situations.
1837 const MachineFunction &MF) const {
1838 // Callable functions always require a stack pointer reference.
1841 "only expected to call this for entry points and chain functions");
1842
1843 const MachineFrameInfo &MFI = MF.getFrameInfo();
1844
1845 // Entry points ordinarily don't need to initialize SP. We have to set it up
1846 // for callees if there are any. Also note tail calls are impossible/don't
1847 // make any sense for kernels.
1848 if (MFI.hasCalls())
1849 return true;
1850
1851 // We still need to initialize the SP if we're doing anything weird that
1852 // references the SP, like variable sized stack objects.
1853 return frameTriviallyRequiresSP(MFI);
1854}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
static Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB)
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
Provides AMDGPU specific target descriptions.
static const Function * getParent(const Value *V)
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Size
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
A set of register units.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
This file declares the machine register scavenger class.
static void buildEpilogRestore(const GCNSubtarget &ST, const SIRegisterInfo &TRI, const SIMachineFunctionInfo &FuncInfo, LiveRegUnits &LiveUnits, MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SpillReg, int FI, Register FrameReg, int64_t DwordOff=0)
static cl::opt< bool > EnableSpillVGPRToAGPR("amdgpu-spill-vgpr-to-agpr", cl::desc("Enable spilling VGPRs to AGPRs"), cl::ReallyHidden, cl::init(true))
static void getVGPRSpillLaneOrTempRegister(MachineFunction &MF, LiveRegUnits &LiveUnits, Register SGPR, const TargetRegisterClass &RC=AMDGPU::SReg_32_XM0_XEXECRegClass, bool IncludeScratchCopy=true)
Query target location for spilling SGPRs IncludeScratchCopy : Also look for free scratch SGPRs.
static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, const SIInstrInfo *TII, Register TargetReg)
static bool allStackObjectsAreDead(const MachineFrameInfo &MFI)
static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI, const SIMachineFunctionInfo &FuncInfo, LiveRegUnits &LiveUnits, MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SpillReg, int FI, Register FrameReg, int64_t DwordOff=0)
static Register buildScratchExecCopy(LiveRegUnits &LiveUnits, MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool IsProlog, bool EnableInactiveLanes)
static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI)
Returns true if the frame will require a reference to the stack pointer.
static void initLiveUnits(LiveRegUnits &LiveUnits, const SIRegisterInfo &TRI, const SIMachineFunctionInfo *FuncInfo, MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, bool IsProlog)
static bool allSGPRSpillsAreDead(const MachineFunction &MF)
static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI, LiveRegUnits &LiveUnits, const TargetRegisterClass &RC, bool Unused=false)
static MCRegister findUnusedRegister(MachineRegisterInfo &MRI, const LiveRegUnits &LiveUnits, const TargetRegisterClass &RC)
static unsigned getScratchScaleFactor(const GCNSubtarget &ST)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:195
bool test(unsigned Idx) const
Definition: BitVector.h:461
BitVector & reset()
Definition: BitVector.h:392
void clearBitsNotInMask(const uint32_t *Mask, unsigned MaskWords=~0u)
clearBitsNotInMask - Clear a bit in this vector for every '0' bit in Mask.
Definition: BitVector.h:725
BitVector & set()
Definition: BitVector.h:351
bool any() const
any - Returns true if any bit is set.
Definition: BitVector.h:170
void clearBitsInMask(const uint32_t *Mask, unsigned MaskWords=~0u)
clearBitsInMask - Clear any bits in this vector that are set in Mask.
Definition: BitVector.h:713
iterator_range< const_set_bits_iterator > set_bits() const
Definition: BitVector.h:140
bool empty() const
empty - Tests whether there are no bits in this bitvector.
Definition: BitVector.h:156
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:262
bool hasImplicitBufferPtr() const
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
TargetInstrInfo overrides.
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
If the specified machine instruction is a direct store to a stack slot, return the virtual or physica...
A set of register units used to track register liveness.
Definition: LiveRegUnits.h:30
bool available(MCPhysReg Reg) const
Returns true if no part of physical register Reg is live.
Definition: LiveRegUnits.h:116
void init(const TargetRegisterInfo &TRI)
Initialize and clear the set.
Definition: LiveRegUnits.h:73
void stepBackward(const MachineInstr &MI)
Updates liveness when stepping backwards over the instruction MI.
void addReg(MCPhysReg Reg)
Adds register units covered by physical register Reg.
Definition: LiveRegUnits.h:86
void addLiveOuts(const MachineBasicBlock &MBB)
Adds registers living out of block MBB.
bool empty() const
Returns true if the set is empty.
Definition: LiveRegUnits.h:83
void addLiveIns(const MachineBasicBlock &MBB)
Adds registers living into block MBB.
void removeReg(MCPhysReg Reg)
Removes all register units covered by physical register Reg.
Definition: LiveRegUnits.h:102
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
bool isLiveIn(MCPhysReg Reg, LaneBitmask LaneMask=LaneBitmask::getAll()) const
Return true if the specified register is in the live in set.
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void sortUniqueLiveIns()
Sorts and uniques the LiveIns vector.
iterator getLastNonDebugInstr(bool SkipPseudoOp=true)
Returns an iterator to the last non-debug instruction in the basic block, or end().
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
bool hasVarSizedObjects() const
This method may be called any time after instruction selection is complete to determine if the stack ...
uint64_t getStackSize() const
Return the number of bytes that must be allocated to hold all of the fixed size frame objects.
int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
bool hasCalls() const
Return true if the current function has any function calls.
bool isFrameAddressTaken() const
This method may be called any time after instruction selection is complete to determine if there is a...
Align getMaxAlign() const
Return the alignment in bytes that this function must be aligned to, which is greater than the defaul...
bool hasPatchPoint() const
This method may be called any time after instruction selection is complete to determine if there is a...
bool hasTailCall() const
Returns true if the function contains a tail call.
Align getObjectAlign(int ObjectIdx) const
Return the alignment of the specified stack object.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
bool hasStackMap() const
This method may be called any time after instruction selection is complete to determine if there is a...
void RemoveStackObject(int ObjectIdx)
Remove or mark dead a statically sized stack object.
int getObjectIndexEnd() const
Return one past the maximum frame object index.
uint8_t getStackID(int ObjectIdx) const
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
int getObjectIndexBegin() const
Return the minimum frame object index.
bool isDeadObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a dead object.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineBasicBlock & front() const
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & setMIFlag(MachineInstr::MIFlag Flag) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
Representation of each machine instruction.
Definition: MachineInstr.h:69
iterator_range< mop_iterator > operands()
Definition: MachineInstr.h:662
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:556
A description of a memory reference used in the backend.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
void setIsDead(bool Val=true)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const MCPhysReg * getCalleeSavedRegs() const
Returns list of callee saved registers.
void addLiveIn(MCRegister Reg, Register vreg=Register())
addLiveIn - Add the specified register as a live-in.
PrologEpilogSGPRSpillBuilder(Register Reg, const PrologEpilogSGPRSaveRestoreInfo SI, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, const SIInstrInfo *TII, const SIRegisterInfo &TRI, LiveRegUnits &LiveUnits, Register FrameReg)
void enterBasicBlockEnd(MachineBasicBlock &MBB)
Start tracking liveness from the end of basic block MBB.
void backward()
Update internal register state and move MBB iterator backwards.
void addScavengingFrameIndex(int FI)
Add a scavenging frame index.
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
void determinePrologEpilogSGPRSaves(MachineFunction &MF, BitVector &SavedRegs, bool NeedExecCopyReservedReg) const
bool hasFP(const MachineFunction &MF) const override
hasFP - Return true if the specified function should have a dedicated frame pointer register.
StackOffset getFrameIndexReference(const MachineFunction &MF, int FI, Register &FrameReg) const override
getFrameIndexReference - This method should return the base register and offset used to reference a f...
void processFunctionBeforeFrameFinalized(MachineFunction &MF, RegScavenger *RS=nullptr) const override
processFunctionBeforeFrameFinalized - This method is called immediately before the specified function...
bool allocateScavengingFrameIndexesNearIncomingSP(const MachineFunction &MF) const override
Control the placement of special register scavenging spill slots when allocating a stack frame.
bool requiresStackPointerReference(const MachineFunction &MF) const
void emitEntryFunctionPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const
void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS=nullptr) const override
This method determines which of the registers reported by TargetRegisterInfo::getCalleeSavedRegs() sh...
void emitCSRSpillStores(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits, Register FrameReg, Register FramePtrRegScratchCopy) const
bool assignCalleeSavedSpillSlots(MachineFunction &MF, const TargetRegisterInfo *TRI, std::vector< CalleeSavedInfo > &CSI) const override
void determineCalleeSavesSGPR(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS=nullptr) const
void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override
void emitCSRSpillRestores(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits, Register FrameReg, Register FramePtrRegScratchCopy) const
void processFunctionBeforeFrameIndicesReplaced(MachineFunction &MF, RegScavenger *RS=nullptr) const override
processFunctionBeforeFrameIndicesReplaced - This method is called immediately before MO_FrameIndex op...
bool isSupportedStackID(TargetStackID::Value ID) const override
void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override
emitProlog/emitEpilog - These methods insert prolog and epilog code into the function.
MachineBasicBlock::iterator eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
This method is called during prolog/epilog code insertion to eliminate call frame setup and destroy p...
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
const WWMSpillsMap & getWWMSpills() const
void getAllScratchSGPRCopyDstRegs(SmallVectorImpl< Register > &Regs) const
ArrayRef< MCPhysReg > getAGPRSpillVGPRs() const
const PrologEpilogSGPRSpillsMap & getPrologEpilogSGPRSpills() const
GCNUserSGPRUsageInfo & getUserSGPRInfo()
void allocateWWMSpill(MachineFunction &MF, Register VGPR, uint64_t Size=4, Align Alignment=Align(4))
void setVGPRToAGPRSpillDead(int FrameIndex)
Register getScratchRSrcReg() const
Returns the physical register reserved for use as the resource descriptor for scratch accesses.
ArrayRef< MCPhysReg > getVGPRSpillAGPRs() const
int getScavengeFI(MachineFrameInfo &MFI, const SIRegisterInfo &TRI)
bool hasPrologEpilogSGPRSpillEntry(Register Reg) const
Register getGITPtrLoReg(const MachineFunction &MF) const
void setVGPRForAGPRCopy(Register NewVGPRForAGPRCopy)
bool allocateVGPRSpillToAGPR(MachineFunction &MF, int FI, bool isAGPRtoVGPR)
Reserve AGPRs or VGPRs to support spilling for FrameIndex FI.
void splitWWMSpillRegisters(MachineFunction &MF, SmallVectorImpl< std::pair< Register, int > > &CalleeSavedRegs, SmallVectorImpl< std::pair< Register, int > > &ScratchRegs) const
ArrayRef< SIRegisterInfo::SpilledReg > getSGPRSpillToPhysicalVGPRLanes(int FrameIndex) const
void shiftSpillPhysVGPRsToLowestRange(MachineFunction &MF)
bool allocateSGPRSpillToVGPRLane(MachineFunction &MF, int FI, bool SpillToPhysVGPRLane=false, bool IsPrologEpilog=false)
void setLongBranchReservedReg(Register Reg)
bool removeDeadFrameIndices(MachineFrameInfo &MFI, bool ResetSGPRSpillStackIDs)
If ResetSGPRSpillStackIDs is true, reset the stack ID from sgpr-spill to the default stack.
MCRegister getPreloadedReg(AMDGPUFunctionArgInfo::PreloadedValue Value) const
bool checkIndexInPrologEpilogSGPRSpills(int FI) const
const ReservedRegSet & getWWMReservedRegs() const
Register getImplicitBufferPtrUserSGPR() const
const PrologEpilogSGPRSaveRestoreInfo & getPrologEpilogSGPRSaveRestoreInfo(Register Reg) const
void setIsStackRealigned(bool Realigned=true)
void addToPrologEpilogSGPRSpills(Register Reg, PrologEpilogSGPRSaveRestoreInfo SI)
Register getScratchSGPRCopyDstReg(Register Reg) const
Register getFrameRegister(const MachineFunction &MF) const override
bool empty() const
Definition: SmallVector.h:94
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:33
int64_t getFixed() const
Returns the fixed component of the stack.
Definition: TypeSize.h:49
virtual bool hasReservedCallFrame(const MachineFunction &MF) const
hasReservedCallFrame - Under normal circumstances, when a frame pointer is not required,...
virtual void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs, RegScavenger *RS=nullptr) const
This method determines which of the registers reported by TargetRegisterInfo::getCalleeSavedRegs() sh...
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
TargetOptions Options
bool DisableFramePointerElim(const MachineFunction &MF) const
DisableFramePointerElim - This returns true if frame pointer elimination optimization should be disab...
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ PRIVATE_ADDRESS
Address space for private memory.
uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset)
Convert ByteOffset to dwords if the subtarget uses dword SMRD immediate offsets.
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
bool isCompute(CallingConv::ID cc)
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Reg
All possible values of the reg field in the ModR/M byte.
@ ReallyHidden
Definition: CommandLine.h:139
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:456
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:665
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
@ And
Bitwise or logical AND of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1930
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.