LLVM 23.0.0git
SIMachineFunctionInfo.cpp
Go to the documentation of this file.
1//===- SIMachineFunctionInfo.cpp - SI Machine Function Info ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "AMDGPUSubtarget.h"
11#include "GCNSubtarget.h"
13#include "SIRegisterInfo.h"
21#include "llvm/IR/CallingConv.h"
23#include "llvm/IR/Function.h"
24#include <cassert>
25#include <optional>
26#include <vector>
27
28enum { MAX_LANES = 64 };
29
30using namespace llvm;
31
32// TODO -- delete this flag once we have more robust mechanisms to allocate the
33// optimal RC for Opc and Dest of MFMA. In particular, there are high RP cases
34// where it is better to produce the VGPR form (e.g. if there are VGPR users
35// of the MFMA result).
37 "amdgpu-mfma-vgpr-form",
38 cl::desc("Whether to force use VGPR for Opc and Dest of MFMA. If "
39 "unspecified, default to compiler heuristics"),
42
44 const SITargetLowering *TLI = STI->getTargetLowering();
45 return static_cast<const GCNTargetMachine &>(TLI->getTargetMachine());
46}
47
49
51 const GCNSubtarget *STI)
52 : AMDGPUMachineFunctionInfo(F, *STI), Mode(F, *STI),
53 GWSResourcePSV(getTM(STI)), UserSGPRInfo(F, *STI), WorkGroupIDX(false),
54 WorkGroupIDY(false), WorkGroupIDZ(false), WorkGroupInfo(false),
55 LDSKernelId(false), PrivateSegmentWaveByteOffset(false),
56 WorkItemIDX(false), WorkItemIDY(false), WorkItemIDZ(false),
57 ImplicitArgPtr(false), GITPtrHigh(0xffffffff), HighBitsOf32BitAddress(0),
58 IsWholeWaveFunction(F.getCallingConv() ==
60 const GCNSubtarget &ST = *STI;
61 FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
62 WavesPerEU = ST.getWavesPerEU(F);
63 MaxNumWorkGroups = ST.getMaxNumWorkGroups(F);
64 assert(MaxNumWorkGroups.size() == 3);
65
66 // Temporarily check both the attribute and the subtarget feature, until the
67 // latter is completely removed.
68 DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
69 if (DynamicVGPRBlockSize == 0 && ST.isDynamicVGPREnabled())
70 DynamicVGPRBlockSize = ST.getDynamicVGPRBlockSize();
71
72 Occupancy = ST.computeOccupancy(F, getLDSSize()).second;
73 CallingConv::ID CC = F.getCallingConv();
74
75 VRegFlags.reserve(1024);
76
77 const bool IsKernel = CC == CallingConv::AMDGPU_KERNEL ||
79
80 if (IsKernel) {
81 WorkGroupIDX = true;
82 WorkItemIDX = true;
83 } else if (CC == CallingConv::AMDGPU_PS) {
84 PSInputAddr = AMDGPU::getInitialPSInputAddr(F);
85 }
86
87 if (ST.hasGFX90AInsts()) {
88 // FIXME: Extract logic out of getMaxNumVectorRegs; we need to apply the
89 // allocation granule and clamping.
90 auto [MinNumAGPRAttr, MaxNumAGPRAttr] =
91 AMDGPU::getIntegerPairAttribute(F, "amdgpu-agpr-alloc", {~0u, ~0u},
92 /*OnlyFirstRequired=*/true);
93 MinNumAGPRs = MinNumAGPRAttr;
94 }
95
96 if (!isEntryFunction()) {
97 if (CC != CallingConv::AMDGPU_Gfx &&
100
101 FrameOffsetReg = AMDGPU::SGPR33;
102 StackPtrOffsetReg = AMDGPU::SGPR32;
103
104 if (!ST.hasFlatScratchEnabled()) {
105 // Non-entry functions have no special inputs for now, other registers
106 // required for scratch access.
107 ScratchRSrcReg = AMDGPU::isChainCC(CC)
108 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
109 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
110
111 ArgInfo.PrivateSegmentBuffer =
112 ArgDescriptor::createRegister(ScratchRSrcReg);
113 }
114
115 if (!F.hasFnAttribute("amdgpu-no-implicitarg-ptr") &&
117 ImplicitArgPtr = true;
118 } else {
119 ImplicitArgPtr = false;
121 std::max(ST.getAlignmentForImplicitArgPtr(), MaxKernArgAlign);
122 }
123
124 if (!AMDGPU::isGraphics(CC) ||
126 ST.hasArchitectedSGPRs())) {
127 if (IsKernel || !F.hasFnAttribute("amdgpu-no-workgroup-id-x") ||
128 !F.hasFnAttribute("amdgpu-no-cluster-id-x"))
129 WorkGroupIDX = true;
130
131 if (!F.hasFnAttribute("amdgpu-no-workgroup-id-y") ||
132 !F.hasFnAttribute("amdgpu-no-cluster-id-y"))
133 WorkGroupIDY = true;
134
135 if (!F.hasFnAttribute("amdgpu-no-workgroup-id-z") ||
136 !F.hasFnAttribute("amdgpu-no-cluster-id-z"))
137 WorkGroupIDZ = true;
138 }
139
140 if (!AMDGPU::isGraphics(CC)) {
141 if (IsKernel || !F.hasFnAttribute("amdgpu-no-workitem-id-x"))
142 WorkItemIDX = true;
143
144 if (!F.hasFnAttribute("amdgpu-no-workitem-id-y") &&
145 ST.getMaxWorkitemID(F, 1) != 0)
146 WorkItemIDY = true;
147
148 if (!F.hasFnAttribute("amdgpu-no-workitem-id-z") &&
149 ST.getMaxWorkitemID(F, 2) != 0)
150 WorkItemIDZ = true;
151
152 if (!IsKernel && !F.hasFnAttribute("amdgpu-no-lds-kernel-id"))
153 LDSKernelId = true;
154 }
155
156 if (isEntryFunction()) {
157 // X, XY, and XYZ are the only supported combinations, so make sure Y is
158 // enabled if Z is.
159 if (WorkItemIDZ)
160 WorkItemIDY = true;
161
162 if (!ST.hasArchitectedFlatScratch()) {
163 PrivateSegmentWaveByteOffset = true;
164
165 // HS and GS always have the scratch wave offset in SGPR5 on GFX9.
166 if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 &&
168 ArgInfo.PrivateSegmentWaveByteOffset =
169 ArgDescriptor::createRegister(AMDGPU::SGPR5);
170 }
171 }
172
173 Attribute A = F.getFnAttribute("amdgpu-git-ptr-high");
174 StringRef S = A.getValueAsString();
175 if (!S.empty())
176 S.consumeInteger(0, GITPtrHigh);
177
178 A = F.getFnAttribute("amdgpu-32bit-address-high-bits");
179 S = A.getValueAsString();
180 if (!S.empty())
181 S.consumeInteger(0, HighBitsOf32BitAddress);
182
183 MaxMemoryClusterDWords = F.getFnAttributeAsParsedInteger(
184 "amdgpu-max-memory-cluster-dwords", DefaultMemoryClusterDWordsLimit);
185
186 // On GFX908, in order to guarantee copying between AGPRs, we need a scratch
187 // VGPR available at all times. For now, reserve highest available VGPR. After
188 // RA, shift it to the lowest available unused VGPR if the one exist.
189 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
190 VGPRForAGPRCopy =
191 AMDGPU::VGPR_32RegClass.getRegister(ST.getMaxNumVGPRs(F) - 1);
192 }
193
194 ClusterDims = AMDGPU::ClusterDimsAttr::get(F);
195}
196
203
206 const GCNSubtarget& ST = MF.getSubtarget<GCNSubtarget>();
207 limitOccupancy(ST.getOccupancyWithWorkGroupSizes(MF).second);
208}
209
211 const SIRegisterInfo &TRI) {
212 ArgInfo.PrivateSegmentBuffer =
213 ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
214 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SGPR_128RegClass));
215 NumUserSGPRs += 4;
216 return ArgInfo.PrivateSegmentBuffer.getRegister();
217}
218
220 ArgInfo.DispatchPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
221 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
222 NumUserSGPRs += 2;
223 return ArgInfo.DispatchPtr.getRegister();
224}
225
227 ArgInfo.QueuePtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
228 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
229 NumUserSGPRs += 2;
230 return ArgInfo.QueuePtr.getRegister();
231}
232
234 ArgInfo.KernargSegmentPtr
235 = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
236 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
237 NumUserSGPRs += 2;
238 return ArgInfo.KernargSegmentPtr.getRegister();
239}
240
242 ArgInfo.DispatchID = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
243 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
244 NumUserSGPRs += 2;
245 return ArgInfo.DispatchID.getRegister();
246}
247
249 ArgInfo.FlatScratchInit = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
250 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
251 NumUserSGPRs += 2;
252 return ArgInfo.FlatScratchInit.getRegister();
253}
254
256 ArgInfo.PrivateSegmentSize = ArgDescriptor::createRegister(getNextUserSGPR());
257 NumUserSGPRs += 1;
258 return ArgInfo.PrivateSegmentSize.getRegister();
259}
260
262 ArgInfo.ImplicitBufferPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
263 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
264 NumUserSGPRs += 2;
265 return ArgInfo.ImplicitBufferPtr.getRegister();
266}
267
269 ArgInfo.LDSKernelId = ArgDescriptor::createRegister(getNextUserSGPR());
270 NumUserSGPRs += 1;
271 return ArgInfo.LDSKernelId.getRegister();
272}
273
275 const SIRegisterInfo &TRI, const TargetRegisterClass *RC,
276 unsigned AllocSizeDWord, int KernArgIdx, int PaddingSGPRs) {
277 auto [It, Inserted] = ArgInfo.PreloadKernArgs.try_emplace(KernArgIdx);
278 assert(Inserted && "Preload kernel argument allocated twice.");
279 NumUserSGPRs += PaddingSGPRs;
280 // If the available register tuples are aligned with the kernarg to be
281 // preloaded use that register, otherwise we need to use a set of SGPRs and
282 // merge them.
283 if (!ArgInfo.FirstKernArgPreloadReg)
284 ArgInfo.FirstKernArgPreloadReg = getNextUserSGPR();
285 Register PreloadReg =
286 TRI.getMatchingSuperReg(getNextUserSGPR(), AMDGPU::sub0, RC);
287 auto &Regs = It->second.Regs;
288 if (PreloadReg &&
289 (RC == &AMDGPU::SReg_32RegClass || RC == &AMDGPU::SReg_64RegClass)) {
290 Regs.push_back(PreloadReg);
291 NumUserSGPRs += AllocSizeDWord;
292 } else {
293 Regs.reserve(AllocSizeDWord);
294 for (unsigned I = 0; I < AllocSizeDWord; ++I) {
295 Regs.push_back(getNextUserSGPR());
296 NumUserSGPRs++;
297 }
298 }
299
300 // Track the actual number of SGPRs that HW will preload to.
301 UserSGPRInfo.allocKernargPreloadSGPRs(AllocSizeDWord + PaddingSGPRs);
302 return &Regs;
303}
304
306 uint64_t Size, Align Alignment) {
307 // Skip if it is an entry function or the register is already added.
308 if (isEntryFunction() || WWMSpills.count(VGPR))
309 return;
310
311 // Skip if this is a function with the amdgpu_cs_chain or
312 // amdgpu_cs_chain_preserve calling convention and this is a scratch register.
313 // We never need to allocate a spill for these because we don't even need to
314 // restore the inactive lanes for them (they're scratchier than the usual
315 // scratch registers). We only need to do this if we have calls to
316 // llvm.amdgcn.cs.chain (otherwise there's no one to save them for, since
317 // chain functions do not return) and the function did not contain a call to
318 // llvm.amdgcn.init.whole.wave (since in that case there are no inactive lanes
319 // when entering the function).
320 if (isChainFunction() &&
323 return;
324
325 WWMSpills.insert(std::make_pair(
326 VGPR, MF.getFrameInfo().CreateSpillStackObject(Size, Alignment)));
327}
328
329// Separate out the callee-saved and scratch registers.
331 MachineFunction &MF,
332 SmallVectorImpl<std::pair<Register, int>> &CalleeSavedRegs,
333 SmallVectorImpl<std::pair<Register, int>> &ScratchRegs) const {
334 const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
335 for (auto &Reg : WWMSpills) {
336 if (isCalleeSavedReg(CSRegs, Reg.first))
337 CalleeSavedRegs.push_back(Reg);
338 else
339 ScratchRegs.push_back(Reg);
340 }
341}
342
344 MCPhysReg Reg) const {
345 for (unsigned I = 0; CSRegs[I]; ++I) {
346 if (CSRegs[I] == Reg)
347 return true;
348 }
349
350 return false;
351}
352
355 BitVector &SavedVGPRs) {
356 const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
358 for (unsigned I = 0, E = WWMVGPRs.size(); I < E; ++I) {
359 Register Reg = WWMVGPRs[I];
360 Register NewReg =
361 TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
362 if (!NewReg || NewReg >= Reg)
363 break;
364
365 MRI.replaceRegWith(Reg, NewReg);
366
367 // Update various tables with the new VGPR.
368 WWMVGPRs[I] = NewReg;
369 WWMReservedRegs.remove(Reg);
370 WWMReservedRegs.insert(NewReg);
371 MRI.reserveReg(NewReg, TRI);
372
373 // Replace the register in SpillPhysVGPRs. This is needed to look for free
374 // lanes while spilling special SGPRs like FP, BP, etc. during PEI.
375 auto *RegItr = llvm::find(SpillPhysVGPRs, Reg);
376 if (RegItr != SpillPhysVGPRs.end()) {
377 unsigned Idx = std::distance(SpillPhysVGPRs.begin(), RegItr);
378 SpillPhysVGPRs[Idx] = NewReg;
379
380 // For replacing registers used in the CFI instructions.
381 MF.replaceFrameInstRegister(Reg, NewReg);
382 }
383
384 // The generic `determineCalleeSaves` might have set the old register if it
385 // is in the CSR range.
386 SavedVGPRs.reset(Reg);
387
388 for (MachineBasicBlock &MBB : MF) {
389 MBB.removeLiveIn(Reg);
390 MBB.sortUniqueLiveIns();
391 }
392
393 Reg = NewReg;
394 }
395}
396
397bool SIMachineFunctionInfo::allocateVirtualVGPRForSGPRSpills(
398 MachineFunction &MF, int FI, unsigned LaneIndex) {
400 Register LaneVGPR;
401 if (!LaneIndex) {
402 LaneVGPR = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
403 SpillVGPRs.push_back(LaneVGPR);
404 } else {
405 LaneVGPR = SpillVGPRs.back();
406 }
407
408 SGPRSpillsToVirtualVGPRLanes[FI].emplace_back(LaneVGPR, LaneIndex);
409 return true;
410}
411
412bool SIMachineFunctionInfo::allocatePhysicalVGPRForSGPRSpills(
413 MachineFunction &MF, int FI, unsigned LaneIndex, bool IsPrologEpilog) {
414 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
415 const SIRegisterInfo *TRI = ST.getRegisterInfo();
416 MachineRegisterInfo &MRI = MF.getRegInfo();
417 Register LaneVGPR;
418 if (!LaneIndex) {
419 // Find the highest available register if called before RA to ensure the
420 // lowest registers are available for allocation. The LaneVGPR, in that
421 // case, will be shifted back to the lowest range after VGPR allocation.
422 LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF,
423 !IsPrologEpilog);
424 if (LaneVGPR == AMDGPU::NoRegister) {
425 // We have no VGPRs left for spilling SGPRs. Reset because we will not
426 // partially spill the SGPR to VGPRs.
427 SGPRSpillsToPhysicalVGPRLanes.erase(FI);
428 return false;
429 }
430
431 if (IsPrologEpilog)
432 allocateWWMSpill(MF, LaneVGPR);
433
434 reserveWWMRegister(LaneVGPR);
435 for (MachineBasicBlock &MBB : MF) {
436 MBB.addLiveIn(LaneVGPR);
438 }
439 SpillPhysVGPRs.push_back(LaneVGPR);
440 } else {
441 LaneVGPR = SpillPhysVGPRs.back();
442 }
443
444 SGPRSpillsToPhysicalVGPRLanes[FI].emplace_back(LaneVGPR, LaneIndex);
445 return true;
446}
447
449 MachineFunction &MF, int FI, bool SpillToPhysVGPRLane,
450 bool IsPrologEpilog) {
451 std::vector<SIRegisterInfo::SpilledReg> &SpillLanes =
452 SpillToPhysVGPRLane ? SGPRSpillsToPhysicalVGPRLanes[FI]
453 : SGPRSpillsToVirtualVGPRLanes[FI];
454
455 // This has already been allocated.
456 if (!SpillLanes.empty())
457 return true;
458
459 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
460 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
461 unsigned WaveSize = ST.getWavefrontSize();
462
463 unsigned Size = FrameInfo.getObjectSize(FI);
464 unsigned NumLanes = Size / 4;
465
466 if (NumLanes > WaveSize)
467 return false;
468
469 assert(Size >= 4 && "invalid sgpr spill size");
470 assert(ST.getRegisterInfo()->spillSGPRToVGPR() &&
471 "not spilling SGPRs to VGPRs");
472
473 unsigned &NumSpillLanes = SpillToPhysVGPRLane ? NumPhysicalVGPRSpillLanes
474 : NumVirtualVGPRSpillLanes;
475
476 for (unsigned I = 0; I < NumLanes; ++I, ++NumSpillLanes) {
477 unsigned LaneIndex = (NumSpillLanes % WaveSize);
478
479 bool Allocated = SpillToPhysVGPRLane
480 ? allocatePhysicalVGPRForSGPRSpills(MF, FI, LaneIndex,
481 IsPrologEpilog)
482 : allocateVirtualVGPRForSGPRSpills(MF, FI, LaneIndex);
483 if (!Allocated) {
484 NumSpillLanes -= I;
485 return false;
486 }
487 }
488
489 return true;
490}
491
492/// Reserve AGPRs or VGPRs to support spilling for FrameIndex \p FI.
493/// Either AGPR is spilled to VGPR to vice versa.
494/// Returns true if a \p FI can be eliminated completely.
496 int FI,
497 bool isAGPRtoVGPR) {
499 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
500 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
501
502 assert(ST.hasMAIInsts() && FrameInfo.isSpillSlotObjectIndex(FI));
503
504 auto &Spill = VGPRToAGPRSpills[FI];
505
506 // This has already been allocated.
507 if (!Spill.Lanes.empty())
508 return Spill.FullyAllocated;
509
510 unsigned Size = FrameInfo.getObjectSize(FI);
511 unsigned NumLanes = Size / 4;
512 Spill.Lanes.resize(NumLanes, AMDGPU::NoRegister);
513
514 const TargetRegisterClass &RC =
515 isAGPRtoVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::AGPR_32RegClass;
516 auto Regs = RC.getRegisters();
517
518 auto &SpillRegs = isAGPRtoVGPR ? SpillAGPR : SpillVGPR;
519 const SIRegisterInfo *TRI = ST.getRegisterInfo();
520 Spill.FullyAllocated = true;
521
522 // FIXME: Move allocation logic out of MachineFunctionInfo and initialize
523 // once.
524 BitVector OtherUsedRegs;
525 OtherUsedRegs.resize(TRI->getNumRegs());
526
527 const uint32_t *CSRMask =
528 TRI->getCallPreservedMask(MF, MF.getFunction().getCallingConv());
529 if (CSRMask)
530 OtherUsedRegs.setBitsInMask(CSRMask);
531
532 // TODO: Should include register tuples, but doesn't matter with current
533 // usage.
534 for (MCPhysReg Reg : SpillAGPR)
535 OtherUsedRegs.set(Reg);
536 for (MCPhysReg Reg : SpillVGPR)
537 OtherUsedRegs.set(Reg);
538
539 SmallVectorImpl<MCPhysReg>::const_iterator NextSpillReg = Regs.begin();
540 for (int I = NumLanes - 1; I >= 0; --I) {
541 NextSpillReg = std::find_if(
542 NextSpillReg, Regs.end(), [&MRI, &OtherUsedRegs](MCPhysReg Reg) {
543 return MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg) &&
544 !OtherUsedRegs[Reg];
545 });
546
547 if (NextSpillReg == Regs.end()) { // Registers exhausted
548 Spill.FullyAllocated = false;
549 break;
550 }
551
552 OtherUsedRegs.set(*NextSpillReg);
553 SpillRegs.push_back(*NextSpillReg);
554 MRI.reserveReg(*NextSpillReg, TRI);
555 Spill.Lanes[I] = *NextSpillReg++;
556 }
557
558 return Spill.FullyAllocated;
559}
560
562 MachineFrameInfo &MFI, bool ResetSGPRSpillStackIDs) {
563 // Remove dead frame indices from function frame, however keep FP & BP since
564 // spills for them haven't been inserted yet. And also make sure to remove the
565 // frame indices from `SGPRSpillsToVirtualVGPRLanes` data structure,
566 // otherwise, it could result in an unexpected side effect and bug, in case of
567 // any re-mapping of freed frame indices by later pass(es) like "stack slot
568 // coloring".
569 for (auto &R : SGPRSpillsToVirtualVGPRLanes)
570 MFI.RemoveStackObject(R.first);
571 SGPRSpillsToVirtualVGPRLanes.clear();
572
573 // Remove the dead frame indices of CSR SGPRs which are spilled to physical
574 // VGPR lanes during SILowerSGPRSpills pass.
575 if (!ResetSGPRSpillStackIDs) {
576 for (auto &R : SGPRSpillsToPhysicalVGPRLanes)
577 MFI.RemoveStackObject(R.first);
578 SGPRSpillsToPhysicalVGPRLanes.clear();
579 }
580 bool HaveSGPRToMemory = false;
581
582 if (ResetSGPRSpillStackIDs) {
583 // All other SGPRs must be allocated on the default stack, so reset the
584 // stack ID.
585 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd(); I != E;
586 ++I) {
590 HaveSGPRToMemory = true;
591 }
592 }
593 }
594 }
595
596 for (auto &R : VGPRToAGPRSpills) {
597 if (R.second.IsDead)
598 MFI.RemoveStackObject(R.first);
599 }
600
601 return HaveSGPRToMemory;
602}
603
605 const SIRegisterInfo &TRI) {
606 if (ScavengeFI)
607 return *ScavengeFI;
608
609 ScavengeFI =
610 MFI.CreateStackObject(TRI.getSpillSize(AMDGPU::SGPR_32RegClass),
611 TRI.getSpillAlign(AMDGPU::SGPR_32RegClass), false);
612 return *ScavengeFI;
613}
614
615MCPhysReg SIMachineFunctionInfo::getNextUserSGPR() const {
616 assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
617 return AMDGPU::SGPR0 + NumUserSGPRs;
618}
619
620MCPhysReg SIMachineFunctionInfo::getNextSystemSGPR() const {
621 return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs;
622}
623
624void SIMachineFunctionInfo::MRI_NoteNewVirtualRegister(Register Reg) {
625 VRegFlags.grow(Reg);
626}
627
628void SIMachineFunctionInfo::MRI_NoteCloneVirtualRegister(Register NewReg,
629 Register SrcReg) {
630 VRegFlags.grow(NewReg);
631 VRegFlags[NewReg] = VRegFlags[SrcReg];
632}
633
636 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
637 if (!ST.isAmdPalOS())
638 return Register();
639 Register GitPtrLo = AMDGPU::SGPR0; // Low GIT address passed in
640 if (ST.hasMergedShaders()) {
641 switch (MF.getFunction().getCallingConv()) {
644 // Low GIT address is passed in s8 rather than s0 for an LS+HS or
645 // ES+GS merged shader on gfx9+.
646 GitPtrLo = AMDGPU::SGPR8;
647 return GitPtrLo;
648 default:
649 return GitPtrLo;
650 }
651 }
652 return GitPtrLo;
653}
654
656 const TargetRegisterInfo &TRI) {
658 {
659 raw_string_ostream OS(Dest.Value);
660 OS << printReg(Reg, &TRI);
661 }
662 return Dest;
663}
664
665static std::optional<yaml::SIArgumentInfo>
667 const TargetRegisterInfo &TRI) {
669
670 auto convertArg = [&](std::optional<yaml::SIArgument> &A,
671 const ArgDescriptor &Arg) {
672 if (!Arg)
673 return false;
674
675 // Create a register or stack argument.
677 if (Arg.isRegister()) {
679 OS << printReg(Arg.getRegister(), &TRI);
680 } else
681 SA.StackOffset = Arg.getStackOffset();
682 // Check and update the optional mask.
683 if (Arg.isMasked())
684 SA.Mask = Arg.getMask();
685
686 A = std::move(SA);
687 return true;
688 };
689
690 bool Any = false;
691 Any |= convertArg(AI.PrivateSegmentBuffer, ArgInfo.PrivateSegmentBuffer);
692 Any |= convertArg(AI.DispatchPtr, ArgInfo.DispatchPtr);
693 Any |= convertArg(AI.QueuePtr, ArgInfo.QueuePtr);
694 Any |= convertArg(AI.KernargSegmentPtr, ArgInfo.KernargSegmentPtr);
695 Any |= convertArg(AI.DispatchID, ArgInfo.DispatchID);
696 Any |= convertArg(AI.FlatScratchInit, ArgInfo.FlatScratchInit);
697 Any |= convertArg(AI.LDSKernelId, ArgInfo.LDSKernelId);
698 Any |= convertArg(AI.PrivateSegmentSize, ArgInfo.PrivateSegmentSize);
699 Any |= convertArg(AI.WorkGroupIDX, ArgInfo.WorkGroupIDX);
700 Any |= convertArg(AI.WorkGroupIDY, ArgInfo.WorkGroupIDY);
701 Any |= convertArg(AI.WorkGroupIDZ, ArgInfo.WorkGroupIDZ);
702 Any |= convertArg(AI.WorkGroupInfo, ArgInfo.WorkGroupInfo);
703 Any |= convertArg(AI.PrivateSegmentWaveByteOffset,
704 ArgInfo.PrivateSegmentWaveByteOffset);
705 Any |= convertArg(AI.ImplicitArgPtr, ArgInfo.ImplicitArgPtr);
706 Any |= convertArg(AI.ImplicitBufferPtr, ArgInfo.ImplicitBufferPtr);
707 Any |= convertArg(AI.WorkItemIDX, ArgInfo.WorkItemIDX);
708 Any |= convertArg(AI.WorkItemIDY, ArgInfo.WorkItemIDY);
709 Any |= convertArg(AI.WorkItemIDZ, ArgInfo.WorkItemIDZ);
710
711 // Write FirstKernArgPreloadReg separately, since it's a Register,
712 // not ArgDescriptor.
713 if (ArgInfo.FirstKernArgPreloadReg) {
714 Register Reg = ArgInfo.FirstKernArgPreloadReg;
715 assert(Reg.isPhysical() &&
716 "FirstKernArgPreloadReg must be a physical register");
717
720 OS << printReg(Reg, &TRI);
721
723 Any = true;
724 }
725
726 if (Any)
727 return AI;
728
729 return std::nullopt;
730}
731
734 const llvm::MachineFunction &MF)
735 : ExplicitKernArgSize(MFI.getExplicitKernArgSize()),
736 MaxKernArgAlign(MFI.getMaxKernArgAlign()), LDSSize(MFI.getLDSSize()),
737 GDSSize(MFI.getGDSSize()), DynLDSAlign(MFI.getDynLDSAlign()),
738 IsEntryFunction(MFI.isEntryFunction()), MemoryBound(MFI.isMemoryBound()),
739 WaveLimiter(MFI.needsWaveLimiter()),
740 HasSpilledSGPRs(MFI.hasSpilledSGPRs()),
741 HasSpilledVGPRs(MFI.hasSpilledVGPRs()),
742 NumWaveDispatchSGPRs(MFI.getNumWaveDispatchSGPRs()),
743 NumWaveDispatchVGPRs(MFI.getNumWaveDispatchVGPRs()),
744 HighBitsOf32BitAddress(MFI.get32BitAddressHighBits()),
745 Occupancy(MFI.getOccupancy()),
746 ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)),
747 FrameOffsetReg(regToString(MFI.getFrameOffsetReg(), TRI)),
748 StackPtrOffsetReg(regToString(MFI.getStackPtrOffsetReg(), TRI)),
749 BytesInStackArgArea(MFI.getBytesInStackArgArea()),
750 ReturnsVoid(MFI.returnsVoid()),
751 ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)),
752 PSInputAddr(MFI.getPSInputAddr()), PSInputEnable(MFI.getPSInputEnable()),
753 MaxMemoryClusterDWords(MFI.getMaxMemoryClusterDWords()),
754 Mode(MFI.getMode()), HasInitWholeWave(MFI.hasInitWholeWave()),
755 IsWholeWaveFunction(MFI.isWholeWaveFunction()),
756 DynamicVGPRBlockSize(MFI.getDynamicVGPRBlockSize()),
757 ScratchReservedForDynamicVGPRs(MFI.getScratchReservedForDynamicVGPRs()),
758 NumKernargPreloadSGPRs(MFI.getNumKernargPreloadedSGPRs()),
759 MinNumAGPRs(MFI.getMinNumAGPRs()) {
760 for (Register Reg : MFI.getSGPRSpillPhysVGPRs())
761 SpillPhysVGPRS.push_back(regToString(Reg, TRI));
762
763 for (Register Reg : MFI.getWWMReservedRegs())
764 WWMReservedRegs.push_back(regToString(Reg, TRI));
765
766 if (MFI.getLongBranchReservedReg())
768 if (MFI.getVGPRForAGPRCopy())
770
771 if (MFI.getSGPRForEXECCopy())
773
774 auto SFI = MFI.getOptionalScavengeFI();
775 if (SFI)
777}
778
782
784 const yaml::SIMachineFunctionInfo &YamlMFI, const MachineFunction &MF,
788 LDSSize = YamlMFI.LDSSize;
789 GDSSize = YamlMFI.GDSSize;
790 DynLDSAlign = YamlMFI.DynLDSAlign;
791 PSInputAddr = YamlMFI.PSInputAddr;
792 PSInputEnable = YamlMFI.PSInputEnable;
793 MaxMemoryClusterDWords = YamlMFI.MaxMemoryClusterDWords;
794 HighBitsOf32BitAddress = YamlMFI.HighBitsOf32BitAddress;
795 Occupancy = YamlMFI.Occupancy;
797 MemoryBound = YamlMFI.MemoryBound;
798 WaveLimiter = YamlMFI.WaveLimiter;
799 HasSpilledSGPRs = YamlMFI.HasSpilledSGPRs;
800 HasSpilledVGPRs = YamlMFI.HasSpilledVGPRs;
801 NumWaveDispatchSGPRs = YamlMFI.NumWaveDispatchSGPRs;
802 NumWaveDispatchVGPRs = YamlMFI.NumWaveDispatchVGPRs;
803 BytesInStackArgArea = YamlMFI.BytesInStackArgArea;
804 ReturnsVoid = YamlMFI.ReturnsVoid;
805 IsWholeWaveFunction = YamlMFI.IsWholeWaveFunction;
806 MinNumAGPRs = YamlMFI.MinNumAGPRs;
807
808 UserSGPRInfo.allocKernargPreloadSGPRs(YamlMFI.NumKernargPreloadSGPRs);
809
810 if (YamlMFI.ScavengeFI) {
811 auto FIOrErr = YamlMFI.ScavengeFI->getFI(MF.getFrameInfo());
812 if (!FIOrErr) {
813 // Create a diagnostic for a the frame index.
814 const MemoryBuffer &Buffer =
815 *PFS.SM->getMemoryBuffer(PFS.SM->getMainFileID());
816
817 Error = SMDiagnostic(*PFS.SM, SMLoc(), Buffer.getBufferIdentifier(), 1, 1,
818 SourceMgr::DK_Error, toString(FIOrErr.takeError()),
819 "", {}, {});
820 SourceRange = YamlMFI.ScavengeFI->SourceRange;
821 return true;
822 }
823 ScavengeFI = *FIOrErr;
824 } else {
825 ScavengeFI = std::nullopt;
826 }
827 return false;
828}
829
831 auto [MinNumAGPR, MaxNumAGPR] =
832 AMDGPU::getIntegerPairAttribute(F, "amdgpu-agpr-alloc", {~0u, ~0u},
833 /*OnlyFirstRequired=*/true);
834 return MinNumAGPR != 0u;
835}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
Base class for AMDGPU specific classes of TargetSubtarget.
MachineBasicBlock & MBB
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
AMD GCN specific subclass of TargetSubtarget.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
if(PassOpts->AAPipeline)
const GCNTargetMachine & getTM(const GCNSubtarget *STI)
static cl::opt< bool, true > MFMAVGPRFormOpt("amdgpu-mfma-vgpr-form", cl::desc("Whether to force use VGPR for Opc and Dest of MFMA. If " "unspecified, default to compiler heuristics"), cl::location(SIMachineFunctionInfo::MFMAVGPRForm), cl::init(true), cl::Hidden)
static std::optional< yaml::SIArgumentInfo > convertArgumentInfo(const AMDGPUFunctionArgInfo &ArgInfo, const TargetRegisterInfo &TRI)
static yaml::StringValue regToString(Register Reg, const TargetRegisterInfo &TRI)
Interface definition for SIRegisterInfo.
Align DynLDSAlign
Align for dynamic shared memory if any.
AMDGPUMachineFunctionInfo(const Function &F, const AMDGPUSubtarget &ST)
uint32_t LDSSize
Number of bytes in the LDS that are being used.
static ClusterDimsAttr get(const Function &F)
Functions, function parameters, and return types can have attributes to indicate how they should be t...
Definition Attributes.h:105
BitVector & reset()
Reset all bits in the bitvector.
Definition BitVector.h:409
void resize(unsigned N, bool t=false)
Grow or shrink the bitvector.
Definition BitVector.h:355
BitVector & set()
Set all bits in the bitvector.
Definition BitVector.h:366
void setBitsInMask(const uint32_t *Mask, unsigned MaskWords=~0u)
Add '1' bits from Mask to this vector.
Definition BitVector.h:724
void push_back(bool Val)
Definition BitVector.h:487
Lightweight error class with error context and mandatory checking.
Definition Error.h:159
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
const SITargetLowering * getTargetLowering() const override
LLVM_ABI void sortUniqueLiveIns()
Sorts and uniques the LiveIns vector.
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateStackObject(uint64_t Size, Align Alignment, bool isSpillSlot, const AllocaInst *Alloca=nullptr, uint8_t ID=0)
Create a new statically sized stack object, returning a nonnegative identifier to represent it.
LLVM_ABI int CreateSpillStackObject(uint64_t Size, Align Alignment)
Create a new statically sized stack object that represents a spill slot, returning a nonnegative iden...
void setStackID(int ObjectIdx, uint8_t ID)
bool hasTailCall() const
Returns true if the function contains a tail call.
void RemoveStackObject(int ObjectIdx)
Remove or mark dead a statically sized stack object.
int getObjectIndexEnd() const
Return one past the maximum frame object index.
uint8_t getStackID(int ObjectIdx) const
int getObjectIndexBegin() const
Return the minimum frame object index.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void replaceFrameInstRegister(MCRegister From, MCRegister To)
Replace all references to register.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * cloneInfo(const Ty &Old)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLVM_ABI const MCPhysReg * getCalleeSavedRegs() const
Returns list of callee saved registers.
void reserveReg(MCRegister PhysReg, const TargetRegisterInfo *TRI)
reserveReg – Mark a register as reserved so checks like isAllocatable will not suggest using it.
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
This interface provides simple read-only access to a block of memory, and provides simple methods for...
virtual StringRef getBufferIdentifier() const
Return an identifier for this buffer, typically the filename it was read from.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool initializeBaseYamlFields(const yaml::SIMachineFunctionInfo &YamlMFI, const MachineFunction &MF, PerFunctionMIParsingState &PFS, SMDiagnostic &Error, SMRange &SourceRange)
void shiftWwmVGPRsToLowestRange(MachineFunction &MF, SmallVectorImpl< Register > &WWMVGPRs, BitVector &SavedVGPRs)
Register addPrivateSegmentSize(const SIRegisterInfo &TRI)
void allocateWWMSpill(MachineFunction &MF, Register VGPR, uint64_t Size=4, Align Alignment=Align(4))
Register addDispatchPtr(const SIRegisterInfo &TRI)
Register addFlatScratchInit(const SIRegisterInfo &TRI)
ArrayRef< Register > getSGPRSpillPhysVGPRs() const
int getScavengeFI(MachineFrameInfo &MFI, const SIRegisterInfo &TRI)
Register addQueuePtr(const SIRegisterInfo &TRI)
SIMachineFunctionInfo(const SIMachineFunctionInfo &MFI)=default
Register getGITPtrLoReg(const MachineFunction &MF) const
bool allocateVGPRSpillToAGPR(MachineFunction &MF, int FI, bool isAGPRtoVGPR)
Reserve AGPRs or VGPRs to support spilling for FrameIndex FI.
void splitWWMSpillRegisters(MachineFunction &MF, SmallVectorImpl< std::pair< Register, int > > &CalleeSavedRegs, SmallVectorImpl< std::pair< Register, int > > &ScratchRegs) const
bool mayUseAGPRs(const Function &F) const
bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg) const
bool allocateSGPRSpillToVGPRLane(MachineFunction &MF, int FI, bool SpillToPhysVGPRLane=false, bool IsPrologEpilog=false)
Register addKernargSegmentPtr(const SIRegisterInfo &TRI)
Register addDispatchID(const SIRegisterInfo &TRI)
bool removeDeadFrameIndices(MachineFrameInfo &MFI, bool ResetSGPRSpillStackIDs)
If ResetSGPRSpillStackIDs is true, reset the stack ID from sgpr-spill to the default stack.
MachineFunctionInfo * clone(BumpPtrAllocator &Allocator, MachineFunction &DestMF, const DenseMap< MachineBasicBlock *, MachineBasicBlock * > &Src2DstMBB) const override
Make a functionally equivalent copy of this MachineFunctionInfo in MF.
bool checkIndexInPrologEpilogSGPRSpills(int FI) const
Register addPrivateSegmentBuffer(const SIRegisterInfo &TRI)
const ReservedRegSet & getWWMReservedRegs() const
std::optional< int > getOptionalScavengeFI() const
Register addImplicitBufferPtr(const SIRegisterInfo &TRI)
void limitOccupancy(const MachineFunction &MF)
SmallVectorImpl< MCRegister > * addPreloadedKernArg(const SIRegisterInfo &TRI, const TargetRegisterClass *RC, unsigned AllocSizeDWord, int KernArgIdx, int PaddingSGPRs)
static bool isChainScratchRegister(Register VGPR)
Instances of this class encapsulate one diagnostic report, allowing printing to a raw_ostream as a ca...
Definition SourceMgr.h:303
Represents a location in source code.
Definition SMLoc.h:22
Represents a range in source code.
Definition SMLoc.h:47
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
typename SuperClass::const_iterator const_iterator
unsigned getMainFileID() const
Definition SourceMgr.h:151
const MemoryBuffer * getMemoryBuffer(unsigned i) const
Definition SourceMgr.h:144
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
bool consumeInteger(unsigned Radix, T &Result)
Parse the current string as an integer of the specified radix.
Definition StringRef.h:519
constexpr bool empty() const
Check if the string is empty.
Definition StringRef.h:141
const TargetMachine & getTargetMachine() const
ArrayRef< MCPhysReg > getRegisters() const
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
A raw_ostream that writes to an std::string.
unsigned getInitialPSInputAddr(const Function &F)
unsigned getDynamicVGPRBlockSize(const Function &F)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
CallingConv Namespace - This namespace contains an enum with a value for the well-known calling conve...
Definition CallingConv.h:21
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ SPIR_KERNEL
Used for SPIR kernel functions.
initializer< Ty > init(const Ty &Val)
LocationClass< Ty > location(Ty &L)
This is an optimization pass for GlobalISel generic memory operations.
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1764
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
std::string toString(const APInt &I, unsigned Radix, bool Signed, bool formatAsCLiteral=false, bool UpperCase=true, bool InsertSeparators=false)
constexpr unsigned DefaultMemoryClusterDWordsLimit
Definition SIInstrInfo.h:40
BumpPtrAllocatorImpl<> BumpPtrAllocator
The standard BumpPtrAllocator which just uses the default template parameters.
Definition Allocator.h:383
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition SCCPSolver.h:42
MachineFunctionInfo - This class can be derived from and used by targets to hold private target-speci...
A serializaable representation of a reference to a stack object or fixed stack object.
This class should be specialized by any type that needs to be converted to/from a YAML mapping.
Definition YAMLTraits.h:63
std::optional< SIArgument > PrivateSegmentWaveByteOffset
std::optional< SIArgument > WorkGroupIDY
std::optional< SIArgument > FlatScratchInit
std::optional< SIArgument > DispatchPtr
std::optional< SIArgument > DispatchID
std::optional< SIArgument > WorkItemIDY
std::optional< SIArgument > WorkGroupIDX
std::optional< SIArgument > ImplicitArgPtr
std::optional< SIArgument > QueuePtr
std::optional< SIArgument > WorkGroupInfo
std::optional< SIArgument > LDSKernelId
std::optional< SIArgument > ImplicitBufferPtr
std::optional< SIArgument > WorkItemIDX
std::optional< SIArgument > KernargSegmentPtr
std::optional< SIArgument > WorkItemIDZ
std::optional< SIArgument > PrivateSegmentSize
std::optional< SIArgument > PrivateSegmentBuffer
std::optional< SIArgument > FirstKernArgPreloadReg
std::optional< SIArgument > WorkGroupIDZ
std::optional< unsigned > Mask
static SIArgument createArgument(bool IsReg)
SmallVector< StringValue > WWMReservedRegs
void mappingImpl(yaml::IO &YamlIO) override
std::optional< SIArgumentInfo > ArgInfo
SmallVector< StringValue, 2 > SpillPhysVGPRS
std::optional< FrameIndex > ScavengeFI
A wrapper around std::string which contains a source range that's being set during parsing.