LLVM 23.0.0git
SIRegisterInfo.cpp
Go to the documentation of this file.
1//===-- SIRegisterInfo.cpp - SI Register Information ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI implementation of the TargetRegisterInfo class.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPU.h"
16#include "GCNSubtarget.h"
20#include "SIRegisterInfo.h"
26
27using namespace llvm;
28
29#define GET_REGINFO_TARGET_DESC
30#include "AMDGPUGenRegisterInfo.inc"
31
33 "amdgpu-spill-sgpr-to-vgpr",
34 cl::desc("Enable spilling SGPRs to VGPRs"),
36 cl::init(true));
37
39 "amdgpu-spill-cfi-saved-regs",
40 cl::desc("Enable spilling the registers required for CFI emission"),
42
43std::array<std::vector<int16_t>, 32> SIRegisterInfo::RegSplitParts;
44std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable;
45
46// Map numbers of DWORDs to indexes in SubRegFromChannelTable.
47// Valid indexes are shifted 1, such that a 0 mapping means unsupported.
48// e.g. for 8 DWORDs (256-bit), SubRegFromChannelTableWidthMap[8] = 8,
49// meaning index 7 in SubRegFromChannelTable.
50static const std::array<unsigned, 17> SubRegFromChannelTableWidthMap = {
51 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 9};
52
53static void emitUnsupportedError(const Function &Fn, const MachineInstr &MI,
54 const Twine &ErrMsg) {
56 DiagnosticInfoUnsupported(Fn, ErrMsg, MI.getDebugLoc()));
57}
58
59namespace llvm {
60
61// A temporary struct to spill SGPRs.
62// This is mostly to spill SGPRs to memory. Spilling SGPRs into VGPR lanes emits
63// just v_writelane and v_readlane.
64//
65// When spilling to memory, the SGPRs are written into VGPR lanes and the VGPR
66// is saved to scratch (or the other way around for loads).
67// For this, a VGPR is required where the needed lanes can be clobbered. The
68// RegScavenger can provide a VGPR where currently active lanes can be
69// clobbered, but we still need to save inactive lanes.
70// The high-level steps are:
71// - Try to scavenge SGPR(s) to save exec
72// - Try to scavenge VGPR
73// - Save needed, all or inactive lanes of a TmpVGPR
74// - Spill/Restore SGPRs using TmpVGPR
75// - Restore TmpVGPR
76//
77// To save all lanes of TmpVGPR, exec needs to be saved and modified. If we
78// cannot scavenge temporary SGPRs to save exec, we use the following code:
79// buffer_store_dword TmpVGPR ; only if active lanes need to be saved
80// s_not exec, exec
81// buffer_store_dword TmpVGPR ; save inactive lanes
82// s_not exec, exec
84 struct PerVGPRData {
85 unsigned PerVGPR;
86 unsigned NumVGPRs;
87 int64_t VGPRLanes;
88 };
89
90 // The SGPR to save
94 unsigned NumSubRegs;
95 bool IsKill;
96 const DebugLoc &DL;
97
98 /* When spilling to stack */
99 // The SGPRs are written into this VGPR, which is then written to scratch
100 // (or vice versa for loads).
101 Register TmpVGPR = AMDGPU::NoRegister;
102 // Temporary spill slot to save TmpVGPR to.
104 // If TmpVGPR is live before the spill or if it is scavenged.
105 bool TmpVGPRLive = false;
106 // Scavenged SGPR to save EXEC.
107 Register SavedExecReg = AMDGPU::NoRegister;
108 // Stack index to write the SGPRs to.
109 int Index;
110 unsigned EltSize = 4;
111
120 unsigned MovOpc;
121 unsigned NotOpc;
122
126 : SGPRSpillBuilder(TRI, TII, IsWave32, MI, MI->getOperand(0).getReg(),
127 MI->getOperand(0).isKill(), Index, RS) {}
128
131 bool IsKill, int Index, RegScavenger *RS)
132 : SuperReg(Reg), MI(MI), IsKill(IsKill), DL(MI->getDebugLoc()),
133 Index(Index), RS(RS), MBB(MI->getParent()), MF(*MBB->getParent()),
134 MFI(*MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI),
136 const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg);
137 SplitParts = TRI.getRegSplitParts(RC, EltSize);
138 NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
139
140 if (IsWave32) {
141 ExecReg = AMDGPU::EXEC_LO;
142 MovOpc = AMDGPU::S_MOV_B32;
143 NotOpc = AMDGPU::S_NOT_B32;
144 } else {
145 ExecReg = AMDGPU::EXEC;
146 MovOpc = AMDGPU::S_MOV_B64;
147 NotOpc = AMDGPU::S_NOT_B64;
148 }
149
150 assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
151 assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI &&
152 SuperReg != AMDGPU::EXEC && "exec should never spill");
153 }
154
157 Data.PerVGPR = IsWave32 ? 32 : 64;
158 Data.NumVGPRs = (NumSubRegs + (Data.PerVGPR - 1)) / Data.PerVGPR;
159 Data.VGPRLanes = (1LL << std::min(Data.PerVGPR, NumSubRegs)) - 1LL;
160 return Data;
161 }
162
163 // Tries to scavenge SGPRs to save EXEC and a VGPR. Uses v0 if no VGPR is
164 // free.
165 // Writes these instructions if an SGPR can be scavenged:
166 // s_mov_b64 s[6:7], exec ; Save exec
167 // s_mov_b64 exec, 3 ; Wanted lanemask
168 // buffer_store_dword v1 ; Write scavenged VGPR to emergency slot
169 //
170 // Writes these instructions if no SGPR can be scavenged:
171 // buffer_store_dword v0 ; Only if no free VGPR was found
172 // s_not_b64 exec, exec
173 // buffer_store_dword v0 ; Save inactive lanes
174 // ; exec stays inverted, it is flipped back in
175 // ; restore.
176 void prepare() {
177 // Scavenged temporary VGPR to use. It must be scavenged once for any number
178 // of spilled subregs.
179 // FIXME: The liveness analysis is limited and does not tell if a register
180 // is in use in lanes that are currently inactive. We can never be sure if
181 // a register as actually in use in another lane, so we need to save all
182 // used lanes of the chosen VGPR.
183 assert(RS && "Cannot spill SGPR to memory without RegScavenger");
184 TmpVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false,
185 0, false);
186
187 // Reserve temporary stack slot
188 TmpVGPRIndex = MFI.getScavengeFI(MF.getFrameInfo(), TRI);
189 if (TmpVGPR) {
190 // Found a register that is dead in the currently active lanes, we only
191 // need to spill inactive lanes.
192 TmpVGPRLive = false;
193 } else {
194 // Pick v0 because it doesn't make a difference.
195 TmpVGPR = AMDGPU::VGPR0;
196 TmpVGPRLive = true;
197 }
198
199 if (TmpVGPRLive) {
200 // We need to inform the scavenger that this index is already in use until
201 // we're done with the custom emergency spill.
202 RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR);
203 }
204
205 // We may end up recursively calling the scavenger, and don't want to re-use
206 // the same register.
207 RS->setRegUsed(TmpVGPR);
208
209 // Try to scavenge SGPRs to save exec
210 assert(!SavedExecReg && "Exec is already saved, refuse to save again");
211 const TargetRegisterClass &RC =
212 IsWave32 ? AMDGPU::SGPR_32RegClass : AMDGPU::SGPR_64RegClass;
213 RS->setRegUsed(SuperReg);
214 SavedExecReg = RS->scavengeRegisterBackwards(RC, MI, false, 0, false);
215
216 int64_t VGPRLanes = getPerVGPRData().VGPRLanes;
217
218 if (SavedExecReg) {
219 RS->setRegUsed(SavedExecReg);
220 // Set exec to needed lanes
222 auto I =
223 BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg).addImm(VGPRLanes);
224 if (!TmpVGPRLive)
226 // Spill needed lanes
227 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
228 } else {
229 // The modify and restore of exec clobber SCC, which we would have to save
230 // and restore. FIXME: We probably would need to reserve a register for
231 // this.
232 if (RS->isRegUsed(AMDGPU::SCC))
233 emitUnsupportedError(MF.getFunction(), *MI,
234 "unhandled SGPR spill to memory");
235
236 // Spill active lanes
237 if (TmpVGPRLive)
238 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false,
239 /*IsKill*/ false);
240 // Spill inactive lanes
241 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
242 if (!TmpVGPRLive)
244 I->getOperand(2).setIsDead(); // Mark SCC as dead.
245 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
246 }
247 }
248
249 // Writes these instructions if an SGPR can be scavenged:
250 // buffer_load_dword v1 ; Write scavenged VGPR to emergency slot
251 // s_waitcnt vmcnt(0) ; If a free VGPR was found
252 // s_mov_b64 exec, s[6:7] ; Save exec
253 //
254 // Writes these instructions if no SGPR can be scavenged:
255 // buffer_load_dword v0 ; Restore inactive lanes
256 // s_waitcnt vmcnt(0) ; If a free VGPR was found
257 // s_not_b64 exec, exec
258 // buffer_load_dword v0 ; Only if no free VGPR was found
259 void restore() {
260 if (SavedExecReg) {
261 // Restore used lanes
262 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
263 /*IsKill*/ false);
264 // Restore exec
265 auto I = BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg)
267 // Add an implicit use of the load so it is not dead.
268 // FIXME This inserts an unnecessary waitcnt
269 if (!TmpVGPRLive) {
271 }
272 } else {
273 // Restore inactive lanes
274 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
275 /*IsKill*/ false);
276 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
277 if (!TmpVGPRLive)
279 I->getOperand(2).setIsDead(); // Mark SCC as dead.
280
281 // Restore active lanes
282 if (TmpVGPRLive)
283 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true);
284 }
285
286 // Inform the scavenger where we're releasing our custom scavenged register.
287 if (TmpVGPRLive) {
288 MachineBasicBlock::iterator RestorePt = std::prev(MI);
289 RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR, &*RestorePt);
290 }
291 }
292
293 // Write TmpVGPR to memory or read TmpVGPR from memory.
294 // Either using a single buffer_load/store if exec is set to the needed mask
295 // or using
296 // buffer_load
297 // s_not exec, exec
298 // buffer_load
299 // s_not exec, exec
300 void readWriteTmpVGPR(unsigned Offset, bool IsLoad) {
301 if (SavedExecReg) {
302 // Spill needed lanes
303 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
304 } else {
305 // The modify and restore of exec clobber SCC, which we would have to save
306 // and restore. FIXME: We probably would need to reserve a register for
307 // this.
308 if (RS->isRegUsed(AMDGPU::SCC))
309 emitUnsupportedError(MF.getFunction(), *MI,
310 "unhandled SGPR spill to memory");
311
312 // Spill active lanes
313 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad,
314 /*IsKill*/ false);
315 // Spill inactive lanes
316 auto Not0 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
317 Not0->getOperand(2).setIsDead(); // Mark SCC as dead.
318 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
319 auto Not1 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
320 Not1->getOperand(2).setIsDead(); // Mark SCC as dead.
321 }
322 }
323
325 assert(MBB->getParent() == &MF);
326 MI = NewMI;
327 MBB = NewMBB;
328 }
329};
330
331} // namespace llvm
332
334 : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour(),
335 ST.getAMDGPUDwarfFlavour(),
336 /*PC=*/0,
337 ST.getHwMode(MCSubtargetInfo::HwMode_RegInfo)),
338 ST(ST), SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) {
339
340 assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 &&
341 getSubRegIndexLaneMask(AMDGPU::sub31).getAsInteger() == (3ULL << 62) &&
342 (getSubRegIndexLaneMask(AMDGPU::lo16) |
343 getSubRegIndexLaneMask(AMDGPU::hi16)).getAsInteger() ==
344 getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() &&
345 "getNumCoveredRegs() will not work with generated subreg masks!");
346
347 RegPressureIgnoredUnits.resize(getNumRegUnits());
348 RegPressureIgnoredUnits.set(
349 static_cast<unsigned>(*regunits(MCRegister::from(AMDGPU::M0)).begin()));
350 for (auto Reg : AMDGPU::VGPR_16RegClass) {
351 if (AMDGPU::isHi16Reg(Reg, *this))
352 RegPressureIgnoredUnits.set(
353 static_cast<unsigned>(*regunits(Reg).begin()));
354 }
355
356 // HACK: Until this is fully tablegen'd.
357 static llvm::once_flag InitializeRegSplitPartsFlag;
358
359 static auto InitializeRegSplitPartsOnce = [this]() {
360 for (unsigned Idx = 1, E = getNumSubRegIndices() - 1; Idx < E; ++Idx) {
361 unsigned Size = getSubRegIdxSize(Idx);
362 if (Size & 15)
363 continue;
364 std::vector<int16_t> &Vec = RegSplitParts[Size / 16 - 1];
365 unsigned Pos = getSubRegIdxOffset(Idx);
366 if (Pos % Size)
367 continue;
368 Pos /= Size;
369 if (Vec.empty()) {
370 unsigned MaxNumParts = 1024 / Size; // Maximum register is 1024 bits.
371 Vec.resize(MaxNumParts);
372 }
373 Vec[Pos] = Idx;
374 }
375 };
376
377 static llvm::once_flag InitializeSubRegFromChannelTableFlag;
378
379 static auto InitializeSubRegFromChannelTableOnce = [this]() {
380 for (auto &Row : SubRegFromChannelTable)
381 Row.fill(AMDGPU::NoSubRegister);
382 for (unsigned Idx = 1; Idx < getNumSubRegIndices(); ++Idx) {
383 unsigned Width = getSubRegIdxSize(Idx) / 32;
384 unsigned Offset = getSubRegIdxOffset(Idx) / 32;
386 Width = SubRegFromChannelTableWidthMap[Width];
387 if (Width == 0)
388 continue;
389 unsigned TableIdx = Width - 1;
390 assert(TableIdx < SubRegFromChannelTable.size());
391 assert(Offset < SubRegFromChannelTable[TableIdx].size());
392 SubRegFromChannelTable[TableIdx][Offset] = Idx;
393 }
394 };
395
396 llvm::call_once(InitializeRegSplitPartsFlag, InitializeRegSplitPartsOnce);
397 llvm::call_once(InitializeSubRegFromChannelTableFlag,
398 InitializeSubRegFromChannelTableOnce);
399}
400
401void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved,
402 MCRegister Reg) const {
403 for (MCRegAliasIterator R(Reg, this, true); R.isValid(); ++R)
404 Reserved.set(*R);
405}
406
407// Forced to be here by one .inc
409 const MachineFunction *MF) const {
411 switch (CC) {
412 case CallingConv::C:
415 return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_SaveList
416 : CSR_AMDGPU_SaveList;
419 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_SaveList
420 : CSR_AMDGPU_SI_Gfx_SaveList;
422 return CSR_AMDGPU_CS_ChainPreserve_SaveList;
423 default: {
424 // Dummy to not crash RegisterClassInfo.
425 static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister;
426 return &NoCalleeSavedReg;
427 }
428 }
429}
430
431const MCPhysReg *
433 return nullptr;
434}
435
437 CallingConv::ID CC) const {
438 switch (CC) {
439 case CallingConv::C:
442 return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_RegMask
443 : CSR_AMDGPU_RegMask;
446 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_RegMask
447 : CSR_AMDGPU_SI_Gfx_RegMask;
450 // Calls to these functions never return, so we can pretend everything is
451 // preserved.
452 return AMDGPU_AllVGPRs_RegMask;
453 default:
454 return nullptr;
455 }
456}
457
459 return CSR_AMDGPU_NoRegs_RegMask;
460}
461
463 return VGPR >= AMDGPU::VGPR0 && VGPR < AMDGPU::VGPR8;
464}
465
468 const MachineFunction &MF) const {
469 // FIXME: Should have a helper function like getEquivalentVGPRClass to get the
470 // equivalent AV class. If used one, the verifier will crash after
471 // RegBankSelect in the GISel flow. The aligned regclasses are not fully given
472 // until Instruction selection.
473 if (ST.hasMAIInsts() && (isVGPRClass(RC) || isAGPRClass(RC))) {
474 if (RC == &AMDGPU::VGPR_32RegClass || RC == &AMDGPU::AGPR_32RegClass)
475 return &AMDGPU::AV_32RegClass;
476 if (RC == &AMDGPU::VReg_64RegClass || RC == &AMDGPU::AReg_64RegClass)
477 return &AMDGPU::AV_64RegClass;
478 if (RC == &AMDGPU::VReg_64_Align2RegClass ||
479 RC == &AMDGPU::AReg_64_Align2RegClass)
480 return &AMDGPU::AV_64_Align2RegClass;
481 if (RC == &AMDGPU::VReg_96RegClass || RC == &AMDGPU::AReg_96RegClass)
482 return &AMDGPU::AV_96RegClass;
483 if (RC == &AMDGPU::VReg_96_Align2RegClass ||
484 RC == &AMDGPU::AReg_96_Align2RegClass)
485 return &AMDGPU::AV_96_Align2RegClass;
486 if (RC == &AMDGPU::VReg_128RegClass || RC == &AMDGPU::AReg_128RegClass)
487 return &AMDGPU::AV_128RegClass;
488 if (RC == &AMDGPU::VReg_128_Align2RegClass ||
489 RC == &AMDGPU::AReg_128_Align2RegClass)
490 return &AMDGPU::AV_128_Align2RegClass;
491 if (RC == &AMDGPU::VReg_160RegClass || RC == &AMDGPU::AReg_160RegClass)
492 return &AMDGPU::AV_160RegClass;
493 if (RC == &AMDGPU::VReg_160_Align2RegClass ||
494 RC == &AMDGPU::AReg_160_Align2RegClass)
495 return &AMDGPU::AV_160_Align2RegClass;
496 if (RC == &AMDGPU::VReg_192RegClass || RC == &AMDGPU::AReg_192RegClass)
497 return &AMDGPU::AV_192RegClass;
498 if (RC == &AMDGPU::VReg_192_Align2RegClass ||
499 RC == &AMDGPU::AReg_192_Align2RegClass)
500 return &AMDGPU::AV_192_Align2RegClass;
501 if (RC == &AMDGPU::VReg_256RegClass || RC == &AMDGPU::AReg_256RegClass)
502 return &AMDGPU::AV_256RegClass;
503 if (RC == &AMDGPU::VReg_256_Align2RegClass ||
504 RC == &AMDGPU::AReg_256_Align2RegClass)
505 return &AMDGPU::AV_256_Align2RegClass;
506 if (RC == &AMDGPU::VReg_512RegClass || RC == &AMDGPU::AReg_512RegClass)
507 return &AMDGPU::AV_512RegClass;
508 if (RC == &AMDGPU::VReg_512_Align2RegClass ||
509 RC == &AMDGPU::AReg_512_Align2RegClass)
510 return &AMDGPU::AV_512_Align2RegClass;
511 if (RC == &AMDGPU::VReg_1024RegClass || RC == &AMDGPU::AReg_1024RegClass)
512 return &AMDGPU::AV_1024RegClass;
513 if (RC == &AMDGPU::VReg_1024_Align2RegClass ||
514 RC == &AMDGPU::AReg_1024_Align2RegClass)
515 return &AMDGPU::AV_1024_Align2RegClass;
516 }
517
519}
520
522 const SIFrameLowering *TFI = ST.getFrameLowering();
524
525 // During ISel lowering we always reserve the stack pointer in entry and chain
526 // functions, but never actually want to reference it when accessing our own
527 // frame. If we need a frame pointer we use it, but otherwise we can just use
528 // an immediate "0" which we represent by returning NoRegister.
529 if (FuncInfo->isBottomOfStack()) {
530 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : Register();
531 }
532 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg()
533 : FuncInfo->getStackPtrOffsetReg();
534}
535
537 // When we need stack realignment, we can't reference off of the
538 // stack pointer, so we reserve a base pointer.
539 return shouldRealignStack(MF);
540}
541
542Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; }
543
545 return AMDGPU_AllVGPRs_RegMask;
546}
547
549 return AMDGPU_AllAGPRs_RegMask;
550}
551
553 return AMDGPU_AllVectorRegs_RegMask;
554}
555
557 return AMDGPU_AllAllocatableSRegs_RegMask;
558}
559
560unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel,
561 unsigned NumRegs) {
562 assert(NumRegs < SubRegFromChannelTableWidthMap.size());
563 unsigned NumRegIndex = SubRegFromChannelTableWidthMap[NumRegs];
564 assert(NumRegIndex && "Not implemented");
565 assert(Channel < SubRegFromChannelTable[NumRegIndex - 1].size());
566 return SubRegFromChannelTable[NumRegIndex - 1][Channel];
567}
568
572
575 const unsigned Align,
576 const TargetRegisterClass *RC) const {
577 unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), Align) - Align;
578 MCRegister BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
579 return getMatchingSuperReg(BaseReg, AMDGPU::sub0, RC);
580}
581
583 const MachineFunction &MF) const {
584 return getAlignedHighSGPRForRC(MF, /*Align=*/4, &AMDGPU::SGPR_128RegClass);
585}
586
588 BitVector Reserved(getNumRegs());
589 Reserved.set(AMDGPU::MODE);
590
592
593 // Reserve special purpose registers.
594 //
595 // EXEC_LO and EXEC_HI could be allocated and used as regular register, but
596 // this seems likely to result in bugs, so I'm marking them as reserved.
597 reserveRegisterTuples(Reserved, AMDGPU::EXEC);
598 reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
599
600 // M0 has to be reserved so that llvm accepts it as a live-in into a block.
601 reserveRegisterTuples(Reserved, AMDGPU::M0);
602
603 // Reserve src_vccz, src_execz, src_scc.
604 reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ);
605 reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ);
606 reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC);
607
608 // Reserve the memory aperture registers
609 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE);
610 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT);
611 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE);
612 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT);
613 reserveRegisterTuples(Reserved, AMDGPU::SRC_FLAT_SCRATCH_BASE_LO);
614 reserveRegisterTuples(Reserved, AMDGPU::SRC_FLAT_SCRATCH_BASE_HI);
615
616 // Reserve async counters pseudo registers
617 reserveRegisterTuples(Reserved, AMDGPU::ASYNCcnt);
618 reserveRegisterTuples(Reserved, AMDGPU::TENSORcnt);
619
620 // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen.
621 reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID);
622
623 // Reserve xnack_mask registers - support is not implemented in Codegen.
624 reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK);
625
626 // Reserve lds_direct register - support is not implemented in Codegen.
627 reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT);
628
629 // Reserve Trap Handler registers - support is not implemented in Codegen.
630 reserveRegisterTuples(Reserved, AMDGPU::TBA);
631 reserveRegisterTuples(Reserved, AMDGPU::TMA);
632 reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1);
633 reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3);
634 reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5);
635 reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7);
636 reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9);
637 reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11);
638 reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13);
639 reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15);
640
641 // Reserve null register - it shall never be allocated
642 reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL64);
643
644 // Reserve SGPRs.
645 //
646 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
647 unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
648 for (const TargetRegisterClass *RC : regclasses()) {
649 if (RC->isBaseClass() && isSGPRClass(RC)) {
650 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
651 for (MCPhysReg Reg : *RC) {
652 unsigned Index = getHWRegIndex(Reg);
653 if (Index + NumRegs > MaxNumSGPRs && Index < TotalNumSGPRs &&
654 Reg != AMDGPU::VCC_LO && Reg != AMDGPU::VCC_HI &&
655 Reg != AMDGPU::VCC)
656 Reserved.set(Reg);
657 }
658 }
659 }
660
661 Register ScratchRSrcReg = MFI->getScratchRSrcReg();
662 if (ScratchRSrcReg != AMDGPU::NoRegister) {
663 // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we
664 // need to spill.
665 // TODO: May need to reserve a VGPR if doing LDS spilling.
666 reserveRegisterTuples(Reserved, ScratchRSrcReg);
667 }
668
669 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
670 if (LongBranchReservedReg)
671 reserveRegisterTuples(Reserved, LongBranchReservedReg);
672
673 // We have to assume the SP is needed in case there are calls in the function,
674 // which is detected after the function is lowered. If we aren't really going
675 // to need SP, don't bother reserving it.
676 MCRegister StackPtrReg = MFI->getStackPtrOffsetReg();
677 if (StackPtrReg) {
678 reserveRegisterTuples(Reserved, StackPtrReg);
679 assert(!isSubRegister(ScratchRSrcReg, StackPtrReg));
680 }
681
682 MCRegister FrameReg = MFI->getFrameOffsetReg();
683 if (FrameReg) {
684 reserveRegisterTuples(Reserved, FrameReg);
685 assert(!isSubRegister(ScratchRSrcReg, FrameReg));
686 }
687
688 if (hasBasePointer(MF)) {
689 MCRegister BasePtrReg = getBaseRegister();
690 reserveRegisterTuples(Reserved, BasePtrReg);
691 assert(!isSubRegister(ScratchRSrcReg, BasePtrReg));
692 }
693
694 // FIXME: Use same reserved register introduced in D149775
695 // SGPR used to preserve EXEC MASK around WWM spill/copy instructions.
696 Register ExecCopyReg = MFI->getSGPRForEXECCopy();
697 if (ExecCopyReg)
698 reserveRegisterTuples(Reserved, ExecCopyReg);
699
700 // Reserve VGPRs/AGPRs.
701 //
702 auto [MaxNumVGPRs, MaxNumAGPRs] = ST.getMaxNumVectorRegs(MF.getFunction());
703
704 for (const TargetRegisterClass *RC : regclasses()) {
705 if (RC->isBaseClass() && isVGPRClass(RC)) {
706 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
707 for (MCPhysReg Reg : *RC) {
708 unsigned Index = getHWRegIndex(Reg);
709 if (Index + NumRegs > MaxNumVGPRs)
710 Reserved.set(Reg);
711 }
712 }
713 }
714
715 // Reserve all the AGPRs if there are no instructions to use it.
716 if (!ST.hasMAIInsts())
717 MaxNumAGPRs = 0;
718 for (const TargetRegisterClass *RC : regclasses()) {
719 if (RC->isBaseClass() && isAGPRClass(RC)) {
720 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
721 for (MCPhysReg Reg : *RC) {
722 unsigned Index = getHWRegIndex(Reg);
723 if (Index + NumRegs > MaxNumAGPRs)
724 Reserved.set(Reg);
725 }
726 }
727 }
728
729 // On GFX908, in order to guarantee copying between AGPRs, we need a scratch
730 // VGPR available at all times.
731 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
732 reserveRegisterTuples(Reserved, MFI->getVGPRForAGPRCopy());
733 }
734
735 // During wwm-regalloc, reserve the registers for perlane VGPR allocation. The
736 // MFI->getNonWWMRegMask() field will have a valid bitmask only during
737 // wwm-regalloc and it would be empty otherwise.
738 BitVector NonWWMRegMask = MFI->getNonWWMRegMask();
739 if (!NonWWMRegMask.empty()) {
740 for (unsigned RegI = AMDGPU::VGPR0, RegE = AMDGPU::VGPR0 + MaxNumVGPRs;
741 RegI < RegE; ++RegI) {
742 if (NonWWMRegMask.test(RegI))
743 reserveRegisterTuples(Reserved, RegI);
744 }
745 }
746
747 for (Register Reg : MFI->getWWMReservedRegs())
748 reserveRegisterTuples(Reserved, Reg);
749
750 // FIXME: Stop using reserved registers for this.
751 for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs())
752 reserveRegisterTuples(Reserved, Reg);
753
754 for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs())
755 reserveRegisterTuples(Reserved, Reg);
756
757 return Reserved;
758}
759
761 MCRegister PhysReg) const {
762 return !MF.getRegInfo().isReserved(PhysReg);
763}
764
767 // On entry or in chain functions, the base address is 0, so it can't possibly
768 // need any more alignment.
769
770 // FIXME: Should be able to specify the entry frame alignment per calling
771 // convention instead.
772 if (Info->isBottomOfStack())
773 return false;
774
776}
777
780 if (Info->isEntryFunction()) {
781 const MachineFrameInfo &MFI = Fn.getFrameInfo();
782 return MFI.hasStackObjects() || MFI.hasCalls();
783 }
784
785 // May need scavenger for dealing with callee saved registers.
786 return true;
787}
788
790 const MachineFunction &MF) const {
791 // Do not use frame virtual registers. They used to be used for SGPRs, but
792 // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the
793 // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a
794 // spill.
795 return false;
796}
797
799 const MachineFunction &MF) const {
800 const MachineFrameInfo &MFI = MF.getFrameInfo();
801 return MFI.hasStackObjects();
802}
803
805 const MachineFunction &) const {
806 // There are no special dedicated stack or frame pointers.
807 return true;
808}
809
812
813 int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
814 AMDGPU::OpName::offset);
815 return MI->getOperand(OffIdx).getImm();
816}
817
819 int Idx) const {
820 switch (MI->getOpcode()) {
821 case AMDGPU::V_ADD_U32_e32:
822 case AMDGPU::V_ADD_U32_e64:
823 case AMDGPU::V_ADD_CO_U32_e32: {
824 int OtherIdx = Idx == 1 ? 2 : 1;
825 const MachineOperand &OtherOp = MI->getOperand(OtherIdx);
826 return OtherOp.isImm() ? OtherOp.getImm() : 0;
827 }
828 case AMDGPU::V_ADD_CO_U32_e64: {
829 int OtherIdx = Idx == 2 ? 3 : 2;
830 const MachineOperand &OtherOp = MI->getOperand(OtherIdx);
831 return OtherOp.isImm() ? OtherOp.getImm() : 0;
832 }
833 default:
834 break;
835 }
836
838 return 0;
839
840 assert((Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
841 AMDGPU::OpName::vaddr) ||
842 (Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
843 AMDGPU::OpName::saddr))) &&
844 "Should never see frame index on non-address operand");
845
847}
848
850 const MachineInstr &MI) {
851 assert(MI.getDesc().isAdd());
852 const MachineOperand &Src0 = MI.getOperand(1);
853 const MachineOperand &Src1 = MI.getOperand(2);
854
855 if (Src0.isFI()) {
856 return Src1.isImm() || (Src1.isReg() && TRI.isVGPR(MI.getMF()->getRegInfo(),
857 Src1.getReg()));
858 }
859
860 if (Src1.isFI()) {
861 return Src0.isImm() || (Src0.isReg() && TRI.isVGPR(MI.getMF()->getRegInfo(),
862 Src0.getReg()));
863 }
864
865 return false;
866}
867
869 // TODO: Handle v_add_co_u32, v_or_b32, v_and_b32 and scalar opcodes.
870 switch (MI->getOpcode()) {
871 case AMDGPU::V_ADD_U32_e32: {
872 // TODO: We could handle this but it requires work to avoid violating
873 // operand restrictions.
874 if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e32) < 2 &&
875 !isFIPlusImmOrVGPR(*this, *MI))
876 return false;
877 [[fallthrough]];
878 }
879 case AMDGPU::V_ADD_U32_e64:
880 // FIXME: This optimization is barely profitable hasFlatScratchEnabled
881 // as-is.
882 //
883 // Much of the benefit with the MUBUF handling is we avoid duplicating the
884 // shift of the frame register, which isn't needed with scratch.
885 //
886 // materializeFrameBaseRegister doesn't know the register classes of the
887 // uses, and unconditionally uses an s_add_i32, which will end up using a
888 // copy for the vector uses.
889 return !ST.hasFlatScratchEnabled();
890 case AMDGPU::V_ADD_CO_U32_e32:
891 if (ST.getConstantBusLimit(AMDGPU::V_ADD_CO_U32_e32) < 2 &&
892 !isFIPlusImmOrVGPR(*this, *MI))
893 return false;
894 // We can't deal with the case where the carry out has a use (though this
895 // should never happen)
896 return MI->getOperand(3).isDead();
897 case AMDGPU::V_ADD_CO_U32_e64:
898 // TODO: Should we check use_empty instead?
899 return MI->getOperand(1).isDead();
900 default:
901 break;
902 }
903
905 return false;
906
907 int64_t FullOffset = Offset + getScratchInstrOffset(MI);
908
909 const SIInstrInfo *TII = ST.getInstrInfo();
911 return !TII->isLegalMUBUFImmOffset(FullOffset);
912
913 return !TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS,
915}
916
918 int FrameIdx,
919 int64_t Offset) const {
920 MachineBasicBlock::iterator Ins = MBB->begin();
921 DebugLoc DL; // Defaults to "unknown"
922
923 if (Ins != MBB->end())
924 DL = Ins->getDebugLoc();
925
926 MachineFunction *MF = MBB->getParent();
927 const SIInstrInfo *TII = ST.getInstrInfo();
928 MachineRegisterInfo &MRI = MF->getRegInfo();
929 unsigned MovOpc =
930 ST.hasFlatScratchEnabled() ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
931
932 Register BaseReg = MRI.createVirtualRegister(
933 ST.hasFlatScratchEnabled() ? &AMDGPU::SReg_32_XEXEC_HIRegClass
934 : &AMDGPU::VGPR_32RegClass);
935
936 if (Offset == 0) {
937 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), BaseReg)
938 .addFrameIndex(FrameIdx);
939 return BaseReg;
940 }
941
942 Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
943
944 Register FIReg = MRI.createVirtualRegister(ST.hasFlatScratchEnabled()
945 ? &AMDGPU::SReg_32_XM0RegClass
946 : &AMDGPU::VGPR_32RegClass);
947
948 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
949 .addImm(Offset);
950 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), FIReg)
951 .addFrameIndex(FrameIdx);
952
953 if (ST.hasFlatScratchEnabled()) {
954 // FIXME: Make sure scc isn't live in.
955 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_I32), BaseReg)
956 .addReg(OffsetReg, RegState::Kill)
957 .addReg(FIReg)
958 .setOperandDead(3); // scc
959 return BaseReg;
960 }
961
962 TII->getAddNoCarry(*MBB, Ins, DL, BaseReg)
963 .addReg(OffsetReg, RegState::Kill)
964 .addReg(FIReg)
965 .addImm(0); // clamp bit
966
967 return BaseReg;
968}
969
971 int64_t Offset) const {
972 const SIInstrInfo *TII = ST.getInstrInfo();
973
974 switch (MI.getOpcode()) {
975 case AMDGPU::V_ADD_U32_e32:
976 case AMDGPU::V_ADD_CO_U32_e32: {
977 MachineOperand *FIOp = &MI.getOperand(2);
978 MachineOperand *ImmOp = &MI.getOperand(1);
979 if (!FIOp->isFI())
980 std::swap(FIOp, ImmOp);
981
982 if (!ImmOp->isImm()) {
983 assert(Offset == 0);
984 FIOp->ChangeToRegister(BaseReg, false);
985 TII->legalizeOperandsVOP2(MI.getMF()->getRegInfo(), MI);
986 return;
987 }
988
989 int64_t TotalOffset = ImmOp->getImm() + Offset;
990 if (TotalOffset == 0) {
991 MI.setDesc(TII->get(AMDGPU::COPY));
992 for (unsigned I = MI.getNumOperands() - 1; I != 1; --I)
993 MI.removeOperand(I);
994
995 MI.getOperand(1).ChangeToRegister(BaseReg, false);
996 return;
997 }
998
999 ImmOp->setImm(TotalOffset);
1000
1001 MachineBasicBlock *MBB = MI.getParent();
1002 MachineFunction *MF = MBB->getParent();
1003 MachineRegisterInfo &MRI = MF->getRegInfo();
1004
1005 // FIXME: materializeFrameBaseRegister does not know the register class of
1006 // the uses of the frame index, and assumes SGPR for hasFlatScratchEnabled.
1007 // Emit a copy so we have a legal operand and hope the register coalescer
1008 // can clean it up.
1009 if (isSGPRReg(MRI, BaseReg)) {
1010 Register BaseRegVGPR =
1011 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1012 BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), BaseRegVGPR)
1013 .addReg(BaseReg);
1014 MI.getOperand(2).ChangeToRegister(BaseRegVGPR, false);
1015 } else {
1016 MI.getOperand(2).ChangeToRegister(BaseReg, false);
1017 }
1018 return;
1019 }
1020 case AMDGPU::V_ADD_U32_e64:
1021 case AMDGPU::V_ADD_CO_U32_e64: {
1022 int Src0Idx = MI.getNumExplicitDefs();
1023 MachineOperand *FIOp = &MI.getOperand(Src0Idx);
1024 MachineOperand *ImmOp = &MI.getOperand(Src0Idx + 1);
1025 if (!FIOp->isFI())
1026 std::swap(FIOp, ImmOp);
1027
1028 if (!ImmOp->isImm()) {
1029 FIOp->ChangeToRegister(BaseReg, false);
1030 TII->legalizeOperandsVOP3(MI.getMF()->getRegInfo(), MI);
1031 return;
1032 }
1033
1034 int64_t TotalOffset = ImmOp->getImm() + Offset;
1035 if (TotalOffset == 0) {
1036 MI.setDesc(TII->get(AMDGPU::COPY));
1037
1038 for (unsigned I = MI.getNumOperands() - 1; I != 1; --I)
1039 MI.removeOperand(I);
1040
1041 MI.getOperand(1).ChangeToRegister(BaseReg, false);
1042 } else {
1043 FIOp->ChangeToRegister(BaseReg, false);
1044 ImmOp->setImm(TotalOffset);
1045 }
1046
1047 return;
1048 }
1049 default:
1050 break;
1051 }
1052
1053 bool IsFlat = TII->isFLATScratch(MI);
1054
1055#ifndef NDEBUG
1056 // FIXME: Is it possible to be storing a frame index to itself?
1057 bool SeenFI = false;
1058 for (const MachineOperand &MO: MI.operands()) {
1059 if (MO.isFI()) {
1060 if (SeenFI)
1061 llvm_unreachable("should not see multiple frame indices");
1062
1063 SeenFI = true;
1064 }
1065 }
1066#endif
1067
1068 MachineOperand *FIOp =
1069 TII->getNamedOperand(MI, IsFlat ? AMDGPU::OpName::saddr
1070 : AMDGPU::OpName::vaddr);
1071
1072 MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset);
1073 int64_t NewOffset = OffsetOp->getImm() + Offset;
1074
1075 assert(FIOp && FIOp->isFI() && "frame index must be address operand");
1076 assert(TII->isMUBUF(MI) || TII->isFLATScratch(MI));
1077
1078 if (IsFlat) {
1079 assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
1081 "offset should be legal");
1082 FIOp->ChangeToRegister(BaseReg, false);
1083 OffsetOp->setImm(NewOffset);
1084 return;
1085 }
1086
1087#ifndef NDEBUG
1088 MachineOperand *SOffset = TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
1089 assert(SOffset->isImm() && SOffset->getImm() == 0);
1090#endif
1091
1092 assert(TII->isLegalMUBUFImmOffset(NewOffset) && "offset should be legal");
1093
1094 FIOp->ChangeToRegister(BaseReg, false);
1095 OffsetOp->setImm(NewOffset);
1096}
1097
1099 Register BaseReg,
1100 int64_t Offset) const {
1101
1102 switch (MI->getOpcode()) {
1103 case AMDGPU::V_ADD_U32_e32:
1104 case AMDGPU::V_ADD_CO_U32_e32:
1105 return true;
1106 case AMDGPU::V_ADD_U32_e64:
1107 case AMDGPU::V_ADD_CO_U32_e64:
1108 return ST.hasVOP3Literal() || AMDGPU::isInlinableIntLiteral(Offset);
1109 default:
1110 break;
1111 }
1112
1114 return false;
1115
1116 int64_t NewOffset = Offset + getScratchInstrOffset(MI);
1117
1118 const SIInstrInfo *TII = ST.getInstrInfo();
1120 return TII->isLegalMUBUFImmOffset(NewOffset);
1121
1122 return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
1124}
1125
1126const TargetRegisterClass *
1128 // This is inaccurate. It depends on the instruction and address space. The
1129 // only place where we should hit this is for dealing with frame indexes /
1130 // private accesses, so this is correct in that case.
1131 return &AMDGPU::VGPR_32RegClass;
1132}
1133
1134const TargetRegisterClass *
1136 return RC == &AMDGPU::SCC_CLASSRegClass ? &AMDGPU::SReg_32RegClass : RC;
1137}
1138
1140 const SIInstrInfo *TII) {
1141
1142 unsigned Op = MI.getOpcode();
1143 switch (Op) {
1144 case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE:
1145 case AMDGPU::SI_BLOCK_SPILL_V1024_CFI_SAVE:
1146 case AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE:
1147 // FIXME: This assumes the mask is statically known and not computed at
1148 // runtime. However, some ABIs may want to compute the mask dynamically and
1149 // this will need to be updated.
1150 return llvm::popcount(
1151 (uint64_t)TII->getNamedOperand(MI, AMDGPU::OpName::mask)->getImm());
1152 case AMDGPU::SI_SPILL_S1024_SAVE:
1153 case AMDGPU::SI_SPILL_S1024_CFI_SAVE:
1154 case AMDGPU::SI_SPILL_S1024_RESTORE:
1155 case AMDGPU::SI_SPILL_V1024_SAVE:
1156 case AMDGPU::SI_SPILL_V1024_CFI_SAVE:
1157 case AMDGPU::SI_SPILL_V1024_RESTORE:
1158 case AMDGPU::SI_SPILL_A1024_SAVE:
1159 case AMDGPU::SI_SPILL_A1024_CFI_SAVE:
1160 case AMDGPU::SI_SPILL_A1024_RESTORE:
1161 case AMDGPU::SI_SPILL_AV1024_SAVE:
1162 case AMDGPU::SI_SPILL_AV1024_CFI_SAVE:
1163 case AMDGPU::SI_SPILL_AV1024_RESTORE:
1164 return 32;
1165 case AMDGPU::SI_SPILL_S512_SAVE:
1166 case AMDGPU::SI_SPILL_S512_CFI_SAVE:
1167 case AMDGPU::SI_SPILL_S512_RESTORE:
1168 case AMDGPU::SI_SPILL_V512_SAVE:
1169 case AMDGPU::SI_SPILL_V512_CFI_SAVE:
1170 case AMDGPU::SI_SPILL_V512_RESTORE:
1171 case AMDGPU::SI_SPILL_A512_SAVE:
1172 case AMDGPU::SI_SPILL_A512_CFI_SAVE:
1173 case AMDGPU::SI_SPILL_A512_RESTORE:
1174 case AMDGPU::SI_SPILL_AV512_SAVE:
1175 case AMDGPU::SI_SPILL_AV512_CFI_SAVE:
1176 case AMDGPU::SI_SPILL_AV512_RESTORE:
1177 return 16;
1178 case AMDGPU::SI_SPILL_S384_SAVE:
1179 case AMDGPU::SI_SPILL_S384_RESTORE:
1180 case AMDGPU::SI_SPILL_V384_SAVE:
1181 case AMDGPU::SI_SPILL_V384_RESTORE:
1182 case AMDGPU::SI_SPILL_A384_SAVE:
1183 case AMDGPU::SI_SPILL_A384_RESTORE:
1184 case AMDGPU::SI_SPILL_AV384_SAVE:
1185 case AMDGPU::SI_SPILL_AV384_RESTORE:
1186 return 12;
1187 case AMDGPU::SI_SPILL_S352_SAVE:
1188 case AMDGPU::SI_SPILL_S352_RESTORE:
1189 case AMDGPU::SI_SPILL_V352_SAVE:
1190 case AMDGPU::SI_SPILL_V352_RESTORE:
1191 case AMDGPU::SI_SPILL_A352_SAVE:
1192 case AMDGPU::SI_SPILL_A352_RESTORE:
1193 case AMDGPU::SI_SPILL_AV352_SAVE:
1194 case AMDGPU::SI_SPILL_AV352_RESTORE:
1195 return 11;
1196 case AMDGPU::SI_SPILL_S320_SAVE:
1197 case AMDGPU::SI_SPILL_S320_RESTORE:
1198 case AMDGPU::SI_SPILL_V320_SAVE:
1199 case AMDGPU::SI_SPILL_V320_RESTORE:
1200 case AMDGPU::SI_SPILL_A320_SAVE:
1201 case AMDGPU::SI_SPILL_A320_RESTORE:
1202 case AMDGPU::SI_SPILL_AV320_SAVE:
1203 case AMDGPU::SI_SPILL_AV320_RESTORE:
1204 return 10;
1205 case AMDGPU::SI_SPILL_S288_SAVE:
1206 case AMDGPU::SI_SPILL_S288_RESTORE:
1207 case AMDGPU::SI_SPILL_V288_SAVE:
1208 case AMDGPU::SI_SPILL_V288_RESTORE:
1209 case AMDGPU::SI_SPILL_A288_SAVE:
1210 case AMDGPU::SI_SPILL_A288_RESTORE:
1211 case AMDGPU::SI_SPILL_AV288_SAVE:
1212 case AMDGPU::SI_SPILL_AV288_RESTORE:
1213 return 9;
1214 case AMDGPU::SI_SPILL_S256_SAVE:
1215 case AMDGPU::SI_SPILL_S256_CFI_SAVE:
1216 case AMDGPU::SI_SPILL_S256_RESTORE:
1217 case AMDGPU::SI_SPILL_V256_SAVE:
1218 case AMDGPU::SI_SPILL_V256_CFI_SAVE:
1219 case AMDGPU::SI_SPILL_V256_RESTORE:
1220 case AMDGPU::SI_SPILL_A256_SAVE:
1221 case AMDGPU::SI_SPILL_A256_CFI_SAVE:
1222 case AMDGPU::SI_SPILL_A256_RESTORE:
1223 case AMDGPU::SI_SPILL_AV256_SAVE:
1224 case AMDGPU::SI_SPILL_AV256_CFI_SAVE:
1225 case AMDGPU::SI_SPILL_AV256_RESTORE:
1226 return 8;
1227 case AMDGPU::SI_SPILL_S224_SAVE:
1228 case AMDGPU::SI_SPILL_S224_CFI_SAVE:
1229 case AMDGPU::SI_SPILL_S224_RESTORE:
1230 case AMDGPU::SI_SPILL_V224_SAVE:
1231 case AMDGPU::SI_SPILL_V224_CFI_SAVE:
1232 case AMDGPU::SI_SPILL_V224_RESTORE:
1233 case AMDGPU::SI_SPILL_A224_SAVE:
1234 case AMDGPU::SI_SPILL_A224_CFI_SAVE:
1235 case AMDGPU::SI_SPILL_A224_RESTORE:
1236 case AMDGPU::SI_SPILL_AV224_SAVE:
1237 case AMDGPU::SI_SPILL_AV224_CFI_SAVE:
1238 case AMDGPU::SI_SPILL_AV224_RESTORE:
1239 return 7;
1240 case AMDGPU::SI_SPILL_S192_SAVE:
1241 case AMDGPU::SI_SPILL_S192_CFI_SAVE:
1242 case AMDGPU::SI_SPILL_S192_RESTORE:
1243 case AMDGPU::SI_SPILL_V192_SAVE:
1244 case AMDGPU::SI_SPILL_V192_CFI_SAVE:
1245 case AMDGPU::SI_SPILL_V192_RESTORE:
1246 case AMDGPU::SI_SPILL_A192_SAVE:
1247 case AMDGPU::SI_SPILL_A192_CFI_SAVE:
1248 case AMDGPU::SI_SPILL_A192_RESTORE:
1249 case AMDGPU::SI_SPILL_AV192_SAVE:
1250 case AMDGPU::SI_SPILL_AV192_CFI_SAVE:
1251 case AMDGPU::SI_SPILL_AV192_RESTORE:
1252 return 6;
1253 case AMDGPU::SI_SPILL_S160_SAVE:
1254 case AMDGPU::SI_SPILL_S160_CFI_SAVE:
1255 case AMDGPU::SI_SPILL_S160_RESTORE:
1256 case AMDGPU::SI_SPILL_V160_SAVE:
1257 case AMDGPU::SI_SPILL_V160_CFI_SAVE:
1258 case AMDGPU::SI_SPILL_V160_RESTORE:
1259 case AMDGPU::SI_SPILL_A160_SAVE:
1260 case AMDGPU::SI_SPILL_A160_CFI_SAVE:
1261 case AMDGPU::SI_SPILL_A160_RESTORE:
1262 case AMDGPU::SI_SPILL_AV160_SAVE:
1263 case AMDGPU::SI_SPILL_AV160_CFI_SAVE:
1264 case AMDGPU::SI_SPILL_AV160_RESTORE:
1265 return 5;
1266 case AMDGPU::SI_SPILL_S128_SAVE:
1267 case AMDGPU::SI_SPILL_S128_CFI_SAVE:
1268 case AMDGPU::SI_SPILL_S128_RESTORE:
1269 case AMDGPU::SI_SPILL_V128_SAVE:
1270 case AMDGPU::SI_SPILL_V128_CFI_SAVE:
1271 case AMDGPU::SI_SPILL_V128_RESTORE:
1272 case AMDGPU::SI_SPILL_A128_SAVE:
1273 case AMDGPU::SI_SPILL_A128_CFI_SAVE:
1274 case AMDGPU::SI_SPILL_A128_RESTORE:
1275 case AMDGPU::SI_SPILL_AV128_SAVE:
1276 case AMDGPU::SI_SPILL_AV128_CFI_SAVE:
1277 case AMDGPU::SI_SPILL_AV128_RESTORE:
1278 return 4;
1279 case AMDGPU::SI_SPILL_S96_SAVE:
1280 case AMDGPU::SI_SPILL_S96_CFI_SAVE:
1281 case AMDGPU::SI_SPILL_S96_RESTORE:
1282 case AMDGPU::SI_SPILL_V96_SAVE:
1283 case AMDGPU::SI_SPILL_V96_CFI_SAVE:
1284 case AMDGPU::SI_SPILL_V96_RESTORE:
1285 case AMDGPU::SI_SPILL_A96_SAVE:
1286 case AMDGPU::SI_SPILL_A96_CFI_SAVE:
1287 case AMDGPU::SI_SPILL_A96_RESTORE:
1288 case AMDGPU::SI_SPILL_AV96_SAVE:
1289 case AMDGPU::SI_SPILL_AV96_CFI_SAVE:
1290 case AMDGPU::SI_SPILL_AV96_RESTORE:
1291 return 3;
1292 case AMDGPU::SI_SPILL_S64_SAVE:
1293 case AMDGPU::SI_SPILL_S64_CFI_SAVE:
1294 case AMDGPU::SI_SPILL_S64_RESTORE:
1295 case AMDGPU::SI_SPILL_V64_SAVE:
1296 case AMDGPU::SI_SPILL_V64_CFI_SAVE:
1297 case AMDGPU::SI_SPILL_V64_RESTORE:
1298 case AMDGPU::SI_SPILL_A64_SAVE:
1299 case AMDGPU::SI_SPILL_A64_CFI_SAVE:
1300 case AMDGPU::SI_SPILL_A64_RESTORE:
1301 case AMDGPU::SI_SPILL_AV64_SAVE:
1302 case AMDGPU::SI_SPILL_AV64_CFI_SAVE:
1303 case AMDGPU::SI_SPILL_AV64_RESTORE:
1304 return 2;
1305 case AMDGPU::SI_SPILL_S32_SAVE:
1306 case AMDGPU::SI_SPILL_S32_CFI_SAVE:
1307 case AMDGPU::SI_SPILL_S32_RESTORE:
1308 case AMDGPU::SI_SPILL_V32_SAVE:
1309 case AMDGPU::SI_SPILL_V32_CFI_SAVE:
1310 case AMDGPU::SI_SPILL_V32_RESTORE:
1311 case AMDGPU::SI_SPILL_A32_SAVE:
1312 case AMDGPU::SI_SPILL_A32_CFI_SAVE:
1313 case AMDGPU::SI_SPILL_A32_RESTORE:
1314 case AMDGPU::SI_SPILL_AV32_SAVE:
1315 case AMDGPU::SI_SPILL_AV32_CFI_SAVE:
1316 case AMDGPU::SI_SPILL_AV32_RESTORE:
1317 case AMDGPU::SI_SPILL_WWM_V32_SAVE:
1318 case AMDGPU::SI_SPILL_WWM_V32_RESTORE:
1319 case AMDGPU::SI_SPILL_WWM_AV32_SAVE:
1320 case AMDGPU::SI_SPILL_WWM_AV32_RESTORE:
1321 case AMDGPU::SI_SPILL_V16_SAVE:
1322 case AMDGPU::SI_SPILL_V16_RESTORE:
1323 return 1;
1324 default: llvm_unreachable("Invalid spill opcode");
1325 }
1326}
1327
1328static int getOffsetMUBUFStore(unsigned Opc) {
1329 switch (Opc) {
1330 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
1331 return AMDGPU::BUFFER_STORE_DWORD_OFFSET;
1332 case AMDGPU::BUFFER_STORE_BYTE_OFFEN:
1333 return AMDGPU::BUFFER_STORE_BYTE_OFFSET;
1334 case AMDGPU::BUFFER_STORE_SHORT_OFFEN:
1335 return AMDGPU::BUFFER_STORE_SHORT_OFFSET;
1336 case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
1337 return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
1338 case AMDGPU::BUFFER_STORE_DWORDX3_OFFEN:
1339 return AMDGPU::BUFFER_STORE_DWORDX3_OFFSET;
1340 case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN:
1341 return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
1342 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN:
1343 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET;
1344 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN:
1345 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET;
1346 default:
1347 return -1;
1348 }
1349}
1350
1351static int getOffsetMUBUFLoad(unsigned Opc) {
1352 switch (Opc) {
1353 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
1354 return AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
1355 case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN:
1356 return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET;
1357 case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN:
1358 return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET;
1359 case AMDGPU::BUFFER_LOAD_USHORT_OFFEN:
1360 return AMDGPU::BUFFER_LOAD_USHORT_OFFSET;
1361 case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN:
1362 return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET;
1363 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN:
1364 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
1365 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN:
1366 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET;
1367 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN:
1368 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET;
1369 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN:
1370 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET;
1371 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN:
1372 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET;
1373 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN:
1374 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET;
1375 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN:
1376 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET;
1377 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN:
1378 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET;
1379 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN:
1380 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET;
1381 default:
1382 return -1;
1383 }
1384}
1385
1386static int getOffenMUBUFStore(unsigned Opc) {
1387 switch (Opc) {
1388 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
1389 return AMDGPU::BUFFER_STORE_DWORD_OFFEN;
1390 case AMDGPU::BUFFER_STORE_BYTE_OFFSET:
1391 return AMDGPU::BUFFER_STORE_BYTE_OFFEN;
1392 case AMDGPU::BUFFER_STORE_SHORT_OFFSET:
1393 return AMDGPU::BUFFER_STORE_SHORT_OFFEN;
1394 case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET:
1395 return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN;
1396 case AMDGPU::BUFFER_STORE_DWORDX3_OFFSET:
1397 return AMDGPU::BUFFER_STORE_DWORDX3_OFFEN;
1398 case AMDGPU::BUFFER_STORE_DWORDX4_OFFSET:
1399 return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN;
1400 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET:
1401 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN;
1402 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET:
1403 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN;
1404 default:
1405 return -1;
1406 }
1407}
1408
1409static int getOffenMUBUFLoad(unsigned Opc) {
1410 switch (Opc) {
1411 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
1412 return AMDGPU::BUFFER_LOAD_DWORD_OFFEN;
1413 case AMDGPU::BUFFER_LOAD_UBYTE_OFFSET:
1414 return AMDGPU::BUFFER_LOAD_UBYTE_OFFEN;
1415 case AMDGPU::BUFFER_LOAD_SBYTE_OFFSET:
1416 return AMDGPU::BUFFER_LOAD_SBYTE_OFFEN;
1417 case AMDGPU::BUFFER_LOAD_USHORT_OFFSET:
1418 return AMDGPU::BUFFER_LOAD_USHORT_OFFEN;
1419 case AMDGPU::BUFFER_LOAD_SSHORT_OFFSET:
1420 return AMDGPU::BUFFER_LOAD_SSHORT_OFFEN;
1421 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET:
1422 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
1423 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET:
1424 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN;
1425 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET:
1426 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN;
1427 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET:
1428 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN;
1429 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET:
1430 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN;
1431 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET:
1432 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN;
1433 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET:
1434 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN;
1435 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET:
1436 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN;
1437 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET:
1438 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN;
1439 default:
1440 return -1;
1441 }
1442}
1443
1446 MachineBasicBlock::iterator MI, int Index, unsigned Lane,
1447 unsigned ValueReg, bool IsKill, bool NeedsCFI) {
1448 MachineFunction *MF = MBB.getParent();
1450 const SIInstrInfo *TII = ST.getInstrInfo();
1451 const SIFrameLowering *TFL = ST.getFrameLowering();
1452
1453 MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane);
1454
1455 if (Reg == AMDGPU::NoRegister)
1456 return MachineInstrBuilder();
1457
1458 bool IsStore = MI->mayStore();
1459 MachineRegisterInfo &MRI = MF->getRegInfo();
1460 auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
1461
1462 unsigned Dst = IsStore ? Reg : ValueReg;
1463 unsigned Src = IsStore ? ValueReg : Reg;
1464 bool IsVGPR = TRI->isVGPR(MRI, Reg);
1465 const DebugLoc &DL = MI->getDebugLoc();
1466 if (IsVGPR == TRI->isVGPR(MRI, ValueReg)) {
1467 // Spiller during regalloc may restore a spilled register to its superclass.
1468 // It could result in AGPR spills restored to VGPRs or the other way around,
1469 // making the src and dst with identical regclasses at this point. It just
1470 // needs a copy in such cases.
1471 auto CopyMIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), Dst)
1472 .addReg(Src, getKillRegState(IsKill));
1474 if (NeedsCFI)
1475 TFL->buildCFIForVRegToVRegSpill(MBB, MI, DL, Src, Dst);
1476 return CopyMIB;
1477 }
1478 unsigned Opc = (IsStore ^ IsVGPR) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64
1479 : AMDGPU::V_ACCVGPR_READ_B32_e64;
1480
1481 auto MIB = BuildMI(MBB, MI, DL, TII->get(Opc), Dst)
1482 .addReg(Src, getKillRegState(IsKill));
1484 if (NeedsCFI)
1485 TFL->buildCFIForVRegToVRegSpill(MBB, MI, DL, Src, Dst);
1486 return MIB;
1487}
1488
1489// This differs from buildSpillLoadStore by only scavenging a VGPR. It does not
1490// need to handle the case where an SGPR may need to be spilled while spilling.
1492 MachineFrameInfo &MFI,
1494 int Index,
1495 int64_t Offset) {
1496 const SIInstrInfo *TII = ST.getInstrInfo();
1497 MachineBasicBlock *MBB = MI->getParent();
1498 const DebugLoc &DL = MI->getDebugLoc();
1499 bool IsStore = MI->mayStore();
1500
1501 unsigned Opc = MI->getOpcode();
1502 int LoadStoreOp = IsStore ?
1504 if (LoadStoreOp == -1)
1505 return false;
1506
1507 const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata);
1508 if (spillVGPRtoAGPR(ST, *MBB, MI, Index, 0, Reg->getReg(), false, false)
1509 .getInstr())
1510 return true;
1511
1512 MachineInstrBuilder NewMI =
1513 BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
1514 .add(*Reg)
1515 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc))
1516 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset))
1517 .addImm(Offset)
1518 .addImm(0) // cpol
1519 .addImm(0) // swz
1520 .cloneMemRefs(*MI);
1521
1522 const MachineOperand *VDataIn = TII->getNamedOperand(*MI,
1523 AMDGPU::OpName::vdata_in);
1524 if (VDataIn)
1525 NewMI.add(*VDataIn);
1526 return true;
1527}
1528
1530 unsigned LoadStoreOp,
1531 unsigned EltSize) {
1532 bool IsStore = TII->get(LoadStoreOp).mayStore();
1533 bool HasVAddr = AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::vaddr);
1534 bool UseST =
1535 !HasVAddr && !AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::saddr);
1536
1537 // Handle block load/store first.
1538 if (TII->isBlockLoadStore(LoadStoreOp))
1539 return LoadStoreOp;
1540
1541 switch (EltSize) {
1542 case 4:
1543 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
1544 : AMDGPU::SCRATCH_LOAD_DWORD_SADDR;
1545 break;
1546 case 8:
1547 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX2_SADDR
1548 : AMDGPU::SCRATCH_LOAD_DWORDX2_SADDR;
1549 break;
1550 case 12:
1551 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX3_SADDR
1552 : AMDGPU::SCRATCH_LOAD_DWORDX3_SADDR;
1553 break;
1554 case 16:
1555 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX4_SADDR
1556 : AMDGPU::SCRATCH_LOAD_DWORDX4_SADDR;
1557 break;
1558 default:
1559 llvm_unreachable("Unexpected spill load/store size!");
1560 }
1561
1562 if (HasVAddr)
1563 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp);
1564 else if (UseST)
1565 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
1566
1567 return LoadStoreOp;
1568}
1569
1572 unsigned LoadStoreOp, int Index, Register ValueReg, bool IsKill,
1573 MCRegister ScratchOffsetReg, int64_t InstOffset, MachineMemOperand *MMO,
1574 RegScavenger *RS, LiveRegUnits *LiveUnits, bool NeedsCFI) const {
1575 assert((!RS || !LiveUnits) && "Only RS or LiveUnits can be set but not both");
1576
1577 MachineFunction *MF = MBB.getParent();
1578 const SIInstrInfo *TII = ST.getInstrInfo();
1579 const MachineFrameInfo &MFI = MF->getFrameInfo();
1580 const SIFrameLowering *TFL = ST.getFrameLowering();
1581 const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
1582
1583 const MCInstrDesc *Desc = &TII->get(LoadStoreOp);
1584 bool IsStore = Desc->mayStore();
1585 bool IsFlat = TII->isFLATScratch(LoadStoreOp);
1586 bool IsBlock = TII->isBlockLoadStore(LoadStoreOp);
1587
1588 bool CanClobberSCC = false;
1589 bool Scavenged = false;
1590 MCRegister SOffset = ScratchOffsetReg;
1591
1592 const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
1593 // On gfx90a+ AGPR is a regular VGPR acceptable for loads and stores.
1594 const bool IsAGPR = !ST.hasGFX90AInsts() && isAGPRClass(RC);
1595 unsigned RegWidth = AMDGPU::getRegBitWidth(*RC) / 8;
1596
1597 // On targets with register tuple alignment requirements,
1598 // for unaligned tuples, spill the first sub-reg as a 32-bit spill,
1599 // and spill the rest as a regular aligned tuple.
1600 // eg: SPILL_V224 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
1601 // will be spilt as:
1602 // SPILL_SCRATCH_DWORD $vgpr1
1603 // SPILL_SCRATCH_DWORDx4 $vgpr2_vgpr3_vgpr4_vgpr5
1604 // SPILL_SCRATCH_DWORDx2 $vgpr6_vgpr7
1605 bool IsRegMisaligned = false;
1606 if (!IsBlock && !IsAGPR && RegWidth > 4) {
1607 unsigned SpillOpcode =
1608 getFlatScratchSpillOpcode(TII, LoadStoreOp, std::min(RegWidth, 16u));
1609 int VDataIdx =
1610 IsStore ? AMDGPU::getNamedOperandIdx(SpillOpcode, AMDGPU::OpName::vdata)
1611 : 0; // Restore Ops have data reg as the first (output) operand.
1612 const TargetRegisterClass *ExpectedRC =
1613 TII->getRegClass(TII->get(SpillOpcode), VDataIdx);
1614 if (!ExpectedRC->contains(ValueReg)) {
1615 unsigned NumRegs = std::min(AMDGPU::getRegBitWidth(*ExpectedRC) / 4, 4u);
1616 unsigned SubIdx = getSubRegFromChannel(0, NumRegs);
1617 const TargetRegisterClass *MatchRC =
1618 getMatchingSuperRegClass(RC, ExpectedRC, SubIdx);
1619 if (!MatchRC || !MatchRC->contains(ValueReg))
1620 IsRegMisaligned = true;
1621 }
1622 }
1623 // The first sub-register will be spilled as a 32-bit value
1624 if (IsRegMisaligned)
1625 RegWidth -= 4u;
1626 // Always use 4 byte operations for AGPRs because we need to scavenge
1627 // a temporary VGPR.
1628 // If we're using a block operation, the element should be the whole block.
1629 unsigned EltSize = IsBlock ? RegWidth
1630 : (IsFlat && !IsAGPR) ? std::min(RegWidth, 16u)
1631 : 4u;
1632 unsigned NumSubRegs = RegWidth / EltSize;
1633 unsigned Size = NumSubRegs * EltSize;
1634 unsigned RemSize = RegWidth - Size;
1635 unsigned NumRemSubRegs = RemSize ? 1 : 0;
1636 // An additional sub-register is needed to spill the misaligned component.
1637 if (IsRegMisaligned)
1638 NumSubRegs += 1;
1639 int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
1640 int64_t MaterializedOffset = Offset;
1641
1642 // Maxoffset is the starting offset for the last chunk to be spilled.
1643 // In case of non-zero remainder element, max offset will be the
1644 // last address(offset + Size) after spilling all the EltSize chunks.
1645 int64_t MaxOffset = Offset + Size - (RemSize ? 0 : EltSize);
1646 int64_t ScratchOffsetRegDelta = 0;
1647 int64_t AdditionalCFIOffset = 0;
1648
1649 if (IsFlat && EltSize > 4) {
1650 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
1651 Desc = &TII->get(LoadStoreOp);
1652 }
1653
1654 Align Alignment = MFI.getObjectAlign(Index);
1655 const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
1656
1657 assert((IsFlat || ((Offset % EltSize) == 0)) &&
1658 "unexpected VGPR spill offset");
1659
1660 // Track a VGPR to use for a constant offset we need to materialize.
1661 Register TmpOffsetVGPR;
1662
1663 // Track a VGPR to use as an intermediate value.
1664 Register TmpIntermediateVGPR;
1665 bool UseVGPROffset = false;
1666
1667 // Materialize a VGPR offset required for the given SGPR/VGPR/Immediate
1668 // combination.
1669 auto MaterializeVOffset = [&](Register SGPRBase, Register TmpVGPR,
1670 int64_t VOffset) {
1671 // We are using a VGPR offset
1672 if (IsFlat && SGPRBase) {
1673 // We only have 1 VGPR offset, or 1 SGPR offset. We don't have a free
1674 // SGPR, so perform the add as vector.
1675 // We don't need a base SGPR in the kernel.
1676
1677 if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) >= 2) {
1678 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), TmpVGPR)
1679 .addReg(SGPRBase)
1680 .addImm(VOffset)
1681 .addImm(0); // clamp
1682 } else {
1683 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
1684 .addReg(SGPRBase);
1685 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e32), TmpVGPR)
1686 .addImm(VOffset)
1687 .addReg(TmpOffsetVGPR);
1688 }
1689 } else {
1690 assert(TmpOffsetVGPR);
1691 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
1692 .addImm(VOffset);
1693 }
1694 };
1695
1696 bool IsOffsetLegal =
1697 IsFlat ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS,
1699 : TII->isLegalMUBUFImmOffset(MaxOffset);
1700 if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) {
1701 SOffset = MCRegister();
1702
1703 // We don't have access to the register scavenger if this function is called
1704 // during PEI::scavengeFrameVirtualRegs() so use LiveUnits in this case.
1705 // TODO: Clobbering SCC is not necessary for scratch instructions in the
1706 // entry.
1707 if (RS) {
1708 SOffset = RS->scavengeRegisterBackwards(AMDGPU::SGPR_32RegClass, MI, false, 0, false);
1709
1710 // Piggy back on the liveness scan we just did see if SCC is dead.
1711 CanClobberSCC = !RS->isRegUsed(AMDGPU::SCC);
1712 } else if (LiveUnits) {
1713 CanClobberSCC = LiveUnits->available(AMDGPU::SCC);
1714 for (MCRegister Reg : AMDGPU::SGPR_32RegClass) {
1715 if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) {
1716 SOffset = Reg;
1717 break;
1718 }
1719 }
1720 }
1721
1722 if (ScratchOffsetReg != AMDGPU::NoRegister && !CanClobberSCC)
1723 SOffset = Register();
1724
1725 if (!SOffset) {
1726 UseVGPROffset = true;
1727
1728 if (RS) {
1729 TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0);
1730 } else {
1731 assert(LiveUnits);
1732 for (MCRegister Reg : AMDGPU::VGPR_32RegClass) {
1733 if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) {
1734 TmpOffsetVGPR = Reg;
1735 break;
1736 }
1737 }
1738 }
1739
1740 assert(TmpOffsetVGPR);
1741 } else if (!SOffset && CanClobberSCC) {
1742 // There are no free SGPRs, and since we are in the process of spilling
1743 // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true
1744 // on SI/CI and on VI it is true until we implement spilling using scalar
1745 // stores), we have no way to free up an SGPR. Our solution here is to
1746 // add the offset directly to the ScratchOffset or StackPtrOffset
1747 // register, and then subtract the offset after the spill to return the
1748 // register to it's original value.
1749
1750 // TODO: If we don't have to do an emergency stack slot spill, converting
1751 // to use the VGPR offset is fewer instructions.
1752 if (!ScratchOffsetReg)
1753 ScratchOffsetReg = FuncInfo->getStackPtrOffsetReg();
1754 SOffset = ScratchOffsetReg;
1755 ScratchOffsetRegDelta = Offset;
1756 } else {
1757 Scavenged = true;
1758 }
1759
1760 AdditionalCFIOffset = Offset;
1761 // We currently only support spilling VGPRs to EltSize boundaries, meaning
1762 // we can simplify the adjustment of Offset here to just scale with
1763 // WavefrontSize.
1764 if (!IsFlat && !UseVGPROffset)
1765 Offset *= ST.getWavefrontSize();
1766
1767 if (!UseVGPROffset && !SOffset)
1768 report_fatal_error("could not scavenge SGPR to spill in entry function");
1769
1770 if (UseVGPROffset) {
1771 // We are using a VGPR offset
1772 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, Offset);
1773 } else if (ScratchOffsetReg == AMDGPU::NoRegister) {
1774 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset).addImm(Offset);
1775 } else {
1776 assert(Offset != 0);
1777 auto Add = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset)
1778 .addReg(ScratchOffsetReg)
1779 .addImm(Offset);
1780 Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1781 }
1782
1783 Offset = 0;
1784 }
1785
1786 if (IsFlat && SOffset == AMDGPU::NoRegister) {
1787 assert(AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0
1788 && "Unexpected vaddr for flat scratch with a FI operand");
1789
1790 if (UseVGPROffset) {
1791 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp);
1792 } else {
1793 assert(ST.hasFlatScratchSTMode());
1794 assert(!TII->isBlockLoadStore(LoadStoreOp) && "Block ops don't have ST");
1795 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
1796 }
1797
1798 Desc = &TII->get(LoadStoreOp);
1799 }
1800
1801 // Save a copy of the original element size before its potentially changed for
1802 // misaligned tuples.
1803 unsigned OrigEltSize = EltSize;
1804 for (unsigned i = 0, e = NumSubRegs + NumRemSubRegs, RegOffset = 0; i != e;
1805 ++i, RegOffset += EltSize) {
1806 if (IsRegMisaligned) {
1807 if (i == 0) {
1808 // For misaligned register tuples, spill only the first sub-reg in the
1809 // first iteration.
1810 EltSize = 4u;
1811 } else {
1812 // The misaligned register was spilt. Now the rest of the tuple is
1813 // properly aligned.
1814 IsRegMisaligned = false;
1815 EltSize = OrigEltSize;
1816 }
1817 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
1818 }
1819 if (i == NumSubRegs) {
1820 EltSize = RemSize;
1821 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
1822 }
1823 Desc = &TII->get(LoadStoreOp);
1824
1825 if (!IsFlat && UseVGPROffset) {
1826 int NewLoadStoreOp = IsStore ? getOffenMUBUFStore(LoadStoreOp)
1827 : getOffenMUBUFLoad(LoadStoreOp);
1828 Desc = &TII->get(NewLoadStoreOp);
1829 }
1830
1831 if (UseVGPROffset && TmpOffsetVGPR == TmpIntermediateVGPR) {
1832 // If we are spilling an AGPR beyond the range of the memory instruction
1833 // offset and need to use a VGPR offset, we ideally have at least 2
1834 // scratch VGPRs. If we don't have a second free VGPR without spilling,
1835 // recycle the VGPR used for the offset which requires resetting after
1836 // each subregister.
1837
1838 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, MaterializedOffset);
1839 }
1840
1841 unsigned NumRegs = EltSize / 4;
1842 Register SubReg = e == 1
1843 ? ValueReg
1844 : Register(getSubReg(ValueReg,
1845 getSubRegFromChannel(RegOffset / 4, NumRegs)));
1846
1847 RegState SOffsetRegState = {};
1848 RegState SrcDstRegState = getDefRegState(!IsStore);
1849 const bool IsLastSubReg = i + 1 == e;
1850 const bool IsFirstSubReg = i == 0;
1851 if (IsLastSubReg) {
1852 SOffsetRegState |= getKillRegState(Scavenged);
1853 // The last implicit use carries the "Kill" flag.
1854 SrcDstRegState |= getKillRegState(IsKill);
1855 }
1856
1857 // Make sure the whole register is defined if there are undef components by
1858 // adding an implicit def of the super-reg on the first instruction.
1859 bool NeedSuperRegDef = e > 1 && IsStore && IsFirstSubReg;
1860 bool NeedSuperRegImpOperand = e > 1;
1861
1862 // Remaining element size to spill into memory after some parts of it
1863 // spilled into either AGPRs or VGPRs.
1864 unsigned RemEltSize = EltSize;
1865
1866 // AGPRs to spill VGPRs and vice versa are allocated in a reverse order,
1867 // starting from the last lane. In case if a register cannot be completely
1868 // spilled into another register that will ensure its alignment does not
1869 // change. For targets with VGPR alignment requirement this is important
1870 // in case of flat scratch usage as we might get a scratch_load or
1871 // scratch_store of an unaligned register otherwise.
1872 for (int LaneS = (RegOffset + EltSize) / 4 - 1, Lane = LaneS,
1873 LaneE = RegOffset / 4;
1874 Lane >= LaneE; --Lane) {
1875 bool IsSubReg = e > 1 || EltSize > 4;
1876 Register Sub = IsSubReg
1877 ? Register(getSubReg(ValueReg, getSubRegFromChannel(Lane)))
1878 : ValueReg;
1879 auto MIB =
1880 spillVGPRtoAGPR(ST, MBB, MI, Index, Lane, Sub, IsKill, NeedsCFI);
1881 if (!MIB.getInstr())
1882 break;
1883 if (NeedSuperRegDef || (IsSubReg && IsStore && Lane == LaneS && IsFirstSubReg)) {
1884 MIB.addReg(ValueReg, RegState::ImplicitDefine);
1885 NeedSuperRegDef = false;
1886 }
1887 if ((IsSubReg || NeedSuperRegImpOperand) && (IsFirstSubReg || IsLastSubReg)) {
1888 NeedSuperRegImpOperand = true;
1889 RegState State = SrcDstRegState;
1890 if (!IsLastSubReg || (Lane != LaneE))
1891 State &= ~RegState::Kill;
1892 if (!IsFirstSubReg || (Lane != LaneS))
1893 State &= ~RegState::Define;
1894 MIB.addReg(ValueReg, RegState::Implicit | State);
1895 }
1896 RemEltSize -= 4;
1897 }
1898
1899 if (!RemEltSize) // Fully spilled into AGPRs.
1900 continue;
1901
1902 if (RemEltSize != EltSize) { // Partially spilled to AGPRs
1903 assert(IsFlat && EltSize > 4);
1904
1905 unsigned NumRegs = RemEltSize / 4;
1906 SubReg = Register(getSubReg(ValueReg,
1907 getSubRegFromChannel(RegOffset / 4, NumRegs)));
1908 unsigned Opc = getFlatScratchSpillOpcode(TII, LoadStoreOp, RemEltSize);
1909 Desc = &TII->get(Opc);
1910 }
1911
1912 unsigned FinalReg = SubReg;
1913
1914 if (IsAGPR) {
1915 assert(EltSize == 4);
1916
1917 if (!TmpIntermediateVGPR) {
1918 TmpIntermediateVGPR = FuncInfo->getVGPRForAGPRCopy();
1919 assert(MF->getRegInfo().isReserved(TmpIntermediateVGPR));
1920 }
1921 if (IsStore) {
1922 auto AccRead = BuildMI(MBB, MI, DL,
1923 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64),
1924 TmpIntermediateVGPR)
1925 .addReg(SubReg, getKillRegState(IsKill));
1926 if (NeedSuperRegDef)
1927 AccRead.addReg(ValueReg, RegState::ImplicitDefine);
1928 if (NeedSuperRegImpOperand && (IsFirstSubReg || IsLastSubReg))
1929 AccRead.addReg(ValueReg, RegState::Implicit);
1931 }
1932 SubReg = TmpIntermediateVGPR;
1933 } else if (UseVGPROffset) {
1934 if (!TmpOffsetVGPR) {
1935 TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
1936 MI, false, 0);
1937 RS->setRegUsed(TmpOffsetVGPR);
1938 }
1939 }
1940
1941 Register FinalValueReg = ValueReg;
1942 if (LoadStoreOp == AMDGPU::SCRATCH_LOAD_USHORT_SADDR) {
1943 // If we are loading 16-bit value with SRAMECC endabled we need a temp
1944 // 32-bit VGPR to load and extract 16-bits into the final register.
1945 ValueReg =
1946 RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0);
1947 SubReg = ValueReg;
1948 IsKill = false;
1949 }
1950
1951 // Create the MMO, additional set the NonVolatile flag as scratch memory
1952 // used for spills will not be used outside the thread.
1953 MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RegOffset);
1955 PInfo, MMO->getFlags() | MOThreadPrivate, RemEltSize,
1956 commonAlignment(Alignment, RegOffset));
1957
1958 auto MIB =
1959 BuildMI(MBB, MI, DL, *Desc)
1960 .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill));
1961
1962 if (UseVGPROffset) {
1963 // For an AGPR spill, we reuse the same temp VGPR for the offset and the
1964 // intermediate accvgpr_write.
1965 MIB.addReg(TmpOffsetVGPR, getKillRegState(IsLastSubReg && !IsAGPR));
1966 }
1967
1968 if (!IsFlat)
1969 MIB.addReg(FuncInfo->getScratchRSrcReg());
1970
1971 if (SOffset == AMDGPU::NoRegister) {
1972 if (!IsFlat) {
1973 if (UseVGPROffset && ScratchOffsetReg) {
1974 MIB.addReg(ScratchOffsetReg);
1975 } else {
1976 assert(FuncInfo->isBottomOfStack());
1977 MIB.addImm(0);
1978 }
1979 }
1980 } else {
1981 MIB.addReg(SOffset, SOffsetRegState);
1982 }
1983
1984 MIB.addImm(Offset + RegOffset);
1985
1986 bool LastUse = MMO->getFlags() & MOLastUse;
1987 MIB.addImm(LastUse ? AMDGPU::CPol::TH_LU : 0); // cpol
1988
1989 if (!IsFlat)
1990 MIB.addImm(0); // swz
1991 MIB.addMemOperand(NewMMO);
1992
1993 if (FinalValueReg != ValueReg) {
1994 // Extract 16-bit from the loaded 32-bit value.
1995 ValueReg = getSubReg(ValueReg, AMDGPU::lo16);
1996 MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B16_t16_e64))
1997 .addReg(FinalValueReg, getDefRegState(true))
1998 .addImm(0)
1999 .addReg(ValueReg, getKillRegState(true))
2000 .addImm(0);
2001 ValueReg = FinalValueReg;
2002 }
2003
2004 if (IsStore && NeedsCFI) {
2005 if (TII->isBlockLoadStore(LoadStoreOp)) {
2006 assert(RegOffset == 0 &&
2007 "expected whole register block to be treated as single element");
2009 } else {
2011 MBB, MI, DebugLoc(), SubReg,
2012 (Offset + RegOffset) * ST.getWavefrontSize() + AdditionalCFIOffset);
2013 }
2014 }
2015
2016 if (!IsAGPR && NeedSuperRegDef)
2017 MIB.addReg(ValueReg, RegState::ImplicitDefine);
2018
2019 if (!IsStore && IsAGPR && TmpIntermediateVGPR != AMDGPU::NoRegister) {
2020 MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64),
2021 FinalReg)
2022 .addReg(TmpIntermediateVGPR, RegState::Kill);
2024 }
2025
2026 bool IsSrcDstDef = hasRegState(SrcDstRegState, RegState::Define);
2027 bool PartialReloadCopy = (RemEltSize != EltSize) && !IsStore;
2028 if (NeedSuperRegImpOperand &&
2029 (IsFirstSubReg || (IsLastSubReg && !IsSrcDstDef))) {
2030 MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
2031 if (PartialReloadCopy)
2032 MIB.addReg(ValueReg, RegState::Implicit);
2033 }
2034
2035 // The epilog restore of a wwm-scratch register can cause undesired
2036 // optimization during machine-cp post PrologEpilogInserter if the same
2037 // register was assigned for return value ABI lowering with a COPY
2038 // instruction. As given below, with the epilog reload, the earlier COPY
2039 // appeared to be dead during machine-cp.
2040 // ...
2041 // v0 in WWM operation, needs the WWM spill at prolog/epilog.
2042 // $vgpr0 = V_WRITELANE_B32 $sgpr20, 0, $vgpr0
2043 // ...
2044 // Epilog block:
2045 // $vgpr0 = COPY $vgpr1 // outgoing value moved to v0
2046 // ...
2047 // WWM spill restore to preserve the inactive lanes of v0.
2048 // $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1
2049 // $vgpr0 = BUFFER_LOAD $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0
2050 // $exec = S_MOV_B64 killed $sgpr4_sgpr5
2051 // ...
2052 // SI_RETURN implicit $vgpr0
2053 // ...
2054 // To fix it, mark the same reg as a tied op for such restore instructions
2055 // so that it marks a usage for the preceding COPY.
2056 if (!IsStore && MI != MBB.end() && MI->isReturn() &&
2057 MI->readsRegister(SubReg, this)) {
2058 MIB.addReg(SubReg, RegState::Implicit);
2059 MIB->tieOperands(0, MIB->getNumOperands() - 1);
2060 }
2061
2062 // If we're building a block load, we should add artificial uses for the
2063 // CSR VGPRs that are *not* being transferred. This is because liveness
2064 // analysis is not aware of the mask, so we need to somehow inform it that
2065 // those registers are not available before the load and they should not be
2066 // scavenged.
2067 if (!IsStore && TII->isBlockLoadStore(LoadStoreOp))
2068 addImplicitUsesForBlockCSRLoad(MIB, ValueReg);
2069 }
2070
2071 if (ScratchOffsetRegDelta != 0) {
2072 // Subtract the offset we added to the ScratchOffset register.
2073 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset)
2074 .addReg(SOffset)
2075 .addImm(-ScratchOffsetRegDelta);
2076 }
2077}
2078
2080 Register BlockReg) const {
2081 const MachineFunction *MF = MIB->getMF();
2082 const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
2083 uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(BlockReg);
2084 Register BaseVGPR = getSubReg(BlockReg, AMDGPU::sub0);
2085 for (unsigned RegOffset = 1; RegOffset < 32; ++RegOffset)
2086 if (!(Mask & (1 << RegOffset)) &&
2087 isCalleeSavedPhysReg(BaseVGPR + RegOffset, *MF))
2088 MIB.addUse(BaseVGPR + RegOffset, RegState::Implicit);
2089}
2090
2093 Register BlockReg,
2094 int64_t Offset) const {
2095 const MachineFunction *MF = MBB.getParent();
2096 const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
2097 uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(BlockReg);
2098 Register BaseVGPR = getSubReg(BlockReg, AMDGPU::sub0);
2099 for (unsigned RegOffset = 0; RegOffset < 32; ++RegOffset) {
2100 Register VGPR = BaseVGPR + RegOffset;
2101 if (Mask & (1 << RegOffset)) {
2102 assert(isCalleeSavedPhysReg(VGPR, *MF));
2103 ST.getFrameLowering()->buildCFIForVGPRToVMEMSpill(
2104 MBB, MBBI, DebugLoc(), VGPR,
2105 (Offset + RegOffset) * ST.getWavefrontSize());
2106 } else if (isCalleeSavedPhysReg(VGPR, *MF)) {
2107 // FIXME: This is a workaround for the fact that FrameLowering's
2108 // emitPrologueEntryCFI considers the block load to clobber all registers
2109 // in the block.
2110 ST.getFrameLowering()->buildCFIForSameValue(MBB, MBBI, DebugLoc(),
2111 BaseVGPR + RegOffset);
2112 }
2113 }
2114}
2115
2117 int Offset, bool IsLoad,
2118 bool IsKill) const {
2119 // Load/store VGPR
2120 MachineFrameInfo &FrameInfo = SB.MF.getFrameInfo();
2121 assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill);
2122
2123 Register FrameReg =
2124 FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(SB.MF)
2125 ? getBaseRegister()
2126 : getFrameRegister(SB.MF);
2127
2128 Align Alignment = FrameInfo.getObjectAlign(Index);
2132 SB.EltSize, Alignment);
2133
2134 if (IsLoad) {
2135 unsigned Opc = ST.hasFlatScratchEnabled()
2136 ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
2137 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
2138 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, false,
2139 FrameReg, (int64_t)Offset * SB.EltSize, MMO, SB.RS);
2140 } else {
2141 unsigned Opc = ST.hasFlatScratchEnabled()
2142 ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
2143 : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
2144 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, IsKill,
2145 FrameReg, (int64_t)Offset * SB.EltSize, MMO, SB.RS);
2146 // This only ever adds one VGPR spill
2147 SB.MFI.addToSpilledVGPRs(1);
2148 }
2149}
2150
2152 RegScavenger *RS, SlotIndexes *Indexes,
2153 LiveIntervals *LIS, bool OnlyToVGPR,
2154 bool SpillToPhysVGPRLane, bool NeedsCFI) const {
2155 assert(!MI->getOperand(0).isUndef() &&
2156 "undef spill should have been deleted earlier");
2157
2158 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
2159
2160 ArrayRef<SpilledReg> VGPRSpills =
2161 SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index)
2163 bool SpillToVGPR = !VGPRSpills.empty();
2164 if (OnlyToVGPR && !SpillToVGPR)
2165 return false;
2166
2167 const SIFrameLowering *TFL = ST.getFrameLowering();
2168
2169 assert(SpillToVGPR || (SB.SuperReg != SB.MFI.getStackPtrOffsetReg() &&
2170 SB.SuperReg != SB.MFI.getFrameOffsetReg()));
2171
2172 if (SpillToVGPR) {
2173
2174 // Since stack slot coloring pass is trying to optimize SGPR spills,
2175 // VGPR lanes (mapped from spill stack slot) may be shared for SGPR
2176 // spills of different sizes. This accounts for number of VGPR lanes alloted
2177 // equal to the largest SGPR being spilled in them.
2178 assert(SB.NumSubRegs <= VGPRSpills.size() &&
2179 "Num of SGPRs spilled should be less than or equal to num of "
2180 "the VGPR lanes.");
2181
2182 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
2183 Register SubReg =
2184 SB.NumSubRegs == 1
2185 ? SB.SuperReg
2186 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2187 SpilledReg Spill = VGPRSpills[i];
2188
2189 bool IsFirstSubreg = i == 0;
2190 bool IsLastSubreg = i == SB.NumSubRegs - 1;
2191 bool UseKill = SB.IsKill && IsLastSubreg;
2192
2193
2194 // Mark the "old value of vgpr" input undef only if this is the first sgpr
2195 // spill to this specific vgpr in the first basic block.
2196 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
2197 SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), Spill.VGPR)
2198 .addReg(SubReg, getKillRegState(UseKill))
2199 .addImm(Spill.Lane)
2200 .addReg(Spill.VGPR);
2201
2202 MachineInstr *CFI = nullptr;
2203 if (NeedsCFI) {
2204 if (SB.SuperReg == SB.TRI.getReturnAddressReg(SB.MF)) {
2205 if (i == e - 1)
2206 CFI = TFL->buildCFIForSGPRToVGPRSpill(*SB.MBB, MI, DebugLoc(),
2207 AMDGPU::PC_REG, VGPRSpills);
2208 } else {
2209 CFI = TFL->buildCFIForSGPRToVGPRSpill(*SB.MBB, MI, DebugLoc(), SubReg,
2210 Spill.VGPR, Spill.Lane);
2211 }
2212 }
2213
2214 if (Indexes) {
2215 if (IsFirstSubreg)
2216 Indexes->replaceMachineInstrInMaps(*MI, *MIB);
2217 else
2218 Indexes->insertMachineInstrInMaps(*MIB);
2219
2220 if (CFI)
2221 Indexes->insertMachineInstrInMaps(*CFI);
2222 }
2223
2224 if (IsFirstSubreg && SB.NumSubRegs > 1) {
2225 // We may be spilling a super-register which is only partially defined,
2226 // and need to ensure later spills think the value is defined.
2227 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
2228 }
2229
2230 if (SB.NumSubRegs > 1 && (IsFirstSubreg || IsLastSubreg))
2231 MIB.addReg(SB.SuperReg, getKillRegState(UseKill) | RegState::Implicit);
2232
2233 // FIXME: Since this spills to another register instead of an actual
2234 // frame index, we should delete the frame index when all references to
2235 // it are fixed.
2236 }
2237 } else {
2238 SB.prepare();
2239
2240 // SubReg carries the "Kill" flag when SubReg == SB.SuperReg.
2241 RegState SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill);
2242
2243 // Per VGPR helper data
2244 auto PVD = SB.getPerVGPRData();
2245
2246 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2247 RegState TmpVGPRFlags = RegState::Undef;
2248
2249 // Write sub registers into the VGPR
2250 for (unsigned i = Offset * PVD.PerVGPR,
2251 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2252 i < e; ++i) {
2253 Register SubReg =
2254 SB.NumSubRegs == 1
2255 ? SB.SuperReg
2256 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2257
2258 MachineInstrBuilder WriteLane =
2259 BuildMI(*SB.MBB, MI, SB.DL,
2260 SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), SB.TmpVGPR)
2261 .addReg(SubReg, SubKillState)
2262 .addImm(i % PVD.PerVGPR)
2263 .addReg(SB.TmpVGPR, TmpVGPRFlags);
2264 TmpVGPRFlags = {};
2265
2266 if (Indexes) {
2267 if (i == 0)
2268 Indexes->replaceMachineInstrInMaps(*MI, *WriteLane);
2269 else
2270 Indexes->insertMachineInstrInMaps(*WriteLane);
2271 }
2272
2273 // There could be undef components of a spilled super register.
2274 // TODO: Can we detect this and skip the spill?
2275 if (SB.NumSubRegs > 1) {
2276 // The last implicit use of the SB.SuperReg carries the "Kill" flag.
2277 RegState SuperKillState = {};
2278 if (i + 1 == SB.NumSubRegs)
2279 SuperKillState |= getKillRegState(SB.IsKill);
2280 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState);
2281 }
2282 }
2283
2284 // Write out VGPR
2285 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ false);
2286
2287 // TODO: Implement CFI for SpillToVMEM for all scenarios.
2288 MachineInstr *CFI = nullptr;
2289 if (NeedsCFI && SB.SuperReg == SB.TRI.getReturnAddressReg(SB.MF)) {
2290 int64_t CFIOffset = (Offset * SB.EltSize +
2291 SB.MF.getFrameInfo().getObjectOffset(Index)) *
2292 ST.getWavefrontSize();
2293 CFI = TFL->buildCFIForSGPRToVMEMSpill(*SB.MBB, MI, DebugLoc(),
2294 AMDGPU::PC_REG, CFIOffset);
2295 }
2296 if (Indexes && CFI)
2297 Indexes->insertMachineInstrInMaps(*CFI);
2298 }
2299
2300 SB.restore();
2301 }
2302
2303 MI->eraseFromParent();
2305
2306 if (LIS)
2308
2309 return true;
2310}
2311
2313 RegScavenger *RS, SlotIndexes *Indexes,
2314 LiveIntervals *LIS, bool OnlyToVGPR,
2315 bool SpillToPhysVGPRLane) const {
2316 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
2317
2318 ArrayRef<SpilledReg> VGPRSpills =
2319 SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index)
2321 bool SpillToVGPR = !VGPRSpills.empty();
2322 if (OnlyToVGPR && !SpillToVGPR)
2323 return false;
2324
2325 if (SpillToVGPR) {
2326 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
2327 Register SubReg =
2328 SB.NumSubRegs == 1
2329 ? SB.SuperReg
2330 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2331
2332 SpilledReg Spill = VGPRSpills[i];
2333 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
2334 SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
2335 .addReg(Spill.VGPR)
2336 .addImm(Spill.Lane);
2337 if (SB.NumSubRegs > 1 && i == 0)
2339 if (Indexes) {
2340 if (i == e - 1)
2341 Indexes->replaceMachineInstrInMaps(*MI, *MIB);
2342 else
2343 Indexes->insertMachineInstrInMaps(*MIB);
2344 }
2345 }
2346 } else {
2347 SB.prepare();
2348
2349 // Per VGPR helper data
2350 auto PVD = SB.getPerVGPRData();
2351
2352 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2353 // Load in VGPR data
2354 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ true);
2355
2356 // Unpack lanes
2357 for (unsigned i = Offset * PVD.PerVGPR,
2358 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2359 i < e; ++i) {
2360 Register SubReg =
2361 SB.NumSubRegs == 1
2362 ? SB.SuperReg
2363 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2364
2365 bool LastSubReg = (i + 1 == e);
2366 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
2367 SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
2368 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg))
2369 .addImm(i);
2370 if (SB.NumSubRegs > 1 && i == 0)
2372 if (Indexes) {
2373 if (i == e - 1)
2374 Indexes->replaceMachineInstrInMaps(*MI, *MIB);
2375 else
2376 Indexes->insertMachineInstrInMaps(*MIB);
2377 }
2378 }
2379 }
2380
2381 SB.restore();
2382 }
2383
2384 MI->eraseFromParent();
2385
2386 if (LIS)
2388
2389 return true;
2390}
2391
2393 MachineBasicBlock &RestoreMBB,
2394 Register SGPR, RegScavenger *RS) const {
2395 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, SGPR, false, 0,
2396 RS);
2397 SB.prepare();
2398 // Generate the spill of SGPR to SB.TmpVGPR.
2399 RegState SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill);
2400 auto PVD = SB.getPerVGPRData();
2401 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2402 RegState TmpVGPRFlags = RegState::Undef;
2403 // Write sub registers into the VGPR
2404 for (unsigned i = Offset * PVD.PerVGPR,
2405 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2406 i < e; ++i) {
2407 Register SubReg =
2408 SB.NumSubRegs == 1
2409 ? SB.SuperReg
2410 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2411
2412 MachineInstrBuilder WriteLane =
2413 BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32),
2414 SB.TmpVGPR)
2415 .addReg(SubReg, SubKillState)
2416 .addImm(i % PVD.PerVGPR)
2417 .addReg(SB.TmpVGPR, TmpVGPRFlags);
2418 TmpVGPRFlags = {};
2419 // There could be undef components of a spilled super register.
2420 // TODO: Can we detect this and skip the spill?
2421 if (SB.NumSubRegs > 1) {
2422 // The last implicit use of the SB.SuperReg carries the "Kill" flag.
2423 RegState SuperKillState = {};
2424 if (i + 1 == SB.NumSubRegs)
2425 SuperKillState |= getKillRegState(SB.IsKill);
2426 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState);
2427 }
2428 }
2429 // Don't need to write VGPR out.
2430 }
2431
2432 // Restore clobbered registers in the specified restore block.
2433 MI = RestoreMBB.end();
2434 SB.setMI(&RestoreMBB, MI);
2435 // Generate the restore of SGPR from SB.TmpVGPR.
2436 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2437 // Don't need to load VGPR in.
2438 // Unpack lanes
2439 for (unsigned i = Offset * PVD.PerVGPR,
2440 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2441 i < e; ++i) {
2442 Register SubReg =
2443 SB.NumSubRegs == 1
2444 ? SB.SuperReg
2445 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2446
2447 assert(SubReg.isPhysical());
2448 bool LastSubReg = (i + 1 == e);
2449 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32),
2450 SubReg)
2451 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg))
2452 .addImm(i);
2453 if (SB.NumSubRegs > 1 && i == 0)
2455 }
2456 }
2457 SB.restore();
2458
2460 return false;
2461}
2462
2463/// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to
2464/// a VGPR and the stack slot can be safely eliminated when all other users are
2465/// handled.
2468 SlotIndexes *Indexes, LiveIntervals *LIS, bool SpillToPhysVGPRLane) const {
2469 bool NeedsCFI = false;
2470 switch (MI->getOpcode()) {
2471 case AMDGPU::SI_SPILL_S1024_CFI_SAVE:
2472 case AMDGPU::SI_SPILL_S512_CFI_SAVE:
2473 case AMDGPU::SI_SPILL_S256_CFI_SAVE:
2474 case AMDGPU::SI_SPILL_S224_CFI_SAVE:
2475 case AMDGPU::SI_SPILL_S192_CFI_SAVE:
2476 case AMDGPU::SI_SPILL_S160_CFI_SAVE:
2477 case AMDGPU::SI_SPILL_S128_CFI_SAVE:
2478 case AMDGPU::SI_SPILL_S96_CFI_SAVE:
2479 case AMDGPU::SI_SPILL_S64_CFI_SAVE:
2480 case AMDGPU::SI_SPILL_S32_CFI_SAVE:
2481 NeedsCFI = true;
2482 [[fallthrough]];
2483 case AMDGPU::SI_SPILL_S1024_SAVE:
2484 case AMDGPU::SI_SPILL_S512_SAVE:
2485 case AMDGPU::SI_SPILL_S384_SAVE:
2486 case AMDGPU::SI_SPILL_S352_SAVE:
2487 case AMDGPU::SI_SPILL_S320_SAVE:
2488 case AMDGPU::SI_SPILL_S288_SAVE:
2489 case AMDGPU::SI_SPILL_S256_SAVE:
2490 case AMDGPU::SI_SPILL_S224_SAVE:
2491 case AMDGPU::SI_SPILL_S192_SAVE:
2492 case AMDGPU::SI_SPILL_S160_SAVE:
2493 case AMDGPU::SI_SPILL_S128_SAVE:
2494 case AMDGPU::SI_SPILL_S96_SAVE:
2495 case AMDGPU::SI_SPILL_S64_SAVE:
2496 case AMDGPU::SI_SPILL_S32_SAVE:
2497 return spillSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane,
2498 NeedsCFI);
2499 case AMDGPU::SI_SPILL_S1024_RESTORE:
2500 case AMDGPU::SI_SPILL_S512_RESTORE:
2501 case AMDGPU::SI_SPILL_S384_RESTORE:
2502 case AMDGPU::SI_SPILL_S352_RESTORE:
2503 case AMDGPU::SI_SPILL_S320_RESTORE:
2504 case AMDGPU::SI_SPILL_S288_RESTORE:
2505 case AMDGPU::SI_SPILL_S256_RESTORE:
2506 case AMDGPU::SI_SPILL_S224_RESTORE:
2507 case AMDGPU::SI_SPILL_S192_RESTORE:
2508 case AMDGPU::SI_SPILL_S160_RESTORE:
2509 case AMDGPU::SI_SPILL_S128_RESTORE:
2510 case AMDGPU::SI_SPILL_S96_RESTORE:
2511 case AMDGPU::SI_SPILL_S64_RESTORE:
2512 case AMDGPU::SI_SPILL_S32_RESTORE:
2513 return restoreSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane);
2514 default:
2515 llvm_unreachable("not an SGPR spill instruction");
2516 }
2517}
2518
2520 int SPAdj, unsigned FIOperandNum,
2521 RegScavenger *RS) const {
2522 MachineFunction *MF = MI->getMF();
2523 MachineBasicBlock *MBB = MI->getParent();
2525 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
2526 const SIInstrInfo *TII = ST.getInstrInfo();
2527 const DebugLoc &DL = MI->getDebugLoc();
2528
2529 assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?");
2530
2532 "unreserved scratch RSRC register");
2533
2534 MachineOperand *FIOp = &MI->getOperand(FIOperandNum);
2535 int Index = MI->getOperand(FIOperandNum).getIndex();
2536
2537 Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF)
2538 ? getBaseRegister()
2539 : getFrameRegister(*MF);
2540
2541 bool NeedsCFI = false;
2542
2543 switch (MI->getOpcode()) {
2544 // SGPR register spill
2545 case AMDGPU::SI_SPILL_S1024_CFI_SAVE:
2546 case AMDGPU::SI_SPILL_S512_CFI_SAVE:
2547 case AMDGPU::SI_SPILL_S256_CFI_SAVE:
2548 case AMDGPU::SI_SPILL_S224_CFI_SAVE:
2549 case AMDGPU::SI_SPILL_S192_CFI_SAVE:
2550 case AMDGPU::SI_SPILL_S160_CFI_SAVE:
2551 case AMDGPU::SI_SPILL_S128_CFI_SAVE:
2552 case AMDGPU::SI_SPILL_S96_CFI_SAVE:
2553 case AMDGPU::SI_SPILL_S64_CFI_SAVE:
2554 case AMDGPU::SI_SPILL_S32_CFI_SAVE: {
2555 NeedsCFI = true;
2556 [[fallthrough]];
2557 }
2558 case AMDGPU::SI_SPILL_S1024_SAVE:
2559 case AMDGPU::SI_SPILL_S512_SAVE:
2560 case AMDGPU::SI_SPILL_S384_SAVE:
2561 case AMDGPU::SI_SPILL_S352_SAVE:
2562 case AMDGPU::SI_SPILL_S320_SAVE:
2563 case AMDGPU::SI_SPILL_S288_SAVE:
2564 case AMDGPU::SI_SPILL_S256_SAVE:
2565 case AMDGPU::SI_SPILL_S224_SAVE:
2566 case AMDGPU::SI_SPILL_S192_SAVE:
2567 case AMDGPU::SI_SPILL_S160_SAVE:
2568 case AMDGPU::SI_SPILL_S128_SAVE:
2569 case AMDGPU::SI_SPILL_S96_SAVE:
2570 case AMDGPU::SI_SPILL_S64_SAVE:
2571 case AMDGPU::SI_SPILL_S32_SAVE: {
2572 return spillSGPR(MI, Index, RS, nullptr, nullptr, false, false, NeedsCFI);
2573 }
2574
2575 // SGPR register restore
2576 case AMDGPU::SI_SPILL_S1024_RESTORE:
2577 case AMDGPU::SI_SPILL_S512_RESTORE:
2578 case AMDGPU::SI_SPILL_S384_RESTORE:
2579 case AMDGPU::SI_SPILL_S352_RESTORE:
2580 case AMDGPU::SI_SPILL_S320_RESTORE:
2581 case AMDGPU::SI_SPILL_S288_RESTORE:
2582 case AMDGPU::SI_SPILL_S256_RESTORE:
2583 case AMDGPU::SI_SPILL_S224_RESTORE:
2584 case AMDGPU::SI_SPILL_S192_RESTORE:
2585 case AMDGPU::SI_SPILL_S160_RESTORE:
2586 case AMDGPU::SI_SPILL_S128_RESTORE:
2587 case AMDGPU::SI_SPILL_S96_RESTORE:
2588 case AMDGPU::SI_SPILL_S64_RESTORE:
2589 case AMDGPU::SI_SPILL_S32_RESTORE: {
2590 return restoreSGPR(MI, Index, RS);
2591 }
2592
2593 // VGPR register spill
2594 case AMDGPU::SI_BLOCK_SPILL_V1024_CFI_SAVE:
2595 case AMDGPU::SI_SPILL_V1024_CFI_SAVE:
2596 case AMDGPU::SI_SPILL_V512_CFI_SAVE:
2597 case AMDGPU::SI_SPILL_V256_CFI_SAVE:
2598 case AMDGPU::SI_SPILL_V224_CFI_SAVE:
2599 case AMDGPU::SI_SPILL_V192_CFI_SAVE:
2600 case AMDGPU::SI_SPILL_V160_CFI_SAVE:
2601 case AMDGPU::SI_SPILL_V128_CFI_SAVE:
2602 case AMDGPU::SI_SPILL_V96_CFI_SAVE:
2603 case AMDGPU::SI_SPILL_V64_CFI_SAVE:
2604 case AMDGPU::SI_SPILL_V32_CFI_SAVE:
2605 case AMDGPU::SI_SPILL_A1024_CFI_SAVE:
2606 case AMDGPU::SI_SPILL_A512_CFI_SAVE:
2607 case AMDGPU::SI_SPILL_A256_CFI_SAVE:
2608 case AMDGPU::SI_SPILL_A224_CFI_SAVE:
2609 case AMDGPU::SI_SPILL_A192_CFI_SAVE:
2610 case AMDGPU::SI_SPILL_A160_CFI_SAVE:
2611 case AMDGPU::SI_SPILL_A128_CFI_SAVE:
2612 case AMDGPU::SI_SPILL_A96_CFI_SAVE:
2613 case AMDGPU::SI_SPILL_A64_CFI_SAVE:
2614 case AMDGPU::SI_SPILL_A32_CFI_SAVE:
2615 case AMDGPU::SI_SPILL_AV1024_CFI_SAVE:
2616 case AMDGPU::SI_SPILL_AV512_CFI_SAVE:
2617 case AMDGPU::SI_SPILL_AV256_CFI_SAVE:
2618 case AMDGPU::SI_SPILL_AV224_CFI_SAVE:
2619 case AMDGPU::SI_SPILL_AV192_CFI_SAVE:
2620 case AMDGPU::SI_SPILL_AV160_CFI_SAVE:
2621 case AMDGPU::SI_SPILL_AV128_CFI_SAVE:
2622 case AMDGPU::SI_SPILL_AV96_CFI_SAVE:
2623 case AMDGPU::SI_SPILL_AV64_CFI_SAVE:
2624 case AMDGPU::SI_SPILL_AV32_CFI_SAVE:
2625 NeedsCFI = true;
2626 [[fallthrough]];
2627 case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE:
2628 case AMDGPU::SI_SPILL_V1024_SAVE:
2629 case AMDGPU::SI_SPILL_V512_SAVE:
2630 case AMDGPU::SI_SPILL_V384_SAVE:
2631 case AMDGPU::SI_SPILL_V352_SAVE:
2632 case AMDGPU::SI_SPILL_V320_SAVE:
2633 case AMDGPU::SI_SPILL_V288_SAVE:
2634 case AMDGPU::SI_SPILL_V256_SAVE:
2635 case AMDGPU::SI_SPILL_V224_SAVE:
2636 case AMDGPU::SI_SPILL_V192_SAVE:
2637 case AMDGPU::SI_SPILL_V160_SAVE:
2638 case AMDGPU::SI_SPILL_V128_SAVE:
2639 case AMDGPU::SI_SPILL_V96_SAVE:
2640 case AMDGPU::SI_SPILL_V64_SAVE:
2641 case AMDGPU::SI_SPILL_V32_SAVE:
2642 case AMDGPU::SI_SPILL_V16_SAVE:
2643 case AMDGPU::SI_SPILL_A1024_SAVE:
2644 case AMDGPU::SI_SPILL_A512_SAVE:
2645 case AMDGPU::SI_SPILL_A384_SAVE:
2646 case AMDGPU::SI_SPILL_A352_SAVE:
2647 case AMDGPU::SI_SPILL_A320_SAVE:
2648 case AMDGPU::SI_SPILL_A288_SAVE:
2649 case AMDGPU::SI_SPILL_A256_SAVE:
2650 case AMDGPU::SI_SPILL_A224_SAVE:
2651 case AMDGPU::SI_SPILL_A192_SAVE:
2652 case AMDGPU::SI_SPILL_A160_SAVE:
2653 case AMDGPU::SI_SPILL_A128_SAVE:
2654 case AMDGPU::SI_SPILL_A96_SAVE:
2655 case AMDGPU::SI_SPILL_A64_SAVE:
2656 case AMDGPU::SI_SPILL_A32_SAVE:
2657 case AMDGPU::SI_SPILL_AV1024_SAVE:
2658 case AMDGPU::SI_SPILL_AV512_SAVE:
2659 case AMDGPU::SI_SPILL_AV384_SAVE:
2660 case AMDGPU::SI_SPILL_AV352_SAVE:
2661 case AMDGPU::SI_SPILL_AV320_SAVE:
2662 case AMDGPU::SI_SPILL_AV288_SAVE:
2663 case AMDGPU::SI_SPILL_AV256_SAVE:
2664 case AMDGPU::SI_SPILL_AV224_SAVE:
2665 case AMDGPU::SI_SPILL_AV192_SAVE:
2666 case AMDGPU::SI_SPILL_AV160_SAVE:
2667 case AMDGPU::SI_SPILL_AV128_SAVE:
2668 case AMDGPU::SI_SPILL_AV96_SAVE:
2669 case AMDGPU::SI_SPILL_AV64_SAVE:
2670 case AMDGPU::SI_SPILL_AV32_SAVE:
2671 case AMDGPU::SI_SPILL_WWM_V32_SAVE:
2672 case AMDGPU::SI_SPILL_WWM_AV32_SAVE: {
2673 assert(
2674 MI->getOpcode() != AMDGPU::SI_BLOCK_SPILL_V1024_SAVE &&
2675 "block spill does not currenty support spilling non-CSR registers");
2676
2677 if (MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_CFI_SAVE)
2678 // Put mask into M0.
2679 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32),
2680 AMDGPU::M0)
2681 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::mask));
2682
2683 const MachineOperand *VData = TII->getNamedOperand(*MI,
2684 AMDGPU::OpName::vdata);
2685 if (VData->isUndef()) {
2686 MI->eraseFromParent();
2687 return true;
2688 }
2689
2690 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
2691 MFI->getStackPtrOffsetReg());
2692
2693 unsigned Opc;
2694 if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_SAVE) {
2695 assert(ST.hasFlatScratchEnabled() && "Flat Scratch is not enabled!");
2696 Opc = AMDGPU::SCRATCH_STORE_SHORT_SADDR_t16;
2697 } else {
2698 Opc = MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_CFI_SAVE
2699 ? AMDGPU::SCRATCH_STORE_BLOCK_SADDR
2700 : ST.hasFlatScratchEnabled() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
2701 : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
2702 }
2703
2704 auto *MBB = MI->getParent();
2705 bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode());
2706 if (IsWWMRegSpill) {
2707 TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(),
2708 RS->isRegUsed(AMDGPU::SCC));
2709 }
2711 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
2712 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
2713 *MI->memoperands_begin(), RS, nullptr, NeedsCFI);
2715 if (IsWWMRegSpill)
2716 TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy());
2717
2718 MI->eraseFromParent();
2719 return true;
2720 }
2721 case AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE: {
2722 // Put mask into M0.
2723 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32),
2724 AMDGPU::M0)
2725 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::mask));
2726 [[fallthrough]];
2727 }
2728 case AMDGPU::SI_SPILL_V16_RESTORE:
2729 case AMDGPU::SI_SPILL_V32_RESTORE:
2730 case AMDGPU::SI_SPILL_V64_RESTORE:
2731 case AMDGPU::SI_SPILL_V96_RESTORE:
2732 case AMDGPU::SI_SPILL_V128_RESTORE:
2733 case AMDGPU::SI_SPILL_V160_RESTORE:
2734 case AMDGPU::SI_SPILL_V192_RESTORE:
2735 case AMDGPU::SI_SPILL_V224_RESTORE:
2736 case AMDGPU::SI_SPILL_V256_RESTORE:
2737 case AMDGPU::SI_SPILL_V288_RESTORE:
2738 case AMDGPU::SI_SPILL_V320_RESTORE:
2739 case AMDGPU::SI_SPILL_V352_RESTORE:
2740 case AMDGPU::SI_SPILL_V384_RESTORE:
2741 case AMDGPU::SI_SPILL_V512_RESTORE:
2742 case AMDGPU::SI_SPILL_V1024_RESTORE:
2743 case AMDGPU::SI_SPILL_A32_RESTORE:
2744 case AMDGPU::SI_SPILL_A64_RESTORE:
2745 case AMDGPU::SI_SPILL_A96_RESTORE:
2746 case AMDGPU::SI_SPILL_A128_RESTORE:
2747 case AMDGPU::SI_SPILL_A160_RESTORE:
2748 case AMDGPU::SI_SPILL_A192_RESTORE:
2749 case AMDGPU::SI_SPILL_A224_RESTORE:
2750 case AMDGPU::SI_SPILL_A256_RESTORE:
2751 case AMDGPU::SI_SPILL_A288_RESTORE:
2752 case AMDGPU::SI_SPILL_A320_RESTORE:
2753 case AMDGPU::SI_SPILL_A352_RESTORE:
2754 case AMDGPU::SI_SPILL_A384_RESTORE:
2755 case AMDGPU::SI_SPILL_A512_RESTORE:
2756 case AMDGPU::SI_SPILL_A1024_RESTORE:
2757 case AMDGPU::SI_SPILL_AV32_RESTORE:
2758 case AMDGPU::SI_SPILL_AV64_RESTORE:
2759 case AMDGPU::SI_SPILL_AV96_RESTORE:
2760 case AMDGPU::SI_SPILL_AV128_RESTORE:
2761 case AMDGPU::SI_SPILL_AV160_RESTORE:
2762 case AMDGPU::SI_SPILL_AV192_RESTORE:
2763 case AMDGPU::SI_SPILL_AV224_RESTORE:
2764 case AMDGPU::SI_SPILL_AV256_RESTORE:
2765 case AMDGPU::SI_SPILL_AV288_RESTORE:
2766 case AMDGPU::SI_SPILL_AV320_RESTORE:
2767 case AMDGPU::SI_SPILL_AV352_RESTORE:
2768 case AMDGPU::SI_SPILL_AV384_RESTORE:
2769 case AMDGPU::SI_SPILL_AV512_RESTORE:
2770 case AMDGPU::SI_SPILL_AV1024_RESTORE:
2771 case AMDGPU::SI_SPILL_WWM_V32_RESTORE:
2772 case AMDGPU::SI_SPILL_WWM_AV32_RESTORE: {
2773 const MachineOperand *VData = TII->getNamedOperand(*MI,
2774 AMDGPU::OpName::vdata);
2775 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
2776 MFI->getStackPtrOffsetReg());
2777
2778 unsigned Opc;
2779 if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_RESTORE) {
2780 assert(ST.hasFlatScratchEnabled() && "Flat Scratch is not enabled!");
2781 Opc = ST.d16PreservesUnusedBits()
2782 ? AMDGPU::SCRATCH_LOAD_SHORT_D16_SADDR_t16
2783 : AMDGPU::SCRATCH_LOAD_USHORT_SADDR;
2784 } else {
2785 Opc = MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE
2786 ? AMDGPU::SCRATCH_LOAD_BLOCK_SADDR
2787 : ST.hasFlatScratchEnabled() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
2788 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
2789 }
2790
2791 auto *MBB = MI->getParent();
2792 bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode());
2793 if (IsWWMRegSpill) {
2794 TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(),
2795 RS->isRegUsed(AMDGPU::SCC));
2796 }
2797
2799 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
2800 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
2801 *MI->memoperands_begin(), RS);
2802
2803 if (IsWWMRegSpill)
2804 TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy());
2805
2806 MI->eraseFromParent();
2807 return true;
2808 }
2809 case AMDGPU::V_ADD_U32_e32:
2810 case AMDGPU::V_ADD_U32_e64:
2811 case AMDGPU::V_ADD_CO_U32_e32:
2812 case AMDGPU::V_ADD_CO_U32_e64: {
2813 // TODO: Handle sub, and, or.
2814 unsigned NumDefs = MI->getNumExplicitDefs();
2815 unsigned Src0Idx = NumDefs;
2816
2817 bool HasClamp = false;
2818 MachineOperand *VCCOp = nullptr;
2819
2820 switch (MI->getOpcode()) {
2821 case AMDGPU::V_ADD_U32_e32:
2822 break;
2823 case AMDGPU::V_ADD_U32_e64:
2824 HasClamp = MI->getOperand(3).getImm();
2825 break;
2826 case AMDGPU::V_ADD_CO_U32_e32:
2827 VCCOp = &MI->getOperand(3);
2828 break;
2829 case AMDGPU::V_ADD_CO_U32_e64:
2830 VCCOp = &MI->getOperand(1);
2831 HasClamp = MI->getOperand(4).getImm();
2832 break;
2833 default:
2834 break;
2835 }
2836 bool DeadVCC = !VCCOp || VCCOp->isDead();
2837 MachineOperand &DstOp = MI->getOperand(0);
2838 Register DstReg = DstOp.getReg();
2839
2840 unsigned OtherOpIdx =
2841 FIOperandNum == Src0Idx ? FIOperandNum + 1 : Src0Idx;
2842 MachineOperand *OtherOp = &MI->getOperand(OtherOpIdx);
2843
2844 unsigned Src1Idx = Src0Idx + 1;
2845 Register MaterializedReg = FrameReg;
2846 Register ScavengedVGPR;
2847
2848 int64_t Offset = FrameInfo.getObjectOffset(Index);
2849 // For the non-immediate case, we could fall through to the default
2850 // handling, but we do an in-place update of the result register here to
2851 // avoid scavenging another register.
2852 if (OtherOp->isImm()) {
2853 int64_t TotalOffset = OtherOp->getImm() + Offset;
2854
2855 if (!ST.hasVOP3Literal() && SIInstrInfo::isVOP3(*MI) &&
2856 !AMDGPU::isInlinableIntLiteral(TotalOffset)) {
2857 // If we can't support a VOP3 literal in the VALU instruction, we
2858 // can't specially fold into the add.
2859 // TODO: Handle VOP3->VOP2 shrink to support the fold.
2860 break;
2861 }
2862
2863 OtherOp->setImm(TotalOffset);
2864 Offset = 0;
2865 }
2866
2867 if (FrameReg && !ST.hasFlatScratchEnabled()) {
2868 // We should just do an in-place update of the result register. However,
2869 // the value there may also be used by the add, in which case we need a
2870 // temporary register.
2871 //
2872 // FIXME: The scavenger is not finding the result register in the
2873 // common case where the add does not read the register.
2874
2875 ScavengedVGPR = RS->scavengeRegisterBackwards(
2876 AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false, /*SPAdj=*/0);
2877
2878 // TODO: If we have a free SGPR, it's sometimes better to use a scalar
2879 // shift.
2880 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64))
2881 .addDef(ScavengedVGPR, RegState::Renamable)
2882 .addImm(ST.getWavefrontSizeLog2())
2883 .addReg(FrameReg);
2884 MaterializedReg = ScavengedVGPR;
2885 }
2886
2887 if ((!OtherOp->isImm() || OtherOp->getImm() != 0) && MaterializedReg) {
2888 if (ST.hasFlatScratchEnabled() &&
2889 !TII->isOperandLegal(*MI, Src1Idx, OtherOp)) {
2890 // We didn't need the shift above, so we have an SGPR for the frame
2891 // register, but may have a VGPR only operand.
2892 //
2893 // TODO: On gfx10+, we can easily change the opcode to the e64 version
2894 // and use the higher constant bus restriction to avoid this copy.
2895
2896 if (!ScavengedVGPR) {
2897 ScavengedVGPR = RS->scavengeRegisterBackwards(
2898 AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false,
2899 /*SPAdj=*/0);
2900 }
2901
2902 assert(ScavengedVGPR != DstReg);
2903
2904 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2905 .addReg(MaterializedReg,
2906 getKillRegState(MaterializedReg != FrameReg));
2907 MaterializedReg = ScavengedVGPR;
2908 }
2909
2910 // TODO: In the flat scratch case, if this is an add of an SGPR, and SCC
2911 // is not live, we could use a scalar add + vector add instead of 2
2912 // vector adds.
2913 auto AddI32 = BuildMI(*MBB, *MI, DL, TII->get(MI->getOpcode()))
2914 .addDef(DstReg, RegState::Renamable);
2915 if (NumDefs == 2)
2916 AddI32.add(MI->getOperand(1));
2917
2918 RegState MaterializedRegFlags =
2919 getKillRegState(MaterializedReg != FrameReg);
2920
2921 if (isVGPRClass(getPhysRegBaseClass(MaterializedReg))) {
2922 // If we know we have a VGPR already, it's more likely the other
2923 // operand is a legal vsrc0.
2924 AddI32
2925 .add(*OtherOp)
2926 .addReg(MaterializedReg, MaterializedRegFlags);
2927 } else {
2928 // Commute operands to avoid violating VOP2 restrictions. This will
2929 // typically happen when using scratch.
2930 AddI32
2931 .addReg(MaterializedReg, MaterializedRegFlags)
2932 .add(*OtherOp);
2933 }
2934
2935 if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 ||
2936 MI->getOpcode() == AMDGPU::V_ADD_U32_e64)
2937 AddI32.addImm(0); // clamp
2938
2939 if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e32)
2940 AddI32.setOperandDead(3); // Dead vcc
2941
2942 MaterializedReg = DstReg;
2943
2944 OtherOp->ChangeToRegister(MaterializedReg, false);
2945 OtherOp->setIsKill(true);
2947 Offset = 0;
2948 } else if (Offset != 0) {
2949 assert(!MaterializedReg);
2951 Offset = 0;
2952 } else {
2953 if (DeadVCC && !HasClamp) {
2954 assert(Offset == 0);
2955
2956 // TODO: Losing kills and implicit operands. Just mutate to copy and
2957 // let lowerCopy deal with it?
2958 if (OtherOp->isReg() && OtherOp->getReg() == DstReg) {
2959 // Folded to an identity copy.
2960 MI->eraseFromParent();
2961 return true;
2962 }
2963
2964 // The immediate value should be in OtherOp
2965 MI->setDesc(TII->get(AMDGPU::V_MOV_B32_e32));
2966 MI->removeOperand(FIOperandNum);
2967
2968 unsigned NumOps = MI->getNumOperands();
2969 for (unsigned I = NumOps - 2; I >= NumDefs + 1; --I)
2970 MI->removeOperand(I);
2971
2972 if (NumDefs == 2)
2973 MI->removeOperand(1);
2974
2975 // The code below can't deal with a mov.
2976 return true;
2977 }
2978
2979 // This folded to a constant, but we have to keep the add around for
2980 // pointless implicit defs or clamp modifier.
2981 FIOp->ChangeToImmediate(0);
2982 }
2983
2984 // Try to improve legality by commuting.
2985 if (!TII->isOperandLegal(*MI, Src1Idx) && TII->commuteInstruction(*MI)) {
2986 std::swap(FIOp, OtherOp);
2987 std::swap(FIOperandNum, OtherOpIdx);
2988 }
2989
2990 // We need at most one mov to satisfy the operand constraints. Prefer to
2991 // move the FI operand first, as it may be a literal in a VOP3
2992 // instruction.
2993 for (unsigned SrcIdx : {FIOperandNum, OtherOpIdx}) {
2994 if (!TII->isOperandLegal(*MI, SrcIdx)) {
2995 // If commuting didn't make the operands legal, we need to materialize
2996 // in a register.
2997 // TODO: Can use SGPR on gfx10+ in some cases.
2998 if (!ScavengedVGPR) {
2999 ScavengedVGPR = RS->scavengeRegisterBackwards(
3000 AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false,
3001 /*SPAdj=*/0);
3002 }
3003
3004 assert(ScavengedVGPR != DstReg);
3005
3006 MachineOperand &Src = MI->getOperand(SrcIdx);
3007 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
3008 .add(Src);
3009
3010 Src.ChangeToRegister(ScavengedVGPR, false);
3011 Src.setIsKill(true);
3012 break;
3013 }
3014 }
3015
3016 // Fold out add of 0 case that can appear in kernels.
3017 if (FIOp->isImm() && FIOp->getImm() == 0 && DeadVCC && !HasClamp) {
3018 if (OtherOp->isReg() && OtherOp->getReg() != DstReg) {
3019 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::COPY), DstReg).add(*OtherOp);
3020 }
3021
3022 MI->eraseFromParent();
3023 }
3024
3025 return true;
3026 }
3027 case AMDGPU::S_ADD_I32:
3028 case AMDGPU::S_ADD_U32: {
3029 // TODO: Handle s_or_b32, s_and_b32.
3030 unsigned OtherOpIdx = FIOperandNum == 1 ? 2 : 1;
3031 MachineOperand &OtherOp = MI->getOperand(OtherOpIdx);
3032
3033 assert(FrameReg || MFI->isBottomOfStack());
3034
3035 MachineOperand &DstOp = MI->getOperand(0);
3036 const DebugLoc &DL = MI->getDebugLoc();
3037 Register MaterializedReg = FrameReg;
3038
3039 // Defend against live scc, which should never happen in practice.
3040 bool DeadSCC = MI->getOperand(3).isDead();
3041
3042 Register TmpReg;
3043
3044 // FIXME: Scavenger should figure out that the result register is
3045 // available. Also should do this for the v_add case.
3046 if (OtherOp.isReg() && OtherOp.getReg() != DstOp.getReg())
3047 TmpReg = DstOp.getReg();
3048
3049 if (FrameReg && !ST.hasFlatScratchEnabled()) {
3050 // FIXME: In the common case where the add does not also read its result
3051 // (i.e. this isn't a reg += fi), it's not finding the dest reg as
3052 // available.
3053 if (!TmpReg)
3054 TmpReg = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
3055 MI, /*RestoreAfter=*/false, 0,
3056 /*AllowSpill=*/false);
3057 if (TmpReg) {
3058 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::S_LSHR_B32))
3059 .addDef(TmpReg, RegState::Renamable)
3060 .addReg(FrameReg)
3061 .addImm(ST.getWavefrontSizeLog2())
3062 .setOperandDead(3); // Set SCC dead
3063 }
3064 MaterializedReg = TmpReg;
3065 }
3066
3067 int64_t Offset = FrameInfo.getObjectOffset(Index);
3068
3069 // For the non-immediate case, we could fall through to the default
3070 // handling, but we do an in-place update of the result register here to
3071 // avoid scavenging another register.
3072 if (OtherOp.isImm()) {
3073 OtherOp.setImm(OtherOp.getImm() + Offset);
3074 Offset = 0;
3075
3076 if (MaterializedReg)
3077 FIOp->ChangeToRegister(MaterializedReg, false);
3078 else
3079 FIOp->ChangeToImmediate(0);
3080 } else if (MaterializedReg) {
3081 // If we can't fold the other operand, do another increment.
3082 Register DstReg = DstOp.getReg();
3083
3084 if (!TmpReg && MaterializedReg == FrameReg) {
3085 TmpReg = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
3086 MI, /*RestoreAfter=*/false, 0,
3087 /*AllowSpill=*/false);
3088 DstReg = TmpReg;
3089 }
3090
3091 if (TmpReg) {
3092 auto AddI32 = BuildMI(*MBB, *MI, DL, MI->getDesc())
3093 .addDef(DstReg, RegState::Renamable)
3094 .addReg(MaterializedReg, RegState::Kill)
3095 .add(OtherOp);
3096 if (DeadSCC)
3097 AddI32.setOperandDead(3);
3098
3099 MaterializedReg = DstReg;
3100
3101 OtherOp.ChangeToRegister(MaterializedReg, false);
3102 OtherOp.setIsKill(true);
3103 OtherOp.setIsRenamable(true);
3104 }
3106 } else {
3107 // If we don't have any other offset to apply, we can just directly
3108 // interpret the frame index as the offset.
3110 }
3111
3112 if (DeadSCC && OtherOp.isImm() && OtherOp.getImm() == 0) {
3113 assert(Offset == 0);
3114 MI->removeOperand(3);
3115 MI->removeOperand(OtherOpIdx);
3116 MI->setDesc(TII->get(FIOp->isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
3117 } else if (DeadSCC && FIOp->isImm() && FIOp->getImm() == 0) {
3118 assert(Offset == 0);
3119 MI->removeOperand(3);
3120 MI->removeOperand(FIOperandNum);
3121 MI->setDesc(
3122 TII->get(OtherOp.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
3123 }
3124
3125 assert(!FIOp->isFI());
3126 return true;
3127 }
3128 default: {
3129 break;
3130 }
3131 }
3132
3133 int64_t Offset = FrameInfo.getObjectOffset(Index);
3134 if (ST.hasFlatScratchEnabled()) {
3135 if (TII->isFLATScratch(*MI)) {
3136 assert(
3137 (int16_t)FIOperandNum ==
3138 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::saddr));
3139
3140 // The offset is always swizzled, just replace it
3141 if (FrameReg)
3142 FIOp->ChangeToRegister(FrameReg, false);
3143
3145 TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
3146 int64_t NewOffset = Offset + OffsetOp->getImm();
3147 if (TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
3149 OffsetOp->setImm(NewOffset);
3150 if (FrameReg)
3151 return false;
3152 Offset = 0;
3153 }
3154
3155 if (!Offset) {
3156 unsigned Opc = MI->getOpcode();
3157 int NewOpc = -1;
3158 if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr)) {
3160 } else if (ST.hasFlatScratchSTMode()) {
3161 // On GFX10 we have ST mode to use no registers for an address.
3162 // Otherwise we need to materialize 0 into an SGPR.
3164 }
3165
3166 if (NewOpc != -1) {
3167 // removeOperand doesn't fixup tied operand indexes as it goes, so
3168 // it asserts. Untie vdst_in for now and retie them afterwards.
3169 int VDstIn =
3170 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
3171 bool TiedVDst = VDstIn != -1 && MI->getOperand(VDstIn).isReg() &&
3172 MI->getOperand(VDstIn).isTied();
3173 if (TiedVDst)
3174 MI->untieRegOperand(VDstIn);
3175
3176 MI->removeOperand(
3177 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr));
3178
3179 if (TiedVDst) {
3180 int NewVDst =
3181 AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
3182 int NewVDstIn =
3183 AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst_in);
3184 assert(NewVDst != -1 && NewVDstIn != -1 && "Must be tied!");
3185 MI->tieOperands(NewVDst, NewVDstIn);
3186 }
3187 MI->setDesc(TII->get(NewOpc));
3188 return false;
3189 }
3190 }
3191 }
3192
3193 if (!FrameReg) {
3195 if (TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp))
3196 return false;
3197 }
3198
3199 // We need to use register here. Check if we can use an SGPR or need
3200 // a VGPR.
3201 FIOp->ChangeToRegister(AMDGPU::M0, false);
3202 bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, FIOp);
3203
3204 if (!Offset && FrameReg && UseSGPR) {
3205 FIOp->setReg(FrameReg);
3206 return false;
3207 }
3208
3209 const TargetRegisterClass *RC =
3210 UseSGPR ? &AMDGPU::SReg_32_XM0RegClass : &AMDGPU::VGPR_32RegClass;
3211
3212 Register TmpReg =
3213 RS->scavengeRegisterBackwards(*RC, MI, false, 0, !UseSGPR);
3214 FIOp->setReg(TmpReg);
3215 FIOp->setIsKill();
3216
3217 if ((!FrameReg || !Offset) && TmpReg) {
3218 unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
3219 auto MIB = BuildMI(*MBB, MI, DL, TII->get(Opc), TmpReg);
3220 if (FrameReg)
3221 MIB.addReg(FrameReg);
3222 else
3223 MIB.addImm(Offset);
3224
3225 return false;
3226 }
3227
3228 bool NeedSaveSCC = (RS->isRegUsed(AMDGPU::SCC) &&
3229 !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr)) ||
3230 MI->readsRegister(AMDGPU::SCC, /*TRI=*/nullptr);
3231
3232 Register TmpSReg =
3233 UseSGPR ? TmpReg
3234 : RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
3235 MI, false, 0, !UseSGPR);
3236
3237 if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR)) {
3238 int SVOpcode = AMDGPU::getFlatScratchInstSVfromSS(MI->getOpcode());
3239 if (ST.hasFlatScratchSVSMode() && SVOpcode != -1) {
3240 Register TmpVGPR = RS->scavengeRegisterBackwards(
3241 AMDGPU::VGPR_32RegClass, MI, false, 0, /*AllowSpill=*/true);
3242
3243 // Materialize the frame register.
3244 auto MIB =
3245 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR);
3246 if (FrameReg)
3247 MIB.addReg(FrameReg);
3248 else
3249 MIB.addImm(Offset);
3250
3251 // Add the offset to the frame register.
3252 if (FrameReg && Offset)
3253 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e32), FrameReg)
3254 .addReg(FrameReg, RegState::Kill)
3255 .addImm(Offset);
3256
3257 BuildMI(*MBB, MI, DL, TII->get(SVOpcode))
3258 .add(MI->getOperand(0)) // $vdata
3259 .addReg(TmpVGPR) // $vaddr
3260 .addImm(0) // Offset
3261 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::cpol));
3262 MI->eraseFromParent();
3263 return true;
3264 }
3265 report_fatal_error("Cannot scavenge register in FI elimination!");
3266 }
3267
3268 if (!TmpSReg) {
3269 // Use frame register and restore it after.
3270 TmpSReg = FrameReg;
3271 FIOp->setReg(FrameReg);
3272 FIOp->setIsKill(false);
3273 }
3274
3275 if (NeedSaveSCC) {
3276 assert(!(Offset & 0x1) && "Flat scratch offset must be aligned!");
3277 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADDC_U32), TmpSReg)
3278 .addReg(FrameReg)
3279 .addImm(Offset);
3280 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITCMP1_B32))
3281 .addReg(TmpSReg)
3282 .addImm(0);
3283 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITSET0_B32), TmpSReg)
3284 .addImm(0)
3285 .addReg(TmpSReg);
3286 } else {
3287 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpSReg)
3288 .addReg(FrameReg)
3289 .addImm(Offset);
3290 }
3291
3292 if (!UseSGPR)
3293 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
3294 .addReg(TmpSReg, RegState::Kill);
3295
3296 if (TmpSReg == FrameReg) {
3297 // Undo frame register modification.
3298 if (NeedSaveSCC &&
3299 !MI->registerDefIsDead(AMDGPU::SCC, /*TRI=*/nullptr)) {
3301 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADDC_U32),
3302 TmpSReg)
3303 .addReg(FrameReg)
3304 .addImm(-Offset);
3305 I = BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITCMP1_B32))
3306 .addReg(TmpSReg)
3307 .addImm(0);
3308 BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITSET0_B32),
3309 TmpSReg)
3310 .addImm(0)
3311 .addReg(TmpSReg);
3312 } else {
3313 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADD_I32),
3314 FrameReg)
3315 .addReg(FrameReg)
3316 .addImm(-Offset);
3317 }
3318 }
3319
3320 return false;
3321 }
3322
3323 bool IsMUBUF = TII->isMUBUF(*MI);
3324
3325 if (!IsMUBUF && !MFI->isBottomOfStack()) {
3326 // Convert to a swizzled stack address by scaling by the wave size.
3327 // In an entry function/kernel the offset is already swizzled.
3328 bool IsSALU = isSGPRClass(TII->getRegClass(MI->getDesc(), FIOperandNum));
3329 bool LiveSCC = RS->isRegUsed(AMDGPU::SCC) &&
3330 !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr);
3331 const TargetRegisterClass *RC = IsSALU && !LiveSCC
3332 ? &AMDGPU::SReg_32RegClass
3333 : &AMDGPU::VGPR_32RegClass;
3334 bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32 ||
3335 MI->getOpcode() == AMDGPU::V_MOV_B32_e64 ||
3336 MI->getOpcode() == AMDGPU::S_MOV_B32;
3337 Register ResultReg =
3338 IsCopy ? MI->getOperand(0).getReg()
3339 : RS->scavengeRegisterBackwards(*RC, MI, false, 0);
3340
3341 int64_t Offset = FrameInfo.getObjectOffset(Index);
3342 if (Offset == 0) {
3343 unsigned OpCode =
3344 IsSALU && !LiveSCC ? AMDGPU::S_LSHR_B32 : AMDGPU::V_LSHRREV_B32_e64;
3345 Register TmpResultReg = ResultReg;
3346 if (IsSALU && LiveSCC) {
3347 TmpResultReg = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
3348 MI, false, 0);
3349 }
3350
3351 auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), TmpResultReg);
3352 if (OpCode == AMDGPU::V_LSHRREV_B32_e64)
3353 // For V_LSHRREV, the operands are reversed (the shift count goes
3354 // first).
3355 Shift.addImm(ST.getWavefrontSizeLog2()).addReg(FrameReg);
3356 else
3357 Shift.addReg(FrameReg).addImm(ST.getWavefrontSizeLog2());
3358 if (IsSALU && !LiveSCC)
3359 Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead.
3360 if (IsSALU && LiveSCC) {
3361 Register NewDest;
3362 if (IsCopy) {
3363 assert(ResultReg.isPhysical());
3364 NewDest = ResultReg;
3365 } else {
3366 NewDest = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
3367 Shift, false, 0);
3368 }
3369 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), NewDest)
3370 .addReg(TmpResultReg);
3371 ResultReg = NewDest;
3372 }
3373 } else {
3375 if (!IsSALU) {
3376 if ((MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) !=
3377 nullptr) {
3378 // Reuse ResultReg in intermediate step.
3379 Register ScaledReg = ResultReg;
3380
3381 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
3382 ScaledReg)
3383 .addImm(ST.getWavefrontSizeLog2())
3384 .addReg(FrameReg);
3385
3386 const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32;
3387
3388 // TODO: Fold if use instruction is another add of a constant.
3389 if (IsVOP2 ||
3390 AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
3391 // FIXME: This can fail
3392 MIB.addImm(Offset);
3393 MIB.addReg(ScaledReg, RegState::Kill);
3394 if (!IsVOP2)
3395 MIB.addImm(0); // clamp bit
3396 } else {
3397 assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 &&
3398 "Need to reuse carry out register");
3399
3400 // Use scavenged unused carry out as offset register.
3401 Register ConstOffsetReg;
3402 if (!isWave32)
3403 ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0);
3404 else
3405 ConstOffsetReg = MIB.getReg(1);
3406
3407 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32),
3408 ConstOffsetReg)
3409 .addImm(Offset);
3410 MIB.addReg(ConstOffsetReg, RegState::Kill);
3411 MIB.addReg(ScaledReg, RegState::Kill);
3412 MIB.addImm(0); // clamp bit
3413 }
3414 }
3415 }
3416 if (!MIB || IsSALU) {
3417 // We have to produce a carry out, and there isn't a free SGPR pair
3418 // for it. We can keep the whole computation on the SALU to avoid
3419 // clobbering an additional register at the cost of an extra mov.
3420
3421 // We may have 1 free scratch SGPR even though a carry out is
3422 // unavailable. Only one additional mov is needed.
3423 Register TmpScaledReg = IsCopy && IsSALU
3424 ? ResultReg
3425 : RS->scavengeRegisterBackwards(
3426 AMDGPU::SReg_32_XM0RegClass, MI,
3427 false, 0, /*AllowSpill=*/false);
3428 Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg;
3429 Register TmpResultReg = ScaledReg;
3430
3431 if (!LiveSCC) {
3432 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), TmpResultReg)
3433 .addReg(FrameReg)
3434 .addImm(ST.getWavefrontSizeLog2());
3435 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpResultReg)
3436 .addReg(TmpResultReg, RegState::Kill)
3437 .addImm(Offset);
3438 } else {
3439 TmpResultReg = RS->scavengeRegisterBackwards(
3440 AMDGPU::VGPR_32RegClass, MI, false, 0, /*AllowSpill=*/true);
3441
3443 if ((Add = TII->getAddNoCarry(*MBB, MI, DL, TmpResultReg, *RS))) {
3444 BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
3445 TmpResultReg)
3446 .addImm(ST.getWavefrontSizeLog2())
3447 .addReg(FrameReg);
3448 if (Add->getOpcode() == AMDGPU::V_ADD_CO_U32_e64) {
3449 BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::S_MOV_B32), ResultReg)
3450 .addImm(Offset);
3451 Add.addReg(ResultReg, RegState::Kill)
3452 .addReg(TmpResultReg, RegState::Kill)
3453 .addImm(0);
3454 } else
3455 Add.addImm(Offset).addReg(TmpResultReg, RegState::Kill);
3456 } else {
3457 assert(Offset > 0 && isUInt<24>(2 * ST.getMaxWaveScratchSize()) &&
3458 "offset is unsafe for v_mad_u32_u24");
3459
3460 // We start with a frame pointer with a wave space value, and
3461 // an offset in lane-space. We are materializing a lane space
3462 // value. We can either do a right shift of the frame pointer
3463 // to get to lane space, or a left shift of the offset to get
3464 // to wavespace. We can right shift after the computation to
3465 // get back to the desired per-lane value. We are using the
3466 // mad_u32_u24 primarily as an add with no carry out clobber.
3467 bool IsInlinableLiteral =
3468 AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm());
3469 if (!IsInlinableLiteral) {
3470 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32),
3471 TmpResultReg)
3472 .addImm(Offset);
3473 }
3474
3475 Add = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MAD_U32_U24_e64),
3476 TmpResultReg);
3477
3478 if (!IsInlinableLiteral) {
3479 Add.addReg(TmpResultReg, RegState::Kill);
3480 } else {
3481 // We fold the offset into mad itself if its inlinable.
3482 Add.addImm(Offset);
3483 }
3484 Add.addImm(ST.getWavefrontSize()).addReg(FrameReg).addImm(0);
3485 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
3486 TmpResultReg)
3487 .addImm(ST.getWavefrontSizeLog2())
3488 .addReg(TmpResultReg);
3489 }
3490
3491 Register NewDest;
3492 if (IsCopy) {
3493 NewDest = ResultReg;
3494 } else {
3495 NewDest = RS->scavengeRegisterBackwards(
3496 AMDGPU::SReg_32_XM0RegClass, *Add, false, 0,
3497 /*AllowSpill=*/true);
3498 }
3499
3500 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
3501 NewDest)
3502 .addReg(TmpResultReg);
3503 ResultReg = NewDest;
3504 }
3505 if (!IsSALU)
3506 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg)
3507 .addReg(TmpResultReg, RegState::Kill);
3508 // If there were truly no free SGPRs, we need to undo everything.
3509 if (!TmpScaledReg.isValid()) {
3510 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg)
3511 .addReg(ScaledReg, RegState::Kill)
3512 .addImm(-Offset);
3513 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg)
3514 .addReg(FrameReg)
3515 .addImm(ST.getWavefrontSizeLog2());
3516 }
3517 }
3518 }
3519
3520 // Don't introduce an extra copy if we're just materializing in a mov.
3521 if (IsCopy) {
3522 MI->eraseFromParent();
3523 return true;
3524 }
3525 FIOp->ChangeToRegister(ResultReg, false, false, true);
3526 return false;
3527 }
3528
3529 if (IsMUBUF) {
3530 // Disable offen so we don't need a 0 vgpr base.
3531 assert(
3532 static_cast<int>(FIOperandNum) ==
3533 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr));
3534
3535 auto &SOffset = *TII->getNamedOperand(*MI, AMDGPU::OpName::soffset);
3536 assert((SOffset.isImm() && SOffset.getImm() == 0));
3537
3538 if (FrameReg != AMDGPU::NoRegister)
3539 SOffset.ChangeToRegister(FrameReg, false);
3540
3541 int64_t Offset = FrameInfo.getObjectOffset(Index);
3542 int64_t OldImm =
3543 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm();
3544 int64_t NewOffset = OldImm + Offset;
3545
3546 if (TII->isLegalMUBUFImmOffset(NewOffset) &&
3547 buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) {
3548 MI->eraseFromParent();
3549 return true;
3550 }
3551 }
3552
3553 // If the offset is simply too big, don't convert to a scratch wave offset
3554 // relative index.
3555
3557 if (!TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp)) {
3558 Register TmpReg =
3559 RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0);
3560 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
3561 .addImm(Offset);
3562 FIOp->ChangeToRegister(TmpReg, false, false, true);
3563 }
3564
3565 return false;
3566}
3567
3571
3573 return getEncodingValue(Reg) & AMDGPU::HWEncoding::REG_IDX_MASK;
3574}
3575
3577 return getRegBitWidth(RC.getID());
3578}
3579
3580static const TargetRegisterClass *
3582 if (BitWidth == 64)
3583 return &AMDGPU::VReg_64RegClass;
3584 if (BitWidth == 96)
3585 return &AMDGPU::VReg_96RegClass;
3586 if (BitWidth == 128)
3587 return &AMDGPU::VReg_128RegClass;
3588 if (BitWidth == 160)
3589 return &AMDGPU::VReg_160RegClass;
3590 if (BitWidth == 192)
3591 return &AMDGPU::VReg_192RegClass;
3592 if (BitWidth == 224)
3593 return &AMDGPU::VReg_224RegClass;
3594 if (BitWidth == 256)
3595 return &AMDGPU::VReg_256RegClass;
3596 if (BitWidth == 288)
3597 return &AMDGPU::VReg_288RegClass;
3598 if (BitWidth == 320)
3599 return &AMDGPU::VReg_320RegClass;
3600 if (BitWidth == 352)
3601 return &AMDGPU::VReg_352RegClass;
3602 if (BitWidth == 384)
3603 return &AMDGPU::VReg_384RegClass;
3604 if (BitWidth == 512)
3605 return &AMDGPU::VReg_512RegClass;
3606 if (BitWidth == 1024)
3607 return &AMDGPU::VReg_1024RegClass;
3608
3609 return nullptr;
3610}
3611
3612static const TargetRegisterClass *
3614 if (BitWidth == 64)
3615 return &AMDGPU::VReg_64_Align2RegClass;
3616 if (BitWidth == 96)
3617 return &AMDGPU::VReg_96_Align2RegClass;
3618 if (BitWidth == 128)
3619 return &AMDGPU::VReg_128_Align2RegClass;
3620 if (BitWidth == 160)
3621 return &AMDGPU::VReg_160_Align2RegClass;
3622 if (BitWidth == 192)
3623 return &AMDGPU::VReg_192_Align2RegClass;
3624 if (BitWidth == 224)
3625 return &AMDGPU::VReg_224_Align2RegClass;
3626 if (BitWidth == 256)
3627 return &AMDGPU::VReg_256_Align2RegClass;
3628 if (BitWidth == 288)
3629 return &AMDGPU::VReg_288_Align2RegClass;
3630 if (BitWidth == 320)
3631 return &AMDGPU::VReg_320_Align2RegClass;
3632 if (BitWidth == 352)
3633 return &AMDGPU::VReg_352_Align2RegClass;
3634 if (BitWidth == 384)
3635 return &AMDGPU::VReg_384_Align2RegClass;
3636 if (BitWidth == 512)
3637 return &AMDGPU::VReg_512_Align2RegClass;
3638 if (BitWidth == 1024)
3639 return &AMDGPU::VReg_1024_Align2RegClass;
3640
3641 return nullptr;
3642}
3643
3644const TargetRegisterClass *
3646 if (BitWidth == 1)
3647 return &AMDGPU::VReg_1RegClass;
3648 if (BitWidth == 16)
3649 return &AMDGPU::VGPR_16RegClass;
3650 if (BitWidth == 32)
3651 return &AMDGPU::VGPR_32RegClass;
3652 return ST.needsAlignedVGPRs() ? getAlignedVGPRClassForBitWidth(BitWidth)
3654}
3655
3656const TargetRegisterClass *
3658 if (BitWidth <= 32)
3659 return &AMDGPU::VGPR_32_Lo256RegClass;
3660 if (BitWidth <= 64)
3661 return &AMDGPU::VReg_64_Lo256_Align2RegClass;
3662 if (BitWidth <= 96)
3663 return &AMDGPU::VReg_96_Lo256_Align2RegClass;
3664 if (BitWidth <= 128)
3665 return &AMDGPU::VReg_128_Lo256_Align2RegClass;
3666 if (BitWidth <= 160)
3667 return &AMDGPU::VReg_160_Lo256_Align2RegClass;
3668 if (BitWidth <= 192)
3669 return &AMDGPU::VReg_192_Lo256_Align2RegClass;
3670 if (BitWidth <= 224)
3671 return &AMDGPU::VReg_224_Lo256_Align2RegClass;
3672 if (BitWidth <= 256)
3673 return &AMDGPU::VReg_256_Lo256_Align2RegClass;
3674 if (BitWidth <= 288)
3675 return &AMDGPU::VReg_288_Lo256_Align2RegClass;
3676 if (BitWidth <= 320)
3677 return &AMDGPU::VReg_320_Lo256_Align2RegClass;
3678 if (BitWidth <= 352)
3679 return &AMDGPU::VReg_352_Lo256_Align2RegClass;
3680 if (BitWidth <= 384)
3681 return &AMDGPU::VReg_384_Lo256_Align2RegClass;
3682 if (BitWidth <= 512)
3683 return &AMDGPU::VReg_512_Lo256_Align2RegClass;
3684 if (BitWidth <= 1024)
3685 return &AMDGPU::VReg_1024_Lo256_Align2RegClass;
3686
3687 return nullptr;
3688}
3689
3690static const TargetRegisterClass *
3692 if (BitWidth == 64)
3693 return &AMDGPU::AReg_64RegClass;
3694 if (BitWidth == 96)
3695 return &AMDGPU::AReg_96RegClass;
3696 if (BitWidth == 128)
3697 return &AMDGPU::AReg_128RegClass;
3698 if (BitWidth == 160)
3699 return &AMDGPU::AReg_160RegClass;
3700 if (BitWidth == 192)
3701 return &AMDGPU::AReg_192RegClass;
3702 if (BitWidth == 224)
3703 return &AMDGPU::AReg_224RegClass;
3704 if (BitWidth == 256)
3705 return &AMDGPU::AReg_256RegClass;
3706 if (BitWidth == 288)
3707 return &AMDGPU::AReg_288RegClass;
3708 if (BitWidth == 320)
3709 return &AMDGPU::AReg_320RegClass;
3710 if (BitWidth == 352)
3711 return &AMDGPU::AReg_352RegClass;
3712 if (BitWidth == 384)
3713 return &AMDGPU::AReg_384RegClass;
3714 if (BitWidth == 512)
3715 return &AMDGPU::AReg_512RegClass;
3716 if (BitWidth == 1024)
3717 return &AMDGPU::AReg_1024RegClass;
3718
3719 return nullptr;
3720}
3721
3722static const TargetRegisterClass *
3724 if (BitWidth == 64)
3725 return &AMDGPU::AReg_64_Align2RegClass;
3726 if (BitWidth == 96)
3727 return &AMDGPU::AReg_96_Align2RegClass;
3728 if (BitWidth == 128)
3729 return &AMDGPU::AReg_128_Align2RegClass;
3730 if (BitWidth == 160)
3731 return &AMDGPU::AReg_160_Align2RegClass;
3732 if (BitWidth == 192)
3733 return &AMDGPU::AReg_192_Align2RegClass;
3734 if (BitWidth == 224)
3735 return &AMDGPU::AReg_224_Align2RegClass;
3736 if (BitWidth == 256)
3737 return &AMDGPU::AReg_256_Align2RegClass;
3738 if (BitWidth == 288)
3739 return &AMDGPU::AReg_288_Align2RegClass;
3740 if (BitWidth == 320)
3741 return &AMDGPU::AReg_320_Align2RegClass;
3742 if (BitWidth == 352)
3743 return &AMDGPU::AReg_352_Align2RegClass;
3744 if (BitWidth == 384)
3745 return &AMDGPU::AReg_384_Align2RegClass;
3746 if (BitWidth == 512)
3747 return &AMDGPU::AReg_512_Align2RegClass;
3748 if (BitWidth == 1024)
3749 return &AMDGPU::AReg_1024_Align2RegClass;
3750
3751 return nullptr;
3752}
3753
3754const TargetRegisterClass *
3756 if (BitWidth == 16)
3757 return &AMDGPU::AGPR_LO16RegClass;
3758 if (BitWidth == 32)
3759 return &AMDGPU::AGPR_32RegClass;
3760 return ST.needsAlignedVGPRs() ? getAlignedAGPRClassForBitWidth(BitWidth)
3762}
3763
3764static const TargetRegisterClass *
3766 if (BitWidth == 64)
3767 return &AMDGPU::AV_64RegClass;
3768 if (BitWidth == 96)
3769 return &AMDGPU::AV_96RegClass;
3770 if (BitWidth == 128)
3771 return &AMDGPU::AV_128RegClass;
3772 if (BitWidth == 160)
3773 return &AMDGPU::AV_160RegClass;
3774 if (BitWidth == 192)
3775 return &AMDGPU::AV_192RegClass;
3776 if (BitWidth == 224)
3777 return &AMDGPU::AV_224RegClass;
3778 if (BitWidth == 256)
3779 return &AMDGPU::AV_256RegClass;
3780 if (BitWidth == 288)
3781 return &AMDGPU::AV_288RegClass;
3782 if (BitWidth == 320)
3783 return &AMDGPU::AV_320RegClass;
3784 if (BitWidth == 352)
3785 return &AMDGPU::AV_352RegClass;
3786 if (BitWidth == 384)
3787 return &AMDGPU::AV_384RegClass;
3788 if (BitWidth == 512)
3789 return &AMDGPU::AV_512RegClass;
3790 if (BitWidth == 1024)
3791 return &AMDGPU::AV_1024RegClass;
3792
3793 return nullptr;
3794}
3795
3796static const TargetRegisterClass *
3798 if (BitWidth == 64)
3799 return &AMDGPU::AV_64_Align2RegClass;
3800 if (BitWidth == 96)
3801 return &AMDGPU::AV_96_Align2RegClass;
3802 if (BitWidth == 128)
3803 return &AMDGPU::AV_128_Align2RegClass;
3804 if (BitWidth == 160)
3805 return &AMDGPU::AV_160_Align2RegClass;
3806 if (BitWidth == 192)
3807 return &AMDGPU::AV_192_Align2RegClass;
3808 if (BitWidth == 224)
3809 return &AMDGPU::AV_224_Align2RegClass;
3810 if (BitWidth == 256)
3811 return &AMDGPU::AV_256_Align2RegClass;
3812 if (BitWidth == 288)
3813 return &AMDGPU::AV_288_Align2RegClass;
3814 if (BitWidth == 320)
3815 return &AMDGPU::AV_320_Align2RegClass;
3816 if (BitWidth == 352)
3817 return &AMDGPU::AV_352_Align2RegClass;
3818 if (BitWidth == 384)
3819 return &AMDGPU::AV_384_Align2RegClass;
3820 if (BitWidth == 512)
3821 return &AMDGPU::AV_512_Align2RegClass;
3822 if (BitWidth == 1024)
3823 return &AMDGPU::AV_1024_Align2RegClass;
3824
3825 return nullptr;
3826}
3827
3828const TargetRegisterClass *
3830 if (BitWidth == 32)
3831 return &AMDGPU::AV_32RegClass;
3832 return ST.needsAlignedVGPRs()
3835}
3836
3837const TargetRegisterClass *
3839 // TODO: In principle this should use AV classes for gfx908 too. This is
3840 // limited to 90a+ to avoid regressing special case copy optimizations which
3841 // need new handling. The core issue is that it's not possible to directly
3842 // copy between AGPRs on gfx908, and the current optimizations around that
3843 // expect to see copies to VGPR.
3844 return ST.hasGFX90AInsts() ? getVectorSuperClassForBitWidth(BitWidth)
3846}
3847
3848const TargetRegisterClass *
3850 if (BitWidth == 16 || BitWidth == 32)
3851 return &AMDGPU::SReg_32RegClass;
3852 if (BitWidth == 64)
3853 return &AMDGPU::SReg_64RegClass;
3854 if (BitWidth == 96)
3855 return &AMDGPU::SGPR_96RegClass;
3856 if (BitWidth == 128)
3857 return &AMDGPU::SGPR_128RegClass;
3858 if (BitWidth == 160)
3859 return &AMDGPU::SGPR_160RegClass;
3860 if (BitWidth == 192)
3861 return &AMDGPU::SGPR_192RegClass;
3862 if (BitWidth == 224)
3863 return &AMDGPU::SGPR_224RegClass;
3864 if (BitWidth == 256)
3865 return &AMDGPU::SGPR_256RegClass;
3866 if (BitWidth == 288)
3867 return &AMDGPU::SGPR_288RegClass;
3868 if (BitWidth == 320)
3869 return &AMDGPU::SGPR_320RegClass;
3870 if (BitWidth == 352)
3871 return &AMDGPU::SGPR_352RegClass;
3872 if (BitWidth == 384)
3873 return &AMDGPU::SGPR_384RegClass;
3874 if (BitWidth == 512)
3875 return &AMDGPU::SGPR_512RegClass;
3876 if (BitWidth == 1024)
3877 return &AMDGPU::SGPR_1024RegClass;
3878
3879 return nullptr;
3880}
3881
3883 Register Reg) const {
3884 const TargetRegisterClass *RC;
3885 if (Reg.isVirtual())
3886 RC = MRI.getRegClass(Reg);
3887 else
3888 RC = getPhysRegBaseClass(Reg);
3889 return RC && isSGPRClass(RC);
3890}
3891
3892const TargetRegisterClass *
3894 unsigned Size = getRegSizeInBits(*SRC);
3895
3896 switch (SRC->getID()) {
3897 default:
3898 break;
3899 case AMDGPU::VS_32_Lo256RegClassID:
3900 case AMDGPU::VS_64_Lo256RegClassID:
3901 return getAllocatableClass(getAlignedLo256VGPRClassForBitWidth(Size));
3902 }
3903
3904 const TargetRegisterClass *VRC =
3905 getAllocatableClass(getVGPRClassForBitWidth(Size));
3906 assert(VRC && "Invalid register class size");
3907 return VRC;
3908}
3909
3910const TargetRegisterClass *
3912 unsigned Size = getRegSizeInBits(*SRC);
3914 assert(ARC && "Invalid register class size");
3915 return ARC;
3916}
3917
3918const TargetRegisterClass *
3920 unsigned Size = getRegSizeInBits(*SRC);
3922 assert(ARC && "Invalid register class size");
3923 return ARC;
3924}
3925
3926const TargetRegisterClass *
3928 unsigned Size = getRegSizeInBits(*VRC);
3929 if (Size == 32)
3930 return &AMDGPU::SGPR_32RegClass;
3932 assert(SRC && "Invalid register class size");
3933 return SRC;
3934}
3935
3936const TargetRegisterClass *
3938 const TargetRegisterClass *SubRC,
3939 unsigned SubIdx) const {
3940 // Ensure this subregister index is aligned in the super register.
3941 const TargetRegisterClass *MatchRC =
3942 getMatchingSuperRegClass(SuperRC, SubRC, SubIdx);
3943 return MatchRC && MatchRC->hasSubClassEq(SuperRC) ? MatchRC : nullptr;
3944}
3945
3946bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
3949 return !ST.hasMFMAInlineLiteralBug();
3950
3951 return OpType >= AMDGPU::OPERAND_SRC_FIRST &&
3952 OpType <= AMDGPU::OPERAND_SRC_LAST;
3953}
3954
3955bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const {
3956 // TODO: 64-bit operands have extending behavior from 32-bit literal.
3957 return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST &&
3959}
3960
3961/// Returns a lowest register that is not used at any point in the function.
3962/// If all registers are used, then this function will return
3963/// AMDGPU::NoRegister. If \p ReserveHighestRegister = true, then return
3964/// highest unused register.
3966 const MachineRegisterInfo &MRI, const TargetRegisterClass *RC,
3967 const MachineFunction &MF, bool ReserveHighestRegister) const {
3968 // Never offer VCC as an unused register.
3969 auto isVCC = [](MCRegister Reg) {
3970 return Reg == AMDGPU::VCC || Reg == AMDGPU::VCC_LO || Reg == AMDGPU::VCC_HI;
3971 };
3972
3973 if (ReserveHighestRegister) {
3974 for (MCRegister Reg : reverse(*RC))
3975 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg) && !isVCC(Reg))
3976 return Reg;
3977 } else {
3978 for (MCRegister Reg : *RC)
3979 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg) && !isVCC(Reg))
3980 return Reg;
3981 }
3982 return MCRegister();
3983}
3984
3986 const RegisterBankInfo &RBI,
3987 Register Reg) const {
3988 auto *RB = RBI.getRegBank(Reg, MRI, *MRI.getTargetRegisterInfo());
3989 if (!RB)
3990 return false;
3991
3992 return !RBI.isDivergentRegBank(RB);
3993}
3994
3996 unsigned EltSize) const {
3997 const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC);
3998 assert(RegBitWidth >= 32 && RegBitWidth <= 1024 && EltSize >= 2);
3999
4000 const unsigned RegHalves = RegBitWidth / 16;
4001 const unsigned EltHalves = EltSize / 2;
4002 assert(RegSplitParts.size() + 1 >= EltHalves);
4003
4004 const std::vector<int16_t> &Parts = RegSplitParts[EltHalves - 1];
4005 const unsigned NumParts = RegHalves / EltHalves;
4006
4007 return ArrayRef(Parts.data(), NumParts);
4008}
4009
4012 Register Reg) const {
4013 return Reg.isVirtual() ? MRI.getRegClass(Reg) : getPhysRegBaseClass(Reg);
4014}
4015
4016const TargetRegisterClass *
4018 const MachineOperand &MO) const {
4019 const TargetRegisterClass *SrcRC = getRegClassForReg(MRI, MO.getReg());
4020 return getSubRegisterClass(SrcRC, MO.getSubReg());
4021}
4022
4024 Register Reg) const {
4025 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
4026 // Registers without classes are unaddressable, SGPR-like registers.
4027 return RC && isVGPRClass(RC);
4028}
4029
4031 Register Reg) const {
4032 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
4033
4034 // Registers without classes are unaddressable, SGPR-like registers.
4035 return RC && isAGPRClass(RC);
4036}
4037
4039 MachineFunction &MF) const {
4040 unsigned MinOcc = ST.getOccupancyWithWorkGroupSizes(MF).first;
4041 switch (RC->getID()) {
4042 default:
4043 return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF);
4044 case AMDGPU::VGPR_32RegClassID:
4045 return std::min(
4046 ST.getMaxNumVGPRs(
4047 MinOcc,
4049 ST.getMaxNumVGPRs(MF));
4050 case AMDGPU::SGPR_32RegClassID:
4051 case AMDGPU::SGPR_LO16RegClassID:
4052 return std::min(ST.getMaxNumSGPRs(MinOcc, true), ST.getMaxNumSGPRs(MF));
4053 }
4054}
4055
4057 unsigned Idx) const {
4058 switch (static_cast<AMDGPU::RegisterPressureSets>(Idx)) {
4059 case AMDGPU::RegisterPressureSets::VGPR_32:
4060 case AMDGPU::RegisterPressureSets::AGPR_32:
4061 return getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
4062 const_cast<MachineFunction &>(MF));
4063 case AMDGPU::RegisterPressureSets::SReg_32:
4064 return getRegPressureLimit(&AMDGPU::SGPR_32RegClass,
4065 const_cast<MachineFunction &>(MF));
4066 }
4067
4068 llvm_unreachable("Unexpected register pressure set!");
4069}
4070
4071const int *SIRegisterInfo::getRegUnitPressureSets(MCRegUnit RegUnit) const {
4072 static const int Empty[] = { -1 };
4073
4074 if (RegPressureIgnoredUnits[static_cast<unsigned>(RegUnit)])
4075 return Empty;
4076
4077 return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit);
4078}
4079
4081 ArrayRef<MCPhysReg> Order,
4083 const MachineFunction &MF,
4084 const VirtRegMap *VRM,
4085 const LiveRegMatrix *Matrix) const {
4086
4087 const MachineRegisterInfo &MRI = MF.getRegInfo();
4088 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4089
4090 std::pair<unsigned, Register> Hint = MRI.getRegAllocationHint(VirtReg);
4091
4092 switch (Hint.first) {
4093 case AMDGPURI::Size32: {
4094 Register Paired = Hint.second;
4095 assert(Paired);
4096 Register PairedPhys;
4097 if (Paired.isPhysical()) {
4098 PairedPhys =
4099 getMatchingSuperReg(Paired, AMDGPU::lo16, &AMDGPU::VGPR_32RegClass);
4100 } else if (VRM && VRM->hasPhys(Paired)) {
4101 PairedPhys = getMatchingSuperReg(VRM->getPhys(Paired), AMDGPU::lo16,
4102 &AMDGPU::VGPR_32RegClass);
4103 }
4104
4105 // Prefer the paired physreg.
4106 if (PairedPhys)
4107 // isLo(Paired) is implicitly true here from the API of
4108 // getMatchingSuperReg.
4109 Hints.push_back(PairedPhys);
4110 return false;
4111 }
4112 case AMDGPURI::Size16: {
4113 Register Paired = Hint.second;
4114 assert(Paired);
4115 Register PairedPhys;
4116 if (Paired.isPhysical()) {
4117 PairedPhys = TRI->getSubReg(Paired, AMDGPU::lo16);
4118 } else if (VRM && VRM->hasPhys(Paired)) {
4119 PairedPhys = TRI->getSubReg(VRM->getPhys(Paired), AMDGPU::lo16);
4120 }
4121
4122 // First prefer the paired physreg.
4123 if (PairedPhys)
4124 Hints.push_back(PairedPhys);
4125 else {
4126 // Add all the lo16 physregs.
4127 // When the Paired operand has not yet been assigned a physreg it is
4128 // better to try putting VirtReg in a lo16 register, because possibly
4129 // later Paired can be assigned to the overlapping register and the COPY
4130 // can be eliminated.
4131 for (MCPhysReg PhysReg : Order) {
4132 if (PhysReg == PairedPhys || AMDGPU::isHi16Reg(PhysReg, *this))
4133 continue;
4134 if (AMDGPU::VGPR_16RegClass.contains(PhysReg) &&
4135 !MRI.isReserved(PhysReg))
4136 Hints.push_back(PhysReg);
4137 }
4138 }
4139 return false;
4140 }
4141 default:
4142 return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
4143 VRM);
4144 }
4145}
4146
4148 // Not a callee saved register.
4149 return AMDGPU::SGPR30_SGPR31;
4150}
4151
4152const TargetRegisterClass *
4154 const RegisterBank &RB) const {
4155 switch (RB.getID()) {
4156 case AMDGPU::VGPRRegBankID:
4158 std::max(ST.useRealTrue16Insts() ? 16u : 32u, Size));
4159 case AMDGPU::VCCRegBankID:
4160 assert(Size == 1);
4161 return getWaveMaskRegClass();
4162 case AMDGPU::SGPRRegBankID:
4163 return getSGPRClassForBitWidth(std::max(32u, Size));
4164 case AMDGPU::AGPRRegBankID:
4165 return getAGPRClassForBitWidth(std::max(32u, Size));
4166 default:
4167 llvm_unreachable("unknown register bank");
4168 }
4169}
4170
4171const TargetRegisterClass *
4173 const MachineRegisterInfo &MRI) const {
4174 const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg());
4175 if (const RegisterBank *RB = dyn_cast<const RegisterBank *>(RCOrRB))
4176 return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB);
4177
4178 if (const auto *RC = dyn_cast<const TargetRegisterClass *>(RCOrRB))
4179 return getAllocatableClass(RC);
4180
4181 return nullptr;
4182}
4183
4185 return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC;
4186}
4187
4189 return isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4190}
4191
4193 // VGPR tuples have an alignment requirement on gfx90a variants.
4194 return ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass
4195 : &AMDGPU::VReg_64RegClass;
4196}
4197
4198// Find reaching register definition
4202 LiveIntervals *LIS) const {
4203 auto &MDT = LIS->getDomTree();
4204 SlotIndex UseIdx = LIS->getInstructionIndex(Use);
4205 SlotIndex DefIdx;
4206
4207 if (Reg.isVirtual()) {
4208 if (!LIS->hasInterval(Reg))
4209 return nullptr;
4210 LiveInterval &LI = LIS->getInterval(Reg);
4211 LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg)
4212 : MRI.getMaxLaneMaskForVReg(Reg);
4213 VNInfo *V = nullptr;
4214 if (LI.hasSubRanges()) {
4215 for (auto &S : LI.subranges()) {
4216 if ((S.LaneMask & SubLanes) == SubLanes) {
4217 V = S.getVNInfoAt(UseIdx);
4218 break;
4219 }
4220 }
4221 } else {
4222 V = LI.getVNInfoAt(UseIdx);
4223 }
4224 if (!V)
4225 return nullptr;
4226 DefIdx = V->def;
4227 } else {
4228 // Find last def.
4229 for (MCRegUnit Unit : regunits(Reg.asMCReg())) {
4230 LiveRange &LR = LIS->getRegUnit(Unit);
4231 if (VNInfo *V = LR.getVNInfoAt(UseIdx)) {
4232 if (!DefIdx.isValid() ||
4233 MDT.dominates(LIS->getInstructionFromIndex(DefIdx),
4234 LIS->getInstructionFromIndex(V->def)))
4235 DefIdx = V->def;
4236 } else {
4237 return nullptr;
4238 }
4239 }
4240 }
4241
4242 MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx);
4243
4244 if (!Def || !MDT.dominates(Def, &Use))
4245 return nullptr;
4246
4247 assert(Def->modifiesRegister(Reg, this));
4248
4249 return Def;
4250}
4251
4253 assert(getRegSizeInBits(*getPhysRegBaseClass(Reg)) <= 32);
4254
4255 for (const TargetRegisterClass &RC : { AMDGPU::VGPR_32RegClass,
4256 AMDGPU::SReg_32RegClass,
4257 AMDGPU::AGPR_32RegClass } ) {
4258 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::lo16, &RC))
4259 return Super;
4260 }
4261 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::hi16,
4262 &AMDGPU::VGPR_32RegClass)) {
4263 return Super;
4264 }
4265
4266 return AMDGPU::NoRegister;
4267}
4268
4270 if (!ST.needsAlignedVGPRs())
4271 return true;
4272
4273 if (isVGPRClass(&RC))
4274 return RC.hasSuperClassEq(getVGPRClassForBitWidth(getRegSizeInBits(RC)));
4275 if (isAGPRClass(&RC))
4276 return RC.hasSuperClassEq(getAGPRClassForBitWidth(getRegSizeInBits(RC)));
4277 if (isVectorSuperClass(&RC))
4278 return RC.hasSuperClassEq(
4279 getVectorSuperClassForBitWidth(getRegSizeInBits(RC)));
4280
4281 assert(&RC != &AMDGPU::VS_64RegClass);
4282
4283 return true;
4284}
4285
4288 return ArrayRef(AMDGPU::SGPR_128RegClass.begin(), ST.getMaxNumSGPRs(MF) / 4);
4289}
4290
4293 return ArrayRef(AMDGPU::SGPR_64RegClass.begin(), ST.getMaxNumSGPRs(MF) / 2);
4294}
4295
4298 return ArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF));
4299}
4300
4301unsigned
4303 unsigned SubReg) const {
4304 switch (RC->TSFlags & SIRCFlags::RegKindMask) {
4305 case SIRCFlags::HasSGPR:
4306 return std::min(128u, getSubRegIdxSize(SubReg));
4307 case SIRCFlags::HasAGPR:
4308 case SIRCFlags::HasVGPR:
4310 return std::min(32u, getSubRegIdxSize(SubReg));
4311 default:
4312 break;
4313 }
4314 return 0;
4315}
4316
4318 const TargetRegisterClass &RC,
4319 bool IncludeCalls) const {
4320 unsigned NumArchVGPRs = ST.getAddressableNumArchVGPRs();
4322 (RC.getID() == AMDGPU::VGPR_32RegClassID)
4323 ? RC.getRegisters().take_front(NumArchVGPRs)
4324 : RC.getRegisters();
4325 for (MCPhysReg Reg : reverse(Registers)) {
4326 if (Reg != AMDGPU::VCC_LO && Reg != AMDGPU::VCC_HI &&
4327 MRI.isPhysRegUsed(Reg, /*SkipRegMaskTest=*/!IncludeCalls))
4328 return getHWRegIndex(Reg) + 1;
4329 }
4330 return 0;
4331}
4332
4335 const MachineFunction &MF) const {
4337 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
4338 if (FuncInfo->checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG))
4339 RegFlags.push_back("WWM_REG");
4340 return RegFlags;
4341}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static const Function * getParent(const Value *V)
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
Live Register Matrix
A set of register units.
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
if(PassOpts->AAPipeline)
This file declares the machine register scavenger class.
SI Pre allocate WWM Registers
static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, int Index, unsigned Lane, unsigned ValueReg, bool IsKill, bool NeedsCFI)
static int getOffenMUBUFStore(unsigned Opc)
static const TargetRegisterClass * getAnyAGPRClassForBitWidth(unsigned BitWidth)
static int getOffsetMUBUFLoad(unsigned Opc)
static const std::array< unsigned, 17 > SubRegFromChannelTableWidthMap
static unsigned getNumSubRegsForSpillOp(const MachineInstr &MI, const SIInstrInfo *TII)
static void emitUnsupportedError(const Function &Fn, const MachineInstr &MI, const Twine &ErrMsg)
static const TargetRegisterClass * getAlignedAGPRClassForBitWidth(unsigned BitWidth)
static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST, MachineFrameInfo &MFI, MachineBasicBlock::iterator MI, int Index, int64_t Offset)
static cl::opt< bool > EnableSpillCFISavedRegs("amdgpu-spill-cfi-saved-regs", cl::desc("Enable spilling the registers required for CFI emission"), cl::ReallyHidden, cl::init(false), cl::ZeroOrMore)
static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII, unsigned LoadStoreOp, unsigned EltSize)
static const TargetRegisterClass * getAlignedVGPRClassForBitWidth(unsigned BitWidth)
static int getOffsetMUBUFStore(unsigned Opc)
static const TargetRegisterClass * getAnyVGPRClassForBitWidth(unsigned BitWidth)
static cl::opt< bool > EnableSpillSGPRToVGPR("amdgpu-spill-sgpr-to-vgpr", cl::desc("Enable spilling SGPRs to VGPRs"), cl::ReallyHidden, cl::init(true))
static const TargetRegisterClass * getAlignedVectorSuperClassForBitWidth(unsigned BitWidth)
static const TargetRegisterClass * getAnyVectorSuperClassForBitWidth(unsigned BitWidth)
static bool isFIPlusImmOrVGPR(const SIRegisterInfo &TRI, const MachineInstr &MI)
static int getOffenMUBUFLoad(unsigned Opc)
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:483
LocallyHashedType DenseMapInfo< LocallyHashedType >::Empty
static const char * getRegisterName(MCRegister Reg)
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
Get the array size.
Definition ArrayRef.h:141
bool empty() const
Check if the array is empty.
Definition ArrayRef.h:136
bool test(unsigned Idx) const
Returns true if bit Idx is set.
Definition BitVector.h:482
bool empty() const
Returns whether there are no bits in this bitvector.
Definition BitVector.h:175
A debug info location.
Definition DebugLoc.h:124
Diagnostic information for unsupported feature in backend.
Register getReg() const
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LiveInterval - This class represents the liveness of a register, or stack slot.
bool hasSubRanges() const
Returns true if subregister liveness information is available.
iterator_range< subrange_iterator > subranges()
void removeAllRegUnitsForPhysReg(MCRegister Reg)
Remove associated live ranges for the register units associated with Reg.
bool hasInterval(Register Reg) const
MachineInstr * getInstructionFromIndex(SlotIndex index) const
Returns the instruction associated with the given index.
MachineDominatorTree & getDomTree()
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
LiveRange & getRegUnit(MCRegUnit Unit)
Return the live range for register unit Unit.
This class represents the liveness of a register, stack slot, etc.
VNInfo * getVNInfoAt(SlotIndex Idx) const
getVNInfoAt - Return the VNInfo that is live at Idx, or NULL.
A set of register units used to track register liveness.
bool available(MCRegister Reg) const
Returns true if no part of physical register Reg is live.
Describe properties that are true of each instruction in the target description file.
MCRegAliasIterator enumerates all registers aliasing Reg.
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
static MCRegister from(unsigned Val)
Check the provided unsigned value is a valid MCRegister.
Definition MCRegister.h:77
Generic base class for all target subtargets.
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
bool hasCalls() const
Return true if the current function has any function calls.
Align getObjectAlign(int ObjectIdx) const
Return the alignment of the specified stack object.
bool hasStackObjects() const
Return true if there are any stack objects in this function.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addUse(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
void setAsmPrinterFlag(AsmPrinterFlagTy Flag)
Set a flag for the AsmPrinter.
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
void setImm(int64_t immVal)
int64_t getImm() const
LLVM_ABI void setIsRenamable(bool Val=true)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
const RegClassOrRegBank & getRegClassOrRegBank(Register Reg) const
Return the register bank or register class of Reg.
bool isReserved(MCRegister PhysReg) const
isReserved - Returns true when PhysReg is a reserved register.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
bool isAllocatable(MCRegister PhysReg) const
isAllocatable - Returns true when PhysReg belongs to an allocatable register class and it hasn't been...
std::pair< unsigned, Register > getRegAllocationHint(Register VReg) const
getRegAllocationHint - Return the register allocation hint for the specified virtual register.
const TargetRegisterInfo * getTargetRegisterInfo() const
LLVM_ABI LaneBitmask getMaxLaneMaskForVReg(Register Reg) const
Returns a mask covering all bits that can appear in lane masks of subregisters of the virtual registe...
LLVM_ABI bool isPhysRegUsed(MCRegister PhysReg, bool SkipRegMaskTest=false) const
Return true if the specified register is modified or read in this function.
Holds all the information related to register banks.
virtual bool isDivergentRegBank(const RegisterBank *RB) const
Returns true if the register bank is considered divergent.
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
MachineInstr * buildCFIForSGPRToVMEMSpill(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, MCRegister SGPR, int64_t Offset) const
Create a CFI index describing a spill of a SGPR to VMEM and build a MachineInstr around it.
MachineInstr * buildCFIForVRegToVRegSpill(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, const MCRegister Reg, const MCRegister RegCopy) const
Create a CFI index describing a spill of the VGPR/AGPR Reg to another VGPR/AGPR RegCopy and build a M...
MachineInstr * buildCFIForVGPRToVMEMSpill(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, MCRegister VGPR, int64_t Offset) const
Create a CFI index describing a spill of a VGPR to VMEM and build a MachineInstr around it.
MachineInstr * buildCFIForSGPRToVGPRSpill(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, const MCRegister SGPR, const MCRegister VGPR, const int Lane) const
Create a CFI index describing a spill of an SGPR to a single lane of a VGPR and build a MachineInstr ...
static bool isFLATScratch(const MachineInstr &MI)
static bool isMUBUF(const MachineInstr &MI)
static bool isVOP3(const MCInstrDesc &Desc)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
ArrayRef< MCPhysReg > getAGPRSpillVGPRs() const
MCPhysReg getVGPRToAGPRSpill(int FrameIndex, unsigned Lane) const
Register getScratchRSrcReg() const
Returns the physical register reserved for use as the resource descriptor for scratch accesses.
ArrayRef< MCPhysReg > getVGPRSpillAGPRs() const
ArrayRef< SIRegisterInfo::SpilledReg > getSGPRSpillToVirtualVGPRLanes(int FrameIndex) const
uint32_t getMaskForVGPRBlockOps(Register RegisterBlock) const
ArrayRef< SIRegisterInfo::SpilledReg > getSGPRSpillToPhysicalVGPRLanes(int FrameIndex) const
bool checkFlag(Register Reg, uint8_t Flag) const
const ReservedRegSet & getWWMReservedRegs() const
Register materializeFrameBaseRegister(MachineBasicBlock *MBB, int FrameIdx, int64_t Offset) const override
int64_t getScratchInstrOffset(const MachineInstr *MI) const
bool isFrameOffsetLegal(const MachineInstr *MI, Register BaseReg, int64_t Offset) const override
const TargetRegisterClass * getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, const TargetRegisterClass *SubRC, unsigned SubIdx) const
Returns a register class which is compatible with SuperRC, such that a subregister exists with class ...
ArrayRef< MCPhysReg > getAllSGPR64(const MachineFunction &MF) const
Return all SGPR64 which satisfy the waves per execution unit requirement of the subtarget.
MCRegister findUnusedRegister(const MachineRegisterInfo &MRI, const TargetRegisterClass *RC, const MachineFunction &MF, bool ReserveHighestVGPR=false) const
Returns a lowest register that is not used at any point in the function.
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
MCPhysReg get32BitRegister(MCPhysReg Reg) const
const uint32_t * getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override
void buildSpillLoadStore(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, unsigned LoadStoreOp, int Index, Register ValueReg, bool ValueIsKill, MCRegister ScratchOffsetReg, int64_t InstrOffset, MachineMemOperand *MMO, RegScavenger *RS, LiveRegUnits *LiveUnits=nullptr, bool NeedsCFI=false) const
bool requiresFrameIndexReplacementScavenging(const MachineFunction &MF) const override
bool shouldRealignStack(const MachineFunction &MF) const override
bool restoreSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, SlotIndexes *Indexes=nullptr, LiveIntervals *LIS=nullptr, bool OnlyToVGPR=false, bool SpillToPhysVGPRLane=false) const
bool isProperlyAlignedRC(const TargetRegisterClass &RC) const
const TargetRegisterClass * getEquivalentVGPRClass(const TargetRegisterClass *SRC) const
Register getFrameRegister(const MachineFunction &MF) const override
LLVM_READONLY const TargetRegisterClass * getVectorSuperClassForBitWidth(unsigned BitWidth) const
bool spillEmergencySGPR(MachineBasicBlock::iterator MI, MachineBasicBlock &RestoreMBB, Register SGPR, RegScavenger *RS) const
SIRegisterInfo(const GCNSubtarget &ST)
const uint32_t * getAllVGPRRegMask() const
MCRegister getReturnAddressReg(const MachineFunction &MF) const
const MCPhysReg * getCalleeSavedRegs(const MachineFunction *MF) const override
bool hasBasePointer(const MachineFunction &MF) const
const TargetRegisterClass * getCrossCopyRegClass(const TargetRegisterClass *RC) const override
Returns a legal register class to copy a register in the specified class to or from.
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
ArrayRef< MCPhysReg > getAllSGPR32(const MachineFunction &MF) const
Return all SGPR32 which satisfy the waves per execution unit requirement of the subtarget.
const TargetRegisterClass * getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &MF) const override
MCRegister reservedPrivateSegmentBufferReg(const MachineFunction &MF) const
Return the end register initially reserved for the scratch buffer in case spilling is needed.
bool eliminateSGPRToVGPRSpillFrameIndex(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, SlotIndexes *Indexes=nullptr, LiveIntervals *LIS=nullptr, bool SpillToPhysVGPRLane=false) const
Special case of eliminateFrameIndex.
bool isVGPR(const MachineRegisterInfo &MRI, Register Reg) const
bool isAsmClobberable(const MachineFunction &MF, MCRegister PhysReg) const override
LLVM_READONLY const TargetRegisterClass * getAGPRClassForBitWidth(unsigned BitWidth) const
static bool isChainScratchRegister(Register VGPR)
bool requiresRegisterScavenging(const MachineFunction &Fn) const override
bool opCanUseInlineConstant(unsigned OpType) const
const TargetRegisterClass * getRegClassForSizeOnBank(unsigned Size, const RegisterBank &Bank) const
const TargetRegisterClass * getConstrainedRegClassForOperand(const MachineOperand &MO, const MachineRegisterInfo &MRI) const override
bool isUniformReg(const MachineRegisterInfo &MRI, const RegisterBankInfo &RBI, Register Reg) const override
const uint32_t * getNoPreservedMask() const override
StringRef getRegAsmName(MCRegister Reg) const override
const uint32_t * getAllAllocatableSRegMask() const
MCRegister getAlignedHighSGPRForRC(const MachineFunction &MF, const unsigned Align, const TargetRegisterClass *RC) const
Return the largest available SGPR aligned to Align for the register class RC.
void buildCFIForBlockCSRStore(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register BlockReg, int64_t Offset) const
const TargetRegisterClass * getRegClassForReg(const MachineRegisterInfo &MRI, Register Reg) const
unsigned getHWRegIndex(MCRegister Reg) const
const MCPhysReg * getCalleeSavedRegsViaCopy(const MachineFunction *MF) const
const uint32_t * getAllVectorRegMask() const
const TargetRegisterClass * getEquivalentAGPRClass(const TargetRegisterClass *SRC) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
const TargetRegisterClass * getPointerRegClass(unsigned Kind=0) const override
const TargetRegisterClass * getRegClassForTypeOnBank(LLT Ty, const RegisterBank &Bank) const
bool opCanUseLiteralConstant(unsigned OpType) const
Register getBaseRegister() const
bool getRegAllocationHints(Register VirtReg, ArrayRef< MCPhysReg > Order, SmallVectorImpl< MCPhysReg > &Hints, const MachineFunction &MF, const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const override
LLVM_READONLY const TargetRegisterClass * getAlignedLo256VGPRClassForBitWidth(unsigned BitWidth) const
LLVM_READONLY const TargetRegisterClass * getVGPRClassForBitWidth(unsigned BitWidth) const
const TargetRegisterClass * getEquivalentAVClass(const TargetRegisterClass *SRC) const
bool requiresFrameIndexScavenging(const MachineFunction &MF) const override
static bool isVGPRClass(const TargetRegisterClass *RC)
MachineInstr * findReachingDef(Register Reg, unsigned SubReg, MachineInstr &Use, MachineRegisterInfo &MRI, LiveIntervals *LIS) const
bool isSGPRReg(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getEquivalentSGPRClass(const TargetRegisterClass *VRC) const
SmallVector< StringLiteral > getVRegFlagsOfReg(Register Reg, const MachineFunction &MF) const override
LLVM_READONLY const TargetRegisterClass * getDefaultVectorSuperClassForBitWidth(unsigned BitWidth) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
ArrayRef< MCPhysReg > getAllSGPR128(const MachineFunction &MF) const
Return all SGPR128 which satisfy the waves per execution unit requirement of the subtarget.
unsigned getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const override
BitVector getReservedRegs(const MachineFunction &MF) const override
bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override
const TargetRegisterClass * getRegClassForOperandReg(const MachineRegisterInfo &MRI, const MachineOperand &MO) const
void addImplicitUsesForBlockCSRLoad(MachineInstrBuilder &MIB, Register BlockReg) const
unsigned getNumUsedPhysRegs(const MachineRegisterInfo &MRI, const TargetRegisterClass &RC, bool IncludeCalls=true) const
const uint32_t * getAllAGPRRegMask() const
const int * getRegUnitPressureSets(MCRegUnit RegUnit) const override
bool isAGPR(const MachineRegisterInfo &MRI, Register Reg) const
bool eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const override
bool spillSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, SlotIndexes *Indexes=nullptr, LiveIntervals *LIS=nullptr, bool OnlyToVGPR=false, bool SpillToPhysVGPRLane=false, bool NeedsCFI=false) const
If OnlyToVGPR is true, this will only succeed if this manages to find a free VGPR lane to spill.
MCRegister getExec() const
MCRegister getVCC() const
int64_t getFrameIndexInstrOffset(const MachineInstr *MI, int Idx) const override
bool isVectorSuperClass(const TargetRegisterClass *RC) const
const TargetRegisterClass * getWaveMaskRegClass() const
unsigned getSubRegAlignmentNumBits(const TargetRegisterClass *RC, unsigned SubReg) const
void resolveFrameIndex(MachineInstr &MI, Register BaseReg, int64_t Offset) const override
bool requiresVirtualBaseRegisters(const MachineFunction &Fn) const override
const TargetRegisterClass * getVGPR64Class() const
void buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index, int Offset, bool IsLoad, bool IsKill=true) const
bool isCFISavedRegsSpillEnabled() const
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
bool isValid() const
Returns true if this is a valid index.
SlotIndexes pass.
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
SlotIndex replaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
ReplaceMachineInstrInMaps - Replacing a machine instr with a new one in maps used by register allocat...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
bool hasFP(const MachineFunction &MF) const
hasFP - Return true if the specified function should have a dedicated frame pointer register.
const uint8_t TSFlags
Configurable target specific flags.
ArrayRef< MCPhysReg > getRegisters() const
unsigned getID() const
Return the register class ID number.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
virtual const TargetRegisterClass * getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &) const
Returns the largest super class of RC that is legal to use in the current sub-target and has the same...
virtual bool shouldRealignStack(const MachineFunction &MF) const
True if storage within the function requires the stack pointer to be aligned more than the normal cal...
virtual bool getRegAllocationHints(Register VirtReg, ArrayRef< MCPhysReg > Order, SmallVectorImpl< MCPhysReg > &Hints, const MachineFunction &MF, const VirtRegMap *VRM=nullptr, const LiveRegMatrix *Matrix=nullptr) const
Get a list of 'hint' registers that the register allocator should try first when allocating a physica...
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
VNInfo - Value Number Information.
MCRegister getPhys(Register virtReg) const
returns the physical register mapped to the specified virtual register
Definition VirtRegMap.h:91
bool hasPhys(Register virtReg) const
returns true if the specified virtual register is mapped to a physical register
Definition VirtRegMap.h:87
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ PRIVATE_ADDRESS
Address space for private memory.
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
@ OPERAND_REG_IMM_FIRST
Definition SIDefines.h:253
@ OPERAND_REG_INLINE_AC_FIRST
Definition SIDefines.h:259
@ OPERAND_REG_INLINE_AC_LAST
Definition SIDefines.h:260
@ OPERAND_REG_IMM_LAST
Definition SIDefines.h:254
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
LLVM_READONLY int32_t getFlatScratchInstSVfromSVS(uint32_t Opcode)
LLVM_READONLY int32_t getFlatScratchInstSVfromSS(uint32_t Opcode)
LLVM_READONLY int32_t getFlatScratchInstSTfromSS(uint32_t Opcode)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition CallingConv.h:47
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:558
PointerUnion< const TargetRegisterClass *, const RegisterBank * > RegClassOrRegBank
Convenient type to represent either a register class or a register bank.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1668
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
RegState
Flags to represent properties of register accesses.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
@ Renamable
Register that may be renamed.
constexpr RegState getKillRegState(bool B)
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
Op::Description Desc
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:156
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
@ HasSGPR
Definition SIDefines.h:26
@ HasVGPR
Definition SIDefines.h:24
@ RegKindMask
Definition SIDefines.h:29
@ HasAGPR
Definition SIDefines.h:25
constexpr RegState getDefRegState(bool B)
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
constexpr bool hasRegState(RegState Value, RegState Test)
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
void call_once(once_flag &flag, Function &&F, Args &&... ArgList)
Execute the function specified as a parameter once.
Definition Threading.h:86
constexpr unsigned BitWidth
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
static const MachineMemOperand::Flags MOThreadPrivate
Mark the MMO of accesses to memory locations that are never written to by other threads.
Definition SIInstrInfo.h:63
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:876
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
This class contains a discriminated union of information about pointers in memory operands,...
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
void setMI(MachineBasicBlock *NewMBB, MachineBasicBlock::iterator NewMI)
ArrayRef< int16_t > SplitParts
SIMachineFunctionInfo & MFI
SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, bool IsWave32, MachineBasicBlock::iterator MI, int Index, RegScavenger *RS)
SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, bool IsWave32, MachineBasicBlock::iterator MI, Register Reg, bool IsKill, int Index, RegScavenger *RS)
MachineBasicBlock::iterator MI
void readWriteTmpVGPR(unsigned Offset, bool IsLoad)
const SIRegisterInfo & TRI
MachineBasicBlock * MBB
const SIInstrInfo & TII
The llvm::once_flag structure.
Definition Threading.h:67