LLVM 22.0.0git
SIRegisterInfo.cpp
Go to the documentation of this file.
1//===-- SIRegisterInfo.cpp - SI Register Information ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI implementation of the TargetRegisterInfo class.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPU.h"
16#include "GCNSubtarget.h"
20#include "SIRegisterInfo.h"
26
27using namespace llvm;
28
29#define GET_REGINFO_TARGET_DESC
30#include "AMDGPUGenRegisterInfo.inc"
31
33 "amdgpu-spill-sgpr-to-vgpr",
34 cl::desc("Enable spilling SGPRs to VGPRs"),
36 cl::init(true));
37
38std::array<std::vector<int16_t>, 32> SIRegisterInfo::RegSplitParts;
39std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable;
40
41// Map numbers of DWORDs to indexes in SubRegFromChannelTable.
42// Valid indexes are shifted 1, such that a 0 mapping means unsupported.
43// e.g. for 8 DWORDs (256-bit), SubRegFromChannelTableWidthMap[8] = 8,
44// meaning index 7 in SubRegFromChannelTable.
45static const std::array<unsigned, 17> SubRegFromChannelTableWidthMap = {
46 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 9};
47
48static void emitUnsupportedError(const Function &Fn, const MachineInstr &MI,
49 const Twine &ErrMsg) {
51 DiagnosticInfoUnsupported(Fn, ErrMsg, MI.getDebugLoc()));
52}
53
54namespace llvm {
55
56// A temporary struct to spill SGPRs.
57// This is mostly to spill SGPRs to memory. Spilling SGPRs into VGPR lanes emits
58// just v_writelane and v_readlane.
59//
60// When spilling to memory, the SGPRs are written into VGPR lanes and the VGPR
61// is saved to scratch (or the other way around for loads).
62// For this, a VGPR is required where the needed lanes can be clobbered. The
63// RegScavenger can provide a VGPR where currently active lanes can be
64// clobbered, but we still need to save inactive lanes.
65// The high-level steps are:
66// - Try to scavenge SGPR(s) to save exec
67// - Try to scavenge VGPR
68// - Save needed, all or inactive lanes of a TmpVGPR
69// - Spill/Restore SGPRs using TmpVGPR
70// - Restore TmpVGPR
71//
72// To save all lanes of TmpVGPR, exec needs to be saved and modified. If we
73// cannot scavenge temporary SGPRs to save exec, we use the following code:
74// buffer_store_dword TmpVGPR ; only if active lanes need to be saved
75// s_not exec, exec
76// buffer_store_dword TmpVGPR ; save inactive lanes
77// s_not exec, exec
79 struct PerVGPRData {
80 unsigned PerVGPR;
81 unsigned NumVGPRs;
82 int64_t VGPRLanes;
83 };
84
85 // The SGPR to save
89 unsigned NumSubRegs;
90 bool IsKill;
91 const DebugLoc &DL;
92
93 /* When spilling to stack */
94 // The SGPRs are written into this VGPR, which is then written to scratch
95 // (or vice versa for loads).
96 Register TmpVGPR = AMDGPU::NoRegister;
97 // Temporary spill slot to save TmpVGPR to.
98 int TmpVGPRIndex = 0;
99 // If TmpVGPR is live before the spill or if it is scavenged.
100 bool TmpVGPRLive = false;
101 // Scavenged SGPR to save EXEC.
102 Register SavedExecReg = AMDGPU::NoRegister;
103 // Stack index to write the SGPRs to.
104 int Index;
105 unsigned EltSize = 4;
106
115 unsigned MovOpc;
116 unsigned NotOpc;
117
121 : SGPRSpillBuilder(TRI, TII, IsWave32, MI, MI->getOperand(0).getReg(),
122 MI->getOperand(0).isKill(), Index, RS) {}
123
126 bool IsKill, int Index, RegScavenger *RS)
127 : SuperReg(Reg), MI(MI), IsKill(IsKill), DL(MI->getDebugLoc()),
128 Index(Index), RS(RS), MBB(MI->getParent()), MF(*MBB->getParent()),
129 MFI(*MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI),
131 const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg);
132 SplitParts = TRI.getRegSplitParts(RC, EltSize);
133 NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
134
135 if (IsWave32) {
136 ExecReg = AMDGPU::EXEC_LO;
137 MovOpc = AMDGPU::S_MOV_B32;
138 NotOpc = AMDGPU::S_NOT_B32;
139 } else {
140 ExecReg = AMDGPU::EXEC;
141 MovOpc = AMDGPU::S_MOV_B64;
142 NotOpc = AMDGPU::S_NOT_B64;
143 }
144
145 assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
146 assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI &&
147 SuperReg != AMDGPU::EXEC && "exec should never spill");
148 }
149
152 Data.PerVGPR = IsWave32 ? 32 : 64;
153 Data.NumVGPRs = (NumSubRegs + (Data.PerVGPR - 1)) / Data.PerVGPR;
154 Data.VGPRLanes = (1LL << std::min(Data.PerVGPR, NumSubRegs)) - 1LL;
155 return Data;
156 }
157
158 // Tries to scavenge SGPRs to save EXEC and a VGPR. Uses v0 if no VGPR is
159 // free.
160 // Writes these instructions if an SGPR can be scavenged:
161 // s_mov_b64 s[6:7], exec ; Save exec
162 // s_mov_b64 exec, 3 ; Wanted lanemask
163 // buffer_store_dword v1 ; Write scavenged VGPR to emergency slot
164 //
165 // Writes these instructions if no SGPR can be scavenged:
166 // buffer_store_dword v0 ; Only if no free VGPR was found
167 // s_not_b64 exec, exec
168 // buffer_store_dword v0 ; Save inactive lanes
169 // ; exec stays inverted, it is flipped back in
170 // ; restore.
171 void prepare() {
172 // Scavenged temporary VGPR to use. It must be scavenged once for any number
173 // of spilled subregs.
174 // FIXME: The liveness analysis is limited and does not tell if a register
175 // is in use in lanes that are currently inactive. We can never be sure if
176 // a register as actually in use in another lane, so we need to save all
177 // used lanes of the chosen VGPR.
178 assert(RS && "Cannot spill SGPR to memory without RegScavenger");
179 TmpVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false,
180 0, false);
181
182 // Reserve temporary stack slot
183 TmpVGPRIndex = MFI.getScavengeFI(MF.getFrameInfo(), TRI);
184 if (TmpVGPR) {
185 // Found a register that is dead in the currently active lanes, we only
186 // need to spill inactive lanes.
187 TmpVGPRLive = false;
188 } else {
189 // Pick v0 because it doesn't make a difference.
190 TmpVGPR = AMDGPU::VGPR0;
191 TmpVGPRLive = true;
192 }
193
194 if (TmpVGPRLive) {
195 // We need to inform the scavenger that this index is already in use until
196 // we're done with the custom emergency spill.
197 RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR);
198 }
199
200 // We may end up recursively calling the scavenger, and don't want to re-use
201 // the same register.
202 RS->setRegUsed(TmpVGPR);
203
204 // Try to scavenge SGPRs to save exec
205 assert(!SavedExecReg && "Exec is already saved, refuse to save again");
206 const TargetRegisterClass &RC =
207 IsWave32 ? AMDGPU::SGPR_32RegClass : AMDGPU::SGPR_64RegClass;
208 RS->setRegUsed(SuperReg);
209 SavedExecReg = RS->scavengeRegisterBackwards(RC, MI, false, 0, false);
210
211 int64_t VGPRLanes = getPerVGPRData().VGPRLanes;
212
213 if (SavedExecReg) {
214 RS->setRegUsed(SavedExecReg);
215 // Set exec to needed lanes
217 auto I =
218 BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg).addImm(VGPRLanes);
219 if (!TmpVGPRLive)
221 // Spill needed lanes
222 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
223 } else {
224 // The modify and restore of exec clobber SCC, which we would have to save
225 // and restore. FIXME: We probably would need to reserve a register for
226 // this.
227 if (RS->isRegUsed(AMDGPU::SCC))
228 emitUnsupportedError(MF.getFunction(), *MI,
229 "unhandled SGPR spill to memory");
230
231 // Spill active lanes
232 if (TmpVGPRLive)
233 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false,
234 /*IsKill*/ false);
235 // Spill inactive lanes
236 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
237 if (!TmpVGPRLive)
239 I->getOperand(2).setIsDead(); // Mark SCC as dead.
240 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
241 }
242 }
243
244 // Writes these instructions if an SGPR can be scavenged:
245 // buffer_load_dword v1 ; Write scavenged VGPR to emergency slot
246 // s_waitcnt vmcnt(0) ; If a free VGPR was found
247 // s_mov_b64 exec, s[6:7] ; Save exec
248 //
249 // Writes these instructions if no SGPR can be scavenged:
250 // buffer_load_dword v0 ; Restore inactive lanes
251 // s_waitcnt vmcnt(0) ; If a free VGPR was found
252 // s_not_b64 exec, exec
253 // buffer_load_dword v0 ; Only if no free VGPR was found
254 void restore() {
255 if (SavedExecReg) {
256 // Restore used lanes
257 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
258 /*IsKill*/ false);
259 // Restore exec
260 auto I = BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg)
262 // Add an implicit use of the load so it is not dead.
263 // FIXME This inserts an unnecessary waitcnt
264 if (!TmpVGPRLive) {
266 }
267 } else {
268 // Restore inactive lanes
269 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
270 /*IsKill*/ false);
271 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
272 if (!TmpVGPRLive)
274 I->getOperand(2).setIsDead(); // Mark SCC as dead.
275
276 // Restore active lanes
277 if (TmpVGPRLive)
278 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true);
279 }
280
281 // Inform the scavenger where we're releasing our custom scavenged register.
282 if (TmpVGPRLive) {
283 MachineBasicBlock::iterator RestorePt = std::prev(MI);
284 RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR, &*RestorePt);
285 }
286 }
287
288 // Write TmpVGPR to memory or read TmpVGPR from memory.
289 // Either using a single buffer_load/store if exec is set to the needed mask
290 // or using
291 // buffer_load
292 // s_not exec, exec
293 // buffer_load
294 // s_not exec, exec
295 void readWriteTmpVGPR(unsigned Offset, bool IsLoad) {
296 if (SavedExecReg) {
297 // Spill needed lanes
298 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
299 } else {
300 // The modify and restore of exec clobber SCC, which we would have to save
301 // and restore. FIXME: We probably would need to reserve a register for
302 // this.
303 if (RS->isRegUsed(AMDGPU::SCC))
304 emitUnsupportedError(MF.getFunction(), *MI,
305 "unhandled SGPR spill to memory");
306
307 // Spill active lanes
308 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad,
309 /*IsKill*/ false);
310 // Spill inactive lanes
311 auto Not0 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
312 Not0->getOperand(2).setIsDead(); // Mark SCC as dead.
313 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
314 auto Not1 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
315 Not1->getOperand(2).setIsDead(); // Mark SCC as dead.
316 }
317 }
318
320 assert(MBB->getParent() == &MF);
321 MI = NewMI;
322 MBB = NewMBB;
323 }
324};
325
326} // namespace llvm
327
329 : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour(),
330 ST.getAMDGPUDwarfFlavour(),
331 /*PC=*/0,
332 ST.getHwMode(MCSubtargetInfo::HwMode_RegInfo)),
333 ST(ST), SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) {
334
335 assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 &&
336 getSubRegIndexLaneMask(AMDGPU::sub31).getAsInteger() == (3ULL << 62) &&
337 (getSubRegIndexLaneMask(AMDGPU::lo16) |
338 getSubRegIndexLaneMask(AMDGPU::hi16)).getAsInteger() ==
339 getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() &&
340 "getNumCoveredRegs() will not work with generated subreg masks!");
341
342 RegPressureIgnoredUnits.resize(getNumRegUnits());
343 RegPressureIgnoredUnits.set(
344 static_cast<unsigned>(*regunits(MCRegister::from(AMDGPU::M0)).begin()));
345 for (auto Reg : AMDGPU::VGPR_16RegClass) {
346 if (AMDGPU::isHi16Reg(Reg, *this))
347 RegPressureIgnoredUnits.set(
348 static_cast<unsigned>(*regunits(Reg).begin()));
349 }
350
351 // HACK: Until this is fully tablegen'd.
352 static llvm::once_flag InitializeRegSplitPartsFlag;
353
354 static auto InitializeRegSplitPartsOnce = [this]() {
355 for (unsigned Idx = 1, E = getNumSubRegIndices() - 1; Idx < E; ++Idx) {
356 unsigned Size = getSubRegIdxSize(Idx);
357 if (Size & 15)
358 continue;
359 std::vector<int16_t> &Vec = RegSplitParts[Size / 16 - 1];
360 unsigned Pos = getSubRegIdxOffset(Idx);
361 if (Pos % Size)
362 continue;
363 Pos /= Size;
364 if (Vec.empty()) {
365 unsigned MaxNumParts = 1024 / Size; // Maximum register is 1024 bits.
366 Vec.resize(MaxNumParts);
367 }
368 Vec[Pos] = Idx;
369 }
370 };
371
372 static llvm::once_flag InitializeSubRegFromChannelTableFlag;
373
374 static auto InitializeSubRegFromChannelTableOnce = [this]() {
375 for (auto &Row : SubRegFromChannelTable)
376 Row.fill(AMDGPU::NoSubRegister);
377 for (unsigned Idx = 1; Idx < getNumSubRegIndices(); ++Idx) {
378 unsigned Width = getSubRegIdxSize(Idx) / 32;
379 unsigned Offset = getSubRegIdxOffset(Idx) / 32;
381 Width = SubRegFromChannelTableWidthMap[Width];
382 if (Width == 0)
383 continue;
384 unsigned TableIdx = Width - 1;
385 assert(TableIdx < SubRegFromChannelTable.size());
386 assert(Offset < SubRegFromChannelTable[TableIdx].size());
387 SubRegFromChannelTable[TableIdx][Offset] = Idx;
388 }
389 };
390
391 llvm::call_once(InitializeRegSplitPartsFlag, InitializeRegSplitPartsOnce);
392 llvm::call_once(InitializeSubRegFromChannelTableFlag,
393 InitializeSubRegFromChannelTableOnce);
394}
395
396void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved,
397 MCRegister Reg) const {
398 for (MCRegAliasIterator R(Reg, this, true); R.isValid(); ++R)
399 Reserved.set(*R);
400}
401
402// Forced to be here by one .inc
404 const MachineFunction *MF) const {
406 switch (CC) {
407 case CallingConv::C:
410 return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_SaveList
411 : CSR_AMDGPU_SaveList;
414 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_SaveList
415 : CSR_AMDGPU_SI_Gfx_SaveList;
417 return CSR_AMDGPU_CS_ChainPreserve_SaveList;
418 default: {
419 // Dummy to not crash RegisterClassInfo.
420 static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister;
421 return &NoCalleeSavedReg;
422 }
423 }
424}
425
426const MCPhysReg *
428 return nullptr;
429}
430
432 CallingConv::ID CC) const {
433 switch (CC) {
434 case CallingConv::C:
437 return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_RegMask
438 : CSR_AMDGPU_RegMask;
441 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_RegMask
442 : CSR_AMDGPU_SI_Gfx_RegMask;
445 // Calls to these functions never return, so we can pretend everything is
446 // preserved.
447 return AMDGPU_AllVGPRs_RegMask;
448 default:
449 return nullptr;
450 }
451}
452
454 return CSR_AMDGPU_NoRegs_RegMask;
455}
456
458 return VGPR >= AMDGPU::VGPR0 && VGPR < AMDGPU::VGPR8;
459}
460
463 const MachineFunction &MF) const {
464 // FIXME: Should have a helper function like getEquivalentVGPRClass to get the
465 // equivalent AV class. If used one, the verifier will crash after
466 // RegBankSelect in the GISel flow. The aligned regclasses are not fully given
467 // until Instruction selection.
468 if (ST.hasMAIInsts() && (isVGPRClass(RC) || isAGPRClass(RC))) {
469 if (RC == &AMDGPU::VGPR_32RegClass || RC == &AMDGPU::AGPR_32RegClass)
470 return &AMDGPU::AV_32RegClass;
471 if (RC == &AMDGPU::VReg_64RegClass || RC == &AMDGPU::AReg_64RegClass)
472 return &AMDGPU::AV_64RegClass;
473 if (RC == &AMDGPU::VReg_64_Align2RegClass ||
474 RC == &AMDGPU::AReg_64_Align2RegClass)
475 return &AMDGPU::AV_64_Align2RegClass;
476 if (RC == &AMDGPU::VReg_96RegClass || RC == &AMDGPU::AReg_96RegClass)
477 return &AMDGPU::AV_96RegClass;
478 if (RC == &AMDGPU::VReg_96_Align2RegClass ||
479 RC == &AMDGPU::AReg_96_Align2RegClass)
480 return &AMDGPU::AV_96_Align2RegClass;
481 if (RC == &AMDGPU::VReg_128RegClass || RC == &AMDGPU::AReg_128RegClass)
482 return &AMDGPU::AV_128RegClass;
483 if (RC == &AMDGPU::VReg_128_Align2RegClass ||
484 RC == &AMDGPU::AReg_128_Align2RegClass)
485 return &AMDGPU::AV_128_Align2RegClass;
486 if (RC == &AMDGPU::VReg_160RegClass || RC == &AMDGPU::AReg_160RegClass)
487 return &AMDGPU::AV_160RegClass;
488 if (RC == &AMDGPU::VReg_160_Align2RegClass ||
489 RC == &AMDGPU::AReg_160_Align2RegClass)
490 return &AMDGPU::AV_160_Align2RegClass;
491 if (RC == &AMDGPU::VReg_192RegClass || RC == &AMDGPU::AReg_192RegClass)
492 return &AMDGPU::AV_192RegClass;
493 if (RC == &AMDGPU::VReg_192_Align2RegClass ||
494 RC == &AMDGPU::AReg_192_Align2RegClass)
495 return &AMDGPU::AV_192_Align2RegClass;
496 if (RC == &AMDGPU::VReg_256RegClass || RC == &AMDGPU::AReg_256RegClass)
497 return &AMDGPU::AV_256RegClass;
498 if (RC == &AMDGPU::VReg_256_Align2RegClass ||
499 RC == &AMDGPU::AReg_256_Align2RegClass)
500 return &AMDGPU::AV_256_Align2RegClass;
501 if (RC == &AMDGPU::VReg_512RegClass || RC == &AMDGPU::AReg_512RegClass)
502 return &AMDGPU::AV_512RegClass;
503 if (RC == &AMDGPU::VReg_512_Align2RegClass ||
504 RC == &AMDGPU::AReg_512_Align2RegClass)
505 return &AMDGPU::AV_512_Align2RegClass;
506 if (RC == &AMDGPU::VReg_1024RegClass || RC == &AMDGPU::AReg_1024RegClass)
507 return &AMDGPU::AV_1024RegClass;
508 if (RC == &AMDGPU::VReg_1024_Align2RegClass ||
509 RC == &AMDGPU::AReg_1024_Align2RegClass)
510 return &AMDGPU::AV_1024_Align2RegClass;
511 }
512
514}
515
517 const SIFrameLowering *TFI = ST.getFrameLowering();
519
520 // During ISel lowering we always reserve the stack pointer in entry and chain
521 // functions, but never actually want to reference it when accessing our own
522 // frame. If we need a frame pointer we use it, but otherwise we can just use
523 // an immediate "0" which we represent by returning NoRegister.
524 if (FuncInfo->isBottomOfStack()) {
525 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : Register();
526 }
527 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg()
528 : FuncInfo->getStackPtrOffsetReg();
529}
530
532 // When we need stack realignment, we can't reference off of the
533 // stack pointer, so we reserve a base pointer.
534 return shouldRealignStack(MF);
535}
536
537Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; }
538
540 return AMDGPU_AllVGPRs_RegMask;
541}
542
544 return AMDGPU_AllAGPRs_RegMask;
545}
546
548 return AMDGPU_AllVectorRegs_RegMask;
549}
550
552 return AMDGPU_AllAllocatableSRegs_RegMask;
553}
554
555unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel,
556 unsigned NumRegs) {
557 assert(NumRegs < SubRegFromChannelTableWidthMap.size());
558 unsigned NumRegIndex = SubRegFromChannelTableWidthMap[NumRegs];
559 assert(NumRegIndex && "Not implemented");
560 assert(Channel < SubRegFromChannelTable[NumRegIndex - 1].size());
561 return SubRegFromChannelTable[NumRegIndex - 1][Channel];
562}
563
566 const unsigned Align,
567 const TargetRegisterClass *RC) const {
568 unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), Align) - Align;
569 MCRegister BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
570 return getMatchingSuperReg(BaseReg, AMDGPU::sub0, RC);
571}
572
574 const MachineFunction &MF) const {
575 return getAlignedHighSGPRForRC(MF, /*Align=*/4, &AMDGPU::SGPR_128RegClass);
576}
577
579 BitVector Reserved(getNumRegs());
580 Reserved.set(AMDGPU::MODE);
581
583
584 // Reserve special purpose registers.
585 //
586 // EXEC_LO and EXEC_HI could be allocated and used as regular register, but
587 // this seems likely to result in bugs, so I'm marking them as reserved.
588 reserveRegisterTuples(Reserved, AMDGPU::EXEC);
589 reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
590
591 // M0 has to be reserved so that llvm accepts it as a live-in into a block.
592 reserveRegisterTuples(Reserved, AMDGPU::M0);
593
594 // Reserve src_vccz, src_execz, src_scc.
595 reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ);
596 reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ);
597 reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC);
598
599 // Reserve the memory aperture registers
600 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE);
601 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT);
602 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE);
603 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT);
604 reserveRegisterTuples(Reserved, AMDGPU::SRC_FLAT_SCRATCH_BASE_LO);
605 reserveRegisterTuples(Reserved, AMDGPU::SRC_FLAT_SCRATCH_BASE_HI);
606
607 // Reserve async counters pseudo registers
608 reserveRegisterTuples(Reserved, AMDGPU::ASYNCcnt);
609 reserveRegisterTuples(Reserved, AMDGPU::TENSORcnt);
610
611 // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen.
612 reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID);
613
614 // Reserve xnack_mask registers - support is not implemented in Codegen.
615 reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK);
616
617 // Reserve lds_direct register - support is not implemented in Codegen.
618 reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT);
619
620 // Reserve Trap Handler registers - support is not implemented in Codegen.
621 reserveRegisterTuples(Reserved, AMDGPU::TBA);
622 reserveRegisterTuples(Reserved, AMDGPU::TMA);
623 reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1);
624 reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3);
625 reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5);
626 reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7);
627 reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9);
628 reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11);
629 reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13);
630 reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15);
631
632 // Reserve null register - it shall never be allocated
633 reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL64);
634
635 // Reserve SGPRs.
636 //
637 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
638 unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
639 for (const TargetRegisterClass *RC : regclasses()) {
640 if (RC->isBaseClass() && isSGPRClass(RC)) {
641 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
642 for (MCPhysReg Reg : *RC) {
643 unsigned Index = getHWRegIndex(Reg);
644 if (Index + NumRegs > MaxNumSGPRs && Index < TotalNumSGPRs)
645 Reserved.set(Reg);
646 }
647 }
648 }
649
650 Register ScratchRSrcReg = MFI->getScratchRSrcReg();
651 if (ScratchRSrcReg != AMDGPU::NoRegister) {
652 // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we
653 // need to spill.
654 // TODO: May need to reserve a VGPR if doing LDS spilling.
655 reserveRegisterTuples(Reserved, ScratchRSrcReg);
656 }
657
658 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
659 if (LongBranchReservedReg)
660 reserveRegisterTuples(Reserved, LongBranchReservedReg);
661
662 // We have to assume the SP is needed in case there are calls in the function,
663 // which is detected after the function is lowered. If we aren't really going
664 // to need SP, don't bother reserving it.
665 MCRegister StackPtrReg = MFI->getStackPtrOffsetReg();
666 if (StackPtrReg) {
667 reserveRegisterTuples(Reserved, StackPtrReg);
668 assert(!isSubRegister(ScratchRSrcReg, StackPtrReg));
669 }
670
671 MCRegister FrameReg = MFI->getFrameOffsetReg();
672 if (FrameReg) {
673 reserveRegisterTuples(Reserved, FrameReg);
674 assert(!isSubRegister(ScratchRSrcReg, FrameReg));
675 }
676
677 if (hasBasePointer(MF)) {
678 MCRegister BasePtrReg = getBaseRegister();
679 reserveRegisterTuples(Reserved, BasePtrReg);
680 assert(!isSubRegister(ScratchRSrcReg, BasePtrReg));
681 }
682
683 // FIXME: Use same reserved register introduced in D149775
684 // SGPR used to preserve EXEC MASK around WWM spill/copy instructions.
685 Register ExecCopyReg = MFI->getSGPRForEXECCopy();
686 if (ExecCopyReg)
687 reserveRegisterTuples(Reserved, ExecCopyReg);
688
689 // Reserve VGPRs/AGPRs.
690 //
691 auto [MaxNumVGPRs, MaxNumAGPRs] = ST.getMaxNumVectorRegs(MF.getFunction());
692
693 for (const TargetRegisterClass *RC : regclasses()) {
694 if (RC->isBaseClass() && isVGPRClass(RC)) {
695 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
696 for (MCPhysReg Reg : *RC) {
697 unsigned Index = getHWRegIndex(Reg);
698 if (Index + NumRegs > MaxNumVGPRs)
699 Reserved.set(Reg);
700 }
701 }
702 }
703
704 // Reserve all the AGPRs if there are no instructions to use it.
705 if (!ST.hasMAIInsts())
706 MaxNumAGPRs = 0;
707 for (const TargetRegisterClass *RC : regclasses()) {
708 if (RC->isBaseClass() && isAGPRClass(RC)) {
709 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
710 for (MCPhysReg Reg : *RC) {
711 unsigned Index = getHWRegIndex(Reg);
712 if (Index + NumRegs > MaxNumAGPRs)
713 Reserved.set(Reg);
714 }
715 }
716 }
717
718 // On GFX908, in order to guarantee copying between AGPRs, we need a scratch
719 // VGPR available at all times.
720 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
721 reserveRegisterTuples(Reserved, MFI->getVGPRForAGPRCopy());
722 }
723
724 // During wwm-regalloc, reserve the registers for perlane VGPR allocation. The
725 // MFI->getNonWWMRegMask() field will have a valid bitmask only during
726 // wwm-regalloc and it would be empty otherwise.
727 BitVector NonWWMRegMask = MFI->getNonWWMRegMask();
728 if (!NonWWMRegMask.empty()) {
729 for (unsigned RegI = AMDGPU::VGPR0, RegE = AMDGPU::VGPR0 + MaxNumVGPRs;
730 RegI < RegE; ++RegI) {
731 if (NonWWMRegMask.test(RegI))
732 reserveRegisterTuples(Reserved, RegI);
733 }
734 }
735
736 for (Register Reg : MFI->getWWMReservedRegs())
737 reserveRegisterTuples(Reserved, Reg);
738
739 // FIXME: Stop using reserved registers for this.
740 for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs())
741 reserveRegisterTuples(Reserved, Reg);
742
743 for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs())
744 reserveRegisterTuples(Reserved, Reg);
745
746 return Reserved;
747}
748
750 MCRegister PhysReg) const {
751 return !MF.getRegInfo().isReserved(PhysReg);
752}
753
756 // On entry or in chain functions, the base address is 0, so it can't possibly
757 // need any more alignment.
758
759 // FIXME: Should be able to specify the entry frame alignment per calling
760 // convention instead.
761 if (Info->isBottomOfStack())
762 return false;
763
765}
766
769 if (Info->isEntryFunction()) {
770 const MachineFrameInfo &MFI = Fn.getFrameInfo();
771 return MFI.hasStackObjects() || MFI.hasCalls();
772 }
773
774 // May need scavenger for dealing with callee saved registers.
775 return true;
776}
777
779 const MachineFunction &MF) const {
780 // Do not use frame virtual registers. They used to be used for SGPRs, but
781 // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the
782 // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a
783 // spill.
784 return false;
785}
786
788 const MachineFunction &MF) const {
789 const MachineFrameInfo &MFI = MF.getFrameInfo();
790 return MFI.hasStackObjects();
791}
792
794 const MachineFunction &) const {
795 // There are no special dedicated stack or frame pointers.
796 return true;
797}
798
801
802 int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
803 AMDGPU::OpName::offset);
804 return MI->getOperand(OffIdx).getImm();
805}
806
808 int Idx) const {
809 switch (MI->getOpcode()) {
810 case AMDGPU::V_ADD_U32_e32:
811 case AMDGPU::V_ADD_U32_e64:
812 case AMDGPU::V_ADD_CO_U32_e32: {
813 int OtherIdx = Idx == 1 ? 2 : 1;
814 const MachineOperand &OtherOp = MI->getOperand(OtherIdx);
815 return OtherOp.isImm() ? OtherOp.getImm() : 0;
816 }
817 case AMDGPU::V_ADD_CO_U32_e64: {
818 int OtherIdx = Idx == 2 ? 3 : 2;
819 const MachineOperand &OtherOp = MI->getOperand(OtherIdx);
820 return OtherOp.isImm() ? OtherOp.getImm() : 0;
821 }
822 default:
823 break;
824 }
825
827 return 0;
828
829 assert((Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
830 AMDGPU::OpName::vaddr) ||
831 (Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
832 AMDGPU::OpName::saddr))) &&
833 "Should never see frame index on non-address operand");
834
836}
837
839 const MachineInstr &MI) {
840 assert(MI.getDesc().isAdd());
841 const MachineOperand &Src0 = MI.getOperand(1);
842 const MachineOperand &Src1 = MI.getOperand(2);
843
844 if (Src0.isFI()) {
845 return Src1.isImm() || (Src1.isReg() && TRI.isVGPR(MI.getMF()->getRegInfo(),
846 Src1.getReg()));
847 }
848
849 if (Src1.isFI()) {
850 return Src0.isImm() || (Src0.isReg() && TRI.isVGPR(MI.getMF()->getRegInfo(),
851 Src0.getReg()));
852 }
853
854 return false;
855}
856
858 // TODO: Handle v_add_co_u32, v_or_b32, v_and_b32 and scalar opcodes.
859 switch (MI->getOpcode()) {
860 case AMDGPU::V_ADD_U32_e32: {
861 // TODO: We could handle this but it requires work to avoid violating
862 // operand restrictions.
863 if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e32) < 2 &&
864 !isFIPlusImmOrVGPR(*this, *MI))
865 return false;
866 [[fallthrough]];
867 }
868 case AMDGPU::V_ADD_U32_e64:
869 // FIXME: This optimization is barely profitable enableFlatScratch as-is.
870 //
871 // Much of the benefit with the MUBUF handling is we avoid duplicating the
872 // shift of the frame register, which isn't needed with scratch.
873 //
874 // materializeFrameBaseRegister doesn't know the register classes of the
875 // uses, and unconditionally uses an s_add_i32, which will end up using a
876 // copy for the vector uses.
877 return !ST.enableFlatScratch();
878 case AMDGPU::V_ADD_CO_U32_e32:
879 if (ST.getConstantBusLimit(AMDGPU::V_ADD_CO_U32_e32) < 2 &&
880 !isFIPlusImmOrVGPR(*this, *MI))
881 return false;
882 // We can't deal with the case where the carry out has a use (though this
883 // should never happen)
884 return MI->getOperand(3).isDead();
885 case AMDGPU::V_ADD_CO_U32_e64:
886 // TODO: Should we check use_empty instead?
887 return MI->getOperand(1).isDead();
888 default:
889 break;
890 }
891
893 return false;
894
895 int64_t FullOffset = Offset + getScratchInstrOffset(MI);
896
897 const SIInstrInfo *TII = ST.getInstrInfo();
899 return !TII->isLegalMUBUFImmOffset(FullOffset);
900
901 return !TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS,
903}
904
906 int FrameIdx,
907 int64_t Offset) const {
908 MachineBasicBlock::iterator Ins = MBB->begin();
909 DebugLoc DL; // Defaults to "unknown"
910
911 if (Ins != MBB->end())
912 DL = Ins->getDebugLoc();
913
914 MachineFunction *MF = MBB->getParent();
915 const SIInstrInfo *TII = ST.getInstrInfo();
917 unsigned MovOpc = ST.enableFlatScratch() ? AMDGPU::S_MOV_B32
918 : AMDGPU::V_MOV_B32_e32;
919
920 Register BaseReg = MRI.createVirtualRegister(
921 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XEXEC_HIRegClass
922 : &AMDGPU::VGPR_32RegClass);
923
924 if (Offset == 0) {
925 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), BaseReg)
926 .addFrameIndex(FrameIdx);
927 return BaseReg;
928 }
929
930 Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
931
932 Register FIReg = MRI.createVirtualRegister(
933 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XM0RegClass
934 : &AMDGPU::VGPR_32RegClass);
935
936 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
937 .addImm(Offset);
938 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), FIReg)
939 .addFrameIndex(FrameIdx);
940
941 if (ST.enableFlatScratch() ) {
942 // FIXME: Make sure scc isn't live in.
943 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_I32), BaseReg)
944 .addReg(OffsetReg, RegState::Kill)
945 .addReg(FIReg)
946 .setOperandDead(3); // scc
947 return BaseReg;
948 }
949
950 TII->getAddNoCarry(*MBB, Ins, DL, BaseReg)
951 .addReg(OffsetReg, RegState::Kill)
952 .addReg(FIReg)
953 .addImm(0); // clamp bit
954
955 return BaseReg;
956}
957
959 int64_t Offset) const {
960 const SIInstrInfo *TII = ST.getInstrInfo();
961
962 switch (MI.getOpcode()) {
963 case AMDGPU::V_ADD_U32_e32:
964 case AMDGPU::V_ADD_CO_U32_e32: {
965 MachineOperand *FIOp = &MI.getOperand(2);
966 MachineOperand *ImmOp = &MI.getOperand(1);
967 if (!FIOp->isFI())
968 std::swap(FIOp, ImmOp);
969
970 if (!ImmOp->isImm()) {
971 assert(Offset == 0);
972 FIOp->ChangeToRegister(BaseReg, false);
973 TII->legalizeOperandsVOP2(MI.getMF()->getRegInfo(), MI);
974 return;
975 }
976
977 int64_t TotalOffset = ImmOp->getImm() + Offset;
978 if (TotalOffset == 0) {
979 MI.setDesc(TII->get(AMDGPU::COPY));
980 for (unsigned I = MI.getNumOperands() - 1; I != 1; --I)
981 MI.removeOperand(I);
982
983 MI.getOperand(1).ChangeToRegister(BaseReg, false);
984 return;
985 }
986
987 ImmOp->setImm(TotalOffset);
988
989 MachineBasicBlock *MBB = MI.getParent();
990 MachineFunction *MF = MBB->getParent();
992
993 // FIXME: materializeFrameBaseRegister does not know the register class of
994 // the uses of the frame index, and assumes SGPR for enableFlatScratch. Emit
995 // a copy so we have a legal operand and hope the register coalescer can
996 // clean it up.
997 if (isSGPRReg(MRI, BaseReg)) {
998 Register BaseRegVGPR =
999 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1000 BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), BaseRegVGPR)
1001 .addReg(BaseReg);
1002 MI.getOperand(2).ChangeToRegister(BaseRegVGPR, false);
1003 } else {
1004 MI.getOperand(2).ChangeToRegister(BaseReg, false);
1005 }
1006 return;
1007 }
1008 case AMDGPU::V_ADD_U32_e64:
1009 case AMDGPU::V_ADD_CO_U32_e64: {
1010 int Src0Idx = MI.getNumExplicitDefs();
1011 MachineOperand *FIOp = &MI.getOperand(Src0Idx);
1012 MachineOperand *ImmOp = &MI.getOperand(Src0Idx + 1);
1013 if (!FIOp->isFI())
1014 std::swap(FIOp, ImmOp);
1015
1016 if (!ImmOp->isImm()) {
1017 FIOp->ChangeToRegister(BaseReg, false);
1018 TII->legalizeOperandsVOP3(MI.getMF()->getRegInfo(), MI);
1019 return;
1020 }
1021
1022 int64_t TotalOffset = ImmOp->getImm() + Offset;
1023 if (TotalOffset == 0) {
1024 MI.setDesc(TII->get(AMDGPU::COPY));
1025
1026 for (unsigned I = MI.getNumOperands() - 1; I != 1; --I)
1027 MI.removeOperand(I);
1028
1029 MI.getOperand(1).ChangeToRegister(BaseReg, false);
1030 } else {
1031 FIOp->ChangeToRegister(BaseReg, false);
1032 ImmOp->setImm(TotalOffset);
1033 }
1034
1035 return;
1036 }
1037 default:
1038 break;
1039 }
1040
1041 bool IsFlat = TII->isFLATScratch(MI);
1042
1043#ifndef NDEBUG
1044 // FIXME: Is it possible to be storing a frame index to itself?
1045 bool SeenFI = false;
1046 for (const MachineOperand &MO: MI.operands()) {
1047 if (MO.isFI()) {
1048 if (SeenFI)
1049 llvm_unreachable("should not see multiple frame indices");
1050
1051 SeenFI = true;
1052 }
1053 }
1054#endif
1055
1056 MachineOperand *FIOp =
1057 TII->getNamedOperand(MI, IsFlat ? AMDGPU::OpName::saddr
1058 : AMDGPU::OpName::vaddr);
1059
1060 MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset);
1061 int64_t NewOffset = OffsetOp->getImm() + Offset;
1062
1063 assert(FIOp && FIOp->isFI() && "frame index must be address operand");
1064 assert(TII->isMUBUF(MI) || TII->isFLATScratch(MI));
1065
1066 if (IsFlat) {
1067 assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
1069 "offset should be legal");
1070 FIOp->ChangeToRegister(BaseReg, false);
1071 OffsetOp->setImm(NewOffset);
1072 return;
1073 }
1074
1075#ifndef NDEBUG
1076 MachineOperand *SOffset = TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
1077 assert(SOffset->isImm() && SOffset->getImm() == 0);
1078#endif
1079
1080 assert(TII->isLegalMUBUFImmOffset(NewOffset) && "offset should be legal");
1081
1082 FIOp->ChangeToRegister(BaseReg, false);
1083 OffsetOp->setImm(NewOffset);
1084}
1085
1087 Register BaseReg,
1088 int64_t Offset) const {
1089
1090 switch (MI->getOpcode()) {
1091 case AMDGPU::V_ADD_U32_e32:
1092 case AMDGPU::V_ADD_CO_U32_e32:
1093 return true;
1094 case AMDGPU::V_ADD_U32_e64:
1095 case AMDGPU::V_ADD_CO_U32_e64:
1096 return ST.hasVOP3Literal() || AMDGPU::isInlinableIntLiteral(Offset);
1097 default:
1098 break;
1099 }
1100
1102 return false;
1103
1104 int64_t NewOffset = Offset + getScratchInstrOffset(MI);
1105
1106 const SIInstrInfo *TII = ST.getInstrInfo();
1108 return TII->isLegalMUBUFImmOffset(NewOffset);
1109
1110 return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
1112}
1113
1114const TargetRegisterClass *
1116 // This is inaccurate. It depends on the instruction and address space. The
1117 // only place where we should hit this is for dealing with frame indexes /
1118 // private accesses, so this is correct in that case.
1119 return &AMDGPU::VGPR_32RegClass;
1120}
1121
1122const TargetRegisterClass *
1124 return RC == &AMDGPU::SCC_CLASSRegClass ? &AMDGPU::SReg_32RegClass : RC;
1125}
1126
1128 const SIInstrInfo *TII) {
1129
1130 unsigned Op = MI.getOpcode();
1131 switch (Op) {
1132 case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE:
1133 case AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE:
1134 // FIXME: This assumes the mask is statically known and not computed at
1135 // runtime. However, some ABIs may want to compute the mask dynamically and
1136 // this will need to be updated.
1137 return llvm::popcount(
1138 (uint64_t)TII->getNamedOperand(MI, AMDGPU::OpName::mask)->getImm());
1139 case AMDGPU::SI_SPILL_S1024_SAVE:
1140 case AMDGPU::SI_SPILL_S1024_RESTORE:
1141 case AMDGPU::SI_SPILL_V1024_SAVE:
1142 case AMDGPU::SI_SPILL_V1024_RESTORE:
1143 case AMDGPU::SI_SPILL_A1024_SAVE:
1144 case AMDGPU::SI_SPILL_A1024_RESTORE:
1145 case AMDGPU::SI_SPILL_AV1024_SAVE:
1146 case AMDGPU::SI_SPILL_AV1024_RESTORE:
1147 return 32;
1148 case AMDGPU::SI_SPILL_S512_SAVE:
1149 case AMDGPU::SI_SPILL_S512_RESTORE:
1150 case AMDGPU::SI_SPILL_V512_SAVE:
1151 case AMDGPU::SI_SPILL_V512_RESTORE:
1152 case AMDGPU::SI_SPILL_A512_SAVE:
1153 case AMDGPU::SI_SPILL_A512_RESTORE:
1154 case AMDGPU::SI_SPILL_AV512_SAVE:
1155 case AMDGPU::SI_SPILL_AV512_RESTORE:
1156 return 16;
1157 case AMDGPU::SI_SPILL_S384_SAVE:
1158 case AMDGPU::SI_SPILL_S384_RESTORE:
1159 case AMDGPU::SI_SPILL_V384_SAVE:
1160 case AMDGPU::SI_SPILL_V384_RESTORE:
1161 case AMDGPU::SI_SPILL_A384_SAVE:
1162 case AMDGPU::SI_SPILL_A384_RESTORE:
1163 case AMDGPU::SI_SPILL_AV384_SAVE:
1164 case AMDGPU::SI_SPILL_AV384_RESTORE:
1165 return 12;
1166 case AMDGPU::SI_SPILL_S352_SAVE:
1167 case AMDGPU::SI_SPILL_S352_RESTORE:
1168 case AMDGPU::SI_SPILL_V352_SAVE:
1169 case AMDGPU::SI_SPILL_V352_RESTORE:
1170 case AMDGPU::SI_SPILL_A352_SAVE:
1171 case AMDGPU::SI_SPILL_A352_RESTORE:
1172 case AMDGPU::SI_SPILL_AV352_SAVE:
1173 case AMDGPU::SI_SPILL_AV352_RESTORE:
1174 return 11;
1175 case AMDGPU::SI_SPILL_S320_SAVE:
1176 case AMDGPU::SI_SPILL_S320_RESTORE:
1177 case AMDGPU::SI_SPILL_V320_SAVE:
1178 case AMDGPU::SI_SPILL_V320_RESTORE:
1179 case AMDGPU::SI_SPILL_A320_SAVE:
1180 case AMDGPU::SI_SPILL_A320_RESTORE:
1181 case AMDGPU::SI_SPILL_AV320_SAVE:
1182 case AMDGPU::SI_SPILL_AV320_RESTORE:
1183 return 10;
1184 case AMDGPU::SI_SPILL_S288_SAVE:
1185 case AMDGPU::SI_SPILL_S288_RESTORE:
1186 case AMDGPU::SI_SPILL_V288_SAVE:
1187 case AMDGPU::SI_SPILL_V288_RESTORE:
1188 case AMDGPU::SI_SPILL_A288_SAVE:
1189 case AMDGPU::SI_SPILL_A288_RESTORE:
1190 case AMDGPU::SI_SPILL_AV288_SAVE:
1191 case AMDGPU::SI_SPILL_AV288_RESTORE:
1192 return 9;
1193 case AMDGPU::SI_SPILL_S256_SAVE:
1194 case AMDGPU::SI_SPILL_S256_RESTORE:
1195 case AMDGPU::SI_SPILL_V256_SAVE:
1196 case AMDGPU::SI_SPILL_V256_RESTORE:
1197 case AMDGPU::SI_SPILL_A256_SAVE:
1198 case AMDGPU::SI_SPILL_A256_RESTORE:
1199 case AMDGPU::SI_SPILL_AV256_SAVE:
1200 case AMDGPU::SI_SPILL_AV256_RESTORE:
1201 return 8;
1202 case AMDGPU::SI_SPILL_S224_SAVE:
1203 case AMDGPU::SI_SPILL_S224_RESTORE:
1204 case AMDGPU::SI_SPILL_V224_SAVE:
1205 case AMDGPU::SI_SPILL_V224_RESTORE:
1206 case AMDGPU::SI_SPILL_A224_SAVE:
1207 case AMDGPU::SI_SPILL_A224_RESTORE:
1208 case AMDGPU::SI_SPILL_AV224_SAVE:
1209 case AMDGPU::SI_SPILL_AV224_RESTORE:
1210 return 7;
1211 case AMDGPU::SI_SPILL_S192_SAVE:
1212 case AMDGPU::SI_SPILL_S192_RESTORE:
1213 case AMDGPU::SI_SPILL_V192_SAVE:
1214 case AMDGPU::SI_SPILL_V192_RESTORE:
1215 case AMDGPU::SI_SPILL_A192_SAVE:
1216 case AMDGPU::SI_SPILL_A192_RESTORE:
1217 case AMDGPU::SI_SPILL_AV192_SAVE:
1218 case AMDGPU::SI_SPILL_AV192_RESTORE:
1219 return 6;
1220 case AMDGPU::SI_SPILL_S160_SAVE:
1221 case AMDGPU::SI_SPILL_S160_RESTORE:
1222 case AMDGPU::SI_SPILL_V160_SAVE:
1223 case AMDGPU::SI_SPILL_V160_RESTORE:
1224 case AMDGPU::SI_SPILL_A160_SAVE:
1225 case AMDGPU::SI_SPILL_A160_RESTORE:
1226 case AMDGPU::SI_SPILL_AV160_SAVE:
1227 case AMDGPU::SI_SPILL_AV160_RESTORE:
1228 return 5;
1229 case AMDGPU::SI_SPILL_S128_SAVE:
1230 case AMDGPU::SI_SPILL_S128_RESTORE:
1231 case AMDGPU::SI_SPILL_V128_SAVE:
1232 case AMDGPU::SI_SPILL_V128_RESTORE:
1233 case AMDGPU::SI_SPILL_A128_SAVE:
1234 case AMDGPU::SI_SPILL_A128_RESTORE:
1235 case AMDGPU::SI_SPILL_AV128_SAVE:
1236 case AMDGPU::SI_SPILL_AV128_RESTORE:
1237 return 4;
1238 case AMDGPU::SI_SPILL_S96_SAVE:
1239 case AMDGPU::SI_SPILL_S96_RESTORE:
1240 case AMDGPU::SI_SPILL_V96_SAVE:
1241 case AMDGPU::SI_SPILL_V96_RESTORE:
1242 case AMDGPU::SI_SPILL_A96_SAVE:
1243 case AMDGPU::SI_SPILL_A96_RESTORE:
1244 case AMDGPU::SI_SPILL_AV96_SAVE:
1245 case AMDGPU::SI_SPILL_AV96_RESTORE:
1246 return 3;
1247 case AMDGPU::SI_SPILL_S64_SAVE:
1248 case AMDGPU::SI_SPILL_S64_RESTORE:
1249 case AMDGPU::SI_SPILL_V64_SAVE:
1250 case AMDGPU::SI_SPILL_V64_RESTORE:
1251 case AMDGPU::SI_SPILL_A64_SAVE:
1252 case AMDGPU::SI_SPILL_A64_RESTORE:
1253 case AMDGPU::SI_SPILL_AV64_SAVE:
1254 case AMDGPU::SI_SPILL_AV64_RESTORE:
1255 return 2;
1256 case AMDGPU::SI_SPILL_S32_SAVE:
1257 case AMDGPU::SI_SPILL_S32_RESTORE:
1258 case AMDGPU::SI_SPILL_V32_SAVE:
1259 case AMDGPU::SI_SPILL_V32_RESTORE:
1260 case AMDGPU::SI_SPILL_A32_SAVE:
1261 case AMDGPU::SI_SPILL_A32_RESTORE:
1262 case AMDGPU::SI_SPILL_AV32_SAVE:
1263 case AMDGPU::SI_SPILL_AV32_RESTORE:
1264 case AMDGPU::SI_SPILL_WWM_V32_SAVE:
1265 case AMDGPU::SI_SPILL_WWM_V32_RESTORE:
1266 case AMDGPU::SI_SPILL_WWM_AV32_SAVE:
1267 case AMDGPU::SI_SPILL_WWM_AV32_RESTORE:
1268 case AMDGPU::SI_SPILL_V16_SAVE:
1269 case AMDGPU::SI_SPILL_V16_RESTORE:
1270 return 1;
1271 default: llvm_unreachable("Invalid spill opcode");
1272 }
1273}
1274
1275static int getOffsetMUBUFStore(unsigned Opc) {
1276 switch (Opc) {
1277 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
1278 return AMDGPU::BUFFER_STORE_DWORD_OFFSET;
1279 case AMDGPU::BUFFER_STORE_BYTE_OFFEN:
1280 return AMDGPU::BUFFER_STORE_BYTE_OFFSET;
1281 case AMDGPU::BUFFER_STORE_SHORT_OFFEN:
1282 return AMDGPU::BUFFER_STORE_SHORT_OFFSET;
1283 case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
1284 return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
1285 case AMDGPU::BUFFER_STORE_DWORDX3_OFFEN:
1286 return AMDGPU::BUFFER_STORE_DWORDX3_OFFSET;
1287 case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN:
1288 return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
1289 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN:
1290 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET;
1291 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN:
1292 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET;
1293 default:
1294 return -1;
1295 }
1296}
1297
1298static int getOffsetMUBUFLoad(unsigned Opc) {
1299 switch (Opc) {
1300 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
1301 return AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
1302 case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN:
1303 return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET;
1304 case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN:
1305 return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET;
1306 case AMDGPU::BUFFER_LOAD_USHORT_OFFEN:
1307 return AMDGPU::BUFFER_LOAD_USHORT_OFFSET;
1308 case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN:
1309 return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET;
1310 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN:
1311 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
1312 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN:
1313 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET;
1314 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN:
1315 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET;
1316 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN:
1317 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET;
1318 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN:
1319 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET;
1320 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN:
1321 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET;
1322 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN:
1323 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET;
1324 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN:
1325 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET;
1326 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN:
1327 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET;
1328 default:
1329 return -1;
1330 }
1331}
1332
1333static int getOffenMUBUFStore(unsigned Opc) {
1334 switch (Opc) {
1335 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
1336 return AMDGPU::BUFFER_STORE_DWORD_OFFEN;
1337 case AMDGPU::BUFFER_STORE_BYTE_OFFSET:
1338 return AMDGPU::BUFFER_STORE_BYTE_OFFEN;
1339 case AMDGPU::BUFFER_STORE_SHORT_OFFSET:
1340 return AMDGPU::BUFFER_STORE_SHORT_OFFEN;
1341 case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET:
1342 return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN;
1343 case AMDGPU::BUFFER_STORE_DWORDX3_OFFSET:
1344 return AMDGPU::BUFFER_STORE_DWORDX3_OFFEN;
1345 case AMDGPU::BUFFER_STORE_DWORDX4_OFFSET:
1346 return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN;
1347 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET:
1348 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN;
1349 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET:
1350 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN;
1351 default:
1352 return -1;
1353 }
1354}
1355
1356static int getOffenMUBUFLoad(unsigned Opc) {
1357 switch (Opc) {
1358 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
1359 return AMDGPU::BUFFER_LOAD_DWORD_OFFEN;
1360 case AMDGPU::BUFFER_LOAD_UBYTE_OFFSET:
1361 return AMDGPU::BUFFER_LOAD_UBYTE_OFFEN;
1362 case AMDGPU::BUFFER_LOAD_SBYTE_OFFSET:
1363 return AMDGPU::BUFFER_LOAD_SBYTE_OFFEN;
1364 case AMDGPU::BUFFER_LOAD_USHORT_OFFSET:
1365 return AMDGPU::BUFFER_LOAD_USHORT_OFFEN;
1366 case AMDGPU::BUFFER_LOAD_SSHORT_OFFSET:
1367 return AMDGPU::BUFFER_LOAD_SSHORT_OFFEN;
1368 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET:
1369 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
1370 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET:
1371 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN;
1372 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET:
1373 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN;
1374 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET:
1375 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN;
1376 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET:
1377 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN;
1378 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET:
1379 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN;
1380 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET:
1381 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN;
1382 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET:
1383 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN;
1384 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET:
1385 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN;
1386 default:
1387 return -1;
1388 }
1389}
1390
1394 int Index, unsigned Lane,
1395 unsigned ValueReg, bool IsKill) {
1396 MachineFunction *MF = MBB.getParent();
1398 const SIInstrInfo *TII = ST.getInstrInfo();
1399
1400 MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane);
1401
1402 if (Reg == AMDGPU::NoRegister)
1403 return MachineInstrBuilder();
1404
1405 bool IsStore = MI->mayStore();
1407 auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
1408
1409 unsigned Dst = IsStore ? Reg : ValueReg;
1410 unsigned Src = IsStore ? ValueReg : Reg;
1411 bool IsVGPR = TRI->isVGPR(MRI, Reg);
1412 DebugLoc DL = MI->getDebugLoc();
1413 if (IsVGPR == TRI->isVGPR(MRI, ValueReg)) {
1414 // Spiller during regalloc may restore a spilled register to its superclass.
1415 // It could result in AGPR spills restored to VGPRs or the other way around,
1416 // making the src and dst with identical regclasses at this point. It just
1417 // needs a copy in such cases.
1418 auto CopyMIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), Dst)
1419 .addReg(Src, getKillRegState(IsKill));
1421 return CopyMIB;
1422 }
1423 unsigned Opc = (IsStore ^ IsVGPR) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64
1424 : AMDGPU::V_ACCVGPR_READ_B32_e64;
1425
1426 auto MIB = BuildMI(MBB, MI, DL, TII->get(Opc), Dst)
1427 .addReg(Src, getKillRegState(IsKill));
1429 return MIB;
1430}
1431
1432// This differs from buildSpillLoadStore by only scavenging a VGPR. It does not
1433// need to handle the case where an SGPR may need to be spilled while spilling.
1435 MachineFrameInfo &MFI,
1437 int Index,
1438 int64_t Offset) {
1439 const SIInstrInfo *TII = ST.getInstrInfo();
1440 MachineBasicBlock *MBB = MI->getParent();
1441 const DebugLoc &DL = MI->getDebugLoc();
1442 bool IsStore = MI->mayStore();
1443
1444 unsigned Opc = MI->getOpcode();
1445 int LoadStoreOp = IsStore ?
1447 if (LoadStoreOp == -1)
1448 return false;
1449
1450 const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata);
1451 if (spillVGPRtoAGPR(ST, *MBB, MI, Index, 0, Reg->getReg(), false).getInstr())
1452 return true;
1453
1454 MachineInstrBuilder NewMI =
1455 BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
1456 .add(*Reg)
1457 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc))
1458 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset))
1459 .addImm(Offset)
1460 .addImm(0) // cpol
1461 .addImm(0) // swz
1462 .cloneMemRefs(*MI);
1463
1464 const MachineOperand *VDataIn = TII->getNamedOperand(*MI,
1465 AMDGPU::OpName::vdata_in);
1466 if (VDataIn)
1467 NewMI.add(*VDataIn);
1468 return true;
1469}
1470
1472 unsigned LoadStoreOp,
1473 unsigned EltSize) {
1474 bool IsStore = TII->get(LoadStoreOp).mayStore();
1475 bool HasVAddr = AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::vaddr);
1476 bool UseST =
1477 !HasVAddr && !AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::saddr);
1478
1479 // Handle block load/store first.
1480 if (TII->isBlockLoadStore(LoadStoreOp))
1481 return LoadStoreOp;
1482
1483 switch (EltSize) {
1484 case 4:
1485 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
1486 : AMDGPU::SCRATCH_LOAD_DWORD_SADDR;
1487 break;
1488 case 8:
1489 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX2_SADDR
1490 : AMDGPU::SCRATCH_LOAD_DWORDX2_SADDR;
1491 break;
1492 case 12:
1493 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX3_SADDR
1494 : AMDGPU::SCRATCH_LOAD_DWORDX3_SADDR;
1495 break;
1496 case 16:
1497 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX4_SADDR
1498 : AMDGPU::SCRATCH_LOAD_DWORDX4_SADDR;
1499 break;
1500 default:
1501 llvm_unreachable("Unexpected spill load/store size!");
1502 }
1503
1504 if (HasVAddr)
1505 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp);
1506 else if (UseST)
1507 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
1508
1509 return LoadStoreOp;
1510}
1511
1514 unsigned LoadStoreOp, int Index, Register ValueReg, bool IsKill,
1515 MCRegister ScratchOffsetReg, int64_t InstOffset, MachineMemOperand *MMO,
1516 RegScavenger *RS, LiveRegUnits *LiveUnits) const {
1517 assert((!RS || !LiveUnits) && "Only RS or LiveUnits can be set but not both");
1518
1519 MachineFunction *MF = MBB.getParent();
1520 const SIInstrInfo *TII = ST.getInstrInfo();
1521 const MachineFrameInfo &MFI = MF->getFrameInfo();
1522 const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
1523
1524 const MCInstrDesc *Desc = &TII->get(LoadStoreOp);
1525 bool IsStore = Desc->mayStore();
1526 bool IsFlat = TII->isFLATScratch(LoadStoreOp);
1527 bool IsBlock = TII->isBlockLoadStore(LoadStoreOp);
1528
1529 bool CanClobberSCC = false;
1530 bool Scavenged = false;
1531 MCRegister SOffset = ScratchOffsetReg;
1532
1533 const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
1534 // On gfx90a+ AGPR is a regular VGPR acceptable for loads and stores.
1535 const bool IsAGPR = !ST.hasGFX90AInsts() && isAGPRClass(RC);
1536 const unsigned RegWidth = AMDGPU::getRegBitWidth(*RC) / 8;
1537
1538 // Always use 4 byte operations for AGPRs because we need to scavenge
1539 // a temporary VGPR.
1540 // If we're using a block operation, the element should be the whole block.
1541 unsigned EltSize = IsBlock ? RegWidth
1542 : (IsFlat && !IsAGPR) ? std::min(RegWidth, 16u)
1543 : 4u;
1544 unsigned NumSubRegs = RegWidth / EltSize;
1545 unsigned Size = NumSubRegs * EltSize;
1546 unsigned RemSize = RegWidth - Size;
1547 unsigned NumRemSubRegs = RemSize ? 1 : 0;
1548 int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
1549 int64_t MaterializedOffset = Offset;
1550
1551 int64_t MaxOffset = Offset + Size + RemSize - EltSize;
1552 int64_t ScratchOffsetRegDelta = 0;
1553
1554 if (IsFlat && EltSize > 4) {
1555 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
1556 Desc = &TII->get(LoadStoreOp);
1557 }
1558
1559 Align Alignment = MFI.getObjectAlign(Index);
1560 const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
1561
1562 assert((IsFlat || ((Offset % EltSize) == 0)) &&
1563 "unexpected VGPR spill offset");
1564
1565 // Track a VGPR to use for a constant offset we need to materialize.
1566 Register TmpOffsetVGPR;
1567
1568 // Track a VGPR to use as an intermediate value.
1569 Register TmpIntermediateVGPR;
1570 bool UseVGPROffset = false;
1571
1572 // Materialize a VGPR offset required for the given SGPR/VGPR/Immediate
1573 // combination.
1574 auto MaterializeVOffset = [&](Register SGPRBase, Register TmpVGPR,
1575 int64_t VOffset) {
1576 // We are using a VGPR offset
1577 if (IsFlat && SGPRBase) {
1578 // We only have 1 VGPR offset, or 1 SGPR offset. We don't have a free
1579 // SGPR, so perform the add as vector.
1580 // We don't need a base SGPR in the kernel.
1581
1582 if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) >= 2) {
1583 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), TmpVGPR)
1584 .addReg(SGPRBase)
1585 .addImm(VOffset)
1586 .addImm(0); // clamp
1587 } else {
1588 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
1589 .addReg(SGPRBase);
1590 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e32), TmpVGPR)
1591 .addImm(VOffset)
1592 .addReg(TmpOffsetVGPR);
1593 }
1594 } else {
1595 assert(TmpOffsetVGPR);
1596 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
1597 .addImm(VOffset);
1598 }
1599 };
1600
1601 bool IsOffsetLegal =
1602 IsFlat ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS,
1604 : TII->isLegalMUBUFImmOffset(MaxOffset);
1605 if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) {
1606 SOffset = MCRegister();
1607
1608 // We don't have access to the register scavenger if this function is called
1609 // during PEI::scavengeFrameVirtualRegs() so use LiveUnits in this case.
1610 // TODO: Clobbering SCC is not necessary for scratch instructions in the
1611 // entry.
1612 if (RS) {
1613 SOffset = RS->scavengeRegisterBackwards(AMDGPU::SGPR_32RegClass, MI, false, 0, false);
1614
1615 // Piggy back on the liveness scan we just did see if SCC is dead.
1616 CanClobberSCC = !RS->isRegUsed(AMDGPU::SCC);
1617 } else if (LiveUnits) {
1618 CanClobberSCC = LiveUnits->available(AMDGPU::SCC);
1619 for (MCRegister Reg : AMDGPU::SGPR_32RegClass) {
1620 if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) {
1621 SOffset = Reg;
1622 break;
1623 }
1624 }
1625 }
1626
1627 if (ScratchOffsetReg != AMDGPU::NoRegister && !CanClobberSCC)
1628 SOffset = Register();
1629
1630 if (!SOffset) {
1631 UseVGPROffset = true;
1632
1633 if (RS) {
1634 TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0);
1635 } else {
1636 assert(LiveUnits);
1637 for (MCRegister Reg : AMDGPU::VGPR_32RegClass) {
1638 if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) {
1639 TmpOffsetVGPR = Reg;
1640 break;
1641 }
1642 }
1643 }
1644
1645 assert(TmpOffsetVGPR);
1646 } else if (!SOffset && CanClobberSCC) {
1647 // There are no free SGPRs, and since we are in the process of spilling
1648 // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true
1649 // on SI/CI and on VI it is true until we implement spilling using scalar
1650 // stores), we have no way to free up an SGPR. Our solution here is to
1651 // add the offset directly to the ScratchOffset or StackPtrOffset
1652 // register, and then subtract the offset after the spill to return the
1653 // register to it's original value.
1654
1655 // TODO: If we don't have to do an emergency stack slot spill, converting
1656 // to use the VGPR offset is fewer instructions.
1657 if (!ScratchOffsetReg)
1658 ScratchOffsetReg = FuncInfo->getStackPtrOffsetReg();
1659 SOffset = ScratchOffsetReg;
1660 ScratchOffsetRegDelta = Offset;
1661 } else {
1662 Scavenged = true;
1663 }
1664
1665 // We currently only support spilling VGPRs to EltSize boundaries, meaning
1666 // we can simplify the adjustment of Offset here to just scale with
1667 // WavefrontSize.
1668 if (!IsFlat && !UseVGPROffset)
1669 Offset *= ST.getWavefrontSize();
1670
1671 if (!UseVGPROffset && !SOffset)
1672 report_fatal_error("could not scavenge SGPR to spill in entry function");
1673
1674 if (UseVGPROffset) {
1675 // We are using a VGPR offset
1676 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, Offset);
1677 } else if (ScratchOffsetReg == AMDGPU::NoRegister) {
1678 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset).addImm(Offset);
1679 } else {
1680 assert(Offset != 0);
1681 auto Add = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset)
1682 .addReg(ScratchOffsetReg)
1683 .addImm(Offset);
1684 Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1685 }
1686
1687 Offset = 0;
1688 }
1689
1690 if (IsFlat && SOffset == AMDGPU::NoRegister) {
1691 assert(AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0
1692 && "Unexpected vaddr for flat scratch with a FI operand");
1693
1694 if (UseVGPROffset) {
1695 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp);
1696 } else {
1697 assert(ST.hasFlatScratchSTMode());
1698 assert(!TII->isBlockLoadStore(LoadStoreOp) && "Block ops don't have ST");
1699 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
1700 }
1701
1702 Desc = &TII->get(LoadStoreOp);
1703 }
1704
1705 for (unsigned i = 0, e = NumSubRegs + NumRemSubRegs, RegOffset = 0; i != e;
1706 ++i, RegOffset += EltSize) {
1707 if (i == NumSubRegs) {
1708 EltSize = RemSize;
1709 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
1710 }
1711 Desc = &TII->get(LoadStoreOp);
1712
1713 if (!IsFlat && UseVGPROffset) {
1714 int NewLoadStoreOp = IsStore ? getOffenMUBUFStore(LoadStoreOp)
1715 : getOffenMUBUFLoad(LoadStoreOp);
1716 Desc = &TII->get(NewLoadStoreOp);
1717 }
1718
1719 if (UseVGPROffset && TmpOffsetVGPR == TmpIntermediateVGPR) {
1720 // If we are spilling an AGPR beyond the range of the memory instruction
1721 // offset and need to use a VGPR offset, we ideally have at least 2
1722 // scratch VGPRs. If we don't have a second free VGPR without spilling,
1723 // recycle the VGPR used for the offset which requires resetting after
1724 // each subregister.
1725
1726 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, MaterializedOffset);
1727 }
1728
1729 unsigned NumRegs = EltSize / 4;
1730 Register SubReg = e == 1
1731 ? ValueReg
1732 : Register(getSubReg(ValueReg,
1733 getSubRegFromChannel(RegOffset / 4, NumRegs)));
1734
1735 unsigned SOffsetRegState = 0;
1736 unsigned SrcDstRegState = getDefRegState(!IsStore);
1737 const bool IsLastSubReg = i + 1 == e;
1738 const bool IsFirstSubReg = i == 0;
1739 if (IsLastSubReg) {
1740 SOffsetRegState |= getKillRegState(Scavenged);
1741 // The last implicit use carries the "Kill" flag.
1742 SrcDstRegState |= getKillRegState(IsKill);
1743 }
1744
1745 // Make sure the whole register is defined if there are undef components by
1746 // adding an implicit def of the super-reg on the first instruction.
1747 bool NeedSuperRegDef = e > 1 && IsStore && IsFirstSubReg;
1748 bool NeedSuperRegImpOperand = e > 1;
1749
1750 // Remaining element size to spill into memory after some parts of it
1751 // spilled into either AGPRs or VGPRs.
1752 unsigned RemEltSize = EltSize;
1753
1754 // AGPRs to spill VGPRs and vice versa are allocated in a reverse order,
1755 // starting from the last lane. In case if a register cannot be completely
1756 // spilled into another register that will ensure its alignment does not
1757 // change. For targets with VGPR alignment requirement this is important
1758 // in case of flat scratch usage as we might get a scratch_load or
1759 // scratch_store of an unaligned register otherwise.
1760 for (int LaneS = (RegOffset + EltSize) / 4 - 1, Lane = LaneS,
1761 LaneE = RegOffset / 4;
1762 Lane >= LaneE; --Lane) {
1763 bool IsSubReg = e > 1 || EltSize > 4;
1764 Register Sub = IsSubReg
1765 ? Register(getSubReg(ValueReg, getSubRegFromChannel(Lane)))
1766 : ValueReg;
1767 auto MIB = spillVGPRtoAGPR(ST, MBB, MI, Index, Lane, Sub, IsKill);
1768 if (!MIB.getInstr())
1769 break;
1770 if (NeedSuperRegDef || (IsSubReg && IsStore && Lane == LaneS && IsFirstSubReg)) {
1771 MIB.addReg(ValueReg, RegState::ImplicitDefine);
1772 NeedSuperRegDef = false;
1773 }
1774 if ((IsSubReg || NeedSuperRegImpOperand) && (IsFirstSubReg || IsLastSubReg)) {
1775 NeedSuperRegImpOperand = true;
1776 unsigned State = SrcDstRegState;
1777 if (!IsLastSubReg || (Lane != LaneE))
1778 State &= ~RegState::Kill;
1779 if (!IsFirstSubReg || (Lane != LaneS))
1780 State &= ~RegState::Define;
1781 MIB.addReg(ValueReg, RegState::Implicit | State);
1782 }
1783 RemEltSize -= 4;
1784 }
1785
1786 if (!RemEltSize) // Fully spilled into AGPRs.
1787 continue;
1788
1789 if (RemEltSize != EltSize) { // Partially spilled to AGPRs
1790 assert(IsFlat && EltSize > 4);
1791
1792 unsigned NumRegs = RemEltSize / 4;
1793 SubReg = Register(getSubReg(ValueReg,
1794 getSubRegFromChannel(RegOffset / 4, NumRegs)));
1795 unsigned Opc = getFlatScratchSpillOpcode(TII, LoadStoreOp, RemEltSize);
1796 Desc = &TII->get(Opc);
1797 }
1798
1799 unsigned FinalReg = SubReg;
1800
1801 if (IsAGPR) {
1802 assert(EltSize == 4);
1803
1804 if (!TmpIntermediateVGPR) {
1805 TmpIntermediateVGPR = FuncInfo->getVGPRForAGPRCopy();
1806 assert(MF->getRegInfo().isReserved(TmpIntermediateVGPR));
1807 }
1808 if (IsStore) {
1809 auto AccRead = BuildMI(MBB, MI, DL,
1810 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64),
1811 TmpIntermediateVGPR)
1812 .addReg(SubReg, getKillRegState(IsKill));
1813 if (NeedSuperRegDef)
1814 AccRead.addReg(ValueReg, RegState::ImplicitDefine);
1815 if (NeedSuperRegImpOperand && (IsFirstSubReg || IsLastSubReg))
1816 AccRead.addReg(ValueReg, RegState::Implicit);
1818 }
1819 SubReg = TmpIntermediateVGPR;
1820 } else if (UseVGPROffset) {
1821 if (!TmpOffsetVGPR) {
1822 TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
1823 MI, false, 0);
1824 RS->setRegUsed(TmpOffsetVGPR);
1825 }
1826 }
1827
1828 Register FinalValueReg = ValueReg;
1829 if (LoadStoreOp == AMDGPU::SCRATCH_LOAD_USHORT_SADDR) {
1830 // If we are loading 16-bit value with SRAMECC endabled we need a temp
1831 // 32-bit VGPR to load and extract 16-bits into the final register.
1832 ValueReg =
1833 RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0);
1834 SubReg = ValueReg;
1835 IsKill = false;
1836 }
1837
1838 MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RegOffset);
1839 MachineMemOperand *NewMMO =
1840 MF->getMachineMemOperand(PInfo, MMO->getFlags(), RemEltSize,
1841 commonAlignment(Alignment, RegOffset));
1842
1843 auto MIB =
1844 BuildMI(MBB, MI, DL, *Desc)
1845 .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill));
1846
1847 if (UseVGPROffset) {
1848 // For an AGPR spill, we reuse the same temp VGPR for the offset and the
1849 // intermediate accvgpr_write.
1850 MIB.addReg(TmpOffsetVGPR, getKillRegState(IsLastSubReg && !IsAGPR));
1851 }
1852
1853 if (!IsFlat)
1854 MIB.addReg(FuncInfo->getScratchRSrcReg());
1855
1856 if (SOffset == AMDGPU::NoRegister) {
1857 if (!IsFlat) {
1858 if (UseVGPROffset && ScratchOffsetReg) {
1859 MIB.addReg(ScratchOffsetReg);
1860 } else {
1861 assert(FuncInfo->isBottomOfStack());
1862 MIB.addImm(0);
1863 }
1864 }
1865 } else {
1866 MIB.addReg(SOffset, SOffsetRegState);
1867 }
1868
1869 MIB.addImm(Offset + RegOffset);
1870
1871 bool LastUse = MMO->getFlags() & MOLastUse;
1872 MIB.addImm(LastUse ? AMDGPU::CPol::TH_LU : 0); // cpol
1873
1874 if (!IsFlat)
1875 MIB.addImm(0); // swz
1876 MIB.addMemOperand(NewMMO);
1877
1878 if (FinalValueReg != ValueReg) {
1879 // Extract 16-bit from the loaded 32-bit value.
1880 ValueReg = getSubReg(ValueReg, AMDGPU::lo16);
1881 MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B16_t16_e64))
1882 .addReg(FinalValueReg, getDefRegState(true))
1883 .addImm(0)
1884 .addReg(ValueReg, getKillRegState(true))
1885 .addImm(0);
1886 ValueReg = FinalValueReg;
1887 }
1888
1889 if (!IsAGPR && NeedSuperRegDef)
1890 MIB.addReg(ValueReg, RegState::ImplicitDefine);
1891
1892 if (!IsStore && IsAGPR && TmpIntermediateVGPR != AMDGPU::NoRegister) {
1893 MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64),
1894 FinalReg)
1895 .addReg(TmpIntermediateVGPR, RegState::Kill);
1897 }
1898
1899 bool IsSrcDstDef = SrcDstRegState & RegState::Define;
1900 bool PartialReloadCopy = (RemEltSize != EltSize) && !IsStore;
1901 if (NeedSuperRegImpOperand &&
1902 (IsFirstSubReg || (IsLastSubReg && !IsSrcDstDef))) {
1903 MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
1904 if (PartialReloadCopy)
1905 MIB.addReg(ValueReg, RegState::Implicit);
1906 }
1907
1908 // The epilog restore of a wwm-scratch register can cause undesired
1909 // optimization during machine-cp post PrologEpilogInserter if the same
1910 // register was assigned for return value ABI lowering with a COPY
1911 // instruction. As given below, with the epilog reload, the earlier COPY
1912 // appeared to be dead during machine-cp.
1913 // ...
1914 // v0 in WWM operation, needs the WWM spill at prolog/epilog.
1915 // $vgpr0 = V_WRITELANE_B32 $sgpr20, 0, $vgpr0
1916 // ...
1917 // Epilog block:
1918 // $vgpr0 = COPY $vgpr1 // outgoing value moved to v0
1919 // ...
1920 // WWM spill restore to preserve the inactive lanes of v0.
1921 // $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1
1922 // $vgpr0 = BUFFER_LOAD $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0
1923 // $exec = S_MOV_B64 killed $sgpr4_sgpr5
1924 // ...
1925 // SI_RETURN implicit $vgpr0
1926 // ...
1927 // To fix it, mark the same reg as a tied op for such restore instructions
1928 // so that it marks a usage for the preceding COPY.
1929 if (!IsStore && MI != MBB.end() && MI->isReturn() &&
1930 MI->readsRegister(SubReg, this)) {
1931 MIB.addReg(SubReg, RegState::Implicit);
1932 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1933 }
1934
1935 // If we're building a block load, we should add artificial uses for the
1936 // CSR VGPRs that are *not* being transferred. This is because liveness
1937 // analysis is not aware of the mask, so we need to somehow inform it that
1938 // those registers are not available before the load and they should not be
1939 // scavenged.
1940 if (!IsStore && TII->isBlockLoadStore(LoadStoreOp))
1941 addImplicitUsesForBlockCSRLoad(MIB, ValueReg);
1942 }
1943
1944 if (ScratchOffsetRegDelta != 0) {
1945 // Subtract the offset we added to the ScratchOffset register.
1946 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset)
1947 .addReg(SOffset)
1948 .addImm(-ScratchOffsetRegDelta);
1949 }
1950}
1951
1953 Register BlockReg) const {
1954 const MachineFunction *MF = MIB->getMF();
1955 const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
1956 uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(BlockReg);
1957 Register BaseVGPR = getSubReg(BlockReg, AMDGPU::sub0);
1958 for (unsigned RegOffset = 1; RegOffset < 32; ++RegOffset)
1959 if (!(Mask & (1 << RegOffset)) &&
1960 isCalleeSavedPhysReg(BaseVGPR + RegOffset, *MF))
1961 MIB.addUse(BaseVGPR + RegOffset, RegState::Implicit);
1962}
1963
1965 int Offset, bool IsLoad,
1966 bool IsKill) const {
1967 // Load/store VGPR
1968 MachineFrameInfo &FrameInfo = SB.MF.getFrameInfo();
1969 assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill);
1970
1971 Register FrameReg =
1972 FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(SB.MF)
1973 ? getBaseRegister()
1974 : getFrameRegister(SB.MF);
1975
1976 Align Alignment = FrameInfo.getObjectAlign(Index);
1980 SB.EltSize, Alignment);
1981
1982 if (IsLoad) {
1983 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
1984 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
1985 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, false,
1986 FrameReg, (int64_t)Offset * SB.EltSize, MMO, SB.RS);
1987 } else {
1988 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
1989 : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
1990 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, IsKill,
1991 FrameReg, (int64_t)Offset * SB.EltSize, MMO, SB.RS);
1992 // This only ever adds one VGPR spill
1993 SB.MFI.addToSpilledVGPRs(1);
1994 }
1995}
1996
1998 RegScavenger *RS, SlotIndexes *Indexes,
1999 LiveIntervals *LIS, bool OnlyToVGPR,
2000 bool SpillToPhysVGPRLane) const {
2001 assert(!MI->getOperand(0).isUndef() &&
2002 "undef spill should have been deleted earlier");
2003
2004 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
2005
2006 ArrayRef<SpilledReg> VGPRSpills =
2007 SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index)
2009 bool SpillToVGPR = !VGPRSpills.empty();
2010 if (OnlyToVGPR && !SpillToVGPR)
2011 return false;
2012
2013 assert(SpillToVGPR || (SB.SuperReg != SB.MFI.getStackPtrOffsetReg() &&
2014 SB.SuperReg != SB.MFI.getFrameOffsetReg()));
2015
2016 if (SpillToVGPR) {
2017
2018 // Since stack slot coloring pass is trying to optimize SGPR spills,
2019 // VGPR lanes (mapped from spill stack slot) may be shared for SGPR
2020 // spills of different sizes. This accounts for number of VGPR lanes alloted
2021 // equal to the largest SGPR being spilled in them.
2022 assert(SB.NumSubRegs <= VGPRSpills.size() &&
2023 "Num of SGPRs spilled should be less than or equal to num of "
2024 "the VGPR lanes.");
2025
2026 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
2028 SB.NumSubRegs == 1
2029 ? SB.SuperReg
2030 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2031 SpilledReg Spill = VGPRSpills[i];
2032
2033 bool IsFirstSubreg = i == 0;
2034 bool IsLastSubreg = i == SB.NumSubRegs - 1;
2035 bool UseKill = SB.IsKill && IsLastSubreg;
2036
2037
2038 // Mark the "old value of vgpr" input undef only if this is the first sgpr
2039 // spill to this specific vgpr in the first basic block.
2040 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
2041 SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), Spill.VGPR)
2042 .addReg(SubReg, getKillRegState(UseKill))
2043 .addImm(Spill.Lane)
2044 .addReg(Spill.VGPR);
2045 if (Indexes) {
2046 if (IsFirstSubreg)
2047 Indexes->replaceMachineInstrInMaps(*MI, *MIB);
2048 else
2049 Indexes->insertMachineInstrInMaps(*MIB);
2050 }
2051
2052 if (IsFirstSubreg && SB.NumSubRegs > 1) {
2053 // We may be spilling a super-register which is only partially defined,
2054 // and need to ensure later spills think the value is defined.
2055 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
2056 }
2057
2058 if (SB.NumSubRegs > 1 && (IsFirstSubreg || IsLastSubreg))
2060
2061 // FIXME: Since this spills to another register instead of an actual
2062 // frame index, we should delete the frame index when all references to
2063 // it are fixed.
2064 }
2065 } else {
2066 SB.prepare();
2067
2068 // SubReg carries the "Kill" flag when SubReg == SB.SuperReg.
2069 unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill);
2070
2071 // Per VGPR helper data
2072 auto PVD = SB.getPerVGPRData();
2073
2074 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2075 unsigned TmpVGPRFlags = RegState::Undef;
2076
2077 // Write sub registers into the VGPR
2078 for (unsigned i = Offset * PVD.PerVGPR,
2079 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2080 i < e; ++i) {
2082 SB.NumSubRegs == 1
2083 ? SB.SuperReg
2084 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2085
2086 MachineInstrBuilder WriteLane =
2087 BuildMI(*SB.MBB, MI, SB.DL,
2088 SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), SB.TmpVGPR)
2089 .addReg(SubReg, SubKillState)
2090 .addImm(i % PVD.PerVGPR)
2091 .addReg(SB.TmpVGPR, TmpVGPRFlags);
2092 TmpVGPRFlags = 0;
2093
2094 if (Indexes) {
2095 if (i == 0)
2096 Indexes->replaceMachineInstrInMaps(*MI, *WriteLane);
2097 else
2098 Indexes->insertMachineInstrInMaps(*WriteLane);
2099 }
2100
2101 // There could be undef components of a spilled super register.
2102 // TODO: Can we detect this and skip the spill?
2103 if (SB.NumSubRegs > 1) {
2104 // The last implicit use of the SB.SuperReg carries the "Kill" flag.
2105 unsigned SuperKillState = 0;
2106 if (i + 1 == SB.NumSubRegs)
2107 SuperKillState |= getKillRegState(SB.IsKill);
2108 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState);
2109 }
2110 }
2111
2112 // Write out VGPR
2113 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ false);
2114 }
2115
2116 SB.restore();
2117 }
2118
2119 MI->eraseFromParent();
2121
2122 if (LIS)
2124
2125 return true;
2126}
2127
2129 RegScavenger *RS, SlotIndexes *Indexes,
2130 LiveIntervals *LIS, bool OnlyToVGPR,
2131 bool SpillToPhysVGPRLane) const {
2132 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
2133
2134 ArrayRef<SpilledReg> VGPRSpills =
2135 SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index)
2137 bool SpillToVGPR = !VGPRSpills.empty();
2138 if (OnlyToVGPR && !SpillToVGPR)
2139 return false;
2140
2141 if (SpillToVGPR) {
2142 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
2144 SB.NumSubRegs == 1
2145 ? SB.SuperReg
2146 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2147
2148 SpilledReg Spill = VGPRSpills[i];
2149 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
2150 SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
2151 .addReg(Spill.VGPR)
2152 .addImm(Spill.Lane);
2153 if (SB.NumSubRegs > 1 && i == 0)
2155 if (Indexes) {
2156 if (i == e - 1)
2157 Indexes->replaceMachineInstrInMaps(*MI, *MIB);
2158 else
2159 Indexes->insertMachineInstrInMaps(*MIB);
2160 }
2161 }
2162 } else {
2163 SB.prepare();
2164
2165 // Per VGPR helper data
2166 auto PVD = SB.getPerVGPRData();
2167
2168 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2169 // Load in VGPR data
2170 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ true);
2171
2172 // Unpack lanes
2173 for (unsigned i = Offset * PVD.PerVGPR,
2174 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2175 i < e; ++i) {
2177 SB.NumSubRegs == 1
2178 ? SB.SuperReg
2179 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2180
2181 bool LastSubReg = (i + 1 == e);
2182 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
2183 SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
2184 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg))
2185 .addImm(i);
2186 if (SB.NumSubRegs > 1 && i == 0)
2188 if (Indexes) {
2189 if (i == e - 1)
2190 Indexes->replaceMachineInstrInMaps(*MI, *MIB);
2191 else
2192 Indexes->insertMachineInstrInMaps(*MIB);
2193 }
2194 }
2195 }
2196
2197 SB.restore();
2198 }
2199
2200 MI->eraseFromParent();
2201
2202 if (LIS)
2204
2205 return true;
2206}
2207
2209 MachineBasicBlock &RestoreMBB,
2210 Register SGPR, RegScavenger *RS) const {
2211 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, SGPR, false, 0,
2212 RS);
2213 SB.prepare();
2214 // Generate the spill of SGPR to SB.TmpVGPR.
2215 unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill);
2216 auto PVD = SB.getPerVGPRData();
2217 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2218 unsigned TmpVGPRFlags = RegState::Undef;
2219 // Write sub registers into the VGPR
2220 for (unsigned i = Offset * PVD.PerVGPR,
2221 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2222 i < e; ++i) {
2224 SB.NumSubRegs == 1
2225 ? SB.SuperReg
2226 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2227
2228 MachineInstrBuilder WriteLane =
2229 BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32),
2230 SB.TmpVGPR)
2231 .addReg(SubReg, SubKillState)
2232 .addImm(i % PVD.PerVGPR)
2233 .addReg(SB.TmpVGPR, TmpVGPRFlags);
2234 TmpVGPRFlags = 0;
2235 // There could be undef components of a spilled super register.
2236 // TODO: Can we detect this and skip the spill?
2237 if (SB.NumSubRegs > 1) {
2238 // The last implicit use of the SB.SuperReg carries the "Kill" flag.
2239 unsigned SuperKillState = 0;
2240 if (i + 1 == SB.NumSubRegs)
2241 SuperKillState |= getKillRegState(SB.IsKill);
2242 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState);
2243 }
2244 }
2245 // Don't need to write VGPR out.
2246 }
2247
2248 // Restore clobbered registers in the specified restore block.
2249 MI = RestoreMBB.end();
2250 SB.setMI(&RestoreMBB, MI);
2251 // Generate the restore of SGPR from SB.TmpVGPR.
2252 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2253 // Don't need to load VGPR in.
2254 // Unpack lanes
2255 for (unsigned i = Offset * PVD.PerVGPR,
2256 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2257 i < e; ++i) {
2259 SB.NumSubRegs == 1
2260 ? SB.SuperReg
2261 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2262
2263 assert(SubReg.isPhysical());
2264 bool LastSubReg = (i + 1 == e);
2265 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32),
2266 SubReg)
2267 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg))
2268 .addImm(i);
2269 if (SB.NumSubRegs > 1 && i == 0)
2271 }
2272 }
2273 SB.restore();
2274
2276 return false;
2277}
2278
2279/// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to
2280/// a VGPR and the stack slot can be safely eliminated when all other users are
2281/// handled.
2284 SlotIndexes *Indexes, LiveIntervals *LIS, bool SpillToPhysVGPRLane) const {
2285 switch (MI->getOpcode()) {
2286 case AMDGPU::SI_SPILL_S1024_SAVE:
2287 case AMDGPU::SI_SPILL_S512_SAVE:
2288 case AMDGPU::SI_SPILL_S384_SAVE:
2289 case AMDGPU::SI_SPILL_S352_SAVE:
2290 case AMDGPU::SI_SPILL_S320_SAVE:
2291 case AMDGPU::SI_SPILL_S288_SAVE:
2292 case AMDGPU::SI_SPILL_S256_SAVE:
2293 case AMDGPU::SI_SPILL_S224_SAVE:
2294 case AMDGPU::SI_SPILL_S192_SAVE:
2295 case AMDGPU::SI_SPILL_S160_SAVE:
2296 case AMDGPU::SI_SPILL_S128_SAVE:
2297 case AMDGPU::SI_SPILL_S96_SAVE:
2298 case AMDGPU::SI_SPILL_S64_SAVE:
2299 case AMDGPU::SI_SPILL_S32_SAVE:
2300 return spillSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane);
2301 case AMDGPU::SI_SPILL_S1024_RESTORE:
2302 case AMDGPU::SI_SPILL_S512_RESTORE:
2303 case AMDGPU::SI_SPILL_S384_RESTORE:
2304 case AMDGPU::SI_SPILL_S352_RESTORE:
2305 case AMDGPU::SI_SPILL_S320_RESTORE:
2306 case AMDGPU::SI_SPILL_S288_RESTORE:
2307 case AMDGPU::SI_SPILL_S256_RESTORE:
2308 case AMDGPU::SI_SPILL_S224_RESTORE:
2309 case AMDGPU::SI_SPILL_S192_RESTORE:
2310 case AMDGPU::SI_SPILL_S160_RESTORE:
2311 case AMDGPU::SI_SPILL_S128_RESTORE:
2312 case AMDGPU::SI_SPILL_S96_RESTORE:
2313 case AMDGPU::SI_SPILL_S64_RESTORE:
2314 case AMDGPU::SI_SPILL_S32_RESTORE:
2315 return restoreSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane);
2316 default:
2317 llvm_unreachable("not an SGPR spill instruction");
2318 }
2319}
2320
2322 int SPAdj, unsigned FIOperandNum,
2323 RegScavenger *RS) const {
2324 MachineFunction *MF = MI->getMF();
2325 MachineBasicBlock *MBB = MI->getParent();
2327 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
2328 const SIInstrInfo *TII = ST.getInstrInfo();
2329 const DebugLoc &DL = MI->getDebugLoc();
2330
2331 assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?");
2332
2334 "unreserved scratch RSRC register");
2335
2336 MachineOperand *FIOp = &MI->getOperand(FIOperandNum);
2337 int Index = MI->getOperand(FIOperandNum).getIndex();
2338
2339 Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF)
2340 ? getBaseRegister()
2341 : getFrameRegister(*MF);
2342
2343 switch (MI->getOpcode()) {
2344 // SGPR register spill
2345 case AMDGPU::SI_SPILL_S1024_SAVE:
2346 case AMDGPU::SI_SPILL_S512_SAVE:
2347 case AMDGPU::SI_SPILL_S384_SAVE:
2348 case AMDGPU::SI_SPILL_S352_SAVE:
2349 case AMDGPU::SI_SPILL_S320_SAVE:
2350 case AMDGPU::SI_SPILL_S288_SAVE:
2351 case AMDGPU::SI_SPILL_S256_SAVE:
2352 case AMDGPU::SI_SPILL_S224_SAVE:
2353 case AMDGPU::SI_SPILL_S192_SAVE:
2354 case AMDGPU::SI_SPILL_S160_SAVE:
2355 case AMDGPU::SI_SPILL_S128_SAVE:
2356 case AMDGPU::SI_SPILL_S96_SAVE:
2357 case AMDGPU::SI_SPILL_S64_SAVE:
2358 case AMDGPU::SI_SPILL_S32_SAVE: {
2359 return spillSGPR(MI, Index, RS);
2360 }
2361
2362 // SGPR register restore
2363 case AMDGPU::SI_SPILL_S1024_RESTORE:
2364 case AMDGPU::SI_SPILL_S512_RESTORE:
2365 case AMDGPU::SI_SPILL_S384_RESTORE:
2366 case AMDGPU::SI_SPILL_S352_RESTORE:
2367 case AMDGPU::SI_SPILL_S320_RESTORE:
2368 case AMDGPU::SI_SPILL_S288_RESTORE:
2369 case AMDGPU::SI_SPILL_S256_RESTORE:
2370 case AMDGPU::SI_SPILL_S224_RESTORE:
2371 case AMDGPU::SI_SPILL_S192_RESTORE:
2372 case AMDGPU::SI_SPILL_S160_RESTORE:
2373 case AMDGPU::SI_SPILL_S128_RESTORE:
2374 case AMDGPU::SI_SPILL_S96_RESTORE:
2375 case AMDGPU::SI_SPILL_S64_RESTORE:
2376 case AMDGPU::SI_SPILL_S32_RESTORE: {
2377 return restoreSGPR(MI, Index, RS);
2378 }
2379
2380 // VGPR register spill
2381 case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE: {
2382 // Put mask into M0.
2383 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32),
2384 AMDGPU::M0)
2385 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::mask));
2386 [[fallthrough]];
2387 }
2388 case AMDGPU::SI_SPILL_V1024_SAVE:
2389 case AMDGPU::SI_SPILL_V512_SAVE:
2390 case AMDGPU::SI_SPILL_V384_SAVE:
2391 case AMDGPU::SI_SPILL_V352_SAVE:
2392 case AMDGPU::SI_SPILL_V320_SAVE:
2393 case AMDGPU::SI_SPILL_V288_SAVE:
2394 case AMDGPU::SI_SPILL_V256_SAVE:
2395 case AMDGPU::SI_SPILL_V224_SAVE:
2396 case AMDGPU::SI_SPILL_V192_SAVE:
2397 case AMDGPU::SI_SPILL_V160_SAVE:
2398 case AMDGPU::SI_SPILL_V128_SAVE:
2399 case AMDGPU::SI_SPILL_V96_SAVE:
2400 case AMDGPU::SI_SPILL_V64_SAVE:
2401 case AMDGPU::SI_SPILL_V32_SAVE:
2402 case AMDGPU::SI_SPILL_V16_SAVE:
2403 case AMDGPU::SI_SPILL_A1024_SAVE:
2404 case AMDGPU::SI_SPILL_A512_SAVE:
2405 case AMDGPU::SI_SPILL_A384_SAVE:
2406 case AMDGPU::SI_SPILL_A352_SAVE:
2407 case AMDGPU::SI_SPILL_A320_SAVE:
2408 case AMDGPU::SI_SPILL_A288_SAVE:
2409 case AMDGPU::SI_SPILL_A256_SAVE:
2410 case AMDGPU::SI_SPILL_A224_SAVE:
2411 case AMDGPU::SI_SPILL_A192_SAVE:
2412 case AMDGPU::SI_SPILL_A160_SAVE:
2413 case AMDGPU::SI_SPILL_A128_SAVE:
2414 case AMDGPU::SI_SPILL_A96_SAVE:
2415 case AMDGPU::SI_SPILL_A64_SAVE:
2416 case AMDGPU::SI_SPILL_A32_SAVE:
2417 case AMDGPU::SI_SPILL_AV1024_SAVE:
2418 case AMDGPU::SI_SPILL_AV512_SAVE:
2419 case AMDGPU::SI_SPILL_AV384_SAVE:
2420 case AMDGPU::SI_SPILL_AV352_SAVE:
2421 case AMDGPU::SI_SPILL_AV320_SAVE:
2422 case AMDGPU::SI_SPILL_AV288_SAVE:
2423 case AMDGPU::SI_SPILL_AV256_SAVE:
2424 case AMDGPU::SI_SPILL_AV224_SAVE:
2425 case AMDGPU::SI_SPILL_AV192_SAVE:
2426 case AMDGPU::SI_SPILL_AV160_SAVE:
2427 case AMDGPU::SI_SPILL_AV128_SAVE:
2428 case AMDGPU::SI_SPILL_AV96_SAVE:
2429 case AMDGPU::SI_SPILL_AV64_SAVE:
2430 case AMDGPU::SI_SPILL_AV32_SAVE:
2431 case AMDGPU::SI_SPILL_WWM_V32_SAVE:
2432 case AMDGPU::SI_SPILL_WWM_AV32_SAVE: {
2433 const MachineOperand *VData = TII->getNamedOperand(*MI,
2434 AMDGPU::OpName::vdata);
2435 if (VData->isUndef()) {
2436 MI->eraseFromParent();
2437 return true;
2438 }
2439
2440 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
2441 MFI->getStackPtrOffsetReg());
2442
2443 unsigned Opc;
2444 if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_SAVE) {
2445 assert(ST.enableFlatScratch() && "Flat Scratch is not enabled!");
2446 Opc = AMDGPU::SCRATCH_STORE_SHORT_SADDR_t16;
2447 } else {
2448 Opc = MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_SAVE
2449 ? AMDGPU::SCRATCH_STORE_BLOCK_SADDR
2450 : ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
2451 : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
2452 }
2453
2454 auto *MBB = MI->getParent();
2455 bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode());
2456 if (IsWWMRegSpill) {
2457 TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(),
2458 RS->isRegUsed(AMDGPU::SCC));
2459 }
2461 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
2462 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
2463 *MI->memoperands_begin(), RS);
2465 if (IsWWMRegSpill)
2466 TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy());
2467
2468 MI->eraseFromParent();
2469 return true;
2470 }
2471 case AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE: {
2472 // Put mask into M0.
2473 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32),
2474 AMDGPU::M0)
2475 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::mask));
2476 [[fallthrough]];
2477 }
2478 case AMDGPU::SI_SPILL_V16_RESTORE:
2479 case AMDGPU::SI_SPILL_V32_RESTORE:
2480 case AMDGPU::SI_SPILL_V64_RESTORE:
2481 case AMDGPU::SI_SPILL_V96_RESTORE:
2482 case AMDGPU::SI_SPILL_V128_RESTORE:
2483 case AMDGPU::SI_SPILL_V160_RESTORE:
2484 case AMDGPU::SI_SPILL_V192_RESTORE:
2485 case AMDGPU::SI_SPILL_V224_RESTORE:
2486 case AMDGPU::SI_SPILL_V256_RESTORE:
2487 case AMDGPU::SI_SPILL_V288_RESTORE:
2488 case AMDGPU::SI_SPILL_V320_RESTORE:
2489 case AMDGPU::SI_SPILL_V352_RESTORE:
2490 case AMDGPU::SI_SPILL_V384_RESTORE:
2491 case AMDGPU::SI_SPILL_V512_RESTORE:
2492 case AMDGPU::SI_SPILL_V1024_RESTORE:
2493 case AMDGPU::SI_SPILL_A32_RESTORE:
2494 case AMDGPU::SI_SPILL_A64_RESTORE:
2495 case AMDGPU::SI_SPILL_A96_RESTORE:
2496 case AMDGPU::SI_SPILL_A128_RESTORE:
2497 case AMDGPU::SI_SPILL_A160_RESTORE:
2498 case AMDGPU::SI_SPILL_A192_RESTORE:
2499 case AMDGPU::SI_SPILL_A224_RESTORE:
2500 case AMDGPU::SI_SPILL_A256_RESTORE:
2501 case AMDGPU::SI_SPILL_A288_RESTORE:
2502 case AMDGPU::SI_SPILL_A320_RESTORE:
2503 case AMDGPU::SI_SPILL_A352_RESTORE:
2504 case AMDGPU::SI_SPILL_A384_RESTORE:
2505 case AMDGPU::SI_SPILL_A512_RESTORE:
2506 case AMDGPU::SI_SPILL_A1024_RESTORE:
2507 case AMDGPU::SI_SPILL_AV32_RESTORE:
2508 case AMDGPU::SI_SPILL_AV64_RESTORE:
2509 case AMDGPU::SI_SPILL_AV96_RESTORE:
2510 case AMDGPU::SI_SPILL_AV128_RESTORE:
2511 case AMDGPU::SI_SPILL_AV160_RESTORE:
2512 case AMDGPU::SI_SPILL_AV192_RESTORE:
2513 case AMDGPU::SI_SPILL_AV224_RESTORE:
2514 case AMDGPU::SI_SPILL_AV256_RESTORE:
2515 case AMDGPU::SI_SPILL_AV288_RESTORE:
2516 case AMDGPU::SI_SPILL_AV320_RESTORE:
2517 case AMDGPU::SI_SPILL_AV352_RESTORE:
2518 case AMDGPU::SI_SPILL_AV384_RESTORE:
2519 case AMDGPU::SI_SPILL_AV512_RESTORE:
2520 case AMDGPU::SI_SPILL_AV1024_RESTORE:
2521 case AMDGPU::SI_SPILL_WWM_V32_RESTORE:
2522 case AMDGPU::SI_SPILL_WWM_AV32_RESTORE: {
2523 const MachineOperand *VData = TII->getNamedOperand(*MI,
2524 AMDGPU::OpName::vdata);
2525 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
2526 MFI->getStackPtrOffsetReg());
2527
2528 unsigned Opc;
2529 if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_RESTORE) {
2530 assert(ST.enableFlatScratch() && "Flat Scratch is not enabled!");
2531 Opc = ST.d16PreservesUnusedBits()
2532 ? AMDGPU::SCRATCH_LOAD_SHORT_D16_SADDR_t16
2533 : AMDGPU::SCRATCH_LOAD_USHORT_SADDR;
2534 } else {
2535 Opc = MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE
2536 ? AMDGPU::SCRATCH_LOAD_BLOCK_SADDR
2537 : ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
2538 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
2539 }
2540
2541 auto *MBB = MI->getParent();
2542 bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode());
2543 if (IsWWMRegSpill) {
2544 TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(),
2545 RS->isRegUsed(AMDGPU::SCC));
2546 }
2547
2549 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
2550 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
2551 *MI->memoperands_begin(), RS);
2552
2553 if (IsWWMRegSpill)
2554 TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy());
2555
2556 MI->eraseFromParent();
2557 return true;
2558 }
2559 case AMDGPU::V_ADD_U32_e32:
2560 case AMDGPU::V_ADD_U32_e64:
2561 case AMDGPU::V_ADD_CO_U32_e32:
2562 case AMDGPU::V_ADD_CO_U32_e64: {
2563 // TODO: Handle sub, and, or.
2564 unsigned NumDefs = MI->getNumExplicitDefs();
2565 unsigned Src0Idx = NumDefs;
2566
2567 bool HasClamp = false;
2568 MachineOperand *VCCOp = nullptr;
2569
2570 switch (MI->getOpcode()) {
2571 case AMDGPU::V_ADD_U32_e32:
2572 break;
2573 case AMDGPU::V_ADD_U32_e64:
2574 HasClamp = MI->getOperand(3).getImm();
2575 break;
2576 case AMDGPU::V_ADD_CO_U32_e32:
2577 VCCOp = &MI->getOperand(3);
2578 break;
2579 case AMDGPU::V_ADD_CO_U32_e64:
2580 VCCOp = &MI->getOperand(1);
2581 HasClamp = MI->getOperand(4).getImm();
2582 break;
2583 default:
2584 break;
2585 }
2586 bool DeadVCC = !VCCOp || VCCOp->isDead();
2587 MachineOperand &DstOp = MI->getOperand(0);
2588 Register DstReg = DstOp.getReg();
2589
2590 unsigned OtherOpIdx =
2591 FIOperandNum == Src0Idx ? FIOperandNum + 1 : Src0Idx;
2592 MachineOperand *OtherOp = &MI->getOperand(OtherOpIdx);
2593
2594 unsigned Src1Idx = Src0Idx + 1;
2595 Register MaterializedReg = FrameReg;
2596 Register ScavengedVGPR;
2597
2598 int64_t Offset = FrameInfo.getObjectOffset(Index);
2599 // For the non-immediate case, we could fall through to the default
2600 // handling, but we do an in-place update of the result register here to
2601 // avoid scavenging another register.
2602 if (OtherOp->isImm()) {
2603 int64_t TotalOffset = OtherOp->getImm() + Offset;
2604
2605 if (!ST.hasVOP3Literal() && SIInstrInfo::isVOP3(*MI) &&
2606 !AMDGPU::isInlinableIntLiteral(TotalOffset)) {
2607 // If we can't support a VOP3 literal in the VALU instruction, we
2608 // can't specially fold into the add.
2609 // TODO: Handle VOP3->VOP2 shrink to support the fold.
2610 break;
2611 }
2612
2613 OtherOp->setImm(TotalOffset);
2614 Offset = 0;
2615 }
2616
2617 if (FrameReg && !ST.enableFlatScratch()) {
2618 // We should just do an in-place update of the result register. However,
2619 // the value there may also be used by the add, in which case we need a
2620 // temporary register.
2621 //
2622 // FIXME: The scavenger is not finding the result register in the
2623 // common case where the add does not read the register.
2624
2625 ScavengedVGPR = RS->scavengeRegisterBackwards(
2626 AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false, /*SPAdj=*/0);
2627
2628 // TODO: If we have a free SGPR, it's sometimes better to use a scalar
2629 // shift.
2630 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64))
2631 .addDef(ScavengedVGPR, RegState::Renamable)
2632 .addImm(ST.getWavefrontSizeLog2())
2633 .addReg(FrameReg);
2634 MaterializedReg = ScavengedVGPR;
2635 }
2636
2637 if ((!OtherOp->isImm() || OtherOp->getImm() != 0) && MaterializedReg) {
2638 if (ST.enableFlatScratch() &&
2639 !TII->isOperandLegal(*MI, Src1Idx, OtherOp)) {
2640 // We didn't need the shift above, so we have an SGPR for the frame
2641 // register, but may have a VGPR only operand.
2642 //
2643 // TODO: On gfx10+, we can easily change the opcode to the e64 version
2644 // and use the higher constant bus restriction to avoid this copy.
2645
2646 if (!ScavengedVGPR) {
2647 ScavengedVGPR = RS->scavengeRegisterBackwards(
2648 AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false,
2649 /*SPAdj=*/0);
2650 }
2651
2652 assert(ScavengedVGPR != DstReg);
2653
2654 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2655 .addReg(MaterializedReg,
2656 MaterializedReg != FrameReg ? RegState::Kill : 0);
2657 MaterializedReg = ScavengedVGPR;
2658 }
2659
2660 // TODO: In the flat scratch case, if this is an add of an SGPR, and SCC
2661 // is not live, we could use a scalar add + vector add instead of 2
2662 // vector adds.
2663 auto AddI32 = BuildMI(*MBB, *MI, DL, TII->get(MI->getOpcode()))
2664 .addDef(DstReg, RegState::Renamable);
2665 if (NumDefs == 2)
2666 AddI32.add(MI->getOperand(1));
2667
2668 unsigned MaterializedRegFlags =
2669 MaterializedReg != FrameReg ? RegState::Kill : 0;
2670
2671 if (isVGPRClass(getPhysRegBaseClass(MaterializedReg))) {
2672 // If we know we have a VGPR already, it's more likely the other
2673 // operand is a legal vsrc0.
2674 AddI32
2675 .add(*OtherOp)
2676 .addReg(MaterializedReg, MaterializedRegFlags);
2677 } else {
2678 // Commute operands to avoid violating VOP2 restrictions. This will
2679 // typically happen when using scratch.
2680 AddI32
2681 .addReg(MaterializedReg, MaterializedRegFlags)
2682 .add(*OtherOp);
2683 }
2684
2685 if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 ||
2686 MI->getOpcode() == AMDGPU::V_ADD_U32_e64)
2687 AddI32.addImm(0); // clamp
2688
2689 if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e32)
2690 AddI32.setOperandDead(3); // Dead vcc
2691
2692 MaterializedReg = DstReg;
2693
2694 OtherOp->ChangeToRegister(MaterializedReg, false);
2695 OtherOp->setIsKill(true);
2697 Offset = 0;
2698 } else if (Offset != 0) {
2699 assert(!MaterializedReg);
2701 Offset = 0;
2702 } else {
2703 if (DeadVCC && !HasClamp) {
2704 assert(Offset == 0);
2705
2706 // TODO: Losing kills and implicit operands. Just mutate to copy and
2707 // let lowerCopy deal with it?
2708 if (OtherOp->isReg() && OtherOp->getReg() == DstReg) {
2709 // Folded to an identity copy.
2710 MI->eraseFromParent();
2711 return true;
2712 }
2713
2714 // The immediate value should be in OtherOp
2715 MI->setDesc(TII->get(AMDGPU::V_MOV_B32_e32));
2716 MI->removeOperand(FIOperandNum);
2717
2718 unsigned NumOps = MI->getNumOperands();
2719 for (unsigned I = NumOps - 2; I >= NumDefs + 1; --I)
2720 MI->removeOperand(I);
2721
2722 if (NumDefs == 2)
2723 MI->removeOperand(1);
2724
2725 // The code below can't deal with a mov.
2726 return true;
2727 }
2728
2729 // This folded to a constant, but we have to keep the add around for
2730 // pointless implicit defs or clamp modifier.
2731 FIOp->ChangeToImmediate(0);
2732 }
2733
2734 // Try to improve legality by commuting.
2735 if (!TII->isOperandLegal(*MI, Src1Idx) && TII->commuteInstruction(*MI)) {
2736 std::swap(FIOp, OtherOp);
2737 std::swap(FIOperandNum, OtherOpIdx);
2738 }
2739
2740 // We need at most one mov to satisfy the operand constraints. Prefer to
2741 // move the FI operand first, as it may be a literal in a VOP3
2742 // instruction.
2743 for (unsigned SrcIdx : {FIOperandNum, OtherOpIdx}) {
2744 if (!TII->isOperandLegal(*MI, SrcIdx)) {
2745 // If commuting didn't make the operands legal, we need to materialize
2746 // in a register.
2747 // TODO: Can use SGPR on gfx10+ in some cases.
2748 if (!ScavengedVGPR) {
2749 ScavengedVGPR = RS->scavengeRegisterBackwards(
2750 AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false,
2751 /*SPAdj=*/0);
2752 }
2753
2754 assert(ScavengedVGPR != DstReg);
2755
2756 MachineOperand &Src = MI->getOperand(SrcIdx);
2757 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2758 .add(Src);
2759
2760 Src.ChangeToRegister(ScavengedVGPR, false);
2761 Src.setIsKill(true);
2762 break;
2763 }
2764 }
2765
2766 // Fold out add of 0 case that can appear in kernels.
2767 if (FIOp->isImm() && FIOp->getImm() == 0 && DeadVCC && !HasClamp) {
2768 if (OtherOp->isReg() && OtherOp->getReg() != DstReg) {
2769 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::COPY), DstReg).add(*OtherOp);
2770 }
2771
2772 MI->eraseFromParent();
2773 }
2774
2775 return true;
2776 }
2777 case AMDGPU::S_ADD_I32:
2778 case AMDGPU::S_ADD_U32: {
2779 // TODO: Handle s_or_b32, s_and_b32.
2780 unsigned OtherOpIdx = FIOperandNum == 1 ? 2 : 1;
2781 MachineOperand &OtherOp = MI->getOperand(OtherOpIdx);
2782
2783 assert(FrameReg || MFI->isBottomOfStack());
2784
2785 MachineOperand &DstOp = MI->getOperand(0);
2786 const DebugLoc &DL = MI->getDebugLoc();
2787 Register MaterializedReg = FrameReg;
2788
2789 // Defend against live scc, which should never happen in practice.
2790 bool DeadSCC = MI->getOperand(3).isDead();
2791
2792 Register TmpReg;
2793
2794 // FIXME: Scavenger should figure out that the result register is
2795 // available. Also should do this for the v_add case.
2796 if (OtherOp.isReg() && OtherOp.getReg() != DstOp.getReg())
2797 TmpReg = DstOp.getReg();
2798
2799 if (FrameReg && !ST.enableFlatScratch()) {
2800 // FIXME: In the common case where the add does not also read its result
2801 // (i.e. this isn't a reg += fi), it's not finding the dest reg as
2802 // available.
2803 if (!TmpReg)
2804 TmpReg = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
2805 MI, /*RestoreAfter=*/false, 0,
2806 /*AllowSpill=*/false);
2807 if (TmpReg) {
2808 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::S_LSHR_B32))
2809 .addDef(TmpReg, RegState::Renamable)
2810 .addReg(FrameReg)
2811 .addImm(ST.getWavefrontSizeLog2())
2812 .setOperandDead(3); // Set SCC dead
2813 }
2814 MaterializedReg = TmpReg;
2815 }
2816
2817 int64_t Offset = FrameInfo.getObjectOffset(Index);
2818
2819 // For the non-immediate case, we could fall through to the default
2820 // handling, but we do an in-place update of the result register here to
2821 // avoid scavenging another register.
2822 if (OtherOp.isImm()) {
2823 OtherOp.setImm(OtherOp.getImm() + Offset);
2824 Offset = 0;
2825
2826 if (MaterializedReg)
2827 FIOp->ChangeToRegister(MaterializedReg, false);
2828 else
2829 FIOp->ChangeToImmediate(0);
2830 } else if (MaterializedReg) {
2831 // If we can't fold the other operand, do another increment.
2832 Register DstReg = DstOp.getReg();
2833
2834 if (!TmpReg && MaterializedReg == FrameReg) {
2835 TmpReg = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
2836 MI, /*RestoreAfter=*/false, 0,
2837 /*AllowSpill=*/false);
2838 DstReg = TmpReg;
2839 }
2840
2841 if (TmpReg) {
2842 auto AddI32 = BuildMI(*MBB, *MI, DL, MI->getDesc())
2843 .addDef(DstReg, RegState::Renamable)
2844 .addReg(MaterializedReg, RegState::Kill)
2845 .add(OtherOp);
2846 if (DeadSCC)
2847 AddI32.setOperandDead(3);
2848
2849 MaterializedReg = DstReg;
2850
2851 OtherOp.ChangeToRegister(MaterializedReg, false);
2852 OtherOp.setIsKill(true);
2853 OtherOp.setIsRenamable(true);
2854 }
2856 } else {
2857 // If we don't have any other offset to apply, we can just directly
2858 // interpret the frame index as the offset.
2860 }
2861
2862 if (DeadSCC && OtherOp.isImm() && OtherOp.getImm() == 0) {
2863 assert(Offset == 0);
2864 MI->removeOperand(3);
2865 MI->removeOperand(OtherOpIdx);
2866 MI->setDesc(TII->get(FIOp->isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2867 } else if (DeadSCC && FIOp->isImm() && FIOp->getImm() == 0) {
2868 assert(Offset == 0);
2869 MI->removeOperand(3);
2870 MI->removeOperand(FIOperandNum);
2871 MI->setDesc(
2872 TII->get(OtherOp.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2873 }
2874
2875 assert(!FIOp->isFI());
2876 return true;
2877 }
2878 default: {
2879 break;
2880 }
2881 }
2882
2883 int64_t Offset = FrameInfo.getObjectOffset(Index);
2884 if (ST.enableFlatScratch()) {
2885 if (TII->isFLATScratch(*MI)) {
2886 assert(
2887 (int16_t)FIOperandNum ==
2888 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::saddr));
2889
2890 // The offset is always swizzled, just replace it
2891 if (FrameReg)
2892 FIOp->ChangeToRegister(FrameReg, false);
2893
2895 TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
2896 int64_t NewOffset = Offset + OffsetOp->getImm();
2897 if (TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
2899 OffsetOp->setImm(NewOffset);
2900 if (FrameReg)
2901 return false;
2902 Offset = 0;
2903 }
2904
2905 if (!Offset) {
2906 unsigned Opc = MI->getOpcode();
2907 int NewOpc = -1;
2908 if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr)) {
2910 } else if (ST.hasFlatScratchSTMode()) {
2911 // On GFX10 we have ST mode to use no registers for an address.
2912 // Otherwise we need to materialize 0 into an SGPR.
2914 }
2915
2916 if (NewOpc != -1) {
2917 // removeOperand doesn't fixup tied operand indexes as it goes, so
2918 // it asserts. Untie vdst_in for now and retie them afterwards.
2919 int VDstIn =
2920 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
2921 bool TiedVDst = VDstIn != -1 && MI->getOperand(VDstIn).isReg() &&
2922 MI->getOperand(VDstIn).isTied();
2923 if (TiedVDst)
2924 MI->untieRegOperand(VDstIn);
2925
2926 MI->removeOperand(
2927 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr));
2928
2929 if (TiedVDst) {
2930 int NewVDst =
2931 AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
2932 int NewVDstIn =
2933 AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst_in);
2934 assert(NewVDst != -1 && NewVDstIn != -1 && "Must be tied!");
2935 MI->tieOperands(NewVDst, NewVDstIn);
2936 }
2937 MI->setDesc(TII->get(NewOpc));
2938 return false;
2939 }
2940 }
2941 }
2942
2943 if (!FrameReg) {
2945 if (TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp))
2946 return false;
2947 }
2948
2949 // We need to use register here. Check if we can use an SGPR or need
2950 // a VGPR.
2951 FIOp->ChangeToRegister(AMDGPU::M0, false);
2952 bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, FIOp);
2953
2954 if (!Offset && FrameReg && UseSGPR) {
2955 FIOp->setReg(FrameReg);
2956 return false;
2957 }
2958
2959 const TargetRegisterClass *RC =
2960 UseSGPR ? &AMDGPU::SReg_32_XM0RegClass : &AMDGPU::VGPR_32RegClass;
2961
2962 Register TmpReg =
2963 RS->scavengeRegisterBackwards(*RC, MI, false, 0, !UseSGPR);
2964 FIOp->setReg(TmpReg);
2965 FIOp->setIsKill();
2966
2967 if ((!FrameReg || !Offset) && TmpReg) {
2968 unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
2969 auto MIB = BuildMI(*MBB, MI, DL, TII->get(Opc), TmpReg);
2970 if (FrameReg)
2971 MIB.addReg(FrameReg);
2972 else
2973 MIB.addImm(Offset);
2974
2975 return false;
2976 }
2977
2978 bool NeedSaveSCC = RS->isRegUsed(AMDGPU::SCC) &&
2979 !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr);
2980
2981 Register TmpSReg =
2982 UseSGPR ? TmpReg
2983 : RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
2984 MI, false, 0, !UseSGPR);
2985
2986 // TODO: for flat scratch another attempt can be made with a VGPR index
2987 // if no SGPRs can be scavenged.
2988 if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR))
2989 report_fatal_error("Cannot scavenge register in FI elimination!");
2990
2991 if (!TmpSReg) {
2992 // Use frame register and restore it after.
2993 TmpSReg = FrameReg;
2994 FIOp->setReg(FrameReg);
2995 FIOp->setIsKill(false);
2996 }
2997
2998 if (NeedSaveSCC) {
2999 assert(!(Offset & 0x1) && "Flat scratch offset must be aligned!");
3000 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADDC_U32), TmpSReg)
3001 .addReg(FrameReg)
3002 .addImm(Offset);
3003 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITCMP1_B32))
3004 .addReg(TmpSReg)
3005 .addImm(0);
3006 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITSET0_B32), TmpSReg)
3007 .addImm(0)
3008 .addReg(TmpSReg);
3009 } else {
3010 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpSReg)
3011 .addReg(FrameReg)
3012 .addImm(Offset);
3013 }
3014
3015 if (!UseSGPR)
3016 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
3017 .addReg(TmpSReg, RegState::Kill);
3018
3019 if (TmpSReg == FrameReg) {
3020 // Undo frame register modification.
3021 if (NeedSaveSCC &&
3022 !MI->registerDefIsDead(AMDGPU::SCC, /*TRI=*/nullptr)) {
3024 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADDC_U32),
3025 TmpSReg)
3026 .addReg(FrameReg)
3027 .addImm(-Offset);
3028 I = BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITCMP1_B32))
3029 .addReg(TmpSReg)
3030 .addImm(0);
3031 BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITSET0_B32),
3032 TmpSReg)
3033 .addImm(0)
3034 .addReg(TmpSReg);
3035 } else {
3036 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADD_I32),
3037 FrameReg)
3038 .addReg(FrameReg)
3039 .addImm(-Offset);
3040 }
3041 }
3042
3043 return false;
3044 }
3045
3046 bool IsMUBUF = TII->isMUBUF(*MI);
3047
3048 if (!IsMUBUF && !MFI->isBottomOfStack()) {
3049 // Convert to a swizzled stack address by scaling by the wave size.
3050 // In an entry function/kernel the offset is already swizzled.
3051 bool IsSALU = isSGPRClass(TII->getRegClass(MI->getDesc(), FIOperandNum));
3052 bool LiveSCC = RS->isRegUsed(AMDGPU::SCC) &&
3053 !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr);
3054 const TargetRegisterClass *RC = IsSALU && !LiveSCC
3055 ? &AMDGPU::SReg_32RegClass
3056 : &AMDGPU::VGPR_32RegClass;
3057 bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32 ||
3058 MI->getOpcode() == AMDGPU::V_MOV_B32_e64 ||
3059 MI->getOpcode() == AMDGPU::S_MOV_B32;
3060 Register ResultReg =
3061 IsCopy ? MI->getOperand(0).getReg()
3062 : RS->scavengeRegisterBackwards(*RC, MI, false, 0);
3063
3064 int64_t Offset = FrameInfo.getObjectOffset(Index);
3065 if (Offset == 0) {
3066 unsigned OpCode =
3067 IsSALU && !LiveSCC ? AMDGPU::S_LSHR_B32 : AMDGPU::V_LSHRREV_B32_e64;
3068 Register TmpResultReg = ResultReg;
3069 if (IsSALU && LiveSCC) {
3070 TmpResultReg = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
3071 MI, false, 0);
3072 }
3073
3074 auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), TmpResultReg);
3075 if (OpCode == AMDGPU::V_LSHRREV_B32_e64)
3076 // For V_LSHRREV, the operands are reversed (the shift count goes
3077 // first).
3078 Shift.addImm(ST.getWavefrontSizeLog2()).addReg(FrameReg);
3079 else
3080 Shift.addReg(FrameReg).addImm(ST.getWavefrontSizeLog2());
3081 if (IsSALU && !LiveSCC)
3082 Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead.
3083 if (IsSALU && LiveSCC) {
3084 Register NewDest;
3085 if (IsCopy) {
3086 assert(ResultReg.isPhysical());
3087 NewDest = ResultReg;
3088 } else {
3089 NewDest = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
3090 Shift, false, 0);
3091 }
3092 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), NewDest)
3093 .addReg(TmpResultReg);
3094 ResultReg = NewDest;
3095 }
3096 } else {
3098 if (!IsSALU) {
3099 if ((MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) !=
3100 nullptr) {
3101 // Reuse ResultReg in intermediate step.
3102 Register ScaledReg = ResultReg;
3103
3104 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
3105 ScaledReg)
3106 .addImm(ST.getWavefrontSizeLog2())
3107 .addReg(FrameReg);
3108
3109 const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32;
3110
3111 // TODO: Fold if use instruction is another add of a constant.
3112 if (IsVOP2 ||
3113 AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
3114 // FIXME: This can fail
3115 MIB.addImm(Offset);
3116 MIB.addReg(ScaledReg, RegState::Kill);
3117 if (!IsVOP2)
3118 MIB.addImm(0); // clamp bit
3119 } else {
3120 assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 &&
3121 "Need to reuse carry out register");
3122
3123 // Use scavenged unused carry out as offset register.
3124 Register ConstOffsetReg;
3125 if (!isWave32)
3126 ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0);
3127 else
3128 ConstOffsetReg = MIB.getReg(1);
3129
3130 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32),
3131 ConstOffsetReg)
3132 .addImm(Offset);
3133 MIB.addReg(ConstOffsetReg, RegState::Kill);
3134 MIB.addReg(ScaledReg, RegState::Kill);
3135 MIB.addImm(0); // clamp bit
3136 }
3137 }
3138 }
3139 if (!MIB || IsSALU) {
3140 // We have to produce a carry out, and there isn't a free SGPR pair
3141 // for it. We can keep the whole computation on the SALU to avoid
3142 // clobbering an additional register at the cost of an extra mov.
3143
3144 // We may have 1 free scratch SGPR even though a carry out is
3145 // unavailable. Only one additional mov is needed.
3146 Register TmpScaledReg = IsCopy && IsSALU
3147 ? ResultReg
3148 : RS->scavengeRegisterBackwards(
3149 AMDGPU::SReg_32_XM0RegClass, MI,
3150 false, 0, /*AllowSpill=*/false);
3151 Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg;
3152 Register TmpResultReg = ScaledReg;
3153
3154 if (!LiveSCC) {
3155 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), TmpResultReg)
3156 .addReg(FrameReg)
3157 .addImm(ST.getWavefrontSizeLog2());
3158 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpResultReg)
3159 .addReg(TmpResultReg, RegState::Kill)
3160 .addImm(Offset);
3161 } else {
3162 TmpResultReg = RS->scavengeRegisterBackwards(
3163 AMDGPU::VGPR_32RegClass, MI, false, 0, /*AllowSpill=*/true);
3164
3166 if ((Add = TII->getAddNoCarry(*MBB, MI, DL, TmpResultReg, *RS))) {
3167 BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
3168 TmpResultReg)
3169 .addImm(ST.getWavefrontSizeLog2())
3170 .addReg(FrameReg);
3171 if (Add->getOpcode() == AMDGPU::V_ADD_CO_U32_e64) {
3172 BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::S_MOV_B32), ResultReg)
3173 .addImm(Offset);
3174 Add.addReg(ResultReg, RegState::Kill)
3175 .addReg(TmpResultReg, RegState::Kill)
3176 .addImm(0);
3177 } else
3178 Add.addImm(Offset).addReg(TmpResultReg, RegState::Kill);
3179 } else {
3180 assert(Offset > 0 && isUInt<24>(2 * ST.getMaxWaveScratchSize()) &&
3181 "offset is unsafe for v_mad_u32_u24");
3182
3183 // We start with a frame pointer with a wave space value, and
3184 // an offset in lane-space. We are materializing a lane space
3185 // value. We can either do a right shift of the frame pointer
3186 // to get to lane space, or a left shift of the offset to get
3187 // to wavespace. We can right shift after the computation to
3188 // get back to the desired per-lane value. We are using the
3189 // mad_u32_u24 primarily as an add with no carry out clobber.
3190 bool IsInlinableLiteral =
3191 AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm());
3192 if (!IsInlinableLiteral) {
3193 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32),
3194 TmpResultReg)
3195 .addImm(Offset);
3196 }
3197
3198 Add = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MAD_U32_U24_e64),
3199 TmpResultReg);
3200
3201 if (!IsInlinableLiteral) {
3202 Add.addReg(TmpResultReg, RegState::Kill);
3203 } else {
3204 // We fold the offset into mad itself if its inlinable.
3205 Add.addImm(Offset);
3206 }
3207 Add.addImm(ST.getWavefrontSize()).addReg(FrameReg).addImm(0);
3208 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
3209 TmpResultReg)
3210 .addImm(ST.getWavefrontSizeLog2())
3211 .addReg(TmpResultReg);
3212 }
3213
3214 Register NewDest;
3215 if (IsCopy) {
3216 NewDest = ResultReg;
3217 } else {
3218 NewDest = RS->scavengeRegisterBackwards(
3219 AMDGPU::SReg_32_XM0RegClass, *Add, false, 0,
3220 /*AllowSpill=*/true);
3221 }
3222
3223 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
3224 NewDest)
3225 .addReg(TmpResultReg);
3226 ResultReg = NewDest;
3227 }
3228 if (!IsSALU)
3229 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg)
3230 .addReg(TmpResultReg, RegState::Kill);
3231 else
3232 ResultReg = TmpResultReg;
3233 // If there were truly no free SGPRs, we need to undo everything.
3234 if (!TmpScaledReg.isValid()) {
3235 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg)
3236 .addReg(ScaledReg, RegState::Kill)
3237 .addImm(-Offset);
3238 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg)
3239 .addReg(FrameReg)
3240 .addImm(ST.getWavefrontSizeLog2());
3241 }
3242 }
3243 }
3244
3245 // Don't introduce an extra copy if we're just materializing in a mov.
3246 if (IsCopy) {
3247 MI->eraseFromParent();
3248 return true;
3249 }
3250 FIOp->ChangeToRegister(ResultReg, false, false, true);
3251 return false;
3252 }
3253
3254 if (IsMUBUF) {
3255 // Disable offen so we don't need a 0 vgpr base.
3256 assert(
3257 static_cast<int>(FIOperandNum) ==
3258 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr));
3259
3260 auto &SOffset = *TII->getNamedOperand(*MI, AMDGPU::OpName::soffset);
3261 assert((SOffset.isImm() && SOffset.getImm() == 0));
3262
3263 if (FrameReg != AMDGPU::NoRegister)
3264 SOffset.ChangeToRegister(FrameReg, false);
3265
3266 int64_t Offset = FrameInfo.getObjectOffset(Index);
3267 int64_t OldImm =
3268 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm();
3269 int64_t NewOffset = OldImm + Offset;
3270
3271 if (TII->isLegalMUBUFImmOffset(NewOffset) &&
3272 buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) {
3273 MI->eraseFromParent();
3274 return true;
3275 }
3276 }
3277
3278 // If the offset is simply too big, don't convert to a scratch wave offset
3279 // relative index.
3280
3282 if (!TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp)) {
3283 Register TmpReg =
3284 RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0);
3285 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
3286 .addImm(Offset);
3287 FIOp->ChangeToRegister(TmpReg, false, false, true);
3288 }
3289
3290 return false;
3291}
3292
3296
3298 return getEncodingValue(Reg) & AMDGPU::HWEncoding::REG_IDX_MASK;
3299}
3300
3302 return getRegBitWidth(RC.getID());
3303}
3304
3305static const TargetRegisterClass *
3307 if (BitWidth == 64)
3308 return &AMDGPU::VReg_64RegClass;
3309 if (BitWidth == 96)
3310 return &AMDGPU::VReg_96RegClass;
3311 if (BitWidth == 128)
3312 return &AMDGPU::VReg_128RegClass;
3313 if (BitWidth == 160)
3314 return &AMDGPU::VReg_160RegClass;
3315 if (BitWidth == 192)
3316 return &AMDGPU::VReg_192RegClass;
3317 if (BitWidth == 224)
3318 return &AMDGPU::VReg_224RegClass;
3319 if (BitWidth == 256)
3320 return &AMDGPU::VReg_256RegClass;
3321 if (BitWidth == 288)
3322 return &AMDGPU::VReg_288RegClass;
3323 if (BitWidth == 320)
3324 return &AMDGPU::VReg_320RegClass;
3325 if (BitWidth == 352)
3326 return &AMDGPU::VReg_352RegClass;
3327 if (BitWidth == 384)
3328 return &AMDGPU::VReg_384RegClass;
3329 if (BitWidth == 512)
3330 return &AMDGPU::VReg_512RegClass;
3331 if (BitWidth == 1024)
3332 return &AMDGPU::VReg_1024RegClass;
3333
3334 return nullptr;
3335}
3336
3337static const TargetRegisterClass *
3339 if (BitWidth == 64)
3340 return &AMDGPU::VReg_64_Align2RegClass;
3341 if (BitWidth == 96)
3342 return &AMDGPU::VReg_96_Align2RegClass;
3343 if (BitWidth == 128)
3344 return &AMDGPU::VReg_128_Align2RegClass;
3345 if (BitWidth == 160)
3346 return &AMDGPU::VReg_160_Align2RegClass;
3347 if (BitWidth == 192)
3348 return &AMDGPU::VReg_192_Align2RegClass;
3349 if (BitWidth == 224)
3350 return &AMDGPU::VReg_224_Align2RegClass;
3351 if (BitWidth == 256)
3352 return &AMDGPU::VReg_256_Align2RegClass;
3353 if (BitWidth == 288)
3354 return &AMDGPU::VReg_288_Align2RegClass;
3355 if (BitWidth == 320)
3356 return &AMDGPU::VReg_320_Align2RegClass;
3357 if (BitWidth == 352)
3358 return &AMDGPU::VReg_352_Align2RegClass;
3359 if (BitWidth == 384)
3360 return &AMDGPU::VReg_384_Align2RegClass;
3361 if (BitWidth == 512)
3362 return &AMDGPU::VReg_512_Align2RegClass;
3363 if (BitWidth == 1024)
3364 return &AMDGPU::VReg_1024_Align2RegClass;
3365
3366 return nullptr;
3367}
3368
3369const TargetRegisterClass *
3371 if (BitWidth == 1)
3372 return &AMDGPU::VReg_1RegClass;
3373 if (BitWidth == 16)
3374 return &AMDGPU::VGPR_16RegClass;
3375 if (BitWidth == 32)
3376 return &AMDGPU::VGPR_32RegClass;
3377 return ST.needsAlignedVGPRs() ? getAlignedVGPRClassForBitWidth(BitWidth)
3379}
3380
3381const TargetRegisterClass *
3383 if (BitWidth <= 32)
3384 return &AMDGPU::VGPR_32_Lo256RegClass;
3385 if (BitWidth <= 64)
3386 return &AMDGPU::VReg_64_Lo256_Align2RegClass;
3387 if (BitWidth <= 96)
3388 return &AMDGPU::VReg_96_Lo256_Align2RegClass;
3389 if (BitWidth <= 128)
3390 return &AMDGPU::VReg_128_Lo256_Align2RegClass;
3391 if (BitWidth <= 160)
3392 return &AMDGPU::VReg_160_Lo256_Align2RegClass;
3393 if (BitWidth <= 192)
3394 return &AMDGPU::VReg_192_Lo256_Align2RegClass;
3395 if (BitWidth <= 224)
3396 return &AMDGPU::VReg_224_Lo256_Align2RegClass;
3397 if (BitWidth <= 256)
3398 return &AMDGPU::VReg_256_Lo256_Align2RegClass;
3399 if (BitWidth <= 288)
3400 return &AMDGPU::VReg_288_Lo256_Align2RegClass;
3401 if (BitWidth <= 320)
3402 return &AMDGPU::VReg_320_Lo256_Align2RegClass;
3403 if (BitWidth <= 352)
3404 return &AMDGPU::VReg_352_Lo256_Align2RegClass;
3405 if (BitWidth <= 384)
3406 return &AMDGPU::VReg_384_Lo256_Align2RegClass;
3407 if (BitWidth <= 512)
3408 return &AMDGPU::VReg_512_Lo256_Align2RegClass;
3409 if (BitWidth <= 1024)
3410 return &AMDGPU::VReg_1024_Lo256_Align2RegClass;
3411
3412 return nullptr;
3413}
3414
3415static const TargetRegisterClass *
3417 if (BitWidth == 64)
3418 return &AMDGPU::AReg_64RegClass;
3419 if (BitWidth == 96)
3420 return &AMDGPU::AReg_96RegClass;
3421 if (BitWidth == 128)
3422 return &AMDGPU::AReg_128RegClass;
3423 if (BitWidth == 160)
3424 return &AMDGPU::AReg_160RegClass;
3425 if (BitWidth == 192)
3426 return &AMDGPU::AReg_192RegClass;
3427 if (BitWidth == 224)
3428 return &AMDGPU::AReg_224RegClass;
3429 if (BitWidth == 256)
3430 return &AMDGPU::AReg_256RegClass;
3431 if (BitWidth == 288)
3432 return &AMDGPU::AReg_288RegClass;
3433 if (BitWidth == 320)
3434 return &AMDGPU::AReg_320RegClass;
3435 if (BitWidth == 352)
3436 return &AMDGPU::AReg_352RegClass;
3437 if (BitWidth == 384)
3438 return &AMDGPU::AReg_384RegClass;
3439 if (BitWidth == 512)
3440 return &AMDGPU::AReg_512RegClass;
3441 if (BitWidth == 1024)
3442 return &AMDGPU::AReg_1024RegClass;
3443
3444 return nullptr;
3445}
3446
3447static const TargetRegisterClass *
3449 if (BitWidth == 64)
3450 return &AMDGPU::AReg_64_Align2RegClass;
3451 if (BitWidth == 96)
3452 return &AMDGPU::AReg_96_Align2RegClass;
3453 if (BitWidth == 128)
3454 return &AMDGPU::AReg_128_Align2RegClass;
3455 if (BitWidth == 160)
3456 return &AMDGPU::AReg_160_Align2RegClass;
3457 if (BitWidth == 192)
3458 return &AMDGPU::AReg_192_Align2RegClass;
3459 if (BitWidth == 224)
3460 return &AMDGPU::AReg_224_Align2RegClass;
3461 if (BitWidth == 256)
3462 return &AMDGPU::AReg_256_Align2RegClass;
3463 if (BitWidth == 288)
3464 return &AMDGPU::AReg_288_Align2RegClass;
3465 if (BitWidth == 320)
3466 return &AMDGPU::AReg_320_Align2RegClass;
3467 if (BitWidth == 352)
3468 return &AMDGPU::AReg_352_Align2RegClass;
3469 if (BitWidth == 384)
3470 return &AMDGPU::AReg_384_Align2RegClass;
3471 if (BitWidth == 512)
3472 return &AMDGPU::AReg_512_Align2RegClass;
3473 if (BitWidth == 1024)
3474 return &AMDGPU::AReg_1024_Align2RegClass;
3475
3476 return nullptr;
3477}
3478
3479const TargetRegisterClass *
3481 if (BitWidth == 16)
3482 return &AMDGPU::AGPR_LO16RegClass;
3483 if (BitWidth == 32)
3484 return &AMDGPU::AGPR_32RegClass;
3485 return ST.needsAlignedVGPRs() ? getAlignedAGPRClassForBitWidth(BitWidth)
3487}
3488
3489static const TargetRegisterClass *
3491 if (BitWidth == 64)
3492 return &AMDGPU::AV_64RegClass;
3493 if (BitWidth == 96)
3494 return &AMDGPU::AV_96RegClass;
3495 if (BitWidth == 128)
3496 return &AMDGPU::AV_128RegClass;
3497 if (BitWidth == 160)
3498 return &AMDGPU::AV_160RegClass;
3499 if (BitWidth == 192)
3500 return &AMDGPU::AV_192RegClass;
3501 if (BitWidth == 224)
3502 return &AMDGPU::AV_224RegClass;
3503 if (BitWidth == 256)
3504 return &AMDGPU::AV_256RegClass;
3505 if (BitWidth == 288)
3506 return &AMDGPU::AV_288RegClass;
3507 if (BitWidth == 320)
3508 return &AMDGPU::AV_320RegClass;
3509 if (BitWidth == 352)
3510 return &AMDGPU::AV_352RegClass;
3511 if (BitWidth == 384)
3512 return &AMDGPU::AV_384RegClass;
3513 if (BitWidth == 512)
3514 return &AMDGPU::AV_512RegClass;
3515 if (BitWidth == 1024)
3516 return &AMDGPU::AV_1024RegClass;
3517
3518 return nullptr;
3519}
3520
3521static const TargetRegisterClass *
3523 if (BitWidth == 64)
3524 return &AMDGPU::AV_64_Align2RegClass;
3525 if (BitWidth == 96)
3526 return &AMDGPU::AV_96_Align2RegClass;
3527 if (BitWidth == 128)
3528 return &AMDGPU::AV_128_Align2RegClass;
3529 if (BitWidth == 160)
3530 return &AMDGPU::AV_160_Align2RegClass;
3531 if (BitWidth == 192)
3532 return &AMDGPU::AV_192_Align2RegClass;
3533 if (BitWidth == 224)
3534 return &AMDGPU::AV_224_Align2RegClass;
3535 if (BitWidth == 256)
3536 return &AMDGPU::AV_256_Align2RegClass;
3537 if (BitWidth == 288)
3538 return &AMDGPU::AV_288_Align2RegClass;
3539 if (BitWidth == 320)
3540 return &AMDGPU::AV_320_Align2RegClass;
3541 if (BitWidth == 352)
3542 return &AMDGPU::AV_352_Align2RegClass;
3543 if (BitWidth == 384)
3544 return &AMDGPU::AV_384_Align2RegClass;
3545 if (BitWidth == 512)
3546 return &AMDGPU::AV_512_Align2RegClass;
3547 if (BitWidth == 1024)
3548 return &AMDGPU::AV_1024_Align2RegClass;
3549
3550 return nullptr;
3551}
3552
3553const TargetRegisterClass *
3555 if (BitWidth == 32)
3556 return &AMDGPU::AV_32RegClass;
3557 return ST.needsAlignedVGPRs()
3560}
3561
3562const TargetRegisterClass *
3564 // TODO: In principle this should use AV classes for gfx908 too. This is
3565 // limited to 90a+ to avoid regressing special case copy optimizations which
3566 // need new handling. The core issue is that it's not possible to directly
3567 // copy between AGPRs on gfx908, and the current optimizations around that
3568 // expect to see copies to VGPR.
3569 return ST.hasGFX90AInsts() ? getVectorSuperClassForBitWidth(BitWidth)
3571}
3572
3573const TargetRegisterClass *
3575 if (BitWidth == 16 || BitWidth == 32)
3576 return &AMDGPU::SReg_32RegClass;
3577 if (BitWidth == 64)
3578 return &AMDGPU::SReg_64RegClass;
3579 if (BitWidth == 96)
3580 return &AMDGPU::SGPR_96RegClass;
3581 if (BitWidth == 128)
3582 return &AMDGPU::SGPR_128RegClass;
3583 if (BitWidth == 160)
3584 return &AMDGPU::SGPR_160RegClass;
3585 if (BitWidth == 192)
3586 return &AMDGPU::SGPR_192RegClass;
3587 if (BitWidth == 224)
3588 return &AMDGPU::SGPR_224RegClass;
3589 if (BitWidth == 256)
3590 return &AMDGPU::SGPR_256RegClass;
3591 if (BitWidth == 288)
3592 return &AMDGPU::SGPR_288RegClass;
3593 if (BitWidth == 320)
3594 return &AMDGPU::SGPR_320RegClass;
3595 if (BitWidth == 352)
3596 return &AMDGPU::SGPR_352RegClass;
3597 if (BitWidth == 384)
3598 return &AMDGPU::SGPR_384RegClass;
3599 if (BitWidth == 512)
3600 return &AMDGPU::SGPR_512RegClass;
3601 if (BitWidth == 1024)
3602 return &AMDGPU::SGPR_1024RegClass;
3603
3604 return nullptr;
3605}
3606
3608 Register Reg) const {
3609 const TargetRegisterClass *RC;
3610 if (Reg.isVirtual())
3611 RC = MRI.getRegClass(Reg);
3612 else
3613 RC = getPhysRegBaseClass(Reg);
3614 return RC && isSGPRClass(RC);
3615}
3616
3617const TargetRegisterClass *
3619 unsigned Size = getRegSizeInBits(*SRC);
3620
3621 switch (SRC->getID()) {
3622 default:
3623 break;
3624 case AMDGPU::VS_32_Lo256RegClassID:
3625 case AMDGPU::VS_64_Lo256RegClassID:
3626 return getAllocatableClass(getAlignedLo256VGPRClassForBitWidth(Size));
3627 }
3628
3629 const TargetRegisterClass *VRC =
3630 getAllocatableClass(getVGPRClassForBitWidth(Size));
3631 assert(VRC && "Invalid register class size");
3632 return VRC;
3633}
3634
3635const TargetRegisterClass *
3637 unsigned Size = getRegSizeInBits(*SRC);
3639 assert(ARC && "Invalid register class size");
3640 return ARC;
3641}
3642
3643const TargetRegisterClass *
3645 unsigned Size = getRegSizeInBits(*SRC);
3647 assert(ARC && "Invalid register class size");
3648 return ARC;
3649}
3650
3651const TargetRegisterClass *
3653 unsigned Size = getRegSizeInBits(*VRC);
3654 if (Size == 32)
3655 return &AMDGPU::SGPR_32RegClass;
3657 assert(SRC && "Invalid register class size");
3658 return SRC;
3659}
3660
3661const TargetRegisterClass *
3663 const TargetRegisterClass *SubRC,
3664 unsigned SubIdx) const {
3665 // Ensure this subregister index is aligned in the super register.
3666 const TargetRegisterClass *MatchRC =
3667 getMatchingSuperRegClass(SuperRC, SubRC, SubIdx);
3668 return MatchRC && MatchRC->hasSubClassEq(SuperRC) ? MatchRC : nullptr;
3669}
3670
3671bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
3674 return !ST.hasMFMAInlineLiteralBug();
3675
3676 return OpType >= AMDGPU::OPERAND_SRC_FIRST &&
3677 OpType <= AMDGPU::OPERAND_SRC_LAST;
3678}
3679
3680bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const {
3681 // TODO: 64-bit operands have extending behavior from 32-bit literal.
3682 return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST &&
3684}
3685
3686/// Returns a lowest register that is not used at any point in the function.
3687/// If all registers are used, then this function will return
3688/// AMDGPU::NoRegister. If \p ReserveHighestRegister = true, then return
3689/// highest unused register.
3692 const MachineFunction &MF, bool ReserveHighestRegister) const {
3693 if (ReserveHighestRegister) {
3694 for (MCRegister Reg : reverse(*RC))
3695 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
3696 return Reg;
3697 } else {
3698 for (MCRegister Reg : *RC)
3699 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
3700 return Reg;
3701 }
3702 return MCRegister();
3703}
3704
3706 const RegisterBankInfo &RBI,
3707 Register Reg) const {
3708 auto *RB = RBI.getRegBank(Reg, MRI, *MRI.getTargetRegisterInfo());
3709 if (!RB)
3710 return false;
3711
3712 return !RBI.isDivergentRegBank(RB);
3713}
3714
3716 unsigned EltSize) const {
3717 const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC);
3718 assert(RegBitWidth >= 32 && RegBitWidth <= 1024 && EltSize >= 2);
3719
3720 const unsigned RegHalves = RegBitWidth / 16;
3721 const unsigned EltHalves = EltSize / 2;
3722 assert(RegSplitParts.size() + 1 >= EltHalves);
3723
3724 const std::vector<int16_t> &Parts = RegSplitParts[EltHalves - 1];
3725 const unsigned NumParts = RegHalves / EltHalves;
3726
3727 return ArrayRef(Parts.data(), NumParts);
3728}
3729
3732 Register Reg) const {
3733 return Reg.isVirtual() ? MRI.getRegClass(Reg) : getPhysRegBaseClass(Reg);
3734}
3735
3736const TargetRegisterClass *
3738 const MachineOperand &MO) const {
3739 const TargetRegisterClass *SrcRC = getRegClassForReg(MRI, MO.getReg());
3740 return getSubRegisterClass(SrcRC, MO.getSubReg());
3741}
3742
3744 Register Reg) const {
3745 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
3746 // Registers without classes are unaddressable, SGPR-like registers.
3747 return RC && isVGPRClass(RC);
3748}
3749
3751 Register Reg) const {
3752 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
3753
3754 // Registers without classes are unaddressable, SGPR-like registers.
3755 return RC && isAGPRClass(RC);
3756}
3757
3759 MachineFunction &MF) const {
3760 unsigned MinOcc = ST.getOccupancyWithWorkGroupSizes(MF).first;
3761 switch (RC->getID()) {
3762 default:
3763 return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF);
3764 case AMDGPU::VGPR_32RegClassID:
3765 return std::min(
3766 ST.getMaxNumVGPRs(
3767 MinOcc,
3769 ST.getMaxNumVGPRs(MF));
3770 case AMDGPU::SGPR_32RegClassID:
3771 case AMDGPU::SGPR_LO16RegClassID:
3772 return std::min(ST.getMaxNumSGPRs(MinOcc, true), ST.getMaxNumSGPRs(MF));
3773 }
3774}
3775
3777 unsigned Idx) const {
3778 switch (static_cast<AMDGPU::RegisterPressureSets>(Idx)) {
3779 case AMDGPU::RegisterPressureSets::VGPR_32:
3780 case AMDGPU::RegisterPressureSets::AGPR_32:
3781 return getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
3782 const_cast<MachineFunction &>(MF));
3783 case AMDGPU::RegisterPressureSets::SReg_32:
3784 return getRegPressureLimit(&AMDGPU::SGPR_32RegClass,
3785 const_cast<MachineFunction &>(MF));
3786 }
3787
3788 llvm_unreachable("Unexpected register pressure set!");
3789}
3790
3791const int *SIRegisterInfo::getRegUnitPressureSets(MCRegUnit RegUnit) const {
3792 static const int Empty[] = { -1 };
3793
3794 if (RegPressureIgnoredUnits[static_cast<unsigned>(RegUnit)])
3795 return Empty;
3796
3797 return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit);
3798}
3799
3801 ArrayRef<MCPhysReg> Order,
3803 const MachineFunction &MF,
3804 const VirtRegMap *VRM,
3805 const LiveRegMatrix *Matrix) const {
3806
3807 const MachineRegisterInfo &MRI = MF.getRegInfo();
3808 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3809
3810 std::pair<unsigned, Register> Hint = MRI.getRegAllocationHint(VirtReg);
3811
3812 switch (Hint.first) {
3813 case AMDGPURI::Size32: {
3814 Register Paired = Hint.second;
3815 assert(Paired);
3816 Register PairedPhys;
3817 if (Paired.isPhysical()) {
3818 PairedPhys =
3819 getMatchingSuperReg(Paired, AMDGPU::lo16, &AMDGPU::VGPR_32RegClass);
3820 } else if (VRM && VRM->hasPhys(Paired)) {
3821 PairedPhys = getMatchingSuperReg(VRM->getPhys(Paired), AMDGPU::lo16,
3822 &AMDGPU::VGPR_32RegClass);
3823 }
3824
3825 // Prefer the paired physreg.
3826 if (PairedPhys)
3827 // isLo(Paired) is implicitly true here from the API of
3828 // getMatchingSuperReg.
3829 Hints.push_back(PairedPhys);
3830 return false;
3831 }
3832 case AMDGPURI::Size16: {
3833 Register Paired = Hint.second;
3834 assert(Paired);
3835 Register PairedPhys;
3836 if (Paired.isPhysical()) {
3837 PairedPhys = TRI->getSubReg(Paired, AMDGPU::lo16);
3838 } else if (VRM && VRM->hasPhys(Paired)) {
3839 PairedPhys = TRI->getSubReg(VRM->getPhys(Paired), AMDGPU::lo16);
3840 }
3841
3842 // First prefer the paired physreg.
3843 if (PairedPhys)
3844 Hints.push_back(PairedPhys);
3845 else {
3846 // Add all the lo16 physregs.
3847 // When the Paired operand has not yet been assigned a physreg it is
3848 // better to try putting VirtReg in a lo16 register, because possibly
3849 // later Paired can be assigned to the overlapping register and the COPY
3850 // can be eliminated.
3851 for (MCPhysReg PhysReg : Order) {
3852 if (PhysReg == PairedPhys || AMDGPU::isHi16Reg(PhysReg, *this))
3853 continue;
3854 if (AMDGPU::VGPR_16RegClass.contains(PhysReg) &&
3855 !MRI.isReserved(PhysReg))
3856 Hints.push_back(PhysReg);
3857 }
3858 }
3859 return false;
3860 }
3861 default:
3862 return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
3863 VRM);
3864 }
3865}
3866
3868 // Not a callee saved register.
3869 return AMDGPU::SGPR30_SGPR31;
3870}
3871
3872const TargetRegisterClass *
3874 const RegisterBank &RB) const {
3875 switch (RB.getID()) {
3876 case AMDGPU::VGPRRegBankID:
3878 std::max(ST.useRealTrue16Insts() ? 16u : 32u, Size));
3879 case AMDGPU::VCCRegBankID:
3880 assert(Size == 1);
3881 return getWaveMaskRegClass();
3882 case AMDGPU::SGPRRegBankID:
3883 return getSGPRClassForBitWidth(std::max(32u, Size));
3884 case AMDGPU::AGPRRegBankID:
3885 return getAGPRClassForBitWidth(std::max(32u, Size));
3886 default:
3887 llvm_unreachable("unknown register bank");
3888 }
3889}
3890
3891const TargetRegisterClass *
3893 const MachineRegisterInfo &MRI) const {
3894 const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg());
3895 if (const RegisterBank *RB = dyn_cast<const RegisterBank *>(RCOrRB))
3896 return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB);
3897
3898 if (const auto *RC = dyn_cast<const TargetRegisterClass *>(RCOrRB))
3899 return getAllocatableClass(RC);
3900
3901 return nullptr;
3902}
3903
3905 return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC;
3906}
3907
3909 return isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
3910}
3911
3913 // VGPR tuples have an alignment requirement on gfx90a variants.
3914 return ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass
3915 : &AMDGPU::VReg_64RegClass;
3916}
3917
3918// Find reaching register definition
3922 LiveIntervals *LIS) const {
3923 auto &MDT = LIS->getDomTree();
3924 SlotIndex UseIdx = LIS->getInstructionIndex(Use);
3925 SlotIndex DefIdx;
3926
3927 if (Reg.isVirtual()) {
3928 if (!LIS->hasInterval(Reg))
3929 return nullptr;
3930 LiveInterval &LI = LIS->getInterval(Reg);
3931 LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg)
3932 : MRI.getMaxLaneMaskForVReg(Reg);
3933 VNInfo *V = nullptr;
3934 if (LI.hasSubRanges()) {
3935 for (auto &S : LI.subranges()) {
3936 if ((S.LaneMask & SubLanes) == SubLanes) {
3937 V = S.getVNInfoAt(UseIdx);
3938 break;
3939 }
3940 }
3941 } else {
3942 V = LI.getVNInfoAt(UseIdx);
3943 }
3944 if (!V)
3945 return nullptr;
3946 DefIdx = V->def;
3947 } else {
3948 // Find last def.
3949 for (MCRegUnit Unit : regunits(Reg.asMCReg())) {
3950 LiveRange &LR = LIS->getRegUnit(Unit);
3951 if (VNInfo *V = LR.getVNInfoAt(UseIdx)) {
3952 if (!DefIdx.isValid() ||
3953 MDT.dominates(LIS->getInstructionFromIndex(DefIdx),
3954 LIS->getInstructionFromIndex(V->def)))
3955 DefIdx = V->def;
3956 } else {
3957 return nullptr;
3958 }
3959 }
3960 }
3961
3962 MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx);
3963
3964 if (!Def || !MDT.dominates(Def, &Use))
3965 return nullptr;
3966
3967 assert(Def->modifiesRegister(Reg, this));
3968
3969 return Def;
3970}
3971
3973 assert(getRegSizeInBits(*getPhysRegBaseClass(Reg)) <= 32);
3974
3975 for (const TargetRegisterClass &RC : { AMDGPU::VGPR_32RegClass,
3976 AMDGPU::SReg_32RegClass,
3977 AMDGPU::AGPR_32RegClass } ) {
3978 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::lo16, &RC))
3979 return Super;
3980 }
3981 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::hi16,
3982 &AMDGPU::VGPR_32RegClass)) {
3983 return Super;
3984 }
3985
3986 return AMDGPU::NoRegister;
3987}
3988
3990 if (!ST.needsAlignedVGPRs())
3991 return true;
3992
3993 if (isVGPRClass(&RC))
3994 return RC.hasSuperClassEq(getVGPRClassForBitWidth(getRegSizeInBits(RC)));
3995 if (isAGPRClass(&RC))
3996 return RC.hasSuperClassEq(getAGPRClassForBitWidth(getRegSizeInBits(RC)));
3997 if (isVectorSuperClass(&RC))
3998 return RC.hasSuperClassEq(
3999 getVectorSuperClassForBitWidth(getRegSizeInBits(RC)));
4000
4001 assert(&RC != &AMDGPU::VS_64RegClass);
4002
4003 return true;
4004}
4005
4008 return ArrayRef(AMDGPU::SGPR_128RegClass.begin(), ST.getMaxNumSGPRs(MF) / 4);
4009}
4010
4013 return ArrayRef(AMDGPU::SGPR_64RegClass.begin(), ST.getMaxNumSGPRs(MF) / 2);
4014}
4015
4018 return ArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF));
4019}
4020
4021unsigned
4023 unsigned SubReg) const {
4024 switch (RC->TSFlags & SIRCFlags::RegKindMask) {
4025 case SIRCFlags::HasSGPR:
4026 return std::min(128u, getSubRegIdxSize(SubReg));
4027 case SIRCFlags::HasAGPR:
4028 case SIRCFlags::HasVGPR:
4030 return std::min(32u, getSubRegIdxSize(SubReg));
4031 default:
4032 break;
4033 }
4034 return 0;
4035}
4036
4038 const TargetRegisterClass &RC,
4039 bool IncludeCalls) const {
4040 unsigned NumArchVGPRs = ST.has1024AddressableVGPRs() ? 1024 : 256;
4042 (RC.getID() == AMDGPU::VGPR_32RegClassID)
4043 ? RC.getRegisters().take_front(NumArchVGPRs)
4044 : RC.getRegisters();
4045 for (MCPhysReg Reg : reverse(Registers))
4046 if (MRI.isPhysRegUsed(Reg, /*SkipRegMaskTest=*/!IncludeCalls))
4047 return getHWRegIndex(Reg) + 1;
4048 return 0;
4049}
4050
4053 const MachineFunction &MF) const {
4055 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
4056 if (FuncInfo->checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG))
4057 RegFlags.push_back("WWM_REG");
4058 return RegFlags;
4059}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static const Function * getParent(const Value *V)
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
Live Register Matrix
A set of register units.
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
if(PassOpts->AAPipeline)
This file declares the machine register scavenger class.
SI Pre allocate WWM Registers
static int getOffenMUBUFStore(unsigned Opc)
static const TargetRegisterClass * getAnyAGPRClassForBitWidth(unsigned BitWidth)
static int getOffsetMUBUFLoad(unsigned Opc)
static const std::array< unsigned, 17 > SubRegFromChannelTableWidthMap
static unsigned getNumSubRegsForSpillOp(const MachineInstr &MI, const SIInstrInfo *TII)
static void emitUnsupportedError(const Function &Fn, const MachineInstr &MI, const Twine &ErrMsg)
static const TargetRegisterClass * getAlignedAGPRClassForBitWidth(unsigned BitWidth)
static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST, MachineFrameInfo &MFI, MachineBasicBlock::iterator MI, int Index, int64_t Offset)
static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII, unsigned LoadStoreOp, unsigned EltSize)
static const TargetRegisterClass * getAlignedVGPRClassForBitWidth(unsigned BitWidth)
static int getOffsetMUBUFStore(unsigned Opc)
static const TargetRegisterClass * getAnyVGPRClassForBitWidth(unsigned BitWidth)
static cl::opt< bool > EnableSpillSGPRToVGPR("amdgpu-spill-sgpr-to-vgpr", cl::desc("Enable spilling SGPRs to VGPRs"), cl::ReallyHidden, cl::init(true))
static const TargetRegisterClass * getAlignedVectorSuperClassForBitWidth(unsigned BitWidth)
static const TargetRegisterClass * getAnyVectorSuperClassForBitWidth(unsigned BitWidth)
static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, int Index, unsigned Lane, unsigned ValueReg, bool IsKill)
static bool isFIPlusImmOrVGPR(const SIRegisterInfo &TRI, const MachineInstr &MI)
static int getOffenMUBUFLoad(unsigned Opc)
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
LocallyHashedType DenseMapInfo< LocallyHashedType >::Empty
static const char * getRegisterName(MCRegister Reg)
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
bool test(unsigned Idx) const
Definition BitVector.h:480
bool empty() const
empty - Tests whether there are no bits in this bitvector.
Definition BitVector.h:175
A debug info location.
Definition DebugLoc.h:124
Diagnostic information for unsupported feature in backend.
Register getReg() const
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LiveInterval - This class represents the liveness of a register, or stack slot.
bool hasSubRanges() const
Returns true if subregister liveness information is available.
iterator_range< subrange_iterator > subranges()
void removeAllRegUnitsForPhysReg(MCRegister Reg)
Remove associated live ranges for the register units associated with Reg.
bool hasInterval(Register Reg) const
MachineInstr * getInstructionFromIndex(SlotIndex index) const
Returns the instruction associated with the given index.
MachineDominatorTree & getDomTree()
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
LiveRange & getRegUnit(MCRegUnit Unit)
Return the live range for register unit Unit.
This class represents the liveness of a register, stack slot, etc.
VNInfo * getVNInfoAt(SlotIndex Idx) const
getVNInfoAt - Return the VNInfo that is live at Idx, or NULL.
A set of register units used to track register liveness.
bool available(MCRegister Reg) const
Returns true if no part of physical register Reg is live.
Describe properties that are true of each instruction in the target description file.
MCRegAliasIterator enumerates all registers aliasing Reg.
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
static MCRegister from(unsigned Val)
Check the provided unsigned value is a valid MCRegister.
Definition MCRegister.h:77
Generic base class for all target subtargets.
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
bool hasCalls() const
Return true if the current function has any function calls.
Align getObjectAlign(int ObjectIdx) const
Return the alignment of the specified stack object.
bool hasStackObjects() const
Return true if there are any stack objects in this function.
uint8_t getStackID(int ObjectIdx) const
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
void setAsmPrinterFlag(uint8_t Flag)
Set a flag for the AsmPrinter.
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
void setImm(int64_t immVal)
int64_t getImm() const
LLVM_ABI void setIsRenamable(bool Val=true)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
bool isReserved(MCRegister PhysReg) const
isReserved - Returns true when PhysReg is a reserved register.
Holds all the information related to register banks.
virtual bool isDivergentRegBank(const RegisterBank *RB) const
Returns true if the register bank is considered divergent.
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
static bool isFLATScratch(const MachineInstr &MI)
static bool isMUBUF(const MachineInstr &MI)
static bool isVOP3(const MCInstrDesc &Desc)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
ArrayRef< MCPhysReg > getAGPRSpillVGPRs() const
MCPhysReg getVGPRToAGPRSpill(int FrameIndex, unsigned Lane) const
Register getScratchRSrcReg() const
Returns the physical register reserved for use as the resource descriptor for scratch accesses.
ArrayRef< MCPhysReg > getVGPRSpillAGPRs() const
ArrayRef< SIRegisterInfo::SpilledReg > getSGPRSpillToVirtualVGPRLanes(int FrameIndex) const
uint32_t getMaskForVGPRBlockOps(Register RegisterBlock) const
ArrayRef< SIRegisterInfo::SpilledReg > getSGPRSpillToPhysicalVGPRLanes(int FrameIndex) const
bool checkFlag(Register Reg, uint8_t Flag) const
const ReservedRegSet & getWWMReservedRegs() const
Register materializeFrameBaseRegister(MachineBasicBlock *MBB, int FrameIdx, int64_t Offset) const override
int64_t getScratchInstrOffset(const MachineInstr *MI) const
bool isFrameOffsetLegal(const MachineInstr *MI, Register BaseReg, int64_t Offset) const override
const TargetRegisterClass * getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, const TargetRegisterClass *SubRC, unsigned SubIdx) const
Returns a register class which is compatible with SuperRC, such that a subregister exists with class ...
ArrayRef< MCPhysReg > getAllSGPR64(const MachineFunction &MF) const
Return all SGPR64 which satisfy the waves per execution unit requirement of the subtarget.
MCRegister findUnusedRegister(const MachineRegisterInfo &MRI, const TargetRegisterClass *RC, const MachineFunction &MF, bool ReserveHighestVGPR=false) const
Returns a lowest register that is not used at any point in the function.
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
MCPhysReg get32BitRegister(MCPhysReg Reg) const
const uint32_t * getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override
bool requiresFrameIndexReplacementScavenging(const MachineFunction &MF) const override
bool shouldRealignStack(const MachineFunction &MF) const override
bool restoreSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, SlotIndexes *Indexes=nullptr, LiveIntervals *LIS=nullptr, bool OnlyToVGPR=false, bool SpillToPhysVGPRLane=false) const
bool isProperlyAlignedRC(const TargetRegisterClass &RC) const
const TargetRegisterClass * getEquivalentVGPRClass(const TargetRegisterClass *SRC) const
Register getFrameRegister(const MachineFunction &MF) const override
LLVM_READONLY const TargetRegisterClass * getVectorSuperClassForBitWidth(unsigned BitWidth) const
bool spillEmergencySGPR(MachineBasicBlock::iterator MI, MachineBasicBlock &RestoreMBB, Register SGPR, RegScavenger *RS) const
SIRegisterInfo(const GCNSubtarget &ST)
const uint32_t * getAllVGPRRegMask() const
MCRegister getReturnAddressReg(const MachineFunction &MF) const
const MCPhysReg * getCalleeSavedRegs(const MachineFunction *MF) const override
bool hasBasePointer(const MachineFunction &MF) const
const TargetRegisterClass * getCrossCopyRegClass(const TargetRegisterClass *RC) const override
Returns a legal register class to copy a register in the specified class to or from.
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
ArrayRef< MCPhysReg > getAllSGPR32(const MachineFunction &MF) const
Return all SGPR32 which satisfy the waves per execution unit requirement of the subtarget.
const TargetRegisterClass * getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &MF) const override
MCRegister reservedPrivateSegmentBufferReg(const MachineFunction &MF) const
Return the end register initially reserved for the scratch buffer in case spilling is needed.
bool eliminateSGPRToVGPRSpillFrameIndex(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, SlotIndexes *Indexes=nullptr, LiveIntervals *LIS=nullptr, bool SpillToPhysVGPRLane=false) const
Special case of eliminateFrameIndex.
bool isVGPR(const MachineRegisterInfo &MRI, Register Reg) const
void buildSpillLoadStore(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, unsigned LoadStoreOp, int Index, Register ValueReg, bool ValueIsKill, MCRegister ScratchOffsetReg, int64_t InstrOffset, MachineMemOperand *MMO, RegScavenger *RS, LiveRegUnits *LiveUnits=nullptr) const
bool isAsmClobberable(const MachineFunction &MF, MCRegister PhysReg) const override
LLVM_READONLY const TargetRegisterClass * getAGPRClassForBitWidth(unsigned BitWidth) const
static bool isChainScratchRegister(Register VGPR)
bool requiresRegisterScavenging(const MachineFunction &Fn) const override
bool opCanUseInlineConstant(unsigned OpType) const
const TargetRegisterClass * getRegClassForSizeOnBank(unsigned Size, const RegisterBank &Bank) const
const TargetRegisterClass * getConstrainedRegClassForOperand(const MachineOperand &MO, const MachineRegisterInfo &MRI) const override
bool isUniformReg(const MachineRegisterInfo &MRI, const RegisterBankInfo &RBI, Register Reg) const override
const uint32_t * getNoPreservedMask() const override
StringRef getRegAsmName(MCRegister Reg) const override
const uint32_t * getAllAllocatableSRegMask() const
MCRegister getAlignedHighSGPRForRC(const MachineFunction &MF, const unsigned Align, const TargetRegisterClass *RC) const
Return the largest available SGPR aligned to Align for the register class RC.
const TargetRegisterClass * getRegClassForReg(const MachineRegisterInfo &MRI, Register Reg) const
unsigned getHWRegIndex(MCRegister Reg) const
const MCPhysReg * getCalleeSavedRegsViaCopy(const MachineFunction *MF) const
const uint32_t * getAllVectorRegMask() const
const TargetRegisterClass * getEquivalentAGPRClass(const TargetRegisterClass *SRC) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
const TargetRegisterClass * getPointerRegClass(unsigned Kind=0) const override
const TargetRegisterClass * getRegClassForTypeOnBank(LLT Ty, const RegisterBank &Bank) const
bool opCanUseLiteralConstant(unsigned OpType) const
Register getBaseRegister() const
bool getRegAllocationHints(Register VirtReg, ArrayRef< MCPhysReg > Order, SmallVectorImpl< MCPhysReg > &Hints, const MachineFunction &MF, const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const override
LLVM_READONLY const TargetRegisterClass * getAlignedLo256VGPRClassForBitWidth(unsigned BitWidth) const
LLVM_READONLY const TargetRegisterClass * getVGPRClassForBitWidth(unsigned BitWidth) const
const TargetRegisterClass * getEquivalentAVClass(const TargetRegisterClass *SRC) const
bool requiresFrameIndexScavenging(const MachineFunction &MF) const override
static bool isVGPRClass(const TargetRegisterClass *RC)
MachineInstr * findReachingDef(Register Reg, unsigned SubReg, MachineInstr &Use, MachineRegisterInfo &MRI, LiveIntervals *LIS) const
bool isSGPRReg(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getEquivalentSGPRClass(const TargetRegisterClass *VRC) const
SmallVector< StringLiteral > getVRegFlagsOfReg(Register Reg, const MachineFunction &MF) const override
LLVM_READONLY const TargetRegisterClass * getDefaultVectorSuperClassForBitWidth(unsigned BitWidth) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
ArrayRef< MCPhysReg > getAllSGPR128(const MachineFunction &MF) const
Return all SGPR128 which satisfy the waves per execution unit requirement of the subtarget.
unsigned getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const override
BitVector getReservedRegs(const MachineFunction &MF) const override
bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override
const TargetRegisterClass * getRegClassForOperandReg(const MachineRegisterInfo &MRI, const MachineOperand &MO) const
void addImplicitUsesForBlockCSRLoad(MachineInstrBuilder &MIB, Register BlockReg) const
unsigned getNumUsedPhysRegs(const MachineRegisterInfo &MRI, const TargetRegisterClass &RC, bool IncludeCalls=true) const
const uint32_t * getAllAGPRRegMask() const
const int * getRegUnitPressureSets(MCRegUnit RegUnit) const override
bool isAGPR(const MachineRegisterInfo &MRI, Register Reg) const
bool eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const override
bool spillSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, SlotIndexes *Indexes=nullptr, LiveIntervals *LIS=nullptr, bool OnlyToVGPR=false, bool SpillToPhysVGPRLane=false) const
If OnlyToVGPR is true, this will only succeed if this manages to find a free VGPR lane to spill.
MCRegister getExec() const
MCRegister getVCC() const
int64_t getFrameIndexInstrOffset(const MachineInstr *MI, int Idx) const override
bool isVectorSuperClass(const TargetRegisterClass *RC) const
const TargetRegisterClass * getWaveMaskRegClass() const
unsigned getSubRegAlignmentNumBits(const TargetRegisterClass *RC, unsigned SubReg) const
void resolveFrameIndex(MachineInstr &MI, Register BaseReg, int64_t Offset) const override
bool requiresVirtualBaseRegisters(const MachineFunction &Fn) const override
const TargetRegisterClass * getVGPR64Class() const
void buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index, int Offset, bool IsLoad, bool IsKill=true) const
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
bool isValid() const
Returns true if this is a valid index.
SlotIndexes pass.
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
SlotIndex replaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
ReplaceMachineInstrInMaps - Replacing a machine instr with a new one in maps used by register allocat...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
bool hasFP(const MachineFunction &MF) const
hasFP - Return true if the specified function should have a dedicated frame pointer register.
const uint8_t TSFlags
Configurable target specific flags.
ArrayRef< MCPhysReg > getRegisters() const
unsigned getID() const
Return the register class ID number.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
virtual const TargetRegisterClass * getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &) const
Returns the largest super class of RC that is legal to use in the current sub-target and has the same...
virtual bool shouldRealignStack(const MachineFunction &MF) const
True if storage within the function requires the stack pointer to be aligned more than the normal cal...
virtual bool getRegAllocationHints(Register VirtReg, ArrayRef< MCPhysReg > Order, SmallVectorImpl< MCPhysReg > &Hints, const MachineFunction &MF, const VirtRegMap *VRM=nullptr, const LiveRegMatrix *Matrix=nullptr) const
Get a list of 'hint' registers that the register allocator should try first when allocating a physica...
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
VNInfo - Value Number Information.
MCRegister getPhys(Register virtReg) const
returns the physical register mapped to the specified virtual register
Definition VirtRegMap.h:91
bool hasPhys(Register virtReg) const
returns true if the specified virtual register is mapped to a physical register
Definition VirtRegMap.h:87
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ PRIVATE_ADDRESS
Address space for private memory.
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
LLVM_READONLY int getFlatScratchInstSVfromSS(uint16_t Opcode)
LLVM_READONLY int getFlatScratchInstSTfromSS(uint16_t Opcode)
LLVM_READONLY int getFlatScratchInstSVfromSVS(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
@ OPERAND_REG_IMM_FIRST
Definition SIDefines.h:250
@ OPERAND_REG_INLINE_AC_FIRST
Definition SIDefines.h:256
@ OPERAND_REG_INLINE_AC_LAST
Definition SIDefines.h:257
@ OPERAND_REG_IMM_LAST
Definition SIDefines.h:251
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition CallingConv.h:47
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Renamable
Register that may be renamed.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:532
PointerUnion< const TargetRegisterClass *, const RegisterBank * > RegClassOrRegBank
Convenient type to represent either a register class or a register bank.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1655
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
Op::Description Desc
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
@ HasSGPR
Definition SIDefines.h:26
@ HasVGPR
Definition SIDefines.h:24
@ RegKindMask
Definition SIDefines.h:29
@ HasAGPR
Definition SIDefines.h:25
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
unsigned getDefRegState(bool B)
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
unsigned getKillRegState(bool B)
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
void call_once(once_flag &flag, Function &&F, Args &&... ArgList)
Execute the function specified as a parameter once.
Definition Threading.h:86
constexpr unsigned BitWidth
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
This class contains a discriminated union of information about pointers in memory operands,...
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
void setMI(MachineBasicBlock *NewMBB, MachineBasicBlock::iterator NewMI)
ArrayRef< int16_t > SplitParts
SIMachineFunctionInfo & MFI
SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, bool IsWave32, MachineBasicBlock::iterator MI, int Index, RegScavenger *RS)
SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, bool IsWave32, MachineBasicBlock::iterator MI, Register Reg, bool IsKill, int Index, RegScavenger *RS)
MachineBasicBlock::iterator MI
void readWriteTmpVGPR(unsigned Offset, bool IsLoad)
const SIRegisterInfo & TRI
MachineBasicBlock * MBB
const SIInstrInfo & TII
The llvm::once_flag structure.
Definition Threading.h:67