LLVM 23.0.0git
SIRegisterInfo.cpp
Go to the documentation of this file.
1//===-- SIRegisterInfo.cpp - SI Register Information ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI implementation of the TargetRegisterInfo class.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPU.h"
16#include "GCNSubtarget.h"
20#include "SIRegisterInfo.h"
26
27using namespace llvm;
28
29#define GET_REGINFO_TARGET_DESC
30#include "AMDGPUGenRegisterInfo.inc"
31
33 "amdgpu-spill-sgpr-to-vgpr",
34 cl::desc("Enable spilling SGPRs to VGPRs"),
36 cl::init(true));
37
38std::array<std::vector<int16_t>, 32> SIRegisterInfo::RegSplitParts;
39std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable;
40
41// Map numbers of DWORDs to indexes in SubRegFromChannelTable.
42// Valid indexes are shifted 1, such that a 0 mapping means unsupported.
43// e.g. for 8 DWORDs (256-bit), SubRegFromChannelTableWidthMap[8] = 8,
44// meaning index 7 in SubRegFromChannelTable.
45static const std::array<unsigned, 17> SubRegFromChannelTableWidthMap = {
46 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 9};
47
48static void emitUnsupportedError(const Function &Fn, const MachineInstr &MI,
49 const Twine &ErrMsg) {
51 DiagnosticInfoUnsupported(Fn, ErrMsg, MI.getDebugLoc()));
52}
53
54namespace llvm {
55
56// A temporary struct to spill SGPRs.
57// This is mostly to spill SGPRs to memory. Spilling SGPRs into VGPR lanes emits
58// just v_writelane and v_readlane.
59//
60// When spilling to memory, the SGPRs are written into VGPR lanes and the VGPR
61// is saved to scratch (or the other way around for loads).
62// For this, a VGPR is required where the needed lanes can be clobbered. The
63// RegScavenger can provide a VGPR where currently active lanes can be
64// clobbered, but we still need to save inactive lanes.
65// The high-level steps are:
66// - Try to scavenge SGPR(s) to save exec
67// - Try to scavenge VGPR
68// - Save needed, all or inactive lanes of a TmpVGPR
69// - Spill/Restore SGPRs using TmpVGPR
70// - Restore TmpVGPR
71//
72// To save all lanes of TmpVGPR, exec needs to be saved and modified. If we
73// cannot scavenge temporary SGPRs to save exec, we use the following code:
74// buffer_store_dword TmpVGPR ; only if active lanes need to be saved
75// s_not exec, exec
76// buffer_store_dword TmpVGPR ; save inactive lanes
77// s_not exec, exec
79 struct PerVGPRData {
80 unsigned PerVGPR;
81 unsigned NumVGPRs;
82 int64_t VGPRLanes;
83 };
84
85 // The SGPR to save
89 unsigned NumSubRegs;
90 bool IsKill;
91 const DebugLoc &DL;
92
93 /* When spilling to stack */
94 // The SGPRs are written into this VGPR, which is then written to scratch
95 // (or vice versa for loads).
96 Register TmpVGPR = AMDGPU::NoRegister;
97 // Temporary spill slot to save TmpVGPR to.
98 int TmpVGPRIndex = 0;
99 // If TmpVGPR is live before the spill or if it is scavenged.
100 bool TmpVGPRLive = false;
101 // Scavenged SGPR to save EXEC.
102 Register SavedExecReg = AMDGPU::NoRegister;
103 // Stack index to write the SGPRs to.
104 int Index;
105 unsigned EltSize = 4;
106
115 unsigned MovOpc;
116 unsigned NotOpc;
117
121 : SGPRSpillBuilder(TRI, TII, IsWave32, MI, MI->getOperand(0).getReg(),
122 MI->getOperand(0).isKill(), Index, RS) {}
123
126 bool IsKill, int Index, RegScavenger *RS)
127 : SuperReg(Reg), MI(MI), IsKill(IsKill), DL(MI->getDebugLoc()),
128 Index(Index), RS(RS), MBB(MI->getParent()), MF(*MBB->getParent()),
129 MFI(*MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI),
131 const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg);
132 SplitParts = TRI.getRegSplitParts(RC, EltSize);
133 NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
134
135 if (IsWave32) {
136 ExecReg = AMDGPU::EXEC_LO;
137 MovOpc = AMDGPU::S_MOV_B32;
138 NotOpc = AMDGPU::S_NOT_B32;
139 } else {
140 ExecReg = AMDGPU::EXEC;
141 MovOpc = AMDGPU::S_MOV_B64;
142 NotOpc = AMDGPU::S_NOT_B64;
143 }
144
145 assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
146 assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI &&
147 SuperReg != AMDGPU::EXEC && "exec should never spill");
148 }
149
152 Data.PerVGPR = IsWave32 ? 32 : 64;
153 Data.NumVGPRs = (NumSubRegs + (Data.PerVGPR - 1)) / Data.PerVGPR;
154 Data.VGPRLanes = (1LL << std::min(Data.PerVGPR, NumSubRegs)) - 1LL;
155 return Data;
156 }
157
158 // Tries to scavenge SGPRs to save EXEC and a VGPR. Uses v0 if no VGPR is
159 // free.
160 // Writes these instructions if an SGPR can be scavenged:
161 // s_mov_b64 s[6:7], exec ; Save exec
162 // s_mov_b64 exec, 3 ; Wanted lanemask
163 // buffer_store_dword v1 ; Write scavenged VGPR to emergency slot
164 //
165 // Writes these instructions if no SGPR can be scavenged:
166 // buffer_store_dword v0 ; Only if no free VGPR was found
167 // s_not_b64 exec, exec
168 // buffer_store_dword v0 ; Save inactive lanes
169 // ; exec stays inverted, it is flipped back in
170 // ; restore.
171 void prepare() {
172 // Scavenged temporary VGPR to use. It must be scavenged once for any number
173 // of spilled subregs.
174 // FIXME: The liveness analysis is limited and does not tell if a register
175 // is in use in lanes that are currently inactive. We can never be sure if
176 // a register as actually in use in another lane, so we need to save all
177 // used lanes of the chosen VGPR.
178 assert(RS && "Cannot spill SGPR to memory without RegScavenger");
179 TmpVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false,
180 0, false);
181
182 // Reserve temporary stack slot
183 TmpVGPRIndex = MFI.getScavengeFI(MF.getFrameInfo(), TRI);
184 if (TmpVGPR) {
185 // Found a register that is dead in the currently active lanes, we only
186 // need to spill inactive lanes.
187 TmpVGPRLive = false;
188 } else {
189 // Pick v0 because it doesn't make a difference.
190 TmpVGPR = AMDGPU::VGPR0;
191 TmpVGPRLive = true;
192 }
193
194 if (TmpVGPRLive) {
195 // We need to inform the scavenger that this index is already in use until
196 // we're done with the custom emergency spill.
197 RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR);
198 }
199
200 // We may end up recursively calling the scavenger, and don't want to re-use
201 // the same register.
202 RS->setRegUsed(TmpVGPR);
203
204 // Try to scavenge SGPRs to save exec
205 assert(!SavedExecReg && "Exec is already saved, refuse to save again");
206 const TargetRegisterClass &RC =
207 IsWave32 ? AMDGPU::SGPR_32RegClass : AMDGPU::SGPR_64RegClass;
208 RS->setRegUsed(SuperReg);
209 SavedExecReg = RS->scavengeRegisterBackwards(RC, MI, false, 0, false);
210
211 int64_t VGPRLanes = getPerVGPRData().VGPRLanes;
212
213 if (SavedExecReg) {
214 RS->setRegUsed(SavedExecReg);
215 // Set exec to needed lanes
217 auto I =
218 BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg).addImm(VGPRLanes);
219 if (!TmpVGPRLive)
221 // Spill needed lanes
222 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
223 } else {
224 // The modify and restore of exec clobber SCC, which we would have to save
225 // and restore. FIXME: We probably would need to reserve a register for
226 // this.
227 if (RS->isRegUsed(AMDGPU::SCC))
228 emitUnsupportedError(MF.getFunction(), *MI,
229 "unhandled SGPR spill to memory");
230
231 // Spill active lanes
232 if (TmpVGPRLive)
233 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false,
234 /*IsKill*/ false);
235 // Spill inactive lanes
236 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
237 if (!TmpVGPRLive)
239 I->getOperand(2).setIsDead(); // Mark SCC as dead.
240 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
241 }
242 }
243
244 // Writes these instructions if an SGPR can be scavenged:
245 // buffer_load_dword v1 ; Write scavenged VGPR to emergency slot
246 // s_waitcnt vmcnt(0) ; If a free VGPR was found
247 // s_mov_b64 exec, s[6:7] ; Save exec
248 //
249 // Writes these instructions if no SGPR can be scavenged:
250 // buffer_load_dword v0 ; Restore inactive lanes
251 // s_waitcnt vmcnt(0) ; If a free VGPR was found
252 // s_not_b64 exec, exec
253 // buffer_load_dword v0 ; Only if no free VGPR was found
254 void restore() {
255 if (SavedExecReg) {
256 // Restore used lanes
257 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
258 /*IsKill*/ false);
259 // Restore exec
260 auto I = BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg)
262 // Add an implicit use of the load so it is not dead.
263 // FIXME This inserts an unnecessary waitcnt
264 if (!TmpVGPRLive) {
266 }
267 } else {
268 // Restore inactive lanes
269 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
270 /*IsKill*/ false);
271 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
272 if (!TmpVGPRLive)
274 I->getOperand(2).setIsDead(); // Mark SCC as dead.
275
276 // Restore active lanes
277 if (TmpVGPRLive)
278 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true);
279 }
280
281 // Inform the scavenger where we're releasing our custom scavenged register.
282 if (TmpVGPRLive) {
283 MachineBasicBlock::iterator RestorePt = std::prev(MI);
284 RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR, &*RestorePt);
285 }
286 }
287
288 // Write TmpVGPR to memory or read TmpVGPR from memory.
289 // Either using a single buffer_load/store if exec is set to the needed mask
290 // or using
291 // buffer_load
292 // s_not exec, exec
293 // buffer_load
294 // s_not exec, exec
295 void readWriteTmpVGPR(unsigned Offset, bool IsLoad) {
296 if (SavedExecReg) {
297 // Spill needed lanes
298 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
299 } else {
300 // The modify and restore of exec clobber SCC, which we would have to save
301 // and restore. FIXME: We probably would need to reserve a register for
302 // this.
303 if (RS->isRegUsed(AMDGPU::SCC))
304 emitUnsupportedError(MF.getFunction(), *MI,
305 "unhandled SGPR spill to memory");
306
307 // Spill active lanes
308 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad,
309 /*IsKill*/ false);
310 // Spill inactive lanes
311 auto Not0 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
312 Not0->getOperand(2).setIsDead(); // Mark SCC as dead.
313 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
314 auto Not1 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
315 Not1->getOperand(2).setIsDead(); // Mark SCC as dead.
316 }
317 }
318
320 assert(MBB->getParent() == &MF);
321 MI = NewMI;
322 MBB = NewMBB;
323 }
324};
325
326} // namespace llvm
327
329 : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour(),
330 ST.getAMDGPUDwarfFlavour(),
331 /*PC=*/0,
332 ST.getHwMode(MCSubtargetInfo::HwMode_RegInfo)),
333 ST(ST), SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) {
334
335 assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 &&
336 getSubRegIndexLaneMask(AMDGPU::sub31).getAsInteger() == (3ULL << 62) &&
337 (getSubRegIndexLaneMask(AMDGPU::lo16) |
338 getSubRegIndexLaneMask(AMDGPU::hi16)).getAsInteger() ==
339 getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() &&
340 "getNumCoveredRegs() will not work with generated subreg masks!");
341
342 RegPressureIgnoredUnits.resize(getNumRegUnits());
343 RegPressureIgnoredUnits.set(
344 static_cast<unsigned>(*regunits(MCRegister::from(AMDGPU::M0)).begin()));
345 for (auto Reg : AMDGPU::VGPR_16RegClass) {
346 if (AMDGPU::isHi16Reg(Reg, *this))
347 RegPressureIgnoredUnits.set(
348 static_cast<unsigned>(*regunits(Reg).begin()));
349 }
350
351 // HACK: Until this is fully tablegen'd.
352 static llvm::once_flag InitializeRegSplitPartsFlag;
353
354 static auto InitializeRegSplitPartsOnce = [this]() {
355 for (unsigned Idx = 1, E = getNumSubRegIndices() - 1; Idx < E; ++Idx) {
356 unsigned Size = getSubRegIdxSize(Idx);
357 if (Size & 15)
358 continue;
359 std::vector<int16_t> &Vec = RegSplitParts[Size / 16 - 1];
360 unsigned Pos = getSubRegIdxOffset(Idx);
361 if (Pos % Size)
362 continue;
363 Pos /= Size;
364 if (Vec.empty()) {
365 unsigned MaxNumParts = 1024 / Size; // Maximum register is 1024 bits.
366 Vec.resize(MaxNumParts);
367 }
368 Vec[Pos] = Idx;
369 }
370 };
371
372 static llvm::once_flag InitializeSubRegFromChannelTableFlag;
373
374 static auto InitializeSubRegFromChannelTableOnce = [this]() {
375 for (auto &Row : SubRegFromChannelTable)
376 Row.fill(AMDGPU::NoSubRegister);
377 for (unsigned Idx = 1; Idx < getNumSubRegIndices(); ++Idx) {
378 unsigned Width = getSubRegIdxSize(Idx) / 32;
379 unsigned Offset = getSubRegIdxOffset(Idx) / 32;
381 Width = SubRegFromChannelTableWidthMap[Width];
382 if (Width == 0)
383 continue;
384 unsigned TableIdx = Width - 1;
385 assert(TableIdx < SubRegFromChannelTable.size());
386 assert(Offset < SubRegFromChannelTable[TableIdx].size());
387 SubRegFromChannelTable[TableIdx][Offset] = Idx;
388 }
389 };
390
391 llvm::call_once(InitializeRegSplitPartsFlag, InitializeRegSplitPartsOnce);
392 llvm::call_once(InitializeSubRegFromChannelTableFlag,
393 InitializeSubRegFromChannelTableOnce);
394}
395
396void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved,
397 MCRegister Reg) const {
398 for (MCRegAliasIterator R(Reg, this, true); R.isValid(); ++R)
399 Reserved.set(*R);
400}
401
402// Forced to be here by one .inc
404 const MachineFunction *MF) const {
406 switch (CC) {
407 case CallingConv::C:
410 return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_SaveList
411 : CSR_AMDGPU_SaveList;
414 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_SaveList
415 : CSR_AMDGPU_SI_Gfx_SaveList;
417 return CSR_AMDGPU_CS_ChainPreserve_SaveList;
418 default: {
419 // Dummy to not crash RegisterClassInfo.
420 static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister;
421 return &NoCalleeSavedReg;
422 }
423 }
424}
425
426const MCPhysReg *
428 return nullptr;
429}
430
432 CallingConv::ID CC) const {
433 switch (CC) {
434 case CallingConv::C:
437 return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_RegMask
438 : CSR_AMDGPU_RegMask;
441 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_RegMask
442 : CSR_AMDGPU_SI_Gfx_RegMask;
445 // Calls to these functions never return, so we can pretend everything is
446 // preserved.
447 return AMDGPU_AllVGPRs_RegMask;
448 default:
449 return nullptr;
450 }
451}
452
454 return CSR_AMDGPU_NoRegs_RegMask;
455}
456
458 return VGPR >= AMDGPU::VGPR0 && VGPR < AMDGPU::VGPR8;
459}
460
463 const MachineFunction &MF) const {
464 // FIXME: Should have a helper function like getEquivalentVGPRClass to get the
465 // equivalent AV class. If used one, the verifier will crash after
466 // RegBankSelect in the GISel flow. The aligned regclasses are not fully given
467 // until Instruction selection.
468 if (ST.hasMAIInsts() && (isVGPRClass(RC) || isAGPRClass(RC))) {
469 if (RC == &AMDGPU::VGPR_32RegClass || RC == &AMDGPU::AGPR_32RegClass)
470 return &AMDGPU::AV_32RegClass;
471 if (RC == &AMDGPU::VReg_64RegClass || RC == &AMDGPU::AReg_64RegClass)
472 return &AMDGPU::AV_64RegClass;
473 if (RC == &AMDGPU::VReg_64_Align2RegClass ||
474 RC == &AMDGPU::AReg_64_Align2RegClass)
475 return &AMDGPU::AV_64_Align2RegClass;
476 if (RC == &AMDGPU::VReg_96RegClass || RC == &AMDGPU::AReg_96RegClass)
477 return &AMDGPU::AV_96RegClass;
478 if (RC == &AMDGPU::VReg_96_Align2RegClass ||
479 RC == &AMDGPU::AReg_96_Align2RegClass)
480 return &AMDGPU::AV_96_Align2RegClass;
481 if (RC == &AMDGPU::VReg_128RegClass || RC == &AMDGPU::AReg_128RegClass)
482 return &AMDGPU::AV_128RegClass;
483 if (RC == &AMDGPU::VReg_128_Align2RegClass ||
484 RC == &AMDGPU::AReg_128_Align2RegClass)
485 return &AMDGPU::AV_128_Align2RegClass;
486 if (RC == &AMDGPU::VReg_160RegClass || RC == &AMDGPU::AReg_160RegClass)
487 return &AMDGPU::AV_160RegClass;
488 if (RC == &AMDGPU::VReg_160_Align2RegClass ||
489 RC == &AMDGPU::AReg_160_Align2RegClass)
490 return &AMDGPU::AV_160_Align2RegClass;
491 if (RC == &AMDGPU::VReg_192RegClass || RC == &AMDGPU::AReg_192RegClass)
492 return &AMDGPU::AV_192RegClass;
493 if (RC == &AMDGPU::VReg_192_Align2RegClass ||
494 RC == &AMDGPU::AReg_192_Align2RegClass)
495 return &AMDGPU::AV_192_Align2RegClass;
496 if (RC == &AMDGPU::VReg_256RegClass || RC == &AMDGPU::AReg_256RegClass)
497 return &AMDGPU::AV_256RegClass;
498 if (RC == &AMDGPU::VReg_256_Align2RegClass ||
499 RC == &AMDGPU::AReg_256_Align2RegClass)
500 return &AMDGPU::AV_256_Align2RegClass;
501 if (RC == &AMDGPU::VReg_512RegClass || RC == &AMDGPU::AReg_512RegClass)
502 return &AMDGPU::AV_512RegClass;
503 if (RC == &AMDGPU::VReg_512_Align2RegClass ||
504 RC == &AMDGPU::AReg_512_Align2RegClass)
505 return &AMDGPU::AV_512_Align2RegClass;
506 if (RC == &AMDGPU::VReg_1024RegClass || RC == &AMDGPU::AReg_1024RegClass)
507 return &AMDGPU::AV_1024RegClass;
508 if (RC == &AMDGPU::VReg_1024_Align2RegClass ||
509 RC == &AMDGPU::AReg_1024_Align2RegClass)
510 return &AMDGPU::AV_1024_Align2RegClass;
511 }
512
514}
515
517 const SIFrameLowering *TFI = ST.getFrameLowering();
519
520 // During ISel lowering we always reserve the stack pointer in entry and chain
521 // functions, but never actually want to reference it when accessing our own
522 // frame. If we need a frame pointer we use it, but otherwise we can just use
523 // an immediate "0" which we represent by returning NoRegister.
524 if (FuncInfo->isBottomOfStack()) {
525 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : Register();
526 }
527 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg()
528 : FuncInfo->getStackPtrOffsetReg();
529}
530
532 // When we need stack realignment, we can't reference off of the
533 // stack pointer, so we reserve a base pointer.
534 return shouldRealignStack(MF);
535}
536
537Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; }
538
540 return AMDGPU_AllVGPRs_RegMask;
541}
542
544 return AMDGPU_AllAGPRs_RegMask;
545}
546
548 return AMDGPU_AllVectorRegs_RegMask;
549}
550
552 return AMDGPU_AllAllocatableSRegs_RegMask;
553}
554
555unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel,
556 unsigned NumRegs) {
557 assert(NumRegs < SubRegFromChannelTableWidthMap.size());
558 unsigned NumRegIndex = SubRegFromChannelTableWidthMap[NumRegs];
559 assert(NumRegIndex && "Not implemented");
560 assert(Channel < SubRegFromChannelTable[NumRegIndex - 1].size());
561 return SubRegFromChannelTable[NumRegIndex - 1][Channel];
562}
563
566 const unsigned Align,
567 const TargetRegisterClass *RC) const {
568 unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), Align) - Align;
569 MCRegister BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
570 return getMatchingSuperReg(BaseReg, AMDGPU::sub0, RC);
571}
572
574 const MachineFunction &MF) const {
575 return getAlignedHighSGPRForRC(MF, /*Align=*/4, &AMDGPU::SGPR_128RegClass);
576}
577
579 BitVector Reserved(getNumRegs());
580 Reserved.set(AMDGPU::MODE);
581
583
584 // Reserve special purpose registers.
585 //
586 // EXEC_LO and EXEC_HI could be allocated and used as regular register, but
587 // this seems likely to result in bugs, so I'm marking them as reserved.
588 reserveRegisterTuples(Reserved, AMDGPU::EXEC);
589 reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
590
591 // M0 has to be reserved so that llvm accepts it as a live-in into a block.
592 reserveRegisterTuples(Reserved, AMDGPU::M0);
593
594 // Reserve src_vccz, src_execz, src_scc.
595 reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ);
596 reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ);
597 reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC);
598
599 // Reserve the memory aperture registers
600 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE);
601 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT);
602 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE);
603 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT);
604 reserveRegisterTuples(Reserved, AMDGPU::SRC_FLAT_SCRATCH_BASE_LO);
605 reserveRegisterTuples(Reserved, AMDGPU::SRC_FLAT_SCRATCH_BASE_HI);
606
607 // Reserve async counters pseudo registers
608 reserveRegisterTuples(Reserved, AMDGPU::ASYNCcnt);
609 reserveRegisterTuples(Reserved, AMDGPU::TENSORcnt);
610
611 // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen.
612 reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID);
613
614 // Reserve xnack_mask registers - support is not implemented in Codegen.
615 reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK);
616
617 // Reserve lds_direct register - support is not implemented in Codegen.
618 reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT);
619
620 // Reserve Trap Handler registers - support is not implemented in Codegen.
621 reserveRegisterTuples(Reserved, AMDGPU::TBA);
622 reserveRegisterTuples(Reserved, AMDGPU::TMA);
623 reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1);
624 reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3);
625 reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5);
626 reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7);
627 reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9);
628 reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11);
629 reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13);
630 reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15);
631
632 // Reserve null register - it shall never be allocated
633 reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL64);
634
635 // Reserve SGPRs.
636 //
637 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
638 unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
639 for (const TargetRegisterClass *RC : regclasses()) {
640 if (RC->isBaseClass() && isSGPRClass(RC)) {
641 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
642 for (MCPhysReg Reg : *RC) {
643 unsigned Index = getHWRegIndex(Reg);
644 if (Index + NumRegs > MaxNumSGPRs && Index < TotalNumSGPRs)
645 Reserved.set(Reg);
646 }
647 }
648 }
649
650 Register ScratchRSrcReg = MFI->getScratchRSrcReg();
651 if (ScratchRSrcReg != AMDGPU::NoRegister) {
652 // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we
653 // need to spill.
654 // TODO: May need to reserve a VGPR if doing LDS spilling.
655 reserveRegisterTuples(Reserved, ScratchRSrcReg);
656 }
657
658 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
659 if (LongBranchReservedReg)
660 reserveRegisterTuples(Reserved, LongBranchReservedReg);
661
662 // We have to assume the SP is needed in case there are calls in the function,
663 // which is detected after the function is lowered. If we aren't really going
664 // to need SP, don't bother reserving it.
665 MCRegister StackPtrReg = MFI->getStackPtrOffsetReg();
666 if (StackPtrReg) {
667 reserveRegisterTuples(Reserved, StackPtrReg);
668 assert(!isSubRegister(ScratchRSrcReg, StackPtrReg));
669 }
670
671 MCRegister FrameReg = MFI->getFrameOffsetReg();
672 if (FrameReg) {
673 reserveRegisterTuples(Reserved, FrameReg);
674 assert(!isSubRegister(ScratchRSrcReg, FrameReg));
675 }
676
677 if (hasBasePointer(MF)) {
678 MCRegister BasePtrReg = getBaseRegister();
679 reserveRegisterTuples(Reserved, BasePtrReg);
680 assert(!isSubRegister(ScratchRSrcReg, BasePtrReg));
681 }
682
683 // FIXME: Use same reserved register introduced in D149775
684 // SGPR used to preserve EXEC MASK around WWM spill/copy instructions.
685 Register ExecCopyReg = MFI->getSGPRForEXECCopy();
686 if (ExecCopyReg)
687 reserveRegisterTuples(Reserved, ExecCopyReg);
688
689 // Reserve VGPRs/AGPRs.
690 //
691 auto [MaxNumVGPRs, MaxNumAGPRs] = ST.getMaxNumVectorRegs(MF.getFunction());
692
693 for (const TargetRegisterClass *RC : regclasses()) {
694 if (RC->isBaseClass() && isVGPRClass(RC)) {
695 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
696 for (MCPhysReg Reg : *RC) {
697 unsigned Index = getHWRegIndex(Reg);
698 if (Index + NumRegs > MaxNumVGPRs)
699 Reserved.set(Reg);
700 }
701 }
702 }
703
704 // Reserve all the AGPRs if there are no instructions to use it.
705 if (!ST.hasMAIInsts())
706 MaxNumAGPRs = 0;
707 for (const TargetRegisterClass *RC : regclasses()) {
708 if (RC->isBaseClass() && isAGPRClass(RC)) {
709 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
710 for (MCPhysReg Reg : *RC) {
711 unsigned Index = getHWRegIndex(Reg);
712 if (Index + NumRegs > MaxNumAGPRs)
713 Reserved.set(Reg);
714 }
715 }
716 }
717
718 // On GFX908, in order to guarantee copying between AGPRs, we need a scratch
719 // VGPR available at all times.
720 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
721 reserveRegisterTuples(Reserved, MFI->getVGPRForAGPRCopy());
722 }
723
724 // During wwm-regalloc, reserve the registers for perlane VGPR allocation. The
725 // MFI->getNonWWMRegMask() field will have a valid bitmask only during
726 // wwm-regalloc and it would be empty otherwise.
727 BitVector NonWWMRegMask = MFI->getNonWWMRegMask();
728 if (!NonWWMRegMask.empty()) {
729 for (unsigned RegI = AMDGPU::VGPR0, RegE = AMDGPU::VGPR0 + MaxNumVGPRs;
730 RegI < RegE; ++RegI) {
731 if (NonWWMRegMask.test(RegI))
732 reserveRegisterTuples(Reserved, RegI);
733 }
734 }
735
736 for (Register Reg : MFI->getWWMReservedRegs())
737 reserveRegisterTuples(Reserved, Reg);
738
739 // FIXME: Stop using reserved registers for this.
740 for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs())
741 reserveRegisterTuples(Reserved, Reg);
742
743 for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs())
744 reserveRegisterTuples(Reserved, Reg);
745
746 return Reserved;
747}
748
750 MCRegister PhysReg) const {
751 return !MF.getRegInfo().isReserved(PhysReg);
752}
753
756 // On entry or in chain functions, the base address is 0, so it can't possibly
757 // need any more alignment.
758
759 // FIXME: Should be able to specify the entry frame alignment per calling
760 // convention instead.
761 if (Info->isBottomOfStack())
762 return false;
763
765}
766
769 if (Info->isEntryFunction()) {
770 const MachineFrameInfo &MFI = Fn.getFrameInfo();
771 return MFI.hasStackObjects() || MFI.hasCalls();
772 }
773
774 // May need scavenger for dealing with callee saved registers.
775 return true;
776}
777
779 const MachineFunction &MF) const {
780 // Do not use frame virtual registers. They used to be used for SGPRs, but
781 // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the
782 // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a
783 // spill.
784 return false;
785}
786
788 const MachineFunction &MF) const {
789 const MachineFrameInfo &MFI = MF.getFrameInfo();
790 return MFI.hasStackObjects();
791}
792
794 const MachineFunction &) const {
795 // There are no special dedicated stack or frame pointers.
796 return true;
797}
798
801
802 int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
803 AMDGPU::OpName::offset);
804 return MI->getOperand(OffIdx).getImm();
805}
806
808 int Idx) const {
809 switch (MI->getOpcode()) {
810 case AMDGPU::V_ADD_U32_e32:
811 case AMDGPU::V_ADD_U32_e64:
812 case AMDGPU::V_ADD_CO_U32_e32: {
813 int OtherIdx = Idx == 1 ? 2 : 1;
814 const MachineOperand &OtherOp = MI->getOperand(OtherIdx);
815 return OtherOp.isImm() ? OtherOp.getImm() : 0;
816 }
817 case AMDGPU::V_ADD_CO_U32_e64: {
818 int OtherIdx = Idx == 2 ? 3 : 2;
819 const MachineOperand &OtherOp = MI->getOperand(OtherIdx);
820 return OtherOp.isImm() ? OtherOp.getImm() : 0;
821 }
822 default:
823 break;
824 }
825
827 return 0;
828
829 assert((Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
830 AMDGPU::OpName::vaddr) ||
831 (Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
832 AMDGPU::OpName::saddr))) &&
833 "Should never see frame index on non-address operand");
834
836}
837
839 const MachineInstr &MI) {
840 assert(MI.getDesc().isAdd());
841 const MachineOperand &Src0 = MI.getOperand(1);
842 const MachineOperand &Src1 = MI.getOperand(2);
843
844 if (Src0.isFI()) {
845 return Src1.isImm() || (Src1.isReg() && TRI.isVGPR(MI.getMF()->getRegInfo(),
846 Src1.getReg()));
847 }
848
849 if (Src1.isFI()) {
850 return Src0.isImm() || (Src0.isReg() && TRI.isVGPR(MI.getMF()->getRegInfo(),
851 Src0.getReg()));
852 }
853
854 return false;
855}
856
858 // TODO: Handle v_add_co_u32, v_or_b32, v_and_b32 and scalar opcodes.
859 switch (MI->getOpcode()) {
860 case AMDGPU::V_ADD_U32_e32: {
861 // TODO: We could handle this but it requires work to avoid violating
862 // operand restrictions.
863 if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e32) < 2 &&
864 !isFIPlusImmOrVGPR(*this, *MI))
865 return false;
866 [[fallthrough]];
867 }
868 case AMDGPU::V_ADD_U32_e64:
869 // FIXME: This optimization is barely profitable hasFlatScratchEnabled
870 // as-is.
871 //
872 // Much of the benefit with the MUBUF handling is we avoid duplicating the
873 // shift of the frame register, which isn't needed with scratch.
874 //
875 // materializeFrameBaseRegister doesn't know the register classes of the
876 // uses, and unconditionally uses an s_add_i32, which will end up using a
877 // copy for the vector uses.
878 return !ST.hasFlatScratchEnabled();
879 case AMDGPU::V_ADD_CO_U32_e32:
880 if (ST.getConstantBusLimit(AMDGPU::V_ADD_CO_U32_e32) < 2 &&
881 !isFIPlusImmOrVGPR(*this, *MI))
882 return false;
883 // We can't deal with the case where the carry out has a use (though this
884 // should never happen)
885 return MI->getOperand(3).isDead();
886 case AMDGPU::V_ADD_CO_U32_e64:
887 // TODO: Should we check use_empty instead?
888 return MI->getOperand(1).isDead();
889 default:
890 break;
891 }
892
894 return false;
895
896 int64_t FullOffset = Offset + getScratchInstrOffset(MI);
897
898 const SIInstrInfo *TII = ST.getInstrInfo();
900 return !TII->isLegalMUBUFImmOffset(FullOffset);
901
902 return !TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS,
904}
905
907 int FrameIdx,
908 int64_t Offset) const {
909 MachineBasicBlock::iterator Ins = MBB->begin();
910 DebugLoc DL; // Defaults to "unknown"
911
912 if (Ins != MBB->end())
913 DL = Ins->getDebugLoc();
914
915 MachineFunction *MF = MBB->getParent();
916 const SIInstrInfo *TII = ST.getInstrInfo();
917 MachineRegisterInfo &MRI = MF->getRegInfo();
918 unsigned MovOpc =
919 ST.hasFlatScratchEnabled() ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
920
921 Register BaseReg = MRI.createVirtualRegister(
922 ST.hasFlatScratchEnabled() ? &AMDGPU::SReg_32_XEXEC_HIRegClass
923 : &AMDGPU::VGPR_32RegClass);
924
925 if (Offset == 0) {
926 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), BaseReg)
927 .addFrameIndex(FrameIdx);
928 return BaseReg;
929 }
930
931 Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
932
933 Register FIReg = MRI.createVirtualRegister(ST.hasFlatScratchEnabled()
934 ? &AMDGPU::SReg_32_XM0RegClass
935 : &AMDGPU::VGPR_32RegClass);
936
937 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
938 .addImm(Offset);
939 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), FIReg)
940 .addFrameIndex(FrameIdx);
941
942 if (ST.hasFlatScratchEnabled()) {
943 // FIXME: Make sure scc isn't live in.
944 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_I32), BaseReg)
945 .addReg(OffsetReg, RegState::Kill)
946 .addReg(FIReg)
947 .setOperandDead(3); // scc
948 return BaseReg;
949 }
950
951 TII->getAddNoCarry(*MBB, Ins, DL, BaseReg)
952 .addReg(OffsetReg, RegState::Kill)
953 .addReg(FIReg)
954 .addImm(0); // clamp bit
955
956 return BaseReg;
957}
958
960 int64_t Offset) const {
961 const SIInstrInfo *TII = ST.getInstrInfo();
962
963 switch (MI.getOpcode()) {
964 case AMDGPU::V_ADD_U32_e32:
965 case AMDGPU::V_ADD_CO_U32_e32: {
966 MachineOperand *FIOp = &MI.getOperand(2);
967 MachineOperand *ImmOp = &MI.getOperand(1);
968 if (!FIOp->isFI())
969 std::swap(FIOp, ImmOp);
970
971 if (!ImmOp->isImm()) {
972 assert(Offset == 0);
973 FIOp->ChangeToRegister(BaseReg, false);
974 TII->legalizeOperandsVOP2(MI.getMF()->getRegInfo(), MI);
975 return;
976 }
977
978 int64_t TotalOffset = ImmOp->getImm() + Offset;
979 if (TotalOffset == 0) {
980 MI.setDesc(TII->get(AMDGPU::COPY));
981 for (unsigned I = MI.getNumOperands() - 1; I != 1; --I)
982 MI.removeOperand(I);
983
984 MI.getOperand(1).ChangeToRegister(BaseReg, false);
985 return;
986 }
987
988 ImmOp->setImm(TotalOffset);
989
990 MachineBasicBlock *MBB = MI.getParent();
991 MachineFunction *MF = MBB->getParent();
992 MachineRegisterInfo &MRI = MF->getRegInfo();
993
994 // FIXME: materializeFrameBaseRegister does not know the register class of
995 // the uses of the frame index, and assumes SGPR for hasFlatScratchEnabled.
996 // Emit a copy so we have a legal operand and hope the register coalescer
997 // can clean it up.
998 if (isSGPRReg(MRI, BaseReg)) {
999 Register BaseRegVGPR =
1000 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1001 BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), BaseRegVGPR)
1002 .addReg(BaseReg);
1003 MI.getOperand(2).ChangeToRegister(BaseRegVGPR, false);
1004 } else {
1005 MI.getOperand(2).ChangeToRegister(BaseReg, false);
1006 }
1007 return;
1008 }
1009 case AMDGPU::V_ADD_U32_e64:
1010 case AMDGPU::V_ADD_CO_U32_e64: {
1011 int Src0Idx = MI.getNumExplicitDefs();
1012 MachineOperand *FIOp = &MI.getOperand(Src0Idx);
1013 MachineOperand *ImmOp = &MI.getOperand(Src0Idx + 1);
1014 if (!FIOp->isFI())
1015 std::swap(FIOp, ImmOp);
1016
1017 if (!ImmOp->isImm()) {
1018 FIOp->ChangeToRegister(BaseReg, false);
1019 TII->legalizeOperandsVOP3(MI.getMF()->getRegInfo(), MI);
1020 return;
1021 }
1022
1023 int64_t TotalOffset = ImmOp->getImm() + Offset;
1024 if (TotalOffset == 0) {
1025 MI.setDesc(TII->get(AMDGPU::COPY));
1026
1027 for (unsigned I = MI.getNumOperands() - 1; I != 1; --I)
1028 MI.removeOperand(I);
1029
1030 MI.getOperand(1).ChangeToRegister(BaseReg, false);
1031 } else {
1032 FIOp->ChangeToRegister(BaseReg, false);
1033 ImmOp->setImm(TotalOffset);
1034 }
1035
1036 return;
1037 }
1038 default:
1039 break;
1040 }
1041
1042 bool IsFlat = TII->isFLATScratch(MI);
1043
1044#ifndef NDEBUG
1045 // FIXME: Is it possible to be storing a frame index to itself?
1046 bool SeenFI = false;
1047 for (const MachineOperand &MO: MI.operands()) {
1048 if (MO.isFI()) {
1049 if (SeenFI)
1050 llvm_unreachable("should not see multiple frame indices");
1051
1052 SeenFI = true;
1053 }
1054 }
1055#endif
1056
1057 MachineOperand *FIOp =
1058 TII->getNamedOperand(MI, IsFlat ? AMDGPU::OpName::saddr
1059 : AMDGPU::OpName::vaddr);
1060
1061 MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset);
1062 int64_t NewOffset = OffsetOp->getImm() + Offset;
1063
1064 assert(FIOp && FIOp->isFI() && "frame index must be address operand");
1065 assert(TII->isMUBUF(MI) || TII->isFLATScratch(MI));
1066
1067 if (IsFlat) {
1068 assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
1070 "offset should be legal");
1071 FIOp->ChangeToRegister(BaseReg, false);
1072 OffsetOp->setImm(NewOffset);
1073 return;
1074 }
1075
1076#ifndef NDEBUG
1077 MachineOperand *SOffset = TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
1078 assert(SOffset->isImm() && SOffset->getImm() == 0);
1079#endif
1080
1081 assert(TII->isLegalMUBUFImmOffset(NewOffset) && "offset should be legal");
1082
1083 FIOp->ChangeToRegister(BaseReg, false);
1084 OffsetOp->setImm(NewOffset);
1085}
1086
1088 Register BaseReg,
1089 int64_t Offset) const {
1090
1091 switch (MI->getOpcode()) {
1092 case AMDGPU::V_ADD_U32_e32:
1093 case AMDGPU::V_ADD_CO_U32_e32:
1094 return true;
1095 case AMDGPU::V_ADD_U32_e64:
1096 case AMDGPU::V_ADD_CO_U32_e64:
1097 return ST.hasVOP3Literal() || AMDGPU::isInlinableIntLiteral(Offset);
1098 default:
1099 break;
1100 }
1101
1103 return false;
1104
1105 int64_t NewOffset = Offset + getScratchInstrOffset(MI);
1106
1107 const SIInstrInfo *TII = ST.getInstrInfo();
1109 return TII->isLegalMUBUFImmOffset(NewOffset);
1110
1111 return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
1113}
1114
1115const TargetRegisterClass *
1117 // This is inaccurate. It depends on the instruction and address space. The
1118 // only place where we should hit this is for dealing with frame indexes /
1119 // private accesses, so this is correct in that case.
1120 return &AMDGPU::VGPR_32RegClass;
1121}
1122
1123const TargetRegisterClass *
1125 return RC == &AMDGPU::SCC_CLASSRegClass ? &AMDGPU::SReg_32RegClass : RC;
1126}
1127
1129 const SIInstrInfo *TII) {
1130
1131 unsigned Op = MI.getOpcode();
1132 switch (Op) {
1133 case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE:
1134 case AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE:
1135 // FIXME: This assumes the mask is statically known and not computed at
1136 // runtime. However, some ABIs may want to compute the mask dynamically and
1137 // this will need to be updated.
1138 return llvm::popcount(
1139 (uint64_t)TII->getNamedOperand(MI, AMDGPU::OpName::mask)->getImm());
1140 case AMDGPU::SI_SPILL_S1024_SAVE:
1141 case AMDGPU::SI_SPILL_S1024_RESTORE:
1142 case AMDGPU::SI_SPILL_V1024_SAVE:
1143 case AMDGPU::SI_SPILL_V1024_RESTORE:
1144 case AMDGPU::SI_SPILL_A1024_SAVE:
1145 case AMDGPU::SI_SPILL_A1024_RESTORE:
1146 case AMDGPU::SI_SPILL_AV1024_SAVE:
1147 case AMDGPU::SI_SPILL_AV1024_RESTORE:
1148 return 32;
1149 case AMDGPU::SI_SPILL_S512_SAVE:
1150 case AMDGPU::SI_SPILL_S512_RESTORE:
1151 case AMDGPU::SI_SPILL_V512_SAVE:
1152 case AMDGPU::SI_SPILL_V512_RESTORE:
1153 case AMDGPU::SI_SPILL_A512_SAVE:
1154 case AMDGPU::SI_SPILL_A512_RESTORE:
1155 case AMDGPU::SI_SPILL_AV512_SAVE:
1156 case AMDGPU::SI_SPILL_AV512_RESTORE:
1157 return 16;
1158 case AMDGPU::SI_SPILL_S384_SAVE:
1159 case AMDGPU::SI_SPILL_S384_RESTORE:
1160 case AMDGPU::SI_SPILL_V384_SAVE:
1161 case AMDGPU::SI_SPILL_V384_RESTORE:
1162 case AMDGPU::SI_SPILL_A384_SAVE:
1163 case AMDGPU::SI_SPILL_A384_RESTORE:
1164 case AMDGPU::SI_SPILL_AV384_SAVE:
1165 case AMDGPU::SI_SPILL_AV384_RESTORE:
1166 return 12;
1167 case AMDGPU::SI_SPILL_S352_SAVE:
1168 case AMDGPU::SI_SPILL_S352_RESTORE:
1169 case AMDGPU::SI_SPILL_V352_SAVE:
1170 case AMDGPU::SI_SPILL_V352_RESTORE:
1171 case AMDGPU::SI_SPILL_A352_SAVE:
1172 case AMDGPU::SI_SPILL_A352_RESTORE:
1173 case AMDGPU::SI_SPILL_AV352_SAVE:
1174 case AMDGPU::SI_SPILL_AV352_RESTORE:
1175 return 11;
1176 case AMDGPU::SI_SPILL_S320_SAVE:
1177 case AMDGPU::SI_SPILL_S320_RESTORE:
1178 case AMDGPU::SI_SPILL_V320_SAVE:
1179 case AMDGPU::SI_SPILL_V320_RESTORE:
1180 case AMDGPU::SI_SPILL_A320_SAVE:
1181 case AMDGPU::SI_SPILL_A320_RESTORE:
1182 case AMDGPU::SI_SPILL_AV320_SAVE:
1183 case AMDGPU::SI_SPILL_AV320_RESTORE:
1184 return 10;
1185 case AMDGPU::SI_SPILL_S288_SAVE:
1186 case AMDGPU::SI_SPILL_S288_RESTORE:
1187 case AMDGPU::SI_SPILL_V288_SAVE:
1188 case AMDGPU::SI_SPILL_V288_RESTORE:
1189 case AMDGPU::SI_SPILL_A288_SAVE:
1190 case AMDGPU::SI_SPILL_A288_RESTORE:
1191 case AMDGPU::SI_SPILL_AV288_SAVE:
1192 case AMDGPU::SI_SPILL_AV288_RESTORE:
1193 return 9;
1194 case AMDGPU::SI_SPILL_S256_SAVE:
1195 case AMDGPU::SI_SPILL_S256_RESTORE:
1196 case AMDGPU::SI_SPILL_V256_SAVE:
1197 case AMDGPU::SI_SPILL_V256_RESTORE:
1198 case AMDGPU::SI_SPILL_A256_SAVE:
1199 case AMDGPU::SI_SPILL_A256_RESTORE:
1200 case AMDGPU::SI_SPILL_AV256_SAVE:
1201 case AMDGPU::SI_SPILL_AV256_RESTORE:
1202 return 8;
1203 case AMDGPU::SI_SPILL_S224_SAVE:
1204 case AMDGPU::SI_SPILL_S224_RESTORE:
1205 case AMDGPU::SI_SPILL_V224_SAVE:
1206 case AMDGPU::SI_SPILL_V224_RESTORE:
1207 case AMDGPU::SI_SPILL_A224_SAVE:
1208 case AMDGPU::SI_SPILL_A224_RESTORE:
1209 case AMDGPU::SI_SPILL_AV224_SAVE:
1210 case AMDGPU::SI_SPILL_AV224_RESTORE:
1211 return 7;
1212 case AMDGPU::SI_SPILL_S192_SAVE:
1213 case AMDGPU::SI_SPILL_S192_RESTORE:
1214 case AMDGPU::SI_SPILL_V192_SAVE:
1215 case AMDGPU::SI_SPILL_V192_RESTORE:
1216 case AMDGPU::SI_SPILL_A192_SAVE:
1217 case AMDGPU::SI_SPILL_A192_RESTORE:
1218 case AMDGPU::SI_SPILL_AV192_SAVE:
1219 case AMDGPU::SI_SPILL_AV192_RESTORE:
1220 return 6;
1221 case AMDGPU::SI_SPILL_S160_SAVE:
1222 case AMDGPU::SI_SPILL_S160_RESTORE:
1223 case AMDGPU::SI_SPILL_V160_SAVE:
1224 case AMDGPU::SI_SPILL_V160_RESTORE:
1225 case AMDGPU::SI_SPILL_A160_SAVE:
1226 case AMDGPU::SI_SPILL_A160_RESTORE:
1227 case AMDGPU::SI_SPILL_AV160_SAVE:
1228 case AMDGPU::SI_SPILL_AV160_RESTORE:
1229 return 5;
1230 case AMDGPU::SI_SPILL_S128_SAVE:
1231 case AMDGPU::SI_SPILL_S128_RESTORE:
1232 case AMDGPU::SI_SPILL_V128_SAVE:
1233 case AMDGPU::SI_SPILL_V128_RESTORE:
1234 case AMDGPU::SI_SPILL_A128_SAVE:
1235 case AMDGPU::SI_SPILL_A128_RESTORE:
1236 case AMDGPU::SI_SPILL_AV128_SAVE:
1237 case AMDGPU::SI_SPILL_AV128_RESTORE:
1238 return 4;
1239 case AMDGPU::SI_SPILL_S96_SAVE:
1240 case AMDGPU::SI_SPILL_S96_RESTORE:
1241 case AMDGPU::SI_SPILL_V96_SAVE:
1242 case AMDGPU::SI_SPILL_V96_RESTORE:
1243 case AMDGPU::SI_SPILL_A96_SAVE:
1244 case AMDGPU::SI_SPILL_A96_RESTORE:
1245 case AMDGPU::SI_SPILL_AV96_SAVE:
1246 case AMDGPU::SI_SPILL_AV96_RESTORE:
1247 return 3;
1248 case AMDGPU::SI_SPILL_S64_SAVE:
1249 case AMDGPU::SI_SPILL_S64_RESTORE:
1250 case AMDGPU::SI_SPILL_V64_SAVE:
1251 case AMDGPU::SI_SPILL_V64_RESTORE:
1252 case AMDGPU::SI_SPILL_A64_SAVE:
1253 case AMDGPU::SI_SPILL_A64_RESTORE:
1254 case AMDGPU::SI_SPILL_AV64_SAVE:
1255 case AMDGPU::SI_SPILL_AV64_RESTORE:
1256 return 2;
1257 case AMDGPU::SI_SPILL_S32_SAVE:
1258 case AMDGPU::SI_SPILL_S32_RESTORE:
1259 case AMDGPU::SI_SPILL_V32_SAVE:
1260 case AMDGPU::SI_SPILL_V32_RESTORE:
1261 case AMDGPU::SI_SPILL_A32_SAVE:
1262 case AMDGPU::SI_SPILL_A32_RESTORE:
1263 case AMDGPU::SI_SPILL_AV32_SAVE:
1264 case AMDGPU::SI_SPILL_AV32_RESTORE:
1265 case AMDGPU::SI_SPILL_WWM_V32_SAVE:
1266 case AMDGPU::SI_SPILL_WWM_V32_RESTORE:
1267 case AMDGPU::SI_SPILL_WWM_AV32_SAVE:
1268 case AMDGPU::SI_SPILL_WWM_AV32_RESTORE:
1269 case AMDGPU::SI_SPILL_V16_SAVE:
1270 case AMDGPU::SI_SPILL_V16_RESTORE:
1271 return 1;
1272 default: llvm_unreachable("Invalid spill opcode");
1273 }
1274}
1275
1276static int getOffsetMUBUFStore(unsigned Opc) {
1277 switch (Opc) {
1278 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
1279 return AMDGPU::BUFFER_STORE_DWORD_OFFSET;
1280 case AMDGPU::BUFFER_STORE_BYTE_OFFEN:
1281 return AMDGPU::BUFFER_STORE_BYTE_OFFSET;
1282 case AMDGPU::BUFFER_STORE_SHORT_OFFEN:
1283 return AMDGPU::BUFFER_STORE_SHORT_OFFSET;
1284 case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
1285 return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
1286 case AMDGPU::BUFFER_STORE_DWORDX3_OFFEN:
1287 return AMDGPU::BUFFER_STORE_DWORDX3_OFFSET;
1288 case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN:
1289 return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
1290 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN:
1291 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET;
1292 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN:
1293 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET;
1294 default:
1295 return -1;
1296 }
1297}
1298
1299static int getOffsetMUBUFLoad(unsigned Opc) {
1300 switch (Opc) {
1301 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
1302 return AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
1303 case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN:
1304 return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET;
1305 case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN:
1306 return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET;
1307 case AMDGPU::BUFFER_LOAD_USHORT_OFFEN:
1308 return AMDGPU::BUFFER_LOAD_USHORT_OFFSET;
1309 case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN:
1310 return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET;
1311 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN:
1312 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
1313 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN:
1314 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET;
1315 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN:
1316 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET;
1317 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN:
1318 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET;
1319 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN:
1320 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET;
1321 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN:
1322 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET;
1323 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN:
1324 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET;
1325 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN:
1326 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET;
1327 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN:
1328 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET;
1329 default:
1330 return -1;
1331 }
1332}
1333
1334static int getOffenMUBUFStore(unsigned Opc) {
1335 switch (Opc) {
1336 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
1337 return AMDGPU::BUFFER_STORE_DWORD_OFFEN;
1338 case AMDGPU::BUFFER_STORE_BYTE_OFFSET:
1339 return AMDGPU::BUFFER_STORE_BYTE_OFFEN;
1340 case AMDGPU::BUFFER_STORE_SHORT_OFFSET:
1341 return AMDGPU::BUFFER_STORE_SHORT_OFFEN;
1342 case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET:
1343 return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN;
1344 case AMDGPU::BUFFER_STORE_DWORDX3_OFFSET:
1345 return AMDGPU::BUFFER_STORE_DWORDX3_OFFEN;
1346 case AMDGPU::BUFFER_STORE_DWORDX4_OFFSET:
1347 return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN;
1348 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET:
1349 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN;
1350 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET:
1351 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN;
1352 default:
1353 return -1;
1354 }
1355}
1356
1357static int getOffenMUBUFLoad(unsigned Opc) {
1358 switch (Opc) {
1359 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
1360 return AMDGPU::BUFFER_LOAD_DWORD_OFFEN;
1361 case AMDGPU::BUFFER_LOAD_UBYTE_OFFSET:
1362 return AMDGPU::BUFFER_LOAD_UBYTE_OFFEN;
1363 case AMDGPU::BUFFER_LOAD_SBYTE_OFFSET:
1364 return AMDGPU::BUFFER_LOAD_SBYTE_OFFEN;
1365 case AMDGPU::BUFFER_LOAD_USHORT_OFFSET:
1366 return AMDGPU::BUFFER_LOAD_USHORT_OFFEN;
1367 case AMDGPU::BUFFER_LOAD_SSHORT_OFFSET:
1368 return AMDGPU::BUFFER_LOAD_SSHORT_OFFEN;
1369 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET:
1370 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
1371 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET:
1372 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN;
1373 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET:
1374 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN;
1375 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET:
1376 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN;
1377 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET:
1378 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN;
1379 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET:
1380 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN;
1381 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET:
1382 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN;
1383 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET:
1384 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN;
1385 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET:
1386 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN;
1387 default:
1388 return -1;
1389 }
1390}
1391
1395 int Index, unsigned Lane,
1396 unsigned ValueReg, bool IsKill) {
1397 MachineFunction *MF = MBB.getParent();
1399 const SIInstrInfo *TII = ST.getInstrInfo();
1400
1401 MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane);
1402
1403 if (Reg == AMDGPU::NoRegister)
1404 return MachineInstrBuilder();
1405
1406 bool IsStore = MI->mayStore();
1407 MachineRegisterInfo &MRI = MF->getRegInfo();
1408 auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
1409
1410 unsigned Dst = IsStore ? Reg : ValueReg;
1411 unsigned Src = IsStore ? ValueReg : Reg;
1412 bool IsVGPR = TRI->isVGPR(MRI, Reg);
1413 const DebugLoc &DL = MI->getDebugLoc();
1414 if (IsVGPR == TRI->isVGPR(MRI, ValueReg)) {
1415 // Spiller during regalloc may restore a spilled register to its superclass.
1416 // It could result in AGPR spills restored to VGPRs or the other way around,
1417 // making the src and dst with identical regclasses at this point. It just
1418 // needs a copy in such cases.
1419 auto CopyMIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), Dst)
1420 .addReg(Src, getKillRegState(IsKill));
1422 return CopyMIB;
1423 }
1424 unsigned Opc = (IsStore ^ IsVGPR) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64
1425 : AMDGPU::V_ACCVGPR_READ_B32_e64;
1426
1427 auto MIB = BuildMI(MBB, MI, DL, TII->get(Opc), Dst)
1428 .addReg(Src, getKillRegState(IsKill));
1430 return MIB;
1431}
1432
1433// This differs from buildSpillLoadStore by only scavenging a VGPR. It does not
1434// need to handle the case where an SGPR may need to be spilled while spilling.
1436 MachineFrameInfo &MFI,
1438 int Index,
1439 int64_t Offset) {
1440 const SIInstrInfo *TII = ST.getInstrInfo();
1441 MachineBasicBlock *MBB = MI->getParent();
1442 const DebugLoc &DL = MI->getDebugLoc();
1443 bool IsStore = MI->mayStore();
1444
1445 unsigned Opc = MI->getOpcode();
1446 int LoadStoreOp = IsStore ?
1448 if (LoadStoreOp == -1)
1449 return false;
1450
1451 const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata);
1452 if (spillVGPRtoAGPR(ST, *MBB, MI, Index, 0, Reg->getReg(), false).getInstr())
1453 return true;
1454
1455 MachineInstrBuilder NewMI =
1456 BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
1457 .add(*Reg)
1458 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc))
1459 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset))
1460 .addImm(Offset)
1461 .addImm(0) // cpol
1462 .addImm(0) // swz
1463 .cloneMemRefs(*MI);
1464
1465 const MachineOperand *VDataIn = TII->getNamedOperand(*MI,
1466 AMDGPU::OpName::vdata_in);
1467 if (VDataIn)
1468 NewMI.add(*VDataIn);
1469 return true;
1470}
1471
1473 unsigned LoadStoreOp,
1474 unsigned EltSize) {
1475 bool IsStore = TII->get(LoadStoreOp).mayStore();
1476 bool HasVAddr = AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::vaddr);
1477 bool UseST =
1478 !HasVAddr && !AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::saddr);
1479
1480 // Handle block load/store first.
1481 if (TII->isBlockLoadStore(LoadStoreOp))
1482 return LoadStoreOp;
1483
1484 switch (EltSize) {
1485 case 4:
1486 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
1487 : AMDGPU::SCRATCH_LOAD_DWORD_SADDR;
1488 break;
1489 case 8:
1490 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX2_SADDR
1491 : AMDGPU::SCRATCH_LOAD_DWORDX2_SADDR;
1492 break;
1493 case 12:
1494 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX3_SADDR
1495 : AMDGPU::SCRATCH_LOAD_DWORDX3_SADDR;
1496 break;
1497 case 16:
1498 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX4_SADDR
1499 : AMDGPU::SCRATCH_LOAD_DWORDX4_SADDR;
1500 break;
1501 default:
1502 llvm_unreachable("Unexpected spill load/store size!");
1503 }
1504
1505 if (HasVAddr)
1506 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp);
1507 else if (UseST)
1508 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
1509
1510 return LoadStoreOp;
1511}
1512
1515 unsigned LoadStoreOp, int Index, Register ValueReg, bool IsKill,
1516 MCRegister ScratchOffsetReg, int64_t InstOffset, MachineMemOperand *MMO,
1517 RegScavenger *RS, LiveRegUnits *LiveUnits) const {
1518 assert((!RS || !LiveUnits) && "Only RS or LiveUnits can be set but not both");
1519
1520 MachineFunction *MF = MBB.getParent();
1521 const SIInstrInfo *TII = ST.getInstrInfo();
1522 const MachineFrameInfo &MFI = MF->getFrameInfo();
1523 const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
1524
1525 const MCInstrDesc *Desc = &TII->get(LoadStoreOp);
1526 bool IsStore = Desc->mayStore();
1527 bool IsFlat = TII->isFLATScratch(LoadStoreOp);
1528 bool IsBlock = TII->isBlockLoadStore(LoadStoreOp);
1529
1530 bool CanClobberSCC = false;
1531 bool Scavenged = false;
1532 MCRegister SOffset = ScratchOffsetReg;
1533
1534 const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
1535 // On gfx90a+ AGPR is a regular VGPR acceptable for loads and stores.
1536 const bool IsAGPR = !ST.hasGFX90AInsts() && isAGPRClass(RC);
1537 unsigned RegWidth = AMDGPU::getRegBitWidth(*RC) / 8;
1538
1539 // On targets with register tuple alignment requirements,
1540 // for unaligned tuples, spill the first sub-reg as a 32-bit spill,
1541 // and spill the rest as a regular aligned tuple.
1542 // eg: SPILL_V224 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
1543 // will be spilt as:
1544 // SPILL_SCRATCH_DWORD $vgpr1
1545 // SPILL_SCRATCH_DWORDx4 $vgpr2_vgpr3_vgpr4_vgpr5
1546 // SPILL_SCRATCH_DWORDx2 $vgpr6_vgpr7
1547 bool IsRegMisaligned = false;
1548 if (!IsBlock && !IsAGPR && RegWidth > 4) {
1549 unsigned SpillOpcode =
1550 getFlatScratchSpillOpcode(TII, LoadStoreOp, std::min(RegWidth, 16u));
1551 int VDataIdx =
1552 IsStore ? AMDGPU::getNamedOperandIdx(SpillOpcode, AMDGPU::OpName::vdata)
1553 : 0; // Restore Ops have data reg as the first (output) operand.
1554 const TargetRegisterClass *ExpectedRC =
1555 TII->getRegClass(TII->get(SpillOpcode), VDataIdx);
1556 if (!ExpectedRC->contains(ValueReg)) {
1557 unsigned NumRegs = std::min(AMDGPU::getRegBitWidth(*ExpectedRC) / 4, 4u);
1558 unsigned SubIdx = getSubRegFromChannel(0, NumRegs);
1559 const TargetRegisterClass *MatchRC =
1560 getMatchingSuperRegClass(RC, ExpectedRC, SubIdx);
1561 if (!MatchRC || !MatchRC->contains(ValueReg))
1562 IsRegMisaligned = true;
1563 }
1564 }
1565 // The first sub-register will be spilled as a 32-bit value
1566 if (IsRegMisaligned)
1567 RegWidth -= 4u;
1568 // Always use 4 byte operations for AGPRs because we need to scavenge
1569 // a temporary VGPR.
1570 // If we're using a block operation, the element should be the whole block.
1571 unsigned EltSize = IsBlock ? RegWidth
1572 : (IsFlat && !IsAGPR) ? std::min(RegWidth, 16u)
1573 : 4u;
1574 unsigned NumSubRegs = RegWidth / EltSize;
1575 unsigned Size = NumSubRegs * EltSize;
1576 unsigned RemSize = RegWidth - Size;
1577 unsigned NumRemSubRegs = RemSize ? 1 : 0;
1578 // An additional sub-register is needed to spill the misaligned component.
1579 if (IsRegMisaligned)
1580 NumSubRegs += 1;
1581 int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
1582 int64_t MaterializedOffset = Offset;
1583
1584 // Maxoffset is the starting offset for the last chunk to be spilled.
1585 // In case of non-zero remainder element, max offset will be the
1586 // last address(offset + Size) after spilling all the EltSize chunks.
1587 int64_t MaxOffset = Offset + Size - (RemSize ? 0 : EltSize);
1588 int64_t ScratchOffsetRegDelta = 0;
1589
1590 if (IsFlat && EltSize > 4) {
1591 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
1592 Desc = &TII->get(LoadStoreOp);
1593 }
1594
1595 Align Alignment = MFI.getObjectAlign(Index);
1596 const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
1597
1598 assert((IsFlat || ((Offset % EltSize) == 0)) &&
1599 "unexpected VGPR spill offset");
1600
1601 // Track a VGPR to use for a constant offset we need to materialize.
1602 Register TmpOffsetVGPR;
1603
1604 // Track a VGPR to use as an intermediate value.
1605 Register TmpIntermediateVGPR;
1606 bool UseVGPROffset = false;
1607
1608 // Materialize a VGPR offset required for the given SGPR/VGPR/Immediate
1609 // combination.
1610 auto MaterializeVOffset = [&](Register SGPRBase, Register TmpVGPR,
1611 int64_t VOffset) {
1612 // We are using a VGPR offset
1613 if (IsFlat && SGPRBase) {
1614 // We only have 1 VGPR offset, or 1 SGPR offset. We don't have a free
1615 // SGPR, so perform the add as vector.
1616 // We don't need a base SGPR in the kernel.
1617
1618 if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) >= 2) {
1619 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), TmpVGPR)
1620 .addReg(SGPRBase)
1621 .addImm(VOffset)
1622 .addImm(0); // clamp
1623 } else {
1624 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
1625 .addReg(SGPRBase);
1626 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e32), TmpVGPR)
1627 .addImm(VOffset)
1628 .addReg(TmpOffsetVGPR);
1629 }
1630 } else {
1631 assert(TmpOffsetVGPR);
1632 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
1633 .addImm(VOffset);
1634 }
1635 };
1636
1637 bool IsOffsetLegal =
1638 IsFlat ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS,
1640 : TII->isLegalMUBUFImmOffset(MaxOffset);
1641 if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) {
1642 SOffset = MCRegister();
1643
1644 // We don't have access to the register scavenger if this function is called
1645 // during PEI::scavengeFrameVirtualRegs() so use LiveUnits in this case.
1646 // TODO: Clobbering SCC is not necessary for scratch instructions in the
1647 // entry.
1648 if (RS) {
1649 SOffset = RS->scavengeRegisterBackwards(AMDGPU::SGPR_32RegClass, MI, false, 0, false);
1650
1651 // Piggy back on the liveness scan we just did see if SCC is dead.
1652 CanClobberSCC = !RS->isRegUsed(AMDGPU::SCC);
1653 } else if (LiveUnits) {
1654 CanClobberSCC = LiveUnits->available(AMDGPU::SCC);
1655 for (MCRegister Reg : AMDGPU::SGPR_32RegClass) {
1656 if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) {
1657 SOffset = Reg;
1658 break;
1659 }
1660 }
1661 }
1662
1663 if (ScratchOffsetReg != AMDGPU::NoRegister && !CanClobberSCC)
1664 SOffset = Register();
1665
1666 if (!SOffset) {
1667 UseVGPROffset = true;
1668
1669 if (RS) {
1670 TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0);
1671 } else {
1672 assert(LiveUnits);
1673 for (MCRegister Reg : AMDGPU::VGPR_32RegClass) {
1674 if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) {
1675 TmpOffsetVGPR = Reg;
1676 break;
1677 }
1678 }
1679 }
1680
1681 assert(TmpOffsetVGPR);
1682 } else if (!SOffset && CanClobberSCC) {
1683 // There are no free SGPRs, and since we are in the process of spilling
1684 // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true
1685 // on SI/CI and on VI it is true until we implement spilling using scalar
1686 // stores), we have no way to free up an SGPR. Our solution here is to
1687 // add the offset directly to the ScratchOffset or StackPtrOffset
1688 // register, and then subtract the offset after the spill to return the
1689 // register to it's original value.
1690
1691 // TODO: If we don't have to do an emergency stack slot spill, converting
1692 // to use the VGPR offset is fewer instructions.
1693 if (!ScratchOffsetReg)
1694 ScratchOffsetReg = FuncInfo->getStackPtrOffsetReg();
1695 SOffset = ScratchOffsetReg;
1696 ScratchOffsetRegDelta = Offset;
1697 } else {
1698 Scavenged = true;
1699 }
1700
1701 // We currently only support spilling VGPRs to EltSize boundaries, meaning
1702 // we can simplify the adjustment of Offset here to just scale with
1703 // WavefrontSize.
1704 if (!IsFlat && !UseVGPROffset)
1705 Offset *= ST.getWavefrontSize();
1706
1707 if (!UseVGPROffset && !SOffset)
1708 report_fatal_error("could not scavenge SGPR to spill in entry function");
1709
1710 if (UseVGPROffset) {
1711 // We are using a VGPR offset
1712 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, Offset);
1713 } else if (ScratchOffsetReg == AMDGPU::NoRegister) {
1714 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset).addImm(Offset);
1715 } else {
1716 assert(Offset != 0);
1717 auto Add = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset)
1718 .addReg(ScratchOffsetReg)
1719 .addImm(Offset);
1720 Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1721 }
1722
1723 Offset = 0;
1724 }
1725
1726 if (IsFlat && SOffset == AMDGPU::NoRegister) {
1727 assert(AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0
1728 && "Unexpected vaddr for flat scratch with a FI operand");
1729
1730 if (UseVGPROffset) {
1731 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp);
1732 } else {
1733 assert(ST.hasFlatScratchSTMode());
1734 assert(!TII->isBlockLoadStore(LoadStoreOp) && "Block ops don't have ST");
1735 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
1736 }
1737
1738 Desc = &TII->get(LoadStoreOp);
1739 }
1740
1741 // Save a copy of the original element size before its potentially changed for
1742 // misaligned tuples.
1743 unsigned OrigEltSize = EltSize;
1744 for (unsigned i = 0, e = NumSubRegs + NumRemSubRegs, RegOffset = 0; i != e;
1745 ++i, RegOffset += EltSize) {
1746 if (IsRegMisaligned) {
1747 if (i == 0) {
1748 // For misaligned register tuples, spill only the first sub-reg in the
1749 // first iteration.
1750 EltSize = 4u;
1751 } else {
1752 // The misaligned register was spilt. Now the rest of the tuple is
1753 // properly aligned.
1754 IsRegMisaligned = false;
1755 EltSize = OrigEltSize;
1756 }
1757 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
1758 }
1759 if (i == NumSubRegs) {
1760 EltSize = RemSize;
1761 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
1762 }
1763 Desc = &TII->get(LoadStoreOp);
1764
1765 if (!IsFlat && UseVGPROffset) {
1766 int NewLoadStoreOp = IsStore ? getOffenMUBUFStore(LoadStoreOp)
1767 : getOffenMUBUFLoad(LoadStoreOp);
1768 Desc = &TII->get(NewLoadStoreOp);
1769 }
1770
1771 if (UseVGPROffset && TmpOffsetVGPR == TmpIntermediateVGPR) {
1772 // If we are spilling an AGPR beyond the range of the memory instruction
1773 // offset and need to use a VGPR offset, we ideally have at least 2
1774 // scratch VGPRs. If we don't have a second free VGPR without spilling,
1775 // recycle the VGPR used for the offset which requires resetting after
1776 // each subregister.
1777
1778 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, MaterializedOffset);
1779 }
1780
1781 unsigned NumRegs = EltSize / 4;
1782 Register SubReg = e == 1
1783 ? ValueReg
1784 : Register(getSubReg(ValueReg,
1785 getSubRegFromChannel(RegOffset / 4, NumRegs)));
1786
1787 RegState SOffsetRegState = {};
1788 RegState SrcDstRegState = getDefRegState(!IsStore);
1789 const bool IsLastSubReg = i + 1 == e;
1790 const bool IsFirstSubReg = i == 0;
1791 if (IsLastSubReg) {
1792 SOffsetRegState |= getKillRegState(Scavenged);
1793 // The last implicit use carries the "Kill" flag.
1794 SrcDstRegState |= getKillRegState(IsKill);
1795 }
1796
1797 // Make sure the whole register is defined if there are undef components by
1798 // adding an implicit def of the super-reg on the first instruction.
1799 bool NeedSuperRegDef = e > 1 && IsStore && IsFirstSubReg;
1800 bool NeedSuperRegImpOperand = e > 1;
1801
1802 // Remaining element size to spill into memory after some parts of it
1803 // spilled into either AGPRs or VGPRs.
1804 unsigned RemEltSize = EltSize;
1805
1806 // AGPRs to spill VGPRs and vice versa are allocated in a reverse order,
1807 // starting from the last lane. In case if a register cannot be completely
1808 // spilled into another register that will ensure its alignment does not
1809 // change. For targets with VGPR alignment requirement this is important
1810 // in case of flat scratch usage as we might get a scratch_load or
1811 // scratch_store of an unaligned register otherwise.
1812 for (int LaneS = (RegOffset + EltSize) / 4 - 1, Lane = LaneS,
1813 LaneE = RegOffset / 4;
1814 Lane >= LaneE; --Lane) {
1815 bool IsSubReg = e > 1 || EltSize > 4;
1816 Register Sub = IsSubReg
1817 ? Register(getSubReg(ValueReg, getSubRegFromChannel(Lane)))
1818 : ValueReg;
1819 auto MIB = spillVGPRtoAGPR(ST, MBB, MI, Index, Lane, Sub, IsKill);
1820 if (!MIB.getInstr())
1821 break;
1822 if (NeedSuperRegDef || (IsSubReg && IsStore && Lane == LaneS && IsFirstSubReg)) {
1823 MIB.addReg(ValueReg, RegState::ImplicitDefine);
1824 NeedSuperRegDef = false;
1825 }
1826 if ((IsSubReg || NeedSuperRegImpOperand) && (IsFirstSubReg || IsLastSubReg)) {
1827 NeedSuperRegImpOperand = true;
1828 RegState State = SrcDstRegState;
1829 if (!IsLastSubReg || (Lane != LaneE))
1830 State &= ~RegState::Kill;
1831 if (!IsFirstSubReg || (Lane != LaneS))
1832 State &= ~RegState::Define;
1833 MIB.addReg(ValueReg, RegState::Implicit | State);
1834 }
1835 RemEltSize -= 4;
1836 }
1837
1838 if (!RemEltSize) // Fully spilled into AGPRs.
1839 continue;
1840
1841 if (RemEltSize != EltSize) { // Partially spilled to AGPRs
1842 assert(IsFlat && EltSize > 4);
1843
1844 unsigned NumRegs = RemEltSize / 4;
1845 SubReg = Register(getSubReg(ValueReg,
1846 getSubRegFromChannel(RegOffset / 4, NumRegs)));
1847 unsigned Opc = getFlatScratchSpillOpcode(TII, LoadStoreOp, RemEltSize);
1848 Desc = &TII->get(Opc);
1849 }
1850
1851 unsigned FinalReg = SubReg;
1852
1853 if (IsAGPR) {
1854 assert(EltSize == 4);
1855
1856 if (!TmpIntermediateVGPR) {
1857 TmpIntermediateVGPR = FuncInfo->getVGPRForAGPRCopy();
1858 assert(MF->getRegInfo().isReserved(TmpIntermediateVGPR));
1859 }
1860 if (IsStore) {
1861 auto AccRead = BuildMI(MBB, MI, DL,
1862 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64),
1863 TmpIntermediateVGPR)
1864 .addReg(SubReg, getKillRegState(IsKill));
1865 if (NeedSuperRegDef)
1866 AccRead.addReg(ValueReg, RegState::ImplicitDefine);
1867 if (NeedSuperRegImpOperand && (IsFirstSubReg || IsLastSubReg))
1868 AccRead.addReg(ValueReg, RegState::Implicit);
1870 }
1871 SubReg = TmpIntermediateVGPR;
1872 } else if (UseVGPROffset) {
1873 if (!TmpOffsetVGPR) {
1874 TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
1875 MI, false, 0);
1876 RS->setRegUsed(TmpOffsetVGPR);
1877 }
1878 }
1879
1880 Register FinalValueReg = ValueReg;
1881 if (LoadStoreOp == AMDGPU::SCRATCH_LOAD_USHORT_SADDR) {
1882 // If we are loading 16-bit value with SRAMECC endabled we need a temp
1883 // 32-bit VGPR to load and extract 16-bits into the final register.
1884 ValueReg =
1885 RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0);
1886 SubReg = ValueReg;
1887 IsKill = false;
1888 }
1889
1890 // Create the MMO, additional set the NonVolatile flag as scratch memory
1891 // used for spills will not be used outside the thread.
1892 MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RegOffset);
1894 PInfo, MMO->getFlags() | MOThreadPrivate, RemEltSize,
1895 commonAlignment(Alignment, RegOffset));
1896
1897 auto MIB =
1898 BuildMI(MBB, MI, DL, *Desc)
1899 .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill));
1900
1901 if (UseVGPROffset) {
1902 // For an AGPR spill, we reuse the same temp VGPR for the offset and the
1903 // intermediate accvgpr_write.
1904 MIB.addReg(TmpOffsetVGPR, getKillRegState(IsLastSubReg && !IsAGPR));
1905 }
1906
1907 if (!IsFlat)
1908 MIB.addReg(FuncInfo->getScratchRSrcReg());
1909
1910 if (SOffset == AMDGPU::NoRegister) {
1911 if (!IsFlat) {
1912 if (UseVGPROffset && ScratchOffsetReg) {
1913 MIB.addReg(ScratchOffsetReg);
1914 } else {
1915 assert(FuncInfo->isBottomOfStack());
1916 MIB.addImm(0);
1917 }
1918 }
1919 } else {
1920 MIB.addReg(SOffset, SOffsetRegState);
1921 }
1922
1923 MIB.addImm(Offset + RegOffset);
1924
1925 bool LastUse = MMO->getFlags() & MOLastUse;
1926 MIB.addImm(LastUse ? AMDGPU::CPol::TH_LU : 0); // cpol
1927
1928 if (!IsFlat)
1929 MIB.addImm(0); // swz
1930 MIB.addMemOperand(NewMMO);
1931
1932 if (FinalValueReg != ValueReg) {
1933 // Extract 16-bit from the loaded 32-bit value.
1934 ValueReg = getSubReg(ValueReg, AMDGPU::lo16);
1935 MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B16_t16_e64))
1936 .addReg(FinalValueReg, getDefRegState(true))
1937 .addImm(0)
1938 .addReg(ValueReg, getKillRegState(true))
1939 .addImm(0);
1940 ValueReg = FinalValueReg;
1941 }
1942
1943 if (!IsAGPR && NeedSuperRegDef)
1944 MIB.addReg(ValueReg, RegState::ImplicitDefine);
1945
1946 if (!IsStore && IsAGPR && TmpIntermediateVGPR != AMDGPU::NoRegister) {
1947 MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64),
1948 FinalReg)
1949 .addReg(TmpIntermediateVGPR, RegState::Kill);
1951 }
1952
1953 bool IsSrcDstDef = hasRegState(SrcDstRegState, RegState::Define);
1954 bool PartialReloadCopy = (RemEltSize != EltSize) && !IsStore;
1955 if (NeedSuperRegImpOperand &&
1956 (IsFirstSubReg || (IsLastSubReg && !IsSrcDstDef))) {
1957 MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
1958 if (PartialReloadCopy)
1959 MIB.addReg(ValueReg, RegState::Implicit);
1960 }
1961
1962 // The epilog restore of a wwm-scratch register can cause undesired
1963 // optimization during machine-cp post PrologEpilogInserter if the same
1964 // register was assigned for return value ABI lowering with a COPY
1965 // instruction. As given below, with the epilog reload, the earlier COPY
1966 // appeared to be dead during machine-cp.
1967 // ...
1968 // v0 in WWM operation, needs the WWM spill at prolog/epilog.
1969 // $vgpr0 = V_WRITELANE_B32 $sgpr20, 0, $vgpr0
1970 // ...
1971 // Epilog block:
1972 // $vgpr0 = COPY $vgpr1 // outgoing value moved to v0
1973 // ...
1974 // WWM spill restore to preserve the inactive lanes of v0.
1975 // $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1
1976 // $vgpr0 = BUFFER_LOAD $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0
1977 // $exec = S_MOV_B64 killed $sgpr4_sgpr5
1978 // ...
1979 // SI_RETURN implicit $vgpr0
1980 // ...
1981 // To fix it, mark the same reg as a tied op for such restore instructions
1982 // so that it marks a usage for the preceding COPY.
1983 if (!IsStore && MI != MBB.end() && MI->isReturn() &&
1984 MI->readsRegister(SubReg, this)) {
1985 MIB.addReg(SubReg, RegState::Implicit);
1986 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1987 }
1988
1989 // If we're building a block load, we should add artificial uses for the
1990 // CSR VGPRs that are *not* being transferred. This is because liveness
1991 // analysis is not aware of the mask, so we need to somehow inform it that
1992 // those registers are not available before the load and they should not be
1993 // scavenged.
1994 if (!IsStore && TII->isBlockLoadStore(LoadStoreOp))
1995 addImplicitUsesForBlockCSRLoad(MIB, ValueReg);
1996 }
1997
1998 if (ScratchOffsetRegDelta != 0) {
1999 // Subtract the offset we added to the ScratchOffset register.
2000 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset)
2001 .addReg(SOffset)
2002 .addImm(-ScratchOffsetRegDelta);
2003 }
2004}
2005
2007 Register BlockReg) const {
2008 const MachineFunction *MF = MIB->getMF();
2009 const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
2010 uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(BlockReg);
2011 Register BaseVGPR = getSubReg(BlockReg, AMDGPU::sub0);
2012 for (unsigned RegOffset = 1; RegOffset < 32; ++RegOffset)
2013 if (!(Mask & (1 << RegOffset)) &&
2014 isCalleeSavedPhysReg(BaseVGPR + RegOffset, *MF))
2015 MIB.addUse(BaseVGPR + RegOffset, RegState::Implicit);
2016}
2017
2019 int Offset, bool IsLoad,
2020 bool IsKill) const {
2021 // Load/store VGPR
2022 MachineFrameInfo &FrameInfo = SB.MF.getFrameInfo();
2023 assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill);
2024
2025 Register FrameReg =
2026 FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(SB.MF)
2027 ? getBaseRegister()
2028 : getFrameRegister(SB.MF);
2029
2030 Align Alignment = FrameInfo.getObjectAlign(Index);
2034 SB.EltSize, Alignment);
2035
2036 if (IsLoad) {
2037 unsigned Opc = ST.hasFlatScratchEnabled()
2038 ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
2039 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
2040 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, false,
2041 FrameReg, (int64_t)Offset * SB.EltSize, MMO, SB.RS);
2042 } else {
2043 unsigned Opc = ST.hasFlatScratchEnabled()
2044 ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
2045 : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
2046 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, IsKill,
2047 FrameReg, (int64_t)Offset * SB.EltSize, MMO, SB.RS);
2048 // This only ever adds one VGPR spill
2049 SB.MFI.addToSpilledVGPRs(1);
2050 }
2051}
2052
2054 RegScavenger *RS, SlotIndexes *Indexes,
2055 LiveIntervals *LIS, bool OnlyToVGPR,
2056 bool SpillToPhysVGPRLane) const {
2057 assert(!MI->getOperand(0).isUndef() &&
2058 "undef spill should have been deleted earlier");
2059
2060 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
2061
2062 ArrayRef<SpilledReg> VGPRSpills =
2063 SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index)
2065 bool SpillToVGPR = !VGPRSpills.empty();
2066 if (OnlyToVGPR && !SpillToVGPR)
2067 return false;
2068
2069 assert(SpillToVGPR || (SB.SuperReg != SB.MFI.getStackPtrOffsetReg() &&
2070 SB.SuperReg != SB.MFI.getFrameOffsetReg()));
2071
2072 if (SpillToVGPR) {
2073
2074 // Since stack slot coloring pass is trying to optimize SGPR spills,
2075 // VGPR lanes (mapped from spill stack slot) may be shared for SGPR
2076 // spills of different sizes. This accounts for number of VGPR lanes alloted
2077 // equal to the largest SGPR being spilled in them.
2078 assert(SB.NumSubRegs <= VGPRSpills.size() &&
2079 "Num of SGPRs spilled should be less than or equal to num of "
2080 "the VGPR lanes.");
2081
2082 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
2083 Register SubReg =
2084 SB.NumSubRegs == 1
2085 ? SB.SuperReg
2086 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2087 SpilledReg Spill = VGPRSpills[i];
2088
2089 bool IsFirstSubreg = i == 0;
2090 bool IsLastSubreg = i == SB.NumSubRegs - 1;
2091 bool UseKill = SB.IsKill && IsLastSubreg;
2092
2093
2094 // Mark the "old value of vgpr" input undef only if this is the first sgpr
2095 // spill to this specific vgpr in the first basic block.
2096 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
2097 SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), Spill.VGPR)
2098 .addReg(SubReg, getKillRegState(UseKill))
2099 .addImm(Spill.Lane)
2100 .addReg(Spill.VGPR);
2101 if (Indexes) {
2102 if (IsFirstSubreg)
2103 Indexes->replaceMachineInstrInMaps(*MI, *MIB);
2104 else
2105 Indexes->insertMachineInstrInMaps(*MIB);
2106 }
2107
2108 if (IsFirstSubreg && SB.NumSubRegs > 1) {
2109 // We may be spilling a super-register which is only partially defined,
2110 // and need to ensure later spills think the value is defined.
2111 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
2112 }
2113
2114 if (SB.NumSubRegs > 1 && (IsFirstSubreg || IsLastSubreg))
2116
2117 // FIXME: Since this spills to another register instead of an actual
2118 // frame index, we should delete the frame index when all references to
2119 // it are fixed.
2120 }
2121 } else {
2122 SB.prepare();
2123
2124 // SubReg carries the "Kill" flag when SubReg == SB.SuperReg.
2125 RegState SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill);
2126
2127 // Per VGPR helper data
2128 auto PVD = SB.getPerVGPRData();
2129
2130 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2131 RegState TmpVGPRFlags = RegState::Undef;
2132
2133 // Write sub registers into the VGPR
2134 for (unsigned i = Offset * PVD.PerVGPR,
2135 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2136 i < e; ++i) {
2137 Register SubReg =
2138 SB.NumSubRegs == 1
2139 ? SB.SuperReg
2140 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2141
2142 MachineInstrBuilder WriteLane =
2143 BuildMI(*SB.MBB, MI, SB.DL,
2144 SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), SB.TmpVGPR)
2145 .addReg(SubReg, SubKillState)
2146 .addImm(i % PVD.PerVGPR)
2147 .addReg(SB.TmpVGPR, TmpVGPRFlags);
2148 TmpVGPRFlags = {};
2149
2150 if (Indexes) {
2151 if (i == 0)
2152 Indexes->replaceMachineInstrInMaps(*MI, *WriteLane);
2153 else
2154 Indexes->insertMachineInstrInMaps(*WriteLane);
2155 }
2156
2157 // There could be undef components of a spilled super register.
2158 // TODO: Can we detect this and skip the spill?
2159 if (SB.NumSubRegs > 1) {
2160 // The last implicit use of the SB.SuperReg carries the "Kill" flag.
2161 RegState SuperKillState = {};
2162 if (i + 1 == SB.NumSubRegs)
2163 SuperKillState |= getKillRegState(SB.IsKill);
2164 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState);
2165 }
2166 }
2167
2168 // Write out VGPR
2169 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ false);
2170 }
2171
2172 SB.restore();
2173 }
2174
2175 MI->eraseFromParent();
2177
2178 if (LIS)
2180
2181 return true;
2182}
2183
2185 RegScavenger *RS, SlotIndexes *Indexes,
2186 LiveIntervals *LIS, bool OnlyToVGPR,
2187 bool SpillToPhysVGPRLane) const {
2188 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
2189
2190 ArrayRef<SpilledReg> VGPRSpills =
2191 SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index)
2193 bool SpillToVGPR = !VGPRSpills.empty();
2194 if (OnlyToVGPR && !SpillToVGPR)
2195 return false;
2196
2197 if (SpillToVGPR) {
2198 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
2199 Register SubReg =
2200 SB.NumSubRegs == 1
2201 ? SB.SuperReg
2202 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2203
2204 SpilledReg Spill = VGPRSpills[i];
2205 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
2206 SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
2207 .addReg(Spill.VGPR)
2208 .addImm(Spill.Lane);
2209 if (SB.NumSubRegs > 1 && i == 0)
2211 if (Indexes) {
2212 if (i == e - 1)
2213 Indexes->replaceMachineInstrInMaps(*MI, *MIB);
2214 else
2215 Indexes->insertMachineInstrInMaps(*MIB);
2216 }
2217 }
2218 } else {
2219 SB.prepare();
2220
2221 // Per VGPR helper data
2222 auto PVD = SB.getPerVGPRData();
2223
2224 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2225 // Load in VGPR data
2226 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ true);
2227
2228 // Unpack lanes
2229 for (unsigned i = Offset * PVD.PerVGPR,
2230 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2231 i < e; ++i) {
2232 Register SubReg =
2233 SB.NumSubRegs == 1
2234 ? SB.SuperReg
2235 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2236
2237 bool LastSubReg = (i + 1 == e);
2238 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
2239 SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
2240 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg))
2241 .addImm(i);
2242 if (SB.NumSubRegs > 1 && i == 0)
2244 if (Indexes) {
2245 if (i == e - 1)
2246 Indexes->replaceMachineInstrInMaps(*MI, *MIB);
2247 else
2248 Indexes->insertMachineInstrInMaps(*MIB);
2249 }
2250 }
2251 }
2252
2253 SB.restore();
2254 }
2255
2256 MI->eraseFromParent();
2257
2258 if (LIS)
2260
2261 return true;
2262}
2263
2265 MachineBasicBlock &RestoreMBB,
2266 Register SGPR, RegScavenger *RS) const {
2267 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, SGPR, false, 0,
2268 RS);
2269 SB.prepare();
2270 // Generate the spill of SGPR to SB.TmpVGPR.
2271 RegState SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill);
2272 auto PVD = SB.getPerVGPRData();
2273 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2274 RegState TmpVGPRFlags = RegState::Undef;
2275 // Write sub registers into the VGPR
2276 for (unsigned i = Offset * PVD.PerVGPR,
2277 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2278 i < e; ++i) {
2279 Register SubReg =
2280 SB.NumSubRegs == 1
2281 ? SB.SuperReg
2282 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2283
2284 MachineInstrBuilder WriteLane =
2285 BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32),
2286 SB.TmpVGPR)
2287 .addReg(SubReg, SubKillState)
2288 .addImm(i % PVD.PerVGPR)
2289 .addReg(SB.TmpVGPR, TmpVGPRFlags);
2290 TmpVGPRFlags = {};
2291 // There could be undef components of a spilled super register.
2292 // TODO: Can we detect this and skip the spill?
2293 if (SB.NumSubRegs > 1) {
2294 // The last implicit use of the SB.SuperReg carries the "Kill" flag.
2295 RegState SuperKillState = {};
2296 if (i + 1 == SB.NumSubRegs)
2297 SuperKillState |= getKillRegState(SB.IsKill);
2298 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState);
2299 }
2300 }
2301 // Don't need to write VGPR out.
2302 }
2303
2304 // Restore clobbered registers in the specified restore block.
2305 MI = RestoreMBB.end();
2306 SB.setMI(&RestoreMBB, MI);
2307 // Generate the restore of SGPR from SB.TmpVGPR.
2308 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2309 // Don't need to load VGPR in.
2310 // Unpack lanes
2311 for (unsigned i = Offset * PVD.PerVGPR,
2312 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2313 i < e; ++i) {
2314 Register SubReg =
2315 SB.NumSubRegs == 1
2316 ? SB.SuperReg
2317 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2318
2319 assert(SubReg.isPhysical());
2320 bool LastSubReg = (i + 1 == e);
2321 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32),
2322 SubReg)
2323 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg))
2324 .addImm(i);
2325 if (SB.NumSubRegs > 1 && i == 0)
2327 }
2328 }
2329 SB.restore();
2330
2332 return false;
2333}
2334
2335/// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to
2336/// a VGPR and the stack slot can be safely eliminated when all other users are
2337/// handled.
2340 SlotIndexes *Indexes, LiveIntervals *LIS, bool SpillToPhysVGPRLane) const {
2341 switch (MI->getOpcode()) {
2342 case AMDGPU::SI_SPILL_S1024_SAVE:
2343 case AMDGPU::SI_SPILL_S512_SAVE:
2344 case AMDGPU::SI_SPILL_S384_SAVE:
2345 case AMDGPU::SI_SPILL_S352_SAVE:
2346 case AMDGPU::SI_SPILL_S320_SAVE:
2347 case AMDGPU::SI_SPILL_S288_SAVE:
2348 case AMDGPU::SI_SPILL_S256_SAVE:
2349 case AMDGPU::SI_SPILL_S224_SAVE:
2350 case AMDGPU::SI_SPILL_S192_SAVE:
2351 case AMDGPU::SI_SPILL_S160_SAVE:
2352 case AMDGPU::SI_SPILL_S128_SAVE:
2353 case AMDGPU::SI_SPILL_S96_SAVE:
2354 case AMDGPU::SI_SPILL_S64_SAVE:
2355 case AMDGPU::SI_SPILL_S32_SAVE:
2356 return spillSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane);
2357 case AMDGPU::SI_SPILL_S1024_RESTORE:
2358 case AMDGPU::SI_SPILL_S512_RESTORE:
2359 case AMDGPU::SI_SPILL_S384_RESTORE:
2360 case AMDGPU::SI_SPILL_S352_RESTORE:
2361 case AMDGPU::SI_SPILL_S320_RESTORE:
2362 case AMDGPU::SI_SPILL_S288_RESTORE:
2363 case AMDGPU::SI_SPILL_S256_RESTORE:
2364 case AMDGPU::SI_SPILL_S224_RESTORE:
2365 case AMDGPU::SI_SPILL_S192_RESTORE:
2366 case AMDGPU::SI_SPILL_S160_RESTORE:
2367 case AMDGPU::SI_SPILL_S128_RESTORE:
2368 case AMDGPU::SI_SPILL_S96_RESTORE:
2369 case AMDGPU::SI_SPILL_S64_RESTORE:
2370 case AMDGPU::SI_SPILL_S32_RESTORE:
2371 return restoreSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane);
2372 default:
2373 llvm_unreachable("not an SGPR spill instruction");
2374 }
2375}
2376
2378 int SPAdj, unsigned FIOperandNum,
2379 RegScavenger *RS) const {
2380 MachineFunction *MF = MI->getMF();
2381 MachineBasicBlock *MBB = MI->getParent();
2383 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
2384 const SIInstrInfo *TII = ST.getInstrInfo();
2385 const DebugLoc &DL = MI->getDebugLoc();
2386
2387 assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?");
2388
2390 "unreserved scratch RSRC register");
2391
2392 MachineOperand *FIOp = &MI->getOperand(FIOperandNum);
2393 int Index = MI->getOperand(FIOperandNum).getIndex();
2394
2395 Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF)
2396 ? getBaseRegister()
2397 : getFrameRegister(*MF);
2398
2399 switch (MI->getOpcode()) {
2400 // SGPR register spill
2401 case AMDGPU::SI_SPILL_S1024_SAVE:
2402 case AMDGPU::SI_SPILL_S512_SAVE:
2403 case AMDGPU::SI_SPILL_S384_SAVE:
2404 case AMDGPU::SI_SPILL_S352_SAVE:
2405 case AMDGPU::SI_SPILL_S320_SAVE:
2406 case AMDGPU::SI_SPILL_S288_SAVE:
2407 case AMDGPU::SI_SPILL_S256_SAVE:
2408 case AMDGPU::SI_SPILL_S224_SAVE:
2409 case AMDGPU::SI_SPILL_S192_SAVE:
2410 case AMDGPU::SI_SPILL_S160_SAVE:
2411 case AMDGPU::SI_SPILL_S128_SAVE:
2412 case AMDGPU::SI_SPILL_S96_SAVE:
2413 case AMDGPU::SI_SPILL_S64_SAVE:
2414 case AMDGPU::SI_SPILL_S32_SAVE: {
2415 return spillSGPR(MI, Index, RS);
2416 }
2417
2418 // SGPR register restore
2419 case AMDGPU::SI_SPILL_S1024_RESTORE:
2420 case AMDGPU::SI_SPILL_S512_RESTORE:
2421 case AMDGPU::SI_SPILL_S384_RESTORE:
2422 case AMDGPU::SI_SPILL_S352_RESTORE:
2423 case AMDGPU::SI_SPILL_S320_RESTORE:
2424 case AMDGPU::SI_SPILL_S288_RESTORE:
2425 case AMDGPU::SI_SPILL_S256_RESTORE:
2426 case AMDGPU::SI_SPILL_S224_RESTORE:
2427 case AMDGPU::SI_SPILL_S192_RESTORE:
2428 case AMDGPU::SI_SPILL_S160_RESTORE:
2429 case AMDGPU::SI_SPILL_S128_RESTORE:
2430 case AMDGPU::SI_SPILL_S96_RESTORE:
2431 case AMDGPU::SI_SPILL_S64_RESTORE:
2432 case AMDGPU::SI_SPILL_S32_RESTORE: {
2433 return restoreSGPR(MI, Index, RS);
2434 }
2435
2436 // VGPR register spill
2437 case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE: {
2438 // Put mask into M0.
2439 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32),
2440 AMDGPU::M0)
2441 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::mask));
2442 [[fallthrough]];
2443 }
2444 case AMDGPU::SI_SPILL_V1024_SAVE:
2445 case AMDGPU::SI_SPILL_V512_SAVE:
2446 case AMDGPU::SI_SPILL_V384_SAVE:
2447 case AMDGPU::SI_SPILL_V352_SAVE:
2448 case AMDGPU::SI_SPILL_V320_SAVE:
2449 case AMDGPU::SI_SPILL_V288_SAVE:
2450 case AMDGPU::SI_SPILL_V256_SAVE:
2451 case AMDGPU::SI_SPILL_V224_SAVE:
2452 case AMDGPU::SI_SPILL_V192_SAVE:
2453 case AMDGPU::SI_SPILL_V160_SAVE:
2454 case AMDGPU::SI_SPILL_V128_SAVE:
2455 case AMDGPU::SI_SPILL_V96_SAVE:
2456 case AMDGPU::SI_SPILL_V64_SAVE:
2457 case AMDGPU::SI_SPILL_V32_SAVE:
2458 case AMDGPU::SI_SPILL_V16_SAVE:
2459 case AMDGPU::SI_SPILL_A1024_SAVE:
2460 case AMDGPU::SI_SPILL_A512_SAVE:
2461 case AMDGPU::SI_SPILL_A384_SAVE:
2462 case AMDGPU::SI_SPILL_A352_SAVE:
2463 case AMDGPU::SI_SPILL_A320_SAVE:
2464 case AMDGPU::SI_SPILL_A288_SAVE:
2465 case AMDGPU::SI_SPILL_A256_SAVE:
2466 case AMDGPU::SI_SPILL_A224_SAVE:
2467 case AMDGPU::SI_SPILL_A192_SAVE:
2468 case AMDGPU::SI_SPILL_A160_SAVE:
2469 case AMDGPU::SI_SPILL_A128_SAVE:
2470 case AMDGPU::SI_SPILL_A96_SAVE:
2471 case AMDGPU::SI_SPILL_A64_SAVE:
2472 case AMDGPU::SI_SPILL_A32_SAVE:
2473 case AMDGPU::SI_SPILL_AV1024_SAVE:
2474 case AMDGPU::SI_SPILL_AV512_SAVE:
2475 case AMDGPU::SI_SPILL_AV384_SAVE:
2476 case AMDGPU::SI_SPILL_AV352_SAVE:
2477 case AMDGPU::SI_SPILL_AV320_SAVE:
2478 case AMDGPU::SI_SPILL_AV288_SAVE:
2479 case AMDGPU::SI_SPILL_AV256_SAVE:
2480 case AMDGPU::SI_SPILL_AV224_SAVE:
2481 case AMDGPU::SI_SPILL_AV192_SAVE:
2482 case AMDGPU::SI_SPILL_AV160_SAVE:
2483 case AMDGPU::SI_SPILL_AV128_SAVE:
2484 case AMDGPU::SI_SPILL_AV96_SAVE:
2485 case AMDGPU::SI_SPILL_AV64_SAVE:
2486 case AMDGPU::SI_SPILL_AV32_SAVE:
2487 case AMDGPU::SI_SPILL_WWM_V32_SAVE:
2488 case AMDGPU::SI_SPILL_WWM_AV32_SAVE: {
2489 const MachineOperand *VData = TII->getNamedOperand(*MI,
2490 AMDGPU::OpName::vdata);
2491 if (VData->isUndef()) {
2492 MI->eraseFromParent();
2493 return true;
2494 }
2495
2496 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
2497 MFI->getStackPtrOffsetReg());
2498
2499 unsigned Opc;
2500 if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_SAVE) {
2501 assert(ST.hasFlatScratchEnabled() && "Flat Scratch is not enabled!");
2502 Opc = AMDGPU::SCRATCH_STORE_SHORT_SADDR_t16;
2503 } else {
2504 Opc = MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_SAVE
2505 ? AMDGPU::SCRATCH_STORE_BLOCK_SADDR
2506 : ST.hasFlatScratchEnabled() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
2507 : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
2508 }
2509
2510 auto *MBB = MI->getParent();
2511 bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode());
2512 if (IsWWMRegSpill) {
2513 TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(),
2514 RS->isRegUsed(AMDGPU::SCC));
2515 }
2517 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
2518 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
2519 *MI->memoperands_begin(), RS);
2521 if (IsWWMRegSpill)
2522 TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy());
2523
2524 MI->eraseFromParent();
2525 return true;
2526 }
2527 case AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE: {
2528 // Put mask into M0.
2529 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32),
2530 AMDGPU::M0)
2531 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::mask));
2532 [[fallthrough]];
2533 }
2534 case AMDGPU::SI_SPILL_V16_RESTORE:
2535 case AMDGPU::SI_SPILL_V32_RESTORE:
2536 case AMDGPU::SI_SPILL_V64_RESTORE:
2537 case AMDGPU::SI_SPILL_V96_RESTORE:
2538 case AMDGPU::SI_SPILL_V128_RESTORE:
2539 case AMDGPU::SI_SPILL_V160_RESTORE:
2540 case AMDGPU::SI_SPILL_V192_RESTORE:
2541 case AMDGPU::SI_SPILL_V224_RESTORE:
2542 case AMDGPU::SI_SPILL_V256_RESTORE:
2543 case AMDGPU::SI_SPILL_V288_RESTORE:
2544 case AMDGPU::SI_SPILL_V320_RESTORE:
2545 case AMDGPU::SI_SPILL_V352_RESTORE:
2546 case AMDGPU::SI_SPILL_V384_RESTORE:
2547 case AMDGPU::SI_SPILL_V512_RESTORE:
2548 case AMDGPU::SI_SPILL_V1024_RESTORE:
2549 case AMDGPU::SI_SPILL_A32_RESTORE:
2550 case AMDGPU::SI_SPILL_A64_RESTORE:
2551 case AMDGPU::SI_SPILL_A96_RESTORE:
2552 case AMDGPU::SI_SPILL_A128_RESTORE:
2553 case AMDGPU::SI_SPILL_A160_RESTORE:
2554 case AMDGPU::SI_SPILL_A192_RESTORE:
2555 case AMDGPU::SI_SPILL_A224_RESTORE:
2556 case AMDGPU::SI_SPILL_A256_RESTORE:
2557 case AMDGPU::SI_SPILL_A288_RESTORE:
2558 case AMDGPU::SI_SPILL_A320_RESTORE:
2559 case AMDGPU::SI_SPILL_A352_RESTORE:
2560 case AMDGPU::SI_SPILL_A384_RESTORE:
2561 case AMDGPU::SI_SPILL_A512_RESTORE:
2562 case AMDGPU::SI_SPILL_A1024_RESTORE:
2563 case AMDGPU::SI_SPILL_AV32_RESTORE:
2564 case AMDGPU::SI_SPILL_AV64_RESTORE:
2565 case AMDGPU::SI_SPILL_AV96_RESTORE:
2566 case AMDGPU::SI_SPILL_AV128_RESTORE:
2567 case AMDGPU::SI_SPILL_AV160_RESTORE:
2568 case AMDGPU::SI_SPILL_AV192_RESTORE:
2569 case AMDGPU::SI_SPILL_AV224_RESTORE:
2570 case AMDGPU::SI_SPILL_AV256_RESTORE:
2571 case AMDGPU::SI_SPILL_AV288_RESTORE:
2572 case AMDGPU::SI_SPILL_AV320_RESTORE:
2573 case AMDGPU::SI_SPILL_AV352_RESTORE:
2574 case AMDGPU::SI_SPILL_AV384_RESTORE:
2575 case AMDGPU::SI_SPILL_AV512_RESTORE:
2576 case AMDGPU::SI_SPILL_AV1024_RESTORE:
2577 case AMDGPU::SI_SPILL_WWM_V32_RESTORE:
2578 case AMDGPU::SI_SPILL_WWM_AV32_RESTORE: {
2579 const MachineOperand *VData = TII->getNamedOperand(*MI,
2580 AMDGPU::OpName::vdata);
2581 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
2582 MFI->getStackPtrOffsetReg());
2583
2584 unsigned Opc;
2585 if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_RESTORE) {
2586 assert(ST.hasFlatScratchEnabled() && "Flat Scratch is not enabled!");
2587 Opc = ST.d16PreservesUnusedBits()
2588 ? AMDGPU::SCRATCH_LOAD_SHORT_D16_SADDR_t16
2589 : AMDGPU::SCRATCH_LOAD_USHORT_SADDR;
2590 } else {
2591 Opc = MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE
2592 ? AMDGPU::SCRATCH_LOAD_BLOCK_SADDR
2593 : ST.hasFlatScratchEnabled() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
2594 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
2595 }
2596
2597 auto *MBB = MI->getParent();
2598 bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode());
2599 if (IsWWMRegSpill) {
2600 TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(),
2601 RS->isRegUsed(AMDGPU::SCC));
2602 }
2603
2605 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
2606 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
2607 *MI->memoperands_begin(), RS);
2608
2609 if (IsWWMRegSpill)
2610 TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy());
2611
2612 MI->eraseFromParent();
2613 return true;
2614 }
2615 case AMDGPU::V_ADD_U32_e32:
2616 case AMDGPU::V_ADD_U32_e64:
2617 case AMDGPU::V_ADD_CO_U32_e32:
2618 case AMDGPU::V_ADD_CO_U32_e64: {
2619 // TODO: Handle sub, and, or.
2620 unsigned NumDefs = MI->getNumExplicitDefs();
2621 unsigned Src0Idx = NumDefs;
2622
2623 bool HasClamp = false;
2624 MachineOperand *VCCOp = nullptr;
2625
2626 switch (MI->getOpcode()) {
2627 case AMDGPU::V_ADD_U32_e32:
2628 break;
2629 case AMDGPU::V_ADD_U32_e64:
2630 HasClamp = MI->getOperand(3).getImm();
2631 break;
2632 case AMDGPU::V_ADD_CO_U32_e32:
2633 VCCOp = &MI->getOperand(3);
2634 break;
2635 case AMDGPU::V_ADD_CO_U32_e64:
2636 VCCOp = &MI->getOperand(1);
2637 HasClamp = MI->getOperand(4).getImm();
2638 break;
2639 default:
2640 break;
2641 }
2642 bool DeadVCC = !VCCOp || VCCOp->isDead();
2643 MachineOperand &DstOp = MI->getOperand(0);
2644 Register DstReg = DstOp.getReg();
2645
2646 unsigned OtherOpIdx =
2647 FIOperandNum == Src0Idx ? FIOperandNum + 1 : Src0Idx;
2648 MachineOperand *OtherOp = &MI->getOperand(OtherOpIdx);
2649
2650 unsigned Src1Idx = Src0Idx + 1;
2651 Register MaterializedReg = FrameReg;
2652 Register ScavengedVGPR;
2653
2654 int64_t Offset = FrameInfo.getObjectOffset(Index);
2655 // For the non-immediate case, we could fall through to the default
2656 // handling, but we do an in-place update of the result register here to
2657 // avoid scavenging another register.
2658 if (OtherOp->isImm()) {
2659 int64_t TotalOffset = OtherOp->getImm() + Offset;
2660
2661 if (!ST.hasVOP3Literal() && SIInstrInfo::isVOP3(*MI) &&
2662 !AMDGPU::isInlinableIntLiteral(TotalOffset)) {
2663 // If we can't support a VOP3 literal in the VALU instruction, we
2664 // can't specially fold into the add.
2665 // TODO: Handle VOP3->VOP2 shrink to support the fold.
2666 break;
2667 }
2668
2669 OtherOp->setImm(TotalOffset);
2670 Offset = 0;
2671 }
2672
2673 if (FrameReg && !ST.hasFlatScratchEnabled()) {
2674 // We should just do an in-place update of the result register. However,
2675 // the value there may also be used by the add, in which case we need a
2676 // temporary register.
2677 //
2678 // FIXME: The scavenger is not finding the result register in the
2679 // common case where the add does not read the register.
2680
2681 ScavengedVGPR = RS->scavengeRegisterBackwards(
2682 AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false, /*SPAdj=*/0);
2683
2684 // TODO: If we have a free SGPR, it's sometimes better to use a scalar
2685 // shift.
2686 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64))
2687 .addDef(ScavengedVGPR, RegState::Renamable)
2688 .addImm(ST.getWavefrontSizeLog2())
2689 .addReg(FrameReg);
2690 MaterializedReg = ScavengedVGPR;
2691 }
2692
2693 if ((!OtherOp->isImm() || OtherOp->getImm() != 0) && MaterializedReg) {
2694 if (ST.hasFlatScratchEnabled() &&
2695 !TII->isOperandLegal(*MI, Src1Idx, OtherOp)) {
2696 // We didn't need the shift above, so we have an SGPR for the frame
2697 // register, but may have a VGPR only operand.
2698 //
2699 // TODO: On gfx10+, we can easily change the opcode to the e64 version
2700 // and use the higher constant bus restriction to avoid this copy.
2701
2702 if (!ScavengedVGPR) {
2703 ScavengedVGPR = RS->scavengeRegisterBackwards(
2704 AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false,
2705 /*SPAdj=*/0);
2706 }
2707
2708 assert(ScavengedVGPR != DstReg);
2709
2710 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2711 .addReg(MaterializedReg,
2712 getKillRegState(MaterializedReg != FrameReg));
2713 MaterializedReg = ScavengedVGPR;
2714 }
2715
2716 // TODO: In the flat scratch case, if this is an add of an SGPR, and SCC
2717 // is not live, we could use a scalar add + vector add instead of 2
2718 // vector adds.
2719 auto AddI32 = BuildMI(*MBB, *MI, DL, TII->get(MI->getOpcode()))
2720 .addDef(DstReg, RegState::Renamable);
2721 if (NumDefs == 2)
2722 AddI32.add(MI->getOperand(1));
2723
2724 RegState MaterializedRegFlags =
2725 getKillRegState(MaterializedReg != FrameReg);
2726
2727 if (isVGPRClass(getPhysRegBaseClass(MaterializedReg))) {
2728 // If we know we have a VGPR already, it's more likely the other
2729 // operand is a legal vsrc0.
2730 AddI32
2731 .add(*OtherOp)
2732 .addReg(MaterializedReg, MaterializedRegFlags);
2733 } else {
2734 // Commute operands to avoid violating VOP2 restrictions. This will
2735 // typically happen when using scratch.
2736 AddI32
2737 .addReg(MaterializedReg, MaterializedRegFlags)
2738 .add(*OtherOp);
2739 }
2740
2741 if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 ||
2742 MI->getOpcode() == AMDGPU::V_ADD_U32_e64)
2743 AddI32.addImm(0); // clamp
2744
2745 if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e32)
2746 AddI32.setOperandDead(3); // Dead vcc
2747
2748 MaterializedReg = DstReg;
2749
2750 OtherOp->ChangeToRegister(MaterializedReg, false);
2751 OtherOp->setIsKill(true);
2753 Offset = 0;
2754 } else if (Offset != 0) {
2755 assert(!MaterializedReg);
2757 Offset = 0;
2758 } else {
2759 if (DeadVCC && !HasClamp) {
2760 assert(Offset == 0);
2761
2762 // TODO: Losing kills and implicit operands. Just mutate to copy and
2763 // let lowerCopy deal with it?
2764 if (OtherOp->isReg() && OtherOp->getReg() == DstReg) {
2765 // Folded to an identity copy.
2766 MI->eraseFromParent();
2767 return true;
2768 }
2769
2770 // The immediate value should be in OtherOp
2771 MI->setDesc(TII->get(AMDGPU::V_MOV_B32_e32));
2772 MI->removeOperand(FIOperandNum);
2773
2774 unsigned NumOps = MI->getNumOperands();
2775 for (unsigned I = NumOps - 2; I >= NumDefs + 1; --I)
2776 MI->removeOperand(I);
2777
2778 if (NumDefs == 2)
2779 MI->removeOperand(1);
2780
2781 // The code below can't deal with a mov.
2782 return true;
2783 }
2784
2785 // This folded to a constant, but we have to keep the add around for
2786 // pointless implicit defs or clamp modifier.
2787 FIOp->ChangeToImmediate(0);
2788 }
2789
2790 // Try to improve legality by commuting.
2791 if (!TII->isOperandLegal(*MI, Src1Idx) && TII->commuteInstruction(*MI)) {
2792 std::swap(FIOp, OtherOp);
2793 std::swap(FIOperandNum, OtherOpIdx);
2794 }
2795
2796 // We need at most one mov to satisfy the operand constraints. Prefer to
2797 // move the FI operand first, as it may be a literal in a VOP3
2798 // instruction.
2799 for (unsigned SrcIdx : {FIOperandNum, OtherOpIdx}) {
2800 if (!TII->isOperandLegal(*MI, SrcIdx)) {
2801 // If commuting didn't make the operands legal, we need to materialize
2802 // in a register.
2803 // TODO: Can use SGPR on gfx10+ in some cases.
2804 if (!ScavengedVGPR) {
2805 ScavengedVGPR = RS->scavengeRegisterBackwards(
2806 AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false,
2807 /*SPAdj=*/0);
2808 }
2809
2810 assert(ScavengedVGPR != DstReg);
2811
2812 MachineOperand &Src = MI->getOperand(SrcIdx);
2813 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2814 .add(Src);
2815
2816 Src.ChangeToRegister(ScavengedVGPR, false);
2817 Src.setIsKill(true);
2818 break;
2819 }
2820 }
2821
2822 // Fold out add of 0 case that can appear in kernels.
2823 if (FIOp->isImm() && FIOp->getImm() == 0 && DeadVCC && !HasClamp) {
2824 if (OtherOp->isReg() && OtherOp->getReg() != DstReg) {
2825 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::COPY), DstReg).add(*OtherOp);
2826 }
2827
2828 MI->eraseFromParent();
2829 }
2830
2831 return true;
2832 }
2833 case AMDGPU::S_ADD_I32:
2834 case AMDGPU::S_ADD_U32: {
2835 // TODO: Handle s_or_b32, s_and_b32.
2836 unsigned OtherOpIdx = FIOperandNum == 1 ? 2 : 1;
2837 MachineOperand &OtherOp = MI->getOperand(OtherOpIdx);
2838
2839 assert(FrameReg || MFI->isBottomOfStack());
2840
2841 MachineOperand &DstOp = MI->getOperand(0);
2842 const DebugLoc &DL = MI->getDebugLoc();
2843 Register MaterializedReg = FrameReg;
2844
2845 // Defend against live scc, which should never happen in practice.
2846 bool DeadSCC = MI->getOperand(3).isDead();
2847
2848 Register TmpReg;
2849
2850 // FIXME: Scavenger should figure out that the result register is
2851 // available. Also should do this for the v_add case.
2852 if (OtherOp.isReg() && OtherOp.getReg() != DstOp.getReg())
2853 TmpReg = DstOp.getReg();
2854
2855 if (FrameReg && !ST.hasFlatScratchEnabled()) {
2856 // FIXME: In the common case where the add does not also read its result
2857 // (i.e. this isn't a reg += fi), it's not finding the dest reg as
2858 // available.
2859 if (!TmpReg)
2860 TmpReg = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
2861 MI, /*RestoreAfter=*/false, 0,
2862 /*AllowSpill=*/false);
2863 if (TmpReg) {
2864 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::S_LSHR_B32))
2865 .addDef(TmpReg, RegState::Renamable)
2866 .addReg(FrameReg)
2867 .addImm(ST.getWavefrontSizeLog2())
2868 .setOperandDead(3); // Set SCC dead
2869 }
2870 MaterializedReg = TmpReg;
2871 }
2872
2873 int64_t Offset = FrameInfo.getObjectOffset(Index);
2874
2875 // For the non-immediate case, we could fall through to the default
2876 // handling, but we do an in-place update of the result register here to
2877 // avoid scavenging another register.
2878 if (OtherOp.isImm()) {
2879 OtherOp.setImm(OtherOp.getImm() + Offset);
2880 Offset = 0;
2881
2882 if (MaterializedReg)
2883 FIOp->ChangeToRegister(MaterializedReg, false);
2884 else
2885 FIOp->ChangeToImmediate(0);
2886 } else if (MaterializedReg) {
2887 // If we can't fold the other operand, do another increment.
2888 Register DstReg = DstOp.getReg();
2889
2890 if (!TmpReg && MaterializedReg == FrameReg) {
2891 TmpReg = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
2892 MI, /*RestoreAfter=*/false, 0,
2893 /*AllowSpill=*/false);
2894 DstReg = TmpReg;
2895 }
2896
2897 if (TmpReg) {
2898 auto AddI32 = BuildMI(*MBB, *MI, DL, MI->getDesc())
2899 .addDef(DstReg, RegState::Renamable)
2900 .addReg(MaterializedReg, RegState::Kill)
2901 .add(OtherOp);
2902 if (DeadSCC)
2903 AddI32.setOperandDead(3);
2904
2905 MaterializedReg = DstReg;
2906
2907 OtherOp.ChangeToRegister(MaterializedReg, false);
2908 OtherOp.setIsKill(true);
2909 OtherOp.setIsRenamable(true);
2910 }
2912 } else {
2913 // If we don't have any other offset to apply, we can just directly
2914 // interpret the frame index as the offset.
2916 }
2917
2918 if (DeadSCC && OtherOp.isImm() && OtherOp.getImm() == 0) {
2919 assert(Offset == 0);
2920 MI->removeOperand(3);
2921 MI->removeOperand(OtherOpIdx);
2922 MI->setDesc(TII->get(FIOp->isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2923 } else if (DeadSCC && FIOp->isImm() && FIOp->getImm() == 0) {
2924 assert(Offset == 0);
2925 MI->removeOperand(3);
2926 MI->removeOperand(FIOperandNum);
2927 MI->setDesc(
2928 TII->get(OtherOp.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2929 }
2930
2931 assert(!FIOp->isFI());
2932 return true;
2933 }
2934 default: {
2935 break;
2936 }
2937 }
2938
2939 int64_t Offset = FrameInfo.getObjectOffset(Index);
2940 if (ST.hasFlatScratchEnabled()) {
2941 if (TII->isFLATScratch(*MI)) {
2942 assert(
2943 (int16_t)FIOperandNum ==
2944 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::saddr));
2945
2946 // The offset is always swizzled, just replace it
2947 if (FrameReg)
2948 FIOp->ChangeToRegister(FrameReg, false);
2949
2951 TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
2952 int64_t NewOffset = Offset + OffsetOp->getImm();
2953 if (TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
2955 OffsetOp->setImm(NewOffset);
2956 if (FrameReg)
2957 return false;
2958 Offset = 0;
2959 }
2960
2961 if (!Offset) {
2962 unsigned Opc = MI->getOpcode();
2963 int NewOpc = -1;
2964 if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr)) {
2966 } else if (ST.hasFlatScratchSTMode()) {
2967 // On GFX10 we have ST mode to use no registers for an address.
2968 // Otherwise we need to materialize 0 into an SGPR.
2970 }
2971
2972 if (NewOpc != -1) {
2973 // removeOperand doesn't fixup tied operand indexes as it goes, so
2974 // it asserts. Untie vdst_in for now and retie them afterwards.
2975 int VDstIn =
2976 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
2977 bool TiedVDst = VDstIn != -1 && MI->getOperand(VDstIn).isReg() &&
2978 MI->getOperand(VDstIn).isTied();
2979 if (TiedVDst)
2980 MI->untieRegOperand(VDstIn);
2981
2982 MI->removeOperand(
2983 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr));
2984
2985 if (TiedVDst) {
2986 int NewVDst =
2987 AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
2988 int NewVDstIn =
2989 AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst_in);
2990 assert(NewVDst != -1 && NewVDstIn != -1 && "Must be tied!");
2991 MI->tieOperands(NewVDst, NewVDstIn);
2992 }
2993 MI->setDesc(TII->get(NewOpc));
2994 return false;
2995 }
2996 }
2997 }
2998
2999 if (!FrameReg) {
3001 if (TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp))
3002 return false;
3003 }
3004
3005 // We need to use register here. Check if we can use an SGPR or need
3006 // a VGPR.
3007 FIOp->ChangeToRegister(AMDGPU::M0, false);
3008 bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, FIOp);
3009
3010 if (!Offset && FrameReg && UseSGPR) {
3011 FIOp->setReg(FrameReg);
3012 return false;
3013 }
3014
3015 const TargetRegisterClass *RC =
3016 UseSGPR ? &AMDGPU::SReg_32_XM0RegClass : &AMDGPU::VGPR_32RegClass;
3017
3018 Register TmpReg =
3019 RS->scavengeRegisterBackwards(*RC, MI, false, 0, !UseSGPR);
3020 FIOp->setReg(TmpReg);
3021 FIOp->setIsKill();
3022
3023 if ((!FrameReg || !Offset) && TmpReg) {
3024 unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
3025 auto MIB = BuildMI(*MBB, MI, DL, TII->get(Opc), TmpReg);
3026 if (FrameReg)
3027 MIB.addReg(FrameReg);
3028 else
3029 MIB.addImm(Offset);
3030
3031 return false;
3032 }
3033
3034 bool NeedSaveSCC = RS->isRegUsed(AMDGPU::SCC) &&
3035 !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr);
3036
3037 Register TmpSReg =
3038 UseSGPR ? TmpReg
3039 : RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
3040 MI, false, 0, !UseSGPR);
3041
3042 if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR)) {
3043 int SVOpcode = AMDGPU::getFlatScratchInstSVfromSS(MI->getOpcode());
3044 if (ST.hasFlatScratchSVSMode() && SVOpcode != -1) {
3045 Register TmpVGPR = RS->scavengeRegisterBackwards(
3046 AMDGPU::VGPR_32RegClass, MI, false, 0, /*AllowSpill=*/true);
3047
3048 // Materialize the frame register.
3049 auto MIB =
3050 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR);
3051 if (FrameReg)
3052 MIB.addReg(FrameReg);
3053 else
3054 MIB.addImm(Offset);
3055
3056 // Add the offset to the frame register.
3057 if (FrameReg && Offset)
3058 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e32), FrameReg)
3059 .addReg(FrameReg, RegState::Kill)
3060 .addImm(Offset);
3061
3062 BuildMI(*MBB, MI, DL, TII->get(SVOpcode))
3063 .add(MI->getOperand(0)) // $vdata
3064 .addReg(TmpVGPR) // $vaddr
3065 .addImm(0) // Offset
3066 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::cpol));
3067 MI->eraseFromParent();
3068 return true;
3069 }
3070 report_fatal_error("Cannot scavenge register in FI elimination!");
3071 }
3072
3073 if (!TmpSReg) {
3074 // Use frame register and restore it after.
3075 TmpSReg = FrameReg;
3076 FIOp->setReg(FrameReg);
3077 FIOp->setIsKill(false);
3078 }
3079
3080 if (NeedSaveSCC) {
3081 assert(!(Offset & 0x1) && "Flat scratch offset must be aligned!");
3082 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADDC_U32), TmpSReg)
3083 .addReg(FrameReg)
3084 .addImm(Offset);
3085 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITCMP1_B32))
3086 .addReg(TmpSReg)
3087 .addImm(0);
3088 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITSET0_B32), TmpSReg)
3089 .addImm(0)
3090 .addReg(TmpSReg);
3091 } else {
3092 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpSReg)
3093 .addReg(FrameReg)
3094 .addImm(Offset);
3095 }
3096
3097 if (!UseSGPR)
3098 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
3099 .addReg(TmpSReg, RegState::Kill);
3100
3101 if (TmpSReg == FrameReg) {
3102 // Undo frame register modification.
3103 if (NeedSaveSCC &&
3104 !MI->registerDefIsDead(AMDGPU::SCC, /*TRI=*/nullptr)) {
3106 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADDC_U32),
3107 TmpSReg)
3108 .addReg(FrameReg)
3109 .addImm(-Offset);
3110 I = BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITCMP1_B32))
3111 .addReg(TmpSReg)
3112 .addImm(0);
3113 BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITSET0_B32),
3114 TmpSReg)
3115 .addImm(0)
3116 .addReg(TmpSReg);
3117 } else {
3118 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADD_I32),
3119 FrameReg)
3120 .addReg(FrameReg)
3121 .addImm(-Offset);
3122 }
3123 }
3124
3125 return false;
3126 }
3127
3128 bool IsMUBUF = TII->isMUBUF(*MI);
3129
3130 if (!IsMUBUF && !MFI->isBottomOfStack()) {
3131 // Convert to a swizzled stack address by scaling by the wave size.
3132 // In an entry function/kernel the offset is already swizzled.
3133 bool IsSALU = isSGPRClass(TII->getRegClass(MI->getDesc(), FIOperandNum));
3134 bool LiveSCC = RS->isRegUsed(AMDGPU::SCC) &&
3135 !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr);
3136 const TargetRegisterClass *RC = IsSALU && !LiveSCC
3137 ? &AMDGPU::SReg_32RegClass
3138 : &AMDGPU::VGPR_32RegClass;
3139 bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32 ||
3140 MI->getOpcode() == AMDGPU::V_MOV_B32_e64 ||
3141 MI->getOpcode() == AMDGPU::S_MOV_B32;
3142 Register ResultReg =
3143 IsCopy ? MI->getOperand(0).getReg()
3144 : RS->scavengeRegisterBackwards(*RC, MI, false, 0);
3145
3146 int64_t Offset = FrameInfo.getObjectOffset(Index);
3147 if (Offset == 0) {
3148 unsigned OpCode =
3149 IsSALU && !LiveSCC ? AMDGPU::S_LSHR_B32 : AMDGPU::V_LSHRREV_B32_e64;
3150 Register TmpResultReg = ResultReg;
3151 if (IsSALU && LiveSCC) {
3152 TmpResultReg = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
3153 MI, false, 0);
3154 }
3155
3156 auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), TmpResultReg);
3157 if (OpCode == AMDGPU::V_LSHRREV_B32_e64)
3158 // For V_LSHRREV, the operands are reversed (the shift count goes
3159 // first).
3160 Shift.addImm(ST.getWavefrontSizeLog2()).addReg(FrameReg);
3161 else
3162 Shift.addReg(FrameReg).addImm(ST.getWavefrontSizeLog2());
3163 if (IsSALU && !LiveSCC)
3164 Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead.
3165 if (IsSALU && LiveSCC) {
3166 Register NewDest;
3167 if (IsCopy) {
3168 assert(ResultReg.isPhysical());
3169 NewDest = ResultReg;
3170 } else {
3171 NewDest = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
3172 Shift, false, 0);
3173 }
3174 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), NewDest)
3175 .addReg(TmpResultReg);
3176 ResultReg = NewDest;
3177 }
3178 } else {
3180 if (!IsSALU) {
3181 if ((MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) !=
3182 nullptr) {
3183 // Reuse ResultReg in intermediate step.
3184 Register ScaledReg = ResultReg;
3185
3186 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
3187 ScaledReg)
3188 .addImm(ST.getWavefrontSizeLog2())
3189 .addReg(FrameReg);
3190
3191 const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32;
3192
3193 // TODO: Fold if use instruction is another add of a constant.
3194 if (IsVOP2 ||
3195 AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
3196 // FIXME: This can fail
3197 MIB.addImm(Offset);
3198 MIB.addReg(ScaledReg, RegState::Kill);
3199 if (!IsVOP2)
3200 MIB.addImm(0); // clamp bit
3201 } else {
3202 assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 &&
3203 "Need to reuse carry out register");
3204
3205 // Use scavenged unused carry out as offset register.
3206 Register ConstOffsetReg;
3207 if (!isWave32)
3208 ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0);
3209 else
3210 ConstOffsetReg = MIB.getReg(1);
3211
3212 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32),
3213 ConstOffsetReg)
3214 .addImm(Offset);
3215 MIB.addReg(ConstOffsetReg, RegState::Kill);
3216 MIB.addReg(ScaledReg, RegState::Kill);
3217 MIB.addImm(0); // clamp bit
3218 }
3219 }
3220 }
3221 if (!MIB || IsSALU) {
3222 // We have to produce a carry out, and there isn't a free SGPR pair
3223 // for it. We can keep the whole computation on the SALU to avoid
3224 // clobbering an additional register at the cost of an extra mov.
3225
3226 // We may have 1 free scratch SGPR even though a carry out is
3227 // unavailable. Only one additional mov is needed.
3228 Register TmpScaledReg = IsCopy && IsSALU
3229 ? ResultReg
3230 : RS->scavengeRegisterBackwards(
3231 AMDGPU::SReg_32_XM0RegClass, MI,
3232 false, 0, /*AllowSpill=*/false);
3233 Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg;
3234 Register TmpResultReg = ScaledReg;
3235
3236 if (!LiveSCC) {
3237 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), TmpResultReg)
3238 .addReg(FrameReg)
3239 .addImm(ST.getWavefrontSizeLog2());
3240 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpResultReg)
3241 .addReg(TmpResultReg, RegState::Kill)
3242 .addImm(Offset);
3243 } else {
3244 TmpResultReg = RS->scavengeRegisterBackwards(
3245 AMDGPU::VGPR_32RegClass, MI, false, 0, /*AllowSpill=*/true);
3246
3248 if ((Add = TII->getAddNoCarry(*MBB, MI, DL, TmpResultReg, *RS))) {
3249 BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
3250 TmpResultReg)
3251 .addImm(ST.getWavefrontSizeLog2())
3252 .addReg(FrameReg);
3253 if (Add->getOpcode() == AMDGPU::V_ADD_CO_U32_e64) {
3254 BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::S_MOV_B32), ResultReg)
3255 .addImm(Offset);
3256 Add.addReg(ResultReg, RegState::Kill)
3257 .addReg(TmpResultReg, RegState::Kill)
3258 .addImm(0);
3259 } else
3260 Add.addImm(Offset).addReg(TmpResultReg, RegState::Kill);
3261 } else {
3262 assert(Offset > 0 && isUInt<24>(2 * ST.getMaxWaveScratchSize()) &&
3263 "offset is unsafe for v_mad_u32_u24");
3264
3265 // We start with a frame pointer with a wave space value, and
3266 // an offset in lane-space. We are materializing a lane space
3267 // value. We can either do a right shift of the frame pointer
3268 // to get to lane space, or a left shift of the offset to get
3269 // to wavespace. We can right shift after the computation to
3270 // get back to the desired per-lane value. We are using the
3271 // mad_u32_u24 primarily as an add with no carry out clobber.
3272 bool IsInlinableLiteral =
3273 AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm());
3274 if (!IsInlinableLiteral) {
3275 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32),
3276 TmpResultReg)
3277 .addImm(Offset);
3278 }
3279
3280 Add = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MAD_U32_U24_e64),
3281 TmpResultReg);
3282
3283 if (!IsInlinableLiteral) {
3284 Add.addReg(TmpResultReg, RegState::Kill);
3285 } else {
3286 // We fold the offset into mad itself if its inlinable.
3287 Add.addImm(Offset);
3288 }
3289 Add.addImm(ST.getWavefrontSize()).addReg(FrameReg).addImm(0);
3290 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
3291 TmpResultReg)
3292 .addImm(ST.getWavefrontSizeLog2())
3293 .addReg(TmpResultReg);
3294 }
3295
3296 Register NewDest;
3297 if (IsCopy) {
3298 NewDest = ResultReg;
3299 } else {
3300 NewDest = RS->scavengeRegisterBackwards(
3301 AMDGPU::SReg_32_XM0RegClass, *Add, false, 0,
3302 /*AllowSpill=*/true);
3303 }
3304
3305 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
3306 NewDest)
3307 .addReg(TmpResultReg);
3308 ResultReg = NewDest;
3309 }
3310 if (!IsSALU)
3311 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg)
3312 .addReg(TmpResultReg, RegState::Kill);
3313 // If there were truly no free SGPRs, we need to undo everything.
3314 if (!TmpScaledReg.isValid()) {
3315 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg)
3316 .addReg(ScaledReg, RegState::Kill)
3317 .addImm(-Offset);
3318 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg)
3319 .addReg(FrameReg)
3320 .addImm(ST.getWavefrontSizeLog2());
3321 }
3322 }
3323 }
3324
3325 // Don't introduce an extra copy if we're just materializing in a mov.
3326 if (IsCopy) {
3327 MI->eraseFromParent();
3328 return true;
3329 }
3330 FIOp->ChangeToRegister(ResultReg, false, false, true);
3331 return false;
3332 }
3333
3334 if (IsMUBUF) {
3335 // Disable offen so we don't need a 0 vgpr base.
3336 assert(
3337 static_cast<int>(FIOperandNum) ==
3338 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr));
3339
3340 auto &SOffset = *TII->getNamedOperand(*MI, AMDGPU::OpName::soffset);
3341 assert((SOffset.isImm() && SOffset.getImm() == 0));
3342
3343 if (FrameReg != AMDGPU::NoRegister)
3344 SOffset.ChangeToRegister(FrameReg, false);
3345
3346 int64_t Offset = FrameInfo.getObjectOffset(Index);
3347 int64_t OldImm =
3348 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm();
3349 int64_t NewOffset = OldImm + Offset;
3350
3351 if (TII->isLegalMUBUFImmOffset(NewOffset) &&
3352 buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) {
3353 MI->eraseFromParent();
3354 return true;
3355 }
3356 }
3357
3358 // If the offset is simply too big, don't convert to a scratch wave offset
3359 // relative index.
3360
3362 if (!TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp)) {
3363 Register TmpReg =
3364 RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0);
3365 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
3366 .addImm(Offset);
3367 FIOp->ChangeToRegister(TmpReg, false, false, true);
3368 }
3369
3370 return false;
3371}
3372
3376
3378 return getEncodingValue(Reg) & AMDGPU::HWEncoding::REG_IDX_MASK;
3379}
3380
3382 return getRegBitWidth(RC.getID());
3383}
3384
3385static const TargetRegisterClass *
3387 if (BitWidth == 64)
3388 return &AMDGPU::VReg_64RegClass;
3389 if (BitWidth == 96)
3390 return &AMDGPU::VReg_96RegClass;
3391 if (BitWidth == 128)
3392 return &AMDGPU::VReg_128RegClass;
3393 if (BitWidth == 160)
3394 return &AMDGPU::VReg_160RegClass;
3395 if (BitWidth == 192)
3396 return &AMDGPU::VReg_192RegClass;
3397 if (BitWidth == 224)
3398 return &AMDGPU::VReg_224RegClass;
3399 if (BitWidth == 256)
3400 return &AMDGPU::VReg_256RegClass;
3401 if (BitWidth == 288)
3402 return &AMDGPU::VReg_288RegClass;
3403 if (BitWidth == 320)
3404 return &AMDGPU::VReg_320RegClass;
3405 if (BitWidth == 352)
3406 return &AMDGPU::VReg_352RegClass;
3407 if (BitWidth == 384)
3408 return &AMDGPU::VReg_384RegClass;
3409 if (BitWidth == 512)
3410 return &AMDGPU::VReg_512RegClass;
3411 if (BitWidth == 1024)
3412 return &AMDGPU::VReg_1024RegClass;
3413
3414 return nullptr;
3415}
3416
3417static const TargetRegisterClass *
3419 if (BitWidth == 64)
3420 return &AMDGPU::VReg_64_Align2RegClass;
3421 if (BitWidth == 96)
3422 return &AMDGPU::VReg_96_Align2RegClass;
3423 if (BitWidth == 128)
3424 return &AMDGPU::VReg_128_Align2RegClass;
3425 if (BitWidth == 160)
3426 return &AMDGPU::VReg_160_Align2RegClass;
3427 if (BitWidth == 192)
3428 return &AMDGPU::VReg_192_Align2RegClass;
3429 if (BitWidth == 224)
3430 return &AMDGPU::VReg_224_Align2RegClass;
3431 if (BitWidth == 256)
3432 return &AMDGPU::VReg_256_Align2RegClass;
3433 if (BitWidth == 288)
3434 return &AMDGPU::VReg_288_Align2RegClass;
3435 if (BitWidth == 320)
3436 return &AMDGPU::VReg_320_Align2RegClass;
3437 if (BitWidth == 352)
3438 return &AMDGPU::VReg_352_Align2RegClass;
3439 if (BitWidth == 384)
3440 return &AMDGPU::VReg_384_Align2RegClass;
3441 if (BitWidth == 512)
3442 return &AMDGPU::VReg_512_Align2RegClass;
3443 if (BitWidth == 1024)
3444 return &AMDGPU::VReg_1024_Align2RegClass;
3445
3446 return nullptr;
3447}
3448
3449const TargetRegisterClass *
3451 if (BitWidth == 1)
3452 return &AMDGPU::VReg_1RegClass;
3453 if (BitWidth == 16)
3454 return &AMDGPU::VGPR_16RegClass;
3455 if (BitWidth == 32)
3456 return &AMDGPU::VGPR_32RegClass;
3457 return ST.needsAlignedVGPRs() ? getAlignedVGPRClassForBitWidth(BitWidth)
3459}
3460
3461const TargetRegisterClass *
3463 if (BitWidth <= 32)
3464 return &AMDGPU::VGPR_32_Lo256RegClass;
3465 if (BitWidth <= 64)
3466 return &AMDGPU::VReg_64_Lo256_Align2RegClass;
3467 if (BitWidth <= 96)
3468 return &AMDGPU::VReg_96_Lo256_Align2RegClass;
3469 if (BitWidth <= 128)
3470 return &AMDGPU::VReg_128_Lo256_Align2RegClass;
3471 if (BitWidth <= 160)
3472 return &AMDGPU::VReg_160_Lo256_Align2RegClass;
3473 if (BitWidth <= 192)
3474 return &AMDGPU::VReg_192_Lo256_Align2RegClass;
3475 if (BitWidth <= 224)
3476 return &AMDGPU::VReg_224_Lo256_Align2RegClass;
3477 if (BitWidth <= 256)
3478 return &AMDGPU::VReg_256_Lo256_Align2RegClass;
3479 if (BitWidth <= 288)
3480 return &AMDGPU::VReg_288_Lo256_Align2RegClass;
3481 if (BitWidth <= 320)
3482 return &AMDGPU::VReg_320_Lo256_Align2RegClass;
3483 if (BitWidth <= 352)
3484 return &AMDGPU::VReg_352_Lo256_Align2RegClass;
3485 if (BitWidth <= 384)
3486 return &AMDGPU::VReg_384_Lo256_Align2RegClass;
3487 if (BitWidth <= 512)
3488 return &AMDGPU::VReg_512_Lo256_Align2RegClass;
3489 if (BitWidth <= 1024)
3490 return &AMDGPU::VReg_1024_Lo256_Align2RegClass;
3491
3492 return nullptr;
3493}
3494
3495static const TargetRegisterClass *
3497 if (BitWidth == 64)
3498 return &AMDGPU::AReg_64RegClass;
3499 if (BitWidth == 96)
3500 return &AMDGPU::AReg_96RegClass;
3501 if (BitWidth == 128)
3502 return &AMDGPU::AReg_128RegClass;
3503 if (BitWidth == 160)
3504 return &AMDGPU::AReg_160RegClass;
3505 if (BitWidth == 192)
3506 return &AMDGPU::AReg_192RegClass;
3507 if (BitWidth == 224)
3508 return &AMDGPU::AReg_224RegClass;
3509 if (BitWidth == 256)
3510 return &AMDGPU::AReg_256RegClass;
3511 if (BitWidth == 288)
3512 return &AMDGPU::AReg_288RegClass;
3513 if (BitWidth == 320)
3514 return &AMDGPU::AReg_320RegClass;
3515 if (BitWidth == 352)
3516 return &AMDGPU::AReg_352RegClass;
3517 if (BitWidth == 384)
3518 return &AMDGPU::AReg_384RegClass;
3519 if (BitWidth == 512)
3520 return &AMDGPU::AReg_512RegClass;
3521 if (BitWidth == 1024)
3522 return &AMDGPU::AReg_1024RegClass;
3523
3524 return nullptr;
3525}
3526
3527static const TargetRegisterClass *
3529 if (BitWidth == 64)
3530 return &AMDGPU::AReg_64_Align2RegClass;
3531 if (BitWidth == 96)
3532 return &AMDGPU::AReg_96_Align2RegClass;
3533 if (BitWidth == 128)
3534 return &AMDGPU::AReg_128_Align2RegClass;
3535 if (BitWidth == 160)
3536 return &AMDGPU::AReg_160_Align2RegClass;
3537 if (BitWidth == 192)
3538 return &AMDGPU::AReg_192_Align2RegClass;
3539 if (BitWidth == 224)
3540 return &AMDGPU::AReg_224_Align2RegClass;
3541 if (BitWidth == 256)
3542 return &AMDGPU::AReg_256_Align2RegClass;
3543 if (BitWidth == 288)
3544 return &AMDGPU::AReg_288_Align2RegClass;
3545 if (BitWidth == 320)
3546 return &AMDGPU::AReg_320_Align2RegClass;
3547 if (BitWidth == 352)
3548 return &AMDGPU::AReg_352_Align2RegClass;
3549 if (BitWidth == 384)
3550 return &AMDGPU::AReg_384_Align2RegClass;
3551 if (BitWidth == 512)
3552 return &AMDGPU::AReg_512_Align2RegClass;
3553 if (BitWidth == 1024)
3554 return &AMDGPU::AReg_1024_Align2RegClass;
3555
3556 return nullptr;
3557}
3558
3559const TargetRegisterClass *
3561 if (BitWidth == 16)
3562 return &AMDGPU::AGPR_LO16RegClass;
3563 if (BitWidth == 32)
3564 return &AMDGPU::AGPR_32RegClass;
3565 return ST.needsAlignedVGPRs() ? getAlignedAGPRClassForBitWidth(BitWidth)
3567}
3568
3569static const TargetRegisterClass *
3571 if (BitWidth == 64)
3572 return &AMDGPU::AV_64RegClass;
3573 if (BitWidth == 96)
3574 return &AMDGPU::AV_96RegClass;
3575 if (BitWidth == 128)
3576 return &AMDGPU::AV_128RegClass;
3577 if (BitWidth == 160)
3578 return &AMDGPU::AV_160RegClass;
3579 if (BitWidth == 192)
3580 return &AMDGPU::AV_192RegClass;
3581 if (BitWidth == 224)
3582 return &AMDGPU::AV_224RegClass;
3583 if (BitWidth == 256)
3584 return &AMDGPU::AV_256RegClass;
3585 if (BitWidth == 288)
3586 return &AMDGPU::AV_288RegClass;
3587 if (BitWidth == 320)
3588 return &AMDGPU::AV_320RegClass;
3589 if (BitWidth == 352)
3590 return &AMDGPU::AV_352RegClass;
3591 if (BitWidth == 384)
3592 return &AMDGPU::AV_384RegClass;
3593 if (BitWidth == 512)
3594 return &AMDGPU::AV_512RegClass;
3595 if (BitWidth == 1024)
3596 return &AMDGPU::AV_1024RegClass;
3597
3598 return nullptr;
3599}
3600
3601static const TargetRegisterClass *
3603 if (BitWidth == 64)
3604 return &AMDGPU::AV_64_Align2RegClass;
3605 if (BitWidth == 96)
3606 return &AMDGPU::AV_96_Align2RegClass;
3607 if (BitWidth == 128)
3608 return &AMDGPU::AV_128_Align2RegClass;
3609 if (BitWidth == 160)
3610 return &AMDGPU::AV_160_Align2RegClass;
3611 if (BitWidth == 192)
3612 return &AMDGPU::AV_192_Align2RegClass;
3613 if (BitWidth == 224)
3614 return &AMDGPU::AV_224_Align2RegClass;
3615 if (BitWidth == 256)
3616 return &AMDGPU::AV_256_Align2RegClass;
3617 if (BitWidth == 288)
3618 return &AMDGPU::AV_288_Align2RegClass;
3619 if (BitWidth == 320)
3620 return &AMDGPU::AV_320_Align2RegClass;
3621 if (BitWidth == 352)
3622 return &AMDGPU::AV_352_Align2RegClass;
3623 if (BitWidth == 384)
3624 return &AMDGPU::AV_384_Align2RegClass;
3625 if (BitWidth == 512)
3626 return &AMDGPU::AV_512_Align2RegClass;
3627 if (BitWidth == 1024)
3628 return &AMDGPU::AV_1024_Align2RegClass;
3629
3630 return nullptr;
3631}
3632
3633const TargetRegisterClass *
3635 if (BitWidth == 32)
3636 return &AMDGPU::AV_32RegClass;
3637 return ST.needsAlignedVGPRs()
3640}
3641
3642const TargetRegisterClass *
3644 // TODO: In principle this should use AV classes for gfx908 too. This is
3645 // limited to 90a+ to avoid regressing special case copy optimizations which
3646 // need new handling. The core issue is that it's not possible to directly
3647 // copy between AGPRs on gfx908, and the current optimizations around that
3648 // expect to see copies to VGPR.
3649 return ST.hasGFX90AInsts() ? getVectorSuperClassForBitWidth(BitWidth)
3651}
3652
3653const TargetRegisterClass *
3655 if (BitWidth == 16 || BitWidth == 32)
3656 return &AMDGPU::SReg_32RegClass;
3657 if (BitWidth == 64)
3658 return &AMDGPU::SReg_64RegClass;
3659 if (BitWidth == 96)
3660 return &AMDGPU::SGPR_96RegClass;
3661 if (BitWidth == 128)
3662 return &AMDGPU::SGPR_128RegClass;
3663 if (BitWidth == 160)
3664 return &AMDGPU::SGPR_160RegClass;
3665 if (BitWidth == 192)
3666 return &AMDGPU::SGPR_192RegClass;
3667 if (BitWidth == 224)
3668 return &AMDGPU::SGPR_224RegClass;
3669 if (BitWidth == 256)
3670 return &AMDGPU::SGPR_256RegClass;
3671 if (BitWidth == 288)
3672 return &AMDGPU::SGPR_288RegClass;
3673 if (BitWidth == 320)
3674 return &AMDGPU::SGPR_320RegClass;
3675 if (BitWidth == 352)
3676 return &AMDGPU::SGPR_352RegClass;
3677 if (BitWidth == 384)
3678 return &AMDGPU::SGPR_384RegClass;
3679 if (BitWidth == 512)
3680 return &AMDGPU::SGPR_512RegClass;
3681 if (BitWidth == 1024)
3682 return &AMDGPU::SGPR_1024RegClass;
3683
3684 return nullptr;
3685}
3686
3688 Register Reg) const {
3689 const TargetRegisterClass *RC;
3690 if (Reg.isVirtual())
3691 RC = MRI.getRegClass(Reg);
3692 else
3693 RC = getPhysRegBaseClass(Reg);
3694 return RC && isSGPRClass(RC);
3695}
3696
3697const TargetRegisterClass *
3699 unsigned Size = getRegSizeInBits(*SRC);
3700
3701 switch (SRC->getID()) {
3702 default:
3703 break;
3704 case AMDGPU::VS_32_Lo256RegClassID:
3705 case AMDGPU::VS_64_Lo256RegClassID:
3706 return getAllocatableClass(getAlignedLo256VGPRClassForBitWidth(Size));
3707 }
3708
3709 const TargetRegisterClass *VRC =
3710 getAllocatableClass(getVGPRClassForBitWidth(Size));
3711 assert(VRC && "Invalid register class size");
3712 return VRC;
3713}
3714
3715const TargetRegisterClass *
3717 unsigned Size = getRegSizeInBits(*SRC);
3719 assert(ARC && "Invalid register class size");
3720 return ARC;
3721}
3722
3723const TargetRegisterClass *
3725 unsigned Size = getRegSizeInBits(*SRC);
3727 assert(ARC && "Invalid register class size");
3728 return ARC;
3729}
3730
3731const TargetRegisterClass *
3733 unsigned Size = getRegSizeInBits(*VRC);
3734 if (Size == 32)
3735 return &AMDGPU::SGPR_32RegClass;
3737 assert(SRC && "Invalid register class size");
3738 return SRC;
3739}
3740
3741const TargetRegisterClass *
3743 const TargetRegisterClass *SubRC,
3744 unsigned SubIdx) const {
3745 // Ensure this subregister index is aligned in the super register.
3746 const TargetRegisterClass *MatchRC =
3747 getMatchingSuperRegClass(SuperRC, SubRC, SubIdx);
3748 return MatchRC && MatchRC->hasSubClassEq(SuperRC) ? MatchRC : nullptr;
3749}
3750
3751bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
3754 return !ST.hasMFMAInlineLiteralBug();
3755
3756 return OpType >= AMDGPU::OPERAND_SRC_FIRST &&
3757 OpType <= AMDGPU::OPERAND_SRC_LAST;
3758}
3759
3760bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const {
3761 // TODO: 64-bit operands have extending behavior from 32-bit literal.
3762 return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST &&
3764}
3765
3766/// Returns a lowest register that is not used at any point in the function.
3767/// If all registers are used, then this function will return
3768/// AMDGPU::NoRegister. If \p ReserveHighestRegister = true, then return
3769/// highest unused register.
3771 const MachineRegisterInfo &MRI, const TargetRegisterClass *RC,
3772 const MachineFunction &MF, bool ReserveHighestRegister) const {
3773 if (ReserveHighestRegister) {
3774 for (MCRegister Reg : reverse(*RC))
3775 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
3776 return Reg;
3777 } else {
3778 for (MCRegister Reg : *RC)
3779 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
3780 return Reg;
3781 }
3782 return MCRegister();
3783}
3784
3786 const RegisterBankInfo &RBI,
3787 Register Reg) const {
3788 auto *RB = RBI.getRegBank(Reg, MRI, *MRI.getTargetRegisterInfo());
3789 if (!RB)
3790 return false;
3791
3792 return !RBI.isDivergentRegBank(RB);
3793}
3794
3796 unsigned EltSize) const {
3797 const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC);
3798 assert(RegBitWidth >= 32 && RegBitWidth <= 1024 && EltSize >= 2);
3799
3800 const unsigned RegHalves = RegBitWidth / 16;
3801 const unsigned EltHalves = EltSize / 2;
3802 assert(RegSplitParts.size() + 1 >= EltHalves);
3803
3804 const std::vector<int16_t> &Parts = RegSplitParts[EltHalves - 1];
3805 const unsigned NumParts = RegHalves / EltHalves;
3806
3807 return ArrayRef(Parts.data(), NumParts);
3808}
3809
3812 Register Reg) const {
3813 return Reg.isVirtual() ? MRI.getRegClass(Reg) : getPhysRegBaseClass(Reg);
3814}
3815
3816const TargetRegisterClass *
3818 const MachineOperand &MO) const {
3819 const TargetRegisterClass *SrcRC = getRegClassForReg(MRI, MO.getReg());
3820 return getSubRegisterClass(SrcRC, MO.getSubReg());
3821}
3822
3824 Register Reg) const {
3825 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
3826 // Registers without classes are unaddressable, SGPR-like registers.
3827 return RC && isVGPRClass(RC);
3828}
3829
3831 Register Reg) const {
3832 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
3833
3834 // Registers without classes are unaddressable, SGPR-like registers.
3835 return RC && isAGPRClass(RC);
3836}
3837
3839 MachineFunction &MF) const {
3840 unsigned MinOcc = ST.getOccupancyWithWorkGroupSizes(MF).first;
3841 switch (RC->getID()) {
3842 default:
3843 return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF);
3844 case AMDGPU::VGPR_32RegClassID:
3845 return std::min(
3846 ST.getMaxNumVGPRs(
3847 MinOcc,
3849 ST.getMaxNumVGPRs(MF));
3850 case AMDGPU::SGPR_32RegClassID:
3851 case AMDGPU::SGPR_LO16RegClassID:
3852 return std::min(ST.getMaxNumSGPRs(MinOcc, true), ST.getMaxNumSGPRs(MF));
3853 }
3854}
3855
3857 unsigned Idx) const {
3858 switch (static_cast<AMDGPU::RegisterPressureSets>(Idx)) {
3859 case AMDGPU::RegisterPressureSets::VGPR_32:
3860 case AMDGPU::RegisterPressureSets::AGPR_32:
3861 return getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
3862 const_cast<MachineFunction &>(MF));
3863 case AMDGPU::RegisterPressureSets::SReg_32:
3864 return getRegPressureLimit(&AMDGPU::SGPR_32RegClass,
3865 const_cast<MachineFunction &>(MF));
3866 }
3867
3868 llvm_unreachable("Unexpected register pressure set!");
3869}
3870
3871const int *SIRegisterInfo::getRegUnitPressureSets(MCRegUnit RegUnit) const {
3872 static const int Empty[] = { -1 };
3873
3874 if (RegPressureIgnoredUnits[static_cast<unsigned>(RegUnit)])
3875 return Empty;
3876
3877 return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit);
3878}
3879
3881 ArrayRef<MCPhysReg> Order,
3883 const MachineFunction &MF,
3884 const VirtRegMap *VRM,
3885 const LiveRegMatrix *Matrix) const {
3886
3887 const MachineRegisterInfo &MRI = MF.getRegInfo();
3888 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3889
3890 std::pair<unsigned, Register> Hint = MRI.getRegAllocationHint(VirtReg);
3891
3892 switch (Hint.first) {
3893 case AMDGPURI::Size32: {
3894 Register Paired = Hint.second;
3895 assert(Paired);
3896 Register PairedPhys;
3897 if (Paired.isPhysical()) {
3898 PairedPhys =
3899 getMatchingSuperReg(Paired, AMDGPU::lo16, &AMDGPU::VGPR_32RegClass);
3900 } else if (VRM && VRM->hasPhys(Paired)) {
3901 PairedPhys = getMatchingSuperReg(VRM->getPhys(Paired), AMDGPU::lo16,
3902 &AMDGPU::VGPR_32RegClass);
3903 }
3904
3905 // Prefer the paired physreg.
3906 if (PairedPhys)
3907 // isLo(Paired) is implicitly true here from the API of
3908 // getMatchingSuperReg.
3909 Hints.push_back(PairedPhys);
3910 return false;
3911 }
3912 case AMDGPURI::Size16: {
3913 Register Paired = Hint.second;
3914 assert(Paired);
3915 Register PairedPhys;
3916 if (Paired.isPhysical()) {
3917 PairedPhys = TRI->getSubReg(Paired, AMDGPU::lo16);
3918 } else if (VRM && VRM->hasPhys(Paired)) {
3919 PairedPhys = TRI->getSubReg(VRM->getPhys(Paired), AMDGPU::lo16);
3920 }
3921
3922 // First prefer the paired physreg.
3923 if (PairedPhys)
3924 Hints.push_back(PairedPhys);
3925 else {
3926 // Add all the lo16 physregs.
3927 // When the Paired operand has not yet been assigned a physreg it is
3928 // better to try putting VirtReg in a lo16 register, because possibly
3929 // later Paired can be assigned to the overlapping register and the COPY
3930 // can be eliminated.
3931 for (MCPhysReg PhysReg : Order) {
3932 if (PhysReg == PairedPhys || AMDGPU::isHi16Reg(PhysReg, *this))
3933 continue;
3934 if (AMDGPU::VGPR_16RegClass.contains(PhysReg) &&
3935 !MRI.isReserved(PhysReg))
3936 Hints.push_back(PhysReg);
3937 }
3938 }
3939 return false;
3940 }
3941 default:
3942 return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
3943 VRM);
3944 }
3945}
3946
3948 // Not a callee saved register.
3949 return AMDGPU::SGPR30_SGPR31;
3950}
3951
3952const TargetRegisterClass *
3954 const RegisterBank &RB) const {
3955 switch (RB.getID()) {
3956 case AMDGPU::VGPRRegBankID:
3958 std::max(ST.useRealTrue16Insts() ? 16u : 32u, Size));
3959 case AMDGPU::VCCRegBankID:
3960 assert(Size == 1);
3961 return getWaveMaskRegClass();
3962 case AMDGPU::SGPRRegBankID:
3963 return getSGPRClassForBitWidth(std::max(32u, Size));
3964 case AMDGPU::AGPRRegBankID:
3965 return getAGPRClassForBitWidth(std::max(32u, Size));
3966 default:
3967 llvm_unreachable("unknown register bank");
3968 }
3969}
3970
3971const TargetRegisterClass *
3973 const MachineRegisterInfo &MRI) const {
3974 const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg());
3975 if (const RegisterBank *RB = dyn_cast<const RegisterBank *>(RCOrRB))
3976 return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB);
3977
3978 if (const auto *RC = dyn_cast<const TargetRegisterClass *>(RCOrRB))
3979 return getAllocatableClass(RC);
3980
3981 return nullptr;
3982}
3983
3985 return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC;
3986}
3987
3989 return isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
3990}
3991
3993 // VGPR tuples have an alignment requirement on gfx90a variants.
3994 return ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass
3995 : &AMDGPU::VReg_64RegClass;
3996}
3997
3998// Find reaching register definition
4002 LiveIntervals *LIS) const {
4003 auto &MDT = LIS->getDomTree();
4004 SlotIndex UseIdx = LIS->getInstructionIndex(Use);
4005 SlotIndex DefIdx;
4006
4007 if (Reg.isVirtual()) {
4008 if (!LIS->hasInterval(Reg))
4009 return nullptr;
4010 LiveInterval &LI = LIS->getInterval(Reg);
4011 LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg)
4012 : MRI.getMaxLaneMaskForVReg(Reg);
4013 VNInfo *V = nullptr;
4014 if (LI.hasSubRanges()) {
4015 for (auto &S : LI.subranges()) {
4016 if ((S.LaneMask & SubLanes) == SubLanes) {
4017 V = S.getVNInfoAt(UseIdx);
4018 break;
4019 }
4020 }
4021 } else {
4022 V = LI.getVNInfoAt(UseIdx);
4023 }
4024 if (!V)
4025 return nullptr;
4026 DefIdx = V->def;
4027 } else {
4028 // Find last def.
4029 for (MCRegUnit Unit : regunits(Reg.asMCReg())) {
4030 LiveRange &LR = LIS->getRegUnit(Unit);
4031 if (VNInfo *V = LR.getVNInfoAt(UseIdx)) {
4032 if (!DefIdx.isValid() ||
4033 MDT.dominates(LIS->getInstructionFromIndex(DefIdx),
4034 LIS->getInstructionFromIndex(V->def)))
4035 DefIdx = V->def;
4036 } else {
4037 return nullptr;
4038 }
4039 }
4040 }
4041
4042 MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx);
4043
4044 if (!Def || !MDT.dominates(Def, &Use))
4045 return nullptr;
4046
4047 assert(Def->modifiesRegister(Reg, this));
4048
4049 return Def;
4050}
4051
4053 assert(getRegSizeInBits(*getPhysRegBaseClass(Reg)) <= 32);
4054
4055 for (const TargetRegisterClass &RC : { AMDGPU::VGPR_32RegClass,
4056 AMDGPU::SReg_32RegClass,
4057 AMDGPU::AGPR_32RegClass } ) {
4058 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::lo16, &RC))
4059 return Super;
4060 }
4061 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::hi16,
4062 &AMDGPU::VGPR_32RegClass)) {
4063 return Super;
4064 }
4065
4066 return AMDGPU::NoRegister;
4067}
4068
4070 if (!ST.needsAlignedVGPRs())
4071 return true;
4072
4073 if (isVGPRClass(&RC))
4074 return RC.hasSuperClassEq(getVGPRClassForBitWidth(getRegSizeInBits(RC)));
4075 if (isAGPRClass(&RC))
4076 return RC.hasSuperClassEq(getAGPRClassForBitWidth(getRegSizeInBits(RC)));
4077 if (isVectorSuperClass(&RC))
4078 return RC.hasSuperClassEq(
4079 getVectorSuperClassForBitWidth(getRegSizeInBits(RC)));
4080
4081 assert(&RC != &AMDGPU::VS_64RegClass);
4082
4083 return true;
4084}
4085
4088 return ArrayRef(AMDGPU::SGPR_128RegClass.begin(), ST.getMaxNumSGPRs(MF) / 4);
4089}
4090
4093 return ArrayRef(AMDGPU::SGPR_64RegClass.begin(), ST.getMaxNumSGPRs(MF) / 2);
4094}
4095
4098 return ArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF));
4099}
4100
4101unsigned
4103 unsigned SubReg) const {
4104 switch (RC->TSFlags & SIRCFlags::RegKindMask) {
4105 case SIRCFlags::HasSGPR:
4106 return std::min(128u, getSubRegIdxSize(SubReg));
4107 case SIRCFlags::HasAGPR:
4108 case SIRCFlags::HasVGPR:
4110 return std::min(32u, getSubRegIdxSize(SubReg));
4111 default:
4112 break;
4113 }
4114 return 0;
4115}
4116
4118 const TargetRegisterClass &RC,
4119 bool IncludeCalls) const {
4120 unsigned NumArchVGPRs = ST.getAddressableNumArchVGPRs();
4122 (RC.getID() == AMDGPU::VGPR_32RegClassID)
4123 ? RC.getRegisters().take_front(NumArchVGPRs)
4124 : RC.getRegisters();
4125 for (MCPhysReg Reg : reverse(Registers))
4126 if (MRI.isPhysRegUsed(Reg, /*SkipRegMaskTest=*/!IncludeCalls))
4127 return getHWRegIndex(Reg) + 1;
4128 return 0;
4129}
4130
4133 const MachineFunction &MF) const {
4135 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
4136 if (FuncInfo->checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG))
4137 RegFlags.push_back("WWM_REG");
4138 return RegFlags;
4139}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static const Function * getParent(const Value *V)
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
Live Register Matrix
A set of register units.
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
if(PassOpts->AAPipeline)
This file declares the machine register scavenger class.
SI Pre allocate WWM Registers
static int getOffenMUBUFStore(unsigned Opc)
static const TargetRegisterClass * getAnyAGPRClassForBitWidth(unsigned BitWidth)
static int getOffsetMUBUFLoad(unsigned Opc)
static const std::array< unsigned, 17 > SubRegFromChannelTableWidthMap
static unsigned getNumSubRegsForSpillOp(const MachineInstr &MI, const SIInstrInfo *TII)
static void emitUnsupportedError(const Function &Fn, const MachineInstr &MI, const Twine &ErrMsg)
static const TargetRegisterClass * getAlignedAGPRClassForBitWidth(unsigned BitWidth)
static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST, MachineFrameInfo &MFI, MachineBasicBlock::iterator MI, int Index, int64_t Offset)
static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII, unsigned LoadStoreOp, unsigned EltSize)
static const TargetRegisterClass * getAlignedVGPRClassForBitWidth(unsigned BitWidth)
static int getOffsetMUBUFStore(unsigned Opc)
static const TargetRegisterClass * getAnyVGPRClassForBitWidth(unsigned BitWidth)
static cl::opt< bool > EnableSpillSGPRToVGPR("amdgpu-spill-sgpr-to-vgpr", cl::desc("Enable spilling SGPRs to VGPRs"), cl::ReallyHidden, cl::init(true))
static const TargetRegisterClass * getAlignedVectorSuperClassForBitWidth(unsigned BitWidth)
static const TargetRegisterClass * getAnyVectorSuperClassForBitWidth(unsigned BitWidth)
static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, int Index, unsigned Lane, unsigned ValueReg, bool IsKill)
static bool isFIPlusImmOrVGPR(const SIRegisterInfo &TRI, const MachineInstr &MI)
static int getOffenMUBUFLoad(unsigned Opc)
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:483
LocallyHashedType DenseMapInfo< LocallyHashedType >::Empty
static const char * getRegisterName(MCRegister Reg)
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
bool test(unsigned Idx) const
Definition BitVector.h:480
bool empty() const
empty - Tests whether there are no bits in this bitvector.
Definition BitVector.h:175
A debug info location.
Definition DebugLoc.h:123
Diagnostic information for unsupported feature in backend.
Register getReg() const
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LiveInterval - This class represents the liveness of a register, or stack slot.
bool hasSubRanges() const
Returns true if subregister liveness information is available.
iterator_range< subrange_iterator > subranges()
void removeAllRegUnitsForPhysReg(MCRegister Reg)
Remove associated live ranges for the register units associated with Reg.
bool hasInterval(Register Reg) const
MachineInstr * getInstructionFromIndex(SlotIndex index) const
Returns the instruction associated with the given index.
MachineDominatorTree & getDomTree()
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
LiveRange & getRegUnit(MCRegUnit Unit)
Return the live range for register unit Unit.
This class represents the liveness of a register, stack slot, etc.
VNInfo * getVNInfoAt(SlotIndex Idx) const
getVNInfoAt - Return the VNInfo that is live at Idx, or NULL.
A set of register units used to track register liveness.
bool available(MCRegister Reg) const
Returns true if no part of physical register Reg is live.
Describe properties that are true of each instruction in the target description file.
MCRegAliasIterator enumerates all registers aliasing Reg.
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
static MCRegister from(unsigned Val)
Check the provided unsigned value is a valid MCRegister.
Definition MCRegister.h:77
Generic base class for all target subtargets.
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
bool hasCalls() const
Return true if the current function has any function calls.
Align getObjectAlign(int ObjectIdx) const
Return the alignment of the specified stack object.
bool hasStackObjects() const
Return true if there are any stack objects in this function.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addUse(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
void setAsmPrinterFlag(AsmPrinterFlagTy Flag)
Set a flag for the AsmPrinter.
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
void setImm(int64_t immVal)
int64_t getImm() const
LLVM_ABI void setIsRenamable(bool Val=true)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
const RegClassOrRegBank & getRegClassOrRegBank(Register Reg) const
Return the register bank or register class of Reg.
bool isReserved(MCRegister PhysReg) const
isReserved - Returns true when PhysReg is a reserved register.
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
bool isAllocatable(MCRegister PhysReg) const
isAllocatable - Returns true when PhysReg belongs to an allocatable register class and it hasn't been...
std::pair< unsigned, Register > getRegAllocationHint(Register VReg) const
getRegAllocationHint - Return the register allocation hint for the specified virtual register.
const TargetRegisterInfo * getTargetRegisterInfo() const
LLVM_ABI LaneBitmask getMaxLaneMaskForVReg(Register Reg) const
Returns a mask covering all bits that can appear in lane masks of subregisters of the virtual registe...
LLVM_ABI bool isPhysRegUsed(MCRegister PhysReg, bool SkipRegMaskTest=false) const
Return true if the specified register is modified or read in this function.
Holds all the information related to register banks.
virtual bool isDivergentRegBank(const RegisterBank *RB) const
Returns true if the register bank is considered divergent.
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
static bool isFLATScratch(const MachineInstr &MI)
static bool isMUBUF(const MachineInstr &MI)
static bool isVOP3(const MCInstrDesc &Desc)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
ArrayRef< MCPhysReg > getAGPRSpillVGPRs() const
MCPhysReg getVGPRToAGPRSpill(int FrameIndex, unsigned Lane) const
Register getScratchRSrcReg() const
Returns the physical register reserved for use as the resource descriptor for scratch accesses.
ArrayRef< MCPhysReg > getVGPRSpillAGPRs() const
ArrayRef< SIRegisterInfo::SpilledReg > getSGPRSpillToVirtualVGPRLanes(int FrameIndex) const
uint32_t getMaskForVGPRBlockOps(Register RegisterBlock) const
ArrayRef< SIRegisterInfo::SpilledReg > getSGPRSpillToPhysicalVGPRLanes(int FrameIndex) const
bool checkFlag(Register Reg, uint8_t Flag) const
const ReservedRegSet & getWWMReservedRegs() const
Register materializeFrameBaseRegister(MachineBasicBlock *MBB, int FrameIdx, int64_t Offset) const override
int64_t getScratchInstrOffset(const MachineInstr *MI) const
bool isFrameOffsetLegal(const MachineInstr *MI, Register BaseReg, int64_t Offset) const override
const TargetRegisterClass * getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, const TargetRegisterClass *SubRC, unsigned SubIdx) const
Returns a register class which is compatible with SuperRC, such that a subregister exists with class ...
ArrayRef< MCPhysReg > getAllSGPR64(const MachineFunction &MF) const
Return all SGPR64 which satisfy the waves per execution unit requirement of the subtarget.
MCRegister findUnusedRegister(const MachineRegisterInfo &MRI, const TargetRegisterClass *RC, const MachineFunction &MF, bool ReserveHighestVGPR=false) const
Returns a lowest register that is not used at any point in the function.
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
MCPhysReg get32BitRegister(MCPhysReg Reg) const
const uint32_t * getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override
bool requiresFrameIndexReplacementScavenging(const MachineFunction &MF) const override
bool shouldRealignStack(const MachineFunction &MF) const override
bool restoreSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, SlotIndexes *Indexes=nullptr, LiveIntervals *LIS=nullptr, bool OnlyToVGPR=false, bool SpillToPhysVGPRLane=false) const
bool isProperlyAlignedRC(const TargetRegisterClass &RC) const
const TargetRegisterClass * getEquivalentVGPRClass(const TargetRegisterClass *SRC) const
Register getFrameRegister(const MachineFunction &MF) const override
LLVM_READONLY const TargetRegisterClass * getVectorSuperClassForBitWidth(unsigned BitWidth) const
bool spillEmergencySGPR(MachineBasicBlock::iterator MI, MachineBasicBlock &RestoreMBB, Register SGPR, RegScavenger *RS) const
SIRegisterInfo(const GCNSubtarget &ST)
const uint32_t * getAllVGPRRegMask() const
MCRegister getReturnAddressReg(const MachineFunction &MF) const
const MCPhysReg * getCalleeSavedRegs(const MachineFunction *MF) const override
bool hasBasePointer(const MachineFunction &MF) const
const TargetRegisterClass * getCrossCopyRegClass(const TargetRegisterClass *RC) const override
Returns a legal register class to copy a register in the specified class to or from.
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
ArrayRef< MCPhysReg > getAllSGPR32(const MachineFunction &MF) const
Return all SGPR32 which satisfy the waves per execution unit requirement of the subtarget.
const TargetRegisterClass * getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &MF) const override
MCRegister reservedPrivateSegmentBufferReg(const MachineFunction &MF) const
Return the end register initially reserved for the scratch buffer in case spilling is needed.
bool eliminateSGPRToVGPRSpillFrameIndex(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, SlotIndexes *Indexes=nullptr, LiveIntervals *LIS=nullptr, bool SpillToPhysVGPRLane=false) const
Special case of eliminateFrameIndex.
bool isVGPR(const MachineRegisterInfo &MRI, Register Reg) const
void buildSpillLoadStore(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, unsigned LoadStoreOp, int Index, Register ValueReg, bool ValueIsKill, MCRegister ScratchOffsetReg, int64_t InstrOffset, MachineMemOperand *MMO, RegScavenger *RS, LiveRegUnits *LiveUnits=nullptr) const
bool isAsmClobberable(const MachineFunction &MF, MCRegister PhysReg) const override
LLVM_READONLY const TargetRegisterClass * getAGPRClassForBitWidth(unsigned BitWidth) const
static bool isChainScratchRegister(Register VGPR)
bool requiresRegisterScavenging(const MachineFunction &Fn) const override
bool opCanUseInlineConstant(unsigned OpType) const
const TargetRegisterClass * getRegClassForSizeOnBank(unsigned Size, const RegisterBank &Bank) const
const TargetRegisterClass * getConstrainedRegClassForOperand(const MachineOperand &MO, const MachineRegisterInfo &MRI) const override
bool isUniformReg(const MachineRegisterInfo &MRI, const RegisterBankInfo &RBI, Register Reg) const override
const uint32_t * getNoPreservedMask() const override
StringRef getRegAsmName(MCRegister Reg) const override
const uint32_t * getAllAllocatableSRegMask() const
MCRegister getAlignedHighSGPRForRC(const MachineFunction &MF, const unsigned Align, const TargetRegisterClass *RC) const
Return the largest available SGPR aligned to Align for the register class RC.
const TargetRegisterClass * getRegClassForReg(const MachineRegisterInfo &MRI, Register Reg) const
unsigned getHWRegIndex(MCRegister Reg) const
const MCPhysReg * getCalleeSavedRegsViaCopy(const MachineFunction *MF) const
const uint32_t * getAllVectorRegMask() const
const TargetRegisterClass * getEquivalentAGPRClass(const TargetRegisterClass *SRC) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
const TargetRegisterClass * getPointerRegClass(unsigned Kind=0) const override
const TargetRegisterClass * getRegClassForTypeOnBank(LLT Ty, const RegisterBank &Bank) const
bool opCanUseLiteralConstant(unsigned OpType) const
Register getBaseRegister() const
bool getRegAllocationHints(Register VirtReg, ArrayRef< MCPhysReg > Order, SmallVectorImpl< MCPhysReg > &Hints, const MachineFunction &MF, const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const override
LLVM_READONLY const TargetRegisterClass * getAlignedLo256VGPRClassForBitWidth(unsigned BitWidth) const
LLVM_READONLY const TargetRegisterClass * getVGPRClassForBitWidth(unsigned BitWidth) const
const TargetRegisterClass * getEquivalentAVClass(const TargetRegisterClass *SRC) const
bool requiresFrameIndexScavenging(const MachineFunction &MF) const override
static bool isVGPRClass(const TargetRegisterClass *RC)
MachineInstr * findReachingDef(Register Reg, unsigned SubReg, MachineInstr &Use, MachineRegisterInfo &MRI, LiveIntervals *LIS) const
bool isSGPRReg(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getEquivalentSGPRClass(const TargetRegisterClass *VRC) const
SmallVector< StringLiteral > getVRegFlagsOfReg(Register Reg, const MachineFunction &MF) const override
LLVM_READONLY const TargetRegisterClass * getDefaultVectorSuperClassForBitWidth(unsigned BitWidth) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
ArrayRef< MCPhysReg > getAllSGPR128(const MachineFunction &MF) const
Return all SGPR128 which satisfy the waves per execution unit requirement of the subtarget.
unsigned getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const override
BitVector getReservedRegs(const MachineFunction &MF) const override
bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override
const TargetRegisterClass * getRegClassForOperandReg(const MachineRegisterInfo &MRI, const MachineOperand &MO) const
void addImplicitUsesForBlockCSRLoad(MachineInstrBuilder &MIB, Register BlockReg) const
unsigned getNumUsedPhysRegs(const MachineRegisterInfo &MRI, const TargetRegisterClass &RC, bool IncludeCalls=true) const
const uint32_t * getAllAGPRRegMask() const
const int * getRegUnitPressureSets(MCRegUnit RegUnit) const override
bool isAGPR(const MachineRegisterInfo &MRI, Register Reg) const
bool eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const override
bool spillSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, SlotIndexes *Indexes=nullptr, LiveIntervals *LIS=nullptr, bool OnlyToVGPR=false, bool SpillToPhysVGPRLane=false) const
If OnlyToVGPR is true, this will only succeed if this manages to find a free VGPR lane to spill.
MCRegister getExec() const
MCRegister getVCC() const
int64_t getFrameIndexInstrOffset(const MachineInstr *MI, int Idx) const override
bool isVectorSuperClass(const TargetRegisterClass *RC) const
const TargetRegisterClass * getWaveMaskRegClass() const
unsigned getSubRegAlignmentNumBits(const TargetRegisterClass *RC, unsigned SubReg) const
void resolveFrameIndex(MachineInstr &MI, Register BaseReg, int64_t Offset) const override
bool requiresVirtualBaseRegisters(const MachineFunction &Fn) const override
const TargetRegisterClass * getVGPR64Class() const
void buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index, int Offset, bool IsLoad, bool IsKill=true) const
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
bool isValid() const
Returns true if this is a valid index.
SlotIndexes pass.
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
SlotIndex replaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
ReplaceMachineInstrInMaps - Replacing a machine instr with a new one in maps used by register allocat...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
bool hasFP(const MachineFunction &MF) const
hasFP - Return true if the specified function should have a dedicated frame pointer register.
const uint8_t TSFlags
Configurable target specific flags.
ArrayRef< MCPhysReg > getRegisters() const
unsigned getID() const
Return the register class ID number.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
virtual const TargetRegisterClass * getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &) const
Returns the largest super class of RC that is legal to use in the current sub-target and has the same...
virtual bool shouldRealignStack(const MachineFunction &MF) const
True if storage within the function requires the stack pointer to be aligned more than the normal cal...
virtual bool getRegAllocationHints(Register VirtReg, ArrayRef< MCPhysReg > Order, SmallVectorImpl< MCPhysReg > &Hints, const MachineFunction &MF, const VirtRegMap *VRM=nullptr, const LiveRegMatrix *Matrix=nullptr) const
Get a list of 'hint' registers that the register allocator should try first when allocating a physica...
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
VNInfo - Value Number Information.
MCRegister getPhys(Register virtReg) const
returns the physical register mapped to the specified virtual register
Definition VirtRegMap.h:91
bool hasPhys(Register virtReg) const
returns true if the specified virtual register is mapped to a physical register
Definition VirtRegMap.h:87
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ PRIVATE_ADDRESS
Address space for private memory.
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
@ OPERAND_REG_IMM_FIRST
Definition SIDefines.h:253
@ OPERAND_REG_INLINE_AC_FIRST
Definition SIDefines.h:259
@ OPERAND_REG_INLINE_AC_LAST
Definition SIDefines.h:260
@ OPERAND_REG_IMM_LAST
Definition SIDefines.h:254
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
LLVM_READONLY int32_t getFlatScratchInstSVfromSVS(uint32_t Opcode)
LLVM_READONLY int32_t getFlatScratchInstSVfromSS(uint32_t Opcode)
LLVM_READONLY int32_t getFlatScratchInstSTfromSS(uint32_t Opcode)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition CallingConv.h:47
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:532
PointerUnion< const TargetRegisterClass *, const RegisterBank * > RegClassOrRegBank
Convenient type to represent either a register class or a register bank.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1669
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
RegState
Flags to represent properties of register accesses.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
@ Renamable
Register that may be renamed.
constexpr RegState getKillRegState(bool B)
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
Op::Description Desc
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:156
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
@ HasSGPR
Definition SIDefines.h:26
@ HasVGPR
Definition SIDefines.h:24
@ RegKindMask
Definition SIDefines.h:29
@ HasAGPR
Definition SIDefines.h:25
constexpr RegState getDefRegState(bool B)
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
constexpr bool hasRegState(RegState Value, RegState Test)
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
void call_once(once_flag &flag, Function &&F, Args &&... ArgList)
Execute the function specified as a parameter once.
Definition Threading.h:86
constexpr unsigned BitWidth
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
static const MachineMemOperand::Flags MOThreadPrivate
Mark the MMO of accesses to memory locations that are never written to by other threads.
Definition SIInstrInfo.h:63
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
This class contains a discriminated union of information about pointers in memory operands,...
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
void setMI(MachineBasicBlock *NewMBB, MachineBasicBlock::iterator NewMI)
ArrayRef< int16_t > SplitParts
SIMachineFunctionInfo & MFI
SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, bool IsWave32, MachineBasicBlock::iterator MI, int Index, RegScavenger *RS)
SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, bool IsWave32, MachineBasicBlock::iterator MI, Register Reg, bool IsKill, int Index, RegScavenger *RS)
MachineBasicBlock::iterator MI
void readWriteTmpVGPR(unsigned Offset, bool IsLoad)
const SIRegisterInfo & TRI
MachineBasicBlock * MBB
const SIInstrInfo & TII
The llvm::once_flag structure.
Definition Threading.h:67