LLVM 23.0.0git
SIRegisterInfo.cpp
Go to the documentation of this file.
1//===-- SIRegisterInfo.cpp - SI Register Information ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI implementation of the TargetRegisterInfo class.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPU.h"
16#include "GCNSubtarget.h"
20#include "SIRegisterInfo.h"
26
27using namespace llvm;
28
29#define GET_REGINFO_TARGET_DESC
30#include "AMDGPUGenRegisterInfo.inc"
31
33 "amdgpu-spill-sgpr-to-vgpr",
34 cl::desc("Enable spilling SGPRs to VGPRs"),
36 cl::init(true));
37
38std::array<std::vector<int16_t>, 32> SIRegisterInfo::RegSplitParts;
39std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable;
40
41// Map numbers of DWORDs to indexes in SubRegFromChannelTable.
42// Valid indexes are shifted 1, such that a 0 mapping means unsupported.
43// e.g. for 8 DWORDs (256-bit), SubRegFromChannelTableWidthMap[8] = 8,
44// meaning index 7 in SubRegFromChannelTable.
45static const std::array<unsigned, 17> SubRegFromChannelTableWidthMap = {
46 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 9};
47
48static void emitUnsupportedError(const Function &Fn, const MachineInstr &MI,
49 const Twine &ErrMsg) {
51 DiagnosticInfoUnsupported(Fn, ErrMsg, MI.getDebugLoc()));
52}
53
54namespace llvm {
55
56// A temporary struct to spill SGPRs.
57// This is mostly to spill SGPRs to memory. Spilling SGPRs into VGPR lanes emits
58// just v_writelane and v_readlane.
59//
60// When spilling to memory, the SGPRs are written into VGPR lanes and the VGPR
61// is saved to scratch (or the other way around for loads).
62// For this, a VGPR is required where the needed lanes can be clobbered. The
63// RegScavenger can provide a VGPR where currently active lanes can be
64// clobbered, but we still need to save inactive lanes.
65// The high-level steps are:
66// - Try to scavenge SGPR(s) to save exec
67// - Try to scavenge VGPR
68// - Save needed, all or inactive lanes of a TmpVGPR
69// - Spill/Restore SGPRs using TmpVGPR
70// - Restore TmpVGPR
71//
72// To save all lanes of TmpVGPR, exec needs to be saved and modified. If we
73// cannot scavenge temporary SGPRs to save exec, we use the following code:
74// buffer_store_dword TmpVGPR ; only if active lanes need to be saved
75// s_not exec, exec
76// buffer_store_dword TmpVGPR ; save inactive lanes
77// s_not exec, exec
79 struct PerVGPRData {
80 unsigned PerVGPR;
81 unsigned NumVGPRs;
82 int64_t VGPRLanes;
83 };
84
85 // The SGPR to save
89 unsigned NumSubRegs;
90 bool IsKill;
91 const DebugLoc &DL;
92
93 /* When spilling to stack */
94 // The SGPRs are written into this VGPR, which is then written to scratch
95 // (or vice versa for loads).
96 Register TmpVGPR = AMDGPU::NoRegister;
97 // Temporary spill slot to save TmpVGPR to.
98 int TmpVGPRIndex = 0;
99 // If TmpVGPR is live before the spill or if it is scavenged.
100 bool TmpVGPRLive = false;
101 // Scavenged SGPR to save EXEC.
102 Register SavedExecReg = AMDGPU::NoRegister;
103 // Stack index to write the SGPRs to.
104 int Index;
105 unsigned EltSize = 4;
106
115 unsigned MovOpc;
116 unsigned NotOpc;
117
121 : SGPRSpillBuilder(TRI, TII, IsWave32, MI, MI->getOperand(0).getReg(),
122 MI->getOperand(0).isKill(), Index, RS) {}
123
126 bool IsKill, int Index, RegScavenger *RS)
127 : SuperReg(Reg), MI(MI), IsKill(IsKill), DL(MI->getDebugLoc()),
128 Index(Index), RS(RS), MBB(MI->getParent()), MF(*MBB->getParent()),
129 MFI(*MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI),
131 const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg);
132 SplitParts = TRI.getRegSplitParts(RC, EltSize);
133 NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
134
135 if (IsWave32) {
136 ExecReg = AMDGPU::EXEC_LO;
137 MovOpc = AMDGPU::S_MOV_B32;
138 NotOpc = AMDGPU::S_NOT_B32;
139 } else {
140 ExecReg = AMDGPU::EXEC;
141 MovOpc = AMDGPU::S_MOV_B64;
142 NotOpc = AMDGPU::S_NOT_B64;
143 }
144
145 assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
146 assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI &&
147 SuperReg != AMDGPU::EXEC && "exec should never spill");
148 }
149
152 Data.PerVGPR = IsWave32 ? 32 : 64;
153 Data.NumVGPRs = (NumSubRegs + (Data.PerVGPR - 1)) / Data.PerVGPR;
154 Data.VGPRLanes = (1LL << std::min(Data.PerVGPR, NumSubRegs)) - 1LL;
155 return Data;
156 }
157
158 // Tries to scavenge SGPRs to save EXEC and a VGPR. Uses v0 if no VGPR is
159 // free.
160 // Writes these instructions if an SGPR can be scavenged:
161 // s_mov_b64 s[6:7], exec ; Save exec
162 // s_mov_b64 exec, 3 ; Wanted lanemask
163 // buffer_store_dword v1 ; Write scavenged VGPR to emergency slot
164 //
165 // Writes these instructions if no SGPR can be scavenged:
166 // buffer_store_dword v0 ; Only if no free VGPR was found
167 // s_not_b64 exec, exec
168 // buffer_store_dword v0 ; Save inactive lanes
169 // ; exec stays inverted, it is flipped back in
170 // ; restore.
171 void prepare() {
172 // Scavenged temporary VGPR to use. It must be scavenged once for any number
173 // of spilled subregs.
174 // FIXME: The liveness analysis is limited and does not tell if a register
175 // is in use in lanes that are currently inactive. We can never be sure if
176 // a register as actually in use in another lane, so we need to save all
177 // used lanes of the chosen VGPR.
178 assert(RS && "Cannot spill SGPR to memory without RegScavenger");
179 TmpVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false,
180 0, false);
181
182 // Reserve temporary stack slot
183 TmpVGPRIndex = MFI.getScavengeFI(MF.getFrameInfo(), TRI);
184 if (TmpVGPR) {
185 // Found a register that is dead in the currently active lanes, we only
186 // need to spill inactive lanes.
187 TmpVGPRLive = false;
188 } else {
189 // Pick v0 because it doesn't make a difference.
190 TmpVGPR = AMDGPU::VGPR0;
191 TmpVGPRLive = true;
192 }
193
194 if (TmpVGPRLive) {
195 // We need to inform the scavenger that this index is already in use until
196 // we're done with the custom emergency spill.
197 RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR);
198 }
199
200 // We may end up recursively calling the scavenger, and don't want to re-use
201 // the same register.
202 RS->setRegUsed(TmpVGPR);
203
204 // Try to scavenge SGPRs to save exec
205 assert(!SavedExecReg && "Exec is already saved, refuse to save again");
206 const TargetRegisterClass &RC =
207 IsWave32 ? AMDGPU::SGPR_32RegClass : AMDGPU::SGPR_64RegClass;
208 RS->setRegUsed(SuperReg);
209 SavedExecReg = RS->scavengeRegisterBackwards(RC, MI, false, 0, false);
210
211 int64_t VGPRLanes = getPerVGPRData().VGPRLanes;
212
213 if (SavedExecReg) {
214 RS->setRegUsed(SavedExecReg);
215 // Set exec to needed lanes
217 auto I =
218 BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg).addImm(VGPRLanes);
219 if (!TmpVGPRLive)
221 // Spill needed lanes
222 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
223 } else {
224 // The modify and restore of exec clobber SCC, which we would have to save
225 // and restore. FIXME: We probably would need to reserve a register for
226 // this.
227 if (RS->isRegUsed(AMDGPU::SCC))
228 emitUnsupportedError(MF.getFunction(), *MI,
229 "unhandled SGPR spill to memory");
230
231 // Spill active lanes
232 if (TmpVGPRLive)
233 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false,
234 /*IsKill*/ false);
235 // Spill inactive lanes
236 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
237 if (!TmpVGPRLive)
239 I->getOperand(2).setIsDead(); // Mark SCC as dead.
240 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
241 }
242 }
243
244 // Writes these instructions if an SGPR can be scavenged:
245 // buffer_load_dword v1 ; Write scavenged VGPR to emergency slot
246 // s_waitcnt vmcnt(0) ; If a free VGPR was found
247 // s_mov_b64 exec, s[6:7] ; Save exec
248 //
249 // Writes these instructions if no SGPR can be scavenged:
250 // buffer_load_dword v0 ; Restore inactive lanes
251 // s_waitcnt vmcnt(0) ; If a free VGPR was found
252 // s_not_b64 exec, exec
253 // buffer_load_dword v0 ; Only if no free VGPR was found
254 void restore() {
255 if (SavedExecReg) {
256 // Restore used lanes
257 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
258 /*IsKill*/ false);
259 // Restore exec
260 auto I = BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg)
262 // Add an implicit use of the load so it is not dead.
263 // FIXME This inserts an unnecessary waitcnt
264 if (!TmpVGPRLive) {
266 }
267 } else {
268 // Restore inactive lanes
269 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
270 /*IsKill*/ false);
271 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
272 if (!TmpVGPRLive)
274 I->getOperand(2).setIsDead(); // Mark SCC as dead.
275
276 // Restore active lanes
277 if (TmpVGPRLive)
278 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true);
279 }
280
281 // Inform the scavenger where we're releasing our custom scavenged register.
282 if (TmpVGPRLive) {
283 MachineBasicBlock::iterator RestorePt = std::prev(MI);
284 RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR, &*RestorePt);
285 }
286 }
287
288 // Write TmpVGPR to memory or read TmpVGPR from memory.
289 // Either using a single buffer_load/store if exec is set to the needed mask
290 // or using
291 // buffer_load
292 // s_not exec, exec
293 // buffer_load
294 // s_not exec, exec
295 void readWriteTmpVGPR(unsigned Offset, bool IsLoad) {
296 if (SavedExecReg) {
297 // Spill needed lanes
298 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
299 } else {
300 // The modify and restore of exec clobber SCC, which we would have to save
301 // and restore. FIXME: We probably would need to reserve a register for
302 // this.
303 if (RS->isRegUsed(AMDGPU::SCC))
304 emitUnsupportedError(MF.getFunction(), *MI,
305 "unhandled SGPR spill to memory");
306
307 // Spill active lanes
308 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad,
309 /*IsKill*/ false);
310 // Spill inactive lanes
311 auto Not0 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
312 Not0->getOperand(2).setIsDead(); // Mark SCC as dead.
313 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
314 auto Not1 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
315 Not1->getOperand(2).setIsDead(); // Mark SCC as dead.
316 }
317 }
318
320 assert(MBB->getParent() == &MF);
321 MI = NewMI;
322 MBB = NewMBB;
323 }
324};
325
326} // namespace llvm
327
329 : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour(),
330 ST.getAMDGPUDwarfFlavour(),
331 /*PC=*/0,
332 ST.getHwMode(MCSubtargetInfo::HwMode_RegInfo)),
333 ST(ST), SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) {
334
335 assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 &&
336 getSubRegIndexLaneMask(AMDGPU::sub31).getAsInteger() == (3ULL << 62) &&
337 (getSubRegIndexLaneMask(AMDGPU::lo16) |
338 getSubRegIndexLaneMask(AMDGPU::hi16)).getAsInteger() ==
339 getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() &&
340 "getNumCoveredRegs() will not work with generated subreg masks!");
341
342 RegPressureIgnoredUnits.resize(getNumRegUnits());
343 RegPressureIgnoredUnits.set(
344 static_cast<unsigned>(*regunits(MCRegister::from(AMDGPU::M0)).begin()));
345 for (auto Reg : AMDGPU::VGPR_16RegClass) {
346 if (AMDGPU::isHi16Reg(Reg, *this))
347 RegPressureIgnoredUnits.set(
348 static_cast<unsigned>(*regunits(Reg).begin()));
349 }
350
351 // HACK: Until this is fully tablegen'd.
352 static llvm::once_flag InitializeRegSplitPartsFlag;
353
354 static auto InitializeRegSplitPartsOnce = [this]() {
355 for (unsigned Idx = 1, E = getNumSubRegIndices() - 1; Idx < E; ++Idx) {
356 unsigned Size = getSubRegIdxSize(Idx);
357 if (Size & 15)
358 continue;
359 std::vector<int16_t> &Vec = RegSplitParts[Size / 16 - 1];
360 unsigned Pos = getSubRegIdxOffset(Idx);
361 if (Pos % Size)
362 continue;
363 Pos /= Size;
364 if (Vec.empty()) {
365 unsigned MaxNumParts = 1024 / Size; // Maximum register is 1024 bits.
366 Vec.resize(MaxNumParts);
367 }
368 Vec[Pos] = Idx;
369 }
370 };
371
372 static llvm::once_flag InitializeSubRegFromChannelTableFlag;
373
374 static auto InitializeSubRegFromChannelTableOnce = [this]() {
375 for (auto &Row : SubRegFromChannelTable)
376 Row.fill(AMDGPU::NoSubRegister);
377 for (unsigned Idx = 1; Idx < getNumSubRegIndices(); ++Idx) {
378 unsigned Width = getSubRegIdxSize(Idx) / 32;
379 unsigned Offset = getSubRegIdxOffset(Idx) / 32;
381 Width = SubRegFromChannelTableWidthMap[Width];
382 if (Width == 0)
383 continue;
384 unsigned TableIdx = Width - 1;
385 assert(TableIdx < SubRegFromChannelTable.size());
386 assert(Offset < SubRegFromChannelTable[TableIdx].size());
387 SubRegFromChannelTable[TableIdx][Offset] = Idx;
388 }
389 };
390
391 llvm::call_once(InitializeRegSplitPartsFlag, InitializeRegSplitPartsOnce);
392 llvm::call_once(InitializeSubRegFromChannelTableFlag,
393 InitializeSubRegFromChannelTableOnce);
394}
395
396void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved,
397 MCRegister Reg) const {
398 for (MCRegAliasIterator R(Reg, this, true); R.isValid(); ++R)
399 Reserved.set(*R);
400}
401
402// Forced to be here by one .inc
404 const MachineFunction *MF) const {
406 switch (CC) {
407 case CallingConv::C:
410 return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_SaveList
411 : CSR_AMDGPU_SaveList;
414 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_SaveList
415 : CSR_AMDGPU_SI_Gfx_SaveList;
417 return CSR_AMDGPU_CS_ChainPreserve_SaveList;
418 default: {
419 // Dummy to not crash RegisterClassInfo.
420 static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister;
421 return &NoCalleeSavedReg;
422 }
423 }
424}
425
426const MCPhysReg *
428 return nullptr;
429}
430
432 CallingConv::ID CC) const {
433 switch (CC) {
434 case CallingConv::C:
437 return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_RegMask
438 : CSR_AMDGPU_RegMask;
441 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_RegMask
442 : CSR_AMDGPU_SI_Gfx_RegMask;
445 // Calls to these functions never return, so we can pretend everything is
446 // preserved.
447 return AMDGPU_AllVGPRs_RegMask;
448 default:
449 return nullptr;
450 }
451}
452
454 return CSR_AMDGPU_NoRegs_RegMask;
455}
456
458 return VGPR >= AMDGPU::VGPR0 && VGPR < AMDGPU::VGPR8;
459}
460
463 const MachineFunction &MF) const {
464 // FIXME: Should have a helper function like getEquivalentVGPRClass to get the
465 // equivalent AV class. If used one, the verifier will crash after
466 // RegBankSelect in the GISel flow. The aligned regclasses are not fully given
467 // until Instruction selection.
468 if (ST.hasMAIInsts() && (isVGPRClass(RC) || isAGPRClass(RC))) {
469 if (RC == &AMDGPU::VGPR_32RegClass || RC == &AMDGPU::AGPR_32RegClass)
470 return &AMDGPU::AV_32RegClass;
471 if (RC == &AMDGPU::VReg_64RegClass || RC == &AMDGPU::AReg_64RegClass)
472 return &AMDGPU::AV_64RegClass;
473 if (RC == &AMDGPU::VReg_64_Align2RegClass ||
474 RC == &AMDGPU::AReg_64_Align2RegClass)
475 return &AMDGPU::AV_64_Align2RegClass;
476 if (RC == &AMDGPU::VReg_96RegClass || RC == &AMDGPU::AReg_96RegClass)
477 return &AMDGPU::AV_96RegClass;
478 if (RC == &AMDGPU::VReg_96_Align2RegClass ||
479 RC == &AMDGPU::AReg_96_Align2RegClass)
480 return &AMDGPU::AV_96_Align2RegClass;
481 if (RC == &AMDGPU::VReg_128RegClass || RC == &AMDGPU::AReg_128RegClass)
482 return &AMDGPU::AV_128RegClass;
483 if (RC == &AMDGPU::VReg_128_Align2RegClass ||
484 RC == &AMDGPU::AReg_128_Align2RegClass)
485 return &AMDGPU::AV_128_Align2RegClass;
486 if (RC == &AMDGPU::VReg_160RegClass || RC == &AMDGPU::AReg_160RegClass)
487 return &AMDGPU::AV_160RegClass;
488 if (RC == &AMDGPU::VReg_160_Align2RegClass ||
489 RC == &AMDGPU::AReg_160_Align2RegClass)
490 return &AMDGPU::AV_160_Align2RegClass;
491 if (RC == &AMDGPU::VReg_192RegClass || RC == &AMDGPU::AReg_192RegClass)
492 return &AMDGPU::AV_192RegClass;
493 if (RC == &AMDGPU::VReg_192_Align2RegClass ||
494 RC == &AMDGPU::AReg_192_Align2RegClass)
495 return &AMDGPU::AV_192_Align2RegClass;
496 if (RC == &AMDGPU::VReg_256RegClass || RC == &AMDGPU::AReg_256RegClass)
497 return &AMDGPU::AV_256RegClass;
498 if (RC == &AMDGPU::VReg_256_Align2RegClass ||
499 RC == &AMDGPU::AReg_256_Align2RegClass)
500 return &AMDGPU::AV_256_Align2RegClass;
501 if (RC == &AMDGPU::VReg_512RegClass || RC == &AMDGPU::AReg_512RegClass)
502 return &AMDGPU::AV_512RegClass;
503 if (RC == &AMDGPU::VReg_512_Align2RegClass ||
504 RC == &AMDGPU::AReg_512_Align2RegClass)
505 return &AMDGPU::AV_512_Align2RegClass;
506 if (RC == &AMDGPU::VReg_1024RegClass || RC == &AMDGPU::AReg_1024RegClass)
507 return &AMDGPU::AV_1024RegClass;
508 if (RC == &AMDGPU::VReg_1024_Align2RegClass ||
509 RC == &AMDGPU::AReg_1024_Align2RegClass)
510 return &AMDGPU::AV_1024_Align2RegClass;
511 }
512
514}
515
517 const SIFrameLowering *TFI = ST.getFrameLowering();
519
520 // During ISel lowering we always reserve the stack pointer in entry and chain
521 // functions, but never actually want to reference it when accessing our own
522 // frame. If we need a frame pointer we use it, but otherwise we can just use
523 // an immediate "0" which we represent by returning NoRegister.
524 if (FuncInfo->isBottomOfStack()) {
525 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : Register();
526 }
527 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg()
528 : FuncInfo->getStackPtrOffsetReg();
529}
530
532 // When we need stack realignment, we can't reference off of the
533 // stack pointer, so we reserve a base pointer.
534 return shouldRealignStack(MF);
535}
536
537Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; }
538
540 return AMDGPU_AllVGPRs_RegMask;
541}
542
544 return AMDGPU_AllAGPRs_RegMask;
545}
546
548 return AMDGPU_AllVectorRegs_RegMask;
549}
550
552 return AMDGPU_AllAllocatableSRegs_RegMask;
553}
554
555unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel,
556 unsigned NumRegs) {
557 assert(NumRegs < SubRegFromChannelTableWidthMap.size());
558 unsigned NumRegIndex = SubRegFromChannelTableWidthMap[NumRegs];
559 assert(NumRegIndex && "Not implemented");
560 assert(Channel < SubRegFromChannelTable[NumRegIndex - 1].size());
561 return SubRegFromChannelTable[NumRegIndex - 1][Channel];
562}
563
566 const unsigned Align,
567 const TargetRegisterClass *RC) const {
568 unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), Align) - Align;
569 MCRegister BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
570 return getMatchingSuperReg(BaseReg, AMDGPU::sub0, RC);
571}
572
574 const MachineFunction &MF) const {
575 return getAlignedHighSGPRForRC(MF, /*Align=*/4, &AMDGPU::SGPR_128RegClass);
576}
577
579 BitVector Reserved(getNumRegs());
580 Reserved.set(AMDGPU::MODE);
581
583
584 // Reserve special purpose registers.
585 //
586 // EXEC_LO and EXEC_HI could be allocated and used as regular register, but
587 // this seems likely to result in bugs, so I'm marking them as reserved.
588 reserveRegisterTuples(Reserved, AMDGPU::EXEC);
589 reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
590
591 // M0 has to be reserved so that llvm accepts it as a live-in into a block.
592 reserveRegisterTuples(Reserved, AMDGPU::M0);
593
594 // Reserve src_vccz, src_execz, src_scc.
595 reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ);
596 reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ);
597 reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC);
598
599 // Reserve the memory aperture registers
600 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE);
601 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT);
602 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE);
603 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT);
604 reserveRegisterTuples(Reserved, AMDGPU::SRC_FLAT_SCRATCH_BASE_LO);
605 reserveRegisterTuples(Reserved, AMDGPU::SRC_FLAT_SCRATCH_BASE_HI);
606
607 // Reserve async counters pseudo registers
608 reserveRegisterTuples(Reserved, AMDGPU::ASYNCcnt);
609 reserveRegisterTuples(Reserved, AMDGPU::TENSORcnt);
610
611 // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen.
612 reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID);
613
614 // Reserve xnack_mask registers - support is not implemented in Codegen.
615 reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK);
616
617 // Reserve lds_direct register - support is not implemented in Codegen.
618 reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT);
619
620 // Reserve Trap Handler registers - support is not implemented in Codegen.
621 reserveRegisterTuples(Reserved, AMDGPU::TBA);
622 reserveRegisterTuples(Reserved, AMDGPU::TMA);
623 reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1);
624 reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3);
625 reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5);
626 reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7);
627 reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9);
628 reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11);
629 reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13);
630 reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15);
631
632 // Reserve null register - it shall never be allocated
633 reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL64);
634
635 // Reserve SGPRs.
636 //
637 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
638 unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
639 for (const TargetRegisterClass *RC : regclasses()) {
640 if (RC->isBaseClass() && isSGPRClass(RC)) {
641 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
642 for (MCPhysReg Reg : *RC) {
643 unsigned Index = getHWRegIndex(Reg);
644 if (Index + NumRegs > MaxNumSGPRs && Index < TotalNumSGPRs)
645 Reserved.set(Reg);
646 }
647 }
648 }
649
650 Register ScratchRSrcReg = MFI->getScratchRSrcReg();
651 if (ScratchRSrcReg != AMDGPU::NoRegister) {
652 // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we
653 // need to spill.
654 // TODO: May need to reserve a VGPR if doing LDS spilling.
655 reserveRegisterTuples(Reserved, ScratchRSrcReg);
656 }
657
658 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
659 if (LongBranchReservedReg)
660 reserveRegisterTuples(Reserved, LongBranchReservedReg);
661
662 // We have to assume the SP is needed in case there are calls in the function,
663 // which is detected after the function is lowered. If we aren't really going
664 // to need SP, don't bother reserving it.
665 MCRegister StackPtrReg = MFI->getStackPtrOffsetReg();
666 if (StackPtrReg) {
667 reserveRegisterTuples(Reserved, StackPtrReg);
668 assert(!isSubRegister(ScratchRSrcReg, StackPtrReg));
669 }
670
671 MCRegister FrameReg = MFI->getFrameOffsetReg();
672 if (FrameReg) {
673 reserveRegisterTuples(Reserved, FrameReg);
674 assert(!isSubRegister(ScratchRSrcReg, FrameReg));
675 }
676
677 if (hasBasePointer(MF)) {
678 MCRegister BasePtrReg = getBaseRegister();
679 reserveRegisterTuples(Reserved, BasePtrReg);
680 assert(!isSubRegister(ScratchRSrcReg, BasePtrReg));
681 }
682
683 // FIXME: Use same reserved register introduced in D149775
684 // SGPR used to preserve EXEC MASK around WWM spill/copy instructions.
685 Register ExecCopyReg = MFI->getSGPRForEXECCopy();
686 if (ExecCopyReg)
687 reserveRegisterTuples(Reserved, ExecCopyReg);
688
689 // Reserve VGPRs/AGPRs.
690 //
691 auto [MaxNumVGPRs, MaxNumAGPRs] = ST.getMaxNumVectorRegs(MF.getFunction());
692
693 for (const TargetRegisterClass *RC : regclasses()) {
694 if (RC->isBaseClass() && isVGPRClass(RC)) {
695 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
696 for (MCPhysReg Reg : *RC) {
697 unsigned Index = getHWRegIndex(Reg);
698 if (Index + NumRegs > MaxNumVGPRs)
699 Reserved.set(Reg);
700 }
701 }
702 }
703
704 // Reserve all the AGPRs if there are no instructions to use it.
705 if (!ST.hasMAIInsts())
706 MaxNumAGPRs = 0;
707 for (const TargetRegisterClass *RC : regclasses()) {
708 if (RC->isBaseClass() && isAGPRClass(RC)) {
709 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
710 for (MCPhysReg Reg : *RC) {
711 unsigned Index = getHWRegIndex(Reg);
712 if (Index + NumRegs > MaxNumAGPRs)
713 Reserved.set(Reg);
714 }
715 }
716 }
717
718 // On GFX908, in order to guarantee copying between AGPRs, we need a scratch
719 // VGPR available at all times.
720 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
721 reserveRegisterTuples(Reserved, MFI->getVGPRForAGPRCopy());
722 }
723
724 // During wwm-regalloc, reserve the registers for perlane VGPR allocation. The
725 // MFI->getNonWWMRegMask() field will have a valid bitmask only during
726 // wwm-regalloc and it would be empty otherwise.
727 BitVector NonWWMRegMask = MFI->getNonWWMRegMask();
728 if (!NonWWMRegMask.empty()) {
729 for (unsigned RegI = AMDGPU::VGPR0, RegE = AMDGPU::VGPR0 + MaxNumVGPRs;
730 RegI < RegE; ++RegI) {
731 if (NonWWMRegMask.test(RegI))
732 reserveRegisterTuples(Reserved, RegI);
733 }
734 }
735
736 for (Register Reg : MFI->getWWMReservedRegs())
737 reserveRegisterTuples(Reserved, Reg);
738
739 // FIXME: Stop using reserved registers for this.
740 for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs())
741 reserveRegisterTuples(Reserved, Reg);
742
743 for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs())
744 reserveRegisterTuples(Reserved, Reg);
745
746 return Reserved;
747}
748
750 MCRegister PhysReg) const {
751 return !MF.getRegInfo().isReserved(PhysReg);
752}
753
756 // On entry or in chain functions, the base address is 0, so it can't possibly
757 // need any more alignment.
758
759 // FIXME: Should be able to specify the entry frame alignment per calling
760 // convention instead.
761 if (Info->isBottomOfStack())
762 return false;
763
765}
766
769 if (Info->isEntryFunction()) {
770 const MachineFrameInfo &MFI = Fn.getFrameInfo();
771 return MFI.hasStackObjects() || MFI.hasCalls();
772 }
773
774 // May need scavenger for dealing with callee saved registers.
775 return true;
776}
777
779 const MachineFunction &MF) const {
780 // Do not use frame virtual registers. They used to be used for SGPRs, but
781 // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the
782 // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a
783 // spill.
784 return false;
785}
786
788 const MachineFunction &MF) const {
789 const MachineFrameInfo &MFI = MF.getFrameInfo();
790 return MFI.hasStackObjects();
791}
792
794 const MachineFunction &) const {
795 // There are no special dedicated stack or frame pointers.
796 return true;
797}
798
801
802 int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
803 AMDGPU::OpName::offset);
804 return MI->getOperand(OffIdx).getImm();
805}
806
808 int Idx) const {
809 switch (MI->getOpcode()) {
810 case AMDGPU::V_ADD_U32_e32:
811 case AMDGPU::V_ADD_U32_e64:
812 case AMDGPU::V_ADD_CO_U32_e32: {
813 int OtherIdx = Idx == 1 ? 2 : 1;
814 const MachineOperand &OtherOp = MI->getOperand(OtherIdx);
815 return OtherOp.isImm() ? OtherOp.getImm() : 0;
816 }
817 case AMDGPU::V_ADD_CO_U32_e64: {
818 int OtherIdx = Idx == 2 ? 3 : 2;
819 const MachineOperand &OtherOp = MI->getOperand(OtherIdx);
820 return OtherOp.isImm() ? OtherOp.getImm() : 0;
821 }
822 default:
823 break;
824 }
825
827 return 0;
828
829 assert((Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
830 AMDGPU::OpName::vaddr) ||
831 (Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
832 AMDGPU::OpName::saddr))) &&
833 "Should never see frame index on non-address operand");
834
836}
837
839 const MachineInstr &MI) {
840 assert(MI.getDesc().isAdd());
841 const MachineOperand &Src0 = MI.getOperand(1);
842 const MachineOperand &Src1 = MI.getOperand(2);
843
844 if (Src0.isFI()) {
845 return Src1.isImm() || (Src1.isReg() && TRI.isVGPR(MI.getMF()->getRegInfo(),
846 Src1.getReg()));
847 }
848
849 if (Src1.isFI()) {
850 return Src0.isImm() || (Src0.isReg() && TRI.isVGPR(MI.getMF()->getRegInfo(),
851 Src0.getReg()));
852 }
853
854 return false;
855}
856
858 // TODO: Handle v_add_co_u32, v_or_b32, v_and_b32 and scalar opcodes.
859 switch (MI->getOpcode()) {
860 case AMDGPU::V_ADD_U32_e32: {
861 // TODO: We could handle this but it requires work to avoid violating
862 // operand restrictions.
863 if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e32) < 2 &&
864 !isFIPlusImmOrVGPR(*this, *MI))
865 return false;
866 [[fallthrough]];
867 }
868 case AMDGPU::V_ADD_U32_e64:
869 // FIXME: This optimization is barely profitable hasFlatScratchEnabled
870 // as-is.
871 //
872 // Much of the benefit with the MUBUF handling is we avoid duplicating the
873 // shift of the frame register, which isn't needed with scratch.
874 //
875 // materializeFrameBaseRegister doesn't know the register classes of the
876 // uses, and unconditionally uses an s_add_i32, which will end up using a
877 // copy for the vector uses.
878 return !ST.hasFlatScratchEnabled();
879 case AMDGPU::V_ADD_CO_U32_e32:
880 if (ST.getConstantBusLimit(AMDGPU::V_ADD_CO_U32_e32) < 2 &&
881 !isFIPlusImmOrVGPR(*this, *MI))
882 return false;
883 // We can't deal with the case where the carry out has a use (though this
884 // should never happen)
885 return MI->getOperand(3).isDead();
886 case AMDGPU::V_ADD_CO_U32_e64:
887 // TODO: Should we check use_empty instead?
888 return MI->getOperand(1).isDead();
889 default:
890 break;
891 }
892
894 return false;
895
896 int64_t FullOffset = Offset + getScratchInstrOffset(MI);
897
898 const SIInstrInfo *TII = ST.getInstrInfo();
900 return !TII->isLegalMUBUFImmOffset(FullOffset);
901
902 return !TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS,
904}
905
907 int FrameIdx,
908 int64_t Offset) const {
909 MachineBasicBlock::iterator Ins = MBB->begin();
910 DebugLoc DL; // Defaults to "unknown"
911
912 if (Ins != MBB->end())
913 DL = Ins->getDebugLoc();
914
915 MachineFunction *MF = MBB->getParent();
916 const SIInstrInfo *TII = ST.getInstrInfo();
918 unsigned MovOpc =
919 ST.hasFlatScratchEnabled() ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
920
921 Register BaseReg = MRI.createVirtualRegister(
922 ST.hasFlatScratchEnabled() ? &AMDGPU::SReg_32_XEXEC_HIRegClass
923 : &AMDGPU::VGPR_32RegClass);
924
925 if (Offset == 0) {
926 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), BaseReg)
927 .addFrameIndex(FrameIdx);
928 return BaseReg;
929 }
930
931 Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
932
933 Register FIReg = MRI.createVirtualRegister(ST.hasFlatScratchEnabled()
934 ? &AMDGPU::SReg_32_XM0RegClass
935 : &AMDGPU::VGPR_32RegClass);
936
937 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
938 .addImm(Offset);
939 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), FIReg)
940 .addFrameIndex(FrameIdx);
941
942 if (ST.hasFlatScratchEnabled()) {
943 // FIXME: Make sure scc isn't live in.
944 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_I32), BaseReg)
945 .addReg(OffsetReg, RegState::Kill)
946 .addReg(FIReg)
947 .setOperandDead(3); // scc
948 return BaseReg;
949 }
950
951 TII->getAddNoCarry(*MBB, Ins, DL, BaseReg)
952 .addReg(OffsetReg, RegState::Kill)
953 .addReg(FIReg)
954 .addImm(0); // clamp bit
955
956 return BaseReg;
957}
958
960 int64_t Offset) const {
961 const SIInstrInfo *TII = ST.getInstrInfo();
962
963 switch (MI.getOpcode()) {
964 case AMDGPU::V_ADD_U32_e32:
965 case AMDGPU::V_ADD_CO_U32_e32: {
966 MachineOperand *FIOp = &MI.getOperand(2);
967 MachineOperand *ImmOp = &MI.getOperand(1);
968 if (!FIOp->isFI())
969 std::swap(FIOp, ImmOp);
970
971 if (!ImmOp->isImm()) {
972 assert(Offset == 0);
973 FIOp->ChangeToRegister(BaseReg, false);
974 TII->legalizeOperandsVOP2(MI.getMF()->getRegInfo(), MI);
975 return;
976 }
977
978 int64_t TotalOffset = ImmOp->getImm() + Offset;
979 if (TotalOffset == 0) {
980 MI.setDesc(TII->get(AMDGPU::COPY));
981 for (unsigned I = MI.getNumOperands() - 1; I != 1; --I)
982 MI.removeOperand(I);
983
984 MI.getOperand(1).ChangeToRegister(BaseReg, false);
985 return;
986 }
987
988 ImmOp->setImm(TotalOffset);
989
990 MachineBasicBlock *MBB = MI.getParent();
991 MachineFunction *MF = MBB->getParent();
993
994 // FIXME: materializeFrameBaseRegister does not know the register class of
995 // the uses of the frame index, and assumes SGPR for hasFlatScratchEnabled.
996 // Emit a copy so we have a legal operand and hope the register coalescer
997 // can clean it up.
998 if (isSGPRReg(MRI, BaseReg)) {
999 Register BaseRegVGPR =
1000 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1001 BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), BaseRegVGPR)
1002 .addReg(BaseReg);
1003 MI.getOperand(2).ChangeToRegister(BaseRegVGPR, false);
1004 } else {
1005 MI.getOperand(2).ChangeToRegister(BaseReg, false);
1006 }
1007 return;
1008 }
1009 case AMDGPU::V_ADD_U32_e64:
1010 case AMDGPU::V_ADD_CO_U32_e64: {
1011 int Src0Idx = MI.getNumExplicitDefs();
1012 MachineOperand *FIOp = &MI.getOperand(Src0Idx);
1013 MachineOperand *ImmOp = &MI.getOperand(Src0Idx + 1);
1014 if (!FIOp->isFI())
1015 std::swap(FIOp, ImmOp);
1016
1017 if (!ImmOp->isImm()) {
1018 FIOp->ChangeToRegister(BaseReg, false);
1019 TII->legalizeOperandsVOP3(MI.getMF()->getRegInfo(), MI);
1020 return;
1021 }
1022
1023 int64_t TotalOffset = ImmOp->getImm() + Offset;
1024 if (TotalOffset == 0) {
1025 MI.setDesc(TII->get(AMDGPU::COPY));
1026
1027 for (unsigned I = MI.getNumOperands() - 1; I != 1; --I)
1028 MI.removeOperand(I);
1029
1030 MI.getOperand(1).ChangeToRegister(BaseReg, false);
1031 } else {
1032 FIOp->ChangeToRegister(BaseReg, false);
1033 ImmOp->setImm(TotalOffset);
1034 }
1035
1036 return;
1037 }
1038 default:
1039 break;
1040 }
1041
1042 bool IsFlat = TII->isFLATScratch(MI);
1043
1044#ifndef NDEBUG
1045 // FIXME: Is it possible to be storing a frame index to itself?
1046 bool SeenFI = false;
1047 for (const MachineOperand &MO: MI.operands()) {
1048 if (MO.isFI()) {
1049 if (SeenFI)
1050 llvm_unreachable("should not see multiple frame indices");
1051
1052 SeenFI = true;
1053 }
1054 }
1055#endif
1056
1057 MachineOperand *FIOp =
1058 TII->getNamedOperand(MI, IsFlat ? AMDGPU::OpName::saddr
1059 : AMDGPU::OpName::vaddr);
1060
1061 MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset);
1062 int64_t NewOffset = OffsetOp->getImm() + Offset;
1063
1064 assert(FIOp && FIOp->isFI() && "frame index must be address operand");
1065 assert(TII->isMUBUF(MI) || TII->isFLATScratch(MI));
1066
1067 if (IsFlat) {
1068 assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
1070 "offset should be legal");
1071 FIOp->ChangeToRegister(BaseReg, false);
1072 OffsetOp->setImm(NewOffset);
1073 return;
1074 }
1075
1076#ifndef NDEBUG
1077 MachineOperand *SOffset = TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
1078 assert(SOffset->isImm() && SOffset->getImm() == 0);
1079#endif
1080
1081 assert(TII->isLegalMUBUFImmOffset(NewOffset) && "offset should be legal");
1082
1083 FIOp->ChangeToRegister(BaseReg, false);
1084 OffsetOp->setImm(NewOffset);
1085}
1086
1088 Register BaseReg,
1089 int64_t Offset) const {
1090
1091 switch (MI->getOpcode()) {
1092 case AMDGPU::V_ADD_U32_e32:
1093 case AMDGPU::V_ADD_CO_U32_e32:
1094 return true;
1095 case AMDGPU::V_ADD_U32_e64:
1096 case AMDGPU::V_ADD_CO_U32_e64:
1097 return ST.hasVOP3Literal() || AMDGPU::isInlinableIntLiteral(Offset);
1098 default:
1099 break;
1100 }
1101
1103 return false;
1104
1105 int64_t NewOffset = Offset + getScratchInstrOffset(MI);
1106
1107 const SIInstrInfo *TII = ST.getInstrInfo();
1109 return TII->isLegalMUBUFImmOffset(NewOffset);
1110
1111 return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
1113}
1114
1115const TargetRegisterClass *
1117 // This is inaccurate. It depends on the instruction and address space. The
1118 // only place where we should hit this is for dealing with frame indexes /
1119 // private accesses, so this is correct in that case.
1120 return &AMDGPU::VGPR_32RegClass;
1121}
1122
1123const TargetRegisterClass *
1125 return RC == &AMDGPU::SCC_CLASSRegClass ? &AMDGPU::SReg_32RegClass : RC;
1126}
1127
1129 const SIInstrInfo *TII) {
1130
1131 unsigned Op = MI.getOpcode();
1132 switch (Op) {
1133 case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE:
1134 case AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE:
1135 // FIXME: This assumes the mask is statically known and not computed at
1136 // runtime. However, some ABIs may want to compute the mask dynamically and
1137 // this will need to be updated.
1138 return llvm::popcount(
1139 (uint64_t)TII->getNamedOperand(MI, AMDGPU::OpName::mask)->getImm());
1140 case AMDGPU::SI_SPILL_S1024_SAVE:
1141 case AMDGPU::SI_SPILL_S1024_RESTORE:
1142 case AMDGPU::SI_SPILL_V1024_SAVE:
1143 case AMDGPU::SI_SPILL_V1024_RESTORE:
1144 case AMDGPU::SI_SPILL_A1024_SAVE:
1145 case AMDGPU::SI_SPILL_A1024_RESTORE:
1146 case AMDGPU::SI_SPILL_AV1024_SAVE:
1147 case AMDGPU::SI_SPILL_AV1024_RESTORE:
1148 return 32;
1149 case AMDGPU::SI_SPILL_S512_SAVE:
1150 case AMDGPU::SI_SPILL_S512_RESTORE:
1151 case AMDGPU::SI_SPILL_V512_SAVE:
1152 case AMDGPU::SI_SPILL_V512_RESTORE:
1153 case AMDGPU::SI_SPILL_A512_SAVE:
1154 case AMDGPU::SI_SPILL_A512_RESTORE:
1155 case AMDGPU::SI_SPILL_AV512_SAVE:
1156 case AMDGPU::SI_SPILL_AV512_RESTORE:
1157 return 16;
1158 case AMDGPU::SI_SPILL_S384_SAVE:
1159 case AMDGPU::SI_SPILL_S384_RESTORE:
1160 case AMDGPU::SI_SPILL_V384_SAVE:
1161 case AMDGPU::SI_SPILL_V384_RESTORE:
1162 case AMDGPU::SI_SPILL_A384_SAVE:
1163 case AMDGPU::SI_SPILL_A384_RESTORE:
1164 case AMDGPU::SI_SPILL_AV384_SAVE:
1165 case AMDGPU::SI_SPILL_AV384_RESTORE:
1166 return 12;
1167 case AMDGPU::SI_SPILL_S352_SAVE:
1168 case AMDGPU::SI_SPILL_S352_RESTORE:
1169 case AMDGPU::SI_SPILL_V352_SAVE:
1170 case AMDGPU::SI_SPILL_V352_RESTORE:
1171 case AMDGPU::SI_SPILL_A352_SAVE:
1172 case AMDGPU::SI_SPILL_A352_RESTORE:
1173 case AMDGPU::SI_SPILL_AV352_SAVE:
1174 case AMDGPU::SI_SPILL_AV352_RESTORE:
1175 return 11;
1176 case AMDGPU::SI_SPILL_S320_SAVE:
1177 case AMDGPU::SI_SPILL_S320_RESTORE:
1178 case AMDGPU::SI_SPILL_V320_SAVE:
1179 case AMDGPU::SI_SPILL_V320_RESTORE:
1180 case AMDGPU::SI_SPILL_A320_SAVE:
1181 case AMDGPU::SI_SPILL_A320_RESTORE:
1182 case AMDGPU::SI_SPILL_AV320_SAVE:
1183 case AMDGPU::SI_SPILL_AV320_RESTORE:
1184 return 10;
1185 case AMDGPU::SI_SPILL_S288_SAVE:
1186 case AMDGPU::SI_SPILL_S288_RESTORE:
1187 case AMDGPU::SI_SPILL_V288_SAVE:
1188 case AMDGPU::SI_SPILL_V288_RESTORE:
1189 case AMDGPU::SI_SPILL_A288_SAVE:
1190 case AMDGPU::SI_SPILL_A288_RESTORE:
1191 case AMDGPU::SI_SPILL_AV288_SAVE:
1192 case AMDGPU::SI_SPILL_AV288_RESTORE:
1193 return 9;
1194 case AMDGPU::SI_SPILL_S256_SAVE:
1195 case AMDGPU::SI_SPILL_S256_RESTORE:
1196 case AMDGPU::SI_SPILL_V256_SAVE:
1197 case AMDGPU::SI_SPILL_V256_RESTORE:
1198 case AMDGPU::SI_SPILL_A256_SAVE:
1199 case AMDGPU::SI_SPILL_A256_RESTORE:
1200 case AMDGPU::SI_SPILL_AV256_SAVE:
1201 case AMDGPU::SI_SPILL_AV256_RESTORE:
1202 return 8;
1203 case AMDGPU::SI_SPILL_S224_SAVE:
1204 case AMDGPU::SI_SPILL_S224_RESTORE:
1205 case AMDGPU::SI_SPILL_V224_SAVE:
1206 case AMDGPU::SI_SPILL_V224_RESTORE:
1207 case AMDGPU::SI_SPILL_A224_SAVE:
1208 case AMDGPU::SI_SPILL_A224_RESTORE:
1209 case AMDGPU::SI_SPILL_AV224_SAVE:
1210 case AMDGPU::SI_SPILL_AV224_RESTORE:
1211 return 7;
1212 case AMDGPU::SI_SPILL_S192_SAVE:
1213 case AMDGPU::SI_SPILL_S192_RESTORE:
1214 case AMDGPU::SI_SPILL_V192_SAVE:
1215 case AMDGPU::SI_SPILL_V192_RESTORE:
1216 case AMDGPU::SI_SPILL_A192_SAVE:
1217 case AMDGPU::SI_SPILL_A192_RESTORE:
1218 case AMDGPU::SI_SPILL_AV192_SAVE:
1219 case AMDGPU::SI_SPILL_AV192_RESTORE:
1220 return 6;
1221 case AMDGPU::SI_SPILL_S160_SAVE:
1222 case AMDGPU::SI_SPILL_S160_RESTORE:
1223 case AMDGPU::SI_SPILL_V160_SAVE:
1224 case AMDGPU::SI_SPILL_V160_RESTORE:
1225 case AMDGPU::SI_SPILL_A160_SAVE:
1226 case AMDGPU::SI_SPILL_A160_RESTORE:
1227 case AMDGPU::SI_SPILL_AV160_SAVE:
1228 case AMDGPU::SI_SPILL_AV160_RESTORE:
1229 return 5;
1230 case AMDGPU::SI_SPILL_S128_SAVE:
1231 case AMDGPU::SI_SPILL_S128_RESTORE:
1232 case AMDGPU::SI_SPILL_V128_SAVE:
1233 case AMDGPU::SI_SPILL_V128_RESTORE:
1234 case AMDGPU::SI_SPILL_A128_SAVE:
1235 case AMDGPU::SI_SPILL_A128_RESTORE:
1236 case AMDGPU::SI_SPILL_AV128_SAVE:
1237 case AMDGPU::SI_SPILL_AV128_RESTORE:
1238 return 4;
1239 case AMDGPU::SI_SPILL_S96_SAVE:
1240 case AMDGPU::SI_SPILL_S96_RESTORE:
1241 case AMDGPU::SI_SPILL_V96_SAVE:
1242 case AMDGPU::SI_SPILL_V96_RESTORE:
1243 case AMDGPU::SI_SPILL_A96_SAVE:
1244 case AMDGPU::SI_SPILL_A96_RESTORE:
1245 case AMDGPU::SI_SPILL_AV96_SAVE:
1246 case AMDGPU::SI_SPILL_AV96_RESTORE:
1247 return 3;
1248 case AMDGPU::SI_SPILL_S64_SAVE:
1249 case AMDGPU::SI_SPILL_S64_RESTORE:
1250 case AMDGPU::SI_SPILL_V64_SAVE:
1251 case AMDGPU::SI_SPILL_V64_RESTORE:
1252 case AMDGPU::SI_SPILL_A64_SAVE:
1253 case AMDGPU::SI_SPILL_A64_RESTORE:
1254 case AMDGPU::SI_SPILL_AV64_SAVE:
1255 case AMDGPU::SI_SPILL_AV64_RESTORE:
1256 return 2;
1257 case AMDGPU::SI_SPILL_S32_SAVE:
1258 case AMDGPU::SI_SPILL_S32_RESTORE:
1259 case AMDGPU::SI_SPILL_V32_SAVE:
1260 case AMDGPU::SI_SPILL_V32_RESTORE:
1261 case AMDGPU::SI_SPILL_A32_SAVE:
1262 case AMDGPU::SI_SPILL_A32_RESTORE:
1263 case AMDGPU::SI_SPILL_AV32_SAVE:
1264 case AMDGPU::SI_SPILL_AV32_RESTORE:
1265 case AMDGPU::SI_SPILL_WWM_V32_SAVE:
1266 case AMDGPU::SI_SPILL_WWM_V32_RESTORE:
1267 case AMDGPU::SI_SPILL_WWM_AV32_SAVE:
1268 case AMDGPU::SI_SPILL_WWM_AV32_RESTORE:
1269 case AMDGPU::SI_SPILL_V16_SAVE:
1270 case AMDGPU::SI_SPILL_V16_RESTORE:
1271 return 1;
1272 default: llvm_unreachable("Invalid spill opcode");
1273 }
1274}
1275
1276static int getOffsetMUBUFStore(unsigned Opc) {
1277 switch (Opc) {
1278 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
1279 return AMDGPU::BUFFER_STORE_DWORD_OFFSET;
1280 case AMDGPU::BUFFER_STORE_BYTE_OFFEN:
1281 return AMDGPU::BUFFER_STORE_BYTE_OFFSET;
1282 case AMDGPU::BUFFER_STORE_SHORT_OFFEN:
1283 return AMDGPU::BUFFER_STORE_SHORT_OFFSET;
1284 case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
1285 return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
1286 case AMDGPU::BUFFER_STORE_DWORDX3_OFFEN:
1287 return AMDGPU::BUFFER_STORE_DWORDX3_OFFSET;
1288 case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN:
1289 return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
1290 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN:
1291 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET;
1292 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN:
1293 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET;
1294 default:
1295 return -1;
1296 }
1297}
1298
1299static int getOffsetMUBUFLoad(unsigned Opc) {
1300 switch (Opc) {
1301 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
1302 return AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
1303 case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN:
1304 return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET;
1305 case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN:
1306 return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET;
1307 case AMDGPU::BUFFER_LOAD_USHORT_OFFEN:
1308 return AMDGPU::BUFFER_LOAD_USHORT_OFFSET;
1309 case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN:
1310 return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET;
1311 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN:
1312 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
1313 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN:
1314 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET;
1315 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN:
1316 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET;
1317 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN:
1318 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET;
1319 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN:
1320 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET;
1321 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN:
1322 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET;
1323 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN:
1324 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET;
1325 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN:
1326 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET;
1327 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN:
1328 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET;
1329 default:
1330 return -1;
1331 }
1332}
1333
1334static int getOffenMUBUFStore(unsigned Opc) {
1335 switch (Opc) {
1336 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
1337 return AMDGPU::BUFFER_STORE_DWORD_OFFEN;
1338 case AMDGPU::BUFFER_STORE_BYTE_OFFSET:
1339 return AMDGPU::BUFFER_STORE_BYTE_OFFEN;
1340 case AMDGPU::BUFFER_STORE_SHORT_OFFSET:
1341 return AMDGPU::BUFFER_STORE_SHORT_OFFEN;
1342 case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET:
1343 return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN;
1344 case AMDGPU::BUFFER_STORE_DWORDX3_OFFSET:
1345 return AMDGPU::BUFFER_STORE_DWORDX3_OFFEN;
1346 case AMDGPU::BUFFER_STORE_DWORDX4_OFFSET:
1347 return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN;
1348 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET:
1349 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN;
1350 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET:
1351 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN;
1352 default:
1353 return -1;
1354 }
1355}
1356
1357static int getOffenMUBUFLoad(unsigned Opc) {
1358 switch (Opc) {
1359 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
1360 return AMDGPU::BUFFER_LOAD_DWORD_OFFEN;
1361 case AMDGPU::BUFFER_LOAD_UBYTE_OFFSET:
1362 return AMDGPU::BUFFER_LOAD_UBYTE_OFFEN;
1363 case AMDGPU::BUFFER_LOAD_SBYTE_OFFSET:
1364 return AMDGPU::BUFFER_LOAD_SBYTE_OFFEN;
1365 case AMDGPU::BUFFER_LOAD_USHORT_OFFSET:
1366 return AMDGPU::BUFFER_LOAD_USHORT_OFFEN;
1367 case AMDGPU::BUFFER_LOAD_SSHORT_OFFSET:
1368 return AMDGPU::BUFFER_LOAD_SSHORT_OFFEN;
1369 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET:
1370 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
1371 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET:
1372 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN;
1373 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET:
1374 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN;
1375 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET:
1376 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN;
1377 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET:
1378 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN;
1379 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET:
1380 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN;
1381 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET:
1382 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN;
1383 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET:
1384 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN;
1385 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET:
1386 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN;
1387 default:
1388 return -1;
1389 }
1390}
1391
1395 int Index, unsigned Lane,
1396 unsigned ValueReg, bool IsKill) {
1397 MachineFunction *MF = MBB.getParent();
1399 const SIInstrInfo *TII = ST.getInstrInfo();
1400
1401 MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane);
1402
1403 if (Reg == AMDGPU::NoRegister)
1404 return MachineInstrBuilder();
1405
1406 bool IsStore = MI->mayStore();
1408 auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
1409
1410 unsigned Dst = IsStore ? Reg : ValueReg;
1411 unsigned Src = IsStore ? ValueReg : Reg;
1412 bool IsVGPR = TRI->isVGPR(MRI, Reg);
1413 const DebugLoc &DL = MI->getDebugLoc();
1414 if (IsVGPR == TRI->isVGPR(MRI, ValueReg)) {
1415 // Spiller during regalloc may restore a spilled register to its superclass.
1416 // It could result in AGPR spills restored to VGPRs or the other way around,
1417 // making the src and dst with identical regclasses at this point. It just
1418 // needs a copy in such cases.
1419 auto CopyMIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), Dst)
1420 .addReg(Src, getKillRegState(IsKill));
1422 return CopyMIB;
1423 }
1424 unsigned Opc = (IsStore ^ IsVGPR) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64
1425 : AMDGPU::V_ACCVGPR_READ_B32_e64;
1426
1427 auto MIB = BuildMI(MBB, MI, DL, TII->get(Opc), Dst)
1428 .addReg(Src, getKillRegState(IsKill));
1430 return MIB;
1431}
1432
1433// This differs from buildSpillLoadStore by only scavenging a VGPR. It does not
1434// need to handle the case where an SGPR may need to be spilled while spilling.
1436 MachineFrameInfo &MFI,
1438 int Index,
1439 int64_t Offset) {
1440 const SIInstrInfo *TII = ST.getInstrInfo();
1441 MachineBasicBlock *MBB = MI->getParent();
1442 const DebugLoc &DL = MI->getDebugLoc();
1443 bool IsStore = MI->mayStore();
1444
1445 unsigned Opc = MI->getOpcode();
1446 int LoadStoreOp = IsStore ?
1448 if (LoadStoreOp == -1)
1449 return false;
1450
1451 const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata);
1452 if (spillVGPRtoAGPR(ST, *MBB, MI, Index, 0, Reg->getReg(), false).getInstr())
1453 return true;
1454
1455 MachineInstrBuilder NewMI =
1456 BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
1457 .add(*Reg)
1458 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc))
1459 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset))
1460 .addImm(Offset)
1461 .addImm(0) // cpol
1462 .addImm(0) // swz
1463 .cloneMemRefs(*MI);
1464
1465 const MachineOperand *VDataIn = TII->getNamedOperand(*MI,
1466 AMDGPU::OpName::vdata_in);
1467 if (VDataIn)
1468 NewMI.add(*VDataIn);
1469 return true;
1470}
1471
1473 unsigned LoadStoreOp,
1474 unsigned EltSize) {
1475 bool IsStore = TII->get(LoadStoreOp).mayStore();
1476 bool HasVAddr = AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::vaddr);
1477 bool UseST =
1478 !HasVAddr && !AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::saddr);
1479
1480 // Handle block load/store first.
1481 if (TII->isBlockLoadStore(LoadStoreOp))
1482 return LoadStoreOp;
1483
1484 switch (EltSize) {
1485 case 4:
1486 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
1487 : AMDGPU::SCRATCH_LOAD_DWORD_SADDR;
1488 break;
1489 case 8:
1490 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX2_SADDR
1491 : AMDGPU::SCRATCH_LOAD_DWORDX2_SADDR;
1492 break;
1493 case 12:
1494 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX3_SADDR
1495 : AMDGPU::SCRATCH_LOAD_DWORDX3_SADDR;
1496 break;
1497 case 16:
1498 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX4_SADDR
1499 : AMDGPU::SCRATCH_LOAD_DWORDX4_SADDR;
1500 break;
1501 default:
1502 llvm_unreachable("Unexpected spill load/store size!");
1503 }
1504
1505 if (HasVAddr)
1506 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp);
1507 else if (UseST)
1508 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
1509
1510 return LoadStoreOp;
1511}
1512
1515 unsigned LoadStoreOp, int Index, Register ValueReg, bool IsKill,
1516 MCRegister ScratchOffsetReg, int64_t InstOffset, MachineMemOperand *MMO,
1517 RegScavenger *RS, LiveRegUnits *LiveUnits) const {
1518 assert((!RS || !LiveUnits) && "Only RS or LiveUnits can be set but not both");
1519
1520 MachineFunction *MF = MBB.getParent();
1521 const SIInstrInfo *TII = ST.getInstrInfo();
1522 const MachineFrameInfo &MFI = MF->getFrameInfo();
1523 const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
1524
1525 const MCInstrDesc *Desc = &TII->get(LoadStoreOp);
1526 bool IsStore = Desc->mayStore();
1527 bool IsFlat = TII->isFLATScratch(LoadStoreOp);
1528 bool IsBlock = TII->isBlockLoadStore(LoadStoreOp);
1529
1530 bool CanClobberSCC = false;
1531 bool Scavenged = false;
1532 MCRegister SOffset = ScratchOffsetReg;
1533
1534 const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
1535 // On gfx90a+ AGPR is a regular VGPR acceptable for loads and stores.
1536 const bool IsAGPR = !ST.hasGFX90AInsts() && isAGPRClass(RC);
1537 const unsigned RegWidth = AMDGPU::getRegBitWidth(*RC) / 8;
1538
1539 // Always use 4 byte operations for AGPRs because we need to scavenge
1540 // a temporary VGPR.
1541 // If we're using a block operation, the element should be the whole block.
1542 unsigned EltSize = IsBlock ? RegWidth
1543 : (IsFlat && !IsAGPR) ? std::min(RegWidth, 16u)
1544 : 4u;
1545 unsigned NumSubRegs = RegWidth / EltSize;
1546 unsigned Size = NumSubRegs * EltSize;
1547 unsigned RemSize = RegWidth - Size;
1548 unsigned NumRemSubRegs = RemSize ? 1 : 0;
1549 int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
1550 int64_t MaterializedOffset = Offset;
1551
1552 int64_t MaxOffset = Offset + Size + RemSize - EltSize;
1553 int64_t ScratchOffsetRegDelta = 0;
1554
1555 if (IsFlat && EltSize > 4) {
1556 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
1557 Desc = &TII->get(LoadStoreOp);
1558 }
1559
1560 Align Alignment = MFI.getObjectAlign(Index);
1561 const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
1562
1563 assert((IsFlat || ((Offset % EltSize) == 0)) &&
1564 "unexpected VGPR spill offset");
1565
1566 // Track a VGPR to use for a constant offset we need to materialize.
1567 Register TmpOffsetVGPR;
1568
1569 // Track a VGPR to use as an intermediate value.
1570 Register TmpIntermediateVGPR;
1571 bool UseVGPROffset = false;
1572
1573 // Materialize a VGPR offset required for the given SGPR/VGPR/Immediate
1574 // combination.
1575 auto MaterializeVOffset = [&](Register SGPRBase, Register TmpVGPR,
1576 int64_t VOffset) {
1577 // We are using a VGPR offset
1578 if (IsFlat && SGPRBase) {
1579 // We only have 1 VGPR offset, or 1 SGPR offset. We don't have a free
1580 // SGPR, so perform the add as vector.
1581 // We don't need a base SGPR in the kernel.
1582
1583 if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) >= 2) {
1584 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), TmpVGPR)
1585 .addReg(SGPRBase)
1586 .addImm(VOffset)
1587 .addImm(0); // clamp
1588 } else {
1589 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
1590 .addReg(SGPRBase);
1591 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e32), TmpVGPR)
1592 .addImm(VOffset)
1593 .addReg(TmpOffsetVGPR);
1594 }
1595 } else {
1596 assert(TmpOffsetVGPR);
1597 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
1598 .addImm(VOffset);
1599 }
1600 };
1601
1602 bool IsOffsetLegal =
1603 IsFlat ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS,
1605 : TII->isLegalMUBUFImmOffset(MaxOffset);
1606 if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) {
1607 SOffset = MCRegister();
1608
1609 // We don't have access to the register scavenger if this function is called
1610 // during PEI::scavengeFrameVirtualRegs() so use LiveUnits in this case.
1611 // TODO: Clobbering SCC is not necessary for scratch instructions in the
1612 // entry.
1613 if (RS) {
1614 SOffset = RS->scavengeRegisterBackwards(AMDGPU::SGPR_32RegClass, MI, false, 0, false);
1615
1616 // Piggy back on the liveness scan we just did see if SCC is dead.
1617 CanClobberSCC = !RS->isRegUsed(AMDGPU::SCC);
1618 } else if (LiveUnits) {
1619 CanClobberSCC = LiveUnits->available(AMDGPU::SCC);
1620 for (MCRegister Reg : AMDGPU::SGPR_32RegClass) {
1621 if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) {
1622 SOffset = Reg;
1623 break;
1624 }
1625 }
1626 }
1627
1628 if (ScratchOffsetReg != AMDGPU::NoRegister && !CanClobberSCC)
1629 SOffset = Register();
1630
1631 if (!SOffset) {
1632 UseVGPROffset = true;
1633
1634 if (RS) {
1635 TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0);
1636 } else {
1637 assert(LiveUnits);
1638 for (MCRegister Reg : AMDGPU::VGPR_32RegClass) {
1639 if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) {
1640 TmpOffsetVGPR = Reg;
1641 break;
1642 }
1643 }
1644 }
1645
1646 assert(TmpOffsetVGPR);
1647 } else if (!SOffset && CanClobberSCC) {
1648 // There are no free SGPRs, and since we are in the process of spilling
1649 // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true
1650 // on SI/CI and on VI it is true until we implement spilling using scalar
1651 // stores), we have no way to free up an SGPR. Our solution here is to
1652 // add the offset directly to the ScratchOffset or StackPtrOffset
1653 // register, and then subtract the offset after the spill to return the
1654 // register to it's original value.
1655
1656 // TODO: If we don't have to do an emergency stack slot spill, converting
1657 // to use the VGPR offset is fewer instructions.
1658 if (!ScratchOffsetReg)
1659 ScratchOffsetReg = FuncInfo->getStackPtrOffsetReg();
1660 SOffset = ScratchOffsetReg;
1661 ScratchOffsetRegDelta = Offset;
1662 } else {
1663 Scavenged = true;
1664 }
1665
1666 // We currently only support spilling VGPRs to EltSize boundaries, meaning
1667 // we can simplify the adjustment of Offset here to just scale with
1668 // WavefrontSize.
1669 if (!IsFlat && !UseVGPROffset)
1670 Offset *= ST.getWavefrontSize();
1671
1672 if (!UseVGPROffset && !SOffset)
1673 report_fatal_error("could not scavenge SGPR to spill in entry function");
1674
1675 if (UseVGPROffset) {
1676 // We are using a VGPR offset
1677 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, Offset);
1678 } else if (ScratchOffsetReg == AMDGPU::NoRegister) {
1679 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset).addImm(Offset);
1680 } else {
1681 assert(Offset != 0);
1682 auto Add = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset)
1683 .addReg(ScratchOffsetReg)
1684 .addImm(Offset);
1685 Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1686 }
1687
1688 Offset = 0;
1689 }
1690
1691 if (IsFlat && SOffset == AMDGPU::NoRegister) {
1692 assert(AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0
1693 && "Unexpected vaddr for flat scratch with a FI operand");
1694
1695 if (UseVGPROffset) {
1696 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp);
1697 } else {
1698 assert(ST.hasFlatScratchSTMode());
1699 assert(!TII->isBlockLoadStore(LoadStoreOp) && "Block ops don't have ST");
1700 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
1701 }
1702
1703 Desc = &TII->get(LoadStoreOp);
1704 }
1705
1706 for (unsigned i = 0, e = NumSubRegs + NumRemSubRegs, RegOffset = 0; i != e;
1707 ++i, RegOffset += EltSize) {
1708 if (i == NumSubRegs) {
1709 EltSize = RemSize;
1710 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
1711 }
1712 Desc = &TII->get(LoadStoreOp);
1713
1714 if (!IsFlat && UseVGPROffset) {
1715 int NewLoadStoreOp = IsStore ? getOffenMUBUFStore(LoadStoreOp)
1716 : getOffenMUBUFLoad(LoadStoreOp);
1717 Desc = &TII->get(NewLoadStoreOp);
1718 }
1719
1720 if (UseVGPROffset && TmpOffsetVGPR == TmpIntermediateVGPR) {
1721 // If we are spilling an AGPR beyond the range of the memory instruction
1722 // offset and need to use a VGPR offset, we ideally have at least 2
1723 // scratch VGPRs. If we don't have a second free VGPR without spilling,
1724 // recycle the VGPR used for the offset which requires resetting after
1725 // each subregister.
1726
1727 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, MaterializedOffset);
1728 }
1729
1730 unsigned NumRegs = EltSize / 4;
1731 Register SubReg = e == 1
1732 ? ValueReg
1733 : Register(getSubReg(ValueReg,
1734 getSubRegFromChannel(RegOffset / 4, NumRegs)));
1735
1736 RegState SOffsetRegState = {};
1737 RegState SrcDstRegState = getDefRegState(!IsStore);
1738 const bool IsLastSubReg = i + 1 == e;
1739 const bool IsFirstSubReg = i == 0;
1740 if (IsLastSubReg) {
1741 SOffsetRegState |= getKillRegState(Scavenged);
1742 // The last implicit use carries the "Kill" flag.
1743 SrcDstRegState |= getKillRegState(IsKill);
1744 }
1745
1746 // Make sure the whole register is defined if there are undef components by
1747 // adding an implicit def of the super-reg on the first instruction.
1748 bool NeedSuperRegDef = e > 1 && IsStore && IsFirstSubReg;
1749 bool NeedSuperRegImpOperand = e > 1;
1750
1751 // Remaining element size to spill into memory after some parts of it
1752 // spilled into either AGPRs or VGPRs.
1753 unsigned RemEltSize = EltSize;
1754
1755 // AGPRs to spill VGPRs and vice versa are allocated in a reverse order,
1756 // starting from the last lane. In case if a register cannot be completely
1757 // spilled into another register that will ensure its alignment does not
1758 // change. For targets with VGPR alignment requirement this is important
1759 // in case of flat scratch usage as we might get a scratch_load or
1760 // scratch_store of an unaligned register otherwise.
1761 for (int LaneS = (RegOffset + EltSize) / 4 - 1, Lane = LaneS,
1762 LaneE = RegOffset / 4;
1763 Lane >= LaneE; --Lane) {
1764 bool IsSubReg = e > 1 || EltSize > 4;
1765 Register Sub = IsSubReg
1766 ? Register(getSubReg(ValueReg, getSubRegFromChannel(Lane)))
1767 : ValueReg;
1768 auto MIB = spillVGPRtoAGPR(ST, MBB, MI, Index, Lane, Sub, IsKill);
1769 if (!MIB.getInstr())
1770 break;
1771 if (NeedSuperRegDef || (IsSubReg && IsStore && Lane == LaneS && IsFirstSubReg)) {
1772 MIB.addReg(ValueReg, RegState::ImplicitDefine);
1773 NeedSuperRegDef = false;
1774 }
1775 if ((IsSubReg || NeedSuperRegImpOperand) && (IsFirstSubReg || IsLastSubReg)) {
1776 NeedSuperRegImpOperand = true;
1777 RegState State = SrcDstRegState;
1778 if (!IsLastSubReg || (Lane != LaneE))
1779 State &= ~RegState::Kill;
1780 if (!IsFirstSubReg || (Lane != LaneS))
1781 State &= ~RegState::Define;
1782 MIB.addReg(ValueReg, RegState::Implicit | State);
1783 }
1784 RemEltSize -= 4;
1785 }
1786
1787 if (!RemEltSize) // Fully spilled into AGPRs.
1788 continue;
1789
1790 if (RemEltSize != EltSize) { // Partially spilled to AGPRs
1791 assert(IsFlat && EltSize > 4);
1792
1793 unsigned NumRegs = RemEltSize / 4;
1794 SubReg = Register(getSubReg(ValueReg,
1795 getSubRegFromChannel(RegOffset / 4, NumRegs)));
1796 unsigned Opc = getFlatScratchSpillOpcode(TII, LoadStoreOp, RemEltSize);
1797 Desc = &TII->get(Opc);
1798 }
1799
1800 unsigned FinalReg = SubReg;
1801
1802 if (IsAGPR) {
1803 assert(EltSize == 4);
1804
1805 if (!TmpIntermediateVGPR) {
1806 TmpIntermediateVGPR = FuncInfo->getVGPRForAGPRCopy();
1807 assert(MF->getRegInfo().isReserved(TmpIntermediateVGPR));
1808 }
1809 if (IsStore) {
1810 auto AccRead = BuildMI(MBB, MI, DL,
1811 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64),
1812 TmpIntermediateVGPR)
1813 .addReg(SubReg, getKillRegState(IsKill));
1814 if (NeedSuperRegDef)
1815 AccRead.addReg(ValueReg, RegState::ImplicitDefine);
1816 if (NeedSuperRegImpOperand && (IsFirstSubReg || IsLastSubReg))
1817 AccRead.addReg(ValueReg, RegState::Implicit);
1819 }
1820 SubReg = TmpIntermediateVGPR;
1821 } else if (UseVGPROffset) {
1822 if (!TmpOffsetVGPR) {
1823 TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
1824 MI, false, 0);
1825 RS->setRegUsed(TmpOffsetVGPR);
1826 }
1827 }
1828
1829 Register FinalValueReg = ValueReg;
1830 if (LoadStoreOp == AMDGPU::SCRATCH_LOAD_USHORT_SADDR) {
1831 // If we are loading 16-bit value with SRAMECC endabled we need a temp
1832 // 32-bit VGPR to load and extract 16-bits into the final register.
1833 ValueReg =
1834 RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0);
1835 SubReg = ValueReg;
1836 IsKill = false;
1837 }
1838
1839 MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RegOffset);
1840 MachineMemOperand *NewMMO =
1841 MF->getMachineMemOperand(PInfo, MMO->getFlags(), RemEltSize,
1842 commonAlignment(Alignment, RegOffset));
1843
1844 auto MIB =
1845 BuildMI(MBB, MI, DL, *Desc)
1846 .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill));
1847
1848 if (UseVGPROffset) {
1849 // For an AGPR spill, we reuse the same temp VGPR for the offset and the
1850 // intermediate accvgpr_write.
1851 MIB.addReg(TmpOffsetVGPR, getKillRegState(IsLastSubReg && !IsAGPR));
1852 }
1853
1854 if (!IsFlat)
1855 MIB.addReg(FuncInfo->getScratchRSrcReg());
1856
1857 if (SOffset == AMDGPU::NoRegister) {
1858 if (!IsFlat) {
1859 if (UseVGPROffset && ScratchOffsetReg) {
1860 MIB.addReg(ScratchOffsetReg);
1861 } else {
1862 assert(FuncInfo->isBottomOfStack());
1863 MIB.addImm(0);
1864 }
1865 }
1866 } else {
1867 MIB.addReg(SOffset, SOffsetRegState);
1868 }
1869
1870 MIB.addImm(Offset + RegOffset);
1871
1872 bool LastUse = MMO->getFlags() & MOLastUse;
1873 MIB.addImm(LastUse ? AMDGPU::CPol::TH_LU : 0); // cpol
1874
1875 if (!IsFlat)
1876 MIB.addImm(0); // swz
1877 MIB.addMemOperand(NewMMO);
1878
1879 if (FinalValueReg != ValueReg) {
1880 // Extract 16-bit from the loaded 32-bit value.
1881 ValueReg = getSubReg(ValueReg, AMDGPU::lo16);
1882 MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B16_t16_e64))
1883 .addReg(FinalValueReg, getDefRegState(true))
1884 .addImm(0)
1885 .addReg(ValueReg, getKillRegState(true))
1886 .addImm(0);
1887 ValueReg = FinalValueReg;
1888 }
1889
1890 if (!IsAGPR && NeedSuperRegDef)
1891 MIB.addReg(ValueReg, RegState::ImplicitDefine);
1892
1893 if (!IsStore && IsAGPR && TmpIntermediateVGPR != AMDGPU::NoRegister) {
1894 MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64),
1895 FinalReg)
1896 .addReg(TmpIntermediateVGPR, RegState::Kill);
1898 }
1899
1900 bool IsSrcDstDef = hasRegState(SrcDstRegState, RegState::Define);
1901 bool PartialReloadCopy = (RemEltSize != EltSize) && !IsStore;
1902 if (NeedSuperRegImpOperand &&
1903 (IsFirstSubReg || (IsLastSubReg && !IsSrcDstDef))) {
1904 MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
1905 if (PartialReloadCopy)
1906 MIB.addReg(ValueReg, RegState::Implicit);
1907 }
1908
1909 // The epilog restore of a wwm-scratch register can cause undesired
1910 // optimization during machine-cp post PrologEpilogInserter if the same
1911 // register was assigned for return value ABI lowering with a COPY
1912 // instruction. As given below, with the epilog reload, the earlier COPY
1913 // appeared to be dead during machine-cp.
1914 // ...
1915 // v0 in WWM operation, needs the WWM spill at prolog/epilog.
1916 // $vgpr0 = V_WRITELANE_B32 $sgpr20, 0, $vgpr0
1917 // ...
1918 // Epilog block:
1919 // $vgpr0 = COPY $vgpr1 // outgoing value moved to v0
1920 // ...
1921 // WWM spill restore to preserve the inactive lanes of v0.
1922 // $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1
1923 // $vgpr0 = BUFFER_LOAD $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0
1924 // $exec = S_MOV_B64 killed $sgpr4_sgpr5
1925 // ...
1926 // SI_RETURN implicit $vgpr0
1927 // ...
1928 // To fix it, mark the same reg as a tied op for such restore instructions
1929 // so that it marks a usage for the preceding COPY.
1930 if (!IsStore && MI != MBB.end() && MI->isReturn() &&
1931 MI->readsRegister(SubReg, this)) {
1932 MIB.addReg(SubReg, RegState::Implicit);
1933 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1934 }
1935
1936 // If we're building a block load, we should add artificial uses for the
1937 // CSR VGPRs that are *not* being transferred. This is because liveness
1938 // analysis is not aware of the mask, so we need to somehow inform it that
1939 // those registers are not available before the load and they should not be
1940 // scavenged.
1941 if (!IsStore && TII->isBlockLoadStore(LoadStoreOp))
1942 addImplicitUsesForBlockCSRLoad(MIB, ValueReg);
1943 }
1944
1945 if (ScratchOffsetRegDelta != 0) {
1946 // Subtract the offset we added to the ScratchOffset register.
1947 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset)
1948 .addReg(SOffset)
1949 .addImm(-ScratchOffsetRegDelta);
1950 }
1951}
1952
1954 Register BlockReg) const {
1955 const MachineFunction *MF = MIB->getMF();
1956 const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
1957 uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(BlockReg);
1958 Register BaseVGPR = getSubReg(BlockReg, AMDGPU::sub0);
1959 for (unsigned RegOffset = 1; RegOffset < 32; ++RegOffset)
1960 if (!(Mask & (1 << RegOffset)) &&
1961 isCalleeSavedPhysReg(BaseVGPR + RegOffset, *MF))
1962 MIB.addUse(BaseVGPR + RegOffset, RegState::Implicit);
1963}
1964
1966 int Offset, bool IsLoad,
1967 bool IsKill) const {
1968 // Load/store VGPR
1969 MachineFrameInfo &FrameInfo = SB.MF.getFrameInfo();
1970 assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill);
1971
1972 Register FrameReg =
1973 FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(SB.MF)
1974 ? getBaseRegister()
1975 : getFrameRegister(SB.MF);
1976
1977 Align Alignment = FrameInfo.getObjectAlign(Index);
1981 SB.EltSize, Alignment);
1982
1983 if (IsLoad) {
1984 unsigned Opc = ST.hasFlatScratchEnabled()
1985 ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
1986 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
1987 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, false,
1988 FrameReg, (int64_t)Offset * SB.EltSize, MMO, SB.RS);
1989 } else {
1990 unsigned Opc = ST.hasFlatScratchEnabled()
1991 ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
1992 : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
1993 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, IsKill,
1994 FrameReg, (int64_t)Offset * SB.EltSize, MMO, SB.RS);
1995 // This only ever adds one VGPR spill
1996 SB.MFI.addToSpilledVGPRs(1);
1997 }
1998}
1999
2001 RegScavenger *RS, SlotIndexes *Indexes,
2002 LiveIntervals *LIS, bool OnlyToVGPR,
2003 bool SpillToPhysVGPRLane) const {
2004 assert(!MI->getOperand(0).isUndef() &&
2005 "undef spill should have been deleted earlier");
2006
2007 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
2008
2009 ArrayRef<SpilledReg> VGPRSpills =
2010 SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index)
2012 bool SpillToVGPR = !VGPRSpills.empty();
2013 if (OnlyToVGPR && !SpillToVGPR)
2014 return false;
2015
2016 assert(SpillToVGPR || (SB.SuperReg != SB.MFI.getStackPtrOffsetReg() &&
2017 SB.SuperReg != SB.MFI.getFrameOffsetReg()));
2018
2019 if (SpillToVGPR) {
2020
2021 // Since stack slot coloring pass is trying to optimize SGPR spills,
2022 // VGPR lanes (mapped from spill stack slot) may be shared for SGPR
2023 // spills of different sizes. This accounts for number of VGPR lanes alloted
2024 // equal to the largest SGPR being spilled in them.
2025 assert(SB.NumSubRegs <= VGPRSpills.size() &&
2026 "Num of SGPRs spilled should be less than or equal to num of "
2027 "the VGPR lanes.");
2028
2029 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
2031 SB.NumSubRegs == 1
2032 ? SB.SuperReg
2033 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2034 SpilledReg Spill = VGPRSpills[i];
2035
2036 bool IsFirstSubreg = i == 0;
2037 bool IsLastSubreg = i == SB.NumSubRegs - 1;
2038 bool UseKill = SB.IsKill && IsLastSubreg;
2039
2040
2041 // Mark the "old value of vgpr" input undef only if this is the first sgpr
2042 // spill to this specific vgpr in the first basic block.
2043 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
2044 SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), Spill.VGPR)
2045 .addReg(SubReg, getKillRegState(UseKill))
2046 .addImm(Spill.Lane)
2047 .addReg(Spill.VGPR);
2048 if (Indexes) {
2049 if (IsFirstSubreg)
2050 Indexes->replaceMachineInstrInMaps(*MI, *MIB);
2051 else
2052 Indexes->insertMachineInstrInMaps(*MIB);
2053 }
2054
2055 if (IsFirstSubreg && SB.NumSubRegs > 1) {
2056 // We may be spilling a super-register which is only partially defined,
2057 // and need to ensure later spills think the value is defined.
2058 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
2059 }
2060
2061 if (SB.NumSubRegs > 1 && (IsFirstSubreg || IsLastSubreg))
2063
2064 // FIXME: Since this spills to another register instead of an actual
2065 // frame index, we should delete the frame index when all references to
2066 // it are fixed.
2067 }
2068 } else {
2069 SB.prepare();
2070
2071 // SubReg carries the "Kill" flag when SubReg == SB.SuperReg.
2072 RegState SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill);
2073
2074 // Per VGPR helper data
2075 auto PVD = SB.getPerVGPRData();
2076
2077 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2078 RegState TmpVGPRFlags = RegState::Undef;
2079
2080 // Write sub registers into the VGPR
2081 for (unsigned i = Offset * PVD.PerVGPR,
2082 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2083 i < e; ++i) {
2085 SB.NumSubRegs == 1
2086 ? SB.SuperReg
2087 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2088
2089 MachineInstrBuilder WriteLane =
2090 BuildMI(*SB.MBB, MI, SB.DL,
2091 SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), SB.TmpVGPR)
2092 .addReg(SubReg, SubKillState)
2093 .addImm(i % PVD.PerVGPR)
2094 .addReg(SB.TmpVGPR, TmpVGPRFlags);
2095 TmpVGPRFlags = {};
2096
2097 if (Indexes) {
2098 if (i == 0)
2099 Indexes->replaceMachineInstrInMaps(*MI, *WriteLane);
2100 else
2101 Indexes->insertMachineInstrInMaps(*WriteLane);
2102 }
2103
2104 // There could be undef components of a spilled super register.
2105 // TODO: Can we detect this and skip the spill?
2106 if (SB.NumSubRegs > 1) {
2107 // The last implicit use of the SB.SuperReg carries the "Kill" flag.
2108 RegState SuperKillState = {};
2109 if (i + 1 == SB.NumSubRegs)
2110 SuperKillState |= getKillRegState(SB.IsKill);
2111 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState);
2112 }
2113 }
2114
2115 // Write out VGPR
2116 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ false);
2117 }
2118
2119 SB.restore();
2120 }
2121
2122 MI->eraseFromParent();
2124
2125 if (LIS)
2127
2128 return true;
2129}
2130
2132 RegScavenger *RS, SlotIndexes *Indexes,
2133 LiveIntervals *LIS, bool OnlyToVGPR,
2134 bool SpillToPhysVGPRLane) const {
2135 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
2136
2137 ArrayRef<SpilledReg> VGPRSpills =
2138 SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index)
2140 bool SpillToVGPR = !VGPRSpills.empty();
2141 if (OnlyToVGPR && !SpillToVGPR)
2142 return false;
2143
2144 if (SpillToVGPR) {
2145 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
2147 SB.NumSubRegs == 1
2148 ? SB.SuperReg
2149 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2150
2151 SpilledReg Spill = VGPRSpills[i];
2152 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
2153 SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
2154 .addReg(Spill.VGPR)
2155 .addImm(Spill.Lane);
2156 if (SB.NumSubRegs > 1 && i == 0)
2158 if (Indexes) {
2159 if (i == e - 1)
2160 Indexes->replaceMachineInstrInMaps(*MI, *MIB);
2161 else
2162 Indexes->insertMachineInstrInMaps(*MIB);
2163 }
2164 }
2165 } else {
2166 SB.prepare();
2167
2168 // Per VGPR helper data
2169 auto PVD = SB.getPerVGPRData();
2170
2171 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2172 // Load in VGPR data
2173 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ true);
2174
2175 // Unpack lanes
2176 for (unsigned i = Offset * PVD.PerVGPR,
2177 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2178 i < e; ++i) {
2180 SB.NumSubRegs == 1
2181 ? SB.SuperReg
2182 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2183
2184 bool LastSubReg = (i + 1 == e);
2185 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
2186 SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
2187 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg))
2188 .addImm(i);
2189 if (SB.NumSubRegs > 1 && i == 0)
2191 if (Indexes) {
2192 if (i == e - 1)
2193 Indexes->replaceMachineInstrInMaps(*MI, *MIB);
2194 else
2195 Indexes->insertMachineInstrInMaps(*MIB);
2196 }
2197 }
2198 }
2199
2200 SB.restore();
2201 }
2202
2203 MI->eraseFromParent();
2204
2205 if (LIS)
2207
2208 return true;
2209}
2210
2212 MachineBasicBlock &RestoreMBB,
2213 Register SGPR, RegScavenger *RS) const {
2214 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, SGPR, false, 0,
2215 RS);
2216 SB.prepare();
2217 // Generate the spill of SGPR to SB.TmpVGPR.
2218 RegState SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill);
2219 auto PVD = SB.getPerVGPRData();
2220 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2221 RegState TmpVGPRFlags = RegState::Undef;
2222 // Write sub registers into the VGPR
2223 for (unsigned i = Offset * PVD.PerVGPR,
2224 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2225 i < e; ++i) {
2227 SB.NumSubRegs == 1
2228 ? SB.SuperReg
2229 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2230
2231 MachineInstrBuilder WriteLane =
2232 BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32),
2233 SB.TmpVGPR)
2234 .addReg(SubReg, SubKillState)
2235 .addImm(i % PVD.PerVGPR)
2236 .addReg(SB.TmpVGPR, TmpVGPRFlags);
2237 TmpVGPRFlags = {};
2238 // There could be undef components of a spilled super register.
2239 // TODO: Can we detect this and skip the spill?
2240 if (SB.NumSubRegs > 1) {
2241 // The last implicit use of the SB.SuperReg carries the "Kill" flag.
2242 RegState SuperKillState = {};
2243 if (i + 1 == SB.NumSubRegs)
2244 SuperKillState |= getKillRegState(SB.IsKill);
2245 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState);
2246 }
2247 }
2248 // Don't need to write VGPR out.
2249 }
2250
2251 // Restore clobbered registers in the specified restore block.
2252 MI = RestoreMBB.end();
2253 SB.setMI(&RestoreMBB, MI);
2254 // Generate the restore of SGPR from SB.TmpVGPR.
2255 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2256 // Don't need to load VGPR in.
2257 // Unpack lanes
2258 for (unsigned i = Offset * PVD.PerVGPR,
2259 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2260 i < e; ++i) {
2262 SB.NumSubRegs == 1
2263 ? SB.SuperReg
2264 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2265
2266 assert(SubReg.isPhysical());
2267 bool LastSubReg = (i + 1 == e);
2268 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32),
2269 SubReg)
2270 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg))
2271 .addImm(i);
2272 if (SB.NumSubRegs > 1 && i == 0)
2274 }
2275 }
2276 SB.restore();
2277
2279 return false;
2280}
2281
2282/// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to
2283/// a VGPR and the stack slot can be safely eliminated when all other users are
2284/// handled.
2287 SlotIndexes *Indexes, LiveIntervals *LIS, bool SpillToPhysVGPRLane) const {
2288 switch (MI->getOpcode()) {
2289 case AMDGPU::SI_SPILL_S1024_SAVE:
2290 case AMDGPU::SI_SPILL_S512_SAVE:
2291 case AMDGPU::SI_SPILL_S384_SAVE:
2292 case AMDGPU::SI_SPILL_S352_SAVE:
2293 case AMDGPU::SI_SPILL_S320_SAVE:
2294 case AMDGPU::SI_SPILL_S288_SAVE:
2295 case AMDGPU::SI_SPILL_S256_SAVE:
2296 case AMDGPU::SI_SPILL_S224_SAVE:
2297 case AMDGPU::SI_SPILL_S192_SAVE:
2298 case AMDGPU::SI_SPILL_S160_SAVE:
2299 case AMDGPU::SI_SPILL_S128_SAVE:
2300 case AMDGPU::SI_SPILL_S96_SAVE:
2301 case AMDGPU::SI_SPILL_S64_SAVE:
2302 case AMDGPU::SI_SPILL_S32_SAVE:
2303 return spillSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane);
2304 case AMDGPU::SI_SPILL_S1024_RESTORE:
2305 case AMDGPU::SI_SPILL_S512_RESTORE:
2306 case AMDGPU::SI_SPILL_S384_RESTORE:
2307 case AMDGPU::SI_SPILL_S352_RESTORE:
2308 case AMDGPU::SI_SPILL_S320_RESTORE:
2309 case AMDGPU::SI_SPILL_S288_RESTORE:
2310 case AMDGPU::SI_SPILL_S256_RESTORE:
2311 case AMDGPU::SI_SPILL_S224_RESTORE:
2312 case AMDGPU::SI_SPILL_S192_RESTORE:
2313 case AMDGPU::SI_SPILL_S160_RESTORE:
2314 case AMDGPU::SI_SPILL_S128_RESTORE:
2315 case AMDGPU::SI_SPILL_S96_RESTORE:
2316 case AMDGPU::SI_SPILL_S64_RESTORE:
2317 case AMDGPU::SI_SPILL_S32_RESTORE:
2318 return restoreSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane);
2319 default:
2320 llvm_unreachable("not an SGPR spill instruction");
2321 }
2322}
2323
2325 int SPAdj, unsigned FIOperandNum,
2326 RegScavenger *RS) const {
2327 MachineFunction *MF = MI->getMF();
2328 MachineBasicBlock *MBB = MI->getParent();
2330 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
2331 const SIInstrInfo *TII = ST.getInstrInfo();
2332 const DebugLoc &DL = MI->getDebugLoc();
2333
2334 assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?");
2335
2337 "unreserved scratch RSRC register");
2338
2339 MachineOperand *FIOp = &MI->getOperand(FIOperandNum);
2340 int Index = MI->getOperand(FIOperandNum).getIndex();
2341
2342 Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF)
2343 ? getBaseRegister()
2344 : getFrameRegister(*MF);
2345
2346 switch (MI->getOpcode()) {
2347 // SGPR register spill
2348 case AMDGPU::SI_SPILL_S1024_SAVE:
2349 case AMDGPU::SI_SPILL_S512_SAVE:
2350 case AMDGPU::SI_SPILL_S384_SAVE:
2351 case AMDGPU::SI_SPILL_S352_SAVE:
2352 case AMDGPU::SI_SPILL_S320_SAVE:
2353 case AMDGPU::SI_SPILL_S288_SAVE:
2354 case AMDGPU::SI_SPILL_S256_SAVE:
2355 case AMDGPU::SI_SPILL_S224_SAVE:
2356 case AMDGPU::SI_SPILL_S192_SAVE:
2357 case AMDGPU::SI_SPILL_S160_SAVE:
2358 case AMDGPU::SI_SPILL_S128_SAVE:
2359 case AMDGPU::SI_SPILL_S96_SAVE:
2360 case AMDGPU::SI_SPILL_S64_SAVE:
2361 case AMDGPU::SI_SPILL_S32_SAVE: {
2362 return spillSGPR(MI, Index, RS);
2363 }
2364
2365 // SGPR register restore
2366 case AMDGPU::SI_SPILL_S1024_RESTORE:
2367 case AMDGPU::SI_SPILL_S512_RESTORE:
2368 case AMDGPU::SI_SPILL_S384_RESTORE:
2369 case AMDGPU::SI_SPILL_S352_RESTORE:
2370 case AMDGPU::SI_SPILL_S320_RESTORE:
2371 case AMDGPU::SI_SPILL_S288_RESTORE:
2372 case AMDGPU::SI_SPILL_S256_RESTORE:
2373 case AMDGPU::SI_SPILL_S224_RESTORE:
2374 case AMDGPU::SI_SPILL_S192_RESTORE:
2375 case AMDGPU::SI_SPILL_S160_RESTORE:
2376 case AMDGPU::SI_SPILL_S128_RESTORE:
2377 case AMDGPU::SI_SPILL_S96_RESTORE:
2378 case AMDGPU::SI_SPILL_S64_RESTORE:
2379 case AMDGPU::SI_SPILL_S32_RESTORE: {
2380 return restoreSGPR(MI, Index, RS);
2381 }
2382
2383 // VGPR register spill
2384 case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE: {
2385 // Put mask into M0.
2386 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32),
2387 AMDGPU::M0)
2388 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::mask));
2389 [[fallthrough]];
2390 }
2391 case AMDGPU::SI_SPILL_V1024_SAVE:
2392 case AMDGPU::SI_SPILL_V512_SAVE:
2393 case AMDGPU::SI_SPILL_V384_SAVE:
2394 case AMDGPU::SI_SPILL_V352_SAVE:
2395 case AMDGPU::SI_SPILL_V320_SAVE:
2396 case AMDGPU::SI_SPILL_V288_SAVE:
2397 case AMDGPU::SI_SPILL_V256_SAVE:
2398 case AMDGPU::SI_SPILL_V224_SAVE:
2399 case AMDGPU::SI_SPILL_V192_SAVE:
2400 case AMDGPU::SI_SPILL_V160_SAVE:
2401 case AMDGPU::SI_SPILL_V128_SAVE:
2402 case AMDGPU::SI_SPILL_V96_SAVE:
2403 case AMDGPU::SI_SPILL_V64_SAVE:
2404 case AMDGPU::SI_SPILL_V32_SAVE:
2405 case AMDGPU::SI_SPILL_V16_SAVE:
2406 case AMDGPU::SI_SPILL_A1024_SAVE:
2407 case AMDGPU::SI_SPILL_A512_SAVE:
2408 case AMDGPU::SI_SPILL_A384_SAVE:
2409 case AMDGPU::SI_SPILL_A352_SAVE:
2410 case AMDGPU::SI_SPILL_A320_SAVE:
2411 case AMDGPU::SI_SPILL_A288_SAVE:
2412 case AMDGPU::SI_SPILL_A256_SAVE:
2413 case AMDGPU::SI_SPILL_A224_SAVE:
2414 case AMDGPU::SI_SPILL_A192_SAVE:
2415 case AMDGPU::SI_SPILL_A160_SAVE:
2416 case AMDGPU::SI_SPILL_A128_SAVE:
2417 case AMDGPU::SI_SPILL_A96_SAVE:
2418 case AMDGPU::SI_SPILL_A64_SAVE:
2419 case AMDGPU::SI_SPILL_A32_SAVE:
2420 case AMDGPU::SI_SPILL_AV1024_SAVE:
2421 case AMDGPU::SI_SPILL_AV512_SAVE:
2422 case AMDGPU::SI_SPILL_AV384_SAVE:
2423 case AMDGPU::SI_SPILL_AV352_SAVE:
2424 case AMDGPU::SI_SPILL_AV320_SAVE:
2425 case AMDGPU::SI_SPILL_AV288_SAVE:
2426 case AMDGPU::SI_SPILL_AV256_SAVE:
2427 case AMDGPU::SI_SPILL_AV224_SAVE:
2428 case AMDGPU::SI_SPILL_AV192_SAVE:
2429 case AMDGPU::SI_SPILL_AV160_SAVE:
2430 case AMDGPU::SI_SPILL_AV128_SAVE:
2431 case AMDGPU::SI_SPILL_AV96_SAVE:
2432 case AMDGPU::SI_SPILL_AV64_SAVE:
2433 case AMDGPU::SI_SPILL_AV32_SAVE:
2434 case AMDGPU::SI_SPILL_WWM_V32_SAVE:
2435 case AMDGPU::SI_SPILL_WWM_AV32_SAVE: {
2436 const MachineOperand *VData = TII->getNamedOperand(*MI,
2437 AMDGPU::OpName::vdata);
2438 if (VData->isUndef()) {
2439 MI->eraseFromParent();
2440 return true;
2441 }
2442
2443 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
2444 MFI->getStackPtrOffsetReg());
2445
2446 unsigned Opc;
2447 if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_SAVE) {
2448 assert(ST.hasFlatScratchEnabled() && "Flat Scratch is not enabled!");
2449 Opc = AMDGPU::SCRATCH_STORE_SHORT_SADDR_t16;
2450 } else {
2451 Opc = MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_SAVE
2452 ? AMDGPU::SCRATCH_STORE_BLOCK_SADDR
2453 : ST.hasFlatScratchEnabled() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
2454 : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
2455 }
2456
2457 auto *MBB = MI->getParent();
2458 bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode());
2459 if (IsWWMRegSpill) {
2460 TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(),
2461 RS->isRegUsed(AMDGPU::SCC));
2462 }
2464 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
2465 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
2466 *MI->memoperands_begin(), RS);
2468 if (IsWWMRegSpill)
2469 TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy());
2470
2471 MI->eraseFromParent();
2472 return true;
2473 }
2474 case AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE: {
2475 // Put mask into M0.
2476 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32),
2477 AMDGPU::M0)
2478 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::mask));
2479 [[fallthrough]];
2480 }
2481 case AMDGPU::SI_SPILL_V16_RESTORE:
2482 case AMDGPU::SI_SPILL_V32_RESTORE:
2483 case AMDGPU::SI_SPILL_V64_RESTORE:
2484 case AMDGPU::SI_SPILL_V96_RESTORE:
2485 case AMDGPU::SI_SPILL_V128_RESTORE:
2486 case AMDGPU::SI_SPILL_V160_RESTORE:
2487 case AMDGPU::SI_SPILL_V192_RESTORE:
2488 case AMDGPU::SI_SPILL_V224_RESTORE:
2489 case AMDGPU::SI_SPILL_V256_RESTORE:
2490 case AMDGPU::SI_SPILL_V288_RESTORE:
2491 case AMDGPU::SI_SPILL_V320_RESTORE:
2492 case AMDGPU::SI_SPILL_V352_RESTORE:
2493 case AMDGPU::SI_SPILL_V384_RESTORE:
2494 case AMDGPU::SI_SPILL_V512_RESTORE:
2495 case AMDGPU::SI_SPILL_V1024_RESTORE:
2496 case AMDGPU::SI_SPILL_A32_RESTORE:
2497 case AMDGPU::SI_SPILL_A64_RESTORE:
2498 case AMDGPU::SI_SPILL_A96_RESTORE:
2499 case AMDGPU::SI_SPILL_A128_RESTORE:
2500 case AMDGPU::SI_SPILL_A160_RESTORE:
2501 case AMDGPU::SI_SPILL_A192_RESTORE:
2502 case AMDGPU::SI_SPILL_A224_RESTORE:
2503 case AMDGPU::SI_SPILL_A256_RESTORE:
2504 case AMDGPU::SI_SPILL_A288_RESTORE:
2505 case AMDGPU::SI_SPILL_A320_RESTORE:
2506 case AMDGPU::SI_SPILL_A352_RESTORE:
2507 case AMDGPU::SI_SPILL_A384_RESTORE:
2508 case AMDGPU::SI_SPILL_A512_RESTORE:
2509 case AMDGPU::SI_SPILL_A1024_RESTORE:
2510 case AMDGPU::SI_SPILL_AV32_RESTORE:
2511 case AMDGPU::SI_SPILL_AV64_RESTORE:
2512 case AMDGPU::SI_SPILL_AV96_RESTORE:
2513 case AMDGPU::SI_SPILL_AV128_RESTORE:
2514 case AMDGPU::SI_SPILL_AV160_RESTORE:
2515 case AMDGPU::SI_SPILL_AV192_RESTORE:
2516 case AMDGPU::SI_SPILL_AV224_RESTORE:
2517 case AMDGPU::SI_SPILL_AV256_RESTORE:
2518 case AMDGPU::SI_SPILL_AV288_RESTORE:
2519 case AMDGPU::SI_SPILL_AV320_RESTORE:
2520 case AMDGPU::SI_SPILL_AV352_RESTORE:
2521 case AMDGPU::SI_SPILL_AV384_RESTORE:
2522 case AMDGPU::SI_SPILL_AV512_RESTORE:
2523 case AMDGPU::SI_SPILL_AV1024_RESTORE:
2524 case AMDGPU::SI_SPILL_WWM_V32_RESTORE:
2525 case AMDGPU::SI_SPILL_WWM_AV32_RESTORE: {
2526 const MachineOperand *VData = TII->getNamedOperand(*MI,
2527 AMDGPU::OpName::vdata);
2528 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
2529 MFI->getStackPtrOffsetReg());
2530
2531 unsigned Opc;
2532 if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_RESTORE) {
2533 assert(ST.hasFlatScratchEnabled() && "Flat Scratch is not enabled!");
2534 Opc = ST.d16PreservesUnusedBits()
2535 ? AMDGPU::SCRATCH_LOAD_SHORT_D16_SADDR_t16
2536 : AMDGPU::SCRATCH_LOAD_USHORT_SADDR;
2537 } else {
2538 Opc = MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE
2539 ? AMDGPU::SCRATCH_LOAD_BLOCK_SADDR
2540 : ST.hasFlatScratchEnabled() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
2541 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
2542 }
2543
2544 auto *MBB = MI->getParent();
2545 bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode());
2546 if (IsWWMRegSpill) {
2547 TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(),
2548 RS->isRegUsed(AMDGPU::SCC));
2549 }
2550
2552 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
2553 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
2554 *MI->memoperands_begin(), RS);
2555
2556 if (IsWWMRegSpill)
2557 TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy());
2558
2559 MI->eraseFromParent();
2560 return true;
2561 }
2562 case AMDGPU::V_ADD_U32_e32:
2563 case AMDGPU::V_ADD_U32_e64:
2564 case AMDGPU::V_ADD_CO_U32_e32:
2565 case AMDGPU::V_ADD_CO_U32_e64: {
2566 // TODO: Handle sub, and, or.
2567 unsigned NumDefs = MI->getNumExplicitDefs();
2568 unsigned Src0Idx = NumDefs;
2569
2570 bool HasClamp = false;
2571 MachineOperand *VCCOp = nullptr;
2572
2573 switch (MI->getOpcode()) {
2574 case AMDGPU::V_ADD_U32_e32:
2575 break;
2576 case AMDGPU::V_ADD_U32_e64:
2577 HasClamp = MI->getOperand(3).getImm();
2578 break;
2579 case AMDGPU::V_ADD_CO_U32_e32:
2580 VCCOp = &MI->getOperand(3);
2581 break;
2582 case AMDGPU::V_ADD_CO_U32_e64:
2583 VCCOp = &MI->getOperand(1);
2584 HasClamp = MI->getOperand(4).getImm();
2585 break;
2586 default:
2587 break;
2588 }
2589 bool DeadVCC = !VCCOp || VCCOp->isDead();
2590 MachineOperand &DstOp = MI->getOperand(0);
2591 Register DstReg = DstOp.getReg();
2592
2593 unsigned OtherOpIdx =
2594 FIOperandNum == Src0Idx ? FIOperandNum + 1 : Src0Idx;
2595 MachineOperand *OtherOp = &MI->getOperand(OtherOpIdx);
2596
2597 unsigned Src1Idx = Src0Idx + 1;
2598 Register MaterializedReg = FrameReg;
2599 Register ScavengedVGPR;
2600
2601 int64_t Offset = FrameInfo.getObjectOffset(Index);
2602 // For the non-immediate case, we could fall through to the default
2603 // handling, but we do an in-place update of the result register here to
2604 // avoid scavenging another register.
2605 if (OtherOp->isImm()) {
2606 int64_t TotalOffset = OtherOp->getImm() + Offset;
2607
2608 if (!ST.hasVOP3Literal() && SIInstrInfo::isVOP3(*MI) &&
2609 !AMDGPU::isInlinableIntLiteral(TotalOffset)) {
2610 // If we can't support a VOP3 literal in the VALU instruction, we
2611 // can't specially fold into the add.
2612 // TODO: Handle VOP3->VOP2 shrink to support the fold.
2613 break;
2614 }
2615
2616 OtherOp->setImm(TotalOffset);
2617 Offset = 0;
2618 }
2619
2620 if (FrameReg && !ST.hasFlatScratchEnabled()) {
2621 // We should just do an in-place update of the result register. However,
2622 // the value there may also be used by the add, in which case we need a
2623 // temporary register.
2624 //
2625 // FIXME: The scavenger is not finding the result register in the
2626 // common case where the add does not read the register.
2627
2628 ScavengedVGPR = RS->scavengeRegisterBackwards(
2629 AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false, /*SPAdj=*/0);
2630
2631 // TODO: If we have a free SGPR, it's sometimes better to use a scalar
2632 // shift.
2633 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64))
2634 .addDef(ScavengedVGPR, RegState::Renamable)
2635 .addImm(ST.getWavefrontSizeLog2())
2636 .addReg(FrameReg);
2637 MaterializedReg = ScavengedVGPR;
2638 }
2639
2640 if ((!OtherOp->isImm() || OtherOp->getImm() != 0) && MaterializedReg) {
2641 if (ST.hasFlatScratchEnabled() &&
2642 !TII->isOperandLegal(*MI, Src1Idx, OtherOp)) {
2643 // We didn't need the shift above, so we have an SGPR for the frame
2644 // register, but may have a VGPR only operand.
2645 //
2646 // TODO: On gfx10+, we can easily change the opcode to the e64 version
2647 // and use the higher constant bus restriction to avoid this copy.
2648
2649 if (!ScavengedVGPR) {
2650 ScavengedVGPR = RS->scavengeRegisterBackwards(
2651 AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false,
2652 /*SPAdj=*/0);
2653 }
2654
2655 assert(ScavengedVGPR != DstReg);
2656
2657 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2658 .addReg(MaterializedReg,
2659 getKillRegState(MaterializedReg != FrameReg));
2660 MaterializedReg = ScavengedVGPR;
2661 }
2662
2663 // TODO: In the flat scratch case, if this is an add of an SGPR, and SCC
2664 // is not live, we could use a scalar add + vector add instead of 2
2665 // vector adds.
2666 auto AddI32 = BuildMI(*MBB, *MI, DL, TII->get(MI->getOpcode()))
2667 .addDef(DstReg, RegState::Renamable);
2668 if (NumDefs == 2)
2669 AddI32.add(MI->getOperand(1));
2670
2671 RegState MaterializedRegFlags =
2672 getKillRegState(MaterializedReg != FrameReg);
2673
2674 if (isVGPRClass(getPhysRegBaseClass(MaterializedReg))) {
2675 // If we know we have a VGPR already, it's more likely the other
2676 // operand is a legal vsrc0.
2677 AddI32
2678 .add(*OtherOp)
2679 .addReg(MaterializedReg, MaterializedRegFlags);
2680 } else {
2681 // Commute operands to avoid violating VOP2 restrictions. This will
2682 // typically happen when using scratch.
2683 AddI32
2684 .addReg(MaterializedReg, MaterializedRegFlags)
2685 .add(*OtherOp);
2686 }
2687
2688 if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 ||
2689 MI->getOpcode() == AMDGPU::V_ADD_U32_e64)
2690 AddI32.addImm(0); // clamp
2691
2692 if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e32)
2693 AddI32.setOperandDead(3); // Dead vcc
2694
2695 MaterializedReg = DstReg;
2696
2697 OtherOp->ChangeToRegister(MaterializedReg, false);
2698 OtherOp->setIsKill(true);
2700 Offset = 0;
2701 } else if (Offset != 0) {
2702 assert(!MaterializedReg);
2704 Offset = 0;
2705 } else {
2706 if (DeadVCC && !HasClamp) {
2707 assert(Offset == 0);
2708
2709 // TODO: Losing kills and implicit operands. Just mutate to copy and
2710 // let lowerCopy deal with it?
2711 if (OtherOp->isReg() && OtherOp->getReg() == DstReg) {
2712 // Folded to an identity copy.
2713 MI->eraseFromParent();
2714 return true;
2715 }
2716
2717 // The immediate value should be in OtherOp
2718 MI->setDesc(TII->get(AMDGPU::V_MOV_B32_e32));
2719 MI->removeOperand(FIOperandNum);
2720
2721 unsigned NumOps = MI->getNumOperands();
2722 for (unsigned I = NumOps - 2; I >= NumDefs + 1; --I)
2723 MI->removeOperand(I);
2724
2725 if (NumDefs == 2)
2726 MI->removeOperand(1);
2727
2728 // The code below can't deal with a mov.
2729 return true;
2730 }
2731
2732 // This folded to a constant, but we have to keep the add around for
2733 // pointless implicit defs or clamp modifier.
2734 FIOp->ChangeToImmediate(0);
2735 }
2736
2737 // Try to improve legality by commuting.
2738 if (!TII->isOperandLegal(*MI, Src1Idx) && TII->commuteInstruction(*MI)) {
2739 std::swap(FIOp, OtherOp);
2740 std::swap(FIOperandNum, OtherOpIdx);
2741 }
2742
2743 // We need at most one mov to satisfy the operand constraints. Prefer to
2744 // move the FI operand first, as it may be a literal in a VOP3
2745 // instruction.
2746 for (unsigned SrcIdx : {FIOperandNum, OtherOpIdx}) {
2747 if (!TII->isOperandLegal(*MI, SrcIdx)) {
2748 // If commuting didn't make the operands legal, we need to materialize
2749 // in a register.
2750 // TODO: Can use SGPR on gfx10+ in some cases.
2751 if (!ScavengedVGPR) {
2752 ScavengedVGPR = RS->scavengeRegisterBackwards(
2753 AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false,
2754 /*SPAdj=*/0);
2755 }
2756
2757 assert(ScavengedVGPR != DstReg);
2758
2759 MachineOperand &Src = MI->getOperand(SrcIdx);
2760 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2761 .add(Src);
2762
2763 Src.ChangeToRegister(ScavengedVGPR, false);
2764 Src.setIsKill(true);
2765 break;
2766 }
2767 }
2768
2769 // Fold out add of 0 case that can appear in kernels.
2770 if (FIOp->isImm() && FIOp->getImm() == 0 && DeadVCC && !HasClamp) {
2771 if (OtherOp->isReg() && OtherOp->getReg() != DstReg) {
2772 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::COPY), DstReg).add(*OtherOp);
2773 }
2774
2775 MI->eraseFromParent();
2776 }
2777
2778 return true;
2779 }
2780 case AMDGPU::S_ADD_I32:
2781 case AMDGPU::S_ADD_U32: {
2782 // TODO: Handle s_or_b32, s_and_b32.
2783 unsigned OtherOpIdx = FIOperandNum == 1 ? 2 : 1;
2784 MachineOperand &OtherOp = MI->getOperand(OtherOpIdx);
2785
2786 assert(FrameReg || MFI->isBottomOfStack());
2787
2788 MachineOperand &DstOp = MI->getOperand(0);
2789 const DebugLoc &DL = MI->getDebugLoc();
2790 Register MaterializedReg = FrameReg;
2791
2792 // Defend against live scc, which should never happen in practice.
2793 bool DeadSCC = MI->getOperand(3).isDead();
2794
2795 Register TmpReg;
2796
2797 // FIXME: Scavenger should figure out that the result register is
2798 // available. Also should do this for the v_add case.
2799 if (OtherOp.isReg() && OtherOp.getReg() != DstOp.getReg())
2800 TmpReg = DstOp.getReg();
2801
2802 if (FrameReg && !ST.hasFlatScratchEnabled()) {
2803 // FIXME: In the common case where the add does not also read its result
2804 // (i.e. this isn't a reg += fi), it's not finding the dest reg as
2805 // available.
2806 if (!TmpReg)
2807 TmpReg = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
2808 MI, /*RestoreAfter=*/false, 0,
2809 /*AllowSpill=*/false);
2810 if (TmpReg) {
2811 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::S_LSHR_B32))
2812 .addDef(TmpReg, RegState::Renamable)
2813 .addReg(FrameReg)
2814 .addImm(ST.getWavefrontSizeLog2())
2815 .setOperandDead(3); // Set SCC dead
2816 }
2817 MaterializedReg = TmpReg;
2818 }
2819
2820 int64_t Offset = FrameInfo.getObjectOffset(Index);
2821
2822 // For the non-immediate case, we could fall through to the default
2823 // handling, but we do an in-place update of the result register here to
2824 // avoid scavenging another register.
2825 if (OtherOp.isImm()) {
2826 OtherOp.setImm(OtherOp.getImm() + Offset);
2827 Offset = 0;
2828
2829 if (MaterializedReg)
2830 FIOp->ChangeToRegister(MaterializedReg, false);
2831 else
2832 FIOp->ChangeToImmediate(0);
2833 } else if (MaterializedReg) {
2834 // If we can't fold the other operand, do another increment.
2835 Register DstReg = DstOp.getReg();
2836
2837 if (!TmpReg && MaterializedReg == FrameReg) {
2838 TmpReg = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
2839 MI, /*RestoreAfter=*/false, 0,
2840 /*AllowSpill=*/false);
2841 DstReg = TmpReg;
2842 }
2843
2844 if (TmpReg) {
2845 auto AddI32 = BuildMI(*MBB, *MI, DL, MI->getDesc())
2846 .addDef(DstReg, RegState::Renamable)
2847 .addReg(MaterializedReg, RegState::Kill)
2848 .add(OtherOp);
2849 if (DeadSCC)
2850 AddI32.setOperandDead(3);
2851
2852 MaterializedReg = DstReg;
2853
2854 OtherOp.ChangeToRegister(MaterializedReg, false);
2855 OtherOp.setIsKill(true);
2856 OtherOp.setIsRenamable(true);
2857 }
2859 } else {
2860 // If we don't have any other offset to apply, we can just directly
2861 // interpret the frame index as the offset.
2863 }
2864
2865 if (DeadSCC && OtherOp.isImm() && OtherOp.getImm() == 0) {
2866 assert(Offset == 0);
2867 MI->removeOperand(3);
2868 MI->removeOperand(OtherOpIdx);
2869 MI->setDesc(TII->get(FIOp->isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2870 } else if (DeadSCC && FIOp->isImm() && FIOp->getImm() == 0) {
2871 assert(Offset == 0);
2872 MI->removeOperand(3);
2873 MI->removeOperand(FIOperandNum);
2874 MI->setDesc(
2875 TII->get(OtherOp.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2876 }
2877
2878 assert(!FIOp->isFI());
2879 return true;
2880 }
2881 default: {
2882 break;
2883 }
2884 }
2885
2886 int64_t Offset = FrameInfo.getObjectOffset(Index);
2887 if (ST.hasFlatScratchEnabled()) {
2888 if (TII->isFLATScratch(*MI)) {
2889 assert(
2890 (int16_t)FIOperandNum ==
2891 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::saddr));
2892
2893 // The offset is always swizzled, just replace it
2894 if (FrameReg)
2895 FIOp->ChangeToRegister(FrameReg, false);
2896
2898 TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
2899 int64_t NewOffset = Offset + OffsetOp->getImm();
2900 if (TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
2902 OffsetOp->setImm(NewOffset);
2903 if (FrameReg)
2904 return false;
2905 Offset = 0;
2906 }
2907
2908 if (!Offset) {
2909 unsigned Opc = MI->getOpcode();
2910 int NewOpc = -1;
2911 if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr)) {
2913 } else if (ST.hasFlatScratchSTMode()) {
2914 // On GFX10 we have ST mode to use no registers for an address.
2915 // Otherwise we need to materialize 0 into an SGPR.
2917 }
2918
2919 if (NewOpc != -1) {
2920 // removeOperand doesn't fixup tied operand indexes as it goes, so
2921 // it asserts. Untie vdst_in for now and retie them afterwards.
2922 int VDstIn =
2923 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
2924 bool TiedVDst = VDstIn != -1 && MI->getOperand(VDstIn).isReg() &&
2925 MI->getOperand(VDstIn).isTied();
2926 if (TiedVDst)
2927 MI->untieRegOperand(VDstIn);
2928
2929 MI->removeOperand(
2930 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr));
2931
2932 if (TiedVDst) {
2933 int NewVDst =
2934 AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
2935 int NewVDstIn =
2936 AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst_in);
2937 assert(NewVDst != -1 && NewVDstIn != -1 && "Must be tied!");
2938 MI->tieOperands(NewVDst, NewVDstIn);
2939 }
2940 MI->setDesc(TII->get(NewOpc));
2941 return false;
2942 }
2943 }
2944 }
2945
2946 if (!FrameReg) {
2948 if (TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp))
2949 return false;
2950 }
2951
2952 // We need to use register here. Check if we can use an SGPR or need
2953 // a VGPR.
2954 FIOp->ChangeToRegister(AMDGPU::M0, false);
2955 bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, FIOp);
2956
2957 if (!Offset && FrameReg && UseSGPR) {
2958 FIOp->setReg(FrameReg);
2959 return false;
2960 }
2961
2962 const TargetRegisterClass *RC =
2963 UseSGPR ? &AMDGPU::SReg_32_XM0RegClass : &AMDGPU::VGPR_32RegClass;
2964
2965 Register TmpReg =
2966 RS->scavengeRegisterBackwards(*RC, MI, false, 0, !UseSGPR);
2967 FIOp->setReg(TmpReg);
2968 FIOp->setIsKill();
2969
2970 if ((!FrameReg || !Offset) && TmpReg) {
2971 unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
2972 auto MIB = BuildMI(*MBB, MI, DL, TII->get(Opc), TmpReg);
2973 if (FrameReg)
2974 MIB.addReg(FrameReg);
2975 else
2976 MIB.addImm(Offset);
2977
2978 return false;
2979 }
2980
2981 bool NeedSaveSCC = RS->isRegUsed(AMDGPU::SCC) &&
2982 !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr);
2983
2984 Register TmpSReg =
2985 UseSGPR ? TmpReg
2986 : RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
2987 MI, false, 0, !UseSGPR);
2988
2989 if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR)) {
2990 int SVOpcode = AMDGPU::getFlatScratchInstSVfromSS(MI->getOpcode());
2991 if (ST.hasFlatScratchSVSMode() && SVOpcode != -1) {
2992 Register TmpVGPR = RS->scavengeRegisterBackwards(
2993 AMDGPU::VGPR_32RegClass, MI, false, 0, /*AllowSpill=*/true);
2994
2995 // Materialize the frame register.
2996 auto MIB =
2997 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR);
2998 if (FrameReg)
2999 MIB.addReg(FrameReg);
3000 else
3001 MIB.addImm(Offset);
3002
3003 // Add the offset to the frame register.
3004 if (FrameReg && Offset)
3005 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e32), FrameReg)
3006 .addReg(FrameReg, RegState::Kill)
3007 .addImm(Offset);
3008
3009 BuildMI(*MBB, MI, DL, TII->get(SVOpcode))
3010 .add(MI->getOperand(0)) // $vdata
3011 .addReg(TmpVGPR) // $vaddr
3012 .addImm(0) // Offset
3013 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::cpol));
3014 MI->eraseFromParent();
3015 return true;
3016 }
3017 report_fatal_error("Cannot scavenge register in FI elimination!");
3018 }
3019
3020 if (!TmpSReg) {
3021 // Use frame register and restore it after.
3022 TmpSReg = FrameReg;
3023 FIOp->setReg(FrameReg);
3024 FIOp->setIsKill(false);
3025 }
3026
3027 if (NeedSaveSCC) {
3028 assert(!(Offset & 0x1) && "Flat scratch offset must be aligned!");
3029 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADDC_U32), TmpSReg)
3030 .addReg(FrameReg)
3031 .addImm(Offset);
3032 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITCMP1_B32))
3033 .addReg(TmpSReg)
3034 .addImm(0);
3035 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITSET0_B32), TmpSReg)
3036 .addImm(0)
3037 .addReg(TmpSReg);
3038 } else {
3039 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpSReg)
3040 .addReg(FrameReg)
3041 .addImm(Offset);
3042 }
3043
3044 if (!UseSGPR)
3045 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
3046 .addReg(TmpSReg, RegState::Kill);
3047
3048 if (TmpSReg == FrameReg) {
3049 // Undo frame register modification.
3050 if (NeedSaveSCC &&
3051 !MI->registerDefIsDead(AMDGPU::SCC, /*TRI=*/nullptr)) {
3053 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADDC_U32),
3054 TmpSReg)
3055 .addReg(FrameReg)
3056 .addImm(-Offset);
3057 I = BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITCMP1_B32))
3058 .addReg(TmpSReg)
3059 .addImm(0);
3060 BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITSET0_B32),
3061 TmpSReg)
3062 .addImm(0)
3063 .addReg(TmpSReg);
3064 } else {
3065 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADD_I32),
3066 FrameReg)
3067 .addReg(FrameReg)
3068 .addImm(-Offset);
3069 }
3070 }
3071
3072 return false;
3073 }
3074
3075 bool IsMUBUF = TII->isMUBUF(*MI);
3076
3077 if (!IsMUBUF && !MFI->isBottomOfStack()) {
3078 // Convert to a swizzled stack address by scaling by the wave size.
3079 // In an entry function/kernel the offset is already swizzled.
3080 bool IsSALU = isSGPRClass(TII->getRegClass(MI->getDesc(), FIOperandNum));
3081 bool LiveSCC = RS->isRegUsed(AMDGPU::SCC) &&
3082 !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr);
3083 const TargetRegisterClass *RC = IsSALU && !LiveSCC
3084 ? &AMDGPU::SReg_32RegClass
3085 : &AMDGPU::VGPR_32RegClass;
3086 bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32 ||
3087 MI->getOpcode() == AMDGPU::V_MOV_B32_e64 ||
3088 MI->getOpcode() == AMDGPU::S_MOV_B32;
3089 Register ResultReg =
3090 IsCopy ? MI->getOperand(0).getReg()
3091 : RS->scavengeRegisterBackwards(*RC, MI, false, 0);
3092
3093 int64_t Offset = FrameInfo.getObjectOffset(Index);
3094 if (Offset == 0) {
3095 unsigned OpCode =
3096 IsSALU && !LiveSCC ? AMDGPU::S_LSHR_B32 : AMDGPU::V_LSHRREV_B32_e64;
3097 Register TmpResultReg = ResultReg;
3098 if (IsSALU && LiveSCC) {
3099 TmpResultReg = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
3100 MI, false, 0);
3101 }
3102
3103 auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), TmpResultReg);
3104 if (OpCode == AMDGPU::V_LSHRREV_B32_e64)
3105 // For V_LSHRREV, the operands are reversed (the shift count goes
3106 // first).
3107 Shift.addImm(ST.getWavefrontSizeLog2()).addReg(FrameReg);
3108 else
3109 Shift.addReg(FrameReg).addImm(ST.getWavefrontSizeLog2());
3110 if (IsSALU && !LiveSCC)
3111 Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead.
3112 if (IsSALU && LiveSCC) {
3113 Register NewDest;
3114 if (IsCopy) {
3115 assert(ResultReg.isPhysical());
3116 NewDest = ResultReg;
3117 } else {
3118 NewDest = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
3119 Shift, false, 0);
3120 }
3121 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), NewDest)
3122 .addReg(TmpResultReg);
3123 ResultReg = NewDest;
3124 }
3125 } else {
3127 if (!IsSALU) {
3128 if ((MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) !=
3129 nullptr) {
3130 // Reuse ResultReg in intermediate step.
3131 Register ScaledReg = ResultReg;
3132
3133 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
3134 ScaledReg)
3135 .addImm(ST.getWavefrontSizeLog2())
3136 .addReg(FrameReg);
3137
3138 const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32;
3139
3140 // TODO: Fold if use instruction is another add of a constant.
3141 if (IsVOP2 ||
3142 AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
3143 // FIXME: This can fail
3144 MIB.addImm(Offset);
3145 MIB.addReg(ScaledReg, RegState::Kill);
3146 if (!IsVOP2)
3147 MIB.addImm(0); // clamp bit
3148 } else {
3149 assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 &&
3150 "Need to reuse carry out register");
3151
3152 // Use scavenged unused carry out as offset register.
3153 Register ConstOffsetReg;
3154 if (!isWave32)
3155 ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0);
3156 else
3157 ConstOffsetReg = MIB.getReg(1);
3158
3159 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32),
3160 ConstOffsetReg)
3161 .addImm(Offset);
3162 MIB.addReg(ConstOffsetReg, RegState::Kill);
3163 MIB.addReg(ScaledReg, RegState::Kill);
3164 MIB.addImm(0); // clamp bit
3165 }
3166 }
3167 }
3168 if (!MIB || IsSALU) {
3169 // We have to produce a carry out, and there isn't a free SGPR pair
3170 // for it. We can keep the whole computation on the SALU to avoid
3171 // clobbering an additional register at the cost of an extra mov.
3172
3173 // We may have 1 free scratch SGPR even though a carry out is
3174 // unavailable. Only one additional mov is needed.
3175 Register TmpScaledReg = IsCopy && IsSALU
3176 ? ResultReg
3177 : RS->scavengeRegisterBackwards(
3178 AMDGPU::SReg_32_XM0RegClass, MI,
3179 false, 0, /*AllowSpill=*/false);
3180 Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg;
3181 Register TmpResultReg = ScaledReg;
3182
3183 if (!LiveSCC) {
3184 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), TmpResultReg)
3185 .addReg(FrameReg)
3186 .addImm(ST.getWavefrontSizeLog2());
3187 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpResultReg)
3188 .addReg(TmpResultReg, RegState::Kill)
3189 .addImm(Offset);
3190 } else {
3191 TmpResultReg = RS->scavengeRegisterBackwards(
3192 AMDGPU::VGPR_32RegClass, MI, false, 0, /*AllowSpill=*/true);
3193
3195 if ((Add = TII->getAddNoCarry(*MBB, MI, DL, TmpResultReg, *RS))) {
3196 BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
3197 TmpResultReg)
3198 .addImm(ST.getWavefrontSizeLog2())
3199 .addReg(FrameReg);
3200 if (Add->getOpcode() == AMDGPU::V_ADD_CO_U32_e64) {
3201 BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::S_MOV_B32), ResultReg)
3202 .addImm(Offset);
3203 Add.addReg(ResultReg, RegState::Kill)
3204 .addReg(TmpResultReg, RegState::Kill)
3205 .addImm(0);
3206 } else
3207 Add.addImm(Offset).addReg(TmpResultReg, RegState::Kill);
3208 } else {
3209 assert(Offset > 0 && isUInt<24>(2 * ST.getMaxWaveScratchSize()) &&
3210 "offset is unsafe for v_mad_u32_u24");
3211
3212 // We start with a frame pointer with a wave space value, and
3213 // an offset in lane-space. We are materializing a lane space
3214 // value. We can either do a right shift of the frame pointer
3215 // to get to lane space, or a left shift of the offset to get
3216 // to wavespace. We can right shift after the computation to
3217 // get back to the desired per-lane value. We are using the
3218 // mad_u32_u24 primarily as an add with no carry out clobber.
3219 bool IsInlinableLiteral =
3220 AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm());
3221 if (!IsInlinableLiteral) {
3222 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32),
3223 TmpResultReg)
3224 .addImm(Offset);
3225 }
3226
3227 Add = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MAD_U32_U24_e64),
3228 TmpResultReg);
3229
3230 if (!IsInlinableLiteral) {
3231 Add.addReg(TmpResultReg, RegState::Kill);
3232 } else {
3233 // We fold the offset into mad itself if its inlinable.
3234 Add.addImm(Offset);
3235 }
3236 Add.addImm(ST.getWavefrontSize()).addReg(FrameReg).addImm(0);
3237 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
3238 TmpResultReg)
3239 .addImm(ST.getWavefrontSizeLog2())
3240 .addReg(TmpResultReg);
3241 }
3242
3243 Register NewDest;
3244 if (IsCopy) {
3245 NewDest = ResultReg;
3246 } else {
3247 NewDest = RS->scavengeRegisterBackwards(
3248 AMDGPU::SReg_32_XM0RegClass, *Add, false, 0,
3249 /*AllowSpill=*/true);
3250 }
3251
3252 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
3253 NewDest)
3254 .addReg(TmpResultReg);
3255 ResultReg = NewDest;
3256 }
3257 if (!IsSALU)
3258 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg)
3259 .addReg(TmpResultReg, RegState::Kill);
3260 else
3261 ResultReg = TmpResultReg;
3262 // If there were truly no free SGPRs, we need to undo everything.
3263 if (!TmpScaledReg.isValid()) {
3264 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg)
3265 .addReg(ScaledReg, RegState::Kill)
3266 .addImm(-Offset);
3267 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg)
3268 .addReg(FrameReg)
3269 .addImm(ST.getWavefrontSizeLog2());
3270 }
3271 }
3272 }
3273
3274 // Don't introduce an extra copy if we're just materializing in a mov.
3275 if (IsCopy) {
3276 MI->eraseFromParent();
3277 return true;
3278 }
3279 FIOp->ChangeToRegister(ResultReg, false, false, true);
3280 return false;
3281 }
3282
3283 if (IsMUBUF) {
3284 // Disable offen so we don't need a 0 vgpr base.
3285 assert(
3286 static_cast<int>(FIOperandNum) ==
3287 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr));
3288
3289 auto &SOffset = *TII->getNamedOperand(*MI, AMDGPU::OpName::soffset);
3290 assert((SOffset.isImm() && SOffset.getImm() == 0));
3291
3292 if (FrameReg != AMDGPU::NoRegister)
3293 SOffset.ChangeToRegister(FrameReg, false);
3294
3295 int64_t Offset = FrameInfo.getObjectOffset(Index);
3296 int64_t OldImm =
3297 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm();
3298 int64_t NewOffset = OldImm + Offset;
3299
3300 if (TII->isLegalMUBUFImmOffset(NewOffset) &&
3301 buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) {
3302 MI->eraseFromParent();
3303 return true;
3304 }
3305 }
3306
3307 // If the offset is simply too big, don't convert to a scratch wave offset
3308 // relative index.
3309
3311 if (!TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp)) {
3312 Register TmpReg =
3313 RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0);
3314 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
3315 .addImm(Offset);
3316 FIOp->ChangeToRegister(TmpReg, false, false, true);
3317 }
3318
3319 return false;
3320}
3321
3325
3327 return getEncodingValue(Reg) & AMDGPU::HWEncoding::REG_IDX_MASK;
3328}
3329
3331 return getRegBitWidth(RC.getID());
3332}
3333
3334static const TargetRegisterClass *
3336 if (BitWidth == 64)
3337 return &AMDGPU::VReg_64RegClass;
3338 if (BitWidth == 96)
3339 return &AMDGPU::VReg_96RegClass;
3340 if (BitWidth == 128)
3341 return &AMDGPU::VReg_128RegClass;
3342 if (BitWidth == 160)
3343 return &AMDGPU::VReg_160RegClass;
3344 if (BitWidth == 192)
3345 return &AMDGPU::VReg_192RegClass;
3346 if (BitWidth == 224)
3347 return &AMDGPU::VReg_224RegClass;
3348 if (BitWidth == 256)
3349 return &AMDGPU::VReg_256RegClass;
3350 if (BitWidth == 288)
3351 return &AMDGPU::VReg_288RegClass;
3352 if (BitWidth == 320)
3353 return &AMDGPU::VReg_320RegClass;
3354 if (BitWidth == 352)
3355 return &AMDGPU::VReg_352RegClass;
3356 if (BitWidth == 384)
3357 return &AMDGPU::VReg_384RegClass;
3358 if (BitWidth == 512)
3359 return &AMDGPU::VReg_512RegClass;
3360 if (BitWidth == 1024)
3361 return &AMDGPU::VReg_1024RegClass;
3362
3363 return nullptr;
3364}
3365
3366static const TargetRegisterClass *
3368 if (BitWidth == 64)
3369 return &AMDGPU::VReg_64_Align2RegClass;
3370 if (BitWidth == 96)
3371 return &AMDGPU::VReg_96_Align2RegClass;
3372 if (BitWidth == 128)
3373 return &AMDGPU::VReg_128_Align2RegClass;
3374 if (BitWidth == 160)
3375 return &AMDGPU::VReg_160_Align2RegClass;
3376 if (BitWidth == 192)
3377 return &AMDGPU::VReg_192_Align2RegClass;
3378 if (BitWidth == 224)
3379 return &AMDGPU::VReg_224_Align2RegClass;
3380 if (BitWidth == 256)
3381 return &AMDGPU::VReg_256_Align2RegClass;
3382 if (BitWidth == 288)
3383 return &AMDGPU::VReg_288_Align2RegClass;
3384 if (BitWidth == 320)
3385 return &AMDGPU::VReg_320_Align2RegClass;
3386 if (BitWidth == 352)
3387 return &AMDGPU::VReg_352_Align2RegClass;
3388 if (BitWidth == 384)
3389 return &AMDGPU::VReg_384_Align2RegClass;
3390 if (BitWidth == 512)
3391 return &AMDGPU::VReg_512_Align2RegClass;
3392 if (BitWidth == 1024)
3393 return &AMDGPU::VReg_1024_Align2RegClass;
3394
3395 return nullptr;
3396}
3397
3398const TargetRegisterClass *
3400 if (BitWidth == 1)
3401 return &AMDGPU::VReg_1RegClass;
3402 if (BitWidth == 16)
3403 return &AMDGPU::VGPR_16RegClass;
3404 if (BitWidth == 32)
3405 return &AMDGPU::VGPR_32RegClass;
3406 return ST.needsAlignedVGPRs() ? getAlignedVGPRClassForBitWidth(BitWidth)
3408}
3409
3410const TargetRegisterClass *
3412 if (BitWidth <= 32)
3413 return &AMDGPU::VGPR_32_Lo256RegClass;
3414 if (BitWidth <= 64)
3415 return &AMDGPU::VReg_64_Lo256_Align2RegClass;
3416 if (BitWidth <= 96)
3417 return &AMDGPU::VReg_96_Lo256_Align2RegClass;
3418 if (BitWidth <= 128)
3419 return &AMDGPU::VReg_128_Lo256_Align2RegClass;
3420 if (BitWidth <= 160)
3421 return &AMDGPU::VReg_160_Lo256_Align2RegClass;
3422 if (BitWidth <= 192)
3423 return &AMDGPU::VReg_192_Lo256_Align2RegClass;
3424 if (BitWidth <= 224)
3425 return &AMDGPU::VReg_224_Lo256_Align2RegClass;
3426 if (BitWidth <= 256)
3427 return &AMDGPU::VReg_256_Lo256_Align2RegClass;
3428 if (BitWidth <= 288)
3429 return &AMDGPU::VReg_288_Lo256_Align2RegClass;
3430 if (BitWidth <= 320)
3431 return &AMDGPU::VReg_320_Lo256_Align2RegClass;
3432 if (BitWidth <= 352)
3433 return &AMDGPU::VReg_352_Lo256_Align2RegClass;
3434 if (BitWidth <= 384)
3435 return &AMDGPU::VReg_384_Lo256_Align2RegClass;
3436 if (BitWidth <= 512)
3437 return &AMDGPU::VReg_512_Lo256_Align2RegClass;
3438 if (BitWidth <= 1024)
3439 return &AMDGPU::VReg_1024_Lo256_Align2RegClass;
3440
3441 return nullptr;
3442}
3443
3444static const TargetRegisterClass *
3446 if (BitWidth == 64)
3447 return &AMDGPU::AReg_64RegClass;
3448 if (BitWidth == 96)
3449 return &AMDGPU::AReg_96RegClass;
3450 if (BitWidth == 128)
3451 return &AMDGPU::AReg_128RegClass;
3452 if (BitWidth == 160)
3453 return &AMDGPU::AReg_160RegClass;
3454 if (BitWidth == 192)
3455 return &AMDGPU::AReg_192RegClass;
3456 if (BitWidth == 224)
3457 return &AMDGPU::AReg_224RegClass;
3458 if (BitWidth == 256)
3459 return &AMDGPU::AReg_256RegClass;
3460 if (BitWidth == 288)
3461 return &AMDGPU::AReg_288RegClass;
3462 if (BitWidth == 320)
3463 return &AMDGPU::AReg_320RegClass;
3464 if (BitWidth == 352)
3465 return &AMDGPU::AReg_352RegClass;
3466 if (BitWidth == 384)
3467 return &AMDGPU::AReg_384RegClass;
3468 if (BitWidth == 512)
3469 return &AMDGPU::AReg_512RegClass;
3470 if (BitWidth == 1024)
3471 return &AMDGPU::AReg_1024RegClass;
3472
3473 return nullptr;
3474}
3475
3476static const TargetRegisterClass *
3478 if (BitWidth == 64)
3479 return &AMDGPU::AReg_64_Align2RegClass;
3480 if (BitWidth == 96)
3481 return &AMDGPU::AReg_96_Align2RegClass;
3482 if (BitWidth == 128)
3483 return &AMDGPU::AReg_128_Align2RegClass;
3484 if (BitWidth == 160)
3485 return &AMDGPU::AReg_160_Align2RegClass;
3486 if (BitWidth == 192)
3487 return &AMDGPU::AReg_192_Align2RegClass;
3488 if (BitWidth == 224)
3489 return &AMDGPU::AReg_224_Align2RegClass;
3490 if (BitWidth == 256)
3491 return &AMDGPU::AReg_256_Align2RegClass;
3492 if (BitWidth == 288)
3493 return &AMDGPU::AReg_288_Align2RegClass;
3494 if (BitWidth == 320)
3495 return &AMDGPU::AReg_320_Align2RegClass;
3496 if (BitWidth == 352)
3497 return &AMDGPU::AReg_352_Align2RegClass;
3498 if (BitWidth == 384)
3499 return &AMDGPU::AReg_384_Align2RegClass;
3500 if (BitWidth == 512)
3501 return &AMDGPU::AReg_512_Align2RegClass;
3502 if (BitWidth == 1024)
3503 return &AMDGPU::AReg_1024_Align2RegClass;
3504
3505 return nullptr;
3506}
3507
3508const TargetRegisterClass *
3510 if (BitWidth == 16)
3511 return &AMDGPU::AGPR_LO16RegClass;
3512 if (BitWidth == 32)
3513 return &AMDGPU::AGPR_32RegClass;
3514 return ST.needsAlignedVGPRs() ? getAlignedAGPRClassForBitWidth(BitWidth)
3516}
3517
3518static const TargetRegisterClass *
3520 if (BitWidth == 64)
3521 return &AMDGPU::AV_64RegClass;
3522 if (BitWidth == 96)
3523 return &AMDGPU::AV_96RegClass;
3524 if (BitWidth == 128)
3525 return &AMDGPU::AV_128RegClass;
3526 if (BitWidth == 160)
3527 return &AMDGPU::AV_160RegClass;
3528 if (BitWidth == 192)
3529 return &AMDGPU::AV_192RegClass;
3530 if (BitWidth == 224)
3531 return &AMDGPU::AV_224RegClass;
3532 if (BitWidth == 256)
3533 return &AMDGPU::AV_256RegClass;
3534 if (BitWidth == 288)
3535 return &AMDGPU::AV_288RegClass;
3536 if (BitWidth == 320)
3537 return &AMDGPU::AV_320RegClass;
3538 if (BitWidth == 352)
3539 return &AMDGPU::AV_352RegClass;
3540 if (BitWidth == 384)
3541 return &AMDGPU::AV_384RegClass;
3542 if (BitWidth == 512)
3543 return &AMDGPU::AV_512RegClass;
3544 if (BitWidth == 1024)
3545 return &AMDGPU::AV_1024RegClass;
3546
3547 return nullptr;
3548}
3549
3550static const TargetRegisterClass *
3552 if (BitWidth == 64)
3553 return &AMDGPU::AV_64_Align2RegClass;
3554 if (BitWidth == 96)
3555 return &AMDGPU::AV_96_Align2RegClass;
3556 if (BitWidth == 128)
3557 return &AMDGPU::AV_128_Align2RegClass;
3558 if (BitWidth == 160)
3559 return &AMDGPU::AV_160_Align2RegClass;
3560 if (BitWidth == 192)
3561 return &AMDGPU::AV_192_Align2RegClass;
3562 if (BitWidth == 224)
3563 return &AMDGPU::AV_224_Align2RegClass;
3564 if (BitWidth == 256)
3565 return &AMDGPU::AV_256_Align2RegClass;
3566 if (BitWidth == 288)
3567 return &AMDGPU::AV_288_Align2RegClass;
3568 if (BitWidth == 320)
3569 return &AMDGPU::AV_320_Align2RegClass;
3570 if (BitWidth == 352)
3571 return &AMDGPU::AV_352_Align2RegClass;
3572 if (BitWidth == 384)
3573 return &AMDGPU::AV_384_Align2RegClass;
3574 if (BitWidth == 512)
3575 return &AMDGPU::AV_512_Align2RegClass;
3576 if (BitWidth == 1024)
3577 return &AMDGPU::AV_1024_Align2RegClass;
3578
3579 return nullptr;
3580}
3581
3582const TargetRegisterClass *
3584 if (BitWidth == 32)
3585 return &AMDGPU::AV_32RegClass;
3586 return ST.needsAlignedVGPRs()
3589}
3590
3591const TargetRegisterClass *
3593 // TODO: In principle this should use AV classes for gfx908 too. This is
3594 // limited to 90a+ to avoid regressing special case copy optimizations which
3595 // need new handling. The core issue is that it's not possible to directly
3596 // copy between AGPRs on gfx908, and the current optimizations around that
3597 // expect to see copies to VGPR.
3598 return ST.hasGFX90AInsts() ? getVectorSuperClassForBitWidth(BitWidth)
3600}
3601
3602const TargetRegisterClass *
3604 if (BitWidth == 16 || BitWidth == 32)
3605 return &AMDGPU::SReg_32RegClass;
3606 if (BitWidth == 64)
3607 return &AMDGPU::SReg_64RegClass;
3608 if (BitWidth == 96)
3609 return &AMDGPU::SGPR_96RegClass;
3610 if (BitWidth == 128)
3611 return &AMDGPU::SGPR_128RegClass;
3612 if (BitWidth == 160)
3613 return &AMDGPU::SGPR_160RegClass;
3614 if (BitWidth == 192)
3615 return &AMDGPU::SGPR_192RegClass;
3616 if (BitWidth == 224)
3617 return &AMDGPU::SGPR_224RegClass;
3618 if (BitWidth == 256)
3619 return &AMDGPU::SGPR_256RegClass;
3620 if (BitWidth == 288)
3621 return &AMDGPU::SGPR_288RegClass;
3622 if (BitWidth == 320)
3623 return &AMDGPU::SGPR_320RegClass;
3624 if (BitWidth == 352)
3625 return &AMDGPU::SGPR_352RegClass;
3626 if (BitWidth == 384)
3627 return &AMDGPU::SGPR_384RegClass;
3628 if (BitWidth == 512)
3629 return &AMDGPU::SGPR_512RegClass;
3630 if (BitWidth == 1024)
3631 return &AMDGPU::SGPR_1024RegClass;
3632
3633 return nullptr;
3634}
3635
3637 Register Reg) const {
3638 const TargetRegisterClass *RC;
3639 if (Reg.isVirtual())
3640 RC = MRI.getRegClass(Reg);
3641 else
3642 RC = getPhysRegBaseClass(Reg);
3643 return RC && isSGPRClass(RC);
3644}
3645
3646const TargetRegisterClass *
3648 unsigned Size = getRegSizeInBits(*SRC);
3649
3650 switch (SRC->getID()) {
3651 default:
3652 break;
3653 case AMDGPU::VS_32_Lo256RegClassID:
3654 case AMDGPU::VS_64_Lo256RegClassID:
3655 return getAllocatableClass(getAlignedLo256VGPRClassForBitWidth(Size));
3656 }
3657
3658 const TargetRegisterClass *VRC =
3659 getAllocatableClass(getVGPRClassForBitWidth(Size));
3660 assert(VRC && "Invalid register class size");
3661 return VRC;
3662}
3663
3664const TargetRegisterClass *
3666 unsigned Size = getRegSizeInBits(*SRC);
3668 assert(ARC && "Invalid register class size");
3669 return ARC;
3670}
3671
3672const TargetRegisterClass *
3674 unsigned Size = getRegSizeInBits(*SRC);
3676 assert(ARC && "Invalid register class size");
3677 return ARC;
3678}
3679
3680const TargetRegisterClass *
3682 unsigned Size = getRegSizeInBits(*VRC);
3683 if (Size == 32)
3684 return &AMDGPU::SGPR_32RegClass;
3686 assert(SRC && "Invalid register class size");
3687 return SRC;
3688}
3689
3690const TargetRegisterClass *
3692 const TargetRegisterClass *SubRC,
3693 unsigned SubIdx) const {
3694 // Ensure this subregister index is aligned in the super register.
3695 const TargetRegisterClass *MatchRC =
3696 getMatchingSuperRegClass(SuperRC, SubRC, SubIdx);
3697 return MatchRC && MatchRC->hasSubClassEq(SuperRC) ? MatchRC : nullptr;
3698}
3699
3700bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
3703 return !ST.hasMFMAInlineLiteralBug();
3704
3705 return OpType >= AMDGPU::OPERAND_SRC_FIRST &&
3706 OpType <= AMDGPU::OPERAND_SRC_LAST;
3707}
3708
3709bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const {
3710 // TODO: 64-bit operands have extending behavior from 32-bit literal.
3711 return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST &&
3713}
3714
3715/// Returns a lowest register that is not used at any point in the function.
3716/// If all registers are used, then this function will return
3717/// AMDGPU::NoRegister. If \p ReserveHighestRegister = true, then return
3718/// highest unused register.
3721 const MachineFunction &MF, bool ReserveHighestRegister) const {
3722 if (ReserveHighestRegister) {
3723 for (MCRegister Reg : reverse(*RC))
3724 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
3725 return Reg;
3726 } else {
3727 for (MCRegister Reg : *RC)
3728 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
3729 return Reg;
3730 }
3731 return MCRegister();
3732}
3733
3735 const RegisterBankInfo &RBI,
3736 Register Reg) const {
3737 auto *RB = RBI.getRegBank(Reg, MRI, *MRI.getTargetRegisterInfo());
3738 if (!RB)
3739 return false;
3740
3741 return !RBI.isDivergentRegBank(RB);
3742}
3743
3745 unsigned EltSize) const {
3746 const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC);
3747 assert(RegBitWidth >= 32 && RegBitWidth <= 1024 && EltSize >= 2);
3748
3749 const unsigned RegHalves = RegBitWidth / 16;
3750 const unsigned EltHalves = EltSize / 2;
3751 assert(RegSplitParts.size() + 1 >= EltHalves);
3752
3753 const std::vector<int16_t> &Parts = RegSplitParts[EltHalves - 1];
3754 const unsigned NumParts = RegHalves / EltHalves;
3755
3756 return ArrayRef(Parts.data(), NumParts);
3757}
3758
3761 Register Reg) const {
3762 return Reg.isVirtual() ? MRI.getRegClass(Reg) : getPhysRegBaseClass(Reg);
3763}
3764
3765const TargetRegisterClass *
3767 const MachineOperand &MO) const {
3768 const TargetRegisterClass *SrcRC = getRegClassForReg(MRI, MO.getReg());
3769 return getSubRegisterClass(SrcRC, MO.getSubReg());
3770}
3771
3773 Register Reg) const {
3774 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
3775 // Registers without classes are unaddressable, SGPR-like registers.
3776 return RC && isVGPRClass(RC);
3777}
3778
3780 Register Reg) const {
3781 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
3782
3783 // Registers without classes are unaddressable, SGPR-like registers.
3784 return RC && isAGPRClass(RC);
3785}
3786
3788 MachineFunction &MF) const {
3789 unsigned MinOcc = ST.getOccupancyWithWorkGroupSizes(MF).first;
3790 switch (RC->getID()) {
3791 default:
3792 return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF);
3793 case AMDGPU::VGPR_32RegClassID:
3794 return std::min(
3795 ST.getMaxNumVGPRs(
3796 MinOcc,
3798 ST.getMaxNumVGPRs(MF));
3799 case AMDGPU::SGPR_32RegClassID:
3800 case AMDGPU::SGPR_LO16RegClassID:
3801 return std::min(ST.getMaxNumSGPRs(MinOcc, true), ST.getMaxNumSGPRs(MF));
3802 }
3803}
3804
3806 unsigned Idx) const {
3807 switch (static_cast<AMDGPU::RegisterPressureSets>(Idx)) {
3808 case AMDGPU::RegisterPressureSets::VGPR_32:
3809 case AMDGPU::RegisterPressureSets::AGPR_32:
3810 return getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
3811 const_cast<MachineFunction &>(MF));
3812 case AMDGPU::RegisterPressureSets::SReg_32:
3813 return getRegPressureLimit(&AMDGPU::SGPR_32RegClass,
3814 const_cast<MachineFunction &>(MF));
3815 }
3816
3817 llvm_unreachable("Unexpected register pressure set!");
3818}
3819
3820const int *SIRegisterInfo::getRegUnitPressureSets(MCRegUnit RegUnit) const {
3821 static const int Empty[] = { -1 };
3822
3823 if (RegPressureIgnoredUnits[static_cast<unsigned>(RegUnit)])
3824 return Empty;
3825
3826 return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit);
3827}
3828
3830 ArrayRef<MCPhysReg> Order,
3832 const MachineFunction &MF,
3833 const VirtRegMap *VRM,
3834 const LiveRegMatrix *Matrix) const {
3835
3836 const MachineRegisterInfo &MRI = MF.getRegInfo();
3837 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3838
3839 std::pair<unsigned, Register> Hint = MRI.getRegAllocationHint(VirtReg);
3840
3841 switch (Hint.first) {
3842 case AMDGPURI::Size32: {
3843 Register Paired = Hint.second;
3844 assert(Paired);
3845 Register PairedPhys;
3846 if (Paired.isPhysical()) {
3847 PairedPhys =
3848 getMatchingSuperReg(Paired, AMDGPU::lo16, &AMDGPU::VGPR_32RegClass);
3849 } else if (VRM && VRM->hasPhys(Paired)) {
3850 PairedPhys = getMatchingSuperReg(VRM->getPhys(Paired), AMDGPU::lo16,
3851 &AMDGPU::VGPR_32RegClass);
3852 }
3853
3854 // Prefer the paired physreg.
3855 if (PairedPhys)
3856 // isLo(Paired) is implicitly true here from the API of
3857 // getMatchingSuperReg.
3858 Hints.push_back(PairedPhys);
3859 return false;
3860 }
3861 case AMDGPURI::Size16: {
3862 Register Paired = Hint.second;
3863 assert(Paired);
3864 Register PairedPhys;
3865 if (Paired.isPhysical()) {
3866 PairedPhys = TRI->getSubReg(Paired, AMDGPU::lo16);
3867 } else if (VRM && VRM->hasPhys(Paired)) {
3868 PairedPhys = TRI->getSubReg(VRM->getPhys(Paired), AMDGPU::lo16);
3869 }
3870
3871 // First prefer the paired physreg.
3872 if (PairedPhys)
3873 Hints.push_back(PairedPhys);
3874 else {
3875 // Add all the lo16 physregs.
3876 // When the Paired operand has not yet been assigned a physreg it is
3877 // better to try putting VirtReg in a lo16 register, because possibly
3878 // later Paired can be assigned to the overlapping register and the COPY
3879 // can be eliminated.
3880 for (MCPhysReg PhysReg : Order) {
3881 if (PhysReg == PairedPhys || AMDGPU::isHi16Reg(PhysReg, *this))
3882 continue;
3883 if (AMDGPU::VGPR_16RegClass.contains(PhysReg) &&
3884 !MRI.isReserved(PhysReg))
3885 Hints.push_back(PhysReg);
3886 }
3887 }
3888 return false;
3889 }
3890 default:
3891 return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
3892 VRM);
3893 }
3894}
3895
3897 // Not a callee saved register.
3898 return AMDGPU::SGPR30_SGPR31;
3899}
3900
3901const TargetRegisterClass *
3903 const RegisterBank &RB) const {
3904 switch (RB.getID()) {
3905 case AMDGPU::VGPRRegBankID:
3907 std::max(ST.useRealTrue16Insts() ? 16u : 32u, Size));
3908 case AMDGPU::VCCRegBankID:
3909 assert(Size == 1);
3910 return getWaveMaskRegClass();
3911 case AMDGPU::SGPRRegBankID:
3912 return getSGPRClassForBitWidth(std::max(32u, Size));
3913 case AMDGPU::AGPRRegBankID:
3914 return getAGPRClassForBitWidth(std::max(32u, Size));
3915 default:
3916 llvm_unreachable("unknown register bank");
3917 }
3918}
3919
3920const TargetRegisterClass *
3922 const MachineRegisterInfo &MRI) const {
3923 const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg());
3924 if (const RegisterBank *RB = dyn_cast<const RegisterBank *>(RCOrRB))
3925 return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB);
3926
3927 if (const auto *RC = dyn_cast<const TargetRegisterClass *>(RCOrRB))
3928 return getAllocatableClass(RC);
3929
3930 return nullptr;
3931}
3932
3934 return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC;
3935}
3936
3938 return isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
3939}
3940
3942 // VGPR tuples have an alignment requirement on gfx90a variants.
3943 return ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass
3944 : &AMDGPU::VReg_64RegClass;
3945}
3946
3947// Find reaching register definition
3951 LiveIntervals *LIS) const {
3952 auto &MDT = LIS->getDomTree();
3953 SlotIndex UseIdx = LIS->getInstructionIndex(Use);
3954 SlotIndex DefIdx;
3955
3956 if (Reg.isVirtual()) {
3957 if (!LIS->hasInterval(Reg))
3958 return nullptr;
3959 LiveInterval &LI = LIS->getInterval(Reg);
3960 LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg)
3961 : MRI.getMaxLaneMaskForVReg(Reg);
3962 VNInfo *V = nullptr;
3963 if (LI.hasSubRanges()) {
3964 for (auto &S : LI.subranges()) {
3965 if ((S.LaneMask & SubLanes) == SubLanes) {
3966 V = S.getVNInfoAt(UseIdx);
3967 break;
3968 }
3969 }
3970 } else {
3971 V = LI.getVNInfoAt(UseIdx);
3972 }
3973 if (!V)
3974 return nullptr;
3975 DefIdx = V->def;
3976 } else {
3977 // Find last def.
3978 for (MCRegUnit Unit : regunits(Reg.asMCReg())) {
3979 LiveRange &LR = LIS->getRegUnit(Unit);
3980 if (VNInfo *V = LR.getVNInfoAt(UseIdx)) {
3981 if (!DefIdx.isValid() ||
3982 MDT.dominates(LIS->getInstructionFromIndex(DefIdx),
3983 LIS->getInstructionFromIndex(V->def)))
3984 DefIdx = V->def;
3985 } else {
3986 return nullptr;
3987 }
3988 }
3989 }
3990
3991 MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx);
3992
3993 if (!Def || !MDT.dominates(Def, &Use))
3994 return nullptr;
3995
3996 assert(Def->modifiesRegister(Reg, this));
3997
3998 return Def;
3999}
4000
4002 assert(getRegSizeInBits(*getPhysRegBaseClass(Reg)) <= 32);
4003
4004 for (const TargetRegisterClass &RC : { AMDGPU::VGPR_32RegClass,
4005 AMDGPU::SReg_32RegClass,
4006 AMDGPU::AGPR_32RegClass } ) {
4007 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::lo16, &RC))
4008 return Super;
4009 }
4010 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::hi16,
4011 &AMDGPU::VGPR_32RegClass)) {
4012 return Super;
4013 }
4014
4015 return AMDGPU::NoRegister;
4016}
4017
4019 if (!ST.needsAlignedVGPRs())
4020 return true;
4021
4022 if (isVGPRClass(&RC))
4023 return RC.hasSuperClassEq(getVGPRClassForBitWidth(getRegSizeInBits(RC)));
4024 if (isAGPRClass(&RC))
4025 return RC.hasSuperClassEq(getAGPRClassForBitWidth(getRegSizeInBits(RC)));
4026 if (isVectorSuperClass(&RC))
4027 return RC.hasSuperClassEq(
4028 getVectorSuperClassForBitWidth(getRegSizeInBits(RC)));
4029
4030 assert(&RC != &AMDGPU::VS_64RegClass);
4031
4032 return true;
4033}
4034
4037 return ArrayRef(AMDGPU::SGPR_128RegClass.begin(), ST.getMaxNumSGPRs(MF) / 4);
4038}
4039
4042 return ArrayRef(AMDGPU::SGPR_64RegClass.begin(), ST.getMaxNumSGPRs(MF) / 2);
4043}
4044
4047 return ArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF));
4048}
4049
4050unsigned
4052 unsigned SubReg) const {
4053 switch (RC->TSFlags & SIRCFlags::RegKindMask) {
4054 case SIRCFlags::HasSGPR:
4055 return std::min(128u, getSubRegIdxSize(SubReg));
4056 case SIRCFlags::HasAGPR:
4057 case SIRCFlags::HasVGPR:
4059 return std::min(32u, getSubRegIdxSize(SubReg));
4060 default:
4061 break;
4062 }
4063 return 0;
4064}
4065
4067 const TargetRegisterClass &RC,
4068 bool IncludeCalls) const {
4069 unsigned NumArchVGPRs = ST.has1024AddressableVGPRs() ? 1024 : 256;
4071 (RC.getID() == AMDGPU::VGPR_32RegClassID)
4072 ? RC.getRegisters().take_front(NumArchVGPRs)
4073 : RC.getRegisters();
4074 for (MCPhysReg Reg : reverse(Registers))
4075 if (MRI.isPhysRegUsed(Reg, /*SkipRegMaskTest=*/!IncludeCalls))
4076 return getHWRegIndex(Reg) + 1;
4077 return 0;
4078}
4079
4082 const MachineFunction &MF) const {
4084 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
4085 if (FuncInfo->checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG))
4086 RegFlags.push_back("WWM_REG");
4087 return RegFlags;
4088}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static const Function * getParent(const Value *V)
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
Live Register Matrix
A set of register units.
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
if(PassOpts->AAPipeline)
This file declares the machine register scavenger class.
SI Pre allocate WWM Registers
static int getOffenMUBUFStore(unsigned Opc)
static const TargetRegisterClass * getAnyAGPRClassForBitWidth(unsigned BitWidth)
static int getOffsetMUBUFLoad(unsigned Opc)
static const std::array< unsigned, 17 > SubRegFromChannelTableWidthMap
static unsigned getNumSubRegsForSpillOp(const MachineInstr &MI, const SIInstrInfo *TII)
static void emitUnsupportedError(const Function &Fn, const MachineInstr &MI, const Twine &ErrMsg)
static const TargetRegisterClass * getAlignedAGPRClassForBitWidth(unsigned BitWidth)
static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST, MachineFrameInfo &MFI, MachineBasicBlock::iterator MI, int Index, int64_t Offset)
static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII, unsigned LoadStoreOp, unsigned EltSize)
static const TargetRegisterClass * getAlignedVGPRClassForBitWidth(unsigned BitWidth)
static int getOffsetMUBUFStore(unsigned Opc)
static const TargetRegisterClass * getAnyVGPRClassForBitWidth(unsigned BitWidth)
static cl::opt< bool > EnableSpillSGPRToVGPR("amdgpu-spill-sgpr-to-vgpr", cl::desc("Enable spilling SGPRs to VGPRs"), cl::ReallyHidden, cl::init(true))
static const TargetRegisterClass * getAlignedVectorSuperClassForBitWidth(unsigned BitWidth)
static const TargetRegisterClass * getAnyVectorSuperClassForBitWidth(unsigned BitWidth)
static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, int Index, unsigned Lane, unsigned ValueReg, bool IsKill)
static bool isFIPlusImmOrVGPR(const SIRegisterInfo &TRI, const MachineInstr &MI)
static int getOffenMUBUFLoad(unsigned Opc)
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:487
LocallyHashedType DenseMapInfo< LocallyHashedType >::Empty
static const char * getRegisterName(MCRegister Reg)
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
bool test(unsigned Idx) const
Definition BitVector.h:480
bool empty() const
empty - Tests whether there are no bits in this bitvector.
Definition BitVector.h:175
A debug info location.
Definition DebugLoc.h:123
Diagnostic information for unsupported feature in backend.
Register getReg() const
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LiveInterval - This class represents the liveness of a register, or stack slot.
bool hasSubRanges() const
Returns true if subregister liveness information is available.
iterator_range< subrange_iterator > subranges()
void removeAllRegUnitsForPhysReg(MCRegister Reg)
Remove associated live ranges for the register units associated with Reg.
bool hasInterval(Register Reg) const
MachineInstr * getInstructionFromIndex(SlotIndex index) const
Returns the instruction associated with the given index.
MachineDominatorTree & getDomTree()
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
LiveRange & getRegUnit(MCRegUnit Unit)
Return the live range for register unit Unit.
This class represents the liveness of a register, stack slot, etc.
VNInfo * getVNInfoAt(SlotIndex Idx) const
getVNInfoAt - Return the VNInfo that is live at Idx, or NULL.
A set of register units used to track register liveness.
bool available(MCRegister Reg) const
Returns true if no part of physical register Reg is live.
Describe properties that are true of each instruction in the target description file.
MCRegAliasIterator enumerates all registers aliasing Reg.
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
static MCRegister from(unsigned Val)
Check the provided unsigned value is a valid MCRegister.
Definition MCRegister.h:77
Generic base class for all target subtargets.
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
bool hasCalls() const
Return true if the current function has any function calls.
Align getObjectAlign(int ObjectIdx) const
Return the alignment of the specified stack object.
bool hasStackObjects() const
Return true if there are any stack objects in this function.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addUse(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
void setAsmPrinterFlag(uint8_t Flag)
Set a flag for the AsmPrinter.
LLVM_ABI const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
void setImm(int64_t immVal)
int64_t getImm() const
LLVM_ABI void setIsRenamable(bool Val=true)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
bool isReserved(MCRegister PhysReg) const
isReserved - Returns true when PhysReg is a reserved register.
Holds all the information related to register banks.
virtual bool isDivergentRegBank(const RegisterBank *RB) const
Returns true if the register bank is considered divergent.
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
static bool isFLATScratch(const MachineInstr &MI)
static bool isMUBUF(const MachineInstr &MI)
static bool isVOP3(const MCInstrDesc &Desc)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
ArrayRef< MCPhysReg > getAGPRSpillVGPRs() const
MCPhysReg getVGPRToAGPRSpill(int FrameIndex, unsigned Lane) const
Register getScratchRSrcReg() const
Returns the physical register reserved for use as the resource descriptor for scratch accesses.
ArrayRef< MCPhysReg > getVGPRSpillAGPRs() const
ArrayRef< SIRegisterInfo::SpilledReg > getSGPRSpillToVirtualVGPRLanes(int FrameIndex) const
uint32_t getMaskForVGPRBlockOps(Register RegisterBlock) const
ArrayRef< SIRegisterInfo::SpilledReg > getSGPRSpillToPhysicalVGPRLanes(int FrameIndex) const
bool checkFlag(Register Reg, uint8_t Flag) const
const ReservedRegSet & getWWMReservedRegs() const
Register materializeFrameBaseRegister(MachineBasicBlock *MBB, int FrameIdx, int64_t Offset) const override
int64_t getScratchInstrOffset(const MachineInstr *MI) const
bool isFrameOffsetLegal(const MachineInstr *MI, Register BaseReg, int64_t Offset) const override
const TargetRegisterClass * getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, const TargetRegisterClass *SubRC, unsigned SubIdx) const
Returns a register class which is compatible with SuperRC, such that a subregister exists with class ...
ArrayRef< MCPhysReg > getAllSGPR64(const MachineFunction &MF) const
Return all SGPR64 which satisfy the waves per execution unit requirement of the subtarget.
MCRegister findUnusedRegister(const MachineRegisterInfo &MRI, const TargetRegisterClass *RC, const MachineFunction &MF, bool ReserveHighestVGPR=false) const
Returns a lowest register that is not used at any point in the function.
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
MCPhysReg get32BitRegister(MCPhysReg Reg) const
const uint32_t * getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override
bool requiresFrameIndexReplacementScavenging(const MachineFunction &MF) const override
bool shouldRealignStack(const MachineFunction &MF) const override
bool restoreSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, SlotIndexes *Indexes=nullptr, LiveIntervals *LIS=nullptr, bool OnlyToVGPR=false, bool SpillToPhysVGPRLane=false) const
bool isProperlyAlignedRC(const TargetRegisterClass &RC) const
const TargetRegisterClass * getEquivalentVGPRClass(const TargetRegisterClass *SRC) const
Register getFrameRegister(const MachineFunction &MF) const override
LLVM_READONLY const TargetRegisterClass * getVectorSuperClassForBitWidth(unsigned BitWidth) const
bool spillEmergencySGPR(MachineBasicBlock::iterator MI, MachineBasicBlock &RestoreMBB, Register SGPR, RegScavenger *RS) const
SIRegisterInfo(const GCNSubtarget &ST)
const uint32_t * getAllVGPRRegMask() const
MCRegister getReturnAddressReg(const MachineFunction &MF) const
const MCPhysReg * getCalleeSavedRegs(const MachineFunction *MF) const override
bool hasBasePointer(const MachineFunction &MF) const
const TargetRegisterClass * getCrossCopyRegClass(const TargetRegisterClass *RC) const override
Returns a legal register class to copy a register in the specified class to or from.
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
ArrayRef< MCPhysReg > getAllSGPR32(const MachineFunction &MF) const
Return all SGPR32 which satisfy the waves per execution unit requirement of the subtarget.
const TargetRegisterClass * getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &MF) const override
MCRegister reservedPrivateSegmentBufferReg(const MachineFunction &MF) const
Return the end register initially reserved for the scratch buffer in case spilling is needed.
bool eliminateSGPRToVGPRSpillFrameIndex(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, SlotIndexes *Indexes=nullptr, LiveIntervals *LIS=nullptr, bool SpillToPhysVGPRLane=false) const
Special case of eliminateFrameIndex.
bool isVGPR(const MachineRegisterInfo &MRI, Register Reg) const
void buildSpillLoadStore(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, unsigned LoadStoreOp, int Index, Register ValueReg, bool ValueIsKill, MCRegister ScratchOffsetReg, int64_t InstrOffset, MachineMemOperand *MMO, RegScavenger *RS, LiveRegUnits *LiveUnits=nullptr) const
bool isAsmClobberable(const MachineFunction &MF, MCRegister PhysReg) const override
LLVM_READONLY const TargetRegisterClass * getAGPRClassForBitWidth(unsigned BitWidth) const
static bool isChainScratchRegister(Register VGPR)
bool requiresRegisterScavenging(const MachineFunction &Fn) const override
bool opCanUseInlineConstant(unsigned OpType) const
const TargetRegisterClass * getRegClassForSizeOnBank(unsigned Size, const RegisterBank &Bank) const
const TargetRegisterClass * getConstrainedRegClassForOperand(const MachineOperand &MO, const MachineRegisterInfo &MRI) const override
bool isUniformReg(const MachineRegisterInfo &MRI, const RegisterBankInfo &RBI, Register Reg) const override
const uint32_t * getNoPreservedMask() const override
StringRef getRegAsmName(MCRegister Reg) const override
const uint32_t * getAllAllocatableSRegMask() const
MCRegister getAlignedHighSGPRForRC(const MachineFunction &MF, const unsigned Align, const TargetRegisterClass *RC) const
Return the largest available SGPR aligned to Align for the register class RC.
const TargetRegisterClass * getRegClassForReg(const MachineRegisterInfo &MRI, Register Reg) const
unsigned getHWRegIndex(MCRegister Reg) const
const MCPhysReg * getCalleeSavedRegsViaCopy(const MachineFunction *MF) const
const uint32_t * getAllVectorRegMask() const
const TargetRegisterClass * getEquivalentAGPRClass(const TargetRegisterClass *SRC) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
const TargetRegisterClass * getPointerRegClass(unsigned Kind=0) const override
const TargetRegisterClass * getRegClassForTypeOnBank(LLT Ty, const RegisterBank &Bank) const
bool opCanUseLiteralConstant(unsigned OpType) const
Register getBaseRegister() const
bool getRegAllocationHints(Register VirtReg, ArrayRef< MCPhysReg > Order, SmallVectorImpl< MCPhysReg > &Hints, const MachineFunction &MF, const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const override
LLVM_READONLY const TargetRegisterClass * getAlignedLo256VGPRClassForBitWidth(unsigned BitWidth) const
LLVM_READONLY const TargetRegisterClass * getVGPRClassForBitWidth(unsigned BitWidth) const
const TargetRegisterClass * getEquivalentAVClass(const TargetRegisterClass *SRC) const
bool requiresFrameIndexScavenging(const MachineFunction &MF) const override
static bool isVGPRClass(const TargetRegisterClass *RC)
MachineInstr * findReachingDef(Register Reg, unsigned SubReg, MachineInstr &Use, MachineRegisterInfo &MRI, LiveIntervals *LIS) const
bool isSGPRReg(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getEquivalentSGPRClass(const TargetRegisterClass *VRC) const
SmallVector< StringLiteral > getVRegFlagsOfReg(Register Reg, const MachineFunction &MF) const override
LLVM_READONLY const TargetRegisterClass * getDefaultVectorSuperClassForBitWidth(unsigned BitWidth) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
ArrayRef< MCPhysReg > getAllSGPR128(const MachineFunction &MF) const
Return all SGPR128 which satisfy the waves per execution unit requirement of the subtarget.
unsigned getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const override
BitVector getReservedRegs(const MachineFunction &MF) const override
bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override
const TargetRegisterClass * getRegClassForOperandReg(const MachineRegisterInfo &MRI, const MachineOperand &MO) const
void addImplicitUsesForBlockCSRLoad(MachineInstrBuilder &MIB, Register BlockReg) const
unsigned getNumUsedPhysRegs(const MachineRegisterInfo &MRI, const TargetRegisterClass &RC, bool IncludeCalls=true) const
const uint32_t * getAllAGPRRegMask() const
const int * getRegUnitPressureSets(MCRegUnit RegUnit) const override
bool isAGPR(const MachineRegisterInfo &MRI, Register Reg) const
bool eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const override
bool spillSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, SlotIndexes *Indexes=nullptr, LiveIntervals *LIS=nullptr, bool OnlyToVGPR=false, bool SpillToPhysVGPRLane=false) const
If OnlyToVGPR is true, this will only succeed if this manages to find a free VGPR lane to spill.
MCRegister getExec() const
MCRegister getVCC() const
int64_t getFrameIndexInstrOffset(const MachineInstr *MI, int Idx) const override
bool isVectorSuperClass(const TargetRegisterClass *RC) const
const TargetRegisterClass * getWaveMaskRegClass() const
unsigned getSubRegAlignmentNumBits(const TargetRegisterClass *RC, unsigned SubReg) const
void resolveFrameIndex(MachineInstr &MI, Register BaseReg, int64_t Offset) const override
bool requiresVirtualBaseRegisters(const MachineFunction &Fn) const override
const TargetRegisterClass * getVGPR64Class() const
void buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index, int Offset, bool IsLoad, bool IsKill=true) const
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
bool isValid() const
Returns true if this is a valid index.
SlotIndexes pass.
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
SlotIndex replaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
ReplaceMachineInstrInMaps - Replacing a machine instr with a new one in maps used by register allocat...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
bool hasFP(const MachineFunction &MF) const
hasFP - Return true if the specified function should have a dedicated frame pointer register.
const uint8_t TSFlags
Configurable target specific flags.
ArrayRef< MCPhysReg > getRegisters() const
unsigned getID() const
Return the register class ID number.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
virtual const TargetRegisterClass * getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &) const
Returns the largest super class of RC that is legal to use in the current sub-target and has the same...
virtual bool shouldRealignStack(const MachineFunction &MF) const
True if storage within the function requires the stack pointer to be aligned more than the normal cal...
virtual bool getRegAllocationHints(Register VirtReg, ArrayRef< MCPhysReg > Order, SmallVectorImpl< MCPhysReg > &Hints, const MachineFunction &MF, const VirtRegMap *VRM=nullptr, const LiveRegMatrix *Matrix=nullptr) const
Get a list of 'hint' registers that the register allocator should try first when allocating a physica...
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
VNInfo - Value Number Information.
MCRegister getPhys(Register virtReg) const
returns the physical register mapped to the specified virtual register
Definition VirtRegMap.h:91
bool hasPhys(Register virtReg) const
returns true if the specified virtual register is mapped to a physical register
Definition VirtRegMap.h:87
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ PRIVATE_ADDRESS
Address space for private memory.
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
LLVM_READONLY int getFlatScratchInstSVfromSS(uint16_t Opcode)
LLVM_READONLY int getFlatScratchInstSTfromSS(uint16_t Opcode)
LLVM_READONLY int getFlatScratchInstSVfromSVS(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
@ OPERAND_REG_IMM_FIRST
Definition SIDefines.h:252
@ OPERAND_REG_INLINE_AC_FIRST
Definition SIDefines.h:258
@ OPERAND_REG_INLINE_AC_LAST
Definition SIDefines.h:259
@ OPERAND_REG_IMM_LAST
Definition SIDefines.h:253
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition CallingConv.h:47
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
@ Offset
Definition DWP.cpp:532
PointerUnion< const TargetRegisterClass *, const RegisterBank * > RegClassOrRegBank
Convenient type to represent either a register class or a register bank.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1667
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
RegState
Flags to represent properties of register accesses.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
@ Renamable
Register that may be renamed.
constexpr RegState getKillRegState(bool B)
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
Op::Description Desc
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
@ HasSGPR
Definition SIDefines.h:26
@ HasVGPR
Definition SIDefines.h:24
@ RegKindMask
Definition SIDefines.h:29
@ HasAGPR
Definition SIDefines.h:25
constexpr RegState getDefRegState(bool B)
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
constexpr bool hasRegState(RegState Value, RegState Test)
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
void call_once(once_flag &flag, Function &&F, Args &&... ArgList)
Execute the function specified as a parameter once.
Definition Threading.h:86
constexpr unsigned BitWidth
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
This class contains a discriminated union of information about pointers in memory operands,...
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
void setMI(MachineBasicBlock *NewMBB, MachineBasicBlock::iterator NewMI)
ArrayRef< int16_t > SplitParts
SIMachineFunctionInfo & MFI
SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, bool IsWave32, MachineBasicBlock::iterator MI, int Index, RegScavenger *RS)
SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, bool IsWave32, MachineBasicBlock::iterator MI, Register Reg, bool IsKill, int Index, RegScavenger *RS)
MachineBasicBlock::iterator MI
void readWriteTmpVGPR(unsigned Offset, bool IsLoad)
const SIRegisterInfo & TRI
MachineBasicBlock * MBB
const SIInstrInfo & TII
The llvm::once_flag structure.
Definition Threading.h:67