LLVM 22.0.0git
SIRegisterInfo.cpp
Go to the documentation of this file.
1//===-- SIRegisterInfo.cpp - SI Register Information ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI implementation of the TargetRegisterInfo class.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPU.h"
16#include "GCNSubtarget.h"
20#include "SIRegisterInfo.h"
26
27using namespace llvm;
28
29#define GET_REGINFO_TARGET_DESC
30#include "AMDGPUGenRegisterInfo.inc"
31
33 "amdgpu-spill-sgpr-to-vgpr",
34 cl::desc("Enable spilling SGPRs to VGPRs"),
36 cl::init(true));
37
38std::array<std::vector<int16_t>, 32> SIRegisterInfo::RegSplitParts;
39std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable;
40
41// Map numbers of DWORDs to indexes in SubRegFromChannelTable.
42// Valid indexes are shifted 1, such that a 0 mapping means unsupported.
43// e.g. for 8 DWORDs (256-bit), SubRegFromChannelTableWidthMap[8] = 8,
44// meaning index 7 in SubRegFromChannelTable.
45static const std::array<unsigned, 17> SubRegFromChannelTableWidthMap = {
46 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 9};
47
48static void emitUnsupportedError(const Function &Fn, const MachineInstr &MI,
49 const Twine &ErrMsg) {
51 DiagnosticInfoUnsupported(Fn, ErrMsg, MI.getDebugLoc()));
52}
53
54namespace llvm {
55
56// A temporary struct to spill SGPRs.
57// This is mostly to spill SGPRs to memory. Spilling SGPRs into VGPR lanes emits
58// just v_writelane and v_readlane.
59//
60// When spilling to memory, the SGPRs are written into VGPR lanes and the VGPR
61// is saved to scratch (or the other way around for loads).
62// For this, a VGPR is required where the needed lanes can be clobbered. The
63// RegScavenger can provide a VGPR where currently active lanes can be
64// clobbered, but we still need to save inactive lanes.
65// The high-level steps are:
66// - Try to scavenge SGPR(s) to save exec
67// - Try to scavenge VGPR
68// - Save needed, all or inactive lanes of a TmpVGPR
69// - Spill/Restore SGPRs using TmpVGPR
70// - Restore TmpVGPR
71//
72// To save all lanes of TmpVGPR, exec needs to be saved and modified. If we
73// cannot scavenge temporary SGPRs to save exec, we use the following code:
74// buffer_store_dword TmpVGPR ; only if active lanes need to be saved
75// s_not exec, exec
76// buffer_store_dword TmpVGPR ; save inactive lanes
77// s_not exec, exec
79 struct PerVGPRData {
80 unsigned PerVGPR;
81 unsigned NumVGPRs;
82 int64_t VGPRLanes;
83 };
84
85 // The SGPR to save
89 unsigned NumSubRegs;
90 bool IsKill;
91 const DebugLoc &DL;
92
93 /* When spilling to stack */
94 // The SGPRs are written into this VGPR, which is then written to scratch
95 // (or vice versa for loads).
96 Register TmpVGPR = AMDGPU::NoRegister;
97 // Temporary spill slot to save TmpVGPR to.
98 int TmpVGPRIndex = 0;
99 // If TmpVGPR is live before the spill or if it is scavenged.
100 bool TmpVGPRLive = false;
101 // Scavenged SGPR to save EXEC.
102 Register SavedExecReg = AMDGPU::NoRegister;
103 // Stack index to write the SGPRs to.
104 int Index;
105 unsigned EltSize = 4;
106
115 unsigned MovOpc;
116 unsigned NotOpc;
117
121 : SGPRSpillBuilder(TRI, TII, IsWave32, MI, MI->getOperand(0).getReg(),
122 MI->getOperand(0).isKill(), Index, RS) {}
123
126 bool IsKill, int Index, RegScavenger *RS)
127 : SuperReg(Reg), MI(MI), IsKill(IsKill), DL(MI->getDebugLoc()),
128 Index(Index), RS(RS), MBB(MI->getParent()), MF(*MBB->getParent()),
129 MFI(*MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI),
131 const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg);
132 SplitParts = TRI.getRegSplitParts(RC, EltSize);
133 NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
134
135 if (IsWave32) {
136 ExecReg = AMDGPU::EXEC_LO;
137 MovOpc = AMDGPU::S_MOV_B32;
138 NotOpc = AMDGPU::S_NOT_B32;
139 } else {
140 ExecReg = AMDGPU::EXEC;
141 MovOpc = AMDGPU::S_MOV_B64;
142 NotOpc = AMDGPU::S_NOT_B64;
143 }
144
145 assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
146 assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI &&
147 SuperReg != AMDGPU::EXEC && "exec should never spill");
148 }
149
152 Data.PerVGPR = IsWave32 ? 32 : 64;
153 Data.NumVGPRs = (NumSubRegs + (Data.PerVGPR - 1)) / Data.PerVGPR;
154 Data.VGPRLanes = (1LL << std::min(Data.PerVGPR, NumSubRegs)) - 1LL;
155 return Data;
156 }
157
158 // Tries to scavenge SGPRs to save EXEC and a VGPR. Uses v0 if no VGPR is
159 // free.
160 // Writes these instructions if an SGPR can be scavenged:
161 // s_mov_b64 s[6:7], exec ; Save exec
162 // s_mov_b64 exec, 3 ; Wanted lanemask
163 // buffer_store_dword v1 ; Write scavenged VGPR to emergency slot
164 //
165 // Writes these instructions if no SGPR can be scavenged:
166 // buffer_store_dword v0 ; Only if no free VGPR was found
167 // s_not_b64 exec, exec
168 // buffer_store_dword v0 ; Save inactive lanes
169 // ; exec stays inverted, it is flipped back in
170 // ; restore.
171 void prepare() {
172 // Scavenged temporary VGPR to use. It must be scavenged once for any number
173 // of spilled subregs.
174 // FIXME: The liveness analysis is limited and does not tell if a register
175 // is in use in lanes that are currently inactive. We can never be sure if
176 // a register as actually in use in another lane, so we need to save all
177 // used lanes of the chosen VGPR.
178 assert(RS && "Cannot spill SGPR to memory without RegScavenger");
179 TmpVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false,
180 0, false);
181
182 // Reserve temporary stack slot
183 TmpVGPRIndex = MFI.getScavengeFI(MF.getFrameInfo(), TRI);
184 if (TmpVGPR) {
185 // Found a register that is dead in the currently active lanes, we only
186 // need to spill inactive lanes.
187 TmpVGPRLive = false;
188 } else {
189 // Pick v0 because it doesn't make a difference.
190 TmpVGPR = AMDGPU::VGPR0;
191 TmpVGPRLive = true;
192 }
193
194 if (TmpVGPRLive) {
195 // We need to inform the scavenger that this index is already in use until
196 // we're done with the custom emergency spill.
197 RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR);
198 }
199
200 // We may end up recursively calling the scavenger, and don't want to re-use
201 // the same register.
202 RS->setRegUsed(TmpVGPR);
203
204 // Try to scavenge SGPRs to save exec
205 assert(!SavedExecReg && "Exec is already saved, refuse to save again");
206 const TargetRegisterClass &RC =
207 IsWave32 ? AMDGPU::SGPR_32RegClass : AMDGPU::SGPR_64RegClass;
208 RS->setRegUsed(SuperReg);
209 SavedExecReg = RS->scavengeRegisterBackwards(RC, MI, false, 0, false);
210
211 int64_t VGPRLanes = getPerVGPRData().VGPRLanes;
212
213 if (SavedExecReg) {
214 RS->setRegUsed(SavedExecReg);
215 // Set exec to needed lanes
217 auto I =
218 BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg).addImm(VGPRLanes);
219 if (!TmpVGPRLive)
221 // Spill needed lanes
222 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
223 } else {
224 // The modify and restore of exec clobber SCC, which we would have to save
225 // and restore. FIXME: We probably would need to reserve a register for
226 // this.
227 if (RS->isRegUsed(AMDGPU::SCC))
228 emitUnsupportedError(MF.getFunction(), *MI,
229 "unhandled SGPR spill to memory");
230
231 // Spill active lanes
232 if (TmpVGPRLive)
233 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false,
234 /*IsKill*/ false);
235 // Spill inactive lanes
236 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
237 if (!TmpVGPRLive)
239 I->getOperand(2).setIsDead(); // Mark SCC as dead.
240 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
241 }
242 }
243
244 // Writes these instructions if an SGPR can be scavenged:
245 // buffer_load_dword v1 ; Write scavenged VGPR to emergency slot
246 // s_waitcnt vmcnt(0) ; If a free VGPR was found
247 // s_mov_b64 exec, s[6:7] ; Save exec
248 //
249 // Writes these instructions if no SGPR can be scavenged:
250 // buffer_load_dword v0 ; Restore inactive lanes
251 // s_waitcnt vmcnt(0) ; If a free VGPR was found
252 // s_not_b64 exec, exec
253 // buffer_load_dword v0 ; Only if no free VGPR was found
254 void restore() {
255 if (SavedExecReg) {
256 // Restore used lanes
257 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
258 /*IsKill*/ false);
259 // Restore exec
260 auto I = BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg)
262 // Add an implicit use of the load so it is not dead.
263 // FIXME This inserts an unnecessary waitcnt
264 if (!TmpVGPRLive) {
266 }
267 } else {
268 // Restore inactive lanes
269 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
270 /*IsKill*/ false);
271 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
272 if (!TmpVGPRLive)
274 I->getOperand(2).setIsDead(); // Mark SCC as dead.
275
276 // Restore active lanes
277 if (TmpVGPRLive)
278 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true);
279 }
280
281 // Inform the scavenger where we're releasing our custom scavenged register.
282 if (TmpVGPRLive) {
283 MachineBasicBlock::iterator RestorePt = std::prev(MI);
284 RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR, &*RestorePt);
285 }
286 }
287
288 // Write TmpVGPR to memory or read TmpVGPR from memory.
289 // Either using a single buffer_load/store if exec is set to the needed mask
290 // or using
291 // buffer_load
292 // s_not exec, exec
293 // buffer_load
294 // s_not exec, exec
295 void readWriteTmpVGPR(unsigned Offset, bool IsLoad) {
296 if (SavedExecReg) {
297 // Spill needed lanes
298 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
299 } else {
300 // The modify and restore of exec clobber SCC, which we would have to save
301 // and restore. FIXME: We probably would need to reserve a register for
302 // this.
303 if (RS->isRegUsed(AMDGPU::SCC))
304 emitUnsupportedError(MF.getFunction(), *MI,
305 "unhandled SGPR spill to memory");
306
307 // Spill active lanes
308 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad,
309 /*IsKill*/ false);
310 // Spill inactive lanes
311 auto Not0 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
312 Not0->getOperand(2).setIsDead(); // Mark SCC as dead.
313 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
314 auto Not1 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
315 Not1->getOperand(2).setIsDead(); // Mark SCC as dead.
316 }
317 }
318
320 assert(MBB->getParent() == &MF);
321 MI = NewMI;
322 MBB = NewMBB;
323 }
324};
325
326} // namespace llvm
327
329 : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour(),
330 ST.getAMDGPUDwarfFlavour(),
331 /*PC=*/0,
332 ST.getHwMode(MCSubtargetInfo::HwMode_RegInfo)),
333 ST(ST), SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) {
334
335 assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 &&
336 getSubRegIndexLaneMask(AMDGPU::sub31).getAsInteger() == (3ULL << 62) &&
337 (getSubRegIndexLaneMask(AMDGPU::lo16) |
338 getSubRegIndexLaneMask(AMDGPU::hi16)).getAsInteger() ==
339 getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() &&
340 "getNumCoveredRegs() will not work with generated subreg masks!");
341
342 RegPressureIgnoredUnits.resize(getNumRegUnits());
343 RegPressureIgnoredUnits.set(*regunits(MCRegister::from(AMDGPU::M0)).begin());
344 for (auto Reg : AMDGPU::VGPR_16RegClass) {
345 if (AMDGPU::isHi16Reg(Reg, *this))
346 RegPressureIgnoredUnits.set(*regunits(Reg).begin());
347 }
348
349 // HACK: Until this is fully tablegen'd.
350 static llvm::once_flag InitializeRegSplitPartsFlag;
351
352 static auto InitializeRegSplitPartsOnce = [this]() {
353 for (unsigned Idx = 1, E = getNumSubRegIndices() - 1; Idx < E; ++Idx) {
354 unsigned Size = getSubRegIdxSize(Idx);
355 if (Size & 15)
356 continue;
357 std::vector<int16_t> &Vec = RegSplitParts[Size / 16 - 1];
358 unsigned Pos = getSubRegIdxOffset(Idx);
359 if (Pos % Size)
360 continue;
361 Pos /= Size;
362 if (Vec.empty()) {
363 unsigned MaxNumParts = 1024 / Size; // Maximum register is 1024 bits.
364 Vec.resize(MaxNumParts);
365 }
366 Vec[Pos] = Idx;
367 }
368 };
369
370 static llvm::once_flag InitializeSubRegFromChannelTableFlag;
371
372 static auto InitializeSubRegFromChannelTableOnce = [this]() {
373 for (auto &Row : SubRegFromChannelTable)
374 Row.fill(AMDGPU::NoSubRegister);
375 for (unsigned Idx = 1; Idx < getNumSubRegIndices(); ++Idx) {
376 unsigned Width = getSubRegIdxSize(Idx) / 32;
377 unsigned Offset = getSubRegIdxOffset(Idx) / 32;
379 Width = SubRegFromChannelTableWidthMap[Width];
380 if (Width == 0)
381 continue;
382 unsigned TableIdx = Width - 1;
383 assert(TableIdx < SubRegFromChannelTable.size());
384 assert(Offset < SubRegFromChannelTable[TableIdx].size());
385 SubRegFromChannelTable[TableIdx][Offset] = Idx;
386 }
387 };
388
389 llvm::call_once(InitializeRegSplitPartsFlag, InitializeRegSplitPartsOnce);
390 llvm::call_once(InitializeSubRegFromChannelTableFlag,
391 InitializeSubRegFromChannelTableOnce);
392}
393
394void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved,
395 MCRegister Reg) const {
396 for (MCRegAliasIterator R(Reg, this, true); R.isValid(); ++R)
397 Reserved.set(*R);
398}
399
400// Forced to be here by one .inc
402 const MachineFunction *MF) const {
404 switch (CC) {
405 case CallingConv::C:
408 return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_SaveList
409 : CSR_AMDGPU_SaveList;
412 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_SaveList
413 : CSR_AMDGPU_SI_Gfx_SaveList;
415 return CSR_AMDGPU_CS_ChainPreserve_SaveList;
416 default: {
417 // Dummy to not crash RegisterClassInfo.
418 static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister;
419 return &NoCalleeSavedReg;
420 }
421 }
422}
423
424const MCPhysReg *
426 return nullptr;
427}
428
430 CallingConv::ID CC) const {
431 switch (CC) {
432 case CallingConv::C:
435 return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_RegMask
436 : CSR_AMDGPU_RegMask;
439 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_RegMask
440 : CSR_AMDGPU_SI_Gfx_RegMask;
443 // Calls to these functions never return, so we can pretend everything is
444 // preserved.
445 return AMDGPU_AllVGPRs_RegMask;
446 default:
447 return nullptr;
448 }
449}
450
452 return CSR_AMDGPU_NoRegs_RegMask;
453}
454
456 return VGPR >= AMDGPU::VGPR0 && VGPR < AMDGPU::VGPR8;
457}
458
461 const MachineFunction &MF) const {
462 // FIXME: Should have a helper function like getEquivalentVGPRClass to get the
463 // equivalent AV class. If used one, the verifier will crash after
464 // RegBankSelect in the GISel flow. The aligned regclasses are not fully given
465 // until Instruction selection.
466 if (ST.hasMAIInsts() && (isVGPRClass(RC) || isAGPRClass(RC))) {
467 if (RC == &AMDGPU::VGPR_32RegClass || RC == &AMDGPU::AGPR_32RegClass)
468 return &AMDGPU::AV_32RegClass;
469 if (RC == &AMDGPU::VReg_64RegClass || RC == &AMDGPU::AReg_64RegClass)
470 return &AMDGPU::AV_64RegClass;
471 if (RC == &AMDGPU::VReg_64_Align2RegClass ||
472 RC == &AMDGPU::AReg_64_Align2RegClass)
473 return &AMDGPU::AV_64_Align2RegClass;
474 if (RC == &AMDGPU::VReg_96RegClass || RC == &AMDGPU::AReg_96RegClass)
475 return &AMDGPU::AV_96RegClass;
476 if (RC == &AMDGPU::VReg_96_Align2RegClass ||
477 RC == &AMDGPU::AReg_96_Align2RegClass)
478 return &AMDGPU::AV_96_Align2RegClass;
479 if (RC == &AMDGPU::VReg_128RegClass || RC == &AMDGPU::AReg_128RegClass)
480 return &AMDGPU::AV_128RegClass;
481 if (RC == &AMDGPU::VReg_128_Align2RegClass ||
482 RC == &AMDGPU::AReg_128_Align2RegClass)
483 return &AMDGPU::AV_128_Align2RegClass;
484 if (RC == &AMDGPU::VReg_160RegClass || RC == &AMDGPU::AReg_160RegClass)
485 return &AMDGPU::AV_160RegClass;
486 if (RC == &AMDGPU::VReg_160_Align2RegClass ||
487 RC == &AMDGPU::AReg_160_Align2RegClass)
488 return &AMDGPU::AV_160_Align2RegClass;
489 if (RC == &AMDGPU::VReg_192RegClass || RC == &AMDGPU::AReg_192RegClass)
490 return &AMDGPU::AV_192RegClass;
491 if (RC == &AMDGPU::VReg_192_Align2RegClass ||
492 RC == &AMDGPU::AReg_192_Align2RegClass)
493 return &AMDGPU::AV_192_Align2RegClass;
494 if (RC == &AMDGPU::VReg_256RegClass || RC == &AMDGPU::AReg_256RegClass)
495 return &AMDGPU::AV_256RegClass;
496 if (RC == &AMDGPU::VReg_256_Align2RegClass ||
497 RC == &AMDGPU::AReg_256_Align2RegClass)
498 return &AMDGPU::AV_256_Align2RegClass;
499 if (RC == &AMDGPU::VReg_512RegClass || RC == &AMDGPU::AReg_512RegClass)
500 return &AMDGPU::AV_512RegClass;
501 if (RC == &AMDGPU::VReg_512_Align2RegClass ||
502 RC == &AMDGPU::AReg_512_Align2RegClass)
503 return &AMDGPU::AV_512_Align2RegClass;
504 if (RC == &AMDGPU::VReg_1024RegClass || RC == &AMDGPU::AReg_1024RegClass)
505 return &AMDGPU::AV_1024RegClass;
506 if (RC == &AMDGPU::VReg_1024_Align2RegClass ||
507 RC == &AMDGPU::AReg_1024_Align2RegClass)
508 return &AMDGPU::AV_1024_Align2RegClass;
509 }
510
512}
513
515 const SIFrameLowering *TFI = ST.getFrameLowering();
517
518 // During ISel lowering we always reserve the stack pointer in entry and chain
519 // functions, but never actually want to reference it when accessing our own
520 // frame. If we need a frame pointer we use it, but otherwise we can just use
521 // an immediate "0" which we represent by returning NoRegister.
522 if (FuncInfo->isBottomOfStack()) {
523 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : Register();
524 }
525 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg()
526 : FuncInfo->getStackPtrOffsetReg();
527}
528
530 // When we need stack realignment, we can't reference off of the
531 // stack pointer, so we reserve a base pointer.
532 return shouldRealignStack(MF);
533}
534
535Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; }
536
538 return AMDGPU_AllVGPRs_RegMask;
539}
540
542 return AMDGPU_AllAGPRs_RegMask;
543}
544
546 return AMDGPU_AllVectorRegs_RegMask;
547}
548
550 return AMDGPU_AllAllocatableSRegs_RegMask;
551}
552
553unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel,
554 unsigned NumRegs) {
555 assert(NumRegs < SubRegFromChannelTableWidthMap.size());
556 unsigned NumRegIndex = SubRegFromChannelTableWidthMap[NumRegs];
557 assert(NumRegIndex && "Not implemented");
558 assert(Channel < SubRegFromChannelTable[NumRegIndex - 1].size());
559 return SubRegFromChannelTable[NumRegIndex - 1][Channel];
560}
561
564 const unsigned Align,
565 const TargetRegisterClass *RC) const {
566 unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), Align) - Align;
567 MCRegister BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
568 return getMatchingSuperReg(BaseReg, AMDGPU::sub0, RC);
569}
570
572 const MachineFunction &MF) const {
573 return getAlignedHighSGPRForRC(MF, /*Align=*/4, &AMDGPU::SGPR_128RegClass);
574}
575
577 BitVector Reserved(getNumRegs());
578 Reserved.set(AMDGPU::MODE);
579
581
582 // Reserve special purpose registers.
583 //
584 // EXEC_LO and EXEC_HI could be allocated and used as regular register, but
585 // this seems likely to result in bugs, so I'm marking them as reserved.
586 reserveRegisterTuples(Reserved, AMDGPU::EXEC);
587 reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
588
589 // M0 has to be reserved so that llvm accepts it as a live-in into a block.
590 reserveRegisterTuples(Reserved, AMDGPU::M0);
591
592 // Reserve src_vccz, src_execz, src_scc.
593 reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ);
594 reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ);
595 reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC);
596
597 // Reserve the memory aperture registers
598 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE);
599 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT);
600 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE);
601 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT);
602 reserveRegisterTuples(Reserved, AMDGPU::SRC_FLAT_SCRATCH_BASE_LO);
603 reserveRegisterTuples(Reserved, AMDGPU::SRC_FLAT_SCRATCH_BASE_HI);
604
605 // Reserve async counters pseudo registers
606 reserveRegisterTuples(Reserved, AMDGPU::ASYNCcnt);
607 reserveRegisterTuples(Reserved, AMDGPU::TENSORcnt);
608
609 // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen.
610 reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID);
611
612 // Reserve xnack_mask registers - support is not implemented in Codegen.
613 reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK);
614
615 // Reserve lds_direct register - support is not implemented in Codegen.
616 reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT);
617
618 // Reserve Trap Handler registers - support is not implemented in Codegen.
619 reserveRegisterTuples(Reserved, AMDGPU::TBA);
620 reserveRegisterTuples(Reserved, AMDGPU::TMA);
621 reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1);
622 reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3);
623 reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5);
624 reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7);
625 reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9);
626 reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11);
627 reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13);
628 reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15);
629
630 // Reserve null register - it shall never be allocated
631 reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL64);
632
633 // Reserve SGPRs.
634 //
635 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
636 unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
637 for (const TargetRegisterClass *RC : regclasses()) {
638 if (RC->isBaseClass() && isSGPRClass(RC)) {
639 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
640 for (MCPhysReg Reg : *RC) {
641 unsigned Index = getHWRegIndex(Reg);
642 if (Index + NumRegs > MaxNumSGPRs && Index < TotalNumSGPRs)
643 Reserved.set(Reg);
644 }
645 }
646 }
647
648 Register ScratchRSrcReg = MFI->getScratchRSrcReg();
649 if (ScratchRSrcReg != AMDGPU::NoRegister) {
650 // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we
651 // need to spill.
652 // TODO: May need to reserve a VGPR if doing LDS spilling.
653 reserveRegisterTuples(Reserved, ScratchRSrcReg);
654 }
655
656 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
657 if (LongBranchReservedReg)
658 reserveRegisterTuples(Reserved, LongBranchReservedReg);
659
660 // We have to assume the SP is needed in case there are calls in the function,
661 // which is detected after the function is lowered. If we aren't really going
662 // to need SP, don't bother reserving it.
663 MCRegister StackPtrReg = MFI->getStackPtrOffsetReg();
664 if (StackPtrReg) {
665 reserveRegisterTuples(Reserved, StackPtrReg);
666 assert(!isSubRegister(ScratchRSrcReg, StackPtrReg));
667 }
668
669 MCRegister FrameReg = MFI->getFrameOffsetReg();
670 if (FrameReg) {
671 reserveRegisterTuples(Reserved, FrameReg);
672 assert(!isSubRegister(ScratchRSrcReg, FrameReg));
673 }
674
675 if (hasBasePointer(MF)) {
676 MCRegister BasePtrReg = getBaseRegister();
677 reserveRegisterTuples(Reserved, BasePtrReg);
678 assert(!isSubRegister(ScratchRSrcReg, BasePtrReg));
679 }
680
681 // FIXME: Use same reserved register introduced in D149775
682 // SGPR used to preserve EXEC MASK around WWM spill/copy instructions.
683 Register ExecCopyReg = MFI->getSGPRForEXECCopy();
684 if (ExecCopyReg)
685 reserveRegisterTuples(Reserved, ExecCopyReg);
686
687 // Reserve VGPRs/AGPRs.
688 //
689 auto [MaxNumVGPRs, MaxNumAGPRs] = ST.getMaxNumVectorRegs(MF.getFunction());
690
691 for (const TargetRegisterClass *RC : regclasses()) {
692 if (RC->isBaseClass() && isVGPRClass(RC)) {
693 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
694 for (MCPhysReg Reg : *RC) {
695 unsigned Index = getHWRegIndex(Reg);
696 if (Index + NumRegs > MaxNumVGPRs)
697 Reserved.set(Reg);
698 }
699 }
700 }
701
702 // Reserve all the AGPRs if there are no instructions to use it.
703 if (!ST.hasMAIInsts())
704 MaxNumAGPRs = 0;
705 for (const TargetRegisterClass *RC : regclasses()) {
706 if (RC->isBaseClass() && isAGPRClass(RC)) {
707 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
708 for (MCPhysReg Reg : *RC) {
709 unsigned Index = getHWRegIndex(Reg);
710 if (Index + NumRegs > MaxNumAGPRs)
711 Reserved.set(Reg);
712 }
713 }
714 }
715
716 // On GFX908, in order to guarantee copying between AGPRs, we need a scratch
717 // VGPR available at all times.
718 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
719 reserveRegisterTuples(Reserved, MFI->getVGPRForAGPRCopy());
720 }
721
722 // During wwm-regalloc, reserve the registers for perlane VGPR allocation. The
723 // MFI->getNonWWMRegMask() field will have a valid bitmask only during
724 // wwm-regalloc and it would be empty otherwise.
725 BitVector NonWWMRegMask = MFI->getNonWWMRegMask();
726 if (!NonWWMRegMask.empty()) {
727 for (unsigned RegI = AMDGPU::VGPR0, RegE = AMDGPU::VGPR0 + MaxNumVGPRs;
728 RegI < RegE; ++RegI) {
729 if (NonWWMRegMask.test(RegI))
730 reserveRegisterTuples(Reserved, RegI);
731 }
732 }
733
734 for (Register Reg : MFI->getWWMReservedRegs())
735 reserveRegisterTuples(Reserved, Reg);
736
737 // FIXME: Stop using reserved registers for this.
738 for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs())
739 reserveRegisterTuples(Reserved, Reg);
740
741 for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs())
742 reserveRegisterTuples(Reserved, Reg);
743
744 return Reserved;
745}
746
748 MCRegister PhysReg) const {
749 return !MF.getRegInfo().isReserved(PhysReg);
750}
751
754 // On entry or in chain functions, the base address is 0, so it can't possibly
755 // need any more alignment.
756
757 // FIXME: Should be able to specify the entry frame alignment per calling
758 // convention instead.
759 if (Info->isBottomOfStack())
760 return false;
761
763}
764
767 if (Info->isEntryFunction()) {
768 const MachineFrameInfo &MFI = Fn.getFrameInfo();
769 return MFI.hasStackObjects() || MFI.hasCalls();
770 }
771
772 // May need scavenger for dealing with callee saved registers.
773 return true;
774}
775
777 const MachineFunction &MF) const {
778 // Do not use frame virtual registers. They used to be used for SGPRs, but
779 // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the
780 // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a
781 // spill.
782 return false;
783}
784
786 const MachineFunction &MF) const {
787 const MachineFrameInfo &MFI = MF.getFrameInfo();
788 return MFI.hasStackObjects();
789}
790
792 const MachineFunction &) const {
793 // There are no special dedicated stack or frame pointers.
794 return true;
795}
796
799
800 int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
801 AMDGPU::OpName::offset);
802 return MI->getOperand(OffIdx).getImm();
803}
804
806 int Idx) const {
807 switch (MI->getOpcode()) {
808 case AMDGPU::V_ADD_U32_e32:
809 case AMDGPU::V_ADD_U32_e64:
810 case AMDGPU::V_ADD_CO_U32_e32: {
811 int OtherIdx = Idx == 1 ? 2 : 1;
812 const MachineOperand &OtherOp = MI->getOperand(OtherIdx);
813 return OtherOp.isImm() ? OtherOp.getImm() : 0;
814 }
815 case AMDGPU::V_ADD_CO_U32_e64: {
816 int OtherIdx = Idx == 2 ? 3 : 2;
817 const MachineOperand &OtherOp = MI->getOperand(OtherIdx);
818 return OtherOp.isImm() ? OtherOp.getImm() : 0;
819 }
820 default:
821 break;
822 }
823
825 return 0;
826
827 assert((Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
828 AMDGPU::OpName::vaddr) ||
829 (Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
830 AMDGPU::OpName::saddr))) &&
831 "Should never see frame index on non-address operand");
832
834}
835
837 const MachineInstr &MI) {
838 assert(MI.getDesc().isAdd());
839 const MachineOperand &Src0 = MI.getOperand(1);
840 const MachineOperand &Src1 = MI.getOperand(2);
841
842 if (Src0.isFI()) {
843 return Src1.isImm() || (Src1.isReg() && TRI.isVGPR(MI.getMF()->getRegInfo(),
844 Src1.getReg()));
845 }
846
847 if (Src1.isFI()) {
848 return Src0.isImm() || (Src0.isReg() && TRI.isVGPR(MI.getMF()->getRegInfo(),
849 Src0.getReg()));
850 }
851
852 return false;
853}
854
856 // TODO: Handle v_add_co_u32, v_or_b32, v_and_b32 and scalar opcodes.
857 switch (MI->getOpcode()) {
858 case AMDGPU::V_ADD_U32_e32: {
859 // TODO: We could handle this but it requires work to avoid violating
860 // operand restrictions.
861 if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e32) < 2 &&
862 !isFIPlusImmOrVGPR(*this, *MI))
863 return false;
864 [[fallthrough]];
865 }
866 case AMDGPU::V_ADD_U32_e64:
867 // FIXME: This optimization is barely profitable enableFlatScratch as-is.
868 //
869 // Much of the benefit with the MUBUF handling is we avoid duplicating the
870 // shift of the frame register, which isn't needed with scratch.
871 //
872 // materializeFrameBaseRegister doesn't know the register classes of the
873 // uses, and unconditionally uses an s_add_i32, which will end up using a
874 // copy for the vector uses.
875 return !ST.enableFlatScratch();
876 case AMDGPU::V_ADD_CO_U32_e32:
877 if (ST.getConstantBusLimit(AMDGPU::V_ADD_CO_U32_e32) < 2 &&
878 !isFIPlusImmOrVGPR(*this, *MI))
879 return false;
880 // We can't deal with the case where the carry out has a use (though this
881 // should never happen)
882 return MI->getOperand(3).isDead();
883 case AMDGPU::V_ADD_CO_U32_e64:
884 // TODO: Should we check use_empty instead?
885 return MI->getOperand(1).isDead();
886 default:
887 break;
888 }
889
891 return false;
892
893 int64_t FullOffset = Offset + getScratchInstrOffset(MI);
894
895 const SIInstrInfo *TII = ST.getInstrInfo();
897 return !TII->isLegalMUBUFImmOffset(FullOffset);
898
899 return !TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS,
901}
902
904 int FrameIdx,
905 int64_t Offset) const {
906 MachineBasicBlock::iterator Ins = MBB->begin();
907 DebugLoc DL; // Defaults to "unknown"
908
909 if (Ins != MBB->end())
910 DL = Ins->getDebugLoc();
911
912 MachineFunction *MF = MBB->getParent();
913 const SIInstrInfo *TII = ST.getInstrInfo();
915 unsigned MovOpc = ST.enableFlatScratch() ? AMDGPU::S_MOV_B32
916 : AMDGPU::V_MOV_B32_e32;
917
918 Register BaseReg = MRI.createVirtualRegister(
919 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XEXEC_HIRegClass
920 : &AMDGPU::VGPR_32RegClass);
921
922 if (Offset == 0) {
923 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), BaseReg)
924 .addFrameIndex(FrameIdx);
925 return BaseReg;
926 }
927
928 Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
929
930 Register FIReg = MRI.createVirtualRegister(
931 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XM0RegClass
932 : &AMDGPU::VGPR_32RegClass);
933
934 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
935 .addImm(Offset);
936 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), FIReg)
937 .addFrameIndex(FrameIdx);
938
939 if (ST.enableFlatScratch() ) {
940 // FIXME: Make sure scc isn't live in.
941 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_I32), BaseReg)
942 .addReg(OffsetReg, RegState::Kill)
943 .addReg(FIReg)
944 .setOperandDead(3); // scc
945 return BaseReg;
946 }
947
948 TII->getAddNoCarry(*MBB, Ins, DL, BaseReg)
949 .addReg(OffsetReg, RegState::Kill)
950 .addReg(FIReg)
951 .addImm(0); // clamp bit
952
953 return BaseReg;
954}
955
957 int64_t Offset) const {
958 const SIInstrInfo *TII = ST.getInstrInfo();
959
960 switch (MI.getOpcode()) {
961 case AMDGPU::V_ADD_U32_e32:
962 case AMDGPU::V_ADD_CO_U32_e32: {
963 MachineOperand *FIOp = &MI.getOperand(2);
964 MachineOperand *ImmOp = &MI.getOperand(1);
965 if (!FIOp->isFI())
966 std::swap(FIOp, ImmOp);
967
968 if (!ImmOp->isImm()) {
969 assert(Offset == 0);
970 FIOp->ChangeToRegister(BaseReg, false);
971 TII->legalizeOperandsVOP2(MI.getMF()->getRegInfo(), MI);
972 return;
973 }
974
975 int64_t TotalOffset = ImmOp->getImm() + Offset;
976 if (TotalOffset == 0) {
977 MI.setDesc(TII->get(AMDGPU::COPY));
978 for (unsigned I = MI.getNumOperands() - 1; I != 1; --I)
979 MI.removeOperand(I);
980
981 MI.getOperand(1).ChangeToRegister(BaseReg, false);
982 return;
983 }
984
985 ImmOp->setImm(TotalOffset);
986
987 MachineBasicBlock *MBB = MI.getParent();
988 MachineFunction *MF = MBB->getParent();
990
991 // FIXME: materializeFrameBaseRegister does not know the register class of
992 // the uses of the frame index, and assumes SGPR for enableFlatScratch. Emit
993 // a copy so we have a legal operand and hope the register coalescer can
994 // clean it up.
995 if (isSGPRReg(MRI, BaseReg)) {
996 Register BaseRegVGPR =
997 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
998 BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), BaseRegVGPR)
999 .addReg(BaseReg);
1000 MI.getOperand(2).ChangeToRegister(BaseRegVGPR, false);
1001 } else {
1002 MI.getOperand(2).ChangeToRegister(BaseReg, false);
1003 }
1004 return;
1005 }
1006 case AMDGPU::V_ADD_U32_e64:
1007 case AMDGPU::V_ADD_CO_U32_e64: {
1008 int Src0Idx = MI.getNumExplicitDefs();
1009 MachineOperand *FIOp = &MI.getOperand(Src0Idx);
1010 MachineOperand *ImmOp = &MI.getOperand(Src0Idx + 1);
1011 if (!FIOp->isFI())
1012 std::swap(FIOp, ImmOp);
1013
1014 if (!ImmOp->isImm()) {
1015 FIOp->ChangeToRegister(BaseReg, false);
1016 TII->legalizeOperandsVOP3(MI.getMF()->getRegInfo(), MI);
1017 return;
1018 }
1019
1020 int64_t TotalOffset = ImmOp->getImm() + Offset;
1021 if (TotalOffset == 0) {
1022 MI.setDesc(TII->get(AMDGPU::COPY));
1023
1024 for (unsigned I = MI.getNumOperands() - 1; I != 1; --I)
1025 MI.removeOperand(I);
1026
1027 MI.getOperand(1).ChangeToRegister(BaseReg, false);
1028 } else {
1029 FIOp->ChangeToRegister(BaseReg, false);
1030 ImmOp->setImm(TotalOffset);
1031 }
1032
1033 return;
1034 }
1035 default:
1036 break;
1037 }
1038
1039 bool IsFlat = TII->isFLATScratch(MI);
1040
1041#ifndef NDEBUG
1042 // FIXME: Is it possible to be storing a frame index to itself?
1043 bool SeenFI = false;
1044 for (const MachineOperand &MO: MI.operands()) {
1045 if (MO.isFI()) {
1046 if (SeenFI)
1047 llvm_unreachable("should not see multiple frame indices");
1048
1049 SeenFI = true;
1050 }
1051 }
1052#endif
1053
1054 MachineOperand *FIOp =
1055 TII->getNamedOperand(MI, IsFlat ? AMDGPU::OpName::saddr
1056 : AMDGPU::OpName::vaddr);
1057
1058 MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset);
1059 int64_t NewOffset = OffsetOp->getImm() + Offset;
1060
1061 assert(FIOp && FIOp->isFI() && "frame index must be address operand");
1062 assert(TII->isMUBUF(MI) || TII->isFLATScratch(MI));
1063
1064 if (IsFlat) {
1065 assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
1067 "offset should be legal");
1068 FIOp->ChangeToRegister(BaseReg, false);
1069 OffsetOp->setImm(NewOffset);
1070 return;
1071 }
1072
1073#ifndef NDEBUG
1074 MachineOperand *SOffset = TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
1075 assert(SOffset->isImm() && SOffset->getImm() == 0);
1076#endif
1077
1078 assert(TII->isLegalMUBUFImmOffset(NewOffset) && "offset should be legal");
1079
1080 FIOp->ChangeToRegister(BaseReg, false);
1081 OffsetOp->setImm(NewOffset);
1082}
1083
1085 Register BaseReg,
1086 int64_t Offset) const {
1087
1088 switch (MI->getOpcode()) {
1089 case AMDGPU::V_ADD_U32_e32:
1090 case AMDGPU::V_ADD_CO_U32_e32:
1091 return true;
1092 case AMDGPU::V_ADD_U32_e64:
1093 case AMDGPU::V_ADD_CO_U32_e64:
1094 return ST.hasVOP3Literal() || AMDGPU::isInlinableIntLiteral(Offset);
1095 default:
1096 break;
1097 }
1098
1100 return false;
1101
1102 int64_t NewOffset = Offset + getScratchInstrOffset(MI);
1103
1104 const SIInstrInfo *TII = ST.getInstrInfo();
1106 return TII->isLegalMUBUFImmOffset(NewOffset);
1107
1108 return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
1110}
1111
1112const TargetRegisterClass *
1114 // This is inaccurate. It depends on the instruction and address space. The
1115 // only place where we should hit this is for dealing with frame indexes /
1116 // private accesses, so this is correct in that case.
1117 return &AMDGPU::VGPR_32RegClass;
1118}
1119
1120const TargetRegisterClass *
1122 return RC == &AMDGPU::SCC_CLASSRegClass ? &AMDGPU::SReg_32RegClass : RC;
1123}
1124
1126 const SIInstrInfo *TII) {
1127
1128 unsigned Op = MI.getOpcode();
1129 switch (Op) {
1130 case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE:
1131 case AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE:
1132 // FIXME: This assumes the mask is statically known and not computed at
1133 // runtime. However, some ABIs may want to compute the mask dynamically and
1134 // this will need to be updated.
1135 return llvm::popcount(
1136 (uint64_t)TII->getNamedOperand(MI, AMDGPU::OpName::mask)->getImm());
1137 case AMDGPU::SI_SPILL_S1024_SAVE:
1138 case AMDGPU::SI_SPILL_S1024_RESTORE:
1139 case AMDGPU::SI_SPILL_V1024_SAVE:
1140 case AMDGPU::SI_SPILL_V1024_RESTORE:
1141 case AMDGPU::SI_SPILL_A1024_SAVE:
1142 case AMDGPU::SI_SPILL_A1024_RESTORE:
1143 case AMDGPU::SI_SPILL_AV1024_SAVE:
1144 case AMDGPU::SI_SPILL_AV1024_RESTORE:
1145 return 32;
1146 case AMDGPU::SI_SPILL_S512_SAVE:
1147 case AMDGPU::SI_SPILL_S512_RESTORE:
1148 case AMDGPU::SI_SPILL_V512_SAVE:
1149 case AMDGPU::SI_SPILL_V512_RESTORE:
1150 case AMDGPU::SI_SPILL_A512_SAVE:
1151 case AMDGPU::SI_SPILL_A512_RESTORE:
1152 case AMDGPU::SI_SPILL_AV512_SAVE:
1153 case AMDGPU::SI_SPILL_AV512_RESTORE:
1154 return 16;
1155 case AMDGPU::SI_SPILL_S384_SAVE:
1156 case AMDGPU::SI_SPILL_S384_RESTORE:
1157 case AMDGPU::SI_SPILL_V384_SAVE:
1158 case AMDGPU::SI_SPILL_V384_RESTORE:
1159 case AMDGPU::SI_SPILL_A384_SAVE:
1160 case AMDGPU::SI_SPILL_A384_RESTORE:
1161 case AMDGPU::SI_SPILL_AV384_SAVE:
1162 case AMDGPU::SI_SPILL_AV384_RESTORE:
1163 return 12;
1164 case AMDGPU::SI_SPILL_S352_SAVE:
1165 case AMDGPU::SI_SPILL_S352_RESTORE:
1166 case AMDGPU::SI_SPILL_V352_SAVE:
1167 case AMDGPU::SI_SPILL_V352_RESTORE:
1168 case AMDGPU::SI_SPILL_A352_SAVE:
1169 case AMDGPU::SI_SPILL_A352_RESTORE:
1170 case AMDGPU::SI_SPILL_AV352_SAVE:
1171 case AMDGPU::SI_SPILL_AV352_RESTORE:
1172 return 11;
1173 case AMDGPU::SI_SPILL_S320_SAVE:
1174 case AMDGPU::SI_SPILL_S320_RESTORE:
1175 case AMDGPU::SI_SPILL_V320_SAVE:
1176 case AMDGPU::SI_SPILL_V320_RESTORE:
1177 case AMDGPU::SI_SPILL_A320_SAVE:
1178 case AMDGPU::SI_SPILL_A320_RESTORE:
1179 case AMDGPU::SI_SPILL_AV320_SAVE:
1180 case AMDGPU::SI_SPILL_AV320_RESTORE:
1181 return 10;
1182 case AMDGPU::SI_SPILL_S288_SAVE:
1183 case AMDGPU::SI_SPILL_S288_RESTORE:
1184 case AMDGPU::SI_SPILL_V288_SAVE:
1185 case AMDGPU::SI_SPILL_V288_RESTORE:
1186 case AMDGPU::SI_SPILL_A288_SAVE:
1187 case AMDGPU::SI_SPILL_A288_RESTORE:
1188 case AMDGPU::SI_SPILL_AV288_SAVE:
1189 case AMDGPU::SI_SPILL_AV288_RESTORE:
1190 return 9;
1191 case AMDGPU::SI_SPILL_S256_SAVE:
1192 case AMDGPU::SI_SPILL_S256_RESTORE:
1193 case AMDGPU::SI_SPILL_V256_SAVE:
1194 case AMDGPU::SI_SPILL_V256_RESTORE:
1195 case AMDGPU::SI_SPILL_A256_SAVE:
1196 case AMDGPU::SI_SPILL_A256_RESTORE:
1197 case AMDGPU::SI_SPILL_AV256_SAVE:
1198 case AMDGPU::SI_SPILL_AV256_RESTORE:
1199 return 8;
1200 case AMDGPU::SI_SPILL_S224_SAVE:
1201 case AMDGPU::SI_SPILL_S224_RESTORE:
1202 case AMDGPU::SI_SPILL_V224_SAVE:
1203 case AMDGPU::SI_SPILL_V224_RESTORE:
1204 case AMDGPU::SI_SPILL_A224_SAVE:
1205 case AMDGPU::SI_SPILL_A224_RESTORE:
1206 case AMDGPU::SI_SPILL_AV224_SAVE:
1207 case AMDGPU::SI_SPILL_AV224_RESTORE:
1208 return 7;
1209 case AMDGPU::SI_SPILL_S192_SAVE:
1210 case AMDGPU::SI_SPILL_S192_RESTORE:
1211 case AMDGPU::SI_SPILL_V192_SAVE:
1212 case AMDGPU::SI_SPILL_V192_RESTORE:
1213 case AMDGPU::SI_SPILL_A192_SAVE:
1214 case AMDGPU::SI_SPILL_A192_RESTORE:
1215 case AMDGPU::SI_SPILL_AV192_SAVE:
1216 case AMDGPU::SI_SPILL_AV192_RESTORE:
1217 return 6;
1218 case AMDGPU::SI_SPILL_S160_SAVE:
1219 case AMDGPU::SI_SPILL_S160_RESTORE:
1220 case AMDGPU::SI_SPILL_V160_SAVE:
1221 case AMDGPU::SI_SPILL_V160_RESTORE:
1222 case AMDGPU::SI_SPILL_A160_SAVE:
1223 case AMDGPU::SI_SPILL_A160_RESTORE:
1224 case AMDGPU::SI_SPILL_AV160_SAVE:
1225 case AMDGPU::SI_SPILL_AV160_RESTORE:
1226 return 5;
1227 case AMDGPU::SI_SPILL_S128_SAVE:
1228 case AMDGPU::SI_SPILL_S128_RESTORE:
1229 case AMDGPU::SI_SPILL_V128_SAVE:
1230 case AMDGPU::SI_SPILL_V128_RESTORE:
1231 case AMDGPU::SI_SPILL_A128_SAVE:
1232 case AMDGPU::SI_SPILL_A128_RESTORE:
1233 case AMDGPU::SI_SPILL_AV128_SAVE:
1234 case AMDGPU::SI_SPILL_AV128_RESTORE:
1235 return 4;
1236 case AMDGPU::SI_SPILL_S96_SAVE:
1237 case AMDGPU::SI_SPILL_S96_RESTORE:
1238 case AMDGPU::SI_SPILL_V96_SAVE:
1239 case AMDGPU::SI_SPILL_V96_RESTORE:
1240 case AMDGPU::SI_SPILL_A96_SAVE:
1241 case AMDGPU::SI_SPILL_A96_RESTORE:
1242 case AMDGPU::SI_SPILL_AV96_SAVE:
1243 case AMDGPU::SI_SPILL_AV96_RESTORE:
1244 return 3;
1245 case AMDGPU::SI_SPILL_S64_SAVE:
1246 case AMDGPU::SI_SPILL_S64_RESTORE:
1247 case AMDGPU::SI_SPILL_V64_SAVE:
1248 case AMDGPU::SI_SPILL_V64_RESTORE:
1249 case AMDGPU::SI_SPILL_A64_SAVE:
1250 case AMDGPU::SI_SPILL_A64_RESTORE:
1251 case AMDGPU::SI_SPILL_AV64_SAVE:
1252 case AMDGPU::SI_SPILL_AV64_RESTORE:
1253 return 2;
1254 case AMDGPU::SI_SPILL_S32_SAVE:
1255 case AMDGPU::SI_SPILL_S32_RESTORE:
1256 case AMDGPU::SI_SPILL_V32_SAVE:
1257 case AMDGPU::SI_SPILL_V32_RESTORE:
1258 case AMDGPU::SI_SPILL_A32_SAVE:
1259 case AMDGPU::SI_SPILL_A32_RESTORE:
1260 case AMDGPU::SI_SPILL_AV32_SAVE:
1261 case AMDGPU::SI_SPILL_AV32_RESTORE:
1262 case AMDGPU::SI_SPILL_WWM_V32_SAVE:
1263 case AMDGPU::SI_SPILL_WWM_V32_RESTORE:
1264 case AMDGPU::SI_SPILL_WWM_AV32_SAVE:
1265 case AMDGPU::SI_SPILL_WWM_AV32_RESTORE:
1266 case AMDGPU::SI_SPILL_V16_SAVE:
1267 case AMDGPU::SI_SPILL_V16_RESTORE:
1268 return 1;
1269 default: llvm_unreachable("Invalid spill opcode");
1270 }
1271}
1272
1273static int getOffsetMUBUFStore(unsigned Opc) {
1274 switch (Opc) {
1275 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
1276 return AMDGPU::BUFFER_STORE_DWORD_OFFSET;
1277 case AMDGPU::BUFFER_STORE_BYTE_OFFEN:
1278 return AMDGPU::BUFFER_STORE_BYTE_OFFSET;
1279 case AMDGPU::BUFFER_STORE_SHORT_OFFEN:
1280 return AMDGPU::BUFFER_STORE_SHORT_OFFSET;
1281 case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
1282 return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
1283 case AMDGPU::BUFFER_STORE_DWORDX3_OFFEN:
1284 return AMDGPU::BUFFER_STORE_DWORDX3_OFFSET;
1285 case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN:
1286 return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
1287 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN:
1288 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET;
1289 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN:
1290 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET;
1291 default:
1292 return -1;
1293 }
1294}
1295
1296static int getOffsetMUBUFLoad(unsigned Opc) {
1297 switch (Opc) {
1298 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
1299 return AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
1300 case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN:
1301 return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET;
1302 case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN:
1303 return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET;
1304 case AMDGPU::BUFFER_LOAD_USHORT_OFFEN:
1305 return AMDGPU::BUFFER_LOAD_USHORT_OFFSET;
1306 case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN:
1307 return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET;
1308 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN:
1309 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
1310 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN:
1311 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET;
1312 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN:
1313 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET;
1314 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN:
1315 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET;
1316 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN:
1317 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET;
1318 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN:
1319 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET;
1320 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN:
1321 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET;
1322 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN:
1323 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET;
1324 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN:
1325 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET;
1326 default:
1327 return -1;
1328 }
1329}
1330
1331static int getOffenMUBUFStore(unsigned Opc) {
1332 switch (Opc) {
1333 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
1334 return AMDGPU::BUFFER_STORE_DWORD_OFFEN;
1335 case AMDGPU::BUFFER_STORE_BYTE_OFFSET:
1336 return AMDGPU::BUFFER_STORE_BYTE_OFFEN;
1337 case AMDGPU::BUFFER_STORE_SHORT_OFFSET:
1338 return AMDGPU::BUFFER_STORE_SHORT_OFFEN;
1339 case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET:
1340 return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN;
1341 case AMDGPU::BUFFER_STORE_DWORDX3_OFFSET:
1342 return AMDGPU::BUFFER_STORE_DWORDX3_OFFEN;
1343 case AMDGPU::BUFFER_STORE_DWORDX4_OFFSET:
1344 return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN;
1345 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET:
1346 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN;
1347 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET:
1348 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN;
1349 default:
1350 return -1;
1351 }
1352}
1353
1354static int getOffenMUBUFLoad(unsigned Opc) {
1355 switch (Opc) {
1356 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
1357 return AMDGPU::BUFFER_LOAD_DWORD_OFFEN;
1358 case AMDGPU::BUFFER_LOAD_UBYTE_OFFSET:
1359 return AMDGPU::BUFFER_LOAD_UBYTE_OFFEN;
1360 case AMDGPU::BUFFER_LOAD_SBYTE_OFFSET:
1361 return AMDGPU::BUFFER_LOAD_SBYTE_OFFEN;
1362 case AMDGPU::BUFFER_LOAD_USHORT_OFFSET:
1363 return AMDGPU::BUFFER_LOAD_USHORT_OFFEN;
1364 case AMDGPU::BUFFER_LOAD_SSHORT_OFFSET:
1365 return AMDGPU::BUFFER_LOAD_SSHORT_OFFEN;
1366 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET:
1367 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
1368 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET:
1369 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN;
1370 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET:
1371 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN;
1372 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET:
1373 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN;
1374 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET:
1375 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN;
1376 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET:
1377 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN;
1378 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET:
1379 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN;
1380 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET:
1381 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN;
1382 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET:
1383 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN;
1384 default:
1385 return -1;
1386 }
1387}
1388
1392 int Index, unsigned Lane,
1393 unsigned ValueReg, bool IsKill) {
1394 MachineFunction *MF = MBB.getParent();
1396 const SIInstrInfo *TII = ST.getInstrInfo();
1397
1398 MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane);
1399
1400 if (Reg == AMDGPU::NoRegister)
1401 return MachineInstrBuilder();
1402
1403 bool IsStore = MI->mayStore();
1405 auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
1406
1407 unsigned Dst = IsStore ? Reg : ValueReg;
1408 unsigned Src = IsStore ? ValueReg : Reg;
1409 bool IsVGPR = TRI->isVGPR(MRI, Reg);
1410 DebugLoc DL = MI->getDebugLoc();
1411 if (IsVGPR == TRI->isVGPR(MRI, ValueReg)) {
1412 // Spiller during regalloc may restore a spilled register to its superclass.
1413 // It could result in AGPR spills restored to VGPRs or the other way around,
1414 // making the src and dst with identical regclasses at this point. It just
1415 // needs a copy in such cases.
1416 auto CopyMIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), Dst)
1417 .addReg(Src, getKillRegState(IsKill));
1419 return CopyMIB;
1420 }
1421 unsigned Opc = (IsStore ^ IsVGPR) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64
1422 : AMDGPU::V_ACCVGPR_READ_B32_e64;
1423
1424 auto MIB = BuildMI(MBB, MI, DL, TII->get(Opc), Dst)
1425 .addReg(Src, getKillRegState(IsKill));
1427 return MIB;
1428}
1429
1430// This differs from buildSpillLoadStore by only scavenging a VGPR. It does not
1431// need to handle the case where an SGPR may need to be spilled while spilling.
1433 MachineFrameInfo &MFI,
1435 int Index,
1436 int64_t Offset) {
1437 const SIInstrInfo *TII = ST.getInstrInfo();
1438 MachineBasicBlock *MBB = MI->getParent();
1439 const DebugLoc &DL = MI->getDebugLoc();
1440 bool IsStore = MI->mayStore();
1441
1442 unsigned Opc = MI->getOpcode();
1443 int LoadStoreOp = IsStore ?
1445 if (LoadStoreOp == -1)
1446 return false;
1447
1448 const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata);
1449 if (spillVGPRtoAGPR(ST, *MBB, MI, Index, 0, Reg->getReg(), false).getInstr())
1450 return true;
1451
1452 MachineInstrBuilder NewMI =
1453 BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
1454 .add(*Reg)
1455 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc))
1456 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset))
1457 .addImm(Offset)
1458 .addImm(0) // cpol
1459 .addImm(0) // swz
1460 .cloneMemRefs(*MI);
1461
1462 const MachineOperand *VDataIn = TII->getNamedOperand(*MI,
1463 AMDGPU::OpName::vdata_in);
1464 if (VDataIn)
1465 NewMI.add(*VDataIn);
1466 return true;
1467}
1468
1470 unsigned LoadStoreOp,
1471 unsigned EltSize) {
1472 bool IsStore = TII->get(LoadStoreOp).mayStore();
1473 bool HasVAddr = AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::vaddr);
1474 bool UseST =
1475 !HasVAddr && !AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::saddr);
1476
1477 // Handle block load/store first.
1478 if (TII->isBlockLoadStore(LoadStoreOp))
1479 return LoadStoreOp;
1480
1481 switch (EltSize) {
1482 case 4:
1483 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
1484 : AMDGPU::SCRATCH_LOAD_DWORD_SADDR;
1485 break;
1486 case 8:
1487 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX2_SADDR
1488 : AMDGPU::SCRATCH_LOAD_DWORDX2_SADDR;
1489 break;
1490 case 12:
1491 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX3_SADDR
1492 : AMDGPU::SCRATCH_LOAD_DWORDX3_SADDR;
1493 break;
1494 case 16:
1495 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX4_SADDR
1496 : AMDGPU::SCRATCH_LOAD_DWORDX4_SADDR;
1497 break;
1498 default:
1499 llvm_unreachable("Unexpected spill load/store size!");
1500 }
1501
1502 if (HasVAddr)
1503 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp);
1504 else if (UseST)
1505 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
1506
1507 return LoadStoreOp;
1508}
1509
1512 unsigned LoadStoreOp, int Index, Register ValueReg, bool IsKill,
1513 MCRegister ScratchOffsetReg, int64_t InstOffset, MachineMemOperand *MMO,
1514 RegScavenger *RS, LiveRegUnits *LiveUnits) const {
1515 assert((!RS || !LiveUnits) && "Only RS or LiveUnits can be set but not both");
1516
1517 MachineFunction *MF = MBB.getParent();
1518 const SIInstrInfo *TII = ST.getInstrInfo();
1519 const MachineFrameInfo &MFI = MF->getFrameInfo();
1520 const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
1521
1522 const MCInstrDesc *Desc = &TII->get(LoadStoreOp);
1523 bool IsStore = Desc->mayStore();
1524 bool IsFlat = TII->isFLATScratch(LoadStoreOp);
1525 bool IsBlock = TII->isBlockLoadStore(LoadStoreOp);
1526
1527 bool CanClobberSCC = false;
1528 bool Scavenged = false;
1529 MCRegister SOffset = ScratchOffsetReg;
1530
1531 const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
1532 // On gfx90a+ AGPR is a regular VGPR acceptable for loads and stores.
1533 const bool IsAGPR = !ST.hasGFX90AInsts() && isAGPRClass(RC);
1534 const unsigned RegWidth = AMDGPU::getRegBitWidth(*RC) / 8;
1535
1536 // Always use 4 byte operations for AGPRs because we need to scavenge
1537 // a temporary VGPR.
1538 // If we're using a block operation, the element should be the whole block.
1539 unsigned EltSize = IsBlock ? RegWidth
1540 : (IsFlat && !IsAGPR) ? std::min(RegWidth, 16u)
1541 : 4u;
1542 unsigned NumSubRegs = RegWidth / EltSize;
1543 unsigned Size = NumSubRegs * EltSize;
1544 unsigned RemSize = RegWidth - Size;
1545 unsigned NumRemSubRegs = RemSize ? 1 : 0;
1546 int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
1547 int64_t MaterializedOffset = Offset;
1548
1549 int64_t MaxOffset = Offset + Size + RemSize - EltSize;
1550 int64_t ScratchOffsetRegDelta = 0;
1551
1552 if (IsFlat && EltSize > 4) {
1553 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
1554 Desc = &TII->get(LoadStoreOp);
1555 }
1556
1557 Align Alignment = MFI.getObjectAlign(Index);
1558 const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
1559
1560 assert((IsFlat || ((Offset % EltSize) == 0)) &&
1561 "unexpected VGPR spill offset");
1562
1563 // Track a VGPR to use for a constant offset we need to materialize.
1564 Register TmpOffsetVGPR;
1565
1566 // Track a VGPR to use as an intermediate value.
1567 Register TmpIntermediateVGPR;
1568 bool UseVGPROffset = false;
1569
1570 // Materialize a VGPR offset required for the given SGPR/VGPR/Immediate
1571 // combination.
1572 auto MaterializeVOffset = [&](Register SGPRBase, Register TmpVGPR,
1573 int64_t VOffset) {
1574 // We are using a VGPR offset
1575 if (IsFlat && SGPRBase) {
1576 // We only have 1 VGPR offset, or 1 SGPR offset. We don't have a free
1577 // SGPR, so perform the add as vector.
1578 // We don't need a base SGPR in the kernel.
1579
1580 if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) >= 2) {
1581 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), TmpVGPR)
1582 .addReg(SGPRBase)
1583 .addImm(VOffset)
1584 .addImm(0); // clamp
1585 } else {
1586 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
1587 .addReg(SGPRBase);
1588 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e32), TmpVGPR)
1589 .addImm(VOffset)
1590 .addReg(TmpOffsetVGPR);
1591 }
1592 } else {
1593 assert(TmpOffsetVGPR);
1594 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
1595 .addImm(VOffset);
1596 }
1597 };
1598
1599 bool IsOffsetLegal =
1600 IsFlat ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS,
1602 : TII->isLegalMUBUFImmOffset(MaxOffset);
1603 if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) {
1604 SOffset = MCRegister();
1605
1606 // We don't have access to the register scavenger if this function is called
1607 // during PEI::scavengeFrameVirtualRegs() so use LiveUnits in this case.
1608 // TODO: Clobbering SCC is not necessary for scratch instructions in the
1609 // entry.
1610 if (RS) {
1611 SOffset = RS->scavengeRegisterBackwards(AMDGPU::SGPR_32RegClass, MI, false, 0, false);
1612
1613 // Piggy back on the liveness scan we just did see if SCC is dead.
1614 CanClobberSCC = !RS->isRegUsed(AMDGPU::SCC);
1615 } else if (LiveUnits) {
1616 CanClobberSCC = LiveUnits->available(AMDGPU::SCC);
1617 for (MCRegister Reg : AMDGPU::SGPR_32RegClass) {
1618 if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) {
1619 SOffset = Reg;
1620 break;
1621 }
1622 }
1623 }
1624
1625 if (ScratchOffsetReg != AMDGPU::NoRegister && !CanClobberSCC)
1626 SOffset = Register();
1627
1628 if (!SOffset) {
1629 UseVGPROffset = true;
1630
1631 if (RS) {
1632 TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0);
1633 } else {
1634 assert(LiveUnits);
1635 for (MCRegister Reg : AMDGPU::VGPR_32RegClass) {
1636 if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) {
1637 TmpOffsetVGPR = Reg;
1638 break;
1639 }
1640 }
1641 }
1642
1643 assert(TmpOffsetVGPR);
1644 } else if (!SOffset && CanClobberSCC) {
1645 // There are no free SGPRs, and since we are in the process of spilling
1646 // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true
1647 // on SI/CI and on VI it is true until we implement spilling using scalar
1648 // stores), we have no way to free up an SGPR. Our solution here is to
1649 // add the offset directly to the ScratchOffset or StackPtrOffset
1650 // register, and then subtract the offset after the spill to return the
1651 // register to it's original value.
1652
1653 // TODO: If we don't have to do an emergency stack slot spill, converting
1654 // to use the VGPR offset is fewer instructions.
1655 if (!ScratchOffsetReg)
1656 ScratchOffsetReg = FuncInfo->getStackPtrOffsetReg();
1657 SOffset = ScratchOffsetReg;
1658 ScratchOffsetRegDelta = Offset;
1659 } else {
1660 Scavenged = true;
1661 }
1662
1663 // We currently only support spilling VGPRs to EltSize boundaries, meaning
1664 // we can simplify the adjustment of Offset here to just scale with
1665 // WavefrontSize.
1666 if (!IsFlat && !UseVGPROffset)
1667 Offset *= ST.getWavefrontSize();
1668
1669 if (!UseVGPROffset && !SOffset)
1670 report_fatal_error("could not scavenge SGPR to spill in entry function");
1671
1672 if (UseVGPROffset) {
1673 // We are using a VGPR offset
1674 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, Offset);
1675 } else if (ScratchOffsetReg == AMDGPU::NoRegister) {
1676 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset).addImm(Offset);
1677 } else {
1678 assert(Offset != 0);
1679 auto Add = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset)
1680 .addReg(ScratchOffsetReg)
1681 .addImm(Offset);
1682 Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1683 }
1684
1685 Offset = 0;
1686 }
1687
1688 if (IsFlat && SOffset == AMDGPU::NoRegister) {
1689 assert(AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0
1690 && "Unexpected vaddr for flat scratch with a FI operand");
1691
1692 if (UseVGPROffset) {
1693 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp);
1694 } else {
1695 assert(ST.hasFlatScratchSTMode());
1696 assert(!TII->isBlockLoadStore(LoadStoreOp) && "Block ops don't have ST");
1697 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
1698 }
1699
1700 Desc = &TII->get(LoadStoreOp);
1701 }
1702
1703 for (unsigned i = 0, e = NumSubRegs + NumRemSubRegs, RegOffset = 0; i != e;
1704 ++i, RegOffset += EltSize) {
1705 if (i == NumSubRegs) {
1706 EltSize = RemSize;
1707 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
1708 }
1709 Desc = &TII->get(LoadStoreOp);
1710
1711 if (!IsFlat && UseVGPROffset) {
1712 int NewLoadStoreOp = IsStore ? getOffenMUBUFStore(LoadStoreOp)
1713 : getOffenMUBUFLoad(LoadStoreOp);
1714 Desc = &TII->get(NewLoadStoreOp);
1715 }
1716
1717 if (UseVGPROffset && TmpOffsetVGPR == TmpIntermediateVGPR) {
1718 // If we are spilling an AGPR beyond the range of the memory instruction
1719 // offset and need to use a VGPR offset, we ideally have at least 2
1720 // scratch VGPRs. If we don't have a second free VGPR without spilling,
1721 // recycle the VGPR used for the offset which requires resetting after
1722 // each subregister.
1723
1724 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, MaterializedOffset);
1725 }
1726
1727 unsigned NumRegs = EltSize / 4;
1728 Register SubReg = e == 1
1729 ? ValueReg
1730 : Register(getSubReg(ValueReg,
1731 getSubRegFromChannel(RegOffset / 4, NumRegs)));
1732
1733 unsigned SOffsetRegState = 0;
1734 unsigned SrcDstRegState = getDefRegState(!IsStore);
1735 const bool IsLastSubReg = i + 1 == e;
1736 const bool IsFirstSubReg = i == 0;
1737 if (IsLastSubReg) {
1738 SOffsetRegState |= getKillRegState(Scavenged);
1739 // The last implicit use carries the "Kill" flag.
1740 SrcDstRegState |= getKillRegState(IsKill);
1741 }
1742
1743 // Make sure the whole register is defined if there are undef components by
1744 // adding an implicit def of the super-reg on the first instruction.
1745 bool NeedSuperRegDef = e > 1 && IsStore && IsFirstSubReg;
1746 bool NeedSuperRegImpOperand = e > 1;
1747
1748 // Remaining element size to spill into memory after some parts of it
1749 // spilled into either AGPRs or VGPRs.
1750 unsigned RemEltSize = EltSize;
1751
1752 // AGPRs to spill VGPRs and vice versa are allocated in a reverse order,
1753 // starting from the last lane. In case if a register cannot be completely
1754 // spilled into another register that will ensure its alignment does not
1755 // change. For targets with VGPR alignment requirement this is important
1756 // in case of flat scratch usage as we might get a scratch_load or
1757 // scratch_store of an unaligned register otherwise.
1758 for (int LaneS = (RegOffset + EltSize) / 4 - 1, Lane = LaneS,
1759 LaneE = RegOffset / 4;
1760 Lane >= LaneE; --Lane) {
1761 bool IsSubReg = e > 1 || EltSize > 4;
1762 Register Sub = IsSubReg
1763 ? Register(getSubReg(ValueReg, getSubRegFromChannel(Lane)))
1764 : ValueReg;
1765 auto MIB = spillVGPRtoAGPR(ST, MBB, MI, Index, Lane, Sub, IsKill);
1766 if (!MIB.getInstr())
1767 break;
1768 if (NeedSuperRegDef || (IsSubReg && IsStore && Lane == LaneS && IsFirstSubReg)) {
1769 MIB.addReg(ValueReg, RegState::ImplicitDefine);
1770 NeedSuperRegDef = false;
1771 }
1772 if ((IsSubReg || NeedSuperRegImpOperand) && (IsFirstSubReg || IsLastSubReg)) {
1773 NeedSuperRegImpOperand = true;
1774 unsigned State = SrcDstRegState;
1775 if (!IsLastSubReg || (Lane != LaneE))
1776 State &= ~RegState::Kill;
1777 if (!IsFirstSubReg || (Lane != LaneS))
1778 State &= ~RegState::Define;
1779 MIB.addReg(ValueReg, RegState::Implicit | State);
1780 }
1781 RemEltSize -= 4;
1782 }
1783
1784 if (!RemEltSize) // Fully spilled into AGPRs.
1785 continue;
1786
1787 if (RemEltSize != EltSize) { // Partially spilled to AGPRs
1788 assert(IsFlat && EltSize > 4);
1789
1790 unsigned NumRegs = RemEltSize / 4;
1791 SubReg = Register(getSubReg(ValueReg,
1792 getSubRegFromChannel(RegOffset / 4, NumRegs)));
1793 unsigned Opc = getFlatScratchSpillOpcode(TII, LoadStoreOp, RemEltSize);
1794 Desc = &TII->get(Opc);
1795 }
1796
1797 unsigned FinalReg = SubReg;
1798
1799 if (IsAGPR) {
1800 assert(EltSize == 4);
1801
1802 if (!TmpIntermediateVGPR) {
1803 TmpIntermediateVGPR = FuncInfo->getVGPRForAGPRCopy();
1804 assert(MF->getRegInfo().isReserved(TmpIntermediateVGPR));
1805 }
1806 if (IsStore) {
1807 auto AccRead = BuildMI(MBB, MI, DL,
1808 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64),
1809 TmpIntermediateVGPR)
1810 .addReg(SubReg, getKillRegState(IsKill));
1811 if (NeedSuperRegDef)
1812 AccRead.addReg(ValueReg, RegState::ImplicitDefine);
1813 if (NeedSuperRegImpOperand && (IsFirstSubReg || IsLastSubReg))
1814 AccRead.addReg(ValueReg, RegState::Implicit);
1816 }
1817 SubReg = TmpIntermediateVGPR;
1818 } else if (UseVGPROffset) {
1819 if (!TmpOffsetVGPR) {
1820 TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
1821 MI, false, 0);
1822 RS->setRegUsed(TmpOffsetVGPR);
1823 }
1824 }
1825
1826 MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RegOffset);
1827 MachineMemOperand *NewMMO =
1828 MF->getMachineMemOperand(PInfo, MMO->getFlags(), RemEltSize,
1829 commonAlignment(Alignment, RegOffset));
1830
1831 auto MIB =
1832 BuildMI(MBB, MI, DL, *Desc)
1833 .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill));
1834
1835 if (UseVGPROffset) {
1836 // For an AGPR spill, we reuse the same temp VGPR for the offset and the
1837 // intermediate accvgpr_write.
1838 MIB.addReg(TmpOffsetVGPR, getKillRegState(IsLastSubReg && !IsAGPR));
1839 }
1840
1841 if (!IsFlat)
1842 MIB.addReg(FuncInfo->getScratchRSrcReg());
1843
1844 if (SOffset == AMDGPU::NoRegister) {
1845 if (!IsFlat) {
1846 if (UseVGPROffset && ScratchOffsetReg) {
1847 MIB.addReg(ScratchOffsetReg);
1848 } else {
1849 assert(FuncInfo->isBottomOfStack());
1850 MIB.addImm(0);
1851 }
1852 }
1853 } else {
1854 MIB.addReg(SOffset, SOffsetRegState);
1855 }
1856
1857 MIB.addImm(Offset + RegOffset);
1858
1859 bool LastUse = MMO->getFlags() & MOLastUse;
1860 MIB.addImm(LastUse ? AMDGPU::CPol::TH_LU : 0); // cpol
1861
1862 if (!IsFlat)
1863 MIB.addImm(0); // swz
1864 MIB.addMemOperand(NewMMO);
1865
1866 if (!IsAGPR && NeedSuperRegDef)
1867 MIB.addReg(ValueReg, RegState::ImplicitDefine);
1868
1869 if (!IsStore && IsAGPR && TmpIntermediateVGPR != AMDGPU::NoRegister) {
1870 MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64),
1871 FinalReg)
1872 .addReg(TmpIntermediateVGPR, RegState::Kill);
1874 }
1875
1876 bool IsSrcDstDef = SrcDstRegState & RegState::Define;
1877 if (NeedSuperRegImpOperand &&
1878 (IsFirstSubReg || (IsLastSubReg && !IsSrcDstDef)))
1879 MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
1880
1881 // The epilog restore of a wwm-scratch register can cause undesired
1882 // optimization during machine-cp post PrologEpilogInserter if the same
1883 // register was assigned for return value ABI lowering with a COPY
1884 // instruction. As given below, with the epilog reload, the earlier COPY
1885 // appeared to be dead during machine-cp.
1886 // ...
1887 // v0 in WWM operation, needs the WWM spill at prolog/epilog.
1888 // $vgpr0 = V_WRITELANE_B32 $sgpr20, 0, $vgpr0
1889 // ...
1890 // Epilog block:
1891 // $vgpr0 = COPY $vgpr1 // outgoing value moved to v0
1892 // ...
1893 // WWM spill restore to preserve the inactive lanes of v0.
1894 // $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1
1895 // $vgpr0 = BUFFER_LOAD $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0
1896 // $exec = S_MOV_B64 killed $sgpr4_sgpr5
1897 // ...
1898 // SI_RETURN implicit $vgpr0
1899 // ...
1900 // To fix it, mark the same reg as a tied op for such restore instructions
1901 // so that it marks a usage for the preceding COPY.
1902 if (!IsStore && MI != MBB.end() && MI->isReturn() &&
1903 MI->readsRegister(SubReg, this)) {
1904 MIB.addReg(SubReg, RegState::Implicit);
1905 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1906 }
1907
1908 // If we're building a block load, we should add artificial uses for the
1909 // CSR VGPRs that are *not* being transferred. This is because liveness
1910 // analysis is not aware of the mask, so we need to somehow inform it that
1911 // those registers are not available before the load and they should not be
1912 // scavenged.
1913 if (!IsStore && TII->isBlockLoadStore(LoadStoreOp))
1914 addImplicitUsesForBlockCSRLoad(MIB, ValueReg);
1915 }
1916
1917 if (ScratchOffsetRegDelta != 0) {
1918 // Subtract the offset we added to the ScratchOffset register.
1919 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset)
1920 .addReg(SOffset)
1921 .addImm(-ScratchOffsetRegDelta);
1922 }
1923}
1924
1926 Register BlockReg) const {
1927 const MachineFunction *MF = MIB->getParent()->getParent();
1928 const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
1929 uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(BlockReg);
1930 Register BaseVGPR = getSubReg(BlockReg, AMDGPU::sub0);
1931 for (unsigned RegOffset = 1; RegOffset < 32; ++RegOffset)
1932 if (!(Mask & (1 << RegOffset)) &&
1933 isCalleeSavedPhysReg(BaseVGPR + RegOffset, *MF))
1934 MIB.addUse(BaseVGPR + RegOffset, RegState::Implicit);
1935}
1936
1938 int Offset, bool IsLoad,
1939 bool IsKill) const {
1940 // Load/store VGPR
1941 MachineFrameInfo &FrameInfo = SB.MF.getFrameInfo();
1942 assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill);
1943
1944 Register FrameReg =
1945 FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(SB.MF)
1946 ? getBaseRegister()
1947 : getFrameRegister(SB.MF);
1948
1949 Align Alignment = FrameInfo.getObjectAlign(Index);
1953 SB.EltSize, Alignment);
1954
1955 if (IsLoad) {
1956 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
1957 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
1958 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, false,
1959 FrameReg, (int64_t)Offset * SB.EltSize, MMO, SB.RS);
1960 } else {
1961 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
1962 : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
1963 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, IsKill,
1964 FrameReg, (int64_t)Offset * SB.EltSize, MMO, SB.RS);
1965 // This only ever adds one VGPR spill
1966 SB.MFI.addToSpilledVGPRs(1);
1967 }
1968}
1969
1971 RegScavenger *RS, SlotIndexes *Indexes,
1972 LiveIntervals *LIS, bool OnlyToVGPR,
1973 bool SpillToPhysVGPRLane) const {
1974 assert(!MI->getOperand(0).isUndef() &&
1975 "undef spill should have been deleted earlier");
1976
1977 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
1978
1979 ArrayRef<SpilledReg> VGPRSpills =
1980 SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index)
1982 bool SpillToVGPR = !VGPRSpills.empty();
1983 if (OnlyToVGPR && !SpillToVGPR)
1984 return false;
1985
1986 assert(SpillToVGPR || (SB.SuperReg != SB.MFI.getStackPtrOffsetReg() &&
1987 SB.SuperReg != SB.MFI.getFrameOffsetReg()));
1988
1989 if (SpillToVGPR) {
1990
1991 // Since stack slot coloring pass is trying to optimize SGPR spills,
1992 // VGPR lanes (mapped from spill stack slot) may be shared for SGPR
1993 // spills of different sizes. This accounts for number of VGPR lanes alloted
1994 // equal to the largest SGPR being spilled in them.
1995 assert(SB.NumSubRegs <= VGPRSpills.size() &&
1996 "Num of SGPRs spilled should be less than or equal to num of "
1997 "the VGPR lanes.");
1998
1999 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
2001 SB.NumSubRegs == 1
2002 ? SB.SuperReg
2003 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2004 SpilledReg Spill = VGPRSpills[i];
2005
2006 bool IsFirstSubreg = i == 0;
2007 bool IsLastSubreg = i == SB.NumSubRegs - 1;
2008 bool UseKill = SB.IsKill && IsLastSubreg;
2009
2010
2011 // Mark the "old value of vgpr" input undef only if this is the first sgpr
2012 // spill to this specific vgpr in the first basic block.
2013 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
2014 SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), Spill.VGPR)
2015 .addReg(SubReg, getKillRegState(UseKill))
2016 .addImm(Spill.Lane)
2017 .addReg(Spill.VGPR);
2018 if (Indexes) {
2019 if (IsFirstSubreg)
2020 Indexes->replaceMachineInstrInMaps(*MI, *MIB);
2021 else
2022 Indexes->insertMachineInstrInMaps(*MIB);
2023 }
2024
2025 if (IsFirstSubreg && SB.NumSubRegs > 1) {
2026 // We may be spilling a super-register which is only partially defined,
2027 // and need to ensure later spills think the value is defined.
2028 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
2029 }
2030
2031 if (SB.NumSubRegs > 1 && (IsFirstSubreg || IsLastSubreg))
2033
2034 // FIXME: Since this spills to another register instead of an actual
2035 // frame index, we should delete the frame index when all references to
2036 // it are fixed.
2037 }
2038 } else {
2039 SB.prepare();
2040
2041 // SubReg carries the "Kill" flag when SubReg == SB.SuperReg.
2042 unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill);
2043
2044 // Per VGPR helper data
2045 auto PVD = SB.getPerVGPRData();
2046
2047 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2048 unsigned TmpVGPRFlags = RegState::Undef;
2049
2050 // Write sub registers into the VGPR
2051 for (unsigned i = Offset * PVD.PerVGPR,
2052 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2053 i < e; ++i) {
2055 SB.NumSubRegs == 1
2056 ? SB.SuperReg
2057 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2058
2059 MachineInstrBuilder WriteLane =
2060 BuildMI(*SB.MBB, MI, SB.DL,
2061 SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), SB.TmpVGPR)
2062 .addReg(SubReg, SubKillState)
2063 .addImm(i % PVD.PerVGPR)
2064 .addReg(SB.TmpVGPR, TmpVGPRFlags);
2065 TmpVGPRFlags = 0;
2066
2067 if (Indexes) {
2068 if (i == 0)
2069 Indexes->replaceMachineInstrInMaps(*MI, *WriteLane);
2070 else
2071 Indexes->insertMachineInstrInMaps(*WriteLane);
2072 }
2073
2074 // There could be undef components of a spilled super register.
2075 // TODO: Can we detect this and skip the spill?
2076 if (SB.NumSubRegs > 1) {
2077 // The last implicit use of the SB.SuperReg carries the "Kill" flag.
2078 unsigned SuperKillState = 0;
2079 if (i + 1 == SB.NumSubRegs)
2080 SuperKillState |= getKillRegState(SB.IsKill);
2081 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState);
2082 }
2083 }
2084
2085 // Write out VGPR
2086 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ false);
2087 }
2088
2089 SB.restore();
2090 }
2091
2092 MI->eraseFromParent();
2094
2095 if (LIS)
2097
2098 return true;
2099}
2100
2102 RegScavenger *RS, SlotIndexes *Indexes,
2103 LiveIntervals *LIS, bool OnlyToVGPR,
2104 bool SpillToPhysVGPRLane) const {
2105 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
2106
2107 ArrayRef<SpilledReg> VGPRSpills =
2108 SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index)
2110 bool SpillToVGPR = !VGPRSpills.empty();
2111 if (OnlyToVGPR && !SpillToVGPR)
2112 return false;
2113
2114 if (SpillToVGPR) {
2115 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
2117 SB.NumSubRegs == 1
2118 ? SB.SuperReg
2119 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2120
2121 SpilledReg Spill = VGPRSpills[i];
2122 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
2123 SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
2124 .addReg(Spill.VGPR)
2125 .addImm(Spill.Lane);
2126 if (SB.NumSubRegs > 1 && i == 0)
2128 if (Indexes) {
2129 if (i == e - 1)
2130 Indexes->replaceMachineInstrInMaps(*MI, *MIB);
2131 else
2132 Indexes->insertMachineInstrInMaps(*MIB);
2133 }
2134 }
2135 } else {
2136 SB.prepare();
2137
2138 // Per VGPR helper data
2139 auto PVD = SB.getPerVGPRData();
2140
2141 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2142 // Load in VGPR data
2143 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ true);
2144
2145 // Unpack lanes
2146 for (unsigned i = Offset * PVD.PerVGPR,
2147 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2148 i < e; ++i) {
2150 SB.NumSubRegs == 1
2151 ? SB.SuperReg
2152 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2153
2154 bool LastSubReg = (i + 1 == e);
2155 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
2156 SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
2157 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg))
2158 .addImm(i);
2159 if (SB.NumSubRegs > 1 && i == 0)
2161 if (Indexes) {
2162 if (i == e - 1)
2163 Indexes->replaceMachineInstrInMaps(*MI, *MIB);
2164 else
2165 Indexes->insertMachineInstrInMaps(*MIB);
2166 }
2167 }
2168 }
2169
2170 SB.restore();
2171 }
2172
2173 MI->eraseFromParent();
2174
2175 if (LIS)
2177
2178 return true;
2179}
2180
2182 MachineBasicBlock &RestoreMBB,
2183 Register SGPR, RegScavenger *RS) const {
2184 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, SGPR, false, 0,
2185 RS);
2186 SB.prepare();
2187 // Generate the spill of SGPR to SB.TmpVGPR.
2188 unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill);
2189 auto PVD = SB.getPerVGPRData();
2190 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2191 unsigned TmpVGPRFlags = RegState::Undef;
2192 // Write sub registers into the VGPR
2193 for (unsigned i = Offset * PVD.PerVGPR,
2194 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2195 i < e; ++i) {
2197 SB.NumSubRegs == 1
2198 ? SB.SuperReg
2199 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2200
2201 MachineInstrBuilder WriteLane =
2202 BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32),
2203 SB.TmpVGPR)
2204 .addReg(SubReg, SubKillState)
2205 .addImm(i % PVD.PerVGPR)
2206 .addReg(SB.TmpVGPR, TmpVGPRFlags);
2207 TmpVGPRFlags = 0;
2208 // There could be undef components of a spilled super register.
2209 // TODO: Can we detect this and skip the spill?
2210 if (SB.NumSubRegs > 1) {
2211 // The last implicit use of the SB.SuperReg carries the "Kill" flag.
2212 unsigned SuperKillState = 0;
2213 if (i + 1 == SB.NumSubRegs)
2214 SuperKillState |= getKillRegState(SB.IsKill);
2215 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState);
2216 }
2217 }
2218 // Don't need to write VGPR out.
2219 }
2220
2221 // Restore clobbered registers in the specified restore block.
2222 MI = RestoreMBB.end();
2223 SB.setMI(&RestoreMBB, MI);
2224 // Generate the restore of SGPR from SB.TmpVGPR.
2225 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2226 // Don't need to load VGPR in.
2227 // Unpack lanes
2228 for (unsigned i = Offset * PVD.PerVGPR,
2229 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2230 i < e; ++i) {
2232 SB.NumSubRegs == 1
2233 ? SB.SuperReg
2234 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2235
2236 assert(SubReg.isPhysical());
2237 bool LastSubReg = (i + 1 == e);
2238 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32),
2239 SubReg)
2240 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg))
2241 .addImm(i);
2242 if (SB.NumSubRegs > 1 && i == 0)
2244 }
2245 }
2246 SB.restore();
2247
2249 return false;
2250}
2251
2252/// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to
2253/// a VGPR and the stack slot can be safely eliminated when all other users are
2254/// handled.
2257 SlotIndexes *Indexes, LiveIntervals *LIS, bool SpillToPhysVGPRLane) const {
2258 switch (MI->getOpcode()) {
2259 case AMDGPU::SI_SPILL_S1024_SAVE:
2260 case AMDGPU::SI_SPILL_S512_SAVE:
2261 case AMDGPU::SI_SPILL_S384_SAVE:
2262 case AMDGPU::SI_SPILL_S352_SAVE:
2263 case AMDGPU::SI_SPILL_S320_SAVE:
2264 case AMDGPU::SI_SPILL_S288_SAVE:
2265 case AMDGPU::SI_SPILL_S256_SAVE:
2266 case AMDGPU::SI_SPILL_S224_SAVE:
2267 case AMDGPU::SI_SPILL_S192_SAVE:
2268 case AMDGPU::SI_SPILL_S160_SAVE:
2269 case AMDGPU::SI_SPILL_S128_SAVE:
2270 case AMDGPU::SI_SPILL_S96_SAVE:
2271 case AMDGPU::SI_SPILL_S64_SAVE:
2272 case AMDGPU::SI_SPILL_S32_SAVE:
2273 return spillSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane);
2274 case AMDGPU::SI_SPILL_S1024_RESTORE:
2275 case AMDGPU::SI_SPILL_S512_RESTORE:
2276 case AMDGPU::SI_SPILL_S384_RESTORE:
2277 case AMDGPU::SI_SPILL_S352_RESTORE:
2278 case AMDGPU::SI_SPILL_S320_RESTORE:
2279 case AMDGPU::SI_SPILL_S288_RESTORE:
2280 case AMDGPU::SI_SPILL_S256_RESTORE:
2281 case AMDGPU::SI_SPILL_S224_RESTORE:
2282 case AMDGPU::SI_SPILL_S192_RESTORE:
2283 case AMDGPU::SI_SPILL_S160_RESTORE:
2284 case AMDGPU::SI_SPILL_S128_RESTORE:
2285 case AMDGPU::SI_SPILL_S96_RESTORE:
2286 case AMDGPU::SI_SPILL_S64_RESTORE:
2287 case AMDGPU::SI_SPILL_S32_RESTORE:
2288 return restoreSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane);
2289 default:
2290 llvm_unreachable("not an SGPR spill instruction");
2291 }
2292}
2293
2295 int SPAdj, unsigned FIOperandNum,
2296 RegScavenger *RS) const {
2297 MachineFunction *MF = MI->getParent()->getParent();
2298 MachineBasicBlock *MBB = MI->getParent();
2300 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
2301 const SIInstrInfo *TII = ST.getInstrInfo();
2302 const DebugLoc &DL = MI->getDebugLoc();
2303
2304 assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?");
2305
2307 "unreserved scratch RSRC register");
2308
2309 MachineOperand *FIOp = &MI->getOperand(FIOperandNum);
2310 int Index = MI->getOperand(FIOperandNum).getIndex();
2311
2312 Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF)
2313 ? getBaseRegister()
2314 : getFrameRegister(*MF);
2315
2316 switch (MI->getOpcode()) {
2317 // SGPR register spill
2318 case AMDGPU::SI_SPILL_S1024_SAVE:
2319 case AMDGPU::SI_SPILL_S512_SAVE:
2320 case AMDGPU::SI_SPILL_S384_SAVE:
2321 case AMDGPU::SI_SPILL_S352_SAVE:
2322 case AMDGPU::SI_SPILL_S320_SAVE:
2323 case AMDGPU::SI_SPILL_S288_SAVE:
2324 case AMDGPU::SI_SPILL_S256_SAVE:
2325 case AMDGPU::SI_SPILL_S224_SAVE:
2326 case AMDGPU::SI_SPILL_S192_SAVE:
2327 case AMDGPU::SI_SPILL_S160_SAVE:
2328 case AMDGPU::SI_SPILL_S128_SAVE:
2329 case AMDGPU::SI_SPILL_S96_SAVE:
2330 case AMDGPU::SI_SPILL_S64_SAVE:
2331 case AMDGPU::SI_SPILL_S32_SAVE: {
2332 return spillSGPR(MI, Index, RS);
2333 }
2334
2335 // SGPR register restore
2336 case AMDGPU::SI_SPILL_S1024_RESTORE:
2337 case AMDGPU::SI_SPILL_S512_RESTORE:
2338 case AMDGPU::SI_SPILL_S384_RESTORE:
2339 case AMDGPU::SI_SPILL_S352_RESTORE:
2340 case AMDGPU::SI_SPILL_S320_RESTORE:
2341 case AMDGPU::SI_SPILL_S288_RESTORE:
2342 case AMDGPU::SI_SPILL_S256_RESTORE:
2343 case AMDGPU::SI_SPILL_S224_RESTORE:
2344 case AMDGPU::SI_SPILL_S192_RESTORE:
2345 case AMDGPU::SI_SPILL_S160_RESTORE:
2346 case AMDGPU::SI_SPILL_S128_RESTORE:
2347 case AMDGPU::SI_SPILL_S96_RESTORE:
2348 case AMDGPU::SI_SPILL_S64_RESTORE:
2349 case AMDGPU::SI_SPILL_S32_RESTORE: {
2350 return restoreSGPR(MI, Index, RS);
2351 }
2352
2353 // VGPR register spill
2354 case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE: {
2355 // Put mask into M0.
2356 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32),
2357 AMDGPU::M0)
2358 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::mask));
2359 [[fallthrough]];
2360 }
2361 case AMDGPU::SI_SPILL_V1024_SAVE:
2362 case AMDGPU::SI_SPILL_V512_SAVE:
2363 case AMDGPU::SI_SPILL_V384_SAVE:
2364 case AMDGPU::SI_SPILL_V352_SAVE:
2365 case AMDGPU::SI_SPILL_V320_SAVE:
2366 case AMDGPU::SI_SPILL_V288_SAVE:
2367 case AMDGPU::SI_SPILL_V256_SAVE:
2368 case AMDGPU::SI_SPILL_V224_SAVE:
2369 case AMDGPU::SI_SPILL_V192_SAVE:
2370 case AMDGPU::SI_SPILL_V160_SAVE:
2371 case AMDGPU::SI_SPILL_V128_SAVE:
2372 case AMDGPU::SI_SPILL_V96_SAVE:
2373 case AMDGPU::SI_SPILL_V64_SAVE:
2374 case AMDGPU::SI_SPILL_V32_SAVE:
2375 case AMDGPU::SI_SPILL_V16_SAVE:
2376 case AMDGPU::SI_SPILL_A1024_SAVE:
2377 case AMDGPU::SI_SPILL_A512_SAVE:
2378 case AMDGPU::SI_SPILL_A384_SAVE:
2379 case AMDGPU::SI_SPILL_A352_SAVE:
2380 case AMDGPU::SI_SPILL_A320_SAVE:
2381 case AMDGPU::SI_SPILL_A288_SAVE:
2382 case AMDGPU::SI_SPILL_A256_SAVE:
2383 case AMDGPU::SI_SPILL_A224_SAVE:
2384 case AMDGPU::SI_SPILL_A192_SAVE:
2385 case AMDGPU::SI_SPILL_A160_SAVE:
2386 case AMDGPU::SI_SPILL_A128_SAVE:
2387 case AMDGPU::SI_SPILL_A96_SAVE:
2388 case AMDGPU::SI_SPILL_A64_SAVE:
2389 case AMDGPU::SI_SPILL_A32_SAVE:
2390 case AMDGPU::SI_SPILL_AV1024_SAVE:
2391 case AMDGPU::SI_SPILL_AV512_SAVE:
2392 case AMDGPU::SI_SPILL_AV384_SAVE:
2393 case AMDGPU::SI_SPILL_AV352_SAVE:
2394 case AMDGPU::SI_SPILL_AV320_SAVE:
2395 case AMDGPU::SI_SPILL_AV288_SAVE:
2396 case AMDGPU::SI_SPILL_AV256_SAVE:
2397 case AMDGPU::SI_SPILL_AV224_SAVE:
2398 case AMDGPU::SI_SPILL_AV192_SAVE:
2399 case AMDGPU::SI_SPILL_AV160_SAVE:
2400 case AMDGPU::SI_SPILL_AV128_SAVE:
2401 case AMDGPU::SI_SPILL_AV96_SAVE:
2402 case AMDGPU::SI_SPILL_AV64_SAVE:
2403 case AMDGPU::SI_SPILL_AV32_SAVE:
2404 case AMDGPU::SI_SPILL_WWM_V32_SAVE:
2405 case AMDGPU::SI_SPILL_WWM_AV32_SAVE: {
2406 const MachineOperand *VData = TII->getNamedOperand(*MI,
2407 AMDGPU::OpName::vdata);
2408 if (VData->isUndef()) {
2409 MI->eraseFromParent();
2410 return true;
2411 }
2412
2413 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
2414 MFI->getStackPtrOffsetReg());
2415
2416 unsigned Opc;
2417 if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_SAVE) {
2418 assert(ST.enableFlatScratch() && "Flat Scratch is not enabled!");
2419 Opc = AMDGPU::SCRATCH_STORE_SHORT_SADDR_t16;
2420 } else {
2421 Opc = MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_SAVE
2422 ? AMDGPU::SCRATCH_STORE_BLOCK_SADDR
2423 : ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
2424 : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
2425 }
2426
2427 auto *MBB = MI->getParent();
2428 bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode());
2429 if (IsWWMRegSpill) {
2430 TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(),
2431 RS->isRegUsed(AMDGPU::SCC));
2432 }
2434 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
2435 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
2436 *MI->memoperands_begin(), RS);
2438 if (IsWWMRegSpill)
2439 TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy());
2440
2441 MI->eraseFromParent();
2442 return true;
2443 }
2444 case AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE: {
2445 // Put mask into M0.
2446 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32),
2447 AMDGPU::M0)
2448 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::mask));
2449 [[fallthrough]];
2450 }
2451 case AMDGPU::SI_SPILL_V16_RESTORE:
2452 case AMDGPU::SI_SPILL_V32_RESTORE:
2453 case AMDGPU::SI_SPILL_V64_RESTORE:
2454 case AMDGPU::SI_SPILL_V96_RESTORE:
2455 case AMDGPU::SI_SPILL_V128_RESTORE:
2456 case AMDGPU::SI_SPILL_V160_RESTORE:
2457 case AMDGPU::SI_SPILL_V192_RESTORE:
2458 case AMDGPU::SI_SPILL_V224_RESTORE:
2459 case AMDGPU::SI_SPILL_V256_RESTORE:
2460 case AMDGPU::SI_SPILL_V288_RESTORE:
2461 case AMDGPU::SI_SPILL_V320_RESTORE:
2462 case AMDGPU::SI_SPILL_V352_RESTORE:
2463 case AMDGPU::SI_SPILL_V384_RESTORE:
2464 case AMDGPU::SI_SPILL_V512_RESTORE:
2465 case AMDGPU::SI_SPILL_V1024_RESTORE:
2466 case AMDGPU::SI_SPILL_A32_RESTORE:
2467 case AMDGPU::SI_SPILL_A64_RESTORE:
2468 case AMDGPU::SI_SPILL_A96_RESTORE:
2469 case AMDGPU::SI_SPILL_A128_RESTORE:
2470 case AMDGPU::SI_SPILL_A160_RESTORE:
2471 case AMDGPU::SI_SPILL_A192_RESTORE:
2472 case AMDGPU::SI_SPILL_A224_RESTORE:
2473 case AMDGPU::SI_SPILL_A256_RESTORE:
2474 case AMDGPU::SI_SPILL_A288_RESTORE:
2475 case AMDGPU::SI_SPILL_A320_RESTORE:
2476 case AMDGPU::SI_SPILL_A352_RESTORE:
2477 case AMDGPU::SI_SPILL_A384_RESTORE:
2478 case AMDGPU::SI_SPILL_A512_RESTORE:
2479 case AMDGPU::SI_SPILL_A1024_RESTORE:
2480 case AMDGPU::SI_SPILL_AV32_RESTORE:
2481 case AMDGPU::SI_SPILL_AV64_RESTORE:
2482 case AMDGPU::SI_SPILL_AV96_RESTORE:
2483 case AMDGPU::SI_SPILL_AV128_RESTORE:
2484 case AMDGPU::SI_SPILL_AV160_RESTORE:
2485 case AMDGPU::SI_SPILL_AV192_RESTORE:
2486 case AMDGPU::SI_SPILL_AV224_RESTORE:
2487 case AMDGPU::SI_SPILL_AV256_RESTORE:
2488 case AMDGPU::SI_SPILL_AV288_RESTORE:
2489 case AMDGPU::SI_SPILL_AV320_RESTORE:
2490 case AMDGPU::SI_SPILL_AV352_RESTORE:
2491 case AMDGPU::SI_SPILL_AV384_RESTORE:
2492 case AMDGPU::SI_SPILL_AV512_RESTORE:
2493 case AMDGPU::SI_SPILL_AV1024_RESTORE:
2494 case AMDGPU::SI_SPILL_WWM_V32_RESTORE:
2495 case AMDGPU::SI_SPILL_WWM_AV32_RESTORE: {
2496 const MachineOperand *VData = TII->getNamedOperand(*MI,
2497 AMDGPU::OpName::vdata);
2498 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
2499 MFI->getStackPtrOffsetReg());
2500
2501 unsigned Opc;
2502 if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_RESTORE) {
2503 assert(ST.enableFlatScratch() && "Flat Scratch is not enabled!");
2504 Opc = AMDGPU::SCRATCH_LOAD_SHORT_D16_SADDR_t16;
2505 } else {
2506 Opc = MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE
2507 ? AMDGPU::SCRATCH_LOAD_BLOCK_SADDR
2508 : ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
2509 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
2510 }
2511
2512 auto *MBB = MI->getParent();
2513 bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode());
2514 if (IsWWMRegSpill) {
2515 TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(),
2516 RS->isRegUsed(AMDGPU::SCC));
2517 }
2518
2520 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
2521 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
2522 *MI->memoperands_begin(), RS);
2523
2524 if (IsWWMRegSpill)
2525 TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy());
2526
2527 MI->eraseFromParent();
2528 return true;
2529 }
2530 case AMDGPU::V_ADD_U32_e32:
2531 case AMDGPU::V_ADD_U32_e64:
2532 case AMDGPU::V_ADD_CO_U32_e32:
2533 case AMDGPU::V_ADD_CO_U32_e64: {
2534 // TODO: Handle sub, and, or.
2535 unsigned NumDefs = MI->getNumExplicitDefs();
2536 unsigned Src0Idx = NumDefs;
2537
2538 bool HasClamp = false;
2539 MachineOperand *VCCOp = nullptr;
2540
2541 switch (MI->getOpcode()) {
2542 case AMDGPU::V_ADD_U32_e32:
2543 break;
2544 case AMDGPU::V_ADD_U32_e64:
2545 HasClamp = MI->getOperand(3).getImm();
2546 break;
2547 case AMDGPU::V_ADD_CO_U32_e32:
2548 VCCOp = &MI->getOperand(3);
2549 break;
2550 case AMDGPU::V_ADD_CO_U32_e64:
2551 VCCOp = &MI->getOperand(1);
2552 HasClamp = MI->getOperand(4).getImm();
2553 break;
2554 default:
2555 break;
2556 }
2557 bool DeadVCC = !VCCOp || VCCOp->isDead();
2558 MachineOperand &DstOp = MI->getOperand(0);
2559 Register DstReg = DstOp.getReg();
2560
2561 unsigned OtherOpIdx =
2562 FIOperandNum == Src0Idx ? FIOperandNum + 1 : Src0Idx;
2563 MachineOperand *OtherOp = &MI->getOperand(OtherOpIdx);
2564
2565 unsigned Src1Idx = Src0Idx + 1;
2566 Register MaterializedReg = FrameReg;
2567 Register ScavengedVGPR;
2568
2569 int64_t Offset = FrameInfo.getObjectOffset(Index);
2570 // For the non-immediate case, we could fall through to the default
2571 // handling, but we do an in-place update of the result register here to
2572 // avoid scavenging another register.
2573 if (OtherOp->isImm()) {
2574 int64_t TotalOffset = OtherOp->getImm() + Offset;
2575
2576 if (!ST.hasVOP3Literal() && SIInstrInfo::isVOP3(*MI) &&
2577 !AMDGPU::isInlinableIntLiteral(TotalOffset)) {
2578 // If we can't support a VOP3 literal in the VALU instruction, we
2579 // can't specially fold into the add.
2580 // TODO: Handle VOP3->VOP2 shrink to support the fold.
2581 break;
2582 }
2583
2584 OtherOp->setImm(TotalOffset);
2585 Offset = 0;
2586 }
2587
2588 if (FrameReg && !ST.enableFlatScratch()) {
2589 // We should just do an in-place update of the result register. However,
2590 // the value there may also be used by the add, in which case we need a
2591 // temporary register.
2592 //
2593 // FIXME: The scavenger is not finding the result register in the
2594 // common case where the add does not read the register.
2595
2596 ScavengedVGPR = RS->scavengeRegisterBackwards(
2597 AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false, /*SPAdj=*/0);
2598
2599 // TODO: If we have a free SGPR, it's sometimes better to use a scalar
2600 // shift.
2601 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64))
2602 .addDef(ScavengedVGPR, RegState::Renamable)
2603 .addImm(ST.getWavefrontSizeLog2())
2604 .addReg(FrameReg);
2605 MaterializedReg = ScavengedVGPR;
2606 }
2607
2608 if ((!OtherOp->isImm() || OtherOp->getImm() != 0) && MaterializedReg) {
2609 if (ST.enableFlatScratch() &&
2610 !TII->isOperandLegal(*MI, Src1Idx, OtherOp)) {
2611 // We didn't need the shift above, so we have an SGPR for the frame
2612 // register, but may have a VGPR only operand.
2613 //
2614 // TODO: On gfx10+, we can easily change the opcode to the e64 version
2615 // and use the higher constant bus restriction to avoid this copy.
2616
2617 if (!ScavengedVGPR) {
2618 ScavengedVGPR = RS->scavengeRegisterBackwards(
2619 AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false,
2620 /*SPAdj=*/0);
2621 }
2622
2623 assert(ScavengedVGPR != DstReg);
2624
2625 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2626 .addReg(MaterializedReg,
2627 MaterializedReg != FrameReg ? RegState::Kill : 0);
2628 MaterializedReg = ScavengedVGPR;
2629 }
2630
2631 // TODO: In the flat scratch case, if this is an add of an SGPR, and SCC
2632 // is not live, we could use a scalar add + vector add instead of 2
2633 // vector adds.
2634 auto AddI32 = BuildMI(*MBB, *MI, DL, TII->get(MI->getOpcode()))
2635 .addDef(DstReg, RegState::Renamable);
2636 if (NumDefs == 2)
2637 AddI32.add(MI->getOperand(1));
2638
2639 unsigned MaterializedRegFlags =
2640 MaterializedReg != FrameReg ? RegState::Kill : 0;
2641
2642 if (isVGPRClass(getPhysRegBaseClass(MaterializedReg))) {
2643 // If we know we have a VGPR already, it's more likely the other
2644 // operand is a legal vsrc0.
2645 AddI32
2646 .add(*OtherOp)
2647 .addReg(MaterializedReg, MaterializedRegFlags);
2648 } else {
2649 // Commute operands to avoid violating VOP2 restrictions. This will
2650 // typically happen when using scratch.
2651 AddI32
2652 .addReg(MaterializedReg, MaterializedRegFlags)
2653 .add(*OtherOp);
2654 }
2655
2656 if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 ||
2657 MI->getOpcode() == AMDGPU::V_ADD_U32_e64)
2658 AddI32.addImm(0); // clamp
2659
2660 if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e32)
2661 AddI32.setOperandDead(3); // Dead vcc
2662
2663 MaterializedReg = DstReg;
2664
2665 OtherOp->ChangeToRegister(MaterializedReg, false);
2666 OtherOp->setIsKill(true);
2668 Offset = 0;
2669 } else if (Offset != 0) {
2670 assert(!MaterializedReg);
2672 Offset = 0;
2673 } else {
2674 if (DeadVCC && !HasClamp) {
2675 assert(Offset == 0);
2676
2677 // TODO: Losing kills and implicit operands. Just mutate to copy and
2678 // let lowerCopy deal with it?
2679 if (OtherOp->isReg() && OtherOp->getReg() == DstReg) {
2680 // Folded to an identity copy.
2681 MI->eraseFromParent();
2682 return true;
2683 }
2684
2685 // The immediate value should be in OtherOp
2686 MI->setDesc(TII->get(AMDGPU::V_MOV_B32_e32));
2687 MI->removeOperand(FIOperandNum);
2688
2689 unsigned NumOps = MI->getNumOperands();
2690 for (unsigned I = NumOps - 2; I >= NumDefs + 1; --I)
2691 MI->removeOperand(I);
2692
2693 if (NumDefs == 2)
2694 MI->removeOperand(1);
2695
2696 // The code below can't deal with a mov.
2697 return true;
2698 }
2699
2700 // This folded to a constant, but we have to keep the add around for
2701 // pointless implicit defs or clamp modifier.
2702 FIOp->ChangeToImmediate(0);
2703 }
2704
2705 // Try to improve legality by commuting.
2706 if (!TII->isOperandLegal(*MI, Src1Idx) && TII->commuteInstruction(*MI)) {
2707 std::swap(FIOp, OtherOp);
2708 std::swap(FIOperandNum, OtherOpIdx);
2709 }
2710
2711 // We need at most one mov to satisfy the operand constraints. Prefer to
2712 // move the FI operand first, as it may be a literal in a VOP3
2713 // instruction.
2714 for (unsigned SrcIdx : {FIOperandNum, OtherOpIdx}) {
2715 if (!TII->isOperandLegal(*MI, SrcIdx)) {
2716 // If commuting didn't make the operands legal, we need to materialize
2717 // in a register.
2718 // TODO: Can use SGPR on gfx10+ in some cases.
2719 if (!ScavengedVGPR) {
2720 ScavengedVGPR = RS->scavengeRegisterBackwards(
2721 AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false,
2722 /*SPAdj=*/0);
2723 }
2724
2725 assert(ScavengedVGPR != DstReg);
2726
2727 MachineOperand &Src = MI->getOperand(SrcIdx);
2728 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2729 .add(Src);
2730
2731 Src.ChangeToRegister(ScavengedVGPR, false);
2732 Src.setIsKill(true);
2733 break;
2734 }
2735 }
2736
2737 // Fold out add of 0 case that can appear in kernels.
2738 if (FIOp->isImm() && FIOp->getImm() == 0 && DeadVCC && !HasClamp) {
2739 if (OtherOp->isReg() && OtherOp->getReg() != DstReg) {
2740 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::COPY), DstReg).add(*OtherOp);
2741 }
2742
2743 MI->eraseFromParent();
2744 }
2745
2746 return true;
2747 }
2748 case AMDGPU::S_ADD_I32:
2749 case AMDGPU::S_ADD_U32: {
2750 // TODO: Handle s_or_b32, s_and_b32.
2751 unsigned OtherOpIdx = FIOperandNum == 1 ? 2 : 1;
2752 MachineOperand &OtherOp = MI->getOperand(OtherOpIdx);
2753
2754 assert(FrameReg || MFI->isBottomOfStack());
2755
2756 MachineOperand &DstOp = MI->getOperand(0);
2757 const DebugLoc &DL = MI->getDebugLoc();
2758 Register MaterializedReg = FrameReg;
2759
2760 // Defend against live scc, which should never happen in practice.
2761 bool DeadSCC = MI->getOperand(3).isDead();
2762
2763 Register TmpReg;
2764
2765 // FIXME: Scavenger should figure out that the result register is
2766 // available. Also should do this for the v_add case.
2767 if (OtherOp.isReg() && OtherOp.getReg() != DstOp.getReg())
2768 TmpReg = DstOp.getReg();
2769
2770 if (FrameReg && !ST.enableFlatScratch()) {
2771 // FIXME: In the common case where the add does not also read its result
2772 // (i.e. this isn't a reg += fi), it's not finding the dest reg as
2773 // available.
2774 if (!TmpReg)
2775 TmpReg = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
2776 MI, /*RestoreAfter=*/false, 0,
2777 /*AllowSpill=*/false);
2778 if (TmpReg) {
2779 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::S_LSHR_B32))
2780 .addDef(TmpReg, RegState::Renamable)
2781 .addReg(FrameReg)
2782 .addImm(ST.getWavefrontSizeLog2())
2783 .setOperandDead(3); // Set SCC dead
2784 }
2785 MaterializedReg = TmpReg;
2786 }
2787
2788 int64_t Offset = FrameInfo.getObjectOffset(Index);
2789
2790 // For the non-immediate case, we could fall through to the default
2791 // handling, but we do an in-place update of the result register here to
2792 // avoid scavenging another register.
2793 if (OtherOp.isImm()) {
2794 OtherOp.setImm(OtherOp.getImm() + Offset);
2795 Offset = 0;
2796
2797 if (MaterializedReg)
2798 FIOp->ChangeToRegister(MaterializedReg, false);
2799 else
2800 FIOp->ChangeToImmediate(0);
2801 } else if (MaterializedReg) {
2802 // If we can't fold the other operand, do another increment.
2803 Register DstReg = DstOp.getReg();
2804
2805 if (!TmpReg && MaterializedReg == FrameReg) {
2806 TmpReg = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
2807 MI, /*RestoreAfter=*/false, 0,
2808 /*AllowSpill=*/false);
2809 DstReg = TmpReg;
2810 }
2811
2812 if (TmpReg) {
2813 auto AddI32 = BuildMI(*MBB, *MI, DL, MI->getDesc())
2814 .addDef(DstReg, RegState::Renamable)
2815 .addReg(MaterializedReg, RegState::Kill)
2816 .add(OtherOp);
2817 if (DeadSCC)
2818 AddI32.setOperandDead(3);
2819
2820 MaterializedReg = DstReg;
2821
2822 OtherOp.ChangeToRegister(MaterializedReg, false);
2823 OtherOp.setIsKill(true);
2824 OtherOp.setIsRenamable(true);
2825 }
2827 } else {
2828 // If we don't have any other offset to apply, we can just directly
2829 // interpret the frame index as the offset.
2831 }
2832
2833 if (DeadSCC && OtherOp.isImm() && OtherOp.getImm() == 0) {
2834 assert(Offset == 0);
2835 MI->removeOperand(3);
2836 MI->removeOperand(OtherOpIdx);
2837 MI->setDesc(TII->get(FIOp->isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2838 } else if (DeadSCC && FIOp->isImm() && FIOp->getImm() == 0) {
2839 assert(Offset == 0);
2840 MI->removeOperand(3);
2841 MI->removeOperand(FIOperandNum);
2842 MI->setDesc(
2843 TII->get(OtherOp.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2844 }
2845
2846 assert(!FIOp->isFI());
2847 return true;
2848 }
2849 default: {
2850 break;
2851 }
2852 }
2853
2854 int64_t Offset = FrameInfo.getObjectOffset(Index);
2855 if (ST.enableFlatScratch()) {
2856 if (TII->isFLATScratch(*MI)) {
2857 assert(
2858 (int16_t)FIOperandNum ==
2859 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::saddr));
2860
2861 // The offset is always swizzled, just replace it
2862 if (FrameReg)
2863 FIOp->ChangeToRegister(FrameReg, false);
2864
2866 TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
2867 int64_t NewOffset = Offset + OffsetOp->getImm();
2868 if (TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
2870 OffsetOp->setImm(NewOffset);
2871 if (FrameReg)
2872 return false;
2873 Offset = 0;
2874 }
2875
2876 if (!Offset) {
2877 unsigned Opc = MI->getOpcode();
2878 int NewOpc = -1;
2879 if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr)) {
2881 } else if (ST.hasFlatScratchSTMode()) {
2882 // On GFX10 we have ST mode to use no registers for an address.
2883 // Otherwise we need to materialize 0 into an SGPR.
2885 }
2886
2887 if (NewOpc != -1) {
2888 // removeOperand doesn't fixup tied operand indexes as it goes, so
2889 // it asserts. Untie vdst_in for now and retie them afterwards.
2890 int VDstIn =
2891 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
2892 bool TiedVDst = VDstIn != -1 && MI->getOperand(VDstIn).isReg() &&
2893 MI->getOperand(VDstIn).isTied();
2894 if (TiedVDst)
2895 MI->untieRegOperand(VDstIn);
2896
2897 MI->removeOperand(
2898 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr));
2899
2900 if (TiedVDst) {
2901 int NewVDst =
2902 AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
2903 int NewVDstIn =
2904 AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst_in);
2905 assert(NewVDst != -1 && NewVDstIn != -1 && "Must be tied!");
2906 MI->tieOperands(NewVDst, NewVDstIn);
2907 }
2908 MI->setDesc(TII->get(NewOpc));
2909 return false;
2910 }
2911 }
2912 }
2913
2914 if (!FrameReg) {
2916 if (TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp))
2917 return false;
2918 }
2919
2920 // We need to use register here. Check if we can use an SGPR or need
2921 // a VGPR.
2922 FIOp->ChangeToRegister(AMDGPU::M0, false);
2923 bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, FIOp);
2924
2925 if (!Offset && FrameReg && UseSGPR) {
2926 FIOp->setReg(FrameReg);
2927 return false;
2928 }
2929
2930 const TargetRegisterClass *RC =
2931 UseSGPR ? &AMDGPU::SReg_32_XM0RegClass : &AMDGPU::VGPR_32RegClass;
2932
2933 Register TmpReg =
2934 RS->scavengeRegisterBackwards(*RC, MI, false, 0, !UseSGPR);
2935 FIOp->setReg(TmpReg);
2936 FIOp->setIsKill();
2937
2938 if ((!FrameReg || !Offset) && TmpReg) {
2939 unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
2940 auto MIB = BuildMI(*MBB, MI, DL, TII->get(Opc), TmpReg);
2941 if (FrameReg)
2942 MIB.addReg(FrameReg);
2943 else
2944 MIB.addImm(Offset);
2945
2946 return false;
2947 }
2948
2949 bool NeedSaveSCC = RS->isRegUsed(AMDGPU::SCC) &&
2950 !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr);
2951
2952 Register TmpSReg =
2953 UseSGPR ? TmpReg
2954 : RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
2955 MI, false, 0, !UseSGPR);
2956
2957 // TODO: for flat scratch another attempt can be made with a VGPR index
2958 // if no SGPRs can be scavenged.
2959 if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR))
2960 report_fatal_error("Cannot scavenge register in FI elimination!");
2961
2962 if (!TmpSReg) {
2963 // Use frame register and restore it after.
2964 TmpSReg = FrameReg;
2965 FIOp->setReg(FrameReg);
2966 FIOp->setIsKill(false);
2967 }
2968
2969 if (NeedSaveSCC) {
2970 assert(!(Offset & 0x1) && "Flat scratch offset must be aligned!");
2971 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADDC_U32), TmpSReg)
2972 .addReg(FrameReg)
2973 .addImm(Offset);
2974 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITCMP1_B32))
2975 .addReg(TmpSReg)
2976 .addImm(0);
2977 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITSET0_B32), TmpSReg)
2978 .addImm(0)
2979 .addReg(TmpSReg);
2980 } else {
2981 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpSReg)
2982 .addReg(FrameReg)
2983 .addImm(Offset);
2984 }
2985
2986 if (!UseSGPR)
2987 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
2988 .addReg(TmpSReg, RegState::Kill);
2989
2990 if (TmpSReg == FrameReg) {
2991 // Undo frame register modification.
2992 if (NeedSaveSCC &&
2993 !MI->registerDefIsDead(AMDGPU::SCC, /*TRI=*/nullptr)) {
2995 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADDC_U32),
2996 TmpSReg)
2997 .addReg(FrameReg)
2998 .addImm(-Offset);
2999 I = BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITCMP1_B32))
3000 .addReg(TmpSReg)
3001 .addImm(0);
3002 BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITSET0_B32),
3003 TmpSReg)
3004 .addImm(0)
3005 .addReg(TmpSReg);
3006 } else {
3007 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADD_I32),
3008 FrameReg)
3009 .addReg(FrameReg)
3010 .addImm(-Offset);
3011 }
3012 }
3013
3014 return false;
3015 }
3016
3017 bool IsMUBUF = TII->isMUBUF(*MI);
3018
3019 if (!IsMUBUF && !MFI->isBottomOfStack()) {
3020 // Convert to a swizzled stack address by scaling by the wave size.
3021 // In an entry function/kernel the offset is already swizzled.
3022 bool IsSALU = isSGPRClass(TII->getOpRegClass(*MI, FIOperandNum));
3023 bool LiveSCC = RS->isRegUsed(AMDGPU::SCC) &&
3024 !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr);
3025 const TargetRegisterClass *RC = IsSALU && !LiveSCC
3026 ? &AMDGPU::SReg_32RegClass
3027 : &AMDGPU::VGPR_32RegClass;
3028 bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32 ||
3029 MI->getOpcode() == AMDGPU::V_MOV_B32_e64 ||
3030 MI->getOpcode() == AMDGPU::S_MOV_B32;
3031 Register ResultReg =
3032 IsCopy ? MI->getOperand(0).getReg()
3033 : RS->scavengeRegisterBackwards(*RC, MI, false, 0);
3034
3035 int64_t Offset = FrameInfo.getObjectOffset(Index);
3036 if (Offset == 0) {
3037 unsigned OpCode =
3038 IsSALU && !LiveSCC ? AMDGPU::S_LSHR_B32 : AMDGPU::V_LSHRREV_B32_e64;
3039 Register TmpResultReg = ResultReg;
3040 if (IsSALU && LiveSCC) {
3041 TmpResultReg = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
3042 MI, false, 0);
3043 }
3044
3045 auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), TmpResultReg);
3046 if (OpCode == AMDGPU::V_LSHRREV_B32_e64)
3047 // For V_LSHRREV, the operands are reversed (the shift count goes
3048 // first).
3049 Shift.addImm(ST.getWavefrontSizeLog2()).addReg(FrameReg);
3050 else
3051 Shift.addReg(FrameReg).addImm(ST.getWavefrontSizeLog2());
3052 if (IsSALU && !LiveSCC)
3053 Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead.
3054 if (IsSALU && LiveSCC) {
3055 Register NewDest;
3056 if (IsCopy) {
3057 assert(ResultReg.isPhysical());
3058 NewDest = ResultReg;
3059 } else {
3060 NewDest = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
3061 Shift, false, 0);
3062 }
3063 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), NewDest)
3064 .addReg(TmpResultReg);
3065 ResultReg = NewDest;
3066 }
3067 } else {
3069 if (!IsSALU) {
3070 if ((MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) !=
3071 nullptr) {
3072 // Reuse ResultReg in intermediate step.
3073 Register ScaledReg = ResultReg;
3074
3075 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
3076 ScaledReg)
3077 .addImm(ST.getWavefrontSizeLog2())
3078 .addReg(FrameReg);
3079
3080 const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32;
3081
3082 // TODO: Fold if use instruction is another add of a constant.
3083 if (IsVOP2 ||
3084 AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
3085 // FIXME: This can fail
3086 MIB.addImm(Offset);
3087 MIB.addReg(ScaledReg, RegState::Kill);
3088 if (!IsVOP2)
3089 MIB.addImm(0); // clamp bit
3090 } else {
3091 assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 &&
3092 "Need to reuse carry out register");
3093
3094 // Use scavenged unused carry out as offset register.
3095 Register ConstOffsetReg;
3096 if (!isWave32)
3097 ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0);
3098 else
3099 ConstOffsetReg = MIB.getReg(1);
3100
3101 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32),
3102 ConstOffsetReg)
3103 .addImm(Offset);
3104 MIB.addReg(ConstOffsetReg, RegState::Kill);
3105 MIB.addReg(ScaledReg, RegState::Kill);
3106 MIB.addImm(0); // clamp bit
3107 }
3108 }
3109 }
3110 if (!MIB || IsSALU) {
3111 // We have to produce a carry out, and there isn't a free SGPR pair
3112 // for it. We can keep the whole computation on the SALU to avoid
3113 // clobbering an additional register at the cost of an extra mov.
3114
3115 // We may have 1 free scratch SGPR even though a carry out is
3116 // unavailable. Only one additional mov is needed.
3117 Register TmpScaledReg = IsCopy && IsSALU
3118 ? ResultReg
3119 : RS->scavengeRegisterBackwards(
3120 AMDGPU::SReg_32_XM0RegClass, MI,
3121 false, 0, /*AllowSpill=*/false);
3122 Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg;
3123 Register TmpResultReg = ScaledReg;
3124
3125 if (!LiveSCC) {
3126 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), TmpResultReg)
3127 .addReg(FrameReg)
3128 .addImm(ST.getWavefrontSizeLog2());
3129 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpResultReg)
3130 .addReg(TmpResultReg, RegState::Kill)
3131 .addImm(Offset);
3132 } else {
3133 TmpResultReg = RS->scavengeRegisterBackwards(
3134 AMDGPU::VGPR_32RegClass, MI, false, 0, /*AllowSpill=*/true);
3135
3137 if ((Add = TII->getAddNoCarry(*MBB, MI, DL, TmpResultReg, *RS))) {
3138 BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
3139 TmpResultReg)
3140 .addImm(ST.getWavefrontSizeLog2())
3141 .addReg(FrameReg);
3142 if (Add->getOpcode() == AMDGPU::V_ADD_CO_U32_e64) {
3143 BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::S_MOV_B32), ResultReg)
3144 .addImm(Offset);
3145 Add.addReg(ResultReg, RegState::Kill)
3146 .addReg(TmpResultReg, RegState::Kill)
3147 .addImm(0);
3148 } else
3149 Add.addImm(Offset).addReg(TmpResultReg, RegState::Kill);
3150 } else {
3151 assert(Offset > 0 && isUInt<24>(2 * ST.getMaxWaveScratchSize()) &&
3152 "offset is unsafe for v_mad_u32_u24");
3153
3154 // We start with a frame pointer with a wave space value, and
3155 // an offset in lane-space. We are materializing a lane space
3156 // value. We can either do a right shift of the frame pointer
3157 // to get to lane space, or a left shift of the offset to get
3158 // to wavespace. We can right shift after the computation to
3159 // get back to the desired per-lane value. We are using the
3160 // mad_u32_u24 primarily as an add with no carry out clobber.
3161 bool IsInlinableLiteral =
3162 AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm());
3163 if (!IsInlinableLiteral) {
3164 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32),
3165 TmpResultReg)
3166 .addImm(Offset);
3167 }
3168
3169 Add = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MAD_U32_U24_e64),
3170 TmpResultReg);
3171
3172 if (!IsInlinableLiteral) {
3173 Add.addReg(TmpResultReg, RegState::Kill);
3174 } else {
3175 // We fold the offset into mad itself if its inlinable.
3176 Add.addImm(Offset);
3177 }
3178 Add.addImm(ST.getWavefrontSize()).addReg(FrameReg).addImm(0);
3179 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
3180 TmpResultReg)
3181 .addImm(ST.getWavefrontSizeLog2())
3182 .addReg(TmpResultReg);
3183 }
3184
3185 Register NewDest;
3186 if (IsCopy) {
3187 NewDest = ResultReg;
3188 } else {
3189 NewDest = RS->scavengeRegisterBackwards(
3190 AMDGPU::SReg_32_XM0RegClass, *Add, false, 0,
3191 /*AllowSpill=*/true);
3192 }
3193
3194 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
3195 NewDest)
3196 .addReg(TmpResultReg);
3197 ResultReg = NewDest;
3198 }
3199 if (!IsSALU)
3200 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg)
3201 .addReg(TmpResultReg, RegState::Kill);
3202 else
3203 ResultReg = TmpResultReg;
3204 // If there were truly no free SGPRs, we need to undo everything.
3205 if (!TmpScaledReg.isValid()) {
3206 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg)
3207 .addReg(ScaledReg, RegState::Kill)
3208 .addImm(-Offset);
3209 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg)
3210 .addReg(FrameReg)
3211 .addImm(ST.getWavefrontSizeLog2());
3212 }
3213 }
3214 }
3215
3216 // Don't introduce an extra copy if we're just materializing in a mov.
3217 if (IsCopy) {
3218 MI->eraseFromParent();
3219 return true;
3220 }
3221 FIOp->ChangeToRegister(ResultReg, false, false, true);
3222 return false;
3223 }
3224
3225 if (IsMUBUF) {
3226 // Disable offen so we don't need a 0 vgpr base.
3227 assert(
3228 static_cast<int>(FIOperandNum) ==
3229 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr));
3230
3231 auto &SOffset = *TII->getNamedOperand(*MI, AMDGPU::OpName::soffset);
3232 assert((SOffset.isImm() && SOffset.getImm() == 0));
3233
3234 if (FrameReg != AMDGPU::NoRegister)
3235 SOffset.ChangeToRegister(FrameReg, false);
3236
3237 int64_t Offset = FrameInfo.getObjectOffset(Index);
3238 int64_t OldImm =
3239 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm();
3240 int64_t NewOffset = OldImm + Offset;
3241
3242 if (TII->isLegalMUBUFImmOffset(NewOffset) &&
3243 buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) {
3244 MI->eraseFromParent();
3245 return true;
3246 }
3247 }
3248
3249 // If the offset is simply too big, don't convert to a scratch wave offset
3250 // relative index.
3251
3253 if (!TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp)) {
3254 Register TmpReg =
3255 RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0);
3256 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
3257 .addImm(Offset);
3258 FIOp->ChangeToRegister(TmpReg, false, false, true);
3259 }
3260
3261 return false;
3262}
3263
3267
3269 return getEncodingValue(Reg) & AMDGPU::HWEncoding::REG_IDX_MASK;
3270}
3271
3273 return getRegBitWidth(RC.getID());
3274}
3275
3276static const TargetRegisterClass *
3278 if (BitWidth == 64)
3279 return &AMDGPU::VReg_64RegClass;
3280 if (BitWidth == 96)
3281 return &AMDGPU::VReg_96RegClass;
3282 if (BitWidth == 128)
3283 return &AMDGPU::VReg_128RegClass;
3284 if (BitWidth == 160)
3285 return &AMDGPU::VReg_160RegClass;
3286 if (BitWidth == 192)
3287 return &AMDGPU::VReg_192RegClass;
3288 if (BitWidth == 224)
3289 return &AMDGPU::VReg_224RegClass;
3290 if (BitWidth == 256)
3291 return &AMDGPU::VReg_256RegClass;
3292 if (BitWidth == 288)
3293 return &AMDGPU::VReg_288RegClass;
3294 if (BitWidth == 320)
3295 return &AMDGPU::VReg_320RegClass;
3296 if (BitWidth == 352)
3297 return &AMDGPU::VReg_352RegClass;
3298 if (BitWidth == 384)
3299 return &AMDGPU::VReg_384RegClass;
3300 if (BitWidth == 512)
3301 return &AMDGPU::VReg_512RegClass;
3302 if (BitWidth == 1024)
3303 return &AMDGPU::VReg_1024RegClass;
3304
3305 return nullptr;
3306}
3307
3308static const TargetRegisterClass *
3310 if (BitWidth == 64)
3311 return &AMDGPU::VReg_64_Align2RegClass;
3312 if (BitWidth == 96)
3313 return &AMDGPU::VReg_96_Align2RegClass;
3314 if (BitWidth == 128)
3315 return &AMDGPU::VReg_128_Align2RegClass;
3316 if (BitWidth == 160)
3317 return &AMDGPU::VReg_160_Align2RegClass;
3318 if (BitWidth == 192)
3319 return &AMDGPU::VReg_192_Align2RegClass;
3320 if (BitWidth == 224)
3321 return &AMDGPU::VReg_224_Align2RegClass;
3322 if (BitWidth == 256)
3323 return &AMDGPU::VReg_256_Align2RegClass;
3324 if (BitWidth == 288)
3325 return &AMDGPU::VReg_288_Align2RegClass;
3326 if (BitWidth == 320)
3327 return &AMDGPU::VReg_320_Align2RegClass;
3328 if (BitWidth == 352)
3329 return &AMDGPU::VReg_352_Align2RegClass;
3330 if (BitWidth == 384)
3331 return &AMDGPU::VReg_384_Align2RegClass;
3332 if (BitWidth == 512)
3333 return &AMDGPU::VReg_512_Align2RegClass;
3334 if (BitWidth == 1024)
3335 return &AMDGPU::VReg_1024_Align2RegClass;
3336
3337 return nullptr;
3338}
3339
3340const TargetRegisterClass *
3342 if (BitWidth == 1)
3343 return &AMDGPU::VReg_1RegClass;
3344 if (BitWidth == 16)
3345 return &AMDGPU::VGPR_16RegClass;
3346 if (BitWidth == 32)
3347 return &AMDGPU::VGPR_32RegClass;
3348 return ST.needsAlignedVGPRs() ? getAlignedVGPRClassForBitWidth(BitWidth)
3350}
3351
3352const TargetRegisterClass *
3354 if (BitWidth <= 32)
3355 return &AMDGPU::VGPR_32_Lo256RegClass;
3356 if (BitWidth <= 64)
3357 return &AMDGPU::VReg_64_Lo256_Align2RegClass;
3358 if (BitWidth <= 96)
3359 return &AMDGPU::VReg_96_Lo256_Align2RegClass;
3360 if (BitWidth <= 128)
3361 return &AMDGPU::VReg_128_Lo256_Align2RegClass;
3362 if (BitWidth <= 160)
3363 return &AMDGPU::VReg_160_Lo256_Align2RegClass;
3364 if (BitWidth <= 192)
3365 return &AMDGPU::VReg_192_Lo256_Align2RegClass;
3366 if (BitWidth <= 224)
3367 return &AMDGPU::VReg_224_Lo256_Align2RegClass;
3368 if (BitWidth <= 256)
3369 return &AMDGPU::VReg_256_Lo256_Align2RegClass;
3370 if (BitWidth <= 288)
3371 return &AMDGPU::VReg_288_Lo256_Align2RegClass;
3372 if (BitWidth <= 320)
3373 return &AMDGPU::VReg_320_Lo256_Align2RegClass;
3374 if (BitWidth <= 352)
3375 return &AMDGPU::VReg_352_Lo256_Align2RegClass;
3376 if (BitWidth <= 384)
3377 return &AMDGPU::VReg_384_Lo256_Align2RegClass;
3378 if (BitWidth <= 512)
3379 return &AMDGPU::VReg_512_Lo256_Align2RegClass;
3380 if (BitWidth <= 1024)
3381 return &AMDGPU::VReg_1024_Lo256_Align2RegClass;
3382
3383 return nullptr;
3384}
3385
3386static const TargetRegisterClass *
3388 if (BitWidth == 64)
3389 return &AMDGPU::AReg_64RegClass;
3390 if (BitWidth == 96)
3391 return &AMDGPU::AReg_96RegClass;
3392 if (BitWidth == 128)
3393 return &AMDGPU::AReg_128RegClass;
3394 if (BitWidth == 160)
3395 return &AMDGPU::AReg_160RegClass;
3396 if (BitWidth == 192)
3397 return &AMDGPU::AReg_192RegClass;
3398 if (BitWidth == 224)
3399 return &AMDGPU::AReg_224RegClass;
3400 if (BitWidth == 256)
3401 return &AMDGPU::AReg_256RegClass;
3402 if (BitWidth == 288)
3403 return &AMDGPU::AReg_288RegClass;
3404 if (BitWidth == 320)
3405 return &AMDGPU::AReg_320RegClass;
3406 if (BitWidth == 352)
3407 return &AMDGPU::AReg_352RegClass;
3408 if (BitWidth == 384)
3409 return &AMDGPU::AReg_384RegClass;
3410 if (BitWidth == 512)
3411 return &AMDGPU::AReg_512RegClass;
3412 if (BitWidth == 1024)
3413 return &AMDGPU::AReg_1024RegClass;
3414
3415 return nullptr;
3416}
3417
3418static const TargetRegisterClass *
3420 if (BitWidth == 64)
3421 return &AMDGPU::AReg_64_Align2RegClass;
3422 if (BitWidth == 96)
3423 return &AMDGPU::AReg_96_Align2RegClass;
3424 if (BitWidth == 128)
3425 return &AMDGPU::AReg_128_Align2RegClass;
3426 if (BitWidth == 160)
3427 return &AMDGPU::AReg_160_Align2RegClass;
3428 if (BitWidth == 192)
3429 return &AMDGPU::AReg_192_Align2RegClass;
3430 if (BitWidth == 224)
3431 return &AMDGPU::AReg_224_Align2RegClass;
3432 if (BitWidth == 256)
3433 return &AMDGPU::AReg_256_Align2RegClass;
3434 if (BitWidth == 288)
3435 return &AMDGPU::AReg_288_Align2RegClass;
3436 if (BitWidth == 320)
3437 return &AMDGPU::AReg_320_Align2RegClass;
3438 if (BitWidth == 352)
3439 return &AMDGPU::AReg_352_Align2RegClass;
3440 if (BitWidth == 384)
3441 return &AMDGPU::AReg_384_Align2RegClass;
3442 if (BitWidth == 512)
3443 return &AMDGPU::AReg_512_Align2RegClass;
3444 if (BitWidth == 1024)
3445 return &AMDGPU::AReg_1024_Align2RegClass;
3446
3447 return nullptr;
3448}
3449
3450const TargetRegisterClass *
3452 if (BitWidth == 16)
3453 return &AMDGPU::AGPR_LO16RegClass;
3454 if (BitWidth == 32)
3455 return &AMDGPU::AGPR_32RegClass;
3456 return ST.needsAlignedVGPRs() ? getAlignedAGPRClassForBitWidth(BitWidth)
3458}
3459
3460static const TargetRegisterClass *
3462 if (BitWidth == 64)
3463 return &AMDGPU::AV_64RegClass;
3464 if (BitWidth == 96)
3465 return &AMDGPU::AV_96RegClass;
3466 if (BitWidth == 128)
3467 return &AMDGPU::AV_128RegClass;
3468 if (BitWidth == 160)
3469 return &AMDGPU::AV_160RegClass;
3470 if (BitWidth == 192)
3471 return &AMDGPU::AV_192RegClass;
3472 if (BitWidth == 224)
3473 return &AMDGPU::AV_224RegClass;
3474 if (BitWidth == 256)
3475 return &AMDGPU::AV_256RegClass;
3476 if (BitWidth == 288)
3477 return &AMDGPU::AV_288RegClass;
3478 if (BitWidth == 320)
3479 return &AMDGPU::AV_320RegClass;
3480 if (BitWidth == 352)
3481 return &AMDGPU::AV_352RegClass;
3482 if (BitWidth == 384)
3483 return &AMDGPU::AV_384RegClass;
3484 if (BitWidth == 512)
3485 return &AMDGPU::AV_512RegClass;
3486 if (BitWidth == 1024)
3487 return &AMDGPU::AV_1024RegClass;
3488
3489 return nullptr;
3490}
3491
3492static const TargetRegisterClass *
3494 if (BitWidth == 64)
3495 return &AMDGPU::AV_64_Align2RegClass;
3496 if (BitWidth == 96)
3497 return &AMDGPU::AV_96_Align2RegClass;
3498 if (BitWidth == 128)
3499 return &AMDGPU::AV_128_Align2RegClass;
3500 if (BitWidth == 160)
3501 return &AMDGPU::AV_160_Align2RegClass;
3502 if (BitWidth == 192)
3503 return &AMDGPU::AV_192_Align2RegClass;
3504 if (BitWidth == 224)
3505 return &AMDGPU::AV_224_Align2RegClass;
3506 if (BitWidth == 256)
3507 return &AMDGPU::AV_256_Align2RegClass;
3508 if (BitWidth == 288)
3509 return &AMDGPU::AV_288_Align2RegClass;
3510 if (BitWidth == 320)
3511 return &AMDGPU::AV_320_Align2RegClass;
3512 if (BitWidth == 352)
3513 return &AMDGPU::AV_352_Align2RegClass;
3514 if (BitWidth == 384)
3515 return &AMDGPU::AV_384_Align2RegClass;
3516 if (BitWidth == 512)
3517 return &AMDGPU::AV_512_Align2RegClass;
3518 if (BitWidth == 1024)
3519 return &AMDGPU::AV_1024_Align2RegClass;
3520
3521 return nullptr;
3522}
3523
3524const TargetRegisterClass *
3526 if (BitWidth == 32)
3527 return &AMDGPU::AV_32RegClass;
3528 return ST.needsAlignedVGPRs()
3531}
3532
3533const TargetRegisterClass *
3535 if (BitWidth == 16 || BitWidth == 32)
3536 return &AMDGPU::SReg_32RegClass;
3537 if (BitWidth == 64)
3538 return &AMDGPU::SReg_64RegClass;
3539 if (BitWidth == 96)
3540 return &AMDGPU::SGPR_96RegClass;
3541 if (BitWidth == 128)
3542 return &AMDGPU::SGPR_128RegClass;
3543 if (BitWidth == 160)
3544 return &AMDGPU::SGPR_160RegClass;
3545 if (BitWidth == 192)
3546 return &AMDGPU::SGPR_192RegClass;
3547 if (BitWidth == 224)
3548 return &AMDGPU::SGPR_224RegClass;
3549 if (BitWidth == 256)
3550 return &AMDGPU::SGPR_256RegClass;
3551 if (BitWidth == 288)
3552 return &AMDGPU::SGPR_288RegClass;
3553 if (BitWidth == 320)
3554 return &AMDGPU::SGPR_320RegClass;
3555 if (BitWidth == 352)
3556 return &AMDGPU::SGPR_352RegClass;
3557 if (BitWidth == 384)
3558 return &AMDGPU::SGPR_384RegClass;
3559 if (BitWidth == 512)
3560 return &AMDGPU::SGPR_512RegClass;
3561 if (BitWidth == 1024)
3562 return &AMDGPU::SGPR_1024RegClass;
3563
3564 return nullptr;
3565}
3566
3568 Register Reg) const {
3569 const TargetRegisterClass *RC;
3570 if (Reg.isVirtual())
3571 RC = MRI.getRegClass(Reg);
3572 else
3573 RC = getPhysRegBaseClass(Reg);
3574 return RC && isSGPRClass(RC);
3575}
3576
3577const TargetRegisterClass *
3579 unsigned Size = getRegSizeInBits(*SRC);
3580
3581 switch (SRC->getID()) {
3582 default:
3583 break;
3584 case AMDGPU::VS_32_Lo256RegClassID:
3585 case AMDGPU::VS_64_Lo256RegClassID:
3586 return getAllocatableClass(getAlignedLo256VGPRClassForBitWidth(Size));
3587 }
3588
3589 const TargetRegisterClass *VRC =
3590 getAllocatableClass(getVGPRClassForBitWidth(Size));
3591 assert(VRC && "Invalid register class size");
3592 return VRC;
3593}
3594
3595const TargetRegisterClass *
3597 unsigned Size = getRegSizeInBits(*SRC);
3599 assert(ARC && "Invalid register class size");
3600 return ARC;
3601}
3602
3603const TargetRegisterClass *
3605 unsigned Size = getRegSizeInBits(*VRC);
3606 if (Size == 32)
3607 return &AMDGPU::SGPR_32RegClass;
3609 assert(SRC && "Invalid register class size");
3610 return SRC;
3611}
3612
3613const TargetRegisterClass *
3615 const TargetRegisterClass *SubRC,
3616 unsigned SubIdx) const {
3617 // Ensure this subregister index is aligned in the super register.
3618 const TargetRegisterClass *MatchRC =
3619 getMatchingSuperRegClass(SuperRC, SubRC, SubIdx);
3620 return MatchRC && MatchRC->hasSubClassEq(SuperRC) ? MatchRC : nullptr;
3621}
3622
3623bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
3626 return !ST.hasMFMAInlineLiteralBug();
3627
3628 return OpType >= AMDGPU::OPERAND_SRC_FIRST &&
3629 OpType <= AMDGPU::OPERAND_SRC_LAST;
3630}
3631
3632bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const {
3633 // TODO: 64-bit operands have extending behavior from 32-bit literal.
3634 return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST &&
3636}
3637
3638/// Returns a lowest register that is not used at any point in the function.
3639/// If all registers are used, then this function will return
3640/// AMDGPU::NoRegister. If \p ReserveHighestRegister = true, then return
3641/// highest unused register.
3644 const MachineFunction &MF, bool ReserveHighestRegister) const {
3645 if (ReserveHighestRegister) {
3646 for (MCRegister Reg : reverse(*RC))
3647 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
3648 return Reg;
3649 } else {
3650 for (MCRegister Reg : *RC)
3651 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
3652 return Reg;
3653 }
3654 return MCRegister();
3655}
3656
3658 const RegisterBankInfo &RBI,
3659 Register Reg) const {
3660 auto *RB = RBI.getRegBank(Reg, MRI, *MRI.getTargetRegisterInfo());
3661 if (!RB)
3662 return false;
3663
3664 return !RBI.isDivergentRegBank(RB);
3665}
3666
3668 unsigned EltSize) const {
3669 const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC);
3670 assert(RegBitWidth >= 32 && RegBitWidth <= 1024 && EltSize >= 2);
3671
3672 const unsigned RegHalves = RegBitWidth / 16;
3673 const unsigned EltHalves = EltSize / 2;
3674 assert(RegSplitParts.size() + 1 >= EltHalves);
3675
3676 const std::vector<int16_t> &Parts = RegSplitParts[EltHalves - 1];
3677 const unsigned NumParts = RegHalves / EltHalves;
3678
3679 return ArrayRef(Parts.data(), NumParts);
3680}
3681
3684 Register Reg) const {
3685 return Reg.isVirtual() ? MRI.getRegClass(Reg) : getPhysRegBaseClass(Reg);
3686}
3687
3688const TargetRegisterClass *
3690 const MachineOperand &MO) const {
3691 const TargetRegisterClass *SrcRC = getRegClassForReg(MRI, MO.getReg());
3692 return getSubRegisterClass(SrcRC, MO.getSubReg());
3693}
3694
3696 Register Reg) const {
3697 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
3698 // Registers without classes are unaddressable, SGPR-like registers.
3699 return RC && isVGPRClass(RC);
3700}
3701
3703 Register Reg) const {
3704 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
3705
3706 // Registers without classes are unaddressable, SGPR-like registers.
3707 return RC && isAGPRClass(RC);
3708}
3709
3711 const TargetRegisterClass *SrcRC,
3712 unsigned SubReg,
3713 const TargetRegisterClass *DstRC,
3714 unsigned DstSubReg,
3715 const TargetRegisterClass *NewRC,
3716 LiveIntervals &LIS) const {
3717 unsigned SrcSize = getRegSizeInBits(*SrcRC);
3718 unsigned DstSize = getRegSizeInBits(*DstRC);
3719 unsigned NewSize = getRegSizeInBits(*NewRC);
3720
3721 // Do not increase size of registers beyond dword, we would need to allocate
3722 // adjacent registers and constraint regalloc more than needed.
3723
3724 // Always allow dword coalescing.
3725 if (SrcSize <= 32 || DstSize <= 32)
3726 return true;
3727
3728 return NewSize <= DstSize || NewSize <= SrcSize;
3729}
3730
3732 MachineFunction &MF) const {
3733 unsigned MinOcc = ST.getOccupancyWithWorkGroupSizes(MF).first;
3734 switch (RC->getID()) {
3735 default:
3736 return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF);
3737 case AMDGPU::VGPR_32RegClassID:
3738 return std::min(
3739 ST.getMaxNumVGPRs(
3740 MinOcc,
3742 ST.getMaxNumVGPRs(MF));
3743 case AMDGPU::SGPR_32RegClassID:
3744 case AMDGPU::SGPR_LO16RegClassID:
3745 return std::min(ST.getMaxNumSGPRs(MinOcc, true), ST.getMaxNumSGPRs(MF));
3746 }
3747}
3748
3750 unsigned Idx) const {
3751 switch (static_cast<AMDGPU::RegisterPressureSets>(Idx)) {
3752 case AMDGPU::RegisterPressureSets::VGPR_32:
3753 case AMDGPU::RegisterPressureSets::AGPR_32:
3754 return getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
3755 const_cast<MachineFunction &>(MF));
3756 case AMDGPU::RegisterPressureSets::SReg_32:
3757 return getRegPressureLimit(&AMDGPU::SGPR_32RegClass,
3758 const_cast<MachineFunction &>(MF));
3759 }
3760
3761 llvm_unreachable("Unexpected register pressure set!");
3762}
3763
3764const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const {
3765 static const int Empty[] = { -1 };
3766
3767 if (RegPressureIgnoredUnits[RegUnit])
3768 return Empty;
3769
3770 return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit);
3771}
3772
3774 ArrayRef<MCPhysReg> Order,
3776 const MachineFunction &MF,
3777 const VirtRegMap *VRM,
3778 const LiveRegMatrix *Matrix) const {
3779
3780 const MachineRegisterInfo &MRI = MF.getRegInfo();
3781 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3782
3783 std::pair<unsigned, Register> Hint = MRI.getRegAllocationHint(VirtReg);
3784
3785 switch (Hint.first) {
3786 case AMDGPURI::Size32: {
3787 Register Paired = Hint.second;
3788 assert(Paired);
3789 Register PairedPhys;
3790 if (Paired.isPhysical()) {
3791 PairedPhys =
3792 getMatchingSuperReg(Paired, AMDGPU::lo16, &AMDGPU::VGPR_32RegClass);
3793 } else if (VRM && VRM->hasPhys(Paired)) {
3794 PairedPhys = getMatchingSuperReg(VRM->getPhys(Paired), AMDGPU::lo16,
3795 &AMDGPU::VGPR_32RegClass);
3796 }
3797
3798 // Prefer the paired physreg.
3799 if (PairedPhys)
3800 // isLo(Paired) is implicitly true here from the API of
3801 // getMatchingSuperReg.
3802 Hints.push_back(PairedPhys);
3803 return false;
3804 }
3805 case AMDGPURI::Size16: {
3806 Register Paired = Hint.second;
3807 assert(Paired);
3808 Register PairedPhys;
3809 if (Paired.isPhysical()) {
3810 PairedPhys = TRI->getSubReg(Paired, AMDGPU::lo16);
3811 } else if (VRM && VRM->hasPhys(Paired)) {
3812 PairedPhys = TRI->getSubReg(VRM->getPhys(Paired), AMDGPU::lo16);
3813 }
3814
3815 // First prefer the paired physreg.
3816 if (PairedPhys)
3817 Hints.push_back(PairedPhys);
3818 else {
3819 // Add all the lo16 physregs.
3820 // When the Paired operand has not yet been assigned a physreg it is
3821 // better to try putting VirtReg in a lo16 register, because possibly
3822 // later Paired can be assigned to the overlapping register and the COPY
3823 // can be eliminated.
3824 for (MCPhysReg PhysReg : Order) {
3825 if (PhysReg == PairedPhys || AMDGPU::isHi16Reg(PhysReg, *this))
3826 continue;
3827 if (AMDGPU::VGPR_16RegClass.contains(PhysReg) &&
3828 !MRI.isReserved(PhysReg))
3829 Hints.push_back(PhysReg);
3830 }
3831 }
3832 return false;
3833 }
3834 default:
3835 return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
3836 VRM);
3837 }
3838}
3839
3841 // Not a callee saved register.
3842 return AMDGPU::SGPR30_SGPR31;
3843}
3844
3845const TargetRegisterClass *
3847 const RegisterBank &RB) const {
3848 switch (RB.getID()) {
3849 case AMDGPU::VGPRRegBankID:
3851 std::max(ST.useRealTrue16Insts() ? 16u : 32u, Size));
3852 case AMDGPU::VCCRegBankID:
3853 assert(Size == 1);
3854 return getWaveMaskRegClass();
3855 case AMDGPU::SGPRRegBankID:
3856 return getSGPRClassForBitWidth(std::max(32u, Size));
3857 case AMDGPU::AGPRRegBankID:
3858 return getAGPRClassForBitWidth(std::max(32u, Size));
3859 default:
3860 llvm_unreachable("unknown register bank");
3861 }
3862}
3863
3864const TargetRegisterClass *
3866 const MachineRegisterInfo &MRI) const {
3867 const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg());
3868 if (const RegisterBank *RB = dyn_cast<const RegisterBank *>(RCOrRB))
3869 return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB);
3870
3871 if (const auto *RC = dyn_cast<const TargetRegisterClass *>(RCOrRB))
3872 return getAllocatableClass(RC);
3873
3874 return nullptr;
3875}
3876
3878 return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC;
3879}
3880
3882 return isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
3883}
3884
3886 // VGPR tuples have an alignment requirement on gfx90a variants.
3887 return ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass
3888 : &AMDGPU::VReg_64RegClass;
3889}
3890
3891const TargetRegisterClass *
3892SIRegisterInfo::getRegClass(unsigned RCID) const {
3893 switch ((int)RCID) {
3894 case AMDGPU::SReg_1RegClassID:
3895 return getBoolRC();
3896 case AMDGPU::SReg_1_XEXECRegClassID:
3897 return getWaveMaskRegClass();
3898 case -1:
3899 return nullptr;
3900 default:
3901 return AMDGPUGenRegisterInfo::getRegClass(RCID);
3902 }
3903}
3904
3905// Find reaching register definition
3909 LiveIntervals *LIS) const {
3910 auto &MDT = LIS->getDomTree();
3911 SlotIndex UseIdx = LIS->getInstructionIndex(Use);
3912 SlotIndex DefIdx;
3913
3914 if (Reg.isVirtual()) {
3915 if (!LIS->hasInterval(Reg))
3916 return nullptr;
3917 LiveInterval &LI = LIS->getInterval(Reg);
3918 LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg)
3919 : MRI.getMaxLaneMaskForVReg(Reg);
3920 VNInfo *V = nullptr;
3921 if (LI.hasSubRanges()) {
3922 for (auto &S : LI.subranges()) {
3923 if ((S.LaneMask & SubLanes) == SubLanes) {
3924 V = S.getVNInfoAt(UseIdx);
3925 break;
3926 }
3927 }
3928 } else {
3929 V = LI.getVNInfoAt(UseIdx);
3930 }
3931 if (!V)
3932 return nullptr;
3933 DefIdx = V->def;
3934 } else {
3935 // Find last def.
3936 for (MCRegUnit Unit : regunits(Reg.asMCReg())) {
3937 LiveRange &LR = LIS->getRegUnit(Unit);
3938 if (VNInfo *V = LR.getVNInfoAt(UseIdx)) {
3939 if (!DefIdx.isValid() ||
3940 MDT.dominates(LIS->getInstructionFromIndex(DefIdx),
3941 LIS->getInstructionFromIndex(V->def)))
3942 DefIdx = V->def;
3943 } else {
3944 return nullptr;
3945 }
3946 }
3947 }
3948
3949 MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx);
3950
3951 if (!Def || !MDT.dominates(Def, &Use))
3952 return nullptr;
3953
3954 assert(Def->modifiesRegister(Reg, this));
3955
3956 return Def;
3957}
3958
3960 assert(getRegSizeInBits(*getPhysRegBaseClass(Reg)) <= 32);
3961
3962 for (const TargetRegisterClass &RC : { AMDGPU::VGPR_32RegClass,
3963 AMDGPU::SReg_32RegClass,
3964 AMDGPU::AGPR_32RegClass } ) {
3965 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::lo16, &RC))
3966 return Super;
3967 }
3968 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::hi16,
3969 &AMDGPU::VGPR_32RegClass)) {
3970 return Super;
3971 }
3972
3973 return AMDGPU::NoRegister;
3974}
3975
3977 if (!ST.needsAlignedVGPRs())
3978 return true;
3979
3980 if (isVGPRClass(&RC))
3981 return RC.hasSuperClassEq(getVGPRClassForBitWidth(getRegSizeInBits(RC)));
3982 if (isAGPRClass(&RC))
3983 return RC.hasSuperClassEq(getAGPRClassForBitWidth(getRegSizeInBits(RC)));
3984 if (isVectorSuperClass(&RC))
3985 return RC.hasSuperClassEq(
3986 getVectorSuperClassForBitWidth(getRegSizeInBits(RC)));
3987
3988 assert(&RC != &AMDGPU::VS_64RegClass);
3989
3990 return true;
3991}
3992
3993const TargetRegisterClass *
3995 if (!RC || !ST.needsAlignedVGPRs())
3996 return RC;
3997
3998 unsigned Size = getRegSizeInBits(*RC);
3999 if (Size <= 32)
4000 return RC;
4001
4002 if (RC == &AMDGPU::VS_64RegClass)
4003 return &AMDGPU::VS_64_Align2RegClass;
4004
4005 if (isVGPRClass(RC))
4007 if (isAGPRClass(RC))
4009 if (isVectorSuperClass(RC))
4011
4012 return RC;
4013}
4014
4017 return ArrayRef(AMDGPU::SGPR_128RegClass.begin(), ST.getMaxNumSGPRs(MF) / 4);
4018}
4019
4022 return ArrayRef(AMDGPU::SGPR_64RegClass.begin(), ST.getMaxNumSGPRs(MF) / 2);
4023}
4024
4027 return ArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF));
4028}
4029
4030unsigned
4032 unsigned SubReg) const {
4033 switch (RC->TSFlags & SIRCFlags::RegKindMask) {
4034 case SIRCFlags::HasSGPR:
4035 return std::min(128u, getSubRegIdxSize(SubReg));
4036 case SIRCFlags::HasAGPR:
4037 case SIRCFlags::HasVGPR:
4039 return std::min(32u, getSubRegIdxSize(SubReg));
4040 default:
4041 break;
4042 }
4043 return 0;
4044}
4045
4047 const TargetRegisterClass &RC,
4048 bool IncludeCalls) const {
4049 unsigned NumArchVGPRs = ST.has1024AddressableVGPRs() ? 1024 : 256;
4051 (RC.getID() == AMDGPU::VGPR_32RegClassID)
4052 ? RC.getRegisters().take_front(NumArchVGPRs)
4053 : RC.getRegisters();
4054 for (MCPhysReg Reg : reverse(Registers))
4055 if (MRI.isPhysRegUsed(Reg, /*SkipRegMaskTest=*/!IncludeCalls))
4056 return getHWRegIndex(Reg) + 1;
4057 return 0;
4058}
4059
4062 const MachineFunction &MF) const {
4064 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
4065 if (FuncInfo->checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG))
4066 RegFlags.push_back("WWM_REG");
4067 return RegFlags;
4068}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static const Function * getParent(const Value *V)
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
Live Register Matrix
A set of register units.
#define I(x, y, z)
Definition MD5.cpp:58
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
if(PassOpts->AAPipeline)
This file declares the machine register scavenger class.
SI Pre allocate WWM Registers
static int getOffenMUBUFStore(unsigned Opc)
static const TargetRegisterClass * getAnyAGPRClassForBitWidth(unsigned BitWidth)
static int getOffsetMUBUFLoad(unsigned Opc)
static const std::array< unsigned, 17 > SubRegFromChannelTableWidthMap
static unsigned getNumSubRegsForSpillOp(const MachineInstr &MI, const SIInstrInfo *TII)
static void emitUnsupportedError(const Function &Fn, const MachineInstr &MI, const Twine &ErrMsg)
static const TargetRegisterClass * getAlignedAGPRClassForBitWidth(unsigned BitWidth)
static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST, MachineFrameInfo &MFI, MachineBasicBlock::iterator MI, int Index, int64_t Offset)
static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII, unsigned LoadStoreOp, unsigned EltSize)
static const TargetRegisterClass * getAlignedVGPRClassForBitWidth(unsigned BitWidth)
static int getOffsetMUBUFStore(unsigned Opc)
static const TargetRegisterClass * getAnyVGPRClassForBitWidth(unsigned BitWidth)
static cl::opt< bool > EnableSpillSGPRToVGPR("amdgpu-spill-sgpr-to-vgpr", cl::desc("Enable spilling SGPRs to VGPRs"), cl::ReallyHidden, cl::init(true))
static const TargetRegisterClass * getAlignedVectorSuperClassForBitWidth(unsigned BitWidth)
static const TargetRegisterClass * getAnyVectorSuperClassForBitWidth(unsigned BitWidth)
static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, int Index, unsigned Lane, unsigned ValueReg, bool IsKill)
static bool isFIPlusImmOrVGPR(const SIRegisterInfo &TRI, const MachineInstr &MI)
static int getOffenMUBUFLoad(unsigned Opc)
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
LocallyHashedType DenseMapInfo< LocallyHashedType >::Empty
static const char * getRegisterName(MCRegister Reg)
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:147
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:142
bool test(unsigned Idx) const
Definition BitVector.h:480
bool empty() const
empty - Tests whether there are no bits in this bitvector.
Definition BitVector.h:175
A debug info location.
Definition DebugLoc.h:124
Diagnostic information for unsupported feature in backend.
Register getReg() const
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LiveInterval - This class represents the liveness of a register, or stack slot.
bool hasSubRanges() const
Returns true if subregister liveness information is available.
iterator_range< subrange_iterator > subranges()
void removeAllRegUnitsForPhysReg(MCRegister Reg)
Remove associated live ranges for the register units associated with Reg.
bool hasInterval(Register Reg) const
MachineInstr * getInstructionFromIndex(SlotIndex index) const
Returns the instruction associated with the given index.
MachineDominatorTree & getDomTree()
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveRange & getRegUnit(unsigned Unit)
Return the live range for register unit Unit.
LiveInterval & getInterval(Register Reg)
This class represents the liveness of a register, stack slot, etc.
VNInfo * getVNInfoAt(SlotIndex Idx) const
getVNInfoAt - Return the VNInfo that is live at Idx, or NULL.
A set of register units used to track register liveness.
bool available(MCRegister Reg) const
Returns true if no part of physical register Reg is live.
Describe properties that are true of each instruction in the target description file.
MCRegAliasIterator enumerates all registers aliasing Reg.
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:33
static MCRegister from(unsigned Val)
Check the provided unsigned value is a valid MCRegister.
Definition MCRegister.h:69
Generic base class for all target subtargets.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
bool hasCalls() const
Return true if the current function has any function calls.
Align getObjectAlign(int ObjectIdx) const
Return the alignment of the specified stack object.
bool hasStackObjects() const
Return true if there are any stack objects in this function.
uint8_t getStackID(int ObjectIdx) const
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
void setAsmPrinterFlag(uint8_t Flag)
Set a flag for the AsmPrinter.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
void setImm(int64_t immVal)
int64_t getImm() const
LLVM_ABI void setIsRenamable(bool Val=true)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
bool isReserved(MCRegister PhysReg) const
isReserved - Returns true when PhysReg is a reserved register.
Holds all the information related to register banks.
virtual bool isDivergentRegBank(const RegisterBank *RB) const
Returns true if the register bank is considered divergent.
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
constexpr bool isValid() const
Definition Register.h:107
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:78
static bool isFLATScratch(const MachineInstr &MI)
static bool isMUBUF(const MachineInstr &MI)
static bool isVOP3(const MCInstrDesc &Desc)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
ArrayRef< MCPhysReg > getAGPRSpillVGPRs() const
MCPhysReg getVGPRToAGPRSpill(int FrameIndex, unsigned Lane) const
Register getScratchRSrcReg() const
Returns the physical register reserved for use as the resource descriptor for scratch accesses.
ArrayRef< MCPhysReg > getVGPRSpillAGPRs() const
ArrayRef< SIRegisterInfo::SpilledReg > getSGPRSpillToVirtualVGPRLanes(int FrameIndex) const
uint32_t getMaskForVGPRBlockOps(Register RegisterBlock) const
ArrayRef< SIRegisterInfo::SpilledReg > getSGPRSpillToPhysicalVGPRLanes(int FrameIndex) const
bool checkFlag(Register Reg, uint8_t Flag) const
const ReservedRegSet & getWWMReservedRegs() const
Register materializeFrameBaseRegister(MachineBasicBlock *MBB, int FrameIdx, int64_t Offset) const override
int64_t getScratchInstrOffset(const MachineInstr *MI) const
bool isFrameOffsetLegal(const MachineInstr *MI, Register BaseReg, int64_t Offset) const override
const TargetRegisterClass * getRegClass(unsigned RCID) const
const TargetRegisterClass * getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, const TargetRegisterClass *SubRC, unsigned SubIdx) const
Returns a register class which is compatible with SuperRC, such that a subregister exists with class ...
ArrayRef< MCPhysReg > getAllSGPR64(const MachineFunction &MF) const
Return all SGPR64 which satisfy the waves per execution unit requirement of the subtarget.
MCRegister findUnusedRegister(const MachineRegisterInfo &MRI, const TargetRegisterClass *RC, const MachineFunction &MF, bool ReserveHighestVGPR=false) const
Returns a lowest register that is not used at any point in the function.
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
MCPhysReg get32BitRegister(MCPhysReg Reg) const
const uint32_t * getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override
bool requiresFrameIndexReplacementScavenging(const MachineFunction &MF) const override
const TargetRegisterClass * getProperlyAlignedRC(const TargetRegisterClass *RC) const
bool shouldRealignStack(const MachineFunction &MF) const override
bool restoreSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, SlotIndexes *Indexes=nullptr, LiveIntervals *LIS=nullptr, bool OnlyToVGPR=false, bool SpillToPhysVGPRLane=false) const
bool isProperlyAlignedRC(const TargetRegisterClass &RC) const
const TargetRegisterClass * getEquivalentVGPRClass(const TargetRegisterClass *SRC) const
Register getFrameRegister(const MachineFunction &MF) const override
LLVM_READONLY const TargetRegisterClass * getVectorSuperClassForBitWidth(unsigned BitWidth) const
bool spillEmergencySGPR(MachineBasicBlock::iterator MI, MachineBasicBlock &RestoreMBB, Register SGPR, RegScavenger *RS) const
SIRegisterInfo(const GCNSubtarget &ST)
const uint32_t * getAllVGPRRegMask() const
MCRegister getReturnAddressReg(const MachineFunction &MF) const
const MCPhysReg * getCalleeSavedRegs(const MachineFunction *MF) const override
bool hasBasePointer(const MachineFunction &MF) const
const TargetRegisterClass * getCrossCopyRegClass(const TargetRegisterClass *RC) const override
Returns a legal register class to copy a register in the specified class to or from.
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
ArrayRef< MCPhysReg > getAllSGPR32(const MachineFunction &MF) const
Return all SGPR32 which satisfy the waves per execution unit requirement of the subtarget.
const TargetRegisterClass * getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &MF) const override
MCRegister reservedPrivateSegmentBufferReg(const MachineFunction &MF) const
Return the end register initially reserved for the scratch buffer in case spilling is needed.
bool eliminateSGPRToVGPRSpillFrameIndex(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, SlotIndexes *Indexes=nullptr, LiveIntervals *LIS=nullptr, bool SpillToPhysVGPRLane=false) const
Special case of eliminateFrameIndex.
bool isVGPR(const MachineRegisterInfo &MRI, Register Reg) const
void buildSpillLoadStore(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, unsigned LoadStoreOp, int Index, Register ValueReg, bool ValueIsKill, MCRegister ScratchOffsetReg, int64_t InstrOffset, MachineMemOperand *MMO, RegScavenger *RS, LiveRegUnits *LiveUnits=nullptr) const
bool isAsmClobberable(const MachineFunction &MF, MCRegister PhysReg) const override
LLVM_READONLY const TargetRegisterClass * getAGPRClassForBitWidth(unsigned BitWidth) const
static bool isChainScratchRegister(Register VGPR)
bool requiresRegisterScavenging(const MachineFunction &Fn) const override
bool opCanUseInlineConstant(unsigned OpType) const
const TargetRegisterClass * getRegClassForSizeOnBank(unsigned Size, const RegisterBank &Bank) const
const TargetRegisterClass * getConstrainedRegClassForOperand(const MachineOperand &MO, const MachineRegisterInfo &MRI) const override
bool isUniformReg(const MachineRegisterInfo &MRI, const RegisterBankInfo &RBI, Register Reg) const override
const uint32_t * getNoPreservedMask() const override
StringRef getRegAsmName(MCRegister Reg) const override
const uint32_t * getAllAllocatableSRegMask() const
MCRegister getAlignedHighSGPRForRC(const MachineFunction &MF, const unsigned Align, const TargetRegisterClass *RC) const
Return the largest available SGPR aligned to Align for the register class RC.
const TargetRegisterClass * getRegClassForReg(const MachineRegisterInfo &MRI, Register Reg) const
unsigned getHWRegIndex(MCRegister Reg) const
const MCPhysReg * getCalleeSavedRegsViaCopy(const MachineFunction *MF) const
const uint32_t * getAllVectorRegMask() const
const TargetRegisterClass * getEquivalentAGPRClass(const TargetRegisterClass *SRC) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
const TargetRegisterClass * getPointerRegClass(unsigned Kind=0) const override
const TargetRegisterClass * getRegClassForTypeOnBank(LLT Ty, const RegisterBank &Bank) const
bool opCanUseLiteralConstant(unsigned OpType) const
Register getBaseRegister() const
bool getRegAllocationHints(Register VirtReg, ArrayRef< MCPhysReg > Order, SmallVectorImpl< MCPhysReg > &Hints, const MachineFunction &MF, const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const override
LLVM_READONLY const TargetRegisterClass * getAlignedLo256VGPRClassForBitWidth(unsigned BitWidth) const
LLVM_READONLY const TargetRegisterClass * getVGPRClassForBitWidth(unsigned BitWidth) const
bool requiresFrameIndexScavenging(const MachineFunction &MF) const override
static bool isVGPRClass(const TargetRegisterClass *RC)
MachineInstr * findReachingDef(Register Reg, unsigned SubReg, MachineInstr &Use, MachineRegisterInfo &MRI, LiveIntervals *LIS) const
bool isSGPRReg(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getEquivalentSGPRClass(const TargetRegisterClass *VRC) const
SmallVector< StringLiteral > getVRegFlagsOfReg(Register Reg, const MachineFunction &MF) const override
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
ArrayRef< MCPhysReg > getAllSGPR128(const MachineFunction &MF) const
Return all SGPR128 which satisfy the waves per execution unit requirement of the subtarget.
unsigned getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const override
BitVector getReservedRegs(const MachineFunction &MF) const override
bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override
const TargetRegisterClass * getRegClassForOperandReg(const MachineRegisterInfo &MRI, const MachineOperand &MO) const
void addImplicitUsesForBlockCSRLoad(MachineInstrBuilder &MIB, Register BlockReg) const
unsigned getNumUsedPhysRegs(const MachineRegisterInfo &MRI, const TargetRegisterClass &RC, bool IncludeCalls=true) const
const uint32_t * getAllAGPRRegMask() const
bool shouldCoalesce(MachineInstr *MI, const TargetRegisterClass *SrcRC, unsigned SubReg, const TargetRegisterClass *DstRC, unsigned DstSubReg, const TargetRegisterClass *NewRC, LiveIntervals &LIS) const override
const TargetRegisterClass * getBoolRC() const
bool isAGPR(const MachineRegisterInfo &MRI, Register Reg) const
bool eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const override
bool spillSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, SlotIndexes *Indexes=nullptr, LiveIntervals *LIS=nullptr, bool OnlyToVGPR=false, bool SpillToPhysVGPRLane=false) const
If OnlyToVGPR is true, this will only succeed if this manages to find a free VGPR lane to spill.
MCRegister getExec() const
MCRegister getVCC() const
int64_t getFrameIndexInstrOffset(const MachineInstr *MI, int Idx) const override
bool isVectorSuperClass(const TargetRegisterClass *RC) const
const TargetRegisterClass * getWaveMaskRegClass() const
unsigned getSubRegAlignmentNumBits(const TargetRegisterClass *RC, unsigned SubReg) const
void resolveFrameIndex(MachineInstr &MI, Register BaseReg, int64_t Offset) const override
bool requiresVirtualBaseRegisters(const MachineFunction &Fn) const override
const TargetRegisterClass * getVGPR64Class() const
void buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index, int Offset, bool IsLoad, bool IsKill=true) const
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
const int * getRegUnitPressureSets(unsigned RegUnit) const override
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
bool isValid() const
Returns true if this is a valid index.
SlotIndexes pass.
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
SlotIndex replaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
ReplaceMachineInstrInMaps - Replacing a machine instr with a new one in maps used by register allocat...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
bool hasFP(const MachineFunction &MF) const
hasFP - Return true if the specified function should have a dedicated frame pointer register.
const uint8_t TSFlags
Configurable target specific flags.
ArrayRef< MCPhysReg > getRegisters() const
unsigned getID() const
Return the register class ID number.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
virtual const TargetRegisterClass * getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &) const
Returns the largest super class of RC that is legal to use in the current sub-target and has the same...
virtual bool shouldRealignStack(const MachineFunction &MF) const
True if storage within the function requires the stack pointer to be aligned more than the normal cal...
virtual bool getRegAllocationHints(Register VirtReg, ArrayRef< MCPhysReg > Order, SmallVectorImpl< MCPhysReg > &Hints, const MachineFunction &MF, const VirtRegMap *VRM=nullptr, const LiveRegMatrix *Matrix=nullptr) const
Get a list of 'hint' registers that the register allocator should try first when allocating a physica...
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
VNInfo - Value Number Information.
MCRegister getPhys(Register virtReg) const
returns the physical register mapped to the specified virtual register
Definition VirtRegMap.h:91
bool hasPhys(Register virtReg) const
returns true if the specified virtual register is mapped to a physical register
Definition VirtRegMap.h:87
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ PRIVATE_ADDRESS
Address space for private memory.
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
LLVM_READONLY int getFlatScratchInstSVfromSS(uint16_t Opcode)
LLVM_READONLY int getFlatScratchInstSTfromSS(uint16_t Opcode)
LLVM_READONLY int getFlatScratchInstSVfromSVS(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
@ OPERAND_REG_IMM_FIRST
Definition SIDefines.h:250
@ OPERAND_REG_INLINE_AC_FIRST
Definition SIDefines.h:256
@ OPERAND_REG_INLINE_AC_LAST
Definition SIDefines.h:257
@ OPERAND_REG_IMM_LAST
Definition SIDefines.h:251
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition CallingConv.h:47
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Renamable
Register that may be renamed.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:477
PointerUnion< const TargetRegisterClass *, const RegisterBank * > RegClassOrRegBank
Convenient type to represent either a register class or a register bank.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1655
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:644
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:557
Op::Description Desc
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
@ HasSGPR
Definition SIDefines.h:26
@ HasVGPR
Definition SIDefines.h:24
@ RegKindMask
Definition SIDefines.h:29
@ HasAGPR
Definition SIDefines.h:25
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:198
unsigned MCRegUnit
Register units are used to compute register aliasing.
Definition MCRegister.h:30
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:405
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
unsigned getDefRegState(bool B)
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
unsigned getKillRegState(bool B)
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
void call_once(once_flag &flag, Function &&F, Args &&... ArgList)
Execute the function specified as a parameter once.
Definition Threading.h:86
constexpr unsigned BitWidth
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
This class contains a discriminated union of information about pointers in memory operands,...
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
void setMI(MachineBasicBlock *NewMBB, MachineBasicBlock::iterator NewMI)
ArrayRef< int16_t > SplitParts
SIMachineFunctionInfo & MFI
SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, bool IsWave32, MachineBasicBlock::iterator MI, int Index, RegScavenger *RS)
SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, bool IsWave32, MachineBasicBlock::iterator MI, Register Reg, bool IsKill, int Index, RegScavenger *RS)
MachineBasicBlock::iterator MI
void readWriteTmpVGPR(unsigned Offset, bool IsLoad)
const SIRegisterInfo & TRI
MachineBasicBlock * MBB
const SIInstrInfo & TII
The llvm::once_flag structure.
Definition Threading.h:67