LLVM 22.0.0git
SIRegisterInfo.cpp
Go to the documentation of this file.
1//===-- SIRegisterInfo.cpp - SI Register Information ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI implementation of the TargetRegisterInfo class.
11//
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPU.h"
16#include "GCNSubtarget.h"
20#include "SIRegisterInfo.h"
26
27using namespace llvm;
28
29#define GET_REGINFO_TARGET_DESC
30#include "AMDGPUGenRegisterInfo.inc"
31
33 "amdgpu-spill-sgpr-to-vgpr",
34 cl::desc("Enable spilling SGPRs to VGPRs"),
36 cl::init(true));
37
38std::array<std::vector<int16_t>, 32> SIRegisterInfo::RegSplitParts;
39std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable;
40
41// Map numbers of DWORDs to indexes in SubRegFromChannelTable.
42// Valid indexes are shifted 1, such that a 0 mapping means unsupported.
43// e.g. for 8 DWORDs (256-bit), SubRegFromChannelTableWidthMap[8] = 8,
44// meaning index 7 in SubRegFromChannelTable.
45static const std::array<unsigned, 17> SubRegFromChannelTableWidthMap = {
46 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 9};
47
48static void emitUnsupportedError(const Function &Fn, const MachineInstr &MI,
49 const Twine &ErrMsg) {
51 DiagnosticInfoUnsupported(Fn, ErrMsg, MI.getDebugLoc()));
52}
53
54namespace llvm {
55
56// A temporary struct to spill SGPRs.
57// This is mostly to spill SGPRs to memory. Spilling SGPRs into VGPR lanes emits
58// just v_writelane and v_readlane.
59//
60// When spilling to memory, the SGPRs are written into VGPR lanes and the VGPR
61// is saved to scratch (or the other way around for loads).
62// For this, a VGPR is required where the needed lanes can be clobbered. The
63// RegScavenger can provide a VGPR where currently active lanes can be
64// clobbered, but we still need to save inactive lanes.
65// The high-level steps are:
66// - Try to scavenge SGPR(s) to save exec
67// - Try to scavenge VGPR
68// - Save needed, all or inactive lanes of a TmpVGPR
69// - Spill/Restore SGPRs using TmpVGPR
70// - Restore TmpVGPR
71//
72// To save all lanes of TmpVGPR, exec needs to be saved and modified. If we
73// cannot scavenge temporary SGPRs to save exec, we use the following code:
74// buffer_store_dword TmpVGPR ; only if active lanes need to be saved
75// s_not exec, exec
76// buffer_store_dword TmpVGPR ; save inactive lanes
77// s_not exec, exec
79 struct PerVGPRData {
80 unsigned PerVGPR;
81 unsigned NumVGPRs;
82 int64_t VGPRLanes;
83 };
84
85 // The SGPR to save
89 unsigned NumSubRegs;
90 bool IsKill;
91 const DebugLoc &DL;
92
93 /* When spilling to stack */
94 // The SGPRs are written into this VGPR, which is then written to scratch
95 // (or vice versa for loads).
96 Register TmpVGPR = AMDGPU::NoRegister;
97 // Temporary spill slot to save TmpVGPR to.
98 int TmpVGPRIndex = 0;
99 // If TmpVGPR is live before the spill or if it is scavenged.
100 bool TmpVGPRLive = false;
101 // Scavenged SGPR to save EXEC.
102 Register SavedExecReg = AMDGPU::NoRegister;
103 // Stack index to write the SGPRs to.
104 int Index;
105 unsigned EltSize = 4;
106
115 unsigned MovOpc;
116 unsigned NotOpc;
117
121 : SGPRSpillBuilder(TRI, TII, IsWave32, MI, MI->getOperand(0).getReg(),
122 MI->getOperand(0).isKill(), Index, RS) {}
123
126 bool IsKill, int Index, RegScavenger *RS)
127 : SuperReg(Reg), MI(MI), IsKill(IsKill), DL(MI->getDebugLoc()),
128 Index(Index), RS(RS), MBB(MI->getParent()), MF(*MBB->getParent()),
129 MFI(*MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI),
131 const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg);
132 SplitParts = TRI.getRegSplitParts(RC, EltSize);
133 NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
134
135 if (IsWave32) {
136 ExecReg = AMDGPU::EXEC_LO;
137 MovOpc = AMDGPU::S_MOV_B32;
138 NotOpc = AMDGPU::S_NOT_B32;
139 } else {
140 ExecReg = AMDGPU::EXEC;
141 MovOpc = AMDGPU::S_MOV_B64;
142 NotOpc = AMDGPU::S_NOT_B64;
143 }
144
145 assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
146 assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI &&
147 SuperReg != AMDGPU::EXEC && "exec should never spill");
148 }
149
152 Data.PerVGPR = IsWave32 ? 32 : 64;
153 Data.NumVGPRs = (NumSubRegs + (Data.PerVGPR - 1)) / Data.PerVGPR;
154 Data.VGPRLanes = (1LL << std::min(Data.PerVGPR, NumSubRegs)) - 1LL;
155 return Data;
156 }
157
158 // Tries to scavenge SGPRs to save EXEC and a VGPR. Uses v0 if no VGPR is
159 // free.
160 // Writes these instructions if an SGPR can be scavenged:
161 // s_mov_b64 s[6:7], exec ; Save exec
162 // s_mov_b64 exec, 3 ; Wanted lanemask
163 // buffer_store_dword v1 ; Write scavenged VGPR to emergency slot
164 //
165 // Writes these instructions if no SGPR can be scavenged:
166 // buffer_store_dword v0 ; Only if no free VGPR was found
167 // s_not_b64 exec, exec
168 // buffer_store_dword v0 ; Save inactive lanes
169 // ; exec stays inverted, it is flipped back in
170 // ; restore.
171 void prepare() {
172 // Scavenged temporary VGPR to use. It must be scavenged once for any number
173 // of spilled subregs.
174 // FIXME: The liveness analysis is limited and does not tell if a register
175 // is in use in lanes that are currently inactive. We can never be sure if
176 // a register as actually in use in another lane, so we need to save all
177 // used lanes of the chosen VGPR.
178 assert(RS && "Cannot spill SGPR to memory without RegScavenger");
179 TmpVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false,
180 0, false);
181
182 // Reserve temporary stack slot
183 TmpVGPRIndex = MFI.getScavengeFI(MF.getFrameInfo(), TRI);
184 if (TmpVGPR) {
185 // Found a register that is dead in the currently active lanes, we only
186 // need to spill inactive lanes.
187 TmpVGPRLive = false;
188 } else {
189 // Pick v0 because it doesn't make a difference.
190 TmpVGPR = AMDGPU::VGPR0;
191 TmpVGPRLive = true;
192 }
193
194 if (TmpVGPRLive) {
195 // We need to inform the scavenger that this index is already in use until
196 // we're done with the custom emergency spill.
197 RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR);
198 }
199
200 // We may end up recursively calling the scavenger, and don't want to re-use
201 // the same register.
202 RS->setRegUsed(TmpVGPR);
203
204 // Try to scavenge SGPRs to save exec
205 assert(!SavedExecReg && "Exec is already saved, refuse to save again");
206 const TargetRegisterClass &RC =
207 IsWave32 ? AMDGPU::SGPR_32RegClass : AMDGPU::SGPR_64RegClass;
208 RS->setRegUsed(SuperReg);
209 SavedExecReg = RS->scavengeRegisterBackwards(RC, MI, false, 0, false);
210
211 int64_t VGPRLanes = getPerVGPRData().VGPRLanes;
212
213 if (SavedExecReg) {
214 RS->setRegUsed(SavedExecReg);
215 // Set exec to needed lanes
217 auto I =
218 BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg).addImm(VGPRLanes);
219 if (!TmpVGPRLive)
221 // Spill needed lanes
222 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
223 } else {
224 // The modify and restore of exec clobber SCC, which we would have to save
225 // and restore. FIXME: We probably would need to reserve a register for
226 // this.
227 if (RS->isRegUsed(AMDGPU::SCC))
228 emitUnsupportedError(MF.getFunction(), *MI,
229 "unhandled SGPR spill to memory");
230
231 // Spill active lanes
232 if (TmpVGPRLive)
233 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false,
234 /*IsKill*/ false);
235 // Spill inactive lanes
236 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
237 if (!TmpVGPRLive)
239 I->getOperand(2).setIsDead(); // Mark SCC as dead.
240 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
241 }
242 }
243
244 // Writes these instructions if an SGPR can be scavenged:
245 // buffer_load_dword v1 ; Write scavenged VGPR to emergency slot
246 // s_waitcnt vmcnt(0) ; If a free VGPR was found
247 // s_mov_b64 exec, s[6:7] ; Save exec
248 //
249 // Writes these instructions if no SGPR can be scavenged:
250 // buffer_load_dword v0 ; Restore inactive lanes
251 // s_waitcnt vmcnt(0) ; If a free VGPR was found
252 // s_not_b64 exec, exec
253 // buffer_load_dword v0 ; Only if no free VGPR was found
254 void restore() {
255 if (SavedExecReg) {
256 // Restore used lanes
257 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
258 /*IsKill*/ false);
259 // Restore exec
260 auto I = BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg)
262 // Add an implicit use of the load so it is not dead.
263 // FIXME This inserts an unnecessary waitcnt
264 if (!TmpVGPRLive) {
266 }
267 } else {
268 // Restore inactive lanes
269 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
270 /*IsKill*/ false);
271 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
272 if (!TmpVGPRLive)
274 I->getOperand(2).setIsDead(); // Mark SCC as dead.
275
276 // Restore active lanes
277 if (TmpVGPRLive)
278 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true);
279 }
280
281 // Inform the scavenger where we're releasing our custom scavenged register.
282 if (TmpVGPRLive) {
283 MachineBasicBlock::iterator RestorePt = std::prev(MI);
284 RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR, &*RestorePt);
285 }
286 }
287
288 // Write TmpVGPR to memory or read TmpVGPR from memory.
289 // Either using a single buffer_load/store if exec is set to the needed mask
290 // or using
291 // buffer_load
292 // s_not exec, exec
293 // buffer_load
294 // s_not exec, exec
295 void readWriteTmpVGPR(unsigned Offset, bool IsLoad) {
296 if (SavedExecReg) {
297 // Spill needed lanes
298 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
299 } else {
300 // The modify and restore of exec clobber SCC, which we would have to save
301 // and restore. FIXME: We probably would need to reserve a register for
302 // this.
303 if (RS->isRegUsed(AMDGPU::SCC))
304 emitUnsupportedError(MF.getFunction(), *MI,
305 "unhandled SGPR spill to memory");
306
307 // Spill active lanes
308 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad,
309 /*IsKill*/ false);
310 // Spill inactive lanes
311 auto Not0 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
312 Not0->getOperand(2).setIsDead(); // Mark SCC as dead.
313 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
314 auto Not1 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
315 Not1->getOperand(2).setIsDead(); // Mark SCC as dead.
316 }
317 }
318
320 assert(MBB->getParent() == &MF);
321 MI = NewMI;
322 MBB = NewMBB;
323 }
324};
325
326} // namespace llvm
327
329 : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour(),
330 ST.getAMDGPUDwarfFlavour(),
331 /*PC=*/0,
332 ST.getHwMode(MCSubtargetInfo::HwMode_RegInfo)),
333 ST(ST), SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) {
334
335 assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 &&
336 getSubRegIndexLaneMask(AMDGPU::sub31).getAsInteger() == (3ULL << 62) &&
337 (getSubRegIndexLaneMask(AMDGPU::lo16) |
338 getSubRegIndexLaneMask(AMDGPU::hi16)).getAsInteger() ==
339 getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() &&
340 "getNumCoveredRegs() will not work with generated subreg masks!");
341
342 RegPressureIgnoredUnits.resize(getNumRegUnits());
343 RegPressureIgnoredUnits.set(*regunits(MCRegister::from(AMDGPU::M0)).begin());
344 for (auto Reg : AMDGPU::VGPR_16RegClass) {
345 if (AMDGPU::isHi16Reg(Reg, *this))
346 RegPressureIgnoredUnits.set(*regunits(Reg).begin());
347 }
348
349 // HACK: Until this is fully tablegen'd.
350 static llvm::once_flag InitializeRegSplitPartsFlag;
351
352 static auto InitializeRegSplitPartsOnce = [this]() {
353 for (unsigned Idx = 1, E = getNumSubRegIndices() - 1; Idx < E; ++Idx) {
354 unsigned Size = getSubRegIdxSize(Idx);
355 if (Size & 15)
356 continue;
357 std::vector<int16_t> &Vec = RegSplitParts[Size / 16 - 1];
358 unsigned Pos = getSubRegIdxOffset(Idx);
359 if (Pos % Size)
360 continue;
361 Pos /= Size;
362 if (Vec.empty()) {
363 unsigned MaxNumParts = 1024 / Size; // Maximum register is 1024 bits.
364 Vec.resize(MaxNumParts);
365 }
366 Vec[Pos] = Idx;
367 }
368 };
369
370 static llvm::once_flag InitializeSubRegFromChannelTableFlag;
371
372 static auto InitializeSubRegFromChannelTableOnce = [this]() {
373 for (auto &Row : SubRegFromChannelTable)
374 Row.fill(AMDGPU::NoSubRegister);
375 for (unsigned Idx = 1; Idx < getNumSubRegIndices(); ++Idx) {
376 unsigned Width = getSubRegIdxSize(Idx) / 32;
377 unsigned Offset = getSubRegIdxOffset(Idx) / 32;
379 Width = SubRegFromChannelTableWidthMap[Width];
380 if (Width == 0)
381 continue;
382 unsigned TableIdx = Width - 1;
383 assert(TableIdx < SubRegFromChannelTable.size());
384 assert(Offset < SubRegFromChannelTable[TableIdx].size());
385 SubRegFromChannelTable[TableIdx][Offset] = Idx;
386 }
387 };
388
389 llvm::call_once(InitializeRegSplitPartsFlag, InitializeRegSplitPartsOnce);
390 llvm::call_once(InitializeSubRegFromChannelTableFlag,
391 InitializeSubRegFromChannelTableOnce);
392}
393
394void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved,
395 MCRegister Reg) const {
396 for (MCRegAliasIterator R(Reg, this, true); R.isValid(); ++R)
397 Reserved.set(*R);
398}
399
400// Forced to be here by one .inc
402 const MachineFunction *MF) const {
404 switch (CC) {
405 case CallingConv::C:
408 return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_SaveList
409 : CSR_AMDGPU_SaveList;
412 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_SaveList
413 : CSR_AMDGPU_SI_Gfx_SaveList;
415 return CSR_AMDGPU_CS_ChainPreserve_SaveList;
416 default: {
417 // Dummy to not crash RegisterClassInfo.
418 static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister;
419 return &NoCalleeSavedReg;
420 }
421 }
422}
423
424const MCPhysReg *
426 return nullptr;
427}
428
430 CallingConv::ID CC) const {
431 switch (CC) {
432 case CallingConv::C:
435 return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_RegMask
436 : CSR_AMDGPU_RegMask;
439 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_RegMask
440 : CSR_AMDGPU_SI_Gfx_RegMask;
443 // Calls to these functions never return, so we can pretend everything is
444 // preserved.
445 return AMDGPU_AllVGPRs_RegMask;
446 default:
447 return nullptr;
448 }
449}
450
452 return CSR_AMDGPU_NoRegs_RegMask;
453}
454
456 return VGPR >= AMDGPU::VGPR0 && VGPR < AMDGPU::VGPR8;
457}
458
461 const MachineFunction &MF) const {
462 // FIXME: Should have a helper function like getEquivalentVGPRClass to get the
463 // equivalent AV class. If used one, the verifier will crash after
464 // RegBankSelect in the GISel flow. The aligned regclasses are not fully given
465 // until Instruction selection.
466 if (ST.hasMAIInsts() && (isVGPRClass(RC) || isAGPRClass(RC))) {
467 if (RC == &AMDGPU::VGPR_32RegClass || RC == &AMDGPU::AGPR_32RegClass)
468 return &AMDGPU::AV_32RegClass;
469 if (RC == &AMDGPU::VReg_64RegClass || RC == &AMDGPU::AReg_64RegClass)
470 return &AMDGPU::AV_64RegClass;
471 if (RC == &AMDGPU::VReg_64_Align2RegClass ||
472 RC == &AMDGPU::AReg_64_Align2RegClass)
473 return &AMDGPU::AV_64_Align2RegClass;
474 if (RC == &AMDGPU::VReg_96RegClass || RC == &AMDGPU::AReg_96RegClass)
475 return &AMDGPU::AV_96RegClass;
476 if (RC == &AMDGPU::VReg_96_Align2RegClass ||
477 RC == &AMDGPU::AReg_96_Align2RegClass)
478 return &AMDGPU::AV_96_Align2RegClass;
479 if (RC == &AMDGPU::VReg_128RegClass || RC == &AMDGPU::AReg_128RegClass)
480 return &AMDGPU::AV_128RegClass;
481 if (RC == &AMDGPU::VReg_128_Align2RegClass ||
482 RC == &AMDGPU::AReg_128_Align2RegClass)
483 return &AMDGPU::AV_128_Align2RegClass;
484 if (RC == &AMDGPU::VReg_160RegClass || RC == &AMDGPU::AReg_160RegClass)
485 return &AMDGPU::AV_160RegClass;
486 if (RC == &AMDGPU::VReg_160_Align2RegClass ||
487 RC == &AMDGPU::AReg_160_Align2RegClass)
488 return &AMDGPU::AV_160_Align2RegClass;
489 if (RC == &AMDGPU::VReg_192RegClass || RC == &AMDGPU::AReg_192RegClass)
490 return &AMDGPU::AV_192RegClass;
491 if (RC == &AMDGPU::VReg_192_Align2RegClass ||
492 RC == &AMDGPU::AReg_192_Align2RegClass)
493 return &AMDGPU::AV_192_Align2RegClass;
494 if (RC == &AMDGPU::VReg_256RegClass || RC == &AMDGPU::AReg_256RegClass)
495 return &AMDGPU::AV_256RegClass;
496 if (RC == &AMDGPU::VReg_256_Align2RegClass ||
497 RC == &AMDGPU::AReg_256_Align2RegClass)
498 return &AMDGPU::AV_256_Align2RegClass;
499 if (RC == &AMDGPU::VReg_512RegClass || RC == &AMDGPU::AReg_512RegClass)
500 return &AMDGPU::AV_512RegClass;
501 if (RC == &AMDGPU::VReg_512_Align2RegClass ||
502 RC == &AMDGPU::AReg_512_Align2RegClass)
503 return &AMDGPU::AV_512_Align2RegClass;
504 if (RC == &AMDGPU::VReg_1024RegClass || RC == &AMDGPU::AReg_1024RegClass)
505 return &AMDGPU::AV_1024RegClass;
506 if (RC == &AMDGPU::VReg_1024_Align2RegClass ||
507 RC == &AMDGPU::AReg_1024_Align2RegClass)
508 return &AMDGPU::AV_1024_Align2RegClass;
509 }
510
512}
513
515 const SIFrameLowering *TFI = ST.getFrameLowering();
517
518 // During ISel lowering we always reserve the stack pointer in entry and chain
519 // functions, but never actually want to reference it when accessing our own
520 // frame. If we need a frame pointer we use it, but otherwise we can just use
521 // an immediate "0" which we represent by returning NoRegister.
522 if (FuncInfo->isBottomOfStack()) {
523 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : Register();
524 }
525 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg()
526 : FuncInfo->getStackPtrOffsetReg();
527}
528
530 // When we need stack realignment, we can't reference off of the
531 // stack pointer, so we reserve a base pointer.
532 return shouldRealignStack(MF);
533}
534
535Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; }
536
538 return AMDGPU_AllVGPRs_RegMask;
539}
540
542 return AMDGPU_AllAGPRs_RegMask;
543}
544
546 return AMDGPU_AllVectorRegs_RegMask;
547}
548
550 return AMDGPU_AllAllocatableSRegs_RegMask;
551}
552
553unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel,
554 unsigned NumRegs) {
555 assert(NumRegs < SubRegFromChannelTableWidthMap.size());
556 unsigned NumRegIndex = SubRegFromChannelTableWidthMap[NumRegs];
557 assert(NumRegIndex && "Not implemented");
558 assert(Channel < SubRegFromChannelTable[NumRegIndex - 1].size());
559 return SubRegFromChannelTable[NumRegIndex - 1][Channel];
560}
561
564 const unsigned Align,
565 const TargetRegisterClass *RC) const {
566 unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), Align) - Align;
567 MCRegister BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
568 return getMatchingSuperReg(BaseReg, AMDGPU::sub0, RC);
569}
570
572 const MachineFunction &MF) const {
573 return getAlignedHighSGPRForRC(MF, /*Align=*/4, &AMDGPU::SGPR_128RegClass);
574}
575
577 BitVector Reserved(getNumRegs());
578 Reserved.set(AMDGPU::MODE);
579
581
582 // Reserve special purpose registers.
583 //
584 // EXEC_LO and EXEC_HI could be allocated and used as regular register, but
585 // this seems likely to result in bugs, so I'm marking them as reserved.
586 reserveRegisterTuples(Reserved, AMDGPU::EXEC);
587 reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
588
589 // M0 has to be reserved so that llvm accepts it as a live-in into a block.
590 reserveRegisterTuples(Reserved, AMDGPU::M0);
591
592 // Reserve src_vccz, src_execz, src_scc.
593 reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ);
594 reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ);
595 reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC);
596
597 // Reserve the memory aperture registers
598 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE);
599 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT);
600 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE);
601 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT);
602 reserveRegisterTuples(Reserved, AMDGPU::SRC_FLAT_SCRATCH_BASE_LO);
603 reserveRegisterTuples(Reserved, AMDGPU::SRC_FLAT_SCRATCH_BASE_HI);
604
605 // Reserve async counters pseudo registers
606 reserveRegisterTuples(Reserved, AMDGPU::ASYNCcnt);
607 reserveRegisterTuples(Reserved, AMDGPU::TENSORcnt);
608
609 // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen.
610 reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID);
611
612 // Reserve xnack_mask registers - support is not implemented in Codegen.
613 reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK);
614
615 // Reserve lds_direct register - support is not implemented in Codegen.
616 reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT);
617
618 // Reserve Trap Handler registers - support is not implemented in Codegen.
619 reserveRegisterTuples(Reserved, AMDGPU::TBA);
620 reserveRegisterTuples(Reserved, AMDGPU::TMA);
621 reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1);
622 reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3);
623 reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5);
624 reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7);
625 reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9);
626 reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11);
627 reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13);
628 reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15);
629
630 // Reserve null register - it shall never be allocated
631 reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL64);
632
633 // Reserve SGPRs.
634 //
635 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
636 unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
637 for (const TargetRegisterClass *RC : regclasses()) {
638 if (RC->isBaseClass() && isSGPRClass(RC)) {
639 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
640 for (MCPhysReg Reg : *RC) {
641 unsigned Index = getHWRegIndex(Reg);
642 if (Index + NumRegs > MaxNumSGPRs && Index < TotalNumSGPRs)
643 Reserved.set(Reg);
644 }
645 }
646 }
647
648 Register ScratchRSrcReg = MFI->getScratchRSrcReg();
649 if (ScratchRSrcReg != AMDGPU::NoRegister) {
650 // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we
651 // need to spill.
652 // TODO: May need to reserve a VGPR if doing LDS spilling.
653 reserveRegisterTuples(Reserved, ScratchRSrcReg);
654 }
655
656 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
657 if (LongBranchReservedReg)
658 reserveRegisterTuples(Reserved, LongBranchReservedReg);
659
660 // We have to assume the SP is needed in case there are calls in the function,
661 // which is detected after the function is lowered. If we aren't really going
662 // to need SP, don't bother reserving it.
663 MCRegister StackPtrReg = MFI->getStackPtrOffsetReg();
664 if (StackPtrReg) {
665 reserveRegisterTuples(Reserved, StackPtrReg);
666 assert(!isSubRegister(ScratchRSrcReg, StackPtrReg));
667 }
668
669 MCRegister FrameReg = MFI->getFrameOffsetReg();
670 if (FrameReg) {
671 reserveRegisterTuples(Reserved, FrameReg);
672 assert(!isSubRegister(ScratchRSrcReg, FrameReg));
673 }
674
675 if (hasBasePointer(MF)) {
676 MCRegister BasePtrReg = getBaseRegister();
677 reserveRegisterTuples(Reserved, BasePtrReg);
678 assert(!isSubRegister(ScratchRSrcReg, BasePtrReg));
679 }
680
681 // FIXME: Use same reserved register introduced in D149775
682 // SGPR used to preserve EXEC MASK around WWM spill/copy instructions.
683 Register ExecCopyReg = MFI->getSGPRForEXECCopy();
684 if (ExecCopyReg)
685 reserveRegisterTuples(Reserved, ExecCopyReg);
686
687 // Reserve VGPRs/AGPRs.
688 //
689 auto [MaxNumVGPRs, MaxNumAGPRs] = ST.getMaxNumVectorRegs(MF.getFunction());
690
691 for (const TargetRegisterClass *RC : regclasses()) {
692 if (RC->isBaseClass() && isVGPRClass(RC)) {
693 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
694 for (MCPhysReg Reg : *RC) {
695 unsigned Index = getHWRegIndex(Reg);
696 if (Index + NumRegs > MaxNumVGPRs)
697 Reserved.set(Reg);
698 }
699 }
700 }
701
702 // Reserve all the AGPRs if there are no instructions to use it.
703 if (!ST.hasMAIInsts())
704 MaxNumAGPRs = 0;
705 for (const TargetRegisterClass *RC : regclasses()) {
706 if (RC->isBaseClass() && isAGPRClass(RC)) {
707 unsigned NumRegs = divideCeil(getRegSizeInBits(*RC), 32);
708 for (MCPhysReg Reg : *RC) {
709 unsigned Index = getHWRegIndex(Reg);
710 if (Index + NumRegs > MaxNumAGPRs)
711 Reserved.set(Reg);
712 }
713 }
714 }
715
716 // On GFX908, in order to guarantee copying between AGPRs, we need a scratch
717 // VGPR available at all times.
718 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
719 reserveRegisterTuples(Reserved, MFI->getVGPRForAGPRCopy());
720 }
721
722 // During wwm-regalloc, reserve the registers for perlane VGPR allocation. The
723 // MFI->getNonWWMRegMask() field will have a valid bitmask only during
724 // wwm-regalloc and it would be empty otherwise.
725 BitVector NonWWMRegMask = MFI->getNonWWMRegMask();
726 if (!NonWWMRegMask.empty()) {
727 for (unsigned RegI = AMDGPU::VGPR0, RegE = AMDGPU::VGPR0 + MaxNumVGPRs;
728 RegI < RegE; ++RegI) {
729 if (NonWWMRegMask.test(RegI))
730 reserveRegisterTuples(Reserved, RegI);
731 }
732 }
733
734 for (Register Reg : MFI->getWWMReservedRegs())
735 reserveRegisterTuples(Reserved, Reg);
736
737 // FIXME: Stop using reserved registers for this.
738 for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs())
739 reserveRegisterTuples(Reserved, Reg);
740
741 for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs())
742 reserveRegisterTuples(Reserved, Reg);
743
744 return Reserved;
745}
746
748 MCRegister PhysReg) const {
749 return !MF.getRegInfo().isReserved(PhysReg);
750}
751
754 // On entry or in chain functions, the base address is 0, so it can't possibly
755 // need any more alignment.
756
757 // FIXME: Should be able to specify the entry frame alignment per calling
758 // convention instead.
759 if (Info->isBottomOfStack())
760 return false;
761
763}
764
767 if (Info->isEntryFunction()) {
768 const MachineFrameInfo &MFI = Fn.getFrameInfo();
769 return MFI.hasStackObjects() || MFI.hasCalls();
770 }
771
772 // May need scavenger for dealing with callee saved registers.
773 return true;
774}
775
777 const MachineFunction &MF) const {
778 // Do not use frame virtual registers. They used to be used for SGPRs, but
779 // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the
780 // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a
781 // spill.
782 return false;
783}
784
786 const MachineFunction &MF) const {
787 const MachineFrameInfo &MFI = MF.getFrameInfo();
788 return MFI.hasStackObjects();
789}
790
792 const MachineFunction &) const {
793 // There are no special dedicated stack or frame pointers.
794 return true;
795}
796
799
800 int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
801 AMDGPU::OpName::offset);
802 return MI->getOperand(OffIdx).getImm();
803}
804
806 int Idx) const {
807 switch (MI->getOpcode()) {
808 case AMDGPU::V_ADD_U32_e32:
809 case AMDGPU::V_ADD_U32_e64:
810 case AMDGPU::V_ADD_CO_U32_e32: {
811 int OtherIdx = Idx == 1 ? 2 : 1;
812 const MachineOperand &OtherOp = MI->getOperand(OtherIdx);
813 return OtherOp.isImm() ? OtherOp.getImm() : 0;
814 }
815 case AMDGPU::V_ADD_CO_U32_e64: {
816 int OtherIdx = Idx == 2 ? 3 : 2;
817 const MachineOperand &OtherOp = MI->getOperand(OtherIdx);
818 return OtherOp.isImm() ? OtherOp.getImm() : 0;
819 }
820 default:
821 break;
822 }
823
825 return 0;
826
827 assert((Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
828 AMDGPU::OpName::vaddr) ||
829 (Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
830 AMDGPU::OpName::saddr))) &&
831 "Should never see frame index on non-address operand");
832
834}
835
837 const MachineInstr &MI) {
838 assert(MI.getDesc().isAdd());
839 const MachineOperand &Src0 = MI.getOperand(1);
840 const MachineOperand &Src1 = MI.getOperand(2);
841
842 if (Src0.isFI()) {
843 return Src1.isImm() || (Src1.isReg() && TRI.isVGPR(MI.getMF()->getRegInfo(),
844 Src1.getReg()));
845 }
846
847 if (Src1.isFI()) {
848 return Src0.isImm() || (Src0.isReg() && TRI.isVGPR(MI.getMF()->getRegInfo(),
849 Src0.getReg()));
850 }
851
852 return false;
853}
854
856 // TODO: Handle v_add_co_u32, v_or_b32, v_and_b32 and scalar opcodes.
857 switch (MI->getOpcode()) {
858 case AMDGPU::V_ADD_U32_e32: {
859 // TODO: We could handle this but it requires work to avoid violating
860 // operand restrictions.
861 if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e32) < 2 &&
862 !isFIPlusImmOrVGPR(*this, *MI))
863 return false;
864 [[fallthrough]];
865 }
866 case AMDGPU::V_ADD_U32_e64:
867 // FIXME: This optimization is barely profitable enableFlatScratch as-is.
868 //
869 // Much of the benefit with the MUBUF handling is we avoid duplicating the
870 // shift of the frame register, which isn't needed with scratch.
871 //
872 // materializeFrameBaseRegister doesn't know the register classes of the
873 // uses, and unconditionally uses an s_add_i32, which will end up using a
874 // copy for the vector uses.
875 return !ST.enableFlatScratch();
876 case AMDGPU::V_ADD_CO_U32_e32:
877 if (ST.getConstantBusLimit(AMDGPU::V_ADD_CO_U32_e32) < 2 &&
878 !isFIPlusImmOrVGPR(*this, *MI))
879 return false;
880 // We can't deal with the case where the carry out has a use (though this
881 // should never happen)
882 return MI->getOperand(3).isDead();
883 case AMDGPU::V_ADD_CO_U32_e64:
884 // TODO: Should we check use_empty instead?
885 return MI->getOperand(1).isDead();
886 default:
887 break;
888 }
889
891 return false;
892
893 int64_t FullOffset = Offset + getScratchInstrOffset(MI);
894
895 const SIInstrInfo *TII = ST.getInstrInfo();
897 return !TII->isLegalMUBUFImmOffset(FullOffset);
898
899 return !TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS,
901}
902
904 int FrameIdx,
905 int64_t Offset) const {
906 MachineBasicBlock::iterator Ins = MBB->begin();
907 DebugLoc DL; // Defaults to "unknown"
908
909 if (Ins != MBB->end())
910 DL = Ins->getDebugLoc();
911
912 MachineFunction *MF = MBB->getParent();
913 const SIInstrInfo *TII = ST.getInstrInfo();
915 unsigned MovOpc = ST.enableFlatScratch() ? AMDGPU::S_MOV_B32
916 : AMDGPU::V_MOV_B32_e32;
917
918 Register BaseReg = MRI.createVirtualRegister(
919 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XEXEC_HIRegClass
920 : &AMDGPU::VGPR_32RegClass);
921
922 if (Offset == 0) {
923 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), BaseReg)
924 .addFrameIndex(FrameIdx);
925 return BaseReg;
926 }
927
928 Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
929
930 Register FIReg = MRI.createVirtualRegister(
931 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XM0RegClass
932 : &AMDGPU::VGPR_32RegClass);
933
934 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
935 .addImm(Offset);
936 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), FIReg)
937 .addFrameIndex(FrameIdx);
938
939 if (ST.enableFlatScratch() ) {
940 // FIXME: Make sure scc isn't live in.
941 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_I32), BaseReg)
942 .addReg(OffsetReg, RegState::Kill)
943 .addReg(FIReg)
944 .setOperandDead(3); // scc
945 return BaseReg;
946 }
947
948 TII->getAddNoCarry(*MBB, Ins, DL, BaseReg)
949 .addReg(OffsetReg, RegState::Kill)
950 .addReg(FIReg)
951 .addImm(0); // clamp bit
952
953 return BaseReg;
954}
955
957 int64_t Offset) const {
958 const SIInstrInfo *TII = ST.getInstrInfo();
959
960 switch (MI.getOpcode()) {
961 case AMDGPU::V_ADD_U32_e32:
962 case AMDGPU::V_ADD_CO_U32_e32: {
963 MachineOperand *FIOp = &MI.getOperand(2);
964 MachineOperand *ImmOp = &MI.getOperand(1);
965 if (!FIOp->isFI())
966 std::swap(FIOp, ImmOp);
967
968 if (!ImmOp->isImm()) {
969 assert(Offset == 0);
970 FIOp->ChangeToRegister(BaseReg, false);
971 TII->legalizeOperandsVOP2(MI.getMF()->getRegInfo(), MI);
972 return;
973 }
974
975 int64_t TotalOffset = ImmOp->getImm() + Offset;
976 if (TotalOffset == 0) {
977 MI.setDesc(TII->get(AMDGPU::COPY));
978 for (unsigned I = MI.getNumOperands() - 1; I != 1; --I)
979 MI.removeOperand(I);
980
981 MI.getOperand(1).ChangeToRegister(BaseReg, false);
982 return;
983 }
984
985 ImmOp->setImm(TotalOffset);
986
987 MachineBasicBlock *MBB = MI.getParent();
988 MachineFunction *MF = MBB->getParent();
990
991 // FIXME: materializeFrameBaseRegister does not know the register class of
992 // the uses of the frame index, and assumes SGPR for enableFlatScratch. Emit
993 // a copy so we have a legal operand and hope the register coalescer can
994 // clean it up.
995 if (isSGPRReg(MRI, BaseReg)) {
996 Register BaseRegVGPR =
997 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
998 BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), BaseRegVGPR)
999 .addReg(BaseReg);
1000 MI.getOperand(2).ChangeToRegister(BaseRegVGPR, false);
1001 } else {
1002 MI.getOperand(2).ChangeToRegister(BaseReg, false);
1003 }
1004 return;
1005 }
1006 case AMDGPU::V_ADD_U32_e64:
1007 case AMDGPU::V_ADD_CO_U32_e64: {
1008 int Src0Idx = MI.getNumExplicitDefs();
1009 MachineOperand *FIOp = &MI.getOperand(Src0Idx);
1010 MachineOperand *ImmOp = &MI.getOperand(Src0Idx + 1);
1011 if (!FIOp->isFI())
1012 std::swap(FIOp, ImmOp);
1013
1014 if (!ImmOp->isImm()) {
1015 FIOp->ChangeToRegister(BaseReg, false);
1016 TII->legalizeOperandsVOP3(MI.getMF()->getRegInfo(), MI);
1017 return;
1018 }
1019
1020 int64_t TotalOffset = ImmOp->getImm() + Offset;
1021 if (TotalOffset == 0) {
1022 MI.setDesc(TII->get(AMDGPU::COPY));
1023
1024 for (unsigned I = MI.getNumOperands() - 1; I != 1; --I)
1025 MI.removeOperand(I);
1026
1027 MI.getOperand(1).ChangeToRegister(BaseReg, false);
1028 } else {
1029 FIOp->ChangeToRegister(BaseReg, false);
1030 ImmOp->setImm(TotalOffset);
1031 }
1032
1033 return;
1034 }
1035 default:
1036 break;
1037 }
1038
1039 bool IsFlat = TII->isFLATScratch(MI);
1040
1041#ifndef NDEBUG
1042 // FIXME: Is it possible to be storing a frame index to itself?
1043 bool SeenFI = false;
1044 for (const MachineOperand &MO: MI.operands()) {
1045 if (MO.isFI()) {
1046 if (SeenFI)
1047 llvm_unreachable("should not see multiple frame indices");
1048
1049 SeenFI = true;
1050 }
1051 }
1052#endif
1053
1054 MachineOperand *FIOp =
1055 TII->getNamedOperand(MI, IsFlat ? AMDGPU::OpName::saddr
1056 : AMDGPU::OpName::vaddr);
1057
1058 MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset);
1059 int64_t NewOffset = OffsetOp->getImm() + Offset;
1060
1061 assert(FIOp && FIOp->isFI() && "frame index must be address operand");
1062 assert(TII->isMUBUF(MI) || TII->isFLATScratch(MI));
1063
1064 if (IsFlat) {
1065 assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
1067 "offset should be legal");
1068 FIOp->ChangeToRegister(BaseReg, false);
1069 OffsetOp->setImm(NewOffset);
1070 return;
1071 }
1072
1073#ifndef NDEBUG
1074 MachineOperand *SOffset = TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
1075 assert(SOffset->isImm() && SOffset->getImm() == 0);
1076#endif
1077
1078 assert(TII->isLegalMUBUFImmOffset(NewOffset) && "offset should be legal");
1079
1080 FIOp->ChangeToRegister(BaseReg, false);
1081 OffsetOp->setImm(NewOffset);
1082}
1083
1085 Register BaseReg,
1086 int64_t Offset) const {
1087
1088 switch (MI->getOpcode()) {
1089 case AMDGPU::V_ADD_U32_e32:
1090 case AMDGPU::V_ADD_CO_U32_e32:
1091 return true;
1092 case AMDGPU::V_ADD_U32_e64:
1093 case AMDGPU::V_ADD_CO_U32_e64:
1094 return ST.hasVOP3Literal() || AMDGPU::isInlinableIntLiteral(Offset);
1095 default:
1096 break;
1097 }
1098
1100 return false;
1101
1102 int64_t NewOffset = Offset + getScratchInstrOffset(MI);
1103
1104 const SIInstrInfo *TII = ST.getInstrInfo();
1106 return TII->isLegalMUBUFImmOffset(NewOffset);
1107
1108 return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
1110}
1111
1112const TargetRegisterClass *
1114 // This is inaccurate. It depends on the instruction and address space. The
1115 // only place where we should hit this is for dealing with frame indexes /
1116 // private accesses, so this is correct in that case.
1117 return &AMDGPU::VGPR_32RegClass;
1118}
1119
1120const TargetRegisterClass *
1122 return RC == &AMDGPU::SCC_CLASSRegClass ? &AMDGPU::SReg_32RegClass : RC;
1123}
1124
1126 const SIInstrInfo *TII) {
1127
1128 unsigned Op = MI.getOpcode();
1129 switch (Op) {
1130 case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE:
1131 case AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE:
1132 // FIXME: This assumes the mask is statically known and not computed at
1133 // runtime. However, some ABIs may want to compute the mask dynamically and
1134 // this will need to be updated.
1135 return llvm::popcount(
1136 (uint64_t)TII->getNamedOperand(MI, AMDGPU::OpName::mask)->getImm());
1137 case AMDGPU::SI_SPILL_S1024_SAVE:
1138 case AMDGPU::SI_SPILL_S1024_RESTORE:
1139 case AMDGPU::SI_SPILL_V1024_SAVE:
1140 case AMDGPU::SI_SPILL_V1024_RESTORE:
1141 case AMDGPU::SI_SPILL_A1024_SAVE:
1142 case AMDGPU::SI_SPILL_A1024_RESTORE:
1143 case AMDGPU::SI_SPILL_AV1024_SAVE:
1144 case AMDGPU::SI_SPILL_AV1024_RESTORE:
1145 return 32;
1146 case AMDGPU::SI_SPILL_S512_SAVE:
1147 case AMDGPU::SI_SPILL_S512_RESTORE:
1148 case AMDGPU::SI_SPILL_V512_SAVE:
1149 case AMDGPU::SI_SPILL_V512_RESTORE:
1150 case AMDGPU::SI_SPILL_A512_SAVE:
1151 case AMDGPU::SI_SPILL_A512_RESTORE:
1152 case AMDGPU::SI_SPILL_AV512_SAVE:
1153 case AMDGPU::SI_SPILL_AV512_RESTORE:
1154 return 16;
1155 case AMDGPU::SI_SPILL_S384_SAVE:
1156 case AMDGPU::SI_SPILL_S384_RESTORE:
1157 case AMDGPU::SI_SPILL_V384_SAVE:
1158 case AMDGPU::SI_SPILL_V384_RESTORE:
1159 case AMDGPU::SI_SPILL_A384_SAVE:
1160 case AMDGPU::SI_SPILL_A384_RESTORE:
1161 case AMDGPU::SI_SPILL_AV384_SAVE:
1162 case AMDGPU::SI_SPILL_AV384_RESTORE:
1163 return 12;
1164 case AMDGPU::SI_SPILL_S352_SAVE:
1165 case AMDGPU::SI_SPILL_S352_RESTORE:
1166 case AMDGPU::SI_SPILL_V352_SAVE:
1167 case AMDGPU::SI_SPILL_V352_RESTORE:
1168 case AMDGPU::SI_SPILL_A352_SAVE:
1169 case AMDGPU::SI_SPILL_A352_RESTORE:
1170 case AMDGPU::SI_SPILL_AV352_SAVE:
1171 case AMDGPU::SI_SPILL_AV352_RESTORE:
1172 return 11;
1173 case AMDGPU::SI_SPILL_S320_SAVE:
1174 case AMDGPU::SI_SPILL_S320_RESTORE:
1175 case AMDGPU::SI_SPILL_V320_SAVE:
1176 case AMDGPU::SI_SPILL_V320_RESTORE:
1177 case AMDGPU::SI_SPILL_A320_SAVE:
1178 case AMDGPU::SI_SPILL_A320_RESTORE:
1179 case AMDGPU::SI_SPILL_AV320_SAVE:
1180 case AMDGPU::SI_SPILL_AV320_RESTORE:
1181 return 10;
1182 case AMDGPU::SI_SPILL_S288_SAVE:
1183 case AMDGPU::SI_SPILL_S288_RESTORE:
1184 case AMDGPU::SI_SPILL_V288_SAVE:
1185 case AMDGPU::SI_SPILL_V288_RESTORE:
1186 case AMDGPU::SI_SPILL_A288_SAVE:
1187 case AMDGPU::SI_SPILL_A288_RESTORE:
1188 case AMDGPU::SI_SPILL_AV288_SAVE:
1189 case AMDGPU::SI_SPILL_AV288_RESTORE:
1190 return 9;
1191 case AMDGPU::SI_SPILL_S256_SAVE:
1192 case AMDGPU::SI_SPILL_S256_RESTORE:
1193 case AMDGPU::SI_SPILL_V256_SAVE:
1194 case AMDGPU::SI_SPILL_V256_RESTORE:
1195 case AMDGPU::SI_SPILL_A256_SAVE:
1196 case AMDGPU::SI_SPILL_A256_RESTORE:
1197 case AMDGPU::SI_SPILL_AV256_SAVE:
1198 case AMDGPU::SI_SPILL_AV256_RESTORE:
1199 return 8;
1200 case AMDGPU::SI_SPILL_S224_SAVE:
1201 case AMDGPU::SI_SPILL_S224_RESTORE:
1202 case AMDGPU::SI_SPILL_V224_SAVE:
1203 case AMDGPU::SI_SPILL_V224_RESTORE:
1204 case AMDGPU::SI_SPILL_A224_SAVE:
1205 case AMDGPU::SI_SPILL_A224_RESTORE:
1206 case AMDGPU::SI_SPILL_AV224_SAVE:
1207 case AMDGPU::SI_SPILL_AV224_RESTORE:
1208 return 7;
1209 case AMDGPU::SI_SPILL_S192_SAVE:
1210 case AMDGPU::SI_SPILL_S192_RESTORE:
1211 case AMDGPU::SI_SPILL_V192_SAVE:
1212 case AMDGPU::SI_SPILL_V192_RESTORE:
1213 case AMDGPU::SI_SPILL_A192_SAVE:
1214 case AMDGPU::SI_SPILL_A192_RESTORE:
1215 case AMDGPU::SI_SPILL_AV192_SAVE:
1216 case AMDGPU::SI_SPILL_AV192_RESTORE:
1217 return 6;
1218 case AMDGPU::SI_SPILL_S160_SAVE:
1219 case AMDGPU::SI_SPILL_S160_RESTORE:
1220 case AMDGPU::SI_SPILL_V160_SAVE:
1221 case AMDGPU::SI_SPILL_V160_RESTORE:
1222 case AMDGPU::SI_SPILL_A160_SAVE:
1223 case AMDGPU::SI_SPILL_A160_RESTORE:
1224 case AMDGPU::SI_SPILL_AV160_SAVE:
1225 case AMDGPU::SI_SPILL_AV160_RESTORE:
1226 return 5;
1227 case AMDGPU::SI_SPILL_S128_SAVE:
1228 case AMDGPU::SI_SPILL_S128_RESTORE:
1229 case AMDGPU::SI_SPILL_V128_SAVE:
1230 case AMDGPU::SI_SPILL_V128_RESTORE:
1231 case AMDGPU::SI_SPILL_A128_SAVE:
1232 case AMDGPU::SI_SPILL_A128_RESTORE:
1233 case AMDGPU::SI_SPILL_AV128_SAVE:
1234 case AMDGPU::SI_SPILL_AV128_RESTORE:
1235 return 4;
1236 case AMDGPU::SI_SPILL_S96_SAVE:
1237 case AMDGPU::SI_SPILL_S96_RESTORE:
1238 case AMDGPU::SI_SPILL_V96_SAVE:
1239 case AMDGPU::SI_SPILL_V96_RESTORE:
1240 case AMDGPU::SI_SPILL_A96_SAVE:
1241 case AMDGPU::SI_SPILL_A96_RESTORE:
1242 case AMDGPU::SI_SPILL_AV96_SAVE:
1243 case AMDGPU::SI_SPILL_AV96_RESTORE:
1244 return 3;
1245 case AMDGPU::SI_SPILL_S64_SAVE:
1246 case AMDGPU::SI_SPILL_S64_RESTORE:
1247 case AMDGPU::SI_SPILL_V64_SAVE:
1248 case AMDGPU::SI_SPILL_V64_RESTORE:
1249 case AMDGPU::SI_SPILL_A64_SAVE:
1250 case AMDGPU::SI_SPILL_A64_RESTORE:
1251 case AMDGPU::SI_SPILL_AV64_SAVE:
1252 case AMDGPU::SI_SPILL_AV64_RESTORE:
1253 return 2;
1254 case AMDGPU::SI_SPILL_S32_SAVE:
1255 case AMDGPU::SI_SPILL_S32_RESTORE:
1256 case AMDGPU::SI_SPILL_V32_SAVE:
1257 case AMDGPU::SI_SPILL_V32_RESTORE:
1258 case AMDGPU::SI_SPILL_A32_SAVE:
1259 case AMDGPU::SI_SPILL_A32_RESTORE:
1260 case AMDGPU::SI_SPILL_AV32_SAVE:
1261 case AMDGPU::SI_SPILL_AV32_RESTORE:
1262 case AMDGPU::SI_SPILL_WWM_V32_SAVE:
1263 case AMDGPU::SI_SPILL_WWM_V32_RESTORE:
1264 case AMDGPU::SI_SPILL_WWM_AV32_SAVE:
1265 case AMDGPU::SI_SPILL_WWM_AV32_RESTORE:
1266 case AMDGPU::SI_SPILL_V16_SAVE:
1267 case AMDGPU::SI_SPILL_V16_RESTORE:
1268 return 1;
1269 default: llvm_unreachable("Invalid spill opcode");
1270 }
1271}
1272
1273static int getOffsetMUBUFStore(unsigned Opc) {
1274 switch (Opc) {
1275 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
1276 return AMDGPU::BUFFER_STORE_DWORD_OFFSET;
1277 case AMDGPU::BUFFER_STORE_BYTE_OFFEN:
1278 return AMDGPU::BUFFER_STORE_BYTE_OFFSET;
1279 case AMDGPU::BUFFER_STORE_SHORT_OFFEN:
1280 return AMDGPU::BUFFER_STORE_SHORT_OFFSET;
1281 case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
1282 return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
1283 case AMDGPU::BUFFER_STORE_DWORDX3_OFFEN:
1284 return AMDGPU::BUFFER_STORE_DWORDX3_OFFSET;
1285 case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN:
1286 return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
1287 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN:
1288 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET;
1289 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN:
1290 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET;
1291 default:
1292 return -1;
1293 }
1294}
1295
1296static int getOffsetMUBUFLoad(unsigned Opc) {
1297 switch (Opc) {
1298 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
1299 return AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
1300 case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN:
1301 return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET;
1302 case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN:
1303 return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET;
1304 case AMDGPU::BUFFER_LOAD_USHORT_OFFEN:
1305 return AMDGPU::BUFFER_LOAD_USHORT_OFFSET;
1306 case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN:
1307 return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET;
1308 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN:
1309 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
1310 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN:
1311 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET;
1312 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN:
1313 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET;
1314 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN:
1315 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET;
1316 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN:
1317 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET;
1318 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN:
1319 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET;
1320 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN:
1321 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET;
1322 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN:
1323 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET;
1324 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN:
1325 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET;
1326 default:
1327 return -1;
1328 }
1329}
1330
1331static int getOffenMUBUFStore(unsigned Opc) {
1332 switch (Opc) {
1333 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
1334 return AMDGPU::BUFFER_STORE_DWORD_OFFEN;
1335 case AMDGPU::BUFFER_STORE_BYTE_OFFSET:
1336 return AMDGPU::BUFFER_STORE_BYTE_OFFEN;
1337 case AMDGPU::BUFFER_STORE_SHORT_OFFSET:
1338 return AMDGPU::BUFFER_STORE_SHORT_OFFEN;
1339 case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET:
1340 return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN;
1341 case AMDGPU::BUFFER_STORE_DWORDX3_OFFSET:
1342 return AMDGPU::BUFFER_STORE_DWORDX3_OFFEN;
1343 case AMDGPU::BUFFER_STORE_DWORDX4_OFFSET:
1344 return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN;
1345 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET:
1346 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN;
1347 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET:
1348 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN;
1349 default:
1350 return -1;
1351 }
1352}
1353
1354static int getOffenMUBUFLoad(unsigned Opc) {
1355 switch (Opc) {
1356 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
1357 return AMDGPU::BUFFER_LOAD_DWORD_OFFEN;
1358 case AMDGPU::BUFFER_LOAD_UBYTE_OFFSET:
1359 return AMDGPU::BUFFER_LOAD_UBYTE_OFFEN;
1360 case AMDGPU::BUFFER_LOAD_SBYTE_OFFSET:
1361 return AMDGPU::BUFFER_LOAD_SBYTE_OFFEN;
1362 case AMDGPU::BUFFER_LOAD_USHORT_OFFSET:
1363 return AMDGPU::BUFFER_LOAD_USHORT_OFFEN;
1364 case AMDGPU::BUFFER_LOAD_SSHORT_OFFSET:
1365 return AMDGPU::BUFFER_LOAD_SSHORT_OFFEN;
1366 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET:
1367 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
1368 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET:
1369 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN;
1370 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET:
1371 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN;
1372 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET:
1373 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN;
1374 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET:
1375 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN;
1376 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET:
1377 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN;
1378 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET:
1379 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN;
1380 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET:
1381 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN;
1382 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET:
1383 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN;
1384 default:
1385 return -1;
1386 }
1387}
1388
1392 int Index, unsigned Lane,
1393 unsigned ValueReg, bool IsKill) {
1394 MachineFunction *MF = MBB.getParent();
1396 const SIInstrInfo *TII = ST.getInstrInfo();
1397
1398 MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane);
1399
1400 if (Reg == AMDGPU::NoRegister)
1401 return MachineInstrBuilder();
1402
1403 bool IsStore = MI->mayStore();
1405 auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
1406
1407 unsigned Dst = IsStore ? Reg : ValueReg;
1408 unsigned Src = IsStore ? ValueReg : Reg;
1409 bool IsVGPR = TRI->isVGPR(MRI, Reg);
1410 DebugLoc DL = MI->getDebugLoc();
1411 if (IsVGPR == TRI->isVGPR(MRI, ValueReg)) {
1412 // Spiller during regalloc may restore a spilled register to its superclass.
1413 // It could result in AGPR spills restored to VGPRs or the other way around,
1414 // making the src and dst with identical regclasses at this point. It just
1415 // needs a copy in such cases.
1416 auto CopyMIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), Dst)
1417 .addReg(Src, getKillRegState(IsKill));
1419 return CopyMIB;
1420 }
1421 unsigned Opc = (IsStore ^ IsVGPR) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64
1422 : AMDGPU::V_ACCVGPR_READ_B32_e64;
1423
1424 auto MIB = BuildMI(MBB, MI, DL, TII->get(Opc), Dst)
1425 .addReg(Src, getKillRegState(IsKill));
1427 return MIB;
1428}
1429
1430// This differs from buildSpillLoadStore by only scavenging a VGPR. It does not
1431// need to handle the case where an SGPR may need to be spilled while spilling.
1433 MachineFrameInfo &MFI,
1435 int Index,
1436 int64_t Offset) {
1437 const SIInstrInfo *TII = ST.getInstrInfo();
1438 MachineBasicBlock *MBB = MI->getParent();
1439 const DebugLoc &DL = MI->getDebugLoc();
1440 bool IsStore = MI->mayStore();
1441
1442 unsigned Opc = MI->getOpcode();
1443 int LoadStoreOp = IsStore ?
1445 if (LoadStoreOp == -1)
1446 return false;
1447
1448 const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata);
1449 if (spillVGPRtoAGPR(ST, *MBB, MI, Index, 0, Reg->getReg(), false).getInstr())
1450 return true;
1451
1452 MachineInstrBuilder NewMI =
1453 BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
1454 .add(*Reg)
1455 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc))
1456 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset))
1457 .addImm(Offset)
1458 .addImm(0) // cpol
1459 .addImm(0) // swz
1460 .cloneMemRefs(*MI);
1461
1462 const MachineOperand *VDataIn = TII->getNamedOperand(*MI,
1463 AMDGPU::OpName::vdata_in);
1464 if (VDataIn)
1465 NewMI.add(*VDataIn);
1466 return true;
1467}
1468
1470 unsigned LoadStoreOp,
1471 unsigned EltSize) {
1472 bool IsStore = TII->get(LoadStoreOp).mayStore();
1473 bool HasVAddr = AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::vaddr);
1474 bool UseST =
1475 !HasVAddr && !AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::saddr);
1476
1477 // Handle block load/store first.
1478 if (TII->isBlockLoadStore(LoadStoreOp))
1479 return LoadStoreOp;
1480
1481 switch (EltSize) {
1482 case 4:
1483 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
1484 : AMDGPU::SCRATCH_LOAD_DWORD_SADDR;
1485 break;
1486 case 8:
1487 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX2_SADDR
1488 : AMDGPU::SCRATCH_LOAD_DWORDX2_SADDR;
1489 break;
1490 case 12:
1491 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX3_SADDR
1492 : AMDGPU::SCRATCH_LOAD_DWORDX3_SADDR;
1493 break;
1494 case 16:
1495 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX4_SADDR
1496 : AMDGPU::SCRATCH_LOAD_DWORDX4_SADDR;
1497 break;
1498 default:
1499 llvm_unreachable("Unexpected spill load/store size!");
1500 }
1501
1502 if (HasVAddr)
1503 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp);
1504 else if (UseST)
1505 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
1506
1507 return LoadStoreOp;
1508}
1509
1512 unsigned LoadStoreOp, int Index, Register ValueReg, bool IsKill,
1513 MCRegister ScratchOffsetReg, int64_t InstOffset, MachineMemOperand *MMO,
1514 RegScavenger *RS, LiveRegUnits *LiveUnits) const {
1515 assert((!RS || !LiveUnits) && "Only RS or LiveUnits can be set but not both");
1516
1517 MachineFunction *MF = MBB.getParent();
1518 const SIInstrInfo *TII = ST.getInstrInfo();
1519 const MachineFrameInfo &MFI = MF->getFrameInfo();
1520 const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
1521
1522 const MCInstrDesc *Desc = &TII->get(LoadStoreOp);
1523 bool IsStore = Desc->mayStore();
1524 bool IsFlat = TII->isFLATScratch(LoadStoreOp);
1525 bool IsBlock = TII->isBlockLoadStore(LoadStoreOp);
1526
1527 bool CanClobberSCC = false;
1528 bool Scavenged = false;
1529 MCRegister SOffset = ScratchOffsetReg;
1530
1531 const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
1532 // On gfx90a+ AGPR is a regular VGPR acceptable for loads and stores.
1533 const bool IsAGPR = !ST.hasGFX90AInsts() && isAGPRClass(RC);
1534 const unsigned RegWidth = AMDGPU::getRegBitWidth(*RC) / 8;
1535
1536 // Always use 4 byte operations for AGPRs because we need to scavenge
1537 // a temporary VGPR.
1538 // If we're using a block operation, the element should be the whole block.
1539 unsigned EltSize = IsBlock ? RegWidth
1540 : (IsFlat && !IsAGPR) ? std::min(RegWidth, 16u)
1541 : 4u;
1542 unsigned NumSubRegs = RegWidth / EltSize;
1543 unsigned Size = NumSubRegs * EltSize;
1544 unsigned RemSize = RegWidth - Size;
1545 unsigned NumRemSubRegs = RemSize ? 1 : 0;
1546 int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
1547 int64_t MaterializedOffset = Offset;
1548
1549 int64_t MaxOffset = Offset + Size + RemSize - EltSize;
1550 int64_t ScratchOffsetRegDelta = 0;
1551
1552 if (IsFlat && EltSize > 4) {
1553 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
1554 Desc = &TII->get(LoadStoreOp);
1555 }
1556
1557 Align Alignment = MFI.getObjectAlign(Index);
1558 const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
1559
1560 assert((IsFlat || ((Offset % EltSize) == 0)) &&
1561 "unexpected VGPR spill offset");
1562
1563 // Track a VGPR to use for a constant offset we need to materialize.
1564 Register TmpOffsetVGPR;
1565
1566 // Track a VGPR to use as an intermediate value.
1567 Register TmpIntermediateVGPR;
1568 bool UseVGPROffset = false;
1569
1570 // Materialize a VGPR offset required for the given SGPR/VGPR/Immediate
1571 // combination.
1572 auto MaterializeVOffset = [&](Register SGPRBase, Register TmpVGPR,
1573 int64_t VOffset) {
1574 // We are using a VGPR offset
1575 if (IsFlat && SGPRBase) {
1576 // We only have 1 VGPR offset, or 1 SGPR offset. We don't have a free
1577 // SGPR, so perform the add as vector.
1578 // We don't need a base SGPR in the kernel.
1579
1580 if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) >= 2) {
1581 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), TmpVGPR)
1582 .addReg(SGPRBase)
1583 .addImm(VOffset)
1584 .addImm(0); // clamp
1585 } else {
1586 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
1587 .addReg(SGPRBase);
1588 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e32), TmpVGPR)
1589 .addImm(VOffset)
1590 .addReg(TmpOffsetVGPR);
1591 }
1592 } else {
1593 assert(TmpOffsetVGPR);
1594 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
1595 .addImm(VOffset);
1596 }
1597 };
1598
1599 bool IsOffsetLegal =
1600 IsFlat ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS,
1602 : TII->isLegalMUBUFImmOffset(MaxOffset);
1603 if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) {
1604 SOffset = MCRegister();
1605
1606 // We don't have access to the register scavenger if this function is called
1607 // during PEI::scavengeFrameVirtualRegs() so use LiveUnits in this case.
1608 // TODO: Clobbering SCC is not necessary for scratch instructions in the
1609 // entry.
1610 if (RS) {
1611 SOffset = RS->scavengeRegisterBackwards(AMDGPU::SGPR_32RegClass, MI, false, 0, false);
1612
1613 // Piggy back on the liveness scan we just did see if SCC is dead.
1614 CanClobberSCC = !RS->isRegUsed(AMDGPU::SCC);
1615 } else if (LiveUnits) {
1616 CanClobberSCC = LiveUnits->available(AMDGPU::SCC);
1617 for (MCRegister Reg : AMDGPU::SGPR_32RegClass) {
1618 if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) {
1619 SOffset = Reg;
1620 break;
1621 }
1622 }
1623 }
1624
1625 if (ScratchOffsetReg != AMDGPU::NoRegister && !CanClobberSCC)
1626 SOffset = Register();
1627
1628 if (!SOffset) {
1629 UseVGPROffset = true;
1630
1631 if (RS) {
1632 TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0);
1633 } else {
1634 assert(LiveUnits);
1635 for (MCRegister Reg : AMDGPU::VGPR_32RegClass) {
1636 if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) {
1637 TmpOffsetVGPR = Reg;
1638 break;
1639 }
1640 }
1641 }
1642
1643 assert(TmpOffsetVGPR);
1644 } else if (!SOffset && CanClobberSCC) {
1645 // There are no free SGPRs, and since we are in the process of spilling
1646 // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true
1647 // on SI/CI and on VI it is true until we implement spilling using scalar
1648 // stores), we have no way to free up an SGPR. Our solution here is to
1649 // add the offset directly to the ScratchOffset or StackPtrOffset
1650 // register, and then subtract the offset after the spill to return the
1651 // register to it's original value.
1652
1653 // TODO: If we don't have to do an emergency stack slot spill, converting
1654 // to use the VGPR offset is fewer instructions.
1655 if (!ScratchOffsetReg)
1656 ScratchOffsetReg = FuncInfo->getStackPtrOffsetReg();
1657 SOffset = ScratchOffsetReg;
1658 ScratchOffsetRegDelta = Offset;
1659 } else {
1660 Scavenged = true;
1661 }
1662
1663 // We currently only support spilling VGPRs to EltSize boundaries, meaning
1664 // we can simplify the adjustment of Offset here to just scale with
1665 // WavefrontSize.
1666 if (!IsFlat && !UseVGPROffset)
1667 Offset *= ST.getWavefrontSize();
1668
1669 if (!UseVGPROffset && !SOffset)
1670 report_fatal_error("could not scavenge SGPR to spill in entry function");
1671
1672 if (UseVGPROffset) {
1673 // We are using a VGPR offset
1674 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, Offset);
1675 } else if (ScratchOffsetReg == AMDGPU::NoRegister) {
1676 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset).addImm(Offset);
1677 } else {
1678 assert(Offset != 0);
1679 auto Add = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset)
1680 .addReg(ScratchOffsetReg)
1681 .addImm(Offset);
1682 Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1683 }
1684
1685 Offset = 0;
1686 }
1687
1688 if (IsFlat && SOffset == AMDGPU::NoRegister) {
1689 assert(AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0
1690 && "Unexpected vaddr for flat scratch with a FI operand");
1691
1692 if (UseVGPROffset) {
1693 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp);
1694 } else {
1695 assert(ST.hasFlatScratchSTMode());
1696 assert(!TII->isBlockLoadStore(LoadStoreOp) && "Block ops don't have ST");
1697 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
1698 }
1699
1700 Desc = &TII->get(LoadStoreOp);
1701 }
1702
1703 for (unsigned i = 0, e = NumSubRegs + NumRemSubRegs, RegOffset = 0; i != e;
1704 ++i, RegOffset += EltSize) {
1705 if (i == NumSubRegs) {
1706 EltSize = RemSize;
1707 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
1708 }
1709 Desc = &TII->get(LoadStoreOp);
1710
1711 if (!IsFlat && UseVGPROffset) {
1712 int NewLoadStoreOp = IsStore ? getOffenMUBUFStore(LoadStoreOp)
1713 : getOffenMUBUFLoad(LoadStoreOp);
1714 Desc = &TII->get(NewLoadStoreOp);
1715 }
1716
1717 if (UseVGPROffset && TmpOffsetVGPR == TmpIntermediateVGPR) {
1718 // If we are spilling an AGPR beyond the range of the memory instruction
1719 // offset and need to use a VGPR offset, we ideally have at least 2
1720 // scratch VGPRs. If we don't have a second free VGPR without spilling,
1721 // recycle the VGPR used for the offset which requires resetting after
1722 // each subregister.
1723
1724 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, MaterializedOffset);
1725 }
1726
1727 unsigned NumRegs = EltSize / 4;
1728 Register SubReg = e == 1
1729 ? ValueReg
1730 : Register(getSubReg(ValueReg,
1731 getSubRegFromChannel(RegOffset / 4, NumRegs)));
1732
1733 unsigned SOffsetRegState = 0;
1734 unsigned SrcDstRegState = getDefRegState(!IsStore);
1735 const bool IsLastSubReg = i + 1 == e;
1736 const bool IsFirstSubReg = i == 0;
1737 if (IsLastSubReg) {
1738 SOffsetRegState |= getKillRegState(Scavenged);
1739 // The last implicit use carries the "Kill" flag.
1740 SrcDstRegState |= getKillRegState(IsKill);
1741 }
1742
1743 // Make sure the whole register is defined if there are undef components by
1744 // adding an implicit def of the super-reg on the first instruction.
1745 bool NeedSuperRegDef = e > 1 && IsStore && IsFirstSubReg;
1746 bool NeedSuperRegImpOperand = e > 1;
1747
1748 // Remaining element size to spill into memory after some parts of it
1749 // spilled into either AGPRs or VGPRs.
1750 unsigned RemEltSize = EltSize;
1751
1752 // AGPRs to spill VGPRs and vice versa are allocated in a reverse order,
1753 // starting from the last lane. In case if a register cannot be completely
1754 // spilled into another register that will ensure its alignment does not
1755 // change. For targets with VGPR alignment requirement this is important
1756 // in case of flat scratch usage as we might get a scratch_load or
1757 // scratch_store of an unaligned register otherwise.
1758 for (int LaneS = (RegOffset + EltSize) / 4 - 1, Lane = LaneS,
1759 LaneE = RegOffset / 4;
1760 Lane >= LaneE; --Lane) {
1761 bool IsSubReg = e > 1 || EltSize > 4;
1762 Register Sub = IsSubReg
1763 ? Register(getSubReg(ValueReg, getSubRegFromChannel(Lane)))
1764 : ValueReg;
1765 auto MIB = spillVGPRtoAGPR(ST, MBB, MI, Index, Lane, Sub, IsKill);
1766 if (!MIB.getInstr())
1767 break;
1768 if (NeedSuperRegDef || (IsSubReg && IsStore && Lane == LaneS && IsFirstSubReg)) {
1769 MIB.addReg(ValueReg, RegState::ImplicitDefine);
1770 NeedSuperRegDef = false;
1771 }
1772 if ((IsSubReg || NeedSuperRegImpOperand) && (IsFirstSubReg || IsLastSubReg)) {
1773 NeedSuperRegImpOperand = true;
1774 unsigned State = SrcDstRegState;
1775 if (!IsLastSubReg || (Lane != LaneE))
1776 State &= ~RegState::Kill;
1777 if (!IsFirstSubReg || (Lane != LaneS))
1778 State &= ~RegState::Define;
1779 MIB.addReg(ValueReg, RegState::Implicit | State);
1780 }
1781 RemEltSize -= 4;
1782 }
1783
1784 if (!RemEltSize) // Fully spilled into AGPRs.
1785 continue;
1786
1787 if (RemEltSize != EltSize) { // Partially spilled to AGPRs
1788 assert(IsFlat && EltSize > 4);
1789
1790 unsigned NumRegs = RemEltSize / 4;
1791 SubReg = Register(getSubReg(ValueReg,
1792 getSubRegFromChannel(RegOffset / 4, NumRegs)));
1793 unsigned Opc = getFlatScratchSpillOpcode(TII, LoadStoreOp, RemEltSize);
1794 Desc = &TII->get(Opc);
1795 }
1796
1797 unsigned FinalReg = SubReg;
1798
1799 if (IsAGPR) {
1800 assert(EltSize == 4);
1801
1802 if (!TmpIntermediateVGPR) {
1803 TmpIntermediateVGPR = FuncInfo->getVGPRForAGPRCopy();
1804 assert(MF->getRegInfo().isReserved(TmpIntermediateVGPR));
1805 }
1806 if (IsStore) {
1807 auto AccRead = BuildMI(MBB, MI, DL,
1808 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64),
1809 TmpIntermediateVGPR)
1810 .addReg(SubReg, getKillRegState(IsKill));
1811 if (NeedSuperRegDef)
1812 AccRead.addReg(ValueReg, RegState::ImplicitDefine);
1813 if (NeedSuperRegImpOperand && (IsFirstSubReg || IsLastSubReg))
1814 AccRead.addReg(ValueReg, RegState::Implicit);
1816 }
1817 SubReg = TmpIntermediateVGPR;
1818 } else if (UseVGPROffset) {
1819 if (!TmpOffsetVGPR) {
1820 TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
1821 MI, false, 0);
1822 RS->setRegUsed(TmpOffsetVGPR);
1823 }
1824 }
1825
1826 Register FinalValueReg = ValueReg;
1827 if (LoadStoreOp == AMDGPU::SCRATCH_LOAD_USHORT_SADDR) {
1828 // If we are loading 16-bit value with SRAMECC endabled we need a temp
1829 // 32-bit VGPR to load and extract 16-bits into the final register.
1830 ValueReg =
1831 RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0);
1832 SubReg = ValueReg;
1833 IsKill = false;
1834 }
1835
1836 MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RegOffset);
1837 MachineMemOperand *NewMMO =
1838 MF->getMachineMemOperand(PInfo, MMO->getFlags(), RemEltSize,
1839 commonAlignment(Alignment, RegOffset));
1840
1841 auto MIB =
1842 BuildMI(MBB, MI, DL, *Desc)
1843 .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill));
1844
1845 if (UseVGPROffset) {
1846 // For an AGPR spill, we reuse the same temp VGPR for the offset and the
1847 // intermediate accvgpr_write.
1848 MIB.addReg(TmpOffsetVGPR, getKillRegState(IsLastSubReg && !IsAGPR));
1849 }
1850
1851 if (!IsFlat)
1852 MIB.addReg(FuncInfo->getScratchRSrcReg());
1853
1854 if (SOffset == AMDGPU::NoRegister) {
1855 if (!IsFlat) {
1856 if (UseVGPROffset && ScratchOffsetReg) {
1857 MIB.addReg(ScratchOffsetReg);
1858 } else {
1859 assert(FuncInfo->isBottomOfStack());
1860 MIB.addImm(0);
1861 }
1862 }
1863 } else {
1864 MIB.addReg(SOffset, SOffsetRegState);
1865 }
1866
1867 MIB.addImm(Offset + RegOffset);
1868
1869 bool LastUse = MMO->getFlags() & MOLastUse;
1870 MIB.addImm(LastUse ? AMDGPU::CPol::TH_LU : 0); // cpol
1871
1872 if (!IsFlat)
1873 MIB.addImm(0); // swz
1874 MIB.addMemOperand(NewMMO);
1875
1876 if (FinalValueReg != ValueReg) {
1877 // Extract 16-bit from the loaded 32-bit value.
1878 ValueReg = getSubReg(ValueReg, AMDGPU::lo16);
1879 MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B16_t16_e64))
1880 .addReg(FinalValueReg, getDefRegState(true))
1881 .addImm(0)
1882 .addReg(ValueReg, getKillRegState(true))
1883 .addImm(0);
1884 ValueReg = FinalValueReg;
1885 }
1886
1887 if (!IsAGPR && NeedSuperRegDef)
1888 MIB.addReg(ValueReg, RegState::ImplicitDefine);
1889
1890 if (!IsStore && IsAGPR && TmpIntermediateVGPR != AMDGPU::NoRegister) {
1891 MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64),
1892 FinalReg)
1893 .addReg(TmpIntermediateVGPR, RegState::Kill);
1895 }
1896
1897 bool IsSrcDstDef = SrcDstRegState & RegState::Define;
1898 bool PartialReloadCopy = (RemEltSize != EltSize) && !IsStore;
1899 if (NeedSuperRegImpOperand &&
1900 (IsFirstSubReg || (IsLastSubReg && !IsSrcDstDef))) {
1901 MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
1902 if (PartialReloadCopy)
1903 MIB.addReg(ValueReg, RegState::Implicit);
1904 }
1905
1906 // The epilog restore of a wwm-scratch register can cause undesired
1907 // optimization during machine-cp post PrologEpilogInserter if the same
1908 // register was assigned for return value ABI lowering with a COPY
1909 // instruction. As given below, with the epilog reload, the earlier COPY
1910 // appeared to be dead during machine-cp.
1911 // ...
1912 // v0 in WWM operation, needs the WWM spill at prolog/epilog.
1913 // $vgpr0 = V_WRITELANE_B32 $sgpr20, 0, $vgpr0
1914 // ...
1915 // Epilog block:
1916 // $vgpr0 = COPY $vgpr1 // outgoing value moved to v0
1917 // ...
1918 // WWM spill restore to preserve the inactive lanes of v0.
1919 // $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1
1920 // $vgpr0 = BUFFER_LOAD $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0
1921 // $exec = S_MOV_B64 killed $sgpr4_sgpr5
1922 // ...
1923 // SI_RETURN implicit $vgpr0
1924 // ...
1925 // To fix it, mark the same reg as a tied op for such restore instructions
1926 // so that it marks a usage for the preceding COPY.
1927 if (!IsStore && MI != MBB.end() && MI->isReturn() &&
1928 MI->readsRegister(SubReg, this)) {
1929 MIB.addReg(SubReg, RegState::Implicit);
1930 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1931 }
1932
1933 // If we're building a block load, we should add artificial uses for the
1934 // CSR VGPRs that are *not* being transferred. This is because liveness
1935 // analysis is not aware of the mask, so we need to somehow inform it that
1936 // those registers are not available before the load and they should not be
1937 // scavenged.
1938 if (!IsStore && TII->isBlockLoadStore(LoadStoreOp))
1939 addImplicitUsesForBlockCSRLoad(MIB, ValueReg);
1940 }
1941
1942 if (ScratchOffsetRegDelta != 0) {
1943 // Subtract the offset we added to the ScratchOffset register.
1944 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset)
1945 .addReg(SOffset)
1946 .addImm(-ScratchOffsetRegDelta);
1947 }
1948}
1949
1951 Register BlockReg) const {
1952 const MachineFunction *MF = MIB->getParent()->getParent();
1953 const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
1954 uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(BlockReg);
1955 Register BaseVGPR = getSubReg(BlockReg, AMDGPU::sub0);
1956 for (unsigned RegOffset = 1; RegOffset < 32; ++RegOffset)
1957 if (!(Mask & (1 << RegOffset)) &&
1958 isCalleeSavedPhysReg(BaseVGPR + RegOffset, *MF))
1959 MIB.addUse(BaseVGPR + RegOffset, RegState::Implicit);
1960}
1961
1963 int Offset, bool IsLoad,
1964 bool IsKill) const {
1965 // Load/store VGPR
1966 MachineFrameInfo &FrameInfo = SB.MF.getFrameInfo();
1967 assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill);
1968
1969 Register FrameReg =
1970 FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(SB.MF)
1971 ? getBaseRegister()
1972 : getFrameRegister(SB.MF);
1973
1974 Align Alignment = FrameInfo.getObjectAlign(Index);
1978 SB.EltSize, Alignment);
1979
1980 if (IsLoad) {
1981 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
1982 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
1983 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, false,
1984 FrameReg, (int64_t)Offset * SB.EltSize, MMO, SB.RS);
1985 } else {
1986 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
1987 : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
1988 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, IsKill,
1989 FrameReg, (int64_t)Offset * SB.EltSize, MMO, SB.RS);
1990 // This only ever adds one VGPR spill
1991 SB.MFI.addToSpilledVGPRs(1);
1992 }
1993}
1994
1996 RegScavenger *RS, SlotIndexes *Indexes,
1997 LiveIntervals *LIS, bool OnlyToVGPR,
1998 bool SpillToPhysVGPRLane) const {
1999 assert(!MI->getOperand(0).isUndef() &&
2000 "undef spill should have been deleted earlier");
2001
2002 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
2003
2004 ArrayRef<SpilledReg> VGPRSpills =
2005 SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index)
2007 bool SpillToVGPR = !VGPRSpills.empty();
2008 if (OnlyToVGPR && !SpillToVGPR)
2009 return false;
2010
2011 assert(SpillToVGPR || (SB.SuperReg != SB.MFI.getStackPtrOffsetReg() &&
2012 SB.SuperReg != SB.MFI.getFrameOffsetReg()));
2013
2014 if (SpillToVGPR) {
2015
2016 // Since stack slot coloring pass is trying to optimize SGPR spills,
2017 // VGPR lanes (mapped from spill stack slot) may be shared for SGPR
2018 // spills of different sizes. This accounts for number of VGPR lanes alloted
2019 // equal to the largest SGPR being spilled in them.
2020 assert(SB.NumSubRegs <= VGPRSpills.size() &&
2021 "Num of SGPRs spilled should be less than or equal to num of "
2022 "the VGPR lanes.");
2023
2024 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
2026 SB.NumSubRegs == 1
2027 ? SB.SuperReg
2028 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2029 SpilledReg Spill = VGPRSpills[i];
2030
2031 bool IsFirstSubreg = i == 0;
2032 bool IsLastSubreg = i == SB.NumSubRegs - 1;
2033 bool UseKill = SB.IsKill && IsLastSubreg;
2034
2035
2036 // Mark the "old value of vgpr" input undef only if this is the first sgpr
2037 // spill to this specific vgpr in the first basic block.
2038 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
2039 SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), Spill.VGPR)
2040 .addReg(SubReg, getKillRegState(UseKill))
2041 .addImm(Spill.Lane)
2042 .addReg(Spill.VGPR);
2043 if (Indexes) {
2044 if (IsFirstSubreg)
2045 Indexes->replaceMachineInstrInMaps(*MI, *MIB);
2046 else
2047 Indexes->insertMachineInstrInMaps(*MIB);
2048 }
2049
2050 if (IsFirstSubreg && SB.NumSubRegs > 1) {
2051 // We may be spilling a super-register which is only partially defined,
2052 // and need to ensure later spills think the value is defined.
2053 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
2054 }
2055
2056 if (SB.NumSubRegs > 1 && (IsFirstSubreg || IsLastSubreg))
2058
2059 // FIXME: Since this spills to another register instead of an actual
2060 // frame index, we should delete the frame index when all references to
2061 // it are fixed.
2062 }
2063 } else {
2064 SB.prepare();
2065
2066 // SubReg carries the "Kill" flag when SubReg == SB.SuperReg.
2067 unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill);
2068
2069 // Per VGPR helper data
2070 auto PVD = SB.getPerVGPRData();
2071
2072 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2073 unsigned TmpVGPRFlags = RegState::Undef;
2074
2075 // Write sub registers into the VGPR
2076 for (unsigned i = Offset * PVD.PerVGPR,
2077 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2078 i < e; ++i) {
2080 SB.NumSubRegs == 1
2081 ? SB.SuperReg
2082 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2083
2084 MachineInstrBuilder WriteLane =
2085 BuildMI(*SB.MBB, MI, SB.DL,
2086 SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), SB.TmpVGPR)
2087 .addReg(SubReg, SubKillState)
2088 .addImm(i % PVD.PerVGPR)
2089 .addReg(SB.TmpVGPR, TmpVGPRFlags);
2090 TmpVGPRFlags = 0;
2091
2092 if (Indexes) {
2093 if (i == 0)
2094 Indexes->replaceMachineInstrInMaps(*MI, *WriteLane);
2095 else
2096 Indexes->insertMachineInstrInMaps(*WriteLane);
2097 }
2098
2099 // There could be undef components of a spilled super register.
2100 // TODO: Can we detect this and skip the spill?
2101 if (SB.NumSubRegs > 1) {
2102 // The last implicit use of the SB.SuperReg carries the "Kill" flag.
2103 unsigned SuperKillState = 0;
2104 if (i + 1 == SB.NumSubRegs)
2105 SuperKillState |= getKillRegState(SB.IsKill);
2106 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState);
2107 }
2108 }
2109
2110 // Write out VGPR
2111 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ false);
2112 }
2113
2114 SB.restore();
2115 }
2116
2117 MI->eraseFromParent();
2119
2120 if (LIS)
2122
2123 return true;
2124}
2125
2127 RegScavenger *RS, SlotIndexes *Indexes,
2128 LiveIntervals *LIS, bool OnlyToVGPR,
2129 bool SpillToPhysVGPRLane) const {
2130 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
2131
2132 ArrayRef<SpilledReg> VGPRSpills =
2133 SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index)
2135 bool SpillToVGPR = !VGPRSpills.empty();
2136 if (OnlyToVGPR && !SpillToVGPR)
2137 return false;
2138
2139 if (SpillToVGPR) {
2140 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
2142 SB.NumSubRegs == 1
2143 ? SB.SuperReg
2144 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2145
2146 SpilledReg Spill = VGPRSpills[i];
2147 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
2148 SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
2149 .addReg(Spill.VGPR)
2150 .addImm(Spill.Lane);
2151 if (SB.NumSubRegs > 1 && i == 0)
2153 if (Indexes) {
2154 if (i == e - 1)
2155 Indexes->replaceMachineInstrInMaps(*MI, *MIB);
2156 else
2157 Indexes->insertMachineInstrInMaps(*MIB);
2158 }
2159 }
2160 } else {
2161 SB.prepare();
2162
2163 // Per VGPR helper data
2164 auto PVD = SB.getPerVGPRData();
2165
2166 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2167 // Load in VGPR data
2168 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ true);
2169
2170 // Unpack lanes
2171 for (unsigned i = Offset * PVD.PerVGPR,
2172 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2173 i < e; ++i) {
2175 SB.NumSubRegs == 1
2176 ? SB.SuperReg
2177 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2178
2179 bool LastSubReg = (i + 1 == e);
2180 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
2181 SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
2182 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg))
2183 .addImm(i);
2184 if (SB.NumSubRegs > 1 && i == 0)
2186 if (Indexes) {
2187 if (i == e - 1)
2188 Indexes->replaceMachineInstrInMaps(*MI, *MIB);
2189 else
2190 Indexes->insertMachineInstrInMaps(*MIB);
2191 }
2192 }
2193 }
2194
2195 SB.restore();
2196 }
2197
2198 MI->eraseFromParent();
2199
2200 if (LIS)
2202
2203 return true;
2204}
2205
2207 MachineBasicBlock &RestoreMBB,
2208 Register SGPR, RegScavenger *RS) const {
2209 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, SGPR, false, 0,
2210 RS);
2211 SB.prepare();
2212 // Generate the spill of SGPR to SB.TmpVGPR.
2213 unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill);
2214 auto PVD = SB.getPerVGPRData();
2215 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2216 unsigned TmpVGPRFlags = RegState::Undef;
2217 // Write sub registers into the VGPR
2218 for (unsigned i = Offset * PVD.PerVGPR,
2219 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2220 i < e; ++i) {
2222 SB.NumSubRegs == 1
2223 ? SB.SuperReg
2224 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2225
2226 MachineInstrBuilder WriteLane =
2227 BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32),
2228 SB.TmpVGPR)
2229 .addReg(SubReg, SubKillState)
2230 .addImm(i % PVD.PerVGPR)
2231 .addReg(SB.TmpVGPR, TmpVGPRFlags);
2232 TmpVGPRFlags = 0;
2233 // There could be undef components of a spilled super register.
2234 // TODO: Can we detect this and skip the spill?
2235 if (SB.NumSubRegs > 1) {
2236 // The last implicit use of the SB.SuperReg carries the "Kill" flag.
2237 unsigned SuperKillState = 0;
2238 if (i + 1 == SB.NumSubRegs)
2239 SuperKillState |= getKillRegState(SB.IsKill);
2240 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState);
2241 }
2242 }
2243 // Don't need to write VGPR out.
2244 }
2245
2246 // Restore clobbered registers in the specified restore block.
2247 MI = RestoreMBB.end();
2248 SB.setMI(&RestoreMBB, MI);
2249 // Generate the restore of SGPR from SB.TmpVGPR.
2250 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2251 // Don't need to load VGPR in.
2252 // Unpack lanes
2253 for (unsigned i = Offset * PVD.PerVGPR,
2254 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2255 i < e; ++i) {
2257 SB.NumSubRegs == 1
2258 ? SB.SuperReg
2259 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2260
2261 assert(SubReg.isPhysical());
2262 bool LastSubReg = (i + 1 == e);
2263 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32),
2264 SubReg)
2265 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg))
2266 .addImm(i);
2267 if (SB.NumSubRegs > 1 && i == 0)
2269 }
2270 }
2271 SB.restore();
2272
2274 return false;
2275}
2276
2277/// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to
2278/// a VGPR and the stack slot can be safely eliminated when all other users are
2279/// handled.
2282 SlotIndexes *Indexes, LiveIntervals *LIS, bool SpillToPhysVGPRLane) const {
2283 switch (MI->getOpcode()) {
2284 case AMDGPU::SI_SPILL_S1024_SAVE:
2285 case AMDGPU::SI_SPILL_S512_SAVE:
2286 case AMDGPU::SI_SPILL_S384_SAVE:
2287 case AMDGPU::SI_SPILL_S352_SAVE:
2288 case AMDGPU::SI_SPILL_S320_SAVE:
2289 case AMDGPU::SI_SPILL_S288_SAVE:
2290 case AMDGPU::SI_SPILL_S256_SAVE:
2291 case AMDGPU::SI_SPILL_S224_SAVE:
2292 case AMDGPU::SI_SPILL_S192_SAVE:
2293 case AMDGPU::SI_SPILL_S160_SAVE:
2294 case AMDGPU::SI_SPILL_S128_SAVE:
2295 case AMDGPU::SI_SPILL_S96_SAVE:
2296 case AMDGPU::SI_SPILL_S64_SAVE:
2297 case AMDGPU::SI_SPILL_S32_SAVE:
2298 return spillSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane);
2299 case AMDGPU::SI_SPILL_S1024_RESTORE:
2300 case AMDGPU::SI_SPILL_S512_RESTORE:
2301 case AMDGPU::SI_SPILL_S384_RESTORE:
2302 case AMDGPU::SI_SPILL_S352_RESTORE:
2303 case AMDGPU::SI_SPILL_S320_RESTORE:
2304 case AMDGPU::SI_SPILL_S288_RESTORE:
2305 case AMDGPU::SI_SPILL_S256_RESTORE:
2306 case AMDGPU::SI_SPILL_S224_RESTORE:
2307 case AMDGPU::SI_SPILL_S192_RESTORE:
2308 case AMDGPU::SI_SPILL_S160_RESTORE:
2309 case AMDGPU::SI_SPILL_S128_RESTORE:
2310 case AMDGPU::SI_SPILL_S96_RESTORE:
2311 case AMDGPU::SI_SPILL_S64_RESTORE:
2312 case AMDGPU::SI_SPILL_S32_RESTORE:
2313 return restoreSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane);
2314 default:
2315 llvm_unreachable("not an SGPR spill instruction");
2316 }
2317}
2318
2320 int SPAdj, unsigned FIOperandNum,
2321 RegScavenger *RS) const {
2322 MachineFunction *MF = MI->getParent()->getParent();
2323 MachineBasicBlock *MBB = MI->getParent();
2325 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
2326 const SIInstrInfo *TII = ST.getInstrInfo();
2327 const DebugLoc &DL = MI->getDebugLoc();
2328
2329 assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?");
2330
2332 "unreserved scratch RSRC register");
2333
2334 MachineOperand *FIOp = &MI->getOperand(FIOperandNum);
2335 int Index = MI->getOperand(FIOperandNum).getIndex();
2336
2337 Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF)
2338 ? getBaseRegister()
2339 : getFrameRegister(*MF);
2340
2341 switch (MI->getOpcode()) {
2342 // SGPR register spill
2343 case AMDGPU::SI_SPILL_S1024_SAVE:
2344 case AMDGPU::SI_SPILL_S512_SAVE:
2345 case AMDGPU::SI_SPILL_S384_SAVE:
2346 case AMDGPU::SI_SPILL_S352_SAVE:
2347 case AMDGPU::SI_SPILL_S320_SAVE:
2348 case AMDGPU::SI_SPILL_S288_SAVE:
2349 case AMDGPU::SI_SPILL_S256_SAVE:
2350 case AMDGPU::SI_SPILL_S224_SAVE:
2351 case AMDGPU::SI_SPILL_S192_SAVE:
2352 case AMDGPU::SI_SPILL_S160_SAVE:
2353 case AMDGPU::SI_SPILL_S128_SAVE:
2354 case AMDGPU::SI_SPILL_S96_SAVE:
2355 case AMDGPU::SI_SPILL_S64_SAVE:
2356 case AMDGPU::SI_SPILL_S32_SAVE: {
2357 return spillSGPR(MI, Index, RS);
2358 }
2359
2360 // SGPR register restore
2361 case AMDGPU::SI_SPILL_S1024_RESTORE:
2362 case AMDGPU::SI_SPILL_S512_RESTORE:
2363 case AMDGPU::SI_SPILL_S384_RESTORE:
2364 case AMDGPU::SI_SPILL_S352_RESTORE:
2365 case AMDGPU::SI_SPILL_S320_RESTORE:
2366 case AMDGPU::SI_SPILL_S288_RESTORE:
2367 case AMDGPU::SI_SPILL_S256_RESTORE:
2368 case AMDGPU::SI_SPILL_S224_RESTORE:
2369 case AMDGPU::SI_SPILL_S192_RESTORE:
2370 case AMDGPU::SI_SPILL_S160_RESTORE:
2371 case AMDGPU::SI_SPILL_S128_RESTORE:
2372 case AMDGPU::SI_SPILL_S96_RESTORE:
2373 case AMDGPU::SI_SPILL_S64_RESTORE:
2374 case AMDGPU::SI_SPILL_S32_RESTORE: {
2375 return restoreSGPR(MI, Index, RS);
2376 }
2377
2378 // VGPR register spill
2379 case AMDGPU::SI_BLOCK_SPILL_V1024_SAVE: {
2380 // Put mask into M0.
2381 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32),
2382 AMDGPU::M0)
2383 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::mask));
2384 [[fallthrough]];
2385 }
2386 case AMDGPU::SI_SPILL_V1024_SAVE:
2387 case AMDGPU::SI_SPILL_V512_SAVE:
2388 case AMDGPU::SI_SPILL_V384_SAVE:
2389 case AMDGPU::SI_SPILL_V352_SAVE:
2390 case AMDGPU::SI_SPILL_V320_SAVE:
2391 case AMDGPU::SI_SPILL_V288_SAVE:
2392 case AMDGPU::SI_SPILL_V256_SAVE:
2393 case AMDGPU::SI_SPILL_V224_SAVE:
2394 case AMDGPU::SI_SPILL_V192_SAVE:
2395 case AMDGPU::SI_SPILL_V160_SAVE:
2396 case AMDGPU::SI_SPILL_V128_SAVE:
2397 case AMDGPU::SI_SPILL_V96_SAVE:
2398 case AMDGPU::SI_SPILL_V64_SAVE:
2399 case AMDGPU::SI_SPILL_V32_SAVE:
2400 case AMDGPU::SI_SPILL_V16_SAVE:
2401 case AMDGPU::SI_SPILL_A1024_SAVE:
2402 case AMDGPU::SI_SPILL_A512_SAVE:
2403 case AMDGPU::SI_SPILL_A384_SAVE:
2404 case AMDGPU::SI_SPILL_A352_SAVE:
2405 case AMDGPU::SI_SPILL_A320_SAVE:
2406 case AMDGPU::SI_SPILL_A288_SAVE:
2407 case AMDGPU::SI_SPILL_A256_SAVE:
2408 case AMDGPU::SI_SPILL_A224_SAVE:
2409 case AMDGPU::SI_SPILL_A192_SAVE:
2410 case AMDGPU::SI_SPILL_A160_SAVE:
2411 case AMDGPU::SI_SPILL_A128_SAVE:
2412 case AMDGPU::SI_SPILL_A96_SAVE:
2413 case AMDGPU::SI_SPILL_A64_SAVE:
2414 case AMDGPU::SI_SPILL_A32_SAVE:
2415 case AMDGPU::SI_SPILL_AV1024_SAVE:
2416 case AMDGPU::SI_SPILL_AV512_SAVE:
2417 case AMDGPU::SI_SPILL_AV384_SAVE:
2418 case AMDGPU::SI_SPILL_AV352_SAVE:
2419 case AMDGPU::SI_SPILL_AV320_SAVE:
2420 case AMDGPU::SI_SPILL_AV288_SAVE:
2421 case AMDGPU::SI_SPILL_AV256_SAVE:
2422 case AMDGPU::SI_SPILL_AV224_SAVE:
2423 case AMDGPU::SI_SPILL_AV192_SAVE:
2424 case AMDGPU::SI_SPILL_AV160_SAVE:
2425 case AMDGPU::SI_SPILL_AV128_SAVE:
2426 case AMDGPU::SI_SPILL_AV96_SAVE:
2427 case AMDGPU::SI_SPILL_AV64_SAVE:
2428 case AMDGPU::SI_SPILL_AV32_SAVE:
2429 case AMDGPU::SI_SPILL_WWM_V32_SAVE:
2430 case AMDGPU::SI_SPILL_WWM_AV32_SAVE: {
2431 const MachineOperand *VData = TII->getNamedOperand(*MI,
2432 AMDGPU::OpName::vdata);
2433 if (VData->isUndef()) {
2434 MI->eraseFromParent();
2435 return true;
2436 }
2437
2438 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
2439 MFI->getStackPtrOffsetReg());
2440
2441 unsigned Opc;
2442 if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_SAVE) {
2443 assert(ST.enableFlatScratch() && "Flat Scratch is not enabled!");
2444 Opc = AMDGPU::SCRATCH_STORE_SHORT_SADDR_t16;
2445 } else {
2446 Opc = MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_SAVE
2447 ? AMDGPU::SCRATCH_STORE_BLOCK_SADDR
2448 : ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
2449 : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
2450 }
2451
2452 auto *MBB = MI->getParent();
2453 bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode());
2454 if (IsWWMRegSpill) {
2455 TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(),
2456 RS->isRegUsed(AMDGPU::SCC));
2457 }
2459 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
2460 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
2461 *MI->memoperands_begin(), RS);
2463 if (IsWWMRegSpill)
2464 TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy());
2465
2466 MI->eraseFromParent();
2467 return true;
2468 }
2469 case AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE: {
2470 // Put mask into M0.
2471 BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32),
2472 AMDGPU::M0)
2473 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::mask));
2474 [[fallthrough]];
2475 }
2476 case AMDGPU::SI_SPILL_V16_RESTORE:
2477 case AMDGPU::SI_SPILL_V32_RESTORE:
2478 case AMDGPU::SI_SPILL_V64_RESTORE:
2479 case AMDGPU::SI_SPILL_V96_RESTORE:
2480 case AMDGPU::SI_SPILL_V128_RESTORE:
2481 case AMDGPU::SI_SPILL_V160_RESTORE:
2482 case AMDGPU::SI_SPILL_V192_RESTORE:
2483 case AMDGPU::SI_SPILL_V224_RESTORE:
2484 case AMDGPU::SI_SPILL_V256_RESTORE:
2485 case AMDGPU::SI_SPILL_V288_RESTORE:
2486 case AMDGPU::SI_SPILL_V320_RESTORE:
2487 case AMDGPU::SI_SPILL_V352_RESTORE:
2488 case AMDGPU::SI_SPILL_V384_RESTORE:
2489 case AMDGPU::SI_SPILL_V512_RESTORE:
2490 case AMDGPU::SI_SPILL_V1024_RESTORE:
2491 case AMDGPU::SI_SPILL_A32_RESTORE:
2492 case AMDGPU::SI_SPILL_A64_RESTORE:
2493 case AMDGPU::SI_SPILL_A96_RESTORE:
2494 case AMDGPU::SI_SPILL_A128_RESTORE:
2495 case AMDGPU::SI_SPILL_A160_RESTORE:
2496 case AMDGPU::SI_SPILL_A192_RESTORE:
2497 case AMDGPU::SI_SPILL_A224_RESTORE:
2498 case AMDGPU::SI_SPILL_A256_RESTORE:
2499 case AMDGPU::SI_SPILL_A288_RESTORE:
2500 case AMDGPU::SI_SPILL_A320_RESTORE:
2501 case AMDGPU::SI_SPILL_A352_RESTORE:
2502 case AMDGPU::SI_SPILL_A384_RESTORE:
2503 case AMDGPU::SI_SPILL_A512_RESTORE:
2504 case AMDGPU::SI_SPILL_A1024_RESTORE:
2505 case AMDGPU::SI_SPILL_AV32_RESTORE:
2506 case AMDGPU::SI_SPILL_AV64_RESTORE:
2507 case AMDGPU::SI_SPILL_AV96_RESTORE:
2508 case AMDGPU::SI_SPILL_AV128_RESTORE:
2509 case AMDGPU::SI_SPILL_AV160_RESTORE:
2510 case AMDGPU::SI_SPILL_AV192_RESTORE:
2511 case AMDGPU::SI_SPILL_AV224_RESTORE:
2512 case AMDGPU::SI_SPILL_AV256_RESTORE:
2513 case AMDGPU::SI_SPILL_AV288_RESTORE:
2514 case AMDGPU::SI_SPILL_AV320_RESTORE:
2515 case AMDGPU::SI_SPILL_AV352_RESTORE:
2516 case AMDGPU::SI_SPILL_AV384_RESTORE:
2517 case AMDGPU::SI_SPILL_AV512_RESTORE:
2518 case AMDGPU::SI_SPILL_AV1024_RESTORE:
2519 case AMDGPU::SI_SPILL_WWM_V32_RESTORE:
2520 case AMDGPU::SI_SPILL_WWM_AV32_RESTORE: {
2521 const MachineOperand *VData = TII->getNamedOperand(*MI,
2522 AMDGPU::OpName::vdata);
2523 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
2524 MFI->getStackPtrOffsetReg());
2525
2526 unsigned Opc;
2527 if (MI->getOpcode() == AMDGPU::SI_SPILL_V16_RESTORE) {
2528 assert(ST.enableFlatScratch() && "Flat Scratch is not enabled!");
2529 Opc = ST.d16PreservesUnusedBits()
2530 ? AMDGPU::SCRATCH_LOAD_SHORT_D16_SADDR_t16
2531 : AMDGPU::SCRATCH_LOAD_USHORT_SADDR;
2532 } else {
2533 Opc = MI->getOpcode() == AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE
2534 ? AMDGPU::SCRATCH_LOAD_BLOCK_SADDR
2535 : ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
2536 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
2537 }
2538
2539 auto *MBB = MI->getParent();
2540 bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode());
2541 if (IsWWMRegSpill) {
2542 TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(),
2543 RS->isRegUsed(AMDGPU::SCC));
2544 }
2545
2547 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
2548 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
2549 *MI->memoperands_begin(), RS);
2550
2551 if (IsWWMRegSpill)
2552 TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy());
2553
2554 MI->eraseFromParent();
2555 return true;
2556 }
2557 case AMDGPU::V_ADD_U32_e32:
2558 case AMDGPU::V_ADD_U32_e64:
2559 case AMDGPU::V_ADD_CO_U32_e32:
2560 case AMDGPU::V_ADD_CO_U32_e64: {
2561 // TODO: Handle sub, and, or.
2562 unsigned NumDefs = MI->getNumExplicitDefs();
2563 unsigned Src0Idx = NumDefs;
2564
2565 bool HasClamp = false;
2566 MachineOperand *VCCOp = nullptr;
2567
2568 switch (MI->getOpcode()) {
2569 case AMDGPU::V_ADD_U32_e32:
2570 break;
2571 case AMDGPU::V_ADD_U32_e64:
2572 HasClamp = MI->getOperand(3).getImm();
2573 break;
2574 case AMDGPU::V_ADD_CO_U32_e32:
2575 VCCOp = &MI->getOperand(3);
2576 break;
2577 case AMDGPU::V_ADD_CO_U32_e64:
2578 VCCOp = &MI->getOperand(1);
2579 HasClamp = MI->getOperand(4).getImm();
2580 break;
2581 default:
2582 break;
2583 }
2584 bool DeadVCC = !VCCOp || VCCOp->isDead();
2585 MachineOperand &DstOp = MI->getOperand(0);
2586 Register DstReg = DstOp.getReg();
2587
2588 unsigned OtherOpIdx =
2589 FIOperandNum == Src0Idx ? FIOperandNum + 1 : Src0Idx;
2590 MachineOperand *OtherOp = &MI->getOperand(OtherOpIdx);
2591
2592 unsigned Src1Idx = Src0Idx + 1;
2593 Register MaterializedReg = FrameReg;
2594 Register ScavengedVGPR;
2595
2596 int64_t Offset = FrameInfo.getObjectOffset(Index);
2597 // For the non-immediate case, we could fall through to the default
2598 // handling, but we do an in-place update of the result register here to
2599 // avoid scavenging another register.
2600 if (OtherOp->isImm()) {
2601 int64_t TotalOffset = OtherOp->getImm() + Offset;
2602
2603 if (!ST.hasVOP3Literal() && SIInstrInfo::isVOP3(*MI) &&
2604 !AMDGPU::isInlinableIntLiteral(TotalOffset)) {
2605 // If we can't support a VOP3 literal in the VALU instruction, we
2606 // can't specially fold into the add.
2607 // TODO: Handle VOP3->VOP2 shrink to support the fold.
2608 break;
2609 }
2610
2611 OtherOp->setImm(TotalOffset);
2612 Offset = 0;
2613 }
2614
2615 if (FrameReg && !ST.enableFlatScratch()) {
2616 // We should just do an in-place update of the result register. However,
2617 // the value there may also be used by the add, in which case we need a
2618 // temporary register.
2619 //
2620 // FIXME: The scavenger is not finding the result register in the
2621 // common case where the add does not read the register.
2622
2623 ScavengedVGPR = RS->scavengeRegisterBackwards(
2624 AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false, /*SPAdj=*/0);
2625
2626 // TODO: If we have a free SGPR, it's sometimes better to use a scalar
2627 // shift.
2628 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64))
2629 .addDef(ScavengedVGPR, RegState::Renamable)
2630 .addImm(ST.getWavefrontSizeLog2())
2631 .addReg(FrameReg);
2632 MaterializedReg = ScavengedVGPR;
2633 }
2634
2635 if ((!OtherOp->isImm() || OtherOp->getImm() != 0) && MaterializedReg) {
2636 if (ST.enableFlatScratch() &&
2637 !TII->isOperandLegal(*MI, Src1Idx, OtherOp)) {
2638 // We didn't need the shift above, so we have an SGPR for the frame
2639 // register, but may have a VGPR only operand.
2640 //
2641 // TODO: On gfx10+, we can easily change the opcode to the e64 version
2642 // and use the higher constant bus restriction to avoid this copy.
2643
2644 if (!ScavengedVGPR) {
2645 ScavengedVGPR = RS->scavengeRegisterBackwards(
2646 AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false,
2647 /*SPAdj=*/0);
2648 }
2649
2650 assert(ScavengedVGPR != DstReg);
2651
2652 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2653 .addReg(MaterializedReg,
2654 MaterializedReg != FrameReg ? RegState::Kill : 0);
2655 MaterializedReg = ScavengedVGPR;
2656 }
2657
2658 // TODO: In the flat scratch case, if this is an add of an SGPR, and SCC
2659 // is not live, we could use a scalar add + vector add instead of 2
2660 // vector adds.
2661 auto AddI32 = BuildMI(*MBB, *MI, DL, TII->get(MI->getOpcode()))
2662 .addDef(DstReg, RegState::Renamable);
2663 if (NumDefs == 2)
2664 AddI32.add(MI->getOperand(1));
2665
2666 unsigned MaterializedRegFlags =
2667 MaterializedReg != FrameReg ? RegState::Kill : 0;
2668
2669 if (isVGPRClass(getPhysRegBaseClass(MaterializedReg))) {
2670 // If we know we have a VGPR already, it's more likely the other
2671 // operand is a legal vsrc0.
2672 AddI32
2673 .add(*OtherOp)
2674 .addReg(MaterializedReg, MaterializedRegFlags);
2675 } else {
2676 // Commute operands to avoid violating VOP2 restrictions. This will
2677 // typically happen when using scratch.
2678 AddI32
2679 .addReg(MaterializedReg, MaterializedRegFlags)
2680 .add(*OtherOp);
2681 }
2682
2683 if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 ||
2684 MI->getOpcode() == AMDGPU::V_ADD_U32_e64)
2685 AddI32.addImm(0); // clamp
2686
2687 if (MI->getOpcode() == AMDGPU::V_ADD_CO_U32_e32)
2688 AddI32.setOperandDead(3); // Dead vcc
2689
2690 MaterializedReg = DstReg;
2691
2692 OtherOp->ChangeToRegister(MaterializedReg, false);
2693 OtherOp->setIsKill(true);
2695 Offset = 0;
2696 } else if (Offset != 0) {
2697 assert(!MaterializedReg);
2699 Offset = 0;
2700 } else {
2701 if (DeadVCC && !HasClamp) {
2702 assert(Offset == 0);
2703
2704 // TODO: Losing kills and implicit operands. Just mutate to copy and
2705 // let lowerCopy deal with it?
2706 if (OtherOp->isReg() && OtherOp->getReg() == DstReg) {
2707 // Folded to an identity copy.
2708 MI->eraseFromParent();
2709 return true;
2710 }
2711
2712 // The immediate value should be in OtherOp
2713 MI->setDesc(TII->get(AMDGPU::V_MOV_B32_e32));
2714 MI->removeOperand(FIOperandNum);
2715
2716 unsigned NumOps = MI->getNumOperands();
2717 for (unsigned I = NumOps - 2; I >= NumDefs + 1; --I)
2718 MI->removeOperand(I);
2719
2720 if (NumDefs == 2)
2721 MI->removeOperand(1);
2722
2723 // The code below can't deal with a mov.
2724 return true;
2725 }
2726
2727 // This folded to a constant, but we have to keep the add around for
2728 // pointless implicit defs or clamp modifier.
2729 FIOp->ChangeToImmediate(0);
2730 }
2731
2732 // Try to improve legality by commuting.
2733 if (!TII->isOperandLegal(*MI, Src1Idx) && TII->commuteInstruction(*MI)) {
2734 std::swap(FIOp, OtherOp);
2735 std::swap(FIOperandNum, OtherOpIdx);
2736 }
2737
2738 // We need at most one mov to satisfy the operand constraints. Prefer to
2739 // move the FI operand first, as it may be a literal in a VOP3
2740 // instruction.
2741 for (unsigned SrcIdx : {FIOperandNum, OtherOpIdx}) {
2742 if (!TII->isOperandLegal(*MI, SrcIdx)) {
2743 // If commuting didn't make the operands legal, we need to materialize
2744 // in a register.
2745 // TODO: Can use SGPR on gfx10+ in some cases.
2746 if (!ScavengedVGPR) {
2747 ScavengedVGPR = RS->scavengeRegisterBackwards(
2748 AMDGPU::VGPR_32RegClass, MI, /*RestoreAfter=*/false,
2749 /*SPAdj=*/0);
2750 }
2751
2752 assert(ScavengedVGPR != DstReg);
2753
2754 MachineOperand &Src = MI->getOperand(SrcIdx);
2755 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), ScavengedVGPR)
2756 .add(Src);
2757
2758 Src.ChangeToRegister(ScavengedVGPR, false);
2759 Src.setIsKill(true);
2760 break;
2761 }
2762 }
2763
2764 // Fold out add of 0 case that can appear in kernels.
2765 if (FIOp->isImm() && FIOp->getImm() == 0 && DeadVCC && !HasClamp) {
2766 if (OtherOp->isReg() && OtherOp->getReg() != DstReg) {
2767 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::COPY), DstReg).add(*OtherOp);
2768 }
2769
2770 MI->eraseFromParent();
2771 }
2772
2773 return true;
2774 }
2775 case AMDGPU::S_ADD_I32:
2776 case AMDGPU::S_ADD_U32: {
2777 // TODO: Handle s_or_b32, s_and_b32.
2778 unsigned OtherOpIdx = FIOperandNum == 1 ? 2 : 1;
2779 MachineOperand &OtherOp = MI->getOperand(OtherOpIdx);
2780
2781 assert(FrameReg || MFI->isBottomOfStack());
2782
2783 MachineOperand &DstOp = MI->getOperand(0);
2784 const DebugLoc &DL = MI->getDebugLoc();
2785 Register MaterializedReg = FrameReg;
2786
2787 // Defend against live scc, which should never happen in practice.
2788 bool DeadSCC = MI->getOperand(3).isDead();
2789
2790 Register TmpReg;
2791
2792 // FIXME: Scavenger should figure out that the result register is
2793 // available. Also should do this for the v_add case.
2794 if (OtherOp.isReg() && OtherOp.getReg() != DstOp.getReg())
2795 TmpReg = DstOp.getReg();
2796
2797 if (FrameReg && !ST.enableFlatScratch()) {
2798 // FIXME: In the common case where the add does not also read its result
2799 // (i.e. this isn't a reg += fi), it's not finding the dest reg as
2800 // available.
2801 if (!TmpReg)
2802 TmpReg = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
2803 MI, /*RestoreAfter=*/false, 0,
2804 /*AllowSpill=*/false);
2805 if (TmpReg) {
2806 BuildMI(*MBB, *MI, DL, TII->get(AMDGPU::S_LSHR_B32))
2807 .addDef(TmpReg, RegState::Renamable)
2808 .addReg(FrameReg)
2809 .addImm(ST.getWavefrontSizeLog2())
2810 .setOperandDead(3); // Set SCC dead
2811 }
2812 MaterializedReg = TmpReg;
2813 }
2814
2815 int64_t Offset = FrameInfo.getObjectOffset(Index);
2816
2817 // For the non-immediate case, we could fall through to the default
2818 // handling, but we do an in-place update of the result register here to
2819 // avoid scavenging another register.
2820 if (OtherOp.isImm()) {
2821 OtherOp.setImm(OtherOp.getImm() + Offset);
2822 Offset = 0;
2823
2824 if (MaterializedReg)
2825 FIOp->ChangeToRegister(MaterializedReg, false);
2826 else
2827 FIOp->ChangeToImmediate(0);
2828 } else if (MaterializedReg) {
2829 // If we can't fold the other operand, do another increment.
2830 Register DstReg = DstOp.getReg();
2831
2832 if (!TmpReg && MaterializedReg == FrameReg) {
2833 TmpReg = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
2834 MI, /*RestoreAfter=*/false, 0,
2835 /*AllowSpill=*/false);
2836 DstReg = TmpReg;
2837 }
2838
2839 if (TmpReg) {
2840 auto AddI32 = BuildMI(*MBB, *MI, DL, MI->getDesc())
2841 .addDef(DstReg, RegState::Renamable)
2842 .addReg(MaterializedReg, RegState::Kill)
2843 .add(OtherOp);
2844 if (DeadSCC)
2845 AddI32.setOperandDead(3);
2846
2847 MaterializedReg = DstReg;
2848
2849 OtherOp.ChangeToRegister(MaterializedReg, false);
2850 OtherOp.setIsKill(true);
2851 OtherOp.setIsRenamable(true);
2852 }
2854 } else {
2855 // If we don't have any other offset to apply, we can just directly
2856 // interpret the frame index as the offset.
2858 }
2859
2860 if (DeadSCC && OtherOp.isImm() && OtherOp.getImm() == 0) {
2861 assert(Offset == 0);
2862 MI->removeOperand(3);
2863 MI->removeOperand(OtherOpIdx);
2864 MI->setDesc(TII->get(FIOp->isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2865 } else if (DeadSCC && FIOp->isImm() && FIOp->getImm() == 0) {
2866 assert(Offset == 0);
2867 MI->removeOperand(3);
2868 MI->removeOperand(FIOperandNum);
2869 MI->setDesc(
2870 TII->get(OtherOp.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
2871 }
2872
2873 assert(!FIOp->isFI());
2874 return true;
2875 }
2876 default: {
2877 break;
2878 }
2879 }
2880
2881 int64_t Offset = FrameInfo.getObjectOffset(Index);
2882 if (ST.enableFlatScratch()) {
2883 if (TII->isFLATScratch(*MI)) {
2884 assert(
2885 (int16_t)FIOperandNum ==
2886 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::saddr));
2887
2888 // The offset is always swizzled, just replace it
2889 if (FrameReg)
2890 FIOp->ChangeToRegister(FrameReg, false);
2891
2893 TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
2894 int64_t NewOffset = Offset + OffsetOp->getImm();
2895 if (TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
2897 OffsetOp->setImm(NewOffset);
2898 if (FrameReg)
2899 return false;
2900 Offset = 0;
2901 }
2902
2903 if (!Offset) {
2904 unsigned Opc = MI->getOpcode();
2905 int NewOpc = -1;
2906 if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr)) {
2908 } else if (ST.hasFlatScratchSTMode()) {
2909 // On GFX10 we have ST mode to use no registers for an address.
2910 // Otherwise we need to materialize 0 into an SGPR.
2912 }
2913
2914 if (NewOpc != -1) {
2915 // removeOperand doesn't fixup tied operand indexes as it goes, so
2916 // it asserts. Untie vdst_in for now and retie them afterwards.
2917 int VDstIn =
2918 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
2919 bool TiedVDst = VDstIn != -1 && MI->getOperand(VDstIn).isReg() &&
2920 MI->getOperand(VDstIn).isTied();
2921 if (TiedVDst)
2922 MI->untieRegOperand(VDstIn);
2923
2924 MI->removeOperand(
2925 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr));
2926
2927 if (TiedVDst) {
2928 int NewVDst =
2929 AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
2930 int NewVDstIn =
2931 AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst_in);
2932 assert(NewVDst != -1 && NewVDstIn != -1 && "Must be tied!");
2933 MI->tieOperands(NewVDst, NewVDstIn);
2934 }
2935 MI->setDesc(TII->get(NewOpc));
2936 return false;
2937 }
2938 }
2939 }
2940
2941 if (!FrameReg) {
2943 if (TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp))
2944 return false;
2945 }
2946
2947 // We need to use register here. Check if we can use an SGPR or need
2948 // a VGPR.
2949 FIOp->ChangeToRegister(AMDGPU::M0, false);
2950 bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, FIOp);
2951
2952 if (!Offset && FrameReg && UseSGPR) {
2953 FIOp->setReg(FrameReg);
2954 return false;
2955 }
2956
2957 const TargetRegisterClass *RC =
2958 UseSGPR ? &AMDGPU::SReg_32_XM0RegClass : &AMDGPU::VGPR_32RegClass;
2959
2960 Register TmpReg =
2961 RS->scavengeRegisterBackwards(*RC, MI, false, 0, !UseSGPR);
2962 FIOp->setReg(TmpReg);
2963 FIOp->setIsKill();
2964
2965 if ((!FrameReg || !Offset) && TmpReg) {
2966 unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
2967 auto MIB = BuildMI(*MBB, MI, DL, TII->get(Opc), TmpReg);
2968 if (FrameReg)
2969 MIB.addReg(FrameReg);
2970 else
2971 MIB.addImm(Offset);
2972
2973 return false;
2974 }
2975
2976 bool NeedSaveSCC = RS->isRegUsed(AMDGPU::SCC) &&
2977 !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr);
2978
2979 Register TmpSReg =
2980 UseSGPR ? TmpReg
2981 : RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
2982 MI, false, 0, !UseSGPR);
2983
2984 // TODO: for flat scratch another attempt can be made with a VGPR index
2985 // if no SGPRs can be scavenged.
2986 if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR))
2987 report_fatal_error("Cannot scavenge register in FI elimination!");
2988
2989 if (!TmpSReg) {
2990 // Use frame register and restore it after.
2991 TmpSReg = FrameReg;
2992 FIOp->setReg(FrameReg);
2993 FIOp->setIsKill(false);
2994 }
2995
2996 if (NeedSaveSCC) {
2997 assert(!(Offset & 0x1) && "Flat scratch offset must be aligned!");
2998 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADDC_U32), TmpSReg)
2999 .addReg(FrameReg)
3000 .addImm(Offset);
3001 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITCMP1_B32))
3002 .addReg(TmpSReg)
3003 .addImm(0);
3004 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITSET0_B32), TmpSReg)
3005 .addImm(0)
3006 .addReg(TmpSReg);
3007 } else {
3008 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpSReg)
3009 .addReg(FrameReg)
3010 .addImm(Offset);
3011 }
3012
3013 if (!UseSGPR)
3014 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
3015 .addReg(TmpSReg, RegState::Kill);
3016
3017 if (TmpSReg == FrameReg) {
3018 // Undo frame register modification.
3019 if (NeedSaveSCC &&
3020 !MI->registerDefIsDead(AMDGPU::SCC, /*TRI=*/nullptr)) {
3022 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADDC_U32),
3023 TmpSReg)
3024 .addReg(FrameReg)
3025 .addImm(-Offset);
3026 I = BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITCMP1_B32))
3027 .addReg(TmpSReg)
3028 .addImm(0);
3029 BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITSET0_B32),
3030 TmpSReg)
3031 .addImm(0)
3032 .addReg(TmpSReg);
3033 } else {
3034 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADD_I32),
3035 FrameReg)
3036 .addReg(FrameReg)
3037 .addImm(-Offset);
3038 }
3039 }
3040
3041 return false;
3042 }
3043
3044 bool IsMUBUF = TII->isMUBUF(*MI);
3045
3046 if (!IsMUBUF && !MFI->isBottomOfStack()) {
3047 // Convert to a swizzled stack address by scaling by the wave size.
3048 // In an entry function/kernel the offset is already swizzled.
3049 bool IsSALU = isSGPRClass(TII->getOpRegClass(*MI, FIOperandNum));
3050 bool LiveSCC = RS->isRegUsed(AMDGPU::SCC) &&
3051 !MI->definesRegister(AMDGPU::SCC, /*TRI=*/nullptr);
3052 const TargetRegisterClass *RC = IsSALU && !LiveSCC
3053 ? &AMDGPU::SReg_32RegClass
3054 : &AMDGPU::VGPR_32RegClass;
3055 bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32 ||
3056 MI->getOpcode() == AMDGPU::V_MOV_B32_e64 ||
3057 MI->getOpcode() == AMDGPU::S_MOV_B32;
3058 Register ResultReg =
3059 IsCopy ? MI->getOperand(0).getReg()
3060 : RS->scavengeRegisterBackwards(*RC, MI, false, 0);
3061
3062 int64_t Offset = FrameInfo.getObjectOffset(Index);
3063 if (Offset == 0) {
3064 unsigned OpCode =
3065 IsSALU && !LiveSCC ? AMDGPU::S_LSHR_B32 : AMDGPU::V_LSHRREV_B32_e64;
3066 Register TmpResultReg = ResultReg;
3067 if (IsSALU && LiveSCC) {
3068 TmpResultReg = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
3069 MI, false, 0);
3070 }
3071
3072 auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), TmpResultReg);
3073 if (OpCode == AMDGPU::V_LSHRREV_B32_e64)
3074 // For V_LSHRREV, the operands are reversed (the shift count goes
3075 // first).
3076 Shift.addImm(ST.getWavefrontSizeLog2()).addReg(FrameReg);
3077 else
3078 Shift.addReg(FrameReg).addImm(ST.getWavefrontSizeLog2());
3079 if (IsSALU && !LiveSCC)
3080 Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead.
3081 if (IsSALU && LiveSCC) {
3082 Register NewDest;
3083 if (IsCopy) {
3084 assert(ResultReg.isPhysical());
3085 NewDest = ResultReg;
3086 } else {
3087 NewDest = RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
3088 Shift, false, 0);
3089 }
3090 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), NewDest)
3091 .addReg(TmpResultReg);
3092 ResultReg = NewDest;
3093 }
3094 } else {
3096 if (!IsSALU) {
3097 if ((MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) !=
3098 nullptr) {
3099 // Reuse ResultReg in intermediate step.
3100 Register ScaledReg = ResultReg;
3101
3102 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
3103 ScaledReg)
3104 .addImm(ST.getWavefrontSizeLog2())
3105 .addReg(FrameReg);
3106
3107 const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32;
3108
3109 // TODO: Fold if use instruction is another add of a constant.
3110 if (IsVOP2 ||
3111 AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
3112 // FIXME: This can fail
3113 MIB.addImm(Offset);
3114 MIB.addReg(ScaledReg, RegState::Kill);
3115 if (!IsVOP2)
3116 MIB.addImm(0); // clamp bit
3117 } else {
3118 assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 &&
3119 "Need to reuse carry out register");
3120
3121 // Use scavenged unused carry out as offset register.
3122 Register ConstOffsetReg;
3123 if (!isWave32)
3124 ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0);
3125 else
3126 ConstOffsetReg = MIB.getReg(1);
3127
3128 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32),
3129 ConstOffsetReg)
3130 .addImm(Offset);
3131 MIB.addReg(ConstOffsetReg, RegState::Kill);
3132 MIB.addReg(ScaledReg, RegState::Kill);
3133 MIB.addImm(0); // clamp bit
3134 }
3135 }
3136 }
3137 if (!MIB || IsSALU) {
3138 // We have to produce a carry out, and there isn't a free SGPR pair
3139 // for it. We can keep the whole computation on the SALU to avoid
3140 // clobbering an additional register at the cost of an extra mov.
3141
3142 // We may have 1 free scratch SGPR even though a carry out is
3143 // unavailable. Only one additional mov is needed.
3144 Register TmpScaledReg = IsCopy && IsSALU
3145 ? ResultReg
3146 : RS->scavengeRegisterBackwards(
3147 AMDGPU::SReg_32_XM0RegClass, MI,
3148 false, 0, /*AllowSpill=*/false);
3149 Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg;
3150 Register TmpResultReg = ScaledReg;
3151
3152 if (!LiveSCC) {
3153 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), TmpResultReg)
3154 .addReg(FrameReg)
3155 .addImm(ST.getWavefrontSizeLog2());
3156 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpResultReg)
3157 .addReg(TmpResultReg, RegState::Kill)
3158 .addImm(Offset);
3159 } else {
3160 TmpResultReg = RS->scavengeRegisterBackwards(
3161 AMDGPU::VGPR_32RegClass, MI, false, 0, /*AllowSpill=*/true);
3162
3164 if ((Add = TII->getAddNoCarry(*MBB, MI, DL, TmpResultReg, *RS))) {
3165 BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
3166 TmpResultReg)
3167 .addImm(ST.getWavefrontSizeLog2())
3168 .addReg(FrameReg);
3169 if (Add->getOpcode() == AMDGPU::V_ADD_CO_U32_e64) {
3170 BuildMI(*MBB, *Add, DL, TII->get(AMDGPU::S_MOV_B32), ResultReg)
3171 .addImm(Offset);
3172 Add.addReg(ResultReg, RegState::Kill)
3173 .addReg(TmpResultReg, RegState::Kill)
3174 .addImm(0);
3175 } else
3176 Add.addImm(Offset).addReg(TmpResultReg, RegState::Kill);
3177 } else {
3178 assert(Offset > 0 && isUInt<24>(2 * ST.getMaxWaveScratchSize()) &&
3179 "offset is unsafe for v_mad_u32_u24");
3180
3181 // We start with a frame pointer with a wave space value, and
3182 // an offset in lane-space. We are materializing a lane space
3183 // value. We can either do a right shift of the frame pointer
3184 // to get to lane space, or a left shift of the offset to get
3185 // to wavespace. We can right shift after the computation to
3186 // get back to the desired per-lane value. We are using the
3187 // mad_u32_u24 primarily as an add with no carry out clobber.
3188 bool IsInlinableLiteral =
3189 AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm());
3190 if (!IsInlinableLiteral) {
3191 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32),
3192 TmpResultReg)
3193 .addImm(Offset);
3194 }
3195
3196 Add = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MAD_U32_U24_e64),
3197 TmpResultReg);
3198
3199 if (!IsInlinableLiteral) {
3200 Add.addReg(TmpResultReg, RegState::Kill);
3201 } else {
3202 // We fold the offset into mad itself if its inlinable.
3203 Add.addImm(Offset);
3204 }
3205 Add.addImm(ST.getWavefrontSize()).addReg(FrameReg).addImm(0);
3206 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
3207 TmpResultReg)
3208 .addImm(ST.getWavefrontSizeLog2())
3209 .addReg(TmpResultReg);
3210 }
3211
3212 Register NewDest;
3213 if (IsCopy) {
3214 NewDest = ResultReg;
3215 } else {
3216 NewDest = RS->scavengeRegisterBackwards(
3217 AMDGPU::SReg_32_XM0RegClass, *Add, false, 0,
3218 /*AllowSpill=*/true);
3219 }
3220
3221 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
3222 NewDest)
3223 .addReg(TmpResultReg);
3224 ResultReg = NewDest;
3225 }
3226 if (!IsSALU)
3227 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg)
3228 .addReg(TmpResultReg, RegState::Kill);
3229 else
3230 ResultReg = TmpResultReg;
3231 // If there were truly no free SGPRs, we need to undo everything.
3232 if (!TmpScaledReg.isValid()) {
3233 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg)
3234 .addReg(ScaledReg, RegState::Kill)
3235 .addImm(-Offset);
3236 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg)
3237 .addReg(FrameReg)
3238 .addImm(ST.getWavefrontSizeLog2());
3239 }
3240 }
3241 }
3242
3243 // Don't introduce an extra copy if we're just materializing in a mov.
3244 if (IsCopy) {
3245 MI->eraseFromParent();
3246 return true;
3247 }
3248 FIOp->ChangeToRegister(ResultReg, false, false, true);
3249 return false;
3250 }
3251
3252 if (IsMUBUF) {
3253 // Disable offen so we don't need a 0 vgpr base.
3254 assert(
3255 static_cast<int>(FIOperandNum) ==
3256 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr));
3257
3258 auto &SOffset = *TII->getNamedOperand(*MI, AMDGPU::OpName::soffset);
3259 assert((SOffset.isImm() && SOffset.getImm() == 0));
3260
3261 if (FrameReg != AMDGPU::NoRegister)
3262 SOffset.ChangeToRegister(FrameReg, false);
3263
3264 int64_t Offset = FrameInfo.getObjectOffset(Index);
3265 int64_t OldImm =
3266 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm();
3267 int64_t NewOffset = OldImm + Offset;
3268
3269 if (TII->isLegalMUBUFImmOffset(NewOffset) &&
3270 buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) {
3271 MI->eraseFromParent();
3272 return true;
3273 }
3274 }
3275
3276 // If the offset is simply too big, don't convert to a scratch wave offset
3277 // relative index.
3278
3280 if (!TII->isImmOperandLegal(*MI, FIOperandNum, *FIOp)) {
3281 Register TmpReg =
3282 RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0);
3283 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
3284 .addImm(Offset);
3285 FIOp->ChangeToRegister(TmpReg, false, false, true);
3286 }
3287
3288 return false;
3289}
3290
3294
3296 return getEncodingValue(Reg) & AMDGPU::HWEncoding::REG_IDX_MASK;
3297}
3298
3300 return getRegBitWidth(RC.getID());
3301}
3302
3303static const TargetRegisterClass *
3305 if (BitWidth == 64)
3306 return &AMDGPU::VReg_64RegClass;
3307 if (BitWidth == 96)
3308 return &AMDGPU::VReg_96RegClass;
3309 if (BitWidth == 128)
3310 return &AMDGPU::VReg_128RegClass;
3311 if (BitWidth == 160)
3312 return &AMDGPU::VReg_160RegClass;
3313 if (BitWidth == 192)
3314 return &AMDGPU::VReg_192RegClass;
3315 if (BitWidth == 224)
3316 return &AMDGPU::VReg_224RegClass;
3317 if (BitWidth == 256)
3318 return &AMDGPU::VReg_256RegClass;
3319 if (BitWidth == 288)
3320 return &AMDGPU::VReg_288RegClass;
3321 if (BitWidth == 320)
3322 return &AMDGPU::VReg_320RegClass;
3323 if (BitWidth == 352)
3324 return &AMDGPU::VReg_352RegClass;
3325 if (BitWidth == 384)
3326 return &AMDGPU::VReg_384RegClass;
3327 if (BitWidth == 512)
3328 return &AMDGPU::VReg_512RegClass;
3329 if (BitWidth == 1024)
3330 return &AMDGPU::VReg_1024RegClass;
3331
3332 return nullptr;
3333}
3334
3335static const TargetRegisterClass *
3337 if (BitWidth == 64)
3338 return &AMDGPU::VReg_64_Align2RegClass;
3339 if (BitWidth == 96)
3340 return &AMDGPU::VReg_96_Align2RegClass;
3341 if (BitWidth == 128)
3342 return &AMDGPU::VReg_128_Align2RegClass;
3343 if (BitWidth == 160)
3344 return &AMDGPU::VReg_160_Align2RegClass;
3345 if (BitWidth == 192)
3346 return &AMDGPU::VReg_192_Align2RegClass;
3347 if (BitWidth == 224)
3348 return &AMDGPU::VReg_224_Align2RegClass;
3349 if (BitWidth == 256)
3350 return &AMDGPU::VReg_256_Align2RegClass;
3351 if (BitWidth == 288)
3352 return &AMDGPU::VReg_288_Align2RegClass;
3353 if (BitWidth == 320)
3354 return &AMDGPU::VReg_320_Align2RegClass;
3355 if (BitWidth == 352)
3356 return &AMDGPU::VReg_352_Align2RegClass;
3357 if (BitWidth == 384)
3358 return &AMDGPU::VReg_384_Align2RegClass;
3359 if (BitWidth == 512)
3360 return &AMDGPU::VReg_512_Align2RegClass;
3361 if (BitWidth == 1024)
3362 return &AMDGPU::VReg_1024_Align2RegClass;
3363
3364 return nullptr;
3365}
3366
3367const TargetRegisterClass *
3369 if (BitWidth == 1)
3370 return &AMDGPU::VReg_1RegClass;
3371 if (BitWidth == 16)
3372 return &AMDGPU::VGPR_16RegClass;
3373 if (BitWidth == 32)
3374 return &AMDGPU::VGPR_32RegClass;
3375 return ST.needsAlignedVGPRs() ? getAlignedVGPRClassForBitWidth(BitWidth)
3377}
3378
3379const TargetRegisterClass *
3381 if (BitWidth <= 32)
3382 return &AMDGPU::VGPR_32_Lo256RegClass;
3383 if (BitWidth <= 64)
3384 return &AMDGPU::VReg_64_Lo256_Align2RegClass;
3385 if (BitWidth <= 96)
3386 return &AMDGPU::VReg_96_Lo256_Align2RegClass;
3387 if (BitWidth <= 128)
3388 return &AMDGPU::VReg_128_Lo256_Align2RegClass;
3389 if (BitWidth <= 160)
3390 return &AMDGPU::VReg_160_Lo256_Align2RegClass;
3391 if (BitWidth <= 192)
3392 return &AMDGPU::VReg_192_Lo256_Align2RegClass;
3393 if (BitWidth <= 224)
3394 return &AMDGPU::VReg_224_Lo256_Align2RegClass;
3395 if (BitWidth <= 256)
3396 return &AMDGPU::VReg_256_Lo256_Align2RegClass;
3397 if (BitWidth <= 288)
3398 return &AMDGPU::VReg_288_Lo256_Align2RegClass;
3399 if (BitWidth <= 320)
3400 return &AMDGPU::VReg_320_Lo256_Align2RegClass;
3401 if (BitWidth <= 352)
3402 return &AMDGPU::VReg_352_Lo256_Align2RegClass;
3403 if (BitWidth <= 384)
3404 return &AMDGPU::VReg_384_Lo256_Align2RegClass;
3405 if (BitWidth <= 512)
3406 return &AMDGPU::VReg_512_Lo256_Align2RegClass;
3407 if (BitWidth <= 1024)
3408 return &AMDGPU::VReg_1024_Lo256_Align2RegClass;
3409
3410 return nullptr;
3411}
3412
3413static const TargetRegisterClass *
3415 if (BitWidth == 64)
3416 return &AMDGPU::AReg_64RegClass;
3417 if (BitWidth == 96)
3418 return &AMDGPU::AReg_96RegClass;
3419 if (BitWidth == 128)
3420 return &AMDGPU::AReg_128RegClass;
3421 if (BitWidth == 160)
3422 return &AMDGPU::AReg_160RegClass;
3423 if (BitWidth == 192)
3424 return &AMDGPU::AReg_192RegClass;
3425 if (BitWidth == 224)
3426 return &AMDGPU::AReg_224RegClass;
3427 if (BitWidth == 256)
3428 return &AMDGPU::AReg_256RegClass;
3429 if (BitWidth == 288)
3430 return &AMDGPU::AReg_288RegClass;
3431 if (BitWidth == 320)
3432 return &AMDGPU::AReg_320RegClass;
3433 if (BitWidth == 352)
3434 return &AMDGPU::AReg_352RegClass;
3435 if (BitWidth == 384)
3436 return &AMDGPU::AReg_384RegClass;
3437 if (BitWidth == 512)
3438 return &AMDGPU::AReg_512RegClass;
3439 if (BitWidth == 1024)
3440 return &AMDGPU::AReg_1024RegClass;
3441
3442 return nullptr;
3443}
3444
3445static const TargetRegisterClass *
3447 if (BitWidth == 64)
3448 return &AMDGPU::AReg_64_Align2RegClass;
3449 if (BitWidth == 96)
3450 return &AMDGPU::AReg_96_Align2RegClass;
3451 if (BitWidth == 128)
3452 return &AMDGPU::AReg_128_Align2RegClass;
3453 if (BitWidth == 160)
3454 return &AMDGPU::AReg_160_Align2RegClass;
3455 if (BitWidth == 192)
3456 return &AMDGPU::AReg_192_Align2RegClass;
3457 if (BitWidth == 224)
3458 return &AMDGPU::AReg_224_Align2RegClass;
3459 if (BitWidth == 256)
3460 return &AMDGPU::AReg_256_Align2RegClass;
3461 if (BitWidth == 288)
3462 return &AMDGPU::AReg_288_Align2RegClass;
3463 if (BitWidth == 320)
3464 return &AMDGPU::AReg_320_Align2RegClass;
3465 if (BitWidth == 352)
3466 return &AMDGPU::AReg_352_Align2RegClass;
3467 if (BitWidth == 384)
3468 return &AMDGPU::AReg_384_Align2RegClass;
3469 if (BitWidth == 512)
3470 return &AMDGPU::AReg_512_Align2RegClass;
3471 if (BitWidth == 1024)
3472 return &AMDGPU::AReg_1024_Align2RegClass;
3473
3474 return nullptr;
3475}
3476
3477const TargetRegisterClass *
3479 if (BitWidth == 16)
3480 return &AMDGPU::AGPR_LO16RegClass;
3481 if (BitWidth == 32)
3482 return &AMDGPU::AGPR_32RegClass;
3483 return ST.needsAlignedVGPRs() ? getAlignedAGPRClassForBitWidth(BitWidth)
3485}
3486
3487static const TargetRegisterClass *
3489 if (BitWidth == 64)
3490 return &AMDGPU::AV_64RegClass;
3491 if (BitWidth == 96)
3492 return &AMDGPU::AV_96RegClass;
3493 if (BitWidth == 128)
3494 return &AMDGPU::AV_128RegClass;
3495 if (BitWidth == 160)
3496 return &AMDGPU::AV_160RegClass;
3497 if (BitWidth == 192)
3498 return &AMDGPU::AV_192RegClass;
3499 if (BitWidth == 224)
3500 return &AMDGPU::AV_224RegClass;
3501 if (BitWidth == 256)
3502 return &AMDGPU::AV_256RegClass;
3503 if (BitWidth == 288)
3504 return &AMDGPU::AV_288RegClass;
3505 if (BitWidth == 320)
3506 return &AMDGPU::AV_320RegClass;
3507 if (BitWidth == 352)
3508 return &AMDGPU::AV_352RegClass;
3509 if (BitWidth == 384)
3510 return &AMDGPU::AV_384RegClass;
3511 if (BitWidth == 512)
3512 return &AMDGPU::AV_512RegClass;
3513 if (BitWidth == 1024)
3514 return &AMDGPU::AV_1024RegClass;
3515
3516 return nullptr;
3517}
3518
3519static const TargetRegisterClass *
3521 if (BitWidth == 64)
3522 return &AMDGPU::AV_64_Align2RegClass;
3523 if (BitWidth == 96)
3524 return &AMDGPU::AV_96_Align2RegClass;
3525 if (BitWidth == 128)
3526 return &AMDGPU::AV_128_Align2RegClass;
3527 if (BitWidth == 160)
3528 return &AMDGPU::AV_160_Align2RegClass;
3529 if (BitWidth == 192)
3530 return &AMDGPU::AV_192_Align2RegClass;
3531 if (BitWidth == 224)
3532 return &AMDGPU::AV_224_Align2RegClass;
3533 if (BitWidth == 256)
3534 return &AMDGPU::AV_256_Align2RegClass;
3535 if (BitWidth == 288)
3536 return &AMDGPU::AV_288_Align2RegClass;
3537 if (BitWidth == 320)
3538 return &AMDGPU::AV_320_Align2RegClass;
3539 if (BitWidth == 352)
3540 return &AMDGPU::AV_352_Align2RegClass;
3541 if (BitWidth == 384)
3542 return &AMDGPU::AV_384_Align2RegClass;
3543 if (BitWidth == 512)
3544 return &AMDGPU::AV_512_Align2RegClass;
3545 if (BitWidth == 1024)
3546 return &AMDGPU::AV_1024_Align2RegClass;
3547
3548 return nullptr;
3549}
3550
3551const TargetRegisterClass *
3553 if (BitWidth == 32)
3554 return &AMDGPU::AV_32RegClass;
3555 return ST.needsAlignedVGPRs()
3558}
3559
3560const TargetRegisterClass *
3562 if (BitWidth == 16 || BitWidth == 32)
3563 return &AMDGPU::SReg_32RegClass;
3564 if (BitWidth == 64)
3565 return &AMDGPU::SReg_64RegClass;
3566 if (BitWidth == 96)
3567 return &AMDGPU::SGPR_96RegClass;
3568 if (BitWidth == 128)
3569 return &AMDGPU::SGPR_128RegClass;
3570 if (BitWidth == 160)
3571 return &AMDGPU::SGPR_160RegClass;
3572 if (BitWidth == 192)
3573 return &AMDGPU::SGPR_192RegClass;
3574 if (BitWidth == 224)
3575 return &AMDGPU::SGPR_224RegClass;
3576 if (BitWidth == 256)
3577 return &AMDGPU::SGPR_256RegClass;
3578 if (BitWidth == 288)
3579 return &AMDGPU::SGPR_288RegClass;
3580 if (BitWidth == 320)
3581 return &AMDGPU::SGPR_320RegClass;
3582 if (BitWidth == 352)
3583 return &AMDGPU::SGPR_352RegClass;
3584 if (BitWidth == 384)
3585 return &AMDGPU::SGPR_384RegClass;
3586 if (BitWidth == 512)
3587 return &AMDGPU::SGPR_512RegClass;
3588 if (BitWidth == 1024)
3589 return &AMDGPU::SGPR_1024RegClass;
3590
3591 return nullptr;
3592}
3593
3595 Register Reg) const {
3596 const TargetRegisterClass *RC;
3597 if (Reg.isVirtual())
3598 RC = MRI.getRegClass(Reg);
3599 else
3600 RC = getPhysRegBaseClass(Reg);
3601 return RC && isSGPRClass(RC);
3602}
3603
3604const TargetRegisterClass *
3606 unsigned Size = getRegSizeInBits(*SRC);
3607
3608 switch (SRC->getID()) {
3609 default:
3610 break;
3611 case AMDGPU::VS_32_Lo256RegClassID:
3612 case AMDGPU::VS_64_Lo256RegClassID:
3613 return getAllocatableClass(getAlignedLo256VGPRClassForBitWidth(Size));
3614 }
3615
3616 const TargetRegisterClass *VRC =
3617 getAllocatableClass(getVGPRClassForBitWidth(Size));
3618 assert(VRC && "Invalid register class size");
3619 return VRC;
3620}
3621
3622const TargetRegisterClass *
3624 unsigned Size = getRegSizeInBits(*SRC);
3626 assert(ARC && "Invalid register class size");
3627 return ARC;
3628}
3629
3630const TargetRegisterClass *
3632 unsigned Size = getRegSizeInBits(*VRC);
3633 if (Size == 32)
3634 return &AMDGPU::SGPR_32RegClass;
3636 assert(SRC && "Invalid register class size");
3637 return SRC;
3638}
3639
3640const TargetRegisterClass *
3642 const TargetRegisterClass *SubRC,
3643 unsigned SubIdx) const {
3644 // Ensure this subregister index is aligned in the super register.
3645 const TargetRegisterClass *MatchRC =
3646 getMatchingSuperRegClass(SuperRC, SubRC, SubIdx);
3647 return MatchRC && MatchRC->hasSubClassEq(SuperRC) ? MatchRC : nullptr;
3648}
3649
3650bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
3653 return !ST.hasMFMAInlineLiteralBug();
3654
3655 return OpType >= AMDGPU::OPERAND_SRC_FIRST &&
3656 OpType <= AMDGPU::OPERAND_SRC_LAST;
3657}
3658
3659bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const {
3660 // TODO: 64-bit operands have extending behavior from 32-bit literal.
3661 return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST &&
3663}
3664
3665/// Returns a lowest register that is not used at any point in the function.
3666/// If all registers are used, then this function will return
3667/// AMDGPU::NoRegister. If \p ReserveHighestRegister = true, then return
3668/// highest unused register.
3671 const MachineFunction &MF, bool ReserveHighestRegister) const {
3672 if (ReserveHighestRegister) {
3673 for (MCRegister Reg : reverse(*RC))
3674 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
3675 return Reg;
3676 } else {
3677 for (MCRegister Reg : *RC)
3678 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
3679 return Reg;
3680 }
3681 return MCRegister();
3682}
3683
3685 const RegisterBankInfo &RBI,
3686 Register Reg) const {
3687 auto *RB = RBI.getRegBank(Reg, MRI, *MRI.getTargetRegisterInfo());
3688 if (!RB)
3689 return false;
3690
3691 return !RBI.isDivergentRegBank(RB);
3692}
3693
3695 unsigned EltSize) const {
3696 const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC);
3697 assert(RegBitWidth >= 32 && RegBitWidth <= 1024 && EltSize >= 2);
3698
3699 const unsigned RegHalves = RegBitWidth / 16;
3700 const unsigned EltHalves = EltSize / 2;
3701 assert(RegSplitParts.size() + 1 >= EltHalves);
3702
3703 const std::vector<int16_t> &Parts = RegSplitParts[EltHalves - 1];
3704 const unsigned NumParts = RegHalves / EltHalves;
3705
3706 return ArrayRef(Parts.data(), NumParts);
3707}
3708
3711 Register Reg) const {
3712 return Reg.isVirtual() ? MRI.getRegClass(Reg) : getPhysRegBaseClass(Reg);
3713}
3714
3715const TargetRegisterClass *
3717 const MachineOperand &MO) const {
3718 const TargetRegisterClass *SrcRC = getRegClassForReg(MRI, MO.getReg());
3719 return getSubRegisterClass(SrcRC, MO.getSubReg());
3720}
3721
3723 Register Reg) const {
3724 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
3725 // Registers without classes are unaddressable, SGPR-like registers.
3726 return RC && isVGPRClass(RC);
3727}
3728
3730 Register Reg) const {
3731 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
3732
3733 // Registers without classes are unaddressable, SGPR-like registers.
3734 return RC && isAGPRClass(RC);
3735}
3736
3738 const TargetRegisterClass *SrcRC,
3739 unsigned SubReg,
3740 const TargetRegisterClass *DstRC,
3741 unsigned DstSubReg,
3742 const TargetRegisterClass *NewRC,
3743 LiveIntervals &LIS) const {
3744 unsigned SrcSize = getRegSizeInBits(*SrcRC);
3745 unsigned DstSize = getRegSizeInBits(*DstRC);
3746 unsigned NewSize = getRegSizeInBits(*NewRC);
3747
3748 // Do not increase size of registers beyond dword, we would need to allocate
3749 // adjacent registers and constraint regalloc more than needed.
3750
3751 // Always allow dword coalescing.
3752 if (SrcSize <= 32 || DstSize <= 32)
3753 return true;
3754
3755 return NewSize <= DstSize || NewSize <= SrcSize;
3756}
3757
3759 MachineFunction &MF) const {
3760 unsigned MinOcc = ST.getOccupancyWithWorkGroupSizes(MF).first;
3761 switch (RC->getID()) {
3762 default:
3763 return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF);
3764 case AMDGPU::VGPR_32RegClassID:
3765 return std::min(
3766 ST.getMaxNumVGPRs(
3767 MinOcc,
3769 ST.getMaxNumVGPRs(MF));
3770 case AMDGPU::SGPR_32RegClassID:
3771 case AMDGPU::SGPR_LO16RegClassID:
3772 return std::min(ST.getMaxNumSGPRs(MinOcc, true), ST.getMaxNumSGPRs(MF));
3773 }
3774}
3775
3777 unsigned Idx) const {
3778 switch (static_cast<AMDGPU::RegisterPressureSets>(Idx)) {
3779 case AMDGPU::RegisterPressureSets::VGPR_32:
3780 case AMDGPU::RegisterPressureSets::AGPR_32:
3781 return getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
3782 const_cast<MachineFunction &>(MF));
3783 case AMDGPU::RegisterPressureSets::SReg_32:
3784 return getRegPressureLimit(&AMDGPU::SGPR_32RegClass,
3785 const_cast<MachineFunction &>(MF));
3786 }
3787
3788 llvm_unreachable("Unexpected register pressure set!");
3789}
3790
3791const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const {
3792 static const int Empty[] = { -1 };
3793
3794 if (RegPressureIgnoredUnits[RegUnit])
3795 return Empty;
3796
3797 return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit);
3798}
3799
3801 ArrayRef<MCPhysReg> Order,
3803 const MachineFunction &MF,
3804 const VirtRegMap *VRM,
3805 const LiveRegMatrix *Matrix) const {
3806
3807 const MachineRegisterInfo &MRI = MF.getRegInfo();
3808 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3809
3810 std::pair<unsigned, Register> Hint = MRI.getRegAllocationHint(VirtReg);
3811
3812 switch (Hint.first) {
3813 case AMDGPURI::Size32: {
3814 Register Paired = Hint.second;
3815 assert(Paired);
3816 Register PairedPhys;
3817 if (Paired.isPhysical()) {
3818 PairedPhys =
3819 getMatchingSuperReg(Paired, AMDGPU::lo16, &AMDGPU::VGPR_32RegClass);
3820 } else if (VRM && VRM->hasPhys(Paired)) {
3821 PairedPhys = getMatchingSuperReg(VRM->getPhys(Paired), AMDGPU::lo16,
3822 &AMDGPU::VGPR_32RegClass);
3823 }
3824
3825 // Prefer the paired physreg.
3826 if (PairedPhys)
3827 // isLo(Paired) is implicitly true here from the API of
3828 // getMatchingSuperReg.
3829 Hints.push_back(PairedPhys);
3830 return false;
3831 }
3832 case AMDGPURI::Size16: {
3833 Register Paired = Hint.second;
3834 assert(Paired);
3835 Register PairedPhys;
3836 if (Paired.isPhysical()) {
3837 PairedPhys = TRI->getSubReg(Paired, AMDGPU::lo16);
3838 } else if (VRM && VRM->hasPhys(Paired)) {
3839 PairedPhys = TRI->getSubReg(VRM->getPhys(Paired), AMDGPU::lo16);
3840 }
3841
3842 // First prefer the paired physreg.
3843 if (PairedPhys)
3844 Hints.push_back(PairedPhys);
3845 else {
3846 // Add all the lo16 physregs.
3847 // When the Paired operand has not yet been assigned a physreg it is
3848 // better to try putting VirtReg in a lo16 register, because possibly
3849 // later Paired can be assigned to the overlapping register and the COPY
3850 // can be eliminated.
3851 for (MCPhysReg PhysReg : Order) {
3852 if (PhysReg == PairedPhys || AMDGPU::isHi16Reg(PhysReg, *this))
3853 continue;
3854 if (AMDGPU::VGPR_16RegClass.contains(PhysReg) &&
3855 !MRI.isReserved(PhysReg))
3856 Hints.push_back(PhysReg);
3857 }
3858 }
3859 return false;
3860 }
3861 default:
3862 return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
3863 VRM);
3864 }
3865}
3866
3868 // Not a callee saved register.
3869 return AMDGPU::SGPR30_SGPR31;
3870}
3871
3872const TargetRegisterClass *
3874 const RegisterBank &RB) const {
3875 switch (RB.getID()) {
3876 case AMDGPU::VGPRRegBankID:
3878 std::max(ST.useRealTrue16Insts() ? 16u : 32u, Size));
3879 case AMDGPU::VCCRegBankID:
3880 assert(Size == 1);
3881 return getWaveMaskRegClass();
3882 case AMDGPU::SGPRRegBankID:
3883 return getSGPRClassForBitWidth(std::max(32u, Size));
3884 case AMDGPU::AGPRRegBankID:
3885 return getAGPRClassForBitWidth(std::max(32u, Size));
3886 default:
3887 llvm_unreachable("unknown register bank");
3888 }
3889}
3890
3891const TargetRegisterClass *
3893 const MachineRegisterInfo &MRI) const {
3894 const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg());
3895 if (const RegisterBank *RB = dyn_cast<const RegisterBank *>(RCOrRB))
3896 return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB);
3897
3898 if (const auto *RC = dyn_cast<const TargetRegisterClass *>(RCOrRB))
3899 return getAllocatableClass(RC);
3900
3901 return nullptr;
3902}
3903
3905 return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC;
3906}
3907
3909 return isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
3910}
3911
3913 // VGPR tuples have an alignment requirement on gfx90a variants.
3914 return ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass
3915 : &AMDGPU::VReg_64RegClass;
3916}
3917
3918const TargetRegisterClass *
3919SIRegisterInfo::getRegClass(unsigned RCID) const {
3920 switch ((int)RCID) {
3921 case AMDGPU::SReg_1RegClassID:
3922 return getBoolRC();
3923 case AMDGPU::SReg_1_XEXECRegClassID:
3924 return getWaveMaskRegClass();
3925 case -1:
3926 return nullptr;
3927 default:
3928 return AMDGPUGenRegisterInfo::getRegClass(RCID);
3929 }
3930}
3931
3932// Find reaching register definition
3936 LiveIntervals *LIS) const {
3937 auto &MDT = LIS->getDomTree();
3938 SlotIndex UseIdx = LIS->getInstructionIndex(Use);
3939 SlotIndex DefIdx;
3940
3941 if (Reg.isVirtual()) {
3942 if (!LIS->hasInterval(Reg))
3943 return nullptr;
3944 LiveInterval &LI = LIS->getInterval(Reg);
3945 LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg)
3946 : MRI.getMaxLaneMaskForVReg(Reg);
3947 VNInfo *V = nullptr;
3948 if (LI.hasSubRanges()) {
3949 for (auto &S : LI.subranges()) {
3950 if ((S.LaneMask & SubLanes) == SubLanes) {
3951 V = S.getVNInfoAt(UseIdx);
3952 break;
3953 }
3954 }
3955 } else {
3956 V = LI.getVNInfoAt(UseIdx);
3957 }
3958 if (!V)
3959 return nullptr;
3960 DefIdx = V->def;
3961 } else {
3962 // Find last def.
3963 for (MCRegUnit Unit : regunits(Reg.asMCReg())) {
3964 LiveRange &LR = LIS->getRegUnit(Unit);
3965 if (VNInfo *V = LR.getVNInfoAt(UseIdx)) {
3966 if (!DefIdx.isValid() ||
3967 MDT.dominates(LIS->getInstructionFromIndex(DefIdx),
3968 LIS->getInstructionFromIndex(V->def)))
3969 DefIdx = V->def;
3970 } else {
3971 return nullptr;
3972 }
3973 }
3974 }
3975
3976 MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx);
3977
3978 if (!Def || !MDT.dominates(Def, &Use))
3979 return nullptr;
3980
3981 assert(Def->modifiesRegister(Reg, this));
3982
3983 return Def;
3984}
3985
3987 assert(getRegSizeInBits(*getPhysRegBaseClass(Reg)) <= 32);
3988
3989 for (const TargetRegisterClass &RC : { AMDGPU::VGPR_32RegClass,
3990 AMDGPU::SReg_32RegClass,
3991 AMDGPU::AGPR_32RegClass } ) {
3992 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::lo16, &RC))
3993 return Super;
3994 }
3995 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::hi16,
3996 &AMDGPU::VGPR_32RegClass)) {
3997 return Super;
3998 }
3999
4000 return AMDGPU::NoRegister;
4001}
4002
4004 if (!ST.needsAlignedVGPRs())
4005 return true;
4006
4007 if (isVGPRClass(&RC))
4008 return RC.hasSuperClassEq(getVGPRClassForBitWidth(getRegSizeInBits(RC)));
4009 if (isAGPRClass(&RC))
4010 return RC.hasSuperClassEq(getAGPRClassForBitWidth(getRegSizeInBits(RC)));
4011 if (isVectorSuperClass(&RC))
4012 return RC.hasSuperClassEq(
4013 getVectorSuperClassForBitWidth(getRegSizeInBits(RC)));
4014
4015 assert(&RC != &AMDGPU::VS_64RegClass);
4016
4017 return true;
4018}
4019
4020const TargetRegisterClass *
4022 if (!RC || !ST.needsAlignedVGPRs())
4023 return RC;
4024
4025 unsigned Size = getRegSizeInBits(*RC);
4026 if (Size <= 32)
4027 return RC;
4028
4029 if (RC == &AMDGPU::VS_64RegClass)
4030 return &AMDGPU::VS_64_Align2RegClass;
4031
4032 if (isVGPRClass(RC))
4034 if (isAGPRClass(RC))
4036 if (isVectorSuperClass(RC))
4038
4039 return RC;
4040}
4041
4044 return ArrayRef(AMDGPU::SGPR_128RegClass.begin(), ST.getMaxNumSGPRs(MF) / 4);
4045}
4046
4049 return ArrayRef(AMDGPU::SGPR_64RegClass.begin(), ST.getMaxNumSGPRs(MF) / 2);
4050}
4051
4054 return ArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF));
4055}
4056
4057unsigned
4059 unsigned SubReg) const {
4060 switch (RC->TSFlags & SIRCFlags::RegKindMask) {
4061 case SIRCFlags::HasSGPR:
4062 return std::min(128u, getSubRegIdxSize(SubReg));
4063 case SIRCFlags::HasAGPR:
4064 case SIRCFlags::HasVGPR:
4066 return std::min(32u, getSubRegIdxSize(SubReg));
4067 default:
4068 break;
4069 }
4070 return 0;
4071}
4072
4074 const TargetRegisterClass &RC,
4075 bool IncludeCalls) const {
4076 unsigned NumArchVGPRs = ST.has1024AddressableVGPRs() ? 1024 : 256;
4078 (RC.getID() == AMDGPU::VGPR_32RegClassID)
4079 ? RC.getRegisters().take_front(NumArchVGPRs)
4080 : RC.getRegisters();
4081 for (MCPhysReg Reg : reverse(Registers))
4082 if (MRI.isPhysRegUsed(Reg, /*SkipRegMaskTest=*/!IncludeCalls))
4083 return getHWRegIndex(Reg) + 1;
4084 return 0;
4085}
4086
4089 const MachineFunction &MF) const {
4091 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
4092 if (FuncInfo->checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG))
4093 RegFlags.push_back("WWM_REG");
4094 return RegFlags;
4095}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static const Function * getParent(const Value *V)
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
Live Register Matrix
A set of register units.
#define I(x, y, z)
Definition MD5.cpp:58
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
if(PassOpts->AAPipeline)
This file declares the machine register scavenger class.
SI Pre allocate WWM Registers
static int getOffenMUBUFStore(unsigned Opc)
static const TargetRegisterClass * getAnyAGPRClassForBitWidth(unsigned BitWidth)
static int getOffsetMUBUFLoad(unsigned Opc)
static const std::array< unsigned, 17 > SubRegFromChannelTableWidthMap
static unsigned getNumSubRegsForSpillOp(const MachineInstr &MI, const SIInstrInfo *TII)
static void emitUnsupportedError(const Function &Fn, const MachineInstr &MI, const Twine &ErrMsg)
static const TargetRegisterClass * getAlignedAGPRClassForBitWidth(unsigned BitWidth)
static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST, MachineFrameInfo &MFI, MachineBasicBlock::iterator MI, int Index, int64_t Offset)
static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII, unsigned LoadStoreOp, unsigned EltSize)
static const TargetRegisterClass * getAlignedVGPRClassForBitWidth(unsigned BitWidth)
static int getOffsetMUBUFStore(unsigned Opc)
static const TargetRegisterClass * getAnyVGPRClassForBitWidth(unsigned BitWidth)
static cl::opt< bool > EnableSpillSGPRToVGPR("amdgpu-spill-sgpr-to-vgpr", cl::desc("Enable spilling SGPRs to VGPRs"), cl::ReallyHidden, cl::init(true))
static const TargetRegisterClass * getAlignedVectorSuperClassForBitWidth(unsigned BitWidth)
static const TargetRegisterClass * getAnyVectorSuperClassForBitWidth(unsigned BitWidth)
static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, int Index, unsigned Lane, unsigned ValueReg, bool IsKill)
static bool isFIPlusImmOrVGPR(const SIRegisterInfo &TRI, const MachineInstr &MI)
static int getOffenMUBUFLoad(unsigned Opc)
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:480
LocallyHashedType DenseMapInfo< LocallyHashedType >::Empty
static const char * getRegisterName(MCRegister Reg)
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:143
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:138
bool test(unsigned Idx) const
Definition BitVector.h:480
bool empty() const
empty - Tests whether there are no bits in this bitvector.
Definition BitVector.h:175
A debug info location.
Definition DebugLoc.h:124
Diagnostic information for unsupported feature in backend.
Register getReg() const
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LiveInterval - This class represents the liveness of a register, or stack slot.
bool hasSubRanges() const
Returns true if subregister liveness information is available.
iterator_range< subrange_iterator > subranges()
void removeAllRegUnitsForPhysReg(MCRegister Reg)
Remove associated live ranges for the register units associated with Reg.
bool hasInterval(Register Reg) const
MachineInstr * getInstructionFromIndex(SlotIndex index) const
Returns the instruction associated with the given index.
MachineDominatorTree & getDomTree()
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveRange & getRegUnit(unsigned Unit)
Return the live range for register unit Unit.
LiveInterval & getInterval(Register Reg)
This class represents the liveness of a register, stack slot, etc.
VNInfo * getVNInfoAt(SlotIndex Idx) const
getVNInfoAt - Return the VNInfo that is live at Idx, or NULL.
A set of register units used to track register liveness.
bool available(MCRegister Reg) const
Returns true if no part of physical register Reg is live.
Describe properties that are true of each instruction in the target description file.
MCRegAliasIterator enumerates all registers aliasing Reg.
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:33
static MCRegister from(unsigned Val)
Check the provided unsigned value is a valid MCRegister.
Definition MCRegister.h:69
Generic base class for all target subtargets.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
bool hasCalls() const
Return true if the current function has any function calls.
Align getObjectAlign(int ObjectIdx) const
Return the alignment of the specified stack object.
bool hasStackObjects() const
Return true if there are any stack objects in this function.
uint8_t getStackID(int ObjectIdx) const
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
void setAsmPrinterFlag(uint8_t Flag)
Set a flag for the AsmPrinter.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
void setImm(int64_t immVal)
int64_t getImm() const
LLVM_ABI void setIsRenamable(bool Val=true)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_ABI void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
void setIsKill(bool Val=true)
LLVM_ABI void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
Register getReg() const
getReg - Returns the register number.
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
bool isReserved(MCRegister PhysReg) const
isReserved - Returns true when PhysReg is a reserved register.
Holds all the information related to register banks.
virtual bool isDivergentRegBank(const RegisterBank *RB) const
Returns true if the register bank is considered divergent.
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
Definition Register.h:19
constexpr bool isValid() const
Definition Register.h:107
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:78
static bool isFLATScratch(const MachineInstr &MI)
static bool isMUBUF(const MachineInstr &MI)
static bool isVOP3(const MCInstrDesc &Desc)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
ArrayRef< MCPhysReg > getAGPRSpillVGPRs() const
MCPhysReg getVGPRToAGPRSpill(int FrameIndex, unsigned Lane) const
Register getScratchRSrcReg() const
Returns the physical register reserved for use as the resource descriptor for scratch accesses.
ArrayRef< MCPhysReg > getVGPRSpillAGPRs() const
ArrayRef< SIRegisterInfo::SpilledReg > getSGPRSpillToVirtualVGPRLanes(int FrameIndex) const
uint32_t getMaskForVGPRBlockOps(Register RegisterBlock) const
ArrayRef< SIRegisterInfo::SpilledReg > getSGPRSpillToPhysicalVGPRLanes(int FrameIndex) const
bool checkFlag(Register Reg, uint8_t Flag) const
const ReservedRegSet & getWWMReservedRegs() const
Register materializeFrameBaseRegister(MachineBasicBlock *MBB, int FrameIdx, int64_t Offset) const override
int64_t getScratchInstrOffset(const MachineInstr *MI) const
bool isFrameOffsetLegal(const MachineInstr *MI, Register BaseReg, int64_t Offset) const override
const TargetRegisterClass * getRegClass(unsigned RCID) const
const TargetRegisterClass * getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, const TargetRegisterClass *SubRC, unsigned SubIdx) const
Returns a register class which is compatible with SuperRC, such that a subregister exists with class ...
ArrayRef< MCPhysReg > getAllSGPR64(const MachineFunction &MF) const
Return all SGPR64 which satisfy the waves per execution unit requirement of the subtarget.
MCRegister findUnusedRegister(const MachineRegisterInfo &MRI, const TargetRegisterClass *RC, const MachineFunction &MF, bool ReserveHighestVGPR=false) const
Returns a lowest register that is not used at any point in the function.
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
MCPhysReg get32BitRegister(MCPhysReg Reg) const
const uint32_t * getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override
bool requiresFrameIndexReplacementScavenging(const MachineFunction &MF) const override
const TargetRegisterClass * getProperlyAlignedRC(const TargetRegisterClass *RC) const
bool shouldRealignStack(const MachineFunction &MF) const override
bool restoreSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, SlotIndexes *Indexes=nullptr, LiveIntervals *LIS=nullptr, bool OnlyToVGPR=false, bool SpillToPhysVGPRLane=false) const
bool isProperlyAlignedRC(const TargetRegisterClass &RC) const
const TargetRegisterClass * getEquivalentVGPRClass(const TargetRegisterClass *SRC) const
Register getFrameRegister(const MachineFunction &MF) const override
LLVM_READONLY const TargetRegisterClass * getVectorSuperClassForBitWidth(unsigned BitWidth) const
bool spillEmergencySGPR(MachineBasicBlock::iterator MI, MachineBasicBlock &RestoreMBB, Register SGPR, RegScavenger *RS) const
SIRegisterInfo(const GCNSubtarget &ST)
const uint32_t * getAllVGPRRegMask() const
MCRegister getReturnAddressReg(const MachineFunction &MF) const
const MCPhysReg * getCalleeSavedRegs(const MachineFunction *MF) const override
bool hasBasePointer(const MachineFunction &MF) const
const TargetRegisterClass * getCrossCopyRegClass(const TargetRegisterClass *RC) const override
Returns a legal register class to copy a register in the specified class to or from.
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
ArrayRef< MCPhysReg > getAllSGPR32(const MachineFunction &MF) const
Return all SGPR32 which satisfy the waves per execution unit requirement of the subtarget.
const TargetRegisterClass * getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &MF) const override
MCRegister reservedPrivateSegmentBufferReg(const MachineFunction &MF) const
Return the end register initially reserved for the scratch buffer in case spilling is needed.
bool eliminateSGPRToVGPRSpillFrameIndex(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, SlotIndexes *Indexes=nullptr, LiveIntervals *LIS=nullptr, bool SpillToPhysVGPRLane=false) const
Special case of eliminateFrameIndex.
bool isVGPR(const MachineRegisterInfo &MRI, Register Reg) const
void buildSpillLoadStore(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, unsigned LoadStoreOp, int Index, Register ValueReg, bool ValueIsKill, MCRegister ScratchOffsetReg, int64_t InstrOffset, MachineMemOperand *MMO, RegScavenger *RS, LiveRegUnits *LiveUnits=nullptr) const
bool isAsmClobberable(const MachineFunction &MF, MCRegister PhysReg) const override
LLVM_READONLY const TargetRegisterClass * getAGPRClassForBitWidth(unsigned BitWidth) const
static bool isChainScratchRegister(Register VGPR)
bool requiresRegisterScavenging(const MachineFunction &Fn) const override
bool opCanUseInlineConstant(unsigned OpType) const
const TargetRegisterClass * getRegClassForSizeOnBank(unsigned Size, const RegisterBank &Bank) const
const TargetRegisterClass * getConstrainedRegClassForOperand(const MachineOperand &MO, const MachineRegisterInfo &MRI) const override
bool isUniformReg(const MachineRegisterInfo &MRI, const RegisterBankInfo &RBI, Register Reg) const override
const uint32_t * getNoPreservedMask() const override
StringRef getRegAsmName(MCRegister Reg) const override
const uint32_t * getAllAllocatableSRegMask() const
MCRegister getAlignedHighSGPRForRC(const MachineFunction &MF, const unsigned Align, const TargetRegisterClass *RC) const
Return the largest available SGPR aligned to Align for the register class RC.
const TargetRegisterClass * getRegClassForReg(const MachineRegisterInfo &MRI, Register Reg) const
unsigned getHWRegIndex(MCRegister Reg) const
const MCPhysReg * getCalleeSavedRegsViaCopy(const MachineFunction *MF) const
const uint32_t * getAllVectorRegMask() const
const TargetRegisterClass * getEquivalentAGPRClass(const TargetRegisterClass *SRC) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
const TargetRegisterClass * getPointerRegClass(unsigned Kind=0) const override
const TargetRegisterClass * getRegClassForTypeOnBank(LLT Ty, const RegisterBank &Bank) const
bool opCanUseLiteralConstant(unsigned OpType) const
Register getBaseRegister() const
bool getRegAllocationHints(Register VirtReg, ArrayRef< MCPhysReg > Order, SmallVectorImpl< MCPhysReg > &Hints, const MachineFunction &MF, const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const override
LLVM_READONLY const TargetRegisterClass * getAlignedLo256VGPRClassForBitWidth(unsigned BitWidth) const
LLVM_READONLY const TargetRegisterClass * getVGPRClassForBitWidth(unsigned BitWidth) const
bool requiresFrameIndexScavenging(const MachineFunction &MF) const override
static bool isVGPRClass(const TargetRegisterClass *RC)
MachineInstr * findReachingDef(Register Reg, unsigned SubReg, MachineInstr &Use, MachineRegisterInfo &MRI, LiveIntervals *LIS) const
bool isSGPRReg(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getEquivalentSGPRClass(const TargetRegisterClass *VRC) const
SmallVector< StringLiteral > getVRegFlagsOfReg(Register Reg, const MachineFunction &MF) const override
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
ArrayRef< MCPhysReg > getAllSGPR128(const MachineFunction &MF) const
Return all SGPR128 which satisfy the waves per execution unit requirement of the subtarget.
unsigned getRegPressureSetLimit(const MachineFunction &MF, unsigned Idx) const override
BitVector getReservedRegs(const MachineFunction &MF) const override
bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override
const TargetRegisterClass * getRegClassForOperandReg(const MachineRegisterInfo &MRI, const MachineOperand &MO) const
void addImplicitUsesForBlockCSRLoad(MachineInstrBuilder &MIB, Register BlockReg) const
unsigned getNumUsedPhysRegs(const MachineRegisterInfo &MRI, const TargetRegisterClass &RC, bool IncludeCalls=true) const
const uint32_t * getAllAGPRRegMask() const
bool shouldCoalesce(MachineInstr *MI, const TargetRegisterClass *SrcRC, unsigned SubReg, const TargetRegisterClass *DstRC, unsigned DstSubReg, const TargetRegisterClass *NewRC, LiveIntervals &LIS) const override
const TargetRegisterClass * getBoolRC() const
bool isAGPR(const MachineRegisterInfo &MRI, Register Reg) const
bool eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const override
bool spillSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, SlotIndexes *Indexes=nullptr, LiveIntervals *LIS=nullptr, bool OnlyToVGPR=false, bool SpillToPhysVGPRLane=false) const
If OnlyToVGPR is true, this will only succeed if this manages to find a free VGPR lane to spill.
MCRegister getExec() const
MCRegister getVCC() const
int64_t getFrameIndexInstrOffset(const MachineInstr *MI, int Idx) const override
bool isVectorSuperClass(const TargetRegisterClass *RC) const
const TargetRegisterClass * getWaveMaskRegClass() const
unsigned getSubRegAlignmentNumBits(const TargetRegisterClass *RC, unsigned SubReg) const
void resolveFrameIndex(MachineInstr &MI, Register BaseReg, int64_t Offset) const override
bool requiresVirtualBaseRegisters(const MachineFunction &Fn) const override
const TargetRegisterClass * getVGPR64Class() const
void buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index, int Offset, bool IsLoad, bool IsKill=true) const
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
const int * getRegUnitPressureSets(unsigned RegUnit) const override
SlotIndex - An opaque wrapper around machine indexes.
Definition SlotIndexes.h:66
bool isValid() const
Returns true if this is a valid index.
SlotIndexes pass.
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
SlotIndex replaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
ReplaceMachineInstrInMaps - Replacing a machine instr with a new one in maps used by register allocat...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
bool hasFP(const MachineFunction &MF) const
hasFP - Return true if the specified function should have a dedicated frame pointer register.
const uint8_t TSFlags
Configurable target specific flags.
ArrayRef< MCPhysReg > getRegisters() const
unsigned getID() const
Return the register class ID number.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
virtual const TargetRegisterClass * getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &) const
Returns the largest super class of RC that is legal to use in the current sub-target and has the same...
virtual bool shouldRealignStack(const MachineFunction &MF) const
True if storage within the function requires the stack pointer to be aligned more than the normal cal...
virtual bool getRegAllocationHints(Register VirtReg, ArrayRef< MCPhysReg > Order, SmallVectorImpl< MCPhysReg > &Hints, const MachineFunction &MF, const VirtRegMap *VRM=nullptr, const LiveRegMatrix *Matrix=nullptr) const
Get a list of 'hint' registers that the register allocator should try first when allocating a physica...
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
VNInfo - Value Number Information.
MCRegister getPhys(Register virtReg) const
returns the physical register mapped to the specified virtual register
Definition VirtRegMap.h:91
bool hasPhys(Register virtReg) const
returns true if the specified virtual register is mapped to a physical register
Definition VirtRegMap.h:87
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ PRIVATE_ADDRESS
Address space for private memory.
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
LLVM_READONLY int getFlatScratchInstSVfromSS(uint16_t Opcode)
LLVM_READONLY int getFlatScratchInstSTfromSS(uint16_t Opcode)
LLVM_READONLY int getFlatScratchInstSVfromSVS(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
@ OPERAND_REG_IMM_FIRST
Definition SIDefines.h:250
@ OPERAND_REG_INLINE_AC_FIRST
Definition SIDefines.h:256
@ OPERAND_REG_INLINE_AC_LAST
Definition SIDefines.h:257
@ OPERAND_REG_IMM_LAST
Definition SIDefines.h:251
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition CallingConv.h:47
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Renamable
Register that may be renamed.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:477
PointerUnion< const TargetRegisterClass *, const RegisterBank * > RegClassOrRegBank
Convenient type to represent either a register class or a register bank.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1655
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
Op::Description Desc
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
@ HasSGPR
Definition SIDefines.h:26
@ HasVGPR
Definition SIDefines.h:24
@ RegKindMask
Definition SIDefines.h:29
@ HasAGPR
Definition SIDefines.h:25
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
unsigned MCRegUnit
Register units are used to compute register aliasing.
Definition MCRegister.h:30
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:189
unsigned getDefRegState(bool B)
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
unsigned getKillRegState(bool B)
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
void call_once(once_flag &flag, Function &&F, Args &&... ArgList)
Execute the function specified as a parameter once.
Definition Threading.h:86
constexpr unsigned BitWidth
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
This class contains a discriminated union of information about pointers in memory operands,...
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
void setMI(MachineBasicBlock *NewMBB, MachineBasicBlock::iterator NewMI)
ArrayRef< int16_t > SplitParts
SIMachineFunctionInfo & MFI
SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, bool IsWave32, MachineBasicBlock::iterator MI, int Index, RegScavenger *RS)
SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII, bool IsWave32, MachineBasicBlock::iterator MI, Register Reg, bool IsKill, int Index, RegScavenger *RS)
MachineBasicBlock::iterator MI
void readWriteTmpVGPR(unsigned Offset, bool IsLoad)
const SIRegisterInfo & TRI
MachineBasicBlock * MBB
const SIInstrInfo & TII
The llvm::once_flag structure.
Definition Threading.h:67