LLVM 18.0.0git
AMDGPURegisterBankInfo.cpp
Go to the documentation of this file.
1//===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the RegisterBankInfo class for
10/// AMDGPU.
11///
12/// \par
13///
14/// AMDGPU has unique register bank constraints that require special high level
15/// strategies to deal with. There are two main true physical register banks
16/// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a
17/// sort of pseudo-register bank needed to represent SGPRs used in a vector
18/// boolean context. There is also the AGPR bank, which is a special purpose
19/// physical register bank present on some subtargets.
20///
21/// Copying from VGPR to SGPR is generally illegal, unless the value is known to
22/// be uniform. It is generally not valid to legalize operands by inserting
23/// copies as on other targets. Operations which require uniform, SGPR operands
24/// generally require scalarization by repeatedly executing the instruction,
25/// activating each set of lanes using a unique set of input values. This is
26/// referred to as a waterfall loop.
27///
28/// \par Booleans
29///
30/// Booleans (s1 values) requires special consideration. A vector compare result
31/// is naturally a bitmask with one bit per lane, in a 32 or 64-bit
32/// register. These are represented with the VCC bank. During selection, we need
33/// to be able to unambiguously go back from a register class to a register
34/// bank. To distinguish whether an SGPR should use the SGPR or VCC register
35/// bank, we need to know the use context type. An SGPR s1 value always means a
36/// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets
37/// SCC, which is a 1-bit unaddressable register. This will need to be copied to
38/// a 32-bit virtual register. Taken together, this means we need to adjust the
39/// type of boolean operations to be regbank legal. All SALU booleans need to be
40/// widened to 32-bits, and all VALU booleans need to be s1 values.
41///
42/// A noteworthy exception to the s1-means-vcc rule is for legalization artifact
43/// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc
44/// bank. A non-boolean source (such as a truncate from a 1-bit load from
45/// memory) will require a copy to the VCC bank which will require clearing the
46/// high bits and inserting a compare.
47///
48/// \par Constant bus restriction
49///
50/// VALU instructions have a limitation known as the constant bus
51/// restriction. Most VALU instructions can use SGPR operands, but may read at
52/// most 1 SGPR or constant literal value (this to 2 in gfx10 for most
53/// instructions). This is one unique SGPR, so the same SGPR may be used for
54/// multiple operands. From a register bank perspective, any combination of
55/// operands should be legal as an SGPR, but this is contextually dependent on
56/// the SGPR operands all being the same register. There is therefore optimal to
57/// choose the SGPR with the most uses to minimize the number of copies.
58///
59/// We avoid trying to solve this problem in RegBankSelect. Any VALU G_*
60/// operation should have its source operands all mapped to VGPRs (except for
61/// VCC), inserting copies from any SGPR operands. This the most trivial legal
62/// mapping. Anything beyond the simplest 1:1 instruction selection would be too
63/// complicated to solve here. Every optimization pattern or instruction
64/// selected to multiple outputs would have to enforce this rule, and there
65/// would be additional complexity in tracking this rule for every G_*
66/// operation. By forcing all inputs to VGPRs, it also simplifies the task of
67/// picking the optimal operand combination from a post-isel optimization pass.
68///
69//===----------------------------------------------------------------------===//
70
72
73#include "AMDGPU.h"
75#include "AMDGPUInstrInfo.h"
76#include "GCNSubtarget.h"
78#include "SIRegisterInfo.h"
84#include "llvm/IR/IntrinsicsAMDGPU.h"
85
86#define GET_TARGET_REGBANK_IMPL
87#include "AMDGPUGenRegisterBank.inc"
88
89// This file will be TableGen'ed at some point.
90#include "AMDGPUGenRegisterBankInfo.def"
91
92using namespace llvm;
93using namespace MIPatternMatch;
94
95namespace {
96
97// Observer to apply a register bank to new registers created by LegalizerHelper.
98class ApplyRegBankMapping final : public GISelChangeObserver {
99private:
101 const AMDGPURegisterBankInfo &RBI;
103 const RegisterBank *NewBank;
105
106public:
107 ApplyRegBankMapping(MachineIRBuilder &B, const AMDGPURegisterBankInfo &RBI_,
108 MachineRegisterInfo &MRI_, const RegisterBank *RB)
109 : B(B), RBI(RBI_), MRI(MRI_), NewBank(RB) {
110 assert(!B.isObservingChanges());
111 B.setChangeObserver(*this);
112 }
113
114 ~ApplyRegBankMapping() {
115 for (MachineInstr *MI : NewInsts)
116 applyBank(*MI);
117
118 B.stopObservingChanges();
119 }
120
121 /// Set any registers that don't have a set register class or bank to SALU.
122 void applyBank(MachineInstr &MI) {
123 const unsigned Opc = MI.getOpcode();
124 if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT ||
125 Opc == AMDGPU::G_SEXT) {
126 // LegalizerHelper wants to use the basic legalization artifacts when
127 // widening etc. We don't handle selection with vcc in artifact sources,
128 // so we need to use a select instead to handle these properly.
129 Register DstReg = MI.getOperand(0).getReg();
130 Register SrcReg = MI.getOperand(1).getReg();
131 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI);
132 if (SrcBank == &AMDGPU::VCCRegBank) {
133 const LLT S32 = LLT::scalar(32);
134 assert(MRI.getType(SrcReg) == LLT::scalar(1));
135 assert(MRI.getType(DstReg) == S32);
136 assert(NewBank == &AMDGPU::VGPRRegBank);
137
138 // Replace the extension with a select, which really uses the boolean
139 // source.
140 B.setInsertPt(*MI.getParent(), MI);
141
142 auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1);
143 auto False = B.buildConstant(S32, 0);
144 B.buildSelect(DstReg, SrcReg, True, False);
145 MRI.setRegBank(True.getReg(0), *NewBank);
146 MRI.setRegBank(False.getReg(0), *NewBank);
147 MI.eraseFromParent();
148 }
149
150 assert(!MRI.getRegClassOrRegBank(DstReg));
151 MRI.setRegBank(DstReg, *NewBank);
152 return;
153 }
154
155#ifndef NDEBUG
156 if (Opc == AMDGPU::G_TRUNC) {
157 Register DstReg = MI.getOperand(0).getReg();
158 const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI);
159 assert(DstBank != &AMDGPU::VCCRegBank);
160 }
161#endif
162
163 for (MachineOperand &Op : MI.operands()) {
164 if (!Op.isReg())
165 continue;
166
167 // We may see physical registers if building a real MI
168 Register Reg = Op.getReg();
169 if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg))
170 continue;
171
172 const RegisterBank *RB = NewBank;
173 if (MRI.getType(Reg) == LLT::scalar(1)) {
174 assert(NewBank == &AMDGPU::VGPRRegBank &&
175 "s1 operands should only be used for vector bools");
176 assert((MI.getOpcode() != AMDGPU::G_TRUNC &&
177 MI.getOpcode() != AMDGPU::G_ANYEXT) &&
178 "not expecting legalization artifacts here");
179 RB = &AMDGPU::VCCRegBank;
180 }
181
182 MRI.setRegBank(Reg, *RB);
183 }
184 }
185
186 void erasingInstr(MachineInstr &MI) override {}
187
188 void createdInstr(MachineInstr &MI) override {
189 // At this point, the instruction was just inserted and has no operands.
190 NewInsts.push_back(&MI);
191 }
192
193 void changingInstr(MachineInstr &MI) override {}
194 void changedInstr(MachineInstr &MI) override {
195 // FIXME: In principle we should probably add the instruction to NewInsts,
196 // but the way the LegalizerHelper uses the observer, we will always see the
197 // registers we need to set the regbank on also referenced in a new
198 // instruction.
199 }
200};
201
202}
203
205 : Subtarget(ST), TRI(Subtarget.getRegisterInfo()),
206 TII(Subtarget.getInstrInfo()) {
207
208 // HACK: Until this is fully tablegen'd.
209 static llvm::once_flag InitializeRegisterBankFlag;
210
211 static auto InitializeRegisterBankOnce = [this]() {
212 assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank &&
213 &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank &&
214 &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank);
215 (void)this;
216 };
217
218 llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce);
219}
220
221static bool isVectorRegisterBank(const RegisterBank &Bank) {
222 unsigned BankID = Bank.getID();
223 return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID;
224}
225
227 return RB != &AMDGPU::SGPRRegBank;
228}
229
231 const RegisterBank &Src,
232 unsigned Size) const {
233 // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
234 if (Dst.getID() == AMDGPU::SGPRRegBankID &&
235 (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) {
236 return std::numeric_limits<unsigned>::max();
237 }
238
239 // Bool values are tricky, because the meaning is based on context. The SCC
240 // and VCC banks are for the natural scalar and vector conditions produced by
241 // a compare.
242 //
243 // Legalization doesn't know about the necessary context, so an s1 use may
244 // have been a truncate from an arbitrary value, in which case a copy (lowered
245 // as a compare with 0) needs to be inserted.
246 if (Size == 1 &&
247 (Dst.getID() == AMDGPU::SGPRRegBankID) &&
248 (isVectorRegisterBank(Src) ||
249 Src.getID() == AMDGPU::SGPRRegBankID ||
250 Src.getID() == AMDGPU::VCCRegBankID))
251 return std::numeric_limits<unsigned>::max();
252
253 // There is no direct copy between AGPRs.
254 if (Dst.getID() == AMDGPU::AGPRRegBankID &&
255 Src.getID() == AMDGPU::AGPRRegBankID)
256 return 4;
257
258 return RegisterBankInfo::copyCost(Dst, Src, Size);
259}
260
262 const ValueMapping &ValMapping,
263 const RegisterBank *CurBank) const {
264 // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
265 // VGPR.
266 // FIXME: Is there a better way to do this?
267 if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
268 return 10; // This is expensive.
269
270 assert(ValMapping.NumBreakDowns == 2 &&
271 ValMapping.BreakDown[0].Length == 32 &&
272 ValMapping.BreakDown[0].StartIdx == 0 &&
273 ValMapping.BreakDown[1].Length == 32 &&
274 ValMapping.BreakDown[1].StartIdx == 32 &&
275 ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank);
276
277 // 32-bit extract of a 64-bit value is just access of a subregister, so free.
278 // TODO: Cost of 0 hits assert, though it's not clear it's what we really
279 // want.
280
281 // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
282 // alignment restrictions, but this probably isn't important.
283 return 1;
284}
285
286const RegisterBank &
288 LLT Ty) const {
289 if (&RC == &AMDGPU::SReg_1RegClass)
290 return AMDGPU::VCCRegBank;
291
292 // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
293 // VCC-like use.
294 if (TRI->isSGPRClass(&RC)) {
295 // FIXME: This probably came from a copy from a physical register, which
296 // should be inferable from the copied to-type. We don't have many boolean
297 // physical register constraints so just assume a normal SGPR for now.
298 if (!Ty.isValid())
299 return AMDGPU::SGPRRegBank;
300
301 return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
302 }
303
304 return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank;
305}
306
307template <unsigned NumOps>
310 const MachineInstr &MI, const MachineRegisterInfo &MRI,
311 const std::array<unsigned, NumOps> RegSrcOpIdx,
312 ArrayRef<OpRegBankEntry<NumOps>> Table) const {
313
314 InstructionMappings AltMappings;
315
317
318 unsigned Sizes[NumOps];
319 for (unsigned I = 0; I < NumOps; ++I) {
320 Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
321 Sizes[I] = getSizeInBits(Reg, MRI, *TRI);
322 }
323
324 for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
325 unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
326 Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
327 }
328
329 // getInstrMapping's default mapping uses ID 1, so start at 2.
330 unsigned MappingID = 2;
331 for (const auto &Entry : Table) {
332 for (unsigned I = 0; I < NumOps; ++I) {
333 int OpIdx = RegSrcOpIdx[I];
334 Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]);
335 }
336
337 AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost,
339 Operands.size()));
340 }
341
342 return AltMappings;
343}
344
347 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
348 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
349 case Intrinsic::amdgcn_readlane: {
350 static const OpRegBankEntry<3> Table[2] = {
351 // Perfectly legal.
352 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
353
354 // Need a readfirstlane for the index.
355 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
356 };
357
358 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
359 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, Table);
360 }
361 case Intrinsic::amdgcn_writelane: {
362 static const OpRegBankEntry<4> Table[4] = {
363 // Perfectly legal.
364 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
365
366 // Need readfirstlane of first op
367 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
368
369 // Need readfirstlane of second op
370 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
371
372 // Need readfirstlane of both ops
373 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }
374 };
375
376 // rsrc, voffset, offset
377 const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } };
378 return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, Table);
379 }
380 default:
382 }
383}
384
387 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
388
389 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
390 case Intrinsic::amdgcn_s_buffer_load: {
391 static const OpRegBankEntry<2> Table[4] = {
392 // Perfectly legal.
393 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
394
395 // Only need 1 register in loop
396 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 },
397
398 // Have to waterfall the resource.
399 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
400
401 // Have to waterfall the resource, and the offset.
402 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 }
403 };
404
405 // rsrc, offset
406 const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } };
407 return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, Table);
408 }
409 case Intrinsic::amdgcn_ds_ordered_add:
410 case Intrinsic::amdgcn_ds_ordered_swap: {
411 // VGPR = M0, VGPR
412 static const OpRegBankEntry<3> Table[2] = {
413 // Perfectly legal.
414 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
415
416 // Need a readfirstlane for m0
417 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
418 };
419
420 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
421 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, Table);
422 }
423 case Intrinsic::amdgcn_s_sendmsg:
424 case Intrinsic::amdgcn_s_sendmsghalt: {
425 // FIXME: Should have no register for immediate
426 static const OpRegBankEntry<1> Table[2] = {
427 // Perfectly legal.
428 { { AMDGPU::SGPRRegBankID }, 1 },
429
430 // Need readlane
431 { { AMDGPU::VGPRRegBankID }, 3 }
432 };
433
434 const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } };
435 return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, Table);
436 }
437 default:
439 }
440}
441
442// FIXME: Returns uniform if there's no source value information. This is
443// probably wrong.
444static bool isScalarLoadLegal(const MachineInstr &MI) {
445 if (!MI.hasOneMemOperand())
446 return false;
447
448 const MachineMemOperand *MMO = *MI.memoperands_begin();
449 const unsigned AS = MMO->getAddrSpace();
450 const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||
452 // Require 4-byte alignment.
453 return MMO->getAlign() >= Align(4) &&
454 // Can't do a scalar atomic load.
455 !MMO->isAtomic() &&
456 // Don't use scalar loads for volatile accesses to non-constant address
457 // spaces.
458 (IsConst || !MMO->isVolatile()) &&
459 // Memory must be known constant, or not written before this load.
460 (IsConst || MMO->isInvariant() || (MMO->getFlags() & MONoClobber)) &&
462}
463
466 const MachineInstr &MI) const {
467
468 const MachineFunction &MF = *MI.getParent()->getParent();
469 const MachineRegisterInfo &MRI = MF.getRegInfo();
470
471
472 InstructionMappings AltMappings;
473 switch (MI.getOpcode()) {
474 case TargetOpcode::G_CONSTANT:
475 case TargetOpcode::G_IMPLICIT_DEF: {
476 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
477 if (Size == 1) {
478 static const OpRegBankEntry<1> Table[3] = {
479 { { AMDGPU::VGPRRegBankID }, 1 },
480 { { AMDGPU::SGPRRegBankID }, 1 },
481 { { AMDGPU::VCCRegBankID }, 1 }
482 };
483
484 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
485 }
486
487 [[fallthrough]];
488 }
489 case TargetOpcode::G_FCONSTANT:
490 case TargetOpcode::G_FRAME_INDEX:
491 case TargetOpcode::G_GLOBAL_VALUE: {
492 static const OpRegBankEntry<1> Table[2] = {
493 { { AMDGPU::VGPRRegBankID }, 1 },
494 { { AMDGPU::SGPRRegBankID }, 1 }
495 };
496
497 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
498 }
499 case TargetOpcode::G_AND:
500 case TargetOpcode::G_OR:
501 case TargetOpcode::G_XOR: {
502 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
503
504 if (Size == 1) {
505 // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
506 const InstructionMapping &SCCMapping = getInstructionMapping(
507 1, 1, getOperandsMapping(
508 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
509 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
510 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}),
511 3); // Num Operands
512 AltMappings.push_back(&SCCMapping);
513
514 const InstructionMapping &VCCMapping0 = getInstructionMapping(
515 2, 1, getOperandsMapping(
516 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
517 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
518 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
519 3); // Num Operands
520 AltMappings.push_back(&VCCMapping0);
521 return AltMappings;
522 }
523
524 if (Size != 64)
525 break;
526
527 const InstructionMapping &SSMapping = getInstructionMapping(
528 1, 1, getOperandsMapping(
529 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
530 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
531 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
532 3); // Num Operands
533 AltMappings.push_back(&SSMapping);
534
535 const InstructionMapping &VVMapping = getInstructionMapping(
536 2, 2, getOperandsMapping(
537 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
538 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
539 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
540 3); // Num Operands
541 AltMappings.push_back(&VVMapping);
542 break;
543 }
544 case TargetOpcode::G_LOAD:
545 case TargetOpcode::G_ZEXTLOAD:
546 case TargetOpcode::G_SEXTLOAD: {
547 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
548 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
549 unsigned PtrSize = PtrTy.getSizeInBits();
550 unsigned AS = PtrTy.getAddressSpace();
551
555 const InstructionMapping &SSMapping = getInstructionMapping(
556 1, 1, getOperandsMapping(
557 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
558 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}),
559 2); // Num Operands
560 AltMappings.push_back(&SSMapping);
561 }
562
563 const InstructionMapping &VVMapping = getInstructionMapping(
564 2, 1,
566 {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
567 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
568 2); // Num Operands
569 AltMappings.push_back(&VVMapping);
570
571 // It may be possible to have a vgpr = load sgpr mapping here, because
572 // the mubuf instructions support this kind of load, but probably for only
573 // gfx7 and older. However, the addressing mode matching in the instruction
574 // selector should be able to do a better job of detecting and selecting
575 // these kinds of loads from the vgpr = load vgpr mapping.
576
577 return AltMappings;
578
579 }
580 case TargetOpcode::G_SELECT: {
581 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
582 const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
583 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
584 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
585 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
586 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
587 4); // Num Operands
588 AltMappings.push_back(&SSMapping);
589
590 const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
591 getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
592 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
593 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
594 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
595 4); // Num Operands
596 AltMappings.push_back(&VVMapping);
597
598 return AltMappings;
599 }
600 case TargetOpcode::G_UADDE:
601 case TargetOpcode::G_USUBE:
602 case TargetOpcode::G_SADDE:
603 case TargetOpcode::G_SSUBE: {
604 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
605 const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
607 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
608 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
609 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
610 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
611 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}),
612 5); // Num Operands
613 AltMappings.push_back(&SSMapping);
614
615 const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
616 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
617 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
618 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
619 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
620 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}),
621 5); // Num Operands
622 AltMappings.push_back(&VVMapping);
623 return AltMappings;
624 }
625 case AMDGPU::G_BRCOND: {
626 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
627
628 // TODO: Change type to 32 for scalar
630 1, 1, getOperandsMapping(
631 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}),
632 2); // Num Operands
633 AltMappings.push_back(&SMapping);
634
636 1, 1, getOperandsMapping(
637 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }),
638 2); // Num Operands
639 AltMappings.push_back(&VMapping);
640 return AltMappings;
641 }
642 case AMDGPU::G_INTRINSIC:
643 case AMDGPU::G_INTRINSIC_CONVERGENT:
645 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
646 case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
648 default:
649 break;
650 }
652}
653
657 LLT HalfTy,
658 Register Reg) const {
659 assert(HalfTy.getSizeInBits() == 32);
660 MachineRegisterInfo *MRI = B.getMRI();
661 Register LoLHS = MRI->createGenericVirtualRegister(HalfTy);
662 Register HiLHS = MRI->createGenericVirtualRegister(HalfTy);
663 const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI);
664 MRI->setRegBank(LoLHS, *Bank);
665 MRI->setRegBank(HiLHS, *Bank);
666
667 Regs.push_back(LoLHS);
668 Regs.push_back(HiLHS);
669
670 B.buildInstr(AMDGPU::G_UNMERGE_VALUES)
671 .addDef(LoLHS)
672 .addDef(HiLHS)
673 .addUse(Reg);
674}
675
676/// Replace the current type each register in \p Regs has with \p NewTy
678 LLT NewTy) {
679 for (Register Reg : Regs) {
680 assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits());
681 MRI.setType(Reg, NewTy);
682 }
683}
684
686 if (Ty.isVector()) {
689 Ty.getElementType());
690 }
691
692 assert(Ty.getScalarSizeInBits() % 2 == 0);
693 return LLT::scalar(Ty.getScalarSizeInBits() / 2);
694}
695
696// Build one or more V_READFIRSTLANE_B32 instructions to move the given vector
697// source value into a scalar register.
700 Register Src) const {
701 LLT Ty = MRI.getType(Src);
702 const RegisterBank *Bank = getRegBank(Src, MRI, *TRI);
703
704 if (Bank == &AMDGPU::SGPRRegBank)
705 return Src;
706
707 unsigned Bits = Ty.getSizeInBits();
708 assert(Bits % 32 == 0);
709
710 if (Bank != &AMDGPU::VGPRRegBank) {
711 // We need to copy from AGPR to VGPR
712 Src = B.buildCopy(Ty, Src).getReg(0);
713 MRI.setRegBank(Src, AMDGPU::VGPRRegBank);
714 }
715
716 LLT S32 = LLT::scalar(32);
717 unsigned NumParts = Bits / 32;
720
721 if (Bits == 32) {
722 SrcParts.push_back(Src);
723 } else {
724 auto Unmerge = B.buildUnmerge(S32, Src);
725 for (unsigned i = 0; i < NumParts; ++i)
726 SrcParts.push_back(Unmerge.getReg(i));
727 }
728
729 for (unsigned i = 0; i < NumParts; ++i) {
730 Register SrcPart = SrcParts[i];
731 Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
732 MRI.setType(DstPart, NumParts == 1 ? Ty : S32);
733
734 const TargetRegisterClass *Constrained =
735 constrainGenericRegister(SrcPart, AMDGPU::VGPR_32RegClass, MRI);
736 (void)Constrained;
737 assert(Constrained && "Failed to constrain readfirstlane src reg");
738
739 B.buildInstr(AMDGPU::V_READFIRSTLANE_B32, {DstPart}, {SrcPart});
740
741 DstParts.push_back(DstPart);
742 }
743
744 if (Bits == 32)
745 return DstParts[0];
746
747 Register Dst = B.buildMergeLikeInstr(Ty, DstParts).getReg(0);
748 MRI.setRegBank(Dst, AMDGPU::SGPRRegBank);
749 return Dst;
750}
751
752/// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
753/// any of the required SGPR operands are VGPRs, perform a waterfall loop to
754/// execute the instruction for each unique combination of values in all lanes
755/// in the wave. The block will be split such that rest of the instructions are
756/// moved to a new block.
757///
758/// Essentially performs this loop:
759//
760/// Save Execution Mask
761/// For (Lane : Wavefront) {
762/// Enable Lane, Disable all other lanes
763/// SGPR = read SGPR value for current lane from VGPR
764/// VGPRResult[Lane] = use_op SGPR
765/// }
766/// Restore Execution Mask
767///
768/// There is additional complexity to try for compare values to identify the
769/// unique values used.
772 SmallSet<Register, 4> &SGPROperandRegs) const {
773 // Track use registers which have already been expanded with a readfirstlane
774 // sequence. This may have multiple uses if moving a sequence.
775 DenseMap<Register, Register> WaterfalledRegMap;
776
777 MachineBasicBlock &MBB = B.getMBB();
778 MachineFunction *MF = &B.getMF();
779
781 const unsigned MovExecOpc =
782 Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
783 const unsigned MovExecTermOpc =
784 Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
785
786 const unsigned XorTermOpc = Subtarget.isWave32() ?
787 AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
788 const unsigned AndSaveExecOpc = Subtarget.isWave32() ?
789 AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
790 const unsigned ExecReg = Subtarget.isWave32() ?
791 AMDGPU::EXEC_LO : AMDGPU::EXEC;
792
793#ifndef NDEBUG
794 const int OrigRangeSize = std::distance(Range.begin(), Range.end());
795#endif
796
797 MachineRegisterInfo &MRI = *B.getMRI();
798 Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
799 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
800
801 // Don't bother using generic instructions/registers for the exec mask.
802 B.buildInstr(TargetOpcode::IMPLICIT_DEF)
803 .addDef(InitSaveExecReg);
804
805 Register PhiExec = MRI.createVirtualRegister(WaveRC);
806 Register NewExec = MRI.createVirtualRegister(WaveRC);
807
808 // To insert the loop we need to split the block. Move everything before this
809 // point to a new block, and insert a new empty block before this instruction.
812 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
813 MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
815 ++MBBI;
816 MF->insert(MBBI, LoopBB);
817 MF->insert(MBBI, BodyBB);
818 MF->insert(MBBI, RestoreExecBB);
819 MF->insert(MBBI, RemainderBB);
820
821 LoopBB->addSuccessor(BodyBB);
822 BodyBB->addSuccessor(RestoreExecBB);
823 BodyBB->addSuccessor(LoopBB);
824
825 // Move the rest of the block into a new block.
827 RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
828
829 MBB.addSuccessor(LoopBB);
830 RestoreExecBB->addSuccessor(RemainderBB);
831
832 B.setInsertPt(*LoopBB, LoopBB->end());
833
834 B.buildInstr(TargetOpcode::PHI)
835 .addDef(PhiExec)
836 .addReg(InitSaveExecReg)
837 .addMBB(&MBB)
838 .addReg(NewExec)
839 .addMBB(BodyBB);
840
841 const DebugLoc &DL = B.getDL();
842
843 MachineInstr &FirstInst = *Range.begin();
844
845 // Move the instruction into the loop body. Note we moved everything after
846 // Range.end() already into a new block, so Range.end() is no longer valid.
847 BodyBB->splice(BodyBB->end(), &MBB, Range.begin(), MBB.end());
848
849 // Figure out the iterator range after splicing the instructions.
850 MachineBasicBlock::iterator NewBegin = FirstInst.getIterator();
851 auto NewEnd = BodyBB->end();
852
853 B.setMBB(*LoopBB);
854
855 LLT S1 = LLT::scalar(1);
856 Register CondReg;
857
858 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
859
860 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
861 for (MachineOperand &Op : MI.all_uses()) {
862 Register OldReg = Op.getReg();
863 if (!SGPROperandRegs.count(OldReg))
864 continue;
865
866 // See if we already processed this register in another instruction in the
867 // sequence.
868 auto OldVal = WaterfalledRegMap.find(OldReg);
869 if (OldVal != WaterfalledRegMap.end()) {
870 Op.setReg(OldVal->second);
871 continue;
872 }
873
874 Register OpReg = Op.getReg();
875 LLT OpTy = MRI.getType(OpReg);
876
877 const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI);
878 if (OpBank != &AMDGPU::VGPRRegBank) {
879 // Insert copy from AGPR to VGPR before the loop.
880 B.setMBB(MBB);
881 OpReg = B.buildCopy(OpTy, OpReg).getReg(0);
882 MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank);
883 B.setMBB(*LoopBB);
884 }
885
886 Register CurrentLaneReg = buildReadFirstLane(B, MRI, OpReg);
887
888 // Build the comparison(s).
889 unsigned OpSize = OpTy.getSizeInBits();
890 bool Is64 = OpSize % 64 == 0;
891 unsigned PartSize = Is64 ? 64 : 32;
892 LLT PartTy = LLT::scalar(PartSize);
893 unsigned NumParts = OpSize / PartSize;
895 SmallVector<Register, 8> CurrentLaneParts;
896
897 if (NumParts == 1) {
898 OpParts.push_back(OpReg);
899 CurrentLaneParts.push_back(CurrentLaneReg);
900 } else {
901 auto UnmergeOp = B.buildUnmerge(PartTy, OpReg);
902 auto UnmergeCurrentLane = B.buildUnmerge(PartTy, CurrentLaneReg);
903 for (unsigned i = 0; i < NumParts; ++i) {
904 OpParts.push_back(UnmergeOp.getReg(i));
905 CurrentLaneParts.push_back(UnmergeCurrentLane.getReg(i));
906 MRI.setRegBank(OpParts[i], AMDGPU::VGPRRegBank);
907 MRI.setRegBank(CurrentLaneParts[i], AMDGPU::SGPRRegBank);
908 }
909 }
910
911 for (unsigned i = 0; i < NumParts; ++i) {
912 auto CmpReg = B.buildICmp(CmpInst::ICMP_EQ, S1, CurrentLaneParts[i],
913 OpParts[i]).getReg(0);
914 MRI.setRegBank(CmpReg, AMDGPU::VCCRegBank);
915
916 if (!CondReg) {
917 CondReg = CmpReg;
918 } else {
919 CondReg = B.buildAnd(S1, CondReg, CmpReg).getReg(0);
920 MRI.setRegBank(CondReg, AMDGPU::VCCRegBank);
921 }
922 }
923
924 Op.setReg(CurrentLaneReg);
925
926 // Make sure we don't re-process this register again.
927 WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg()));
928 }
929 }
930
931 // The ballot becomes a no-op during instruction selection.
932 CondReg = B.buildIntrinsic(Intrinsic::amdgcn_ballot,
933 {LLT::scalar(Subtarget.isWave32() ? 32 : 64)})
934 .addReg(CondReg)
935 .getReg(0);
936 MRI.setRegClass(CondReg, WaveRC);
937
938 // Update EXEC, save the original EXEC value to VCC.
939 B.buildInstr(AndSaveExecOpc)
940 .addDef(NewExec)
941 .addReg(CondReg, RegState::Kill);
942
943 MRI.setSimpleHint(NewExec, CondReg);
944
945 B.setInsertPt(*BodyBB, BodyBB->end());
946
947 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
948 B.buildInstr(XorTermOpc)
949 .addDef(ExecReg)
950 .addReg(ExecReg)
951 .addReg(NewExec);
952
953 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
954 // s_cbranch_scc0?
955
956 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
957 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
958
959 // Save the EXEC mask before the loop.
960 BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg)
961 .addReg(ExecReg);
962
963 // Restore the EXEC mask after the loop.
964 B.setMBB(*RestoreExecBB);
965 B.buildInstr(MovExecTermOpc)
966 .addDef(ExecReg)
967 .addReg(SaveExecReg);
968
969 // Set the insert point after the original instruction, so any new
970 // instructions will be in the remainder.
971 B.setInsertPt(*RemainderBB, RemainderBB->begin());
972
973 return true;
974}
975
976// Return any unique registers used by \p MI at \p OpIndices that need to be
977// handled in a waterfall loop. Returns these registers in \p
978// SGPROperandRegs. Returns true if there are any operands to handle and a
979// waterfall loop is necessary.
981 SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,
982 MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {
983 for (unsigned Op : OpIndices) {
984 assert(MI.getOperand(Op).isUse());
985 Register Reg = MI.getOperand(Op).getReg();
986 const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
987 if (OpBank->getID() != AMDGPU::SGPRRegBankID)
988 SGPROperandRegs.insert(Reg);
989 }
990
991 // No operands need to be replaced, so no need to loop.
992 return !SGPROperandRegs.empty();
993}
994
997 // Use a set to avoid extra readfirstlanes in the case where multiple operands
998 // are the same register.
999 SmallSet<Register, 4> SGPROperandRegs;
1000
1001 if (!collectWaterfallOperands(SGPROperandRegs, MI, *B.getMRI(), OpIndices))
1002 return false;
1003
1004 MachineBasicBlock::iterator I = MI.getIterator();
1005 return executeInWaterfallLoop(B, make_range(I, std::next(I)),
1006 SGPROperandRegs);
1007}
1008
1009// Legalize an operand that must be an SGPR by inserting a readfirstlane.
1011 MachineIRBuilder &B, MachineInstr &MI, unsigned OpIdx) const {
1012 Register Reg = MI.getOperand(OpIdx).getReg();
1013 MachineRegisterInfo &MRI = *B.getMRI();
1014 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
1015 if (Bank == &AMDGPU::SGPRRegBank)
1016 return;
1017
1018 Reg = buildReadFirstLane(B, MRI, Reg);
1019 MI.getOperand(OpIdx).setReg(Reg);
1020}
1021
1022/// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
1023/// rest will be in the remainder.
1024static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) {
1025 unsigned TotalSize = Ty.getSizeInBits();
1026 if (!Ty.isVector())
1027 return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)};
1028
1029 LLT EltTy = Ty.getElementType();
1030 unsigned EltSize = EltTy.getSizeInBits();
1031 assert(FirstSize % EltSize == 0);
1032
1033 unsigned FirstPartNumElts = FirstSize / EltSize;
1034 unsigned RemainderElts = (TotalSize - FirstSize) / EltSize;
1035
1036 return {LLT::scalarOrVector(ElementCount::getFixed(FirstPartNumElts), EltTy),
1037 LLT::scalarOrVector(ElementCount::getFixed(RemainderElts), EltTy)};
1038}
1039
1041 if (!Ty.isVector())
1042 return LLT::scalar(128);
1043
1044 LLT EltTy = Ty.getElementType();
1045 assert(128 % EltTy.getSizeInBits() == 0);
1046 return LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
1047}
1048
1052 MachineInstr &MI) const {
1053 MachineRegisterInfo &MRI = *B.getMRI();
1054 Register DstReg = MI.getOperand(0).getReg();
1055 const LLT LoadTy = MRI.getType(DstReg);
1056 unsigned LoadSize = LoadTy.getSizeInBits();
1057 const unsigned MaxNonSmrdLoadSize = 128;
1058
1059 const RegisterBank *DstBank =
1060 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1061 if (DstBank == &AMDGPU::SGPRRegBank) {
1062 // There are some special cases that we need to look at for 32 bit and 96
1063 // bit SGPR loads otherwise we have nothing to do.
1064 if (LoadSize != 32 && LoadSize != 96)
1065 return false;
1066
1067 MachineMemOperand *MMO = *MI.memoperands_begin();
1068 const unsigned MemSize = 8 * MMO->getSize();
1069 // Scalar loads of size 8 or 16 bit with proper alignment may be widened to
1070 // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit
1071 // scalar loads should have a load size of 32 but memory access size of less
1072 // than 32.
1073 if (LoadSize == 32 &&
1074 (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI)))
1075 return false;
1076
1077 Register PtrReg = MI.getOperand(1).getReg();
1078
1079 ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
1080
1081 if (LoadSize == 32) {
1082 // This is an extending load from a sub-dword size. Widen the memory
1083 // access size to 4 bytes and clear the extra high bits appropriately
1084 const LLT S32 = LLT::scalar(32);
1085 if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) {
1086 // Must extend the sign bit into higher bits for a G_SEXTLOAD
1087 auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1088 B.buildSExtInReg(MI.getOperand(0), WideLoad, MemSize);
1089 } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) {
1090 // Must extend zero into higher bits with an AND for a G_ZEXTLOAD
1091 auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1092 B.buildZExtInReg(MI.getOperand(0), WideLoad, MemSize);
1093 } else
1094 // We do not need to touch the higher bits for regular loads.
1095 B.buildLoadFromOffset(MI.getOperand(0), PtrReg, *MMO, 0);
1096 } else {
1097 // 96-bit loads are only available for vector loads. We need to split this
1098 // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
1099 if (MMO->getAlign() < Align(16)) {
1100 LegalizerHelper Helper(B.getMF(), ApplyBank, B);
1101 LLT Part64, Part32;
1102 std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64);
1103 if (Helper.reduceLoadStoreWidth(cast<GAnyLoad>(MI), 0, Part64) !=
1105 return false;
1106 return true;
1107 } else {
1108 LLT WiderTy = widen96To128(LoadTy);
1109 auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0);
1110 if (WiderTy.isScalar())
1111 B.buildTrunc(MI.getOperand(0), WideLoad);
1112 else {
1113 B.buildDeleteTrailingVectorElements(MI.getOperand(0).getReg(),
1114 WideLoad);
1115 }
1116 }
1117 }
1118
1119 MI.eraseFromParent();
1120 return true;
1121 }
1122
1123 // 128-bit loads are supported for all instruction types.
1124 if (LoadSize <= MaxNonSmrdLoadSize)
1125 return false;
1126
1127 SmallVector<Register, 16> DefRegs(OpdMapper.getVRegs(0));
1128 SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1));
1129
1130 if (SrcRegs.empty())
1131 SrcRegs.push_back(MI.getOperand(1).getReg());
1132
1133 assert(LoadSize % MaxNonSmrdLoadSize == 0);
1134
1135 // RegBankSelect only emits scalar types, so we need to reset the pointer
1136 // operand to a pointer type.
1137 Register BasePtrReg = SrcRegs[0];
1138 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
1139 MRI.setType(BasePtrReg, PtrTy);
1140
1141 unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize;
1142 const LLT LoadSplitTy = LoadTy.divide(NumSplitParts);
1143 ApplyRegBankMapping O(B, *this, MRI, &AMDGPU::VGPRRegBank);
1144 LegalizerHelper Helper(B.getMF(), O, B);
1145
1146 if (LoadTy.isVector()) {
1147 if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1148 return false;
1149 } else {
1150 if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1151 return false;
1152 }
1153
1154 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
1155 return true;
1156}
1157
1161 MachineInstr &MI) const {
1162 MachineRegisterInfo &MRI = *B.getMRI();
1163 const MachineFunction &MF = B.getMF();
1164 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1165 const auto &TFI = *ST.getFrameLowering();
1166
1167 // Guard in case the stack growth direction ever changes with scratch
1168 // instructions.
1169 if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown)
1170 return false;
1171
1172 Register Dst = MI.getOperand(0).getReg();
1173 Register AllocSize = MI.getOperand(1).getReg();
1174 Align Alignment = assumeAligned(MI.getOperand(2).getImm());
1175
1176 const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI);
1177
1178 // TODO: Need to emit a wave reduction to get the maximum size.
1179 if (SizeBank != &AMDGPU::SGPRRegBank)
1180 return false;
1181
1182 LLT PtrTy = MRI.getType(Dst);
1183 LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
1184
1186 Register SPReg = Info->getStackPtrOffsetReg();
1187 ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank);
1188
1189 auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2());
1190 auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize);
1191
1192 auto SPCopy = B.buildCopy(PtrTy, SPReg);
1193 if (Alignment > TFI.getStackAlign()) {
1194 auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize);
1195 B.buildMaskLowPtrBits(Dst, PtrAdd,
1196 Log2(Alignment) + ST.getWavefrontSizeLog2());
1197 } else {
1198 B.buildPtrAdd(Dst, SPCopy, ScaledSize);
1199 }
1200
1201 MI.eraseFromParent();
1202 return true;
1203}
1204
1208 int RsrcIdx) const {
1209 const int NumDefs = MI.getNumExplicitDefs();
1210
1211 // The reported argument index is relative to the IR intrinsic call arguments,
1212 // so we need to shift by the number of defs and the intrinsic ID.
1213 RsrcIdx += NumDefs + 1;
1214
1215 // Insert copies to VGPR arguments.
1216 applyDefaultMapping(OpdMapper);
1217
1218 // Fixup any SGPR arguments.
1219 SmallVector<unsigned, 4> SGPRIndexes;
1220 for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
1221 if (!MI.getOperand(I).isReg())
1222 continue;
1223
1224 // If this intrinsic has a sampler, it immediately follows rsrc.
1225 if (I == RsrcIdx || I == RsrcIdx + 1)
1226 SGPRIndexes.push_back(I);
1227 }
1228
1229 executeInWaterfallLoop(B, MI, SGPRIndexes);
1230 return true;
1231}
1232
1233// Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store
1234// the three offsets (voffset, soffset and instoffset)
1236 MachineIRBuilder &B, Register CombinedOffset, Register &VOffsetReg,
1237 Register &SOffsetReg, int64_t &InstOffsetVal, Align Alignment) const {
1238 const LLT S32 = LLT::scalar(32);
1239 MachineRegisterInfo *MRI = B.getMRI();
1240
1241 if (std::optional<int64_t> Imm =
1242 getIConstantVRegSExtVal(CombinedOffset, *MRI)) {
1243 uint32_t SOffset, ImmOffset;
1244 if (TII->splitMUBUFOffset(*Imm, SOffset, ImmOffset, Alignment)) {
1245 VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1246 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1247 InstOffsetVal = ImmOffset;
1248
1249 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1250 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1251 return SOffset + ImmOffset;
1252 }
1253 }
1254
1255 Register Base;
1256 unsigned Offset;
1257
1258 std::tie(Base, Offset) =
1259 AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset);
1260
1261 uint32_t SOffset, ImmOffset;
1262 if ((int)Offset > 0 &&
1263 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
1264 if (getRegBank(Base, *MRI, *TRI) == &AMDGPU::VGPRRegBank) {
1265 VOffsetReg = Base;
1266 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1267 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1268 InstOffsetVal = ImmOffset;
1269 return 0; // XXX - Why is this 0?
1270 }
1271
1272 // If we have SGPR base, we can use it for soffset.
1273 if (SOffset == 0) {
1274 VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1275 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1276 SOffsetReg = Base;
1277 InstOffsetVal = ImmOffset;
1278 return 0; // XXX - Why is this 0?
1279 }
1280 }
1281
1282 // Handle the variable sgpr + vgpr case.
1283 MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI);
1284 if (Add && (int)Offset >= 0) {
1285 Register Src0 = getSrcRegIgnoringCopies(Add->getOperand(1).getReg(), *MRI);
1286 Register Src1 = getSrcRegIgnoringCopies(Add->getOperand(2).getReg(), *MRI);
1287
1288 const RegisterBank *Src0Bank = getRegBank(Src0, *MRI, *TRI);
1289 const RegisterBank *Src1Bank = getRegBank(Src1, *MRI, *TRI);
1290
1291 if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) {
1292 VOffsetReg = Src0;
1293 SOffsetReg = Src1;
1294 return 0;
1295 }
1296
1297 if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) {
1298 VOffsetReg = Src1;
1299 SOffsetReg = Src0;
1300 return 0;
1301 }
1302 }
1303
1304 // Ensure we have a VGPR for the combined offset. This could be an issue if we
1305 // have an SGPR offset and a VGPR resource.
1306 if (getRegBank(CombinedOffset, *MRI, *TRI) == &AMDGPU::VGPRRegBank) {
1307 VOffsetReg = CombinedOffset;
1308 } else {
1309 VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0);
1310 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1311 }
1312
1313 SOffsetReg = B.buildConstant(S32, 0).getReg(0);
1314 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1315 return 0;
1316}
1317
1319 MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
1320 MachineInstr &MI = OpdMapper.getMI();
1321 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1322
1323 const LLT S32 = LLT::scalar(32);
1324 Register Dst = MI.getOperand(0).getReg();
1325 LLT Ty = MRI.getType(Dst);
1326
1327 const RegisterBank *RSrcBank =
1328 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1329 const RegisterBank *OffsetBank =
1330 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1331 if (RSrcBank == &AMDGPU::SGPRRegBank &&
1332 OffsetBank == &AMDGPU::SGPRRegBank)
1333 return true; // Legal mapping
1334
1335 // FIXME: 96-bit case was widened during legalize. We need to narrow it back
1336 // here but don't have an MMO.
1337
1338 unsigned LoadSize = Ty.getSizeInBits();
1339 int NumLoads = 1;
1340 if (LoadSize == 256 || LoadSize == 512) {
1341 NumLoads = LoadSize / 128;
1342 Ty = Ty.divide(NumLoads);
1343 }
1344
1345 // Use the alignment to ensure that the required offsets will fit into the
1346 // immediate offsets.
1347 const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1);
1348
1349 MachineFunction &MF = B.getMF();
1350
1351 Register SOffset;
1352 Register VOffset;
1353 int64_t ImmOffset = 0;
1354
1355 unsigned MMOOffset = setBufferOffsets(B, MI.getOperand(2).getReg(), VOffset,
1356 SOffset, ImmOffset, Alignment);
1357
1358 // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we
1359 // can, but we need to track an MMO for that.
1360 const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8;
1361 const Align MemAlign(4); // FIXME: ABI type alignment?
1366 MemSize, MemAlign);
1367 if (MMOOffset != 0)
1368 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize);
1369
1370 // If only the offset is divergent, emit a MUBUF buffer load instead. We can
1371 // assume that the buffer is unswizzled.
1372
1373 Register RSrc = MI.getOperand(1).getReg();
1374 Register VIndex = B.buildConstant(S32, 0).getReg(0);
1375 B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank);
1376
1377 SmallVector<Register, 4> LoadParts(NumLoads);
1378
1379 MachineBasicBlock::iterator MII = MI.getIterator();
1380 MachineInstrSpan Span(MII, &B.getMBB());
1381
1382 for (int i = 0; i < NumLoads; ++i) {
1383 if (NumLoads == 1) {
1384 LoadParts[i] = Dst;
1385 } else {
1386 LoadParts[i] = MRI.createGenericVirtualRegister(Ty);
1387 MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank);
1388 }
1389
1390 MachineMemOperand *MMO = BaseMMO;
1391 if (i != 0)
1392 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize);
1393
1394 B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD)
1395 .addDef(LoadParts[i]) // vdata
1396 .addUse(RSrc) // rsrc
1397 .addUse(VIndex) // vindex
1398 .addUse(VOffset) // voffset
1399 .addUse(SOffset) // soffset
1400 .addImm(ImmOffset + 16 * i) // offset(imm)
1401 .addImm(0) // cachepolicy, swizzled buffer(imm)
1402 .addImm(0) // idxen(imm)
1403 .addMemOperand(MMO);
1404 }
1405
1406 // TODO: If only the resource is a VGPR, it may be better to execute the
1407 // scalar load in the waterfall loop if the resource is expected to frequently
1408 // be dynamically uniform.
1409 if (RSrcBank != &AMDGPU::SGPRRegBank) {
1410 // Remove the original instruction to avoid potentially confusing the
1411 // waterfall loop logic.
1412 B.setInstr(*Span.begin());
1413 MI.eraseFromParent();
1414
1415 SmallSet<Register, 4> OpsToWaterfall;
1416
1417 OpsToWaterfall.insert(RSrc);
1418 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
1419 OpsToWaterfall);
1420 }
1421
1422 if (NumLoads != 1) {
1423 if (Ty.isVector())
1424 B.buildConcatVectors(Dst, LoadParts);
1425 else
1426 B.buildMergeLikeInstr(Dst, LoadParts);
1427 }
1428
1429 // We removed the instruction earlier with a waterfall loop.
1430 if (RSrcBank == &AMDGPU::SGPRRegBank)
1431 MI.eraseFromParent();
1432
1433 return true;
1434}
1435
1437 const OperandsMapper &OpdMapper,
1438 bool Signed) const {
1439 MachineInstr &MI = OpdMapper.getMI();
1440 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1441
1442 // Insert basic copies
1443 applyDefaultMapping(OpdMapper);
1444
1445 Register DstReg = MI.getOperand(0).getReg();
1446 LLT Ty = MRI.getType(DstReg);
1447
1448 const LLT S32 = LLT::scalar(32);
1449
1450 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
1451 Register SrcReg = MI.getOperand(FirstOpnd).getReg();
1452 Register OffsetReg = MI.getOperand(FirstOpnd + 1).getReg();
1453 Register WidthReg = MI.getOperand(FirstOpnd + 2).getReg();
1454
1455 const RegisterBank *DstBank =
1456 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1457 if (DstBank == &AMDGPU::VGPRRegBank) {
1458 if (Ty == S32)
1459 return true;
1460
1461 // There is no 64-bit vgpr bitfield extract instructions so the operation
1462 // is expanded to a sequence of instructions that implement the operation.
1463 ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank);
1464
1465 const LLT S64 = LLT::scalar(64);
1466 // Shift the source operand so that extracted bits start at bit 0.
1467 auto ShiftOffset = Signed ? B.buildAShr(S64, SrcReg, OffsetReg)
1468 : B.buildLShr(S64, SrcReg, OffsetReg);
1469 auto UnmergeSOffset = B.buildUnmerge({S32, S32}, ShiftOffset);
1470
1471 // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions
1472 // if the width is a constant.
1473 if (auto ConstWidth = getIConstantVRegValWithLookThrough(WidthReg, MRI)) {
1474 // Use the 32-bit bitfield extract instruction if the width is a constant.
1475 // Depending on the width size, use either the low or high 32-bits.
1476 auto Zero = B.buildConstant(S32, 0);
1477 auto WidthImm = ConstWidth->Value.getZExtValue();
1478 if (WidthImm <= 32) {
1479 // Use bitfield extract on the lower 32-bit source, and then sign-extend
1480 // or clear the upper 32-bits.
1481 auto Extract =
1482 Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg)
1483 : B.buildUbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg);
1484 auto Extend =
1485 Signed ? B.buildAShr(S32, Extract, B.buildConstant(S32, 31)) : Zero;
1486 B.buildMergeLikeInstr(DstReg, {Extract, Extend});
1487 } else {
1488 // Use bitfield extract on upper 32-bit source, and combine with lower
1489 // 32-bit source.
1490 auto UpperWidth = B.buildConstant(S32, WidthImm - 32);
1491 auto Extract =
1492 Signed
1493 ? B.buildSbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth)
1494 : B.buildUbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth);
1495 B.buildMergeLikeInstr(DstReg, {UnmergeSOffset.getReg(0), Extract});
1496 }
1497 MI.eraseFromParent();
1498 return true;
1499 }
1500
1501 // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit
1502 // operations.
1503 auto ExtShift = B.buildSub(S32, B.buildConstant(S32, 64), WidthReg);
1504 auto SignBit = B.buildShl(S64, ShiftOffset, ExtShift);
1505 if (Signed)
1506 B.buildAShr(S64, SignBit, ExtShift);
1507 else
1508 B.buildLShr(S64, SignBit, ExtShift);
1509 MI.eraseFromParent();
1510 return true;
1511 }
1512
1513 // The scalar form packs the offset and width in a single operand.
1514
1515 ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank);
1516
1517 // Ensure the high bits are clear to insert the offset.
1518 auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
1519 auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
1520
1521 // Zeros out the low bits, so don't bother clamping the input value.
1522 auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16));
1523
1524 // Transformation function, pack the offset and width of a BFE into
1525 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
1526 // source, bits [5:0] contain the offset and bits [22:16] the width.
1527 auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
1528
1529 // TODO: It might be worth using a pseudo here to avoid scc clobber and
1530 // register class constraints.
1531 unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
1532 (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
1533
1534 auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
1535 if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1536 llvm_unreachable("failed to constrain BFE");
1537
1538 MI.eraseFromParent();
1539 return true;
1540}
1541
1543 MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
1544 MachineInstr &MI = OpdMapper.getMI();
1545 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1546
1547 // Insert basic copies.
1548 applyDefaultMapping(OpdMapper);
1549
1550 Register Dst0 = MI.getOperand(0).getReg();
1551 Register Dst1 = MI.getOperand(1).getReg();
1552 Register Src0 = MI.getOperand(2).getReg();
1553 Register Src1 = MI.getOperand(3).getReg();
1554 Register Src2 = MI.getOperand(4).getReg();
1555
1556 if (MRI.getRegBankOrNull(Src0) == &AMDGPU::VGPRRegBank)
1557 return true;
1558
1559 bool IsUnsigned = MI.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
1560 LLT S1 = LLT::scalar(1);
1561 LLT S32 = LLT::scalar(32);
1562
1563 bool DstOnValu = MRI.getRegBankOrNull(Src2) == &AMDGPU::VGPRRegBank;
1564 bool Accumulate = true;
1565
1566 if (!DstOnValu) {
1567 if (mi_match(Src2, MRI, m_ZeroInt()))
1568 Accumulate = false;
1569 }
1570
1571 // Keep the multiplication on the SALU.
1572 Register DstHi;
1573 Register DstLo = B.buildMul(S32, Src0, Src1).getReg(0);
1574 bool MulHiInVgpr = false;
1575
1576 MRI.setRegBank(DstLo, AMDGPU::SGPRRegBank);
1577
1578 if (Subtarget.hasSMulHi()) {
1579 DstHi = IsUnsigned ? B.buildUMulH(S32, Src0, Src1).getReg(0)
1580 : B.buildSMulH(S32, Src0, Src1).getReg(0);
1581 MRI.setRegBank(DstHi, AMDGPU::SGPRRegBank);
1582 } else {
1583 Register VSrc0 = B.buildCopy(S32, Src0).getReg(0);
1584 Register VSrc1 = B.buildCopy(S32, Src1).getReg(0);
1585
1586 MRI.setRegBank(VSrc0, AMDGPU::VGPRRegBank);
1587 MRI.setRegBank(VSrc1, AMDGPU::VGPRRegBank);
1588
1589 DstHi = IsUnsigned ? B.buildUMulH(S32, VSrc0, VSrc1).getReg(0)
1590 : B.buildSMulH(S32, VSrc0, VSrc1).getReg(0);
1591 MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank);
1592
1593 if (!DstOnValu) {
1594 DstHi = buildReadFirstLane(B, MRI, DstHi);
1595 } else {
1596 MulHiInVgpr = true;
1597 }
1598 }
1599
1600 // Accumulate and produce the "carry-out" bit.
1601 //
1602 // The "carry-out" is defined as bit 64 of the result when computed as a
1603 // big integer. For unsigned multiply-add, this matches the usual definition
1604 // of carry-out. For signed multiply-add, bit 64 is the sign bit of the
1605 // result, which is determined as:
1606 // sign(Src0 * Src1) + sign(Src2) + carry-out from unsigned 64-bit add
1607 LLT CarryType = DstOnValu ? S1 : S32;
1608 const RegisterBank &CarryBank =
1609 DstOnValu ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
1610 const RegisterBank &DstBank =
1611 DstOnValu ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank;
1612 Register Carry;
1613 Register Zero;
1614
1615 if (!IsUnsigned) {
1616 Zero = B.buildConstant(S32, 0).getReg(0);
1617 MRI.setRegBank(Zero,
1618 MulHiInVgpr ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank);
1619
1620 Carry = B.buildICmp(CmpInst::ICMP_SLT, MulHiInVgpr ? S1 : S32, DstHi, Zero)
1621 .getReg(0);
1622 MRI.setRegBank(Carry, MulHiInVgpr ? AMDGPU::VCCRegBank
1623 : AMDGPU::SGPRRegBank);
1624
1625 if (DstOnValu && !MulHiInVgpr) {
1626 Carry = B.buildTrunc(S1, Carry).getReg(0);
1627 MRI.setRegBank(Carry, AMDGPU::VCCRegBank);
1628 }
1629 }
1630
1631 if (Accumulate) {
1632 if (DstOnValu) {
1633 DstLo = B.buildCopy(S32, DstLo).getReg(0);
1634 DstHi = B.buildCopy(S32, DstHi).getReg(0);
1635 MRI.setRegBank(DstLo, AMDGPU::VGPRRegBank);
1636 MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank);
1637 }
1638
1639 auto Unmerge = B.buildUnmerge(S32, Src2);
1640 Register Src2Lo = Unmerge.getReg(0);
1641 Register Src2Hi = Unmerge.getReg(1);
1642 MRI.setRegBank(Src2Lo, DstBank);
1643 MRI.setRegBank(Src2Hi, DstBank);
1644
1645 if (!IsUnsigned) {
1646 auto Src2Sign = B.buildICmp(CmpInst::ICMP_SLT, CarryType, Src2Hi, Zero);
1647 MRI.setRegBank(Src2Sign.getReg(0), CarryBank);
1648
1649 Carry = B.buildXor(CarryType, Carry, Src2Sign).getReg(0);
1650 MRI.setRegBank(Carry, CarryBank);
1651 }
1652
1653 auto AddLo = B.buildUAddo(S32, CarryType, DstLo, Src2Lo);
1654 DstLo = AddLo.getReg(0);
1655 Register CarryLo = AddLo.getReg(1);
1656 MRI.setRegBank(DstLo, DstBank);
1657 MRI.setRegBank(CarryLo, CarryBank);
1658
1659 auto AddHi = B.buildUAdde(S32, CarryType, DstHi, Src2Hi, CarryLo);
1660 DstHi = AddHi.getReg(0);
1661 MRI.setRegBank(DstHi, DstBank);
1662
1663 Register CarryHi = AddHi.getReg(1);
1664 MRI.setRegBank(CarryHi, CarryBank);
1665
1666 if (IsUnsigned) {
1667 Carry = CarryHi;
1668 } else {
1669 Carry = B.buildXor(CarryType, Carry, CarryHi).getReg(0);
1670 MRI.setRegBank(Carry, CarryBank);
1671 }
1672 } else {
1673 if (IsUnsigned) {
1674 Carry = B.buildConstant(CarryType, 0).getReg(0);
1675 MRI.setRegBank(Carry, CarryBank);
1676 }
1677 }
1678
1679 B.buildMergeLikeInstr(Dst0, {DstLo, DstHi});
1680
1681 if (DstOnValu) {
1682 B.buildCopy(Dst1, Carry);
1683 } else {
1684 B.buildTrunc(Dst1, Carry);
1685 }
1686
1687 MI.eraseFromParent();
1688 return true;
1689}
1690
1691// Return a suitable opcode for extending the operands of Opc when widening.
1692static unsigned getExtendOp(unsigned Opc) {
1693 switch (Opc) {
1694 case TargetOpcode::G_ASHR:
1695 case TargetOpcode::G_SMIN:
1696 case TargetOpcode::G_SMAX:
1697 return TargetOpcode::G_SEXT;
1698 case TargetOpcode::G_LSHR:
1699 case TargetOpcode::G_UMIN:
1700 case TargetOpcode::G_UMAX:
1701 return TargetOpcode::G_ZEXT;
1702 default:
1703 return TargetOpcode::G_ANYEXT;
1704 }
1705}
1706
1707// Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding
1708// any illegal vector extend or unmerge operations.
1709static std::pair<Register, Register>
1710unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) {
1711 const LLT S32 = LLT::scalar(32);
1712 auto Bitcast = B.buildBitcast(S32, Src);
1713
1714 if (ExtOpcode == TargetOpcode::G_SEXT) {
1715 auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16);
1716 auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16));
1717 return std::pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1718 }
1719
1720 auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16));
1721 if (ExtOpcode == TargetOpcode::G_ZEXT) {
1722 auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff));
1723 return std::pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1724 }
1725
1726 assert(ExtOpcode == TargetOpcode::G_ANYEXT);
1727 return std::pair(Bitcast.getReg(0), ShiftHi.getReg(0));
1728}
1729
1730// For cases where only a single copy is inserted for matching register banks.
1731// Replace the register in the instruction operand
1733 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
1734 SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
1735 if (!SrcReg.empty()) {
1736 assert(SrcReg.size() == 1);
1737 OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);
1738 return true;
1739 }
1740
1741 return false;
1742}
1743
1744/// Handle register layout difference for f16 images for some subtargets.
1747 Register Reg) const {
1749 return Reg;
1750
1751 const LLT S16 = LLT::scalar(16);
1752 LLT StoreVT = MRI.getType(Reg);
1753 if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
1754 return Reg;
1755
1756 auto Unmerge = B.buildUnmerge(S16, Reg);
1757
1758
1759 SmallVector<Register, 4> WideRegs;
1760 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1761 WideRegs.push_back(Unmerge.getReg(I));
1762
1763 const LLT S32 = LLT::scalar(32);
1764 int NumElts = StoreVT.getNumElements();
1765
1766 return B.buildMergeLikeInstr(LLT::fixed_vector(NumElts, S32), WideRegs)
1767 .getReg(0);
1768}
1769
1770static std::pair<Register, unsigned>
1772 int64_t Const;
1773 if (mi_match(Reg, MRI, m_ICst(Const)))
1774 return std::pair(Register(), Const);
1775
1776 Register Base;
1777 if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const))))
1778 return std::pair(Base, Const);
1779
1780 // TODO: Handle G_OR used for add case
1781 return std::pair(Reg, 0);
1782}
1783
1784std::pair<Register, unsigned>
1786 Register OrigOffset) const {
1787 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset();
1788 Register BaseReg;
1789 unsigned ImmOffset;
1790 const LLT S32 = LLT::scalar(32);
1791
1792 // TODO: Use AMDGPU::getBaseWithConstantOffset() instead.
1793 std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(),
1794 OrigOffset);
1795
1796 unsigned C1 = 0;
1797 if (ImmOffset != 0) {
1798 // If the immediate value is too big for the immoffset field, put only bits
1799 // that would normally fit in the immoffset field. The remaining value that
1800 // is copied/added for the voffset field is a large power of 2, and it
1801 // stands more chance of being CSEd with the copy/add for another similar
1802 // load/store.
1803 // However, do not do that rounding down if that is a negative
1804 // number, as it appears to be illegal to have a negative offset in the
1805 // vgpr, even if adding the immediate offset makes it positive.
1806 unsigned Overflow = ImmOffset & ~MaxImm;
1807 ImmOffset -= Overflow;
1808 if ((int32_t)Overflow < 0) {
1809 Overflow += ImmOffset;
1810 ImmOffset = 0;
1811 }
1812
1813 C1 = ImmOffset;
1814 if (Overflow != 0) {
1815 if (!BaseReg)
1816 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
1817 else {
1818 auto OverflowVal = B.buildConstant(S32, Overflow);
1819 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
1820 }
1821 }
1822 }
1823
1824 if (!BaseReg)
1825 BaseReg = B.buildConstant(S32, 0).getReg(0);
1826
1827 return {BaseReg, C1};
1828}
1829
1831 Register SrcReg) const {
1832 MachineRegisterInfo &MRI = *B.getMRI();
1833 LLT SrcTy = MRI.getType(SrcReg);
1834 if (SrcTy.getSizeInBits() == 32) {
1835 // Use a v_mov_b32 here to make the exec dependency explicit.
1836 B.buildInstr(AMDGPU::V_MOV_B32_e32)
1837 .addDef(DstReg)
1838 .addUse(SrcReg);
1839 return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) &&
1840 constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI);
1841 }
1842
1843 Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1844 Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1845
1846 B.buildInstr(AMDGPU::V_MOV_B32_e32)
1847 .addDef(TmpReg0)
1848 .addUse(SrcReg, 0, AMDGPU::sub0);
1849 B.buildInstr(AMDGPU::V_MOV_B32_e32)
1850 .addDef(TmpReg1)
1851 .addUse(SrcReg, 0, AMDGPU::sub1);
1852 B.buildInstr(AMDGPU::REG_SEQUENCE)
1853 .addDef(DstReg)
1854 .addUse(TmpReg0)
1855 .addImm(AMDGPU::sub0)
1856 .addUse(TmpReg1)
1857 .addImm(AMDGPU::sub1);
1858
1859 return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) &&
1860 constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI);
1861}
1862
1863/// Utility function for pushing dynamic vector indexes with a constant offset
1864/// into waterfall loops.
1866 MachineInstr &IdxUseInstr,
1867 unsigned OpIdx,
1868 unsigned ConstOffset) {
1869 MachineRegisterInfo &MRI = *B.getMRI();
1870 const LLT S32 = LLT::scalar(32);
1871 Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg();
1872 B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator());
1873
1874 auto MaterializedOffset = B.buildConstant(S32, ConstOffset);
1875
1876 auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset);
1877 MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank);
1878 MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank);
1879 IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0));
1880}
1881
1882/// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the
1883/// original 32-bit source value (to be inserted in the low part of the combined
1884/// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit
1885/// value.
1887 Register Hi32Reg, Register Lo32Reg,
1888 unsigned ExtOpc,
1889 const RegisterBank &RegBank,
1890 bool IsBooleanSrc = false) {
1891 if (ExtOpc == AMDGPU::G_ZEXT) {
1892 B.buildConstant(Hi32Reg, 0);
1893 } else if (ExtOpc == AMDGPU::G_SEXT) {
1894 if (IsBooleanSrc) {
1895 // If we know the original source was an s1, the high half is the same as
1896 // the low.
1897 B.buildCopy(Hi32Reg, Lo32Reg);
1898 } else {
1899 // Replicate sign bit from 32-bit extended part.
1900 auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31);
1901 B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank);
1902 B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt);
1903 }
1904 } else {
1905 assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension");
1906 B.buildUndef(Hi32Reg);
1907 }
1908}
1909
1910bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
1912 const OperandsMapper &OpdMapper) const {
1913 MachineRegisterInfo &MRI = *B.getMRI();
1914
1915 Register VecReg = MI.getOperand(1).getReg();
1916 Register Idx = MI.getOperand(2).getReg();
1917
1918 const RegisterBank &IdxBank =
1919 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1920
1921 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
1922
1923 LLT VecTy = MRI.getType(VecReg);
1924 unsigned EltSize = VecTy.getScalarSizeInBits();
1925 unsigned NumElem = VecTy.getNumElements();
1926
1927 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
1928 IsDivergentIdx, &Subtarget))
1929 return false;
1930
1931 LLT S32 = LLT::scalar(32);
1932
1933 const RegisterBank &DstBank =
1934 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1935 const RegisterBank &SrcBank =
1936 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1937
1938 const RegisterBank &CCBank =
1939 (DstBank == AMDGPU::SGPRRegBank &&
1940 SrcBank == AMDGPU::SGPRRegBank &&
1941 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
1942 : AMDGPU::VCCRegBank;
1943 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
1944
1945 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
1946 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
1947 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
1948 }
1949
1950 LLT EltTy = VecTy.getScalarType();
1951 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
1952 unsigned NumLanes = DstRegs.size();
1953 if (!NumLanes)
1954 NumLanes = 1;
1955 else
1956 EltTy = MRI.getType(DstRegs[0]);
1957
1958 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
1959 SmallVector<Register, 2> Res(NumLanes);
1960 for (unsigned L = 0; L < NumLanes; ++L)
1961 Res[L] = UnmergeToEltTy.getReg(L);
1962
1963 for (unsigned I = 1; I < NumElem; ++I) {
1964 auto IC = B.buildConstant(S32, I);
1965 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
1966 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
1967 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
1968
1969 for (unsigned L = 0; L < NumLanes; ++L) {
1970 auto S = B.buildSelect(EltTy, Cmp,
1971 UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]);
1972
1973 for (unsigned N : { 0, 2, 3 })
1974 MRI.setRegBank(S->getOperand(N).getReg(), DstBank);
1975
1976 Res[L] = S->getOperand(0).getReg();
1977 }
1978 }
1979
1980 for (unsigned L = 0; L < NumLanes; ++L) {
1981 Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L];
1982 B.buildCopy(DstReg, Res[L]);
1983 MRI.setRegBank(DstReg, DstBank);
1984 }
1985
1986 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
1987 MI.eraseFromParent();
1988
1989 return true;
1990}
1991
1992// Insert a cross regbank copy for a register if it already has a bank that
1993// differs from the one we want to set.
1996 const RegisterBank &Bank) {
1997 const RegisterBank *CurrBank = MRI.getRegBankOrNull(Reg);
1998 if (CurrBank && *CurrBank != Bank) {
1999 Register Copy = B.buildCopy(MRI.getType(Reg), Reg).getReg(0);
2000 MRI.setRegBank(Copy, Bank);
2001 return Copy;
2002 }
2003
2004 MRI.setRegBank(Reg, Bank);
2005 return Reg;
2006}
2007
2008bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
2010 const OperandsMapper &OpdMapper) const {
2011
2012 MachineRegisterInfo &MRI = *B.getMRI();
2013 Register VecReg = MI.getOperand(1).getReg();
2014 Register Idx = MI.getOperand(3).getReg();
2015
2016 const RegisterBank &IdxBank =
2017 *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2018
2019 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
2020
2021 LLT VecTy = MRI.getType(VecReg);
2022 unsigned EltSize = VecTy.getScalarSizeInBits();
2023 unsigned NumElem = VecTy.getNumElements();
2024
2025 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
2026 IsDivergentIdx, &Subtarget))
2027 return false;
2028
2029 LLT S32 = LLT::scalar(32);
2030
2031 const RegisterBank &DstBank =
2032 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2033 const RegisterBank &SrcBank =
2034 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2035 const RegisterBank &InsBank =
2036 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2037
2038 const RegisterBank &CCBank =
2039 (DstBank == AMDGPU::SGPRRegBank &&
2040 SrcBank == AMDGPU::SGPRRegBank &&
2041 InsBank == AMDGPU::SGPRRegBank &&
2042 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
2043 : AMDGPU::VCCRegBank;
2044 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
2045
2046 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
2047 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
2048 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
2049 }
2050
2051 LLT EltTy = VecTy.getScalarType();
2052 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2053 unsigned NumLanes = InsRegs.size();
2054 if (!NumLanes) {
2055 NumLanes = 1;
2056 InsRegs.push_back(MI.getOperand(2).getReg());
2057 } else {
2058 EltTy = MRI.getType(InsRegs[0]);
2059 }
2060
2061 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
2062 SmallVector<Register, 16> Ops(NumElem * NumLanes);
2063
2064 for (unsigned I = 0; I < NumElem; ++I) {
2065 auto IC = B.buildConstant(S32, I);
2066 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
2067 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
2068 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
2069
2070 for (unsigned L = 0; L < NumLanes; ++L) {
2071 Register Op0 = constrainRegToBank(MRI, B, InsRegs[L], DstBank);
2072 Register Op1 = UnmergeToEltTy.getReg(I * NumLanes + L);
2073 Op1 = constrainRegToBank(MRI, B, Op1, DstBank);
2074
2075 Register Select = B.buildSelect(EltTy, Cmp, Op0, Op1).getReg(0);
2076 MRI.setRegBank(Select, DstBank);
2077
2078 Ops[I * NumLanes + L] = Select;
2079 }
2080 }
2081
2082 LLT MergeTy = LLT::fixed_vector(Ops.size(), EltTy);
2083 if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) {
2084 B.buildBuildVector(MI.getOperand(0), Ops);
2085 } else {
2086 auto Vec = B.buildBuildVector(MergeTy, Ops);
2087 MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank);
2088 B.buildBitcast(MI.getOperand(0).getReg(), Vec);
2089 }
2090
2091 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
2092 MI.eraseFromParent();
2093
2094 return true;
2095}
2096
2098 MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
2099 MachineInstr &MI = OpdMapper.getMI();
2100 B.setInstrAndDebugLoc(MI);
2101 unsigned Opc = MI.getOpcode();
2102 MachineRegisterInfo &MRI = OpdMapper.getMRI();
2103 switch (Opc) {
2104 case AMDGPU::G_CONSTANT:
2105 case AMDGPU::G_IMPLICIT_DEF: {
2106 Register DstReg = MI.getOperand(0).getReg();
2107 LLT DstTy = MRI.getType(DstReg);
2108 if (DstTy != LLT::scalar(1))
2109 break;
2110
2111 const RegisterBank *DstBank =
2112 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2113 if (DstBank == &AMDGPU::VCCRegBank)
2114 break;
2115 SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
2116 if (DefRegs.empty())
2117 DefRegs.push_back(DstReg);
2118
2119 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
2120
2121 Register NewDstReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
2122 LLVMContext &Ctx = B.getMF().getFunction().getContext();
2123
2124 MI.getOperand(0).setReg(NewDstReg);
2125 if (Opc != AMDGPU::G_IMPLICIT_DEF) {
2126 uint64_t ConstVal = MI.getOperand(1).getCImm()->getZExtValue();
2127 MI.getOperand(1).setCImm(
2129 }
2130
2131 MRI.setRegBank(NewDstReg, *DstBank);
2132 B.buildTrunc(DefRegs[0], NewDstReg);
2133 return;
2134 }
2135 case AMDGPU::G_PHI: {
2136 Register DstReg = MI.getOperand(0).getReg();
2137 LLT DstTy = MRI.getType(DstReg);
2138 if (DstTy != LLT::scalar(1))
2139 break;
2140
2141 const LLT S32 = LLT::scalar(32);
2142 const RegisterBank *DstBank =
2143 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2144 if (DstBank == &AMDGPU::VCCRegBank) {
2145 applyDefaultMapping(OpdMapper);
2146 // The standard handling only considers the result register bank for
2147 // phis. For VCC, blindly inserting a copy when the phi is lowered will
2148 // produce an invalid copy. We can only copy with some kind of compare to
2149 // get a vector boolean result. Insert a register bank copy that will be
2150 // correctly lowered to a compare.
2151 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
2152 Register SrcReg = MI.getOperand(I).getReg();
2153 const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
2154
2155 if (SrcBank != &AMDGPU::VCCRegBank) {
2156 MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB();
2157 B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator());
2158
2159 auto Copy = B.buildCopy(LLT::scalar(1), SrcReg);
2160 MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank);
2161 MI.getOperand(I).setReg(Copy.getReg(0));
2162 }
2163 }
2164
2165 return;
2166 }
2167
2168 // Phi handling is strange and only considers the bank of the destination.
2169 substituteSimpleCopyRegs(OpdMapper, 0);
2170
2171 // Promote SGPR/VGPR booleans to s32
2172 ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
2173 B.setInsertPt(B.getMBB(), MI);
2174 LegalizerHelper Helper(B.getMF(), ApplyBank, B);
2175
2176 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2177 llvm_unreachable("widen scalar should have succeeded");
2178
2179 return;
2180 }
2181 case AMDGPU::G_FCMP:
2183 break;
2185 case AMDGPU::G_ICMP:
2186 case AMDGPU::G_UADDO:
2187 case AMDGPU::G_USUBO:
2188 case AMDGPU::G_UADDE:
2189 case AMDGPU::G_SADDE:
2190 case AMDGPU::G_USUBE:
2191 case AMDGPU::G_SSUBE: {
2192 unsigned BoolDstOp =
2193 (Opc == AMDGPU::G_ICMP || Opc == AMDGPU::G_FCMP) ? 0 : 1;
2194 Register DstReg = MI.getOperand(BoolDstOp).getReg();
2195
2196 const RegisterBank *DstBank =
2197 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2198 if (DstBank != &AMDGPU::SGPRRegBank)
2199 break;
2200
2201 const bool HasCarryIn = MI.getNumOperands() == 5;
2202
2203 // If this is a scalar compare, promote the result to s32, as the selection
2204 // will end up using a copy to a 32-bit vreg.
2205 const LLT S32 = LLT::scalar(32);
2206 Register NewDstReg = MRI.createGenericVirtualRegister(S32);
2207 MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank);
2208 MI.getOperand(BoolDstOp).setReg(NewDstReg);
2209
2210 if (HasCarryIn) {
2211 Register NewSrcReg = MRI.createGenericVirtualRegister(S32);
2212 MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank);
2213 B.buildZExt(NewSrcReg, MI.getOperand(4).getReg());
2214 MI.getOperand(4).setReg(NewSrcReg);
2215 }
2216
2217 MachineBasicBlock *MBB = MI.getParent();
2218 B.setInsertPt(*MBB, std::next(MI.getIterator()));
2219
2220 // If we had a constrained VCC result register, a copy was inserted to VCC
2221 // from SGPR.
2222 SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
2223 if (DefRegs.empty())
2224 DefRegs.push_back(DstReg);
2225 B.buildTrunc(DefRegs[0], NewDstReg);
2226 return;
2227 }
2228 case AMDGPU::G_SELECT: {
2229 Register DstReg = MI.getOperand(0).getReg();
2230 LLT DstTy = MRI.getType(DstReg);
2231
2232 SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1));
2233 if (CondRegs.empty())
2234 CondRegs.push_back(MI.getOperand(1).getReg());
2235 else {
2236 assert(CondRegs.size() == 1);
2237 }
2238
2239 const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI);
2240 if (CondBank == &AMDGPU::SGPRRegBank) {
2241 const LLT S32 = LLT::scalar(32);
2242 Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2243 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2244
2245 MI.getOperand(1).setReg(NewCondReg);
2246 B.buildZExt(NewCondReg, CondRegs[0]);
2247 }
2248
2249 if (DstTy.getSizeInBits() != 64)
2250 break;
2251
2252 LLT HalfTy = getHalfSizedType(DstTy);
2253
2254 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2255 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2256 SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));
2257
2258 // All inputs are SGPRs, nothing special to do.
2259 if (DefRegs.empty()) {
2260 assert(Src1Regs.empty() && Src2Regs.empty());
2261 break;
2262 }
2263
2264 if (Src1Regs.empty())
2265 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2266 else {
2267 setRegsToType(MRI, Src1Regs, HalfTy);
2268 }
2269
2270 if (Src2Regs.empty())
2271 split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg());
2272 else
2273 setRegsToType(MRI, Src2Regs, HalfTy);
2274
2275 setRegsToType(MRI, DefRegs, HalfTy);
2276
2277 B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]);
2278 B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]);
2279
2280 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2281 MI.eraseFromParent();
2282 return;
2283 }
2284 case AMDGPU::G_BRCOND: {
2285 Register CondReg = MI.getOperand(0).getReg();
2286 // FIXME: Should use legalizer helper, but should change bool ext type.
2287 const RegisterBank *CondBank =
2288 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2289
2290 if (CondBank == &AMDGPU::SGPRRegBank) {
2291 const LLT S32 = LLT::scalar(32);
2292 Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2293 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2294
2295 MI.getOperand(0).setReg(NewCondReg);
2296 B.buildZExt(NewCondReg, CondReg);
2297 return;
2298 }
2299
2300 break;
2301 }
2302 case AMDGPU::G_AND:
2303 case AMDGPU::G_OR:
2304 case AMDGPU::G_XOR: {
2305 // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
2306 // there is a VGPR input.
2307 Register DstReg = MI.getOperand(0).getReg();
2308 LLT DstTy = MRI.getType(DstReg);
2309
2310 if (DstTy.getSizeInBits() == 1) {
2311 const RegisterBank *DstBank =
2312 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2313 if (DstBank == &AMDGPU::VCCRegBank)
2314 break;
2315
2316 MachineFunction *MF = MI.getParent()->getParent();
2317 ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
2318 LegalizerHelper Helper(*MF, ApplyBank, B);
2319
2320 if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
2322 llvm_unreachable("widen scalar should have succeeded");
2323 return;
2324 }
2325
2326 if (DstTy.getSizeInBits() != 64)
2327 break;
2328
2329 LLT HalfTy = getHalfSizedType(DstTy);
2330 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2331 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
2332 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2333
2334 // All inputs are SGPRs, nothing special to do.
2335 if (DefRegs.empty()) {
2336 assert(Src0Regs.empty() && Src1Regs.empty());
2337 break;
2338 }
2339
2340 assert(DefRegs.size() == 2);
2341 assert(Src0Regs.size() == Src1Regs.size() &&
2342 (Src0Regs.empty() || Src0Regs.size() == 2));
2343
2344 // Depending on where the source registers came from, the generic code may
2345 // have decided to split the inputs already or not. If not, we still need to
2346 // extract the values.
2347
2348 if (Src0Regs.empty())
2349 split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
2350 else
2351 setRegsToType(MRI, Src0Regs, HalfTy);
2352
2353 if (Src1Regs.empty())
2354 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2355 else
2356 setRegsToType(MRI, Src1Regs, HalfTy);
2357
2358 setRegsToType(MRI, DefRegs, HalfTy);
2359
2360 B.buildInstr(Opc, {DefRegs[0]}, {Src0Regs[0], Src1Regs[0]});
2361 B.buildInstr(Opc, {DefRegs[1]}, {Src0Regs[1], Src1Regs[1]});
2362
2363 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2364 MI.eraseFromParent();
2365 return;
2366 }
2367 case AMDGPU::G_ABS: {
2368 Register SrcReg = MI.getOperand(1).getReg();
2369 const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg);
2370
2371 // There is no VALU abs instruction so we need to replace it with a sub and
2372 // max combination.
2373 if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) {
2374 MachineFunction *MF = MI.getParent()->getParent();
2375 ApplyRegBankMapping Apply(B, *this, MRI, &AMDGPU::VGPRRegBank);
2376 LegalizerHelper Helper(*MF, Apply, B);
2377
2379 llvm_unreachable("lowerAbsToMaxNeg should have succeeded");
2380 return;
2381 }
2382 [[fallthrough]];
2383 }
2384 case AMDGPU::G_ADD:
2385 case AMDGPU::G_SUB:
2386 case AMDGPU::G_MUL:
2387 case AMDGPU::G_SHL:
2388 case AMDGPU::G_LSHR:
2389 case AMDGPU::G_ASHR:
2390 case AMDGPU::G_SMIN:
2391 case AMDGPU::G_SMAX:
2392 case AMDGPU::G_UMIN:
2393 case AMDGPU::G_UMAX: {
2394 Register DstReg = MI.getOperand(0).getReg();
2395 LLT DstTy = MRI.getType(DstReg);
2396
2397 // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
2398 // Packed 16-bit operations need to be scalarized and promoted.
2399 if (DstTy != LLT::scalar(16) && DstTy != LLT::fixed_vector(2, 16))
2400 break;
2401
2402 const RegisterBank *DstBank =
2403 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2404 if (DstBank == &AMDGPU::VGPRRegBank)
2405 break;
2406
2407 const LLT S32 = LLT::scalar(32);
2408 MachineBasicBlock *MBB = MI.getParent();
2409 MachineFunction *MF = MBB->getParent();
2410 ApplyRegBankMapping ApplySALU(B, *this, MRI, &AMDGPU::SGPRRegBank);
2411
2412 if (DstTy.isVector() && Opc == AMDGPU::G_ABS) {
2413 Register WideSrcLo, WideSrcHi;
2414
2415 std::tie(WideSrcLo, WideSrcHi) =
2416 unpackV2S16ToS32(B, MI.getOperand(1).getReg(), TargetOpcode::G_SEXT);
2417 auto Lo = B.buildInstr(AMDGPU::G_ABS, {S32}, {WideSrcLo});
2418 auto Hi = B.buildInstr(AMDGPU::G_ABS, {S32}, {WideSrcHi});
2419 B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
2420 MI.eraseFromParent();
2421 return;
2422 }
2423
2424 if (DstTy.isVector()) {
2425 Register WideSrc0Lo, WideSrc0Hi;
2426 Register WideSrc1Lo, WideSrc1Hi;
2427
2428 unsigned ExtendOp = getExtendOp(MI.getOpcode());
2429 std::tie(WideSrc0Lo, WideSrc0Hi)
2430 = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), ExtendOp);
2431 std::tie(WideSrc1Lo, WideSrc1Hi)
2432 = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), ExtendOp);
2433 auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo});
2434 auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi});
2435 B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
2436 MI.eraseFromParent();
2437 } else {
2438 LegalizerHelper Helper(*MF, ApplySALU, B);
2439
2440 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2441 llvm_unreachable("widen scalar should have succeeded");
2442
2443 // FIXME: s16 shift amounts should be legal.
2444 if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR ||
2445 Opc == AMDGPU::G_ASHR) {
2446 B.setInsertPt(*MBB, MI.getIterator());
2447 if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2448 llvm_unreachable("widen scalar should have succeeded");
2449 }
2450 }
2451
2452 return;
2453 }
2454 case AMDGPU::G_SEXT_INREG: {
2455 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2456 if (SrcRegs.empty())
2457 break; // Nothing to repair
2458
2459 const LLT S32 = LLT::scalar(32);
2460 ApplyRegBankMapping O(B, *this, MRI, &AMDGPU::VGPRRegBank);
2461
2462 // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs
2463 // we would need to further expand, and doesn't let us directly set the
2464 // result registers.
2465 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2466
2467 int Amt = MI.getOperand(2).getImm();
2468 if (Amt <= 32) {
2469 // Downstream users have expectations for the high bit behavior, so freeze
2470 // incoming undefined bits.
2471 if (Amt == 32) {
2472 // The low bits are unchanged.
2473 B.buildFreeze(DstRegs[0], SrcRegs[0]);
2474 } else {
2475 auto Freeze = B.buildFreeze(S32, SrcRegs[0]);
2476 // Extend in the low bits and propagate the sign bit to the high half.
2477 B.buildSExtInReg(DstRegs[0], Freeze, Amt);
2478 }
2479
2480 B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31));
2481 } else {
2482 // The low bits are unchanged, and extend in the high bits.
2483 // No freeze required
2484 B.buildCopy(DstRegs[0], SrcRegs[0]);
2485 B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32);
2486 }
2487
2488 Register DstReg = MI.getOperand(0).getReg();
2489 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2490 MI.eraseFromParent();
2491 return;
2492 }
2493 case AMDGPU::G_CTPOP:
2494 case AMDGPU::G_BITREVERSE: {
2495 const RegisterBank *DstBank =
2496 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2497 if (DstBank == &AMDGPU::SGPRRegBank)
2498 break;
2499
2500 Register SrcReg = MI.getOperand(1).getReg();
2501 const LLT S32 = LLT::scalar(32);
2502 LLT Ty = MRI.getType(SrcReg);
2503 if (Ty == S32)
2504 break;
2505
2506 ApplyRegBankMapping ApplyVALU(B, *this, MRI, &AMDGPU::VGPRRegBank);
2507
2508 MachineFunction &MF = B.getMF();
2509 LegalizerHelper Helper(MF, ApplyVALU, B);
2510
2511 if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2512 llvm_unreachable("narrowScalar should have succeeded");
2513 return;
2514 }
2515 case AMDGPU::G_AMDGPU_FFBH_U32:
2516 case AMDGPU::G_AMDGPU_FFBL_B32:
2517 case AMDGPU::G_CTLZ_ZERO_UNDEF:
2518 case AMDGPU::G_CTTZ_ZERO_UNDEF: {
2519 const RegisterBank *DstBank =
2520 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2521 if (DstBank == &AMDGPU::SGPRRegBank)
2522 break;
2523
2524 Register SrcReg = MI.getOperand(1).getReg();
2525 const LLT S32 = LLT::scalar(32);
2526 LLT Ty = MRI.getType(SrcReg);
2527 if (Ty == S32)
2528 break;
2529
2530 // We can narrow this more efficiently than Helper can by using ffbh/ffbl
2531 // which return -1 when the input is zero:
2532 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
2533 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
2534 // (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32))
2535 // (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo))
2536 ApplyRegBankMapping ApplyVALU(B, *this, MRI, &AMDGPU::VGPRRegBank);
2537 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2538 unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF
2539 ? (unsigned)AMDGPU::G_AMDGPU_FFBH_U32
2540 : Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2541 ? (unsigned)AMDGPU::G_AMDGPU_FFBL_B32
2542 : Opc;
2543 unsigned Idx = NewOpc == AMDGPU::G_AMDGPU_FFBH_U32;
2544 auto X = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx]});
2545 auto Y = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx ^ 1]});
2546 unsigned AddOpc =
2547 Opc == AMDGPU::G_CTLZ_ZERO_UNDEF || Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2548 ? AMDGPU::G_ADD
2549 : AMDGPU::G_UADDSAT;
2550 Y = B.buildInstr(AddOpc, {S32}, {Y, B.buildConstant(S32, 32)});
2551 Register DstReg = MI.getOperand(0).getReg();
2552 B.buildUMin(DstReg, X, Y);
2553 MI.eraseFromParent();
2554 return;
2555 }
2556 case AMDGPU::G_SEXT:
2557 case AMDGPU::G_ZEXT:
2558 case AMDGPU::G_ANYEXT: {
2559 Register SrcReg = MI.getOperand(1).getReg();
2560 LLT SrcTy = MRI.getType(SrcReg);
2561 const bool Signed = Opc == AMDGPU::G_SEXT;
2562
2563 assert(OpdMapper.getVRegs(1).empty());
2564
2565 const RegisterBank *SrcBank =
2566 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2567
2568 Register DstReg = MI.getOperand(0).getReg();
2569 LLT DstTy = MRI.getType(DstReg);
2570 if (DstTy.isScalar() &&
2571 SrcBank != &AMDGPU::SGPRRegBank &&
2572 SrcBank != &AMDGPU::VCCRegBank &&
2573 // FIXME: Should handle any type that round to s64 when irregular
2574 // breakdowns supported.
2575 DstTy.getSizeInBits() == 64 &&
2576 SrcTy.getSizeInBits() <= 32) {
2577 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2578
2579 // Extend to 32-bit, and then extend the low half.
2580 if (Signed) {
2581 // TODO: Should really be buildSExtOrCopy
2582 B.buildSExtOrTrunc(DefRegs[0], SrcReg);
2583 } else if (Opc == AMDGPU::G_ZEXT) {
2584 B.buildZExtOrTrunc(DefRegs[0], SrcReg);
2585 } else {
2586 B.buildAnyExtOrTrunc(DefRegs[0], SrcReg);
2587 }
2588
2589 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank);
2590 MRI.setRegBank(DstReg, *SrcBank);
2591 MI.eraseFromParent();
2592 return;
2593 }
2594
2595 if (SrcTy != LLT::scalar(1))
2596 return;
2597
2598 // It is not legal to have a legalization artifact with a VCC source. Rather
2599 // than introducing a copy, insert the select we would have to select the
2600 // copy to.
2601 if (SrcBank == &AMDGPU::VCCRegBank) {
2602 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2603
2604 const RegisterBank *DstBank = &AMDGPU::VGPRRegBank;
2605
2606 unsigned DstSize = DstTy.getSizeInBits();
2607 // 64-bit select is SGPR only
2608 const bool UseSel64 = DstSize > 32 &&
2609 SrcBank->getID() == AMDGPU::SGPRRegBankID;
2610
2611 // TODO: Should s16 select be legal?
2612 LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
2613 auto True = B.buildConstant(SelType, Signed ? -1 : 1);
2614 auto False = B.buildConstant(SelType, 0);
2615
2616 MRI.setRegBank(True.getReg(0), *DstBank);
2617 MRI.setRegBank(False.getReg(0), *DstBank);
2618 MRI.setRegBank(DstReg, *DstBank);
2619
2620 if (DstSize > 32) {
2621 B.buildSelect(DefRegs[0], SrcReg, True, False);
2622 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true);
2623 } else if (DstSize < 32) {
2624 auto Sel = B.buildSelect(SelType, SrcReg, True, False);
2625 MRI.setRegBank(Sel.getReg(0), *DstBank);
2626 B.buildTrunc(DstReg, Sel);
2627 } else {
2628 B.buildSelect(DstReg, SrcReg, True, False);
2629 }
2630
2631 MI.eraseFromParent();
2632 return;
2633 }
2634
2635 break;
2636 }
2637 case AMDGPU::G_EXTRACT_VECTOR_ELT: {
2638 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2639
2640 assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());
2641
2642 Register DstReg = MI.getOperand(0).getReg();
2643 Register SrcReg = MI.getOperand(1).getReg();
2644
2645 const LLT S32 = LLT::scalar(32);
2646 LLT DstTy = MRI.getType(DstReg);
2647 LLT SrcTy = MRI.getType(SrcReg);
2648
2649 if (foldExtractEltToCmpSelect(B, MI, OpdMapper))
2650 return;
2651
2652 const ValueMapping &DstMapping
2653 = OpdMapper.getInstrMapping().getOperandMapping(0);
2654 const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank;
2655 const RegisterBank *SrcBank =
2656 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2657 const RegisterBank *IdxBank =
2658 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2659
2660 Register BaseIdxReg;
2661 unsigned ConstOffset;
2662 std::tie(BaseIdxReg, ConstOffset) =
2663 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg());
2664
2665 // See if the index is an add of a constant which will be foldable by moving
2666 // the base register of the index later if this is going to be executed in a
2667 // waterfall loop. This is essentially to reassociate the add of a constant
2668 // with the readfirstlane.
2669 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2670 ConstOffset > 0 &&
2671 ConstOffset < SrcTy.getNumElements();
2672
2673 // Move the base register. We'll re-insert the add later.
2674 if (ShouldMoveIndexIntoLoop)
2675 MI.getOperand(2).setReg(BaseIdxReg);
2676
2677 // If this is a VGPR result only because the index was a VGPR result, the
2678 // actual indexing will be done on the SGPR source vector, which will
2679 // produce a scalar result. We need to copy to the VGPR result inside the
2680 // waterfall loop.
2681 const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank &&
2682 SrcBank == &AMDGPU::SGPRRegBank;
2683 if (DstRegs.empty()) {
2684 applyDefaultMapping(OpdMapper);
2685
2687
2688 if (NeedCopyToVGPR) {
2689 // We don't want a phi for this temporary reg.
2690 Register TmpReg = MRI.createGenericVirtualRegister(DstTy);
2691 MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank);
2692 MI.getOperand(0).setReg(TmpReg);
2693 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
2694
2695 // Use a v_mov_b32 here to make the exec dependency explicit.
2696 buildVCopy(B, DstReg, TmpReg);
2697 }
2698
2699 // Re-insert the constant offset add inside the waterfall loop.
2700 if (ShouldMoveIndexIntoLoop)
2701 reinsertVectorIndexAdd(B, MI, 2, ConstOffset);
2702
2703 return;
2704 }
2705
2706 assert(DstTy.getSizeInBits() == 64);
2707
2708 LLT Vec32 = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32);
2709
2710 auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2711 auto One = B.buildConstant(S32, 1);
2712
2713 MachineBasicBlock::iterator MII = MI.getIterator();
2714
2715 // Split the vector index into 32-bit pieces. Prepare to move all of the
2716 // new instructions into a waterfall loop if necessary.
2717 //
2718 // Don't put the bitcast or constant in the loop.
2719 MachineInstrSpan Span(MII, &B.getMBB());
2720
2721 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2722 auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2723 auto IdxHi = B.buildAdd(S32, IdxLo, One);
2724
2725 auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
2726 auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi);
2727
2728 MRI.setRegBank(DstReg, *DstBank);
2729 MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2730 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2731 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2732 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2733
2734 SmallSet<Register, 4> OpsToWaterfall;
2735 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) {
2736 MI.eraseFromParent();
2737 return;
2738 }
2739
2740 // Remove the original instruction to avoid potentially confusing the
2741 // waterfall loop logic.
2742 B.setInstr(*Span.begin());
2743 MI.eraseFromParent();
2744 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2745 OpsToWaterfall);
2746
2747 if (NeedCopyToVGPR) {
2748 MachineBasicBlock *LoopBB = Extract1->getParent();
2749 Register TmpReg0 = MRI.createGenericVirtualRegister(S32);
2750 Register TmpReg1 = MRI.createGenericVirtualRegister(S32);
2751 MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank);
2752 MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank);
2753
2754 Extract0->getOperand(0).setReg(TmpReg0);
2755 Extract1->getOperand(0).setReg(TmpReg1);
2756
2757 B.setInsertPt(*LoopBB, ++Extract1->getIterator());
2758
2759 buildVCopy(B, DstRegs[0], TmpReg0);
2760 buildVCopy(B, DstRegs[1], TmpReg1);
2761 }
2762
2763 if (ShouldMoveIndexIntoLoop)
2764 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2765
2766 return;
2767 }
2768 case AMDGPU::G_INSERT_VECTOR_ELT: {
2769 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2770
2771 Register DstReg = MI.getOperand(0).getReg();
2772 LLT VecTy = MRI.getType(DstReg);
2773
2774 assert(OpdMapper.getVRegs(0).empty());
2775 assert(OpdMapper.getVRegs(3).empty());
2776
2777 if (substituteSimpleCopyRegs(OpdMapper, 1))
2778 MRI.setType(MI.getOperand(1).getReg(), VecTy);
2779
2780 if (foldInsertEltToCmpSelect(B, MI, OpdMapper))
2781 return;
2782
2783 const RegisterBank *IdxBank =
2784 OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2785
2786 Register SrcReg = MI.getOperand(1).getReg();
2787 Register InsReg = MI.getOperand(2).getReg();
2788 LLT InsTy = MRI.getType(InsReg);
2789 (void)InsTy;
2790
2791 Register BaseIdxReg;
2792 unsigned ConstOffset;
2793 std::tie(BaseIdxReg, ConstOffset) =
2794 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg());
2795
2796 // See if the index is an add of a constant which will be foldable by moving
2797 // the base register of the index later if this is going to be executed in a
2798 // waterfall loop. This is essentially to reassociate the add of a constant
2799 // with the readfirstlane.
2800 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2801 ConstOffset > 0 &&
2802 ConstOffset < VecTy.getNumElements();
2803
2804 // Move the base register. We'll re-insert the add later.
2805 if (ShouldMoveIndexIntoLoop)
2806 MI.getOperand(3).setReg(BaseIdxReg);
2807
2808
2809 if (InsRegs.empty()) {
2811
2812 // Re-insert the constant offset add inside the waterfall loop.
2813 if (ShouldMoveIndexIntoLoop) {
2814 reinsertVectorIndexAdd(B, MI, 3, ConstOffset);
2815 }
2816
2817 return;
2818 }
2819
2820 assert(InsTy.getSizeInBits() == 64);
2821
2822 const LLT S32 = LLT::scalar(32);
2823 LLT Vec32 = LLT::fixed_vector(2 * VecTy.getNumElements(), 32);
2824
2825 auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2826 auto One = B.buildConstant(S32, 1);
2827
2828 // Split the vector index into 32-bit pieces. Prepare to move all of the
2829 // new instructions into a waterfall loop if necessary.
2830 //
2831 // Don't put the bitcast or constant in the loop.
2833
2834 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2835 auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2836 auto IdxHi = B.buildAdd(S32, IdxLo, One);
2837
2838 auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo);
2839 auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi);
2840
2841 const RegisterBank *DstBank =
2842 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2843 const RegisterBank *SrcBank =
2844 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2845 const RegisterBank *InsSrcBank =
2846 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2847
2848 MRI.setRegBank(InsReg, *InsSrcBank);
2849 MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2850 MRI.setRegBank(InsLo.getReg(0), *DstBank);
2851 MRI.setRegBank(InsHi.getReg(0), *DstBank);
2852 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2853 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2854 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2855
2856
2857 SmallSet<Register, 4> OpsToWaterfall;
2858 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) {
2859 B.setInsertPt(B.getMBB(), MI);
2860 B.buildBitcast(DstReg, InsHi);
2861 MI.eraseFromParent();
2862 return;
2863 }
2864
2865 B.setInstr(*Span.begin());
2866 MI.eraseFromParent();
2867
2868 // Figure out the point after the waterfall loop before mangling the control
2869 // flow.
2870 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2871 OpsToWaterfall);
2872
2873 // The insertion point is now right after the original instruction.
2874 //
2875 // Keep the bitcast to the original vector type out of the loop. Doing this
2876 // saved an extra phi we don't need inside the loop.
2877 B.buildBitcast(DstReg, InsHi);
2878
2879 // Re-insert the constant offset add inside the waterfall loop.
2880 if (ShouldMoveIndexIntoLoop)
2881 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2882
2883 return;
2884 }
2885 case AMDGPU::G_AMDGPU_BUFFER_LOAD:
2886 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
2887 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
2888 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
2889 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
2890 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
2891 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE:
2892 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
2893 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
2894 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
2895 case AMDGPU::G_AMDGPU_BUFFER_STORE:
2896 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
2897 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
2898 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
2899 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16:
2900 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
2901 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: {
2902 applyDefaultMapping(OpdMapper);
2903 executeInWaterfallLoop(B, MI, {1, 4});
2904 return;
2905 }
2906 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
2907 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
2908 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
2909 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
2910 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
2911 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
2912 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
2913 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
2914 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
2915 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
2916 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
2917 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: {
2918 applyDefaultMapping(OpdMapper);
2919 executeInWaterfallLoop(B, MI, {2, 5});
2920 return;
2921 }
2922 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
2923 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
2924 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
2925 applyDefaultMapping(OpdMapper);
2926 executeInWaterfallLoop(B, MI, {2, 5});
2927 return;
2928 }
2929 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
2930 applyDefaultMapping(OpdMapper);
2931 executeInWaterfallLoop(B, MI, {3, 6});
2932 return;
2933 }
2934 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
2935 applyMappingSBufferLoad(B, OpdMapper);
2936 return;
2937 }
2938 case AMDGPU::G_INTRINSIC:
2939 case AMDGPU::G_INTRINSIC_CONVERGENT: {
2940 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
2941 case Intrinsic::amdgcn_readlane: {
2942 substituteSimpleCopyRegs(OpdMapper, 2);
2943
2944 assert(OpdMapper.getVRegs(0).empty());
2945 assert(OpdMapper.getVRegs(3).empty());
2946
2947 // Make sure the index is an SGPR. It doesn't make sense to run this in a
2948 // waterfall loop, so assume it's a uniform value.
2949 constrainOpWithReadfirstlane(B, MI, 3); // Index
2950 return;
2951 }
2952 case Intrinsic::amdgcn_writelane: {
2953 assert(OpdMapper.getVRegs(0).empty());
2954 assert(OpdMapper.getVRegs(2).empty());
2955 assert(OpdMapper.getVRegs(3).empty());
2956
2957 substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
2958 constrainOpWithReadfirstlane(B, MI, 2); // Source value
2959 constrainOpWithReadfirstlane(B, MI, 3); // Index
2960 return;
2961 }
2962 case Intrinsic::amdgcn_interp_p1:
2963 case Intrinsic::amdgcn_interp_p2:
2964 case Intrinsic::amdgcn_interp_mov:
2965 case Intrinsic::amdgcn_interp_p1_f16:
2966 case Intrinsic::amdgcn_interp_p2_f16:
2967 case Intrinsic::amdgcn_lds_param_load: {
2968 applyDefaultMapping(OpdMapper);
2969
2970 // Readlane for m0 value, which is always the last operand.
2971 // FIXME: Should this be a waterfall loop instead?
2972 constrainOpWithReadfirstlane(B, MI, MI.getNumOperands() - 1); // Index
2973 return;
2974 }
2975 case Intrinsic::amdgcn_interp_inreg_p10:
2976 case Intrinsic::amdgcn_interp_inreg_p2:
2977 case Intrinsic::amdgcn_interp_inreg_p10_f16:
2978 case Intrinsic::amdgcn_interp_inreg_p2_f16:
2979 applyDefaultMapping(OpdMapper);
2980 return;
2981 case Intrinsic::amdgcn_permlane16:
2982 case Intrinsic::amdgcn_permlanex16: {
2983 // Doing a waterfall loop over these wouldn't make any sense.
2984 substituteSimpleCopyRegs(OpdMapper, 2);
2985 substituteSimpleCopyRegs(OpdMapper, 3);
2988 return;
2989 }
2990 case Intrinsic::amdgcn_sbfe:
2991 applyMappingBFE(B, OpdMapper, true);
2992 return;
2993 case Intrinsic::amdgcn_ubfe:
2994 applyMappingBFE(B, OpdMapper, false);
2995 return;
2996 case Intrinsic::amdgcn_inverse_ballot:
2997 applyDefaultMapping(OpdMapper);
2998 constrainOpWithReadfirstlane(B, MI, 2); // Mask
2999 return;
3000 case Intrinsic::amdgcn_ballot:
3001 // Use default handling and insert copy to vcc source.
3002 break;
3003 }
3004 break;
3005 }
3006 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3007 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
3008 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
3009 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
3010 const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3012 assert(RSrcIntrin && RSrcIntrin->IsImage);
3013 // Non-images can have complications from operands that allow both SGPR
3014 // and VGPR. For now it's too complicated to figure out the final opcode
3015 // to derive the register bank from the MCInstrDesc.
3016 applyMappingImage(B, MI, OpdMapper, RSrcIntrin->RsrcArg);
3017 return;
3018 }
3019 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
3020 unsigned N = MI.getNumExplicitOperands() - 2;
3021 applyDefaultMapping(OpdMapper);
3023 return;
3024 }
3025 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
3026 case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: {
3027 auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
3028 switch (IntrID) {
3029 case Intrinsic::amdgcn_ds_ordered_add:
3030 case Intrinsic::amdgcn_ds_ordered_swap: {
3031 // This is only allowed to execute with 1 lane, so readfirstlane is safe.
3032 assert(OpdMapper.getVRegs(0).empty());
3033 substituteSimpleCopyRegs(OpdMapper, 3);
3035 return;
3036 }
3037 case Intrinsic::amdgcn_ds_gws_init:
3038 case Intrinsic::amdgcn_ds_gws_barrier:
3039 case Intrinsic::amdgcn_ds_gws_sema_br: {
3040 // Only the first lane is executes, so readfirstlane is safe.
3041 substituteSimpleCopyRegs(OpdMapper, 1);
3043 return;
3044 }
3045 case Intrinsic::amdgcn_ds_gws_sema_v:
3046 case Intrinsic::amdgcn_ds_gws_sema_p:
3047 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
3048 // Only the first lane is executes, so readfirstlane is safe.
3050 return;
3051 }
3052 case Intrinsic::amdgcn_ds_append:
3053 case Intrinsic::amdgcn_ds_consume: {
3055 return;
3056 }
3057 case Intrinsic::amdgcn_s_sendmsg:
3058 case Intrinsic::amdgcn_s_sendmsghalt: {
3059 // FIXME: Should this use a waterfall loop?
3061 return;
3062 }
3063 case Intrinsic::amdgcn_s_setreg: {
3065 return;
3066 }
3067 case Intrinsic::amdgcn_raw_buffer_load_lds:
3068 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: {
3069 applyDefaultMapping(OpdMapper);
3070 constrainOpWithReadfirstlane(B, MI, 1); // rsrc
3072 constrainOpWithReadfirstlane(B, MI, 5); // soffset
3073 return;
3074 }
3075 case Intrinsic::amdgcn_struct_buffer_load_lds:
3076 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
3077 applyDefaultMapping(OpdMapper);
3078 constrainOpWithReadfirstlane(B, MI, 1); // rsrc
3080 constrainOpWithReadfirstlane(B, MI, 6); // soffset
3081 return;
3082 }
3083 case Intrinsic::amdgcn_global_load_lds: {
3084 applyDefaultMapping(OpdMapper);
3086 return;
3087 }
3088 case Intrinsic::amdgcn_lds_direct_load: {
3089 applyDefaultMapping(OpdMapper);
3090 // Readlane for m0 value, which is always the last operand.
3091 constrainOpWithReadfirstlane(B, MI, MI.getNumOperands() - 1); // Index
3092 return;
3093 }
3094 case Intrinsic::amdgcn_exp_row:
3095 applyDefaultMapping(OpdMapper);
3097 return;
3098 default: {
3099 if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3101 // Non-images can have complications from operands that allow both SGPR
3102 // and VGPR. For now it's too complicated to figure out the final opcode
3103 // to derive the register bank from the MCInstrDesc.
3104 if (RSrcIntrin->IsImage) {
3105 applyMappingImage(B, MI, OpdMapper, RSrcIntrin->RsrcArg);
3106 return;
3107 }
3108 }
3109
3110 break;
3111 }
3112 }
3113 break;
3114 }
3115 case AMDGPU::G_SI_CALL: {
3116 // Use a set to avoid extra readfirstlanes in the case where multiple
3117 // operands are the same register.
3118 SmallSet<Register, 4> SGPROperandRegs;
3119
3120 if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, {1}))
3121 break;
3122
3123 // Move all copies to physical SGPRs that are used by the call instruction
3124 // into the loop block. Start searching for these copies until the
3125 // ADJCALLSTACKUP.
3126 unsigned FrameSetupOpcode = AMDGPU::ADJCALLSTACKUP;
3127 unsigned FrameDestroyOpcode = AMDGPU::ADJCALLSTACKDOWN;
3128
3129 // Move all non-copies before the copies, so that a complete range can be
3130 // moved into the waterfall loop.
3131 SmallVector<MachineInstr *, 4> NonCopyInstrs;
3132 // Count of NonCopyInstrs found until the current LastCopy.
3133 unsigned NonCopyInstrsLen = 0;
3135 MachineBasicBlock::iterator LastCopy = Start;
3136 MachineBasicBlock *MBB = MI.getParent();
3139 while (Start->getOpcode() != FrameSetupOpcode) {
3140 --Start;
3141 bool IsCopy = false;
3142 if (Start->getOpcode() == AMDGPU::COPY) {
3143 auto &Dst = Start->getOperand(0);
3144 if (Dst.isReg()) {
3145 Register Reg = Dst.getReg();
3146 if (Reg.isPhysical() && MI.readsRegister(Reg, TRI)) {
3147 IsCopy = true;
3148 } else {
3149 // Also move the copy from the scratch rsrc descriptor into the loop
3150 // to allow it to be optimized away.
3151 auto &Src = Start->getOperand(1);
3152 if (Src.isReg()) {
3153 Reg = Src.getReg();
3154 IsCopy = Info->getScratchRSrcReg() == Reg;
3155 }
3156 }
3157 }
3158 }
3159
3160 if (IsCopy) {
3161 LastCopy = Start;
3162 NonCopyInstrsLen = NonCopyInstrs.size();
3163 } else {
3164 NonCopyInstrs.push_back(&*Start);
3165 }
3166 }
3167 NonCopyInstrs.resize(NonCopyInstrsLen);
3168
3169 for (auto *NonCopy : reverse(NonCopyInstrs)) {
3170 MBB->splice(LastCopy, MBB, NonCopy->getIterator());
3171 }
3172 Start = LastCopy;
3173
3174 // Do the same for copies after the loop
3175 NonCopyInstrs.clear();
3176 NonCopyInstrsLen = 0;
3178 LastCopy = End;
3179 while (End->getOpcode() != FrameDestroyOpcode) {
3180 ++End;
3181 bool IsCopy = false;
3182 if (End->getOpcode() == AMDGPU::COPY) {
3183 auto &Src = End->getOperand(1);
3184 if (Src.isReg()) {
3185 Register Reg = Src.getReg();
3186 IsCopy = Reg.isPhysical() && MI.modifiesRegister(Reg, TRI);
3187 }
3188 }
3189
3190 if (IsCopy) {
3191 LastCopy = End;
3192 NonCopyInstrsLen = NonCopyInstrs.size();
3193 } else {
3194 NonCopyInstrs.push_back(&*End);
3195 }
3196 }
3197 NonCopyInstrs.resize(NonCopyInstrsLen);
3198
3199 End = LastCopy;
3200 ++LastCopy;
3201 for (auto *NonCopy : reverse(NonCopyInstrs)) {
3202 MBB->splice(LastCopy, MBB, NonCopy->getIterator());
3203 }
3204
3205 ++End;
3206 B.setInsertPt(B.getMBB(), Start);
3207 executeInWaterfallLoop(B, make_range(Start, End), SGPROperandRegs);
3208 break;
3209 }
3210 case AMDGPU::G_LOAD:
3211 case AMDGPU::G_ZEXTLOAD:
3212 case AMDGPU::G_SEXTLOAD: {
3213 if (applyMappingLoad(B, OpdMapper, MI))
3214 return;
3215 break;
3216 }
3217 case AMDGPU::G_DYN_STACKALLOC:
3218 applyMappingDynStackAlloc(B, OpdMapper, MI);
3219 return;
3220 case AMDGPU::G_STACKRESTORE: {
3221 applyDefaultMapping(OpdMapper);
3223 return;
3224 }
3225 case AMDGPU::G_SBFX:
3226 applyMappingBFE(B, OpdMapper, /*Signed*/ true);
3227 return;
3228 case AMDGPU::G_UBFX:
3229 applyMappingBFE(B, OpdMapper, /*Signed*/ false);
3230 return;
3231 case AMDGPU::G_AMDGPU_MAD_U64_U32:
3232 case AMDGPU::G_AMDGPU_MAD_I64_I32:
3233 applyMappingMAD_64_32(B, OpdMapper);
3234 return;
3235 default:
3236 break;
3237 }
3238
3239 return applyDefaultMapping(OpdMapper);
3240}
3241
3242// vgpr, sgpr -> vgpr
3243// vgpr, agpr -> vgpr
3244// agpr, agpr -> agpr
3245// agpr, sgpr -> vgpr
3246static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
3247 if (RB0 == AMDGPU::InvalidRegBankID)
3248 return RB1;
3249 if (RB1 == AMDGPU::InvalidRegBankID)
3250 return RB0;
3251
3252 if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID)
3253 return AMDGPU::SGPRRegBankID;
3254
3255 if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID)
3256 return AMDGPU::AGPRRegBankID;
3257
3258 return AMDGPU::VGPRRegBankID;
3259}
3260
3261static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) {
3262 if (RB0 == AMDGPU::InvalidRegBankID)
3263 return RB1;
3264 if (RB1 == AMDGPU::InvalidRegBankID)
3265 return RB0;
3266
3267 // vcc, vcc -> vcc
3268 // vcc, sgpr -> vcc
3269 // vcc, vgpr -> vcc
3270 if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID)
3271 return AMDGPU::VCCRegBankID;
3272
3273 // vcc, vgpr -> vgpr
3274 return regBankUnion(RB0, RB1);
3275}
3276
3278 const MachineInstr &MI) const {
3279 unsigned RegBank = AMDGPU::InvalidRegBankID;
3280
3281 for (const MachineOperand &MO : MI.operands()) {
3282 if (!MO.isReg())
3283 continue;
3284 Register Reg = MO.getReg();
3285 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3286 RegBank = regBankUnion(RegBank, Bank->getID());
3287 if (RegBank == AMDGPU::VGPRRegBankID)
3288 break;
3289 }
3290 }
3291
3292 return RegBank;
3293}
3294
3296 const MachineFunction &MF = *MI.getParent()->getParent();
3297 const MachineRegisterInfo &MRI = MF.getRegInfo();
3298 for (const MachineOperand &MO : MI.operands()) {
3299 if (!MO.isReg())
3300 continue;
3301 Register Reg = MO.getReg();
3302 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3303 if (Bank->getID() != AMDGPU::SGPRRegBankID)
3304 return false;
3305 }
3306 }
3307 return true;
3308}
3309
3312 const MachineFunction &MF = *MI.getParent()->getParent();
3313 const MachineRegisterInfo &MRI = MF.getRegInfo();
3314 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3315
3316 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3317 const MachineOperand &SrcOp = MI.getOperand(i);
3318 if (!SrcOp.isReg())
3319 continue;
3320
3321 unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI);
3322 OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3323 }
3324 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3325 MI.getNumOperands());
3326}
3327
3330 const MachineFunction &MF = *MI.getParent()->getParent();
3331 const MachineRegisterInfo &MRI = MF.getRegInfo();
3332 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3333
3334 // Even though we technically could use SGPRs, this would require knowledge of
3335 // the constant bus restriction. Force all sources to VGPR (except for VCC).
3336 //
3337 // TODO: Unary ops are trivially OK, so accept SGPRs?
3338 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3339 const MachineOperand &Src = MI.getOperand(i);
3340 if (!Src.isReg())
3341 continue;
3342
3343 unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI);
3344 unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
3345 OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
3346 }
3347
3348 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3349 MI.getNumOperands());
3350}
3351
3354 const MachineFunction &MF = *MI.getParent()->getParent();
3355 const MachineRegisterInfo &MRI = MF.getRegInfo();
3356 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3357
3358 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
3359 const MachineOperand &Op = MI.getOperand(I);
3360 if (!Op.isReg())
3361 continue;
3362
3363 unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
3364 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3365 }
3366
3367 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3368 MI.getNumOperands());
3369}
3370
3373 const MachineInstr &MI,
3374 int RsrcIdx) const {
3375 // The reported argument index is relative to the IR intrinsic call arguments,
3376 // so we need to shift by the number of defs and the intrinsic ID.
3377 RsrcIdx += MI.getNumExplicitDefs() + 1;
3378
3379 const int NumOps = MI.getNumOperands();
3380 SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps);
3381
3382 // TODO: Should packed/unpacked D16 difference be reported here as part of
3383 // the value mapping?
3384 for (int I = 0; I != NumOps; ++I) {
3385 if (!MI.getOperand(I).isReg())
3386 continue;
3387
3388 Register OpReg = MI.getOperand(I).getReg();
3389 // We replace some dead address operands with $noreg
3390 if (!OpReg)
3391 continue;
3392
3393 unsigned Size = getSizeInBits(OpReg, MRI, *TRI);
3394
3395 // FIXME: Probably need a new intrinsic register bank searchable table to
3396 // handle arbitrary intrinsics easily.
3397 //
3398 // If this has a sampler, it immediately follows rsrc.
3399 const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1;
3400
3401 if (MustBeSGPR) {
3402 // If this must be an SGPR, so we must report whatever it is as legal.
3403 unsigned NewBank = getRegBankID(OpReg, MRI, AMDGPU::SGPRRegBankID);
3404 OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size);
3405 } else {
3406 // Some operands must be VGPR, and these are easy to copy to.
3407 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3408 }
3409 }
3410
3411 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps);
3412}
3413
3414/// Return the mapping for a pointer argument.
3417 Register PtrReg) const {
3418 LLT PtrTy = MRI.getType(PtrReg);
3419 unsigned Size = PtrTy.getSizeInBits();
3422 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3423
3424 // If we're using MUBUF instructions for global memory, an SGPR base register
3425 // is possible. Otherwise this needs to be a VGPR.
3426 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3427 return AMDGPU::getValueMapping(PtrBank->getID(), Size);
3428}
3429
3432
3433 const MachineFunction &MF = *MI.getParent()->getParent();
3434 const MachineRegisterInfo &MRI = MF.getRegInfo();
3436 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3437 Register PtrReg = MI.getOperand(1).getReg();
3438 LLT PtrTy = MRI.getType(PtrReg);
3439 unsigned AS = PtrTy.getAddressSpace();
3440 unsigned PtrSize = PtrTy.getSizeInBits();
3441
3442 const ValueMapping *ValMapping;
3443 const ValueMapping *PtrMapping;
3444
3445 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3446
3447 if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) {
3448 if (isScalarLoadLegal(MI)) {
3449 // We have a uniform instruction so we want to use an SMRD load
3450 ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3451 PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
3452 } else {
3453 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3454
3455 // If we're using MUBUF instructions for global memory, an SGPR base
3456 // register is possible. Otherwise this needs to be a VGPR.
3457 unsigned PtrBankID = Subtarget.useFlatForGlobal() ?
3458 AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID;
3459
3460 PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize);
3461 }
3462 } else {
3463 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3464 PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
3465 }
3466
3467 OpdsMapping[0] = ValMapping;
3468 OpdsMapping[1] = PtrMapping;
3470 1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands());
3471 return Mapping;
3472
3473 // FIXME: Do we want to add a mapping for FLAT load, or should we just
3474 // handle that during instruction selection?
3475}
3476
3477unsigned
3479 const MachineRegisterInfo &MRI,
3480 unsigned Default) const {
3481 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3482 return Bank ? Bank->getID() : Default;
3483}
3484
3487 const MachineRegisterInfo &MRI,
3488 const TargetRegisterInfo &TRI) const {
3489 // Lie and claim anything is legal, even though this needs to be an SGPR
3490 // applyMapping will have to deal with it as a waterfall loop.
3491 unsigned Bank = getRegBankID(Reg, MRI, AMDGPU::SGPRRegBankID);
3492 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3493 return AMDGPU::getValueMapping(Bank, Size);
3494}
3495
3498 const MachineRegisterInfo &MRI,
3499 const TargetRegisterInfo &TRI) const {
3500 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3501 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3502}
3503
3506 const MachineRegisterInfo &MRI,
3507 const TargetRegisterInfo &TRI) const {
3508 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3509 return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size);
3510}
3511
3512///
3513/// This function must return a legal mapping, because
3514/// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
3515/// in RegBankSelect::Mode::Fast. Any mapping that would cause a
3516/// VGPR to SGPR generated is illegal.
3517///
3518// Operands that must be SGPRs must accept potentially divergent VGPRs as
3519// legal. These will be dealt with in applyMappingImpl.
3520//
3523 const MachineFunction &MF = *MI.getParent()->getParent();
3524 const MachineRegisterInfo &MRI = MF.getRegInfo();
3525
3526 if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) {
3527 // The default logic bothers to analyze impossible alternative mappings. We
3528 // want the most straightforward mapping, so just directly handle this.
3529 const RegisterBank *DstBank = getRegBank(MI.getOperand(0).getReg(), MRI,
3530 *TRI);
3531 const RegisterBank *SrcBank = getRegBank(MI.getOperand(1).getReg(), MRI,
3532 *TRI);
3533 assert(SrcBank && "src bank should have been assigned already");
3534 if (!DstBank)
3535 DstBank = SrcBank;
3536
3537 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3538 if (MI.getOpcode() != AMDGPU::G_FREEZE &&
3539 cannotCopy(*DstBank, *SrcBank, Size))
3541
3542 const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank);
3543 unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2;
3544 SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize);
3545 OpdsMapping[0] = &ValMap;
3546 if (MI.getOpcode() == AMDGPU::G_FREEZE)
3547 OpdsMapping[1] = &ValMap;
3548
3549 return getInstructionMapping(
3550 1, /*Cost*/ 1,
3551 /*OperandsMapping*/ getOperandsMapping(OpdsMapping), OpdsMappingSize);
3552 }
3553
3554 if (MI.isRegSequence()) {
3555 // If any input is a VGPR, the result must be a VGPR. The default handling
3556 // assumes any copy between banks is legal.
3557 unsigned BankID = AMDGPU::SGPRRegBankID;
3558
3559 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3560 auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI);
3561 // It doesn't make sense to use vcc or scc banks here, so just ignore
3562 // them.
3563 if (OpBank != AMDGPU::SGPRRegBankID) {
3564 BankID = AMDGPU::VGPRRegBankID;
3565 break;
3566 }
3567 }
3568 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3569
3570 const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID));
3571 return getInstructionMapping(
3572 1, /*Cost*/ 1,
3573 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3574 }
3575
3576 // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
3577 // properly.
3578 //
3579 // TODO: There are additional exec masking dependencies to analyze.
3580 if (MI.getOpcode() == TargetOpcode::G_PHI) {
3581 unsigned ResultBank = AMDGPU::InvalidRegBankID;
3582 Register DstReg = MI.getOperand(0).getReg();
3583
3584 // Sometimes the result may have already been assigned a bank.
3585 if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI))
3586 ResultBank = DstBank->getID();
3587
3588 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3589 Register Reg = MI.getOperand(I).getReg();
3590 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3591
3592 // FIXME: Assuming VGPR for any undetermined inputs.
3593 if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {
3594 ResultBank = AMDGPU::VGPRRegBankID;
3595 break;
3596 }
3597
3598 // FIXME: Need to promote SGPR case to s32
3599 unsigned OpBank = Bank->getID();
3600 ResultBank = regBankBoolUnion(ResultBank, OpBank);
3601 }
3602
3603 assert(ResultBank != AMDGPU::InvalidRegBankID);
3604
3605 unsigned Size = MRI.getType(DstReg).getSizeInBits();
3606
3607 const ValueMapping &ValMap =
3608 getValueMapping(0, Size, getRegBank(ResultBank));
3609 return getInstructionMapping(
3610 1, /*Cost*/ 1,
3611 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3612 }
3613
3615 if (Mapping.isValid())
3616 return Mapping;
3617
3618 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3619
3620 switch (MI.getOpcode()) {
3621 default:
3623
3624 case AMDGPU::G_AND:
3625 case AMDGPU::G_OR:
3626 case AMDGPU::G_XOR: {
3627 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3628 if (Size == 1) {
3629 const RegisterBank *DstBank
3630 = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI);
3631
3632 unsigned TargetBankID = AMDGPU::InvalidRegBankID;
3633 unsigned BankLHS = AMDGPU::InvalidRegBankID;
3634 unsigned BankRHS = AMDGPU::InvalidRegBankID;
3635 if (DstBank) {
3636 TargetBankID = DstBank->getID();
3637 if (DstBank == &AMDGPU::VCCRegBank) {
3638 TargetBankID = AMDGPU::VCCRegBankID;
3639 BankLHS = AMDGPU::VCCRegBankID;
3640 BankRHS = AMDGPU::VCCRegBankID;
3641 } else {
3642 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3643 AMDGPU::SGPRRegBankID);
3644 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3645 AMDGPU::SGPRRegBankID);
3646 }
3647 } else {
3648 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3649 AMDGPU::VCCRegBankID);
3650 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3651 AMDGPU::VCCRegBankID);
3652
3653 // Both inputs should be true booleans to produce a boolean result.
3654 if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {
3655 TargetBankID = AMDGPU::VGPRRegBankID;
3656 } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {
3657 TargetBankID = AMDGPU::VCCRegBankID;
3658 BankLHS = AMDGPU::VCCRegBankID;
3659 BankRHS = AMDGPU::VCCRegBankID;
3660 } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
3661 TargetBankID = AMDGPU::SGPRRegBankID;
3662 }
3663 }
3664
3665 OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size);
3666 OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size);
3667 OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size);
3668 break;
3669 }
3670
3671 if (Size == 64) {
3672
3673 if (isSALUMapping(MI)) {
3674 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
3675 OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
3676 } else {
3677 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
3678 unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/);
3679 OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
3680
3681 unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI /*, DefaultBankID*/);
3682 OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size);
3683 }
3684
3685 break;
3686 }
3687
3688 [[fallthrough]];
3689 }
3690 case AMDGPU::G_PTR_ADD:
3691 case AMDGPU::G_PTRMASK:
3692 case AMDGPU::G_ADD:
3693 case AMDGPU::G_SUB:
3694 case AMDGPU::G_MUL:
3695 case AMDGPU::G_SHL:
3696 case AMDGPU::G_LSHR:
3697 case AMDGPU::G_ASHR:
3698 case AMDGPU::G_UADDO:
3699 case AMDGPU::G_USUBO:
3700 case AMDGPU::G_UADDE:
3701 case AMDGPU::G_SADDE:
3702 case AMDGPU::G_USUBE:
3703 case AMDGPU::G_SSUBE:
3704 case AMDGPU::G_SMIN:
3705 case AMDGPU::G_SMAX:
3706 case AMDGPU::G_UMIN:
3707 case AMDGPU::G_UMAX:
3708 case AMDGPU::G_ABS:
3709 case AMDGPU::G_SHUFFLE_VECTOR:
3710 case AMDGPU::G_SBFX:
3711 case AMDGPU::G_UBFX:
3712 if (isSALUMapping(MI))
3713 return getDefaultMappingSOP(MI);
3714 return getDefaultMappingVOP(MI);
3715 case AMDGPU::G_FADD:
3716 case AMDGPU::G_FSUB:
3717 case AMDGPU::G_FMUL:
3718 case AMDGPU::G_FMA:
3719 case AMDGPU::G_FFLOOR:
3720 case AMDGPU::G_FCEIL:
3721 case AMDGPU::G_FRINT:
3722 case AMDGPU::G_FMINNUM:
3723 case AMDGPU::G_FMAXNUM:
3724 case AMDGPU::G_INTRINSIC_TRUNC:
3725 case AMDGPU::G_STRICT_FADD:
3726 case AMDGPU::G_STRICT_FSUB:
3727 case AMDGPU::G_STRICT_FMUL:
3728 case AMDGPU::G_STRICT_FMA: {
3729 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3730 if (Subtarget.hasSALUFloatInsts() && (Size == 32 || Size == 16) &&
3732 return getDefaultMappingSOP(MI);
3733 return getDefaultMappingVOP(MI);
3734 }
3735 case AMDGPU::G_FPTOSI:
3736 case AMDGPU::G_FPTOUI:
3737 case AMDGPU::G_SITOFP:
3738 case AMDGPU::G_UITOFP: {
3739 unsigned SizeDst = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3740 unsigned SizeSrc = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3741 if (Subtarget.hasSALUFloatInsts() && SizeDst == 32 && SizeSrc == 32 &&
3743 return getDefaultMappingSOP(MI);
3744 return getDefaultMappingVOP(MI);
3745 }
3746 case AMDGPU::G_FPTRUNC:
3747 case AMDGPU::G_FPEXT: {
3748 unsigned SizeDst = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3749 unsigned SizeSrc = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3750 if (Subtarget.hasSALUFloatInsts() && SizeDst != 64 && SizeSrc != 64 &&
3752 return getDefaultMappingSOP(MI);
3753 return getDefaultMappingVOP(MI);
3754 }
3755 case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU
3756 case AMDGPU::G_SSUBSAT:
3757 case AMDGPU::G_UADDSAT:
3758 case AMDGPU::G_USUBSAT:
3759 case AMDGPU::G_FMAD:
3760 case AMDGPU::G_FSQRT:
3761 case AMDGPU::G_FEXP2:
3762 case AMDGPU::G_FLOG2:
3763 case AMDGPU::G_FLDEXP:
3764 case AMDGPU::G_FMINNUM_IEEE:
3765 case AMDGPU::G_FMAXNUM_IEEE:
3766 case AMDGPU::G_FCANONICALIZE:
3767 case AMDGPU::G_STRICT_FLDEXP:
3768 case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar?
3769 case AMDGPU::G_FSHR: // TODO: Expand for scalar
3770 case AMDGPU::G_AMDGPU_FMIN_LEGACY:
3771 case AMDGPU::G_AMDGPU_FMAX_LEGACY:
3772 case AMDGPU::G_AMDGPU_RCP_IFLAG:
3773 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
3774 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
3775 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
3776 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
3777 case AMDGPU::G_AMDGPU_CVT_PK_I16_I32:
3778 case AMDGPU::G_AMDGPU_SMED3:
3779 case AMDGPU::G_AMDGPU_FMED3:
3780 return getDefaultMappingVOP(MI);
3781 case AMDGPU::G_UMULH:
3782 case AMDGPU::G_SMULH: {
3784 return getDefaultMappingSOP(MI);
3785 return getDefaultMappingVOP(MI);
3786 }
3787 case AMDGPU::G_AMDGPU_MAD_U64_U32:
3788 case AMDGPU::G_AMDGPU_MAD_I64_I32: {
3789 // Three possible mappings:
3790 //
3791 // - Default SOP
3792 // - Default VOP
3793 // - Scalar multiply: src0 and src1 are SGPRs, the rest is VOP.
3794 //
3795 // This allows instruction selection to keep the multiplication part of the
3796 // instruction on the SALU.
3797 bool AllSalu = true;
3798 bool MulSalu = true;
3799 for (unsigned i = 0; i < 5; ++i) {
3800 Register Reg = MI.getOperand(i).getReg();
3801 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3802 if (Bank->getID() != AMDGPU::SGPRRegBankID) {
3803 AllSalu = false;
3804 if (i == 2 || i == 3) {
3805 MulSalu = false;
3806 break;
3807 }
3808 }
3809 }
3810 }
3811
3812 if (AllSalu)
3813 return getDefaultMappingSOP(MI);
3814
3815 // If the multiply-add is full-rate in VALU, use that even if the
3816 // multiplication part is scalar. Accumulating separately on the VALU would
3817 // take two instructions.
3818 if (!MulSalu || Subtarget.hasFullRate64Ops())
3819 return getDefaultMappingVOP(MI);
3820
3821 // Keep the multiplication on the SALU, then accumulate on the VALU.
3822 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
3823 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
3824 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3825 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3826 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
3827 break;
3828 }
3829 case AMDGPU::G_IMPLICIT_DEF: {
3830 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3831 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3832 break;
3833 }
3834 case AMDGPU::G_FCONSTANT:
3835 case AMDGPU::G_CONSTANT:
3836 case AMDGPU::G_GLOBAL_VALUE:
3837 case AMDGPU::G_BLOCK_ADDR:
3838 case AMDGPU::G_READCYCLECOUNTER: {
3839 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3840 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3841 break;
3842 }
3843 case AMDGPU::G_FRAME_INDEX: {
3844 // TODO: This should be the same as other constants, but eliminateFrameIndex
3845 // currently assumes VALU uses.
3846 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3847 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3848 break;
3849 }
3850 case AMDGPU::G_DYN_STACKALLOC: {
3851 // Result is always uniform, and a wave reduction is needed for the source.
3852 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3853 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3854 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32);
3855 break;
3856 }
3857 case AMDGPU::G_AMDGPU_WAVE_ADDRESS: {
3858 // This case is weird because we expect a physical register in the source,
3859 // but need to set a bank anyway.
3860 //
3861 // TODO: We could select the result to SGPR or VGPR
3862 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3863 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3864 break;
3865 }
3866 case AMDGPU::G_INSERT: {
3867 unsigned BankID = getMappingType(MRI, MI);
3868 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3869 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3870 unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
3871 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
3872 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
3873 OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize);
3874 OpdsMapping[3] = nullptr;
3875 break;
3876 }
3877 case AMDGPU::G_EXTRACT: {
3878 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3879 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3880 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3881 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
3882 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
3883 OpdsMapping[2] = nullptr;
3884 break;
3885 }
3886 case AMDGPU::G_BUILD_VECTOR:
3887 case AMDGPU::G_BUILD_VECTOR_TRUNC: {
3888 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
3889 if (DstTy == LLT::fixed_vector(2, 16)) {
3890 unsigned DstSize = DstTy.getSizeInBits();
3891 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3892 unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3893 unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
3894 unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID);
3895
3896 OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize);
3897 OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize);
3898 OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize);
3899 break;
3900 }
3901
3902 [[fallthrough]];
3903 }
3904 case AMDGPU::G_MERGE_VALUES:
3905 case AMDGPU::G_CONCAT_VECTORS: {
3906 unsigned Bank = getMappingType(MRI, MI);
3907 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3908 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3909
3910 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
3911 // Op1 and Dst should use the same register bank.
3912 for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
3913 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
3914 break;
3915 }
3916 case AMDGPU::G_BITREVERSE:
3917 case AMDGPU::G_BITCAST:
3918 case AMDGPU::G_INTTOPTR:
3919 case AMDGPU::G_PTRTOINT:
3920 case AMDGPU::G_FABS:
3921 case AMDGPU::G_FNEG: {
3922 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3923 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3924 OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
3925 break;
3926 }
3927 case AMDGPU::G_AMDGPU_FFBH_U32:
3928 case AMDGPU::G_AMDGPU_FFBL_B32:
3929 case AMDGPU::G_CTLZ_ZERO_UNDEF:
3930 case AMDGPU::G_CTTZ_ZERO_UNDEF: {
3931 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3932 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3933 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
3934 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(BankID, Size);
3935 break;
3936 }
3937 case AMDGPU::G_CTPOP: {
3938 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3939 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3940 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
3941
3942 // This should really be getValueMappingSGPR64Only, but allowing the generic
3943 // code to handle the register split just makes using LegalizerHelper more
3944 // difficult.
3945 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
3946 break;
3947 }
3948 case AMDGPU::G_TRUNC: {
3949 Register Dst = MI.getOperand(0).getReg();
3950 Register Src = MI.getOperand(1).getReg();
3951 unsigned Bank = getRegBankID(Src, MRI);
3952 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
3953 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
3954 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
3955 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
3956 break;
3957 }
3958 case AMDGPU::G_ZEXT:
3959 case AMDGPU::G_SEXT:
3960 case AMDGPU::G_ANYEXT:
3961 case AMDGPU::G_SEXT_INREG: {
3962 Register Dst = MI.getOperand(0).getReg();
3963 Register Src = MI.getOperand(1).getReg();
3964 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
3965 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
3966
3967 unsigned DstBank;
3968 const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI);
3969 assert(SrcBank);
3970 switch (SrcBank->getID()) {
3971 case AMDGPU::SGPRRegBankID:
3972 DstBank = AMDGPU::SGPRRegBankID;
3973 break;
3974 default:
3975 DstBank = AMDGPU::VGPRRegBankID;
3976 break;
3977 }
3978
3979 // Scalar extend can use 64-bit BFE, but VGPRs require extending to
3980 // 32-bits, and then to 64.
3981 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize);
3982 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(),
3983 SrcSize);
3984 break;
3985 }
3986 case AMDGPU::G_IS_FPCLASS: {
3987 Register SrcReg = MI.getOperand(1).getReg();
3988 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
3989 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3990 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
3991 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
3992 break;
3993 }
3994 case AMDGPU::G_STORE: {
3995 assert(MI.getOperand(0).isReg());
3996 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3997
3998 // FIXME: We need to specify a different reg bank once scalar stores are
3999 // supported.
4000 const ValueMapping *ValMapping =
4001 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4002 OpdsMapping[0] = ValMapping;
4003 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4004 break;
4005 }
4006 case AMDGPU::G_ICMP:
4007 case AMDGPU::G_FCMP: {
4008 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4009
4010 // See if the result register has already been constrained to vcc, which may
4011 // happen due to control flow intrinsic lowering.
4012 unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI,
4013 AMDGPU::SGPRRegBankID);
4014 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
4015 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI);
4016
4017 auto canUseSCCICMP = [&]() {
4018 auto Pred =
4019 static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
4020 return Size == 32 ||
4021 (Size == 64 &&
4022 (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
4024 };
4025 auto canUseSCCFCMP = [&]() {
4026 return Subtarget.hasSALUFloatInsts() && (Size == 32 || Size == 16);
4027 };
4028
4029 bool isICMP = MI.getOpcode() == AMDGPU::G_ICMP;
4030 bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID &&
4031 Op2Bank == AMDGPU::SGPRRegBankID &&
4032 Op3Bank == AMDGPU::SGPRRegBankID &&
4033 (isICMP ? canUseSCCICMP() : canUseSCCFCMP());
4034
4035 DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4036 unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4037
4038 // TODO: Use 32-bit for scalar output size.
4039 // SCC results will need to be copied to a 32-bit SGPR virtual register.
4040 const unsigned ResultSize = 1;
4041
4042 OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize);
4043 OpdsMapping[1] = nullptr; // Predicate Operand.
4044 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size);
4045 OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size);
4046 break;
4047 }
4048 case AMDGPU::G_EXTRACT_VECTOR_ELT: {
4049 // VGPR index can be used for waterfall when indexing a SGPR vector.
4050 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4051 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4052 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4053 unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4054 unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI);
4055 unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank);
4056
4057 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize);
4058 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize);
4059
4060 // The index can be either if the source vector is VGPR.
4061 OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4062 break;
4063 }
4064 case AMDGPU::G_INSERT_VECTOR_ELT: {
4065 unsigned OutputBankID = isSALUMapping(MI) ?
4066 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4067
4068 unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4069 unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4070 unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
4071 unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
4072 unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI);
4073
4074 OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
4075 OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize);
4076
4077 // This is a weird case, because we need to break down the mapping based on
4078 // the register bank of a different operand.
4079 if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) {
4080 OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID,
4081 InsertSize);
4082 } else {
4083 assert(InsertSize == 32 || InsertSize == 64);
4084 OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize);
4085 }
4086
4087 // The index can be either if the source vector is VGPR.
4088 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize);
4089 break;
4090 }
4091 case AMDGPU::G_UNMERGE_VALUES: {
4092 unsigned Bank = getMappingType(MRI, MI);
4093
4094 // Op1 and Dst should use the same register bank.
4095 // FIXME: Shouldn't this be the default? Why do we need to handle this?
4096 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
4097 unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
4098 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size);
4099 }
4100 break;
4101 }
4102 case AMDGPU::G_AMDGPU_BUFFER_LOAD:
4103 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
4104 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
4105 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
4106 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
4107 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
4108 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE:
4109 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
4110 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
4111 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
4112 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
4113 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16:
4114 case AMDGPU::G_AMDGPU_BUFFER_STORE:
4115 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
4116 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
4117 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
4118 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: {
4119 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4120
4121 // rsrc
4122 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4123
4124 // vindex
4125 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4126
4127 // voffset
4128 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4129
4130 // soffset
4131 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4132
4133 // Any remaining operands are immediates and were correctly null
4134 // initialized.
4135 break;
4136 }
4137 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
4138 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
4139 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
4140 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
4141 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
4142 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
4143 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
4144 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
4145 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
4146 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
4147 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
4148 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
4149 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
4150 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
4151 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
4152 // vdata_out
4153 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4154
4155 // vdata_in
4156 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4157
4158 // rsrc
4159 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4160
4161 // vindex
4162 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4163
4164 // voffset
4165 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4166
4167 // soffset
4168 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4169
4170 // Any remaining operands are immediates and were correctly null
4171 // initialized.
4172 break;
4173 }
4174 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
4175 // vdata_out
4176 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4177
4178 // vdata_in
4179 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4180
4181 // cmp
4182 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4183
4184 // rsrc
4185 OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4186
4187 // vindex
4188 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4189
4190 // voffset
4191 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4192
4193 // soffset
4194 OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
4195
4196 // Any remaining operands are immediates and were correctly null
4197 // initialized.
4198 break;
4199 }
4200 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
4201 // Lie and claim everything is legal, even though some need to be
4202 // SGPRs. applyMapping will have to deal with it as a waterfall loop.
4203 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4204 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4205
4206 // We need to convert this to a MUBUF if either the resource of offset is
4207 // VGPR.
4208 unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID();
4209 unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID();
4210 unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank);
4211
4212 unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4213 OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0);
4214 break;
4215 }
4216 case AMDGPU::G_INTRINSIC:
4217 case AMDGPU::G_INTRINSIC_CONVERGENT: {
4218 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
4219 default:
4221 case Intrinsic::amdgcn_div_fmas:
4222 case Intrinsic::amdgcn_div_fixup:
4223 case Intrinsic::amdgcn_trig_preop:
4224 case Intrinsic::amdgcn_sin:
4225 case Intrinsic::amdgcn_cos:
4226 case Intrinsic::amdgcn_log_clamp:
4227 case Intrinsic::amdgcn_log:
4228 case Intrinsic::amdgcn_exp2:
4229 case Intrinsic::amdgcn_rcp:
4230 case Intrinsic::amdgcn_rcp_legacy:
4231 case Intrinsic::amdgcn_sqrt:
4232 case Intrinsic::amdgcn_rsq:
4233 case Intrinsic::amdgcn_rsq_legacy:
4234 case Intrinsic::amdgcn_rsq_clamp:
4235 case Intrinsic::amdgcn_fmul_legacy:
4236 case Intrinsic::amdgcn_fma_legacy:
4237 case Intrinsic::amdgcn_frexp_mant:
4238 case Intrinsic::amdgcn_frexp_exp:
4239 case Intrinsic::amdgcn_fract:
4240 case Intrinsic::amdgcn_cvt_pknorm_i16:
4241 case Intrinsic::amdgcn_cvt_pknorm_u16:
4242 case Intrinsic::amdgcn_cvt_pk_i16:
4243 case Intrinsic::amdgcn_cvt_pk_u16:
4244 case Intrinsic::amdgcn_fmed3:
4245 case Intrinsic::amdgcn_cubeid:
4246 case Intrinsic::amdgcn_cubema:
4247 case Intrinsic::amdgcn_cubesc:
4248 case Intrinsic::amdgcn_cubetc:
4249 case Intrinsic::amdgcn_sffbh:
4250 case Intrinsic::amdgcn_fmad_ftz:
4251 case Intrinsic::amdgcn_mbcnt_lo:
4252 case Intrinsic::amdgcn_mbcnt_hi:
4253 case Intrinsic::amdgcn_mul_u24:
4254 case Intrinsic::amdgcn_mul_i24:
4255 case Intrinsic::amdgcn_mulhi_u24:
4256 case Intrinsic::amdgcn_mulhi_i24:
4257 case Intrinsic::amdgcn_lerp:
4258 case Intrinsic::amdgcn_sad_u8:
4259 case Intrinsic::amdgcn_msad_u8:
4260 case Intrinsic::amdgcn_sad_hi_u8:
4261 case Intrinsic::amdgcn_sad_u16:
4262 case Intrinsic::amdgcn_qsad_pk_u16_u8:
4263 case Intrinsic::amdgcn_mqsad_pk_u16_u8:
4264 case Intrinsic::amdgcn_mqsad_u32_u8:
4265 case Intrinsic::amdgcn_cvt_pk_u8_f32:
4266 case Intrinsic::amdgcn_alignbyte:
4267 case Intrinsic::amdgcn_perm:
4268 case Intrinsic::amdgcn_fdot2:
4269 case Intrinsic::amdgcn_sdot2:
4270 case Intrinsic::amdgcn_udot2:
4271 case Intrinsic::amdgcn_sdot4:
4272 case Intrinsic::amdgcn_udot4:
4273 case Intrinsic::amdgcn_sdot8:
4274 case Intrinsic::amdgcn_udot8:
4275 case Intrinsic::amdgcn_fdot2_bf16_bf16:
4276 case Intrinsic::amdgcn_fdot2_f16_f16:
4277 case Intrinsic::amdgcn_fdot2_f32_bf16:
4278 case Intrinsic::amdgcn_sudot4:
4279 case Intrinsic::amdgcn_sudot8:
4280 case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16:
4281 case Intrinsic::amdgcn_wmma_f16_16x16x16_f16:
4282 case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16:
4283 case Intrinsic::amdgcn_wmma_f32_16x16x16_f16:
4284 case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:
4285 case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8:
4286 return getDefaultMappingVOP(MI);
4287 case Intrinsic::amdgcn_sbfe:
4288 case Intrinsic::amdgcn_ubfe:
4289 if (isSALUMapping(MI))
4290 return getDefaultMappingSOP(MI);
4291 return getDefaultMappingVOP(MI);
4292 case Intrinsic::amdgcn_ds_swizzle:
4293 case Intrinsic::amdgcn_ds_permute:
4294 case Intrinsic::amdgcn_ds_bpermute:
4295 case Intrinsic::amdgcn_update_dpp:
4296 case Intrinsic::amdgcn_mov_dpp8:
4297 case Intrinsic::amdgcn_mov_dpp:
4298 case Intrinsic::amdgcn_strict_wwm:
4299 case Intrinsic::amdgcn_wwm:
4300 case Intrinsic::amdgcn_strict_wqm:
4301 case Intrinsic::amdgcn_wqm:
4302 case Intrinsic::amdgcn_softwqm:
4303 case Intrinsic::amdgcn_set_inactive:
4304 case Intrinsic::amdgcn_permlane64:
4306 case Intrinsic::amdgcn_cvt_pkrtz:
4308 return getDefaultMappingSOP(MI);
4309 return getDefaultMappingVOP(MI);
4310 case Intrinsic::amdgcn_kernarg_segment_ptr:
4311 case Intrinsic::amdgcn_s_getpc:
4312 case Intrinsic::amdgcn_groupstaticsize:
4313 case Intrinsic::amdgcn_reloc_constant:
4314 case Intrinsic::returnaddress: {
4315 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4316 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4317 break;
4318 }
4319 case Intrinsic::amdgcn_wqm_vote: {
4320 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4321 OpdsMapping[0] = OpdsMapping[2]
4322 = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size);
4323 break;
4324 }
4325 case Intrinsic::amdgcn_ps_live: {
4326 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4327 break;
4328 }
4329 case Intrinsic::amdgcn_div_scale: {
4330 unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4331 unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4332 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size);
4333 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size);
4334
4335 unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
4336 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4337 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4338 break;
4339 }
4340 case Intrinsic::amdgcn_class: {
4341 Register Src0Reg = MI.getOperand(2).getReg();
4342 Register Src1Reg = MI.getOperand(3).getReg();
4343 unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits();
4344 unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits();
4345 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4346 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
4347 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size);
4348 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size);
4349 break;
4350 }
4351 case Intrinsic::amdgcn_icmp:
4352 case Intrinsic::amdgcn_fcmp: {
4353 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4354 // This is not VCCRegBank because this is not used in boolean contexts.
4355 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4356 unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4357 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4358 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4359 break;
4360 }
4361 case Intrinsic::amdgcn_readlane: {
4362 // This must be an SGPR, but accept a VGPR.
4363 Register IdxReg = MI.getOperand(3).getReg();
4364 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4365 unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4366 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4367 [[fallthrough]];
4368 }
4369 case Intrinsic::amdgcn_readfirstlane: {
4370 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4371 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4372 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4373 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4374 break;
4375 }
4376 case Intrinsic::amdgcn_writelane: {
4377 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4378 Register SrcReg = MI.getOperand(2).getReg();
4379 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
4380 unsigned SrcBank = getRegBankID(SrcReg, MRI, AMDGPU::SGPRRegBankID);
4381 Register IdxReg = MI.getOperand(3).getReg();
4382 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4383 unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4384 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4385
4386 // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
4387 // to legalize.
4388 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize);
4389 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4390 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4391 break;
4392 }
4393 case Intrinsic::amdgcn_if_break: {
4394 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4395 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4396 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4397 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4398 break;
4399 }
4400 case Intrinsic::amdgcn_permlane16:
4401 case Intrinsic::amdgcn_permlanex16: {
4402 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4403 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4404 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4405 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4406 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4407 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4408 break;
4409 }
4410 case Intrinsic::amdgcn_mfma_f32_4x4x1f32:
4411 case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
4412 case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
4413 case Intrinsic::amdgcn_mfma_f32_4x4x2bf16:
4414 case Intrinsic::amdgcn_mfma_f32_16x16x1f32:
4415 case Intrinsic::amdgcn_mfma_f32_16x16x4f32:
4416 case Intrinsic::amdgcn_mfma_f32_16x16x4f16:
4417 case Intrinsic::amdgcn_mfma_f32_16x16x16f16:
4418 case Intrinsic::amdgcn_mfma_i32_16x16x4i8:
4419 case Intrinsic::amdgcn_mfma_i32_16x16x16i8:
4420 case Intrinsic::amdgcn_mfma_f32_16x16x2bf16:
4421 case Intrinsic::amdgcn_mfma_f32_16x16x8bf16:
4422 case Intrinsic::amdgcn_mfma_f32_32x32x1f32:
4423 case Intrinsic::amdgcn_mfma_f32_32x32x2f32:
4424 case Intrinsic::amdgcn_mfma_f32_32x32x4f16:
4425 case Intrinsic::amdgcn_mfma_f32_32x32x8f16:
4426 case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
4427 case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
4428 case Intrinsic::amdgcn_mfma_f32_32x32x2bf16:
4429 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16:
4430 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k:
4431 case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k:
4432 case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k:
4433 case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k:
4434 case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k:
4435 case Intrinsic::amdgcn_mfma_f64_16x16x4f64:
4436 case Intrinsic::amdgcn_mfma_f64_4x4x4f64:
4437 case Intrinsic::amdgcn_mfma_i32_16x16x32_i8:
4438 case Intrinsic::amdgcn_mfma_i32_32x32x16_i8:
4439 case Intrinsic::amdgcn_mfma_f32_16x16x8_xf32:
4440 case Intrinsic::amdgcn_mfma_f32_32x32x4_xf32:
4441 case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_bf8:
4442 case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_fp8:
4443 case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_bf8:
4444 case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_fp8:
4445 case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_bf8:
4446 case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_fp8:
4447 case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_bf8:
4448 case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8: {
4449 // Default for MAI intrinsics.
4450 // srcC can also be an immediate which can be folded later.
4451 // FIXME: Should we eventually add an alternative mapping with AGPR src
4452 // for srcA/srcB?
4453 //
4454 // vdst, srcA, srcB, srcC
4456 OpdsMapping[0] =
4457 Info->mayNeedAGPRs()
4458 ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
4459 : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4460 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4461 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4462 OpdsMapping[4] =
4463 Info->mayNeedAGPRs()
4464 ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
4465 : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4466 break;
4467 }
4468 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
4469 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
4470 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
4471 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
4472 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
4473 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
4474 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
4475 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
4476 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
4477 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
4478 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
4479 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
4480 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
4481 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8: {
4482 // vdst, srcA, srcB, srcC, idx
4483 OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4484 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4485 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4486 OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4487 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4488 break;
4489 }
4490 case Intrinsic::amdgcn_interp_p1:
4491 case Intrinsic::amdgcn_interp_p2:
4492 case Intrinsic::amdgcn_interp_mov:
4493 case Intrinsic::amdgcn_interp_p1_f16:
4494 case Intrinsic::amdgcn_interp_p2_f16:
4495 case Intrinsic::amdgcn_lds_param_load: {
4496 const int M0Idx = MI.getNumOperands() - 1;
4497 Register M0Reg = MI.getOperand(M0Idx).getReg();
4498 unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
4499 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4500
4501 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4502 for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
4503 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4504
4505 // Must be SGPR, but we must take whatever the original bank is and fix it
4506 // later.
4507 OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
4508 break;
4509 }
4510 case Intrinsic::amdgcn_interp_inreg_p10:
4511 case Intrinsic::amdgcn_interp_inreg_p2:
4512 case Intrinsic::amdgcn_interp_inreg_p10_f16:
4513 case Intrinsic::amdgcn_interp_inreg_p2_f16: {
4514 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4515 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4516 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4517 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4518 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4519 break;
4520 }
4521 case Intrinsic::amdgcn_ballot: {
4522 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4523 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4524 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4525 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize);
4526 break;
4527 }
4528 case Intrinsic::amdgcn_inverse_ballot: {
4529 // This must be an SGPR, but accept a VGPR.
4530 Register MaskReg = MI.getOperand(2).getReg();
4531 unsigned MaskSize = MRI.getType(MaskReg).getSizeInBits();
4532 unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);
4533 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4534 OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize);
4535 break;
4536 }
4537 case Intrinsic::amdgcn_wave_reduce_umin:
4538 case Intrinsic::amdgcn_wave_reduce_umax: {
4539 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4540 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4541 unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4542 auto regBankID =
4543 isSALUMapping(MI) ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4544 OpdsMapping[2] = AMDGPU::getValueMapping(regBankID, OpSize);
4545 break;
4546 }
4547 }
4548 break;
4549 }
4550 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4551 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
4552 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
4553 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
4554 auto IntrID = AMDGPU::getIntrinsicID(MI);
4555 const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID);
4556 assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic");
4557 // Non-images can have complications from operands that allow both SGPR
4558 // and VGPR. For now it's too complicated to figure out the final opcode
4559 // to derive the register bank from the MCInstrDesc.
4560 assert(RSrcIntrin->IsImage);
4561 return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
4562 }
4563 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
4564 unsigned N = MI.getNumExplicitOperands() - 2;
4565 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128);
4566 OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI);
4567 if (N == 3) {
4568 // Sequential form: all operands combined into VGPR256/VGPR512
4569 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4570 if (Size > 256)
4571 Size = 512;
4572 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4573 } else {
4574 // NSA form
4575 for (unsigned I = 2; I < N; ++I) {
4576 unsigned Size = MRI.getType(MI.getOperand(I).getReg()).getSizeInBits();
4577 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4578 }
4579 }
4580 break;
4581 }
4582 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
4583 case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: {
4584 auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
4585 switch (IntrID) {
4586 case Intrinsic::amdgcn_s_getreg:
4587 case Intrinsic::amdgcn_s_memtime:
4588 case Intrinsic::amdgcn_s_memrealtime:
4589 case Intrinsic::amdgcn_s_get_waveid_in_workgroup:
4590 case Intrinsic::amdgcn_s_sendmsg_rtn: {
4591 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4592 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4593 break;
4594 }
4595 case Intrinsic::amdgcn_global_atomic_fadd:
4596 case Intrinsic::amdgcn_global_atomic_csub:
4597 case Intrinsic::amdgcn_global_atomic_fmin:
4598 case Intrinsic::amdgcn_global_atomic_fmax:
4599 case Intrinsic::amdgcn_flat_atomic_fadd:
4600 case Intrinsic::amdgcn_flat_atomic_fmin:
4601 case Intrinsic::amdgcn_flat_atomic_fmax:
4602 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
4603 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
4605 case Intrinsic::amdgcn_ds_ordered_add:
4606 case Intrinsic::amdgcn_ds_ordered_swap:
4607 case Intrinsic::amdgcn_ds_fadd_v2bf16: {
4608 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4609 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4610 unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4611 AMDGPU::SGPRRegBankID);
4612 OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32);
4613 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4614 break;
4615 }
4616 case Intrinsic::amdgcn_ds_append:
4617 case Intrinsic::amdgcn_ds_consume: {
4618 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4619 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4620 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4621 break;
4622 }
4623 case Intrinsic::amdgcn_exp_compr:
4624 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4625 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4626 break;
4627 case Intrinsic::amdgcn_exp:
4628 // FIXME: Could we support packed types here?
4629 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4630 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4631 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4632 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4633 break;
4634 case Intrinsic::amdgcn_exp_row:
4635 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4636 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4637 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4638 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4639 OpdsMapping[8] = getSGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI);
4640 break;
4641 case Intrinsic::amdgcn_s_sendmsg:
4642 case Intrinsic::amdgcn_s_sendmsghalt: {
4643 // This must be an SGPR, but accept a VGPR.
4644 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4645 AMDGPU::SGPRRegBankID);
4646 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4647 break;
4648 }
4649 case Intrinsic::amdgcn_s_setreg: {
4650 // This must be an SGPR, but accept a VGPR.
4651 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4652 AMDGPU::SGPRRegBankID);
4653 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4654 break;
4655 }
4656 case Intrinsic::amdgcn_end_cf: {
4657 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4658 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4659 break;
4660 }
4661 case Intrinsic::amdgcn_else: {
4662 unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4663 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4664 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4665 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4666 break;
4667 }
4668 case Intrinsic::amdgcn_live_mask: {
4669 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4670 break;
4671 }
4672 case Intrinsic::amdgcn_wqm_demote:
4673 case Intrinsic::amdgcn_kill: {
4674 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4675 break;
4676 }
4677 case Intrinsic::amdgcn_raw_buffer_load:
4678 case Intrinsic::amdgcn_raw_ptr_buffer_load:
4679 case Intrinsic::amdgcn_raw_tbuffer_load:
4680 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
4681 // FIXME: Should make intrinsic ID the last operand of the instruction,
4682 // then this would be the same as store
4683 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4684 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4685 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4686 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4687 break;
4688 }
4689 case Intrinsic::amdgcn_raw_buffer_load_lds:
4690 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: {
4691 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4692 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4693 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4694 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4695 break;
4696 }
4697 case Intrinsic::amdgcn_raw_buffer_store:
4698 case Intrinsic::amdgcn_raw_ptr_buffer_store:
4699 case Intrinsic::amdgcn_raw_buffer_store_format:
4700 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
4701 case Intrinsic::amdgcn_raw_tbuffer_store:
4702 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
4703 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4704 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4705 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4706 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4707 break;
4708 }
4709 case Intrinsic::amdgcn_struct_buffer_load:
4710 case Intrinsic::amdgcn_struct_ptr_buffer_load:
4711 case Intrinsic::amdgcn_struct_tbuffer_load:
4712 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
4713 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4714 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4715 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4716 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4717 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4718 break;
4719 }
4720 case Intrinsic::amdgcn_struct_buffer_load_lds:
4721 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
4722 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4723 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4724 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4725 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4726 OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
4727 break;
4728 }
4729 case Intrinsic::amdgcn_struct_buffer_store:
4730 case Intrinsic::amdgcn_struct_ptr_buffer_store:
4731 case Intrinsic::amdgcn_struct_tbuffer_store:
4732 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
4733 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4734 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4735 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4736 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4737 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4738 break;
4739 }
4740 case Intrinsic::amdgcn_init_exec_from_input: {
4741 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4742 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4743 break;
4744 }
4745 case Intrinsic::amdgcn_ds_gws_init:
4746 case Intrinsic::amdgcn_ds_gws_barrier:
4747 case Intrinsic::amdgcn_ds_gws_sema_br: {
4748 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4749
4750 // This must be an SGPR, but accept a VGPR.
4751 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4752 AMDGPU::SGPRRegBankID);
4753 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4754 break;
4755 }
4756 case Intrinsic::amdgcn_ds_gws_sema_v:
4757 case Intrinsic::amdgcn_ds_gws_sema_p:
4758 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
4759 // This must be an SGPR, but accept a VGPR.
4760 unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI,
4761 AMDGPU::SGPRRegBankID);
4762 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
4763 break;
4764 }
4765 case Intrinsic::amdgcn_global_load_lds: {
4766 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4767 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4768 break;
4769 }
4770 case Intrinsic::amdgcn_lds_direct_load: {
4771 const int M0Idx = MI.getNumOperands() - 1;
4772 Register M0Reg = MI.getOperand(M0Idx).getReg();
4773 unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
4774 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4775
4776 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);