LLVM 17.0.0git
AMDGPURegisterBankInfo.cpp
Go to the documentation of this file.
1//===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the RegisterBankInfo class for
10/// AMDGPU.
11///
12/// \par
13///
14/// AMDGPU has unique register bank constraints that require special high level
15/// strategies to deal with. There are two main true physical register banks
16/// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a
17/// sort of pseudo-register bank needed to represent SGPRs used in a vector
18/// boolean context. There is also the AGPR bank, which is a special purpose
19/// physical register bank present on some subtargets.
20///
21/// Copying from VGPR to SGPR is generally illegal, unless the value is known to
22/// be uniform. It is generally not valid to legalize operands by inserting
23/// copies as on other targets. Operations which require uniform, SGPR operands
24/// generally require scalarization by repeatedly executing the instruction,
25/// activating each set of lanes using a unique set of input values. This is
26/// referred to as a waterfall loop.
27///
28/// \par Booleans
29///
30/// Booleans (s1 values) requires special consideration. A vector compare result
31/// is naturally a bitmask with one bit per lane, in a 32 or 64-bit
32/// register. These are represented with the VCC bank. During selection, we need
33/// to be able to unambiguously go back from a register class to a register
34/// bank. To distinguish whether an SGPR should use the SGPR or VCC register
35/// bank, we need to know the use context type. An SGPR s1 value always means a
36/// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets
37/// SCC, which is a 1-bit unaddressable register. This will need to be copied to
38/// a 32-bit virtual register. Taken together, this means we need to adjust the
39/// type of boolean operations to be regbank legal. All SALU booleans need to be
40/// widened to 32-bits, and all VALU booleans need to be s1 values.
41///
42/// A noteworthy exception to the s1-means-vcc rule is for legalization artifact
43/// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc
44/// bank. A non-boolean source (such as a truncate from a 1-bit load from
45/// memory) will require a copy to the VCC bank which will require clearing the
46/// high bits and inserting a compare.
47///
48/// \par Constant bus restriction
49///
50/// VALU instructions have a limitation known as the constant bus
51/// restriction. Most VALU instructions can use SGPR operands, but may read at
52/// most 1 SGPR or constant literal value (this to 2 in gfx10 for most
53/// instructions). This is one unique SGPR, so the same SGPR may be used for
54/// multiple operands. From a register bank perspective, any combination of
55/// operands should be legal as an SGPR, but this is contextually dependent on
56/// the SGPR operands all being the same register. There is therefore optimal to
57/// choose the SGPR with the most uses to minimize the number of copies.
58///
59/// We avoid trying to solve this problem in RegBankSelect. Any VALU G_*
60/// operation should have its source operands all mapped to VGPRs (except for
61/// VCC), inserting copies from any SGPR operands. This the most trivial legal
62/// mapping. Anything beyond the simplest 1:1 instruction selection would be too
63/// complicated to solve here. Every optimization pattern or instruction
64/// selected to multiple outputs would have to enforce this rule, and there
65/// would be additional complexity in tracking this rule for every G_*
66/// operation. By forcing all inputs to VGPRs, it also simplifies the task of
67/// picking the optimal operand combination from a post-isel optimization pass.
68///
69//===----------------------------------------------------------------------===//
70
72
73#include "AMDGPU.h"
75#include "AMDGPUInstrInfo.h"
76#include "GCNSubtarget.h"
78#include "SIRegisterInfo.h"
84#include "llvm/IR/IntrinsicsAMDGPU.h"
85
86#define GET_TARGET_REGBANK_IMPL
87#include "AMDGPUGenRegisterBank.inc"
88
89// This file will be TableGen'ed at some point.
90#include "AMDGPUGenRegisterBankInfo.def"
91
92using namespace llvm;
93using namespace MIPatternMatch;
94
95namespace {
96
97// Observer to apply a register bank to new registers created by LegalizerHelper.
98class ApplyRegBankMapping final : public GISelChangeObserver {
99private:
100 const AMDGPURegisterBankInfo &RBI;
102 const RegisterBank *NewBank;
104
105public:
106 ApplyRegBankMapping(const AMDGPURegisterBankInfo &RBI_,
107 MachineRegisterInfo &MRI_, const RegisterBank *RB)
108 : RBI(RBI_), MRI(MRI_), NewBank(RB) {}
109
110 ~ApplyRegBankMapping() {
111 for (MachineInstr *MI : NewInsts)
112 applyBank(*MI);
113 }
114
115 /// Set any registers that don't have a set register class or bank to SALU.
116 void applyBank(MachineInstr &MI) {
117 const unsigned Opc = MI.getOpcode();
118 if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT ||
119 Opc == AMDGPU::G_SEXT) {
120 // LegalizerHelper wants to use the basic legalization artifacts when
121 // widening etc. We don't handle selection with vcc in artifact sources,
122 // so we need to use a select instead to handle these properly.
123 Register DstReg = MI.getOperand(0).getReg();
124 Register SrcReg = MI.getOperand(1).getReg();
125 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI);
126 if (SrcBank == &AMDGPU::VCCRegBank) {
127 const LLT S32 = LLT::scalar(32);
128 assert(MRI.getType(SrcReg) == LLT::scalar(1));
129 assert(MRI.getType(DstReg) == S32);
130 assert(NewBank == &AMDGPU::VGPRRegBank);
131
132 // Replace the extension with a select, which really uses the boolean
133 // source.
135 auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1);
136 auto False = B.buildConstant(S32, 0);
137 B.buildSelect(DstReg, SrcReg, True, False);
138 MRI.setRegBank(True.getReg(0), *NewBank);
139 MRI.setRegBank(False.getReg(0), *NewBank);
140 MI.eraseFromParent();
141 }
142
143 assert(!MRI.getRegClassOrRegBank(DstReg));
144 MRI.setRegBank(DstReg, *NewBank);
145 return;
146 }
147
148#ifndef NDEBUG
149 if (Opc == AMDGPU::G_TRUNC) {
150 Register DstReg = MI.getOperand(0).getReg();
151 const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI);
152 assert(DstBank != &AMDGPU::VCCRegBank);
153 }
154#endif
155
156 for (MachineOperand &Op : MI.operands()) {
157 if (!Op.isReg())
158 continue;
159
160 // We may see physical registers if building a real MI
161 Register Reg = Op.getReg();
162 if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg))
163 continue;
164
165 const RegisterBank *RB = NewBank;
166 if (MRI.getType(Reg) == LLT::scalar(1)) {
167 assert(NewBank == &AMDGPU::VGPRRegBank &&
168 "s1 operands should only be used for vector bools");
169 assert((MI.getOpcode() != AMDGPU::G_TRUNC &&
170 MI.getOpcode() != AMDGPU::G_ANYEXT) &&
171 "not expecting legalization artifacts here");
172 RB = &AMDGPU::VCCRegBank;
173 }
174
175 MRI.setRegBank(Reg, *RB);
176 }
177 }
178
179 void erasingInstr(MachineInstr &MI) override {}
180
181 void createdInstr(MachineInstr &MI) override {
182 // At this point, the instruction was just inserted and has no operands.
183 NewInsts.push_back(&MI);
184 }
185
186 void changingInstr(MachineInstr &MI) override {}
187 void changedInstr(MachineInstr &MI) override {
188 // FIXME: In principle we should probably add the instruction to NewInsts,
189 // but the way the LegalizerHelper uses the observer, we will always see the
190 // registers we need to set the regbank on also referenced in a new
191 // instruction.
192 }
193};
194
195}
197 : Subtarget(ST), TRI(Subtarget.getRegisterInfo()),
198 TII(Subtarget.getInstrInfo()) {
199
200 // HACK: Until this is fully tablegen'd.
201 static llvm::once_flag InitializeRegisterBankFlag;
202
203 static auto InitializeRegisterBankOnce = [this]() {
204 assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank &&
205 &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank &&
206 &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank);
207 (void)this;
208 };
209
210 llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce);
211}
212
213static bool isVectorRegisterBank(const RegisterBank &Bank) {
214 unsigned BankID = Bank.getID();
215 return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID;
216}
217
219 const RegisterBank &Src,
220 unsigned Size) const {
221 // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
222 if (Dst.getID() == AMDGPU::SGPRRegBankID &&
223 (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) {
224 return std::numeric_limits<unsigned>::max();
225 }
226
227 // Bool values are tricky, because the meaning is based on context. The SCC
228 // and VCC banks are for the natural scalar and vector conditions produced by
229 // a compare.
230 //
231 // Legalization doesn't know about the necessary context, so an s1 use may
232 // have been a truncate from an arbitrary value, in which case a copy (lowered
233 // as a compare with 0) needs to be inserted.
234 if (Size == 1 &&
235 (Dst.getID() == AMDGPU::SGPRRegBankID) &&
236 (isVectorRegisterBank(Src) ||
237 Src.getID() == AMDGPU::SGPRRegBankID ||
238 Src.getID() == AMDGPU::VCCRegBankID))
239 return std::numeric_limits<unsigned>::max();
240
241 // There is no direct copy between AGPRs.
242 if (Dst.getID() == AMDGPU::AGPRRegBankID &&
243 Src.getID() == AMDGPU::AGPRRegBankID)
244 return 4;
245
246 return RegisterBankInfo::copyCost(Dst, Src, Size);
247}
248
250 const ValueMapping &ValMapping,
251 const RegisterBank *CurBank) const {
252 // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
253 // VGPR.
254 // FIXME: Is there a better way to do this?
255 if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
256 return 10; // This is expensive.
257
258 assert(ValMapping.NumBreakDowns == 2 &&
259 ValMapping.BreakDown[0].Length == 32 &&
260 ValMapping.BreakDown[0].StartIdx == 0 &&
261 ValMapping.BreakDown[1].Length == 32 &&
262 ValMapping.BreakDown[1].StartIdx == 32 &&
263 ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank);
264
265 // 32-bit extract of a 64-bit value is just access of a subregister, so free.
266 // TODO: Cost of 0 hits assert, though it's not clear it's what we really
267 // want.
268
269 // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
270 // alignment restrictions, but this probably isn't important.
271 return 1;
272}
273
274const RegisterBank &
276 LLT Ty) const {
277 if (&RC == &AMDGPU::SReg_1RegClass)
278 return AMDGPU::VCCRegBank;
279
280 // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
281 // VCC-like use.
282 if (TRI->isSGPRClass(&RC)) {
283 // FIXME: This probably came from a copy from a physical register, which
284 // should be inferable from the copied to-type. We don't have many boolean
285 // physical register constraints so just assume a normal SGPR for now.
286 if (!Ty.isValid())
287 return AMDGPU::SGPRRegBank;
288
289 return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
290 }
291
292 return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank;
293}
294
295template <unsigned NumOps>
298 const MachineInstr &MI, const MachineRegisterInfo &MRI,
299 const std::array<unsigned, NumOps> RegSrcOpIdx,
300 ArrayRef<OpRegBankEntry<NumOps>> Table) const {
301
302 InstructionMappings AltMappings;
303
305
306 unsigned Sizes[NumOps];
307 for (unsigned I = 0; I < NumOps; ++I) {
308 Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
309 Sizes[I] = getSizeInBits(Reg, MRI, *TRI);
310 }
311
312 for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
313 unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
314 Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
315 }
316
317 // getInstrMapping's default mapping uses ID 1, so start at 2.
318 unsigned MappingID = 2;
319 for (const auto &Entry : Table) {
320 for (unsigned I = 0; I < NumOps; ++I) {
321 int OpIdx = RegSrcOpIdx[I];
322 Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]);
323 }
324
325 AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost,
327 Operands.size()));
328 }
329
330 return AltMappings;
331}
332
335 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
336 switch (MI.getIntrinsicID()) {
337 case Intrinsic::amdgcn_readlane: {
338 static const OpRegBankEntry<3> Table[2] = {
339 // Perfectly legal.
340 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
341
342 // Need a readfirstlane for the index.
343 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
344 };
345
346 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
347 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, Table);
348 }
349 case Intrinsic::amdgcn_writelane: {
350 static const OpRegBankEntry<4> Table[4] = {
351 // Perfectly legal.
352 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
353
354 // Need readfirstlane of first op
355 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
356
357 // Need readfirstlane of second op
358 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
359
360 // Need readfirstlane of both ops
361 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }
362 };
363
364 // rsrc, voffset, offset
365 const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } };
366 return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, Table);
367 }
368 default:
370 }
371}
372
375 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
376
377 switch (MI.getIntrinsicID()) {
378 case Intrinsic::amdgcn_s_buffer_load: {
379 static const OpRegBankEntry<2> Table[4] = {
380 // Perfectly legal.
381 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
382
383 // Only need 1 register in loop
384 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 },
385
386 // Have to waterfall the resource.
387 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
388
389 // Have to waterfall the resource, and the offset.
390 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 }
391 };
392
393 // rsrc, offset
394 const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } };
395 return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, Table);
396 }
397 case Intrinsic::amdgcn_ds_ordered_add:
398 case Intrinsic::amdgcn_ds_ordered_swap: {
399 // VGPR = M0, VGPR
400 static const OpRegBankEntry<3> Table[2] = {
401 // Perfectly legal.
402 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
403
404 // Need a readfirstlane for m0
405 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
406 };
407
408 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
409 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, Table);
410 }
411 case Intrinsic::amdgcn_s_sendmsg:
412 case Intrinsic::amdgcn_s_sendmsghalt: {
413 // FIXME: Should have no register for immediate
414 static const OpRegBankEntry<1> Table[2] = {
415 // Perfectly legal.
416 { { AMDGPU::SGPRRegBankID }, 1 },
417
418 // Need readlane
419 { { AMDGPU::VGPRRegBankID }, 3 }
420 };
421
422 const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } };
423 return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, Table);
424 }
425 default:
427 }
428}
429
430// FIXME: Returns uniform if there's no source value information. This is
431// probably wrong.
432static bool isScalarLoadLegal(const MachineInstr &MI) {
433 if (!MI.hasOneMemOperand())
434 return false;
435
436 const MachineMemOperand *MMO = *MI.memoperands_begin();
437 const unsigned AS = MMO->getAddrSpace();
438 const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||
440 // Require 4-byte alignment.
441 return MMO->getAlign() >= Align(4) &&
442 // Can't do a scalar atomic load.
443 !MMO->isAtomic() &&
444 // Don't use scalar loads for volatile accesses to non-constant address
445 // spaces.
446 (IsConst || !MMO->isVolatile()) &&
447 // Memory must be known constant, or not written before this load.
448 (IsConst || MMO->isInvariant() || (MMO->getFlags() & MONoClobber)) &&
450}
451
454 const MachineInstr &MI) const {
455
456 const MachineFunction &MF = *MI.getParent()->getParent();
457 const MachineRegisterInfo &MRI = MF.getRegInfo();
458
459
460 InstructionMappings AltMappings;
461 switch (MI.getOpcode()) {
462 case TargetOpcode::G_CONSTANT:
463 case TargetOpcode::G_IMPLICIT_DEF: {
464 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
465 if (Size == 1) {
466 static const OpRegBankEntry<1> Table[3] = {
467 { { AMDGPU::VGPRRegBankID }, 1 },
468 { { AMDGPU::SGPRRegBankID }, 1 },
469 { { AMDGPU::VCCRegBankID }, 1 }
470 };
471
472 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
473 }
474
475 [[fallthrough]];
476 }
477 case TargetOpcode::G_FCONSTANT:
478 case TargetOpcode::G_FRAME_INDEX:
479 case TargetOpcode::G_GLOBAL_VALUE: {
480 static const OpRegBankEntry<1> Table[2] = {
481 { { AMDGPU::VGPRRegBankID }, 1 },
482 { { AMDGPU::SGPRRegBankID }, 1 }
483 };
484
485 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
486 }
487 case TargetOpcode::G_AND:
488 case TargetOpcode::G_OR:
489 case TargetOpcode::G_XOR: {
490 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
491
492 if (Size == 1) {
493 // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
494 const InstructionMapping &SCCMapping = getInstructionMapping(
495 1, 1, getOperandsMapping(
496 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
497 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
498 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}),
499 3); // Num Operands
500 AltMappings.push_back(&SCCMapping);
501
502 const InstructionMapping &VCCMapping0 = getInstructionMapping(
503 2, 1, getOperandsMapping(
504 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
505 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
506 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
507 3); // Num Operands
508 AltMappings.push_back(&VCCMapping0);
509 return AltMappings;
510 }
511
512 if (Size != 64)
513 break;
514
515 const InstructionMapping &SSMapping = getInstructionMapping(
516 1, 1, getOperandsMapping(
517 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
518 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
519 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
520 3); // Num Operands
521 AltMappings.push_back(&SSMapping);
522
523 const InstructionMapping &VVMapping = getInstructionMapping(
524 2, 2, getOperandsMapping(
525 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
526 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
527 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
528 3); // Num Operands
529 AltMappings.push_back(&VVMapping);
530 break;
531 }
532 case TargetOpcode::G_LOAD:
533 case TargetOpcode::G_ZEXTLOAD:
534 case TargetOpcode::G_SEXTLOAD: {
535 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
536 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
537 unsigned PtrSize = PtrTy.getSizeInBits();
538 unsigned AS = PtrTy.getAddressSpace();
539
543 const InstructionMapping &SSMapping = getInstructionMapping(
544 1, 1, getOperandsMapping(
545 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
546 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}),
547 2); // Num Operands
548 AltMappings.push_back(&SSMapping);
549 }
550
551 const InstructionMapping &VVMapping = getInstructionMapping(
552 2, 1,
554 {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
555 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
556 2); // Num Operands
557 AltMappings.push_back(&VVMapping);
558
559 // It may be possible to have a vgpr = load sgpr mapping here, because
560 // the mubuf instructions support this kind of load, but probably for only
561 // gfx7 and older. However, the addressing mode matching in the instruction
562 // selector should be able to do a better job of detecting and selecting
563 // these kinds of loads from the vgpr = load vgpr mapping.
564
565 return AltMappings;
566
567 }
568 case TargetOpcode::G_SELECT: {
569 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
570 const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
571 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
572 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
573 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
574 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
575 4); // Num Operands
576 AltMappings.push_back(&SSMapping);
577
578 const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
579 getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
580 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
581 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
582 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
583 4); // Num Operands
584 AltMappings.push_back(&VVMapping);
585
586 return AltMappings;
587 }
588 case TargetOpcode::G_UADDE:
589 case TargetOpcode::G_USUBE:
590 case TargetOpcode::G_SADDE:
591 case TargetOpcode::G_SSUBE: {
592 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
593 const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
595 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
596 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
597 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
598 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
599 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}),
600 5); // Num Operands
601 AltMappings.push_back(&SSMapping);
602
603 const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
604 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
605 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
606 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
607 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
608 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}),
609 5); // Num Operands
610 AltMappings.push_back(&VVMapping);
611 return AltMappings;
612 }
613 case AMDGPU::G_BRCOND: {
614 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
615
616 // TODO: Change type to 32 for scalar
618 1, 1, getOperandsMapping(
619 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}),
620 2); // Num Operands
621 AltMappings.push_back(&SMapping);
622
624 1, 1, getOperandsMapping(
625 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }),
626 2); // Num Operands
627 AltMappings.push_back(&VMapping);
628 return AltMappings;
629 }
630 case AMDGPU::G_INTRINSIC:
632 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
634 default:
635 break;
636 }
638}
639
643 LLT HalfTy,
644 Register Reg) const {
645 assert(HalfTy.getSizeInBits() == 32);
646 MachineRegisterInfo *MRI = B.getMRI();
647 Register LoLHS = MRI->createGenericVirtualRegister(HalfTy);
648 Register HiLHS = MRI->createGenericVirtualRegister(HalfTy);
649 const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI);
650 MRI->setRegBank(LoLHS, *Bank);
651 MRI->setRegBank(HiLHS, *Bank);
652
653 Regs.push_back(LoLHS);
654 Regs.push_back(HiLHS);
655
656 B.buildInstr(AMDGPU::G_UNMERGE_VALUES)
657 .addDef(LoLHS)
658 .addDef(HiLHS)
659 .addUse(Reg);
660}
661
662/// Replace the current type each register in \p Regs has with \p NewTy
664 LLT NewTy) {
665 for (Register Reg : Regs) {
666 assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits());
667 MRI.setType(Reg, NewTy);
668 }
669}
670
672 if (Ty.isVector()) {
675 Ty.getElementType());
676 }
677
678 assert(Ty.getScalarSizeInBits() % 2 == 0);
679 return LLT::scalar(Ty.getScalarSizeInBits() / 2);
680}
681
682// Build one or more V_READFIRSTLANE_B32 instructions to move the given vector
683// source value into a scalar register.
686 Register Src) const {
687 LLT Ty = MRI.getType(Src);
688 const RegisterBank *Bank = getRegBank(Src, MRI, *TRI);
689
690 if (Bank == &AMDGPU::SGPRRegBank)
691 return Src;
692
693 unsigned Bits = Ty.getSizeInBits();
694 assert(Bits % 32 == 0);
695
696 if (Bank != &AMDGPU::VGPRRegBank) {
697 // We need to copy from AGPR to VGPR
698 Src = B.buildCopy(Ty, Src).getReg(0);
699 MRI.setRegBank(Src, AMDGPU::VGPRRegBank);
700 }
701
702 LLT S32 = LLT::scalar(32);
703 unsigned NumParts = Bits / 32;
706
707 if (Bits == 32) {
708 SrcParts.push_back(Src);
709 } else {
710 auto Unmerge = B.buildUnmerge(S32, Src);
711 for (unsigned i = 0; i < NumParts; ++i)
712 SrcParts.push_back(Unmerge.getReg(i));
713 }
714
715 for (unsigned i = 0; i < NumParts; ++i) {
716 Register SrcPart = SrcParts[i];
717 Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
718 MRI.setType(DstPart, NumParts == 1 ? Ty : S32);
719
720 const TargetRegisterClass *Constrained =
721 constrainGenericRegister(SrcPart, AMDGPU::VGPR_32RegClass, MRI);
722 (void)Constrained;
723 assert(Constrained && "Failed to constrain readfirstlane src reg");
724
725 B.buildInstr(AMDGPU::V_READFIRSTLANE_B32, {DstPart}, {SrcPart});
726
727 DstParts.push_back(DstPart);
728 }
729
730 if (Bits == 32)
731 return DstParts[0];
732
733 Register Dst = B.buildMergeLikeInstr(Ty, DstParts).getReg(0);
734 MRI.setRegBank(Dst, AMDGPU::SGPRRegBank);
735 return Dst;
736}
737
738/// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
739/// any of the required SGPR operands are VGPRs, perform a waterfall loop to
740/// execute the instruction for each unique combination of values in all lanes
741/// in the wave. The block will be split such that rest of the instructions are
742/// moved to a new block.
743///
744/// Essentially performs this loop:
745//
746/// Save Execution Mask
747/// For (Lane : Wavefront) {
748/// Enable Lane, Disable all other lanes
749/// SGPR = read SGPR value for current lane from VGPR
750/// VGPRResult[Lane] = use_op SGPR
751/// }
752/// Restore Execution Mask
753///
754/// There is additional complexity to try for compare values to identify the
755/// unique values used.
759 SmallSet<Register, 4> &SGPROperandRegs,
760 MachineRegisterInfo &MRI) const {
761
762 // Track use registers which have already been expanded with a readfirstlane
763 // sequence. This may have multiple uses if moving a sequence.
764 DenseMap<Register, Register> WaterfalledRegMap;
765
766 MachineBasicBlock &MBB = B.getMBB();
767 MachineFunction *MF = &B.getMF();
768
770 const unsigned MovExecOpc =
771 Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
772 const unsigned MovExecTermOpc =
773 Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
774
775 const unsigned XorTermOpc = Subtarget.isWave32() ?
776 AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
777 const unsigned AndSaveExecOpc = Subtarget.isWave32() ?
778 AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
779 const unsigned ExecReg = Subtarget.isWave32() ?
780 AMDGPU::EXEC_LO : AMDGPU::EXEC;
781
782#ifndef NDEBUG
783 const int OrigRangeSize = std::distance(Range.begin(), Range.end());
784#endif
785
786 Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
787 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
788
789 // Don't bother using generic instructions/registers for the exec mask.
790 B.buildInstr(TargetOpcode::IMPLICIT_DEF)
791 .addDef(InitSaveExecReg);
792
793 Register PhiExec = MRI.createVirtualRegister(WaveRC);
794 Register NewExec = MRI.createVirtualRegister(WaveRC);
795
796 // To insert the loop we need to split the block. Move everything before this
797 // point to a new block, and insert a new empty block before this instruction.
800 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
801 MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
803 ++MBBI;
804 MF->insert(MBBI, LoopBB);
805 MF->insert(MBBI, BodyBB);
806 MF->insert(MBBI, RestoreExecBB);
807 MF->insert(MBBI, RemainderBB);
808
809 LoopBB->addSuccessor(BodyBB);
810 BodyBB->addSuccessor(RestoreExecBB);
811 BodyBB->addSuccessor(LoopBB);
812
813 // Move the rest of the block into a new block.
815 RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
816
817 MBB.addSuccessor(LoopBB);
818 RestoreExecBB->addSuccessor(RemainderBB);
819
820 B.setInsertPt(*LoopBB, LoopBB->end());
821
822 B.buildInstr(TargetOpcode::PHI)
823 .addDef(PhiExec)
824 .addReg(InitSaveExecReg)
825 .addMBB(&MBB)
826 .addReg(NewExec)
827 .addMBB(BodyBB);
828
829 const DebugLoc &DL = B.getDL();
830
831 MachineInstr &FirstInst = *Range.begin();
832
833 // Move the instruction into the loop body. Note we moved everything after
834 // Range.end() already into a new block, so Range.end() is no longer valid.
835 BodyBB->splice(BodyBB->end(), &MBB, Range.begin(), MBB.end());
836
837 // Figure out the iterator range after splicing the instructions.
838 MachineBasicBlock::iterator NewBegin = FirstInst.getIterator();
839 auto NewEnd = BodyBB->end();
840
841 B.setMBB(*LoopBB);
842
843 LLT S1 = LLT::scalar(1);
844 Register CondReg;
845
846 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
847
848 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
849 for (MachineOperand &Op : MI.uses()) {
850 if (!Op.isReg() || Op.isDef())
851 continue;
852
853 Register OldReg = Op.getReg();
854 if (!SGPROperandRegs.count(OldReg))
855 continue;
856
857 // See if we already processed this register in another instruction in the
858 // sequence.
859 auto OldVal = WaterfalledRegMap.find(OldReg);
860 if (OldVal != WaterfalledRegMap.end()) {
861 Op.setReg(OldVal->second);
862 continue;
863 }
864
865 Register OpReg = Op.getReg();
866 LLT OpTy = MRI.getType(OpReg);
867
868 const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI);
869 if (OpBank != &AMDGPU::VGPRRegBank) {
870 // Insert copy from AGPR to VGPR before the loop.
871 B.setMBB(MBB);
872 OpReg = B.buildCopy(OpTy, OpReg).getReg(0);
873 MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank);
874 B.setMBB(*LoopBB);
875 }
876
877 Register CurrentLaneReg = buildReadFirstLane(B, MRI, OpReg);
878
879 // Build the comparison(s).
880 unsigned OpSize = OpTy.getSizeInBits();
881 bool Is64 = OpSize % 64 == 0;
882 unsigned PartSize = Is64 ? 64 : 32;
883 LLT PartTy = LLT::scalar(PartSize);
884 unsigned NumParts = OpSize / PartSize;
886 SmallVector<Register, 8> CurrentLaneParts;
887
888 if (NumParts == 1) {
889 OpParts.push_back(OpReg);
890 CurrentLaneParts.push_back(CurrentLaneReg);
891 } else {
892 auto UnmergeOp = B.buildUnmerge(PartTy, OpReg);
893 auto UnmergeCurrentLane = B.buildUnmerge(PartTy, CurrentLaneReg);
894 for (unsigned i = 0; i < NumParts; ++i) {
895 OpParts.push_back(UnmergeOp.getReg(i));
896 CurrentLaneParts.push_back(UnmergeCurrentLane.getReg(i));
897 MRI.setRegBank(OpParts[i], AMDGPU::VGPRRegBank);
898 MRI.setRegBank(CurrentLaneParts[i], AMDGPU::SGPRRegBank);
899 }
900 }
901
902 for (unsigned i = 0; i < NumParts; ++i) {
903 auto CmpReg = B.buildICmp(CmpInst::ICMP_EQ, S1, CurrentLaneParts[i],
904 OpParts[i]).getReg(0);
905 MRI.setRegBank(CmpReg, AMDGPU::VCCRegBank);
906
907 if (!CondReg) {
908 CondReg = CmpReg;
909 } else {
910 CondReg = B.buildAnd(S1, CondReg, CmpReg).getReg(0);
911 MRI.setRegBank(CondReg, AMDGPU::VCCRegBank);
912 }
913 }
914
915 Op.setReg(CurrentLaneReg);
916
917 // Make sure we don't re-process this register again.
918 WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg()));
919 }
920 }
921
922 // The ballot becomes a no-op during instruction selection.
923 CondReg = B.buildIntrinsic(Intrinsic::amdgcn_ballot,
924 {LLT::scalar(Subtarget.isWave32() ? 32 : 64)},
925 false)
926 .addReg(CondReg)
927 .getReg(0);
928 MRI.setRegClass(CondReg, WaveRC);
929
930 // Update EXEC, save the original EXEC value to VCC.
931 B.buildInstr(AndSaveExecOpc)
932 .addDef(NewExec)
933 .addReg(CondReg, RegState::Kill);
934
935 MRI.setSimpleHint(NewExec, CondReg);
936
937 B.setInsertPt(*BodyBB, BodyBB->end());
938
939 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
940 B.buildInstr(XorTermOpc)
941 .addDef(ExecReg)
942 .addReg(ExecReg)
943 .addReg(NewExec);
944
945 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
946 // s_cbranch_scc0?
947
948 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
949 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
950
951 // Save the EXEC mask before the loop.
952 BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg)
953 .addReg(ExecReg);
954
955 // Restore the EXEC mask after the loop.
956 B.setMBB(*RestoreExecBB);
957 B.buildInstr(MovExecTermOpc)
958 .addDef(ExecReg)
959 .addReg(SaveExecReg);
960
961 // Set the insert point after the original instruction, so any new
962 // instructions will be in the remainder.
963 B.setInsertPt(*RemainderBB, RemainderBB->begin());
964
965 return true;
966}
967
968// Return any unique registers used by \p MI at \p OpIndices that need to be
969// handled in a waterfall loop. Returns these registers in \p
970// SGPROperandRegs. Returns true if there are any operands to handle and a
971// waterfall loop is necessary.
973 SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,
974 MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {
975 for (unsigned Op : OpIndices) {
976 assert(MI.getOperand(Op).isUse());
977 Register Reg = MI.getOperand(Op).getReg();
978 const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
979 if (OpBank->getID() != AMDGPU::SGPRRegBankID)
980 SGPROperandRegs.insert(Reg);
981 }
982
983 // No operands need to be replaced, so no need to loop.
984 return !SGPROperandRegs.empty();
985}
986
989 ArrayRef<unsigned> OpIndices) const {
990 // Use a set to avoid extra readfirstlanes in the case where multiple operands
991 // are the same register.
992 SmallSet<Register, 4> SGPROperandRegs;
993
994 if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices))
995 return false;
996
997 MachineBasicBlock::iterator I = MI.getIterator();
998 return executeInWaterfallLoop(B, make_range(I, std::next(I)),
999 SGPROperandRegs, MRI);
1000}
1001
1004 ArrayRef<unsigned> OpIndices) const {
1006 return executeInWaterfallLoop(B, MI, MRI, OpIndices);
1007}
1008
1009// Legalize an operand that must be an SGPR by inserting a readfirstlane.
1011 MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const {
1012 Register Reg = MI.getOperand(OpIdx).getReg();
1013 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
1014 if (Bank == &AMDGPU::SGPRRegBank)
1015 return;
1016
1018
1019 Reg = buildReadFirstLane(B, MRI, Reg);
1020 MI.getOperand(OpIdx).setReg(Reg);
1021}
1022
1023/// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
1024/// rest will be in the remainder.
1025static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) {
1026 unsigned TotalSize = Ty.getSizeInBits();
1027 if (!Ty.isVector())
1028 return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)};
1029
1030 LLT EltTy = Ty.getElementType();
1031 unsigned EltSize = EltTy.getSizeInBits();
1032 assert(FirstSize % EltSize == 0);
1033
1034 unsigned FirstPartNumElts = FirstSize / EltSize;
1035 unsigned RemainderElts = (TotalSize - FirstSize) / EltSize;
1036
1037 return {LLT::scalarOrVector(ElementCount::getFixed(FirstPartNumElts), EltTy),
1038 LLT::scalarOrVector(ElementCount::getFixed(RemainderElts), EltTy)};
1039}
1040
1042 if (!Ty.isVector())
1043 return LLT::scalar(128);
1044
1045 LLT EltTy = Ty.getElementType();
1046 assert(128 % EltTy.getSizeInBits() == 0);
1047 return LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
1048}
1049
1052 MachineRegisterInfo &MRI) const {
1053 Register DstReg = MI.getOperand(0).getReg();
1054 const LLT LoadTy = MRI.getType(DstReg);
1055 unsigned LoadSize = LoadTy.getSizeInBits();
1056 const unsigned MaxNonSmrdLoadSize = 128;
1057
1058 const RegisterBank *DstBank =
1059 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1060 if (DstBank == &AMDGPU::SGPRRegBank) {
1061 // There are some special cases that we need to look at for 32 bit and 96
1062 // bit SGPR loads otherwise we have nothing to do.
1063 if (LoadSize != 32 && LoadSize != 96)
1064 return false;
1065
1066 MachineMemOperand *MMO = *MI.memoperands_begin();
1067 const unsigned MemSize = 8 * MMO->getSize();
1068 // Scalar loads of size 8 or 16 bit with proper alignment may be widened to
1069 // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit
1070 // scalar loads should have a load size of 32 but memory access size of less
1071 // than 32.
1072 if (LoadSize == 32 &&
1073 (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI)))
1074 return false;
1075
1076 Register PtrReg = MI.getOperand(1).getReg();
1077
1078 ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank);
1079 MachineIRBuilder B(MI, O);
1080
1081 if (LoadSize == 32) {
1082 // This is an extending load from a sub-dword size. Widen the memory
1083 // access size to 4 bytes and clear the extra high bits appropriately
1084 const LLT S32 = LLT::scalar(32);
1085 if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) {
1086 // Must extend the sign bit into higher bits for a G_SEXTLOAD
1087 auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1088 B.buildSExtInReg(MI.getOperand(0), WideLoad, MemSize);
1089 } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) {
1090 // Must extend zero into higher bits with an AND for a G_ZEXTLOAD
1091 auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1092 B.buildZExtInReg(MI.getOperand(0), WideLoad, MemSize);
1093 } else
1094 // We do not need to touch the higher bits for regular loads.
1095 B.buildLoadFromOffset(MI.getOperand(0), PtrReg, *MMO, 0);
1096 } else {
1097 // 96-bit loads are only available for vector loads. We need to split this
1098 // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
1099 if (MMO->getAlign() < Align(16)) {
1100 MachineFunction *MF = MI.getParent()->getParent();
1101 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
1102 MachineIRBuilder B(MI, ApplyBank);
1103 LegalizerHelper Helper(*MF, ApplyBank, B);
1104 LLT Part64, Part32;
1105 std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64);
1106 if (Helper.reduceLoadStoreWidth(cast<GAnyLoad>(MI), 0, Part64) !=
1108 return false;
1109 return true;
1110 } else {
1111 LLT WiderTy = widen96To128(LoadTy);
1112 auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0);
1113 if (WiderTy.isScalar())
1114 B.buildTrunc(MI.getOperand(0), WideLoad);
1115 else {
1116 B.buildDeleteTrailingVectorElements(MI.getOperand(0).getReg(),
1117 WideLoad);
1118 }
1119 }
1120 }
1121
1122 MI.eraseFromParent();
1123 return true;
1124 }
1125
1126 // 128-bit loads are supported for all instruction types.
1127 if (LoadSize <= MaxNonSmrdLoadSize)
1128 return false;
1129
1130 SmallVector<Register, 16> DefRegs(OpdMapper.getVRegs(0));
1131 SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1));
1132
1133 if (SrcRegs.empty())
1134 SrcRegs.push_back(MI.getOperand(1).getReg());
1135
1136 assert(LoadSize % MaxNonSmrdLoadSize == 0);
1137
1138 // RegBankSelect only emits scalar types, so we need to reset the pointer
1139 // operand to a pointer type.
1140 Register BasePtrReg = SrcRegs[0];
1141 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
1142 MRI.setType(BasePtrReg, PtrTy);
1143
1144 unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize;
1145 const LLT LoadSplitTy = LoadTy.divide(NumSplitParts);
1146 ApplyRegBankMapping Observer(*this, MRI, &AMDGPU::VGPRRegBank);
1147 MachineIRBuilder B(MI, Observer);
1148 LegalizerHelper Helper(B.getMF(), Observer, B);
1149
1150 if (LoadTy.isVector()) {
1151 if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1152 return false;
1153 } else {
1154 if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1155 return false;
1156 }
1157
1158 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
1159 return true;
1160}
1161
1165 MachineRegisterInfo &MRI) const {
1166 const MachineFunction &MF = *MI.getMF();
1167 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1168 const auto &TFI = *ST.getFrameLowering();
1169
1170 // Guard in case the stack growth direction ever changes with scratch
1171 // instructions.
1172 if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown)
1173 return false;
1174
1175 Register Dst = MI.getOperand(0).getReg();
1176 Register AllocSize = MI.getOperand(1).getReg();
1177 Align Alignment = assumeAligned(MI.getOperand(2).getImm());
1178
1179 const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI);
1180
1181 // TODO: Need to emit a wave reduction to get the maximum size.
1182 if (SizeBank != &AMDGPU::SGPRRegBank)
1183 return false;
1184
1185 LLT PtrTy = MRI.getType(Dst);
1186 LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
1187
1189 Register SPReg = Info->getStackPtrOffsetReg();
1190 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
1191 MachineIRBuilder B(MI, ApplyBank);
1192
1193 auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2());
1194 auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize);
1195
1196 auto SPCopy = B.buildCopy(PtrTy, SPReg);
1197 if (Alignment > TFI.getStackAlign()) {
1198 auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize);
1199 B.buildMaskLowPtrBits(Dst, PtrAdd,
1200 Log2(Alignment) + ST.getWavefrontSizeLog2());
1201 } else {
1202 B.buildPtrAdd(Dst, SPCopy, ScaledSize);
1203 }
1204
1205 MI.eraseFromParent();
1206 return true;
1207}
1208
1211 MachineRegisterInfo &MRI, int RsrcIdx) const {
1212 const int NumDefs = MI.getNumExplicitDefs();
1213
1214 // The reported argument index is relative to the IR intrinsic call arguments,
1215 // so we need to shift by the number of defs and the intrinsic ID.
1216 RsrcIdx += NumDefs + 1;
1217
1218 // Insert copies to VGPR arguments.
1219 applyDefaultMapping(OpdMapper);
1220
1221 // Fixup any SGPR arguments.
1222 SmallVector<unsigned, 4> SGPRIndexes;
1223 for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
1224 if (!MI.getOperand(I).isReg())
1225 continue;
1226
1227 // If this intrinsic has a sampler, it immediately follows rsrc.
1228 if (I == RsrcIdx || I == RsrcIdx + 1)
1229 SGPRIndexes.push_back(I);
1230 }
1231
1232 executeInWaterfallLoop(MI, MRI, SGPRIndexes);
1233 return true;
1234}
1235
1236// Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store
1237// the three offsets (voffset, soffset and instoffset)
1239 MachineIRBuilder &B, Register CombinedOffset, Register &VOffsetReg,
1240 Register &SOffsetReg, int64_t &InstOffsetVal, Align Alignment) const {
1241 const LLT S32 = LLT::scalar(32);
1242 MachineRegisterInfo *MRI = B.getMRI();
1243
1244 if (std::optional<int64_t> Imm =
1245 getIConstantVRegSExtVal(CombinedOffset, *MRI)) {
1246 uint32_t SOffset, ImmOffset;
1247 if (TII->splitMUBUFOffset(*Imm, SOffset, ImmOffset, Alignment)) {
1248 VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1249 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1250 InstOffsetVal = ImmOffset;
1251
1252 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1253 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1254 return SOffset + ImmOffset;
1255 }
1256 }
1257
1258 Register Base;
1259 unsigned Offset;
1260
1261 std::tie(Base, Offset) =
1262 AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset);
1263
1264 uint32_t SOffset, ImmOffset;
1265 if ((int)Offset > 0 &&
1266 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
1267 if (getRegBank(Base, *MRI, *TRI) == &AMDGPU::VGPRRegBank) {
1268 VOffsetReg = Base;
1269 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1270 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1271 InstOffsetVal = ImmOffset;
1272 return 0; // XXX - Why is this 0?
1273 }
1274
1275 // If we have SGPR base, we can use it for soffset.
1276 if (SOffset == 0) {
1277 VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1278 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1279 SOffsetReg = Base;
1280 InstOffsetVal = ImmOffset;
1281 return 0; // XXX - Why is this 0?
1282 }
1283 }
1284
1285 // Handle the variable sgpr + vgpr case.
1286 MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI);
1287 if (Add && (int)Offset >= 0) {
1288 Register Src0 = getSrcRegIgnoringCopies(Add->getOperand(1).getReg(), *MRI);
1289 Register Src1 = getSrcRegIgnoringCopies(Add->getOperand(2).getReg(), *MRI);
1290
1291 const RegisterBank *Src0Bank = getRegBank(Src0, *MRI, *TRI);
1292 const RegisterBank *Src1Bank = getRegBank(Src1, *MRI, *TRI);
1293
1294 if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) {
1295 VOffsetReg = Src0;
1296 SOffsetReg = Src1;
1297 return 0;
1298 }
1299
1300 if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) {
1301 VOffsetReg = Src1;
1302 SOffsetReg = Src0;
1303 return 0;
1304 }
1305 }
1306
1307 // Ensure we have a VGPR for the combined offset. This could be an issue if we
1308 // have an SGPR offset and a VGPR resource.
1309 if (getRegBank(CombinedOffset, *MRI, *TRI) == &AMDGPU::VGPRRegBank) {
1310 VOffsetReg = CombinedOffset;
1311 } else {
1312 VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0);
1313 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1314 }
1315
1316 SOffsetReg = B.buildConstant(S32, 0).getReg(0);
1317 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1318 return 0;
1319}
1320
1322 const OperandsMapper &OpdMapper) const {
1323 MachineInstr &MI = OpdMapper.getMI();
1324 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1325
1326 const LLT S32 = LLT::scalar(32);
1327 Register Dst = MI.getOperand(0).getReg();
1328 LLT Ty = MRI.getType(Dst);
1329
1330 const RegisterBank *RSrcBank =
1331 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1332 const RegisterBank *OffsetBank =
1333 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1334 if (RSrcBank == &AMDGPU::SGPRRegBank &&
1335 OffsetBank == &AMDGPU::SGPRRegBank)
1336 return true; // Legal mapping
1337
1338 // FIXME: 96-bit case was widened during legalize. We need to narrow it back
1339 // here but don't have an MMO.
1340
1341 unsigned LoadSize = Ty.getSizeInBits();
1342 int NumLoads = 1;
1343 if (LoadSize == 256 || LoadSize == 512) {
1344 NumLoads = LoadSize / 128;
1345 Ty = Ty.divide(NumLoads);
1346 }
1347
1348 // Use the alignment to ensure that the required offsets will fit into the
1349 // immediate offsets.
1350 const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1);
1351
1353 MachineFunction &MF = B.getMF();
1354
1355 Register SOffset;
1356 Register VOffset;
1357 int64_t ImmOffset = 0;
1358
1359 unsigned MMOOffset = setBufferOffsets(B, MI.getOperand(2).getReg(), VOffset,
1360 SOffset, ImmOffset, Alignment);
1361
1362 // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we
1363 // can, but we need to track an MMO for that.
1364 const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8;
1365 const Align MemAlign(4); // FIXME: ABI type alignment?
1370 MemSize, MemAlign);
1371 if (MMOOffset != 0)
1372 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize);
1373
1374 // If only the offset is divergent, emit a MUBUF buffer load instead. We can
1375 // assume that the buffer is unswizzled.
1376
1377 Register RSrc = MI.getOperand(1).getReg();
1378 Register VIndex = B.buildConstant(S32, 0).getReg(0);
1379 B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank);
1380
1381 SmallVector<Register, 4> LoadParts(NumLoads);
1382
1383 MachineBasicBlock::iterator MII = MI.getIterator();
1384 MachineInstrSpan Span(MII, &B.getMBB());
1385
1386 for (int i = 0; i < NumLoads; ++i) {
1387 if (NumLoads == 1) {
1388 LoadParts[i] = Dst;
1389 } else {
1390 LoadParts[i] = MRI.createGenericVirtualRegister(Ty);
1391 MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank);
1392 }
1393
1394 MachineMemOperand *MMO = BaseMMO;
1395 if (i != 0)
1396 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize);
1397
1398 B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD)
1399 .addDef(LoadParts[i]) // vdata
1400 .addUse(RSrc) // rsrc
1401 .addUse(VIndex) // vindex
1402 .addUse(VOffset) // voffset
1403 .addUse(SOffset) // soffset
1404 .addImm(ImmOffset + 16 * i) // offset(imm)
1405 .addImm(0) // cachepolicy, swizzled buffer(imm)
1406 .addImm(0) // idxen(imm)
1407 .addMemOperand(MMO);
1408 }
1409
1410 // TODO: If only the resource is a VGPR, it may be better to execute the
1411 // scalar load in the waterfall loop if the resource is expected to frequently
1412 // be dynamically uniform.
1413 if (RSrcBank != &AMDGPU::SGPRRegBank) {
1414 // Remove the original instruction to avoid potentially confusing the
1415 // waterfall loop logic.
1416 B.setInstr(*Span.begin());
1417 MI.eraseFromParent();
1418
1419 SmallSet<Register, 4> OpsToWaterfall;
1420
1421 OpsToWaterfall.insert(RSrc);
1422 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
1423 OpsToWaterfall, MRI);
1424 }
1425
1426 if (NumLoads != 1) {
1427 if (Ty.isVector())
1428 B.buildConcatVectors(Dst, LoadParts);
1429 else
1430 B.buildMergeLikeInstr(Dst, LoadParts);
1431 }
1432
1433 // We removed the instruction earlier with a waterfall loop.
1434 if (RSrcBank == &AMDGPU::SGPRRegBank)
1435 MI.eraseFromParent();
1436
1437 return true;
1438}
1439
1441 bool Signed) const {
1442 MachineInstr &MI = OpdMapper.getMI();
1443 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1444
1445 // Insert basic copies
1446 applyDefaultMapping(OpdMapper);
1447
1448 Register DstReg = MI.getOperand(0).getReg();
1449 LLT Ty = MRI.getType(DstReg);
1450
1451 const LLT S32 = LLT::scalar(32);
1452
1453 unsigned FirstOpnd = MI.getOpcode() == AMDGPU::G_INTRINSIC ? 2 : 1;
1454 Register SrcReg = MI.getOperand(FirstOpnd).getReg();
1455 Register OffsetReg = MI.getOperand(FirstOpnd + 1).getReg();
1456 Register WidthReg = MI.getOperand(FirstOpnd + 2).getReg();
1457
1458 const RegisterBank *DstBank =
1459 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1460 if (DstBank == &AMDGPU::VGPRRegBank) {
1461 if (Ty == S32)
1462 return true;
1463
1464 // There is no 64-bit vgpr bitfield extract instructions so the operation
1465 // is expanded to a sequence of instructions that implement the operation.
1466 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::VGPRRegBank);
1467 MachineIRBuilder B(MI, ApplyBank);
1468
1469 const LLT S64 = LLT::scalar(64);
1470 // Shift the source operand so that extracted bits start at bit 0.
1471 auto ShiftOffset = Signed ? B.buildAShr(S64, SrcReg, OffsetReg)
1472 : B.buildLShr(S64, SrcReg, OffsetReg);
1473 auto UnmergeSOffset = B.buildUnmerge({S32, S32}, ShiftOffset);
1474
1475 // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions
1476 // if the width is a constant.
1477 if (auto ConstWidth = getIConstantVRegValWithLookThrough(WidthReg, MRI)) {
1478 // Use the 32-bit bitfield extract instruction if the width is a constant.
1479 // Depending on the width size, use either the low or high 32-bits.
1480 auto Zero = B.buildConstant(S32, 0);
1481 auto WidthImm = ConstWidth->Value.getZExtValue();
1482 if (WidthImm <= 32) {
1483 // Use bitfield extract on the lower 32-bit source, and then sign-extend
1484 // or clear the upper 32-bits.
1485 auto Extract =
1486 Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg)
1487 : B.buildUbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg);
1488 auto Extend =
1489 Signed ? B.buildAShr(S32, Extract, B.buildConstant(S32, 31)) : Zero;
1490 B.buildMergeLikeInstr(DstReg, {Extract, Extend});
1491 } else {
1492 // Use bitfield extract on upper 32-bit source, and combine with lower
1493 // 32-bit source.
1494 auto UpperWidth = B.buildConstant(S32, WidthImm - 32);
1495 auto Extract =
1496 Signed
1497 ? B.buildSbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth)
1498 : B.buildUbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth);
1499 B.buildMergeLikeInstr(DstReg, {UnmergeSOffset.getReg(0), Extract});
1500 }
1501 MI.eraseFromParent();
1502 return true;
1503 }
1504
1505 // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit
1506 // operations.
1507 auto ExtShift = B.buildSub(S32, B.buildConstant(S32, 64), WidthReg);
1508 auto SignBit = B.buildShl(S64, ShiftOffset, ExtShift);
1509 if (Signed)
1510 B.buildAShr(S64, SignBit, ExtShift);
1511 else
1512 B.buildLShr(S64, SignBit, ExtShift);
1513 MI.eraseFromParent();
1514 return true;
1515 }
1516
1517 // The scalar form packs the offset and width in a single operand.
1518
1519 ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
1520 MachineIRBuilder B(MI, ApplyBank);
1521
1522 // Ensure the high bits are clear to insert the offset.
1523 auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
1524 auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
1525
1526 // Zeros out the low bits, so don't bother clamping the input value.
1527 auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16));
1528
1529 // Transformation function, pack the offset and width of a BFE into
1530 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
1531 // source, bits [5:0] contain the offset and bits [22:16] the width.
1532 auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
1533
1534 // TODO: It might be worth using a pseudo here to avoid scc clobber and
1535 // register class constraints.
1536 unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
1537 (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
1538
1539 auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
1540 if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1541 llvm_unreachable("failed to constrain BFE");
1542
1543 MI.eraseFromParent();
1544 return true;
1545}
1546
1548 const OperandsMapper &OpdMapper) const {
1549 MachineInstr &MI = OpdMapper.getMI();
1550 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1551
1552 // Insert basic copies.
1553 applyDefaultMapping(OpdMapper);
1554
1555 Register Dst0 = MI.getOperand(0).getReg();
1556 Register Dst1 = MI.getOperand(1).getReg();
1557 Register Src0 = MI.getOperand(2).getReg();
1558 Register Src1 = MI.getOperand(3).getReg();
1559 Register Src2 = MI.getOperand(4).getReg();
1560
1561 if (MRI.getRegBankOrNull(Src0) == &AMDGPU::VGPRRegBank)
1562 return true;
1563
1564 bool IsUnsigned = MI.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
1565 LLT S1 = LLT::scalar(1);
1566 LLT S32 = LLT::scalar(32);
1567
1568 bool DstOnValu = MRI.getRegBankOrNull(Src2) == &AMDGPU::VGPRRegBank;
1569 bool Accumulate = true;
1570
1571 if (!DstOnValu) {
1572 if (mi_match(Src2, MRI, m_ZeroInt()))
1573 Accumulate = false;
1574 }
1575
1576 // Keep the multiplication on the SALU.
1578
1579 Register DstHi;
1580 Register DstLo = B.buildMul(S32, Src0, Src1).getReg(0);
1581 bool MulHiInVgpr = false;
1582
1583 MRI.setRegBank(DstLo, AMDGPU::SGPRRegBank);
1584
1585 if (Subtarget.hasSMulHi()) {
1586 DstHi = IsUnsigned ? B.buildUMulH(S32, Src0, Src1).getReg(0)
1587 : B.buildSMulH(S32, Src0, Src1).getReg(0);
1588 MRI.setRegBank(DstHi, AMDGPU::SGPRRegBank);
1589 } else {
1590 Register VSrc0 = B.buildCopy(S32, Src0).getReg(0);
1591 Register VSrc1 = B.buildCopy(S32, Src1).getReg(0);
1592
1593 MRI.setRegBank(VSrc0, AMDGPU::VGPRRegBank);
1594 MRI.setRegBank(VSrc1, AMDGPU::VGPRRegBank);
1595
1596 DstHi = IsUnsigned ? B.buildUMulH(S32, VSrc0, VSrc1).getReg(0)
1597 : B.buildSMulH(S32, VSrc0, VSrc1).getReg(0);
1598 MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank);
1599
1600 if (!DstOnValu) {
1601 DstHi = buildReadFirstLane(B, MRI, DstHi);
1602 } else {
1603 MulHiInVgpr = true;
1604 }
1605 }
1606
1607 // Accumulate and produce the "carry-out" bit.
1608 //
1609 // The "carry-out" is defined as bit 64 of the result when computed as a
1610 // big integer. For unsigned multiply-add, this matches the usual definition
1611 // of carry-out. For signed multiply-add, bit 64 is the sign bit of the
1612 // result, which is determined as:
1613 // sign(Src0 * Src1) + sign(Src2) + carry-out from unsigned 64-bit add
1614 LLT CarryType = DstOnValu ? S1 : S32;
1615 const RegisterBank &CarryBank =
1616 DstOnValu ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
1617 const RegisterBank &DstBank =
1618 DstOnValu ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank;
1619 Register Carry;
1620 Register Zero;
1621
1622 if (!IsUnsigned) {
1623 Zero = B.buildConstant(S32, 0).getReg(0);
1624 MRI.setRegBank(Zero,
1625 MulHiInVgpr ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank);
1626
1627 Carry = B.buildICmp(CmpInst::ICMP_SLT, MulHiInVgpr ? S1 : S32, DstHi, Zero)
1628 .getReg(0);
1629 MRI.setRegBank(Carry, MulHiInVgpr ? AMDGPU::VCCRegBank
1630 : AMDGPU::SGPRRegBank);
1631
1632 if (DstOnValu && !MulHiInVgpr) {
1633 Carry = B.buildTrunc(S1, Carry).getReg(0);
1634 MRI.setRegBank(Carry, AMDGPU::VCCRegBank);
1635 }
1636 }
1637
1638 if (Accumulate) {
1639 if (DstOnValu) {
1640 DstLo = B.buildCopy(S32, DstLo).getReg(0);
1641 DstHi = B.buildCopy(S32, DstHi).getReg(0);
1642 MRI.setRegBank(DstLo, AMDGPU::VGPRRegBank);
1643 MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank);
1644 }
1645
1646 auto Unmerge = B.buildUnmerge(S32, Src2);
1647 Register Src2Lo = Unmerge.getReg(0);
1648 Register Src2Hi = Unmerge.getReg(1);
1649 MRI.setRegBank(Src2Lo, DstBank);
1650 MRI.setRegBank(Src2Hi, DstBank);
1651
1652 if (!IsUnsigned) {
1653 auto Src2Sign = B.buildICmp(CmpInst::ICMP_SLT, CarryType, Src2Hi, Zero);
1654 MRI.setRegBank(Src2Sign.getReg(0), CarryBank);
1655
1656 Carry = B.buildXor(CarryType, Carry, Src2Sign).getReg(0);
1657 MRI.setRegBank(Carry, CarryBank);
1658 }
1659
1660 auto AddLo = B.buildUAddo(S32, CarryType, DstLo, Src2Lo);
1661 DstLo = AddLo.getReg(0);
1662 Register CarryLo = AddLo.getReg(1);
1663 MRI.setRegBank(DstLo, DstBank);
1664 MRI.setRegBank(CarryLo, CarryBank);
1665
1666 auto AddHi = B.buildUAdde(S32, CarryType, DstHi, Src2Hi, CarryLo);
1667 DstHi = AddHi.getReg(0);
1668 MRI.setRegBank(DstHi, DstBank);
1669
1670 Register CarryHi = AddHi.getReg(1);
1671 MRI.setRegBank(CarryHi, CarryBank);
1672
1673 if (IsUnsigned) {
1674 Carry = CarryHi;
1675 } else {
1676 Carry = B.buildXor(CarryType, Carry, CarryHi).getReg(0);
1677 MRI.setRegBank(Carry, CarryBank);
1678 }
1679 } else {
1680 if (IsUnsigned) {
1681 Carry = B.buildConstant(CarryType, 0).getReg(0);
1682 MRI.setRegBank(Carry, CarryBank);
1683 }
1684 }
1685
1686 B.buildMergeLikeInstr(Dst0, {DstLo, DstHi});
1687
1688 if (DstOnValu) {
1689 B.buildCopy(Dst1, Carry);
1690 } else {
1691 B.buildTrunc(Dst1, Carry);
1692 }
1693
1694 MI.eraseFromParent();
1695 return true;
1696}
1697
1698// Return a suitable opcode for extending the operands of Opc when widening.
1699static unsigned getExtendOp(unsigned Opc) {
1700 switch (Opc) {
1701 case TargetOpcode::G_ASHR:
1702 case TargetOpcode::G_SMIN:
1703 case TargetOpcode::G_SMAX:
1704 return TargetOpcode::G_SEXT;
1705 case TargetOpcode::G_LSHR:
1706 case TargetOpcode::G_UMIN:
1707 case TargetOpcode::G_UMAX:
1708 return TargetOpcode::G_ZEXT;
1709 default:
1710 return TargetOpcode::G_ANYEXT;
1711 }
1712}
1713
1714// Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding
1715// any illegal vector extend or unmerge operations.
1716static std::pair<Register, Register>
1717unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) {
1718 const LLT S32 = LLT::scalar(32);
1719 auto Bitcast = B.buildBitcast(S32, Src);
1720
1721 if (ExtOpcode == TargetOpcode::G_SEXT) {
1722 auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16);
1723 auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16));
1724 return std::pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1725 }
1726
1727 auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16));
1728 if (ExtOpcode == TargetOpcode::G_ZEXT) {
1729 auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff));
1730 return std::pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1731 }
1732
1733 assert(ExtOpcode == TargetOpcode::G_ANYEXT);
1734 return std::pair(Bitcast.getReg(0), ShiftHi.getReg(0));
1735}
1736
1737// For cases where only a single copy is inserted for matching register banks.
1738// Replace the register in the instruction operand
1740 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
1741 SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
1742 if (!SrcReg.empty()) {
1743 assert(SrcReg.size() == 1);
1744 OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);
1745 return true;
1746 }
1747
1748 return false;
1749}
1750
1751/// Handle register layout difference for f16 images for some subtargets.
1754 Register Reg) const {
1756 return Reg;
1757
1758 const LLT S16 = LLT::scalar(16);
1759 LLT StoreVT = MRI.getType(Reg);
1760 if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
1761 return Reg;
1762
1763 auto Unmerge = B.buildUnmerge(S16, Reg);
1764
1765
1766 SmallVector<Register, 4> WideRegs;
1767 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1768 WideRegs.push_back(Unmerge.getReg(I));
1769
1770 const LLT S32 = LLT::scalar(32);
1771 int NumElts = StoreVT.getNumElements();
1772
1773 return B.buildMergeLikeInstr(LLT::fixed_vector(NumElts, S32), WideRegs)
1774 .getReg(0);
1775}
1776
1777static std::pair<Register, unsigned>
1779 int64_t Const;
1780 if (mi_match(Reg, MRI, m_ICst(Const)))
1781 return std::pair(Register(), Const);
1782
1783 Register Base;
1784 if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const))))
1785 return std::pair(Base, Const);
1786
1787 // TODO: Handle G_OR used for add case
1788 return std::pair(Reg, 0);
1789}
1790
1791std::pair<Register, unsigned>
1793 Register OrigOffset) const {
1794 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset();
1795 Register BaseReg;
1796 unsigned ImmOffset;
1797 const LLT S32 = LLT::scalar(32);
1798
1799 // TODO: Use AMDGPU::getBaseWithConstantOffset() instead.
1800 std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(),
1801 OrigOffset);
1802
1803 unsigned C1 = 0;
1804 if (ImmOffset != 0) {
1805 // If the immediate value is too big for the immoffset field, put only bits
1806 // that would normally fit in the immoffset field. The remaining value that
1807 // is copied/added for the voffset field is a large power of 2, and it
1808 // stands more chance of being CSEd with the copy/add for another similar
1809 // load/store.
1810 // However, do not do that rounding down if that is a negative
1811 // number, as it appears to be illegal to have a negative offset in the
1812 // vgpr, even if adding the immediate offset makes it positive.
1813 unsigned Overflow = ImmOffset & ~MaxImm;
1814 ImmOffset -= Overflow;
1815 if ((int32_t)Overflow < 0) {
1816 Overflow += ImmOffset;
1817 ImmOffset = 0;
1818 }
1819
1820 C1 = ImmOffset;
1821 if (Overflow != 0) {
1822 if (!BaseReg)
1823 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
1824 else {
1825 auto OverflowVal = B.buildConstant(S32, Overflow);
1826 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
1827 }
1828 }
1829 }
1830
1831 if (!BaseReg)
1832 BaseReg = B.buildConstant(S32, 0).getReg(0);
1833
1834 return {BaseReg, C1};
1835}
1836
1838 Register SrcReg) const {
1839 MachineRegisterInfo &MRI = *B.getMRI();
1840 LLT SrcTy = MRI.getType(SrcReg);
1841 if (SrcTy.getSizeInBits() == 32) {
1842 // Use a v_mov_b32 here to make the exec dependency explicit.
1843 B.buildInstr(AMDGPU::V_MOV_B32_e32)
1844 .addDef(DstReg)
1845 .addUse(SrcReg);
1846 return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) &&
1847 constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI);
1848 }
1849
1850 Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1851 Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1852
1853 B.buildInstr(AMDGPU::V_MOV_B32_e32)
1854 .addDef(TmpReg0)
1855 .addUse(SrcReg, 0, AMDGPU::sub0);
1856 B.buildInstr(AMDGPU::V_MOV_B32_e32)
1857 .addDef(TmpReg1)
1858 .addUse(SrcReg, 0, AMDGPU::sub1);
1859 B.buildInstr(AMDGPU::REG_SEQUENCE)
1860 .addDef(DstReg)
1861 .addUse(TmpReg0)
1862 .addImm(AMDGPU::sub0)
1863 .addUse(TmpReg1)
1864 .addImm(AMDGPU::sub1);
1865
1866 return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) &&
1867 constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI);
1868}
1869
1870/// Utility function for pushing dynamic vector indexes with a constant offset
1871/// into waterfall loops.
1873 MachineInstr &IdxUseInstr,
1874 unsigned OpIdx,
1875 unsigned ConstOffset) {
1876 MachineRegisterInfo &MRI = *B.getMRI();
1877 const LLT S32 = LLT::scalar(32);
1878 Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg();
1879 B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator());
1880
1881 auto MaterializedOffset = B.buildConstant(S32, ConstOffset);
1882
1883 auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset);
1884 MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank);
1885 MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank);
1886 IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0));
1887}
1888
1889/// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the
1890/// original 32-bit source value (to be inserted in the low part of the combined
1891/// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit
1892/// value.
1894 Register Hi32Reg, Register Lo32Reg,
1895 unsigned ExtOpc,
1896 const RegisterBank &RegBank,
1897 bool IsBooleanSrc = false) {
1898 if (ExtOpc == AMDGPU::G_ZEXT) {
1899 B.buildConstant(Hi32Reg, 0);
1900 } else if (ExtOpc == AMDGPU::G_SEXT) {
1901 if (IsBooleanSrc) {
1902 // If we know the original source was an s1, the high half is the same as
1903 // the low.
1904 B.buildCopy(Hi32Reg, Lo32Reg);
1905 } else {
1906 // Replicate sign bit from 32-bit extended part.
1907 auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31);
1908 B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank);
1909 B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt);
1910 }
1911 } else {
1912 assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension");
1913 B.buildUndef(Hi32Reg);
1914 }
1915}
1916
1917bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
1919 const OperandsMapper &OpdMapper) const {
1920
1921 Register VecReg = MI.getOperand(1).getReg();
1922 Register Idx = MI.getOperand(2).getReg();
1923
1924 const RegisterBank &IdxBank =
1925 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1926
1927 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
1928
1929 LLT VecTy = MRI.getType(VecReg);
1930 unsigned EltSize = VecTy.getScalarSizeInBits();
1931 unsigned NumElem = VecTy.getNumElements();
1932
1933 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
1934 IsDivergentIdx, &Subtarget))
1935 return false;
1936
1938 LLT S32 = LLT::scalar(32);
1939
1940 const RegisterBank &DstBank =
1941 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1942 const RegisterBank &SrcBank =
1943 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1944
1945 const RegisterBank &CCBank =
1946 (DstBank == AMDGPU::SGPRRegBank &&
1947 SrcBank == AMDGPU::SGPRRegBank &&
1948 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
1949 : AMDGPU::VCCRegBank;
1950 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
1951
1952 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
1953 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
1954 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
1955 }
1956
1957 LLT EltTy = VecTy.getScalarType();
1958 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
1959 unsigned NumLanes = DstRegs.size();
1960 if (!NumLanes)
1961 NumLanes = 1;
1962 else
1963 EltTy = MRI.getType(DstRegs[0]);
1964
1965 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
1966 SmallVector<Register, 2> Res(NumLanes);
1967 for (unsigned L = 0; L < NumLanes; ++L)
1968 Res[L] = UnmergeToEltTy.getReg(L);
1969
1970 for (unsigned I = 1; I < NumElem; ++I) {
1971 auto IC = B.buildConstant(S32, I);
1972 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
1973 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
1974 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
1975
1976 for (unsigned L = 0; L < NumLanes; ++L) {
1977 auto S = B.buildSelect(EltTy, Cmp,
1978 UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]);
1979
1980 for (unsigned N : { 0, 2, 3 })
1981 MRI.setRegBank(S->getOperand(N).getReg(), DstBank);
1982
1983 Res[L] = S->getOperand(0).getReg();
1984 }
1985 }
1986
1987 for (unsigned L = 0; L < NumLanes; ++L) {
1988 Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L];
1989 B.buildCopy(DstReg, Res[L]);
1990 MRI.setRegBank(DstReg, DstBank);
1991 }
1992
1993 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
1994 MI.eraseFromParent();
1995
1996 return true;
1997}
1998
1999// Insert a cross regbank copy for a register if it already has a bank that
2000// differs from the one we want to set.
2003 const RegisterBank &Bank) {
2004 const RegisterBank *CurrBank = MRI.getRegBankOrNull(Reg);
2005 if (CurrBank && *CurrBank != Bank) {
2006 Register Copy = B.buildCopy(MRI.getType(Reg), Reg).getReg(0);
2007 MRI.setRegBank(Copy, Bank);
2008 return Copy;
2009 }
2010
2011 MRI.setRegBank(Reg, Bank);
2012 return Reg;
2013}
2014
2015bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
2017 const OperandsMapper &OpdMapper) const {
2018
2019 Register VecReg = MI.getOperand(1).getReg();
2020 Register Idx = MI.getOperand(3).getReg();
2021
2022 const RegisterBank &IdxBank =
2023 *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2024
2025 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
2026
2027 LLT VecTy = MRI.getType(VecReg);
2028 unsigned EltSize = VecTy.getScalarSizeInBits();
2029 unsigned NumElem = VecTy.getNumElements();
2030
2031 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
2032 IsDivergentIdx, &Subtarget))
2033 return false;
2034
2036 LLT S32 = LLT::scalar(32);
2037
2038 const RegisterBank &DstBank =
2039 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2040 const RegisterBank &SrcBank =
2041 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2042 const RegisterBank &InsBank =
2043 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2044
2045 const RegisterBank &CCBank =
2046 (DstBank == AMDGPU::SGPRRegBank &&
2047 SrcBank == AMDGPU::SGPRRegBank &&
2048 InsBank == AMDGPU::SGPRRegBank &&
2049 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
2050 : AMDGPU::VCCRegBank;
2051 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
2052
2053 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
2054 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
2055 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
2056 }
2057
2058 LLT EltTy = VecTy.getScalarType();
2059 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2060 unsigned NumLanes = InsRegs.size();
2061 if (!NumLanes) {
2062 NumLanes = 1;
2063 InsRegs.push_back(MI.getOperand(2).getReg());
2064 } else {
2065 EltTy = MRI.getType(InsRegs[0]);
2066 }
2067
2068 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
2069 SmallVector<Register, 16> Ops(NumElem * NumLanes);
2070
2071 for (unsigned I = 0; I < NumElem; ++I) {
2072 auto IC = B.buildConstant(S32, I);
2073 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
2074 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
2075 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
2076
2077 for (unsigned L = 0; L < NumLanes; ++L) {
2078 Register Op0 = constrainRegToBank(MRI, B, InsRegs[L], DstBank);
2079 Register Op1 = UnmergeToEltTy.getReg(I * NumLanes + L);
2080 Op1 = constrainRegToBank(MRI, B, Op1, DstBank);
2081
2082 Register Select = B.buildSelect(EltTy, Cmp, Op0, Op1).getReg(0);
2083 MRI.setRegBank(Select, DstBank);
2084
2085 Ops[I * NumLanes + L] = Select;
2086 }
2087 }
2088
2089 LLT MergeTy = LLT::fixed_vector(Ops.size(), EltTy);
2090 if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) {
2091 B.buildBuildVector(MI.getOperand(0), Ops);
2092 } else {
2093 auto Vec = B.buildBuildVector(MergeTy, Ops);
2094 MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank);
2095 B.buildBitcast(MI.getOperand(0).getReg(), Vec);
2096 }
2097
2098 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
2099 MI.eraseFromParent();
2100
2101 return true;
2102}
2103
2105 const OperandsMapper &OpdMapper) const {
2106 MachineInstr &MI = OpdMapper.getMI();
2107 unsigned Opc = MI.getOpcode();
2108 MachineRegisterInfo &MRI = OpdMapper.getMRI();
2109 switch (Opc) {
2110 case AMDGPU::G_CONSTANT:
2111 case AMDGPU::G_IMPLICIT_DEF: {
2112 Register DstReg = MI.getOperand(0).getReg();
2113 LLT DstTy = MRI.getType(DstReg);
2114 if (DstTy != LLT::scalar(1))
2115 break;
2116
2117 const RegisterBank *DstBank =
2118 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2119 if (DstBank == &AMDGPU::VCCRegBank)
2120 break;
2121 SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
2122 if (DefRegs.empty())
2123 DefRegs.push_back(DstReg);
2124
2126 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
2127
2128 Register NewDstReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
2129 LLVMContext &Ctx = B.getMF().getFunction().getContext();
2130
2131 MI.getOperand(0).setReg(NewDstReg);
2132 if (Opc != AMDGPU::G_IMPLICIT_DEF) {
2133 uint64_t ConstVal = MI.getOperand(1).getCImm()->getZExtValue();
2134 MI.getOperand(1).setCImm(
2136 }
2137
2138 MRI.setRegBank(NewDstReg, *DstBank);
2139 B.buildTrunc(DefRegs[0], NewDstReg);
2140 return;
2141 }
2142 case AMDGPU::G_PHI: {
2143 Register DstReg = MI.getOperand(0).getReg();
2144 LLT DstTy = MRI.getType(DstReg);
2145 if (DstTy != LLT::scalar(1))
2146 break;
2147
2148 const LLT S32 = LLT::scalar(32);
2149 const RegisterBank *DstBank =
2150 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2151 if (DstBank == &AMDGPU::VCCRegBank) {
2152 applyDefaultMapping(OpdMapper);
2153 // The standard handling only considers the result register bank for
2154 // phis. For VCC, blindly inserting a copy when the phi is lowered will
2155 // produce an invalid copy. We can only copy with some kind of compare to
2156 // get a vector boolean result. Insert a register bank copy that will be
2157 // correctly lowered to a compare.
2158 MachineIRBuilder B(*MI.getParent()->getParent());
2159
2160 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
2161 Register SrcReg = MI.getOperand(I).getReg();
2162 const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
2163
2164 if (SrcBank != &AMDGPU::VCCRegBank) {
2165 MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB();
2166 B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator());
2167
2168 auto Copy = B.buildCopy(LLT::scalar(1), SrcReg);
2169 MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank);
2170 MI.getOperand(I).setReg(Copy.getReg(0));
2171 }
2172 }
2173
2174 return;
2175 }
2176
2177 // Phi handling is strange and only considers the bank of the destination.
2178 substituteSimpleCopyRegs(OpdMapper, 0);
2179
2180 // Promote SGPR/VGPR booleans to s32
2181 MachineFunction *MF = MI.getParent()->getParent();
2182 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
2183 MachineIRBuilder B(MI, ApplyBank);
2184 LegalizerHelper Helper(*MF, ApplyBank, B);
2185
2186 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2187 llvm_unreachable("widen scalar should have succeeded");
2188
2189 return;
2190 }
2191 case AMDGPU::G_ICMP:
2192 case AMDGPU::G_UADDO:
2193 case AMDGPU::G_USUBO:
2194 case AMDGPU::G_UADDE:
2195 case AMDGPU::G_SADDE:
2196 case AMDGPU::G_USUBE:
2197 case AMDGPU::G_SSUBE: {
2198 unsigned BoolDstOp = Opc == AMDGPU::G_ICMP ? 0 : 1;
2199 Register DstReg = MI.getOperand(BoolDstOp).getReg();
2200
2201 const RegisterBank *DstBank =
2202 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2203 if (DstBank != &AMDGPU::SGPRRegBank)
2204 break;
2205
2206 const bool HasCarryIn = MI.getNumOperands() == 5;
2207
2208 // If this is a scalar compare, promote the result to s32, as the selection
2209 // will end up using a copy to a 32-bit vreg.
2210 const LLT S32 = LLT::scalar(32);
2211 Register NewDstReg = MRI.createGenericVirtualRegister(S32);
2212 MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank);
2213 MI.getOperand(BoolDstOp).setReg(NewDstReg);
2215
2216 if (HasCarryIn) {
2217 Register NewSrcReg = MRI.createGenericVirtualRegister(S32);
2218 MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank);
2219 B.buildZExt(NewSrcReg, MI.getOperand(4).getReg());
2220 MI.getOperand(4).setReg(NewSrcReg);
2221 }
2222
2223 MachineBasicBlock *MBB = MI.getParent();
2224 B.setInsertPt(*MBB, std::next(MI.getIterator()));
2225
2226 // If we had a constrained VCC result register, a copy was inserted to VCC
2227 // from SGPR.
2228 SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
2229 if (DefRegs.empty())
2230 DefRegs.push_back(DstReg);
2231 B.buildTrunc(DefRegs[0], NewDstReg);
2232 return;
2233 }
2234 case AMDGPU::G_SELECT: {
2235 Register DstReg = MI.getOperand(0).getReg();
2236 LLT DstTy = MRI.getType(DstReg);
2237
2238 SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1));
2239 if (CondRegs.empty())
2240 CondRegs.push_back(MI.getOperand(1).getReg());
2241 else {
2242 assert(CondRegs.size() == 1);
2243 }
2244
2245 const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI);
2246 if (CondBank == &AMDGPU::SGPRRegBank) {
2248 const LLT S32 = LLT::scalar(32);
2249 Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2250 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2251
2252 MI.getOperand(1).setReg(NewCondReg);
2253 B.buildZExt(NewCondReg, CondRegs[0]);
2254 }
2255
2256 if (DstTy.getSizeInBits() != 64)
2257 break;
2258
2260 LLT HalfTy = getHalfSizedType(DstTy);
2261
2262 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2263 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2264 SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));
2265
2266 // All inputs are SGPRs, nothing special to do.
2267 if (DefRegs.empty()) {
2268 assert(Src1Regs.empty() && Src2Regs.empty());
2269 break;
2270 }
2271
2272 if (Src1Regs.empty())
2273 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2274 else {
2275 setRegsToType(MRI, Src1Regs, HalfTy);
2276 }
2277
2278 if (Src2Regs.empty())
2279 split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg());
2280 else
2281 setRegsToType(MRI, Src2Regs, HalfTy);
2282
2283 setRegsToType(MRI, DefRegs, HalfTy);
2284
2285 B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]);
2286 B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]);
2287
2288 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2289 MI.eraseFromParent();
2290 return;
2291 }
2292 case AMDGPU::G_BRCOND: {
2293 Register CondReg = MI.getOperand(0).getReg();
2294 // FIXME: Should use legalizer helper, but should change bool ext type.
2295 const RegisterBank *CondBank =
2296 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2297
2298 if (CondBank == &AMDGPU::SGPRRegBank) {
2300 const LLT S32 = LLT::scalar(32);
2301 Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2302 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2303
2304 MI.getOperand(0).setReg(NewCondReg);
2305 B.buildZExt(NewCondReg, CondReg);
2306 return;
2307 }
2308
2309 break;
2310 }
2311 case AMDGPU::G_AND:
2312 case AMDGPU::G_OR:
2313 case AMDGPU::G_XOR: {
2314 // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
2315 // there is a VGPR input.
2316 Register DstReg = MI.getOperand(0).getReg();
2317 LLT DstTy = MRI.getType(DstReg);
2318
2319 if (DstTy.getSizeInBits() == 1) {
2320 const RegisterBank *DstBank =
2321 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2322 if (DstBank == &AMDGPU::VCCRegBank)
2323 break;
2324
2325 MachineFunction *MF = MI.getParent()->getParent();
2326 ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
2327 MachineIRBuilder B(MI, ApplyBank);
2328 LegalizerHelper Helper(*MF, ApplyBank, B);
2329
2330 if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
2332 llvm_unreachable("widen scalar should have succeeded");
2333 return;
2334 }
2335
2336 if (DstTy.getSizeInBits() != 64)
2337 break;
2338
2339 LLT HalfTy = getHalfSizedType(DstTy);
2340 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2341 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
2342 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2343
2344 // All inputs are SGPRs, nothing special to do.
2345 if (DefRegs.empty()) {
2346 assert(Src0Regs.empty() && Src1Regs.empty());
2347 break;
2348 }
2349
2350 assert(DefRegs.size() == 2);
2351 assert(Src0Regs.size() == Src1Regs.size() &&
2352 (Src0Regs.empty() || Src0Regs.size() == 2));
2353
2354 // Depending on where the source registers came from, the generic code may
2355 // have decided to split the inputs already or not. If not, we still need to
2356 // extract the values.
2358
2359 if (Src0Regs.empty())
2360 split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
2361 else
2362 setRegsToType(MRI, Src0Regs, HalfTy);
2363
2364 if (Src1Regs.empty())
2365 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2366 else
2367 setRegsToType(MRI, Src1Regs, HalfTy);
2368
2369 setRegsToType(MRI, DefRegs, HalfTy);
2370
2371 B.buildInstr(Opc, {DefRegs[0]}, {Src0Regs[0], Src1Regs[0]});
2372 B.buildInstr(Opc, {DefRegs[1]}, {Src0Regs[1], Src1Regs[1]});
2373
2374 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2375 MI.eraseFromParent();
2376 return;
2377 }
2378 case AMDGPU::G_ABS: {
2379 Register SrcReg = MI.getOperand(1).getReg();
2380 const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg);
2381
2382 // There is no VALU abs instruction so we need to replace it with a sub and
2383 // max combination.
2384 if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) {
2385 MachineFunction *MF = MI.getParent()->getParent();
2386 ApplyRegBankMapping Apply(*this, MRI, &AMDGPU::VGPRRegBank);
2387 MachineIRBuilder B(MI, Apply);
2388 LegalizerHelper Helper(*MF, Apply, B);
2389
2391 llvm_unreachable("lowerAbsToMaxNeg should have succeeded");
2392 return;
2393 }
2394 [[fallthrough]];
2395 }
2396 case AMDGPU::G_ADD:
2397 case AMDGPU::G_SUB:
2398 case AMDGPU::G_MUL:
2399 case AMDGPU::G_SHL:
2400 case AMDGPU::G_LSHR:
2401 case AMDGPU::G_ASHR:
2402 case AMDGPU::G_SMIN:
2403 case AMDGPU::G_SMAX:
2404 case AMDGPU::G_UMIN:
2405 case AMDGPU::G_UMAX: {
2406 Register DstReg = MI.getOperand(0).getReg();
2407 LLT DstTy = MRI.getType(DstReg);
2408
2409 // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
2410 // Packed 16-bit operations need to be scalarized and promoted.
2411 if (DstTy != LLT::scalar(16) && DstTy != LLT::fixed_vector(2, 16))
2412 break;
2413
2414 const RegisterBank *DstBank =
2415 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2416 if (DstBank == &AMDGPU::VGPRRegBank)
2417 break;
2418
2419 const LLT S32 = LLT::scalar(32);
2420 MachineBasicBlock *MBB = MI.getParent();
2421 MachineFunction *MF = MBB->getParent();
2422 ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
2423 MachineIRBuilder B(MI, ApplySALU);
2424
2425 if (DstTy.isVector()) {
2426 Register WideSrc0Lo, WideSrc0Hi;
2427 Register WideSrc1Lo, WideSrc1Hi;
2428
2429 unsigned ExtendOp = getExtendOp(MI.getOpcode());
2430 std::tie(WideSrc0Lo, WideSrc0Hi)
2431 = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), ExtendOp);
2432 std::tie(WideSrc1Lo, WideSrc1Hi)
2433 = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), ExtendOp);
2434 auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo});
2435 auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi});
2436 B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
2437 MI.eraseFromParent();
2438 } else {
2439 LegalizerHelper Helper(*MF, ApplySALU, B);
2440
2441 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2442 llvm_unreachable("widen scalar should have succeeded");
2443
2444 // FIXME: s16 shift amounts should be legal.
2445 if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR ||
2446 Opc == AMDGPU::G_ASHR) {
2447 B.setInsertPt(*MBB, MI.getIterator());
2448 if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2449 llvm_unreachable("widen scalar should have succeeded");
2450 }
2451 }
2452
2453 return;
2454 }
2455 case AMDGPU::G_SEXT_INREG: {
2456 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2457 if (SrcRegs.empty())
2458 break; // Nothing to repair
2459
2460 const LLT S32 = LLT::scalar(32);
2462 ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank);
2463 GISelObserverWrapper Observer(&O);
2464 B.setChangeObserver(Observer);
2465
2466 // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs
2467 // we would need to further expand, and doesn't let us directly set the
2468 // result registers.
2469 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2470
2471 int Amt = MI.getOperand(2).getImm();
2472 if (Amt <= 32) {
2473 // Downstream users have expectations for the high bit behavior, so freeze
2474 // incoming undefined bits.
2475 if (Amt == 32) {
2476 // The low bits are unchanged.
2477 B.buildFreeze(DstRegs[0], SrcRegs[0]);
2478 } else {
2479 auto Freeze = B.buildFreeze(S32, SrcRegs[0]);
2480 // Extend in the low bits and propagate the sign bit to the high half.
2481 B.buildSExtInReg(DstRegs[0], Freeze, Amt);
2482 }
2483
2484 B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31));
2485 } else {
2486 // The low bits are unchanged, and extend in the high bits.
2487 // No freeze required
2488 B.buildCopy(DstRegs[0], SrcRegs[0]);
2489 B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32);
2490 }
2491
2492 Register DstReg = MI.getOperand(0).getReg();
2493 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2494 MI.eraseFromParent();
2495 return;
2496 }
2497 case AMDGPU::G_CTPOP:
2498 case AMDGPU::G_BITREVERSE: {
2499 const RegisterBank *DstBank =
2500 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2501 if (DstBank == &AMDGPU::SGPRRegBank)
2502 break;
2503
2504 Register SrcReg = MI.getOperand(1).getReg();
2505 const LLT S32 = LLT::scalar(32);
2506 LLT Ty = MRI.getType(SrcReg);
2507 if (Ty == S32)
2508 break;
2509
2510 ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
2511 MachineIRBuilder B(MI, ApplyVALU);
2512
2513 MachineFunction &MF = B.getMF();
2514 LegalizerHelper Helper(MF, ApplyVALU, B);
2515
2516 if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2517 llvm_unreachable("narrowScalar should have succeeded");
2518 return;
2519 }
2520 case AMDGPU::G_AMDGPU_FFBH_U32:
2521 case AMDGPU::G_AMDGPU_FFBL_B32:
2522 case AMDGPU::G_CTLZ_ZERO_UNDEF:
2523 case AMDGPU::G_CTTZ_ZERO_UNDEF: {
2524 const RegisterBank *DstBank =
2525 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2526 if (DstBank == &AMDGPU::SGPRRegBank)
2527 break;
2528
2529 Register SrcReg = MI.getOperand(1).getReg();
2530 const LLT S32 = LLT::scalar(32);
2531 LLT Ty = MRI.getType(SrcReg);
2532 if (Ty == S32)
2533 break;
2534
2535 // We can narrow this more efficiently than Helper can by using ffbh/ffbl
2536 // which return -1 when the input is zero:
2537 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
2538 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
2539 // (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32))
2540 // (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo))
2541 ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
2542 MachineIRBuilder B(MI, ApplyVALU);
2543 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2544 unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF
2545 ? (unsigned)AMDGPU::G_AMDGPU_FFBH_U32
2546 : Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2547 ? (unsigned)AMDGPU::G_AMDGPU_FFBL_B32
2548 : Opc;
2549 unsigned Idx = NewOpc == AMDGPU::G_AMDGPU_FFBH_U32;
2550 auto X = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx]});
2551 auto Y = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx ^ 1]});
2552 unsigned AddOpc =
2553 Opc == AMDGPU::G_CTLZ_ZERO_UNDEF || Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2554 ? AMDGPU::G_ADD
2555 : AMDGPU::G_UADDSAT;
2556 Y = B.buildInstr(AddOpc, {S32}, {Y, B.buildConstant(S32, 32)});
2557 Register DstReg = MI.getOperand(0).getReg();
2558 B.buildUMin(DstReg, X, Y);
2559 MI.eraseFromParent();
2560 return;
2561 }
2562 case AMDGPU::G_SEXT:
2563 case AMDGPU::G_ZEXT:
2564 case AMDGPU::G_ANYEXT: {
2565 Register SrcReg = MI.getOperand(1).getReg();
2566 LLT SrcTy = MRI.getType(SrcReg);
2567 const bool Signed = Opc == AMDGPU::G_SEXT;
2568
2569 assert(OpdMapper.getVRegs(1).empty());
2570
2572 const RegisterBank *SrcBank =
2573 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2574
2575 Register DstReg = MI.getOperand(0).getReg();
2576 LLT DstTy = MRI.getType(DstReg);
2577 if (DstTy.isScalar() &&
2578 SrcBank != &AMDGPU::SGPRRegBank &&
2579 SrcBank != &AMDGPU::VCCRegBank &&
2580 // FIXME: Should handle any type that round to s64 when irregular
2581 // breakdowns supported.
2582 DstTy.getSizeInBits() == 64 &&
2583 SrcTy.getSizeInBits() <= 32) {
2584 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2585
2586 // Extend to 32-bit, and then extend the low half.
2587 if (Signed) {
2588 // TODO: Should really be buildSExtOrCopy
2589 B.buildSExtOrTrunc(DefRegs[0], SrcReg);
2590 } else if (Opc == AMDGPU::G_ZEXT) {
2591 B.buildZExtOrTrunc(DefRegs[0], SrcReg);
2592 } else {
2593 B.buildAnyExtOrTrunc(DefRegs[0], SrcReg);
2594 }
2595
2596 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank);
2597 MRI.setRegBank(DstReg, *SrcBank);
2598 MI.eraseFromParent();
2599 return;
2600 }
2601
2602 if (SrcTy != LLT::scalar(1))
2603 return;
2604
2605 // It is not legal to have a legalization artifact with a VCC source. Rather
2606 // than introducing a copy, insert the select we would have to select the
2607 // copy to.
2608 if (SrcBank == &AMDGPU::VCCRegBank) {
2609 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2610
2611 const RegisterBank *DstBank = &AMDGPU::VGPRRegBank;
2612
2613 unsigned DstSize = DstTy.getSizeInBits();
2614 // 64-bit select is SGPR only
2615 const bool UseSel64 = DstSize > 32 &&
2616 SrcBank->getID() == AMDGPU::SGPRRegBankID;
2617
2618 // TODO: Should s16 select be legal?
2619 LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
2620 auto True = B.buildConstant(SelType, Signed ? -1 : 1);
2621 auto False = B.buildConstant(SelType, 0);
2622
2623 MRI.setRegBank(True.getReg(0), *DstBank);
2624 MRI.setRegBank(False.getReg(0), *DstBank);
2625 MRI.setRegBank(DstReg, *DstBank);
2626
2627 if (DstSize > 32) {
2628 B.buildSelect(DefRegs[0], SrcReg, True, False);
2629 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true);
2630 } else if (DstSize < 32) {
2631 auto Sel = B.buildSelect(SelType, SrcReg, True, False);
2632 MRI.setRegBank(Sel.getReg(0), *DstBank);
2633 B.buildTrunc(DstReg, Sel);
2634 } else {
2635 B.buildSelect(DstReg, SrcReg, True, False);
2636 }
2637
2638 MI.eraseFromParent();
2639 return;
2640 }
2641
2642 break;
2643 }
2644 case AMDGPU::G_EXTRACT_VECTOR_ELT: {
2645 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2646
2647 assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());
2648
2649 Register DstReg = MI.getOperand(0).getReg();
2650 Register SrcReg = MI.getOperand(1).getReg();
2651
2652 const LLT S32 = LLT::scalar(32);
2653 LLT DstTy = MRI.getType(DstReg);
2654 LLT SrcTy = MRI.getType(SrcReg);
2655
2656 if (foldExtractEltToCmpSelect(MI, MRI, OpdMapper))
2657 return;
2658
2660
2661 const ValueMapping &DstMapping
2662 = OpdMapper.getInstrMapping().getOperandMapping(0);
2663 const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank;
2664 const RegisterBank *SrcBank =
2665 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2666 const RegisterBank *IdxBank =
2667 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2668
2669 Register BaseIdxReg;
2670 unsigned ConstOffset;
2671 std::tie(BaseIdxReg, ConstOffset) =
2672 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg());
2673
2674 // See if the index is an add of a constant which will be foldable by moving
2675 // the base register of the index later if this is going to be executed in a
2676 // waterfall loop. This is essentially to reassociate the add of a constant
2677 // with the readfirstlane.
2678 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2679 ConstOffset > 0 &&
2680 ConstOffset < SrcTy.getNumElements();
2681
2682 // Move the base register. We'll re-insert the add later.
2683 if (ShouldMoveIndexIntoLoop)
2684 MI.getOperand(2).setReg(BaseIdxReg);
2685
2686 // If this is a VGPR result only because the index was a VGPR result, the
2687 // actual indexing will be done on the SGPR source vector, which will
2688 // produce a scalar result. We need to copy to the VGPR result inside the
2689 // waterfall loop.
2690 const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank &&
2691 SrcBank == &AMDGPU::SGPRRegBank;
2692 if (DstRegs.empty()) {
2693 applyDefaultMapping(OpdMapper);
2694
2695 executeInWaterfallLoop(MI, MRI, { 2 });
2696
2697 if (NeedCopyToVGPR) {
2698 // We don't want a phi for this temporary reg.
2699 Register TmpReg = MRI.createGenericVirtualRegister(DstTy);
2700 MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank);
2701 MI.getOperand(0).setReg(TmpReg);
2702 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
2703
2704 // Use a v_mov_b32 here to make the exec dependency explicit.
2705 buildVCopy(B, DstReg, TmpReg);
2706 }
2707
2708 // Re-insert the constant offset add inside the waterfall loop.
2709 if (ShouldMoveIndexIntoLoop)
2710 reinsertVectorIndexAdd(B, MI, 2, ConstOffset);
2711
2712 return;
2713 }
2714
2715 assert(DstTy.getSizeInBits() == 64);
2716
2717 LLT Vec32 = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32);
2718
2719 auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2720 auto One = B.buildConstant(S32, 1);
2721
2722 MachineBasicBlock::iterator MII = MI.getIterator();
2723
2724 // Split the vector index into 32-bit pieces. Prepare to move all of the
2725 // new instructions into a waterfall loop if necessary.
2726 //
2727 // Don't put the bitcast or constant in the loop.
2728 MachineInstrSpan Span(MII, &B.getMBB());
2729
2730 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2731 auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2732 auto IdxHi = B.buildAdd(S32, IdxLo, One);
2733
2734 auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
2735 auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi);
2736
2737 MRI.setRegBank(DstReg, *DstBank);
2738 MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2739 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2740 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2741 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2742
2743 SmallSet<Register, 4> OpsToWaterfall;
2744 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) {
2745 MI.eraseFromParent();
2746 return;
2747 }
2748
2749 // Remove the original instruction to avoid potentially confusing the
2750 // waterfall loop logic.
2751 B.setInstr(*Span.begin());
2752 MI.eraseFromParent();
2753 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2754 OpsToWaterfall, MRI);
2755
2756 if (NeedCopyToVGPR) {
2757 MachineBasicBlock *LoopBB = Extract1->getParent();
2758 Register TmpReg0 = MRI.createGenericVirtualRegister(S32);
2759 Register TmpReg1 = MRI.createGenericVirtualRegister(S32);
2760 MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank);
2761 MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank);
2762
2763 Extract0->getOperand(0).setReg(TmpReg0);
2764 Extract1->getOperand(0).setReg(TmpReg1);
2765
2766 B.setInsertPt(*LoopBB, ++Extract1->getIterator());
2767
2768 buildVCopy(B, DstRegs[0], TmpReg0);
2769 buildVCopy(B, DstRegs[1], TmpReg1);
2770 }
2771
2772 if (ShouldMoveIndexIntoLoop)
2773 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2774
2775 return;
2776 }
2777 case AMDGPU::G_INSERT_VECTOR_ELT: {
2778 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2779
2780 Register DstReg = MI.getOperand(0).getReg();
2781 LLT VecTy = MRI.getType(DstReg);
2782
2783 assert(OpdMapper.getVRegs(0).empty());
2784 assert(OpdMapper.getVRegs(3).empty());
2785
2786 if (substituteSimpleCopyRegs(OpdMapper, 1))
2787 MRI.setType(MI.getOperand(1).getReg(), VecTy);
2788
2789 if (foldInsertEltToCmpSelect(MI, MRI, OpdMapper))
2790 return;
2791
2792 const RegisterBank *IdxBank =
2793 OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2794
2795 Register SrcReg = MI.getOperand(1).getReg();
2796 Register InsReg = MI.getOperand(2).getReg();
2797 LLT InsTy = MRI.getType(InsReg);
2798 (void)InsTy;
2799
2800 Register BaseIdxReg;
2801 unsigned ConstOffset;
2802 std::tie(BaseIdxReg, ConstOffset) =
2803 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg());
2804
2805 // See if the index is an add of a constant which will be foldable by moving
2806 // the base register of the index later if this is going to be executed in a
2807 // waterfall loop. This is essentially to reassociate the add of a constant
2808 // with the readfirstlane.
2809 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2810 ConstOffset > 0 &&
2811 ConstOffset < VecTy.getNumElements();
2812
2813 // Move the base register. We'll re-insert the add later.
2814 if (ShouldMoveIndexIntoLoop)
2815 MI.getOperand(3).setReg(BaseIdxReg);
2816
2817
2818 if (InsRegs.empty()) {
2819 executeInWaterfallLoop(MI, MRI, { 3 });
2820
2821 // Re-insert the constant offset add inside the waterfall loop.
2822 if (ShouldMoveIndexIntoLoop) {
2824 reinsertVectorIndexAdd(B, MI, 3, ConstOffset);
2825 }
2826
2827 return;
2828 }
2829
2830
2831 assert(InsTy.getSizeInBits() == 64);
2832
2833 const LLT S32 = LLT::scalar(32);
2834 LLT Vec32 = LLT::fixed_vector(2 * VecTy.getNumElements(), 32);
2835
2837 auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2838 auto One = B.buildConstant(S32, 1);
2839
2840 // Split the vector index into 32-bit pieces. Prepare to move all of the
2841 // new instructions into a waterfall loop if necessary.
2842 //
2843 // Don't put the bitcast or constant in the loop.
2845
2846 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2847 auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2848 auto IdxHi = B.buildAdd(S32, IdxLo, One);
2849
2850 auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo);
2851 auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi);
2852
2853 const RegisterBank *DstBank =
2854 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2855 const RegisterBank *SrcBank =
2856 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2857 const RegisterBank *InsSrcBank =
2858 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2859
2860 MRI.setRegBank(InsReg, *InsSrcBank);
2861 MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2862 MRI.setRegBank(InsLo.getReg(0), *DstBank);
2863 MRI.setRegBank(InsHi.getReg(0), *DstBank);
2864 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2865 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2866 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2867
2868
2869 SmallSet<Register, 4> OpsToWaterfall;
2870 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) {
2871 B.setInsertPt(B.getMBB(), MI);
2872 B.buildBitcast(DstReg, InsHi);
2873 MI.eraseFromParent();
2874 return;
2875 }
2876
2877 B.setInstr(*Span.begin());
2878 MI.eraseFromParent();
2879
2880 // Figure out the point after the waterfall loop before mangling the control
2881 // flow.
2882 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2883 OpsToWaterfall, MRI);
2884
2885 // The insertion point is now right after the original instruction.
2886 //
2887 // Keep the bitcast to the original vector type out of the loop. Doing this
2888 // saved an extra phi we don't need inside the loop.
2889 B.buildBitcast(DstReg, InsHi);
2890
2891 // Re-insert the constant offset add inside the waterfall loop.
2892 if (ShouldMoveIndexIntoLoop)
2893 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2894
2895 return;
2896 }
2897 case AMDGPU::G_AMDGPU_BUFFER_LOAD:
2898 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
2899 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
2900 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
2901 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
2902 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
2903 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE:
2904 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
2905 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
2906 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
2907 case AMDGPU::G_AMDGPU_BUFFER_STORE:
2908 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
2909 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
2910 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
2911 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16:
2912 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
2913 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: {
2914 applyDefaultMapping(OpdMapper);
2915 executeInWaterfallLoop(MI, MRI, {1, 4});
2916 return;
2917 }
2918 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
2919 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
2920 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
2921 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
2922 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
2923 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
2924 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
2925 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
2926 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
2927 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
2928 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
2929 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: {
2930 applyDefaultMapping(OpdMapper);
2931 executeInWaterfallLoop(MI, MRI, {2, 5});
2932 return;
2933 }
2934 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
2935 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
2936 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
2937 applyDefaultMapping(OpdMapper);
2938 executeInWaterfallLoop(MI, MRI, {2, 5});
2939 return;
2940 }
2941 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
2942 applyDefaultMapping(OpdMapper);
2943 executeInWaterfallLoop(MI, MRI, {3, 6});
2944 return;
2945 }
2946 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
2947 applyMappingSBufferLoad(OpdMapper);
2948 return;
2949 }
2950 case AMDGPU::G_INTRINSIC: {
2951 switch (MI.getIntrinsicID()) {
2952 case Intrinsic::amdgcn_readlane: {
2953 substituteSimpleCopyRegs(OpdMapper, 2);
2954
2955 assert(OpdMapper.getVRegs(0).empty());
2956 assert(OpdMapper.getVRegs(3).empty());
2957
2958 // Make sure the index is an SGPR. It doesn't make sense to run this in a
2959 // waterfall loop, so assume it's a uniform value.
2960 constrainOpWithReadfirstlane(MI, MRI, 3); // Index
2961 return;
2962 }
2963 case Intrinsic::amdgcn_writelane: {
2964 assert(OpdMapper.getVRegs(0).empty());
2965 assert(OpdMapper.getVRegs(2).empty());
2966 assert(OpdMapper.getVRegs(3).empty());
2967
2968 substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
2969 constrainOpWithReadfirstlane(MI, MRI, 2); // Source value
2970 constrainOpWithReadfirstlane(MI, MRI, 3); // Index
2971 return;
2972 }
2973 case Intrinsic::amdgcn_interp_p1:
2974 case Intrinsic::amdgcn_interp_p2:
2975 case Intrinsic::amdgcn_interp_mov:
2976 case Intrinsic::amdgcn_interp_p1_f16:
2977 case Intrinsic::amdgcn_interp_p2_f16:
2978 case Intrinsic::amdgcn_lds_param_load: {
2979 applyDefaultMapping(OpdMapper);
2980
2981 // Readlane for m0 value, which is always the last operand.
2982 // FIXME: Should this be a waterfall loop instead?
2983 constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index
2984 return;
2985 }
2986 case Intrinsic::amdgcn_interp_inreg_p10:
2987 case Intrinsic::amdgcn_interp_inreg_p2:
2988 case Intrinsic::amdgcn_interp_inreg_p10_f16:
2989 case Intrinsic::amdgcn_interp_inreg_p2_f16:
2990 applyDefaultMapping(OpdMapper);
2991 return;
2992 case Intrinsic::amdgcn_permlane16:
2993 case Intrinsic::amdgcn_permlanex16: {
2994 // Doing a waterfall loop over these wouldn't make any sense.
2995 substituteSimpleCopyRegs(OpdMapper, 2);
2996 substituteSimpleCopyRegs(OpdMapper, 3);
2999 return;
3000 }
3001 case Intrinsic::amdgcn_sbfe:
3002 applyMappingBFE(OpdMapper, true);
3003 return;
3004 case Intrinsic::amdgcn_ubfe:
3005 applyMappingBFE(OpdMapper, false);
3006 return;
3007 case Intrinsic::amdgcn_ballot:
3008 // Use default handling and insert copy to vcc source.
3009 break;
3010 }
3011 break;
3012 }
3013 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3014 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
3015 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
3016 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
3017 const AMDGPU::RsrcIntrinsic *RSrcIntrin
3018 = AMDGPU::lookupRsrcIntrinsic(MI.getIntrinsicID());
3019 assert(RSrcIntrin && RSrcIntrin->IsImage);
3020 // Non-images can have complications from operands that allow both SGPR
3021 // and VGPR. For now it's too complicated to figure out the final opcode
3022 // to derive the register bank from the MCInstrDesc.
3023 applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
3024 return;
3025 }
3026 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
3027 unsigned N = MI.getNumExplicitOperands() - 2;
3028 applyDefaultMapping(OpdMapper);
3030 return;
3031 }
3032 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
3033 auto IntrID = MI.getIntrinsicID();
3034 switch (IntrID) {
3035 case Intrinsic::amdgcn_ds_ordered_add:
3036 case Intrinsic::amdgcn_ds_ordered_swap: {
3037 // This is only allowed to execute with 1 lane, so readfirstlane is safe.
3038 assert(OpdMapper.getVRegs(0).empty());
3039 substituteSimpleCopyRegs(OpdMapper, 3);
3041 return;
3042 }
3043 case Intrinsic::amdgcn_ds_gws_init:
3044 case Intrinsic::amdgcn_ds_gws_barrier:
3045 case Intrinsic::amdgcn_ds_gws_sema_br: {
3046 // Only the first lane is executes, so readfirstlane is safe.
3047 substituteSimpleCopyRegs(OpdMapper, 1);
3049 return;
3050 }
3051 case Intrinsic::amdgcn_ds_gws_sema_v:
3052 case Intrinsic::amdgcn_ds_gws_sema_p:
3053 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
3054 // Only the first lane is executes, so readfirstlane is safe.
3056 return;
3057 }
3058 case Intrinsic::amdgcn_ds_append:
3059 case Intrinsic::amdgcn_ds_consume: {
3061 return;
3062 }
3063 case Intrinsic::amdgcn_s_sendmsg:
3064 case Intrinsic::amdgcn_s_sendmsghalt: {
3065 // FIXME: Should this use a waterfall loop?
3067 return;
3068 }
3069 case Intrinsic::amdgcn_s_setreg: {
3071 return;
3072 }
3073 case Intrinsic::amdgcn_raw_buffer_load_lds: {
3074 applyDefaultMapping(OpdMapper);
3075 constrainOpWithReadfirstlane(MI, MRI, 1); // rsrc
3077 constrainOpWithReadfirstlane(MI, MRI, 5); // soffset
3078 return;
3079 }
3080 case Intrinsic::amdgcn_struct_buffer_load_lds: {
3081 applyDefaultMapping(OpdMapper);
3082 constrainOpWithReadfirstlane(MI, MRI, 1); // rsrc
3084 constrainOpWithReadfirstlane(MI, MRI, 6); // soffset
3085 return;
3086 }
3087 case Intrinsic::amdgcn_global_load_lds: {
3088 applyDefaultMapping(OpdMapper);
3090 return;
3091 }
3092 case Intrinsic::amdgcn_lds_direct_load: {
3093 applyDefaultMapping(OpdMapper);
3094 // Readlane for m0 value, which is always the last operand.
3095 constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index
3096 return;
3097 }
3098 case Intrinsic::amdgcn_exp_row:
3099 applyDefaultMapping(OpdMapper);
3101 return;
3102 default: {
3103 if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3105 // Non-images can have complications from operands that allow both SGPR
3106 // and VGPR. For now it's too complicated to figure out the final opcode
3107 // to derive the register bank from the MCInstrDesc.
3108 if (RSrcIntrin->IsImage) {
3109 applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
3110 return;
3111 }
3112 }
3113
3114 break;
3115 }
3116 }
3117 break;
3118 }
3119 case AMDGPU::G_SI_CALL: {
3120 // Use a set to avoid extra readfirstlanes in the case where multiple
3121 // operands are the same register.
3122 SmallSet<Register, 4> SGPROperandRegs;
3123
3124 if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, {1}))
3125 break;
3126
3127 // Move all copies to physical SGPRs that are used by the call instruction
3128 // into the loop block. Start searching for these copies until the
3129 // ADJCALLSTACKUP.
3130 unsigned FrameSetupOpcode = AMDGPU::ADJCALLSTACKUP;
3131 unsigned FrameDestroyOpcode = AMDGPU::ADJCALLSTACKDOWN;
3132
3133 // Move all non-copies before the copies, so that a complete range can be
3134 // moved into the waterfall loop.
3135 SmallVector<MachineInstr *, 4> NonCopyInstrs;
3136 // Count of NonCopyInstrs found until the current LastCopy.
3137 unsigned NonCopyInstrsLen = 0;
3139 MachineBasicBlock::iterator LastCopy = Start;
3140 MachineBasicBlock *MBB = MI.getParent();
3143 while (Start->getOpcode() != FrameSetupOpcode) {
3144 --Start;
3145 bool IsCopy = false;
3146 if (Start->getOpcode() == AMDGPU::COPY) {
3147 auto &Dst = Start->getOperand(0);
3148 if (Dst.isReg()) {
3149 Register Reg = Dst.getReg();
3150 if (Reg.isPhysical() && MI.readsRegister(Reg, TRI)) {
3151 IsCopy = true;
3152 } else {
3153 // Also move the copy from the scratch rsrc descriptor into the loop
3154 // to allow it to be optimized away.
3155 auto &Src = Start->getOperand(1);
3156 if (Src.isReg()) {
3157 Reg = Src.getReg();
3158 IsCopy = Info->getScratchRSrcReg() == Reg;
3159 }
3160 }
3161 }
3162 }
3163
3164 if (IsCopy) {
3165 LastCopy = Start;
3166 NonCopyInstrsLen = NonCopyInstrs.size();
3167 } else {
3168 NonCopyInstrs.push_back(&*Start);
3169 }
3170 }
3171 NonCopyInstrs.resize(NonCopyInstrsLen);
3172
3173 for (auto *NonCopy : reverse(NonCopyInstrs)) {
3174 MBB->splice(LastCopy, MBB, NonCopy->getIterator());
3175 }
3176 Start = LastCopy;
3177
3178 // Do the same for copies after the loop
3179 NonCopyInstrs.clear();
3180 NonCopyInstrsLen = 0;
3182 LastCopy = End;
3183 while (End->getOpcode() != FrameDestroyOpcode) {
3184 ++End;
3185 bool IsCopy = false;
3186 if (End->getOpcode() == AMDGPU::COPY) {
3187 auto &Src = End->getOperand(1);
3188 if (Src.isReg()) {
3189 Register Reg = Src.getReg();
3190 IsCopy = Reg.isPhysical() && MI.modifiesRegister(Reg, TRI);
3191 }
3192 }
3193
3194 if (IsCopy) {
3195 LastCopy = End;
3196 NonCopyInstrsLen = NonCopyInstrs.size();
3197 } else {
3198 NonCopyInstrs.push_back(&*End);
3199 }
3200 }
3201 NonCopyInstrs.resize(NonCopyInstrsLen);
3202
3203 End = LastCopy;
3204 ++LastCopy;
3205 for (auto *NonCopy : reverse(NonCopyInstrs)) {
3206 MBB->splice(LastCopy, MBB, NonCopy->getIterator());
3207 }
3208
3209 ++End;
3210 MachineIRBuilder B(*Start);
3211 executeInWaterfallLoop(B, make_range(Start, End), SGPROperandRegs, MRI);
3212 break;
3213 }
3214 case AMDGPU::G_LOAD:
3215 case AMDGPU::G_ZEXTLOAD:
3216 case AMDGPU::G_SEXTLOAD: {
3217 if (applyMappingLoad(MI, OpdMapper, MRI))
3218 return;
3219 break;
3220 }
3221 case AMDGPU::G_DYN_STACKALLOC:
3222 applyMappingDynStackAlloc(MI, OpdMapper, MRI);
3223 return;
3224 case AMDGPU::G_SBFX:
3225 applyMappingBFE(OpdMapper, /*Signed*/ true);
3226 return;
3227 case AMDGPU::G_UBFX:
3228 applyMappingBFE(OpdMapper, /*Signed*/ false);
3229 return;
3230 case AMDGPU::G_AMDGPU_MAD_U64_U32:
3231 case AMDGPU::G_AMDGPU_MAD_I64_I32:
3232 applyMappingMAD_64_32(OpdMapper);
3233 return;
3234 default:
3235 break;
3236 }
3237
3238 return applyDefaultMapping(OpdMapper);
3239}
3240
3241// vgpr, sgpr -> vgpr
3242// vgpr, agpr -> vgpr
3243// agpr, agpr -> agpr
3244// agpr, sgpr -> vgpr
3245static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
3246 if (RB0 == AMDGPU::InvalidRegBankID)
3247 return RB1;
3248 if (RB1 == AMDGPU::InvalidRegBankID)
3249 return RB0;
3250
3251 if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID)
3252 return AMDGPU::SGPRRegBankID;
3253
3254 if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID)
3255 return AMDGPU::AGPRRegBankID;
3256
3257 return AMDGPU::VGPRRegBankID;
3258}
3259
3260static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) {
3261 if (RB0 == AMDGPU::InvalidRegBankID)
3262 return RB1;
3263 if (RB1 == AMDGPU::InvalidRegBankID)
3264 return RB0;
3265
3266 // vcc, vcc -> vcc
3267 // vcc, sgpr -> vcc
3268 // vcc, vgpr -> vcc
3269 if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID)
3270 return AMDGPU::VCCRegBankID;
3271
3272 // vcc, vgpr -> vgpr
3273 return regBankUnion(RB0, RB1);
3274}
3275
3277 const MachineInstr &MI) const {
3278 unsigned RegBank = AMDGPU::InvalidRegBankID;
3279
3280 for (const MachineOperand &MO : MI.operands()) {
3281 if (!MO.isReg())
3282 continue;
3283 Register Reg = MO.getReg();
3284 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3285 RegBank = regBankUnion(RegBank, Bank->getID());
3286 if (RegBank == AMDGPU::VGPRRegBankID)
3287 break;
3288 }
3289 }
3290
3291 return RegBank;
3292}
3293
3295 const MachineFunction &MF = *MI.getParent()->getParent();
3296 const MachineRegisterInfo &MRI = MF.getRegInfo();
3297 for (const MachineOperand &MO : MI.operands()) {
3298 if (!MO.isReg())
3299 continue;
3300 Register Reg = MO.getReg();
3301 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3302 if (Bank->getID() != AMDGPU::SGPRRegBankID)
3303 return false;
3304 }
3305 }
3306 return true;
3307}
3308
3311 const MachineFunction &MF = *MI.getParent()->getParent();
3312 const MachineRegisterInfo &MRI = MF.getRegInfo();
3313 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3314
3315 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3316 const MachineOperand &SrcOp = MI.getOperand(i);
3317 if (!SrcOp.isReg())
3318 continue;
3319
3320 unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI);
3321 OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3322 }
3323 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3324 MI.getNumOperands());
3325}
3326
3329 const MachineFunction &MF = *MI.getParent()->getParent();
3330 const MachineRegisterInfo &MRI = MF.getRegInfo();
3331 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3332
3333 // Even though we technically could use SGPRs, this would require knowledge of
3334 // the constant bus restriction. Force all sources to VGPR (except for VCC).
3335 //
3336 // TODO: Unary ops are trivially OK, so accept SGPRs?
3337 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3338 const MachineOperand &Src = MI.getOperand(i);
3339 if (!Src.isReg())
3340 continue;
3341
3342 unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI);
3343 unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
3344 OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
3345 }
3346
3347 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3348 MI.getNumOperands());
3349}
3350
3353 const MachineFunction &MF = *MI.getParent()->getParent();
3354 const MachineRegisterInfo &MRI = MF.getRegInfo();
3355 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3356
3357 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
3358 const MachineOperand &Op = MI.getOperand(I);
3359 if (!Op.isReg())
3360 continue;
3361
3362 unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
3363 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3364 }
3365
3366 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3367 MI.getNumOperands());
3368}
3369
3372 const MachineInstr &MI,
3373 int RsrcIdx) const {
3374 // The reported argument index is relative to the IR intrinsic call arguments,
3375 // so we need to shift by the number of defs and the intrinsic ID.
3376 RsrcIdx += MI.getNumExplicitDefs() + 1;
3377
3378 const int NumOps = MI.getNumOperands();
3379 SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps);
3380
3381 // TODO: Should packed/unpacked D16 difference be reported here as part of
3382 // the value mapping?
3383 for (int I = 0; I != NumOps; ++I) {
3384 if (!MI.getOperand(I).isReg())
3385 continue;
3386
3387 Register OpReg = MI.getOperand(I).getReg();
3388 // We replace some dead address operands with $noreg
3389 if (!OpReg)
3390 continue;
3391
3392 unsigned Size = getSizeInBits(OpReg, MRI, *TRI);
3393
3394 // FIXME: Probably need a new intrinsic register bank searchable table to
3395 // handle arbitrary intrinsics easily.
3396 //
3397 // If this has a sampler, it immediately follows rsrc.
3398 const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1;
3399
3400 if (MustBeSGPR) {
3401 // If this must be an SGPR, so we must report whatever it is as legal.
3402 unsigned NewBank = getRegBankID(OpReg, MRI, AMDGPU::SGPRRegBankID);
3403 OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size);
3404 } else {
3405 // Some operands must be VGPR, and these are easy to copy to.
3406 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3407 }
3408 }
3409
3410 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps);
3411}
3412
3413/// Return the mapping for a pointer argument.
3416 Register PtrReg) const {
3417 LLT PtrTy = MRI.getType(PtrReg);
3418 unsigned Size = PtrTy.getSizeInBits();
3421 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3422
3423 // If we're using MUBUF instructions for global memory, an SGPR base register
3424 // is possible. Otherwise this needs to be a VGPR.
3425 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3426 return AMDGPU::getValueMapping(PtrBank->getID(), Size);
3427}
3428
3431
3432 const MachineFunction &MF = *MI.getParent()->getParent();
3433 const MachineRegisterInfo &MRI = MF.getRegInfo();
3435 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3436 Register PtrReg = MI.getOperand(1).getReg();
3437 LLT PtrTy = MRI.getType(PtrReg);
3438 unsigned AS = PtrTy.getAddressSpace();
3439 unsigned PtrSize = PtrTy.getSizeInBits();
3440
3441 const ValueMapping *ValMapping;
3442 const ValueMapping *PtrMapping;
3443
3444 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3445
3446 if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) {
3447 if (isScalarLoadLegal(MI)) {
3448 // We have a uniform instruction so we want to use an SMRD load
3449 ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3450 PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
3451 } else {
3452 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3453
3454 // If we're using MUBUF instructions for global memory, an SGPR base
3455 // register is possible. Otherwise this needs to be a VGPR.
3456 unsigned PtrBankID = Subtarget.useFlatForGlobal() ?
3457 AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID;
3458
3459 PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize);
3460 }
3461 } else {
3462 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3463 PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
3464 }
3465
3466 OpdsMapping[0] = ValMapping;
3467 OpdsMapping[1] = PtrMapping;
3469 1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands());
3470 return Mapping;
3471
3472 // FIXME: Do we want to add a mapping for FLAT load, or should we just
3473 // handle that during instruction selection?
3474}
3475
3476unsigned
3478 const MachineRegisterInfo &MRI,
3479 unsigned Default) const {
3480 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3481 return Bank ? Bank->getID() : Default;
3482}
3483
3486 const MachineRegisterInfo &MRI,
3487 const TargetRegisterInfo &TRI) const {
3488 // Lie and claim anything is legal, even though this needs to be an SGPR
3489 // applyMapping will have to deal with it as a waterfall loop.
3490 unsigned Bank = getRegBankID(Reg, MRI, AMDGPU::SGPRRegBankID);
3491 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3492 return AMDGPU::getValueMapping(Bank, Size);
3493}
3494
3497 const MachineRegisterInfo &MRI,
3498 const TargetRegisterInfo &TRI) const {
3499 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3500 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3501}
3502
3505 const MachineRegisterInfo &MRI,
3506 const TargetRegisterInfo &TRI) const {
3507 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3508 return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size);
3509}
3510
3511///
3512/// This function must return a legal mapping, because
3513/// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
3514/// in RegBankSelect::Mode::Fast. Any mapping that would cause a
3515/// VGPR to SGPR generated is illegal.
3516///
3517// Operands that must be SGPRs must accept potentially divergent VGPRs as
3518// legal. These will be dealt with in applyMappingImpl.
3519//
3522 const MachineFunction &MF = *MI.getParent()->getParent();
3523 const MachineRegisterInfo &MRI = MF.getRegInfo();
3524
3525 if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) {
3526 // The default logic bothers to analyze impossible alternative mappings. We
3527 // want the most straightforward mapping, so just directly handle this.
3528 const RegisterBank *DstBank = getRegBank(MI.getOperand(0).getReg(), MRI,
3529 *TRI);
3530 const RegisterBank *SrcBank = getRegBank(MI.getOperand(1).getReg(), MRI,
3531 *TRI);
3532 assert(SrcBank && "src bank should have been assigned already");
3533 if (!DstBank)
3534 DstBank = SrcBank;
3535
3536 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3537 if (MI.getOpcode() != AMDGPU::G_FREEZE &&
3538 cannotCopy(*DstBank, *SrcBank, Size))
3540
3541 const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank);
3542 unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2;
3543 SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize);
3544 OpdsMapping[0] = &ValMap;
3545 if (MI.getOpcode() == AMDGPU::G_FREEZE)
3546 OpdsMapping[1] = &ValMap;
3547
3548 return getInstructionMapping(
3549 1, /*Cost*/ 1,
3550 /*OperandsMapping*/ getOperandsMapping(OpdsMapping), OpdsMappingSize);
3551 }
3552
3553 if (MI.isRegSequence()) {
3554 // If any input is a VGPR, the result must be a VGPR. The default handling
3555 // assumes any copy between banks is legal.
3556 unsigned BankID = AMDGPU::SGPRRegBankID;
3557
3558 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3559 auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI);
3560 // It doesn't make sense to use vcc or scc banks here, so just ignore
3561 // them.
3562 if (OpBank != AMDGPU::SGPRRegBankID) {
3563 BankID = AMDGPU::VGPRRegBankID;
3564 break;
3565 }
3566 }
3567 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3568
3569 const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID));
3570 return getInstructionMapping(
3571 1, /*Cost*/ 1,
3572 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3573 }
3574
3575 // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
3576 // properly.
3577 //
3578 // TODO: There are additional exec masking dependencies to analyze.
3579 if (MI.getOpcode() == TargetOpcode::G_PHI) {
3580 unsigned ResultBank = AMDGPU::InvalidRegBankID;
3581 Register DstReg = MI.getOperand(0).getReg();
3582
3583 // Sometimes the result may have already been assigned a bank.
3584 if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI))
3585 ResultBank = DstBank->getID();
3586
3587 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3588 Register Reg = MI.getOperand(I).getReg();
3589 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3590
3591 // FIXME: Assuming VGPR for any undetermined inputs.
3592 if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {
3593 ResultBank = AMDGPU::VGPRRegBankID;
3594 break;
3595 }
3596
3597 // FIXME: Need to promote SGPR case to s32
3598 unsigned OpBank = Bank->getID();
3599 ResultBank = regBankBoolUnion(ResultBank, OpBank);
3600 }
3601
3602 assert(ResultBank != AMDGPU::InvalidRegBankID);
3603
3604 unsigned Size = MRI.getType(DstReg).getSizeInBits();
3605
3606 const ValueMapping &ValMap =
3607 getValueMapping(0, Size, getRegBank(ResultBank));
3608 return getInstructionMapping(
3609 1, /*Cost*/ 1,
3610 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3611 }
3612
3614 if (Mapping.isValid())
3615 return Mapping;
3616
3617 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3618
3619 switch (MI.getOpcode()) {
3620 default:
3622
3623 case AMDGPU::G_AND:
3624 case AMDGPU::G_OR:
3625 case AMDGPU::G_XOR: {
3626 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3627 if (Size == 1) {
3628 const RegisterBank *DstBank
3629 = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI);
3630
3631 unsigned TargetBankID = AMDGPU::InvalidRegBankID;
3632 unsigned BankLHS = AMDGPU::InvalidRegBankID;
3633 unsigned BankRHS = AMDGPU::InvalidRegBankID;
3634 if (DstBank) {
3635 TargetBankID = DstBank->getID();
3636 if (DstBank == &AMDGPU::VCCRegBank) {
3637 TargetBankID = AMDGPU::VCCRegBankID;
3638 BankLHS = AMDGPU::VCCRegBankID;
3639 BankRHS = AMDGPU::VCCRegBankID;
3640 } else {
3641 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3642 AMDGPU::SGPRRegBankID);
3643 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3644 AMDGPU::SGPRRegBankID);
3645 }
3646 } else {
3647 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3648 AMDGPU::VCCRegBankID);
3649 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3650 AMDGPU::VCCRegBankID);
3651
3652 // Both inputs should be true booleans to produce a boolean result.
3653 if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {
3654 TargetBankID = AMDGPU::VGPRRegBankID;
3655 } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {
3656 TargetBankID = AMDGPU::VCCRegBankID;
3657 BankLHS = AMDGPU::VCCRegBankID;
3658 BankRHS = AMDGPU::VCCRegBankID;
3659 } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
3660 TargetBankID = AMDGPU::SGPRRegBankID;
3661 }
3662 }
3663
3664 OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size);
3665 OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size);
3666 OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size);
3667 break;
3668 }
3669
3670 if (Size == 64) {
3671
3672 if (isSALUMapping(MI)) {
3673 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
3674 OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
3675 } else {
3676 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
3677 unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/);
3678 OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
3679
3680 unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI /*, DefaultBankID*/);
3681 OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size);
3682 }
3683
3684 break;
3685 }
3686
3687 [[fallthrough]];
3688 }
3689 case AMDGPU::G_PTR_ADD:
3690 case AMDGPU::G_PTRMASK:
3691 case AMDGPU::G_ADD:
3692 case AMDGPU::G_SUB:
3693 case AMDGPU::G_MUL:
3694 case AMDGPU::G_SHL:
3695 case AMDGPU::G_LSHR:
3696 case AMDGPU::G_ASHR:
3697 case AMDGPU::G_UADDO:
3698 case AMDGPU::G_USUBO:
3699 case AMDGPU::G_UADDE:
3700 case AMDGPU::G_SADDE:
3701 case AMDGPU::G_USUBE:
3702 case AMDGPU::G_SSUBE:
3703 case AMDGPU::G_SMIN:
3704 case AMDGPU::G_SMAX:
3705 case AMDGPU::G_UMIN:
3706 case AMDGPU::G_UMAX:
3707 case AMDGPU::G_ABS:
3708 case AMDGPU::G_SHUFFLE_VECTOR:
3709 case AMDGPU::G_SBFX:
3710 case AMDGPU::G_UBFX:
3711 if (isSALUMapping(MI))
3712 return getDefaultMappingSOP(MI);
3713 [[fallthrough]];
3714
3715 case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU
3716 case AMDGPU::G_SSUBSAT:
3717 case AMDGPU::G_UADDSAT:
3718 case AMDGPU::G_USUBSAT:
3719 case AMDGPU::G_FADD:
3720 case AMDGPU::G_FSUB:
3721 case AMDGPU::G_FPTOSI:
3722 case AMDGPU::G_FPTOUI:
3723 case AMDGPU::G_FMUL:
3724 case AMDGPU::G_FMA:
3725 case AMDGPU::G_FMAD:
3726 case AMDGPU::G_FSQRT:
3727 case AMDGPU::G_FFLOOR:
3728 case AMDGPU::G_FCEIL:
3729 case AMDGPU::G_FRINT:
3730 case AMDGPU::G_SITOFP:
3731 case AMDGPU::G_UITOFP:
3732 case AMDGPU::G_FPTRUNC:
3733 case AMDGPU::G_FPEXT:
3734 case AMDGPU::G_FEXP2:
3735 case AMDGPU::G_FLOG2:
3736 case AMDGPU::G_FMINNUM:
3737 case AMDGPU::G_FMAXNUM:
3738 case AMDGPU::G_FMINNUM_IEEE:
3739 case AMDGPU::G_FMAXNUM_IEEE:
3740 case AMDGPU::G_FCANONICALIZE:
3741 case AMDGPU::G_INTRINSIC_TRUNC:
3742 case AMDGPU::G_STRICT_FADD:
3743 case AMDGPU::G_STRICT_FSUB:
3744 case AMDGPU::G_STRICT_FMUL:
3745 case AMDGPU::G_STRICT_FMA:
3746 case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar?
3747 case AMDGPU::G_FSHR: // TODO: Expand for scalar
3748 case AMDGPU::G_AMDGPU_FMIN_LEGACY:
3749 case AMDGPU::G_AMDGPU_FMAX_LEGACY:
3750 case AMDGPU::G_AMDGPU_RCP_IFLAG:
3751 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
3752 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
3753 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
3754 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
3755 case AMDGPU::G_AMDGPU_CVT_PK_I16_I32:
3756 case AMDGPU::G_AMDGPU_SMED3:
3757 return getDefaultMappingVOP(MI);
3758 case AMDGPU::G_UMULH:
3759 case AMDGPU::G_SMULH: {
3761 return getDefaultMappingSOP(MI);
3762 return getDefaultMappingVOP(MI);
3763 }
3764 case AMDGPU::G_AMDGPU_MAD_U64_U32:
3765 case AMDGPU::G_AMDGPU_MAD_I64_I32: {
3766 // Three possible mappings:
3767 //
3768 // - Default SOP
3769 // - Default VOP
3770 // - Scalar multiply: src0 and src1 are SGPRs, the rest is VOP.
3771 //
3772 // This allows instruction selection to keep the multiplication part of the
3773 // instruction on the SALU.
3774 bool AllSalu = true;
3775 bool MulSalu = true;
3776 for (unsigned i = 0; i < 5; ++i) {
3777 Register Reg = MI.getOperand(i).getReg();
3778 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3779 if (Bank->getID() != AMDGPU::SGPRRegBankID) {
3780 AllSalu = false;
3781 if (i == 2 || i == 3) {
3782 MulSalu = false;
3783 break;
3784 }
3785 }
3786 }
3787 }
3788
3789 if (AllSalu)
3790 return getDefaultMappingSOP(MI);
3791
3792 // If the multiply-add is full-rate in VALU, use that even if the
3793 // multiplication part is scalar. Accumulating separately on the VALU would
3794 // take two instructions.
3795 if (!MulSalu || Subtarget.hasFullRate64Ops())
3796 return getDefaultMappingVOP(MI);
3797
3798 // Keep the multiplication on the SALU, then accumulate on the VALU.
3799 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
3800 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
3801 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3802 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3803 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
3804 break;
3805 }
3806 case AMDGPU::G_IMPLICIT_DEF: {
3807 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3808 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3809 break;
3810 }
3811 case AMDGPU::G_FCONSTANT:
3812 case AMDGPU::G_CONSTANT:
3813 case AMDGPU::G_GLOBAL_VALUE:
3814 case AMDGPU::G_BLOCK_ADDR:
3815 case AMDGPU::G_READCYCLECOUNTER: {
3816 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3817 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3818 break;
3819 }
3820 case AMDGPU::G_FRAME_INDEX: {
3821 // TODO: This should be the same as other constants, but eliminateFrameIndex
3822 // currently assumes VALU uses.
3823 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3824 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3825 break;
3826 }
3827 case AMDGPU::G_DYN_STACKALLOC: {
3828 // Result is always uniform, and a wave reduction is needed for the source.
3829 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3830 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3831 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32);
3832 break;
3833 }
3834 case AMDGPU::G_AMDGPU_WAVE_ADDRESS: {
3835 // This case is weird because we expect a physical register in the source,
3836 // but need to set a bank anyway.
3837 //
3838 // We could select the result to SGPR or VGPR, but for the one current use
3839 // it's more practical to always use VGPR.
3840 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
3841 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3842 break;
3843 }
3844 case AMDGPU::G_INSERT: {
3845 unsigned BankID = getMappingType(MRI, MI);
3846 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3847 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3848 unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
3849 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
3850 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
3851 OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize);
3852 OpdsMapping[3] = nullptr;
3853 break;
3854 }
3855 case AMDGPU::G_EXTRACT: {
3856 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3857 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3858 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3859 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
3860 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
3861 OpdsMapping[2] = nullptr;
3862 break;
3863 }
3864 case AMDGPU::G_BUILD_VECTOR:
3865 case AMDGPU::G_BUILD_VECTOR_TRUNC: {
3866 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
3867 if (DstTy == LLT::fixed_vector(2, 16)) {
3868 unsigned DstSize = DstTy.getSizeInBits();
3869 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3870 unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3871 unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
3872 unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID);
3873
3874 OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize);
3875 OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize);
3876 OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize);
3877 break;
3878 }
3879
3880 [[fallthrough]];
3881 }
3882 case AMDGPU::G_MERGE_VALUES:
3883 case AMDGPU::G_CONCAT_VECTORS: {
3884 unsigned Bank = getMappingType(MRI, MI);
3885 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3886 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3887
3888 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
3889 // Op1 and Dst should use the same register bank.
3890 for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
3891 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
3892 break;
3893 }
3894 case AMDGPU::G_BITREVERSE:
3895 case AMDGPU::G_BITCAST:
3896 case AMDGPU::G_INTTOPTR:
3897 case AMDGPU::G_PTRTOINT:
3898 case AMDGPU::G_FABS:
3899 case AMDGPU::G_FNEG: {
3900 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3901 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3902 OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
3903 break;
3904 }
3905 case AMDGPU::G_AMDGPU_FFBH_U32:
3906 case AMDGPU::G_AMDGPU_FFBL_B32:
3907 case AMDGPU::G_CTLZ_ZERO_UNDEF:
3908 case AMDGPU::G_CTTZ_ZERO_UNDEF: {
3909 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3910 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3911 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
3912 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(BankID, Size);
3913 break;
3914 }
3915 case AMDGPU::G_CTPOP: {
3916 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3917 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3918 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
3919
3920 // This should really be getValueMappingSGPR64Only, but allowing the generic
3921 // code to handle the register split just makes using LegalizerHelper more
3922 // difficult.
3923 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
3924 break;
3925 }
3926 case AMDGPU::G_TRUNC: {
3927 Register Dst = MI.getOperand(0).getReg();
3928 Register Src = MI.getOperand(1).getReg();
3929 unsigned Bank = getRegBankID(Src, MRI);
3930 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
3931 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
3932 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
3933 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
3934 break;
3935 }
3936 case AMDGPU::G_ZEXT:
3937 case AMDGPU::G_SEXT:
3938 case AMDGPU::G_ANYEXT:
3939 case AMDGPU::G_SEXT_INREG: {
3940 Register Dst = MI.getOperand(0).getReg();
3941 Register Src = MI.getOperand(1).getReg();
3942 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
3943 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
3944
3945 unsigned DstBank;
3946 const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI);
3947 assert(SrcBank);
3948 switch (SrcBank->getID()) {
3949 case AMDGPU::SGPRRegBankID:
3950 DstBank = AMDGPU::SGPRRegBankID;
3951 break;
3952 default:
3953 DstBank = AMDGPU::VGPRRegBankID;
3954 break;
3955 }
3956
3957 // Scalar extend can use 64-bit BFE, but VGPRs require extending to
3958 // 32-bits, and then to 64.
3959 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize);
3960 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(),
3961 SrcSize);
3962 break;
3963 }
3964 case AMDGPU::G_FCMP: {
3965 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3966 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
3967 OpdsMapping[1] = nullptr; // Predicate Operand.
3968 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3969 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3970 break;
3971 }
3972 case AMDGPU::G_IS_FPCLASS: {
3973 Register SrcReg = MI.getOperand(1).getReg();
3974 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
3975 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3976 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
3977 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
3978 break;
3979 }
3980 case AMDGPU::G_STORE: {
3981 assert(MI.getOperand(0).isReg());
3982 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3983
3984 // FIXME: We need to specify a different reg bank once scalar stores are
3985 // supported.
3986 const ValueMapping *ValMapping =
3987 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3988 OpdsMapping[0] = ValMapping;
3989 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
3990 break;
3991 }
3992 case AMDGPU::G_ICMP: {
3993 auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
3994 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3995
3996 // See if the result register has already been constrained to vcc, which may
3997 // happen due to control flow intrinsic lowering.
3998 unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI,
3999 AMDGPU::SGPRRegBankID);
4000 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
4001 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI);
4002
4003 bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID &&
4004 Op2Bank == AMDGPU::SGPRRegBankID &&
4005 Op3Bank == AMDGPU::SGPRRegBankID &&
4006 (Size == 32 || (Size == 64 &&
4007 (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
4009
4010 DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4011 unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4012
4013 // TODO: Use 32-bit for scalar output size.
4014 // SCC results will need to be copied to a 32-bit SGPR virtual register.
4015 const unsigned ResultSize = 1;
4016
4017 OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize);
4018 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size);
4019 OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size);
4020 break;
4021 }
4022 case AMDGPU::G_EXTRACT_VECTOR_ELT: {
4023 // VGPR index can be used for waterfall when indexing a SGPR vector.
4024 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4025 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4026 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4027 unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4028 unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI);
4029 unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank);
4030
4031 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize);
4032 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize);
4033
4034 // The index can be either if the source vector is VGPR.
4035 OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4036 break;
4037 }
4038 case AMDGPU::G_INSERT_VECTOR_ELT: {
4039 unsigned OutputBankID = isSALUMapping(MI) ?
4040 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4041
4042 unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4043 unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4044 unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
4045 unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
4046 unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI);
4047
4048 OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
4049 OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize);
4050
4051 // This is a weird case, because we need to break down the mapping based on
4052 // the register bank of a different operand.
4053 if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) {
4054 OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID,
4055 InsertSize);
4056 } else {
4057 assert(InsertSize == 32 || InsertSize == 64);
4058 OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize);
4059 }
4060
4061 // The index can be either if the source vector is VGPR.
4062 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize);
4063 break;
4064 }
4065 case AMDGPU::G_UNMERGE_VALUES: {
4066 unsigned Bank = getMappingType(MRI, MI);
4067
4068 // Op1 and Dst should use the same register bank.
4069 // FIXME: Shouldn't this be the default? Why do we need to handle this?
4070 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
4071 unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
4072 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size);
4073 }
4074 break;
4075 }
4076 case AMDGPU::G_AMDGPU_BUFFER_LOAD:
4077 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
4078 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
4079 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
4080 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
4081 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
4082 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE:
4083 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
4084 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
4085 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
4086 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
4087 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16:
4088 case AMDGPU::G_AMDGPU_BUFFER_STORE:
4089 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
4090 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
4091 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
4092 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: {
4093 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4094
4095 // rsrc
4096 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4097
4098 // vindex
4099 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4100
4101 // voffset
4102 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4103
4104 // soffset
4105 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4106
4107 // Any remaining operands are immediates and were correctly null
4108 // initialized.
4109 break;
4110 }
4111 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
4112 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
4113 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
4114 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
4115 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
4116 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
4117 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
4118 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
4119 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
4120 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
4121 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
4122 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
4123 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
4124 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
4125 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
4126 // vdata_out
4127 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4128
4129 // vdata_in
4130 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4131
4132 // rsrc
4133 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4134
4135 // vindex
4136 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4137
4138 // voffset
4139 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4140
4141 // soffset
4142 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4143
4144 // Any remaining operands are immediates and were correctly null
4145 // initialized.
4146 break;
4147 }
4148 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
4149 // vdata_out
4150 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4151
4152 // vdata_in
4153 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4154
4155 // cmp
4156 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4157
4158 // rsrc
4159 OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4160
4161 // vindex
4162 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4163
4164 // voffset
4165 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4166
4167 // soffset
4168 OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
4169
4170 // Any remaining operands are immediates and were correctly null
4171 // initialized.
4172 break;
4173 }
4174 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
4175 // Lie and claim everything is legal, even though some need to be
4176 // SGPRs. applyMapping will have to deal with it as a waterfall loop.
4177 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4178 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4179
4180 // We need to convert this to a MUBUF if either the resource of offset is
4181 // VGPR.
4182 unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID();
4183 unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID();
4184 unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank);
4185
4186 unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4187 OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0);
4188 break;
4189 }
4190 case AMDGPU::G_INTRINSIC: {
4191 switch (MI.getIntrinsicID()) {
4192 default:
4194 case Intrinsic::amdgcn_div_fmas:
4195 case Intrinsic::amdgcn_div_fixup:
4196 case Intrinsic::amdgcn_trig_preop:
4197 case Intrinsic::amdgcn_sin:
4198 case Intrinsic::amdgcn_cos:
4199 case Intrinsic::amdgcn_log_clamp:
4200 case Intrinsic::amdgcn_rcp:
4201 case Intrinsic::amdgcn_rcp_legacy:
4202 case Intrinsic::amdgcn_sqrt:
4203 case Intrinsic::amdgcn_rsq:
4204 case Intrinsic::amdgcn_rsq_legacy:
4205 case Intrinsic::amdgcn_rsq_clamp:
4206 case Intrinsic::amdgcn_fmul_legacy:
4207 case Intrinsic::amdgcn_fma_legacy:
4208 case Intrinsic::amdgcn_ldexp:
4209 case Intrinsic::amdgcn_frexp_mant:
4210 case Intrinsic::amdgcn_frexp_exp:
4211 case Intrinsic::amdgcn_fract:
4212 case Intrinsic::amdgcn_cvt_pkrtz:
4213 case Intrinsic::amdgcn_cvt_pknorm_i16:
4214 case Intrinsic::amdgcn_cvt_pknorm_u16:
4215 case Intrinsic::amdgcn_cvt_pk_i16:
4216 case Intrinsic::amdgcn_cvt_pk_u16:
4217 case Intrinsic::amdgcn_fmed3:
4218 case Intrinsic::amdgcn_cubeid:
4219 case Intrinsic::amdgcn_cubema:
4220 case Intrinsic::amdgcn_cubesc:
4221 case Intrinsic::amdgcn_cubetc:
4222 case Intrinsic::amdgcn_sffbh:
4223 case Intrinsic::amdgcn_fmad_ftz:
4224 case Intrinsic::amdgcn_mbcnt_lo:
4225 case Intrinsic::amdgcn_mbcnt_hi:
4226 case Intrinsic::amdgcn_mul_u24:
4227 case Intrinsic::amdgcn_mul_i24:
4228 case Intrinsic::amdgcn_mulhi_u24:
4229 case Intrinsic::amdgcn_mulhi_i24:
4230 case Intrinsic::amdgcn_lerp:
4231 case Intrinsic::amdgcn_sad_u8:
4232 case Intrinsic::amdgcn_msad_u8:
4233 case Intrinsic::amdgcn_sad_hi_u8:
4234 case Intrinsic::amdgcn_sad_u16:
4235 case Intrinsic::amdgcn_qsad_pk_u16_u8:
4236 case Intrinsic::amdgcn_mqsad_pk_u16_u8:
4237 case Intrinsic::amdgcn_mqsad_u32_u8:
4238 case Intrinsic::amdgcn_cvt_pk_u8_f32:
4239 case Intrinsic::amdgcn_alignbyte:
4240 case Intrinsic::amdgcn_perm:
4241 case Intrinsic::amdgcn_fdot2:
4242 case Intrinsic::amdgcn_sdot2:
4243 case Intrinsic::amdgcn_udot2:
4244 case Intrinsic::amdgcn_sdot4:
4245 case Intrinsic::amdgcn_udot4:
4246 case Intrinsic::amdgcn_sdot8:
4247 case Intrinsic::amdgcn_udot8:
4248 case Intrinsic::amdgcn_fdot2_bf16_bf16:
4249 case Intrinsic::amdgcn_fdot2_f16_f16:
4250 case Intrinsic::amdgcn_fdot2_f32_bf16:
4251 case Intrinsic::amdgcn_sudot4:
4252 case Intrinsic::amdgcn_sudot8:
4253 case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16:
4254 case Intrinsic::amdgcn_wmma_f16_16x16x16_f16:
4255 case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16:
4256 case Intrinsic::amdgcn_wmma_f32_16x16x16_f16:
4257 case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:
4258 case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8:
4259 return getDefaultMappingVOP(MI);
4260 case Intrinsic::amdgcn_sbfe:
4261 case Intrinsic::amdgcn_ubfe:
4262 if (isSALUMapping(MI))
4263 return getDefaultMappingSOP(MI);
4264 return getDefaultMappingVOP(MI);
4265 case Intrinsic::amdgcn_ds_swizzle:
4266 case Intrinsic::amdgcn_ds_permute:
4267 case Intrinsic::amdgcn_ds_bpermute:
4268 case Intrinsic::amdgcn_update_dpp:
4269 case Intrinsic::amdgcn_mov_dpp8:
4270 case Intrinsic::amdgcn_mov_dpp:
4271 case Intrinsic::amdgcn_strict_wwm:
4272 case Intrinsic::amdgcn_wwm:
4273 case Intrinsic::amdgcn_strict_wqm:
4274 case Intrinsic::amdgcn_wqm:
4275 case Intrinsic::amdgcn_softwqm:
4276 case Intrinsic::amdgcn_set_inactive:
4277 case Intrinsic::amdgcn_permlane64:
4279 case Intrinsic::amdgcn_kernarg_segment_ptr:
4280 case Intrinsic::amdgcn_s_getpc:
4281 case Intrinsic::amdgcn_groupstaticsize:
4282 case Intrinsic::amdgcn_reloc_constant:
4283 case Intrinsic::returnaddress: {
4284 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4285 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4286 break;
4287 }
4288 case Intrinsic::amdgcn_wqm_vote: {
4289 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4290 OpdsMapping[0] = OpdsMapping[2]
4291 = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size);
4292 break;
4293 }
4294 case Intrinsic::amdgcn_ps_live: {
4295 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4296 break;
4297 }
4298 case Intrinsic::amdgcn_div_scale: {
4299 unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4300 unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4301 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size);
4302 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size);
4303
4304 unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
4305 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4306 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4307 break;
4308 }
4309 case Intrinsic::amdgcn_class: {
4310 Register Src0Reg = MI.getOperand(2).getReg();
4311 Register Src1Reg = MI.getOperand(3).getReg();
4312 unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits();
4313 unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits();
4314 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4315 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
4316 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size);
4317 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size);
4318 break;
4319 }
4320 case Intrinsic::amdgcn_icmp:
4321 case Intrinsic::amdgcn_fcmp: {
4322 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4323 // This is not VCCRegBank because this is not used in boolean contexts.
4324 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4325 unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4326 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4327 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4328 break;
4329 }
4330 case Intrinsic::amdgcn_readlane: {
4331 // This must be an SGPR, but accept a VGPR.
4332 Register IdxReg = MI.getOperand(3).getReg();
4333 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4334 unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4335 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4336 [[fallthrough]];
4337 }
4338 case Intrinsic::amdgcn_readfirstlane: {
4339 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4340 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4341 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4342 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4343 break;
4344 }
4345 case Intrinsic::amdgcn_writelane: {
4346 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4347 Register SrcReg = MI.getOperand(2).getReg();
4348 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
4349 unsigned SrcBank = getRegBankID(SrcReg, MRI, AMDGPU::SGPRRegBankID);
4350 Register IdxReg = MI.getOperand(3).getReg();
4351 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4352 unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4353 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4354
4355 // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
4356 // to legalize.
4357 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize);
4358 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4359 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4360 break;
4361 }
4362 case Intrinsic::amdgcn_if_break: {
4363 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4364 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4365 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4366 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4367 break;
4368 }
4369 case Intrinsic::amdgcn_permlane16:
4370 case Intrinsic::amdgcn_permlanex16: {
4371 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4372 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4373 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4374 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4375 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4376 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4377 break;
4378 }
4379 case Intrinsic::amdgcn_mfma_f32_4x4x1f32:
4380 case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
4381 case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
4382 case Intrinsic::amdgcn_mfma_f32_4x4x2bf16:
4383 case Intrinsic::amdgcn_mfma_f32_16x16x1f32:
4384 case Intrinsic::amdgcn_mfma_f32_16x16x4f32:
4385 case Intrinsic::amdgcn_mfma_f32_16x16x4f16:
4386 case Intrinsic::amdgcn_mfma_f32_16x16x16f16:
4387 case Intrinsic::amdgcn_mfma_i32_16x16x4i8:
4388 case Intrinsic::amdgcn_mfma_i32_16x16x16i8:
4389 case Intrinsic::amdgcn_mfma_f32_16x16x2bf16:
4390 case Intrinsic::amdgcn_mfma_f32_16x16x8bf16:
4391 case Intrinsic::amdgcn_mfma_f32_32x32x1f32:
4392 case Intrinsic::amdgcn_mfma_f32_32x32x2f32:
4393 case Intrinsic::amdgcn_mfma_f32_32x32x4f16:
4394 case Intrinsic::amdgcn_mfma_f32_32x32x8f16:
4395 case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
4396 case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
4397 case Intrinsic::amdgcn_mfma_f32_32x32x2bf16:
4398 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16:
4399 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k:
4400 case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k:
4401 case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k:
4402 case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k:
4403 case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k:
4404 case Intrinsic::amdgcn_mfma_f64_16x16x4f64:
4405 case Intrinsic::amdgcn_mfma_f64_4x4x4f64:
4406 case Intrinsic::amdgcn_mfma_i32_16x16x32_i8:
4407 case Intrinsic::amdgcn_mfma_i32_32x32x16_i8:
4408 case Intrinsic::amdgcn_mfma_f32_16x16x8_xf32:
4409 case Intrinsic::amdgcn_mfma_f32_32x32x4_xf32:
4410 case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_bf8:
4411 case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_fp8:
4412 case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_bf8:
4413 case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_fp8:
4414 case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_bf8:
4415 case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_fp8:
4416 case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_bf8:
4417 case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8: {
4418 // Default for MAI intrinsics.
4419 // srcC can also be an immediate which can be folded later.
4420 // FIXME: Should we eventually add an alternative mapping with AGPR src
4421 // for srcA/srcB?
4422 //
4423 // vdst, srcA, srcB, srcC
4425 OpdsMapping[0] =
4426 Info->mayNeedAGPRs()
4427 ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
4428 : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4429 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4430 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4431 OpdsMapping[4] =
4432 Info->mayNeedAGPRs()
4433 ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
4434 : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4435 break;
4436 }
4437 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
4438 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
4439 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
4440 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
4441 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
4442 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
4443 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
4444 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
4445 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
4446 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
4447 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
4448 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
4449 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
4450 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8: {
4451 // vdst, srcA, srcB, srcC, idx
4452 OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4453 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4454 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4455 OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4456 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4457 break;
4458 }
4459 case Intrinsic::amdgcn_interp_p1:
4460 case Intrinsic::amdgcn_interp_p2:
4461 case Intrinsic::amdgcn_interp_mov:
4462 case Intrinsic::amdgcn_interp_p1_f16:
4463 case Intrinsic::amdgcn_interp_p2_f16:
4464 case Intrinsic::amdgcn_lds_param_load: {
4465 const int M0Idx = MI.getNumOperands() - 1;
4466 Register M0Reg = MI.getOperand(M0Idx).getReg();
4467 unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
4468 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4469
4470 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4471 for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
4472 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4473
4474 // Must be SGPR, but we must take whatever the original bank is and fix it
4475 // later.
4476 OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
4477 break;
4478 }
4479 case Intrinsic::amdgcn_interp_inreg_p10:
4480 case Intrinsic::amdgcn_interp_inreg_p2:
4481 case Intrinsic::amdgcn_interp_inreg_p10_f16:
4482 case Intrinsic::amdgcn_interp_inreg_p2_f16: {
4483 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4484 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4485 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4486 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4487 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4488 break;
4489 }
4490 case Intrinsic::amdgcn_ballot: {
4491 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4492 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4493 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4494 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize);
4495 break;
4496 }
4497 }
4498 break;
4499 }
4500 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4501 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
4502 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
4503 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
4504 auto IntrID = MI.getIntrinsicID();
4505 const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID);
4506 assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic");
4507 // Non-images can have complications from operands that allow both SGPR
4508 // and VGPR. For now it's too complicated to figure out the final opcode
4509 // to derive the register bank from the MCInstrDesc.
4510 assert(RSrcIntrin->IsImage);
4511 return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
4512 }
4513 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
4514 unsigned N = MI.getNumExplicitOperands() - 2;
4515 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128);
4516 OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI);
4517 if (N == 3) {
4518 // Sequential form: all operands combined into VGPR256/VGPR512
4519 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4520 if (Size > 256)
4521 Size = 512;
4522 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4523 } else {
4524 // NSA form
4525 for (unsigned I = 2; I < N; ++I) {
4526 unsigned Size = MRI.getType(MI.getOperand(I).getReg()).getSizeInBits();
4527 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4528 }
4529 }
4530 break;
4531 }
4532 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
4533 auto IntrID = MI.getIntrinsicID();
4534 switch (IntrID) {
4535 case Intrinsic::amdgcn_s_getreg:
4536 case Intrinsic::amdgcn_s_memtime:
4537 case Intrinsic::amdgcn_s_memrealtime:
4538 case Intrinsic::amdgcn_s_get_waveid_in_workgroup:
4539 case Intrinsic::amdgcn_s_sendmsg_rtn: {
4540 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4541 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4542 break;
4543 }
4544 case Intrinsic::amdgcn_global_atomic_fadd:
4545 case Intrinsic::amdgcn_global_atomic_csub:
4546 case Intrinsic::amdgcn_global_atomic_fmin:
4547 case Intrinsic::amdgcn_global_atomic_fmax:
4548 case Intrinsic::amdgcn_flat_atomic_fadd:
4549 case Intrinsic::amdgcn_flat_atomic_fmin:
4550 case Intrinsic::amdgcn_flat_atomic_fmax:
4551 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
4552 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
4554 case Intrinsic::amdgcn_ds_ordered_add:
4555 case Intrinsic::amdgcn_ds_ordered_swap:
4556 case Intrinsic::amdgcn_ds_fadd_v2bf16: {
4557 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4558 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4559 unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4560 AMDGPU::SGPRRegBankID);
4561 OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32);
4562 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4563 break;
4564 }
4565 case Intrinsic::amdgcn_ds_append:
4566 case Intrinsic::amdgcn_ds_consume: {
4567 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4568 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4569 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4570 break;
4571 }
4572 case Intrinsic::amdgcn_exp_compr:
4573 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4574 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4575 break;
4576 case Intrinsic::amdgcn_exp:
4577 // FIXME: Could we support packed types here?
4578 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4579 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4580 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4581 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4582 break;
4583 case Intrinsic::amdgcn_exp_row:
4584 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4585 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4586 OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4587 OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4588 OpdsMapping[8] = getSGPROpMapping(MI.getOperand(8).getReg(), MRI, *TRI);
4589 break;
4590 case Intrinsic::amdgcn_s_sendmsg:
4591 case Intrinsic::amdgcn_s_sendmsghalt: {
4592 // This must be an SGPR, but accept a VGPR.
4593 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4594 AMDGPU::SGPRRegBankID);
4595 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4596 break;
4597 }
4598 case Intrinsic::amdgcn_s_setreg: {
4599 // This must be an SGPR, but accept a VGPR.
4600 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4601 AMDGPU::SGPRRegBankID);
4602 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4603 break;
4604 }
4605 case Intrinsic::amdgcn_end_cf: {
4606 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4607 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4608 break;
4609 }
4610 case Intrinsic::amdgcn_else: {
4611 unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4612 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4613 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4614 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4615 break;
4616 }
4617 case Intrinsic::amdgcn_live_mask: {
4618 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4619 break;
4620 }
4621 case Intrinsic::amdgcn_wqm_demote:
4622 case Intrinsic::amdgcn_kill: {
4623 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4624 break;
4625 }
4626 case Intrinsic::amdgcn_raw_buffer_load:
4627 case Intrinsic::amdgcn_raw_tbuffer_load: {
4628 // FIXME: Should make intrinsic ID the last operand of the instruction,
4629 // then this would be the same as store
4630 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4631 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4632 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4633 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4634 break;
4635 }
4636 case Intrinsic::amdgcn_raw_buffer_load_lds: {
4637 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4638 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4639 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4640 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4641 break;
4642 }
4643 case Intrinsic::amdgcn_raw_buffer_store:
4644 case Intrinsic::amdgcn_raw_buffer_store_format:
4645 case Intrinsic::amdgcn_raw_tbuffer_store: {
4646 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4647 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4648 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4649 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4650 break;
4651 }
4652 case Intrinsic::amdgcn_struct_buffer_load:
4653 case Intrinsic::amdgcn_struct_tbuffer_load: {
4654 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4655 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4656 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4657 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4658 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4659 break;
4660 }
4661 case Intrinsic::amdgcn_struct_buffer_load_lds: {
4662 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4663 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4664 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4665 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4666 OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
4667 break;
4668 }
4669 case Intrinsic::amdgcn_struct_buffer_store:
4670 case Intrinsic::amdgcn_struct_tbuffer_store: {
4671 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4672 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4673 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4674 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4675 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4676 break;
4677 }
4678 case Intrinsic::amdgcn_init_exec_from_input: {
4679 unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4680 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4681 break;
4682 }
4683 case Intrinsic::amdgcn_ds_gws_init:
4684 case Intrinsic::amdgcn_ds_gws_barrier:
4685 case Intrinsic::amdgcn_ds_gws_sema_br: {
4686 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4687
4688 // This must be an SGPR, but accept a VGPR.
4689 unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4690 AMDGPU::SGPRRegBankID);
4691 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4692 break;
4693 }
4694 case Intrinsic::amdgcn_ds_gws_sema_v:
4695 case Intrinsic::amdgcn_ds_gws_sema_p:
4696 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
4697 // This must be an SGPR, but accept a VGPR.
4698 unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI,
4699 AMDGPU::SGPRRegBankID);
4700 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
4701 break;
4702 }
4703 case Intrinsic::amdgcn_global_load_lds: {
4704 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4705 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4706 break;
4707 }
4708 case Intrinsic::amdgcn_lds_direct_load: {
4709 const int M0Idx = MI.getNumOperands() - 1;
4710 Register M0Reg = MI.getOperand(M0Idx).getReg();
4711 unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
4712 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4713
4714 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4715 for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
4716 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4717
4718 // Must be SGPR, but we must take whatever the original bank is and fix it
4719 // later.
4720 OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
4721 break;
4722 }
4723 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
4724 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn:
4725 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4726 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4727 break;
4728 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
4729 OpdsMapping[0] =
4730 getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI); // %vdst
4731 OpdsMapping[1] =
4732 getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI); // %addr
4733 OpdsMapping[3] =
4734 getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI); // %addr
4735 OpdsMapping[4] =
4736 getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI); // %data0
4737 OpdsMapping[5] =
4738 getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); // %data1
4739 break;
4740 }
4741
4742 default:
4744 }
4745 break;
4746 }
4747 case AMDGPU::G_SELECT: {
4748 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4749 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4750 AMDGPU::SGPRRegBankID);
4751 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI,
4752 AMDGPU::SGPRRegBankID);
4753 bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
4754 Op3Bank == AMDGPU::SGPRRegBankID;
4755
4756 unsigned CondBankDefault = SGPRSrcs ?
4757 AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4758 unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI,
4759 CondBankDefault);
4760 if (CondBank == AMDGPU::SGPRRegBankID)
4761 CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4762 else if (CondBank == AMDGPU::VGPRRegBankID)
4763 CondBank = AMDGPU::VCCRegBankID;
4764
4765 unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ?
4766 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4767
4768 assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID);
4769
4770 // TODO: Should report 32-bit for scalar condition type.
4771 if (Size == 64) {
4772 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4773 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
4774 OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4775 OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4776 } else {
4777 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size);
4778 OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
4779 OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size);
4780 OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size);
4781 }
4782
4783 break;
4784 }
4785
4786 case AMDGPU::G_SI_CALL: {
4787 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64);
4788</