LLVM 20.0.0git
AMDGPURegisterBankInfo.cpp
Go to the documentation of this file.
1//===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the RegisterBankInfo class for
10/// AMDGPU.
11///
12/// \par
13///
14/// AMDGPU has unique register bank constraints that require special high level
15/// strategies to deal with. There are two main true physical register banks
16/// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a
17/// sort of pseudo-register bank needed to represent SGPRs used in a vector
18/// boolean context. There is also the AGPR bank, which is a special purpose
19/// physical register bank present on some subtargets.
20///
21/// Copying from VGPR to SGPR is generally illegal, unless the value is known to
22/// be uniform. It is generally not valid to legalize operands by inserting
23/// copies as on other targets. Operations which require uniform, SGPR operands
24/// generally require scalarization by repeatedly executing the instruction,
25/// activating each set of lanes using a unique set of input values. This is
26/// referred to as a waterfall loop.
27///
28/// \par Booleans
29///
30/// Booleans (s1 values) requires special consideration. A vector compare result
31/// is naturally a bitmask with one bit per lane, in a 32 or 64-bit
32/// register. These are represented with the VCC bank. During selection, we need
33/// to be able to unambiguously go back from a register class to a register
34/// bank. To distinguish whether an SGPR should use the SGPR or VCC register
35/// bank, we need to know the use context type. An SGPR s1 value always means a
36/// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets
37/// SCC, which is a 1-bit unaddressable register. This will need to be copied to
38/// a 32-bit virtual register. Taken together, this means we need to adjust the
39/// type of boolean operations to be regbank legal. All SALU booleans need to be
40/// widened to 32-bits, and all VALU booleans need to be s1 values.
41///
42/// A noteworthy exception to the s1-means-vcc rule is for legalization artifact
43/// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc
44/// bank. A non-boolean source (such as a truncate from a 1-bit load from
45/// memory) will require a copy to the VCC bank which will require clearing the
46/// high bits and inserting a compare.
47///
48/// \par Constant bus restriction
49///
50/// VALU instructions have a limitation known as the constant bus
51/// restriction. Most VALU instructions can use SGPR operands, but may read at
52/// most 1 SGPR or constant literal value (this to 2 in gfx10 for most
53/// instructions). This is one unique SGPR, so the same SGPR may be used for
54/// multiple operands. From a register bank perspective, any combination of
55/// operands should be legal as an SGPR, but this is contextually dependent on
56/// the SGPR operands all being the same register. There is therefore optimal to
57/// choose the SGPR with the most uses to minimize the number of copies.
58///
59/// We avoid trying to solve this problem in RegBankSelect. Any VALU G_*
60/// operation should have its source operands all mapped to VGPRs (except for
61/// VCC), inserting copies from any SGPR operands. This the most trivial legal
62/// mapping. Anything beyond the simplest 1:1 instruction selection would be too
63/// complicated to solve here. Every optimization pattern or instruction
64/// selected to multiple outputs would have to enforce this rule, and there
65/// would be additional complexity in tracking this rule for every G_*
66/// operation. By forcing all inputs to VGPRs, it also simplifies the task of
67/// picking the optimal operand combination from a post-isel optimization pass.
68///
69//===----------------------------------------------------------------------===//
70
72
73#include "AMDGPU.h"
75#include "AMDGPUInstrInfo.h"
76#include "GCNSubtarget.h"
78#include "SIRegisterInfo.h"
84#include "llvm/IR/IntrinsicsAMDGPU.h"
85
86#define GET_TARGET_REGBANK_IMPL
87#include "AMDGPUGenRegisterBank.inc"
88
89// This file will be TableGen'ed at some point.
90#include "AMDGPUGenRegisterBankInfo.def"
91
92using namespace llvm;
93using namespace MIPatternMatch;
94
95namespace {
96
97// Observer to apply a register bank to new registers created by LegalizerHelper.
98class ApplyRegBankMapping final : public GISelChangeObserver {
99private:
101 const AMDGPURegisterBankInfo &RBI;
103 const RegisterBank *NewBank;
105
106public:
107 ApplyRegBankMapping(MachineIRBuilder &B, const AMDGPURegisterBankInfo &RBI_,
108 MachineRegisterInfo &MRI_, const RegisterBank *RB)
109 : B(B), RBI(RBI_), MRI(MRI_), NewBank(RB) {
110 assert(!B.isObservingChanges());
111 B.setChangeObserver(*this);
112 }
113
114 ~ApplyRegBankMapping() override {
115 for (MachineInstr *MI : NewInsts)
116 applyBank(*MI);
117
118 B.stopObservingChanges();
119 }
120
121 /// Set any registers that don't have a set register class or bank to SALU.
122 void applyBank(MachineInstr &MI) {
123 const unsigned Opc = MI.getOpcode();
124 if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT ||
125 Opc == AMDGPU::G_SEXT) {
126 // LegalizerHelper wants to use the basic legalization artifacts when
127 // widening etc. We don't handle selection with vcc in artifact sources,
128 // so we need to use a select instead to handle these properly.
129 Register DstReg = MI.getOperand(0).getReg();
130 Register SrcReg = MI.getOperand(1).getReg();
131 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI);
132 if (SrcBank == &AMDGPU::VCCRegBank) {
133 const LLT S32 = LLT::scalar(32);
134 assert(MRI.getType(SrcReg) == LLT::scalar(1));
135 assert(MRI.getType(DstReg) == S32);
136 assert(NewBank == &AMDGPU::VGPRRegBank);
137
138 // Replace the extension with a select, which really uses the boolean
139 // source.
140 B.setInsertPt(*MI.getParent(), MI);
141
142 auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1);
143 auto False = B.buildConstant(S32, 0);
144 B.buildSelect(DstReg, SrcReg, True, False);
145 MRI.setRegBank(True.getReg(0), *NewBank);
146 MRI.setRegBank(False.getReg(0), *NewBank);
147 MI.eraseFromParent();
148 }
149
150 assert(!MRI.getRegClassOrRegBank(DstReg));
151 MRI.setRegBank(DstReg, *NewBank);
152 return;
153 }
154
155#ifndef NDEBUG
156 if (Opc == AMDGPU::G_TRUNC) {
157 Register DstReg = MI.getOperand(0).getReg();
158 const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI);
159 assert(DstBank != &AMDGPU::VCCRegBank);
160 }
161#endif
162
163 for (MachineOperand &Op : MI.operands()) {
164 if (!Op.isReg())
165 continue;
166
167 // We may see physical registers if building a real MI
168 Register Reg = Op.getReg();
169 if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg))
170 continue;
171
172 const RegisterBank *RB = NewBank;
173 if (MRI.getType(Reg) == LLT::scalar(1)) {
174 assert(NewBank == &AMDGPU::VGPRRegBank &&
175 "s1 operands should only be used for vector bools");
176 assert((MI.getOpcode() != AMDGPU::G_TRUNC &&
177 MI.getOpcode() != AMDGPU::G_ANYEXT) &&
178 "not expecting legalization artifacts here");
179 RB = &AMDGPU::VCCRegBank;
180 }
181
182 MRI.setRegBank(Reg, *RB);
183 }
184 }
185
186 void erasingInstr(MachineInstr &MI) override {}
187
188 void createdInstr(MachineInstr &MI) override {
189 // At this point, the instruction was just inserted and has no operands.
190 NewInsts.push_back(&MI);
191 }
192
193 void changingInstr(MachineInstr &MI) override {}
194 void changedInstr(MachineInstr &MI) override {
195 // FIXME: In principle we should probably add the instruction to NewInsts,
196 // but the way the LegalizerHelper uses the observer, we will always see the
197 // registers we need to set the regbank on also referenced in a new
198 // instruction.
199 }
200};
201
202} // anonymous namespace
203
205 : Subtarget(ST), TRI(Subtarget.getRegisterInfo()),
206 TII(Subtarget.getInstrInfo()) {
207
208 // HACK: Until this is fully tablegen'd.
209 static llvm::once_flag InitializeRegisterBankFlag;
210
211 static auto InitializeRegisterBankOnce = [this]() {
212 assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank &&
213 &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank &&
214 &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank);
215 (void)this;
216 };
217
218 llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce);
219}
220
221static bool isVectorRegisterBank(const RegisterBank &Bank) {
222 unsigned BankID = Bank.getID();
223 return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID;
224}
225
227 return RB != &AMDGPU::SGPRRegBank;
228}
229
231 const RegisterBank &Src,
232 TypeSize Size) const {
233 // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
234 if (Dst.getID() == AMDGPU::SGPRRegBankID &&
235 (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) {
236 return std::numeric_limits<unsigned>::max();
237 }
238
239 // Bool values are tricky, because the meaning is based on context. The SCC
240 // and VCC banks are for the natural scalar and vector conditions produced by
241 // a compare.
242 //
243 // Legalization doesn't know about the necessary context, so an s1 use may
244 // have been a truncate from an arbitrary value, in which case a copy (lowered
245 // as a compare with 0) needs to be inserted.
246 if (Size == 1 &&
247 (Dst.getID() == AMDGPU::SGPRRegBankID) &&
248 (isVectorRegisterBank(Src) ||
249 Src.getID() == AMDGPU::SGPRRegBankID ||
250 Src.getID() == AMDGPU::VCCRegBankID))
251 return std::numeric_limits<unsigned>::max();
252
253 // There is no direct copy between AGPRs.
254 if (Dst.getID() == AMDGPU::AGPRRegBankID &&
255 Src.getID() == AMDGPU::AGPRRegBankID)
256 return 4;
257
258 return RegisterBankInfo::copyCost(Dst, Src, Size);
259}
260
262 const ValueMapping &ValMapping,
263 const RegisterBank *CurBank) const {
264 // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
265 // VGPR.
266 // FIXME: Is there a better way to do this?
267 if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
268 return 10; // This is expensive.
269
270 assert(ValMapping.NumBreakDowns == 2 &&
271 ValMapping.BreakDown[0].Length == 32 &&
272 ValMapping.BreakDown[0].StartIdx == 0 &&
273 ValMapping.BreakDown[1].Length == 32 &&
274 ValMapping.BreakDown[1].StartIdx == 32 &&
275 ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank);
276
277 // 32-bit extract of a 64-bit value is just access of a subregister, so free.
278 // TODO: Cost of 0 hits assert, though it's not clear it's what we really
279 // want.
280
281 // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
282 // alignment restrictions, but this probably isn't important.
283 return 1;
284}
285
286const RegisterBank &
288 LLT Ty) const {
289 if (&RC == &AMDGPU::SReg_1RegClass)
290 return AMDGPU::VCCRegBank;
291
292 // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
293 // VCC-like use.
294 if (TRI->isSGPRClass(&RC)) {
295 // FIXME: This probably came from a copy from a physical register, which
296 // should be inferable from the copied to-type. We don't have many boolean
297 // physical register constraints so just assume a normal SGPR for now.
298 if (!Ty.isValid())
299 return AMDGPU::SGPRRegBank;
300
301 return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
302 }
303
304 return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank;
305}
306
307template <unsigned NumOps>
310 const MachineInstr &MI, const MachineRegisterInfo &MRI,
311 const std::array<unsigned, NumOps> RegSrcOpIdx,
312 ArrayRef<OpRegBankEntry<NumOps>> Table) const {
313
314 InstructionMappings AltMappings;
315
317
318 unsigned Sizes[NumOps];
319 for (unsigned I = 0; I < NumOps; ++I) {
320 Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
321 Sizes[I] = getSizeInBits(Reg, MRI, *TRI);
322 }
323
324 for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
325 unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
326 Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
327 }
328
329 // getInstrMapping's default mapping uses ID 1, so start at 2.
330 unsigned MappingID = 2;
331 for (const auto &Entry : Table) {
332 for (unsigned I = 0; I < NumOps; ++I) {
333 int OpIdx = RegSrcOpIdx[I];
334 Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]);
335 }
336
337 AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost,
339 Operands.size()));
340 }
341
342 return AltMappings;
343}
344
347 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
348 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
349 case Intrinsic::amdgcn_readlane: {
350 static const OpRegBankEntry<3> Table[2] = {
351 // Perfectly legal.
352 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
353
354 // Need a readfirstlane for the index.
355 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
356 };
357
358 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
359 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, Table);
360 }
361 case Intrinsic::amdgcn_writelane: {
362 static const OpRegBankEntry<4> Table[4] = {
363 // Perfectly legal.
364 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
365
366 // Need readfirstlane of first op
367 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
368
369 // Need readfirstlane of second op
370 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
371
372 // Need readfirstlane of both ops
373 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }
374 };
375
376 // rsrc, voffset, offset
377 const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } };
378 return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, Table);
379 }
380 default:
382 }
383}
384
387 const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
388
389 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
390 case Intrinsic::amdgcn_s_buffer_load: {
391 static const OpRegBankEntry<2> Table[4] = {
392 // Perfectly legal.
393 { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
394
395 // Only need 1 register in loop
396 { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 },
397
398 // Have to waterfall the resource.
399 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
400
401 // Have to waterfall the resource, and the offset.
402 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 }
403 };
404
405 // rsrc, offset
406 const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } };
407 return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, Table);
408 }
409 case Intrinsic::amdgcn_ds_ordered_add:
410 case Intrinsic::amdgcn_ds_ordered_swap: {
411 // VGPR = M0, VGPR
412 static const OpRegBankEntry<3> Table[2] = {
413 // Perfectly legal.
414 { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
415
416 // Need a readfirstlane for m0
417 { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
418 };
419
420 const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
421 return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, Table);
422 }
423 case Intrinsic::amdgcn_s_sendmsg:
424 case Intrinsic::amdgcn_s_sendmsghalt: {
425 // FIXME: Should have no register for immediate
426 static const OpRegBankEntry<1> Table[2] = {
427 // Perfectly legal.
428 { { AMDGPU::SGPRRegBankID }, 1 },
429
430 // Need readlane
431 { { AMDGPU::VGPRRegBankID }, 3 }
432 };
433
434 const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } };
435 return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, Table);
436 }
437 default:
439 }
440}
441
442// FIXME: Returns uniform if there's no source value information. This is
443// probably wrong.
445 if (!MI.hasOneMemOperand())
446 return false;
447
448 const MachineMemOperand *MMO = *MI.memoperands_begin();
449 const unsigned AS = MMO->getAddrSpace();
450 const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||
452 const unsigned MemSize = 8 * MMO->getSize().getValue();
453
454 // Require 4-byte alignment.
455 return (MMO->getAlign() >= Align(4) ||
457 ((MemSize == 16 && MMO->getAlign() >= Align(2)) ||
458 (MemSize == 8 && MMO->getAlign() >= Align(1))))) &&
459 // Can't do a scalar atomic load.
460 !MMO->isAtomic() &&
461 // Don't use scalar loads for volatile accesses to non-constant address
462 // spaces.
463 (IsConst || !MMO->isVolatile()) &&
464 // Memory must be known constant, or not written before this load.
465 (IsConst || MMO->isInvariant() || (MMO->getFlags() & MONoClobber)) &&
467}
468
471 const MachineInstr &MI) const {
472
473 const MachineFunction &MF = *MI.getParent()->getParent();
474 const MachineRegisterInfo &MRI = MF.getRegInfo();
475
476
477 InstructionMappings AltMappings;
478 switch (MI.getOpcode()) {
479 case TargetOpcode::G_CONSTANT:
480 case TargetOpcode::G_IMPLICIT_DEF: {
481 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
482 if (Size == 1) {
483 static const OpRegBankEntry<1> Table[3] = {
484 { { AMDGPU::VGPRRegBankID }, 1 },
485 { { AMDGPU::SGPRRegBankID }, 1 },
486 { { AMDGPU::VCCRegBankID }, 1 }
487 };
488
489 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
490 }
491
492 [[fallthrough]];
493 }
494 case TargetOpcode::G_FCONSTANT:
495 case TargetOpcode::G_FRAME_INDEX:
496 case TargetOpcode::G_GLOBAL_VALUE: {
497 static const OpRegBankEntry<1> Table[2] = {
498 { { AMDGPU::VGPRRegBankID }, 1 },
499 { { AMDGPU::SGPRRegBankID }, 1 }
500 };
501
502 return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
503 }
504 case TargetOpcode::G_AND:
505 case TargetOpcode::G_OR:
506 case TargetOpcode::G_XOR: {
507 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
508
509 if (Size == 1) {
510 // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
511 const InstructionMapping &SCCMapping = getInstructionMapping(
512 1, 1, getOperandsMapping(
513 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
514 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
515 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}),
516 3); // Num Operands
517 AltMappings.push_back(&SCCMapping);
518
519 const InstructionMapping &VCCMapping0 = getInstructionMapping(
520 2, 1, getOperandsMapping(
521 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
522 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
523 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
524 3); // Num Operands
525 AltMappings.push_back(&VCCMapping0);
526 return AltMappings;
527 }
528
529 if (Size != 64)
530 break;
531
532 const InstructionMapping &SSMapping = getInstructionMapping(
533 1, 1, getOperandsMapping(
534 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
535 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
536 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
537 3); // Num Operands
538 AltMappings.push_back(&SSMapping);
539
540 const InstructionMapping &VVMapping = getInstructionMapping(
541 2, 2, getOperandsMapping(
542 {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
543 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
544 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
545 3); // Num Operands
546 AltMappings.push_back(&VVMapping);
547 break;
548 }
549 case TargetOpcode::G_LOAD:
550 case TargetOpcode::G_ZEXTLOAD:
551 case TargetOpcode::G_SEXTLOAD: {
552 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
553 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
554 unsigned PtrSize = PtrTy.getSizeInBits();
555 unsigned AS = PtrTy.getAddressSpace();
556
560 const InstructionMapping &SSMapping = getInstructionMapping(
561 1, 1, getOperandsMapping(
562 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
563 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}),
564 2); // Num Operands
565 AltMappings.push_back(&SSMapping);
566 }
567
568 const InstructionMapping &VVMapping = getInstructionMapping(
569 2, 1,
571 {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
572 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
573 2); // Num Operands
574 AltMappings.push_back(&VVMapping);
575
576 // It may be possible to have a vgpr = load sgpr mapping here, because
577 // the mubuf instructions support this kind of load, but probably for only
578 // gfx7 and older. However, the addressing mode matching in the instruction
579 // selector should be able to do a better job of detecting and selecting
580 // these kinds of loads from the vgpr = load vgpr mapping.
581
582 return AltMappings;
583
584 }
585 case TargetOpcode::G_SELECT: {
586 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
587 const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
588 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
589 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
590 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
591 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
592 4); // Num Operands
593 AltMappings.push_back(&SSMapping);
594
595 const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
596 getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
597 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
598 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
599 AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
600 4); // Num Operands
601 AltMappings.push_back(&VVMapping);
602
603 return AltMappings;
604 }
605 case TargetOpcode::G_UADDE:
606 case TargetOpcode::G_USUBE:
607 case TargetOpcode::G_SADDE:
608 case TargetOpcode::G_SSUBE: {
609 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
610 const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
612 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
613 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
614 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
615 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
616 AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}),
617 5); // Num Operands
618 AltMappings.push_back(&SSMapping);
619
620 const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
621 getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
622 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
623 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
624 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
625 AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}),
626 5); // Num Operands
627 AltMappings.push_back(&VVMapping);
628 return AltMappings;
629 }
630 case AMDGPU::G_BRCOND: {
631 assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
632
633 // TODO: Change type to 32 for scalar
635 1, 1, getOperandsMapping(
636 {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}),
637 2); // Num Operands
638 AltMappings.push_back(&SMapping);
639
641 1, 1, getOperandsMapping(
642 {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }),
643 2); // Num Operands
644 AltMappings.push_back(&VMapping);
645 return AltMappings;
646 }
647 case AMDGPU::G_INTRINSIC:
648 case AMDGPU::G_INTRINSIC_CONVERGENT:
650 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
651 case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
653 default:
654 break;
655 }
657}
658
662 LLT HalfTy,
663 Register Reg) const {
664 assert(HalfTy.getSizeInBits() == 32);
665 MachineRegisterInfo *MRI = B.getMRI();
666 Register LoLHS = MRI->createGenericVirtualRegister(HalfTy);
667 Register HiLHS = MRI->createGenericVirtualRegister(HalfTy);
668 const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI);
669 MRI->setRegBank(LoLHS, *Bank);
670 MRI->setRegBank(HiLHS, *Bank);
671
672 Regs.push_back(LoLHS);
673 Regs.push_back(HiLHS);
674
675 B.buildInstr(AMDGPU::G_UNMERGE_VALUES)
676 .addDef(LoLHS)
677 .addDef(HiLHS)
678 .addUse(Reg);
679}
680
681/// Replace the current type each register in \p Regs has with \p NewTy
683 LLT NewTy) {
684 for (Register Reg : Regs) {
685 assert(MRI.getType(Reg).getSizeInBits() == NewTy.getSizeInBits());
686 MRI.setType(Reg, NewTy);
687 }
688}
689
691 if (Ty.isVector()) {
694 Ty.getElementType());
695 }
696
697 assert(Ty.getScalarSizeInBits() % 2 == 0);
698 return LLT::scalar(Ty.getScalarSizeInBits() / 2);
699}
700
701// Build one or more V_READFIRSTLANE_B32 instructions to move the given vector
702// source value into a scalar register.
705 Register Src) const {
706 LLT Ty = MRI.getType(Src);
707 const RegisterBank *Bank = getRegBank(Src, MRI, *TRI);
708
709 if (Bank == &AMDGPU::SGPRRegBank)
710 return Src;
711
712 unsigned Bits = Ty.getSizeInBits();
713 assert(Bits % 32 == 0);
714
715 if (Bank != &AMDGPU::VGPRRegBank) {
716 // We need to copy from AGPR to VGPR
717 Src = B.buildCopy(Ty, Src).getReg(0);
718 MRI.setRegBank(Src, AMDGPU::VGPRRegBank);
719 }
720
721 LLT S32 = LLT::scalar(32);
722 unsigned NumParts = Bits / 32;
725
726 if (Bits == 32) {
727 SrcParts.push_back(Src);
728 } else {
729 auto Unmerge = B.buildUnmerge(S32, Src);
730 for (unsigned i = 0; i < NumParts; ++i)
731 SrcParts.push_back(Unmerge.getReg(i));
732 }
733
734 for (unsigned i = 0; i < NumParts; ++i) {
735 Register SrcPart = SrcParts[i];
736 Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
737 MRI.setType(DstPart, NumParts == 1 ? Ty : S32);
738
739 const TargetRegisterClass *Constrained =
740 constrainGenericRegister(SrcPart, AMDGPU::VGPR_32RegClass, MRI);
741 (void)Constrained;
742 assert(Constrained && "Failed to constrain readfirstlane src reg");
743
744 B.buildInstr(AMDGPU::V_READFIRSTLANE_B32, {DstPart}, {SrcPart});
745
746 DstParts.push_back(DstPart);
747 }
748
749 if (Bits == 32)
750 return DstParts[0];
751
752 Register Dst = B.buildMergeLikeInstr(Ty, DstParts).getReg(0);
753 MRI.setRegBank(Dst, AMDGPU::SGPRRegBank);
754 return Dst;
755}
756
757/// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
758/// any of the required SGPR operands are VGPRs, perform a waterfall loop to
759/// execute the instruction for each unique combination of values in all lanes
760/// in the wave. The block will be split such that rest of the instructions are
761/// moved to a new block.
762///
763/// Essentially performs this loop:
764//
765/// Save Execution Mask
766/// For (Lane : Wavefront) {
767/// Enable Lane, Disable all other lanes
768/// SGPR = read SGPR value for current lane from VGPR
769/// VGPRResult[Lane] = use_op SGPR
770/// }
771/// Restore Execution Mask
772///
773/// There is additional complexity to try for compare values to identify the
774/// unique values used.
777 SmallSet<Register, 4> &SGPROperandRegs) const {
778 // Track use registers which have already been expanded with a readfirstlane
779 // sequence. This may have multiple uses if moving a sequence.
780 DenseMap<Register, Register> WaterfalledRegMap;
781
782 MachineBasicBlock &MBB = B.getMBB();
783 MachineFunction *MF = &B.getMF();
784
786 const unsigned MovExecOpc =
787 Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
788 const unsigned MovExecTermOpc =
789 Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
790
791 const unsigned XorTermOpc = Subtarget.isWave32() ?
792 AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
793 const unsigned AndSaveExecOpc = Subtarget.isWave32() ?
794 AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
795 const unsigned ExecReg = Subtarget.isWave32() ?
796 AMDGPU::EXEC_LO : AMDGPU::EXEC;
797
798#ifndef NDEBUG
799 const int OrigRangeSize = std::distance(Range.begin(), Range.end());
800#endif
801
802 MachineRegisterInfo &MRI = *B.getMRI();
803 Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
804 Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
805
806 // Don't bother using generic instructions/registers for the exec mask.
807 B.buildInstr(TargetOpcode::IMPLICIT_DEF)
808 .addDef(InitSaveExecReg);
809
810 Register PhiExec = MRI.createVirtualRegister(WaveRC);
811 Register NewExec = MRI.createVirtualRegister(WaveRC);
812
813 // To insert the loop we need to split the block. Move everything before this
814 // point to a new block, and insert a new empty block before this instruction.
817 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
818 MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
820 ++MBBI;
821 MF->insert(MBBI, LoopBB);
822 MF->insert(MBBI, BodyBB);
823 MF->insert(MBBI, RestoreExecBB);
824 MF->insert(MBBI, RemainderBB);
825
826 LoopBB->addSuccessor(BodyBB);
827 BodyBB->addSuccessor(RestoreExecBB);
828 BodyBB->addSuccessor(LoopBB);
829
830 // Move the rest of the block into a new block.
832 RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
833
834 MBB.addSuccessor(LoopBB);
835 RestoreExecBB->addSuccessor(RemainderBB);
836
837 B.setInsertPt(*LoopBB, LoopBB->end());
838
839 B.buildInstr(TargetOpcode::PHI)
840 .addDef(PhiExec)
841 .addReg(InitSaveExecReg)
842 .addMBB(&MBB)
843 .addReg(NewExec)
844 .addMBB(BodyBB);
845
846 const DebugLoc &DL = B.getDL();
847
848 MachineInstr &FirstInst = *Range.begin();
849
850 // Move the instruction into the loop body. Note we moved everything after
851 // Range.end() already into a new block, so Range.end() is no longer valid.
852 BodyBB->splice(BodyBB->end(), &MBB, Range.begin(), MBB.end());
853
854 // Figure out the iterator range after splicing the instructions.
855 MachineBasicBlock::iterator NewBegin = FirstInst.getIterator();
856 auto NewEnd = BodyBB->end();
857
858 B.setMBB(*LoopBB);
859
860 LLT S1 = LLT::scalar(1);
861 Register CondReg;
862
863 assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
864
865 for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
866 for (MachineOperand &Op : MI.all_uses()) {
867 Register OldReg = Op.getReg();
868 if (!SGPROperandRegs.count(OldReg))
869 continue;
870
871 // See if we already processed this register in another instruction in the
872 // sequence.
873 auto OldVal = WaterfalledRegMap.find(OldReg);
874 if (OldVal != WaterfalledRegMap.end()) {
875 Op.setReg(OldVal->second);
876 continue;
877 }
878
879 Register OpReg = Op.getReg();
880 LLT OpTy = MRI.getType(OpReg);
881
882 const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI);
883 if (OpBank != &AMDGPU::VGPRRegBank) {
884 // Insert copy from AGPR to VGPR before the loop.
885 B.setMBB(MBB);
886 OpReg = B.buildCopy(OpTy, OpReg).getReg(0);
887 MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank);
888 B.setMBB(*LoopBB);
889 }
890
891 Register CurrentLaneReg = buildReadFirstLane(B, MRI, OpReg);
892
893 // Build the comparison(s).
894 unsigned OpSize = OpTy.getSizeInBits();
895 bool Is64 = OpSize % 64 == 0;
896 unsigned PartSize = Is64 ? 64 : 32;
897 LLT PartTy = LLT::scalar(PartSize);
898 unsigned NumParts = OpSize / PartSize;
900 SmallVector<Register, 8> CurrentLaneParts;
901
902 if (NumParts == 1) {
903 OpParts.push_back(OpReg);
904 CurrentLaneParts.push_back(CurrentLaneReg);
905 } else {
906 auto UnmergeOp = B.buildUnmerge(PartTy, OpReg);
907 auto UnmergeCurrentLane = B.buildUnmerge(PartTy, CurrentLaneReg);
908 for (unsigned i = 0; i < NumParts; ++i) {
909 OpParts.push_back(UnmergeOp.getReg(i));
910 CurrentLaneParts.push_back(UnmergeCurrentLane.getReg(i));
911 MRI.setRegBank(OpParts[i], AMDGPU::VGPRRegBank);
912 MRI.setRegBank(CurrentLaneParts[i], AMDGPU::SGPRRegBank);
913 }
914 }
915
916 for (unsigned i = 0; i < NumParts; ++i) {
917 auto CmpReg = B.buildICmp(CmpInst::ICMP_EQ, S1, CurrentLaneParts[i],
918 OpParts[i]).getReg(0);
919 MRI.setRegBank(CmpReg, AMDGPU::VCCRegBank);
920
921 if (!CondReg) {
922 CondReg = CmpReg;
923 } else {
924 CondReg = B.buildAnd(S1, CondReg, CmpReg).getReg(0);
925 MRI.setRegBank(CondReg, AMDGPU::VCCRegBank);
926 }
927 }
928
929 Op.setReg(CurrentLaneReg);
930
931 // Make sure we don't re-process this register again.
932 WaterfalledRegMap.insert(std::pair(OldReg, Op.getReg()));
933 }
934 }
935
936 // The ballot becomes a no-op during instruction selection.
937 CondReg = B.buildIntrinsic(Intrinsic::amdgcn_ballot,
938 {LLT::scalar(Subtarget.isWave32() ? 32 : 64)})
939 .addReg(CondReg)
940 .getReg(0);
941 MRI.setRegClass(CondReg, WaveRC);
942
943 // Update EXEC, save the original EXEC value to VCC.
944 B.buildInstr(AndSaveExecOpc)
945 .addDef(NewExec)
946 .addReg(CondReg, RegState::Kill);
947
948 MRI.setSimpleHint(NewExec, CondReg);
949
950 B.setInsertPt(*BodyBB, BodyBB->end());
951
952 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
953 B.buildInstr(XorTermOpc)
954 .addDef(ExecReg)
955 .addReg(ExecReg)
956 .addReg(NewExec);
957
958 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
959 // s_cbranch_scc0?
960
961 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
962 B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
963
964 // Save the EXEC mask before the loop.
965 BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg)
966 .addReg(ExecReg);
967
968 // Restore the EXEC mask after the loop.
969 B.setMBB(*RestoreExecBB);
970 B.buildInstr(MovExecTermOpc)
971 .addDef(ExecReg)
972 .addReg(SaveExecReg);
973
974 // Set the insert point after the original instruction, so any new
975 // instructions will be in the remainder.
976 B.setInsertPt(*RemainderBB, RemainderBB->begin());
977
978 return true;
979}
980
981// Return any unique registers used by \p MI at \p OpIndices that need to be
982// handled in a waterfall loop. Returns these registers in \p
983// SGPROperandRegs. Returns true if there are any operands to handle and a
984// waterfall loop is necessary.
986 SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,
987 MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {
988 for (unsigned Op : OpIndices) {
989 assert(MI.getOperand(Op).isUse());
990 Register Reg = MI.getOperand(Op).getReg();
991 const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
992 if (OpBank->getID() != AMDGPU::SGPRRegBankID)
993 SGPROperandRegs.insert(Reg);
994 }
995
996 // No operands need to be replaced, so no need to loop.
997 return !SGPROperandRegs.empty();
998}
999
1001 MachineIRBuilder &B, MachineInstr &MI, ArrayRef<unsigned> OpIndices) const {
1002 // Use a set to avoid extra readfirstlanes in the case where multiple operands
1003 // are the same register.
1004 SmallSet<Register, 4> SGPROperandRegs;
1005
1006 if (!collectWaterfallOperands(SGPROperandRegs, MI, *B.getMRI(), OpIndices))
1007 return false;
1008
1009 MachineBasicBlock::iterator I = MI.getIterator();
1010 return executeInWaterfallLoop(B, make_range(I, std::next(I)),
1011 SGPROperandRegs);
1012}
1013
1014// Legalize an operand that must be an SGPR by inserting a readfirstlane.
1016 MachineIRBuilder &B, MachineInstr &MI, unsigned OpIdx) const {
1017 Register Reg = MI.getOperand(OpIdx).getReg();
1018 MachineRegisterInfo &MRI = *B.getMRI();
1019 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
1020 if (Bank == &AMDGPU::SGPRRegBank)
1021 return;
1022
1023 Reg = buildReadFirstLane(B, MRI, Reg);
1024 MI.getOperand(OpIdx).setReg(Reg);
1025}
1026
1027/// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
1028/// rest will be in the remainder.
1029static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) {
1030 unsigned TotalSize = Ty.getSizeInBits();
1031 if (!Ty.isVector())
1032 return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)};
1033
1034 LLT EltTy = Ty.getElementType();
1035 unsigned EltSize = EltTy.getSizeInBits();
1036 assert(FirstSize % EltSize == 0);
1037
1038 unsigned FirstPartNumElts = FirstSize / EltSize;
1039 unsigned RemainderElts = (TotalSize - FirstSize) / EltSize;
1040
1041 return {LLT::scalarOrVector(ElementCount::getFixed(FirstPartNumElts), EltTy),
1042 LLT::scalarOrVector(ElementCount::getFixed(RemainderElts), EltTy)};
1043}
1044
1046 if (!Ty.isVector())
1047 return LLT::scalar(128);
1048
1049 LLT EltTy = Ty.getElementType();
1050 assert(128 % EltTy.getSizeInBits() == 0);
1051 return LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
1052}
1053
1057 MachineInstr &MI) const {
1058 MachineRegisterInfo &MRI = *B.getMRI();
1059 Register DstReg = MI.getOperand(0).getReg();
1060 const LLT LoadTy = MRI.getType(DstReg);
1061 unsigned LoadSize = LoadTy.getSizeInBits();
1062 MachineMemOperand *MMO = *MI.memoperands_begin();
1063 const unsigned MaxNonSmrdLoadSize = 128;
1064
1065 const RegisterBank *DstBank =
1066 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1067 if (DstBank == &AMDGPU::SGPRRegBank) {
1068 // There are some special cases that we need to look at for 32 bit and 96
1069 // bit SGPR loads otherwise we have nothing to do.
1070 if (LoadSize != 32 && (LoadSize != 96 || Subtarget.hasScalarDwordx3Loads()))
1071 return false;
1072
1073 const unsigned MemSize = 8 * MMO->getSize().getValue();
1074 // Scalar loads of size 8 or 16 bit with proper alignment may be widened to
1075 // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit
1076 // scalar loads should have a load size of 32 but memory access size of less
1077 // than 32.
1078 if (LoadSize == 32 &&
1079 (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI)))
1080 return false;
1081
1082 if (LoadSize == 32 &&
1083 ((MemSize == 8 && MMO->getAlign() >= Align(1)) ||
1084 (MemSize == 16 && MMO->getAlign() >= Align(2))) &&
1087 return false;
1088
1089 Register PtrReg = MI.getOperand(1).getReg();
1090
1091 ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
1092
1093 if (LoadSize == 32) {
1094 // This is an extending load from a sub-dword size. Widen the memory
1095 // access size to 4 bytes and clear the extra high bits appropriately
1096 const LLT S32 = LLT::scalar(32);
1097 if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) {
1098 // Must extend the sign bit into higher bits for a G_SEXTLOAD
1099 auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1100 B.buildSExtInReg(MI.getOperand(0), WideLoad, MemSize);
1101 } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) {
1102 // Must extend zero into higher bits with an AND for a G_ZEXTLOAD
1103 auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1104 B.buildZExtInReg(MI.getOperand(0), WideLoad, MemSize);
1105 } else
1106 // We do not need to touch the higher bits for regular loads.
1107 B.buildLoadFromOffset(MI.getOperand(0), PtrReg, *MMO, 0);
1108 } else {
1109 // 96-bit loads are only available for vector loads. We need to split this
1110 // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
1111 if (MMO->getAlign() < Align(16)) {
1112 LegalizerHelper Helper(B.getMF(), ApplyBank, B);
1113 LLT Part64, Part32;
1114 std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64);
1115 if (Helper.reduceLoadStoreWidth(cast<GAnyLoad>(MI), 0, Part64) !=
1117 return false;
1118 return true;
1119 }
1120 LLT WiderTy = widen96To128(LoadTy);
1121 auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0);
1122 if (WiderTy.isScalar()) {
1123 B.buildTrunc(MI.getOperand(0), WideLoad);
1124 } else {
1125 B.buildDeleteTrailingVectorElements(MI.getOperand(0).getReg(),
1126 WideLoad);
1127 }
1128 }
1129
1130 MI.eraseFromParent();
1131 return true;
1132 }
1133
1134 // 128-bit loads are supported for all instruction types.
1135 if (LoadSize <= MaxNonSmrdLoadSize)
1136 return false;
1137
1138 SmallVector<Register, 16> DefRegs(OpdMapper.getVRegs(0));
1139 SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1));
1140
1141 if (SrcRegs.empty())
1142 SrcRegs.push_back(MI.getOperand(1).getReg());
1143
1144 // RegBankSelect only emits scalar types, so we need to reset the pointer
1145 // operand to a pointer type.
1146 Register BasePtrReg = SrcRegs[0];
1147 LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
1148 MRI.setType(BasePtrReg, PtrTy);
1149
1150 // The following are the loads not splitted enough during legalization
1151 // because it was not clear they are smem-load or vmem-load
1154 assert(LoadSize % MaxNonSmrdLoadSize == 0);
1155 unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize;
1156 const LLT LoadSplitTy = LoadTy.divide(NumSplitParts);
1157 ApplyRegBankMapping O(B, *this, MRI, &AMDGPU::VGPRRegBank);
1158 LegalizerHelper Helper(B.getMF(), O, B);
1159 if (LoadTy.isVector()) {
1160 if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) !=
1162 return false;
1163 } else {
1164 if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1165 return false;
1166 }
1167 }
1168
1169 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
1170 return true;
1171}
1172
1176 MachineInstr &MI) const {
1177 MachineRegisterInfo &MRI = *B.getMRI();
1178 const MachineFunction &MF = B.getMF();
1179 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1180 const auto &TFI = *ST.getFrameLowering();
1181
1182 // Guard in case the stack growth direction ever changes with scratch
1183 // instructions.
1184 if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown)
1185 return false;
1186
1187 Register Dst = MI.getOperand(0).getReg();
1188 Register AllocSize = MI.getOperand(1).getReg();
1189 Align Alignment = assumeAligned(MI.getOperand(2).getImm());
1190
1191 const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI);
1192
1193 // TODO: Need to emit a wave reduction to get the maximum size.
1194 if (SizeBank != &AMDGPU::SGPRRegBank)
1195 return false;
1196
1197 LLT PtrTy = MRI.getType(Dst);
1198 LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
1199
1201 Register SPReg = Info->getStackPtrOffsetReg();
1202 ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank);
1203
1204 auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2());
1205 auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize);
1206
1207 auto SPCopy = B.buildCopy(PtrTy, SPReg);
1208 if (Alignment > TFI.getStackAlign()) {
1209 auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize);
1210 B.buildMaskLowPtrBits(Dst, PtrAdd,
1211 Log2(Alignment) + ST.getWavefrontSizeLog2());
1212 } else {
1213 B.buildPtrAdd(Dst, SPCopy, ScaledSize);
1214 }
1215
1216 MI.eraseFromParent();
1217 return true;
1218}
1219
1223 int RsrcIdx) const {
1224 const int NumDefs = MI.getNumExplicitDefs();
1225
1226 // The reported argument index is relative to the IR intrinsic call arguments,
1227 // so we need to shift by the number of defs and the intrinsic ID.
1228 RsrcIdx += NumDefs + 1;
1229
1230 // Insert copies to VGPR arguments.
1231 applyDefaultMapping(OpdMapper);
1232
1233 // Fixup any SGPR arguments.
1234 SmallVector<unsigned, 4> SGPRIndexes;
1235 for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
1236 if (!MI.getOperand(I).isReg())
1237 continue;
1238
1239 // If this intrinsic has a sampler, it immediately follows rsrc.
1240 if (I == RsrcIdx || I == RsrcIdx + 1)
1241 SGPRIndexes.push_back(I);
1242 }
1243
1244 executeInWaterfallLoop(B, MI, SGPRIndexes);
1245 return true;
1246}
1247
1248// Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store
1249// the three offsets (voffset, soffset and instoffset)
1251 MachineIRBuilder &B, Register CombinedOffset, Register &VOffsetReg,
1252 Register &SOffsetReg, int64_t &InstOffsetVal, Align Alignment) const {
1253 const LLT S32 = LLT::scalar(32);
1254 MachineRegisterInfo *MRI = B.getMRI();
1255
1256 if (std::optional<int64_t> Imm =
1257 getIConstantVRegSExtVal(CombinedOffset, *MRI)) {
1258 uint32_t SOffset, ImmOffset;
1259 if (TII->splitMUBUFOffset(*Imm, SOffset, ImmOffset, Alignment)) {
1260 VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1261 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1262 InstOffsetVal = ImmOffset;
1263
1264 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1265 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1266 return SOffset + ImmOffset;
1267 }
1268 }
1269
1270 Register Base;
1271 unsigned Offset;
1272
1273 std::tie(Base, Offset) =
1274 AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset);
1275
1276 uint32_t SOffset, ImmOffset;
1277 if ((int)Offset > 0 &&
1278 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
1279 if (getRegBank(Base, *MRI, *TRI) == &AMDGPU::VGPRRegBank) {
1280 VOffsetReg = Base;
1281 SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1282 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1283 InstOffsetVal = ImmOffset;
1284 return 0; // XXX - Why is this 0?
1285 }
1286
1287 // If we have SGPR base, we can use it for soffset.
1288 if (SOffset == 0) {
1289 VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1290 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1291 SOffsetReg = Base;
1292 InstOffsetVal = ImmOffset;
1293 return 0; // XXX - Why is this 0?
1294 }
1295 }
1296
1297 // Handle the variable sgpr + vgpr case.
1298 MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI);
1299 if (Add && (int)Offset >= 0) {
1300 Register Src0 = getSrcRegIgnoringCopies(Add->getOperand(1).getReg(), *MRI);
1301 Register Src1 = getSrcRegIgnoringCopies(Add->getOperand(2).getReg(), *MRI);
1302
1303 const RegisterBank *Src0Bank = getRegBank(Src0, *MRI, *TRI);
1304 const RegisterBank *Src1Bank = getRegBank(Src1, *MRI, *TRI);
1305
1306 if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) {
1307 VOffsetReg = Src0;
1308 SOffsetReg = Src1;
1309 return 0;
1310 }
1311
1312 if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) {
1313 VOffsetReg = Src1;
1314 SOffsetReg = Src0;
1315 return 0;
1316 }
1317 }
1318
1319 // Ensure we have a VGPR for the combined offset. This could be an issue if we
1320 // have an SGPR offset and a VGPR resource.
1321 if (getRegBank(CombinedOffset, *MRI, *TRI) == &AMDGPU::VGPRRegBank) {
1322 VOffsetReg = CombinedOffset;
1323 } else {
1324 VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0);
1325 B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1326 }
1327
1328 SOffsetReg = B.buildConstant(S32, 0).getReg(0);
1329 B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1330 return 0;
1331}
1332
1334 MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
1335 MachineInstr &MI = OpdMapper.getMI();
1336 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1337
1338 const LLT S32 = LLT::scalar(32);
1339 Register Dst = MI.getOperand(0).getReg();
1340 LLT Ty = MRI.getType(Dst);
1341
1342 const RegisterBank *RSrcBank =
1343 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1344 const RegisterBank *OffsetBank =
1345 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1346 if (RSrcBank == &AMDGPU::SGPRRegBank &&
1347 OffsetBank == &AMDGPU::SGPRRegBank)
1348 return true; // Legal mapping
1349
1350 // FIXME: 96-bit case was widened during legalize. We need to narrow it back
1351 // here but don't have an MMO.
1352
1353 unsigned LoadSize = Ty.getSizeInBits();
1354 int NumLoads = 1;
1355 if (LoadSize == 256 || LoadSize == 512) {
1356 NumLoads = LoadSize / 128;
1357 Ty = Ty.divide(NumLoads);
1358 }
1359
1360 // Use the alignment to ensure that the required offsets will fit into the
1361 // immediate offsets.
1362 const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1);
1363
1364 MachineFunction &MF = B.getMF();
1365
1366 Register SOffset;
1367 Register VOffset;
1368 int64_t ImmOffset = 0;
1369
1370 unsigned MMOOffset = setBufferOffsets(B, MI.getOperand(2).getReg(), VOffset,
1371 SOffset, ImmOffset, Alignment);
1372
1373 // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we
1374 // can, but we need to track an MMO for that.
1375 const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8;
1376 const Align MemAlign(4); // FIXME: ABI type alignment?
1381 MemSize, MemAlign);
1382 if (MMOOffset != 0)
1383 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize);
1384
1385 // If only the offset is divergent, emit a MUBUF buffer load instead. We can
1386 // assume that the buffer is unswizzled.
1387
1388 Register RSrc = MI.getOperand(1).getReg();
1389 Register VIndex = B.buildConstant(S32, 0).getReg(0);
1390 B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank);
1391
1392 SmallVector<Register, 4> LoadParts(NumLoads);
1393
1394 MachineBasicBlock::iterator MII = MI.getIterator();
1395 MachineInstrSpan Span(MII, &B.getMBB());
1396
1397 for (int i = 0; i < NumLoads; ++i) {
1398 if (NumLoads == 1) {
1399 LoadParts[i] = Dst;
1400 } else {
1401 LoadParts[i] = MRI.createGenericVirtualRegister(Ty);
1402 MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank);
1403 }
1404
1405 MachineMemOperand *MMO = BaseMMO;
1406 if (i != 0)
1407 BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize);
1408
1409 B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD)
1410 .addDef(LoadParts[i]) // vdata
1411 .addUse(RSrc) // rsrc
1412 .addUse(VIndex) // vindex
1413 .addUse(VOffset) // voffset
1414 .addUse(SOffset) // soffset
1415 .addImm(ImmOffset + 16 * i) // offset(imm)
1416 .addImm(0) // cachepolicy, swizzled buffer(imm)
1417 .addImm(0) // idxen(imm)
1418 .addMemOperand(MMO);
1419 }
1420
1421 // TODO: If only the resource is a VGPR, it may be better to execute the
1422 // scalar load in the waterfall loop if the resource is expected to frequently
1423 // be dynamically uniform.
1424 if (RSrcBank != &AMDGPU::SGPRRegBank) {
1425 // Remove the original instruction to avoid potentially confusing the
1426 // waterfall loop logic.
1427 B.setInstr(*Span.begin());
1428 MI.eraseFromParent();
1429
1430 SmallSet<Register, 4> OpsToWaterfall;
1431
1432 OpsToWaterfall.insert(RSrc);
1433 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
1434 OpsToWaterfall);
1435 }
1436
1437 if (NumLoads != 1) {
1438 if (Ty.isVector())
1439 B.buildConcatVectors(Dst, LoadParts);
1440 else
1441 B.buildMergeLikeInstr(Dst, LoadParts);
1442 }
1443
1444 // We removed the instruction earlier with a waterfall loop.
1445 if (RSrcBank == &AMDGPU::SGPRRegBank)
1446 MI.eraseFromParent();
1447
1448 return true;
1449}
1450
1452 const OperandsMapper &OpdMapper,
1453 bool Signed) const {
1454 MachineInstr &MI = OpdMapper.getMI();
1455 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1456
1457 // Insert basic copies
1458 applyDefaultMapping(OpdMapper);
1459
1460 Register DstReg = MI.getOperand(0).getReg();
1461 LLT Ty = MRI.getType(DstReg);
1462
1463 const LLT S32 = LLT::scalar(32);
1464
1465 unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
1466 Register SrcReg = MI.getOperand(FirstOpnd).getReg();
1467 Register OffsetReg = MI.getOperand(FirstOpnd + 1).getReg();
1468 Register WidthReg = MI.getOperand(FirstOpnd + 2).getReg();
1469
1470 const RegisterBank *DstBank =
1471 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1472 if (DstBank == &AMDGPU::VGPRRegBank) {
1473 if (Ty == S32)
1474 return true;
1475
1476 // There is no 64-bit vgpr bitfield extract instructions so the operation
1477 // is expanded to a sequence of instructions that implement the operation.
1478 ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank);
1479
1480 const LLT S64 = LLT::scalar(64);
1481 // Shift the source operand so that extracted bits start at bit 0.
1482 auto ShiftOffset = Signed ? B.buildAShr(S64, SrcReg, OffsetReg)
1483 : B.buildLShr(S64, SrcReg, OffsetReg);
1484 auto UnmergeSOffset = B.buildUnmerge({S32, S32}, ShiftOffset);
1485
1486 // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions
1487 // if the width is a constant.
1488 if (auto ConstWidth = getIConstantVRegValWithLookThrough(WidthReg, MRI)) {
1489 // Use the 32-bit bitfield extract instruction if the width is a constant.
1490 // Depending on the width size, use either the low or high 32-bits.
1491 auto Zero = B.buildConstant(S32, 0);
1492 auto WidthImm = ConstWidth->Value.getZExtValue();
1493 if (WidthImm <= 32) {
1494 // Use bitfield extract on the lower 32-bit source, and then sign-extend
1495 // or clear the upper 32-bits.
1496 auto Extract =
1497 Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg)
1498 : B.buildUbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg);
1499 auto Extend =
1500 Signed ? B.buildAShr(S32, Extract, B.buildConstant(S32, 31)) : Zero;
1501 B.buildMergeLikeInstr(DstReg, {Extract, Extend});
1502 } else {
1503 // Use bitfield extract on upper 32-bit source, and combine with lower
1504 // 32-bit source.
1505 auto UpperWidth = B.buildConstant(S32, WidthImm - 32);
1506 auto Extract =
1507 Signed
1508 ? B.buildSbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth)
1509 : B.buildUbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth);
1510 B.buildMergeLikeInstr(DstReg, {UnmergeSOffset.getReg(0), Extract});
1511 }
1512 MI.eraseFromParent();
1513 return true;
1514 }
1515
1516 // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit
1517 // operations.
1518 auto ExtShift = B.buildSub(S32, B.buildConstant(S32, 64), WidthReg);
1519 auto SignBit = B.buildShl(S64, ShiftOffset, ExtShift);
1520 if (Signed)
1521 B.buildAShr(S64, SignBit, ExtShift);
1522 else
1523 B.buildLShr(S64, SignBit, ExtShift);
1524 MI.eraseFromParent();
1525 return true;
1526 }
1527
1528 // The scalar form packs the offset and width in a single operand.
1529
1530 ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank);
1531
1532 // Ensure the high bits are clear to insert the offset.
1533 auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
1534 auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
1535
1536 // Zeros out the low bits, so don't bother clamping the input value.
1537 auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16));
1538
1539 // Transformation function, pack the offset and width of a BFE into
1540 // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
1541 // source, bits [5:0] contain the offset and bits [22:16] the width.
1542 auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
1543
1544 // TODO: It might be worth using a pseudo here to avoid scc clobber and
1545 // register class constraints.
1546 unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
1547 (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
1548
1549 auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
1550 if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1551 llvm_unreachable("failed to constrain BFE");
1552
1553 MI.eraseFromParent();
1554 return true;
1555}
1556
1558 MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
1559 MachineInstr &MI = OpdMapper.getMI();
1560 MachineRegisterInfo &MRI = OpdMapper.getMRI();
1561
1562 // Insert basic copies.
1563 applyDefaultMapping(OpdMapper);
1564
1565 Register Dst0 = MI.getOperand(0).getReg();
1566 Register Dst1 = MI.getOperand(1).getReg();
1567 Register Src0 = MI.getOperand(2).getReg();
1568 Register Src1 = MI.getOperand(3).getReg();
1569 Register Src2 = MI.getOperand(4).getReg();
1570
1571 if (MRI.getRegBankOrNull(Src0) == &AMDGPU::VGPRRegBank)
1572 return true;
1573
1574 bool IsUnsigned = MI.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
1575 LLT S1 = LLT::scalar(1);
1576 LLT S32 = LLT::scalar(32);
1577
1578 bool DstOnValu = MRI.getRegBankOrNull(Src2) == &AMDGPU::VGPRRegBank;
1579 bool Accumulate = true;
1580
1581 if (!DstOnValu) {
1582 if (mi_match(Src2, MRI, m_ZeroInt()))
1583 Accumulate = false;
1584 }
1585
1586 // Keep the multiplication on the SALU.
1587 Register DstHi;
1588 Register DstLo = B.buildMul(S32, Src0, Src1).getReg(0);
1589 bool MulHiInVgpr = false;
1590
1591 MRI.setRegBank(DstLo, AMDGPU::SGPRRegBank);
1592
1593 if (Subtarget.hasSMulHi()) {
1594 DstHi = IsUnsigned ? B.buildUMulH(S32, Src0, Src1).getReg(0)
1595 : B.buildSMulH(S32, Src0, Src1).getReg(0);
1596 MRI.setRegBank(DstHi, AMDGPU::SGPRRegBank);
1597 } else {
1598 Register VSrc0 = B.buildCopy(S32, Src0).getReg(0);
1599 Register VSrc1 = B.buildCopy(S32, Src1).getReg(0);
1600
1601 MRI.setRegBank(VSrc0, AMDGPU::VGPRRegBank);
1602 MRI.setRegBank(VSrc1, AMDGPU::VGPRRegBank);
1603
1604 DstHi = IsUnsigned ? B.buildUMulH(S32, VSrc0, VSrc1).getReg(0)
1605 : B.buildSMulH(S32, VSrc0, VSrc1).getReg(0);
1606 MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank);
1607
1608 if (!DstOnValu) {
1609 DstHi = buildReadFirstLane(B, MRI, DstHi);
1610 } else {
1611 MulHiInVgpr = true;
1612 }
1613 }
1614
1615 // Accumulate and produce the "carry-out" bit.
1616 //
1617 // The "carry-out" is defined as bit 64 of the result when computed as a
1618 // big integer. For unsigned multiply-add, this matches the usual definition
1619 // of carry-out. For signed multiply-add, bit 64 is the sign bit of the
1620 // result, which is determined as:
1621 // sign(Src0 * Src1) + sign(Src2) + carry-out from unsigned 64-bit add
1622 LLT CarryType = DstOnValu ? S1 : S32;
1623 const RegisterBank &CarryBank =
1624 DstOnValu ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
1625 const RegisterBank &DstBank =
1626 DstOnValu ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank;
1627 Register Carry;
1628 Register Zero;
1629
1630 if (!IsUnsigned) {
1631 Zero = B.buildConstant(S32, 0).getReg(0);
1632 MRI.setRegBank(Zero,
1633 MulHiInVgpr ? AMDGPU::VGPRRegBank : AMDGPU::SGPRRegBank);
1634
1635 Carry = B.buildICmp(CmpInst::ICMP_SLT, MulHiInVgpr ? S1 : S32, DstHi, Zero)
1636 .getReg(0);
1637 MRI.setRegBank(Carry, MulHiInVgpr ? AMDGPU::VCCRegBank
1638 : AMDGPU::SGPRRegBank);
1639
1640 if (DstOnValu && !MulHiInVgpr) {
1641 Carry = B.buildTrunc(S1, Carry).getReg(0);
1642 MRI.setRegBank(Carry, AMDGPU::VCCRegBank);
1643 }
1644 }
1645
1646 if (Accumulate) {
1647 if (DstOnValu) {
1648 DstLo = B.buildCopy(S32, DstLo).getReg(0);
1649 DstHi = B.buildCopy(S32, DstHi).getReg(0);
1650 MRI.setRegBank(DstLo, AMDGPU::VGPRRegBank);
1651 MRI.setRegBank(DstHi, AMDGPU::VGPRRegBank);
1652 }
1653
1654 auto Unmerge = B.buildUnmerge(S32, Src2);
1655 Register Src2Lo = Unmerge.getReg(0);
1656 Register Src2Hi = Unmerge.getReg(1);
1657 MRI.setRegBank(Src2Lo, DstBank);
1658 MRI.setRegBank(Src2Hi, DstBank);
1659
1660 if (!IsUnsigned) {
1661 auto Src2Sign = B.buildICmp(CmpInst::ICMP_SLT, CarryType, Src2Hi, Zero);
1662 MRI.setRegBank(Src2Sign.getReg(0), CarryBank);
1663
1664 Carry = B.buildXor(CarryType, Carry, Src2Sign).getReg(0);
1665 MRI.setRegBank(Carry, CarryBank);
1666 }
1667
1668 auto AddLo = B.buildUAddo(S32, CarryType, DstLo, Src2Lo);
1669 DstLo = AddLo.getReg(0);
1670 Register CarryLo = AddLo.getReg(1);
1671 MRI.setRegBank(DstLo, DstBank);
1672 MRI.setRegBank(CarryLo, CarryBank);
1673
1674 auto AddHi = B.buildUAdde(S32, CarryType, DstHi, Src2Hi, CarryLo);
1675 DstHi = AddHi.getReg(0);
1676 MRI.setRegBank(DstHi, DstBank);
1677
1678 Register CarryHi = AddHi.getReg(1);
1679 MRI.setRegBank(CarryHi, CarryBank);
1680
1681 if (IsUnsigned) {
1682 Carry = CarryHi;
1683 } else {
1684 Carry = B.buildXor(CarryType, Carry, CarryHi).getReg(0);
1685 MRI.setRegBank(Carry, CarryBank);
1686 }
1687 } else {
1688 if (IsUnsigned) {
1689 Carry = B.buildConstant(CarryType, 0).getReg(0);
1690 MRI.setRegBank(Carry, CarryBank);
1691 }
1692 }
1693
1694 B.buildMergeLikeInstr(Dst0, {DstLo, DstHi});
1695
1696 if (DstOnValu) {
1697 B.buildCopy(Dst1, Carry);
1698 } else {
1699 B.buildTrunc(Dst1, Carry);
1700 }
1701
1702 MI.eraseFromParent();
1703 return true;
1704}
1705
1706// Return a suitable opcode for extending the operands of Opc when widening.
1707static unsigned getExtendOp(unsigned Opc) {
1708 switch (Opc) {
1709 case TargetOpcode::G_ASHR:
1710 case TargetOpcode::G_SMIN:
1711 case TargetOpcode::G_SMAX:
1712 return TargetOpcode::G_SEXT;
1713 case TargetOpcode::G_LSHR:
1714 case TargetOpcode::G_UMIN:
1715 case TargetOpcode::G_UMAX:
1716 return TargetOpcode::G_ZEXT;
1717 default:
1718 return TargetOpcode::G_ANYEXT;
1719 }
1720}
1721
1722// Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding
1723// any illegal vector extend or unmerge operations.
1724static std::pair<Register, Register>
1725unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) {
1726 const LLT S32 = LLT::scalar(32);
1727 auto Bitcast = B.buildBitcast(S32, Src);
1728
1729 if (ExtOpcode == TargetOpcode::G_SEXT) {
1730 auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16);
1731 auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16));
1732 return std::pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1733 }
1734
1735 auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16));
1736 if (ExtOpcode == TargetOpcode::G_ZEXT) {
1737 auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff));
1738 return std::pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1739 }
1740
1741 assert(ExtOpcode == TargetOpcode::G_ANYEXT);
1742 return std::pair(Bitcast.getReg(0), ShiftHi.getReg(0));
1743}
1744
1745// For cases where only a single copy is inserted for matching register banks.
1746// Replace the register in the instruction operand
1748 const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
1749 SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
1750 if (!SrcReg.empty()) {
1751 assert(SrcReg.size() == 1);
1752 OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);
1753 return true;
1754 }
1755
1756 return false;
1757}
1758
1759/// Handle register layout difference for f16 images for some subtargets.
1762 Register Reg) const {
1764 return Reg;
1765
1766 const LLT S16 = LLT::scalar(16);
1767 LLT StoreVT = MRI.getType(Reg);
1768 if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
1769 return Reg;
1770
1771 auto Unmerge = B.buildUnmerge(S16, Reg);
1772
1773
1774 SmallVector<Register, 4> WideRegs;
1775 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1776 WideRegs.push_back(Unmerge.getReg(I));
1777
1778 const LLT S32 = LLT::scalar(32);
1779 int NumElts = StoreVT.getNumElements();
1780
1781 return B.buildMergeLikeInstr(LLT::fixed_vector(NumElts, S32), WideRegs)
1782 .getReg(0);
1783}
1784
1785static std::pair<Register, unsigned>
1787 int64_t Const;
1788 if (mi_match(Reg, MRI, m_ICst(Const)))
1789 return std::pair(Register(), Const);
1790
1791 Register Base;
1792 if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const))))
1793 return std::pair(Base, Const);
1794
1795 // TODO: Handle G_OR used for add case
1796 return std::pair(Reg, 0);
1797}
1798
1799std::pair<Register, unsigned>
1801 Register OrigOffset) const {
1802 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(Subtarget);
1803 Register BaseReg;
1804 unsigned ImmOffset;
1805 const LLT S32 = LLT::scalar(32);
1806
1807 // TODO: Use AMDGPU::getBaseWithConstantOffset() instead.
1808 std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(),
1809 OrigOffset);
1810
1811 unsigned C1 = 0;
1812 if (ImmOffset != 0) {
1813 // If the immediate value is too big for the immoffset field, put only bits
1814 // that would normally fit in the immoffset field. The remaining value that
1815 // is copied/added for the voffset field is a large power of 2, and it
1816 // stands more chance of being CSEd with the copy/add for another similar
1817 // load/store.
1818 // However, do not do that rounding down if that is a negative
1819 // number, as it appears to be illegal to have a negative offset in the
1820 // vgpr, even if adding the immediate offset makes it positive.
1821 unsigned Overflow = ImmOffset & ~MaxImm;
1822 ImmOffset -= Overflow;
1823 if ((int32_t)Overflow < 0) {
1824 Overflow += ImmOffset;
1825 ImmOffset = 0;
1826 }
1827
1828 C1 = ImmOffset;
1829 if (Overflow != 0) {
1830 if (!BaseReg)
1831 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
1832 else {
1833 auto OverflowVal = B.buildConstant(S32, Overflow);
1834 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
1835 }
1836 }
1837 }
1838
1839 if (!BaseReg)
1840 BaseReg = B.buildConstant(S32, 0).getReg(0);
1841
1842 return {BaseReg, C1};
1843}
1844
1846 Register SrcReg) const {
1847 MachineRegisterInfo &MRI = *B.getMRI();
1848 LLT SrcTy = MRI.getType(SrcReg);
1849 if (SrcTy.getSizeInBits() == 32) {
1850 // Use a v_mov_b32 here to make the exec dependency explicit.
1851 B.buildInstr(AMDGPU::V_MOV_B32_e32)
1852 .addDef(DstReg)
1853 .addUse(SrcReg);
1854 return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) &&
1855 constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI);
1856 }
1857
1858 Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1859 Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1860
1861 B.buildInstr(AMDGPU::V_MOV_B32_e32)
1862 .addDef(TmpReg0)
1863 .addUse(SrcReg, 0, AMDGPU::sub0);
1864 B.buildInstr(AMDGPU::V_MOV_B32_e32)
1865 .addDef(TmpReg1)
1866 .addUse(SrcReg, 0, AMDGPU::sub1);
1867 B.buildInstr(AMDGPU::REG_SEQUENCE)
1868 .addDef(DstReg)
1869 .addUse(TmpReg0)
1870 .addImm(AMDGPU::sub0)
1871 .addUse(TmpReg1)
1872 .addImm(AMDGPU::sub1);
1873
1874 return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) &&
1875 constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI);
1876}
1877
1878/// Utility function for pushing dynamic vector indexes with a constant offset
1879/// into waterfall loops.
1881 MachineInstr &IdxUseInstr,
1882 unsigned OpIdx,
1883 unsigned ConstOffset) {
1884 MachineRegisterInfo &MRI = *B.getMRI();
1885 const LLT S32 = LLT::scalar(32);
1886 Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg();
1887 B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator());
1888
1889 auto MaterializedOffset = B.buildConstant(S32, ConstOffset);
1890
1891 auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset);
1892 MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank);
1893 MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank);
1894 IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0));
1895}
1896
1897/// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the
1898/// original 32-bit source value (to be inserted in the low part of the combined
1899/// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit
1900/// value.
1902 Register Hi32Reg, Register Lo32Reg,
1903 unsigned ExtOpc,
1904 const RegisterBank &RegBank,
1905 bool IsBooleanSrc = false) {
1906 if (ExtOpc == AMDGPU::G_ZEXT) {
1907 B.buildConstant(Hi32Reg, 0);
1908 } else if (ExtOpc == AMDGPU::G_SEXT) {
1909 if (IsBooleanSrc) {
1910 // If we know the original source was an s1, the high half is the same as
1911 // the low.
1912 B.buildCopy(Hi32Reg, Lo32Reg);
1913 } else {
1914 // Replicate sign bit from 32-bit extended part.
1915 auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31);
1916 B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank);
1917 B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt);
1918 }
1919 } else {
1920 assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension");
1921 B.buildUndef(Hi32Reg);
1922 }
1923}
1924
1925bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
1927 const OperandsMapper &OpdMapper) const {
1928 MachineRegisterInfo &MRI = *B.getMRI();
1929
1930 Register VecReg = MI.getOperand(1).getReg();
1931 Register Idx = MI.getOperand(2).getReg();
1932
1933 const RegisterBank &IdxBank =
1934 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1935
1936 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
1937
1938 LLT VecTy = MRI.getType(VecReg);
1939 unsigned EltSize = VecTy.getScalarSizeInBits();
1940 unsigned NumElem = VecTy.getNumElements();
1941
1942 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
1943 IsDivergentIdx, &Subtarget))
1944 return false;
1945
1946 LLT S32 = LLT::scalar(32);
1947
1948 const RegisterBank &DstBank =
1949 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1950 const RegisterBank &SrcBank =
1951 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1952
1953 const RegisterBank &CCBank =
1954 (DstBank == AMDGPU::SGPRRegBank &&
1955 SrcBank == AMDGPU::SGPRRegBank &&
1956 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
1957 : AMDGPU::VCCRegBank;
1958 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
1959
1960 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
1961 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
1962 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
1963 }
1964
1965 LLT EltTy = VecTy.getScalarType();
1966 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
1967 unsigned NumLanes = DstRegs.size();
1968 if (!NumLanes)
1969 NumLanes = 1;
1970 else
1971 EltTy = MRI.getType(DstRegs[0]);
1972
1973 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
1974 SmallVector<Register, 2> Res(NumLanes);
1975 for (unsigned L = 0; L < NumLanes; ++L)
1976 Res[L] = UnmergeToEltTy.getReg(L);
1977
1978 for (unsigned I = 1; I < NumElem; ++I) {
1979 auto IC = B.buildConstant(S32, I);
1980 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
1981 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
1982 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
1983
1984 for (unsigned L = 0; L < NumLanes; ++L) {
1985 auto S = B.buildSelect(EltTy, Cmp,
1986 UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]);
1987
1988 for (unsigned N : { 0, 2, 3 })
1989 MRI.setRegBank(S->getOperand(N).getReg(), DstBank);
1990
1991 Res[L] = S->getOperand(0).getReg();
1992 }
1993 }
1994
1995 for (unsigned L = 0; L < NumLanes; ++L) {
1996 Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L];
1997 B.buildCopy(DstReg, Res[L]);
1998 MRI.setRegBank(DstReg, DstBank);
1999 }
2000
2001 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
2002 MI.eraseFromParent();
2003
2004 return true;
2005}
2006
2007// Insert a cross regbank copy for a register if it already has a bank that
2008// differs from the one we want to set.
2011 const RegisterBank &Bank) {
2012 const RegisterBank *CurrBank = MRI.getRegBankOrNull(Reg);
2013 if (CurrBank && *CurrBank != Bank) {
2014 Register Copy = B.buildCopy(MRI.getType(Reg), Reg).getReg(0);
2015 MRI.setRegBank(Copy, Bank);
2016 return Copy;
2017 }
2018
2019 MRI.setRegBank(Reg, Bank);
2020 return Reg;
2021}
2022
2023bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
2025 const OperandsMapper &OpdMapper) const {
2026
2027 MachineRegisterInfo &MRI = *B.getMRI();
2028 Register VecReg = MI.getOperand(1).getReg();
2029 Register Idx = MI.getOperand(3).getReg();
2030
2031 const RegisterBank &IdxBank =
2032 *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2033
2034 bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
2035
2036 LLT VecTy = MRI.getType(VecReg);
2037 unsigned EltSize = VecTy.getScalarSizeInBits();
2038 unsigned NumElem = VecTy.getNumElements();
2039
2040 if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
2041 IsDivergentIdx, &Subtarget))
2042 return false;
2043
2044 LLT S32 = LLT::scalar(32);
2045
2046 const RegisterBank &DstBank =
2047 *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2048 const RegisterBank &SrcBank =
2049 *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2050 const RegisterBank &InsBank =
2051 *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2052
2053 const RegisterBank &CCBank =
2054 (DstBank == AMDGPU::SGPRRegBank &&
2055 SrcBank == AMDGPU::SGPRRegBank &&
2056 InsBank == AMDGPU::SGPRRegBank &&
2057 IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
2058 : AMDGPU::VCCRegBank;
2059 LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
2060
2061 if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
2062 Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
2063 MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
2064 }
2065
2066 LLT EltTy = VecTy.getScalarType();
2067 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2068 unsigned NumLanes = InsRegs.size();
2069 if (!NumLanes) {
2070 NumLanes = 1;
2071 InsRegs.push_back(MI.getOperand(2).getReg());
2072 } else {
2073 EltTy = MRI.getType(InsRegs[0]);
2074 }
2075
2076 auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
2077 SmallVector<Register, 16> Ops(NumElem * NumLanes);
2078
2079 for (unsigned I = 0; I < NumElem; ++I) {
2080 auto IC = B.buildConstant(S32, I);
2081 MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
2082 auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
2083 MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
2084
2085 for (unsigned L = 0; L < NumLanes; ++L) {
2086 Register Op0 = constrainRegToBank(MRI, B, InsRegs[L], DstBank);
2087 Register Op1 = UnmergeToEltTy.getReg(I * NumLanes + L);
2088 Op1 = constrainRegToBank(MRI, B, Op1, DstBank);
2089
2090 Register Select = B.buildSelect(EltTy, Cmp, Op0, Op1).getReg(0);
2091 MRI.setRegBank(Select, DstBank);
2092
2093 Ops[I * NumLanes + L] = Select;
2094 }
2095 }
2096
2097 LLT MergeTy = LLT::fixed_vector(Ops.size(), EltTy);
2098 if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) {
2099 B.buildBuildVector(MI.getOperand(0), Ops);
2100 } else {
2101 auto Vec = B.buildBuildVector(MergeTy, Ops);
2102 MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank);
2103 B.buildBitcast(MI.getOperand(0).getReg(), Vec);
2104 }
2105
2106 MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
2107 MI.eraseFromParent();
2108
2109 return true;
2110}
2111
2112// Break s_mul_u64 into 32-bit vector operations.
2114 MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
2115 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2116 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
2117 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2118
2119 // All inputs are SGPRs, nothing special to do.
2120 if (DefRegs.empty()) {
2121 assert(Src0Regs.empty() && Src1Regs.empty());
2122 applyDefaultMapping(OpdMapper);
2123 return;
2124 }
2125
2126 assert(DefRegs.size() == 2);
2127 assert(Src0Regs.size() == Src1Regs.size() &&
2128 (Src0Regs.empty() || Src0Regs.size() == 2));
2129
2130 MachineRegisterInfo &MRI = OpdMapper.getMRI();
2131 MachineInstr &MI = OpdMapper.getMI();
2132 Register DstReg = MI.getOperand(0).getReg();
2133 LLT HalfTy = LLT::scalar(32);
2134
2135 // Depending on where the source registers came from, the generic code may
2136 // have decided to split the inputs already or not. If not, we still need to
2137 // extract the values.
2138
2139 if (Src0Regs.empty())
2140 split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
2141 else
2142 setRegsToType(MRI, Src0Regs, HalfTy);
2143
2144 if (Src1Regs.empty())
2145 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2146 else
2147 setRegsToType(MRI, Src1Regs, HalfTy);
2148
2149 setRegsToType(MRI, DefRegs, HalfTy);
2150
2151 // The multiplication is done as follows:
2152 //
2153 // Op1H Op1L
2154 // * Op0H Op0L
2155 // --------------------
2156 // Op1H*Op0L Op1L*Op0L
2157 // + Op1H*Op0H Op1L*Op0H
2158 // -----------------------------------------
2159 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
2160 //
2161 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
2162 // value and that would overflow.
2163 // The low 32-bit value is Op1L*Op0L.
2164 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from
2165 // Op1L*Op0L).
2166
2167 ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank);
2168
2169 Register Hi = B.buildUMulH(HalfTy, Src0Regs[0], Src1Regs[0]).getReg(0);
2170 Register MulLoHi = B.buildMul(HalfTy, Src0Regs[0], Src1Regs[1]).getReg(0);
2171 Register Add = B.buildAdd(HalfTy, Hi, MulLoHi).getReg(0);
2172 Register MulHiLo = B.buildMul(HalfTy, Src0Regs[1], Src1Regs[0]).getReg(0);
2173 B.buildAdd(DefRegs[1], Add, MulHiLo);
2174 B.buildMul(DefRegs[0], Src0Regs[0], Src1Regs[0]);
2175
2176 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2177 MI.eraseFromParent();
2178}
2179
2181 MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
2182 MachineInstr &MI = OpdMapper.getMI();
2183 B.setInstrAndDebugLoc(MI);
2184 unsigned Opc = MI.getOpcode();
2185 MachineRegisterInfo &MRI = OpdMapper.getMRI();
2186 switch (Opc) {
2187 case AMDGPU::G_CONSTANT:
2188 case AMDGPU::G_IMPLICIT_DEF: {
2189 Register DstReg = MI.getOperand(0).getReg();
2190 LLT DstTy = MRI.getType(DstReg);
2191 if (DstTy != LLT::scalar(1))
2192 break;
2193
2194 const RegisterBank *DstBank =
2195 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2196 if (DstBank == &AMDGPU::VCCRegBank)
2197 break;
2198 SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
2199 if (DefRegs.empty())
2200 DefRegs.push_back(DstReg);
2201
2202 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
2203
2204 Register NewDstReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
2205 LLVMContext &Ctx = B.getMF().getFunction().getContext();
2206
2207 MI.getOperand(0).setReg(NewDstReg);
2208 if (Opc != AMDGPU::G_IMPLICIT_DEF) {
2209 uint64_t ConstVal = MI.getOperand(1).getCImm()->getZExtValue();
2210 MI.getOperand(1).setCImm(
2211 ConstantInt::get(IntegerType::getInt32Ty(Ctx), ConstVal));
2212 }
2213
2214 MRI.setRegBank(NewDstReg, *DstBank);
2215 B.buildTrunc(DefRegs[0], NewDstReg);
2216 return;
2217 }
2218 case AMDGPU::G_PHI: {
2219 Register DstReg = MI.getOperand(0).getReg();
2220 LLT DstTy = MRI.getType(DstReg);
2221 if (DstTy != LLT::scalar(1))
2222 break;
2223
2224 const LLT S32 = LLT::scalar(32);
2225 const RegisterBank *DstBank =
2226 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2227 if (DstBank == &AMDGPU::VCCRegBank) {
2228 applyDefaultMapping(OpdMapper);
2229 // The standard handling only considers the result register bank for
2230 // phis. For VCC, blindly inserting a copy when the phi is lowered will
2231 // produce an invalid copy. We can only copy with some kind of compare to
2232 // get a vector boolean result. Insert a register bank copy that will be
2233 // correctly lowered to a compare.
2234 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
2235 Register SrcReg = MI.getOperand(I).getReg();
2236 const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
2237
2238 if (SrcBank != &AMDGPU::VCCRegBank) {
2239 MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB();
2240 B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator());
2241
2242 auto Copy = B.buildCopy(LLT::scalar(1), SrcReg);
2243 MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank);
2244 MI.getOperand(I).setReg(Copy.getReg(0));
2245 }
2246 }
2247
2248 return;
2249 }
2250
2251 // Phi handling is strange and only considers the bank of the destination.
2252 substituteSimpleCopyRegs(OpdMapper, 0);
2253
2254 // Promote SGPR/VGPR booleans to s32
2255 ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
2256 B.setInsertPt(B.getMBB(), MI);
2257 LegalizerHelper Helper(B.getMF(), ApplyBank, B);
2258
2259 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2260 llvm_unreachable("widen scalar should have succeeded");
2261
2262 return;
2263 }
2264 case AMDGPU::G_FCMP:
2266 break;
2267 [[fallthrough]];
2268 case AMDGPU::G_ICMP:
2269 case AMDGPU::G_UADDO:
2270 case AMDGPU::G_USUBO:
2271 case AMDGPU::G_UADDE:
2272 case AMDGPU::G_SADDE:
2273 case AMDGPU::G_USUBE:
2274 case AMDGPU::G_SSUBE: {
2275 unsigned BoolDstOp =
2276 (Opc == AMDGPU::G_ICMP || Opc == AMDGPU::G_FCMP) ? 0 : 1;
2277 Register DstReg = MI.getOperand(BoolDstOp).getReg();
2278
2279 const RegisterBank *DstBank =
2280 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2281 if (DstBank != &AMDGPU::SGPRRegBank)
2282 break;
2283
2284 const bool HasCarryIn = MI.getNumOperands() == 5;
2285
2286 // If this is a scalar compare, promote the result to s32, as the selection
2287 // will end up using a copy to a 32-bit vreg.
2288 const LLT S32 = LLT::scalar(32);
2289 Register NewDstReg = MRI.createGenericVirtualRegister(S32);
2290 MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank);
2291 MI.getOperand(BoolDstOp).setReg(NewDstReg);
2292
2293 if (HasCarryIn) {
2294 Register NewSrcReg = MRI.createGenericVirtualRegister(S32);
2295 MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank);
2296 B.buildZExt(NewSrcReg, MI.getOperand(4).getReg());
2297 MI.getOperand(4).setReg(NewSrcReg);
2298 }
2299
2300 MachineBasicBlock *MBB = MI.getParent();
2301 B.setInsertPt(*MBB, std::next(MI.getIterator()));
2302
2303 // If we had a constrained VCC result register, a copy was inserted to VCC
2304 // from SGPR.
2305 SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
2306 if (DefRegs.empty())
2307 DefRegs.push_back(DstReg);
2308 B.buildTrunc(DefRegs[0], NewDstReg);
2309 return;
2310 }
2311 case AMDGPU::G_SELECT: {
2312 Register DstReg = MI.getOperand(0).getReg();
2313 LLT DstTy = MRI.getType(DstReg);
2314
2315 SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1));
2316 if (CondRegs.empty())
2317 CondRegs.push_back(MI.getOperand(1).getReg());
2318 else {
2319 assert(CondRegs.size() == 1);
2320 }
2321
2322 const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI);
2323 if (CondBank == &AMDGPU::SGPRRegBank) {
2324 const LLT S32 = LLT::scalar(32);
2325 Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2326 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2327
2328 MI.getOperand(1).setReg(NewCondReg);
2329 B.buildZExt(NewCondReg, CondRegs[0]);
2330 }
2331
2332 if (DstTy.getSizeInBits() != 64)
2333 break;
2334
2335 LLT HalfTy = getHalfSizedType(DstTy);
2336
2337 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2338 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2339 SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));
2340
2341 // All inputs are SGPRs, nothing special to do.
2342 if (DefRegs.empty()) {
2343 assert(Src1Regs.empty() && Src2Regs.empty());
2344 break;
2345 }
2346
2347 if (Src1Regs.empty())
2348 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2349 else {
2350 setRegsToType(MRI, Src1Regs, HalfTy);
2351 }
2352
2353 if (Src2Regs.empty())
2354 split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg());
2355 else
2356 setRegsToType(MRI, Src2Regs, HalfTy);
2357
2358 setRegsToType(MRI, DefRegs, HalfTy);
2359
2360 B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]);
2361 B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]);
2362
2363 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2364 MI.eraseFromParent();
2365 return;
2366 }
2367 case AMDGPU::G_BRCOND: {
2368 Register CondReg = MI.getOperand(0).getReg();
2369 // FIXME: Should use legalizer helper, but should change bool ext type.
2370 const RegisterBank *CondBank =
2371 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2372
2373 if (CondBank == &AMDGPU::SGPRRegBank) {
2374 const LLT S32 = LLT::scalar(32);
2375 Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2376 MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2377
2378 MI.getOperand(0).setReg(NewCondReg);
2379 B.buildZExt(NewCondReg, CondReg);
2380 return;
2381 }
2382
2383 break;
2384 }
2385 case AMDGPU::G_AND:
2386 case AMDGPU::G_OR:
2387 case AMDGPU::G_XOR: {
2388 // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
2389 // there is a VGPR input.
2390 Register DstReg = MI.getOperand(0).getReg();
2391 LLT DstTy = MRI.getType(DstReg);
2392
2393 if (DstTy.getSizeInBits() == 1) {
2394 const RegisterBank *DstBank =
2395 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2396 if (DstBank == &AMDGPU::VCCRegBank)
2397 break;
2398
2399 MachineFunction *MF = MI.getParent()->getParent();
2400 ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
2401 LegalizerHelper Helper(*MF, ApplyBank, B);
2402
2403 if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
2405 llvm_unreachable("widen scalar should have succeeded");
2406 return;
2407 }
2408
2409 if (DstTy.getSizeInBits() != 64)
2410 break;
2411
2412 LLT HalfTy = getHalfSizedType(DstTy);
2413 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2414 SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
2415 SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2416
2417 // All inputs are SGPRs, nothing special to do.
2418 if (DefRegs.empty()) {
2419 assert(Src0Regs.empty() && Src1Regs.empty());
2420 break;
2421 }
2422
2423 assert(DefRegs.size() == 2);
2424 assert(Src0Regs.size() == Src1Regs.size() &&
2425 (Src0Regs.empty() || Src0Regs.size() == 2));
2426
2427 // Depending on where the source registers came from, the generic code may
2428 // have decided to split the inputs already or not. If not, we still need to
2429 // extract the values.
2430
2431 if (Src0Regs.empty())
2432 split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
2433 else
2434 setRegsToType(MRI, Src0Regs, HalfTy);
2435
2436 if (Src1Regs.empty())
2437 split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2438 else
2439 setRegsToType(MRI, Src1Regs, HalfTy);
2440
2441 setRegsToType(MRI, DefRegs, HalfTy);
2442
2443 B.buildInstr(Opc, {DefRegs[0]}, {Src0Regs[0], Src1Regs[0]});
2444 B.buildInstr(Opc, {DefRegs[1]}, {Src0Regs[1], Src1Regs[1]});
2445
2446 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2447 MI.eraseFromParent();
2448 return;
2449 }
2450 case AMDGPU::G_ABS: {
2451 Register SrcReg = MI.getOperand(1).getReg();
2452 const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg);
2453
2454 // There is no VALU abs instruction so we need to replace it with a sub and
2455 // max combination.
2456 if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) {
2457 MachineFunction *MF = MI.getParent()->getParent();
2458 ApplyRegBankMapping Apply(B, *this, MRI, &AMDGPU::VGPRRegBank);
2459 LegalizerHelper Helper(*MF, Apply, B);
2460
2462 llvm_unreachable("lowerAbsToMaxNeg should have succeeded");
2463 return;
2464 }
2465 [[fallthrough]];
2466 }
2467 case AMDGPU::G_ADD:
2468 case AMDGPU::G_SUB:
2469 case AMDGPU::G_MUL:
2470 case AMDGPU::G_SHL:
2471 case AMDGPU::G_LSHR:
2472 case AMDGPU::G_ASHR:
2473 case AMDGPU::G_SMIN:
2474 case AMDGPU::G_SMAX:
2475 case AMDGPU::G_UMIN:
2476 case AMDGPU::G_UMAX: {
2477 Register DstReg = MI.getOperand(0).getReg();
2478 LLT DstTy = MRI.getType(DstReg);
2479
2480 // Special case for s_mul_u64. There is not a vector equivalent of
2481 // s_mul_u64. Hence, we have to break down s_mul_u64 into 32-bit vector
2482 // multiplications.
2483 if (Opc == AMDGPU::G_MUL && DstTy.getSizeInBits() == 64) {
2484 applyMappingSMULU64(B, OpdMapper);
2485 return;
2486 }
2487
2488 // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
2489 // Packed 16-bit operations need to be scalarized and promoted.
2490 if (DstTy != LLT::scalar(16) && DstTy != LLT::fixed_vector(2, 16))
2491 break;
2492
2493 const RegisterBank *DstBank =
2494 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2495 if (DstBank == &AMDGPU::VGPRRegBank)
2496 break;
2497
2498 const LLT S32 = LLT::scalar(32);
2499 MachineBasicBlock *MBB = MI.getParent();
2500 MachineFunction *MF = MBB->getParent();
2501 ApplyRegBankMapping ApplySALU(B, *this, MRI, &AMDGPU::SGPRRegBank);
2502
2503 if (DstTy.isVector() && Opc == AMDGPU::G_ABS) {
2504 Register WideSrcLo, WideSrcHi;
2505
2506 std::tie(WideSrcLo, WideSrcHi) =
2507 unpackV2S16ToS32(B, MI.getOperand(1).getReg(), TargetOpcode::G_SEXT);
2508 auto Lo = B.buildInstr(AMDGPU::G_ABS, {S32}, {WideSrcLo});
2509 auto Hi = B.buildInstr(AMDGPU::G_ABS, {S32}, {WideSrcHi});
2510 B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
2511 MI.eraseFromParent();
2512 return;
2513 }
2514
2515 if (DstTy.isVector()) {
2516 Register WideSrc0Lo, WideSrc0Hi;
2517 Register WideSrc1Lo, WideSrc1Hi;
2518
2519 unsigned ExtendOp = getExtendOp(MI.getOpcode());
2520 std::tie(WideSrc0Lo, WideSrc0Hi)
2521 = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), ExtendOp);
2522 std::tie(WideSrc1Lo, WideSrc1Hi)
2523 = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), ExtendOp);
2524 auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo});
2525 auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi});
2526 B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
2527 MI.eraseFromParent();
2528 } else {
2529 LegalizerHelper Helper(*MF, ApplySALU, B);
2530
2531 if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2532 llvm_unreachable("widen scalar should have succeeded");
2533
2534 // FIXME: s16 shift amounts should be legal.
2535 if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR ||
2536 Opc == AMDGPU::G_ASHR) {
2537 B.setInsertPt(*MBB, MI.getIterator());
2538 if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2539 llvm_unreachable("widen scalar should have succeeded");
2540 }
2541 }
2542
2543 return;
2544 }
2545 case AMDGPU::G_AMDGPU_S_MUL_I64_I32:
2546 case AMDGPU::G_AMDGPU_S_MUL_U64_U32: {
2547 // This is a special case for s_mul_u64. We use
2548 // G_AMDGPU_S_MUL_I64_I32 opcode to represent an s_mul_u64 operation
2549 // where the 33 higher bits are sign-extended and
2550 // G_AMDGPU_S_MUL_U64_U32 opcode to represent an s_mul_u64 operation
2551 // where the 32 higher bits are zero-extended. In case scalar registers are
2552 // selected, both opcodes are lowered as s_mul_u64. If the vector registers
2553 // are selected, then G_AMDGPU_S_MUL_I64_I32 and
2554 // G_AMDGPU_S_MUL_U64_U32 are lowered with a vector mad instruction.
2555
2556 // Insert basic copies.
2557 applyDefaultMapping(OpdMapper);
2558
2559 Register DstReg = MI.getOperand(0).getReg();
2560 Register SrcReg0 = MI.getOperand(1).getReg();
2561 Register SrcReg1 = MI.getOperand(2).getReg();
2562 const LLT S32 = LLT::scalar(32);
2563 const LLT S64 = LLT::scalar(64);
2564 assert(MRI.getType(DstReg) == S64 && "This is a special case for s_mul_u64 "
2565 "that handles only 64-bit operands.");
2566 const RegisterBank *DstBank =
2567 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2568
2569 // Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32
2570 // with s_mul_u64 operation.
2571 if (DstBank == &AMDGPU::SGPRRegBank) {
2572 MI.setDesc(TII->get(AMDGPU::S_MUL_U64));
2573 MRI.setRegClass(DstReg, &AMDGPU::SGPR_64RegClass);
2574 MRI.setRegClass(SrcReg0, &AMDGPU::SGPR_64RegClass);
2575 MRI.setRegClass(SrcReg1, &AMDGPU::SGPR_64RegClass);
2576 return;
2577 }
2578
2579 // Replace G_AMDGPU_S_MUL_I64_I32 and G_AMDGPU_S_MUL_U64_U32
2580 // with a vector mad.
2581 assert(MRI.getRegBankOrNull(DstReg) == &AMDGPU::VGPRRegBank &&
2582 "The destination operand should be in vector registers.");
2583
2584 DebugLoc DL = MI.getDebugLoc();
2585
2586 // Extract the lower subregister from the first operand.
2587 Register Op0L = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2588 MRI.setRegClass(Op0L, &AMDGPU::VGPR_32RegClass);
2589 MRI.setType(Op0L, S32);
2590 B.buildTrunc(Op0L, SrcReg0);
2591
2592 // Extract the lower subregister from the second operand.
2593 Register Op1L = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2594 MRI.setRegClass(Op1L, &AMDGPU::VGPR_32RegClass);
2595 MRI.setType(Op1L, S32);
2596 B.buildTrunc(Op1L, SrcReg1);
2597
2598 unsigned NewOpc = Opc == AMDGPU::G_AMDGPU_S_MUL_U64_U32
2599 ? AMDGPU::G_AMDGPU_MAD_U64_U32
2600 : AMDGPU::G_AMDGPU_MAD_I64_I32;
2601
2603 Register Zero64 = B.buildConstant(S64, 0).getReg(0);
2604 MRI.setRegClass(Zero64, &AMDGPU::VReg_64RegClass);
2605 Register CarryOut = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
2606 MRI.setRegClass(CarryOut, &AMDGPU::VReg_64RegClass);
2607 B.buildInstr(NewOpc, {DstReg, CarryOut}, {Op0L, Op1L, Zero64});
2608 MI.eraseFromParent();
2609 return;
2610 }
2611 case AMDGPU::G_SEXT_INREG: {
2612 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2613 if (SrcRegs.empty())
2614 break; // Nothing to repair
2615
2616 const LLT S32 = LLT::scalar(32);
2617 ApplyRegBankMapping O(B, *this, MRI, &AMDGPU::VGPRRegBank);
2618
2619 // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs
2620 // we would need to further expand, and doesn't let us directly set the
2621 // result registers.
2622 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2623
2624 int Amt = MI.getOperand(2).getImm();
2625 if (Amt <= 32) {
2626 // Downstream users have expectations for the high bit behavior, so freeze
2627 // incoming undefined bits.
2628 if (Amt == 32) {
2629 // The low bits are unchanged.
2630 B.buildFreeze(DstRegs[0], SrcRegs[0]);
2631 } else {
2632 auto Freeze = B.buildFreeze(S32, SrcRegs[0]);
2633 // Extend in the low bits and propagate the sign bit to the high half.
2634 B.buildSExtInReg(DstRegs[0], Freeze, Amt);
2635 }
2636
2637 B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31));
2638 } else {
2639 // The low bits are unchanged, and extend in the high bits.
2640 // No freeze required
2641 B.buildCopy(DstRegs[0], SrcRegs[0]);
2642 B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32);
2643 }
2644
2645 Register DstReg = MI.getOperand(0).getReg();
2646 MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2647 MI.eraseFromParent();
2648 return;
2649 }
2650 case AMDGPU::G_CTPOP:
2651 case AMDGPU::G_BITREVERSE: {
2652 const RegisterBank *DstBank =
2653 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2654 if (DstBank == &AMDGPU::SGPRRegBank)
2655 break;
2656
2657 Register SrcReg = MI.getOperand(1).getReg();
2658 const LLT S32 = LLT::scalar(32);
2659 LLT Ty = MRI.getType(SrcReg);
2660 if (Ty == S32)
2661 break;
2662
2663 ApplyRegBankMapping ApplyVALU(B, *this, MRI, &AMDGPU::VGPRRegBank);
2664
2665 MachineFunction &MF = B.getMF();
2666 LegalizerHelper Helper(MF, ApplyVALU, B);
2667
2668 if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2669 llvm_unreachable("narrowScalar should have succeeded");
2670 return;
2671 }
2672 case AMDGPU::G_AMDGPU_FFBH_U32:
2673 case AMDGPU::G_AMDGPU_FFBL_B32:
2674 case AMDGPU::G_CTLZ_ZERO_UNDEF:
2675 case AMDGPU::G_CTTZ_ZERO_UNDEF: {
2676 const RegisterBank *DstBank =
2677 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2678 if (DstBank == &AMDGPU::SGPRRegBank)
2679 break;
2680
2681 Register SrcReg = MI.getOperand(1).getReg();
2682 const LLT S32 = LLT::scalar(32);
2683 LLT Ty = MRI.getType(SrcReg);
2684 if (Ty == S32)
2685 break;
2686
2687 // We can narrow this more efficiently than Helper can by using ffbh/ffbl
2688 // which return -1 when the input is zero:
2689 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
2690 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
2691 // (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32))
2692 // (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo))
2693 ApplyRegBankMapping ApplyVALU(B, *this, MRI, &AMDGPU::VGPRRegBank);
2694 SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2695 unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF
2696 ? (unsigned)AMDGPU::G_AMDGPU_FFBH_U32
2697 : Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2698 ? (unsigned)AMDGPU::G_AMDGPU_FFBL_B32
2699 : Opc;
2700 unsigned Idx = NewOpc == AMDGPU::G_AMDGPU_FFBH_U32;
2701 auto X = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx]});
2702 auto Y = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx ^ 1]});
2703 unsigned AddOpc =
2704 Opc == AMDGPU::G_CTLZ_ZERO_UNDEF || Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2705 ? AMDGPU::G_ADD
2706 : AMDGPU::G_UADDSAT;
2707 Y = B.buildInstr(AddOpc, {S32}, {Y, B.buildConstant(S32, 32)});
2708 Register DstReg = MI.getOperand(0).getReg();
2709 B.buildUMin(DstReg, X, Y);
2710 MI.eraseFromParent();
2711 return;
2712 }
2713 case AMDGPU::G_SEXT:
2714 case AMDGPU::G_ZEXT:
2715 case AMDGPU::G_ANYEXT: {
2716 Register SrcReg = MI.getOperand(1).getReg();
2717 LLT SrcTy = MRI.getType(SrcReg);
2718 const bool Signed = Opc == AMDGPU::G_SEXT;
2719
2720 assert(OpdMapper.getVRegs(1).empty());
2721
2722 const RegisterBank *SrcBank =
2723 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2724
2725 Register DstReg = MI.getOperand(0).getReg();
2726 LLT DstTy = MRI.getType(DstReg);
2727 if (DstTy.isScalar() &&
2728 SrcBank != &AMDGPU::SGPRRegBank &&
2729 SrcBank != &AMDGPU::VCCRegBank &&
2730 // FIXME: Should handle any type that round to s64 when irregular
2731 // breakdowns supported.
2732 DstTy.getSizeInBits() == 64 &&
2733 SrcTy.getSizeInBits() <= 32) {
2734 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2735
2736 // Extend to 32-bit, and then extend the low half.
2737 if (Signed) {
2738 // TODO: Should really be buildSExtOrCopy
2739 B.buildSExtOrTrunc(DefRegs[0], SrcReg);
2740 } else if (Opc == AMDGPU::G_ZEXT) {
2741 B.buildZExtOrTrunc(DefRegs[0], SrcReg);
2742 } else {
2743 B.buildAnyExtOrTrunc(DefRegs[0], SrcReg);
2744 }
2745
2746 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank);
2747 MRI.setRegBank(DstReg, *SrcBank);
2748 MI.eraseFromParent();
2749 return;
2750 }
2751
2752 if (SrcTy != LLT::scalar(1))
2753 return;
2754
2755 // It is not legal to have a legalization artifact with a VCC source. Rather
2756 // than introducing a copy, insert the select we would have to select the
2757 // copy to.
2758 if (SrcBank == &AMDGPU::VCCRegBank) {
2759 SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2760
2761 const RegisterBank *DstBank = &AMDGPU::VGPRRegBank;
2762
2763 unsigned DstSize = DstTy.getSizeInBits();
2764 // 64-bit select is SGPR only
2765 const bool UseSel64 = DstSize > 32 &&
2766 SrcBank->getID() == AMDGPU::SGPRRegBankID;
2767
2768 // TODO: Should s16 select be legal?
2769 LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
2770 auto True = B.buildConstant(SelType, Signed ? -1 : 1);
2771 auto False = B.buildConstant(SelType, 0);
2772
2773 MRI.setRegBank(True.getReg(0), *DstBank);
2774 MRI.setRegBank(False.getReg(0), *DstBank);
2775 MRI.setRegBank(DstReg, *DstBank);
2776
2777 if (DstSize > 32) {
2778 B.buildSelect(DefRegs[0], SrcReg, True, False);
2779 extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true);
2780 } else if (DstSize < 32) {
2781 auto Sel = B.buildSelect(SelType, SrcReg, True, False);
2782 MRI.setRegBank(Sel.getReg(0), *DstBank);
2783 B.buildTrunc(DstReg, Sel);
2784 } else {
2785 B.buildSelect(DstReg, SrcReg, True, False);
2786 }
2787
2788 MI.eraseFromParent();
2789 return;
2790 }
2791
2792 break;
2793 }
2794 case AMDGPU::G_EXTRACT_VECTOR_ELT: {
2795 SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2796
2797 assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());
2798
2799 Register DstReg = MI.getOperand(0).getReg();
2800 Register SrcReg = MI.getOperand(1).getReg();
2801
2802 const LLT S32 = LLT::scalar(32);
2803 LLT DstTy = MRI.getType(DstReg);
2804 LLT SrcTy = MRI.getType(SrcReg);
2805
2806 if (foldExtractEltToCmpSelect(B, MI, OpdMapper))
2807 return;
2808
2809 const ValueMapping &DstMapping
2810 = OpdMapper.getInstrMapping().getOperandMapping(0);
2811 const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank;
2812 const RegisterBank *SrcBank =
2813 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2814 const RegisterBank *IdxBank =
2815 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2816
2817 Register BaseIdxReg;
2818 unsigned ConstOffset;
2819 std::tie(BaseIdxReg, ConstOffset) =
2820 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg());
2821
2822 // See if the index is an add of a constant which will be foldable by moving
2823 // the base register of the index later if this is going to be executed in a
2824 // waterfall loop. This is essentially to reassociate the add of a constant
2825 // with the readfirstlane.
2826 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2827 ConstOffset > 0 &&
2828 ConstOffset < SrcTy.getNumElements();
2829
2830 // Move the base register. We'll re-insert the add later.
2831 if (ShouldMoveIndexIntoLoop)
2832 MI.getOperand(2).setReg(BaseIdxReg);
2833
2834 // If this is a VGPR result only because the index was a VGPR result, the
2835 // actual indexing will be done on the SGPR source vector, which will
2836 // produce a scalar result. We need to copy to the VGPR result inside the
2837 // waterfall loop.
2838 const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank &&
2839 SrcBank == &AMDGPU::SGPRRegBank;
2840 if (DstRegs.empty()) {
2841 applyDefaultMapping(OpdMapper);
2842
2844
2845 if (NeedCopyToVGPR) {
2846 // We don't want a phi for this temporary reg.
2847 Register TmpReg = MRI.createGenericVirtualRegister(DstTy);
2848 MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank);
2849 MI.getOperand(0).setReg(TmpReg);
2850 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
2851
2852 // Use a v_mov_b32 here to make the exec dependency explicit.
2853 buildVCopy(B, DstReg, TmpReg);
2854 }
2855
2856 // Re-insert the constant offset add inside the waterfall loop.
2857 if (ShouldMoveIndexIntoLoop)
2858 reinsertVectorIndexAdd(B, MI, 2, ConstOffset);
2859
2860 return;
2861 }
2862
2863 assert(DstTy.getSizeInBits() == 64);
2864
2865 LLT Vec32 = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32);
2866
2867 auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2868 auto One = B.buildConstant(S32, 1);
2869
2870 MachineBasicBlock::iterator MII = MI.getIterator();
2871
2872 // Split the vector index into 32-bit pieces. Prepare to move all of the
2873 // new instructions into a waterfall loop if necessary.
2874 //
2875 // Don't put the bitcast or constant in the loop.
2876 MachineInstrSpan Span(MII, &B.getMBB());
2877
2878 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2879 auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2880 auto IdxHi = B.buildAdd(S32, IdxLo, One);
2881
2882 auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
2883 auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi);
2884
2885 MRI.setRegBank(DstReg, *DstBank);
2886 MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2887 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2888 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2889 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2890
2891 SmallSet<Register, 4> OpsToWaterfall;
2892 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) {
2893 MI.eraseFromParent();
2894 return;
2895 }
2896
2897 // Remove the original instruction to avoid potentially confusing the
2898 // waterfall loop logic.
2899 B.setInstr(*Span.begin());
2900 MI.eraseFromParent();
2901 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2902 OpsToWaterfall);
2903
2904 if (NeedCopyToVGPR) {
2905 MachineBasicBlock *LoopBB = Extract1->getParent();
2906 Register TmpReg0 = MRI.createGenericVirtualRegister(S32);
2907 Register TmpReg1 = MRI.createGenericVirtualRegister(S32);
2908 MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank);
2909 MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank);
2910
2911 Extract0->getOperand(0).setReg(TmpReg0);
2912 Extract1->getOperand(0).setReg(TmpReg1);
2913
2914 B.setInsertPt(*LoopBB, ++Extract1->getIterator());
2915
2916 buildVCopy(B, DstRegs[0], TmpReg0);
2917 buildVCopy(B, DstRegs[1], TmpReg1);
2918 }
2919
2920 if (ShouldMoveIndexIntoLoop)
2921 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2922
2923 return;
2924 }
2925 case AMDGPU::G_INSERT_VECTOR_ELT: {
2926 SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2927
2928 Register DstReg = MI.getOperand(0).getReg();
2929 LLT VecTy = MRI.getType(DstReg);
2930
2931 assert(OpdMapper.getVRegs(0).empty());
2932 assert(OpdMapper.getVRegs(3).empty());
2933
2934 if (substituteSimpleCopyRegs(OpdMapper, 1))
2935 MRI.setType(MI.getOperand(1).getReg(), VecTy);
2936
2937 if (foldInsertEltToCmpSelect(B, MI, OpdMapper))
2938 return;
2939
2940 const RegisterBank *IdxBank =
2941 OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2942
2943 Register SrcReg = MI.getOperand(1).getReg();
2944 Register InsReg = MI.getOperand(2).getReg();
2945 LLT InsTy = MRI.getType(InsReg);
2946 (void)InsTy;
2947
2948 Register BaseIdxReg;
2949 unsigned ConstOffset;
2950 std::tie(BaseIdxReg, ConstOffset) =
2951 AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg());
2952
2953 // See if the index is an add of a constant which will be foldable by moving
2954 // the base register of the index later if this is going to be executed in a
2955 // waterfall loop. This is essentially to reassociate the add of a constant
2956 // with the readfirstlane.
2957 bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2958 ConstOffset > 0 &&
2959 ConstOffset < VecTy.getNumElements();
2960
2961 // Move the base register. We'll re-insert the add later.
2962 if (ShouldMoveIndexIntoLoop)
2963 MI.getOperand(3).setReg(BaseIdxReg);
2964
2965
2966 if (InsRegs.empty()) {
2968
2969 // Re-insert the constant offset add inside the waterfall loop.
2970 if (ShouldMoveIndexIntoLoop) {
2971 reinsertVectorIndexAdd(B, MI, 3, ConstOffset);
2972 }
2973
2974 return;
2975 }
2976
2977 assert(InsTy.getSizeInBits() == 64);
2978
2979 const LLT S32 = LLT::scalar(32);
2980 LLT Vec32 = LLT::fixed_vector(2 * VecTy.getNumElements(), 32);
2981
2982 auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2983 auto One = B.buildConstant(S32, 1);
2984
2985 // Split the vector index into 32-bit pieces. Prepare to move all of the
2986 // new instructions into a waterfall loop if necessary.
2987 //
2988 // Don't put the bitcast or constant in the loop.
2990
2991 // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2992 auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2993 auto IdxHi = B.buildAdd(S32, IdxLo, One);
2994
2995 auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo);
2996 auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi);
2997
2998 const RegisterBank *DstBank =
2999 OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
3000 const RegisterBank *SrcBank =
3001 OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
3002 const RegisterBank *InsSrcBank =
3003 OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
3004
3005 MRI.setRegBank(InsReg, *InsSrcBank);
3006 MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
3007 MRI.setRegBank(InsLo.getReg(0), *DstBank);
3008 MRI.setRegBank(InsHi.getReg(0), *DstBank);
3009 MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
3010 MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
3011 MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
3012
3013
3014 SmallSet<Register, 4> OpsToWaterfall;
3015 if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) {
3016 B.setInsertPt(B.getMBB(), MI);
3017 B.buildBitcast(DstReg, InsHi);
3018 MI.eraseFromParent();
3019 return;
3020 }
3021
3022 B.setInstr(*Span.begin());
3023 MI.eraseFromParent();
3024
3025 // Figure out the point after the waterfall loop before mangling the control
3026 // flow.
3027 executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
3028 OpsToWaterfall);
3029
3030 // The insertion point is now right after the original instruction.
3031 //
3032 // Keep the bitcast to the original vector type out of the loop. Doing this
3033 // saved an extra phi we don't need inside the loop.
3034 B.buildBitcast(DstReg, InsHi);
3035
3036 // Re-insert the constant offset add inside the waterfall loop.
3037 if (ShouldMoveIndexIntoLoop)
3038 reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
3039
3040 return;
3041 }
3042 case AMDGPU::G_AMDGPU_BUFFER_LOAD:
3043 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
3044 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
3045 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
3046 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
3047 case AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE:
3048 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE:
3049 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT_TFE:
3050 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE:
3051 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE_TFE:
3052 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
3053 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE:
3054 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
3055 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
3056 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
3057 case AMDGPU::G_AMDGPU_BUFFER_STORE:
3058 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
3059 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
3060 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
3061 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16:
3062 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
3063 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: {
3064 applyDefaultMapping(OpdMapper);
3065 executeInWaterfallLoop(B, MI, {1, 4});
3066 return;
3067 }
3068 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
3069 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
3070 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
3071 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
3072 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
3073 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
3074 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
3075 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
3076 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
3077 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
3078 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
3079 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: {
3080 applyDefaultMapping(OpdMapper);
3081 executeInWaterfallLoop(B, MI, {2, 5});
3082 return;
3083 }
3084 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
3085 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
3086 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
3087 applyDefaultMapping(OpdMapper);
3088 executeInWaterfallLoop(B, MI, {2, 5});
3089 return;
3090 }
3091 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
3092 applyDefaultMapping(OpdMapper);
3093 executeInWaterfallLoop(B, MI, {3, 6});
3094 return;
3095 }
3096 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD:
3097 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
3098 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:
3099 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
3100 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT: {
3101 applyMappingSBufferLoad(B, OpdMapper);
3102 return;
3103 }
3104 case AMDGPU::G_INTRINSIC:
3105 case AMDGPU::G_INTRINSIC_CONVERGENT: {
3106 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
3107 case Intrinsic::amdgcn_readlane: {
3108 substituteSimpleCopyRegs(OpdMapper, 2);
3109
3110 assert(OpdMapper.getVRegs(0).empty());
3111 assert(OpdMapper.getVRegs(3).empty());
3112
3113 // Make sure the index is an SGPR. It doesn't make sense to run this in a
3114 // waterfall loop, so assume it's a uniform value.
3115 constrainOpWithReadfirstlane(B, MI, 3); // Index
3116 return;
3117 }
3118 case Intrinsic::amdgcn_writelane: {
3119 assert(OpdMapper.getVRegs(0).empty());
3120 assert(OpdMapper.getVRegs(2).empty());
3121 assert(OpdMapper.getVRegs(3).empty());
3122
3123 substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
3124 constrainOpWithReadfirstlane(B, MI, 2); // Source value
3125 constrainOpWithReadfirstlane(B, MI, 3); // Index
3126 return;
3127 }
3128 case Intrinsic::amdgcn_interp_p1:
3129 case Intrinsic::amdgcn_interp_p2:
3130 case Intrinsic::amdgcn_interp_mov:
3131 case Intrinsic::amdgcn_interp_p1_f16:
3132 case Intrinsic::amdgcn_interp_p2_f16:
3133 case Intrinsic::amdgcn_lds_param_load: {
3134 applyDefaultMapping(OpdMapper);
3135
3136 // Readlane for m0 value, which is always the last operand.
3137 // FIXME: Should this be a waterfall loop instead?
3138 constrainOpWithReadfirstlane(B, MI, MI.getNumOperands() - 1); // Index
3139 return;
3140 }
3141 case Intrinsic::amdgcn_interp_inreg_p10:
3142 case Intrinsic::amdgcn_interp_inreg_p2:
3143 case Intrinsic::amdgcn_interp_inreg_p10_f16:
3144 case Intrinsic::amdgcn_interp_inreg_p2_f16:
3145 case Intrinsic::amdgcn_interp_p10_rtz_f16:
3146 case Intrinsic::amdgcn_interp_p2_rtz_f16:
3147 applyDefaultMapping(OpdMapper);
3148 return;
3149 case Intrinsic::amdgcn_permlane16:
3150 case Intrinsic::amdgcn_permlanex16: {
3151 // Doing a waterfall loop over these wouldn't make any sense.
3152 substituteSimpleCopyRegs(OpdMapper, 2);
3153 substituteSimpleCopyRegs(OpdMapper, 3);
3156 return;
3157 }
3158 case Intrinsic::amdgcn_sbfe:
3159 applyMappingBFE(B, OpdMapper, true);
3160 return;
3161 case Intrinsic::amdgcn_ubfe:
3162 applyMappingBFE(B, OpdMapper, false);
3163 return;
3164 case Intrinsic::amdgcn_inverse_ballot:
3165 case Intrinsic::amdgcn_s_bitreplicate:
3166 case Intrinsic::amdgcn_s_quadmask:
3167 case Intrinsic::amdgcn_s_wqm:
3168 applyDefaultMapping(OpdMapper);
3169 constrainOpWithReadfirstlane(B, MI, 2); // Mask
3170 return;
3171 case Intrinsic::amdgcn_ballot:
3172 // Use default handling and insert copy to vcc source.
3173 break;
3174 }
3175 break;
3176 }
3177 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3178 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
3179 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
3180 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
3181 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
3182 const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3184 assert(RSrcIntrin && RSrcIntrin->IsImage);
3185 // Non-images can have complications from operands that allow both SGPR
3186 // and VGPR. For now it's too complicated to figure out the final opcode
3187 // to derive the register bank from the MCInstrDesc.
3188 applyMappingImage(B, MI, OpdMapper, RSrcIntrin->RsrcArg);
3189 return;
3190 }
3191 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
3192 unsigned N = MI.getNumExplicitOperands() - 2;
3193 applyDefaultMapping(OpdMapper);
3195 return;
3196 }
3197 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
3198 case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: {
3199 auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
3200 switch (IntrID) {
3201 case Intrinsic::amdgcn_ds_ordered_add:
3202 case Intrinsic::amdgcn_ds_ordered_swap: {
3203 // This is only allowed to execute with 1 lane, so readfirstlane is safe.
3204 assert(OpdMapper.getVRegs(0).empty());
3205 substituteSimpleCopyRegs(OpdMapper, 3);
3207 return;
3208 }
3209 case Intrinsic::amdgcn_ds_gws_init:
3210 case Intrinsic::amdgcn_ds_gws_barrier:
3211 case Intrinsic::amdgcn_ds_gws_sema_br: {
3212 // Only the first lane is executes, so readfirstlane is safe.
3213 substituteSimpleCopyRegs(OpdMapper, 1);
3215 return;
3216 }
3217 case Intrinsic::amdgcn_ds_gws_sema_v:
3218 case Intrinsic::amdgcn_ds_gws_sema_p:
3219 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
3220 // Only the first lane is executes, so readfirstlane is safe.
3222 return;
3223 }
3224 case Intrinsic::amdgcn_ds_append:
3225 case Intrinsic::amdgcn_ds_consume: {
3227 return;
3228 }
3229 case Intrinsic::amdgcn_s_sendmsg:
3230 case Intrinsic::amdgcn_s_sendmsghalt: {
3231 // FIXME: Should this use a waterfall loop?
3233 return;
3234 }
3235 case Intrinsic::amdgcn_s_setreg: {
3237 return;
3238 }
3239 case Intrinsic::amdgcn_s_ttracedata:
3241 return;
3242 case Intrinsic::amdgcn_raw_buffer_load_lds:
3243 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: {
3244 applyDefaultMapping(OpdMapper);
3245 constrainOpWithReadfirstlane(B, MI, 1); // rsrc
3247 constrainOpWithReadfirstlane(B, MI, 5); // soffset
3248 return;
3249 }
3250 case Intrinsic::amdgcn_struct_buffer_load_lds:
3251 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
3252 applyDefaultMapping(OpdMapper);
3253 constrainOpWithReadfirstlane(B, MI, 1); // rsrc
3255 constrainOpWithReadfirstlane(B, MI, 6); // soffset
3256 return;
3257 }
3258 case Intrinsic::amdgcn_global_load_lds: {
3259 applyDefaultMapping(OpdMapper);
3261 return;
3262 }
3263 case Intrinsic::amdgcn_lds_direct_load: {
3264 applyDefaultMapping(OpdMapper);
3265 // Readlane for m0 value, which is always the last operand.
3266 constrainOpWithReadfirstlane(B, MI, MI.getNumOperands() - 1); // Index
3267 return;
3268 }
3269 case Intrinsic::amdgcn_exp_row:
3270 applyDefaultMapping(OpdMapper);
3272 return;
3273 case Intrinsic::amdgcn_s_sleep_var:
3274 assert(OpdMapper.getVRegs(1).empty());
3276 return;
3277 case Intrinsic::amdgcn_s_barrier_signal_var:
3278 case Intrinsic::amdgcn_s_barrier_join:
3279 case Intrinsic::amdgcn_s_wakeup_barrier:
3281 return;
3282 case Intrinsic::amdgcn_s_barrier_signal_isfirst_var:
3284 return;
3285 case Intrinsic::amdgcn_s_barrier_init:
3288 return;
3289 case Intrinsic::amdgcn_s_get_barrier_state: {
3291 return;
3292 }
3293 default: {
3294 if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3296 // Non-images can have complications from operands that allow both SGPR
3297 // and VGPR. For now it's too complicated to figure out the final opcode
3298 // to derive the register bank from the MCInstrDesc.
3299 if (RSrcIntrin->IsImage) {
3300 applyMappingImage(B, MI, OpdMapper, RSrcIntrin->RsrcArg);
3301 return;
3302 }
3303 }
3304
3305 break;
3306 }
3307 }
3308 break;
3309 }
3310 case AMDGPU::G_SI_CALL: {
3311 // Use a set to avoid extra readfirstlanes in the case where multiple
3312 // operands are the same register.
3313 SmallSet<Register, 4> SGPROperandRegs;
3314
3315 if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, {1}))
3316 break;
3317
3318 // Move all copies to physical SGPRs that are used by the call instruction
3319 // into the loop block. Start searching for these copies until the
3320 // ADJCALLSTACKUP.
3321 unsigned FrameSetupOpcode = AMDGPU::ADJCALLSTACKUP;
3322 unsigned FrameDestroyOpcode = AMDGPU::ADJCALLSTACKDOWN;
3323
3324 // Move all non-copies before the copies, so that a complete range can be
3325 // moved into the waterfall loop.
3326 SmallVector<MachineInstr *, 4> NonCopyInstrs;
3327 // Count of NonCopyInstrs found until the current LastCopy.
3328 unsigned NonCopyInstrsLen = 0;
3330 MachineBasicBlock::iterator LastCopy = Start;
3331 MachineBasicBlock *MBB = MI.getParent();
3334 while (Start->getOpcode() != FrameSetupOpcode) {
3335 --Start;
3336 bool IsCopy = false;
3337 if (Start->getOpcode() == AMDGPU::COPY) {
3338 auto &Dst = Start->getOperand(0);
3339 if (Dst.isReg()) {
3340 Register Reg = Dst.getReg();
3341 if (Reg.isPhysical() && MI.readsRegister(Reg, TRI)) {
3342 IsCopy = true;
3343 } else {
3344 // Also move the copy from the scratch rsrc descriptor into the loop
3345 // to allow it to be optimized away.
3346 auto &Src = Start->getOperand(1);
3347 if (Src.isReg()) {
3348 Reg = Src.getReg();
3349 IsCopy = Info->getScratchRSrcReg() == Reg;
3350 }
3351 }
3352 }
3353 }
3354
3355 if (IsCopy) {
3356 LastCopy = Start;
3357 NonCopyInstrsLen = NonCopyInstrs.size();
3358 } else {
3359 NonCopyInstrs.push_back(&*Start);
3360 }
3361 }
3362 NonCopyInstrs.resize(NonCopyInstrsLen);
3363
3364 for (auto *NonCopy : reverse(NonCopyInstrs)) {
3365 MBB->splice(LastCopy, MBB, NonCopy->getIterator());
3366 }
3367 Start = LastCopy;
3368
3369 // Do the same for copies after the loop
3370 NonCopyInstrs.clear();
3371 NonCopyInstrsLen = 0;
3373 LastCopy = End;
3374 while (End->getOpcode() != FrameDestroyOpcode) {
3375 ++End;
3376 bool IsCopy = false;
3377 if (End->getOpcode() == AMDGPU::COPY) {
3378 auto &Src = End->getOperand(1);
3379 if (Src.isReg()) {
3380 Register Reg = Src.getReg();
3381 IsCopy = Reg.isPhysical() && MI.modifiesRegister(Reg, TRI);
3382 }
3383 }
3384
3385 if (IsCopy) {
3386 LastCopy = End;
3387 NonCopyInstrsLen = NonCopyInstrs.size();
3388 } else {
3389 NonCopyInstrs.push_back(&*End);
3390 }
3391 }
3392 NonCopyInstrs.resize(NonCopyInstrsLen);
3393
3394 End = LastCopy;
3395 ++LastCopy;
3396 for (auto *NonCopy : reverse(NonCopyInstrs)) {
3397 MBB->splice(LastCopy, MBB, NonCopy->getIterator());
3398 }
3399
3400 ++End;
3401 B.setInsertPt(B.getMBB(), Start);
3402 executeInWaterfallLoop(B, make_range(Start, End), SGPROperandRegs);
3403 break;
3404 }
3405 case AMDGPU::G_LOAD:
3406 case AMDGPU::G_ZEXTLOAD:
3407 case AMDGPU::G_SEXTLOAD: {
3408 if (applyMappingLoad(B, OpdMapper, MI))
3409 return;
3410 break;
3411 }
3412 case AMDGPU::G_DYN_STACKALLOC:
3413 applyMappingDynStackAlloc(B, OpdMapper, MI);
3414 return;
3415 case AMDGPU::G_STACKRESTORE: {
3416 applyDefaultMapping(OpdMapper);
3418 return;
3419 }
3420 case AMDGPU::G_SBFX:
3421 applyMappingBFE(B, OpdMapper, /*Signed*/ true);
3422 return;
3423 case AMDGPU::G_UBFX:
3424 applyMappingBFE(B, OpdMapper, /*Signed*/ false);
3425 return;
3426 case AMDGPU::G_AMDGPU_MAD_U64_U32:
3427 case AMDGPU::G_AMDGPU_MAD_I64_I32:
3428 applyMappingMAD_64_32(B, OpdMapper);
3429 return;
3430 case AMDGPU::G_PREFETCH: {
3431 if (!Subtarget.hasPrefetch()) {
3432 MI.eraseFromParent();
3433 return;
3434 }
3435 Register PtrReg = MI.getOperand(0).getReg();
3436 unsigned PtrBank = getRegBankID(PtrReg, MRI, AMDGPU::SGPRRegBankID);
3437 if (PtrBank == AMDGPU::VGPRRegBankID) {
3438 MI.eraseFromParent();
3439 return;
3440 }
3441 unsigned AS = MRI.getType(PtrReg).getAddressSpace();
3444 MI.eraseFromParent();
3445 return;
3446 }
3447 applyDefaultMapping(OpdMapper);
3448 return;
3449 }
3450 default:
3451 break;
3452 }
3453
3454 return applyDefaultMapping(OpdMapper);
3455}
3456
3457// vgpr, sgpr -> vgpr
3458// vgpr, agpr -> vgpr
3459// agpr, agpr -> agpr
3460// agpr, sgpr -> vgpr
3461static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
3462 if (RB0 == AMDGPU::InvalidRegBankID)
3463 return RB1;
3464 if (RB1 == AMDGPU::InvalidRegBankID)
3465 return RB0;
3466
3467 if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID)
3468 return AMDGPU::SGPRRegBankID;
3469
3470 if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID)
3471 return AMDGPU::AGPRRegBankID;
3472
3473 return AMDGPU::VGPRRegBankID;
3474}
3475
3476static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) {
3477 if (RB0 == AMDGPU::InvalidRegBankID)
3478 return RB1;
3479 if (RB1 == AMDGPU::InvalidRegBankID)
3480 return RB0;
3481
3482 // vcc, vcc -> vcc
3483 // vcc, sgpr -> vcc
3484 // vcc, vgpr -> vcc
3485 if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID)
3486 return AMDGPU::VCCRegBankID;
3487
3488 // vcc, vgpr -> vgpr
3489 return regBankUnion(RB0, RB1);
3490}
3491
3493 const MachineInstr &MI) const {
3494 unsigned RegBank = AMDGPU::InvalidRegBankID;
3495
3496 for (const MachineOperand &MO : MI.operands()) {
3497 if (!MO.isReg())
3498 continue;
3499 Register Reg = MO.getReg();
3500 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3501 RegBank = regBankUnion(RegBank, Bank->getID());
3502 if (RegBank == AMDGPU::VGPRRegBankID)
3503 break;
3504 }
3505 }
3506
3507 return RegBank;
3508}
3509
3511 const MachineFunction &MF = *MI.getParent()->getParent();
3512 const MachineRegisterInfo &MRI = MF.getRegInfo();
3513 for (const MachineOperand &MO : MI.operands()) {
3514 if (!MO.isReg())
3515 continue;
3516 Register Reg = MO.getReg();
3517 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3518 if (Bank->getID() != AMDGPU::SGPRRegBankID)
3519 return false;
3520 }
3521 }
3522 return true;
3523}
3524
3527 const MachineFunction &MF = *MI.getParent()->getParent();
3528 const MachineRegisterInfo &MRI = MF.getRegInfo();
3529 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3530
3531 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3532 const MachineOperand &SrcOp = MI.getOperand(i);
3533 if (!SrcOp.isReg())
3534 continue;
3535
3536 unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI);
3537 OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3538 }
3539 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3540 MI.getNumOperands());
3541}
3542
3545 const MachineFunction &MF = *MI.getParent()->getParent();
3546 const MachineRegisterInfo &MRI = MF.getRegInfo();
3547 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3548
3549 // Even though we technically could use SGPRs, this would require knowledge of
3550 // the constant bus restriction. Force all sources to VGPR (except for VCC).
3551 //
3552 // TODO: Unary ops are trivially OK, so accept SGPRs?
3553 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3554 const MachineOperand &Src = MI.getOperand(i);
3555 if (!Src.isReg())
3556 continue;
3557
3558 unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI);
3559 unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
3560 OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
3561 }
3562
3563 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3564 MI.getNumOperands());
3565}
3566
3569 const MachineFunction &MF = *MI.getParent()->getParent();
3570 const MachineRegisterInfo &MRI = MF.getRegInfo();
3571 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3572
3573 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
3574 const MachineOperand &Op = MI.getOperand(I);
3575 if (!Op.isReg())
3576 continue;
3577
3578 unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
3579 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3580 }
3581
3582 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3583 MI.getNumOperands());
3584}
3585
3588 const MachineInstr &MI,
3589 int RsrcIdx) const {
3590 // The reported argument index is relative to the IR intrinsic call arguments,
3591 // so we need to shift by the number of defs and the intrinsic ID.
3592 RsrcIdx += MI.getNumExplicitDefs() + 1;
3593
3594 const int NumOps = MI.getNumOperands();
3595 SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps);
3596
3597 // TODO: Should packed/unpacked D16 difference be reported here as part of
3598 // the value mapping?
3599 for (int I = 0; I != NumOps; ++I) {
3600 if (!MI.getOperand(I).isReg())
3601 continue;
3602
3603 Register OpReg = MI.getOperand(I).getReg();
3604 // We replace some dead address operands with $noreg
3605 if (!OpReg)
3606 continue;
3607
3608 unsigned Size = getSizeInBits(OpReg, MRI, *TRI);
3609
3610 // FIXME: Probably need a new intrinsic register bank searchable table to
3611 // handle arbitrary intrinsics easily.
3612 //
3613 // If this has a sampler, it immediately follows rsrc.
3614 const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1;
3615
3616 if (MustBeSGPR) {
3617 // If this must be an SGPR, so we must report whatever it is as legal.
3618 unsigned NewBank = getRegBankID(OpReg, MRI, AMDGPU::SGPRRegBankID);
3619 OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size);
3620 } else {
3621 // Some operands must be VGPR, and these are easy to copy to.
3622 OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3623 }
3624 }
3625
3626 return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps);
3627}
3628
3629/// Return the mapping for a pointer argument.
3632 Register PtrReg) const {
3633 LLT PtrTy = MRI.getType(PtrReg);
3634 unsigned Size = PtrTy.getSizeInBits();
3637 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3638
3639 // If we're using MUBUF instructions for global memory, an SGPR base register
3640 // is possible. Otherwise this needs to be a VGPR.
3641 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3642 return AMDGPU::getValueMapping(PtrBank->getID(), Size);
3643}
3644
3647
3648 const MachineFunction &MF = *MI.getParent()->getParent();
3649 const MachineRegisterInfo &MRI = MF.getRegInfo();
3651 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3652 Register PtrReg = MI.getOperand(1).getReg();
3653 LLT PtrTy = MRI.getType(PtrReg);
3654 unsigned AS = PtrTy.getAddressSpace();
3655 unsigned PtrSize = PtrTy.getSizeInBits();
3656
3657 const ValueMapping *ValMapping;
3658 const ValueMapping *PtrMapping;
3659
3660 const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3661
3662 if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) {
3663 if (isScalarLoadLegal(MI)) {
3664 // We have a uniform instruction so we want to use an SMRD load
3665 ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3666 PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
3667 } else {
3668 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3669
3670 // If we're using MUBUF instructions for global memory, an SGPR base
3671 // register is possible. Otherwise this needs to be a VGPR.
3672 unsigned PtrBankID = Subtarget.useFlatForGlobal() ?
3673 AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID;
3674
3675 PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize);
3676 }
3677 } else {
3678 ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3679 PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
3680 }
3681
3682 OpdsMapping[0] = ValMapping;
3683 OpdsMapping[1] = PtrMapping;
3685 1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands());
3686 return Mapping;
3687
3688 // FIXME: Do we want to add a mapping for FLAT load, or should we just
3689 // handle that during instruction selection?
3690}
3691
3692unsigned
3694 const MachineRegisterInfo &MRI,
3695 unsigned Default) const {
3696 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3697 return Bank ? Bank->getID() : Default;
3698}
3699
3702 const MachineRegisterInfo &MRI,
3703 const TargetRegisterInfo &TRI) const {
3704 // Lie and claim anything is legal, even though this needs to be an SGPR
3705 // applyMapping will have to deal with it as a waterfall loop.
3706 unsigned Bank = getRegBankID(Reg, MRI, AMDGPU::SGPRRegBankID);
3707 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3708 return AMDGPU::getValueMapping(Bank, Size);
3709}
3710
3713 const MachineRegisterInfo &MRI,
3714 const TargetRegisterInfo &TRI) const {
3715 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3716 return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3717}
3718
3721 const MachineRegisterInfo &MRI,
3722 const TargetRegisterInfo &TRI) const {
3723 unsigned Size = getSizeInBits(Reg, MRI, TRI);
3724 return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size);
3725}
3726
3727///
3728/// This function must return a legal mapping, because
3729/// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
3730/// in RegBankSelect::Mode::Fast. Any mapping that would cause a
3731/// VGPR to SGPR generated is illegal.
3732///
3733// Operands that must be SGPRs must accept potentially divergent VGPRs as
3734// legal. These will be dealt with in applyMappingImpl.
3735//
3738 const MachineFunction &MF = *MI.getParent()->getParent();
3739 const MachineRegisterInfo &MRI = MF.getRegInfo();
3740
3741 if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) {
3742 Register DstReg = MI.getOperand(0).getReg();
3743 Register SrcReg = MI.getOperand(1).getReg();
3744
3745 // The default logic bothers to analyze impossible alternative mappings. We
3746 // want the most straightforward mapping, so just directly handle this.
3747 const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI);
3748 const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
3749 assert(SrcBank && "src bank should have been assigned already");
3750
3751 // For COPY between a physical reg and an s1, there is no type associated so
3752 // we need to take the virtual register's type as a hint on how to interpret
3753 // s1 values.
3754 if (!SrcReg.isVirtual() && !DstBank &&
3755 MRI.getType(DstReg) == LLT::scalar(1))
3756 DstBank = &AMDGPU::VCCRegBank;
3757 else if (!DstReg.isVirtual() && MRI.getType(SrcReg) == LLT::scalar(1))
3758 DstBank = &AMDGPU::VCCRegBank;
3759
3760 if (!DstBank)
3761 DstBank = SrcBank;
3762
3763 unsigned Size = getSizeInBits(DstReg, MRI, *TRI);
3764 if (MI.getOpcode() != AMDGPU::G_FREEZE &&
3765 cannotCopy(*DstBank, *SrcBank, TypeSize::getFixed(Size)))
3767
3768 const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank);
3769 unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2;
3770 SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize);
3771 OpdsMapping[0] = &ValMap;
3772 if (MI.getOpcode() == AMDGPU::G_FREEZE)
3773 OpdsMapping[1] = &ValMap;
3774
3775 return getInstructionMapping(
3776 1, /*Cost*/ 1,
3777 /*OperandsMapping*/ getOperandsMapping(OpdsMapping), OpdsMappingSize);
3778 }
3779
3780 if (MI.isRegSequence()) {
3781 // If any input is a VGPR, the result must be a VGPR. The default handling
3782 // assumes any copy between banks is legal.
3783 unsigned BankID = AMDGPU::SGPRRegBankID;
3784
3785 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3786 auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI);
3787 // It doesn't make sense to use vcc or scc banks here, so just ignore
3788 // them.
3789 if (OpBank != AMDGPU::SGPRRegBankID) {
3790 BankID = AMDGPU::VGPRRegBankID;
3791 break;
3792 }
3793 }
3794 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3795
3796 const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID));
3797 return getInstructionMapping(
3798 1, /*Cost*/ 1,
3799 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3800 }
3801
3802 // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
3803 // properly.
3804 //
3805 // TODO: There are additional exec masking dependencies to analyze.
3806 if (auto *PHI = dyn_cast<GPhi>(&MI)) {
3807 unsigned ResultBank = AMDGPU::InvalidRegBankID;
3808 Register DstReg = PHI->getReg(0);
3809
3810 // Sometimes the result may have already been assigned a bank.
3811 if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI))
3812 ResultBank = DstBank->getID();
3813
3814 for (unsigned I = 0; I < PHI->getNumIncomingValues(); ++I) {
3815 Register Reg = PHI->getIncomingValue(I);
3816 const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3817
3818 // FIXME: Assuming VGPR for any undetermined inputs.
3819 if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {
3820 ResultBank = AMDGPU::VGPRRegBankID;
3821 break;
3822 }
3823
3824 // FIXME: Need to promote SGPR case to s32
3825 unsigned OpBank = Bank->getID();
3826 ResultBank = regBankBoolUnion(ResultBank, OpBank);
3827 }
3828
3829 assert(ResultBank != AMDGPU::InvalidRegBankID);
3830
3831 unsigned Size = MRI.getType(DstReg).getSizeInBits();
3832
3833 const ValueMapping &ValMap =
3834 getValueMapping(0, Size, getRegBank(ResultBank));
3835 return getInstructionMapping(
3836 1, /*Cost*/ 1,
3837 /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3838 }
3839
3841 if (Mapping.isValid())
3842 return Mapping;
3843
3844 SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3845
3846 switch (MI.getOpcode()) {
3847 default:
3849
3850 case AMDGPU::G_AND:
3851 case AMDGPU::G_OR:
3852 case AMDGPU::G_XOR:
3853 case AMDGPU::G_MUL: {
3854 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3855 if (Size == 1) {
3856 const RegisterBank *DstBank
3857 = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI);
3858
3859 unsigned TargetBankID = AMDGPU::InvalidRegBankID;
3860 unsigned BankLHS = AMDGPU::InvalidRegBankID;
3861 unsigned BankRHS = AMDGPU::InvalidRegBankID;
3862 if (DstBank) {
3863 TargetBankID = DstBank->getID();
3864 if (DstBank == &AMDGPU::VCCRegBank) {
3865 TargetBankID = AMDGPU::VCCRegBankID;
3866 BankLHS = AMDGPU::VCCRegBankID;
3867 BankRHS = AMDGPU::VCCRegBankID;
3868 } else {
3869 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3870 AMDGPU::SGPRRegBankID);
3871 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3872 AMDGPU::SGPRRegBankID);
3873 }
3874 } else {
3875 BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3876 AMDGPU::VCCRegBankID);
3877 BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3878 AMDGPU::VCCRegBankID);
3879
3880 // Both inputs should be true booleans to produce a boolean result.
3881 if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {
3882 TargetBankID = AMDGPU::VGPRRegBankID;
3883 } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {
3884 TargetBankID = AMDGPU::VCCRegBankID;
3885 BankLHS = AMDGPU::VCCRegBankID;
3886 BankRHS = AMDGPU::VCCRegBankID;
3887 } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
3888 TargetBankID = AMDGPU::SGPRRegBankID;
3889 }
3890 }
3891
3892 OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size);
3893 OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size);
3894 OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size);
3895 break;
3896 }
3897
3898 if (Size == 64) {
3899
3900 if (isSALUMapping(MI)) {
3901 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
3902 OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
3903 } else {
3904 OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
3905 unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/);
3906 OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
3907
3908 unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI /*, DefaultBankID*/);
3909 OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size);
3910 }
3911
3912 break;
3913 }
3914
3915 [[fallthrough]];
3916 }
3917 case AMDGPU::G_PTR_ADD:
3918 case AMDGPU::G_PTRMASK:
3919 case AMDGPU::G_ADD:
3920 case AMDGPU::G_SUB:
3921 case AMDGPU::G_SHL:
3922 case AMDGPU::G_LSHR:
3923 case AMDGPU::G_ASHR:
3924 case AMDGPU::G_UADDO:
3925 case AMDGPU::G_USUBO:
3926 case AMDGPU::G_UADDE:
3927 case AMDGPU::G_SADDE:
3928 case AMDGPU::G_USUBE:
3929 case AMDGPU::G_SSUBE:
3930 case AMDGPU::G_SMIN:
3931 case AMDGPU::G_SMAX:
3932 case AMDGPU::G_UMIN:
3933 case AMDGPU::G_UMAX:
3934 case AMDGPU::G_ABS:
3935 case AMDGPU::G_SHUFFLE_VECTOR:
3936 case AMDGPU::G_SBFX:
3937 case AMDGPU::G_UBFX:
3938 case AMDGPU::G_AMDGPU_S_MUL_I64_I32:
3939 case AMDGPU::G_AMDGPU_S_MUL_U64_U32:
3940 if (isSALUMapping(MI))
3941 return getDefaultMappingSOP(MI);
3942 return getDefaultMappingVOP(MI);
3943 case AMDGPU::G_FADD:
3944 case AMDGPU::G_FSUB:
3945 case AMDGPU::G_FMUL:
3946 case AMDGPU::G_FMA:
3947 case AMDGPU::G_FFLOOR:
3948 case AMDGPU::G_FCEIL:
3949 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
3950 case AMDGPU::G_FMINNUM:
3951 case AMDGPU::G_FMAXNUM:
3952 case AMDGPU::G_FMINIMUM:
3953 case AMDGPU::G_FMAXIMUM:
3954 case AMDGPU::G_INTRINSIC_TRUNC:
3955 case AMDGPU::G_STRICT_FADD:
3956 case AMDGPU::G_STRICT_FSUB:
3957 case AMDGPU::G_STRICT_FMUL:
3958 case AMDGPU::G_STRICT_FMA: {
3959 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3960 unsigned Size = Ty.getSizeInBits();
3961 if (Subtarget.hasSALUFloatInsts() && Ty.isScalar() &&
3962 (Size == 32 || Size == 16) && isSALUMapping(MI))
3963 return getDefaultMappingSOP(MI);
3964 return getDefaultMappingVOP(MI);
3965 }
3966 case AMDGPU::G_FPTOSI:
3967 case AMDGPU::G_FPTOUI:
3968 case AMDGPU::G_SITOFP:
3969 case AMDGPU::G_UITOFP: {
3970 unsigned SizeDst = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3971 unsigned SizeSrc = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3972 if (Subtarget.hasSALUFloatInsts() && SizeDst == 32 && SizeSrc == 32 &&
3974 return getDefaultMappingSOP(MI);
3975 return getDefaultMappingVOP(MI);
3976 }
3977 case AMDGPU::G_FPTRUNC:
3978 case AMDGPU::G_FPEXT: {
3979 unsigned SizeDst = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3980 unsigned SizeSrc = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3981 if (Subtarget.hasSALUFloatInsts() && SizeDst != 64 && SizeSrc != 64 &&
3983 return getDefaultMappingSOP(MI);
3984 return getDefaultMappingVOP(MI);
3985 }
3986 case AMDGPU::G_FSQRT:
3987 case AMDGPU::G_FEXP2:
3988 case AMDGPU::G_FLOG2: {
3989 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3990 if (Subtarget.hasPseudoScalarTrans() && (Size == 16 || Size == 32) &&
3992 return getDefaultMappingSOP(MI);
3993 return getDefaultMappingVOP(MI);
3994 }
3995 case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU
3996 case AMDGPU::G_SSUBSAT:
3997 case AMDGPU::G_UADDSAT:
3998 case AMDGPU::G_USUBSAT:
3999 case AMDGPU::G_FMAD:
4000 case AMDGPU::G_FLDEXP:
4001 case AMDGPU::G_FMINNUM_IEEE:
4002 case AMDGPU::G_FMAXNUM_IEEE:
4003 case AMDGPU::G_FCANONICALIZE:
4004 case AMDGPU::G_STRICT_FLDEXP:
4005 case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar?
4006 case AMDGPU::G_FSHR: // TODO: Expand for scalar
4007 case AMDGPU::G_AMDGPU_FMIN_LEGACY:
4008 case AMDGPU::G_AMDGPU_FMAX_LEGACY:
4009 case AMDGPU::G_AMDGPU_RCP_IFLAG:
4010 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
4011 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
4012 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
4013 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
4014 case AMDGPU::G_AMDGPU_CVT_PK_I16_I32:
4015 case AMDGPU::G_AMDGPU_SMED3:
4016 case AMDGPU::G_AMDGPU_FMED3:
4017 return getDefaultMappingVOP(MI);
4018 case AMDGPU::G_UMULH:
4019 case AMDGPU::G_SMULH: {
4021 return getDefaultMappingSOP(MI);
4022 return getDefaultMappingVOP(MI);
4023 }
4024 case AMDGPU::G_AMDGPU_MAD_U64_U32:
4025 case AMDGPU::G_AMDGPU_MAD_I64_I32: {
4026 // Three possible mappings:
4027 //
4028 // - Default SOP
4029 // - Default VOP
4030 // - Scalar multiply: src0 and src1 are SGPRs, the rest is VOP.
4031 //
4032 // This allows instruction selection to keep the multiplication part of the
4033 // instruction on the SALU.
4034 bool AllSalu = true;
4035 bool MulSalu = true;
4036 for (unsigned i = 0; i < 5; ++i) {
4037 Register Reg = MI.getOperand(i).getReg();
4038 if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
4039 if (Bank->getID() != AMDGPU::SGPRRegBankID) {
4040 AllSalu = false;
4041 if (i == 2 || i == 3) {
4042 MulSalu = false;
4043 break;
4044 }
4045 }
4046 }
4047 }
4048
4049 if (AllSalu)
4050 return getDefaultMappingSOP(MI);
4051
4052 // If the multiply-add is full-rate in VALU, use that even if the
4053 // multiplication part is scalar. Accumulating separately on the VALU would
4054 // take two instructions.
4055 if (!MulSalu || Subtarget.hasFullRate64Ops())
4056 return getDefaultMappingVOP(MI);
4057
4058 // Keep the multiplication on the SALU, then accumulate on the VALU.
4059 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
4060 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4061 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
4062 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
4063 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
4064 break;
4065 }
4066 case AMDGPU::G_IMPLICIT_DEF: {
4067 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4068 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4069 break;
4070 }
4071 case AMDGPU::G_FCONSTANT:
4072 case AMDGPU::G_CONSTANT:
4073 case AMDGPU::G_GLOBAL_VALUE:
4074 case AMDGPU::G_FRAME_INDEX:
4075 case AMDGPU::G_BLOCK_ADDR:
4076 case AMDGPU::G_READSTEADYCOUNTER:
4077 case AMDGPU::G_READCYCLECOUNTER: {
4078 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4079 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4080 break;
4081 }
4082 case AMDGPU::G_DYN_STACKALLOC: {
4083 // Result is always uniform, and a wave reduction is needed for the source.
4084 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
4085 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4086 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32);
4087 break;
4088 }
4089 case AMDGPU::G_AMDGPU_WAVE_ADDRESS: {
4090 // This case is weird because we expect a physical register in the source,
4091 // but need to set a bank anyway.
4092 //
4093 // TODO: We could select the result to SGPR or VGPR
4094 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
4095 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
4096 break;
4097 }
4098 case AMDGPU::G_INSERT: {
4099 unsigned BankID = getMappingType(MRI, MI);
4100 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4101 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4102 unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
4103 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
4104 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
4105 OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize);
4106 OpdsMapping[3] = nullptr;
4107 break;
4108 }
4109 case AMDGPU::G_EXTRACT: {
4110 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4111 unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4112 unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4113 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
4114 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
4115 OpdsMapping[2] = nullptr;
4116 break;
4117 }
4118 case AMDGPU::G_BUILD_VECTOR:
4119 case AMDGPU::G_BUILD_VECTOR_TRUNC: {
4120 LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
4121 if (DstTy == LLT::fixed_vector(2, 16)) {
4122 unsigned DstSize = DstTy.getSizeInBits();
4123 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4124 unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4125 unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
4126 unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID);
4127
4128 OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize);
4129 OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize);
4130 OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize);
4131 break;
4132 }
4133
4134 [[fallthrough]];
4135 }
4136 case AMDGPU::G_MERGE_VALUES:
4137 case AMDGPU::G_CONCAT_VECTORS: {
4138 unsigned Bank = getMappingType(MRI, MI);
4139 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4140 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4141
4142 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
4143 // Op1 and Dst should use the same register bank.
4144 for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
4145 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
4146 break;
4147 }
4148 case AMDGPU::G_BITREVERSE:
4149 case AMDGPU::G_BITCAST:
4150 case AMDGPU::G_INTTOPTR:
4151 case AMDGPU::G_PTRTOINT:
4152 case AMDGPU::G_FABS:
4153 case AMDGPU::G_FNEG: {
4154 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4155 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4156 OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
4157 break;
4158 }
4159 case AMDGPU::G_AMDGPU_FFBH_U32:
4160 case AMDGPU::G_AMDGPU_FFBL_B32:
4161 case AMDGPU::G_CTLZ_ZERO_UNDEF:
4162 case AMDGPU::G_CTTZ_ZERO_UNDEF: {
4163 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4164 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4165 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
4166 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(BankID, Size);
4167 break;
4168 }
4169 case AMDGPU::G_CTPOP: {
4170 unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4171 unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4172 OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
4173
4174 // This should really be getValueMappingSGPR64Only, but allowing the generic
4175 // code to handle the register split just makes using LegalizerHelper more
4176 // difficult.
4177 OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
4178 break;
4179 }
4180 case AMDGPU::G_TRUNC: {
4181 Register Dst = MI.getOperand(0).getReg();
4182 Register Src = MI.getOperand(1).getReg();
4183 unsigned Bank = getRegBankID(Src, MRI);
4184 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
4185 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
4186 OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
4187 OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
4188 break;
4189 }
4190 case AMDGPU::G_ZEXT:
4191 case AMDGPU::G_SEXT:
4192 case AMDGPU::G_ANYEXT:
4193 case AMDGPU::G_SEXT_INREG: {
4194 Register Dst = MI.getOperand(0).getReg();
4195 Register Src = MI.getOperand(1).getReg();
4196 unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
4197 unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
4198
4199 unsigned DstBank;
4200 const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI);
4201 assert(SrcBank);
4202 switch (SrcBank->getID()) {
4203 case AMDGPU::SGPRRegBankID:
4204 DstBank = AMDGPU::SGPRRegBankID;
4205 break;
4206 default:
4207 DstBank = AMDGPU::VGPRRegBankID;
4208 break;
4209 }
4210
4211 // Scalar extend can use 64-bit BFE, but VGPRs require extending to
4212 // 32-bits, and then to 64.
4213 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize);
4214 OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(),
4215 SrcSize);
4216 break;
4217 }
4218 case AMDGPU::G_IS_FPCLASS: {
4219 Register SrcReg = MI.getOperand(1).getReg();
4220 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
4221 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4222 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
4223 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4224 break;
4225 }
4226 case AMDGPU::G_STORE: {
4227 assert(MI.getOperand(0).isReg());
4228 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4229
4230 // FIXME: We need to specify a different reg bank once scalar stores are
4231 // supported.
4232 const ValueMapping *ValMapping =
4233 AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4234 OpdsMapping[0] = ValMapping;
4235 OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4236 break;
4237 }
4238 case AMDGPU::G_ICMP:
4239 case AMDGPU::G_FCMP: {
4240 unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4241
4242 // See if the result register has already been constrained to vcc, which may
4243 // happen due to control flow intrinsic lowering.
4244 unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI,
4245 AMDGPU::SGPRRegBankID);
4246 unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
4247 unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI);
4248
4249 auto canUseSCCICMP = [&]() {
4250 auto Pred =
4251 static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
4252 return Size == 32 ||
4253 (Size == 64 &&
4254 (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
4256 };
4257 auto canUseSCCFCMP = [&]() {
4258 return Subtarget.hasSALUFloatInsts() && (Size == 32 || Size == 16);
4259 };
4260
4261 bool isICMP = MI.getOpcode() == AMDGPU::G_ICMP;
4262 bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID &&
4263 Op2Bank == AMDGPU::SGPRRegBankID &&
4264 Op3Bank == AMDGPU::SGPRRegBankID &&
4265 (isICMP ? canUseSCCICMP() : canUseSCCFCMP());
4266
4267 DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4268 unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4269
4270 // TODO: Use 32-bit for scalar output size.
4271 // SCC results will need to be copied to a 32-bit SGPR virtual register.
4272 const unsigned ResultSize = 1;
4273
4274 OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize);
4275 OpdsMapping[1] = nullptr; // Predicate Operand.
4276 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size);
4277 OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size);
4278 break;
4279 }
4280 case AMDGPU::G_EXTRACT_VECTOR_ELT: {
4281 // VGPR index can be used for waterfall when indexing a SGPR vector.
4282 unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
4283 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4284 unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4285 unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4286 unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI);
4287 unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank);
4288
4289 OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize);
4290 OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize);
4291
4292 // The index can be either if the source vector is VGPR.
4293 OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4294 break;
4295 }
4296 case AMDGPU::G_INSERT_VECTOR_ELT: {
4297 unsigned OutputBankID = isSALUMapping(MI) ?
4298 AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4299
4300 unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4301 unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4302 unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
4303 unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
4304 unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI);
4305
4306 OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
4307 OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize);
4308
4309 // This is a weird case, because we need to break down the mapping based on
4310 // the register bank of a different operand.
4311 if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) {
4312 OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID,
4313 InsertSize);
4314 } else {
4315 assert(InsertSize == 32 || InsertSize == 64);
4316 OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize);
4317 }
4318
4319 // The index can be either if the source vector is VGPR.
4320 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize);
4321 break;
4322 }
4323 case AMDGPU::G_UNMERGE_VALUES: {
4324 unsigned Bank = getMappingType(MRI, MI);
4325
4326 // Op1 and Dst should use the same register bank.
4327 // FIXME: Shouldn't this be the default? Why do we need to handle this?
4328 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
4329 unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
4330 OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size);
4331 }
4332 break;
4333 }
4334 case AMDGPU::G_AMDGPU_BUFFER_LOAD:
4335 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
4336 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
4337 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
4338 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
4339 case AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE:
4340 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE:
4341 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE_TFE:
4342 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE:
4343 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT_TFE:
4344 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
4345 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE:
4346 case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
4347 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
4348 case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
4349 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
4350 case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16:
4351 case AMDGPU::G_AMDGPU_BUFFER_STORE:
4352 case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
4353 case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
4354 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
4355 case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: {
4356 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4357
4358 // rsrc
4359 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4360
4361 // vindex
4362 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4363
4364 // voffset
4365 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4366
4367 // soffset
4368 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4369
4370 // Any remaining operands are immediates and were correctly null
4371 // initialized.
4372 break;
4373 }
4374 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
4375 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
4376 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
4377 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
4378 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
4379 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
4380 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
4381 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
4382 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
4383 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
4384 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
4385 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
4386 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
4387 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
4388 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
4389 // vdata_out
4390 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4391
4392 // vdata_in
4393 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4394
4395 // rsrc
4396 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4397
4398 // vindex
4399 OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4400
4401 // voffset
4402 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4403
4404 // soffset
4405 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4406
4407 // Any remaining operands are immediates and were correctly null
4408 // initialized.
4409 break;
4410 }
4411 case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
4412 // vdata_out
4413 OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4414
4415 // vdata_in
4416 OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4417
4418 // cmp
4419 OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4420
4421 // rsrc
4422 OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4423
4424 // vindex
4425 OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4426
4427 // voffset
4428 OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4429
4430 // soffset
4431 OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
4432
4433 // Any remaining operands are immediates and were correctly null
4434 // initialized.
4435 break;
4436 }
4437 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD:
4438 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
4439 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:
4440 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
4441 case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT: {
4442 // Lie and claim everything is legal, even though some need to be
4443 // SGPRs. applyMapping will have to deal with it as a waterfall loop.
4444 OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4445 OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4446
4447 // We need to convert this to a MUBUF if either the resource of offset is
4448 // VGPR.
4449 unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID();
4450 unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID();
4451 unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank);
4452
4453 unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4454 OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0);
4455 break;
4456 }
4457 case AMDGPU::G_INTRINSIC:
4458 case AMDGPU::G_INTRINSIC_CONVERGENT: {
4459 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
4460 default:
4462 case Intrinsic::amdgcn_div_fmas:
4463 case Intrinsic::amdgcn_div_fixup:
4464 case Intrinsic::amdgcn_trig_preop:
4465 case Intrinsic::amdgcn_sin:
4466 case Intrinsic::amdgcn_cos:
4467 case Intrinsic::amdgcn_log_clamp:
4468 case Intrinsic::amdgcn_rcp_legacy:
4469 case Intrinsic::amdgcn_rsq_legacy:
4470 case Intrinsic::amdgcn_rsq_clamp:
4471 case Intrinsic::amdgcn_fmul_legacy:
4472 case Intrinsic::amdgcn_fma_legacy:
4473 case Intrinsic::amdgcn_frexp_mant:
4474 case Intrinsic::amdgcn_frexp_exp:
4475 case Intrinsic::amdgcn_fract:
4476 case Intrinsic::amdgcn_cvt_pknorm_i16:
4477 case Intrinsic::amdgcn_cvt_pknorm_u16:
4478 case Intrinsic::amdgcn_cvt_pk_i16:
4479 case Intrinsic::amdgcn_cvt_pk_u16:
4480 case Intrinsic::amdgcn_fmed3:
4481 case Intrinsic::amdgcn_cubeid:
4482 case Intrinsic::amdgcn_cubema:
4483 case Intrinsic::amdgcn_cubesc:
4484 case Intrinsic::amdgcn_cubetc:
4485 case Intrinsic::amdgcn_sffbh:
4486 case Intrinsic::amdgcn_fmad_ftz:
4487 case Intrinsic::amdgcn_mbcnt_lo:
4488 case Intrinsic::amdgcn_mbcnt_hi:
4489 case Intrinsic::amdgcn_mul_u24:
4490 case Intrinsic::amdgcn_mul_i24:
4491 case Intrinsic::amdgcn_mulhi_u24:
4492 case Intrinsic::amdgcn_mulhi_i24:
4493 case Intrinsic::amdgcn_lerp:
4494 case Intrinsic::amdgcn_sad_u8:
4495 case Intrinsic::amdgcn_msad_u8:
4496 case Intrinsic::amdgcn_sad_hi_u8:
4497 case Intrinsic::amdgcn_sad_u16:
4498 case Intrinsic::amdgcn_qsad_pk_u16_u8:
4499 case Intrinsic::amdgcn_mqsad_pk_u16_u8:
4500 case Intrinsic::amdgcn_mqsad_u32_u8:
4501 case Intrinsic::amdgcn_cvt_pk_u8_f32:
4502 case Intrinsic::amdgcn_alignbyte:
4503 case Intrinsic::amdgcn_perm:
4504 case Intrinsic::amdgcn_fdot2:
4505 case Intrinsic::amdgcn_sdot2:
4506 case Intrinsic::amdgcn_udot2:
4507 case Intrinsic::amdgcn_sdot4:
4508 case Intrinsic::amdgcn_udot4:
4509 case Intrinsic::amdgcn_sdot8:
4510 case Intrinsic::amdgcn_udot8:
4511 case Intrinsic::amdgcn_fdot2_bf16_bf16:
4512 case Intrinsic::amdgcn_fdot2_f16_f16:
4513 case Intrinsic::amdgcn_fdot2_f32_bf16:
4514 case Intrinsic::amdgcn_sudot4:
4515 case Intrinsic::amdgcn_sudot8:
4516 case Intrinsic::amdgcn_dot4_f32_fp8_bf8:
4517 case Intrinsic::amdgcn_dot4_f32_bf8_fp8:
4518 case Intrinsic::amdgcn_dot4_f32_fp8_fp8:
4519 case Intrinsic::amdgcn_dot4_f32_bf8_bf8:
4520 case Intrinsic::amdgcn_cvt_f32_fp8:
4521 case Intrinsic::amdgcn_cvt_f32_bf8:
4522 case Intrinsic::amdgcn_cvt_pk_f32_fp8:
4523 case Intrinsic::amdgcn_cvt_pk_f32_bf8:
4524 case Intrinsic::amdgcn_cvt_pk_fp8_f32:
4525 case Intrinsic::amdgcn_cvt_pk_bf8_f32:
4526 case Intrinsic::amdgcn_cvt_sr_fp8_f32:
4527 case Intrinsic::amdgcn_cvt_sr_bf8_f32:
4528 case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16:
4529 case Intrinsic::amdgcn_wmma_f16_16x16x16_f16:
4530 case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied:
4531 case Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied:
4532 case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16:
4533 case Intrinsic::amdgcn_wmma_f32_16x16x16_f16:
4534 case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:
4535 case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8:
4536 case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8:
4537 case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8:
4538 case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8:
4539 case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8:
4540 case Intrinsic::amdgcn_wmma_i32_16x16x32_iu4:
4541 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
4542 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
4543 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
4544 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
4545 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
4546 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
4547 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4:
4548 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
4549 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
4550 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
4551 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8:
4552 return getDefaultMappingVOP(MI);
4553 case Intrinsic::amdgcn_log:
4554 case Intrinsic::amdgcn_exp2:
4555 case Intrinsic::amdgcn_rcp:
4556 case Intrinsic::amdgcn_rsq:
4557 case Intrinsic::amdgcn_sqrt: {
4558 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4559 if (Subtarget.hasPseudoScalarTrans() && (Size == 16 || Size == 32) &&
4561 return getDefaultMappingSOP(MI);
4562 return getDefaultMappingVOP(MI);
4563 }
4564 case Intrinsic::amdgcn_sbfe:
4565 case Intrinsic::amdgcn_ubfe:
4566 if (isSALUMapping(MI))
4567 return getDefaultMappingSOP(MI);
4568 return getDefaultMappingVOP(MI);
4569 case Intrinsic::amdgcn_ds_swizzle:
4570 case Intrinsic::amdgcn_ds_permute:
4571 case Intrinsic::amdgcn_ds_bpermute:
4572 case Intrinsic::amdgcn_update_dpp:
4573 case Intrinsic::amdgcn_mov_dpp8:
4574 case Intrinsic::amdgcn_mov_dpp:
4575 case Intrinsic::amdgcn_strict_wwm:
4576 case Intrinsic::amdgcn_wwm:
4577 case Intrinsic::amdgcn_strict_wqm:
4578 case Intrinsic::amdgcn_wqm:
4579 case Intrinsic::amdgcn_softwqm:
4580 case Intrinsic::amdgcn_set_inactive:
4581 case Intrinsic::amdgcn_set_inactive_chain_arg:
4582 case Intrinsic::amdgcn_permlane64:
4584 case Intrinsic::amdgcn_cvt_pkrtz:
4586 return getDefaultMappingSOP(MI);
4587 return getDefaultMappingVOP(MI);
4588 case Intrinsic::amdgcn_kernarg_segment_ptr:
4589 case Intrinsic::amdgcn_s_getpc:
4590 case Intrinsic::amdgcn_groupstaticsize:
4591 case Intrinsic::amdgcn_reloc_constant:
4592 case Intrinsic::returnaddress: {
4593 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4594 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4595 break;
4596 }
4597 case Intrinsic::amdgcn_wqm_vote: {
4598 unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4599 OpdsMapping[0] = OpdsMapping[2]
4600 = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size);
4601 break;
4602 }
4603 case Intrinsic::amdgcn_ps_live: {
4604 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4605 break;
4606 }
4607 case Intrinsic::amdgcn_div_scale: {
4608 unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4609 unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4610 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size);
4611 OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size);
4612
4613 unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
4614 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4615 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4616 break;
4617 }
4618 case Intrinsic::amdgcn_class: {
4619 Register Src0Reg = MI.getOperand(2).getReg();
4620 Register Src1Reg = MI.getOperand(3).getReg();
4621 unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits();
4622 unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits();
4623 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4624 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
4625 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size);
4626 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size);
4627 break;
4628 }
4629 case Intrinsic::amdgcn_icmp:
4630 case Intrinsic::amdgcn_fcmp: {
4631 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4632 // This is not VCCRegBank because this is not used in boolean contexts.
4633 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4634 unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4635 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4636 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4637 break;
4638 }
4639 case Intrinsic::amdgcn_readlane: {
4640 // This must be an SGPR, but accept a VGPR.
4641 Register IdxReg = MI.getOperand(3).getReg();
4642 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4643 unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4644 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4645 [[fallthrough]];
4646 }
4647 case Intrinsic::amdgcn_readfirstlane: {
4648 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4649 unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4650 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4651 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4652 break;
4653 }
4654 case Intrinsic::amdgcn_writelane: {
4655 unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4656 Register SrcReg = MI.getOperand(2).getReg();
4657 unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
4658 unsigned SrcBank = getRegBankID(SrcReg, MRI, AMDGPU::SGPRRegBankID);
4659 Register IdxReg = MI.getOperand(3).getReg();
4660 unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4661 unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4662 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4663
4664 // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
4665 // to legalize.
4666 OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize);
4667 OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4668 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4669 break;
4670 }
4671 case Intrinsic::amdgcn_if_break: {
4672 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4673 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4674 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4675 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4676 break;
4677 }
4678 case Intrinsic::amdgcn_permlane16:
4679 case Intrinsic::amdgcn_permlanex16: {
4680 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4681 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4682 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4683 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4684 OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4685 OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4686 break;
4687 }
4688 case Intrinsic::amdgcn_permlane16_var:
4689 case Intrinsic::amdgcn_permlanex16_var: {
4690 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4691 OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4692 OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4693 OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4694 OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4695 break;
4696 }
4697 case Intrinsic::amdgcn_mfma_f32_4x4x1f32:
4698 case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
4699 case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
4700 case Intrinsic::amdgcn_mfma_f32_4x4x2bf16:
4701 case Intrinsic::amdgcn_mfma_f32_16x16x1f32:
4702 case Intrinsic::amdgcn_mfma_f32_16x16x4f32:
4703 case Intrinsic::amdgcn_mfma_f32_16x16x4f16:
4704 case Intrinsic::amdgcn_mfma_f32_16x16x16f16:
4705 case Intrinsic::amdgcn_mfma_i32_16x16x4i8:
4706 case Intrinsic::amdgcn_mfma_i32_16x16x16i8:
4707 case Intrinsic::amdgcn_mfma_f32_16x16x2bf16:
4708 case Intrinsic::amdgcn_mfma_f32_16x16x8bf16:
4709 case Intrinsic::amdgcn_mfma_f32_32x32x1f32:
4710 case Intrinsic::amdgcn_mfma_f32_32x32x2f32:
4711 case Intrinsic::amdgcn_mfma_f32_32x32x4f16:
4712 case Intrinsic::amdgcn_mfma_f32_32x32x8f16:
4713 case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
4714 case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
4715 case Intrinsic::amdgcn_mfma_f32_32x32x2bf16:
4716 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16:
4717 case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k:
4718 case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k:
4719 case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k:
4720 case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k:
4721 case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k:
4722 case Intrinsic::amdgcn_mfma_f64_16x16x4f64:
4723 case Intrinsic::amdgcn_mfma_f64_4x4x4f64:
4724 case Intrinsic::amdgcn_mfma_i32_16x16x32_i8:
4725 case Intrinsic::amdgcn_mfma_i32_32x32x16_i8:
4726 case Intrinsic::amdgcn_mfma_f32_16x16x8_xf32:
4727 case Intrinsic::amdgcn_mfma_f32_32x32x4_xf32:
4728 case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_bf8:
4729 case Intrinsic::amdgcn_mfma_f32_16x16x32_bf8_fp8:
4730 case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_bf8:
4731 case Intrinsic::amdgcn_mfma_f32_16x16x32_fp8_fp8:
4732 case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_bf8:
4733 case Intrinsic::amdgcn_mfma_f32_32x32x16_bf8_fp8:
4734 case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_bf8:
4735 case Intrinsic::amdgcn_mfma_f32_32x32x16_fp8_fp8: {
4736 // Default for MAI intrinsics.
4737 // srcC can also be an immediate which can be folded later.
4738 // FIXME: Should we eventually add an alternative mapping with AGPR src
4739 // for srcA/srcB?
4740 //
4741 // vdst, srcA, srcB, srcC
4743 OpdsMapping[0] =
4744 Info->mayNeedAGPRs()
4745 ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
4746 : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4747 OpdsMapping[2] = getVGPROpMappin