LLVM  15.0.0git
AMDGPURegisterBankInfo.cpp
Go to the documentation of this file.
1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the RegisterBankInfo class for
10 /// AMDGPU.
11 ///
12 /// \par
13 ///
14 /// AMDGPU has unique register bank constraints that require special high level
15 /// strategies to deal with. There are two main true physical register banks
16 /// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a
17 /// sort of pseudo-register bank needed to represent SGPRs used in a vector
18 /// boolean context. There is also the AGPR bank, which is a special purpose
19 /// physical register bank present on some subtargets.
20 ///
21 /// Copying from VGPR to SGPR is generally illegal, unless the value is known to
22 /// be uniform. It is generally not valid to legalize operands by inserting
23 /// copies as on other targets. Operations which require uniform, SGPR operands
24 /// generally require scalarization by repeatedly executing the instruction,
25 /// activating each set of lanes using a unique set of input values. This is
26 /// referred to as a waterfall loop.
27 ///
28 /// \par Booleans
29 ///
30 /// Booleans (s1 values) requires special consideration. A vector compare result
31 /// is naturally a bitmask with one bit per lane, in a 32 or 64-bit
32 /// register. These are represented with the VCC bank. During selection, we need
33 /// to be able to unambiguously go back from a register class to a register
34 /// bank. To distinguish whether an SGPR should use the SGPR or VCC register
35 /// bank, we need to know the use context type. An SGPR s1 value always means a
36 /// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets
37 /// SCC, which is a 1-bit unaddressable register. This will need to be copied to
38 /// a 32-bit virtual register. Taken together, this means we need to adjust the
39 /// type of boolean operations to be regbank legal. All SALU booleans need to be
40 /// widened to 32-bits, and all VALU booleans need to be s1 values.
41 ///
42 /// A noteworthy exception to the s1-means-vcc rule is for legalization artifact
43 /// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc
44 /// bank. A non-boolean source (such as a truncate from a 1-bit load from
45 /// memory) will require a copy to the VCC bank which will require clearing the
46 /// high bits and inserting a compare.
47 ///
48 /// \par Constant bus restriction
49 ///
50 /// VALU instructions have a limitation known as the constant bus
51 /// restriction. Most VALU instructions can use SGPR operands, but may read at
52 /// most 1 SGPR or constant literal value (this to 2 in gfx10 for most
53 /// instructions). This is one unique SGPR, so the same SGPR may be used for
54 /// multiple operands. From a register bank perspective, any combination of
55 /// operands should be legal as an SGPR, but this is contextually dependent on
56 /// the SGPR operands all being the same register. There is therefore optimal to
57 /// choose the SGPR with the most uses to minimize the number of copies.
58 ///
59 /// We avoid trying to solve this problem in RegBankSelect. Any VALU G_*
60 /// operation should have its source operands all mapped to VGPRs (except for
61 /// VCC), inserting copies from any SGPR operands. This the most trivial legal
62 /// mapping. Anything beyond the simplest 1:1 instruction selection would be too
63 /// complicated to solve here. Every optimization pattern or instruction
64 /// selected to multiple outputs would have to enforce this rule, and there
65 /// would be additional complexity in tracking this rule for every G_*
66 /// operation. By forcing all inputs to VGPRs, it also simplifies the task of
67 /// picking the optimal operand combination from a post-isel optimization pass.
68 ///
69 //===----------------------------------------------------------------------===//
70 
71 #include "AMDGPURegisterBankInfo.h"
72 
73 #include "AMDGPU.h"
74 #include "AMDGPUGlobalISelUtils.h"
75 #include "AMDGPUInstrInfo.h"
76 #include "GCNSubtarget.h"
77 #include "SIMachineFunctionInfo.h"
78 #include "SIRegisterInfo.h"
84 #include "llvm/IR/IntrinsicsAMDGPU.h"
85 
86 #define GET_TARGET_REGBANK_IMPL
87 #include "AMDGPUGenRegisterBank.inc"
88 
89 // This file will be TableGen'ed at some point.
90 #include "AMDGPUGenRegisterBankInfo.def"
91 
92 using namespace llvm;
93 using namespace MIPatternMatch;
94 
95 namespace {
96 
97 // Observer to apply a register bank to new registers created by LegalizerHelper.
98 class ApplyRegBankMapping final : public GISelChangeObserver {
99 private:
100  const AMDGPURegisterBankInfo &RBI;
102  const RegisterBank *NewBank;
104 
105 public:
106  ApplyRegBankMapping(const AMDGPURegisterBankInfo &RBI_,
107  MachineRegisterInfo &MRI_, const RegisterBank *RB)
108  : RBI(RBI_), MRI(MRI_), NewBank(RB) {}
109 
110  ~ApplyRegBankMapping() {
111  for (MachineInstr *MI : NewInsts)
112  applyBank(*MI);
113  }
114 
115  /// Set any registers that don't have a set register class or bank to SALU.
116  void applyBank(MachineInstr &MI) {
117  const unsigned Opc = MI.getOpcode();
118  if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT ||
119  Opc == AMDGPU::G_SEXT) {
120  // LegalizerHelper wants to use the basic legalization artifacts when
121  // widening etc. We don't handle selection with vcc in artifact sources,
122  // so we need to use a select instead to handle these properly.
123  Register DstReg = MI.getOperand(0).getReg();
124  Register SrcReg = MI.getOperand(1).getReg();
125  const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI);
126  if (SrcBank == &AMDGPU::VCCRegBank) {
127  const LLT S32 = LLT::scalar(32);
128  assert(MRI.getType(SrcReg) == LLT::scalar(1));
129  assert(MRI.getType(DstReg) == S32);
130  assert(NewBank == &AMDGPU::VGPRRegBank);
131 
132  // Replace the extension with a select, which really uses the boolean
133  // source.
135  auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1);
136  auto False = B.buildConstant(S32, 0);
137  B.buildSelect(DstReg, SrcReg, True, False);
138  MRI.setRegBank(True.getReg(0), *NewBank);
139  MRI.setRegBank(False.getReg(0), *NewBank);
140  MI.eraseFromParent();
141  }
142 
143  assert(!MRI.getRegClassOrRegBank(DstReg));
144  MRI.setRegBank(DstReg, *NewBank);
145  return;
146  }
147 
148 #ifndef NDEBUG
149  if (Opc == AMDGPU::G_TRUNC) {
150  Register DstReg = MI.getOperand(0).getReg();
151  const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI);
152  assert(DstBank != &AMDGPU::VCCRegBank);
153  }
154 #endif
155 
156  for (MachineOperand &Op : MI.operands()) {
157  if (!Op.isReg())
158  continue;
159 
160  // We may see physical registers if building a real MI
161  Register Reg = Op.getReg();
162  if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg))
163  continue;
164 
165  const RegisterBank *RB = NewBank;
166  if (MRI.getType(Reg) == LLT::scalar(1)) {
167  assert(NewBank == &AMDGPU::VGPRRegBank &&
168  "s1 operands should only be used for vector bools");
169  assert((MI.getOpcode() != AMDGPU::G_TRUNC &&
170  MI.getOpcode() != AMDGPU::G_ANYEXT) &&
171  "not expecting legalization artifacts here");
172  RB = &AMDGPU::VCCRegBank;
173  }
174 
175  MRI.setRegBank(Reg, *RB);
176  }
177  }
178 
179  void erasingInstr(MachineInstr &MI) override {}
180 
181  void createdInstr(MachineInstr &MI) override {
182  // At this point, the instruction was just inserted and has no operands.
183  NewInsts.push_back(&MI);
184  }
185 
186  void changingInstr(MachineInstr &MI) override {}
187  void changedInstr(MachineInstr &MI) override {
188  // FIXME: In principle we should probably add the instruction to NewInsts,
189  // but the way the LegalizerHelper uses the observer, we will always see the
190  // registers we need to set the regbank on also referenced in a new
191  // instruction.
192  }
193 };
194 
195 }
197  : Subtarget(ST), TRI(Subtarget.getRegisterInfo()),
198  TII(Subtarget.getInstrInfo()) {
199 
200  // HACK: Until this is fully tablegen'd.
201  static llvm::once_flag InitializeRegisterBankFlag;
202 
203  static auto InitializeRegisterBankOnce = [this]() {
204  assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank &&
205  &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank &&
206  &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank);
207  (void)this;
208  };
209 
210  llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce);
211 }
212 
213 static bool isVectorRegisterBank(const RegisterBank &Bank) {
214  unsigned BankID = Bank.getID();
215  return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID;
216 }
217 
219  const RegisterBank &Src,
220  unsigned Size) const {
221  // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
222  if (Dst.getID() == AMDGPU::SGPRRegBankID &&
223  (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) {
225  }
226 
227  // Bool values are tricky, because the meaning is based on context. The SCC
228  // and VCC banks are for the natural scalar and vector conditions produced by
229  // a compare.
230  //
231  // Legalization doesn't know about the necessary context, so an s1 use may
232  // have been a truncate from an arbitrary value, in which case a copy (lowered
233  // as a compare with 0) needs to be inserted.
234  if (Size == 1 &&
235  (Dst.getID() == AMDGPU::SGPRRegBankID) &&
236  (isVectorRegisterBank(Src) ||
237  Src.getID() == AMDGPU::SGPRRegBankID ||
238  Src.getID() == AMDGPU::VCCRegBankID))
240 
241  // There is no direct copy between AGPRs.
242  if (Dst.getID() == AMDGPU::AGPRRegBankID &&
243  Src.getID() == AMDGPU::AGPRRegBankID)
244  return 4;
245 
246  return RegisterBankInfo::copyCost(Dst, Src, Size);
247 }
248 
250  const ValueMapping &ValMapping,
251  const RegisterBank *CurBank) const {
252  // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
253  // VGPR.
254  // FIXME: Is there a better way to do this?
255  if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
256  return 10; // This is expensive.
257 
258  assert(ValMapping.NumBreakDowns == 2 &&
259  ValMapping.BreakDown[0].Length == 32 &&
260  ValMapping.BreakDown[0].StartIdx == 0 &&
261  ValMapping.BreakDown[1].Length == 32 &&
262  ValMapping.BreakDown[1].StartIdx == 32 &&
263  ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank);
264 
265  // 32-bit extract of a 64-bit value is just access of a subregister, so free.
266  // TODO: Cost of 0 hits assert, though it's not clear it's what we really
267  // want.
268 
269  // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
270  // alignment restrictions, but this probably isn't important.
271  return 1;
272 }
273 
274 const RegisterBank &
276  LLT Ty) const {
277  if (&RC == &AMDGPU::SReg_1RegClass)
278  return AMDGPU::VCCRegBank;
279 
280  // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
281  // VCC-like use.
282  if (TRI->isSGPRClass(&RC)) {
283  // FIXME: This probably came from a copy from a physical register, which
284  // should be inferable from the copied to-type. We don't have many boolean
285  // physical register constraints so just assume a normal SGPR for now.
286  if (!Ty.isValid())
287  return AMDGPU::SGPRRegBank;
288 
289  return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
290  }
291 
292  return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank;
293 }
294 
295 template <unsigned NumOps>
298  const MachineInstr &MI, const MachineRegisterInfo &MRI,
299  const std::array<unsigned, NumOps> RegSrcOpIdx,
300  ArrayRef<OpRegBankEntry<NumOps>> Table) const {
301 
302  InstructionMappings AltMappings;
303 
305 
306  unsigned Sizes[NumOps];
307  for (unsigned I = 0; I < NumOps; ++I) {
308  Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
309  Sizes[I] = getSizeInBits(Reg, MRI, *TRI);
310  }
311 
312  for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
313  unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
314  Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
315  }
316 
317  // getInstrMapping's default mapping uses ID 1, so start at 2.
318  unsigned MappingID = 2;
319  for (const auto &Entry : Table) {
320  for (unsigned I = 0; I < NumOps; ++I) {
321  int OpIdx = RegSrcOpIdx[I];
322  Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]);
323  }
324 
325  AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost,
327  Operands.size()));
328  }
329 
330  return AltMappings;
331 }
332 
335  const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
336  switch (MI.getIntrinsicID()) {
337  case Intrinsic::amdgcn_readlane: {
338  static const OpRegBankEntry<3> Table[2] = {
339  // Perfectly legal.
340  { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
341 
342  // Need a readfirstlane for the index.
343  { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
344  };
345 
346  const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
347  return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
348  }
349  case Intrinsic::amdgcn_writelane: {
350  static const OpRegBankEntry<4> Table[4] = {
351  // Perfectly legal.
352  { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
353 
354  // Need readfirstlane of first op
355  { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
356 
357  // Need readfirstlane of second op
358  { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
359 
360  // Need readfirstlane of both ops
361  { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }
362  };
363 
364  // rsrc, voffset, offset
365  const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } };
366  return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
367  }
368  default:
370  }
371 }
372 
375  const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
376 
377  switch (MI.getIntrinsicID()) {
378  case Intrinsic::amdgcn_s_buffer_load: {
379  static const OpRegBankEntry<2> Table[4] = {
380  // Perfectly legal.
381  { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
382 
383  // Only need 1 register in loop
384  { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 },
385 
386  // Have to waterfall the resource.
387  { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
388 
389  // Have to waterfall the resource, and the offset.
390  { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 }
391  };
392 
393  // rsrc, offset
394  const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } };
395  return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
396  }
397  case Intrinsic::amdgcn_ds_ordered_add:
398  case Intrinsic::amdgcn_ds_ordered_swap: {
399  // VGPR = M0, VGPR
400  static const OpRegBankEntry<3> Table[2] = {
401  // Perfectly legal.
402  { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
403 
404  // Need a readfirstlane for m0
405  { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
406  };
407 
408  const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
409  return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
410  }
411  case Intrinsic::amdgcn_s_sendmsg:
412  case Intrinsic::amdgcn_s_sendmsghalt: {
413  // FIXME: Should have no register for immediate
414  static const OpRegBankEntry<1> Table[2] = {
415  // Perfectly legal.
416  { { AMDGPU::SGPRRegBankID }, 1 },
417 
418  // Need readlane
419  { { AMDGPU::VGPRRegBankID }, 3 }
420  };
421 
422  const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } };
423  return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
424  }
425  default:
427  }
428 }
429 
430 // FIXME: Returns uniform if there's no source value information. This is
431 // probably wrong.
432 static bool isScalarLoadLegal(const MachineInstr &MI) {
433  if (!MI.hasOneMemOperand())
434  return false;
435 
436  const MachineMemOperand *MMO = *MI.memoperands_begin();
437  const unsigned AS = MMO->getAddrSpace();
438  const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||
440  // Require 4-byte alignment.
441  return MMO->getAlign() >= Align(4) &&
442  // Can't do a scalar atomic load.
443  !MMO->isAtomic() &&
444  // Don't use scalar loads for volatile accesses to non-constant address
445  // spaces.
446  (IsConst || !MMO->isVolatile()) &&
447  // Memory must be known constant, or not written before this load.
448  (IsConst || MMO->isInvariant() || (MMO->getFlags() & MONoClobber)) &&
450 }
451 
454  const MachineInstr &MI) const {
455 
456  const MachineFunction &MF = *MI.getParent()->getParent();
457  const MachineRegisterInfo &MRI = MF.getRegInfo();
458 
459 
460  InstructionMappings AltMappings;
461  switch (MI.getOpcode()) {
462  case TargetOpcode::G_CONSTANT: {
463  unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
464  if (Size == 1) {
465  static const OpRegBankEntry<1> Table[3] = {
466  { { AMDGPU::VGPRRegBankID }, 1 },
467  { { AMDGPU::SGPRRegBankID }, 1 },
468  { { AMDGPU::VCCRegBankID }, 1 }
469  };
470 
471  return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
472  }
473 
475  }
476  case TargetOpcode::G_FCONSTANT:
477  case TargetOpcode::G_FRAME_INDEX:
478  case TargetOpcode::G_GLOBAL_VALUE: {
479  static const OpRegBankEntry<1> Table[2] = {
480  { { AMDGPU::VGPRRegBankID }, 1 },
481  { { AMDGPU::SGPRRegBankID }, 1 }
482  };
483 
484  return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
485  }
486  case TargetOpcode::G_AND:
487  case TargetOpcode::G_OR:
488  case TargetOpcode::G_XOR: {
489  unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
490 
491  if (Size == 1) {
492  // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
493  const InstructionMapping &SCCMapping = getInstructionMapping(
494  1, 1, getOperandsMapping(
495  {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
496  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
497  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}),
498  3); // Num Operands
499  AltMappings.push_back(&SCCMapping);
500 
501  const InstructionMapping &VCCMapping0 = getInstructionMapping(
502  2, 1, getOperandsMapping(
503  {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
504  AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
505  AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
506  3); // Num Operands
507  AltMappings.push_back(&VCCMapping0);
508  return AltMappings;
509  }
510 
511  if (Size != 64)
512  break;
513 
514  const InstructionMapping &SSMapping = getInstructionMapping(
515  1, 1, getOperandsMapping(
516  {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
517  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
518  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
519  3); // Num Operands
520  AltMappings.push_back(&SSMapping);
521 
522  const InstructionMapping &VVMapping = getInstructionMapping(
523  2, 2, getOperandsMapping(
524  {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
525  AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
526  AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
527  3); // Num Operands
528  AltMappings.push_back(&VVMapping);
529  break;
530  }
531  case TargetOpcode::G_LOAD:
532  case TargetOpcode::G_ZEXTLOAD:
533  case TargetOpcode::G_SEXTLOAD: {
534  unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
535  LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
536  unsigned PtrSize = PtrTy.getSizeInBits();
537  unsigned AS = PtrTy.getAddressSpace();
538 
540  AS != AMDGPUAS::PRIVATE_ADDRESS) &&
542  const InstructionMapping &SSMapping = getInstructionMapping(
543  1, 1, getOperandsMapping(
544  {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
545  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}),
546  2); // Num Operands
547  AltMappings.push_back(&SSMapping);
548  }
549 
550  const InstructionMapping &VVMapping = getInstructionMapping(
551  2, 1,
553  {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
554  AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
555  2); // Num Operands
556  AltMappings.push_back(&VVMapping);
557 
558  // It may be possible to have a vgpr = load sgpr mapping here, because
559  // the mubuf instructions support this kind of load, but probably for only
560  // gfx7 and older. However, the addressing mode matching in the instruction
561  // selector should be able to do a better job of detecting and selecting
562  // these kinds of loads from the vgpr = load vgpr mapping.
563 
564  return AltMappings;
565 
566  }
567  case TargetOpcode::G_SELECT: {
568  unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
569  const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
570  getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
571  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
572  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
573  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
574  4); // Num Operands
575  AltMappings.push_back(&SSMapping);
576 
577  const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
578  getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
579  AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
580  AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
581  AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
582  4); // Num Operands
583  AltMappings.push_back(&VVMapping);
584 
585  return AltMappings;
586  }
587  case TargetOpcode::G_UADDE:
588  case TargetOpcode::G_USUBE:
589  case TargetOpcode::G_SADDE:
590  case TargetOpcode::G_SSUBE: {
591  unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
592  const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
594  {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
595  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
596  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
597  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
598  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}),
599  5); // Num Operands
600  AltMappings.push_back(&SSMapping);
601 
602  const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
603  getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
604  AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
605  AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
606  AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
607  AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}),
608  5); // Num Operands
609  AltMappings.push_back(&VVMapping);
610  return AltMappings;
611  }
612  case AMDGPU::G_BRCOND: {
613  assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
614 
615  // TODO: Change type to 32 for scalar
616  const InstructionMapping &SMapping = getInstructionMapping(
617  1, 1, getOperandsMapping(
618  {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}),
619  2); // Num Operands
620  AltMappings.push_back(&SMapping);
621 
622  const InstructionMapping &VMapping = getInstructionMapping(
623  1, 1, getOperandsMapping(
624  {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }),
625  2); // Num Operands
626  AltMappings.push_back(&VMapping);
627  return AltMappings;
628  }
629  case AMDGPU::G_INTRINSIC:
631  case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
633  default:
634  break;
635  }
637 }
638 
642  LLT HalfTy,
643  Register Reg) const {
644  assert(HalfTy.getSizeInBits() == 32);
645  MachineRegisterInfo *MRI = B.getMRI();
646  Register LoLHS = MRI->createGenericVirtualRegister(HalfTy);
647  Register HiLHS = MRI->createGenericVirtualRegister(HalfTy);
648  const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI);
649  MRI->setRegBank(LoLHS, *Bank);
650  MRI->setRegBank(HiLHS, *Bank);
651 
652  Regs.push_back(LoLHS);
653  Regs.push_back(HiLHS);
654 
655  B.buildInstr(AMDGPU::G_UNMERGE_VALUES)
656  .addDef(LoLHS)
657  .addDef(HiLHS)
658  .addUse(Reg);
659 }
660 
661 /// Replace the current type each register in \p Regs has with \p NewTy
663  LLT NewTy) {
664  for (Register Reg : Regs) {
666  MRI.setType(Reg, NewTy);
667  }
668 }
669 
671  if (Ty.isVector()) {
674  Ty.getElementType());
675  }
676 
677  assert(Ty.getScalarSizeInBits() % 2 == 0);
678  return LLT::scalar(Ty.getScalarSizeInBits() / 2);
679 }
680 
681 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
682 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to
683 /// execute the instruction for each unique combination of values in all lanes
684 /// in the wave. The block will be split such that rest of the instructions are
685 /// moved to a new block.
686 ///
687 /// Essentially performs this loop:
688 //
689 /// Save Execution Mask
690 /// For (Lane : Wavefront) {
691 /// Enable Lane, Disable all other lanes
692 /// SGPR = read SGPR value for current lane from VGPR
693 /// VGPRResult[Lane] = use_op SGPR
694 /// }
695 /// Restore Execution Mask
696 ///
697 /// There is additional complexity to try for compare values to identify the
698 /// unique values used.
702  SmallSet<Register, 4> &SGPROperandRegs,
703  MachineRegisterInfo &MRI) const {
704 
705  // Track use registers which have already been expanded with a readfirstlane
706  // sequence. This may have multiple uses if moving a sequence.
707  DenseMap<Register, Register> WaterfalledRegMap;
708 
709  MachineBasicBlock &MBB = B.getMBB();
710  MachineFunction *MF = &B.getMF();
711 
712  const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
713  const unsigned WaveAndOpc = Subtarget.isWave32() ?
714  AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
715  const unsigned MovExecOpc =
716  Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
717  const unsigned MovExecTermOpc =
718  Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
719 
720  const unsigned XorTermOpc = Subtarget.isWave32() ?
721  AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
722  const unsigned AndSaveExecOpc = Subtarget.isWave32() ?
723  AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
724  const unsigned ExecReg = Subtarget.isWave32() ?
725  AMDGPU::EXEC_LO : AMDGPU::EXEC;
726 
727 #ifndef NDEBUG
728  const int OrigRangeSize = std::distance(Range.begin(), Range.end());
729 #endif
730 
731  Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
732  Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
733 
734  // Don't bother using generic instructions/registers for the exec mask.
735  B.buildInstr(TargetOpcode::IMPLICIT_DEF)
736  .addDef(InitSaveExecReg);
737 
738  Register PhiExec = MRI.createVirtualRegister(WaveRC);
739  Register NewExec = MRI.createVirtualRegister(WaveRC);
740 
741  // To insert the loop we need to split the block. Move everything before this
742  // point to a new block, and insert a new empty block before this instruction.
745  MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
746  MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
748  ++MBBI;
749  MF->insert(MBBI, LoopBB);
750  MF->insert(MBBI, BodyBB);
751  MF->insert(MBBI, RestoreExecBB);
752  MF->insert(MBBI, RemainderBB);
753 
754  LoopBB->addSuccessor(BodyBB);
755  BodyBB->addSuccessor(RestoreExecBB);
756  BodyBB->addSuccessor(LoopBB);
757 
758  // Move the rest of the block into a new block.
759  RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
760  RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
761 
762  MBB.addSuccessor(LoopBB);
763  RestoreExecBB->addSuccessor(RemainderBB);
764 
765  B.setInsertPt(*LoopBB, LoopBB->end());
766 
767  B.buildInstr(TargetOpcode::PHI)
768  .addDef(PhiExec)
769  .addReg(InitSaveExecReg)
770  .addMBB(&MBB)
771  .addReg(NewExec)
772  .addMBB(BodyBB);
773 
774  const DebugLoc &DL = B.getDL();
775 
776  MachineInstr &FirstInst = *Range.begin();
777 
778  // Move the instruction into the loop body. Note we moved everything after
779  // Range.end() already into a new block, so Range.end() is no longer valid.
780  BodyBB->splice(BodyBB->end(), &MBB, Range.begin(), MBB.end());
781 
782  // Figure out the iterator range after splicing the instructions.
783  MachineBasicBlock::iterator NewBegin = FirstInst.getIterator();
784  auto NewEnd = BodyBB->end();
785 
786  MachineBasicBlock::iterator I = LoopBB->end();
787  B.setMBB(*LoopBB);
788 
789  Register CondReg;
790 
791  assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
792 
793  for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
794  for (MachineOperand &Op : MI.uses()) {
795  if (!Op.isReg() || Op.isDef())
796  continue;
797 
798  Register OldReg = Op.getReg();
799  if (!SGPROperandRegs.count(OldReg))
800  continue;
801 
802  // See if we already processed this register in another instruction in the
803  // sequence.
804  auto OldVal = WaterfalledRegMap.find(OldReg);
805  if (OldVal != WaterfalledRegMap.end()) {
806  Op.setReg(OldVal->second);
807  continue;
808  }
809 
810  Register OpReg = Op.getReg();
811  LLT OpTy = MRI.getType(OpReg);
812 
813  const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI);
814  if (OpBank != &AMDGPU::VGPRRegBank) {
815  // Insert copy from AGPR to VGPR before the loop.
816  B.setMBB(MBB);
817  OpReg = B.buildCopy(OpTy, OpReg).getReg(0);
818  MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank);
819  B.setMBB(*LoopBB);
820  }
821 
822  unsigned OpSize = OpTy.getSizeInBits();
823 
824  // Can only do a readlane of 32-bit pieces.
825  if (OpSize == 32) {
826  // Avoid extra copies in the simple case of one 32-bit register.
827  Register CurrentLaneOpReg
828  = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
829  MRI.setType(CurrentLaneOpReg, OpTy);
830 
831  constrainGenericRegister(OpReg, AMDGPU::VGPR_32RegClass, MRI);
832  // Read the next variant <- also loop target.
833  BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
834  CurrentLaneOpReg)
835  .addReg(OpReg);
836 
837  Register NewCondReg = MRI.createVirtualRegister(WaveRC);
838  bool First = CondReg == AMDGPU::NoRegister;
839  if (First)
840  CondReg = NewCondReg;
841 
842  // Compare the just read M0 value to all possible Idx values.
843  B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
844  .addDef(NewCondReg)
845  .addReg(CurrentLaneOpReg)
846  .addReg(OpReg);
847  Op.setReg(CurrentLaneOpReg);
848 
849  if (!First) {
850  Register AndReg = MRI.createVirtualRegister(WaveRC);
851 
852  // If there are multiple operands to consider, and the conditions.
853  B.buildInstr(WaveAndOpc)
854  .addDef(AndReg)
855  .addReg(NewCondReg)
856  .addReg(CondReg);
857  CondReg = AndReg;
858  }
859  } else {
860  LLT S32 = LLT::scalar(32);
861  SmallVector<Register, 8> ReadlanePieces;
862 
863  // The compares can be done as 64-bit, but the extract needs to be done
864  // in 32-bit pieces.
865 
866  bool Is64 = OpSize % 64 == 0;
867 
868  unsigned UnmergeTySize = Is64 ? 64 : 32;
869  unsigned CmpOp =
870  Is64 ? AMDGPU::V_CMP_EQ_U64_e64 : AMDGPU::V_CMP_EQ_U32_e64;
871 
872  // Insert the unmerge before the loop.
873 
874  B.setMBB(MBB);
875  unsigned NumPieces = OpSize / UnmergeTySize;
876  SmallVector<Register, 8> UnmergePieces;
877  if (NumPieces == 1) {
878  UnmergePieces.push_back(OpReg);
879  } else {
880  LLT UnmergeTy = LLT::scalar(UnmergeTySize);
881  MachineInstrBuilder Unmerge = B.buildUnmerge(UnmergeTy, OpReg);
882  for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx)
883  UnmergePieces.push_back(Unmerge.getReg(PieceIdx));
884  }
885  B.setMBB(*LoopBB);
886 
887  for (Register UnmergePiece : UnmergePieces) {
888  Register CurrentLaneOpReg;
889  if (Is64) {
890  Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32);
891  Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32);
892 
893  MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass);
894  MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass);
895  MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass);
896 
897  // Read the next variant <- also loop target.
898  BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
899  CurrentLaneOpRegLo)
900  .addReg(UnmergePiece, 0, AMDGPU::sub0);
901 
902  // Read the next variant <- also loop target.
903  BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
904  CurrentLaneOpRegHi)
905  .addReg(UnmergePiece, 0, AMDGPU::sub1);
906 
907  CurrentLaneOpReg =
908  B.buildMerge(LLT::scalar(64),
909  {CurrentLaneOpRegLo, CurrentLaneOpRegHi})
910  .getReg(0);
911 
912  MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass);
913 
914  if (OpTy.getScalarSizeInBits() == 64) {
915  // If we need to produce a 64-bit element vector, so use the
916  // merged pieces
917  ReadlanePieces.push_back(CurrentLaneOpReg);
918  } else {
919  // 32-bit element type.
920  ReadlanePieces.push_back(CurrentLaneOpRegLo);
921  ReadlanePieces.push_back(CurrentLaneOpRegHi);
922  }
923  } else {
924  CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32);
925  MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass);
926  MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass);
927 
928  // Read the next variant <- also loop target.
929  BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
930  CurrentLaneOpReg)
931  .addReg(UnmergePiece);
932  ReadlanePieces.push_back(CurrentLaneOpReg);
933  }
934 
935  Register NewCondReg = MRI.createVirtualRegister(WaveRC);
936  bool First = CondReg == AMDGPU::NoRegister;
937  if (First)
938  CondReg = NewCondReg;
939 
940  B.buildInstr(CmpOp)
941  .addDef(NewCondReg)
942  .addReg(CurrentLaneOpReg)
943  .addReg(UnmergePiece);
944 
945  if (!First) {
946  Register AndReg = MRI.createVirtualRegister(WaveRC);
947 
948  // If there are multiple operands to consider, and the conditions.
949  B.buildInstr(WaveAndOpc)
950  .addDef(AndReg)
951  .addReg(NewCondReg)
952  .addReg(CondReg);
953  CondReg = AndReg;
954  }
955  }
956 
957  // FIXME: Build merge seems to switch to CONCAT_VECTORS but not
958  // BUILD_VECTOR
959  if (OpTy.isVector()) {
960  auto Merge = B.buildBuildVector(OpTy, ReadlanePieces);
961  Op.setReg(Merge.getReg(0));
962  MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank);
963  } else if (ReadlanePieces.size() > 1) {
964  auto Merge = B.buildMerge(OpTy, ReadlanePieces);
965  Op.setReg(Merge.getReg(0));
966  MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank);
967  } else {
968  Op.setReg(ReadlanePieces[0]);
969  }
970  }
971 
972  // Make sure we don't re-process this register again.
973  WaterfalledRegMap.insert(std::make_pair(OldReg, Op.getReg()));
974  }
975  }
976 
977  // Update EXEC, save the original EXEC value to VCC.
978  B.buildInstr(AndSaveExecOpc)
979  .addDef(NewExec)
980  .addReg(CondReg, RegState::Kill);
981 
982  MRI.setSimpleHint(NewExec, CondReg);
983 
984  B.setInsertPt(*BodyBB, BodyBB->end());
985 
986  // Update EXEC, switch all done bits to 0 and all todo bits to 1.
987  B.buildInstr(XorTermOpc)
988  .addDef(ExecReg)
989  .addReg(ExecReg)
990  .addReg(NewExec);
991 
992  // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
993  // s_cbranch_scc0?
994 
995  // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
996  B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
997 
998  // Save the EXEC mask before the loop.
999  BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg)
1000  .addReg(ExecReg);
1001 
1002  // Restore the EXEC mask after the loop.
1003  B.setMBB(*RestoreExecBB);
1004  B.buildInstr(MovExecTermOpc)
1005  .addDef(ExecReg)
1006  .addReg(SaveExecReg);
1007 
1008  // Set the insert point after the original instruction, so any new
1009  // instructions will be in the remainder.
1010  B.setInsertPt(*RemainderBB, RemainderBB->begin());
1011 
1012  return true;
1013 }
1014 
1015 // Return any unique registers used by \p MI at \p OpIndices that need to be
1016 // handled in a waterfall loop. Returns these registers in \p
1017 // SGPROperandRegs. Returns true if there are any operands to handle and a
1018 // waterfall loop is necessary.
1020  SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,
1021  MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {
1022  for (unsigned Op : OpIndices) {
1023  assert(MI.getOperand(Op).isUse());
1024  Register Reg = MI.getOperand(Op).getReg();
1025  const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
1026  if (OpBank->getID() != AMDGPU::SGPRRegBankID)
1027  SGPROperandRegs.insert(Reg);
1028  }
1029 
1030  // No operands need to be replaced, so no need to loop.
1031  return !SGPROperandRegs.empty();
1032 }
1033 
1036  ArrayRef<unsigned> OpIndices) const {
1037  // Use a set to avoid extra readfirstlanes in the case where multiple operands
1038  // are the same register.
1039  SmallSet<Register, 4> SGPROperandRegs;
1040 
1041  if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices))
1042  return false;
1043 
1044  MachineBasicBlock::iterator I = MI.getIterator();
1045  return executeInWaterfallLoop(B, make_range(I, std::next(I)),
1046  SGPROperandRegs, MRI);
1047 }
1048 
1051  ArrayRef<unsigned> OpIndices) const {
1053  return executeInWaterfallLoop(B, MI, MRI, OpIndices);
1054 }
1055 
1056 // Legalize an operand that must be an SGPR by inserting a readfirstlane.
1058  MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const {
1059  Register Reg = MI.getOperand(OpIdx).getReg();
1060  const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
1061  if (Bank == &AMDGPU::SGPRRegBank)
1062  return;
1063 
1064  LLT Ty = MRI.getType(Reg);
1066 
1067  if (Bank != &AMDGPU::VGPRRegBank) {
1068  // We need to copy from AGPR to VGPR
1069  Reg = B.buildCopy(Ty, Reg).getReg(0);
1070  MRI.setRegBank(Reg, AMDGPU::VGPRRegBank);
1071  }
1072 
1073  Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1074  B.buildInstr(AMDGPU::V_READFIRSTLANE_B32)
1075  .addDef(SGPR)
1076  .addReg(Reg);
1077 
1078  MRI.setType(SGPR, Ty);
1079 
1080  const TargetRegisterClass *Constrained =
1081  constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI);
1082  (void)Constrained;
1083  assert(Constrained && "Failed to constrain readfirstlane src reg");
1084 
1085  MI.getOperand(OpIdx).setReg(SGPR);
1086 }
1087 
1088 /// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
1089 /// rest will be in the remainder.
1090 static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) {
1091  unsigned TotalSize = Ty.getSizeInBits();
1092  if (!Ty.isVector())
1093  return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)};
1094 
1095  LLT EltTy = Ty.getElementType();
1096  unsigned EltSize = EltTy.getSizeInBits();
1097  assert(FirstSize % EltSize == 0);
1098 
1099  unsigned FirstPartNumElts = FirstSize / EltSize;
1100  unsigned RemainderElts = (TotalSize - FirstSize) / EltSize;
1101 
1102  return {LLT::scalarOrVector(ElementCount::getFixed(FirstPartNumElts), EltTy),
1103  LLT::scalarOrVector(ElementCount::getFixed(RemainderElts), EltTy)};
1104 }
1105 
1106 static LLT widen96To128(LLT Ty) {
1107  if (!Ty.isVector())
1108  return LLT::scalar(128);
1109 
1110  LLT EltTy = Ty.getElementType();
1111  assert(128 % EltTy.getSizeInBits() == 0);
1112  return LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
1113 }
1114 
1116  const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1117  MachineRegisterInfo &MRI) const {
1118  Register DstReg = MI.getOperand(0).getReg();
1119  const LLT LoadTy = MRI.getType(DstReg);
1120  unsigned LoadSize = LoadTy.getSizeInBits();
1121  const unsigned MaxNonSmrdLoadSize = 128;
1122 
1123  const RegisterBank *DstBank =
1124  OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1125  if (DstBank == &AMDGPU::SGPRRegBank) {
1126  // There are some special cases that we need to look at for 32 bit and 96
1127  // bit SGPR loads otherwise we have nothing to do.
1128  if (LoadSize != 32 && LoadSize != 96)
1129  return false;
1130 
1131  MachineMemOperand *MMO = *MI.memoperands_begin();
1132  const unsigned MemSize = 8 * MMO->getSize();
1133  // Scalar loads of size 8 or 16 bit with proper alignment may be widened to
1134  // 32 bit. Check to see if we need to widen the memory access, 8 or 16 bit
1135  // scalar loads should have a load size of 32 but memory access size of less
1136  // than 32.
1137  if (LoadSize == 32 &&
1138  (MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI)))
1139  return false;
1140 
1141  Register PtrReg = MI.getOperand(1).getReg();
1142 
1143  ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank);
1144  MachineIRBuilder B(MI, O);
1145 
1146  if (LoadSize == 32) {
1147  // This is an extending load from a sub-dword size. Widen the memory
1148  // access size to 4 bytes and clear the extra high bits appropriately
1149  const LLT S32 = LLT::scalar(32);
1150  if (MI.getOpcode() == AMDGPU::G_SEXTLOAD) {
1151  // Must extend the sign bit into higher bits for a G_SEXTLOAD
1152  auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1153  B.buildSExtInReg(MI.getOperand(0), WideLoad, MemSize);
1154  } else if (MI.getOpcode() == AMDGPU::G_ZEXTLOAD) {
1155  // Must extend zero into higher bits with an AND for a G_ZEXTLOAD
1156  auto WideLoad = B.buildLoadFromOffset(S32, PtrReg, *MMO, 0);
1157  B.buildZExtInReg(MI.getOperand(0), WideLoad, MemSize);
1158  } else
1159  // We do not need to touch the higher bits for regular loads.
1160  B.buildLoadFromOffset(MI.getOperand(0), PtrReg, *MMO, 0);
1161  } else {
1162  // 96-bit loads are only available for vector loads. We need to split this
1163  // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
1164  if (MMO->getAlign() < Align(16)) {
1165  MachineFunction *MF = MI.getParent()->getParent();
1166  ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
1167  MachineIRBuilder B(MI, ApplyBank);
1168  LegalizerHelper Helper(*MF, ApplyBank, B);
1169  LLT Part64, Part32;
1170  std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64);
1171  if (Helper.reduceLoadStoreWidth(cast<GAnyLoad>(MI), 0, Part64) !=
1173  return false;
1174  return true;
1175  } else {
1176  LLT WiderTy = widen96To128(LoadTy);
1177  auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0);
1178  if (WiderTy.isScalar())
1179  B.buildTrunc(MI.getOperand(0), WideLoad);
1180  else {
1181  B.buildDeleteTrailingVectorElements(MI.getOperand(0).getReg(),
1182  WideLoad);
1183  }
1184  }
1185  }
1186 
1187  MI.eraseFromParent();
1188  return true;
1189  }
1190 
1191  // 128-bit loads are supported for all instruction types.
1192  if (LoadSize <= MaxNonSmrdLoadSize)
1193  return false;
1194 
1195  SmallVector<Register, 16> DefRegs(OpdMapper.getVRegs(0));
1196  SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1));
1197 
1198  if (SrcRegs.empty())
1199  SrcRegs.push_back(MI.getOperand(1).getReg());
1200 
1201  assert(LoadSize % MaxNonSmrdLoadSize == 0);
1202 
1203  // RegBankSelect only emits scalar types, so we need to reset the pointer
1204  // operand to a pointer type.
1205  Register BasePtrReg = SrcRegs[0];
1206  LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
1207  MRI.setType(BasePtrReg, PtrTy);
1208 
1209  unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize;
1210  const LLT LoadSplitTy = LoadTy.divide(NumSplitParts);
1211  ApplyRegBankMapping Observer(*this, MRI, &AMDGPU::VGPRRegBank);
1212  MachineIRBuilder B(MI, Observer);
1213  LegalizerHelper Helper(B.getMF(), Observer, B);
1214 
1215  if (LoadTy.isVector()) {
1216  if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1217  return false;
1218  } else {
1219  if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1220  return false;
1221  }
1222 
1223  MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
1224  return true;
1225 }
1226 
1228  MachineInstr &MI,
1229  const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1230  MachineRegisterInfo &MRI) const {
1231  const MachineFunction &MF = *MI.getMF();
1232  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1233  const auto &TFI = *ST.getFrameLowering();
1234 
1235  // Guard in case the stack growth direction ever changes with scratch
1236  // instructions.
1237  if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown)
1238  return false;
1239 
1240  Register Dst = MI.getOperand(0).getReg();
1241  Register AllocSize = MI.getOperand(1).getReg();
1242  Align Alignment = assumeAligned(MI.getOperand(2).getImm());
1243 
1244  const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI);
1245 
1246  // TODO: Need to emit a wave reduction to get the maximum size.
1247  if (SizeBank != &AMDGPU::SGPRRegBank)
1248  return false;
1249 
1250  LLT PtrTy = MRI.getType(Dst);
1251  LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
1252 
1254  Register SPReg = Info->getStackPtrOffsetReg();
1255  ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
1256  MachineIRBuilder B(MI, ApplyBank);
1257 
1258  auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2());
1259  auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize);
1260 
1261  auto SPCopy = B.buildCopy(PtrTy, SPReg);
1262  if (Alignment > TFI.getStackAlign()) {
1263  auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize);
1264  B.buildMaskLowPtrBits(Dst, PtrAdd,
1265  Log2(Alignment) + ST.getWavefrontSizeLog2());
1266  } else {
1267  B.buildPtrAdd(Dst, SPCopy, ScaledSize);
1268  }
1269 
1270  MI.eraseFromParent();
1271  return true;
1272 }
1273 
1276  MachineRegisterInfo &MRI, int RsrcIdx) const {
1277  const int NumDefs = MI.getNumExplicitDefs();
1278 
1279  // The reported argument index is relative to the IR intrinsic call arguments,
1280  // so we need to shift by the number of defs and the intrinsic ID.
1281  RsrcIdx += NumDefs + 1;
1282 
1283  // Insert copies to VGPR arguments.
1284  applyDefaultMapping(OpdMapper);
1285 
1286  // Fixup any SGPR arguments.
1287  SmallVector<unsigned, 4> SGPRIndexes;
1288  for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
1289  if (!MI.getOperand(I).isReg())
1290  continue;
1291 
1292  // If this intrinsic has a sampler, it immediately follows rsrc.
1293  if (I == RsrcIdx || I == RsrcIdx + 1)
1294  SGPRIndexes.push_back(I);
1295  }
1296 
1297  executeInWaterfallLoop(MI, MRI, SGPRIndexes);
1298  return true;
1299 }
1300 
1302  Register Reg) {
1304  if (!Def)
1305  return Reg;
1306 
1307  // TODO: Guard against this being an implicit def
1308  return Def->getOperand(0).getReg();
1309 }
1310 
1311 // Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store
1312 // the three offsets (voffset, soffset and instoffset)
1314  const AMDGPURegisterBankInfo &RBI,
1315  Register CombinedOffset, Register &VOffsetReg,
1316  Register &SOffsetReg, int64_t &InstOffsetVal,
1317  Align Alignment) {
1318  const LLT S32 = LLT::scalar(32);
1319  MachineRegisterInfo *MRI = B.getMRI();
1320 
1321  if (Optional<int64_t> Imm = getIConstantVRegSExtVal(CombinedOffset, *MRI)) {
1322  uint32_t SOffset, ImmOffset;
1323  if (AMDGPU::splitMUBUFOffset(*Imm, SOffset, ImmOffset, &RBI.Subtarget,
1324  Alignment)) {
1325  VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1326  SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1327  InstOffsetVal = ImmOffset;
1328 
1329  B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1330  B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1331  return SOffset + ImmOffset;
1332  }
1333  }
1334 
1335  Register Base;
1336  unsigned Offset;
1337 
1338  std::tie(Base, Offset) =
1339  AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset);
1340 
1341  uint32_t SOffset, ImmOffset;
1342  if ((int)Offset > 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
1343  &RBI.Subtarget, Alignment)) {
1344  if (RBI.getRegBank(Base, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
1345  VOffsetReg = Base;
1346  SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1347  B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1348  InstOffsetVal = ImmOffset;
1349  return 0; // XXX - Why is this 0?
1350  }
1351 
1352  // If we have SGPR base, we can use it for soffset.
1353  if (SOffset == 0) {
1354  VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1355  B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1356  SOffsetReg = Base;
1357  InstOffsetVal = ImmOffset;
1358  return 0; // XXX - Why is this 0?
1359  }
1360  }
1361 
1362  // Handle the variable sgpr + vgpr case.
1363  MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI);
1364  if (Add && (int)Offset >= 0) {
1365  Register Src0 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(1).getReg());
1366  Register Src1 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(2).getReg());
1367 
1368  const RegisterBank *Src0Bank = RBI.getRegBank(Src0, *MRI, *RBI.TRI);
1369  const RegisterBank *Src1Bank = RBI.getRegBank(Src1, *MRI, *RBI.TRI);
1370 
1371  if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) {
1372  VOffsetReg = Src0;
1373  SOffsetReg = Src1;
1374  return 0;
1375  }
1376 
1377  if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) {
1378  VOffsetReg = Src1;
1379  SOffsetReg = Src0;
1380  return 0;
1381  }
1382  }
1383 
1384  // Ensure we have a VGPR for the combined offset. This could be an issue if we
1385  // have an SGPR offset and a VGPR resource.
1386  if (RBI.getRegBank(CombinedOffset, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
1387  VOffsetReg = CombinedOffset;
1388  } else {
1389  VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0);
1390  B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1391  }
1392 
1393  SOffsetReg = B.buildConstant(S32, 0).getReg(0);
1394  B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1395  return 0;
1396 }
1397 
1399  const OperandsMapper &OpdMapper) const {
1400  MachineInstr &MI = OpdMapper.getMI();
1401  MachineRegisterInfo &MRI = OpdMapper.getMRI();
1402 
1403  const LLT S32 = LLT::scalar(32);
1404  Register Dst = MI.getOperand(0).getReg();
1405  LLT Ty = MRI.getType(Dst);
1406 
1407  const RegisterBank *RSrcBank =
1408  OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1409  const RegisterBank *OffsetBank =
1410  OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1411  if (RSrcBank == &AMDGPU::SGPRRegBank &&
1412  OffsetBank == &AMDGPU::SGPRRegBank)
1413  return true; // Legal mapping
1414 
1415  // FIXME: 96-bit case was widened during legalize. We need to narrow it back
1416  // here but don't have an MMO.
1417 
1418  unsigned LoadSize = Ty.getSizeInBits();
1419  int NumLoads = 1;
1420  if (LoadSize == 256 || LoadSize == 512) {
1421  NumLoads = LoadSize / 128;
1422  Ty = Ty.divide(NumLoads);
1423  }
1424 
1425  // Use the alignment to ensure that the required offsets will fit into the
1426  // immediate offsets.
1427  const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1);
1428 
1430  MachineFunction &MF = B.getMF();
1431 
1432  Register SOffset;
1433  Register VOffset;
1434  int64_t ImmOffset = 0;
1435 
1436  unsigned MMOOffset = setBufferOffsets(B, *this, MI.getOperand(2).getReg(),
1437  VOffset, SOffset, ImmOffset, Alignment);
1438 
1439  // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we
1440  // can, but we need to track an MMO for that.
1441  const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8;
1442  const Align MemAlign(4); // FIXME: ABI type alignment?
1447  MemSize, MemAlign);
1448  if (MMOOffset != 0)
1449  BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize);
1450 
1451  // If only the offset is divergent, emit a MUBUF buffer load instead. We can
1452  // assume that the buffer is unswizzled.
1453 
1454  Register RSrc = MI.getOperand(1).getReg();
1455  Register VIndex = B.buildConstant(S32, 0).getReg(0);
1456  B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank);
1457 
1458  SmallVector<Register, 4> LoadParts(NumLoads);
1459 
1460  MachineBasicBlock::iterator MII = MI.getIterator();
1461  MachineInstrSpan Span(MII, &B.getMBB());
1462 
1463  for (int i = 0; i < NumLoads; ++i) {
1464  if (NumLoads == 1) {
1465  LoadParts[i] = Dst;
1466  } else {
1467  LoadParts[i] = MRI.createGenericVirtualRegister(Ty);
1468  MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank);
1469  }
1470 
1471  MachineMemOperand *MMO = BaseMMO;
1472  if (i != 0)
1473  BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize);
1474 
1475  B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD)
1476  .addDef(LoadParts[i]) // vdata
1477  .addUse(RSrc) // rsrc
1478  .addUse(VIndex) // vindex
1479  .addUse(VOffset) // voffset
1480  .addUse(SOffset) // soffset
1481  .addImm(ImmOffset + 16 * i) // offset(imm)
1482  .addImm(0) // cachepolicy, swizzled buffer(imm)
1483  .addImm(0) // idxen(imm)
1484  .addMemOperand(MMO);
1485  }
1486 
1487  // TODO: If only the resource is a VGPR, it may be better to execute the
1488  // scalar load in the waterfall loop if the resource is expected to frequently
1489  // be dynamically uniform.
1490  if (RSrcBank != &AMDGPU::SGPRRegBank) {
1491  // Remove the original instruction to avoid potentially confusing the
1492  // waterfall loop logic.
1493  B.setInstr(*Span.begin());
1494  MI.eraseFromParent();
1495 
1496  SmallSet<Register, 4> OpsToWaterfall;
1497 
1498  OpsToWaterfall.insert(RSrc);
1499  executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
1500  OpsToWaterfall, MRI);
1501  }
1502 
1503  if (NumLoads != 1) {
1504  if (Ty.isVector())
1505  B.buildConcatVectors(Dst, LoadParts);
1506  else
1507  B.buildMerge(Dst, LoadParts);
1508  }
1509 
1510  // We removed the instruction earlier with a waterfall loop.
1511  if (RSrcBank == &AMDGPU::SGPRRegBank)
1512  MI.eraseFromParent();
1513 
1514  return true;
1515 }
1516 
1518  bool Signed) const {
1519  MachineInstr &MI = OpdMapper.getMI();
1520  MachineRegisterInfo &MRI = OpdMapper.getMRI();
1521 
1522  // Insert basic copies
1523  applyDefaultMapping(OpdMapper);
1524 
1525  Register DstReg = MI.getOperand(0).getReg();
1526  LLT Ty = MRI.getType(DstReg);
1527 
1528  const LLT S32 = LLT::scalar(32);
1529 
1530  unsigned FirstOpnd = MI.getOpcode() == AMDGPU::G_INTRINSIC ? 2 : 1;
1531  Register SrcReg = MI.getOperand(FirstOpnd).getReg();
1532  Register OffsetReg = MI.getOperand(FirstOpnd + 1).getReg();
1533  Register WidthReg = MI.getOperand(FirstOpnd + 2).getReg();
1534 
1535  const RegisterBank *DstBank =
1536  OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1537  if (DstBank == &AMDGPU::VGPRRegBank) {
1538  if (Ty == S32)
1539  return true;
1540 
1541  // There is no 64-bit vgpr bitfield extract instructions so the operation
1542  // is expanded to a sequence of instructions that implement the operation.
1543  ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::VGPRRegBank);
1544  MachineIRBuilder B(MI, ApplyBank);
1545 
1546  const LLT S64 = LLT::scalar(64);
1547  // Shift the source operand so that extracted bits start at bit 0.
1548  auto ShiftOffset = Signed ? B.buildAShr(S64, SrcReg, OffsetReg)
1549  : B.buildLShr(S64, SrcReg, OffsetReg);
1550  auto UnmergeSOffset = B.buildUnmerge({S32, S32}, ShiftOffset);
1551 
1552  // A 64-bit bitfield extract uses the 32-bit bitfield extract instructions
1553  // if the width is a constant.
1554  if (auto ConstWidth = getIConstantVRegValWithLookThrough(WidthReg, MRI)) {
1555  // Use the 32-bit bitfield extract instruction if the width is a constant.
1556  // Depending on the width size, use either the low or high 32-bits.
1557  auto Zero = B.buildConstant(S32, 0);
1558  auto WidthImm = ConstWidth->Value.getZExtValue();
1559  if (WidthImm <= 32) {
1560  // Use bitfield extract on the lower 32-bit source, and then sign-extend
1561  // or clear the upper 32-bits.
1562  auto Extract =
1563  Signed ? B.buildSbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg)
1564  : B.buildUbfx(S32, UnmergeSOffset.getReg(0), Zero, WidthReg);
1565  auto Extend =
1566  Signed ? B.buildAShr(S32, Extract, B.buildConstant(S32, 31)) : Zero;
1567  B.buildMerge(DstReg, {Extract, Extend});
1568  } else {
1569  // Use bitfield extract on upper 32-bit source, and combine with lower
1570  // 32-bit source.
1571  auto UpperWidth = B.buildConstant(S32, WidthImm - 32);
1572  auto Extract =
1573  Signed
1574  ? B.buildSbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth)
1575  : B.buildUbfx(S32, UnmergeSOffset.getReg(1), Zero, UpperWidth);
1576  B.buildMerge(DstReg, {UnmergeSOffset.getReg(0), Extract});
1577  }
1578  MI.eraseFromParent();
1579  return true;
1580  }
1581 
1582  // Expand to Src >> Offset << (64 - Width) >> (64 - Width) using 64-bit
1583  // operations.
1584  auto ExtShift = B.buildSub(S32, B.buildConstant(S32, 64), WidthReg);
1585  auto SignBit = B.buildShl(S64, ShiftOffset, ExtShift);
1586  if (Signed)
1587  B.buildAShr(S64, SignBit, ExtShift);
1588  else
1589  B.buildLShr(S64, SignBit, ExtShift);
1590  MI.eraseFromParent();
1591  return true;
1592  }
1593 
1594  // The scalar form packs the offset and width in a single operand.
1595 
1596  ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
1597  MachineIRBuilder B(MI, ApplyBank);
1598 
1599  // Ensure the high bits are clear to insert the offset.
1600  auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
1601  auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
1602 
1603  // Zeros out the low bits, so don't bother clamping the input value.
1604  auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16));
1605 
1606  // Transformation function, pack the offset and width of a BFE into
1607  // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
1608  // source, bits [5:0] contain the offset and bits [22:16] the width.
1609  auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
1610 
1611  // TODO: It might be worth using a pseudo here to avoid scc clobber and
1612  // register class constraints.
1613  unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
1614  (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
1615 
1616  auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
1617  if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1618  llvm_unreachable("failed to constrain BFE");
1619 
1620  MI.eraseFromParent();
1621  return true;
1622 }
1623 
1624 // Return a suitable opcode for extending the operands of Opc when widening.
1625 static unsigned getExtendOp(unsigned Opc) {
1626  switch (Opc) {
1627  case TargetOpcode::G_ASHR:
1628  case TargetOpcode::G_SMIN:
1629  case TargetOpcode::G_SMAX:
1630  return TargetOpcode::G_SEXT;
1631  case TargetOpcode::G_LSHR:
1632  case TargetOpcode::G_UMIN:
1633  case TargetOpcode::G_UMAX:
1634  return TargetOpcode::G_ZEXT;
1635  default:
1636  return TargetOpcode::G_ANYEXT;
1637  }
1638 }
1639 
1640 // Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding
1641 // any illegal vector extend or unmerge operations.
1642 static std::pair<Register, Register>
1643 unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) {
1644  const LLT S32 = LLT::scalar(32);
1645  auto Bitcast = B.buildBitcast(S32, Src);
1646 
1647  if (ExtOpcode == TargetOpcode::G_SEXT) {
1648  auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16);
1649  auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16));
1650  return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1651  }
1652 
1653  auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16));
1654  if (ExtOpcode == TargetOpcode::G_ZEXT) {
1655  auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff));
1656  return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1657  }
1658 
1659  assert(ExtOpcode == TargetOpcode::G_ANYEXT);
1660  return std::make_pair(Bitcast.getReg(0), ShiftHi.getReg(0));
1661 }
1662 
1663 // For cases where only a single copy is inserted for matching register banks.
1664 // Replace the register in the instruction operand
1666  const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
1667  SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
1668  if (!SrcReg.empty()) {
1669  assert(SrcReg.size() == 1);
1670  OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);
1671  return true;
1672  }
1673 
1674  return false;
1675 }
1676 
1677 /// Handle register layout difference for f16 images for some subtargets.
1680  Register Reg) const {
1682  return Reg;
1683 
1684  const LLT S16 = LLT::scalar(16);
1685  LLT StoreVT = MRI.getType(Reg);
1686  if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
1687  return Reg;
1688 
1689  auto Unmerge = B.buildUnmerge(S16, Reg);
1690 
1691 
1692  SmallVector<Register, 4> WideRegs;
1693  for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1694  WideRegs.push_back(Unmerge.getReg(I));
1695 
1696  const LLT S32 = LLT::scalar(32);
1697  int NumElts = StoreVT.getNumElements();
1698 
1699  return B.buildMerge(LLT::fixed_vector(NumElts, S32), WideRegs).getReg(0);
1700 }
1701 
1702 static std::pair<Register, unsigned>
1704  int64_t Const;
1705  if (mi_match(Reg, MRI, m_ICst(Const)))
1706  return std::make_pair(Register(), Const);
1707 
1708  Register Base;
1709  if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const))))
1710  return std::make_pair(Base, Const);
1711 
1712  // TODO: Handle G_OR used for add case
1713  return std::make_pair(Reg, 0);
1714 }
1715 
1716 std::pair<Register, unsigned>
1718  Register OrigOffset) const {
1719  const unsigned MaxImm = 4095;
1720  Register BaseReg;
1721  unsigned ImmOffset;
1722  const LLT S32 = LLT::scalar(32);
1723 
1724  std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(),
1725  OrigOffset);
1726 
1727  unsigned C1 = 0;
1728  if (ImmOffset != 0) {
1729  // If the immediate value is too big for the immoffset field, put the value
1730  // and -4096 into the immoffset field so that the value that is copied/added
1731  // for the voffset field is a multiple of 4096, and it stands more chance
1732  // of being CSEd with the copy/add for another similar load/store.
1733  // However, do not do that rounding down to a multiple of 4096 if that is a
1734  // negative number, as it appears to be illegal to have a negative offset
1735  // in the vgpr, even if adding the immediate offset makes it positive.
1736  unsigned Overflow = ImmOffset & ~MaxImm;
1737  ImmOffset -= Overflow;
1738  if ((int32_t)Overflow < 0) {
1739  Overflow += ImmOffset;
1740  ImmOffset = 0;
1741  }
1742 
1743  C1 = ImmOffset;
1744  if (Overflow != 0) {
1745  if (!BaseReg)
1746  BaseReg = B.buildConstant(S32, Overflow).getReg(0);
1747  else {
1748  auto OverflowVal = B.buildConstant(S32, Overflow);
1749  BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
1750  }
1751  }
1752  }
1753 
1754  if (!BaseReg)
1755  BaseReg = B.buildConstant(S32, 0).getReg(0);
1756 
1757  return {BaseReg, C1};
1758 }
1759 
1761  Register SrcReg) const {
1762  MachineRegisterInfo &MRI = *B.getMRI();
1763  LLT SrcTy = MRI.getType(SrcReg);
1764  if (SrcTy.getSizeInBits() == 32) {
1765  // Use a v_mov_b32 here to make the exec dependency explicit.
1766  B.buildInstr(AMDGPU::V_MOV_B32_e32)
1767  .addDef(DstReg)
1768  .addUse(SrcReg);
1769  return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) &&
1770  constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI);
1771  }
1772 
1773  Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1774  Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1775 
1776  B.buildInstr(AMDGPU::V_MOV_B32_e32)
1777  .addDef(TmpReg0)
1778  .addUse(SrcReg, 0, AMDGPU::sub0);
1779  B.buildInstr(AMDGPU::V_MOV_B32_e32)
1780  .addDef(TmpReg1)
1781  .addUse(SrcReg, 0, AMDGPU::sub1);
1782  B.buildInstr(AMDGPU::REG_SEQUENCE)
1783  .addDef(DstReg)
1784  .addUse(TmpReg0)
1785  .addImm(AMDGPU::sub0)
1786  .addUse(TmpReg1)
1787  .addImm(AMDGPU::sub1);
1788 
1789  return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) &&
1790  constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI);
1791 }
1792 
1793 /// Utility function for pushing dynamic vector indexes with a constant offset
1794 /// into waterfall loops.
1796  MachineInstr &IdxUseInstr,
1797  unsigned OpIdx,
1798  unsigned ConstOffset) {
1799  MachineRegisterInfo &MRI = *B.getMRI();
1800  const LLT S32 = LLT::scalar(32);
1801  Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg();
1802  B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator());
1803 
1804  auto MaterializedOffset = B.buildConstant(S32, ConstOffset);
1805 
1806  auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset);
1807  MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank);
1808  MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank);
1809  IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0));
1810 }
1811 
1812 /// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the
1813 /// original 32-bit source value (to be inserted in the low part of the combined
1814 /// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit
1815 /// value.
1817  Register Hi32Reg, Register Lo32Reg,
1818  unsigned ExtOpc,
1819  const RegisterBank &RegBank,
1820  bool IsBooleanSrc = false) {
1821  if (ExtOpc == AMDGPU::G_ZEXT) {
1822  B.buildConstant(Hi32Reg, 0);
1823  } else if (ExtOpc == AMDGPU::G_SEXT) {
1824  if (IsBooleanSrc) {
1825  // If we know the original source was an s1, the high half is the same as
1826  // the low.
1827  B.buildCopy(Hi32Reg, Lo32Reg);
1828  } else {
1829  // Replicate sign bit from 32-bit extended part.
1830  auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31);
1831  B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank);
1832  B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt);
1833  }
1834  } else {
1835  assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension");
1836  B.buildUndef(Hi32Reg);
1837  }
1838 }
1839 
1840 bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
1842  const OperandsMapper &OpdMapper) const {
1843 
1844  Register VecReg = MI.getOperand(1).getReg();
1845  Register Idx = MI.getOperand(2).getReg();
1846 
1847  const RegisterBank &IdxBank =
1848  *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1849 
1850  bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
1851 
1852  LLT VecTy = MRI.getType(VecReg);
1853  unsigned EltSize = VecTy.getScalarSizeInBits();
1854  unsigned NumElem = VecTy.getNumElements();
1855 
1856  if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
1857  IsDivergentIdx))
1858  return false;
1859 
1861  LLT S32 = LLT::scalar(32);
1862 
1863  const RegisterBank &DstBank =
1864  *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1865  const RegisterBank &SrcBank =
1866  *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1867 
1868  const RegisterBank &CCBank =
1869  (DstBank == AMDGPU::SGPRRegBank &&
1870  SrcBank == AMDGPU::SGPRRegBank &&
1871  IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
1872  : AMDGPU::VCCRegBank;
1873  LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
1874 
1875  if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
1876  Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
1877  MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
1878  }
1879 
1880  LLT EltTy = VecTy.getScalarType();
1881  SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
1882  unsigned NumLanes = DstRegs.size();
1883  if (!NumLanes)
1884  NumLanes = 1;
1885  else
1886  EltTy = MRI.getType(DstRegs[0]);
1887 
1888  auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
1889  SmallVector<Register, 2> Res(NumLanes);
1890  for (unsigned L = 0; L < NumLanes; ++L)
1891  Res[L] = UnmergeToEltTy.getReg(L);
1892 
1893  for (unsigned I = 1; I < NumElem; ++I) {
1894  auto IC = B.buildConstant(S32, I);
1895  MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
1896  auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
1897  MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
1898 
1899  for (unsigned L = 0; L < NumLanes; ++L) {
1900  auto S = B.buildSelect(EltTy, Cmp,
1901  UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]);
1902 
1903  for (unsigned N : { 0, 2, 3 })
1904  MRI.setRegBank(S->getOperand(N).getReg(), DstBank);
1905 
1906  Res[L] = S->getOperand(0).getReg();
1907  }
1908  }
1909 
1910  for (unsigned L = 0; L < NumLanes; ++L) {
1911  Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L];
1912  B.buildCopy(DstReg, Res[L]);
1913  MRI.setRegBank(DstReg, DstBank);
1914  }
1915 
1916  MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
1917  MI.eraseFromParent();
1918 
1919  return true;
1920 }
1921 
1922 // Insert a cross regbank copy for a register if it already has a bank that
1923 // differs from the one we want to set.
1926  const RegisterBank &Bank) {
1927  const RegisterBank *CurrBank = MRI.getRegBankOrNull(Reg);
1928  if (CurrBank && *CurrBank != Bank) {
1929  Register Copy = B.buildCopy(MRI.getType(Reg), Reg).getReg(0);
1930  MRI.setRegBank(Copy, Bank);
1931  return Copy;
1932  }
1933 
1934  MRI.setRegBank(Reg, Bank);
1935  return Reg;
1936 }
1937 
1938 bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
1940  const OperandsMapper &OpdMapper) const {
1941 
1942  Register VecReg = MI.getOperand(1).getReg();
1943  Register Idx = MI.getOperand(3).getReg();
1944 
1945  const RegisterBank &IdxBank =
1946  *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
1947 
1948  bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
1949 
1950  LLT VecTy = MRI.getType(VecReg);
1951  unsigned EltSize = VecTy.getScalarSizeInBits();
1952  unsigned NumElem = VecTy.getNumElements();
1953 
1954  if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
1955  IsDivergentIdx))
1956  return false;
1957 
1959  LLT S32 = LLT::scalar(32);
1960 
1961  const RegisterBank &DstBank =
1962  *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1963  const RegisterBank &SrcBank =
1964  *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1965  const RegisterBank &InsBank =
1966  *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1967 
1968  const RegisterBank &CCBank =
1969  (DstBank == AMDGPU::SGPRRegBank &&
1970  SrcBank == AMDGPU::SGPRRegBank &&
1971  InsBank == AMDGPU::SGPRRegBank &&
1972  IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
1973  : AMDGPU::VCCRegBank;
1974  LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
1975 
1976  if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
1977  Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
1978  MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
1979  }
1980 
1981  LLT EltTy = VecTy.getScalarType();
1982  SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
1983  unsigned NumLanes = InsRegs.size();
1984  if (!NumLanes) {
1985  NumLanes = 1;
1986  InsRegs.push_back(MI.getOperand(2).getReg());
1987  } else {
1988  EltTy = MRI.getType(InsRegs[0]);
1989  }
1990 
1991  auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
1992  SmallVector<Register, 16> Ops(NumElem * NumLanes);
1993 
1994  for (unsigned I = 0; I < NumElem; ++I) {
1995  auto IC = B.buildConstant(S32, I);
1996  MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
1997  auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
1998  MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
1999 
2000  for (unsigned L = 0; L < NumLanes; ++L) {
2001  Register Op0 = constrainRegToBank(MRI, B, InsRegs[L], DstBank);
2002  Register Op1 = UnmergeToEltTy.getReg(I * NumLanes + L);
2003  Op1 = constrainRegToBank(MRI, B, Op1, DstBank);
2004 
2005  Register Select = B.buildSelect(EltTy, Cmp, Op0, Op1).getReg(0);
2006  MRI.setRegBank(Select, DstBank);
2007 
2008  Ops[I * NumLanes + L] = Select;
2009  }
2010  }
2011 
2012  LLT MergeTy = LLT::fixed_vector(Ops.size(), EltTy);
2013  if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) {
2014  B.buildBuildVector(MI.getOperand(0), Ops);
2015  } else {
2016  auto Vec = B.buildBuildVector(MergeTy, Ops);
2017  MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank);
2018  B.buildBitcast(MI.getOperand(0).getReg(), Vec);
2019  }
2020 
2021  MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
2022  MI.eraseFromParent();
2023 
2024  return true;
2025 }
2026 
2028  const OperandsMapper &OpdMapper) const {
2029  MachineInstr &MI = OpdMapper.getMI();
2030  unsigned Opc = MI.getOpcode();
2031  MachineRegisterInfo &MRI = OpdMapper.getMRI();
2032  switch (Opc) {
2033  case AMDGPU::G_PHI: {
2034  Register DstReg = MI.getOperand(0).getReg();
2035  LLT DstTy = MRI.getType(DstReg);
2036  if (DstTy != LLT::scalar(1))
2037  break;
2038 
2039  const LLT S32 = LLT::scalar(32);
2040  const RegisterBank *DstBank =
2041  OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2042  if (DstBank == &AMDGPU::VCCRegBank) {
2043  applyDefaultMapping(OpdMapper);
2044  // The standard handling only considers the result register bank for
2045  // phis. For VCC, blindly inserting a copy when the phi is lowered will
2046  // produce an invalid copy. We can only copy with some kind of compare to
2047  // get a vector boolean result. Insert a register bank copy that will be
2048  // correctly lowered to a compare.
2049  MachineIRBuilder B(*MI.getParent()->getParent());
2050 
2051  for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
2052  Register SrcReg = MI.getOperand(I).getReg();
2053  const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
2054 
2055  if (SrcBank != &AMDGPU::VCCRegBank) {
2056  MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB();
2057  B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator());
2058 
2059  auto Copy = B.buildCopy(LLT::scalar(1), SrcReg);
2060  MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank);
2061  MI.getOperand(I).setReg(Copy.getReg(0));
2062  }
2063  }
2064 
2065  return;
2066  }
2067 
2068  // Phi handling is strange and only considers the bank of the destination.
2069  substituteSimpleCopyRegs(OpdMapper, 0);
2070 
2071  // Promote SGPR/VGPR booleans to s32
2072  MachineFunction *MF = MI.getParent()->getParent();
2073  ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
2074  MachineIRBuilder B(MI, ApplyBank);
2075  LegalizerHelper Helper(*MF, ApplyBank, B);
2076 
2077  if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2078  llvm_unreachable("widen scalar should have succeeded");
2079 
2080  return;
2081  }
2082  case AMDGPU::G_ICMP:
2083  case AMDGPU::G_UADDO:
2084  case AMDGPU::G_USUBO:
2085  case AMDGPU::G_UADDE:
2086  case AMDGPU::G_SADDE:
2087  case AMDGPU::G_USUBE:
2088  case AMDGPU::G_SSUBE: {
2089  unsigned BoolDstOp = Opc == AMDGPU::G_ICMP ? 0 : 1;
2090  Register DstReg = MI.getOperand(BoolDstOp).getReg();
2091 
2092  const RegisterBank *DstBank =
2093  OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2094  if (DstBank != &AMDGPU::SGPRRegBank)
2095  break;
2096 
2097  const bool HasCarryIn = MI.getNumOperands() == 5;
2098 
2099  // If this is a scalar compare, promote the result to s32, as the selection
2100  // will end up using a copy to a 32-bit vreg.
2101  const LLT S32 = LLT::scalar(32);
2102  Register NewDstReg = MRI.createGenericVirtualRegister(S32);
2103  MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank);
2104  MI.getOperand(BoolDstOp).setReg(NewDstReg);
2106 
2107  if (HasCarryIn) {
2108  Register NewSrcReg = MRI.createGenericVirtualRegister(S32);
2109  MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank);
2110  B.buildZExt(NewSrcReg, MI.getOperand(4).getReg());
2111  MI.getOperand(4).setReg(NewSrcReg);
2112  }
2113 
2114  MachineBasicBlock *MBB = MI.getParent();
2115  B.setInsertPt(*MBB, std::next(MI.getIterator()));
2116 
2117  // If we had a constrained VCC result register, a copy was inserted to VCC
2118  // from SGPR.
2119  SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
2120  if (DefRegs.empty())
2121  DefRegs.push_back(DstReg);
2122  B.buildTrunc(DefRegs[0], NewDstReg);
2123  return;
2124  }
2125  case AMDGPU::G_SELECT: {
2126  Register DstReg = MI.getOperand(0).getReg();
2127  LLT DstTy = MRI.getType(DstReg);
2128 
2129  SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1));
2130  if (CondRegs.empty())
2131  CondRegs.push_back(MI.getOperand(1).getReg());
2132  else {
2133  assert(CondRegs.size() == 1);
2134  }
2135 
2136  const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI);
2137  if (CondBank == &AMDGPU::SGPRRegBank) {
2139  const LLT S32 = LLT::scalar(32);
2140  Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2141  MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2142 
2143  MI.getOperand(1).setReg(NewCondReg);
2144  B.buildZExt(NewCondReg, CondRegs[0]);
2145  }
2146 
2147  if (DstTy.getSizeInBits() != 64)
2148  break;
2149 
2151  LLT HalfTy = getHalfSizedType(DstTy);
2152 
2153  SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2154  SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2155  SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));
2156 
2157  // All inputs are SGPRs, nothing special to do.
2158  if (DefRegs.empty()) {
2159  assert(Src1Regs.empty() && Src2Regs.empty());
2160  break;
2161  }
2162 
2163  if (Src1Regs.empty())
2164  split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2165  else {
2166  setRegsToType(MRI, Src1Regs, HalfTy);
2167  }
2168 
2169  if (Src2Regs.empty())
2170  split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg());
2171  else
2172  setRegsToType(MRI, Src2Regs, HalfTy);
2173 
2174  setRegsToType(MRI, DefRegs, HalfTy);
2175 
2176  B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]);
2177  B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]);
2178 
2179  MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2180  MI.eraseFromParent();
2181  return;
2182  }
2183  case AMDGPU::G_BRCOND: {
2184  Register CondReg = MI.getOperand(0).getReg();
2185  // FIXME: Should use legalizer helper, but should change bool ext type.
2186  const RegisterBank *CondBank =
2187  OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2188 
2189  if (CondBank == &AMDGPU::SGPRRegBank) {
2191  const LLT S32 = LLT::scalar(32);
2192  Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2193  MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2194 
2195  MI.getOperand(0).setReg(NewCondReg);
2196  B.buildZExt(NewCondReg, CondReg);
2197  return;
2198  }
2199 
2200  break;
2201  }
2202  case AMDGPU::G_AND:
2203  case AMDGPU::G_OR:
2204  case AMDGPU::G_XOR: {
2205  // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
2206  // there is a VGPR input.
2207  Register DstReg = MI.getOperand(0).getReg();
2208  LLT DstTy = MRI.getType(DstReg);
2209 
2210  if (DstTy.getSizeInBits() == 1) {
2211  const RegisterBank *DstBank =
2212  OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2213  if (DstBank == &AMDGPU::VCCRegBank)
2214  break;
2215 
2216  MachineFunction *MF = MI.getParent()->getParent();
2217  ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
2218  MachineIRBuilder B(MI, ApplyBank);
2219  LegalizerHelper Helper(*MF, ApplyBank, B);
2220 
2221  if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
2223  llvm_unreachable("widen scalar should have succeeded");
2224  return;
2225  }
2226 
2227  if (DstTy.getSizeInBits() != 64)
2228  break;
2229 
2230  LLT HalfTy = getHalfSizedType(DstTy);
2231  SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2232  SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
2233  SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2234 
2235  // All inputs are SGPRs, nothing special to do.
2236  if (DefRegs.empty()) {
2237  assert(Src0Regs.empty() && Src1Regs.empty());
2238  break;
2239  }
2240 
2241  assert(DefRegs.size() == 2);
2242  assert(Src0Regs.size() == Src1Regs.size() &&
2243  (Src0Regs.empty() || Src0Regs.size() == 2));
2244 
2245  // Depending on where the source registers came from, the generic code may
2246  // have decided to split the inputs already or not. If not, we still need to
2247  // extract the values.
2249 
2250  if (Src0Regs.empty())
2251  split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
2252  else
2253  setRegsToType(MRI, Src0Regs, HalfTy);
2254 
2255  if (Src1Regs.empty())
2256  split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2257  else
2258  setRegsToType(MRI, Src1Regs, HalfTy);
2259 
2260  setRegsToType(MRI, DefRegs, HalfTy);
2261 
2262  B.buildInstr(Opc, {DefRegs[0]}, {Src0Regs[0], Src1Regs[0]});
2263  B.buildInstr(Opc, {DefRegs[1]}, {Src0Regs[1], Src1Regs[1]});
2264 
2265  MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2266  MI.eraseFromParent();
2267  return;
2268  }
2269  case AMDGPU::G_ABS: {
2270  Register SrcReg = MI.getOperand(1).getReg();
2271  const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg);
2272 
2273  // There is no VALU abs instruction so we need to replace it with a sub and
2274  // max combination.
2275  if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) {
2276  MachineFunction *MF = MI.getParent()->getParent();
2277  ApplyRegBankMapping Apply(*this, MRI, &AMDGPU::VGPRRegBank);
2278  MachineIRBuilder B(MI, Apply);
2279  LegalizerHelper Helper(*MF, Apply, B);
2280 
2282  llvm_unreachable("lowerAbsToMaxNeg should have succeeded");
2283  return;
2284  }
2286  }
2287  case AMDGPU::G_ADD:
2288  case AMDGPU::G_SUB:
2289  case AMDGPU::G_MUL:
2290  case AMDGPU::G_SHL:
2291  case AMDGPU::G_LSHR:
2292  case AMDGPU::G_ASHR:
2293  case AMDGPU::G_SMIN:
2294  case AMDGPU::G_SMAX:
2295  case AMDGPU::G_UMIN:
2296  case AMDGPU::G_UMAX: {
2297  Register DstReg = MI.getOperand(0).getReg();
2298  LLT DstTy = MRI.getType(DstReg);
2299 
2300  // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
2301  // Packed 16-bit operations need to be scalarized and promoted.
2302  if (DstTy != LLT::scalar(16) && DstTy != LLT::fixed_vector(2, 16))
2303  break;
2304 
2305  const RegisterBank *DstBank =
2306  OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2307  if (DstBank == &AMDGPU::VGPRRegBank)
2308  break;
2309 
2310  const LLT S32 = LLT::scalar(32);
2311  MachineBasicBlock *MBB = MI.getParent();
2312  MachineFunction *MF = MBB->getParent();
2313  ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
2314  MachineIRBuilder B(MI, ApplySALU);
2315 
2316  if (DstTy.isVector()) {
2317  Register WideSrc0Lo, WideSrc0Hi;
2318  Register WideSrc1Lo, WideSrc1Hi;
2319 
2320  unsigned ExtendOp = getExtendOp(MI.getOpcode());
2321  std::tie(WideSrc0Lo, WideSrc0Hi)
2322  = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), ExtendOp);
2323  std::tie(WideSrc1Lo, WideSrc1Hi)
2324  = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), ExtendOp);
2325  auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo});
2326  auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi});
2327  B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
2328  MI.eraseFromParent();
2329  } else {
2330  LegalizerHelper Helper(*MF, ApplySALU, B);
2331 
2332  if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2333  llvm_unreachable("widen scalar should have succeeded");
2334 
2335  // FIXME: s16 shift amounts should be legal.
2336  if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR ||
2337  Opc == AMDGPU::G_ASHR) {
2338  B.setInsertPt(*MBB, MI.getIterator());
2339  if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2340  llvm_unreachable("widen scalar should have succeeded");
2341  }
2342  }
2343 
2344  return;
2345  }
2346  case AMDGPU::G_SEXT_INREG: {
2347  SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2348  if (SrcRegs.empty())
2349  break; // Nothing to repair
2350 
2351  const LLT S32 = LLT::scalar(32);
2353  ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank);
2354  GISelObserverWrapper Observer(&O);
2355  B.setChangeObserver(Observer);
2356 
2357  // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs
2358  // we would need to further expand, and doesn't let us directly set the
2359  // result registers.
2360  SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2361 
2362  int Amt = MI.getOperand(2).getImm();
2363  if (Amt <= 32) {
2364  if (Amt == 32) {
2365  // The low bits are unchanged.
2366  B.buildCopy(DstRegs[0], SrcRegs[0]);
2367  } else {
2368  // Extend in the low bits and propagate the sign bit to the high half.
2369  B.buildSExtInReg(DstRegs[0], SrcRegs[0], Amt);
2370  }
2371 
2372  B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31));
2373  } else {
2374  // The low bits are unchanged, and extend in the high bits.
2375  B.buildCopy(DstRegs[0], SrcRegs[0]);
2376  B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32);
2377  }
2378 
2379  Register DstReg = MI.getOperand(0).getReg();
2380  MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2381  MI.eraseFromParent();
2382  return;
2383  }
2384  case AMDGPU::G_CTPOP:
2385  case AMDGPU::G_BITREVERSE: {
2386  const RegisterBank *DstBank =
2387  OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2388  if (DstBank == &AMDGPU::SGPRRegBank)
2389  break;
2390 
2391  Register SrcReg = MI.getOperand(1).getReg();
2392  const LLT S32 = LLT::scalar(32);
2393  LLT Ty = MRI.getType(SrcReg);
2394  if (Ty == S32)
2395  break;
2396 
2397  ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
2398  MachineIRBuilder B(MI, ApplyVALU);
2399 
2400  MachineFunction &MF = B.getMF();
2401  LegalizerHelper Helper(MF, ApplyVALU, B);
2402 
2403  if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2404  llvm_unreachable("narrowScalar should have succeeded");
2405  return;
2406  }
2407  case AMDGPU::G_AMDGPU_FFBH_U32:
2408  case AMDGPU::G_AMDGPU_FFBL_B32:
2409  case AMDGPU::G_CTLZ_ZERO_UNDEF:
2410  case AMDGPU::G_CTTZ_ZERO_UNDEF: {
2411  const RegisterBank *DstBank =
2412  OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2413  if (DstBank == &AMDGPU::SGPRRegBank)
2414  break;
2415 
2416  Register SrcReg = MI.getOperand(1).getReg();
2417  const LLT S32 = LLT::scalar(32);
2418  LLT Ty = MRI.getType(SrcReg);
2419  if (Ty == S32)
2420  break;
2421 
2422  // We can narrow this more efficiently than Helper can by using ffbh/ffbl
2423  // which return -1 when the input is zero:
2424  // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
2425  // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
2426  // (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32))
2427  // (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo))
2428  ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
2429  MachineIRBuilder B(MI, ApplyVALU);
2430  SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2431  unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF
2432  ? (unsigned)AMDGPU::G_AMDGPU_FFBH_U32
2433  : Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2434  ? (unsigned)AMDGPU::G_AMDGPU_FFBL_B32
2435  : Opc;
2436  unsigned Idx = NewOpc == AMDGPU::G_AMDGPU_FFBH_U32;
2437  auto X = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx]});
2438  auto Y = B.buildInstr(NewOpc, {S32}, {SrcRegs[Idx ^ 1]});
2439  unsigned AddOpc =
2440  Opc == AMDGPU::G_CTLZ_ZERO_UNDEF || Opc == AMDGPU::G_CTTZ_ZERO_UNDEF
2441  ? AMDGPU::G_ADD
2442  : AMDGPU::G_UADDSAT;
2443  Y = B.buildInstr(AddOpc, {S32}, {Y, B.buildConstant(S32, 32)});
2444  Register DstReg = MI.getOperand(0).getReg();
2445  B.buildUMin(DstReg, X, Y);
2446  MI.eraseFromParent();
2447  return;
2448  }
2449  case AMDGPU::G_SEXT:
2450  case AMDGPU::G_ZEXT:
2451  case AMDGPU::G_ANYEXT: {
2452  Register SrcReg = MI.getOperand(1).getReg();
2453  LLT SrcTy = MRI.getType(SrcReg);
2454  const bool Signed = Opc == AMDGPU::G_SEXT;
2455 
2456  assert(empty(OpdMapper.getVRegs(1)));
2457 
2459  const RegisterBank *SrcBank =
2460  OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2461 
2462  Register DstReg = MI.getOperand(0).getReg();
2463  LLT DstTy = MRI.getType(DstReg);
2464  if (DstTy.isScalar() &&
2465  SrcBank != &AMDGPU::SGPRRegBank &&
2466  SrcBank != &AMDGPU::VCCRegBank &&
2467  // FIXME: Should handle any type that round to s64 when irregular
2468  // breakdowns supported.
2469  DstTy.getSizeInBits() == 64 &&
2470  SrcTy.getSizeInBits() <= 32) {
2471  SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2472 
2473  // Extend to 32-bit, and then extend the low half.
2474  if (Signed) {
2475  // TODO: Should really be buildSExtOrCopy
2476  B.buildSExtOrTrunc(DefRegs[0], SrcReg);
2477  } else if (Opc == AMDGPU::G_ZEXT) {
2478  B.buildZExtOrTrunc(DefRegs[0], SrcReg);
2479  } else {
2480  B.buildAnyExtOrTrunc(DefRegs[0], SrcReg);
2481  }
2482 
2483  extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank);
2484  MRI.setRegBank(DstReg, *SrcBank);
2485  MI.eraseFromParent();
2486  return;
2487  }
2488 
2489  if (SrcTy != LLT::scalar(1))
2490  return;
2491 
2492  // It is not legal to have a legalization artifact with a VCC source. Rather
2493  // than introducing a copy, insert the select we would have to select the
2494  // copy to.
2495  if (SrcBank == &AMDGPU::VCCRegBank) {
2496  SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2497 
2498  const RegisterBank *DstBank = &AMDGPU::VGPRRegBank;
2499 
2500  unsigned DstSize = DstTy.getSizeInBits();
2501  // 64-bit select is SGPR only
2502  const bool UseSel64 = DstSize > 32 &&
2503  SrcBank->getID() == AMDGPU::SGPRRegBankID;
2504 
2505  // TODO: Should s16 select be legal?
2506  LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
2507  auto True = B.buildConstant(SelType, Signed ? -1 : 1);
2508  auto False = B.buildConstant(SelType, 0);
2509 
2510  MRI.setRegBank(True.getReg(0), *DstBank);
2511  MRI.setRegBank(False.getReg(0), *DstBank);
2512  MRI.setRegBank(DstReg, *DstBank);
2513 
2514  if (DstSize > 32) {
2515  B.buildSelect(DefRegs[0], SrcReg, True, False);
2516  extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true);
2517  } else if (DstSize < 32) {
2518  auto Sel = B.buildSelect(SelType, SrcReg, True, False);
2519  MRI.setRegBank(Sel.getReg(0), *DstBank);
2520  B.buildTrunc(DstReg, Sel);
2521  } else {
2522  B.buildSelect(DstReg, SrcReg, True, False);
2523  }
2524 
2525  MI.eraseFromParent();
2526  return;
2527  }
2528 
2529  break;
2530  }
2531  case AMDGPU::G_BUILD_VECTOR:
2532  case AMDGPU::G_BUILD_VECTOR_TRUNC: {
2533  Register DstReg = MI.getOperand(0).getReg();
2534  LLT DstTy = MRI.getType(DstReg);
2535  if (DstTy != LLT::fixed_vector(2, 16))
2536  break;
2537 
2538  assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty());
2539  substituteSimpleCopyRegs(OpdMapper, 1);
2540  substituteSimpleCopyRegs(OpdMapper, 2);
2541 
2542  const RegisterBank *DstBank =
2543  OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2544  if (DstBank == &AMDGPU::SGPRRegBank)
2545  break; // Can use S_PACK_* instructions.
2546 
2548 
2549  Register Lo = MI.getOperand(1).getReg();
2550  Register Hi = MI.getOperand(2).getReg();
2551  const LLT S32 = LLT::scalar(32);
2552 
2553  const RegisterBank *BankLo =
2554  OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2555  const RegisterBank *BankHi =
2556  OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2557 
2558  Register ZextLo;
2559  Register ShiftHi;
2560 
2561  if (Opc == AMDGPU::G_BUILD_VECTOR) {
2562  ZextLo = B.buildZExt(S32, Lo).getReg(0);
2563  MRI.setRegBank(ZextLo, *BankLo);
2564 
2565  Register ZextHi = B.buildZExt(S32, Hi).getReg(0);
2566  MRI.setRegBank(ZextHi, *BankHi);
2567 
2568  auto ShiftAmt = B.buildConstant(S32, 16);
2569  MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
2570 
2571  ShiftHi = B.buildShl(S32, ZextHi, ShiftAmt).getReg(0);
2572  MRI.setRegBank(ShiftHi, *BankHi);
2573  } else {
2574  Register MaskLo = B.buildConstant(S32, 0xffff).getReg(0);
2575  MRI.setRegBank(MaskLo, *BankLo);
2576 
2577  auto ShiftAmt = B.buildConstant(S32, 16);
2578  MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
2579 
2580  ShiftHi = B.buildShl(S32, Hi, ShiftAmt).getReg(0);
2581  MRI.setRegBank(ShiftHi, *BankHi);
2582 
2583  ZextLo = B.buildAnd(S32, Lo, MaskLo).getReg(0);
2584  MRI.setRegBank(ZextLo, *BankLo);
2585  }
2586 
2587  auto Or = B.buildOr(S32, ZextLo, ShiftHi);
2588  MRI.setRegBank(Or.getReg(0), *DstBank);
2589 
2590  B.buildBitcast(DstReg, Or);
2591  MI.eraseFromParent();
2592  return;
2593  }
2594  case AMDGPU::G_EXTRACT_VECTOR_ELT: {
2595  SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2596 
2597  assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());
2598 
2599  Register DstReg = MI.getOperand(0).getReg();
2600  Register SrcReg = MI.getOperand(1).getReg();
2601 
2602  const LLT S32 = LLT::scalar(32);
2603  LLT DstTy = MRI.getType(DstReg);
2604  LLT SrcTy = MRI.getType(SrcReg);
2605 
2606  if (foldExtractEltToCmpSelect(MI, MRI, OpdMapper))
2607  return;
2608 
2610 
2611  const ValueMapping &DstMapping
2612  = OpdMapper.getInstrMapping().getOperandMapping(0);
2613  const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank;
2614  const RegisterBank *SrcBank =
2615  OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2616  const RegisterBank *IdxBank =
2617  OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2618 
2619  Register BaseIdxReg;
2620  unsigned ConstOffset;
2621  std::tie(BaseIdxReg, ConstOffset) =
2622  AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg());
2623 
2624  // See if the index is an add of a constant which will be foldable by moving
2625  // the base register of the index later if this is going to be executed in a
2626  // waterfall loop. This is essentially to reassociate the add of a constant
2627  // with the readfirstlane.
2628  bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2629  ConstOffset > 0 &&
2630  ConstOffset < SrcTy.getNumElements();
2631 
2632  // Move the base register. We'll re-insert the add later.
2633  if (ShouldMoveIndexIntoLoop)
2634  MI.getOperand(2).setReg(BaseIdxReg);
2635 
2636  // If this is a VGPR result only because the index was a VGPR result, the
2637  // actual indexing will be done on the SGPR source vector, which will
2638  // produce a scalar result. We need to copy to the VGPR result inside the
2639  // waterfall loop.
2640  const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank &&
2641  SrcBank == &AMDGPU::SGPRRegBank;
2642  if (DstRegs.empty()) {
2643  applyDefaultMapping(OpdMapper);
2644 
2645  executeInWaterfallLoop(MI, MRI, { 2 });
2646 
2647  if (NeedCopyToVGPR) {
2648  // We don't want a phi for this temporary reg.
2649  Register TmpReg = MRI.createGenericVirtualRegister(DstTy);
2650  MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank);
2651  MI.getOperand(0).setReg(TmpReg);
2652  B.setInsertPt(*MI.getParent(), ++MI.getIterator());
2653 
2654  // Use a v_mov_b32 here to make the exec dependency explicit.
2655  buildVCopy(B, DstReg, TmpReg);
2656  }
2657 
2658  // Re-insert the constant offset add inside the waterfall loop.
2659  if (ShouldMoveIndexIntoLoop)
2660  reinsertVectorIndexAdd(B, MI, 2, ConstOffset);
2661 
2662  return;
2663  }
2664 
2665  assert(DstTy.getSizeInBits() == 64);
2666 
2667  LLT Vec32 = LLT::fixed_vector(2 * SrcTy.getNumElements(), 32);
2668 
2669  auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2670  auto One = B.buildConstant(S32, 1);
2671 
2672  MachineBasicBlock::iterator MII = MI.getIterator();
2673 
2674  // Split the vector index into 32-bit pieces. Prepare to move all of the
2675  // new instructions into a waterfall loop if necessary.
2676  //
2677  // Don't put the bitcast or constant in the loop.
2678  MachineInstrSpan Span(MII, &B.getMBB());
2679 
2680  // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2681  auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2682  auto IdxHi = B.buildAdd(S32, IdxLo, One);
2683 
2684  auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
2685  auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi);
2686 
2687  MRI.setRegBank(DstReg, *DstBank);
2688  MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2689  MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2690  MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2691  MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2692 
2693  SmallSet<Register, 4> OpsToWaterfall;
2694  if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) {
2695  MI.eraseFromParent();
2696  return;
2697  }
2698 
2699  // Remove the original instruction to avoid potentially confusing the
2700  // waterfall loop logic.
2701  B.setInstr(*Span.begin());
2702  MI.eraseFromParent();
2703  executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2704  OpsToWaterfall, MRI);
2705 
2706  if (NeedCopyToVGPR) {
2707  MachineBasicBlock *LoopBB = Extract1->getParent();
2708  Register TmpReg0 = MRI.createGenericVirtualRegister(S32);
2709  Register TmpReg1 = MRI.createGenericVirtualRegister(S32);
2710  MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank);
2711  MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank);
2712 
2713  Extract0->getOperand(0).setReg(TmpReg0);
2714  Extract1->getOperand(0).setReg(TmpReg1);
2715 
2716  B.setInsertPt(*LoopBB, ++Extract1->getIterator());
2717 
2718  buildVCopy(B, DstRegs[0], TmpReg0);
2719  buildVCopy(B, DstRegs[1], TmpReg1);
2720  }
2721 
2722  if (ShouldMoveIndexIntoLoop)
2723  reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2724 
2725  return;
2726  }
2727  case AMDGPU::G_INSERT_VECTOR_ELT: {
2728  SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2729 
2730  Register DstReg = MI.getOperand(0).getReg();
2731  LLT VecTy = MRI.getType(DstReg);
2732 
2733  assert(OpdMapper.getVRegs(0).empty());
2734  assert(OpdMapper.getVRegs(3).empty());
2735 
2736  if (substituteSimpleCopyRegs(OpdMapper, 1))
2737  MRI.setType(MI.getOperand(1).getReg(), VecTy);
2738 
2739  if (foldInsertEltToCmpSelect(MI, MRI, OpdMapper))
2740  return;
2741 
2742  const RegisterBank *IdxBank =
2743  OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2744 
2745  Register SrcReg = MI.getOperand(1).getReg();
2746  Register InsReg = MI.getOperand(2).getReg();
2747  LLT InsTy = MRI.getType(InsReg);
2748  (void)InsTy;
2749 
2750  Register BaseIdxReg;
2751  unsigned ConstOffset;
2752  std::tie(BaseIdxReg, ConstOffset) =
2753  AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg());
2754 
2755  // See if the index is an add of a constant which will be foldable by moving
2756  // the base register of the index later if this is going to be executed in a
2757  // waterfall loop. This is essentially to reassociate the add of a constant
2758  // with the readfirstlane.
2759  bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2760  ConstOffset > 0 &&
2761  ConstOffset < VecTy.getNumElements();
2762 
2763  // Move the base register. We'll re-insert the add later.
2764  if (ShouldMoveIndexIntoLoop)
2765  MI.getOperand(3).setReg(BaseIdxReg);
2766 
2767 
2768  if (InsRegs.empty()) {
2769  executeInWaterfallLoop(MI, MRI, { 3 });
2770 
2771  // Re-insert the constant offset add inside the waterfall loop.
2772  if (ShouldMoveIndexIntoLoop) {
2774  reinsertVectorIndexAdd(B, MI, 3, ConstOffset);
2775  }
2776 
2777  return;
2778  }
2779 
2780 
2781  assert(InsTy.getSizeInBits() == 64);
2782 
2783  const LLT S32 = LLT::scalar(32);
2784  LLT Vec32 = LLT::fixed_vector(2 * VecTy.getNumElements(), 32);
2785 
2787  auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2788  auto One = B.buildConstant(S32, 1);
2789 
2790  // Split the vector index into 32-bit pieces. Prepare to move all of the
2791  // new instructions into a waterfall loop if necessary.
2792  //
2793  // Don't put the bitcast or constant in the loop.
2794  MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
2795 
2796  // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2797  auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2798  auto IdxHi = B.buildAdd(S32, IdxLo, One);
2799 
2800  auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo);
2801  auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi);
2802 
2803  const RegisterBank *DstBank =
2804  OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2805  const RegisterBank *SrcBank =
2806  OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2807  const RegisterBank *InsSrcBank =
2808  OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2809 
2810  MRI.setRegBank(InsReg, *InsSrcBank);
2811  MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2812  MRI.setRegBank(InsLo.getReg(0), *DstBank);
2813  MRI.setRegBank(InsHi.getReg(0), *DstBank);
2814  MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2815  MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2816  MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2817 
2818 
2819  SmallSet<Register, 4> OpsToWaterfall;
2820  if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) {
2821  B.setInsertPt(B.getMBB(), MI);
2822  B.buildBitcast(DstReg, InsHi);
2823  MI.eraseFromParent();
2824  return;
2825  }
2826 
2827  B.setInstr(*Span.begin());
2828  MI.eraseFromParent();
2829 
2830  // Figure out the point after the waterfall loop before mangling the control
2831  // flow.
2832  executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2833  OpsToWaterfall, MRI);
2834 
2835  // The insertion point is now right after the original instruction.
2836  //
2837  // Keep the bitcast to the original vector type out of the loop. Doing this
2838  // saved an extra phi we don't need inside the loop.
2839  B.buildBitcast(DstReg, InsHi);
2840 
2841  // Re-insert the constant offset add inside the waterfall loop.
2842  if (ShouldMoveIndexIntoLoop)
2843  reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2844 
2845  return;
2846  }
2847  case AMDGPU::G_AMDGPU_BUFFER_LOAD:
2848  case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
2849  case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
2850  case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
2851  case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
2852  case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
2853  case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
2854  case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
2855  case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
2856  case AMDGPU::G_AMDGPU_BUFFER_STORE:
2857  case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
2858  case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
2859  case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
2860  case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16:
2861  case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
2862  case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: {
2863  applyDefaultMapping(OpdMapper);
2864  executeInWaterfallLoop(MI, MRI, {1, 4});
2865  return;
2866  }
2867  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
2868  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
2869  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
2870  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
2871  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
2872  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
2873  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
2874  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
2875  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
2876  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
2877  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
2878  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: {
2879  applyDefaultMapping(OpdMapper);
2880  executeInWaterfallLoop(MI, MRI, {2, 5});
2881  return;
2882  }
2883  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
2884  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
2885  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
2886  applyDefaultMapping(OpdMapper);
2887  executeInWaterfallLoop(MI, MRI, {2, 5});
2888  return;
2889  }
2890  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
2891  applyDefaultMapping(OpdMapper);
2892  executeInWaterfallLoop(MI, MRI, {3, 6});
2893  return;
2894  }
2895  case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
2896  applyMappingSBufferLoad(OpdMapper);
2897  return;
2898  }
2899  case AMDGPU::G_INTRINSIC: {
2900  switch (MI.getIntrinsicID()) {
2901  case Intrinsic::amdgcn_readlane: {
2902  substituteSimpleCopyRegs(OpdMapper, 2);
2903 
2904  assert(OpdMapper.getVRegs(0).empty());
2905  assert(OpdMapper.getVRegs(3).empty());
2906 
2907  // Make sure the index is an SGPR. It doesn't make sense to run this in a
2908  // waterfall loop, so assume it's a uniform value.
2909  constrainOpWithReadfirstlane(MI, MRI, 3); // Index
2910  return;
2911  }
2912  case Intrinsic::amdgcn_writelane: {
2913  assert(OpdMapper.getVRegs(0).empty());
2914  assert(OpdMapper.getVRegs(2).empty());
2915  assert(OpdMapper.getVRegs(3).empty());
2916 
2917  substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
2918  constrainOpWithReadfirstlane(MI, MRI, 2); // Source value
2919  constrainOpWithReadfirstlane(MI, MRI, 3); // Index
2920  return;
2921  }
2922  case Intrinsic::amdgcn_interp_p1:
2923  case Intrinsic::amdgcn_interp_p2:
2924  case Intrinsic::amdgcn_interp_mov:
2925  case Intrinsic::amdgcn_interp_p1_f16:
2926  case Intrinsic::amdgcn_interp_p2_f16: {
2927  applyDefaultMapping(OpdMapper);
2928 
2929  // Readlane for m0 value, which is always the last operand.
2930  // FIXME: Should this be a waterfall loop instead?
2931  constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index
2932  return;
2933  }
2934  case Intrinsic::amdgcn_permlane16:
2935  case Intrinsic::amdgcn_permlanex16: {
2936  // Doing a waterfall loop over these wouldn't make any sense.
2937  substituteSimpleCopyRegs(OpdMapper, 2);
2938  substituteSimpleCopyRegs(OpdMapper, 3);
2941  return;
2942  }
2943  case Intrinsic::amdgcn_sbfe:
2944  applyMappingBFE(OpdMapper, true);
2945  return;
2946  case Intrinsic::amdgcn_ubfe:
2947  applyMappingBFE(OpdMapper, false);
2948  return;
2949  case Intrinsic::amdgcn_ballot:
2950  // Use default handling and insert copy to vcc source.
2951  break;
2952  }
2953  break;
2954  }
2955  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
2956  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
2957  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
2958  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
2959  const AMDGPU::RsrcIntrinsic *RSrcIntrin
2960  = AMDGPU::lookupRsrcIntrinsic(MI.getIntrinsicID());
2961  assert(RSrcIntrin && RSrcIntrin->IsImage);
2962  // Non-images can have complications from operands that allow both SGPR
2963  // and VGPR. For now it's too complicated to figure out the final opcode
2964  // to derive the register bank from the MCInstrDesc.
2965  applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
2966  return;
2967  }
2968  case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
2969  unsigned N = MI.getNumExplicitOperands() - 2;
2970  applyDefaultMapping(OpdMapper);
2971  executeInWaterfallLoop(MI, MRI, { N });
2972  return;
2973  }
2974  case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
2975  auto IntrID = MI.getIntrinsicID();
2976  switch (IntrID) {
2977  case Intrinsic::amdgcn_ds_ordered_add:
2978  case Intrinsic::amdgcn_ds_ordered_swap: {
2979  // This is only allowed to execute with 1 lane, so readfirstlane is safe.
2980  assert(OpdMapper.getVRegs(0).empty());
2981  substituteSimpleCopyRegs(OpdMapper, 3);
2982  constrainOpWithReadfirstlane(MI, MRI, 2); // M0
2983  return;
2984  }
2985  case Intrinsic::amdgcn_ds_gws_init:
2986  case Intrinsic::amdgcn_ds_gws_barrier:
2987  case Intrinsic::amdgcn_ds_gws_sema_br: {
2988  // Only the first lane is executes, so readfirstlane is safe.
2989  substituteSimpleCopyRegs(OpdMapper, 1);
2990  constrainOpWithReadfirstlane(MI, MRI, 2); // M0
2991  return;
2992  }
2993  case Intrinsic::amdgcn_ds_gws_sema_v:
2994  case Intrinsic::amdgcn_ds_gws_sema_p:
2995  case Intrinsic::amdgcn_ds_gws_sema_release_all: {
2996  // Only the first lane is executes, so readfirstlane is safe.
2997  constrainOpWithReadfirstlane(MI, MRI, 1); // M0
2998  return;
2999  }
3000  case Intrinsic::amdgcn_ds_append:
3001  case Intrinsic::amdgcn_ds_consume: {
3002  constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3003  return;
3004  }
3005  case Intrinsic::amdgcn_s_sendmsg:
3006  case Intrinsic::amdgcn_s_sendmsghalt: {
3007  // FIXME: Should this use a waterfall loop?
3008  constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3009  return;
3010  }
3011  case Intrinsic::amdgcn_s_setreg: {
3013  return;
3014  }
3015  case Intrinsic::amdgcn_raw_buffer_load_lds: {
3016  applyDefaultMapping(OpdMapper);
3017  constrainOpWithReadfirstlane(MI, MRI, 1); // rsrc
3018  constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3019  constrainOpWithReadfirstlane(MI, MRI, 5); // soffset
3020  return;
3021  }
3022  case Intrinsic::amdgcn_struct_buffer_load_lds: {
3023  applyDefaultMapping(OpdMapper);
3024  constrainOpWithReadfirstlane(MI, MRI, 1); // rsrc
3025  constrainOpWithReadfirstlane(MI, MRI, 2); // M0
3026  constrainOpWithReadfirstlane(MI, MRI, 6); // soffset
3027  return;
3028  }
3029  case Intrinsic::amdgcn_global_load_lds: {
3030  applyDefaultMapping(OpdMapper);
3032  return;
3033  }
3034  default: {
3035  if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
3036  AMDGPU::lookupRsrcIntrinsic(IntrID)) {
3037  // Non-images can have complications from operands that allow both SGPR
3038  // and VGPR. For now it's too complicated to figure out the final opcode
3039  // to derive the register bank from the MCInstrDesc.
3040  if (RSrcIntrin->IsImage) {
3041  applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
3042  return;
3043  }
3044  }
3045 
3046  break;
3047  }
3048  }
3049  break;
3050  }
3051  case AMDGPU::G_SI_CALL: {
3052  // Use a set to avoid extra readfirstlanes in the case where multiple
3053  // operands are the same register.
3054  SmallSet<Register, 4> SGPROperandRegs;
3055 
3056  if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, {1}))
3057  break;
3058 
3059  // Move all copies to physical SGPRs that are used by the call instruction
3060  // into the loop block. Start searching for these copies until the
3061  // ADJCALLSTACKUP.
3062  unsigned FrameSetupOpcode = AMDGPU::ADJCALLSTACKUP;
3063  unsigned FrameDestroyOpcode = AMDGPU::ADJCALLSTACKDOWN;
3064 
3065  // Move all non-copies before the copies, so that a complete range can be
3066  // moved into the waterfall loop.
3067  SmallVector<MachineInstr *, 4> NonCopyInstrs;
3068  // Count of NonCopyInstrs found until the current LastCopy.
3069  unsigned NonCopyInstrsLen = 0;
3071  MachineBasicBlock::iterator LastCopy = Start;
3072  MachineBasicBlock *MBB = MI.getParent();
3073  const SIMachineFunctionInfo *Info =
3075  while (Start->getOpcode() != FrameSetupOpcode) {
3076  --Start;
3077  bool IsCopy = false;
3078  if (Start->getOpcode() == AMDGPU::COPY) {
3079  auto &Dst = Start->getOperand(0);
3080  if (Dst.isReg()) {
3081  Register Reg = Dst.getReg();
3082  if (Reg.isPhysical() && MI.readsRegister(Reg, TRI)) {
3083  IsCopy = true;
3084  } else {
3085  // Also move the copy from the scratch rsrc descriptor into the loop
3086  // to allow it to be optimized away.
3087  auto &Src = Start->getOperand(1);
3088  if (Src.isReg()) {
3089  Reg = Src.getReg();
3090  IsCopy = Info->getScratchRSrcReg() == Reg;
3091  }
3092  }
3093  }
3094  }
3095 
3096  if (IsCopy) {
3097  LastCopy = Start;
3098  NonCopyInstrsLen = NonCopyInstrs.size();
3099  } else {
3100  NonCopyInstrs.push_back(&*Start);
3101  }
3102  }
3103  NonCopyInstrs.resize(NonCopyInstrsLen);
3104 
3105  for (auto *NonCopy : reverse(NonCopyInstrs)) {
3106  MBB->splice(LastCopy, MBB, NonCopy->getIterator());
3107  }
3108  Start = LastCopy;
3109 
3110  // Do the same for copies after the loop
3111  NonCopyInstrs.clear();
3112  NonCopyInstrsLen = 0;
3114  LastCopy = End;
3115  while (End->getOpcode() != FrameDestroyOpcode) {
3116  ++End;
3117  bool IsCopy = false;
3118  if (End->getOpcode() == AMDGPU::COPY) {
3119  auto &Src = End->getOperand(1);
3120  if (Src.isReg()) {
3121  Register Reg = Src.getReg();
3122  IsCopy = Reg.isPhysical() && MI.modifiesRegister(Reg, TRI);
3123  }
3124  }
3125 
3126  if (IsCopy) {
3127  LastCopy = End;
3128  NonCopyInstrsLen = NonCopyInstrs.size();
3129  } else {
3130  NonCopyInstrs.push_back(&*End);
3131  }
3132  }
3133  NonCopyInstrs.resize(NonCopyInstrsLen);
3134 
3135  End = LastCopy;
3136  ++LastCopy;
3137  for (auto *NonCopy : reverse(NonCopyInstrs)) {
3138  MBB->splice(LastCopy, MBB, NonCopy->getIterator());
3139  }
3140 
3141  ++End;
3142  MachineIRBuilder B(*Start);
3143  executeInWaterfallLoop(B, make_range(Start, End), SGPROperandRegs, MRI);
3144  break;
3145  }
3146  case AMDGPU::G_LOAD:
3147  case AMDGPU::G_ZEXTLOAD:
3148  case AMDGPU::G_SEXTLOAD: {
3149  if (applyMappingLoad(MI, OpdMapper, MRI))
3150  return;
3151  break;
3152  }
3153  case AMDGPU::G_DYN_STACKALLOC:
3154  applyMappingDynStackAlloc(MI, OpdMapper, MRI);
3155  return;
3156  case AMDGPU::G_SBFX:
3157  applyMappingBFE(OpdMapper, /*Signed*/ true);
3158  return;
3159  case AMDGPU::G_UBFX:
3160  applyMappingBFE(OpdMapper, /*Signed*/ false);
3161  return;
3162  default:
3163  break;
3164  }
3165 
3166  return applyDefaultMapping(OpdMapper);
3167 }
3168 
3169 // vgpr, sgpr -> vgpr
3170 // vgpr, agpr -> vgpr
3171 // agpr, agpr -> agpr
3172 // agpr, sgpr -> vgpr
3173 static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
3174  if (RB0 == AMDGPU::InvalidRegBankID)
3175  return RB1;
3176  if (RB1 == AMDGPU::InvalidRegBankID)
3177  return RB0;
3178 
3179  if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID)
3180  return AMDGPU::SGPRRegBankID;
3181 
3182  if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID)
3183  return AMDGPU::AGPRRegBankID;
3184 
3185  return AMDGPU::VGPRRegBankID;
3186 }
3187 
3188 static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) {
3189  if (RB0 == AMDGPU::InvalidRegBankID)
3190  return RB1;
3191  if (RB1 == AMDGPU::InvalidRegBankID)
3192  return RB0;
3193 
3194  // vcc, vcc -> vcc
3195  // vcc, sgpr -> vcc
3196  // vcc, vgpr -> vcc
3197  if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID)
3198  return AMDGPU::VCCRegBankID;
3199 
3200  // vcc, vgpr -> vgpr
3201  return regBankUnion(RB0, RB1);
3202 }
3203 
3205  const MachineInstr &MI) const {
3206  unsigned RegBank = AMDGPU::InvalidRegBankID;
3207 
3208  for (const MachineOperand &MO : MI.operands()) {
3209  if (!MO.isReg())
3210  continue;
3211  Register Reg = MO.getReg();
3212  if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3213  RegBank = regBankUnion(RegBank, Bank->getID());
3214  if (RegBank == AMDGPU::VGPRRegBankID)
3215  break;
3216  }
3217  }
3218 
3219  return RegBank;
3220 }
3221 
3223  const MachineFunction &MF = *MI.getParent()->getParent();
3224  const MachineRegisterInfo &MRI = MF.getRegInfo();
3225  for (const MachineOperand &MO : MI.operands()) {
3226  if (!MO.isReg())
3227  continue;
3228  Register Reg = MO.getReg();
3229  if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3230  if (Bank->getID() != AMDGPU::SGPRRegBankID)
3231  return false;
3232  }
3233  }
3234  return true;
3235 }
3236 
3239  const MachineFunction &MF = *MI.getParent()->getParent();
3240  const MachineRegisterInfo &MRI = MF.getRegInfo();
3241  SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3242 
3243  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3244  const MachineOperand &SrcOp = MI.getOperand(i);
3245  if (!SrcOp.isReg())
3246  continue;
3247 
3248  unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI);
3249  OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3250  }
3251  return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3252  MI.getNumOperands());
3253 }
3254 
3257  const MachineFunction &MF = *MI.getParent()->getParent();
3258  const MachineRegisterInfo &MRI = MF.getRegInfo();
3259  SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3260 
3261  // Even though we technically could use SGPRs, this would require knowledge of
3262  // the constant bus restriction. Force all sources to VGPR (except for VCC).
3263  //
3264  // TODO: Unary ops are trivially OK, so accept SGPRs?
3265  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3266  const MachineOperand &Src = MI.getOperand(i);
3267  if (!Src.isReg())
3268  continue;
3269 
3270  unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI);
3271  unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
3272  OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
3273  }
3274 
3275  return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3276  MI.getNumOperands());
3277 }
3278 
3281  const MachineFunction &MF = *MI.getParent()->getParent();
3282  const MachineRegisterInfo &MRI = MF.getRegInfo();
3283  SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3284 
3285  for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
3286  const MachineOperand &Op = MI.getOperand(I);
3287  if (!Op.isReg())
3288  continue;
3289 
3290  unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
3291  OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3292  }
3293 
3294  return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3295  MI.getNumOperands());
3296 }
3297 
3300  const MachineInstr &MI,
3301  int RsrcIdx) const {
3302  // The reported argument index is relative to the IR intrinsic call arguments,
3303  // so we need to shift by the number of defs and the intrinsic ID.
3304  RsrcIdx += MI.getNumExplicitDefs() + 1;
3305 
3306  const int NumOps = MI.getNumOperands();
3307  SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps);
3308 
3309  // TODO: Should packed/unpacked D16 difference be reported here as part of
3310  // the value mapping?
3311  for (int I = 0; I != NumOps; ++I) {
3312  if (!MI.getOperand(I).isReg())
3313  continue;
3314 
3315  Register OpReg = MI.getOperand(I).getReg();
3316  // We replace some dead address operands with $noreg
3317  if (!OpReg)
3318  continue;
3319 
3320  unsigned Size = getSizeInBits(OpReg, MRI, *TRI);
3321 
3322  // FIXME: Probably need a new intrinsic register bank searchable table to
3323  // handle arbitrary intrinsics easily.
3324  //
3325  // If this has a sampler, it immediately follows rsrc.
3326  const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1;
3327 
3328  if (MustBeSGPR) {
3329  // If this must be an SGPR, so we must report whatever it is as legal.
3330  unsigned NewBank = getRegBankID(OpReg, MRI, AMDGPU::SGPRRegBankID);
3331  OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size);
3332  } else {
3333  // Some operands must be VGPR, and these are easy to copy to.
3334  OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3335  }
3336  }
3337 
3338  return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps);
3339 }
3340 
3341 /// Return the mapping for a pointer argument.
3344  Register PtrReg) const {
3345  LLT PtrTy = MRI.getType(PtrReg);
3346  unsigned Size = PtrTy.getSizeInBits();
3347  if (Subtarget.useFlatForGlobal() ||
3349  return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3350 
3351  // If we're using MUBUF instructions for global memory, an SGPR base register
3352  // is possible. Otherwise this needs to be a VGPR.
3353  const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3354  return AMDGPU::getValueMapping(PtrBank->getID(), Size);
3355 }
3356 
3359 
3360  const MachineFunction &MF = *MI.getParent()->getParent();
3361  const MachineRegisterInfo &MRI = MF.getRegInfo();
3362  SmallVector<const ValueMapping*, 2> OpdsMapping(2);
3363  unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3364  Register PtrReg = MI.getOperand(1).getReg();
3365  LLT PtrTy = MRI.getType(PtrReg);
3366  unsigned AS = PtrTy.getAddressSpace();
3367  unsigned PtrSize = PtrTy.getSizeInBits();
3368 
3369  const ValueMapping *ValMapping;
3370  const ValueMapping *PtrMapping;
3371 
3372  const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3373 
3374  if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) {
3375  if (isScalarLoadLegal(MI)) {
3376  // We have a uniform instruction so we want to use an SMRD load
3377  ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3378  PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
3379  } else {
3380  ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3381 
3382  // If we're using MUBUF instructions for global memory, an SGPR base
3383  // register is possible. Otherwise this needs to be a VGPR.
3384  unsigned PtrBankID = Subtarget.useFlatForGlobal() ?
3385  AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID;
3386 
3387  PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize);
3388  }
3389  } else {
3390  ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3391  PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
3392  }
3393 
3394  OpdsMapping[0] = ValMapping;
3395  OpdsMapping[1] = PtrMapping;
3397  1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands());
3398  return Mapping;
3399 
3400  // FIXME: Do we want to add a mapping for FLAT load, or should we just
3401  // handle that during instruction selection?
3402 }
3403 
3404 unsigned
3406  const MachineRegisterInfo &MRI,
3407  unsigned Default) const {
3408  const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3409  return Bank ? Bank->getID() : Default;
3410 }
3411 
3414  const MachineRegisterInfo &MRI,
3415  const TargetRegisterInfo &TRI) const {
3416  // Lie and claim anything is legal, even though this needs to be an SGPR
3417  // applyMapping will have to deal with it as a waterfall loop.
3418  unsigned Bank = getRegBankID(Reg, MRI, AMDGPU::SGPRRegBankID);
3419  unsigned Size = getSizeInBits(Reg, MRI, TRI);
3420  return AMDGPU::getValueMapping(Bank, Size);
3421 }
3422 
3425  const MachineRegisterInfo &MRI,
3426  const TargetRegisterInfo &TRI) const {
3427  unsigned Size = getSizeInBits(Reg, MRI, TRI);
3428  return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3429 }
3430 
3433  const MachineRegisterInfo &MRI,
3434  const TargetRegisterInfo &TRI) const {
3435  unsigned Size = getSizeInBits(Reg, MRI, TRI);
3436  return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size);
3437 }
3438 
3439 ///
3440 /// This function must return a legal mapping, because
3441 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
3442 /// in RegBankSelect::Mode::Fast. Any mapping that would cause a
3443 /// VGPR to SGPR generated is illegal.
3444 ///
3445 // Operands that must be SGPRs must accept potentially divergent VGPRs as
3446 // legal. These will be dealt with in applyMappingImpl.
3447 //
3450  const MachineFunction &MF = *MI.getParent()->getParent();
3451  const MachineRegisterInfo &MRI = MF.getRegInfo();
3452 
3453  if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) {
3454  // The default logic bothers to analyze impossible alternative mappings. We
3455  // want the most straightforward mapping, so just directly handle this.
3456  const RegisterBank *DstBank = getRegBank(MI.getOperand(0).getReg(), MRI,
3457  *TRI);
3458  const RegisterBank *SrcBank = getRegBank(MI.getOperand(1).getReg(), MRI,
3459  *TRI);
3460  assert(SrcBank && "src bank should have been assigned already");
3461  if (!DstBank)
3462  DstBank = SrcBank;
3463 
3464  unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3465  if (cannotCopy(*DstBank, *SrcBank, Size))
3467 
3468  const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank);
3469  unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2;
3470  SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize);
3471  OpdsMapping[0] = &ValMap;
3472  if (MI.getOpcode() == AMDGPU::G_FREEZE)
3473  OpdsMapping[1] = &ValMap;
3474 
3475  return getInstructionMapping(
3476  1, /*Cost*/ 1,
3477  /*OperandsMapping*/ getOperandsMapping(OpdsMapping), OpdsMappingSize);
3478  }
3479 
3480  if (MI.isRegSequence()) {
3481  // If any input is a VGPR, the result must be a VGPR. The default handling
3482  // assumes any copy between banks is legal.
3483  unsigned BankID = AMDGPU::SGPRRegBankID;
3484 
3485  for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3486  auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI);
3487  // It doesn't make sense to use vcc or scc banks here, so just ignore
3488  // them.
3489  if (OpBank != AMDGPU::SGPRRegBankID) {
3490  BankID = AMDGPU::VGPRRegBankID;
3491  break;
3492  }
3493  }
3494  unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3495 
3496  const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID));
3497  return getInstructionMapping(
3498  1, /*Cost*/ 1,
3499  /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3500  }
3501 
3502  // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
3503  // properly.
3504  //
3505  // TODO: There are additional exec masking dependencies to analyze.
3506  if (MI.getOpcode() == TargetOpcode::G_PHI) {
3507  unsigned ResultBank = AMDGPU::InvalidRegBankID;
3508  Register DstReg = MI.getOperand(0).getReg();
3509 
3510  // Sometimes the result may have already been assigned a bank.
3511  if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI))
3512  ResultBank = DstBank->getID();
3513 
3514  for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3515  Register Reg = MI.getOperand(I).getReg();
3516  const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3517 
3518  // FIXME: Assuming VGPR for any undetermined inputs.
3519  if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {
3520  ResultBank = AMDGPU::VGPRRegBankID;
3521  break;
3522  }
3523 
3524  // FIXME: Need to promote SGPR case to s32
3525  unsigned OpBank = Bank->getID();
3526  ResultBank = regBankBoolUnion(ResultBank, OpBank);
3527  }
3528 
3529  assert(ResultBank != AMDGPU::InvalidRegBankID);
3530 
3531  unsigned Size = MRI.getType(DstReg).getSizeInBits();
3532 
3533  const ValueMapping &ValMap =
3534  getValueMapping(0, Size, getRegBank(ResultBank));
3535  return getInstructionMapping(
3536  1, /*Cost*/ 1,
3537  /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3538  }
3539 
3541  if (Mapping.isValid())
3542  return Mapping;
3543 
3544  SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3545 
3546  switch (MI.getOpcode()) {
3547  default:
3549 
3550  case AMDGPU::G_AND:
3551  case AMDGPU::G_OR:
3552  case AMDGPU::G_XOR: {
3553  unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3554  if (Size == 1) {
3555  const RegisterBank *DstBank
3556  = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI);
3557 
3558  unsigned TargetBankID = AMDGPU::InvalidRegBankID;
3559  unsigned BankLHS = AMDGPU::InvalidRegBankID;
3560  unsigned BankRHS = AMDGPU::InvalidRegBankID;
3561  if (DstBank) {
3562  TargetBankID = DstBank->getID();
3563  if (DstBank == &AMDGPU::VCCRegBank) {
3564  TargetBankID = AMDGPU::VCCRegBankID;
3565  BankLHS = AMDGPU::VCCRegBankID;
3566  BankRHS = AMDGPU::VCCRegBankID;
3567  } else {
3568  BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3569  AMDGPU::SGPRRegBankID);
3570  BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3571  AMDGPU::SGPRRegBankID);
3572  }
3573  } else {
3574  BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3575  AMDGPU::VCCRegBankID);
3576  BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3577  AMDGPU::VCCRegBankID);
3578 
3579  // Both inputs should be true booleans to produce a boolean result.
3580  if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {
3581  TargetBankID = AMDGPU::VGPRRegBankID;
3582  } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {
3583  TargetBankID = AMDGPU::VCCRegBankID;
3584  BankLHS = AMDGPU::VCCRegBankID;
3585  BankRHS = AMDGPU::VCCRegBankID;
3586  } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
3587  TargetBankID = AMDGPU::SGPRRegBankID;
3588  }
3589  }
3590 
3591  OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size);
3592  OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size);
3593  OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size);
3594  break;
3595  }
3596 
3597  if (Size == 64) {
3598 
3599  if (isSALUMapping(MI)) {
3600  OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
3601  OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
3602  } else {
3603  OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
3604  unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/);
3605  OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
3606 
3607  unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI /*, DefaultBankID*/);
3608  OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size);
3609  }
3610 
3611  break;
3612  }
3613 
3615  }
3616  case AMDGPU::G_PTR_ADD:
3617  case AMDGPU::G_PTRMASK:
3618  case AMDGPU::G_ADD:
3619  case AMDGPU::G_SUB:
3620  case AMDGPU::G_MUL:
3621  case AMDGPU::G_SHL:
3622  case AMDGPU::G_LSHR:
3623  case AMDGPU::G_ASHR:
3624  case AMDGPU::G_UADDO:
3625  case AMDGPU::G_USUBO:
3626  case AMDGPU::G_UADDE:
3627  case AMDGPU::G_SADDE:
3628  case AMDGPU::G_USUBE:
3629  case AMDGPU::G_SSUBE:
3630  case AMDGPU::G_SMIN:
3631  case AMDGPU::G_SMAX:
3632  case AMDGPU::G_UMIN:
3633  case AMDGPU::G_UMAX:
3634  case AMDGPU::G_ABS:
3635  case AMDGPU::G_SHUFFLE_VECTOR:
3636  case AMDGPU::G_SBFX:
3637  case AMDGPU::G_UBFX:
3638  if (isSALUMapping(MI))
3639  return getDefaultMappingSOP(MI);
3641 
3642  case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU
3643  case AMDGPU::G_SSUBSAT:
3644  case AMDGPU::G_UADDSAT:
3645  case AMDGPU::G_USUBSAT:
3646  case AMDGPU::G_FADD:
3647  case AMDGPU::G_FSUB:
3648  case AMDGPU::G_FPTOSI:
3649  case AMDGPU::G_FPTOUI:
3650  case AMDGPU::G_FMUL:
3651  case AMDGPU::G_FMA:
3652  case AMDGPU::G_FMAD:
3653  case AMDGPU::G_FSQRT:
3654  case AMDGPU::G_FFLOOR:
3655  case AMDGPU::G_FCEIL:
3656  case AMDGPU::G_FRINT:
3657  case AMDGPU::G_SITOFP:
3658  case AMDGPU::G_UITOFP:
3659  case AMDGPU::G_FPTRUNC:
3660  case AMDGPU::G_FPEXT:
3661  case AMDGPU::G_FEXP2:
3662  case AMDGPU::G_FLOG2:
3663  case AMDGPU::G_FMINNUM:
3664  case AMDGPU::G_FMAXNUM:
3665  case AMDGPU::G_FMINNUM_IEEE:
3666  case AMDGPU::G_FMAXNUM_IEEE:
3667  case AMDGPU::G_FCANONICALIZE:
3668  case AMDGPU::G_INTRINSIC_TRUNC:
3669  case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar?
3670  case AMDGPU::G_FSHR: // TODO: Expand for scalar
3671  case AMDGPU::G_AMDGPU_FMIN_LEGACY:
3672  case AMDGPU::G_AMDGPU_FMAX_LEGACY:
3673  case AMDGPU::G_AMDGPU_RCP_IFLAG:
3674  case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
3675  case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
3676  case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
3677  case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
3678  case AMDGPU::G_AMDGPU_CVT_PK_I16_I32:
3679  case AMDGPU::G_AMDGPU_SMED3:
3680  return getDefaultMappingVOP(MI);
3681  case AMDGPU::G_UMULH:
3682  case AMDGPU::G_SMULH: {
3684  return getDefaultMappingSOP(MI);
3685  return getDefaultMappingVOP(MI);
3686  }
3687  case AMDGPU::G_IMPLICIT_DEF: {
3688  unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3689  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3690  break;
3691  }
3692  case AMDGPU::G_FCONSTANT:
3693  case AMDGPU::G_CONSTANT:
3694  case AMDGPU::G_GLOBAL_VALUE:
3695  case AMDGPU::G_BLOCK_ADDR:
3696  case AMDGPU::G_READCYCLECOUNTER: {
3697  unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3698  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3699  break;
3700  }
3701  case AMDGPU::G_FRAME_INDEX: {
3702  // TODO: This should be the same as other constants, but eliminateFrameIndex
3703  // currently assumes VALU uses.
3704  unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3705  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3706  break;
3707  }
3708  case AMDGPU::G_DYN_STACKALLOC: {
3709  // Result is always uniform, and a wave reduction is needed for the source.
3710  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3711  unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3712  OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32);
3713  break;
3714  }
3715  case AMDGPU::G_AMDGPU_WAVE_ADDRESS: {
3716  // This case is weird because we expect a physical register in the source,
3717  // but need to set a bank anyway.
3718  //
3719  // We could select the result to SGPR or VGPR, but for the one current use
3720  // it's more practical to always use VGPR.
3721  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
3722  OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3723  break;
3724  }
3725  case AMDGPU::G_INSERT: {
3726  unsigned BankID = getMappingType(MRI, MI);
3727  unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3728  unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3729  unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
3730  OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
3731  OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
3732  OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize);
3733  OpdsMapping[3] = nullptr;
3734  break;
3735  }
3736  case AMDGPU::G_EXTRACT: {
3737  unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3738  unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3739  unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3740  OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
3741  OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
3742  OpdsMapping[2] = nullptr;
3743  break;
3744  }
3745  case AMDGPU::G_BUILD_VECTOR:
3746  case AMDGPU::G_BUILD_VECTOR_TRUNC: {
3747  LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
3748  if (DstTy == LLT::fixed_vector(2, 16)) {
3749  unsigned DstSize = DstTy.getSizeInBits();
3750  unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3751  unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3752  unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
3753  unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID);
3754 
3755  OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize);
3756  OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize);
3757  OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize);
3758  break;
3759  }
3760 
3762  }
3763  case AMDGPU::G_MERGE_VALUES:
3764  case AMDGPU::G_CONCAT_VECTORS: {
3765  unsigned Bank = getMappingType(MRI, MI);
3766  unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3767  unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3768 
3769  OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
3770  // Op1 and Dst should use the same register bank.
3771  for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
3772  OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
3773  break;
3774  }
3775  case AMDGPU::G_BITREVERSE:
3776  case AMDGPU::G_BITCAST:
3777  case AMDGPU::G_INTTOPTR:
3778  case AMDGPU::G_PTRTOINT:
3779  case AMDGPU::G_FABS:
3780  case AMDGPU::G_FNEG: {
3781  unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3782  unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3783  OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
3784  break;
3785  }
3786  case AMDGPU::G_AMDGPU_FFBH_U32:
3787  case AMDGPU::G_AMDGPU_FFBL_B32:
3788  case AMDGPU::G_CTLZ_ZERO_UNDEF:
3789  case AMDGPU::G_CTTZ_ZERO_UNDEF: {
3790  unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3791  unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3792  OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
3793  OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(BankID, Size);
3794  break;
3795  }
3796  case AMDGPU::G_CTPOP: {
3797  unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3798  unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3799  OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
3800 
3801  // This should really be getValueMappingSGPR64Only, but allowing the generic
3802  // code to handle the register split just makes using LegalizerHelper more
3803  // difficult.
3804  OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
3805  break;
3806  }
3807  case AMDGPU::G_TRUNC: {
3808  Register Dst = MI.getOperand(0).getReg();
3809  Register Src = MI.getOperand(1).getReg();
3810  unsigned Bank = getRegBankID(Src, MRI);
3811  unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
3812  unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
3813  OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
3814  OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
3815  break;
3816  }
3817  case AMDGPU::G_ZEXT:
3818  case AMDGPU::G_SEXT:
3819  case AMDGPU::G_ANYEXT:
3820  case AMDGPU::G_SEXT_INREG: {
3821  Register Dst = MI.getOperand(0).getReg();
3822  Register Src = MI.getOperand(1).getReg();
3823  unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
3824  unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
3825 
3826  unsigned DstBank;
3827  const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI);
3828  assert(SrcBank);
3829  switch (SrcBank->getID()) {
3830  case AMDGPU::SGPRRegBankID:
3831  DstBank = AMDGPU::SGPRRegBankID;
3832  break;
3833  default:
3834  DstBank = AMDGPU::VGPRRegBankID;
3835  break;
3836  }
3837 
3838  // Scalar extend can use 64-bit BFE, but VGPRs require extending to
3839  // 32-bits, and then to 64.
3840  OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize);
3841  OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(),
3842  SrcSize);
3843  break;
3844  }
3845  case AMDGPU::G_FCMP: {
3846  unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3847  unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
3848  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
3849  OpdsMapping[1] = nullptr; // Predicate Operand.
3850  OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
3851  OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3852  break;
3853  }
3854  case AMDGPU::G_STORE: {
3855  assert(MI.getOperand(0).isReg());
3856  unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3857 
3858  // FIXME: We need to specify a different reg bank once scalar stores are
3859  // supported.
3860  const ValueMapping *ValMapping =
3861  AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3862  OpdsMapping[0] = ValMapping;
3863  OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
3864  break;
3865  }
3866  case AMDGPU::G_ICMP: {
3867  auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
3868  unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3869 
3870  // See if the result register has already been constrained to vcc, which may
3871  // happen due to control flow intrinsic lowering.
3872  unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI,
3873  AMDGPU::SGPRRegBankID);
3874  unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
3875  unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI);
3876 
3877  bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID &&
3878  Op2Bank == AMDGPU::SGPRRegBankID &&
3879  Op3Bank == AMDGPU::SGPRRegBankID &&
3880  (Size == 32 || (Size == 64 &&
3881  (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
3883 
3884  DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
3885  unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
3886 
3887  // TODO: Use 32-bit for scalar output size.
3888  // SCC results will need to be copied to a 32-bit SGPR virtual register.
3889  const unsigned ResultSize = 1;
3890 
3891  OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize);
3892  OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size);
3893  OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size);
3894  break;
3895  }
3896  case AMDGPU::G_EXTRACT_VECTOR_ELT: {
3897  // VGPR index can be used for waterfall when indexing a SGPR vector.
3898  unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3899  unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3900  unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3901  unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3902  unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI);
3903  unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank);
3904 
3905  OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize);
3906  OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize);
3907 
3908  // The index can be either if the source vector is VGPR.
3909  OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize);
3910  break;
3911  }
3912  case AMDGPU::G_INSERT_VECTOR_ELT: {
3913  unsigned OutputBankID = isSALUMapping(MI) ?
3914  AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
3915 
3916  unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3917  unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3918  unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
3919  unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
3920  unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI);
3921 
3922  OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
3923  OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize);
3924 
3925  // This is a weird case, because we need to break down the mapping based on
3926  // the register bank of a different operand.
3927  if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) {
3928  OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID,
3929  InsertSize);
3930  } else {
3931  assert(InsertSize == 32 || InsertSize == 64);
3932  OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize);
3933  }
3934 
3935  // The index can be either if the source vector is VGPR.
3936  OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize);
3937  break;
3938  }
3939  case AMDGPU::G_UNMERGE_VALUES: {
3940  unsigned Bank = getMappingType(MRI, MI);
3941 
3942  // Op1 and Dst should use the same register bank.
3943  // FIXME: Shouldn't this be the default? Why do we need to handle this?
3944  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3945  unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
3946  OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size);
3947  }
3948  break;
3949  }
3950  case AMDGPU::G_AMDGPU_BUFFER_LOAD:
3951  case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
3952  case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
3953  case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
3954  case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
3955  case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
3956  case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
3957  case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
3958  case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
3959  case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
3960  case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16:
3961  case AMDGPU::G_AMDGPU_BUFFER_STORE:
3962  case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
3963  case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
3964  case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
3965  case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: {
3966  OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3967 
3968  // rsrc
3969  OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3970 
3971  // vindex
3972  OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3973 
3974  // voffset
3975  OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3976 
3977  // soffset
3978  OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3979 
3980  // Any remaining operands are immediates and were correctly null
3981  // initialized.
3982  break;
3983  }
3984  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
3985  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
3986  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
3987  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
3988  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
3989  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
3990  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
3991  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
3992  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
3993  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
3994  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
3995  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
3996  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
3997  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
3998  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
3999  // vdata_out
4000  OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4001 
4002  // vdata_in
4003  OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4004 
4005  // rsrc
4006  OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4007 
4008  // vindex
4009  OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4010 
4011  // voffset
4012  OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4013 
4014  // soffset
4015  OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4016 
4017  // Any remaining operands are immediates and were correctly null
4018  // initialized.
4019  break;
4020  }
4021  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
4022  // vdata_out
4023  OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4024 
4025  // vdata_in
4026  OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4027 
4028  // cmp
4029  OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4030 
4031  // rsrc
4032  OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4033 
4034  // vindex
4035  OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4036 
4037  // voffset
4038  OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4039 
4040  // soffset
4041  OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
4042 
4043  // Any remaining operands are immediates and were correctly null
4044  // initialized.
4045  break;
4046  }
4047  case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
4048  // Lie and claim everything is legal, even though some need to be
4049  // SGPRs. applyMapping will have to deal with it as a waterfall loop.
4050  OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4051  OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4052 
4053  // We need to convert this to a MUBUF if either the resource of offset is
4054  // VGPR.
4055  unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID();
4056  unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID();
4057  unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank);
4058 
4059  unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4060  OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0);
4061  break;
4062  }
4063  case AMDGPU::G_INTRINSIC: {
4064  switch (MI.getIntrinsicID()) {
4065  default:
4067  case Intrinsic::amdgcn_div_fmas:
4068  case Intrinsic::amdgcn_div_fixup:
4069  case Intrinsic::amdgcn_trig_preop:
4070  case Intrinsic::amdgcn_sin:
4071  case Intrinsic::amdgcn_cos:
4072  case Intrinsic::amdgcn_log_clamp:
4073  case Intrinsic::amdgcn_rcp:
4074  case Intrinsic::amdgcn_rcp_legacy:
4075  case Intrinsic::amdgcn_sqrt:
4076  case Intrinsic::amdgcn_rsq:
4077  case Intrinsic::amdgcn_rsq_legacy:
4078  case Intrinsic::amdgcn_rsq_clamp:
4079  case Intrinsic::amdgcn_fmul_legacy:
4080  case Intrinsic::amdgcn_fma_legacy:
4081  case Intrinsic::amdgcn_ldexp:
4082  case Intrinsic::amdgcn_frexp_mant:
4083  case Intrinsic::amdgcn_frexp_exp:
4084  case Intrinsic::amdgcn_fract:
4085  case Intrinsic::amdgcn_cvt_pkrtz:
4086  case Intrinsic::amdgcn_cvt_pknorm_i16:
4087  case Intrinsic::amdgcn_cvt_pknorm_u16:
4088  case Intrinsic::amdgcn_cvt_pk_i16:
4089  case Intrinsic::amdgcn_cvt_pk_u16:
4090  case Intrinsic::amdgcn_fmed3:
4091  case Intrinsic::amdgcn_cubeid:
4092  case Intrinsic::amdgcn_cubema:
4093  case Intrinsic::amdgcn_cubesc:
4094  case Intrinsic::amdgcn_cubetc:
4095  case Intrinsic::amdgcn_sffbh:
4096  case Intrinsic::amdgcn_fmad_ftz:
4097  case Intrinsic::amdgcn_mbcnt_lo:
4098  case Intrinsic::amdgcn_mbcnt_hi:
4099  case Intrinsic::amdgcn_mul_u24:
4100  case Intrinsic::amdgcn_mul_i24:
4101  case Intrinsic::amdgcn_mulhi_u24:
4102  case Intrinsic::amdgcn_mulhi_i24:
4103  case Intrinsic::amdgcn_lerp:
4104  case Intrinsic::amdgcn_sad_u8:
4105  case Intrinsic::amdgcn_msad_u8:
4106  case Intrinsic::amdgcn_sad_hi_u8:
4107  case Intrinsic::amdgcn_sad_u16:
4108  case Intrinsic::amdgcn_qsad_pk_u16_u8:
4109  case Intrinsic::amdgcn_mqsad_pk_u16_u8:
4110  case Intrinsic::amdgcn_mqsad_u32_u8:
4111  case Intrinsic::amdgcn_cvt_pk_u8_f32:
4112  case Intrinsic::amdgcn_alignbyte:
4113  case Intrinsic::amdgcn_perm:
4114  case Intrinsic::amdgcn_fdot2:
4115  case Intrinsic::amdgcn_sdot2:
4116  case Intrinsic::amdgcn_udot2:
4117  case Intrinsic::amdgcn_sdot4:
4118  case Intrinsic::amdgcn_udot4:
4119  case Intrinsic::amdgcn_sdot8:
4120  case Intrinsic::amdgcn_udot8:
4121  return getDefaultMappingVOP(MI);
4122  case Intrinsic::amdgcn_sbfe:
4123  case Intrinsic::amdgcn_ubfe:
4124  if (isSALUMapping(MI))
4125  return getDefaultMappingSOP(MI);
4126  return getDefaultMappingVOP(MI);
4127  case Intrinsic::amdgcn_ds_swizzle:
4128  case Intrinsic::amdgcn_ds_permute:
4129  case Intrinsic::amdgcn_ds_bpermute:
4130  case Intrinsic::amdgcn_update_dpp:
4131  case Intrinsic::amdgcn_mov_dpp8:
4132  case Intrinsic::amdgcn_mov_dpp:
4133  case Intrinsic::amdgcn_strict_wwm:
4134  case Intrinsic::amdgcn_wwm:
4135  case Intrinsic::amdgcn_strict_wqm:
4136  case Intrinsic::amdgcn_wqm:
4137  case Intrinsic::amdgcn_softwqm:
4138  case Intrinsic::amdgcn_set_inactive:
4139  return getDefaultMappingAllVGPR(MI);
4140  case Intrinsic::amdgcn_kernarg_segment_ptr:
4141  case Intrinsic::amdgcn_s_getpc:
4142  case Intrinsic::amdgcn_groupstaticsize:
4143  case Intrinsic::amdgcn_reloc_constant:
4144  case Intrinsic::returnaddress: {
4145  unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4146  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4147  break;
4148  }
4149  case Intrinsic::amdgcn_wqm_vote: {
4150  unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4151  OpdsMapping[0] = OpdsMapping[2]
4152  = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size);
4153  break;
4154  }
4155  case Intrinsic::amdgcn_ps_live: {
4156  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4157  break;
4158  }
4159  case Intrinsic::amdgcn_div_scale: {
4160  unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4161  unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
4162  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size);
4163  OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size);
4164 
4165  unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
4166  OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4167  OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4168  break;
4169  }
4170  case Intrinsic::amdgcn_class: {
4171  Register Src0Reg = MI.getOperand(2).getReg();
4172  Register Src1Reg = MI.getOperand(3).getReg();
4173  unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits();
4174  unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits();
4175  unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4176  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
4177  OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size);
4178  OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size);
4179  break;
4180  }
4181  case Intrinsic::amdgcn_icmp:
4182  case Intrinsic::amdgcn_fcmp: {
4183  unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4184  // This is not VCCRegBank because this is not used in boolean contexts.
4185  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4186  unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4187  OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4188  OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4189  break;
4190  }
4191  case Intrinsic::amdgcn_readlane: {
4192  // This must be an SGPR, but accept a VGPR.
4193  Register IdxReg = MI.getOperand(3).getReg();
4194  unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4195  unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4196  OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4198  }
4199  case Intrinsic::amdgcn_readfirstlane: {
4200  unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4201  unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4202  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4203  OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4204  break;
4205  }
4206  case Intrinsic::amdgcn_writelane: {
4207  unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4208  Register SrcReg = MI.getOperand(2).getReg();
4209  unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
4210  unsigned SrcBank = getRegBankID(SrcReg, MRI, AMDGPU::SGPRRegBankID);
4211  Register IdxReg = MI.getOperand(3).getReg();
4212  unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4213  unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4214  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4215 
4216  // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
4217  // to legalize.
4218  OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize);
4219  OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4220  OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4221  break;
4222  }
4223  case Intrinsic::amdgcn_if_break: {
4224  unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4225  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4226  OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4227  OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4228  break;
4229  }
4230  case Intrinsic::amdgcn_permlane16:
4231  case Intrinsic::amdgcn_permlanex16: {
4232  unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4233  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4234  OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4235  OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4236  OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4237  OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4238  break;
4239  }
4240  case Intrinsic::amdgcn_mfma_f32_4x4x1f32:
4241  case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
4242  case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
4243  case Intrinsic::amdgcn_mfma_f32_4x4x2bf16:
4244  case Intrinsic::amdgcn_mfma_f32_16x16x1f32:
4245  case Intrinsic::amdgcn_mfma_f32_16x16x4f32:
4246  case Intrinsic::amdgcn_mfma_f32_16x16x4f16:
4247  case Intrinsic::amdgcn_mfma_f32_16x16x16f16:
4248  case Intrinsic::amdgcn_mfma_i32_16x16x4i8:
4249  case Intrinsic::amdgcn_mfma_i32_16x16x16i8:
4250  case Intrinsic::amdgcn_mfma_f32_16x16x2bf16:
4251  case Intrinsic::amdgcn_mfma_f32_16x16x8bf16:
4252  case Intrinsic::amdgcn_mfma_f32_32x32x1f32:
4253  case Intrinsic::amdgcn_mfma_f32_32x32x2f32:
4254  case Intrinsic::amdgcn_mfma_f32_32x32x4f16:
4255  case Intrinsic::amdgcn_mfma_f32_32x32x8f16:
4256  case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
4257  case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
4258  case Intrinsic::amdgcn_mfma_f32_32x32x2bf16:
4259  case Intrinsic::amdgcn_mfma_f32_32x32x4bf16:
4260  case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k:
4261  case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k:
4262  case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k:
4263  case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k:
4264  case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k:
4265  case Intrinsic::amdgcn_mfma_f64_16x16x4f64:
4266  case Intrinsic::amdgcn_mfma_f64_4x4x4f64:
4267  case Intrinsic::amdgcn_mfma_i32_16x16x32_i8:
4268  case Intrinsic::amdgcn_mfma_i32_32x32x16_i8:
4269  case Intrinsic::amdgcn_mfma_f32_16x16x8_xf32:
4270  case Intrinsic::amdgcn_mfma_f32_32x32x4_xf32: {
4271  // Default for MAI intrinsics.
4272  // srcC can also be an immediate which can be folded later.
4273  // FIXME: Should we eventually add an alternative mapping with AGPR src
4274  // for srcA/srcB?
4275  //
4276  // vdst, srcA, srcB, srcC
4278  OpdsMapping[0] =
4279  Info->mayNeedAGPRs()
4280  ? getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI)
4281  : getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4282  OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4283  OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4284  OpdsMapping[4] =
4285  Info->mayNeedAGPRs()
4286  ? getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI)
4287  : getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4288  break;
4289  }
4290  case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
4291  case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
4292  case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
4293  case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
4294  case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
4295  case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8: {
4296  // vdst, srcA, srcB, srcC, idx
4297  OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4298  OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4299  OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4300  OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4301  OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4302  break;
4303  }
4304  case Intrinsic::amdgcn_interp_p1:
4305  case Intrinsic::amdgcn_interp_p2:
4306  case Intrinsic::amdgcn_interp_mov:
4307  case Intrinsic::amdgcn_interp_p1_f16:
4308  case Intrinsic::amdgcn_interp_p2_f16: {
4309  const int M0Idx = MI.getNumOperands() - 1;
4310  Register M0Reg = MI.getOperand(M0Idx).getReg();
4311  unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
4312  unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4313 
4314  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4315  for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
4316  OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4317 
4318  // Must be SGPR, but we must take whatever the original bank is and fix it
4319  // later.
4320  OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
4321  break;
4322  }
4323  case Intrinsic::amdgcn_ballot: {
4324  unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4325  unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4326  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4327  OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize);
4328  break;
4329  }
4330  }
4331  break;
4332  }
4333  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4334  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
4335  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
4336  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
4337  auto IntrID = MI.getIntrinsicID();
4338  const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID);
4339  assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic");
4340  // Non-images can have complications from operands that allow both SGPR
4341  // and VGPR. For now it's too complicated to figure out the final opcode
4342  // to derive the register bank from the MCInstrDesc.
4343  assert(RSrcIntrin->IsImage);
4344  return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
4345  }
4346  case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
4347  unsigned N = MI.getNumExplicitOperands() - 2;
4348  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128);
4349  OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI);
4350  if (N == 3) {
4351  // Sequential form: all operands combined into VGPR256/VGPR512
4352  unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4353  if (Size > 256)
4354  Size = 512;
4355  OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4356  } else {
4357  // NSA form
4358  for (unsigned I = 2; I < N; ++I)
4359  OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4360  }
4361  break;
4362  }
4363  case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
4364  auto IntrID = MI.getIntrinsicID();
4365  switch (IntrID) {
4366  case Intrinsic::amdgcn_s_getreg:
4367  case Intrinsic::amdgcn_s_memtime:
4368  case Intrinsic::amdgcn_s_memrealtime:
4369  case Intrinsic::amdgcn_s_get_waveid_in_workgroup: {
4370  unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4371  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4372  break;
4373  }
4374  case Intrinsic::amdgcn_global_atomic_fadd:
4375  case Intrinsic::amdgcn_global_atomic_csub:
4376  case Intrinsic::amdgcn_global_atomic_fmin:
4377  case Intrinsic::amdgcn_global_atomic_fmax:
4378  case Intrinsic::amdgcn_flat_atomic_fadd:
4379  case Intrinsic::amdgcn_flat_atomic_fmin:
4380  case Intrinsic::amdgcn_flat_atomic_fmax:
4381  case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
4382  case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
4383  return getDefaultMappingAllVGPR(MI);
4384  case Intrinsic::amdgcn_ds_ordered_add:
4385  case Intrinsic::amdgcn_ds_ordered_swap: {
4386  unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4387  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4388  unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4389  AMDGPU::SGPRRegBankID);
4390  OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32);
4391  OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4392  break;
4393  }
4394  case Intrinsic::amdgcn_ds_append:
4395  case Intrinsic::amdgcn_ds_consume: {
4396  unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4397  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4398  OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4399  break;
4400  }
4401  case Intrinsic::amdgcn_exp_compr:
4402  OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4403  OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4404  break;
4405  case Intrinsic::amdgcn_exp:
4406  // FIXME: Could we support packed types here?
4407  OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4408  OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4409  OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4410  OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4411  break;
4412  case Intrinsic::amdgcn_s_sendmsg:
4413  case Intrinsic::amdgcn_s_sendmsghalt: {
4414  // This must be an SGPR, but accept a VGPR.
4415  unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4416  AMDGPU::SGPRRegBankID);
4417  OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4418  break;
4419  }
4420  case Intrinsic::amdgcn_s_setreg: {
4421  // This must be an SGPR, but accept a VGPR.
4422  unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4423  AMDGPU::SGPRRegBankID);
4424  OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4425  break;
4426  }
4427  case Intrinsic::amdgcn_end_cf: {
4428  unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4429  OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4430  break;
4431  }
4432  case Intrinsic::amdgcn_else: {
4433  unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4434  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4435  OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4436  OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4437  break;
4438  }
4439  case Intrinsic::amdgcn_live_mask: {
4440  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4441  break;
4442  }
4443  case Intrinsic::amdgcn_wqm_demote:
4444  case Intrinsic::amdgcn_kill: {
4445  OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4446  break;
4447  }
4448  case Intrinsic::amdgcn_raw_buffer_load:
4449  case Intrinsic::amdgcn_raw_tbuffer_load: {
4450  // FIXME: Should make intrinsic ID the last operand of the instruction,
4451  // then this would be the same as store
4452  OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4453  OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4454  OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4455  OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4456  break;
4457  }
4458  case Intrinsic::amdgcn_raw_buffer_load_lds: {
4459  OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4460  OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4461  OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4462  OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4463  break;
4464  }
4465  case Intrinsic::amdgcn_raw_buffer_store:
4466  case Intrinsic::amdgcn_raw_buffer_store_format:
4467  case Intrinsic::amdgcn_raw_tbuffer_store: {
4468  OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4469  OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4470  OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4471  OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4472  break;
4473  }
4474  case Intrinsic::amdgcn_struct_buffer_load:
4475  case Intrinsic::amdgcn_struct_tbuffer_load: {
4476  OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4477  OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4478  OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4479  OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4480  OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4481  break;
4482  }
4483  case Intrinsic::amdgcn_struct_buffer_load_lds: {
4484  OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4485  OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4486  OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4487  OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4488  OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
4489  break;
4490  }
4491  case Intrinsic::amdgcn_struct_buffer_store:
4492  case Intrinsic::amdgcn_struct_tbuffer_store: {
4493  OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4494  OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4495  OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4496  OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4497  OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4498  break;
4499  }
4500  case Intrinsic::amdgcn_init_exec_from_input: {
4501  unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4502  OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4503  break;
4504  }
4505  case Intrinsic::amdgcn_ds_gws_init:
4506  case Intrinsic::amdgcn_ds_gws_barrier:
4507  case Intrinsic::amdgcn_ds_gws_sema_br: {
4508  OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4509 
4510  // This must be an SGPR, but accept a VGPR.
4511  unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4512  AMDGPU::SGPRRegBankID);
4513  OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4514  break;
4515  }
4516  case Intrinsic::amdgcn_ds_gws_sema_v:
4517  case Intrinsic::amdgcn_ds_gws_sema_p:
4518  case Intrinsic::amdgcn_ds_gws_sema_release_all: {
4519  // This must be an SGPR, but accept a VGPR.
4520  unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI,
4521  AMDGPU::SGPRRegBankID);
4522  OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
4523  break;
4524  }
4525  case Intrinsic::amdgcn_global_load_lds: {
4526  OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4527  OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4528  break;
4529  }
4530  default:
4532  }
4533  break;
4534  }
4535  case AMDGPU::G_SELECT: {
4536  unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4537  unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4538  AMDGPU::SGPRRegBankID);
4539  unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI,
4540  AMDGPU::SGPRRegBankID);
4541  bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
4542  Op3Bank == AMDGPU::SGPRRegBankID;
4543 
4544  unsigned CondBankDefault = SGPRSrcs ?
4545  AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4546  unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI,
4547  CondBankDefault);
4548  if (CondBank == AMDGPU::SGPRRegBankID)
4549  CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4550  else if (CondBank == AMDGPU::VGPRRegBankID)
4551  CondBank = AMDGPU::VCCRegBankID;
4552 
4553  unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ?
4554  AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4555 
4556  assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID);
4557 
4558  // TODO: Should report 32-bit for scalar condition type.
4559  if (Size == 64) {
4560  OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4561  OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
4562  OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4563  OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4564  } else {
4565  OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size);
4566  OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
4567  OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size);
4568  OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size);
4569  }
4570 
4571  break;
4572  }
4573 
4574  case AMDGPU::G_SI_CALL: {
4575  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64);
4576  // Lie and claim everything is legal, even though some need to be
4577  // SGPRs. applyMapping will have to deal with it as a waterfall loop.
4578  OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4579 
4580  // Allow anything for implicit arguments
4581  for (unsigned I = 4; I < MI.getNumOperands(); ++I) {
4582  if (MI.getOperand(I).isReg()) {
4583  Register Reg = MI.getOperand(I).getReg();
4584  auto OpBank = getRegBankID(Reg, MRI);
4585  unsigned Size = getSizeInBits(Reg, MRI, *TRI);
4586  OpdsMapping[I] = AMDGPU::getValueMapping(OpBank, Size);
4587  }
4588  }
4589  break;
4590  }
4591  case AMDGPU::G_LOAD:
4592  case AMDGPU::G_ZEXTLOAD:
4593  case AMDGPU::G_SEXTLOAD:
4594  return getInstrMappingForLoad(MI);
4595 
4596  case AMDGPU::G_ATOMICRMW_XCHG:
4597  case AMDGPU::G_ATOMICRMW_ADD:
4598  case AMDGPU::G_ATOMICRMW_SUB:
4599  case AMDGPU::G_ATOMICRMW_AND:
4600  case AMDGPU::G_ATOMICRMW_OR:
4601  case AMDGPU::G_ATOMICRMW_XOR:
4602  case AMDGPU::G_ATOMICRMW_MAX:
4603  case AMDGPU::G_ATOMICRMW_MIN:
4604  case AMDGPU::G_ATOMICRMW_UMAX:
4605  case AMDGPU::G_ATOMICRMW_UMIN:
4606  case AMDGPU::G_ATOMICRMW_FADD:
4607  case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
4608  case AMDGPU::G_AMDGPU_ATOMIC_INC:
4609  case AMDGPU::G_AMDGPU_ATOMIC_DEC:
4610  case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
4611  case AMDGPU::G_AMDGPU_ATOMIC_FMAX: {
4612  OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4613  OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4614  OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4615  break;
4616  }
4617  case AMDGPU::G_ATOMIC_CMPXCHG: {
4618  OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4619  OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4620  OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4621  OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4622  break;
4623  }
4624  case AMDGPU::G_BRCOND: {
4625  unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI,
4626  AMDGPU::SGPRRegBankID);
4627  assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
4628  if (Bank != AMDGPU::SGPRRegBankID)
4629  Bank = AMDGPU::VCCRegBankID;
4630 
4631  OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1);
4632  break;
4633  }
4634  case AMDGPU::G_FPTRUNC_ROUND_UPWARD:
4635  case AMDGPU::G_FPTRUNC_ROUND_DOWNWARD:
4636  return getDefaultMappingVOP(MI);
4637  }
4638 
4639  return getInstructionMapping(/*ID*/1, /*Cost*/1,
4640  getOperandsMapping(OpdsMapping),
4641  MI.getNumOperands());
4642 }
i
i
Definition: README.txt:29
MIPatternMatch.h
llvm::GCNSubtarget::hasScalarMulHiInsts
bool hasScalarMulHiInsts() const
Definition: GCNSubtarget.h:387
isScalarLoadLegal
static bool isScalarLoadLegal(const MachineInstr &MI)
Definition: AMDGPURegisterBankInfo.cpp:432
llvm::getIConstantVRegSExtVal
Optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition: Utils.cpp:299
llvm::AMDGPURegisterBankInfo::getSGPROpMapping
const ValueMapping * getSGPROpMapping(Register Reg, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI) const
Definition: AMDGPURegisterBankInfo.cpp:3413
Signed
@ Signed
Definition: NVPTXISelLowering.cpp:4635
substituteSimpleCopyRegs
static bool substituteSimpleCopyRegs(const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx)
Definition: AMDGPURegisterBankInfo.cpp:1665
llvm::getDefIgnoringCopies
MachineInstr * getDefIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, folding away any trivial copies.
Definition: Utils.cpp:460
llvm::AMDGPURegisterBankInfo
Definition: AMDGPURegisterBankInfo.h:42
llvm::SIRegisterInfo::isAGPRClass
static bool isAGPRClass(const TargetRegisterClass *RC)
Definition: SIRegisterInfo.h:193
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:104
Merge
R600 Clause Merge
Definition: R600ClauseMergePass.cpp:70
llvm::AMDGPURegisterBankInfo::getDefaultMappingAllVGPR
const InstructionMapping & getDefaultMappingAllVGPR(const MachineInstr &MI) const
Definition: AMDGPURegisterBankInfo.cpp:3280
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:17
llvm::tgtok::Def
@ Def
Definition: TGLexer.h:50
llvm::CmpInst::ICMP_EQ
@ ICMP_EQ
equal
Definition: InstrTypes.h:740
getExtendOp
static unsigned getExtendOp(unsigned Opc)
Definition: AMDGPURegisterBankInfo.cpp:1625
llvm::make_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
Definition: iterator_range.h:53
llvm::MONoClobber
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition: SIInstrInfo.h:40
llvm::RecurKind::Or
@ Or
Bitwise or logical OR of integers.
llvm::LLT::getScalarSizeInBits
unsigned getScalarSizeInBits() const
Definition: LowLevelTypeImpl.h:224
llvm::MachineRegisterInfo::createVirtualRegister
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
Definition: MachineRegisterInfo.cpp:156
llvm::CmpInst::Predicate
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:719
llvm::MachineMemOperand::getAlign
Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
Definition: MachineOperand.cpp:1087
llvm::MIPatternMatch::m_Reg
operand_type_match m_Reg()
Definition: MIPatternMatch.h:252
SIMachineFunctionInfo.h
llvm::RegisterBankInfo::getInstrMappingImpl
const InstructionMapping & getInstrMappingImpl(const MachineInstr &MI) const
Try to get the mapping of MI.
Definition: RegisterBankInfo.cpp:159
llvm::MachineRegisterInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Definition: MachineRegisterInfo.h:50
splitUnequalType
static std::pair< LLT, LLT > splitUnequalType(LLT Ty, unsigned FirstSize)
Split Ty into 2 pieces.
Definition: AMDGPURegisterBankInfo.cpp:1090
llvm::MachineInstrSpan
MachineInstrSpan provides an interface to get an iteration range containing the instruction it was in...
Definition: MachineBasicBlock.h:1203
llvm::RegisterBankInfo::OperandsMapper::getMRI
MachineRegisterInfo & getMRI() const
The MachineRegisterInfo we used to realize the mapping.
Definition: RegisterBankInfo.h:334
llvm::getOpcodeDef
MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition: Utils.cpp:474
C1
instcombine should handle this C2 when C1
Definition: README.txt:263
llvm::SmallVector< MachineInstr *, 4 >
llvm::MachineFunction::getMachineMemOperand
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, uint64_t s, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
Definition: MachineFunction.cpp:456
llvm::LegacyLegalizeActions::Bitcast
@ Bitcast
Perform the operation on a different, but equivalently sized type.
Definition: LegacyLegalizerInfo.h:54
llvm::RegisterBankInfo::getRegBank
RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
Definition: RegisterBankInfo.h:431
llvm::X86Disassembler::Reg
Reg
All possible values of the reg field in the ModR/M byte.
Definition: X86DisassemblerDecoder.h:462
llvm::getSrcRegIgnoringCopies
Register getSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the source register for Reg, folding away any trivial copies.
Definition: Utils.cpp:467
llvm::AMDGPUInstrInfo::isUniformMMO
static bool isUniformMMO(const MachineMemOperand *MMO)
Definition: AMDGPUInstrInfo.cpp:31
llvm::MachineMemOperand::MOInvariant
@ MOInvariant
The memory access always returns the same value (or traps).
Definition: MachineMemOperand.h:143
llvm::CmpInst::ICMP_NE
@ ICMP_NE
not equal
Definition: InstrTypes.h:741
llvm::AMDGPURegisterBankInfo::applyMappingImpl
void applyMappingImpl(const OperandsMapper &OpdMapper) const override
See RegisterBankInfo::applyMapping.
Definition: AMDGPURegisterBankInfo.cpp:2027
llvm::RegisterBankInfo::applyDefaultMapping
static void applyDefaultMapping(const OperandsMapper &OpdMapper)
Helper method to apply something that is like the default mapping.
Definition: RegisterBankInfo.cpp:435
llvm::AMDGPURegisterBankInfo::AMDGPURegisterBankInfo
AMDGPURegisterBankInfo(const GCNSubtarget &STI)
Definition: AMDGPURegisterBankInfo.cpp:196
llvm::TargetRegisterInfo
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Definition: TargetRegisterInfo.h:234
llvm::SIRegisterInfo::getWaveMaskRegClass
const TargetRegisterClass * getWaveMaskRegClass() const
Definition: SIRegisterInfo.h:325
llvm::LLT::getScalarType
LLT getScalarType() const
Definition: LowLevelTypeImpl.h:167
llvm::reverse
auto reverse(ContainerTy &&C, std::enable_if_t< has_rbegin< ContainerTy >::value > *=nullptr)
Definition: STLExtras.h:380
llvm::MachineMemOperand
A description of a memory reference used in the backend.
Definition: MachineMemOperand.h:126
llvm::MachineMemOperand::MODereferenceable
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
Definition: MachineMemOperand.h:141
llvm::AMDGPURegisterBankInfo::getInstrMapping
const InstructionMapping & getInstrMapping(const MachineInstr &MI) const override
This function must return a legal mapping, because AMDGPURegisterBankInfo::getInstrAlternativeMapping...
Definition: AMDGPURegisterBankInfo.cpp:3449
llvm::MachineFunction::insert
void insert(iterator MBBI, MachineBasicBlock *MBB)
Definition: MachineFunction.h:841
llvm::SmallSet
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:136
llvm::LLT::isValid
bool isValid() const
Definition: LowLevelTypeImpl.h:116
GenericMachineInstrs.h
llvm::Optional< int64_t >
llvm::RegisterBankInfo::OperandsMapper::getInstrMapping
const InstructionMapping & getInstrMapping() const
The final mapping of the instruction.
Definition: RegisterBankInfo.h:331
llvm::AMDGPURegisterBankInfo::getVGPROpMapping
const ValueMapping * getVGPROpMapping(Register Reg, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI) const
Definition: AMDGPURegisterBankInfo.cpp:3424
llvm::GCNSubtarget
Definition: GCNSubtarget.h:31
MachineIRBuilder.h
unpackV2S16ToS32
static std::pair< Register, Register > unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode)
Definition: AMDGPURegisterBankInfo.cpp:1643
llvm::RegisterBankInfo::InstructionMapping::isValid
bool isValid() const
Check whether this object is valid.
Definition: RegisterBankInfo.h:253
llvm::MachineMemOperand::isInvariant
bool isInvariant() const
Definition: MachineMemOperand.h:291
llvm::AMDGPURegisterBankInfo::TII
const SIInstrInfo * TII
Definition: AMDGPURegisterBankInfo.h:46
llvm::LegalizerHelper
Definition: LegalizerHelper.h:46
llvm::AMDGPURegisterBankInfo::applyMappingSBufferLoad
bool applyMappingSBufferLoad(const OperandsMapper &OpdMapper) const
Definition: AMDGPURegisterBankInfo.cpp:1398
llvm::AMDGPURegisterBankInfo::TRI
const SIRegisterInfo * TRI
Definition: AMDGPURegisterBankInfo.h:45
TRI
unsigned const TargetRegisterInfo * TRI
Definition: MachineSink.cpp:1618
llvm::RegisterBankInfo::getValueMapping
const ValueMapping & getValueMapping(unsigned StartIdx, unsigned Length, const RegisterBank &RegBank) const
The most common ValueMapping consists of a single PartialMapping.
Definition: RegisterBankInfo.cpp:294
llvm::GCNSubtarget::hasScalarCompareEq64
bool hasScalarCompareEq64() const
Definition: GCNSubtarget.h:814
regBankBoolUnion
static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1)
Definition: AMDGPURegisterBankInfo.cpp:3188
llvm::AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic
RegisterBankInfo::InstructionMappings getInstrAlternativeMappingsIntrinsic(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
Definition: AMDGPURegisterBankInfo.cpp:334
llvm::RegisterBankInfo::ValueMapping::BreakDown
const PartialMapping * BreakDown
How the value is broken down between the different register banks.
Definition: RegisterBankInfo.h:147
llvm::constrainSelectedInstRegOperands
bool constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition: Utils.cpp:151
llvm::AMDGPURegisterBankInfo::copyCost
unsigned copyCost(const RegisterBank &A, const RegisterBank &B, unsigned Size) const override
Get the cost of a copy from B to A, or put differently, get the cost of A = COPY B.
Definition: AMDGPURegisterBankInfo.cpp:218
llvm::AMDGPUAS::REGION_ADDRESS
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition: AMDGPU.h:360
llvm::LLT::fixed_vector
static LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
Definition: LowLevelTypeImpl.h:74
llvm::MachineBasicBlock::addSuccessor
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
Definition: MachineBasicBlock.cpp:747
llvm::MachineFunction::getRegInfo
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Definition: MachineFunction.h:650
getBaseWithConstantOffset
static std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg)
Definition: AMDGPURegisterBankInfo.cpp:1703
llvm::RegisterBank
This class implements the register bank concept.
Definition: RegisterBank.h:28
llvm::MachineRegisterInfo::setType
void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
Definition: MachineRegisterInfo.cpp:180
getReg
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
Definition: MipsDisassembler.cpp:517
GCNSubtarget.h
llvm::AMDGPURegisterBankInfo::applyMappingLoad
bool applyMappingLoad(MachineInstr &MI, const OperandsMapper &OpdMapper, MachineRegisterInfo &MRI) const
Definition: AMDGPURegisterBankInfo.cpp:1115
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::AMDGPURegisterBankInfo::getAGPROpMapping
const ValueMapping * getAGPROpMapping(Register Reg, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI) const
Definition: AMDGPURegisterBankInfo.cpp:3432
llvm::MachineMemOperand::getAddrSpace
unsigned getAddrSpace() const
Definition: MachineMemOperand.h:227
llvm::GCNSubtarget::useFlatForGlobal
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:465
llvm::MachineFunction::getInfo
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Definition: MachineFunction.h:738
llvm::MachineInstr::getOperand
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:501
llvm::Log2
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition: Alignment.h:207
llvm::LLT::getSizeInBits
TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelTypeImpl.h:152
llvm::LegalizerHelper::narrowScalar
LegalizeResult narrowScalar(MachineInstr &MI, unsigned TypeIdx, LLT NarrowTy)
Legalize an instruction by reducing the width of the underlying scalar type.
Definition: LegalizerHelper.cpp:899
Y
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
llvm::AMDGPURegisterBankInfo::addMappingFromTable
InstructionMappings addMappingFromTable(const MachineInstr &MI, const MachineRegisterInfo &MRI, const std::array< unsigned, NumOps > RegSrcOpIdx, ArrayRef< OpRegBankEntry< NumOps >> Table) const
llvm::TargetRegisterClass
Definition: TargetRegisterInfo.h:45
TII
const HexagonInstrInfo * TII
Definition: HexagonCopyToCombine.cpp:127
llvm::AMDGPURegisterBankInfo::getValueMappingForPtr
const ValueMapping * getValueMappingForPtr(const MachineRegisterInfo &MRI, Register Ptr) const
Return the mapping for a pointer argument.
Definition: AMDGPURegisterBankInfo.cpp:3343
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
First
into llvm powi allowing the code generator to produce balanced multiplication trees First
Definition: README.txt:54
llvm::MachineOperand
MachineOperand class - Representation of each machine instruction operand.
Definition: MachineOperand.h:48
llvm::AMDGPUAS::CONSTANT_ADDRESS_32BIT
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
Definition: AMDGPU.h:366
llvm::GCNSubtarget::isWave32
bool isWave32() const
Definition: GCNSubtarget.h:1220
llvm::AMDGPURegisterBankInfo::OpRegBankEntry
Definition: AMDGPURegisterBankInfo.h:127
llvm::SmallVectorImpl::resize
void resize(size_type N)
Definition: SmallVector.h:619
llvm::RegisterBank::getID
unsigned getID() const
Get the identifier of this register bank.
Definition: RegisterBank.h:47
llvm::MachineInstrSpan::begin
MachineBasicBlock::iterator begin()
Definition: MachineBasicBlock.h:1214
llvm::AMDGPURegisterBankInfo::constrainOpWithReadfirstlane
void constrainOpWithReadfirstlane(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const
Definition: AMDGPURegisterBankInfo.cpp:1057
llvm::SIRegisterInfo::isSGPRClass
static bool isSGPRClass(const TargetRegisterClass *RC)
Definition: SIRegisterInfo.h:176
isVectorRegisterBank
static bool isVectorRegisterBank(const RegisterBank &Bank)
Definition: AMDGPURegisterBankInfo.cpp:213
llvm::AMDGPURegisterBankInfo::collectWaterfallOperands
bool collectWaterfallOperands(SmallSet< Register, 4 > &SGPROperandRegs, MachineInstr &MI, MachineRegisterInfo &MRI, ArrayRef< unsigned > OpIndices) const
Definition: AMDGPURegisterBankInfo.cpp:1019
Info
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Align
uint64_t Align
Definition: ELFObjHandler.cpp:81
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
llvm::MachineInstrSpan::end
MachineBasicBlock::iterator end()
Definition: MachineBasicBlock.h:1217
llvm::LinearPolySize< ElementCount >::getFixed
static ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:283
llvm::RegisterBankInfo::getInstrAlternativeMappings
virtual InstructionMappings getInstrAlternativeMappings(const MachineInstr &MI) const
Get the alternative mappings for MI.
Definition: RegisterBankInfo.cpp:430
llvm::RegisterBankInfo::getSizeInBits
unsigned getSizeInBits(Register Reg, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI) const
Get the size in bits of Reg.
Definition: RegisterBankInfo.cpp:490
llvm::RegisterBankInfo::constrainGenericRegister
static const TargetRegisterClass * constrainGenericRegister(Register Reg, const TargetRegisterClass &RC, MachineRegisterInfo &MRI)
Constrain the (possibly generic) virtual register Reg to RC.
Definition: RegisterBankInfo.cpp:129
llvm::LegalizerHelper::fewerElementsVector
LegalizeResult fewerElementsVector(MachineInstr &MI, unsigned TypeIdx, LLT NarrowTy)
Legalize a vector instruction by splitting into multiple components, each acting on the same scalar t...
Definition: LegalizerHelper.cpp:4162
llvm::MachineInstrBuilder::getReg
Register getReg(unsigned Idx) const
Get the register for the operand index.
Definition: MachineInstrBuilder.h:94
X
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
llvm::MachineBasicBlock
Definition: MachineBasicBlock.h:94
llvm::AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects
RegisterBankInfo::InstructionMappings getInstrAlternativeMappingsIntrinsicWSideEffects(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
Definition: AMDGPURegisterBankInfo.cpp:374
llvm::RegisterBankInfo::OperandsMapper
Helper class used to get/create the virtual registers that will be used to replace the MachineOperand...
Definition: RegisterBankInfo.h:279
setRegsToType
static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef< Register > Regs, LLT NewTy)
Replace the current type each register in Regs has with NewTy.
Definition: AMDGPURegisterBankInfo.cpp:662
Operands
mir Rename Register Operands
Definition: MIRNamerPass.cpp:74
llvm::LLT::getAddressSpace
unsigned getAddressSpace() const
Definition: LowLevelTypeImpl.h:238
llvm::RegisterBankInfo::PartialMapping::StartIdx
unsigned StartIdx
Number of bits at which this partial mapping starts in the original value.
Definition: RegisterBankInfo.h:52
llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition: MachineFunction.h:640
llvm::LegalizerHelper::lowerAbsToMaxNeg
LegalizeResult lowerAbsToMaxNeg(MachineInstr &MI)
Definition: LegalizerHelper.cpp:7348
llvm::AMDGPURegisterBankInfo::splitBufferOffsets
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register Offset) const
Definition: AMDGPURegisterBankInfo.cpp:1717
llvm::RISCVFenceField::O
@ O
Definition: RISCVBaseInfo.h:239
llvm::SmallSet::count
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:166
AMDGPURegisterBankInfo.h
llvm::AMDGPURegisterBankInfo::getRegBankFromRegClass
const RegisterBank & getRegBankFromRegClass(const TargetRegisterClass &RC, LLT) const override
Get a register bank that covers RC.
Definition: AMDGPURegisterBankInfo.cpp:275
llvm::LLT::divide
LLT divide(int Factor) const
Return a type that is Factor times smaller.
Definition: LowLevelTypeImpl.h:196
llvm::RegisterBankInfo::OperandsMapper::getVRegs
iterator_range< SmallVectorImpl< Register >::const_iterator > getVRegs(unsigned OpIdx, bool ForDebug=false) const
Get all the virtual registers required to map the OpIdx-th operand of the instruction.
Definition: RegisterBankInfo.cpp:729
llvm::MachineIRBuilder
Helper class to build MachineInstr.
Definition: MachineIRBuilder.h:219
llvm::AMDGPURegisterBankInfo::Subtarget
const GCNSubtarget & Subtarget
Definition: AMDGPURegisterBankInfo.h:44
llvm::MachineInstr
Representation of each machine instruction.
Definition: MachineInstr.h:66
llvm::MachineInstrBuilder
Definition: MachineInstrBuilder.h:69
llvm::ARMII::VecSize
@ VecSize
Definition: ARMBaseInfo.h:421
llvm::AMDGPURegisterBankInfo::getBreakDownCost
unsigned getBreakDownCost(const ValueMapping &ValMapping, const RegisterBank *CurBank=nullptr) const override
Get the cost of using ValMapping to decompose a register.
Definition: AMDGPURegisterBankInfo.cpp:249
llvm::RegisterBankInfo::InstructionMapping
Helper class that represents how the value of an instruction may be mapped and what is the related co...
Definition: RegisterBankInfo.h:189
llvm::MachineRegisterInfo::setRegBank
void setRegBank(Register Reg, const RegisterBank &RegBank)
Set the register bank to RegBank for Reg.
Definition: MachineRegisterInfo.cpp:61
llvm::AMDGPU::getBaseWithConstantOffset
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg)
Returns base register and constant offset.
Definition: AMDGPUGlobalISelUtils.cpp:17
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
llvm::MachinePointerInfo
This class contains a discriminated union of information about pointers in memory operands,...
Definition: MachineMemOperand.h:38
llvm::numbers::e
constexpr double e
Definition: MathExtras.h:57
llvm::assumeAligned
Align assumeAligned(uint64_t Value)
Treats the value 0 as a 1, so Align is always at least 1.
Definition: Alignment.h:103
llvm::DenseMap
Definition: DenseMap.h:716
llvm::RegisterBankInfo::cannotCopy
bool cannotCopy(const RegisterBank &Dst, const RegisterBank &Src, unsigned Size) const
Definition: RegisterBankInfo.h:623
I
#define I(x, y, z)
Definition: MD5.cpp:58
llvm::RegisterBankInfo::getOperandsMapping
const ValueMapping * getOperandsMapping(Iterator Begin, Iterator End) const
Get the uniquely generated array of ValueMapping for the elements of between Begin and End.
Definition: RegisterBankInfo.cpp:329
llvm::LLT::isVector
bool isVector() const
Definition: LowLevelTypeImpl.h:122
llvm::LLT::getNumElements
uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
Definition: LowLevelTypeImpl.h:126
llvm::MachineFunction::CreateMachineBasicBlock
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *bb=nullptr)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
Definition: MachineFunction.cpp:441
AMDGPUGlobalISelUtils.h
llvm::AMDGPURegisterBankInfo::getMappingType
unsigned getMappingType(const MachineRegisterInfo &MRI, const MachineInstr &MI) const
Definition: AMDGPURegisterBankInfo.cpp:3204
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::find
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:152
llvm::MachineRegisterInfo::setSimpleHint
void setSimpleHint(Register VReg, Register PrefReg)
Specify the preferred (target independent) register allocation hint for the specified virtual registe...
Definition: MachineRegisterInfo.h:789
llvm::AMDGPURegisterBankInfo::split64BitValueForMapping
void split64BitValueForMapping(MachineIRBuilder &B, SmallVector< Register, 2 > &Regs, LLT HalfTy, Register Reg) const
Split 64-bit value Reg into two 32-bit halves