LLVM  13.0.0git
AMDGPURegisterBankInfo.cpp
Go to the documentation of this file.
1 //===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the RegisterBankInfo class for
10 /// AMDGPU.
11 ///
12 /// \par
13 ///
14 /// AMDGPU has unique register bank constraints that require special high level
15 /// strategies to deal with. There are two main true physical register banks
16 /// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a
17 /// sort of pseudo-register bank needed to represent SGPRs used in a vector
18 /// boolean context. There is also the AGPR bank, which is a special purpose
19 /// physical register bank present on some subtargets.
20 ///
21 /// Copying from VGPR to SGPR is generally illegal, unless the value is known to
22 /// be uniform. It is generally not valid to legalize operands by inserting
23 /// copies as on other targets. Operations which require uniform, SGPR operands
24 /// generally require scalarization by repeatedly executing the instruction,
25 /// activating each set of lanes using a unique set of input values. This is
26 /// referred to as a waterfall loop.
27 ///
28 /// \par Booleans
29 ///
30 /// Booleans (s1 values) requires special consideration. A vector compare result
31 /// is naturally a bitmask with one bit per lane, in a 32 or 64-bit
32 /// register. These are represented with the VCC bank. During selection, we need
33 /// to be able to unambiguously go back from a register class to a register
34 /// bank. To distinguish whether an SGPR should use the SGPR or VCC register
35 /// bank, we need to know the use context type. An SGPR s1 value always means a
36 /// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets
37 /// SCC, which is a 1-bit unaddressable register. This will need to be copied to
38 /// a 32-bit virtual register. Taken together, this means we need to adjust the
39 /// type of boolean operations to be regbank legal. All SALU booleans need to be
40 /// widened to 32-bits, and all VALU booleans need to be s1 values.
41 ///
42 /// A noteworthy exception to the s1-means-vcc rule is for legalization artifact
43 /// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc
44 /// bank. A non-boolean source (such as a truncate from a 1-bit load from
45 /// memory) will require a copy to the VCC bank which will require clearing the
46 /// high bits and inserting a compare.
47 ///
48 /// \par Constant bus restriction
49 ///
50 /// VALU instructions have a limitation known as the constant bus
51 /// restriction. Most VALU instructions can use SGPR operands, but may read at
52 /// most 1 SGPR or constant literal value (this to 2 in gfx10 for most
53 /// instructions). This is one unique SGPR, so the same SGPR may be used for
54 /// multiple operands. From a register bank perspective, any combination of
55 /// operands should be legal as an SGPR, but this is contextually dependent on
56 /// the SGPR operands all being the same register. There is therefore optimal to
57 /// choose the SGPR with the most uses to minimize the number of copies.
58 ///
59 /// We avoid trying to solve this problem in RegBankSelect. Any VALU G_*
60 /// operation should have its source operands all mapped to VGPRs (except for
61 /// VCC), inserting copies from any SGPR operands. This the most trival legal
62 /// mapping. Anything beyond the simplest 1:1 instruction selection would be too
63 /// complicated to solve here. Every optimization pattern or instruction
64 /// selected to multiple outputs would have to enforce this rule, and there
65 /// would be additional complexity in tracking this rule for every G_*
66 /// operation. By forcing all inputs to VGPRs, it also simplifies the task of
67 /// picking the optimal operand combination from a post-isel optimization pass.
68 ///
69 //===----------------------------------------------------------------------===//
70 
71 #include "AMDGPURegisterBankInfo.h"
72 
73 #include "AMDGPU.h"
74 #include "AMDGPUGlobalISelUtils.h"
75 #include "AMDGPUInstrInfo.h"
76 #include "GCNSubtarget.h"
77 #include "SIMachineFunctionInfo.h"
78 #include "SIRegisterInfo.h"
83 #include "llvm/IR/IntrinsicsAMDGPU.h"
84 
85 #define GET_TARGET_REGBANK_IMPL
86 #include "AMDGPUGenRegisterBank.inc"
87 
88 // This file will be TableGen'ed at some point.
89 #include "AMDGPUGenRegisterBankInfo.def"
90 
91 using namespace llvm;
92 using namespace MIPatternMatch;
93 
94 namespace {
95 
96 // Observer to apply a register bank to new registers created by LegalizerHelper.
97 class ApplyRegBankMapping final : public GISelChangeObserver {
98 private:
99  const AMDGPURegisterBankInfo &RBI;
101  const RegisterBank *NewBank;
103 
104 public:
105  ApplyRegBankMapping(const AMDGPURegisterBankInfo &RBI_,
106  MachineRegisterInfo &MRI_, const RegisterBank *RB)
107  : RBI(RBI_), MRI(MRI_), NewBank(RB) {}
108 
109  ~ApplyRegBankMapping() {
110  for (MachineInstr *MI : NewInsts)
111  applyBank(*MI);
112  }
113 
114  /// Set any registers that don't have a set register class or bank to SALU.
115  void applyBank(MachineInstr &MI) {
116  const unsigned Opc = MI.getOpcode();
117  if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT ||
118  Opc == AMDGPU::G_SEXT) {
119  // LegalizerHelper wants to use the basic legalization artifacts when
120  // widening etc. We don't handle selection with vcc in artifact sources,
121  // so we need to use a sslect instead to handle these properly.
122  Register DstReg = MI.getOperand(0).getReg();
123  Register SrcReg = MI.getOperand(1).getReg();
124  const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI);
125  if (SrcBank == &AMDGPU::VCCRegBank) {
126  const LLT S32 = LLT::scalar(32);
127  assert(MRI.getType(SrcReg) == LLT::scalar(1));
128  assert(MRI.getType(DstReg) == S32);
129  assert(NewBank == &AMDGPU::VGPRRegBank);
130 
131  // Replace the extension with a select, which really uses the boolean
132  // source.
134  auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1);
135  auto False = B.buildConstant(S32, 0);
136  B.buildSelect(DstReg, SrcReg, True, False);
137  MRI.setRegBank(True.getReg(0), *NewBank);
138  MRI.setRegBank(False.getReg(0), *NewBank);
139  MI.eraseFromParent();
140  }
141 
142  assert(!MRI.getRegClassOrRegBank(DstReg));
143  MRI.setRegBank(DstReg, *NewBank);
144  return;
145  }
146 
147 #ifndef NDEBUG
148  if (Opc == AMDGPU::G_TRUNC) {
149  Register DstReg = MI.getOperand(0).getReg();
150  const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI);
151  assert(DstBank != &AMDGPU::VCCRegBank);
152  }
153 #endif
154 
155  for (MachineOperand &Op : MI.operands()) {
156  if (!Op.isReg())
157  continue;
158 
159  // We may see physical registers if building a real MI
160  Register Reg = Op.getReg();
161  if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg))
162  continue;
163 
164  const RegisterBank *RB = NewBank;
165  if (MRI.getType(Reg) == LLT::scalar(1)) {
166  assert(NewBank == &AMDGPU::VGPRRegBank &&
167  "s1 operands should only be used for vector bools");
168  assert((MI.getOpcode() != AMDGPU::G_TRUNC &&
169  MI.getOpcode() != AMDGPU::G_ANYEXT) &&
170  "not expecting legalization artifacts here");
171  RB = &AMDGPU::VCCRegBank;
172  }
173 
174  MRI.setRegBank(Reg, *RB);
175  }
176  }
177 
178  void erasingInstr(MachineInstr &MI) override {}
179 
180  void createdInstr(MachineInstr &MI) override {
181  // At this point, the instruction was just inserted and has no operands.
182  NewInsts.push_back(&MI);
183  }
184 
185  void changingInstr(MachineInstr &MI) override {}
186  void changedInstr(MachineInstr &MI) override {
187  // FIXME: In principle we should probably add the instruction to NewInsts,
188  // but the way the LegalizerHelper uses the observer, we will always see the
189  // registers we need to set the regbank on also referenced in a new
190  // instruction.
191  }
192 };
193 
194 }
197  Subtarget(ST),
198  TRI(Subtarget.getRegisterInfo()),
199  TII(Subtarget.getInstrInfo()) {
200 
201  // HACK: Until this is fully tablegen'd.
202  static llvm::once_flag InitializeRegisterBankFlag;
203 
204  static auto InitializeRegisterBankOnce = [this]() {
205  assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank &&
206  &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank &&
207  &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank);
208  (void)this;
209  };
210 
211  llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce);
212 }
213 
214 static bool isVectorRegisterBank(const RegisterBank &Bank) {
215  unsigned BankID = Bank.getID();
216  return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID;
217 }
218 
220  const RegisterBank &Src,
221  unsigned Size) const {
222  // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
223  if (Dst.getID() == AMDGPU::SGPRRegBankID &&
224  (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) {
226  }
227 
228  // Bool values are tricky, because the meaning is based on context. The SCC
229  // and VCC banks are for the natural scalar and vector conditions produced by
230  // a compare.
231  //
232  // Legalization doesn't know about the necessary context, so an s1 use may
233  // have been a truncate from an arbitrary value, in which case a copy (lowered
234  // as a compare with 0) needs to be inserted.
235  if (Size == 1 &&
236  (Dst.getID() == AMDGPU::SGPRRegBankID) &&
237  (isVectorRegisterBank(Src) ||
238  Src.getID() == AMDGPU::SGPRRegBankID ||
239  Src.getID() == AMDGPU::VCCRegBankID))
241 
242  // There is no direct copy between AGPRs.
243  if (Dst.getID() == AMDGPU::AGPRRegBankID &&
244  Src.getID() == AMDGPU::AGPRRegBankID)
245  return 4;
246 
247  return RegisterBankInfo::copyCost(Dst, Src, Size);
248 }
249 
251  const ValueMapping &ValMapping,
252  const RegisterBank *CurBank) const {
253  // Check if this is a breakdown for G_LOAD to move the pointer from SGPR to
254  // VGPR.
255  // FIXME: Is there a better way to do this?
256  if (ValMapping.NumBreakDowns >= 2 || ValMapping.BreakDown[0].Length >= 64)
257  return 10; // This is expensive.
258 
259  assert(ValMapping.NumBreakDowns == 2 &&
260  ValMapping.BreakDown[0].Length == 32 &&
261  ValMapping.BreakDown[0].StartIdx == 0 &&
262  ValMapping.BreakDown[1].Length == 32 &&
263  ValMapping.BreakDown[1].StartIdx == 32 &&
264  ValMapping.BreakDown[0].RegBank == ValMapping.BreakDown[1].RegBank);
265 
266  // 32-bit extract of a 64-bit value is just access of a subregister, so free.
267  // TODO: Cost of 0 hits assert, though it's not clear it's what we really
268  // want.
269 
270  // TODO: 32-bit insert to a 64-bit SGPR may incur a non-free copy due to SGPR
271  // alignment restrictions, but this probably isn't important.
272  return 1;
273 }
274 
275 const RegisterBank &
277  LLT Ty) const {
278  if (&RC == &AMDGPU::SReg_1RegClass)
279  return AMDGPU::VCCRegBank;
280 
281  // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
282  // VCC-like use.
283  if (TRI->isSGPRClass(&RC)) {
284  // FIXME: This probably came from a copy from a physical register, which
285  // should be inferrrable from the copied to-type. We don't have many boolean
286  // physical register constraints so just assume a normal SGPR for now.
287  if (!Ty.isValid())
288  return AMDGPU::SGPRRegBank;
289 
290  return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
291  }
292 
293  return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank;
294 }
295 
296 template <unsigned NumOps>
299  const MachineInstr &MI, const MachineRegisterInfo &MRI,
300  const std::array<unsigned, NumOps> RegSrcOpIdx,
301  ArrayRef<OpRegBankEntry<NumOps>> Table) const {
302 
303  InstructionMappings AltMappings;
304 
306 
307  unsigned Sizes[NumOps];
308  for (unsigned I = 0; I < NumOps; ++I) {
309  Register Reg = MI.getOperand(RegSrcOpIdx[I]).getReg();
310  Sizes[I] = getSizeInBits(Reg, MRI, *TRI);
311  }
312 
313  for (unsigned I = 0, E = MI.getNumExplicitDefs(); I != E; ++I) {
314  unsigned SizeI = getSizeInBits(MI.getOperand(I).getReg(), MRI, *TRI);
315  Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
316  }
317 
318  // getInstrMapping's default mapping uses ID 1, so start at 2.
319  unsigned MappingID = 2;
320  for (const auto &Entry : Table) {
321  for (unsigned I = 0; I < NumOps; ++I) {
322  int OpIdx = RegSrcOpIdx[I];
323  Operands[OpIdx] = AMDGPU::getValueMapping(Entry.RegBanks[I], Sizes[I]);
324  }
325 
326  AltMappings.push_back(&getInstructionMapping(MappingID++, Entry.Cost,
328  Operands.size()));
329  }
330 
331  return AltMappings;
332 }
333 
336  const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
337  switch (MI.getIntrinsicID()) {
338  case Intrinsic::amdgcn_readlane: {
339  static const OpRegBankEntry<3> Table[2] = {
340  // Perfectly legal.
341  { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
342 
343  // Need a readfirstlane for the index.
344  { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
345  };
346 
347  const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
348  return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
349  }
350  case Intrinsic::amdgcn_writelane: {
351  static const OpRegBankEntry<4> Table[4] = {
352  // Perfectly legal.
353  { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
354 
355  // Need readfirstlane of first op
356  { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
357 
358  // Need readfirstlane of second op
359  { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 },
360 
361  // Need readfirstlane of both ops
362  { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 3 }
363  };
364 
365  // rsrc, voffset, offset
366  const std::array<unsigned, 4> RegSrcOpIdx = { { 0, 2, 3, 4 } };
367  return addMappingFromTable<4>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
368  }
369  default:
371  }
372 }
373 
376  const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
377 
378  switch (MI.getIntrinsicID()) {
379  case Intrinsic::amdgcn_s_buffer_load: {
380  static const OpRegBankEntry<2> Table[4] = {
381  // Perfectly legal.
382  { { AMDGPU::SGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
383 
384  // Only need 1 register in loop
385  { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 300 },
386 
387  // Have to waterfall the resource.
388  { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
389 
390  // Have to waterfall the resource, and the offset.
391  { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1500 }
392  };
393 
394  // rsrc, offset
395  const std::array<unsigned, 2> RegSrcOpIdx = { { 2, 3 } };
396  return addMappingFromTable<2>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
397  }
398  case Intrinsic::amdgcn_ds_ordered_add:
399  case Intrinsic::amdgcn_ds_ordered_swap: {
400  // VGPR = M0, VGPR
401  static const OpRegBankEntry<3> Table[2] = {
402  // Perfectly legal.
403  { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
404 
405  // Need a readfirstlane for m0
406  { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 2 }
407  };
408 
409  const std::array<unsigned, 3> RegSrcOpIdx = { { 0, 2, 3 } };
410  return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
411  }
412  case Intrinsic::amdgcn_s_sendmsg:
413  case Intrinsic::amdgcn_s_sendmsghalt: {
414  // FIXME: Should have no register for immediate
415  static const OpRegBankEntry<1> Table[2] = {
416  // Perfectly legal.
417  { { AMDGPU::SGPRRegBankID }, 1 },
418 
419  // Need readlane
420  { { AMDGPU::VGPRRegBankID }, 3 }
421  };
422 
423  const std::array<unsigned, 1> RegSrcOpIdx = { { 2 } };
424  return addMappingFromTable<1>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
425  }
426  default:
428  }
429 }
430 
431 static bool memOpHasNoClobbered(const MachineMemOperand *MMO) {
432  const Instruction *I = dyn_cast_or_null<Instruction>(MMO->getValue());
433  return I && I->getMetadata("amdgpu.noclobber");
434 }
435 
436 // FIXME: Returns uniform if there's no source value information. This is
437 // probably wrong.
438 static bool isScalarLoadLegal(const MachineInstr &MI) {
439  if (!MI.hasOneMemOperand())
440  return false;
441 
442  const MachineMemOperand *MMO = *MI.memoperands_begin();
443  const unsigned AS = MMO->getAddrSpace();
444  const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||
446 
447  // There are no extending SMRD/SMEM loads, and they require 4-byte alignment.
448  return MMO->getSize() >= 4 && MMO->getAlign() >= Align(4) &&
449  // Can't do a scalar atomic load.
450  !MMO->isAtomic() &&
451  // Don't use scalar loads for volatile accesses to non-constant address
452  // spaces.
453  (IsConst || !MMO->isVolatile()) &&
454  // Memory must be known constant, or not written before this load.
455  (IsConst || MMO->isInvariant() || memOpHasNoClobbered(MMO)) &&
457 }
458 
461  const MachineInstr &MI) const {
462 
463  const MachineFunction &MF = *MI.getParent()->getParent();
464  const MachineRegisterInfo &MRI = MF.getRegInfo();
465 
466 
467  InstructionMappings AltMappings;
468  switch (MI.getOpcode()) {
469  case TargetOpcode::G_CONSTANT: {
470  unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
471  if (Size == 1) {
472  static const OpRegBankEntry<1> Table[3] = {
473  { { AMDGPU::VGPRRegBankID }, 1 },
474  { { AMDGPU::SGPRRegBankID }, 1 },
475  { { AMDGPU::VCCRegBankID }, 1 }
476  };
477 
478  return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
479  }
480 
482  }
483  case TargetOpcode::G_FCONSTANT:
484  case TargetOpcode::G_FRAME_INDEX:
485  case TargetOpcode::G_GLOBAL_VALUE: {
486  static const OpRegBankEntry<1> Table[2] = {
487  { { AMDGPU::VGPRRegBankID }, 1 },
488  { { AMDGPU::SGPRRegBankID }, 1 }
489  };
490 
491  return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
492  }
493  case TargetOpcode::G_AND:
494  case TargetOpcode::G_OR:
495  case TargetOpcode::G_XOR: {
496  unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
497 
498  if (Size == 1) {
499  // s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
500  const InstructionMapping &SCCMapping = getInstructionMapping(
501  1, 1, getOperandsMapping(
502  {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
503  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
504  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}),
505  3); // Num Operands
506  AltMappings.push_back(&SCCMapping);
507 
508  const InstructionMapping &VCCMapping0 = getInstructionMapping(
509  2, 1, getOperandsMapping(
510  {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
511  AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
512  AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
513  3); // Num Operands
514  AltMappings.push_back(&VCCMapping0);
515  return AltMappings;
516  }
517 
518  if (Size != 64)
519  break;
520 
521  const InstructionMapping &SSMapping = getInstructionMapping(
522  1, 1, getOperandsMapping(
523  {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
524  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
525  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
526  3); // Num Operands
527  AltMappings.push_back(&SSMapping);
528 
529  const InstructionMapping &VVMapping = getInstructionMapping(
530  2, 2, getOperandsMapping(
531  {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
532  AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
533  AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
534  3); // Num Operands
535  AltMappings.push_back(&VVMapping);
536  break;
537  }
538  case TargetOpcode::G_LOAD:
539  case TargetOpcode::G_ZEXTLOAD:
540  case TargetOpcode::G_SEXTLOAD: {
541  unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
542  LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
543  unsigned PtrSize = PtrTy.getSizeInBits();
544  unsigned AS = PtrTy.getAddressSpace();
545 
547  AS != AMDGPUAS::PRIVATE_ADDRESS) &&
549  const InstructionMapping &SSMapping = getInstructionMapping(
550  1, 1, getOperandsMapping(
551  {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
552  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}),
553  2); // Num Operands
554  AltMappings.push_back(&SSMapping);
555  }
556 
557  const InstructionMapping &VVMapping = getInstructionMapping(
558  2, 1,
560  {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
561  AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
562  2); // Num Operands
563  AltMappings.push_back(&VVMapping);
564 
565  // It may be possible to have a vgpr = load sgpr mapping here, because
566  // the mubuf instructions support this kind of load, but probably for only
567  // gfx7 and older. However, the addressing mode matching in the instruction
568  // selector should be able to do a better job of detecting and selecting
569  // these kinds of loads from the vgpr = load vgpr mapping.
570 
571  return AltMappings;
572 
573  }
574  case TargetOpcode::G_SELECT: {
575  unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
576  const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
577  getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
578  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
579  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
580  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
581  4); // Num Operands
582  AltMappings.push_back(&SSMapping);
583 
584  const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
585  getOperandsMapping({AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
586  AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
587  AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
588  AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
589  4); // Num Operands
590  AltMappings.push_back(&VVMapping);
591 
592  return AltMappings;
593  }
594  case TargetOpcode::G_UADDE:
595  case TargetOpcode::G_USUBE:
596  case TargetOpcode::G_SADDE:
597  case TargetOpcode::G_SSUBE: {
598  unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
599  const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
601  {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
602  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
603  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
604  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
605  AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}),
606  5); // Num Operands
607  AltMappings.push_back(&SSMapping);
608 
609  const InstructionMapping &VVMapping = getInstructionMapping(2, 1,
610  getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
611  AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
612  AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
613  AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
614  AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1)}),
615  5); // Num Operands
616  AltMappings.push_back(&VVMapping);
617  return AltMappings;
618  }
619  case AMDGPU::G_BRCOND: {
620  assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
621 
622  // TODO: Change type to 32 for scalar
623  const InstructionMapping &SMapping = getInstructionMapping(
624  1, 1, getOperandsMapping(
625  {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}),
626  2); // Num Operands
627  AltMappings.push_back(&SMapping);
628 
629  const InstructionMapping &VMapping = getInstructionMapping(
630  1, 1, getOperandsMapping(
631  {AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1), nullptr }),
632  2); // Num Operands
633  AltMappings.push_back(&VMapping);
634  return AltMappings;
635  }
636  case AMDGPU::G_INTRINSIC:
638  case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
640  default:
641  break;
642  }
644 }
645 
649  LLT HalfTy,
650  Register Reg) const {
651  assert(HalfTy.getSizeInBits() == 32);
652  MachineRegisterInfo *MRI = B.getMRI();
653  Register LoLHS = MRI->createGenericVirtualRegister(HalfTy);
654  Register HiLHS = MRI->createGenericVirtualRegister(HalfTy);
655  const RegisterBank *Bank = getRegBank(Reg, *MRI, *TRI);
656  MRI->setRegBank(LoLHS, *Bank);
657  MRI->setRegBank(HiLHS, *Bank);
658 
659  Regs.push_back(LoLHS);
660  Regs.push_back(HiLHS);
661 
662  B.buildInstr(AMDGPU::G_UNMERGE_VALUES)
663  .addDef(LoLHS)
664  .addDef(HiLHS)
665  .addUse(Reg);
666 }
667 
668 /// Replace the current type each register in \p Regs has with \p NewTy
670  LLT NewTy) {
671  for (Register Reg : Regs) {
673  MRI.setType(Reg, NewTy);
674  }
675 }
676 
678  if (Ty.isVector()) {
679  assert(Ty.getNumElements() % 2 == 0);
680  return LLT::scalarOrVector(Ty.getNumElements() / 2, Ty.getElementType());
681  }
682 
683  assert(Ty.getSizeInBits() % 2 == 0);
684  return LLT::scalar(Ty.getSizeInBits() / 2);
685 }
686 
687 /// Legalize instruction \p MI where operands in \p OpIndices must be SGPRs. If
688 /// any of the required SGPR operands are VGPRs, perform a waterfall loop to
689 /// execute the instruction for each unique combination of values in all lanes
690 /// in the wave. The block will be split such that rest of the instructions are
691 /// moved to a new block.
692 ///
693 /// Essentially performs this loop:
694 //
695 /// Save Execution Mask
696 /// For (Lane : Wavefront) {
697 /// Enable Lane, Disable all other lanes
698 /// SGPR = read SGPR value for current lane from VGPR
699 /// VGPRResult[Lane] = use_op SGPR
700 /// }
701 /// Restore Execution Mask
702 ///
703 /// There is additional complexity to try for compare values to identify the
704 /// unique values used.
708  SmallSet<Register, 4> &SGPROperandRegs,
709  MachineRegisterInfo &MRI) const {
710  SmallVector<Register, 4> ResultRegs;
711  SmallVector<Register, 4> InitResultRegs;
712  SmallVector<Register, 4> PhiRegs;
713 
714  // Track use registers which have already been expanded with a readfirstlane
715  // sequence. This may have multiple uses if moving a sequence.
716  DenseMap<Register, Register> WaterfalledRegMap;
717 
718  MachineBasicBlock &MBB = B.getMBB();
719  MachineFunction *MF = &B.getMF();
720 
721  const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
722  const unsigned WaveAndOpc = Subtarget.isWave32() ?
723  AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
724  const unsigned MovTermOpc = Subtarget.isWave32() ?
725  AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
726  const unsigned XorTermOpc = Subtarget.isWave32() ?
727  AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
728  const unsigned AndSaveExecOpc = Subtarget.isWave32() ?
729  AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
730  const unsigned ExecReg = Subtarget.isWave32() ?
731  AMDGPU::EXEC_LO : AMDGPU::EXEC;
732 
733 #ifndef NDEBUG
734  const int OrigRangeSize = std::distance(Range.begin(), Range.end());
735 #endif
736 
737  for (MachineInstr &MI : Range) {
738  for (MachineOperand &Def : MI.defs()) {
739  if (MRI.use_nodbg_empty(Def.getReg()))
740  continue;
741 
742  LLT ResTy = MRI.getType(Def.getReg());
743  const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI);
744  ResultRegs.push_back(Def.getReg());
745  Register InitReg = B.buildUndef(ResTy).getReg(0);
746  Register PhiReg = MRI.createGenericVirtualRegister(ResTy);
747  InitResultRegs.push_back(InitReg);
748  PhiRegs.push_back(PhiReg);
749  MRI.setRegBank(PhiReg, *DefBank);
750  MRI.setRegBank(InitReg, *DefBank);
751  }
752  }
753 
754  Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
755  Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
756 
757  // Don't bother using generic instructions/registers for the exec mask.
758  B.buildInstr(TargetOpcode::IMPLICIT_DEF)
759  .addDef(InitSaveExecReg);
760 
761  Register PhiExec = MRI.createVirtualRegister(WaveRC);
762  Register NewExec = MRI.createVirtualRegister(WaveRC);
763 
764  // To insert the loop we need to split the block. Move everything before this
765  // point to a new block, and insert a new empty block before this instruction.
767  MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
768  MachineBasicBlock *RestoreExecBB = MF->CreateMachineBasicBlock();
770  ++MBBI;
771  MF->insert(MBBI, LoopBB);
772  MF->insert(MBBI, RestoreExecBB);
773  MF->insert(MBBI, RemainderBB);
774 
775  LoopBB->addSuccessor(RestoreExecBB);
776  LoopBB->addSuccessor(LoopBB);
777 
778  // Move the rest of the block into a new block.
779  RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
780  RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
781 
782  MBB.addSuccessor(LoopBB);
783  RestoreExecBB->addSuccessor(RemainderBB);
784 
785  B.setInsertPt(*LoopBB, LoopBB->end());
786 
787  B.buildInstr(TargetOpcode::PHI)
788  .addDef(PhiExec)
789  .addReg(InitSaveExecReg)
790  .addMBB(&MBB)
791  .addReg(NewExec)
792  .addMBB(LoopBB);
793 
794  for (auto Result : zip(InitResultRegs, ResultRegs, PhiRegs)) {
795  B.buildInstr(TargetOpcode::G_PHI)
796  .addDef(std::get<2>(Result))
797  .addReg(std::get<0>(Result)) // Initial value / implicit_def
798  .addMBB(&MBB)
799  .addReg(std::get<1>(Result)) // Mid-loop value.
800  .addMBB(LoopBB);
801  }
802 
803  const DebugLoc &DL = B.getDL();
804 
805  MachineInstr &FirstInst = *Range.begin();
806 
807  // Move the instruction into the loop. Note we moved everything after
808  // Range.end() already into a new block, so Range.end() is no longer valid.
809  LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end());
810 
811  // Figure out the iterator range after splicing the instructions.
812  MachineBasicBlock::iterator NewBegin = FirstInst.getIterator();
813  auto NewEnd = LoopBB->end();
814 
815  MachineBasicBlock::iterator I = Range.begin();
816  B.setInsertPt(*LoopBB, I);
817 
818  Register CondReg;
819 
820  assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
821 
822  for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
823  for (MachineOperand &Op : MI.uses()) {
824  if (!Op.isReg() || Op.isDef())
825  continue;
826 
827  Register OldReg = Op.getReg();
828  if (!SGPROperandRegs.count(OldReg))
829  continue;
830 
831  // See if we already processed this register in another instruction in the
832  // sequence.
833  auto OldVal = WaterfalledRegMap.find(OldReg);
834  if (OldVal != WaterfalledRegMap.end()) {
835  Op.setReg(OldVal->second);
836  continue;
837  }
838 
839  Register OpReg = Op.getReg();
840  LLT OpTy = MRI.getType(OpReg);
841 
842  const RegisterBank *OpBank = getRegBank(OpReg, MRI, *TRI);
843  if (OpBank != &AMDGPU::VGPRRegBank) {
844  // Insert copy from AGPR to VGPR before the loop.
845  B.setMBB(MBB);
846  OpReg = B.buildCopy(OpTy, OpReg).getReg(0);
847  MRI.setRegBank(OpReg, AMDGPU::VGPRRegBank);
848  B.setInstr(*I);
849  }
850 
851  unsigned OpSize = OpTy.getSizeInBits();
852 
853  // Can only do a readlane of 32-bit pieces.
854  if (OpSize == 32) {
855  // Avoid extra copies in the simple case of one 32-bit register.
856  Register CurrentLaneOpReg
857  = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
858  MRI.setType(CurrentLaneOpReg, OpTy);
859 
860  constrainGenericRegister(OpReg, AMDGPU::VGPR_32RegClass, MRI);
861  // Read the next variant <- also loop target.
862  BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
863  CurrentLaneOpReg)
864  .addReg(OpReg);
865 
866  Register NewCondReg = MRI.createVirtualRegister(WaveRC);
867  bool First = CondReg == AMDGPU::NoRegister;
868  if (First)
869  CondReg = NewCondReg;
870 
871  // Compare the just read M0 value to all possible Idx values.
872  B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
873  .addDef(NewCondReg)
874  .addReg(CurrentLaneOpReg)
875  .addReg(OpReg);
876  Op.setReg(CurrentLaneOpReg);
877 
878  if (!First) {
879  Register AndReg = MRI.createVirtualRegister(WaveRC);
880 
881  // If there are multiple operands to consider, and the conditions.
882  B.buildInstr(WaveAndOpc)
883  .addDef(AndReg)
884  .addReg(NewCondReg)
885  .addReg(CondReg);
886  CondReg = AndReg;
887  }
888  } else {
889  LLT S32 = LLT::scalar(32);
890  SmallVector<Register, 8> ReadlanePieces;
891 
892  // The compares can be done as 64-bit, but the extract needs to be done
893  // in 32-bit pieces.
894 
895  bool Is64 = OpSize % 64 == 0;
896 
897  LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32);
898  unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64
899  : AMDGPU::V_CMP_EQ_U32_e64;
900 
901  // The compares can be done as 64-bit, but the extract needs to be done
902  // in 32-bit pieces.
903 
904  // Insert the unmerge before the loop.
905 
906  B.setMBB(MBB);
907  auto Unmerge = B.buildUnmerge(UnmergeTy, OpReg);
908  B.setInstr(*I);
909 
910  unsigned NumPieces = Unmerge->getNumOperands() - 1;
911  for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) {
912  Register UnmergePiece = Unmerge.getReg(PieceIdx);
913 
914  Register CurrentLaneOpReg;
915  if (Is64) {
916  Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32);
917  Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32);
918 
919  MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass);
920  MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass);
921  MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass);
922 
923  // Read the next variant <- also loop target.
924  BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
925  CurrentLaneOpRegLo)
926  .addReg(UnmergePiece, 0, AMDGPU::sub0);
927 
928  // Read the next variant <- also loop target.
929  BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
930  CurrentLaneOpRegHi)
931  .addReg(UnmergePiece, 0, AMDGPU::sub1);
932 
933  CurrentLaneOpReg =
934  B.buildMerge(LLT::scalar(64),
935  {CurrentLaneOpRegLo, CurrentLaneOpRegHi})
936  .getReg(0);
937 
938  MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass);
939 
940  if (OpTy.getScalarSizeInBits() == 64) {
941  // If we need to produce a 64-bit element vector, so use the
942  // merged pieces
943  ReadlanePieces.push_back(CurrentLaneOpReg);
944  } else {
945  // 32-bit element type.
946  ReadlanePieces.push_back(CurrentLaneOpRegLo);
947  ReadlanePieces.push_back(CurrentLaneOpRegHi);
948  }
949  } else {
950  CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32);
951  MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass);
952  MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass);
953 
954  // Read the next variant <- also loop target.
955  BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
956  CurrentLaneOpReg)
957  .addReg(UnmergePiece);
958  ReadlanePieces.push_back(CurrentLaneOpReg);
959  }
960 
961  Register NewCondReg = MRI.createVirtualRegister(WaveRC);
962  bool First = CondReg == AMDGPU::NoRegister;
963  if (First)
964  CondReg = NewCondReg;
965 
966  B.buildInstr(CmpOp)
967  .addDef(NewCondReg)
968  .addReg(CurrentLaneOpReg)
969  .addReg(UnmergePiece);
970 
971  if (!First) {
972  Register AndReg = MRI.createVirtualRegister(WaveRC);
973 
974  // If there are multiple operands to consider, and the conditions.
975  B.buildInstr(WaveAndOpc)
976  .addDef(AndReg)
977  .addReg(NewCondReg)
978  .addReg(CondReg);
979  CondReg = AndReg;
980  }
981  }
982 
983  // FIXME: Build merge seems to switch to CONCAT_VECTORS but not
984  // BUILD_VECTOR
985  if (OpTy.isVector()) {
986  auto Merge = B.buildBuildVector(OpTy, ReadlanePieces);
987  Op.setReg(Merge.getReg(0));
988  } else {
989  auto Merge = B.buildMerge(OpTy, ReadlanePieces);
990  Op.setReg(Merge.getReg(0));
991  }
992 
993  MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank);
994  }
995 
996  // Make sure we don't re-process this register again.
997  WaterfalledRegMap.insert(std::make_pair(OldReg, Op.getReg()));
998  }
999  }
1000 
1001  B.setInsertPt(*LoopBB, LoopBB->end());
1002 
1003  // Update EXEC, save the original EXEC value to VCC.
1004  B.buildInstr(AndSaveExecOpc)
1005  .addDef(NewExec)
1006  .addReg(CondReg, RegState::Kill);
1007 
1008  MRI.setSimpleHint(NewExec, CondReg);
1009 
1010  // Update EXEC, switch all done bits to 0 and all todo bits to 1.
1011  B.buildInstr(XorTermOpc)
1012  .addDef(ExecReg)
1013  .addReg(ExecReg)
1014  .addReg(NewExec);
1015 
1016  // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
1017  // s_cbranch_scc0?
1018 
1019  // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
1020  B.buildInstr(AMDGPU::S_CBRANCH_EXECNZ)
1021  .addMBB(LoopBB);
1022 
1023  // Save the EXEC mask before the loop.
1024  BuildMI(MBB, MBB.end(), DL, TII->get(MovTermOpc), SaveExecReg)
1025  .addReg(ExecReg);
1026 
1027  // Restore the EXEC mask after the loop.
1028  B.setMBB(*RestoreExecBB);
1029  B.buildInstr(MovTermOpc)
1030  .addDef(ExecReg)
1031  .addReg(SaveExecReg);
1032 
1033  // Set the insert point after the original instruction, so any new
1034  // instructions will be in the remainder.
1035  B.setInsertPt(*RemainderBB, RemainderBB->begin());
1036 
1037  return true;
1038 }
1039 
1040 // Return any unique registers used by \p MI at \p OpIndices that need to be
1041 // handled in a waterfall loop. Returns these registers in \p
1042 // SGPROperandRegs. Returns true if there are any operands to handle and a
1043 // waterfall loop is necessary.
1045  SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,
1046  MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {
1047  for (unsigned Op : OpIndices) {
1048  assert(MI.getOperand(Op).isUse());
1049  Register Reg = MI.getOperand(Op).getReg();
1050  const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
1051  if (OpBank->getID() != AMDGPU::SGPRRegBankID)
1052  SGPROperandRegs.insert(Reg);
1053  }
1054 
1055  // No operands need to be replaced, so no need to loop.
1056  return !SGPROperandRegs.empty();
1057 }
1058 
1061  ArrayRef<unsigned> OpIndices) const {
1062  // Use a set to avoid extra readfirstlanes in the case where multiple operands
1063  // are the same register.
1064  SmallSet<Register, 4> SGPROperandRegs;
1065 
1066  if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices))
1067  return false;
1068 
1069  MachineBasicBlock::iterator I = MI.getIterator();
1070  return executeInWaterfallLoop(B, make_range(I, std::next(I)),
1071  SGPROperandRegs, MRI);
1072 }
1073 
1076  ArrayRef<unsigned> OpIndices) const {
1078  return executeInWaterfallLoop(B, MI, MRI, OpIndices);
1079 }
1080 
1081 // Legalize an operand that must be an SGPR by inserting a readfirstlane.
1083  MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const {
1084  Register Reg = MI.getOperand(OpIdx).getReg();
1085  const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
1086  if (Bank == &AMDGPU::SGPRRegBank)
1087  return;
1088 
1089  LLT Ty = MRI.getType(Reg);
1091 
1092  if (Bank != &AMDGPU::VGPRRegBank) {
1093  // We need to copy from AGPR to VGPR
1094  Reg = B.buildCopy(Ty, Reg).getReg(0);
1095  MRI.setRegBank(Reg, AMDGPU::VGPRRegBank);
1096  }
1097 
1098  Register SGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
1099  B.buildInstr(AMDGPU::V_READFIRSTLANE_B32)
1100  .addDef(SGPR)
1101  .addReg(Reg);
1102 
1103  MRI.setType(SGPR, Ty);
1104 
1105  const TargetRegisterClass *Constrained =
1106  constrainGenericRegister(Reg, AMDGPU::VGPR_32RegClass, MRI);
1107  (void)Constrained;
1108  assert(Constrained && "Failed to constrain readfirstlane src reg");
1109 
1110  MI.getOperand(OpIdx).setReg(SGPR);
1111 }
1112 
1113 /// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
1114 /// rest will be in the remainder.
1115 static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) {
1116  unsigned TotalSize = Ty.getSizeInBits();
1117  if (!Ty.isVector())
1118  return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)};
1119 
1120  LLT EltTy = Ty.getElementType();
1121  unsigned EltSize = EltTy.getSizeInBits();
1122  assert(FirstSize % EltSize == 0);
1123 
1124  unsigned FirstPartNumElts = FirstSize / EltSize;
1125  unsigned RemainderElts = (TotalSize - FirstSize) / EltSize;
1126 
1127  return {LLT::scalarOrVector(FirstPartNumElts, EltTy),
1128  LLT::scalarOrVector(RemainderElts, EltTy)};
1129 }
1130 
1131 static LLT widen96To128(LLT Ty) {
1132  if (!Ty.isVector())
1133  return LLT::scalar(128);
1134 
1135  LLT EltTy = Ty.getElementType();
1136  assert(128 % EltTy.getSizeInBits() == 0);
1137  return LLT::vector(128 / EltTy.getSizeInBits(), EltTy);
1138 }
1139 
1141  const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1142  MachineRegisterInfo &MRI) const {
1143  Register DstReg = MI.getOperand(0).getReg();
1144  const LLT LoadTy = MRI.getType(DstReg);
1145  unsigned LoadSize = LoadTy.getSizeInBits();
1146  const unsigned MaxNonSmrdLoadSize = 128;
1147 
1148  const RegisterBank *PtrBank =
1149  OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1150  if (PtrBank == &AMDGPU::SGPRRegBank) {
1151  // If the pointer is an SGPR, we ordinarily have nothing to do.
1152  if (LoadSize != 96)
1153  return false;
1154 
1155  MachineMemOperand *MMO = *MI.memoperands_begin();
1156  Register PtrReg = MI.getOperand(1).getReg();
1157  // 96-bit loads are only available for vector loads. We need to split this
1158  // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
1159 
1160  ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank);
1161  MachineIRBuilder B(MI, O);
1162 
1163  if (MMO->getAlign() < Align(16)) {
1164  LLT Part64, Part32;
1165  std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64);
1166  auto Load0 = B.buildLoadFromOffset(Part64, PtrReg, *MMO, 0);
1167  auto Load1 = B.buildLoadFromOffset(Part32, PtrReg, *MMO, 8);
1168 
1169  auto Undef = B.buildUndef(LoadTy);
1170  auto Ins0 = B.buildInsert(LoadTy, Undef, Load0, 0);
1171  B.buildInsert(MI.getOperand(0), Ins0, Load1, 64);
1172  } else {
1173  LLT WiderTy = widen96To128(LoadTy);
1174  auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0);
1175  B.buildExtract(MI.getOperand(0), WideLoad, 0);
1176  }
1177 
1178  MI.eraseFromParent();
1179  return true;
1180  }
1181 
1182  // 128-bit loads are supported for all instruction types.
1183  if (LoadSize <= MaxNonSmrdLoadSize)
1184  return false;
1185 
1186  SmallVector<Register, 16> DefRegs(OpdMapper.getVRegs(0));
1187  SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1));
1188 
1189  if (SrcRegs.empty())
1190  SrcRegs.push_back(MI.getOperand(1).getReg());
1191 
1192  assert(LoadSize % MaxNonSmrdLoadSize == 0);
1193 
1194  // RegBankSelect only emits scalar types, so we need to reset the pointer
1195  // operand to a pointer type.
1196  Register BasePtrReg = SrcRegs[0];
1197  LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
1198  MRI.setType(BasePtrReg, PtrTy);
1199 
1200  unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize;
1201  const LLT LoadSplitTy = LoadTy.divide(NumSplitParts);
1202  ApplyRegBankMapping Observer(*this, MRI, &AMDGPU::VGPRRegBank);
1203  MachineIRBuilder B(MI, Observer);
1204  LegalizerHelper Helper(B.getMF(), Observer, B);
1205 
1206  if (LoadTy.isVector()) {
1207  if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1208  return false;
1209  } else {
1210  if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
1211  return false;
1212  }
1213 
1214  MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
1215  return true;
1216 }
1217 
1219  MachineInstr &MI,
1220  const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
1221  MachineRegisterInfo &MRI) const {
1222  const MachineFunction &MF = *MI.getMF();
1223  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1224  const auto &TFI = *ST.getFrameLowering();
1225 
1226  // Guard in case the stack growth direction ever changes with scratch
1227  // instructions.
1228  if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown)
1229  return false;
1230 
1231  Register Dst = MI.getOperand(0).getReg();
1232  Register AllocSize = MI.getOperand(1).getReg();
1233  Align Alignment = assumeAligned(MI.getOperand(2).getImm());
1234 
1235  const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI);
1236 
1237  // TODO: Need to emit a wave reduction to get the maximum size.
1238  if (SizeBank != &AMDGPU::SGPRRegBank)
1239  return false;
1240 
1241  LLT PtrTy = MRI.getType(Dst);
1242  LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
1243 
1245  Register SPReg = Info->getStackPtrOffsetReg();
1246  ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
1247  MachineIRBuilder B(MI, ApplyBank);
1248 
1249  auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2());
1250  auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize);
1251 
1252  auto SPCopy = B.buildCopy(PtrTy, SPReg);
1253  if (Alignment > TFI.getStackAlign()) {
1254  auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize);
1255  B.buildMaskLowPtrBits(Dst, PtrAdd,
1256  Log2(Alignment) + ST.getWavefrontSizeLog2());
1257  } else {
1258  B.buildPtrAdd(Dst, SPCopy, ScaledSize);
1259  }
1260 
1261  MI.eraseFromParent();
1262  return true;
1263 }
1264 
1267  MachineRegisterInfo &MRI, int RsrcIdx) const {
1268  const int NumDefs = MI.getNumExplicitDefs();
1269 
1270  // The reported argument index is relative to the IR intrinsic call arguments,
1271  // so we need to shift by the number of defs and the intrinsic ID.
1272  RsrcIdx += NumDefs + 1;
1273 
1274  // Insert copies to VGPR arguments.
1275  applyDefaultMapping(OpdMapper);
1276 
1277  // Fixup any SGPR arguments.
1278  SmallVector<unsigned, 4> SGPRIndexes;
1279  for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
1280  if (!MI.getOperand(I).isReg())
1281  continue;
1282 
1283  // If this intrinsic has a sampler, it immediately follows rsrc.
1284  if (I == RsrcIdx || I == RsrcIdx + 1)
1285  SGPRIndexes.push_back(I);
1286  }
1287 
1288  executeInWaterfallLoop(MI, MRI, SGPRIndexes);
1289  return true;
1290 }
1291 
1293  Register Reg) {
1295  if (!Def)
1296  return Reg;
1297 
1298  // TODO: Guard against this being an implicit def
1299  return Def->getOperand(0).getReg();
1300 }
1301 
1302 // Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store
1303 // the three offsets (voffset, soffset and instoffset)
1305  const AMDGPURegisterBankInfo &RBI,
1306  Register CombinedOffset, Register &VOffsetReg,
1307  Register &SOffsetReg, int64_t &InstOffsetVal,
1308  Align Alignment) {
1309  const LLT S32 = LLT::scalar(32);
1310  MachineRegisterInfo *MRI = B.getMRI();
1311 
1312  if (Optional<int64_t> Imm = getConstantVRegSExtVal(CombinedOffset, *MRI)) {
1313  uint32_t SOffset, ImmOffset;
1314  if (AMDGPU::splitMUBUFOffset(*Imm, SOffset, ImmOffset, &RBI.Subtarget,
1315  Alignment)) {
1316  VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1317  SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1318  InstOffsetVal = ImmOffset;
1319 
1320  B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1321  B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1322  return SOffset + ImmOffset;
1323  }
1324  }
1325 
1326  Register Base;
1327  unsigned Offset;
1328 
1329  std::tie(Base, Offset) =
1330  AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset);
1331 
1332  uint32_t SOffset, ImmOffset;
1333  if (Offset > 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
1334  &RBI.Subtarget, Alignment)) {
1335  if (RBI.getRegBank(Base, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
1336  VOffsetReg = Base;
1337  SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
1338  B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1339  InstOffsetVal = ImmOffset;
1340  return 0; // XXX - Why is this 0?
1341  }
1342 
1343  // If we have SGPR base, we can use it for soffset.
1344  if (SOffset == 0) {
1345  VOffsetReg = B.buildConstant(S32, 0).getReg(0);
1346  B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1347  SOffsetReg = Base;
1348  InstOffsetVal = ImmOffset;
1349  return 0; // XXX - Why is this 0?
1350  }
1351  }
1352 
1353  // Handle the variable sgpr + vgpr case.
1354  if (MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI)) {
1355  Register Src0 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(1).getReg());
1356  Register Src1 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(2).getReg());
1357 
1358  const RegisterBank *Src0Bank = RBI.getRegBank(Src0, *MRI, *RBI.TRI);
1359  const RegisterBank *Src1Bank = RBI.getRegBank(Src1, *MRI, *RBI.TRI);
1360 
1361  if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) {
1362  VOffsetReg = Src0;
1363  SOffsetReg = Src1;
1364  return 0;
1365  }
1366 
1367  if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) {
1368  VOffsetReg = Src1;
1369  SOffsetReg = Src0;
1370  return 0;
1371  }
1372  }
1373 
1374  // Ensure we have a VGPR for the combined offset. This could be an issue if we
1375  // have an SGPR offset and a VGPR resource.
1376  if (RBI.getRegBank(CombinedOffset, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
1377  VOffsetReg = CombinedOffset;
1378  } else {
1379  VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0);
1380  B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
1381  }
1382 
1383  SOffsetReg = B.buildConstant(S32, 0).getReg(0);
1384  B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
1385  return 0;
1386 }
1387 
1389  const OperandsMapper &OpdMapper) const {
1390  MachineInstr &MI = OpdMapper.getMI();
1391  MachineRegisterInfo &MRI = OpdMapper.getMRI();
1392 
1393  const LLT S32 = LLT::scalar(32);
1394  Register Dst = MI.getOperand(0).getReg();
1395  LLT Ty = MRI.getType(Dst);
1396 
1397  const RegisterBank *RSrcBank =
1398  OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1399  const RegisterBank *OffsetBank =
1400  OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1401  if (RSrcBank == &AMDGPU::SGPRRegBank &&
1402  OffsetBank == &AMDGPU::SGPRRegBank)
1403  return true; // Legal mapping
1404 
1405  // FIXME: 96-bit case was widened during legalize. We neeed to narrow it back
1406  // here but don't have an MMO.
1407 
1408  unsigned LoadSize = Ty.getSizeInBits();
1409  int NumLoads = 1;
1410  if (LoadSize == 256 || LoadSize == 512) {
1411  NumLoads = LoadSize / 128;
1412  Ty = Ty.divide(NumLoads);
1413  }
1414 
1415  // Use the alignment to ensure that the required offsets will fit into the
1416  // immediate offsets.
1417  const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1);
1418 
1420  MachineFunction &MF = B.getMF();
1421 
1422  Register SOffset;
1423  Register VOffset;
1424  int64_t ImmOffset = 0;
1425 
1426  unsigned MMOOffset = setBufferOffsets(B, *this, MI.getOperand(2).getReg(),
1427  VOffset, SOffset, ImmOffset, Alignment);
1428 
1429  // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we
1430  // can, but we neeed to track an MMO for that.
1431  const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8;
1432  const Align MemAlign(4); // FIXME: ABI type alignment?
1437  MemSize, MemAlign);
1438  if (MMOOffset != 0)
1439  BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize);
1440 
1441  // If only the offset is divergent, emit a MUBUF buffer load instead. We can
1442  // assume that the buffer is unswizzled.
1443 
1444  Register RSrc = MI.getOperand(1).getReg();
1445  Register VIndex = B.buildConstant(S32, 0).getReg(0);
1446  B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank);
1447 
1448  SmallVector<Register, 4> LoadParts(NumLoads);
1449 
1450  MachineBasicBlock::iterator MII = MI.getIterator();
1451  MachineInstrSpan Span(MII, &B.getMBB());
1452 
1453  for (int i = 0; i < NumLoads; ++i) {
1454  if (NumLoads == 1) {
1455  LoadParts[i] = Dst;
1456  } else {
1457  LoadParts[i] = MRI.createGenericVirtualRegister(Ty);
1458  MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank);
1459  }
1460 
1461  MachineMemOperand *MMO = BaseMMO;
1462  if (i != 0)
1463  BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize);
1464 
1465  B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD)
1466  .addDef(LoadParts[i]) // vdata
1467  .addUse(RSrc) // rsrc
1468  .addUse(VIndex) // vindex
1469  .addUse(VOffset) // voffset
1470  .addUse(SOffset) // soffset
1471  .addImm(ImmOffset + 16 * i) // offset(imm)
1472  .addImm(0) // cachepolicy, swizzled buffer(imm)
1473  .addImm(0) // idxen(imm)
1474  .addMemOperand(MMO);
1475  }
1476 
1477  // TODO: If only the resource is a VGPR, it may be better to execute the
1478  // scalar load in the waterfall loop if the resource is expected to frequently
1479  // be dynamically uniform.
1480  if (RSrcBank != &AMDGPU::SGPRRegBank) {
1481  // Remove the original instruction to avoid potentially confusing the
1482  // waterfall loop logic.
1483  B.setInstr(*Span.begin());
1484  MI.eraseFromParent();
1485 
1486  SmallSet<Register, 4> OpsToWaterfall;
1487 
1488  OpsToWaterfall.insert(RSrc);
1489  executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
1490  OpsToWaterfall, MRI);
1491  }
1492 
1493  if (NumLoads != 1) {
1494  if (Ty.isVector())
1495  B.buildConcatVectors(Dst, LoadParts);
1496  else
1497  B.buildMerge(Dst, LoadParts);
1498  }
1499 
1500  // We removed the instruction earlier with a waterfall loop.
1501  if (RSrcBank == &AMDGPU::SGPRRegBank)
1502  MI.eraseFromParent();
1503 
1504  return true;
1505 }
1506 
1508  const OperandsMapper &OpdMapper, bool Signed) const {
1509  MachineInstr &MI = OpdMapper.getMI();
1510  MachineRegisterInfo &MRI = OpdMapper.getMRI();
1511 
1512  // Insert basic copies
1513  applyDefaultMapping(OpdMapper);
1514 
1515  Register DstReg = MI.getOperand(0).getReg();
1516  LLT Ty = MRI.getType(DstReg);
1517 
1518  const LLT S32 = LLT::scalar(32);
1519 
1520  const RegisterBank *DstBank =
1521  OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1522  if (DstBank == &AMDGPU::VGPRRegBank) {
1523  if (Ty == S32)
1524  return true;
1525 
1526  // TODO: 64-bit version is scalar only, so we need to expand this.
1527  return false;
1528  }
1529 
1530  Register SrcReg = MI.getOperand(2).getReg();
1531  Register OffsetReg = MI.getOperand(3).getReg();
1532  Register WidthReg = MI.getOperand(4).getReg();
1533 
1534  // The scalar form packs the offset and width in a single operand.
1535 
1536  ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
1537  MachineIRBuilder B(MI, ApplyBank);
1538 
1539  // Ensure the high bits are clear to insert the offset.
1540  auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
1541  auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
1542 
1543  // Zeros out the low bits, so don't bother clamping the input value.
1544  auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16));
1545 
1546  // Transformation function, pack the offset and width of a BFE into
1547  // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
1548  // source, bits [5:0] contain the offset and bits [22:16] the width.
1549  auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
1550 
1551  // TODO: It might be worth using a pseudo here to avoid scc clobber and
1552  // register class constraints.
1553  unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
1554  (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
1555 
1556  auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
1557  if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1558  llvm_unreachable("failed to constrain BFE");
1559 
1560  MI.eraseFromParent();
1561  return true;
1562 }
1563 
1564 // Return a suitable opcode for extending the operands of Opc when widening.
1565 static unsigned getExtendOp(unsigned Opc) {
1566  switch (Opc) {
1567  case TargetOpcode::G_ASHR:
1568  case TargetOpcode::G_SMIN:
1569  case TargetOpcode::G_SMAX:
1570  return TargetOpcode::G_SEXT;
1571  case TargetOpcode::G_LSHR:
1572  case TargetOpcode::G_UMIN:
1573  case TargetOpcode::G_UMAX:
1574  return TargetOpcode::G_ZEXT;
1575  default:
1576  return TargetOpcode::G_ANYEXT;
1577  }
1578 }
1579 
1580 // Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding
1581 // any illegal vector extend or unmerge operations.
1582 static std::pair<Register, Register>
1583 unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) {
1584  const LLT S32 = LLT::scalar(32);
1585  auto Bitcast = B.buildBitcast(S32, Src);
1586 
1587  if (ExtOpcode == TargetOpcode::G_SEXT) {
1588  auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16);
1589  auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16));
1590  return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1591  }
1592 
1593  auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16));
1594  if (ExtOpcode == TargetOpcode::G_ZEXT) {
1595  auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff));
1596  return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0));
1597  }
1598 
1599  assert(ExtOpcode == TargetOpcode::G_ANYEXT);
1600  return std::make_pair(Bitcast.getReg(0), ShiftHi.getReg(0));
1601 }
1602 
1603 // For cases where only a single copy is inserted for matching register banks.
1604 // Replace the register in the instruction operand
1606  const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
1607  SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
1608  if (!SrcReg.empty()) {
1609  assert(SrcReg.size() == 1);
1610  OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);
1611  return true;
1612  }
1613 
1614  return false;
1615 }
1616 
1617 /// Handle register layout difference for f16 images for some subtargets.
1620  Register Reg) const {
1622  return Reg;
1623 
1624  const LLT S16 = LLT::scalar(16);
1625  LLT StoreVT = MRI.getType(Reg);
1626  if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
1627  return Reg;
1628 
1629  auto Unmerge = B.buildUnmerge(S16, Reg);
1630 
1631 
1632  SmallVector<Register, 4> WideRegs;
1633  for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
1634  WideRegs.push_back(Unmerge.getReg(I));
1635 
1636  const LLT S32 = LLT::scalar(32);
1637  int NumElts = StoreVT.getNumElements();
1638 
1639  return B.buildMerge(LLT::vector(NumElts, S32), WideRegs).getReg(0);
1640 }
1641 
1642 static std::pair<Register, unsigned>
1644  int64_t Const;
1645  if (mi_match(Reg, MRI, m_ICst(Const)))
1646  return std::make_pair(Register(), Const);
1647 
1648  Register Base;
1649  if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const))))
1650  return std::make_pair(Base, Const);
1651 
1652  // TODO: Handle G_OR used for add case
1653  return std::make_pair(Reg, 0);
1654 }
1655 
1656 std::pair<Register, unsigned>
1658  Register OrigOffset) const {
1659  const unsigned MaxImm = 4095;
1660  Register BaseReg;
1661  unsigned ImmOffset;
1662  const LLT S32 = LLT::scalar(32);
1663 
1664  std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(),
1665  OrigOffset);
1666 
1667  unsigned C1 = 0;
1668  if (ImmOffset != 0) {
1669  // If the immediate value is too big for the immoffset field, put the value
1670  // and -4096 into the immoffset field so that the value that is copied/added
1671  // for the voffset field is a multiple of 4096, and it stands more chance
1672  // of being CSEd with the copy/add for another similar load/store.
1673  // However, do not do that rounding down to a multiple of 4096 if that is a
1674  // negative number, as it appears to be illegal to have a negative offset
1675  // in the vgpr, even if adding the immediate offset makes it positive.
1676  unsigned Overflow = ImmOffset & ~MaxImm;
1677  ImmOffset -= Overflow;
1678  if ((int32_t)Overflow < 0) {
1679  Overflow += ImmOffset;
1680  ImmOffset = 0;
1681  }
1682 
1683  C1 = ImmOffset;
1684  if (Overflow != 0) {
1685  if (!BaseReg)
1686  BaseReg = B.buildConstant(S32, Overflow).getReg(0);
1687  else {
1688  auto OverflowVal = B.buildConstant(S32, Overflow);
1689  BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
1690  }
1691  }
1692  }
1693 
1694  if (!BaseReg)
1695  BaseReg = B.buildConstant(S32, 0).getReg(0);
1696 
1697  return {BaseReg, C1};
1698 }
1699 
1701  int64_t C;
1702  return mi_match(Reg, MRI, m_ICst(C)) && C == 0;
1703 }
1704 
1705 static unsigned extractCPol(unsigned CachePolicy) {
1706  return CachePolicy & AMDGPU::CPol::ALL;
1707 }
1708 
1709 static unsigned extractSWZ(unsigned CachePolicy) {
1710  return (CachePolicy >> 3) & 1;
1711 }
1712 
1713 
1714 MachineInstr *
1716  MachineInstr &MI) const {
1717  MachineRegisterInfo &MRI = *B.getMRI();
1718  executeInWaterfallLoop(B, MI, MRI, {2, 4});
1719 
1720  // FIXME: DAG lowering brokenly changes opcode based on FP vs. integer.
1721 
1722  Register VData = MI.getOperand(1).getReg();
1723  LLT Ty = MRI.getType(VData);
1724 
1725  int EltSize = Ty.getScalarSizeInBits();
1726  int Size = Ty.getSizeInBits();
1727 
1728  // FIXME: Broken integer truncstore.
1729  if (EltSize != 32)
1730  report_fatal_error("unhandled intrinsic store");
1731 
1732  // FIXME: Verifier should enforce 1 MMO for these intrinsics.
1733  const int MemSize = (*MI.memoperands_begin())->getSize();
1734 
1735 
1736  Register RSrc = MI.getOperand(2).getReg();
1737  Register VOffset = MI.getOperand(3).getReg();
1738  Register SOffset = MI.getOperand(4).getReg();
1739  unsigned CachePolicy = MI.getOperand(5).getImm();
1740 
1741  unsigned ImmOffset;
1742  std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
1743 
1744  const bool Offen = !isZero(VOffset, MRI);
1745 
1746  unsigned Opc = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact;
1747  switch (8 * MemSize) {
1748  case 8:
1749  Opc = Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact :
1750  AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact;
1751  break;
1752  case 16:
1753  Opc = Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact :
1754  AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact;
1755  break;
1756  default:
1757  Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact :
1758  AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact;
1759  if (Size > 32)
1760  Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32);
1761  break;
1762  }
1763 
1764 
1765  // Set the insertion point back to the instruction in case it was moved into a
1766  // loop.
1767  B.setInstr(MI);
1768 
1769  MachineInstrBuilder MIB = B.buildInstr(Opc)
1770  .addUse(VData);
1771 
1772  if (Offen)
1773  MIB.addUse(VOffset);
1774 
1775  MIB.addUse(RSrc)
1776  .addUse(SOffset)
1777  .addImm(ImmOffset)
1778  .addImm(extractCPol(CachePolicy))
1779  .addImm(0) // tfe: FIXME: Remove from inst
1780  .addImm(extractSWZ(CachePolicy))
1781  .cloneMemRefs(MI);
1782 
1783  // FIXME: We need a way to report failure from applyMappingImpl.
1784  // Insert constrain copies before inserting the loop.
1785  if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
1786  report_fatal_error("failed to constrain selected store intrinsic");
1787 
1788  return MIB;
1789 }
1790 
1792  Register SrcReg) const {
1793  MachineRegisterInfo &MRI = *B.getMRI();
1794  LLT SrcTy = MRI.getType(SrcReg);
1795  if (SrcTy.getSizeInBits() == 32) {
1796  // Use a v_mov_b32 here to make the exec dependency explicit.
1797  B.buildInstr(AMDGPU::V_MOV_B32_e32)
1798  .addDef(DstReg)
1799  .addUse(SrcReg);
1800  return constrainGenericRegister(DstReg, AMDGPU::VGPR_32RegClass, MRI) &&
1801  constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI);
1802  }
1803 
1804  Register TmpReg0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1805  Register TmpReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1806 
1807  B.buildInstr(AMDGPU::V_MOV_B32_e32)
1808  .addDef(TmpReg0)
1809  .addUse(SrcReg, 0, AMDGPU::sub0);
1810  B.buildInstr(AMDGPU::V_MOV_B32_e32)
1811  .addDef(TmpReg1)
1812  .addUse(SrcReg, 0, AMDGPU::sub1);
1813  B.buildInstr(AMDGPU::REG_SEQUENCE)
1814  .addDef(DstReg)
1815  .addUse(TmpReg0)
1816  .addImm(AMDGPU::sub0)
1817  .addUse(TmpReg1)
1818  .addImm(AMDGPU::sub1);
1819 
1820  return constrainGenericRegister(SrcReg, AMDGPU::SReg_64RegClass, MRI) &&
1821  constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI);
1822 }
1823 
1824 /// Utility function for pushing dynamic vector indexes with a constant offset
1825 /// into waterwall loops.
1827  MachineInstr &IdxUseInstr,
1828  unsigned OpIdx,
1829  unsigned ConstOffset) {
1830  MachineRegisterInfo &MRI = *B.getMRI();
1831  const LLT S32 = LLT::scalar(32);
1832  Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg();
1833  B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator());
1834 
1835  auto MaterializedOffset = B.buildConstant(S32, ConstOffset);
1836 
1837  auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset);
1838  MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank);
1839  MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank);
1840  IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0));
1841 }
1842 
1843 /// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the
1844 /// original 32-bit source value (to be inserted in the low part of the combined
1845 /// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit
1846 /// value.
1848  Register Hi32Reg, Register Lo32Reg,
1849  unsigned ExtOpc,
1850  const RegisterBank &RegBank,
1851  bool IsBooleanSrc = false) {
1852  if (ExtOpc == AMDGPU::G_ZEXT) {
1853  B.buildConstant(Hi32Reg, 0);
1854  } else if (ExtOpc == AMDGPU::G_SEXT) {
1855  if (IsBooleanSrc) {
1856  // If we know the original source was an s1, the high half is the same as
1857  // the low.
1858  B.buildCopy(Hi32Reg, Lo32Reg);
1859  } else {
1860  // Replicate sign bit from 32-bit extended part.
1861  auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31);
1862  B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank);
1863  B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt);
1864  }
1865  } else {
1866  assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension");
1867  B.buildUndef(Hi32Reg);
1868  }
1869 }
1870 
1871 bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
1873  const OperandsMapper &OpdMapper) const {
1874 
1875  Register VecReg = MI.getOperand(1).getReg();
1876  Register Idx = MI.getOperand(2).getReg();
1877 
1878  const RegisterBank &IdxBank =
1879  *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1880 
1881  bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
1882 
1883  LLT VecTy = MRI.getType(VecReg);
1884  unsigned EltSize = VecTy.getScalarSizeInBits();
1885  unsigned NumElem = VecTy.getNumElements();
1886 
1887  if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
1888  IsDivergentIdx))
1889  return false;
1890 
1892  LLT S32 = LLT::scalar(32);
1893 
1894  const RegisterBank &DstBank =
1895  *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1896  const RegisterBank &SrcBank =
1897  *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1898 
1899  const RegisterBank &CCBank =
1900  (DstBank == AMDGPU::SGPRRegBank &&
1901  SrcBank == AMDGPU::SGPRRegBank &&
1902  IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
1903  : AMDGPU::VCCRegBank;
1904  LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
1905 
1906  if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
1907  Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
1908  MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
1909  }
1910 
1911  LLT EltTy = VecTy.getScalarType();
1912  SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
1913  unsigned NumLanes = DstRegs.size();
1914  if (!NumLanes)
1915  NumLanes = 1;
1916  else
1917  EltTy = MRI.getType(DstRegs[0]);
1918 
1919  auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
1920  SmallVector<Register, 2> Res(NumLanes);
1921  for (unsigned L = 0; L < NumLanes; ++L)
1922  Res[L] = UnmergeToEltTy.getReg(L);
1923 
1924  for (unsigned I = 1; I < NumElem; ++I) {
1925  auto IC = B.buildConstant(S32, I);
1926  MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
1927  auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
1928  MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
1929 
1930  for (unsigned L = 0; L < NumLanes; ++L) {
1931  auto S = B.buildSelect(EltTy, Cmp,
1932  UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]);
1933 
1934  for (unsigned N : { 0, 2, 3 })
1935  MRI.setRegBank(S->getOperand(N).getReg(), DstBank);
1936 
1937  Res[L] = S->getOperand(0).getReg();
1938  }
1939  }
1940 
1941  for (unsigned L = 0; L < NumLanes; ++L) {
1942  Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L];
1943  B.buildCopy(DstReg, Res[L]);
1944  MRI.setRegBank(DstReg, DstBank);
1945  }
1946 
1947  MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
1948  MI.eraseFromParent();
1949 
1950  return true;
1951 }
1952 
1953 bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
1955  const OperandsMapper &OpdMapper) const {
1956 
1957  Register VecReg = MI.getOperand(1).getReg();
1958  Register Idx = MI.getOperand(3).getReg();
1959 
1960  const RegisterBank &IdxBank =
1961  *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
1962 
1963  bool IsDivergentIdx = IdxBank != AMDGPU::SGPRRegBank;
1964 
1965  LLT VecTy = MRI.getType(VecReg);
1966  unsigned EltSize = VecTy.getScalarSizeInBits();
1967  unsigned NumElem = VecTy.getNumElements();
1968 
1969  if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
1970  IsDivergentIdx))
1971  return false;
1972 
1974  LLT S32 = LLT::scalar(32);
1975 
1976  const RegisterBank &DstBank =
1977  *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
1978  const RegisterBank &SrcBank =
1979  *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
1980  const RegisterBank &InsBank =
1981  *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
1982 
1983  const RegisterBank &CCBank =
1984  (DstBank == AMDGPU::SGPRRegBank &&
1985  SrcBank == AMDGPU::SGPRRegBank &&
1986  InsBank == AMDGPU::SGPRRegBank &&
1987  IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
1988  : AMDGPU::VCCRegBank;
1989  LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
1990 
1991  if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
1992  Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
1993  MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
1994  }
1995 
1996  LLT EltTy = VecTy.getScalarType();
1997  SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
1998  unsigned NumLanes = InsRegs.size();
1999  if (!NumLanes) {
2000  NumLanes = 1;
2001  InsRegs.push_back(MI.getOperand(2).getReg());
2002  } else {
2003  EltTy = MRI.getType(InsRegs[0]);
2004  }
2005 
2006  auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
2007  SmallVector<Register, 16> Ops(NumElem * NumLanes);
2008 
2009  for (unsigned I = 0; I < NumElem; ++I) {
2010  auto IC = B.buildConstant(S32, I);
2011  MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
2012  auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
2013  MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
2014 
2015  for (unsigned L = 0; L < NumLanes; ++L) {
2016  auto S = B.buildSelect(EltTy, Cmp, InsRegs[L],
2017  UnmergeToEltTy.getReg(I * NumLanes + L));
2018 
2019  for (unsigned N : { 0, 2, 3 })
2020  MRI.setRegBank(S->getOperand(N).getReg(), DstBank);
2021 
2022  Ops[I * NumLanes + L] = S->getOperand(0).getReg();
2023  }
2024  }
2025 
2026  LLT MergeTy = LLT::vector(Ops.size(), EltTy);
2027  if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) {
2028  B.buildBuildVector(MI.getOperand(0), Ops);
2029  } else {
2030  auto Vec = B.buildBuildVector(MergeTy, Ops);
2031  MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank);
2032  B.buildBitcast(MI.getOperand(0).getReg(), Vec);
2033  }
2034 
2035  MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
2036  MI.eraseFromParent();
2037 
2038  return true;
2039 }
2040 
2042  const OperandsMapper &OpdMapper) const {
2043  MachineInstr &MI = OpdMapper.getMI();
2044  unsigned Opc = MI.getOpcode();
2045  MachineRegisterInfo &MRI = OpdMapper.getMRI();
2046  switch (Opc) {
2047  case AMDGPU::G_PHI: {
2048  Register DstReg = MI.getOperand(0).getReg();
2049  LLT DstTy = MRI.getType(DstReg);
2050  if (DstTy != LLT::scalar(1))
2051  break;
2052 
2053  const LLT S32 = LLT::scalar(32);
2054  const RegisterBank *DstBank =
2055  OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2056  if (DstBank == &AMDGPU::VCCRegBank) {
2057  applyDefaultMapping(OpdMapper);
2058  // The standard handling only considers the result register bank for
2059  // phis. For VCC, blindly inserting a copy when the phi is lowered will
2060  // produce an invalid copy. We can only copy with some kind of compare to
2061  // get a vector boolean result. Insert a regitser bank copy that will be
2062  // correctly lowered to a compare.
2063  MachineIRBuilder B(*MI.getParent()->getParent());
2064 
2065  for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
2066  Register SrcReg = MI.getOperand(I).getReg();
2067  const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
2068 
2069  if (SrcBank != &AMDGPU::VCCRegBank) {
2070  MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB();
2071  B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator());
2072 
2073  auto Copy = B.buildCopy(LLT::scalar(1), SrcReg);
2074  MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank);
2075  MI.getOperand(I).setReg(Copy.getReg(0));
2076  }
2077  }
2078 
2079  return;
2080  }
2081 
2082  // Phi handling is strange and only considers the bank of the destination.
2083  substituteSimpleCopyRegs(OpdMapper, 0);
2084 
2085  // Promote SGPR/VGPR booleans to s32
2086  MachineFunction *MF = MI.getParent()->getParent();
2087  ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
2088  MachineIRBuilder B(MI, ApplyBank);
2089  LegalizerHelper Helper(*MF, ApplyBank, B);
2090 
2091  if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2092  llvm_unreachable("widen scalar should have succeeded");
2093 
2094  return;
2095  }
2096  case AMDGPU::G_ICMP:
2097  case AMDGPU::G_UADDO:
2098  case AMDGPU::G_USUBO:
2099  case AMDGPU::G_UADDE:
2100  case AMDGPU::G_SADDE:
2101  case AMDGPU::G_USUBE:
2102  case AMDGPU::G_SSUBE: {
2103  unsigned BoolDstOp = Opc == AMDGPU::G_ICMP ? 0 : 1;
2104  Register DstReg = MI.getOperand(BoolDstOp).getReg();
2105 
2106  const RegisterBank *DstBank =
2107  OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2108  if (DstBank != &AMDGPU::SGPRRegBank)
2109  break;
2110 
2111  const bool HasCarryIn = MI.getNumOperands() == 5;
2112 
2113  // If this is a scalar compare, promote the result to s32, as the selection
2114  // will end up using a copy to a 32-bit vreg.
2115  const LLT S32 = LLT::scalar(32);
2116  Register NewDstReg = MRI.createGenericVirtualRegister(S32);
2117  MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank);
2118  MI.getOperand(BoolDstOp).setReg(NewDstReg);
2120 
2121  if (HasCarryIn) {
2122  Register NewSrcReg = MRI.createGenericVirtualRegister(S32);
2123  MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank);
2124  B.buildZExt(NewSrcReg, MI.getOperand(4).getReg());
2125  MI.getOperand(4).setReg(NewSrcReg);
2126  }
2127 
2128  MachineBasicBlock *MBB = MI.getParent();
2129  B.setInsertPt(*MBB, std::next(MI.getIterator()));
2130 
2131  // If we had a constrained VCC result register, a copy was inserted to VCC
2132  // from SGPR.
2133  SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
2134  if (DefRegs.empty())
2135  DefRegs.push_back(DstReg);
2136  B.buildTrunc(DefRegs[0], NewDstReg);
2137  return;
2138  }
2139  case AMDGPU::G_SELECT: {
2140  Register DstReg = MI.getOperand(0).getReg();
2141  LLT DstTy = MRI.getType(DstReg);
2142 
2143  SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1));
2144  if (CondRegs.empty())
2145  CondRegs.push_back(MI.getOperand(1).getReg());
2146  else {
2147  assert(CondRegs.size() == 1);
2148  }
2149 
2150  const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI);
2151  if (CondBank == &AMDGPU::SGPRRegBank) {
2153  const LLT S32 = LLT::scalar(32);
2154  Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2155  MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2156 
2157  MI.getOperand(1).setReg(NewCondReg);
2158  B.buildZExt(NewCondReg, CondRegs[0]);
2159  }
2160 
2161  if (DstTy.getSizeInBits() != 64)
2162  break;
2163 
2165  LLT HalfTy = getHalfSizedType(DstTy);
2166 
2167  SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2168  SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2169  SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));
2170 
2171  // All inputs are SGPRs, nothing special to do.
2172  if (DefRegs.empty()) {
2173  assert(Src1Regs.empty() && Src2Regs.empty());
2174  break;
2175  }
2176 
2177  if (Src1Regs.empty())
2178  split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2179  else {
2180  setRegsToType(MRI, Src1Regs, HalfTy);
2181  }
2182 
2183  if (Src2Regs.empty())
2184  split64BitValueForMapping(B, Src2Regs, HalfTy, MI.getOperand(3).getReg());
2185  else
2186  setRegsToType(MRI, Src2Regs, HalfTy);
2187 
2188  setRegsToType(MRI, DefRegs, HalfTy);
2189 
2190  B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]);
2191  B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]);
2192 
2193  MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2194  MI.eraseFromParent();
2195  return;
2196  }
2197  case AMDGPU::G_BRCOND: {
2198  Register CondReg = MI.getOperand(0).getReg();
2199  // FIXME: Should use legalizer helper, but should change bool ext type.
2200  const RegisterBank *CondBank =
2201  OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2202 
2203  if (CondBank == &AMDGPU::SGPRRegBank) {
2205  const LLT S32 = LLT::scalar(32);
2206  Register NewCondReg = MRI.createGenericVirtualRegister(S32);
2207  MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
2208 
2209  MI.getOperand(0).setReg(NewCondReg);
2210  B.buildZExt(NewCondReg, CondReg);
2211  return;
2212  }
2213 
2214  break;
2215  }
2216  case AMDGPU::G_AND:
2217  case AMDGPU::G_OR:
2218  case AMDGPU::G_XOR: {
2219  // 64-bit and is only available on the SALU, so split into 2 32-bit ops if
2220  // there is a VGPR input.
2221  Register DstReg = MI.getOperand(0).getReg();
2222  LLT DstTy = MRI.getType(DstReg);
2223 
2224  if (DstTy.getSizeInBits() == 1) {
2225  const RegisterBank *DstBank =
2226  OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2227  if (DstBank == &AMDGPU::VCCRegBank)
2228  break;
2229 
2230  MachineFunction *MF = MI.getParent()->getParent();
2231  ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
2232  MachineIRBuilder B(MI, ApplyBank);
2233  LegalizerHelper Helper(*MF, ApplyBank, B);
2234 
2235  if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
2237  llvm_unreachable("widen scalar should have succeeded");
2238  return;
2239  }
2240 
2241  if (DstTy.getSizeInBits() != 64)
2242  break;
2243 
2244  LLT HalfTy = getHalfSizedType(DstTy);
2245  SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2246  SmallVector<Register, 2> Src0Regs(OpdMapper.getVRegs(1));
2247  SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
2248 
2249  // All inputs are SGPRs, nothing special to do.
2250  if (DefRegs.empty()) {
2251  assert(Src0Regs.empty() && Src1Regs.empty());
2252  break;
2253  }
2254 
2255  assert(DefRegs.size() == 2);
2256  assert(Src0Regs.size() == Src1Regs.size() &&
2257  (Src0Regs.empty() || Src0Regs.size() == 2));
2258 
2259  // Depending on where the source registers came from, the generic code may
2260  // have decided to split the inputs already or not. If not, we still need to
2261  // extract the values.
2263 
2264  if (Src0Regs.empty())
2265  split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
2266  else
2267  setRegsToType(MRI, Src0Regs, HalfTy);
2268 
2269  if (Src1Regs.empty())
2270  split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
2271  else
2272  setRegsToType(MRI, Src1Regs, HalfTy);
2273 
2274  setRegsToType(MRI, DefRegs, HalfTy);
2275 
2276  B.buildInstr(Opc, {DefRegs[0]}, {Src0Regs[0], Src1Regs[0]});
2277  B.buildInstr(Opc, {DefRegs[1]}, {Src0Regs[1], Src1Regs[1]});
2278 
2279  MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2280  MI.eraseFromParent();
2281  return;
2282  }
2283  case AMDGPU::G_ADD:
2284  case AMDGPU::G_SUB:
2285  case AMDGPU::G_MUL:
2286  case AMDGPU::G_SHL:
2287  case AMDGPU::G_LSHR:
2288  case AMDGPU::G_ASHR:
2289  case AMDGPU::G_SMIN:
2290  case AMDGPU::G_SMAX:
2291  case AMDGPU::G_UMIN:
2292  case AMDGPU::G_UMAX: {
2293  Register DstReg = MI.getOperand(0).getReg();
2294  LLT DstTy = MRI.getType(DstReg);
2295 
2296  // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
2297  // Packed 16-bit operations need to be scalarized and promoted.
2298  if (DstTy != LLT::scalar(16) && DstTy != LLT::vector(2, 16))
2299  break;
2300 
2301  const RegisterBank *DstBank =
2302  OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2303  if (DstBank == &AMDGPU::VGPRRegBank)
2304  break;
2305 
2306  const LLT S32 = LLT::scalar(32);
2307  MachineBasicBlock *MBB = MI.getParent();
2308  MachineFunction *MF = MBB->getParent();
2309  ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
2310  MachineIRBuilder B(MI, ApplySALU);
2311 
2312  if (DstTy.isVector()) {
2313  Register WideSrc0Lo, WideSrc0Hi;
2314  Register WideSrc1Lo, WideSrc1Hi;
2315 
2316  unsigned ExtendOp = getExtendOp(MI.getOpcode());
2317  std::tie(WideSrc0Lo, WideSrc0Hi)
2318  = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), ExtendOp);
2319  std::tie(WideSrc1Lo, WideSrc1Hi)
2320  = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), ExtendOp);
2321  auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo});
2322  auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi});
2323  B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
2324  MI.eraseFromParent();
2325  } else {
2326  LegalizerHelper Helper(*MF, ApplySALU, B);
2327 
2328  if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
2329  llvm_unreachable("widen scalar should have succeeded");
2330 
2331  // FIXME: s16 shift amounts should be legal.
2332  if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR ||
2333  Opc == AMDGPU::G_ASHR) {
2334  B.setInsertPt(*MBB, MI.getIterator());
2335  if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2336  llvm_unreachable("widen scalar should have succeeded");
2337  }
2338  }
2339 
2340  return;
2341  }
2342  case AMDGPU::G_SEXT_INREG: {
2343  SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
2344  if (SrcRegs.empty())
2345  break; // Nothing to repair
2346 
2347  const LLT S32 = LLT::scalar(32);
2349  ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank);
2350  GISelObserverWrapper Observer(&O);
2351  B.setChangeObserver(Observer);
2352 
2353  // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs
2354  // we would need to further expand, and doesn't let us directly set the
2355  // result registers.
2356  SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2357 
2358  int Amt = MI.getOperand(2).getImm();
2359  if (Amt <= 32) {
2360  if (Amt == 32) {
2361  // The low bits are unchanged.
2362  B.buildCopy(DstRegs[0], SrcRegs[0]);
2363  } else {
2364  // Extend in the low bits and propagate the sign bit to the high half.
2365  B.buildSExtInReg(DstRegs[0], SrcRegs[0], Amt);
2366  }
2367 
2368  B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31));
2369  } else {
2370  // The low bits are unchanged, and extend in the high bits.
2371  B.buildCopy(DstRegs[0], SrcRegs[0]);
2372  B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32);
2373  }
2374 
2375  Register DstReg = MI.getOperand(0).getReg();
2376  MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
2377  MI.eraseFromParent();
2378  return;
2379  }
2380  case AMDGPU::G_CTPOP:
2381  case AMDGPU::G_BITREVERSE:
2382  case AMDGPU::G_CTLZ_ZERO_UNDEF:
2383  case AMDGPU::G_CTTZ_ZERO_UNDEF: {
2384  const RegisterBank *DstBank =
2385  OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2386  if (DstBank == &AMDGPU::SGPRRegBank)
2387  break;
2388 
2389  Register SrcReg = MI.getOperand(1).getReg();
2390  const LLT S32 = LLT::scalar(32);
2391  LLT Ty = MRI.getType(SrcReg);
2392  if (Ty == S32)
2393  break;
2394 
2395  ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
2396  MachineIRBuilder B(MI, ApplyVALU);
2397 
2398  MachineFunction &MF = B.getMF();
2399  LegalizerHelper Helper(MF, ApplyVALU, B);
2400 
2401  if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized)
2402  llvm_unreachable("narrowScalar should have succeeded");
2403  return;
2404  }
2405  case AMDGPU::G_SEXT:
2406  case AMDGPU::G_ZEXT:
2407  case AMDGPU::G_ANYEXT: {
2408  Register SrcReg = MI.getOperand(1).getReg();
2409  LLT SrcTy = MRI.getType(SrcReg);
2410  const bool Signed = Opc == AMDGPU::G_SEXT;
2411 
2412  assert(empty(OpdMapper.getVRegs(1)));
2413 
2415  const RegisterBank *SrcBank =
2416  OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2417 
2418  Register DstReg = MI.getOperand(0).getReg();
2419  LLT DstTy = MRI.getType(DstReg);
2420  if (DstTy.isScalar() &&
2421  SrcBank != &AMDGPU::SGPRRegBank &&
2422  SrcBank != &AMDGPU::VCCRegBank &&
2423  // FIXME: Should handle any type that round to s64 when irregular
2424  // breakdowns supported.
2425  DstTy.getSizeInBits() == 64 &&
2426  SrcTy.getSizeInBits() <= 32) {
2427  SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2428 
2429  // Extend to 32-bit, and then extend the low half.
2430  if (Signed) {
2431  // TODO: Should really be buildSExtOrCopy
2432  B.buildSExtOrTrunc(DefRegs[0], SrcReg);
2433  } else if (Opc == AMDGPU::G_ZEXT) {
2434  B.buildZExtOrTrunc(DefRegs[0], SrcReg);
2435  } else {
2436  B.buildAnyExtOrTrunc(DefRegs[0], SrcReg);
2437  }
2438 
2439  extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank);
2440  MRI.setRegBank(DstReg, *SrcBank);
2441  MI.eraseFromParent();
2442  return;
2443  }
2444 
2445  if (SrcTy != LLT::scalar(1))
2446  return;
2447 
2448  // It is not legal to have a legalization artifact with a VCC source. Rather
2449  // than introducing a copy, insert the select we would have to select the
2450  // copy to.
2451  if (SrcBank == &AMDGPU::VCCRegBank) {
2452  SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
2453 
2454  const RegisterBank *DstBank = &AMDGPU::VGPRRegBank;
2455 
2456  unsigned DstSize = DstTy.getSizeInBits();
2457  // 64-bit select is SGPR only
2458  const bool UseSel64 = DstSize > 32 &&
2459  SrcBank->getID() == AMDGPU::SGPRRegBankID;
2460 
2461  // TODO: Should s16 select be legal?
2462  LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
2463  auto True = B.buildConstant(SelType, Signed ? -1 : 1);
2464  auto False = B.buildConstant(SelType, 0);
2465 
2466  MRI.setRegBank(True.getReg(0), *DstBank);
2467  MRI.setRegBank(False.getReg(0), *DstBank);
2468  MRI.setRegBank(DstReg, *DstBank);
2469 
2470  if (DstSize > 32) {
2471  B.buildSelect(DefRegs[0], SrcReg, True, False);
2472  extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true);
2473  } else if (DstSize < 32) {
2474  auto Sel = B.buildSelect(SelType, SrcReg, True, False);
2475  MRI.setRegBank(Sel.getReg(0), *DstBank);
2476  B.buildTrunc(DstReg, Sel);
2477  } else {
2478  B.buildSelect(DstReg, SrcReg, True, False);
2479  }
2480 
2481  MI.eraseFromParent();
2482  return;
2483  }
2484 
2485  break;
2486  }
2487  case AMDGPU::G_BUILD_VECTOR:
2488  case AMDGPU::G_BUILD_VECTOR_TRUNC: {
2489  Register DstReg = MI.getOperand(0).getReg();
2490  LLT DstTy = MRI.getType(DstReg);
2491  if (DstTy != LLT::vector(2, 16))
2492  break;
2493 
2494  assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty());
2495  substituteSimpleCopyRegs(OpdMapper, 1);
2496  substituteSimpleCopyRegs(OpdMapper, 2);
2497 
2498  const RegisterBank *DstBank =
2499  OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2500  if (DstBank == &AMDGPU::SGPRRegBank)
2501  break; // Can use S_PACK_* instructions.
2502 
2504 
2505  Register Lo = MI.getOperand(1).getReg();
2506  Register Hi = MI.getOperand(2).getReg();
2507  const LLT S32 = LLT::scalar(32);
2508 
2509  const RegisterBank *BankLo =
2510  OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2511  const RegisterBank *BankHi =
2512  OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2513 
2514  Register ZextLo;
2515  Register ShiftHi;
2516 
2517  if (Opc == AMDGPU::G_BUILD_VECTOR) {
2518  ZextLo = B.buildZExt(S32, Lo).getReg(0);
2519  MRI.setRegBank(ZextLo, *BankLo);
2520 
2521  Register ZextHi = B.buildZExt(S32, Hi).getReg(0);
2522  MRI.setRegBank(ZextHi, *BankHi);
2523 
2524  auto ShiftAmt = B.buildConstant(S32, 16);
2525  MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
2526 
2527  ShiftHi = B.buildShl(S32, ZextHi, ShiftAmt).getReg(0);
2528  MRI.setRegBank(ShiftHi, *BankHi);
2529  } else {
2530  Register MaskLo = B.buildConstant(S32, 0xffff).getReg(0);
2531  MRI.setRegBank(MaskLo, *BankLo);
2532 
2533  auto ShiftAmt = B.buildConstant(S32, 16);
2534  MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
2535 
2536  ShiftHi = B.buildShl(S32, Hi, ShiftAmt).getReg(0);
2537  MRI.setRegBank(ShiftHi, *BankHi);
2538 
2539  ZextLo = B.buildAnd(S32, Lo, MaskLo).getReg(0);
2540  MRI.setRegBank(ZextLo, *BankLo);
2541  }
2542 
2543  auto Or = B.buildOr(S32, ZextLo, ShiftHi);
2544  MRI.setRegBank(Or.getReg(0), *DstBank);
2545 
2546  B.buildBitcast(DstReg, Or);
2547  MI.eraseFromParent();
2548  return;
2549  }
2550  case AMDGPU::G_EXTRACT_VECTOR_ELT: {
2551  SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
2552 
2553  assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());
2554 
2555  Register DstReg = MI.getOperand(0).getReg();
2556  Register SrcReg = MI.getOperand(1).getReg();
2557 
2558  const LLT S32 = LLT::scalar(32);
2559  LLT DstTy = MRI.getType(DstReg);
2560  LLT SrcTy = MRI.getType(SrcReg);
2561 
2562  if (foldExtractEltToCmpSelect(MI, MRI, OpdMapper))
2563  return;
2564 
2566 
2567  const ValueMapping &DstMapping
2568  = OpdMapper.getInstrMapping().getOperandMapping(0);
2569  const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank;
2570  const RegisterBank *SrcBank =
2571  OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2572  const RegisterBank *IdxBank =
2573  OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2574 
2575  Register BaseIdxReg;
2576  unsigned ConstOffset;
2577  std::tie(BaseIdxReg, ConstOffset) =
2578  AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg());
2579 
2580  // See if the index is an add of a constant which will be foldable by moving
2581  // the base register of the index later if this is going to be executed in a
2582  // waterfall loop. This is essentially to reassociate the add of a constant
2583  // with the readfirstlane.
2584  bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2585  ConstOffset > 0 &&
2586  ConstOffset < SrcTy.getNumElements();
2587 
2588  // Move the base register. We'll re-insert the add later.
2589  if (ShouldMoveIndexIntoLoop)
2590  MI.getOperand(2).setReg(BaseIdxReg);
2591 
2592  // If this is a VGPR result only because the index was a VGPR result, the
2593  // actual indexing will be done on the SGPR source vector, which will
2594  // produce a scalar result. We need to copy to the VGPR result inside the
2595  // waterfall loop.
2596  const bool NeedCopyToVGPR = DstBank == &AMDGPU::VGPRRegBank &&
2597  SrcBank == &AMDGPU::SGPRRegBank;
2598  if (DstRegs.empty()) {
2599  applyDefaultMapping(OpdMapper);
2600 
2601  executeInWaterfallLoop(MI, MRI, { 2 });
2602 
2603  if (NeedCopyToVGPR) {
2604  // We don't want a phi for this temporary reg.
2605  Register TmpReg = MRI.createGenericVirtualRegister(DstTy);
2606  MRI.setRegBank(TmpReg, AMDGPU::SGPRRegBank);
2607  MI.getOperand(0).setReg(TmpReg);
2608  B.setInsertPt(*MI.getParent(), ++MI.getIterator());
2609 
2610  // Use a v_mov_b32 here to make the exec dependency explicit.
2611  buildVCopy(B, DstReg, TmpReg);
2612  }
2613 
2614  // Re-insert the constant offset add inside the waterfall loop.
2615  if (ShouldMoveIndexIntoLoop)
2616  reinsertVectorIndexAdd(B, MI, 2, ConstOffset);
2617 
2618  return;
2619  }
2620 
2621  assert(DstTy.getSizeInBits() == 64);
2622 
2623  LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32);
2624 
2625  auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2626  auto One = B.buildConstant(S32, 1);
2627 
2628  MachineBasicBlock::iterator MII = MI.getIterator();
2629 
2630  // Split the vector index into 32-bit pieces. Prepare to move all of the
2631  // new instructions into a waterfall loop if necessary.
2632  //
2633  // Don't put the bitcast or constant in the loop.
2634  MachineInstrSpan Span(MII, &B.getMBB());
2635 
2636  // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2637  auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2638  auto IdxHi = B.buildAdd(S32, IdxLo, One);
2639 
2640  auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
2641  auto Extract1 = B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi);
2642 
2643  MRI.setRegBank(DstReg, *DstBank);
2644  MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2645  MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2646  MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2647  MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2648 
2649  SmallSet<Register, 4> OpsToWaterfall;
2650  if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) {
2651  MI.eraseFromParent();
2652  return;
2653  }
2654 
2655  // Remove the original instruction to avoid potentially confusing the
2656  // waterfall loop logic.
2657  B.setInstr(*Span.begin());
2658  MI.eraseFromParent();
2659  executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2660  OpsToWaterfall, MRI);
2661 
2662  if (NeedCopyToVGPR) {
2663  MachineBasicBlock *LoopBB = Extract1->getParent();
2664  Register TmpReg0 = MRI.createGenericVirtualRegister(S32);
2665  Register TmpReg1 = MRI.createGenericVirtualRegister(S32);
2666  MRI.setRegBank(TmpReg0, AMDGPU::SGPRRegBank);
2667  MRI.setRegBank(TmpReg1, AMDGPU::SGPRRegBank);
2668 
2669  Extract0->getOperand(0).setReg(TmpReg0);
2670  Extract1->getOperand(0).setReg(TmpReg1);
2671 
2672  B.setInsertPt(*LoopBB, ++Extract1->getIterator());
2673 
2674  buildVCopy(B, DstRegs[0], TmpReg0);
2675  buildVCopy(B, DstRegs[1], TmpReg1);
2676  }
2677 
2678  if (ShouldMoveIndexIntoLoop)
2679  reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2680 
2681  return;
2682  }
2683  case AMDGPU::G_INSERT_VECTOR_ELT: {
2684  SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
2685 
2686  Register DstReg = MI.getOperand(0).getReg();
2687  LLT VecTy = MRI.getType(DstReg);
2688 
2689  assert(OpdMapper.getVRegs(0).empty());
2690  assert(OpdMapper.getVRegs(3).empty());
2691 
2692  if (substituteSimpleCopyRegs(OpdMapper, 1))
2693  MRI.setType(MI.getOperand(1).getReg(), VecTy);
2694 
2695  if (foldInsertEltToCmpSelect(MI, MRI, OpdMapper))
2696  return;
2697 
2698  const RegisterBank *IdxBank =
2699  OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
2700 
2701  Register SrcReg = MI.getOperand(1).getReg();
2702  Register InsReg = MI.getOperand(2).getReg();
2703  LLT InsTy = MRI.getType(InsReg);
2704  (void)InsTy;
2705 
2706  Register BaseIdxReg;
2707  unsigned ConstOffset;
2708  std::tie(BaseIdxReg, ConstOffset) =
2709  AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg());
2710 
2711  // See if the index is an add of a constant which will be foldable by moving
2712  // the base register of the index later if this is going to be executed in a
2713  // waterfall loop. This is essentially to reassociate the add of a constant
2714  // with the readfirstlane.
2715  bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
2716  ConstOffset > 0 &&
2717  ConstOffset < VecTy.getNumElements();
2718 
2719  // Move the base register. We'll re-insert the add later.
2720  if (ShouldMoveIndexIntoLoop)
2721  MI.getOperand(3).setReg(BaseIdxReg);
2722 
2723 
2724  if (InsRegs.empty()) {
2725  executeInWaterfallLoop(MI, MRI, { 3 });
2726 
2727  // Re-insert the constant offset add inside the waterfall loop.
2728  if (ShouldMoveIndexIntoLoop) {
2730  reinsertVectorIndexAdd(B, MI, 3, ConstOffset);
2731  }
2732 
2733  return;
2734  }
2735 
2736 
2737  assert(InsTy.getSizeInBits() == 64);
2738 
2739  const LLT S32 = LLT::scalar(32);
2740  LLT Vec32 = LLT::vector(2 * VecTy.getNumElements(), 32);
2741 
2743  auto CastSrc = B.buildBitcast(Vec32, SrcReg);
2744  auto One = B.buildConstant(S32, 1);
2745 
2746  // Split the vector index into 32-bit pieces. Prepare to move all of the
2747  // new instructions into a waterfall loop if necessary.
2748  //
2749  // Don't put the bitcast or constant in the loop.
2750  MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
2751 
2752  // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
2753  auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
2754  auto IdxHi = B.buildAdd(S32, IdxLo, One);
2755 
2756  auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo);
2757  auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi);
2758 
2759  const RegisterBank *DstBank =
2760  OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
2761  const RegisterBank *SrcBank =
2762  OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
2763  const RegisterBank *InsSrcBank =
2764  OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
2765 
2766  MRI.setRegBank(InsReg, *InsSrcBank);
2767  MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
2768  MRI.setRegBank(InsLo.getReg(0), *DstBank);
2769  MRI.setRegBank(InsHi.getReg(0), *DstBank);
2770  MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
2771  MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
2772  MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
2773 
2774 
2775  SmallSet<Register, 4> OpsToWaterfall;
2776  if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) {
2777  B.setInsertPt(B.getMBB(), MI);
2778  B.buildBitcast(DstReg, InsHi);
2779  MI.eraseFromParent();
2780  return;
2781  }
2782 
2783  B.setInstr(*Span.begin());
2784  MI.eraseFromParent();
2785 
2786  // Figure out the point after the waterfall loop before mangling the control
2787  // flow.
2788  executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
2789  OpsToWaterfall, MRI);
2790 
2791  // The insertion point is now right after the original instruction.
2792  //
2793  // Keep the bitcast to the original vector type out of the loop. Doing this
2794  // saved an extra phi we don't need inside the loop.
2795  B.buildBitcast(DstReg, InsHi);
2796 
2797  // Re-insert the constant offset add inside the waterfall loop.
2798  if (ShouldMoveIndexIntoLoop)
2799  reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
2800 
2801  return;
2802  }
2803  case AMDGPU::G_AMDGPU_BUFFER_LOAD:
2804  case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
2805  case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
2806  case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
2807  case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
2808  case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
2809  case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
2810  case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
2811  case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
2812  case AMDGPU::G_AMDGPU_BUFFER_STORE:
2813  case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
2814  case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
2815  case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
2816  case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16:
2817  case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
2818  case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: {
2819  applyDefaultMapping(OpdMapper);
2820  executeInWaterfallLoop(MI, MRI, {1, 4});
2821  return;
2822  }
2823  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
2824  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
2825  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
2826  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
2827  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
2828  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
2829  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
2830  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
2831  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
2832  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
2833  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
2834  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: {
2835  applyDefaultMapping(OpdMapper);
2836  executeInWaterfallLoop(MI, MRI, {2, 5});
2837  return;
2838  }
2839  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
2840  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
2841  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
2842  applyDefaultMapping(OpdMapper);
2843  executeInWaterfallLoop(MI, MRI, {2, 5});
2844  return;
2845  }
2846  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
2847  applyDefaultMapping(OpdMapper);
2848  executeInWaterfallLoop(MI, MRI, {3, 6});
2849  return;
2850  }
2851  case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
2852  applyMappingSBufferLoad(OpdMapper);
2853  return;
2854  }
2855  case AMDGPU::G_INTRINSIC: {
2856  switch (MI.getIntrinsicID()) {
2857  case Intrinsic::amdgcn_readlane: {
2858  substituteSimpleCopyRegs(OpdMapper, 2);
2859 
2860  assert(OpdMapper.getVRegs(0).empty());
2861  assert(OpdMapper.getVRegs(3).empty());
2862 
2863  // Make sure the index is an SGPR. It doesn't make sense to run this in a
2864  // waterfall loop, so assume it's a uniform value.
2865  constrainOpWithReadfirstlane(MI, MRI, 3); // Index
2866  return;
2867  }
2868  case Intrinsic::amdgcn_writelane: {
2869  assert(OpdMapper.getVRegs(0).empty());
2870  assert(OpdMapper.getVRegs(2).empty());
2871  assert(OpdMapper.getVRegs(3).empty());
2872 
2873  substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
2874  constrainOpWithReadfirstlane(MI, MRI, 2); // Source value
2875  constrainOpWithReadfirstlane(MI, MRI, 3); // Index
2876  return;
2877  }
2878  case Intrinsic::amdgcn_interp_p1:
2879  case Intrinsic::amdgcn_interp_p2:
2880  case Intrinsic::amdgcn_interp_mov:
2881  case Intrinsic::amdgcn_interp_p1_f16:
2882  case Intrinsic::amdgcn_interp_p2_f16: {
2883  applyDefaultMapping(OpdMapper);
2884 
2885  // Readlane for m0 value, which is always the last operand.
2886  // FIXME: Should this be a waterfall loop instead?
2887  constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index
2888  return;
2889  }
2890  case Intrinsic::amdgcn_permlane16:
2891  case Intrinsic::amdgcn_permlanex16: {
2892  // Doing a waterfall loop over these wouldn't make any sense.
2893  substituteSimpleCopyRegs(OpdMapper, 2);
2894  substituteSimpleCopyRegs(OpdMapper, 3);
2897  return;
2898  }
2899  case Intrinsic::amdgcn_sbfe:
2900  applyMappingBFEIntrinsic(OpdMapper, true);
2901  return;
2902  case Intrinsic::amdgcn_ubfe:
2903  applyMappingBFEIntrinsic(OpdMapper, false);
2904  return;
2905  case Intrinsic::amdgcn_ballot:
2906  // Use default handling and insert copy to vcc source.
2907  break;
2908  }
2909  break;
2910  }
2911  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
2912  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
2913  const AMDGPU::RsrcIntrinsic *RSrcIntrin
2914  = AMDGPU::lookupRsrcIntrinsic(MI.getIntrinsicID());
2915  assert(RSrcIntrin && RSrcIntrin->IsImage);
2916  // Non-images can have complications from operands that allow both SGPR
2917  // and VGPR. For now it's too complicated to figure out the final opcode
2918  // to derive the register bank from the MCInstrDesc.
2919  applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
2920  return;
2921  }
2922  case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
2923  unsigned N = MI.getNumExplicitOperands() - 2;
2924  executeInWaterfallLoop(MI, MRI, { N });
2925  return;
2926  }
2927  case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
2928  auto IntrID = MI.getIntrinsicID();
2929  switch (IntrID) {
2930  case Intrinsic::amdgcn_ds_ordered_add:
2931  case Intrinsic::amdgcn_ds_ordered_swap: {
2932  // This is only allowed to execute with 1 lane, so readfirstlane is safe.
2933  assert(OpdMapper.getVRegs(0).empty());
2934  substituteSimpleCopyRegs(OpdMapper, 3);
2935  constrainOpWithReadfirstlane(MI, MRI, 2); // M0
2936  return;
2937  }
2938  case Intrinsic::amdgcn_ds_gws_init:
2939  case Intrinsic::amdgcn_ds_gws_barrier:
2940  case Intrinsic::amdgcn_ds_gws_sema_br: {
2941  // Only the first lane is executes, so readfirstlane is safe.
2942  substituteSimpleCopyRegs(OpdMapper, 1);
2943  constrainOpWithReadfirstlane(MI, MRI, 2); // M0
2944  return;
2945  }
2946  case Intrinsic::amdgcn_ds_gws_sema_v:
2947  case Intrinsic::amdgcn_ds_gws_sema_p:
2948  case Intrinsic::amdgcn_ds_gws_sema_release_all: {
2949  // Only the first lane is executes, so readfirstlane is safe.
2950  constrainOpWithReadfirstlane(MI, MRI, 1); // M0
2951  return;
2952  }
2953  case Intrinsic::amdgcn_ds_append:
2954  case Intrinsic::amdgcn_ds_consume: {
2955  constrainOpWithReadfirstlane(MI, MRI, 2); // M0
2956  return;
2957  }
2958  case Intrinsic::amdgcn_s_sendmsg:
2959  case Intrinsic::amdgcn_s_sendmsghalt: {
2960  // FIXME: Should this use a waterfall loop?
2961  constrainOpWithReadfirstlane(MI, MRI, 2); // M0
2962  return;
2963  }
2964  case Intrinsic::amdgcn_s_setreg: {
2966  return;
2967  }
2968  default: {
2969  if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
2970  AMDGPU::lookupRsrcIntrinsic(IntrID)) {
2971  // Non-images can have complications from operands that allow both SGPR
2972  // and VGPR. For now it's too complicated to figure out the final opcode
2973  // to derive the register bank from the MCInstrDesc.
2974  if (RSrcIntrin->IsImage) {
2975  applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
2976  return;
2977  }
2978  }
2979 
2980  break;
2981  }
2982  }
2983  break;
2984  }
2985  case AMDGPU::G_LOAD:
2986  case AMDGPU::G_ZEXTLOAD:
2987  case AMDGPU::G_SEXTLOAD: {
2988  if (applyMappingLoad(MI, OpdMapper, MRI))
2989  return;
2990  break;
2991  }
2992  case AMDGPU::G_DYN_STACKALLOC:
2993  applyMappingDynStackAlloc(MI, OpdMapper, MRI);
2994  return;
2995  default:
2996  break;
2997  }
2998 
2999  return applyDefaultMapping(OpdMapper);
3000 }
3001 
3002 // vgpr, sgpr -> vgpr
3003 // vgpr, agpr -> vgpr
3004 // agpr, agpr -> agpr
3005 // agpr, sgpr -> vgpr
3006 static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
3007  if (RB0 == AMDGPU::InvalidRegBankID)
3008  return RB1;
3009  if (RB1 == AMDGPU::InvalidRegBankID)
3010  return RB0;
3011 
3012  if (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID)
3013  return AMDGPU::SGPRRegBankID;
3014 
3015  if (RB0 == AMDGPU::AGPRRegBankID && RB1 == AMDGPU::AGPRRegBankID)
3016  return AMDGPU::AGPRRegBankID;
3017 
3018  return AMDGPU::VGPRRegBankID;
3019 }
3020 
3021 static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1) {
3022  if (RB0 == AMDGPU::InvalidRegBankID)
3023  return RB1;
3024  if (RB1 == AMDGPU::InvalidRegBankID)
3025  return RB0;
3026 
3027  // vcc, vcc -> vcc
3028  // vcc, sgpr -> vcc
3029  // vcc, vgpr -> vcc
3030  if (RB0 == AMDGPU::VCCRegBankID || RB1 == AMDGPU::VCCRegBankID)
3031  return AMDGPU::VCCRegBankID;
3032 
3033  // vcc, vgpr -> vgpr
3034  return regBankUnion(RB0, RB1);
3035 }
3036 
3038  const MachineInstr &MI) const {
3039  unsigned RegBank = AMDGPU::InvalidRegBankID;
3040 
3041  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3042  if (!MI.getOperand(i).isReg())
3043  continue;
3044  Register Reg = MI.getOperand(i).getReg();
3045  if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3046  RegBank = regBankUnion(RegBank, Bank->getID());
3047  if (RegBank == AMDGPU::VGPRRegBankID)
3048  break;
3049  }
3050  }
3051 
3052  return RegBank;
3053 }
3054 
3056  const MachineFunction &MF = *MI.getParent()->getParent();
3057  const MachineRegisterInfo &MRI = MF.getRegInfo();
3058  for (unsigned i = 0, e = MI.getNumOperands();i != e; ++i) {
3059  if (!MI.getOperand(i).isReg())
3060  continue;
3061  Register Reg = MI.getOperand(i).getReg();
3062  if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
3063  if (Bank->getID() != AMDGPU::SGPRRegBankID)
3064  return false;
3065  }
3066  }
3067  return true;
3068 }
3069 
3072  const MachineFunction &MF = *MI.getParent()->getParent();
3073  const MachineRegisterInfo &MRI = MF.getRegInfo();
3074  SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3075 
3076  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3077  const MachineOperand &SrcOp = MI.getOperand(i);
3078  if (!SrcOp.isReg())
3079  continue;
3080 
3081  unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI);
3082  OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3083  }
3084  return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3085  MI.getNumOperands());
3086 }
3087 
3090  const MachineFunction &MF = *MI.getParent()->getParent();
3091  const MachineRegisterInfo &MRI = MF.getRegInfo();
3092  SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3093 
3094  // Even though we technically could use SGPRs, this would require knowledge of
3095  // the constant bus restriction. Force all sources to VGPR (except for VCC).
3096  //
3097  // TODO: Unary ops are trivially OK, so accept SGPRs?
3098  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3099  const MachineOperand &Src = MI.getOperand(i);
3100  if (!Src.isReg())
3101  continue;
3102 
3103  unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI);
3104  unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
3105  OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
3106  }
3107 
3108  return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3109  MI.getNumOperands());
3110 }
3111 
3114  const MachineFunction &MF = *MI.getParent()->getParent();
3115  const MachineRegisterInfo &MRI = MF.getRegInfo();
3116  SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3117 
3118  for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
3119  const MachineOperand &Op = MI.getOperand(I);
3120  if (!Op.isReg())
3121  continue;
3122 
3123  unsigned Size = getSizeInBits(Op.getReg(), MRI, *TRI);
3124  OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3125  }
3126 
3127  return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
3128  MI.getNumOperands());
3129 }
3130 
3133  const MachineInstr &MI,
3134  int RsrcIdx) const {
3135  // The reported argument index is relative to the IR intrinsic call arguments,
3136  // so we need to shift by the number of defs and the intrinsic ID.
3137  RsrcIdx += MI.getNumExplicitDefs() + 1;
3138 
3139  const int NumOps = MI.getNumOperands();
3140  SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps);
3141 
3142  // TODO: Should packed/unpacked D16 difference be reported here as part of
3143  // the value mapping?
3144  for (int I = 0; I != NumOps; ++I) {
3145  if (!MI.getOperand(I).isReg())
3146  continue;
3147 
3148  Register OpReg = MI.getOperand(I).getReg();
3149  // We replace some dead address operands with $noreg
3150  if (!OpReg)
3151  continue;
3152 
3153  unsigned Size = getSizeInBits(OpReg, MRI, *TRI);
3154 
3155  // FIXME: Probably need a new intrinsic register bank searchable table to
3156  // handle arbitrary intrinsics easily.
3157  //
3158  // If this has a sampler, it immediately follows rsrc.
3159  const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1;
3160 
3161  if (MustBeSGPR) {
3162  // If this must be an SGPR, so we must report whatever it is as legal.
3163  unsigned NewBank = getRegBankID(OpReg, MRI, AMDGPU::SGPRRegBankID);
3164  OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size);
3165  } else {
3166  // Some operands must be VGPR, and these are easy to copy to.
3167  OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3168  }
3169  }
3170 
3171  return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps);
3172 }
3173 
3174 /// Return the mapping for a pointer arugment.
3177  Register PtrReg) const {
3178  LLT PtrTy = MRI.getType(PtrReg);
3179  unsigned Size = PtrTy.getSizeInBits();
3180  if (Subtarget.useFlatForGlobal() ||
3182  return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3183 
3184  // If we're using MUBUF instructions for global memory, an SGPR base register
3185  // is possible. Otherwise this needs to be a VGPR.
3186  const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3187  return AMDGPU::getValueMapping(PtrBank->getID(), Size);
3188 }
3189 
3192 
3193  const MachineFunction &MF = *MI.getParent()->getParent();
3194  const MachineRegisterInfo &MRI = MF.getRegInfo();
3195  SmallVector<const ValueMapping*, 2> OpdsMapping(2);
3196  unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3197  Register PtrReg = MI.getOperand(1).getReg();
3198  LLT PtrTy = MRI.getType(PtrReg);
3199  unsigned AS = PtrTy.getAddressSpace();
3200  unsigned PtrSize = PtrTy.getSizeInBits();
3201 
3202  const ValueMapping *ValMapping;
3203  const ValueMapping *PtrMapping;
3204 
3205  const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
3206 
3207  if (PtrBank == &AMDGPU::SGPRRegBank && AMDGPU::isFlatGlobalAddrSpace(AS)) {
3208  if (isScalarLoadLegal(MI)) {
3209  // We have a uniform instruction so we want to use an SMRD load
3210  ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3211  PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
3212  } else {
3213  ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3214 
3215  // If we're using MUBUF instructions for global memory, an SGPR base
3216  // register is possible. Otherwise this needs to be a VGPR.
3217  unsigned PtrBankID = Subtarget.useFlatForGlobal() ?
3218  AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID;
3219 
3220  PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize);
3221  }
3222  } else {
3223  ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3224  PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
3225  }
3226 
3227  OpdsMapping[0] = ValMapping;
3228  OpdsMapping[1] = PtrMapping;
3230  1, 1, getOperandsMapping(OpdsMapping), MI.getNumOperands());
3231  return Mapping;
3232 
3233  // FIXME: Do we want to add a mapping for FLAT load, or should we just
3234  // handle that during instruction selection?
3235 }
3236 
3237 unsigned
3239  const MachineRegisterInfo &MRI,
3240  unsigned Default) const {
3241  const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3242  return Bank ? Bank->getID() : Default;
3243 }
3244 
3247  const MachineRegisterInfo &MRI,
3248  const TargetRegisterInfo &TRI) const {
3249  // Lie and claim anything is legal, even though this needs to be an SGPR
3250  // applyMapping will have to deal with it as a waterfall loop.
3251  unsigned Bank = getRegBankID(Reg, MRI, AMDGPU::SGPRRegBankID);
3252  unsigned Size = getSizeInBits(Reg, MRI, TRI);
3253  return AMDGPU::getValueMapping(Bank, Size);
3254 }
3255 
3258  const MachineRegisterInfo &MRI,
3259  const TargetRegisterInfo &TRI) const {
3260  unsigned Size = getSizeInBits(Reg, MRI, TRI);
3261  return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3262 }
3263 
3266  const MachineRegisterInfo &MRI,
3267  const TargetRegisterInfo &TRI) const {
3268  unsigned Size = getSizeInBits(Reg, MRI, TRI);
3269  return AMDGPU::getValueMapping(AMDGPU::AGPRRegBankID, Size);
3270 }
3271 
3272 ///
3273 /// This function must return a legal mapping, because
3274 /// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
3275 /// in RegBankSelect::Mode::Fast. Any mapping that would cause a
3276 /// VGPR to SGPR generated is illegal.
3277 ///
3278 // Operands that must be SGPRs must accept potentially divergent VGPRs as
3279 // legal. These will be dealt with in applyMappingImpl.
3280 //
3283  const MachineFunction &MF = *MI.getParent()->getParent();
3284  const MachineRegisterInfo &MRI = MF.getRegInfo();
3285 
3286  if (MI.isCopy() || MI.getOpcode() == AMDGPU::G_FREEZE) {
3287  // The default logic bothers to analyze impossible alternative mappings. We
3288  // want the most straightforward mapping, so just directly handle this.
3289  const RegisterBank *DstBank = getRegBank(MI.getOperand(0).getReg(), MRI,
3290  *TRI);
3291  const RegisterBank *SrcBank = getRegBank(MI.getOperand(1).getReg(), MRI,
3292  *TRI);
3293  assert(SrcBank && "src bank should have been assigned already");
3294  if (!DstBank)
3295  DstBank = SrcBank;
3296 
3297  unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3298  if (cannotCopy(*DstBank, *SrcBank, Size))
3300 
3301  const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank);
3302  unsigned OpdsMappingSize = MI.isCopy() ? 1 : 2;
3303  SmallVector<const ValueMapping *, 1> OpdsMapping(OpdsMappingSize);
3304  OpdsMapping[0] = &ValMap;
3305  if (MI.getOpcode() == AMDGPU::G_FREEZE)
3306  OpdsMapping[1] = &ValMap;
3307 
3308  return getInstructionMapping(
3309  1, /*Cost*/ 1,
3310  /*OperandsMapping*/ getOperandsMapping(OpdsMapping), OpdsMappingSize);
3311  }
3312 
3313  if (MI.isRegSequence()) {
3314  // If any input is a VGPR, the result must be a VGPR. The default handling
3315  // assumes any copy between banks is legal.
3316  unsigned BankID = AMDGPU::SGPRRegBankID;
3317 
3318  for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3319  auto OpBank = getRegBankID(MI.getOperand(I).getReg(), MRI);
3320  // It doesn't make sense to use vcc or scc banks here, so just ignore
3321  // them.
3322  if (OpBank != AMDGPU::SGPRRegBankID) {
3323  BankID = AMDGPU::VGPRRegBankID;
3324  break;
3325  }
3326  }
3327  unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3328 
3329  const ValueMapping &ValMap = getValueMapping(0, Size, getRegBank(BankID));
3330  return getInstructionMapping(
3331  1, /*Cost*/ 1,
3332  /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3333  }
3334 
3335  // The default handling is broken and doesn't handle illegal SGPR->VGPR copies
3336  // properly.
3337  //
3338  // TODO: There are additional exec masking dependencies to analyze.
3339  if (MI.getOpcode() == TargetOpcode::G_PHI) {
3340  unsigned ResultBank = AMDGPU::InvalidRegBankID;
3341  Register DstReg = MI.getOperand(0).getReg();
3342 
3343  // Sometimes the result may have already been assigned a bank.
3344  if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI))
3345  ResultBank = DstBank->getID();
3346 
3347  for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3348  Register Reg = MI.getOperand(I).getReg();
3349  const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
3350 
3351  // FIXME: Assuming VGPR for any undetermined inputs.
3352  if (!Bank || Bank->getID() == AMDGPU::VGPRRegBankID) {
3353  ResultBank = AMDGPU::VGPRRegBankID;
3354  break;
3355  }
3356 
3357  // FIXME: Need to promote SGPR case to s32
3358  unsigned OpBank = Bank->getID();
3359  ResultBank = regBankBoolUnion(ResultBank, OpBank);
3360  }
3361 
3362  assert(ResultBank != AMDGPU::InvalidRegBankID);
3363 
3364  unsigned Size = MRI.getType(DstReg).getSizeInBits();
3365 
3366  const ValueMapping &ValMap =
3367  getValueMapping(0, Size, getRegBank(ResultBank));
3368  return getInstructionMapping(
3369  1, /*Cost*/ 1,
3370  /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
3371  }
3372 
3374  if (Mapping.isValid())
3375  return Mapping;
3376 
3377  SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
3378 
3379  switch (MI.getOpcode()) {
3380  default:
3382 
3383  case AMDGPU::G_AND:
3384  case AMDGPU::G_OR:
3385  case AMDGPU::G_XOR: {
3386  unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3387  if (Size == 1) {
3388  const RegisterBank *DstBank
3389  = getRegBank(MI.getOperand(0).getReg(), MRI, *TRI);
3390 
3391  unsigned TargetBankID = AMDGPU::InvalidRegBankID;
3392  unsigned BankLHS = AMDGPU::InvalidRegBankID;
3393  unsigned BankRHS = AMDGPU::InvalidRegBankID;
3394  if (DstBank) {
3395  TargetBankID = DstBank->getID();
3396  if (DstBank == &AMDGPU::VCCRegBank) {
3397  TargetBankID = AMDGPU::VCCRegBankID;
3398  BankLHS = AMDGPU::VCCRegBankID;
3399  BankRHS = AMDGPU::VCCRegBankID;
3400  } else {
3401  BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3402  AMDGPU::SGPRRegBankID);
3403  BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3404  AMDGPU::SGPRRegBankID);
3405  }
3406  } else {
3407  BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI,
3408  AMDGPU::VCCRegBankID);
3409  BankRHS = getRegBankID(MI.getOperand(2).getReg(), MRI,
3410  AMDGPU::VCCRegBankID);
3411 
3412  // Both inputs should be true booleans to produce a boolean result.
3413  if (BankLHS == AMDGPU::VGPRRegBankID || BankRHS == AMDGPU::VGPRRegBankID) {
3414  TargetBankID = AMDGPU::VGPRRegBankID;
3415  } else if (BankLHS == AMDGPU::VCCRegBankID || BankRHS == AMDGPU::VCCRegBankID) {
3416  TargetBankID = AMDGPU::VCCRegBankID;
3417  BankLHS = AMDGPU::VCCRegBankID;
3418  BankRHS = AMDGPU::VCCRegBankID;
3419  } else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
3420  TargetBankID = AMDGPU::SGPRRegBankID;
3421  }
3422  }
3423 
3424  OpdsMapping[0] = AMDGPU::getValueMapping(TargetBankID, Size);
3425  OpdsMapping[1] = AMDGPU::getValueMapping(BankLHS, Size);
3426  OpdsMapping[2] = AMDGPU::getValueMapping(BankRHS, Size);
3427  break;
3428  }
3429 
3430  if (Size == 64) {
3431 
3432  if (isSALUMapping(MI)) {
3433  OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size);
3434  OpdsMapping[1] = OpdsMapping[2] = OpdsMapping[0];
3435  } else {
3436  OpdsMapping[0] = getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size);
3437  unsigned Bank1 = getRegBankID(MI.getOperand(1).getReg(), MRI /*, DefaultBankID*/);
3438  OpdsMapping[1] = AMDGPU::getValueMapping(Bank1, Size);
3439 
3440  unsigned Bank2 = getRegBankID(MI.getOperand(2).getReg(), MRI /*, DefaultBankID*/);
3441  OpdsMapping[2] = AMDGPU::getValueMapping(Bank2, Size);
3442  }
3443 
3444  break;
3445  }
3446 
3448  }
3449  case AMDGPU::G_PTR_ADD:
3450  case AMDGPU::G_PTRMASK:
3451  case AMDGPU::G_ADD:
3452  case AMDGPU::G_SUB:
3453  case AMDGPU::G_MUL:
3454  case AMDGPU::G_SHL:
3455  case AMDGPU::G_LSHR:
3456  case AMDGPU::G_ASHR:
3457  case AMDGPU::G_UADDO:
3458  case AMDGPU::G_USUBO:
3459  case AMDGPU::G_UADDE:
3460  case AMDGPU::G_SADDE:
3461  case AMDGPU::G_USUBE:
3462  case AMDGPU::G_SSUBE:
3463  case AMDGPU::G_SMIN:
3464  case AMDGPU::G_SMAX:
3465  case AMDGPU::G_UMIN:
3466  case AMDGPU::G_UMAX:
3467  case AMDGPU::G_SHUFFLE_VECTOR:
3468  if (isSALUMapping(MI))
3469  return getDefaultMappingSOP(MI);
3471 
3472  case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU
3473  case AMDGPU::G_SSUBSAT:
3474  case AMDGPU::G_UADDSAT:
3475  case AMDGPU::G_USUBSAT:
3476  case AMDGPU::G_FADD:
3477  case AMDGPU::G_FSUB:
3478  case AMDGPU::G_FPTOSI:
3479  case AMDGPU::G_FPTOUI:
3480  case AMDGPU::G_FMUL:
3481  case AMDGPU::G_FMA:
3482  case AMDGPU::G_FMAD:
3483  case AMDGPU::G_FSQRT:
3484  case AMDGPU::G_FFLOOR:
3485  case AMDGPU::G_FCEIL:
3486  case AMDGPU::G_FRINT:
3487  case AMDGPU::G_SITOFP:
3488  case AMDGPU::G_UITOFP:
3489  case AMDGPU::G_FPTRUNC:
3490  case AMDGPU::G_FPEXT:
3491  case AMDGPU::G_FEXP2:
3492  case AMDGPU::G_FLOG2:
3493  case AMDGPU::G_FMINNUM:
3494  case AMDGPU::G_FMAXNUM:
3495  case AMDGPU::G_FMINNUM_IEEE:
3496  case AMDGPU::G_FMAXNUM_IEEE:
3497  case AMDGPU::G_FCANONICALIZE:
3498  case AMDGPU::G_INTRINSIC_TRUNC:
3499  case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar?
3500  case AMDGPU::G_FSHR: // TODO: Expand for scalar
3501  case AMDGPU::G_AMDGPU_FFBH_U32:
3502  case AMDGPU::G_AMDGPU_FMIN_LEGACY:
3503  case AMDGPU::G_AMDGPU_FMAX_LEGACY:
3504  case AMDGPU::G_AMDGPU_RCP_IFLAG:
3505  case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
3506  case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
3507  case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
3508  case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
3509  case AMDGPU::G_AMDGPU_CVT_PK_I16_I32:
3510  case AMDGPU::G_AMDGPU_MED3:
3511  return getDefaultMappingVOP(MI);
3512  case AMDGPU::G_UMULH:
3513  case AMDGPU::G_SMULH: {
3515  return getDefaultMappingSOP(MI);
3516  return getDefaultMappingVOP(MI);
3517  }
3518  case AMDGPU::G_IMPLICIT_DEF: {
3519  unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3520  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3521  break;
3522  }
3523  case AMDGPU::G_FCONSTANT:
3524  case AMDGPU::G_CONSTANT:
3525  case AMDGPU::G_GLOBAL_VALUE:
3526  case AMDGPU::G_BLOCK_ADDR:
3527  case AMDGPU::G_READCYCLECOUNTER: {
3528  unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3529  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3530  break;
3531  }
3532  case AMDGPU::G_FRAME_INDEX: {
3533  // TODO: This should be the same as other constants, but eliminateFrameIndex
3534  // currently assumes VALU uses.
3535  unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3536  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3537  break;
3538  }
3539  case AMDGPU::G_DYN_STACKALLOC: {
3540  // Result is always uniform, and a wave reduction is needed for the source.
3541  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
3542  unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3543  OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32);
3544  break;
3545  }
3546  case AMDGPU::G_INSERT: {
3547  unsigned BankID = getMappingType(MRI, MI);
3548  unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3549  unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3550  unsigned EltSize = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
3551  OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
3552  OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
3553  OpdsMapping[2] = AMDGPU::getValueMapping(BankID, EltSize);
3554  OpdsMapping[3] = nullptr;
3555  break;
3556  }
3557  case AMDGPU::G_EXTRACT: {
3558  unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3559  unsigned DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
3560  unsigned SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
3561  OpdsMapping[0] = AMDGPU::getValueMapping(BankID, DstSize);
3562  OpdsMapping[1] = AMDGPU::getValueMapping(BankID, SrcSize);
3563  OpdsMapping[2] = nullptr;
3564  break;
3565  }
3566  case AMDGPU::G_BUILD_VECTOR:
3567  case AMDGPU::G_BUILD_VECTOR_TRUNC: {
3568  LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
3569  if (DstTy == LLT::vector(2, 16)) {
3570  unsigned DstSize = DstTy.getSizeInBits();
3571  unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3572  unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3573  unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
3574  unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID);
3575 
3576  OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize);
3577  OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize);
3578  OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize);
3579  break;
3580  }
3581 
3583  }
3584  case AMDGPU::G_MERGE_VALUES:
3585  case AMDGPU::G_CONCAT_VECTORS: {
3586  unsigned Bank = getMappingType(MRI, MI);
3587  unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3588  unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3589 
3590  OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
3591  // Op1 and Dst should use the same register bank.
3592  for (unsigned i = 1, e = MI.getNumOperands(); i != e; ++i)
3593  OpdsMapping[i] = AMDGPU::getValueMapping(Bank, SrcSize);
3594  break;
3595  }
3596  case AMDGPU::G_BITREVERSE:
3597  case AMDGPU::G_BITCAST:
3598  case AMDGPU::G_INTTOPTR:
3599  case AMDGPU::G_PTRTOINT:
3600  case AMDGPU::G_FABS:
3601  case AMDGPU::G_FNEG: {
3602  unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3603  unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3604  OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
3605  break;
3606  }
3607  case AMDGPU::G_CTLZ_ZERO_UNDEF:
3608  case AMDGPU::G_CTTZ_ZERO_UNDEF:
3609  case AMDGPU::G_CTPOP: {
3610  unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3611  unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3612  OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
3613 
3614  // This should really be getValueMappingSGPR64Only, but allowing the generic
3615  // code to handle the register split just makes using LegalizerHelper more
3616  // difficult.
3617  OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
3618  break;
3619  }
3620  case AMDGPU::G_TRUNC: {
3621  Register Dst = MI.getOperand(0).getReg();
3622  Register Src = MI.getOperand(1).getReg();
3623  unsigned Bank = getRegBankID(Src, MRI);
3624  unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
3625  unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
3626  OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
3627  OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
3628  break;
3629  }
3630  case AMDGPU::G_ZEXT:
3631  case AMDGPU::G_SEXT:
3632  case AMDGPU::G_ANYEXT:
3633  case AMDGPU::G_SEXT_INREG: {
3634  Register Dst = MI.getOperand(0).getReg();
3635  Register Src = MI.getOperand(1).getReg();
3636  unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
3637  unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
3638 
3639  unsigned DstBank;
3640  const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI);
3641  assert(SrcBank);
3642  switch (SrcBank->getID()) {
3643  case AMDGPU::SGPRRegBankID:
3644  DstBank = AMDGPU::SGPRRegBankID;
3645  break;
3646  default:
3647  DstBank = AMDGPU::VGPRRegBankID;
3648  break;
3649  }
3650 
3651  // Scalar extend can use 64-bit BFE, but VGPRs require extending to
3652  // 32-bits, and then to 64.
3653  OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize);
3654  OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(),
3655  SrcSize);
3656  break;
3657  }
3658  case AMDGPU::G_FCMP: {
3659  unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3660  unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
3661  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
3662  OpdsMapping[1] = nullptr; // Predicate Operand.
3663  OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
3664  OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3665  break;
3666  }
3667  case AMDGPU::G_STORE: {
3668  assert(MI.getOperand(0).isReg());
3669  unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3670 
3671  // FIXME: We need to specify a different reg bank once scalar stores are
3672  // supported.
3673  const ValueMapping *ValMapping =
3674  AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
3675  OpdsMapping[0] = ValMapping;
3676  OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
3677  break;
3678  }
3679  case AMDGPU::G_ICMP: {
3680  auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
3681  unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3682 
3683  // See if the result register has already been constrained to vcc, which may
3684  // happen due to control flow intrinsic lowering.
3685  unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI,
3686  AMDGPU::SGPRRegBankID);
3687  unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
3688  unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI);
3689 
3690  bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID &&
3691  Op2Bank == AMDGPU::SGPRRegBankID &&
3692  Op3Bank == AMDGPU::SGPRRegBankID &&
3693  (Size == 32 || (Size == 64 &&
3694  (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
3696 
3697  DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
3698  unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
3699 
3700  // TODO: Use 32-bit for scalar output size.
3701  // SCC results will need to be copied to a 32-bit SGPR virtual register.
3702  const unsigned ResultSize = 1;
3703 
3704  OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize);
3705  OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size);
3706  OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size);
3707  break;
3708  }
3709  case AMDGPU::G_EXTRACT_VECTOR_ELT: {
3710  // VGPR index can be used for waterfall when indexing a SGPR vector.
3711  unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI);
3712  unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3713  unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3714  unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3715  unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI);
3716  unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank);
3717 
3718  OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize);
3719  OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize);
3720 
3721  // The index can be either if the source vector is VGPR.
3722  OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize);
3723  break;
3724  }
3725  case AMDGPU::G_INSERT_VECTOR_ELT: {
3726  unsigned OutputBankID = isSALUMapping(MI) ?
3727  AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
3728 
3729  unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3730  unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3731  unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
3732  unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), MRI);
3733  unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI);
3734 
3735  OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
3736  OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize);
3737 
3738  // This is a weird case, because we need to break down the mapping based on
3739  // the register bank of a different operand.
3740  if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) {
3741  OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID,
3742  InsertSize);
3743  } else {
3744  assert(InsertSize == 32 || InsertSize == 64);
3745  OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize);
3746  }
3747 
3748  // The index can be either if the source vector is VGPR.
3749  OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize);
3750  break;
3751  }
3752  case AMDGPU::G_UNMERGE_VALUES: {
3753  unsigned Bank = getMappingType(MRI, MI);
3754 
3755  // Op1 and Dst should use the same register bank.
3756  // FIXME: Shouldn't this be the default? Why do we need to handle this?
3757  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3758  unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
3759  OpdsMapping[i] = AMDGPU::getValueMapping(Bank, Size);
3760  }
3761  break;
3762  }
3763  case AMDGPU::G_AMDGPU_BUFFER_LOAD:
3764  case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
3765  case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
3766  case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
3767  case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
3768  case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
3769  case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
3770  case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
3771  case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
3772  case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
3773  case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16:
3774  case AMDGPU::G_AMDGPU_BUFFER_STORE:
3775  case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
3776  case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
3777  case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
3778  case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: {
3779  OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3780 
3781  // rsrc
3782  OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3783 
3784  // vindex
3785  OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3786 
3787  // voffset
3788  OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3789 
3790  // soffset
3791  OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3792 
3793  // Any remaining operands are immediates and were correctly null
3794  // initialized.
3795  break;
3796  }
3797  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
3798  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
3799  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
3800  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
3801  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
3802  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
3803  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
3804  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
3805  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
3806  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
3807  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
3808  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
3809  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
3810  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
3811  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
3812  // vdata_out
3813  OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3814 
3815  // vdata_in
3816  OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3817 
3818  // rsrc
3819  OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3820 
3821  // vindex
3822  OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3823 
3824  // voffset
3825  OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3826 
3827  // soffset
3828  OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
3829 
3830  // Any remaining operands are immediates and were correctly null
3831  // initialized.
3832  break;
3833  }
3834  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
3835  // vdata_out
3836  OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
3837 
3838  // vdata_in
3839  OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3840 
3841  // cmp
3842  OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3843 
3844  // rsrc
3845  OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
3846 
3847  // vindex
3848  OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
3849 
3850  // voffset
3851  OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
3852 
3853  // soffset
3854  OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
3855 
3856  // Any remaining operands are immediates and were correctly null
3857  // initialized.
3858  break;
3859  }
3860  case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
3861  // Lie and claim everything is legal, even though some need to be
3862  // SGPRs. applyMapping will have to deal with it as a waterfall loop.
3863  OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
3864  OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
3865 
3866  // We need to convert this to a MUBUF if either the resource of offset is
3867  // VGPR.
3868  unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID();
3869  unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID();
3870  unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank);
3871 
3872  unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3873  OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0);
3874  break;
3875  }
3876  case AMDGPU::G_INTRINSIC: {
3877  switch (MI.getIntrinsicID()) {
3878  default:
3880  case Intrinsic::amdgcn_div_fmas:
3881  case Intrinsic::amdgcn_div_fixup:
3882  case Intrinsic::amdgcn_trig_preop:
3883  case Intrinsic::amdgcn_sin:
3884  case Intrinsic::amdgcn_cos:
3885  case Intrinsic::amdgcn_log_clamp:
3886  case Intrinsic::amdgcn_rcp:
3887  case Intrinsic::amdgcn_rcp_legacy:
3888  case Intrinsic::amdgcn_sqrt:
3889  case Intrinsic::amdgcn_rsq:
3890  case Intrinsic::amdgcn_rsq_legacy:
3891  case Intrinsic::amdgcn_rsq_clamp:
3892  case Intrinsic::amdgcn_fmul_legacy:
3893  case Intrinsic::amdgcn_fma_legacy:
3894  case Intrinsic::amdgcn_ldexp:
3895  case Intrinsic::amdgcn_frexp_mant:
3896  case Intrinsic::amdgcn_frexp_exp:
3897  case Intrinsic::amdgcn_fract:
3898  case Intrinsic::amdgcn_cvt_pkrtz:
3899  case Intrinsic::amdgcn_cvt_pknorm_i16:
3900  case Intrinsic::amdgcn_cvt_pknorm_u16:
3901  case Intrinsic::amdgcn_cvt_pk_i16:
3902  case Intrinsic::amdgcn_cvt_pk_u16:
3903  case Intrinsic::amdgcn_fmed3:
3904  case Intrinsic::amdgcn_cubeid:
3905  case Intrinsic::amdgcn_cubema:
3906  case Intrinsic::amdgcn_cubesc:
3907  case Intrinsic::amdgcn_cubetc:
3908  case Intrinsic::amdgcn_sffbh:
3909  case Intrinsic::amdgcn_fmad_ftz:
3910  case Intrinsic::amdgcn_mbcnt_lo:
3911  case Intrinsic::amdgcn_mbcnt_hi:
3912  case Intrinsic::amdgcn_mul_u24:
3913  case Intrinsic::amdgcn_mul_i24:
3914  case Intrinsic::amdgcn_lerp:
3915  case Intrinsic::amdgcn_sad_u8:
3916  case Intrinsic::amdgcn_msad_u8:
3917  case Intrinsic::amdgcn_sad_hi_u8:
3918  case Intrinsic::amdgcn_sad_u16:
3919  case Intrinsic::amdgcn_qsad_pk_u16_u8:
3920  case Intrinsic::amdgcn_mqsad_pk_u16_u8:
3921  case Intrinsic::amdgcn_mqsad_u32_u8:
3922  case Intrinsic::amdgcn_cvt_pk_u8_f32:
3923  case Intrinsic::amdgcn_alignbit:
3924  case Intrinsic::amdgcn_alignbyte:
3925  case Intrinsic::amdgcn_fdot2:
3926  case Intrinsic::amdgcn_sdot2:
3927  case Intrinsic::amdgcn_udot2:
3928  case Intrinsic::amdgcn_sdot4:
3929  case Intrinsic::amdgcn_udot4:
3930  case Intrinsic::amdgcn_sdot8:
3931  case Intrinsic::amdgcn_udot8:
3932  return getDefaultMappingVOP(MI);
3933  case Intrinsic::amdgcn_sbfe:
3934  case Intrinsic::amdgcn_ubfe:
3935  if (isSALUMapping(MI))
3936  return getDefaultMappingSOP(MI);
3937  return getDefaultMappingVOP(MI);
3938  case Intrinsic::amdgcn_ds_swizzle:
3939  case Intrinsic::amdgcn_ds_permute:
3940  case Intrinsic::amdgcn_ds_bpermute:
3941  case Intrinsic::amdgcn_update_dpp:
3942  case Intrinsic::amdgcn_mov_dpp8:
3943  case Intrinsic::amdgcn_mov_dpp:
3944  case Intrinsic::amdgcn_strict_wwm:
3945  case Intrinsic::amdgcn_wwm:
3946  case Intrinsic::amdgcn_strict_wqm:
3947  case Intrinsic::amdgcn_wqm:
3948  case Intrinsic::amdgcn_softwqm:
3949  case Intrinsic::amdgcn_set_inactive:
3950  return getDefaultMappingAllVGPR(MI);
3951  case Intrinsic::amdgcn_kernarg_segment_ptr:
3952  case Intrinsic::amdgcn_s_getpc:
3953  case Intrinsic::amdgcn_groupstaticsize:
3954  case Intrinsic::amdgcn_reloc_constant:
3955  case Intrinsic::returnaddress: {
3956  unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3957  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
3958  break;
3959  }
3960  case Intrinsic::amdgcn_wqm_vote: {
3961  unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3962  OpdsMapping[0] = OpdsMapping[2]
3963  = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size);
3964  break;
3965  }
3966  case Intrinsic::amdgcn_ps_live: {
3967  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
3968  break;
3969  }
3970  case Intrinsic::amdgcn_div_scale: {
3971  unsigned Dst0Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3972  unsigned Dst1Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
3973  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Dst0Size);
3974  OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size);
3975 
3976  unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
3977  OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
3978  OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
3979  break;
3980  }
3981  case Intrinsic::amdgcn_class: {
3982  Register Src0Reg = MI.getOperand(2).getReg();
3983  Register Src1Reg = MI.getOperand(3).getReg();
3984  unsigned Src0Size = MRI.getType(Src0Reg).getSizeInBits();
3985  unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits();
3986  unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3987  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
3988  OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size);
3989  OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size);
3990  break;
3991  }
3992  case Intrinsic::amdgcn_icmp:
3993  case Intrinsic::amdgcn_fcmp: {
3994  unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
3995  // This is not VCCRegBank because this is not used in boolean contexts.
3996  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
3997  unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
3998  OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
3999  OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
4000  break;
4001  }
4002  case Intrinsic::amdgcn_readlane: {
4003  // This must be an SGPR, but accept a VGPR.
4004  Register IdxReg = MI.getOperand(3).getReg();
4005  unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4006  unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4007  OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4009  }
4010  case Intrinsic::amdgcn_readfirstlane: {
4011  unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4012  unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4013  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4014  OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4015  break;
4016  }
4017  case Intrinsic::amdgcn_writelane: {
4018  unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4019  Register SrcReg = MI.getOperand(2).getReg();
4020  unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
4021  unsigned SrcBank = getRegBankID(SrcReg, MRI, AMDGPU::SGPRRegBankID);
4022  Register IdxReg = MI.getOperand(3).getReg();
4023  unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
4024  unsigned IdxBank = getRegBankID(IdxReg, MRI, AMDGPU::SGPRRegBankID);
4025  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4026 
4027  // These 2 must be SGPRs, but accept VGPRs. Readfirstlane will be inserted
4028  // to legalize.
4029  OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, SrcSize);
4030  OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
4031  OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
4032  break;
4033  }
4034  case Intrinsic::amdgcn_if_break: {
4035  unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4036  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4037  OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4038  OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4039  break;
4040  }
4041  case Intrinsic::amdgcn_permlane16:
4042  case Intrinsic::amdgcn_permlanex16: {
4043  unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
4044  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4045  OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4046  OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
4047  OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4048  OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4049  break;
4050  }
4051  case Intrinsic::amdgcn_mfma_f32_4x4x1f32:
4052  case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
4053  case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
4054  case Intrinsic::amdgcn_mfma_f32_4x4x2bf16:
4055  case Intrinsic::amdgcn_mfma_f32_16x16x1f32:
4056  case Intrinsic::amdgcn_mfma_f32_16x16x4f32:
4057  case Intrinsic::amdgcn_mfma_f32_16x16x4f16:
4058  case Intrinsic::amdgcn_mfma_f32_16x16x16f16:
4059  case Intrinsic::amdgcn_mfma_i32_16x16x4i8:
4060  case Intrinsic::amdgcn_mfma_i32_16x16x16i8:
4061  case Intrinsic::amdgcn_mfma_f32_16x16x2bf16:
4062  case Intrinsic::amdgcn_mfma_f32_16x16x8bf16:
4063  case Intrinsic::amdgcn_mfma_f32_32x32x1f32:
4064  case Intrinsic::amdgcn_mfma_f32_32x32x2f32:
4065  case Intrinsic::amdgcn_mfma_f32_32x32x4f16:
4066  case Intrinsic::amdgcn_mfma_f32_32x32x8f16:
4067  case Intrinsic::amdgcn_mfma_i32_32x32x4i8:
4068  case Intrinsic::amdgcn_mfma_i32_32x32x8i8:
4069  case Intrinsic::amdgcn_mfma_f32_32x32x2bf16:
4070  case Intrinsic::amdgcn_mfma_f32_32x32x4bf16:
4071  case Intrinsic::amdgcn_mfma_f32_32x32x4bf16_1k:
4072  case Intrinsic::amdgcn_mfma_f32_16x16x4bf16_1k:
4073  case Intrinsic::amdgcn_mfma_f32_4x4x4bf16_1k:
4074  case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k:
4075  case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k:
4076  case Intrinsic::amdgcn_mfma_f64_16x16x4f64:
4077  case Intrinsic::amdgcn_mfma_f64_4x4x4f64: {
4078  // Default for MAI intrinsics.
4079  // srcC can also be an immediate which can be folded later.
4080  // FIXME: Should we eventually add an alternative mapping with AGPR src
4081  // for srcA/srcB?
4082  //
4083  // vdst, srcA, srcB, srcC
4084  OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4085  OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4086  OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4087  OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4088  break;
4089  }
4090  case Intrinsic::amdgcn_interp_p1:
4091  case Intrinsic::amdgcn_interp_p2:
4092  case Intrinsic::amdgcn_interp_mov:
4093  case Intrinsic::amdgcn_interp_p1_f16:
4094  case Intrinsic::amdgcn_interp_p2_f16: {
4095  const int M0Idx = MI.getNumOperands() - 1;
4096  Register M0Reg = MI.getOperand(M0Idx).getReg();
4097  unsigned M0Bank = getRegBankID(M0Reg, MRI, AMDGPU::SGPRRegBankID);
4098  unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4099 
4100  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4101  for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
4102  OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4103 
4104  // Must be SGPR, but we must take whatever the original bank is and fix it
4105  // later.
4106  OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
4107  break;
4108  }
4109  case Intrinsic::amdgcn_ballot: {
4110  unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4111  unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
4112  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
4113  OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize);
4114  break;
4115  }
4116  }
4117  break;
4118  }
4119  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4120  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
4121  auto IntrID = MI.getIntrinsicID();
4122  const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID);
4123  assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic");
4124  // Non-images can have complications from operands that allow both SGPR
4125  // and VGPR. For now it's too complicated to figure out the final opcode
4126  // to derive the register bank from the MCInstrDesc.
4127  assert(RSrcIntrin->IsImage);
4128  return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
4129  }
4130  case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
4131  unsigned N = MI.getNumExplicitOperands() - 2;
4132  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 128);
4133  OpdsMapping[N] = getSGPROpMapping(MI.getOperand(N).getReg(), MRI, *TRI);
4134  for (unsigned I = 2; I < N; ++I)
4135  OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4136  break;
4137  }
4138  case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
4139  auto IntrID = MI.getIntrinsicID();
4140  switch (IntrID) {
4141  case Intrinsic::amdgcn_s_getreg:
4142  case Intrinsic::amdgcn_s_memtime:
4143  case Intrinsic::amdgcn_s_memrealtime:
4144  case Intrinsic::amdgcn_s_get_waveid_in_workgroup: {
4145  unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4146  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4147  break;
4148  }
4149  case Intrinsic::amdgcn_global_atomic_fadd:
4150  case Intrinsic::amdgcn_global_atomic_csub:
4151  case Intrinsic::amdgcn_global_atomic_fmin:
4152  case Intrinsic::amdgcn_global_atomic_fmax:
4153  case Intrinsic::amdgcn_flat_atomic_fadd:
4154  case Intrinsic::amdgcn_flat_atomic_fmin:
4155  case Intrinsic::amdgcn_flat_atomic_fmax:
4156  return getDefaultMappingAllVGPR(MI);
4157  case Intrinsic::amdgcn_ds_ordered_add:
4158  case Intrinsic::amdgcn_ds_ordered_swap: {
4159  unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4160  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4161  unsigned M0Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4162  AMDGPU::SGPRRegBankID);
4163  OpdsMapping[2] = AMDGPU::getValueMapping(M0Bank, 32);
4164  OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4165  break;
4166  }
4167  case Intrinsic::amdgcn_ds_append:
4168  case Intrinsic::amdgcn_ds_consume: {
4169  unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4170  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
4171  OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4172  break;
4173  }
4174  case Intrinsic::amdgcn_exp_compr:
4175  OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4176  OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4177  break;
4178  case Intrinsic::amdgcn_exp:
4179  // FIXME: Could we support packed types here?
4180  OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4181  OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4182  OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4183  OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4184  break;
4185  case Intrinsic::amdgcn_s_sendmsg:
4186  case Intrinsic::amdgcn_s_sendmsghalt: {
4187  // This must be an SGPR, but accept a VGPR.
4188  unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4189  AMDGPU::SGPRRegBankID);
4190  OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4191  break;
4192  }
4193  case Intrinsic::amdgcn_s_setreg: {
4194  // This must be an SGPR, but accept a VGPR.
4195  unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4196  AMDGPU::SGPRRegBankID);
4197  OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4198  break;
4199  }
4200  case Intrinsic::amdgcn_end_cf: {
4201  unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4202  OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4203  break;
4204  }
4205  case Intrinsic::amdgcn_else: {
4206  unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4207  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4208  OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4209  OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
4210  break;
4211  }
4212  case Intrinsic::amdgcn_live_mask: {
4213  OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4214  break;
4215  }
4216  case Intrinsic::amdgcn_wqm_demote:
4217  case Intrinsic::amdgcn_kill: {
4218  OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
4219  break;
4220  }
4221  case Intrinsic::amdgcn_raw_buffer_load:
4222  case Intrinsic::amdgcn_raw_tbuffer_load: {
4223  // FIXME: Should make intrinsic ID the last operand of the instruction,
4224  // then this would be the same as store
4225  OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4226  OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4227  OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4228  OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4229  break;
4230  }
4231  case Intrinsic::amdgcn_raw_buffer_store:
4232  case Intrinsic::amdgcn_raw_buffer_store_format:
4233  case Intrinsic::amdgcn_raw_tbuffer_store: {
4234  OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4235  OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4236  OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4237  OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4238  break;
4239  }
4240  case Intrinsic::amdgcn_struct_buffer_load:
4241  case Intrinsic::amdgcn_struct_tbuffer_load: {
4242  OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4243  OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4244  OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4245  OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4246  OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4247  break;
4248  }
4249  case Intrinsic::amdgcn_struct_buffer_store:
4250  case Intrinsic::amdgcn_struct_tbuffer_store: {
4251  OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4252  OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4253  OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4254  OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
4255  OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
4256  break;
4257  }
4258  case Intrinsic::amdgcn_init_exec_from_input: {
4259  unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
4260  OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
4261  break;
4262  }
4263  case Intrinsic::amdgcn_ds_gws_init:
4264  case Intrinsic::amdgcn_ds_gws_barrier:
4265  case Intrinsic::amdgcn_ds_gws_sema_br: {
4266  OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
4267 
4268  // This must be an SGPR, but accept a VGPR.
4269  unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4270  AMDGPU::SGPRRegBankID);
4271  OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
4272  break;
4273  }
4274  case Intrinsic::amdgcn_ds_gws_sema_v:
4275  case Intrinsic::amdgcn_ds_gws_sema_p:
4276  case Intrinsic::amdgcn_ds_gws_sema_release_all: {
4277  // This must be an SGPR, but accept a VGPR.
4278  unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI,
4279  AMDGPU::SGPRRegBankID);
4280  OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
4281  break;
4282  }
4283  default:
4285  }
4286  break;
4287  }
4288  case AMDGPU::G_SELECT: {
4289  unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
4290  unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI,
4291  AMDGPU::SGPRRegBankID);
4292  unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI,
4293  AMDGPU::SGPRRegBankID);
4294  bool SGPRSrcs = Op2Bank == AMDGPU::SGPRRegBankID &&
4295  Op3Bank == AMDGPU::SGPRRegBankID;
4296 
4297  unsigned CondBankDefault = SGPRSrcs ?
4298  AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4299  unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI,
4300  CondBankDefault);
4301  if (CondBank == AMDGPU::SGPRRegBankID)
4302  CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
4303  else if (CondBank == AMDGPU::VGPRRegBankID)
4304  CondBank = AMDGPU::VCCRegBankID;
4305 
4306  unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ?
4307  AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
4308 
4309  assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID);
4310 
4311  // TODO: Should report 32-bit for scalar condition type.
4312  if (Size == 64) {
4313  OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4314  OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
4315  OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4316  OpdsMapping[3] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
4317  } else {
4318  OpdsMapping[0] = AMDGPU::getValueMapping(Bank, Size);
4319  OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
4320  OpdsMapping[2] = AMDGPU::getValueMapping(Bank, Size);
4321  OpdsMapping[3] = AMDGPU::getValueMapping(Bank, Size);
4322  }
4323 
4324  break;
4325  }
4326 
4327  case AMDGPU::G_LOAD:
4328  case AMDGPU::G_ZEXTLOAD:
4329  case AMDGPU::G_SEXTLOAD:
4330  return getInstrMappingForLoad(MI);
4331 
4332  case AMDGPU::G_ATOMICRMW_XCHG:
4333  case AMDGPU::G_ATOMICRMW_ADD:
4334  case AMDGPU::G_ATOMICRMW_SUB:
4335  case AMDGPU::G_ATOMICRMW_AND:
4336  case AMDGPU::G_ATOMICRMW_OR:
4337  case AMDGPU::G_ATOMICRMW_XOR:
4338  case AMDGPU::G_ATOMICRMW_MAX:
4339  case AMDGPU::G_ATOMICRMW_MIN:
4340  case AMDGPU::G_ATOMICRMW_UMAX:
4341  case AMDGPU::G_ATOMICRMW_UMIN:
4342  case AMDGPU::G_ATOMICRMW_FADD:
4343  case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
4344  case AMDGPU::G_AMDGPU_ATOMIC_INC:
4345  case AMDGPU::G_AMDGPU_ATOMIC_DEC:
4346  case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
4347  case AMDGPU::G_AMDGPU_ATOMIC_FMAX: {
4348  OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4349  OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4350  OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4351  break;
4352  }
4353  case AMDGPU::G_ATOMIC_CMPXCHG: {
4354  OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
4355  OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
4356  OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4357  OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
4358  break;
4359  }
4360  case AMDGPU::G_BRCOND: {
4361  unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI,
4362  AMDGPU::SGPRRegBankID);
4363  assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
4364  if (Bank != AMDGPU::SGPRRegBankID)
4365  Bank = AMDGPU::VCCRegBankID;
4366 
4367  OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1);
4368  break;
4369  }
4370  }
4371 
4372  return getInstructionMapping(/*ID*/1, /*Cost*/1,
4373  getOperandsMapping(OpdsMapping),
4374  MI.getNumOperands());
4375 }
llvm::Check::Size
@ Size
Definition: FileCheck.h:73
i
i
Definition: README.txt:29
MIPatternMatch.h
llvm::GCNSubtarget::hasScalarMulHiInsts
bool hasScalarMulHiInsts() const
Definition: GCNSubtarget.h:382
isScalarLoadLegal
static bool isScalarLoadLegal(const MachineInstr &MI)
Definition: AMDGPURegisterBankInfo.cpp:438
llvm::AMDGPURegisterBankInfo::getSGPROpMapping
const ValueMapping * getSGPROpMapping(Register Reg, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI) const
Definition: AMDGPURegisterBankInfo.cpp:3246
Signed
@ Signed
Definition: NVPTXISelLowering.cpp:4543
substituteSimpleCopyRegs
static bool substituteSimpleCopyRegs(const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx)
Definition: AMDGPURegisterBankInfo.cpp:1605
llvm::getDefIgnoringCopies
MachineInstr * getDefIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, folding away any trivial copies.
Definition: Utils.cpp:396
llvm::AMDGPURegisterBankInfo
Definition: AMDGPURegisterBankInfo.h:42
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:100
Merge
R600 Clause Merge
Definition: R600ClauseMergePass.cpp:69
llvm::MachineInstrBuilder::addImm
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Definition: MachineInstrBuilder.h:132
llvm::AMDGPURegisterBankInfo::getDefaultMappingAllVGPR
const InstructionMapping & getDefaultMappingAllVGPR(const MachineInstr &MI) const
Definition: AMDGPURegisterBankInfo.cpp:3113
llvm
Definition: AllocatorList.h:23
llvm::tgtok::Def
@ Def
Definition: TGLexer.h:50
Reg
unsigned Reg
Definition: MachineSink.cpp:1566
llvm::CmpInst::ICMP_EQ
@ ICMP_EQ
equal
Definition: InstrTypes.h:743
getExtendOp
static unsigned getExtendOp(unsigned Opc)
Definition: AMDGPURegisterBankInfo.cpp:1565
llvm::make_range
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
Definition: iterator_range.h:53
llvm::RecurKind::Or
@ Or
Bitwise or logical OR of integers.
llvm::LLT::getScalarSizeInBits
unsigned getScalarSizeInBits() const
Definition: LowLevelTypeImpl.h:163
llvm::MachineRegisterInfo::createVirtualRegister
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
Definition: MachineRegisterInfo.cpp:158
llvm::CmpInst::Predicate
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:722
llvm::MachineMemOperand::getAlign
Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
Definition: MachineOperand.cpp:1069
llvm::MIPatternMatch::m_Reg
operand_type_match m_Reg()
Definition: MIPatternMatch.h:106
SIMachineFunctionInfo.h
llvm::RegisterBankInfo::getInstrMappingImpl
const InstructionMapping & getInstrMappingImpl(const MachineInstr &MI) const
Try to get the mapping of MI.
Definition: RegisterBankInfo.cpp:162
llvm::MachineRegisterInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Definition: MachineRegisterInfo.h:52
splitUnequalType
static std::pair< LLT, LLT > splitUnequalType(LLT Ty, unsigned FirstSize)
Split Ty into 2 pieces.
Definition: AMDGPURegisterBankInfo.cpp:1115
llvm::MachineInstrSpan
MachineInstrSpan provides an interface to get an iteration range containing the instruction it was in...
Definition: MachineBasicBlock.h:1094
llvm::RegisterBankInfo::OperandsMapper::getMRI
MachineRegisterInfo & getMRI() const
The MachineRegisterInfo we used to realize the mapping.
Definition: RegisterBankInfo.h:335
llvm::getOpcodeDef
MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition: Utils.cpp:410
C1
instcombine should handle this C2 when C1
Definition: README.txt:263
llvm::SmallVector< MachineInstr *, 4 >
llvm::MachineFunction::getMachineMemOperand
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, uint64_t s, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
Definition: MachineFunction.cpp:430
llvm::MipsISD::Lo
@ Lo
Definition: MipsISelLowering.h:79
llvm::RegisterBankInfo::getRegBank
RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
Definition: RegisterBankInfo.h:432
llvm::getSrcRegIgnoringCopies
Register getSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the source register for Reg, folding away any trivial copies.
Definition: Utils.cpp:403
llvm::AMDGPUInstrInfo::isUniformMMO
static bool isUniformMMO(const MachineMemOperand *MMO)
Definition: AMDGPUInstrInfo.cpp:31
llvm::MachineMemOperand::MOInvariant
@ MOInvariant
The memory access always returns the same value (or traps).
Definition: MachineMemOperand.h:144
llvm::CmpInst::ICMP_NE
@ ICMP_NE
not equal
Definition: InstrTypes.h:744
llvm::AMDGPURegisterBankInfo::applyMappingImpl
void applyMappingImpl(const OperandsMapper &OpdMapper) const override
See RegisterBankInfo::applyMapping.
Definition: AMDGPURegisterBankInfo.cpp:2041
llvm::RegisterBankInfo::applyDefaultMapping
static void applyDefaultMapping(const OperandsMapper &OpdMapper)
Helper method to apply something that is like the default mapping.
Definition: RegisterBankInfo.cpp:438
llvm::AMDGPURegisterBankInfo::AMDGPURegisterBankInfo
AMDGPURegisterBankInfo(const GCNSubtarget &STI)
Definition: AMDGPURegisterBankInfo.cpp:195
llvm::TargetRegisterInfo
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Definition: TargetRegisterInfo.h:231
llvm::SIRegisterInfo::getWaveMaskRegClass
const TargetRegisterClass * getWaveMaskRegClass() const
Definition: SIRegisterInfo.h:283
llvm::LLT::getScalarType
LLT getScalarType() const
Definition: LowLevelTypeImpl.h:121
llvm::MachineMemOperand
A description of a memory reference used in the backend.
Definition: MachineMemOperand.h:127
llvm::MachineMemOperand::MODereferenceable
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
Definition: MachineMemOperand.h:142
llvm::AMDGPURegisterBankInfo::getInstrMapping
const InstructionMapping & getInstrMapping(const MachineInstr &MI) const override
This function must return a legal mapping, because AMDGPURegisterBankInfo::getInstrAlternativeMapping...
Definition: AMDGPURegisterBankInfo.cpp:3282
llvm::MachineFunction::insert
void insert(iterator MBBI, MachineBasicBlock *MBB)
Definition: MachineFunction.h:756
llvm::SmallSet
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:134
llvm::LLT::isValid
bool isValid() const
Definition: LowLevelTypeImpl.h:90
llvm::Optional< int64_t >
llvm::RegisterBankInfo::OperandsMapper::getInstrMapping
const InstructionMapping & getInstrMapping() const
The final mapping of the instruction.
Definition: RegisterBankInfo.h:332
llvm::LLT::scalarOrVector
static LLT scalarOrVector(uint16_t NumElements, LLT ScalarTy)
Definition: LowLevelTypeImpl.h:74
Offset
uint64_t Offset
Definition: ELFObjHandler.cpp:81
llvm::AMDGPURegisterBankInfo::getVGPROpMapping
const ValueMapping * getVGPROpMapping(Register Reg, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI) const
Definition: AMDGPURegisterBankInfo.cpp:3257
llvm::GCNSubtarget
Definition: GCNSubtarget.h:38
MachineIRBuilder.h
unpackV2S16ToS32
static std::pair< Register, Register > unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode)
Definition: AMDGPURegisterBankInfo.cpp:1583
llvm::RegisterBankInfo::InstructionMapping::isValid
bool isValid() const
Check whether this object is valid.
Definition: RegisterBankInfo.h:254
llvm::MachineMemOperand::isInvariant
bool isInvariant() const
Definition: MachineMemOperand.h:271
llvm::AMDGPURegisterBankInfo::TII
const SIInstrInfo * TII
Definition: AMDGPURegisterBankInfo.h:46
llvm::LegalizerHelper
Definition: LegalizerHelper.h:37
llvm::AMDGPURegisterBankInfo::applyMappingSBufferLoad
bool applyMappingSBufferLoad(const OperandsMapper &OpdMapper) const
Definition: AMDGPURegisterBankInfo.cpp:1388
llvm::AMDGPURegisterBankInfo::TRI
const SIRegisterInfo * TRI
Definition: AMDGPURegisterBankInfo.h:45
TRI
unsigned const TargetRegisterInfo * TRI
Definition: MachineSink.cpp:1567
llvm::RegisterBankInfo::getValueMapping
const ValueMapping & getValueMapping(unsigned StartIdx, unsigned Length, const RegisterBank &RegBank) const
The most common ValueMapping consists of a single PartialMapping.
Definition: RegisterBankInfo.cpp:297
llvm::GCNSubtarget::hasScalarCompareEq64
bool hasScalarCompareEq64() const
Definition: GCNSubtarget.h:798
regBankBoolUnion
static unsigned regBankBoolUnion(unsigned RB0, unsigned RB1)
Definition: AMDGPURegisterBankInfo.cpp:3021
llvm::AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic
RegisterBankInfo::InstructionMappings getInstrAlternativeMappingsIntrinsic(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
Definition: AMDGPURegisterBankInfo.cpp:335
llvm::SIRegisterInfo::isAGPRClass
bool isAGPRClass(const TargetRegisterClass *RC) const
Definition: SIRegisterInfo.h:166
llvm::RegisterBankInfo::ValueMapping::BreakDown
const PartialMapping * BreakDown
How the value is broken down between the different register banks.
Definition: RegisterBankInfo.h:147
llvm::constrainSelectedInstRegOperands
bool constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition: Utils.cpp:134
llvm::AMDGPURegisterBankInfo::copyCost
unsigned copyCost(const RegisterBank &A, const RegisterBank &B, unsigned Size) const override
Get the cost of a copy from B to A, or put differently, get the cost of A = COPY B.
Definition: AMDGPURegisterBankInfo.cpp:219
llvm::MipsISD::Hi
@ Hi
Definition: MipsISelLowering.h:75
llvm::AMDGPURegisterBankInfo::applyMappingBFEIntrinsic
bool applyMappingBFEIntrinsic(const OperandsMapper &OpdMapper, bool Signed) const
Definition: AMDGPURegisterBankInfo.cpp:1507
llvm::MachineBasicBlock::addSuccessor
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
Definition: MachineBasicBlock.cpp:745
llvm::AMDGPUAS::CONSTANT_ADDRESS_32BIT
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
Definition: AMDGPU.h:380
llvm::MachineFunction::getRegInfo
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Definition: MachineFunction.h:565
getBaseWithConstantOffset
static std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg)
Definition: AMDGPURegisterBankInfo.cpp:1643
llvm::RegisterBank
This class implements the register bank concept.
Definition: RegisterBank.h:28
llvm::MachineRegisterInfo::setType
void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
Definition: MachineRegisterInfo.cpp:182
GCNSubtarget.h
llvm::AMDGPURegisterBankInfo::applyMappingLoad
bool applyMappingLoad(MachineInstr &MI, const OperandsMapper &OpdMapper, MachineRegisterInfo &MRI) const
Definition: AMDGPURegisterBankInfo.cpp:1140
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::AMDGPURegisterBankInfo::getAGPROpMapping
const ValueMapping * getAGPROpMapping(Register Reg, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI) const
Definition: AMDGPURegisterBankInfo.cpp:3265
llvm::MachineMemOperand::getAddrSpace
unsigned getAddrSpace() const
Definition: MachineMemOperand.h:218
llvm::GCNSubtarget::useFlatForGlobal
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:460
llvm::MachineFunction::getInfo
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Definition: MachineFunction.h:653
llvm::LLT::getSizeInBits
unsigned getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelTypeImpl.h:109
llvm::MachineMemOperand::getValue
const Value * getValue() const
Return the base address of the memory access.
Definition: MachineMemOperand.h:200
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
llvm::MachineInstr::getOperand
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:488
llvm::AMDGPU::getMUBUFOpcode
int getMUBUFOpcode(unsigned BaseOpc, unsigned Elements)
Definition: AMDGPUBaseInfo.cpp:235
llvm::Log2
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition: Alignment.h:217
llvm::LegalizerHelper::narrowScalar
LegalizeResult narrowScalar(MachineInstr &MI, unsigned TypeIdx, LLT NarrowTy)
Legalize an instruction by reducing the width of the underlying scalar type.
Definition: LegalizerHelper.cpp:780
llvm::AMDGPURegisterBankInfo::addMappingFromTable
InstructionMappings addMappingFromTable(const MachineInstr &MI, const MachineRegisterInfo &MRI, const std::array< unsigned, NumOps > RegSrcOpIdx, ArrayRef< OpRegBankEntry< NumOps >> Table) const
llvm::TargetRegisterClass
Definition: TargetRegisterInfo.h:46
TII
const HexagonInstrInfo * TII
Definition: HexagonCopyToCombine.cpp:129
llvm::AMDGPURegisterBankInfo::getValueMappingForPtr
const ValueMapping * getValueMappingForPtr(const MachineRegisterInfo &MRI, Register Ptr) const
Return the mapping for a pointer arugment.
Definition: AMDGPURegisterBankInfo.cpp:3176
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
First
into llvm powi allowing the code generator to produce balanced multiplication trees First
Definition: README.txt:54
llvm::MachineOperand
MachineOperand class - Representation of each machine instruction operand.
Definition: MachineOperand.h:49
llvm::Instruction
Definition: Instruction.h:45
llvm::GCNSubtarget::isWave32
bool isWave32() const
Definition: GCNSubtarget.h:1074
llvm::AMDGPURegisterBankInfo::OpRegBankEntry
Definition: AMDGPURegisterBankInfo.h:131
llvm::report_fatal_error
LLVM_ATTRIBUTE_NORETURN void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:140
llvm::RegisterBank::getID
unsigned getID() const
Get the identifier of this register bank.
Definition: RegisterBank.h:47
llvm::MachineInstrSpan::begin
MachineBasicBlock::iterator begin()
Definition: MachineBasicBlock.h:1105
llvm::LLT::vector
static LLT vector(uint16_t NumElements, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
Definition: LowLevelTypeImpl.h:58
llvm::AMDGPURegisterBankInfo::constrainOpWithReadfirstlane
void constrainOpWithReadfirstlane(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const
Definition: AMDGPURegisterBankInfo.cpp:1082
isVectorRegisterBank
static bool isVectorRegisterBank(const RegisterBank &Bank)
Definition: AMDGPURegisterBankInfo.cpp:214
llvm::AMDGPURegisterBankInfo::collectWaterfallOperands
bool collectWaterfallOperands(SmallSet< Register, 4 > &SGPROperandRegs, MachineInstr &MI, MachineRegisterInfo &MRI, ArrayRef< unsigned > OpIndices) const
Definition: AMDGPURegisterBankInfo.cpp:1044
Info
Analysis containing CSE Info
Definition: CSEInfo.cpp:26
Align
uint64_t Align
Definition: ELFObjHandler.cpp:83
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
llvm::AMDGPUAS::REGION_ADDRESS
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition: AMDGPU.h:374
llvm::MachineInstrSpan::end
MachineBasicBlock::iterator end()
Definition: MachineBasicBlock.h:1108
llvm::RegState::Kill
@ Kill
The last use of a register.
Definition: MachineInstrBuilder.h:49
llvm::RegisterBankInfo::getInstrAlternativeMappings
virtual InstructionMappings getInstrAlternativeMappings(const MachineInstr &MI) const
Get the alternative mappings for MI.
Definition: RegisterBankInfo.cpp:433
llvm::RegisterBankInfo::getSizeInBits
unsigned getSizeInBits(Register Reg, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI) const
Get the size in bits of Reg.
Definition: RegisterBankInfo.cpp:493
llvm::RegisterBankInfo::constrainGenericRegister
static const TargetRegisterClass * constrainGenericRegister(Register Reg, const TargetRegisterClass &RC, MachineRegisterInfo &MRI)
Constrain the (possibly generic) virtual register Reg to RC.
Definition: RegisterBankInfo.cpp:132
llvm::LegalizerHelper::fewerElementsVector
LegalizeResult fewerElementsVector(MachineInstr &MI, unsigned TypeIdx, LLT NarrowTy)
Legalize a vector instruction by splitting into multiple components, each acting on the same scalar t...
Definition: LegalizerHelper.cpp:4079
llvm::MachineBasicBlock
Definition: MachineBasicBlock.h:95
llvm::AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects
RegisterBankInfo::InstructionMappings getInstrAlternativeMappingsIntrinsicWSideEffects(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
Definition: AMDGPURegisterBankInfo.cpp:375
llvm::MachineInstrBuilder::cloneMemRefs
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Definition: MachineInstrBuilder.h:214
llvm::RegisterBankInfo::OperandsMapper
Helper class used to get/create the virtual registers that will be used to replace the MachineOperand...
Definition: RegisterBankInfo.h:280
llvm::AMDGPUAS::PRIVATE_ADDRESS
@ PRIVATE_ADDRESS
Address space for private memory.
Definition: AMDGPU.h:378
setRegsToType
static void setRegsToType(MachineRegisterInfo &MRI, ArrayRef< Register > Regs, LLT NewTy)
Replace the current type each register in Regs has with NewTy.
Definition: AMDGPURegisterBankInfo.cpp:669
Operands
mir Rename Register Operands
Definition: MIRNamerPass.cpp:78
llvm::LLT::getAddressSpace
unsigned getAddressSpace() const
Definition: LowLevelTypeImpl.h:178
llvm::RegisterBankInfo::PartialMapping::StartIdx
unsigned StartIdx
Number of bits at which this partial mapping starts in the original value.
Definition: RegisterBankInfo.h:52
llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition: MachineFunction.h:555
extractCPol
static unsigned extractCPol(unsigned CachePolicy)
Definition: AMDGPURegisterBankInfo.cpp:1705
llvm::AMDGPURegisterBankInfo::splitBufferOffsets
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register Offset) const
Definition: AMDGPURegisterBankInfo.cpp:1657
llvm::RISCVFenceField::O
@ O
Definition: RISCVBaseInfo.h:128
llvm::SmallSet::count
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:164
AMDGPURegisterBankInfo.h
llvm::AMDGPURegisterBankInfo::getRegBankFromRegClass
const RegisterBank & getRegBankFromRegClass(const TargetRegisterClass &RC, LLT) const override
Get a register bank that covers RC.
Definition: AMDGPURegisterBankInfo.cpp:276
llvm::LLT::divide
LLT divide(int Factor) const
Return a type that is Factor times smaller.
Definition: LowLevelTypeImpl.h:150
llvm::RegisterBankInfo::OperandsMapper::getVRegs
iterator_range< SmallVectorImpl< Register >::const_iterator > getVRegs(unsigned OpIdx, bool ForDebug=false) const
Get all the virtual registers required to map the OpIdx-th operand of the instruction.
Definition: RegisterBankInfo.cpp:732
llvm::MachineIRBuilder
Helper class to build MachineInstr.
Definition: MachineIRBuilder.h:220
llvm::AMDGPURegisterBankInfo::Subtarget
const GCNSubtarget & Subtarget
Definition: AMDGPURegisterBankInfo.h:44
llvm::MachineOperand::isReg
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Definition: MachineOperand.h:318
llvm::MachineInstr
Representation of each machine instruction.
Definition: MachineInstr.h:64
llvm::MachineInstrBuilder
Definition: MachineInstrBuilder.h:70
llvm::AMDGPURegisterBankInfo::getBreakDownCost
unsigned getBreakDownCost(const ValueMapping &ValMapping, const RegisterBank *CurBank=nullptr) const override
Get the cost of using ValMapping to decompose a register.
Definition: AMDGPURegisterBankInfo.cpp:250
llvm::RegisterBankInfo::InstructionMapping
Helper class that represents how the value of an instruction may be mapped and what is the related co...
Definition: RegisterBankInfo.h:189
llvm::MachineRegisterInfo::setRegBank
void setRegBank(Register Reg, const RegisterBank &RegBank)
Set the register bank to RegBank for Reg.
Definition: MachineRegisterInfo.cpp:63
llvm::AMDGPU::getBaseWithConstantOffset
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg)
Returns base register and constant offset.
Definition: AMDGPUGlobalISelUtils.cpp:17
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
llvm::LegalizeActions::Bitcast
@ Bitcast
Perform the operation on a different, but equivalently sized type.
Definition: LegalizerInfo.h:72
llvm::MachinePointerInfo
This class contains a discriminated union of information about pointers in memory operands,...
Definition: MachineMemOperand.h:37
llvm::numbers::e
constexpr double e
Definition: MathExtras.h:57
llvm::assumeAligned
Align assumeAligned(uint64_t Value)
Treats the value 0 as a 1, so Align is always at least 1.
Definition: Alignment.h:113
llvm::DenseMap
Definition: DenseMap.h:714
llvm::RegisterBankInfo::cannotCopy
bool cannotCopy(const RegisterBank &Dst, const RegisterBank &Src, unsigned Size) const
Definition: RegisterBankInfo.h:624
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::RegisterBankInfo::getOperandsMapping
const ValueMapping * getOperandsMapping(Iterator Begin, Iterator End) const
Get the uniquely generated array of ValueMapping for the elements of between Begin and End.
Definition: RegisterBankInfo.cpp:332
llvm::LLT::isVector
bool isVector() const
Definition: LowLevelTypeImpl.h:96
llvm::LLT::getNumElements
uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
Definition: LowLevelTypeImpl.h:100
llvm::MachineFunction::CreateMachineBasicBlock
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *bb=nullptr)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
Definition: MachineFunction.cpp:414
AMDGPUGlobalISelUtils.h
llvm::AMDGPURegisterBankInfo::selectStoreIntrinsic
MachineInstr * selectStoreIntrinsic(MachineIRBuilder &B, MachineInstr &MI) const
Definition: AMDGPURegisterBankInfo.cpp:1715
llvm::AMDGPURegisterBankInfo::getMappingType
unsigned getMappingType(const MachineRegisterInfo &MRI, const MachineInstr &MI) const
Definition: AMDGPURegisterBankInfo.cpp:3037
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::find
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:150
llvm::getConstantVRegSExtVal
Optional< int64_t > getConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition: Utils.cpp:281
llvm::MachineRegisterInfo::setSimpleHint
void setSimpleHint(Register VReg, Register PrefReg)
Specify the preferred (target independent) register allocation hint for the specified virtual registe...
Definition: MachineRegisterInfo.h:781
llvm::AMDGPURegisterBankInfo::split64BitValueForMapping
void split64BitValueForMapping(MachineIRBuilder &B, SmallVector< Register, 2 > &Regs, LLT HalfTy, Register Reg) const
Split 64-bit value Reg into two 32-bit halves and populate them into Regs.
Definition: AMDGPURegisterBankInfo.cpp:646
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::RegisterBankInfo::OperandsMapper::getMI
MachineInstr & getMI() const
Definition: RegisterBankInfo.h:329
llvm::MachineBasicBlock::getParent
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
Definition: MachineBasicBlock.h:225
llvm::AMDGPU::isFlatGlobalAddrSpace
bool isFlatGlobalAddrSpace(unsigned AS)
Definition: AMDGPU.h:420
llvm::AMDGPURegisterBankInfo::getDefaultMappingSOP
const InstructionMapping & getDefaultMappingSOP(const MachineInstr &MI) const
Definition: AMDGPURegisterBankInfo.cpp:3071
llvm::MachineRegisterInfo::createGenericVirtualRegister
Register createGenericVirtualRegister(LLT Ty, StringRef Name="")
Create and return a new generic virtual register with low-level type Ty.
Definition: MachineRegisterInfo.cpp:188
llvm::AMDGPURegisterBankInfo::applyMappingImage
bool applyMappingImage(MachineInstr &MI, const OperandsMapper &OpdMapper, MachineRegisterInfo &MRI, int RSrcIdx) const
Definition: AMDGPURegisterBankInfo.cpp:1265
llvm::AMDGPU::CPol::ALL
@ ALL
Definition: SIDefines.h:286
llvm::MachineInstrBuilder::addReg
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Definition: MachineInstrBuilder.h:98
llvm::MachineInstrBuilder::addUse
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
Definition: MachineInstrBuilder.h:124
llvm::MachineOperand::getReg
Register getReg() const
getReg - Returns the register number.
Definition: MachineOperand.h:357
llvm::RegisterBankInfo::PartialMapping::RegBank
const RegisterBank * RegBank
Register bank where the partial value lives.
Definition: RegisterBankInfo.h:60
llvm::zip
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&... args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:728
llvm::LLT::isScalar
bool isScalar() const
Definition: LowLevelTypeImpl.h:92
llvm::LegalizerHelper::widenScalar
LegalizeResult widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy)
Legalize an instruction by performing the operation on a wider scalar type (for example a 16-bit addi...
Definition: LegalizerHelper.cpp:1886
llvm::SIRegisterInfo::isSGPRClass
bool isSGPRClass(const TargetRegisterClass *RC) const
Definition: SIRegisterInfo.h:154
isReg
static bool isReg(const MCInst &MI, unsigned OpNo)
Definition: MipsInstPrinter.cpp:31
llvm::MachineRegisterInfo::getRegClassOrRegBank
const RegClassOrRegBank & getRegClassOrRegBank(Register Reg) const
Return the register bank or register class of Reg.
Definition: MachineRegisterInfo.h:668
llvm::MachineFunction
Definition: MachineFunction.h:227
llvm::RegisterBankInfo::getInstructionMapping
const InstructionMapping & getInstructionMapping(unsigned ID, unsigned Cost, const ValueMapping *OperandsMapping, unsigned NumOperands) const
Method to get a uniquely generated InstructionMapping.
Definition: RegisterBankInfo.h:526
llvm::MachineRegisterInfo::use_nodbg_empty
bool use_nodbg_empty(Register RegNo) const
use_nodbg_empty - Return true if there are no non-Debug instructions using the specified register.
Definition: MachineRegisterInfo.h:566
llvm::MachineBasicBlock::getFirstTerminator
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
Definition: MachineBasicBlock.cpp:241
llvm::RegisterBankInfo::ValueMapping
Helper struct that represents how a value is mapped through different register banks.
Definition: RegisterBankInfo.h:145
llvm::iterator_range::end
IteratorT end() const
Definition: iterator_range.h:45
llvm::AMDGPU::HSAMD::Kernel::Arg::Key::IsConst
constexpr char IsConst[]
Key for Kernel::Arg::Metadata::mIsConst.
Definition: AMDGPUMetadata.h:190
llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: APInt.h:32
llvm::AMDGPU::RsrcIntrinsic::IsImage
bool IsImage
Definition: AMDGPUInstrInfo.h:40
llvm::SrcOp::getReg
Register getReg() const
Definition: MachineIRBuilder.h:171
llvm::AMDGPU::lookupRsrcIntrinsic
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
llvm::MachineBasicBlock::splice
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
Definition: MachineBasicBlock.h:863
AMDGPU.h
MBBI
MachineBasicBlock MachineBasicBlock::iterator MBBI
Definition: AArch64SLSHardening.cpp:75
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:136
extractSWZ
static unsigned extractSWZ(unsigned CachePolicy)
Definition: AMDGPURegisterBankInfo.cpp:1709
llvm::AMDGPURegisterBankInfo::applyMappingDynStackAlloc
bool applyMappingDynStackAlloc(MachineInstr &MI, const OperandsMapper &OpdMapper, MachineRegisterInfo &MRI) const
Definition: AMDGPURegisterBankInfo.cpp:1218
uint32_t
reinsertVectorIndexAdd
static void reinsertVectorIndexAdd(MachineIRBuilder &B, MachineInstr &IdxUseInstr, unsigned OpIdx, unsigned ConstOffset)
Utility function for pushing dynamic vector indexes with a constant offset into waterwall loops.
Definition: AMDGPURegisterBankInfo.cpp:1826
llvm::SITargetLowering::shouldExpandVectorDynExt
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
Definition: SIISelLowering.cpp:10031
llvm::ilist_node_impl::getIterator
self_iterator getIterator()
Definition: ilist_node.h:81
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
S
add sub stmia L5 ldr r0 bl L_printf $stub Instead of a and a wouldn t it be better to do three moves *Return an aggregate type is even return S
Definition: README.txt:210
llvm::once_flag
std::once_flag once_flag
Definition: Threading.h:89
llvm::MachineInstr::getParent
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:286
llvm::SmallSet::insert
std::pair< NoneType, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:180
llvm::AMDGPURegisterBankInfo::buildVCopy
bool buildVCopy(MachineIRBuilder &B, Register DstReg, Register SrcReg) const
Definition: AMDGPURegisterBankInfo.cpp:1791
LLVM_FALLTHROUGH
#define LLVM_FALLTHROUGH
LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
Definition: Compiler.h:281
llvm::GISelChangeObserver
Abstract class that contains various methods for clients to notify about changes.
Definition: GISelChangeObserver.h:29
llvm::AMDGPUGenRegisterBankInfo
This class provides the information for the target register banks.
Definition: AMDGPURegisterBankInfo.h:34
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::insert
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:207
llvm::RegisterBankInfo::PartialMapping::Length
unsigned Length
Length of this mapping in bits.
Definition: RegisterBankInfo.h:57
llvm::MachineMemOperand::MOLoad
@ MOLoad
The memory access reads data.
Definition: MachineMemOperand.h:134
MRI
unsigned const MachineRegisterInfo * MRI
Definition: AArch64AdvSIMDScalarPass.cpp:105
llvm::AMDGPURegisterBankInfo::handleD16VData
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg) const
Handle register layout difference for f16 images for some subtargets.
Definition: AMDGPURegisterBankInfo.cpp:1618
llvm::Register
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
extendLow32IntoHigh32
static void extendLow32IntoHigh32(MachineIRBuilder &B, Register Hi32Reg, Register Lo32Reg, unsigned ExtOpc, const RegisterBank &RegBank, bool IsBooleanSrc=false)
Implement extending a 32-bit value to a 64-bit value.
Definition: AMDGPURegisterBankInfo.cpp:1847
llvm::AMDGPU::RsrcIntrinsic::RsrcArg
uint8_t RsrcArg
Definition: AMDGPUInstrInfo.h:39
llvm::AMDGPURegisterBankInfo::executeInWaterfallLoop
bool executeInWaterfallLoop(MachineIRBuilder &B, iterator_range< MachineBasicBlock::iterator > Range, SmallSet< Register, 4 > &SGPROperandRegs, MachineRegisterInfo &MRI) const
Legalize instruction MI where operands in OpIndices must be SGPRs.
Definition: AMDGPURegisterBankInfo.cpp:705
isZero
static bool isZero(Register Reg, MachineRegisterInfo &MRI)
Definition: AMDGPURegisterBankInfo.cpp:1700
MBB
MachineBasicBlock & MBB
Definition: AArch64SLSHardening.cpp:74
llvm::MachineMemOperand::isVolatile
bool isVolatile() const
Definition: MachineMemOperand.h:268
AMDGPUInstrInfo.h
regBankUnion
static unsigned regBankUnion(unsigned RB0, unsigned RB1)
Definition: AMDGPURegisterBankInfo.cpp:3006
llvm::empty
constexpr bool empty(const T &RangeOrContainer)
Test whether RangeOrContainer is empty. Similar to C++17 std::empty.
Definition: STLExtras.h:263
llvm::call_once
void call_once(once_flag &flag, Function &&F, Args &&... ArgList)
Execute the function specified as a parameter once.
Definition: Threading.h:119
llvm::MIPatternMatch::m_GAdd
BinaryOp_match< LHS, RHS, TargetOpcode::G_ADD, true > m_GAdd(const LHS &L, const RHS &R)
Definition: MIPatternMatch.h:234
llvm::DenseMapBase< DenseMap< KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >, KeyT, ValueT, DenseMapInfo< KeyT >, llvm::detail::DenseMapPair< KeyT, ValueT > >::end
iterator end()
Definition: DenseMap.h:83
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:314
llvm::GCNSubtarget::hasUnpackedD16VMem
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:631
llvm::X86::FirstMacroFusionInstKind::Cmp
@ Cmp
llvm::ilist_iterator
Iterator for intrusive lists based on ilist_node.
Definition: ilist_iterator.h:57
llvm::MachineMemOperand::getSize
uint64_t getSize() const
Return the size in bytes of the memory reference.
Definition: MachineMemOperand.h:221
widen96To128
static LLT widen96To128(LLT Ty)
Definition: AMDGPURegisterBankInfo.cpp:1131
llvm::RegState::Undef
@ Undef
Value of the register doesn't matter.
Definition: MachineInstrBuilder.h:53
llvm::AMDGPUAS::LOCAL_ADDRESS
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPU.h:377
llvm::RegisterBankInfo::getInvalidInstructionMapping
const InstructionMapping & getInvalidInstructionMapping() const
Method to get a uniquely generated invalid InstructionMapping.
Definition: RegisterBankInfo.h:534
LegalizerHelper.h
llvm::AMDGPURegisterBankInfo::getInstrMappingForLoad
const RegisterBankInfo::InstructionMapping & getInstrMappingForLoad(const MachineInstr &MI) const
Definition: AMDGPURegisterBankInfo.cpp:3191
llvm::AMDGPURegisterBankInfo::getDefaultMappingVOP
const InstructionMapping & getDefaultMappingVOP(const MachineInstr &MI) const
Definition: AMDGPURegisterBankInfo.cpp:3089
llvm::AMDGPU::splitMUBUFOffset
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, const GCNSubtarget *Subtarget, Align Alignment)
Definition: AMDGPUBaseInfo.cpp:1852
llvm::AMDGPURegisterBankInfo::getInstrAlternativeMappings
InstructionMappings getInstrAlternativeMappings(const MachineInstr &MI) const override
Get the alternative mappings for MI.
Definition: AMDGPURegisterBankInfo.cpp:460
llvm::RegisterBankInfo::copyCost
virtual unsigned copyCost(const RegisterBank &A, const RegisterBank &B, unsigned Size) const
Get the cost of a copy from B to A, or put differently, get the cost of A = COPY B.
Definition: RegisterBankInfo.h:614
llvm::MachineMemOperand::isAtomic
bool isAtomic() const
Returns true if this operation has an atomic ordering requirement of unordered or higher,...
Definition: MachineMemOperand.h:275
llvm::MIPatternMatch::m_ICst
ConstantMatch m_ICst(int64_t &Cst)
Definition: MIPatternMatch.h:69
llvm::makeArrayRef
ArrayRef< T > makeArrayRef(const T &OneElt)
Construct an ArrayRef from a single element.
Definition: ArrayRef.h:474
llvm::RegisterBankInfo::ValueMapping::NumBreakDowns
unsigned NumBreakDowns
Number of partial mapping to break down this value.
Definition: RegisterBankInfo.h:150
memOpHasNoClobbered
static bool memOpHasNoClobbered(const MachineMemOperand *MMO)
Definition: AMDGPURegisterBankInfo.cpp:431
llvm::MachineBasicBlock::begin
iterator begin()
Definition: MachineBasicBlock.h:268
llvm::MachineRegisterInfo::getType
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
Definition: MachineRegisterInfo.h:732
RegisterBank.h
llvm::BuildMI
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
Definition: MachineInstrBuilder.h:329
N
#define N
llvm::MachineOperand::setReg
void setReg(Register Reg)
Change the register this operand corresponds to.