LLVM  16.0.0git
SILoadStoreOptimizer.cpp
Go to the documentation of this file.
1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass tries to fuse DS instructions with close by immediate offsets.
10 // This will fuse operations such as
11 // ds_read_b32 v0, v2 offset:16
12 // ds_read_b32 v1, v2 offset:32
13 // ==>
14 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15 //
16 // The same is done for certain SMEM and VMEM opcodes, e.g.:
17 // s_buffer_load_dword s4, s[0:3], 4
18 // s_buffer_load_dword s5, s[0:3], 8
19 // ==>
20 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21 //
22 // This pass also tries to promote constant offset to the immediate by
23 // adjusting the base. It tries to use a base from the nearby instructions that
24 // allows it to have a 13bit constant offset and then promotes the 13bit offset
25 // to the immediate.
26 // E.g.
27 // s_movk_i32 s0, 0x1800
28 // v_add_co_u32_e32 v0, vcc, s0, v2
29 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30 //
31 // s_movk_i32 s0, 0x1000
32 // v_add_co_u32_e32 v5, vcc, s0, v2
33 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34 // global_load_dwordx2 v[5:6], v[5:6], off
35 // global_load_dwordx2 v[0:1], v[0:1], off
36 // =>
37 // s_movk_i32 s0, 0x1000
38 // v_add_co_u32_e32 v5, vcc, s0, v2
39 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40 // global_load_dwordx2 v[5:6], v[5:6], off
41 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42 //
43 // Future improvements:
44 //
45 // - This is currently missing stores of constants because loading
46 // the constant into the data register is placed between the stores, although
47 // this is arguably a scheduling problem.
48 //
49 // - Live interval recomputing seems inefficient. This currently only matches
50 // one pair, and recomputes live intervals and moves on to the next pair. It
51 // would be better to compute a list of all merges that need to occur.
52 //
53 // - With a list of instructions to process, we can also merge more. If a
54 // cluster of loads have offsets that are too large to fit in the 8-bit
55 // offsets, but are close enough to fit in the 8 bits, we can add to the base
56 // pointer and use the new reduced offsets.
57 //
58 //===----------------------------------------------------------------------===//
59 
60 #include "AMDGPU.h"
61 #include "GCNSubtarget.h"
65 #include "llvm/InitializePasses.h"
66 
67 using namespace llvm;
68 
69 #define DEBUG_TYPE "si-load-store-opt"
70 
71 namespace {
72 enum InstClassEnum {
73  UNKNOWN,
74  DS_READ,
75  DS_WRITE,
76  S_BUFFER_LOAD_IMM,
77  S_BUFFER_LOAD_SGPR_IMM,
78  S_LOAD_IMM,
81  MIMG,
82  TBUFFER_LOAD,
83  TBUFFER_STORE,
84  GLOBAL_LOAD_SADDR,
85  GLOBAL_STORE_SADDR,
86  FLAT_LOAD,
87  FLAT_STORE,
88  GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of
89  GLOBAL_STORE // any CombineInfo, they are only ever returned by
90  // getCommonInstClass.
91 };
92 
93 struct AddressRegs {
94  unsigned char NumVAddrs = 0;
95  bool SBase = false;
96  bool SRsrc = false;
97  bool SOffset = false;
98  bool SAddr = false;
99  bool VAddr = false;
100  bool Addr = false;
101  bool SSamp = false;
102 };
103 
104 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
105 const unsigned MaxAddressRegs = 12 + 1 + 1;
106 
107 class SILoadStoreOptimizer : public MachineFunctionPass {
108  struct CombineInfo {
110  unsigned EltSize;
111  unsigned Offset;
112  unsigned Width;
113  unsigned Format;
114  unsigned BaseOff;
115  unsigned DMask;
116  InstClassEnum InstClass;
117  unsigned CPol = 0;
118  bool IsAGPR;
119  bool UseST64;
120  int AddrIdx[MaxAddressRegs];
121  const MachineOperand *AddrReg[MaxAddressRegs];
122  unsigned NumAddresses;
123  unsigned Order;
124 
125  bool hasSameBaseAddress(const CombineInfo &CI) {
126  if (NumAddresses != CI.NumAddresses)
127  return false;
128 
129  const MachineInstr &MI = *CI.I;
130  for (unsigned i = 0; i < NumAddresses; i++) {
131  const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
132 
133  if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
134  if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
135  AddrReg[i]->getImm() != AddrRegNext.getImm()) {
136  return false;
137  }
138  continue;
139  }
140 
141  // Check same base pointer. Be careful of subregisters, which can occur
142  // with vectors of pointers.
143  if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
144  AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
145  return false;
146  }
147  }
148  return true;
149  }
150 
151  bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
152  for (unsigned i = 0; i < NumAddresses; ++i) {
153  const MachineOperand *AddrOp = AddrReg[i];
154  // Immediates are always OK.
155  if (AddrOp->isImm())
156  continue;
157 
158  // Don't try to merge addresses that aren't either immediates or registers.
159  // TODO: Should be possible to merge FrameIndexes and maybe some other
160  // non-register
161  if (!AddrOp->isReg())
162  return false;
163 
164  // TODO: We should be able to merge physical reg addresses.
165  if (AddrOp->getReg().isPhysical())
166  return false;
167 
168  // If an address has only one use then there will be no other
169  // instructions with the same address, so we can't merge this one.
170  if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
171  return false;
172  }
173  return true;
174  }
175 
176  void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
177 
178  // Compare by pointer order.
179  bool operator<(const CombineInfo& Other) const {
180  return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset;
181  }
182  };
183 
184  struct BaseRegisters {
185  Register LoReg;
186  Register HiReg;
187 
188  unsigned LoSubReg = 0;
189  unsigned HiSubReg = 0;
190  };
191 
192  struct MemAddress {
193  BaseRegisters Base;
194  int64_t Offset = 0;
195  };
196 
197  using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
198 
199 private:
200  const GCNSubtarget *STM = nullptr;
201  const SIInstrInfo *TII = nullptr;
202  const SIRegisterInfo *TRI = nullptr;
203  MachineRegisterInfo *MRI = nullptr;
204  AliasAnalysis *AA = nullptr;
205  bool OptimizeAgain;
206 
207  bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
208  const DenseSet<Register> &ARegUses,
209  const MachineInstr &A, const MachineInstr &B) const;
210  static bool dmasksCanBeCombined(const CombineInfo &CI,
211  const SIInstrInfo &TII,
212  const CombineInfo &Paired);
213  static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
214  CombineInfo &Paired, bool Modify = false);
215  static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
216  const CombineInfo &Paired);
217  static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
218  static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
219  const CombineInfo &Paired);
220  const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI,
221  const CombineInfo &Paired);
222  const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
223 
224  CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
225 
226  unsigned read2Opcode(unsigned EltSize) const;
227  unsigned read2ST64Opcode(unsigned EltSize) const;
229  mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
230  MachineBasicBlock::iterator InsertBefore);
231 
232  unsigned write2Opcode(unsigned EltSize) const;
233  unsigned write2ST64Opcode(unsigned EltSize) const;
235  mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
236  MachineBasicBlock::iterator InsertBefore);
238  mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
239  MachineBasicBlock::iterator InsertBefore);
241  mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
242  MachineBasicBlock::iterator InsertBefore);
244  mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
245  MachineBasicBlock::iterator InsertBefore);
247  mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
248  MachineBasicBlock::iterator InsertBefore);
250  mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
251  MachineBasicBlock::iterator InsertBefore);
253  mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
254  MachineBasicBlock::iterator InsertBefore);
256  mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
257  MachineBasicBlock::iterator InsertBefore);
259  mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
260  MachineBasicBlock::iterator InsertBefore);
261 
262  void updateBaseAndOffset(MachineInstr &I, Register NewBase,
263  int32_t NewOffset) const;
264  Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
265  MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
266  Optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
267  void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
268  /// Promotes constant offset to the immediate by adjusting the base. It
269  /// tries to use a base from the nearby instructions that allows it to have
270  /// a 13bit constant offset which gets promoted to the immediate.
271  bool promoteConstantOffsetToImm(MachineInstr &CI,
272  MemInfoMap &Visited,
273  SmallPtrSet<MachineInstr *, 4> &Promoted) const;
274  void addInstToMergeableList(const CombineInfo &CI,
275  std::list<std::list<CombineInfo> > &MergeableInsts) const;
276 
277  std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
279  MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
280  std::list<std::list<CombineInfo>> &MergeableInsts) const;
281 
282  static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI,
283  const CombineInfo &Paired);
284 
285  static InstClassEnum getCommonInstClass(const CombineInfo &CI,
286  const CombineInfo &Paired);
287 
288 public:
289  static char ID;
290 
291  SILoadStoreOptimizer() : MachineFunctionPass(ID) {
293  }
294 
295  bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
296  bool &OptimizeListAgain);
297  bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
298 
299  bool runOnMachineFunction(MachineFunction &MF) override;
300 
301  StringRef getPassName() const override { return "SI Load Store Optimizer"; }
302 
303  void getAnalysisUsage(AnalysisUsage &AU) const override {
304  AU.setPreservesCFG();
306 
308  }
309 
310  MachineFunctionProperties getRequiredProperties() const override {
313  }
314 };
315 
316 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
317  const unsigned Opc = MI.getOpcode();
318 
319  if (TII.isMUBUF(Opc)) {
320  // FIXME: Handle d16 correctly
321  return AMDGPU::getMUBUFElements(Opc);
322  }
323  if (TII.isMIMG(MI)) {
324  uint64_t DMaskImm =
325  TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
326  return countPopulation(DMaskImm);
327  }
328  if (TII.isMTBUF(Opc)) {
329  return AMDGPU::getMTBUFElements(Opc);
330  }
331 
332  switch (Opc) {
333  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
334  case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
335  case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
336  case AMDGPU::S_LOAD_DWORD_IMM:
337  case AMDGPU::GLOBAL_LOAD_DWORD:
338  case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
339  case AMDGPU::GLOBAL_STORE_DWORD:
340  case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
341  case AMDGPU::FLAT_LOAD_DWORD:
342  case AMDGPU::FLAT_STORE_DWORD:
343  return 1;
344  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
345  case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
346  case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
347  case AMDGPU::S_LOAD_DWORDX2_IMM:
348  case AMDGPU::GLOBAL_LOAD_DWORDX2:
349  case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
350  case AMDGPU::GLOBAL_STORE_DWORDX2:
351  case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
352  case AMDGPU::FLAT_LOAD_DWORDX2:
353  case AMDGPU::FLAT_STORE_DWORDX2:
354  return 2;
355  case AMDGPU::GLOBAL_LOAD_DWORDX3:
356  case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
357  case AMDGPU::GLOBAL_STORE_DWORDX3:
358  case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
359  case AMDGPU::FLAT_LOAD_DWORDX3:
360  case AMDGPU::FLAT_STORE_DWORDX3:
361  return 3;
362  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
363  case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
364  case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
365  case AMDGPU::S_LOAD_DWORDX4_IMM:
366  case AMDGPU::GLOBAL_LOAD_DWORDX4:
367  case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
368  case AMDGPU::GLOBAL_STORE_DWORDX4:
369  case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
370  case AMDGPU::FLAT_LOAD_DWORDX4:
371  case AMDGPU::FLAT_STORE_DWORDX4:
372  return 4;
373  case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
374  case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
375  case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
376  case AMDGPU::S_LOAD_DWORDX8_IMM:
377  return 8;
378  case AMDGPU::DS_READ_B32: [[fallthrough]];
379  case AMDGPU::DS_READ_B32_gfx9: [[fallthrough]];
380  case AMDGPU::DS_WRITE_B32: [[fallthrough]];
381  case AMDGPU::DS_WRITE_B32_gfx9:
382  return 1;
383  case AMDGPU::DS_READ_B64: [[fallthrough]];
384  case AMDGPU::DS_READ_B64_gfx9: [[fallthrough]];
385  case AMDGPU::DS_WRITE_B64: [[fallthrough]];
386  case AMDGPU::DS_WRITE_B64_gfx9:
387  return 2;
388  default:
389  return 0;
390  }
391 }
392 
393 /// Maps instruction opcode to enum InstClassEnum.
394 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
395  switch (Opc) {
396  default:
397  if (TII.isMUBUF(Opc)) {
398  switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
399  default:
400  return UNKNOWN;
401  case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
402  case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
403  case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
404  case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
405  return BUFFER_LOAD;
406  case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
407  case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
408  case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
409  case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
410  return BUFFER_STORE;
411  }
412  }
413  if (TII.isMIMG(Opc)) {
414  // Ignore instructions encoded without vaddr.
415  if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1 &&
416  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) == -1)
417  return UNKNOWN;
418  // Ignore BVH instructions
419  if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
420  return UNKNOWN;
421  // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
422  if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
423  TII.isGather4(Opc))
424  return UNKNOWN;
425  return MIMG;
426  }
427  if (TII.isMTBUF(Opc)) {
428  switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
429  default:
430  return UNKNOWN;
431  case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
432  case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
433  case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
434  case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
435  return TBUFFER_LOAD;
436  case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
437  case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
438  case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
439  case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
440  return TBUFFER_STORE;
441  }
442  }
443  return UNKNOWN;
444  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
445  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
446  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
447  case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
448  return S_BUFFER_LOAD_IMM;
449  // For the purposes of this optimization SGPR variants of buffer loads
450  // are considered to be zero-offsetted SGPR_IMM loads.
451  case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
452  case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
453  case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
454  case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
455  case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
456  case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
457  case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
458  case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
459  return S_BUFFER_LOAD_SGPR_IMM;
460  case AMDGPU::S_LOAD_DWORD_IMM:
461  case AMDGPU::S_LOAD_DWORDX2_IMM:
462  case AMDGPU::S_LOAD_DWORDX4_IMM:
463  case AMDGPU::S_LOAD_DWORDX8_IMM:
464  return S_LOAD_IMM;
465  case AMDGPU::DS_READ_B32:
466  case AMDGPU::DS_READ_B32_gfx9:
467  case AMDGPU::DS_READ_B64:
468  case AMDGPU::DS_READ_B64_gfx9:
469  return DS_READ;
470  case AMDGPU::DS_WRITE_B32:
471  case AMDGPU::DS_WRITE_B32_gfx9:
472  case AMDGPU::DS_WRITE_B64:
473  case AMDGPU::DS_WRITE_B64_gfx9:
474  return DS_WRITE;
475  case AMDGPU::GLOBAL_LOAD_DWORD:
476  case AMDGPU::GLOBAL_LOAD_DWORDX2:
477  case AMDGPU::GLOBAL_LOAD_DWORDX3:
478  case AMDGPU::GLOBAL_LOAD_DWORDX4:
479  case AMDGPU::FLAT_LOAD_DWORD:
480  case AMDGPU::FLAT_LOAD_DWORDX2:
481  case AMDGPU::FLAT_LOAD_DWORDX3:
482  case AMDGPU::FLAT_LOAD_DWORDX4:
483  return FLAT_LOAD;
484  case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
485  case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
486  case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
487  case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
488  return GLOBAL_LOAD_SADDR;
489  case AMDGPU::GLOBAL_STORE_DWORD:
490  case AMDGPU::GLOBAL_STORE_DWORDX2:
491  case AMDGPU::GLOBAL_STORE_DWORDX3:
492  case AMDGPU::GLOBAL_STORE_DWORDX4:
493  case AMDGPU::FLAT_STORE_DWORD:
494  case AMDGPU::FLAT_STORE_DWORDX2:
495  case AMDGPU::FLAT_STORE_DWORDX3:
496  case AMDGPU::FLAT_STORE_DWORDX4:
497  return FLAT_STORE;
498  case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
499  case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
500  case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
501  case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
502  return GLOBAL_STORE_SADDR;
503  }
504 }
505 
506 /// Determines instruction subclass from opcode. Only instructions
507 /// of the same subclass can be merged together. The merged instruction may have
508 /// a different subclass but must have the same class.
509 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
510  switch (Opc) {
511  default:
512  if (TII.isMUBUF(Opc))
513  return AMDGPU::getMUBUFBaseOpcode(Opc);
514  if (TII.isMIMG(Opc)) {
515  const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
516  assert(Info);
517  return Info->BaseOpcode;
518  }
519  if (TII.isMTBUF(Opc))
520  return AMDGPU::getMTBUFBaseOpcode(Opc);
521  return -1;
522  case AMDGPU::DS_READ_B32:
523  case AMDGPU::DS_READ_B32_gfx9:
524  case AMDGPU::DS_READ_B64:
525  case AMDGPU::DS_READ_B64_gfx9:
526  case AMDGPU::DS_WRITE_B32:
527  case AMDGPU::DS_WRITE_B32_gfx9:
528  case AMDGPU::DS_WRITE_B64:
529  case AMDGPU::DS_WRITE_B64_gfx9:
530  return Opc;
531  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
532  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
533  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
534  case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
535  return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
536  // For the purposes of this optimization SGPR variants of buffer loads
537  // are considered to be zero-offsetted SGPR_IMM loads.
538  case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
539  case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
540  case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
541  case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
542  case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
543  case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
544  case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
545  case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
546  return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
547  case AMDGPU::S_LOAD_DWORD_IMM:
548  case AMDGPU::S_LOAD_DWORDX2_IMM:
549  case AMDGPU::S_LOAD_DWORDX4_IMM:
550  case AMDGPU::S_LOAD_DWORDX8_IMM:
551  return AMDGPU::S_LOAD_DWORD_IMM;
552  case AMDGPU::GLOBAL_LOAD_DWORD:
553  case AMDGPU::GLOBAL_LOAD_DWORDX2:
554  case AMDGPU::GLOBAL_LOAD_DWORDX3:
555  case AMDGPU::GLOBAL_LOAD_DWORDX4:
556  case AMDGPU::FLAT_LOAD_DWORD:
557  case AMDGPU::FLAT_LOAD_DWORDX2:
558  case AMDGPU::FLAT_LOAD_DWORDX3:
559  case AMDGPU::FLAT_LOAD_DWORDX4:
560  return AMDGPU::FLAT_LOAD_DWORD;
561  case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
562  case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
563  case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
564  case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
565  return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
566  case AMDGPU::GLOBAL_STORE_DWORD:
567  case AMDGPU::GLOBAL_STORE_DWORDX2:
568  case AMDGPU::GLOBAL_STORE_DWORDX3:
569  case AMDGPU::GLOBAL_STORE_DWORDX4:
570  case AMDGPU::FLAT_STORE_DWORD:
571  case AMDGPU::FLAT_STORE_DWORDX2:
572  case AMDGPU::FLAT_STORE_DWORDX3:
573  case AMDGPU::FLAT_STORE_DWORDX4:
574  return AMDGPU::FLAT_STORE_DWORD;
575  case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
576  case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
577  case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
578  case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
579  return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
580  }
581 }
582 
583 // GLOBAL loads and stores are classified as FLAT initially. If both combined
584 // instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE.
585 // If either or both instructions are non segment specific FLAT the resulting
586 // combined operation will be FLAT, potentially promoting one of the GLOBAL
587 // operations to FLAT.
588 // For other instructions return the original unmodified class.
589 InstClassEnum
590 SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI,
591  const CombineInfo &Paired) {
592  assert(CI.InstClass == Paired.InstClass);
593 
594  if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
596  return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
597 
598  return CI.InstClass;
599 }
600 
601 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
602  AddressRegs Result;
603 
604  if (TII.isMUBUF(Opc)) {
605  if (AMDGPU::getMUBUFHasVAddr(Opc))
606  Result.VAddr = true;
607  if (AMDGPU::getMUBUFHasSrsrc(Opc))
608  Result.SRsrc = true;
610  Result.SOffset = true;
611 
612  return Result;
613  }
614 
615  if (TII.isMIMG(Opc)) {
616  int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
617  if (VAddr0Idx >= 0) {
618  int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
619  Result.NumVAddrs = SRsrcIdx - VAddr0Idx;
620  } else {
621  Result.VAddr = true;
622  }
623  Result.SRsrc = true;
624  const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
625  if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
626  Result.SSamp = true;
627 
628  return Result;
629  }
630  if (TII.isMTBUF(Opc)) {
631  if (AMDGPU::getMTBUFHasVAddr(Opc))
632  Result.VAddr = true;
633  if (AMDGPU::getMTBUFHasSrsrc(Opc))
634  Result.SRsrc = true;
636  Result.SOffset = true;
637 
638  return Result;
639  }
640 
641  switch (Opc) {
642  default:
643  return Result;
644  case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
645  case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
646  case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
647  case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
648  case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
649  case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
650  case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
651  case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
652  Result.SOffset = true;
653  [[fallthrough]];
654  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
655  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
656  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
657  case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
658  case AMDGPU::S_LOAD_DWORD_IMM:
659  case AMDGPU::S_LOAD_DWORDX2_IMM:
660  case AMDGPU::S_LOAD_DWORDX4_IMM:
661  case AMDGPU::S_LOAD_DWORDX8_IMM:
662  Result.SBase = true;
663  return Result;
664  case AMDGPU::DS_READ_B32:
665  case AMDGPU::DS_READ_B64:
666  case AMDGPU::DS_READ_B32_gfx9:
667  case AMDGPU::DS_READ_B64_gfx9:
668  case AMDGPU::DS_WRITE_B32:
669  case AMDGPU::DS_WRITE_B64:
670  case AMDGPU::DS_WRITE_B32_gfx9:
671  case AMDGPU::DS_WRITE_B64_gfx9:
672  Result.Addr = true;
673  return Result;
674  case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
675  case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
676  case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
677  case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
678  case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
679  case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
680  case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
681  case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
682  Result.SAddr = true;
683  [[fallthrough]];
684  case AMDGPU::GLOBAL_LOAD_DWORD:
685  case AMDGPU::GLOBAL_LOAD_DWORDX2:
686  case AMDGPU::GLOBAL_LOAD_DWORDX3:
687  case AMDGPU::GLOBAL_LOAD_DWORDX4:
688  case AMDGPU::GLOBAL_STORE_DWORD:
689  case AMDGPU::GLOBAL_STORE_DWORDX2:
690  case AMDGPU::GLOBAL_STORE_DWORDX3:
691  case AMDGPU::GLOBAL_STORE_DWORDX4:
692  case AMDGPU::FLAT_LOAD_DWORD:
693  case AMDGPU::FLAT_LOAD_DWORDX2:
694  case AMDGPU::FLAT_LOAD_DWORDX3:
695  case AMDGPU::FLAT_LOAD_DWORDX4:
696  case AMDGPU::FLAT_STORE_DWORD:
697  case AMDGPU::FLAT_STORE_DWORDX2:
698  case AMDGPU::FLAT_STORE_DWORDX3:
699  case AMDGPU::FLAT_STORE_DWORDX4:
700  Result.VAddr = true;
701  return Result;
702  }
703 }
704 
705 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
706  const SILoadStoreOptimizer &LSO) {
707  I = MI;
708  unsigned Opc = MI->getOpcode();
709  InstClass = getInstClass(Opc, *LSO.TII);
710 
711  if (InstClass == UNKNOWN)
712  return;
713 
714  IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI));
715 
716  switch (InstClass) {
717  case DS_READ:
718  EltSize =
719  (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
720  : 4;
721  break;
722  case DS_WRITE:
723  EltSize =
724  (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
725  : 4;
726  break;
727  case S_BUFFER_LOAD_IMM:
728  case S_BUFFER_LOAD_SGPR_IMM:
729  case S_LOAD_IMM:
730  EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);
731  break;
732  default:
733  EltSize = 4;
734  break;
735  }
736 
737  if (InstClass == MIMG) {
738  DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
739  // Offset is not considered for MIMG instructions.
740  Offset = 0;
741  } else {
742  int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
743  Offset = OffsetIdx == -1 ? 0 : I->getOperand(OffsetIdx).getImm();
744  }
745 
746  if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
747  Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
748 
749  Width = getOpcodeWidth(*I, *LSO.TII);
750 
751  if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
752  Offset &= 0xffff;
753  } else if (InstClass != MIMG) {
754  CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
755  }
756 
757  AddressRegs Regs = getRegs(Opc, *LSO.TII);
758 
759  NumAddresses = 0;
760  for (unsigned J = 0; J < Regs.NumVAddrs; J++)
761  AddrIdx[NumAddresses++] =
762  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
763  if (Regs.Addr)
764  AddrIdx[NumAddresses++] =
765  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
766  if (Regs.SBase)
767  AddrIdx[NumAddresses++] =
768  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
769  if (Regs.SRsrc)
770  AddrIdx[NumAddresses++] =
771  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
772  if (Regs.SOffset)
773  AddrIdx[NumAddresses++] =
774  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
775  if (Regs.SAddr)
776  AddrIdx[NumAddresses++] =
777  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
778  if (Regs.VAddr)
779  AddrIdx[NumAddresses++] =
780  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
781  if (Regs.SSamp)
782  AddrIdx[NumAddresses++] =
783  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp);
784  assert(NumAddresses <= MaxAddressRegs);
785 
786  for (unsigned J = 0; J < NumAddresses; J++)
787  AddrReg[J] = &I->getOperand(AddrIdx[J]);
788 }
789 
790 } // end anonymous namespace.
791 
792 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
793  "SI Load Store Optimizer", false, false)
797 
798 char SILoadStoreOptimizer::ID = 0;
799 
800 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
801 
803  return new SILoadStoreOptimizer();
804 }
805 
806 static void addDefsUsesToList(const MachineInstr &MI,
807  DenseSet<Register> &RegDefs,
808  DenseSet<Register> &RegUses) {
809  for (const auto &Op : MI.operands()) {
810  if (!Op.isReg())
811  continue;
812  if (Op.isDef())
813  RegDefs.insert(Op.getReg());
814  if (Op.readsReg())
815  RegUses.insert(Op.getReg());
816  }
817 }
818 
819 bool SILoadStoreOptimizer::canSwapInstructions(
820  const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
821  const MachineInstr &A, const MachineInstr &B) const {
822  if (A.mayLoadOrStore() && B.mayLoadOrStore() &&
823  (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true))
824  return false;
825  for (const auto &BOp : B.operands()) {
826  if (!BOp.isReg())
827  continue;
828  if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))
829  return false;
830  if (BOp.isDef() && ARegUses.contains(BOp.getReg()))
831  return false;
832  }
833  return true;
834 }
835 
836 // Given that \p CI and \p Paired are adjacent memory operations produce a new
837 // MMO for the combined operation with a new access size.
839 SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,
840  const CombineInfo &Paired) {
841  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
842  const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
843 
844  unsigned Size = MMOa->getSize() + MMOb->getSize();
845 
846  // A base pointer for the combined operation is the same as the leading
847  // operation's pointer.
848  if (Paired < CI)
849  std::swap(MMOa, MMOb);
850 
851  MachinePointerInfo PtrInfo(MMOa->getPointerInfo());
852  // If merging FLAT and GLOBAL set address space to FLAT.
853  if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
854  PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS;
855 
856  MachineFunction *MF = CI.I->getMF();
857  return MF->getMachineMemOperand(MMOa, PtrInfo, Size);
858 }
859 
860 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
861  const SIInstrInfo &TII,
862  const CombineInfo &Paired) {
863  assert(CI.InstClass == MIMG);
864 
865  // Ignore instructions with tfe/lwe set.
866  const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
867  const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
868 
869  if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
870  return false;
871 
872  // Check other optional immediate operands for equality.
873  unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
874  AMDGPU::OpName::unorm, AMDGPU::OpName::da,
875  AMDGPU::OpName::r128, AMDGPU::OpName::a16};
876 
877  for (auto op : OperandsToMatch) {
878  int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
879  if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
880  return false;
881  if (Idx != -1 &&
882  CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
883  return false;
884  }
885 
886  // Check DMask for overlaps.
887  unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
888  unsigned MinMask = std::min(CI.DMask, Paired.DMask);
889 
890  unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask);
891  if ((1u << AllowedBitsForMin) <= MinMask)
892  return false;
893 
894  return true;
895 }
896 
897 static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
898  unsigned ComponentCount,
899  const GCNSubtarget &STI) {
900  if (ComponentCount > 4)
901  return 0;
902 
903  const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
904  llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI);
905  if (!OldFormatInfo)
906  return 0;
907 
908  const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
910  ComponentCount,
911  OldFormatInfo->NumFormat, STI);
912 
913  if (!NewFormatInfo)
914  return 0;
915 
916  assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
917  NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
918 
919  return NewFormatInfo->Format;
920 }
921 
922 // Return the value in the inclusive range [Lo,Hi] that is aligned to the
923 // highest power of two. Note that the result is well defined for all inputs
924 // including corner cases like:
925 // - if Lo == Hi, return that value
926 // - if Lo == 0, return 0 (even though the "- 1" below underflows
927 // - if Lo > Hi, return 0 (as if the range wrapped around)
929  return Hi & maskLeadingOnes<uint32_t>(countLeadingZeros((Lo - 1) ^ Hi) + 1);
930 }
931 
932 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
933  const GCNSubtarget &STI,
934  CombineInfo &Paired,
935  bool Modify) {
936  assert(CI.InstClass != MIMG);
937 
938  // XXX - Would the same offset be OK? Is there any reason this would happen or
939  // be useful?
940  if (CI.Offset == Paired.Offset)
941  return false;
942 
943  // This won't be valid if the offset isn't aligned.
944  if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
945  return false;
946 
947  if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
948 
949  const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
950  llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI);
951  if (!Info0)
952  return false;
953  const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
954  llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
955  if (!Info1)
956  return false;
957 
958  if (Info0->BitsPerComp != Info1->BitsPerComp ||
959  Info0->NumFormat != Info1->NumFormat)
960  return false;
961 
962  // TODO: Should be possible to support more formats, but if format loads
963  // are not dword-aligned, the merged load might not be valid.
964  if (Info0->BitsPerComp != 32)
965  return false;
966 
967  if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
968  return false;
969  }
970 
971  uint32_t EltOffset0 = CI.Offset / CI.EltSize;
972  uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
973  CI.UseST64 = false;
974  CI.BaseOff = 0;
975 
976  // Handle all non-DS instructions.
977  if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
978  return (EltOffset0 + CI.Width == EltOffset1 ||
979  EltOffset1 + Paired.Width == EltOffset0) &&
980  CI.CPol == Paired.CPol;
981  }
982 
983  // If the offset in elements doesn't fit in 8-bits, we might be able to use
984  // the stride 64 versions.
985  if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
986  isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
987  if (Modify) {
988  CI.Offset = EltOffset0 / 64;
989  Paired.Offset = EltOffset1 / 64;
990  CI.UseST64 = true;
991  }
992  return true;
993  }
994 
995  // Check if the new offsets fit in the reduced 8-bit range.
996  if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
997  if (Modify) {
998  CI.Offset = EltOffset0;
999  Paired.Offset = EltOffset1;
1000  }
1001  return true;
1002  }
1003 
1004  // Try to shift base address to decrease offsets.
1005  uint32_t Min = std::min(EltOffset0, EltOffset1);
1006  uint32_t Max = std::max(EltOffset0, EltOffset1);
1007 
1008  const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
1009  if (((Max - Min) & ~Mask) == 0) {
1010  if (Modify) {
1011  // From the range of values we could use for BaseOff, choose the one that
1012  // is aligned to the highest power of two, to maximise the chance that
1013  // the same offset can be reused for other load/store pairs.
1014  uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
1015  // Copy the low bits of the offsets, so that when we adjust them by
1016  // subtracting BaseOff they will be multiples of 64.
1017  BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
1018  CI.BaseOff = BaseOff * CI.EltSize;
1019  CI.Offset = (EltOffset0 - BaseOff) / 64;
1020  Paired.Offset = (EltOffset1 - BaseOff) / 64;
1021  CI.UseST64 = true;
1022  }
1023  return true;
1024  }
1025 
1026  if (isUInt<8>(Max - Min)) {
1027  if (Modify) {
1028  // From the range of values we could use for BaseOff, choose the one that
1029  // is aligned to the highest power of two, to maximise the chance that
1030  // the same offset can be reused for other load/store pairs.
1031  uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
1032  CI.BaseOff = BaseOff * CI.EltSize;
1033  CI.Offset = EltOffset0 - BaseOff;
1034  Paired.Offset = EltOffset1 - BaseOff;
1035  }
1036  return true;
1037  }
1038 
1039  return false;
1040 }
1041 
1042 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
1043  const CombineInfo &CI,
1044  const CombineInfo &Paired) {
1045  const unsigned Width = (CI.Width + Paired.Width);
1046  switch (CI.InstClass) {
1047  default:
1048  return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
1049  case S_BUFFER_LOAD_IMM:
1050  case S_BUFFER_LOAD_SGPR_IMM:
1051  case S_LOAD_IMM:
1052  switch (Width) {
1053  default:
1054  return false;
1055  case 2:
1056  case 4:
1057  case 8:
1058  return true;
1059  }
1060  }
1061 }
1062 
1063 const TargetRegisterClass *
1064 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
1065  if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
1066  return TRI->getRegClassForReg(*MRI, Dst->getReg());
1067  }
1068  if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
1069  return TRI->getRegClassForReg(*MRI, Src->getReg());
1070  }
1071  if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
1072  return TRI->getRegClassForReg(*MRI, Src->getReg());
1073  }
1074  if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
1075  return TRI->getRegClassForReg(*MRI, Dst->getReg());
1076  }
1077  if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
1078  return TRI->getRegClassForReg(*MRI, Src->getReg());
1079  }
1080  return nullptr;
1081 }
1082 
1083 /// This function assumes that CI comes before Paired in a basic block. Return
1084 /// an insertion point for the merged instruction or nullptr on failure.
1085 SILoadStoreOptimizer::CombineInfo *
1086 SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1087  CombineInfo &Paired) {
1088  // If another instruction has already been merged into CI, it may now be a
1089  // type that we can't do any further merging into.
1090  if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1091  return nullptr;
1092  assert(CI.InstClass == Paired.InstClass);
1093 
1094  if (getInstSubclass(CI.I->getOpcode(), *TII) !=
1095  getInstSubclass(Paired.I->getOpcode(), *TII))
1096  return nullptr;
1097 
1098  // Check both offsets (or masks for MIMG) can be combined and fit in the
1099  // reduced range.
1100  if (CI.InstClass == MIMG) {
1101  if (!dmasksCanBeCombined(CI, *TII, Paired))
1102  return nullptr;
1103  } else {
1104  if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
1105  return nullptr;
1106  }
1107 
1108  DenseSet<Register> RegDefs;
1109  DenseSet<Register> RegUses;
1110  CombineInfo *Where;
1111  if (CI.I->mayLoad()) {
1112  // Try to hoist Paired up to CI.
1113  addDefsUsesToList(*Paired.I, RegDefs, RegUses);
1114  for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
1115  if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))
1116  return nullptr;
1117  }
1118  Where = &CI;
1119  } else {
1120  // Try to sink CI down to Paired.
1121  addDefsUsesToList(*CI.I, RegDefs, RegUses);
1122  for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
1123  if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))
1124  return nullptr;
1125  }
1126  Where = &Paired;
1127  }
1128 
1129  // Call offsetsCanBeCombined with modify = true so that the offsets are
1130  // correct for the new instruction. This should return true, because
1131  // this function should only be called on CombineInfo objects that
1132  // have already been confirmed to be mergeable.
1133  if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1134  offsetsCanBeCombined(CI, *STM, Paired, true);
1135  return Where;
1136 }
1137 
1138 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
1139  if (STM->ldsRequiresM0Init())
1140  return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1141  return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1142 }
1143 
1144 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
1145  if (STM->ldsRequiresM0Init())
1146  return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1147 
1148  return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1149  : AMDGPU::DS_READ2ST64_B64_gfx9;
1150 }
1151 
1153 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1154  MachineBasicBlock::iterator InsertBefore) {
1155  MachineBasicBlock *MBB = CI.I->getParent();
1156 
1157  // Be careful, since the addresses could be subregisters themselves in weird
1158  // cases, like vectors of pointers.
1159  const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1160 
1161  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1162  const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1163 
1164  unsigned NewOffset0 = CI.Offset;
1165  unsigned NewOffset1 = Paired.Offset;
1166  unsigned Opc =
1167  CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1168 
1169  unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
1170  unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
1171 
1172  if (NewOffset0 > NewOffset1) {
1173  // Canonicalize the merged instruction so the smaller offset comes first.
1174  std::swap(NewOffset0, NewOffset1);
1175  std::swap(SubRegIdx0, SubRegIdx1);
1176  }
1177 
1178  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1179  (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1180 
1181  const MCInstrDesc &Read2Desc = TII->get(Opc);
1182 
1183  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1184  Register DestReg = MRI->createVirtualRegister(SuperRC);
1185 
1186  DebugLoc DL = CI.I->getDebugLoc();
1187 
1188  Register BaseReg = AddrReg->getReg();
1189  unsigned BaseSubReg = AddrReg->getSubReg();
1190  unsigned BaseRegFlags = 0;
1191  if (CI.BaseOff) {
1192  Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1193  BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1194  .addImm(CI.BaseOff);
1195 
1196  BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1197  BaseRegFlags = RegState::Kill;
1198 
1199  TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1200  .addReg(ImmReg)
1201  .addReg(AddrReg->getReg(), 0, BaseSubReg)
1202  .addImm(0); // clamp bit
1203  BaseSubReg = 0;
1204  }
1205 
1206  MachineInstrBuilder Read2 =
1207  BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg)
1208  .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1209  .addImm(NewOffset0) // offset0
1210  .addImm(NewOffset1) // offset1
1211  .addImm(0) // gds
1212  .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1213 
1214  (void)Read2;
1215 
1216  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1217 
1218  // Copy to the old destination registers.
1219  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1220  .add(*Dest0) // Copy to same destination including flags and sub reg.
1221  .addReg(DestReg, 0, SubRegIdx0);
1222  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1223  .add(*Dest1)
1224  .addReg(DestReg, RegState::Kill, SubRegIdx1);
1225 
1226  CI.I->eraseFromParent();
1227  Paired.I->eraseFromParent();
1228 
1229  LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
1230  return Read2;
1231 }
1232 
1233 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
1234  if (STM->ldsRequiresM0Init())
1235  return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1236  return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1237  : AMDGPU::DS_WRITE2_B64_gfx9;
1238 }
1239 
1240 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
1241  if (STM->ldsRequiresM0Init())
1242  return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1243  : AMDGPU::DS_WRITE2ST64_B64;
1244 
1245  return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1246  : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1247 }
1248 
1249 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
1250  CombineInfo &CI, CombineInfo &Paired,
1251  MachineBasicBlock::iterator InsertBefore) {
1252  MachineBasicBlock *MBB = CI.I->getParent();
1253 
1254  // Be sure to use .addOperand(), and not .addReg() with these. We want to be
1255  // sure we preserve the subregister index and any register flags set on them.
1256  const MachineOperand *AddrReg =
1257  TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1258  const MachineOperand *Data0 =
1259  TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1260  const MachineOperand *Data1 =
1261  TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1262 
1263  unsigned NewOffset0 = CI.Offset;
1264  unsigned NewOffset1 = Paired.Offset;
1265  unsigned Opc =
1266  CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1267 
1268  if (NewOffset0 > NewOffset1) {
1269  // Canonicalize the merged instruction so the smaller offset comes first.
1270  std::swap(NewOffset0, NewOffset1);
1271  std::swap(Data0, Data1);
1272  }
1273 
1274  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1275  (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1276 
1277  const MCInstrDesc &Write2Desc = TII->get(Opc);
1278  DebugLoc DL = CI.I->getDebugLoc();
1279 
1280  Register BaseReg = AddrReg->getReg();
1281  unsigned BaseSubReg = AddrReg->getSubReg();
1282  unsigned BaseRegFlags = 0;
1283  if (CI.BaseOff) {
1284  Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1285  BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1286  .addImm(CI.BaseOff);
1287 
1288  BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1289  BaseRegFlags = RegState::Kill;
1290 
1291  TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1292  .addReg(ImmReg)
1293  .addReg(AddrReg->getReg(), 0, BaseSubReg)
1294  .addImm(0); // clamp bit
1295  BaseSubReg = 0;
1296  }
1297 
1298  MachineInstrBuilder Write2 =
1299  BuildMI(*MBB, InsertBefore, DL, Write2Desc)
1300  .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1301  .add(*Data0) // data0
1302  .add(*Data1) // data1
1303  .addImm(NewOffset0) // offset0
1304  .addImm(NewOffset1) // offset1
1305  .addImm(0) // gds
1306  .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1307 
1308  CI.I->eraseFromParent();
1309  Paired.I->eraseFromParent();
1310 
1311  LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
1312  return Write2;
1313 }
1314 
1316 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1317  MachineBasicBlock::iterator InsertBefore) {
1318  MachineBasicBlock *MBB = CI.I->getParent();
1319  DebugLoc DL = CI.I->getDebugLoc();
1320  const unsigned Opcode = getNewOpcode(CI, Paired);
1321 
1322  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1323 
1324  Register DestReg = MRI->createVirtualRegister(SuperRC);
1325  unsigned MergedDMask = CI.DMask | Paired.DMask;
1326  unsigned DMaskIdx =
1327  AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1328 
1329  auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1330  for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
1331  if (I == DMaskIdx)
1332  MIB.addImm(MergedDMask);
1333  else
1334  MIB.add((*CI.I).getOperand(I));
1335  }
1336 
1337  // It shouldn't be possible to get this far if the two instructions
1338  // don't have a single memoperand, because MachineInstr::mayAlias()
1339  // will return true if this is the case.
1340  assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1341 
1342  MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1343 
1344  unsigned SubRegIdx0, SubRegIdx1;
1345  std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired);
1346 
1347  // Copy to the old destination registers.
1348  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1349  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1350  const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1351 
1352  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1353  .add(*Dest0) // Copy to same destination including flags and sub reg.
1354  .addReg(DestReg, 0, SubRegIdx0);
1355  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1356  .add(*Dest1)
1357  .addReg(DestReg, RegState::Kill, SubRegIdx1);
1358 
1359  CI.I->eraseFromParent();
1360  Paired.I->eraseFromParent();
1361  return New;
1362 }
1363 
1364 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
1365  CombineInfo &CI, CombineInfo &Paired,
1366  MachineBasicBlock::iterator InsertBefore) {
1367  MachineBasicBlock *MBB = CI.I->getParent();
1368  DebugLoc DL = CI.I->getDebugLoc();
1369  const unsigned Opcode = getNewOpcode(CI, Paired);
1370 
1371  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1372 
1373  Register DestReg = MRI->createVirtualRegister(SuperRC);
1374  unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1375 
1376  // It shouldn't be possible to get this far if the two instructions
1377  // don't have a single memoperand, because MachineInstr::mayAlias()
1378  // will return true if this is the case.
1379  assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1380 
1382  BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)
1383  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));
1384  if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1385  New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));
1386  // For convenience, when SGPR_IMM buffer loads are merged into a
1387  // zero-offset load, we generate its SGPR variant.
1388  if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::offset) != -1)
1389  New.addImm(MergedOffset);
1390  New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1391 
1392  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1393  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1394  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1395 
1396  // Copy to the old destination registers.
1397  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1398  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
1399  const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst);
1400 
1401  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1402  .add(*Dest0) // Copy to same destination including flags and sub reg.
1403  .addReg(DestReg, 0, SubRegIdx0);
1404  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1405  .add(*Dest1)
1406  .addReg(DestReg, RegState::Kill, SubRegIdx1);
1407 
1408  CI.I->eraseFromParent();
1409  Paired.I->eraseFromParent();
1410  return New;
1411 }
1412 
1413 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
1414  CombineInfo &CI, CombineInfo &Paired,
1415  MachineBasicBlock::iterator InsertBefore) {
1416  MachineBasicBlock *MBB = CI.I->getParent();
1417  DebugLoc DL = CI.I->getDebugLoc();
1418 
1419  const unsigned Opcode = getNewOpcode(CI, Paired);
1420 
1421  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1422 
1423  // Copy to the new source register.
1424  Register DestReg = MRI->createVirtualRegister(SuperRC);
1425  unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1426 
1427  auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1428 
1429  AddressRegs Regs = getRegs(Opcode, *TII);
1430 
1431  if (Regs.VAddr)
1432  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1433 
1434  // It shouldn't be possible to get this far if the two instructions
1435  // don't have a single memoperand, because MachineInstr::mayAlias()
1436  // will return true if this is the case.
1437  assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1438 
1439  MachineInstr *New =
1440  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1441  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1442  .addImm(MergedOffset) // offset
1443  .addImm(CI.CPol) // cpol
1444  .addImm(0) // tfe
1445  .addImm(0) // swz
1446  .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1447 
1448  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1449  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1450  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1451 
1452  // Copy to the old destination registers.
1453  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1454  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1455  const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1456 
1457  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1458  .add(*Dest0) // Copy to same destination including flags and sub reg.
1459  .addReg(DestReg, 0, SubRegIdx0);
1460  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1461  .add(*Dest1)
1462  .addReg(DestReg, RegState::Kill, SubRegIdx1);
1463 
1464  CI.I->eraseFromParent();
1465  Paired.I->eraseFromParent();
1466  return New;
1467 }
1468 
1469 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
1470  CombineInfo &CI, CombineInfo &Paired,
1471  MachineBasicBlock::iterator InsertBefore) {
1472  MachineBasicBlock *MBB = CI.I->getParent();
1473  DebugLoc DL = CI.I->getDebugLoc();
1474 
1475  const unsigned Opcode = getNewOpcode(CI, Paired);
1476 
1477  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1478 
1479  // Copy to the new source register.
1480  Register DestReg = MRI->createVirtualRegister(SuperRC);
1481  unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1482 
1483  auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1484 
1485  AddressRegs Regs = getRegs(Opcode, *TII);
1486 
1487  if (Regs.VAddr)
1488  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1489 
1490  unsigned JoinedFormat =
1491  getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1492 
1493  // It shouldn't be possible to get this far if the two instructions
1494  // don't have a single memoperand, because MachineInstr::mayAlias()
1495  // will return true if this is the case.
1496  assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1497 
1498  MachineInstr *New =
1499  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1500  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1501  .addImm(MergedOffset) // offset
1502  .addImm(JoinedFormat) // format
1503  .addImm(CI.CPol) // cpol
1504  .addImm(0) // tfe
1505  .addImm(0) // swz
1506  .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1507 
1508  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1509  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1510  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1511 
1512  // Copy to the old destination registers.
1513  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1514  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1515  const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1516 
1517  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1518  .add(*Dest0) // Copy to same destination including flags and sub reg.
1519  .addReg(DestReg, 0, SubRegIdx0);
1520  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1521  .add(*Dest1)
1522  .addReg(DestReg, RegState::Kill, SubRegIdx1);
1523 
1524  CI.I->eraseFromParent();
1525  Paired.I->eraseFromParent();
1526  return New;
1527 }
1528 
1529 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
1530  CombineInfo &CI, CombineInfo &Paired,
1531  MachineBasicBlock::iterator InsertBefore) {
1532  MachineBasicBlock *MBB = CI.I->getParent();
1533  DebugLoc DL = CI.I->getDebugLoc();
1534 
1535  const unsigned Opcode = getNewOpcode(CI, Paired);
1536 
1537  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1538  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1539  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1540 
1541  // Copy to the new source register.
1542  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1543  Register SrcReg = MRI->createVirtualRegister(SuperRC);
1544 
1545  const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1546  const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1547 
1548  BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1549  .add(*Src0)
1550  .addImm(SubRegIdx0)
1551  .add(*Src1)
1552  .addImm(SubRegIdx1);
1553 
1554  auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1555  .addReg(SrcReg, RegState::Kill);
1556 
1557  AddressRegs Regs = getRegs(Opcode, *TII);
1558 
1559  if (Regs.VAddr)
1560  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1561 
1562  unsigned JoinedFormat =
1563  getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1564 
1565  // It shouldn't be possible to get this far if the two instructions
1566  // don't have a single memoperand, because MachineInstr::mayAlias()
1567  // will return true if this is the case.
1568  assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1569 
1570  MachineInstr *New =
1571  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1572  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1573  .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1574  .addImm(JoinedFormat) // format
1575  .addImm(CI.CPol) // cpol
1576  .addImm(0) // tfe
1577  .addImm(0) // swz
1578  .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1579 
1580  CI.I->eraseFromParent();
1581  Paired.I->eraseFromParent();
1582  return New;
1583 }
1584 
1585 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
1586  CombineInfo &CI, CombineInfo &Paired,
1587  MachineBasicBlock::iterator InsertBefore) {
1588  MachineBasicBlock *MBB = CI.I->getParent();
1589  DebugLoc DL = CI.I->getDebugLoc();
1590 
1591  const unsigned Opcode = getNewOpcode(CI, Paired);
1592 
1593  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1594  Register DestReg = MRI->createVirtualRegister(SuperRC);
1595 
1596  auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1597 
1598  if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1599  MIB.add(*SAddr);
1600 
1601  MachineInstr *New =
1602  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1603  .addImm(std::min(CI.Offset, Paired.Offset))
1604  .addImm(CI.CPol)
1605  .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1606 
1607  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1608  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1609  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1610 
1611  // Copy to the old destination registers.
1612  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1613  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1614  const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1615 
1616  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1617  .add(*Dest0) // Copy to same destination including flags and sub reg.
1618  .addReg(DestReg, 0, SubRegIdx0);
1619  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1620  .add(*Dest1)
1621  .addReg(DestReg, RegState::Kill, SubRegIdx1);
1622 
1623  CI.I->eraseFromParent();
1624  Paired.I->eraseFromParent();
1625  return New;
1626 }
1627 
1628 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
1629  CombineInfo &CI, CombineInfo &Paired,
1630  MachineBasicBlock::iterator InsertBefore) {
1631  MachineBasicBlock *MBB = CI.I->getParent();
1632  DebugLoc DL = CI.I->getDebugLoc();
1633 
1634  const unsigned Opcode = getNewOpcode(CI, Paired);
1635 
1636  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1637  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1638  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1639 
1640  // Copy to the new source register.
1641  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1642  Register SrcReg = MRI->createVirtualRegister(SuperRC);
1643 
1644  const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1645  const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1646 
1647  BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1648  .add(*Src0)
1649  .addImm(SubRegIdx0)
1650  .add(*Src1)
1651  .addImm(SubRegIdx1);
1652 
1653  auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1654  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1655  .addReg(SrcReg, RegState::Kill);
1656 
1657  if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1658  MIB.add(*SAddr);
1659 
1660  MachineInstr *New =
1661  MIB.addImm(std::min(CI.Offset, Paired.Offset))
1662  .addImm(CI.CPol)
1663  .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1664 
1665  CI.I->eraseFromParent();
1666  Paired.I->eraseFromParent();
1667  return New;
1668 }
1669 
1670 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1671  const CombineInfo &Paired) {
1672  const unsigned Width = CI.Width + Paired.Width;
1673 
1674  switch (getCommonInstClass(CI, Paired)) {
1675  default:
1676  assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1677  // FIXME: Handle d16 correctly
1678  return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
1679  Width);
1680  case TBUFFER_LOAD:
1681  case TBUFFER_STORE:
1682  return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1683  Width);
1684 
1685  case UNKNOWN:
1686  llvm_unreachable("Unknown instruction class");
1687  case S_BUFFER_LOAD_IMM:
1688  switch (Width) {
1689  default:
1690  return 0;
1691  case 2:
1692  return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1693  case 4:
1694  return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1695  case 8:
1696  return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1697  }
1698  case S_BUFFER_LOAD_SGPR_IMM:
1699  switch (Width) {
1700  default:
1701  return 0;
1702  case 2:
1703  return CI.Offset == 0 ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR
1704  : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1705  case 4:
1706  return CI.Offset == 0 ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR
1707  : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1708  case 8:
1709  return CI.Offset == 0 ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR
1710  : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1711  }
1712  case S_LOAD_IMM:
1713  switch (Width) {
1714  default:
1715  return 0;
1716  case 2:
1717  return AMDGPU::S_LOAD_DWORDX2_IMM;
1718  case 4:
1719  return AMDGPU::S_LOAD_DWORDX4_IMM;
1720  case 8:
1721  return AMDGPU::S_LOAD_DWORDX8_IMM;
1722  }
1723  case GLOBAL_LOAD:
1724  switch (Width) {
1725  default:
1726  return 0;
1727  case 2:
1728  return AMDGPU::GLOBAL_LOAD_DWORDX2;
1729  case 3:
1730  return AMDGPU::GLOBAL_LOAD_DWORDX3;
1731  case 4:
1732  return AMDGPU::GLOBAL_LOAD_DWORDX4;
1733  }
1734  case GLOBAL_LOAD_SADDR:
1735  switch (Width) {
1736  default:
1737  return 0;
1738  case 2:
1739  return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1740  case 3:
1741  return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1742  case 4:
1743  return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1744  }
1745  case GLOBAL_STORE:
1746  switch (Width) {
1747  default:
1748  return 0;
1749  case 2:
1750  return AMDGPU::GLOBAL_STORE_DWORDX2;
1751  case 3:
1752  return AMDGPU::GLOBAL_STORE_DWORDX3;
1753  case 4:
1754  return AMDGPU::GLOBAL_STORE_DWORDX4;
1755  }
1756  case GLOBAL_STORE_SADDR:
1757  switch (Width) {
1758  default:
1759  return 0;
1760  case 2:
1761  return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1762  case 3:
1763  return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1764  case 4:
1765  return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1766  }
1767  case FLAT_LOAD:
1768  switch (Width) {
1769  default:
1770  return 0;
1771  case 2:
1772  return AMDGPU::FLAT_LOAD_DWORDX2;
1773  case 3:
1774  return AMDGPU::FLAT_LOAD_DWORDX3;
1775  case 4:
1776  return AMDGPU::FLAT_LOAD_DWORDX4;
1777  }
1778  case FLAT_STORE:
1779  switch (Width) {
1780  default:
1781  return 0;
1782  case 2:
1783  return AMDGPU::FLAT_STORE_DWORDX2;
1784  case 3:
1785  return AMDGPU::FLAT_STORE_DWORDX3;
1786  case 4:
1787  return AMDGPU::FLAT_STORE_DWORDX4;
1788  }
1789  case MIMG:
1790  assert((countPopulation(CI.DMask | Paired.DMask) == Width) &&
1791  "No overlaps");
1792  return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
1793  }
1794 }
1795 
1796 std::pair<unsigned, unsigned>
1797 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
1798  const CombineInfo &Paired) {
1799  assert((CI.InstClass != MIMG || (countPopulation(CI.DMask | Paired.DMask) ==
1800  CI.Width + Paired.Width)) &&
1801  "No overlaps");
1802 
1803  unsigned Idx0;
1804  unsigned Idx1;
1805 
1806  static const unsigned Idxs[5][4] = {
1807  {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1808  {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
1809  {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
1810  {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
1811  {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
1812  };
1813 
1814  assert(CI.Width >= 1 && CI.Width <= 4);
1815  assert(Paired.Width >= 1 && Paired.Width <= 4);
1816 
1817  if (Paired < CI) {
1818  Idx1 = Idxs[0][Paired.Width - 1];
1819  Idx0 = Idxs[Paired.Width][CI.Width - 1];
1820  } else {
1821  Idx0 = Idxs[0][CI.Width - 1];
1822  Idx1 = Idxs[CI.Width][Paired.Width - 1];
1823  }
1824 
1825  return std::make_pair(Idx0, Idx1);
1826 }
1827 
1828 const TargetRegisterClass *
1829 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
1830  const CombineInfo &Paired) {
1831  if (CI.InstClass == S_BUFFER_LOAD_IMM ||
1832  CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
1833  switch (CI.Width + Paired.Width) {
1834  default:
1835  return nullptr;
1836  case 2:
1837  return &AMDGPU::SReg_64_XEXECRegClass;
1838  case 4:
1839  return &AMDGPU::SGPR_128RegClass;
1840  case 8:
1841  return &AMDGPU::SGPR_256RegClass;
1842  case 16:
1843  return &AMDGPU::SGPR_512RegClass;
1844  }
1845  }
1846 
1847  unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1848  return TRI->isAGPRClass(getDataRegClass(*CI.I))
1849  ? TRI->getAGPRClassForBitWidth(BitWidth)
1850  : TRI->getVGPRClassForBitWidth(BitWidth);
1851 }
1852 
1853 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
1854  CombineInfo &CI, CombineInfo &Paired,
1855  MachineBasicBlock::iterator InsertBefore) {
1856  MachineBasicBlock *MBB = CI.I->getParent();
1857  DebugLoc DL = CI.I->getDebugLoc();
1858 
1859  const unsigned Opcode = getNewOpcode(CI, Paired);
1860 
1861  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1862  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1863  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1864 
1865  // Copy to the new source register.
1866  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1867  Register SrcReg = MRI->createVirtualRegister(SuperRC);
1868 
1869  const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1870  const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1871 
1872  BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1873  .add(*Src0)
1874  .addImm(SubRegIdx0)
1875  .add(*Src1)
1876  .addImm(SubRegIdx1);
1877 
1878  auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1879  .addReg(SrcReg, RegState::Kill);
1880 
1881  AddressRegs Regs = getRegs(Opcode, *TII);
1882 
1883  if (Regs.VAddr)
1884  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1885 
1886 
1887  // It shouldn't be possible to get this far if the two instructions
1888  // don't have a single memoperand, because MachineInstr::mayAlias()
1889  // will return true if this is the case.
1890  assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1891 
1892  MachineInstr *New =
1893  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1894  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1895  .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1896  .addImm(CI.CPol) // cpol
1897  .addImm(0) // tfe
1898  .addImm(0) // swz
1899  .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1900 
1901  CI.I->eraseFromParent();
1902  Paired.I->eraseFromParent();
1903  return New;
1904 }
1905 
1907 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
1908  APInt V(32, Val, true);
1909  if (TII->isInlineConstant(V))
1910  return MachineOperand::CreateImm(Val);
1911 
1912  Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1913  MachineInstr *Mov =
1914  BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1915  TII->get(AMDGPU::S_MOV_B32), Reg)
1916  .addImm(Val);
1917  (void)Mov;
1918  LLVM_DEBUG(dbgs() << " "; Mov->dump());
1919  return MachineOperand::CreateReg(Reg, false);
1920 }
1921 
1922 // Compute base address using Addr and return the final register.
1923 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1924  const MemAddress &Addr) const {
1925  MachineBasicBlock *MBB = MI.getParent();
1926  MachineBasicBlock::iterator MBBI = MI.getIterator();
1927  DebugLoc DL = MI.getDebugLoc();
1928 
1929  assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1930  Addr.Base.LoSubReg) &&
1931  "Expected 32-bit Base-Register-Low!!");
1932 
1933  assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1934  Addr.Base.HiSubReg) &&
1935  "Expected 32-bit Base-Register-Hi!!");
1936 
1937  LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n");
1938  MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1939  MachineOperand OffsetHi =
1940  createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1941 
1942  const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1943  Register CarryReg = MRI->createVirtualRegister(CarryRC);
1944  Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
1945 
1946  Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1947  Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1948  MachineInstr *LoHalf =
1949  BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
1950  .addReg(CarryReg, RegState::Define)
1951  .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1952  .add(OffsetLo)
1953  .addImm(0); // clamp bit
1954  (void)LoHalf;
1955  LLVM_DEBUG(dbgs() << " "; LoHalf->dump(););
1956 
1957  MachineInstr *HiHalf =
1958  BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1959  .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
1960  .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
1961  .add(OffsetHi)
1962  .addReg(CarryReg, RegState::Kill)
1963  .addImm(0); // clamp bit
1964  (void)HiHalf;
1965  LLVM_DEBUG(dbgs() << " "; HiHalf->dump(););
1966 
1967  Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
1968  MachineInstr *FullBase =
1969  BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
1970  .addReg(DestSub0)
1971  .addImm(AMDGPU::sub0)
1972  .addReg(DestSub1)
1973  .addImm(AMDGPU::sub1);
1974  (void)FullBase;
1975  LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";);
1976 
1977  return FullDestReg;
1978 }
1979 
1980 // Update base and offset with the NewBase and NewOffset in MI.
1981 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
1982  Register NewBase,
1983  int32_t NewOffset) const {
1984  auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1985  Base->setReg(NewBase);
1986  Base->setIsKill(false);
1987  TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
1988 }
1989 
1991 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
1992  if (Op.isImm())
1993  return Op.getImm();
1994 
1995  if (!Op.isReg())
1996  return None;
1997 
1998  MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
1999  if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
2000  !Def->getOperand(1).isImm())
2001  return None;
2002 
2003  return Def->getOperand(1).getImm();
2004 }
2005 
2006 // Analyze Base and extracts:
2007 // - 32bit base registers, subregisters
2008 // - 64bit constant offset
2009 // Expecting base computation as:
2010 // %OFFSET0:sgpr_32 = S_MOV_B32 8000
2011 // %LO:vgpr_32, %c:sreg_64_xexec =
2012 // V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
2013 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
2014 // %Base:vreg_64 =
2015 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
2016 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
2017  MemAddress &Addr) const {
2018  if (!Base.isReg())
2019  return;
2020 
2021  MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
2022  if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
2023  || Def->getNumOperands() != 5)
2024  return;
2025 
2026  MachineOperand BaseLo = Def->getOperand(1);
2027  MachineOperand BaseHi = Def->getOperand(3);
2028  if (!BaseLo.isReg() || !BaseHi.isReg())
2029  return;
2030 
2031  MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
2032  MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
2033 
2034  if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
2035  !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
2036  return;
2037 
2038  const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
2039  const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
2040 
2041  auto Offset0P = extractConstOffset(*Src0);
2042  if (Offset0P)
2043  BaseLo = *Src1;
2044  else {
2045  if (!(Offset0P = extractConstOffset(*Src1)))
2046  return;
2047  BaseLo = *Src0;
2048  }
2049 
2050  Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
2051  Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
2052 
2053  if (Src0->isImm())
2054  std::swap(Src0, Src1);
2055 
2056  if (!Src1->isImm())
2057  return;
2058 
2059  uint64_t Offset1 = Src1->getImm();
2060  BaseHi = *Src0;
2061 
2062  Addr.Base.LoReg = BaseLo.getReg();
2063  Addr.Base.HiReg = BaseHi.getReg();
2064  Addr.Base.LoSubReg = BaseLo.getSubReg();
2065  Addr.Base.HiSubReg = BaseHi.getSubReg();
2066  Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
2067 }
2068 
2069 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
2070  MachineInstr &MI,
2071  MemInfoMap &Visited,
2072  SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
2073 
2074  if (!(MI.mayLoad() ^ MI.mayStore()))
2075  return false;
2076 
2077  // TODO: Support flat and scratch.
2078  if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0)
2079  return false;
2080 
2081  if (MI.mayLoad() &&
2082  TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr)
2083  return false;
2084 
2085  if (AnchorList.count(&MI))
2086  return false;
2087 
2088  LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
2089 
2090  if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
2091  LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";);
2092  return false;
2093  }
2094 
2095  // Step1: Find the base-registers and a 64bit constant offset.
2096  MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
2097  MemAddress MAddr;
2098  if (Visited.find(&MI) == Visited.end()) {
2099  processBaseWithConstOffset(Base, MAddr);
2100  Visited[&MI] = MAddr;
2101  } else
2102  MAddr = Visited[&MI];
2103 
2104  if (MAddr.Offset == 0) {
2105  LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no"
2106  " constant offsets that can be promoted.\n";);
2107  return false;
2108  }
2109 
2110  LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", "
2111  << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
2112 
2113  // Step2: Traverse through MI's basic block and find an anchor(that has the
2114  // same base-registers) with the highest 13bit distance from MI's offset.
2115  // E.g. (64bit loads)
2116  // bb:
2117  // addr1 = &a + 4096; load1 = load(addr1, 0)
2118  // addr2 = &a + 6144; load2 = load(addr2, 0)
2119  // addr3 = &a + 8192; load3 = load(addr3, 0)
2120  // addr4 = &a + 10240; load4 = load(addr4, 0)
2121  // addr5 = &a + 12288; load5 = load(addr5, 0)
2122  //
2123  // Starting from the first load, the optimization will try to find a new base
2124  // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
2125  // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
2126  // as the new-base(anchor) because of the maximum distance which can
2127  // accommodate more intermediate bases presumably.
2128  //
2129  // Step3: move (&a + 8192) above load1. Compute and promote offsets from
2130  // (&a + 8192) for load1, load2, load4.
2131  // addr = &a + 8192
2132  // load1 = load(addr, -4096)
2133  // load2 = load(addr, -2048)
2134  // load3 = load(addr, 0)
2135  // load4 = load(addr, 2048)
2136  // addr5 = &a + 12288; load5 = load(addr5, 0)
2137  //
2138  MachineInstr *AnchorInst = nullptr;
2139  MemAddress AnchorAddr;
2142 
2143  MachineBasicBlock *MBB = MI.getParent();
2145  MachineBasicBlock::iterator MBBI = MI.getIterator();
2146  ++MBBI;
2147  const SITargetLowering *TLI =
2148  static_cast<const SITargetLowering *>(STM->getTargetLowering());
2149 
2150  for ( ; MBBI != E; ++MBBI) {
2151  MachineInstr &MINext = *MBBI;
2152  // TODO: Support finding an anchor(with same base) from store addresses or
2153  // any other load addresses where the opcodes are different.
2154  if (MINext.getOpcode() != MI.getOpcode() ||
2155  TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
2156  continue;
2157 
2158  const MachineOperand &BaseNext =
2159  *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
2160  MemAddress MAddrNext;
2161  if (Visited.find(&MINext) == Visited.end()) {
2162  processBaseWithConstOffset(BaseNext, MAddrNext);
2163  Visited[&MINext] = MAddrNext;
2164  } else
2165  MAddrNext = Visited[&MINext];
2166 
2167  if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
2168  MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
2169  MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
2170  MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2171  continue;
2172 
2173  InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset));
2174 
2175  int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2177  AM.HasBaseReg = true;
2178  AM.BaseOffs = Dist;
2179  if (TLI->isLegalGlobalAddressingMode(AM) &&
2180  (uint32_t)std::abs(Dist) > MaxDist) {
2181  MaxDist = std::abs(Dist);
2182 
2183  AnchorAddr = MAddrNext;
2184  AnchorInst = &MINext;
2185  }
2186  }
2187 
2188  if (AnchorInst) {
2189  LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): ";
2190  AnchorInst->dump());
2191  LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: "
2192  << AnchorAddr.Offset << "\n\n");
2193 
2194  // Instead of moving up, just re-compute anchor-instruction's base address.
2195  Register Base = computeBase(MI, AnchorAddr);
2196 
2197  updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
2198  LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump(););
2199 
2200  for (auto P : InstsWCommonBase) {
2202  AM.HasBaseReg = true;
2203  AM.BaseOffs = P.second - AnchorAddr.Offset;
2204 
2205  if (TLI->isLegalGlobalAddressingMode(AM)) {
2206  LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second;
2207  dbgs() << ")"; P.first->dump());
2208  updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
2209  LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump());
2210  }
2211  }
2212  AnchorList.insert(AnchorInst);
2213  return true;
2214  }
2215 
2216  return false;
2217 }
2218 
2219 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
2220  std::list<std::list<CombineInfo> > &MergeableInsts) const {
2221  for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2222  if (AddrList.front().InstClass == CI.InstClass &&
2223  AddrList.front().IsAGPR == CI.IsAGPR &&
2224  AddrList.front().hasSameBaseAddress(CI)) {
2225  AddrList.emplace_back(CI);
2226  return;
2227  }
2228  }
2229 
2230  // Base address not found, so add a new list.
2231  MergeableInsts.emplace_back(1, CI);
2232 }
2233 
2234 std::pair<MachineBasicBlock::iterator, bool>
2235 SILoadStoreOptimizer::collectMergeableInsts(
2237  MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
2238  std::list<std::list<CombineInfo>> &MergeableInsts) const {
2239  bool Modified = false;
2240 
2241  // Sort potential mergeable instructions into lists. One list per base address.
2242  unsigned Order = 0;
2243  MachineBasicBlock::iterator BlockI = Begin;
2244  for (; BlockI != End; ++BlockI) {
2245  MachineInstr &MI = *BlockI;
2246 
2247  // We run this before checking if an address is mergeable, because it can produce
2248  // better code even if the instructions aren't mergeable.
2249  if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
2250  Modified = true;
2251 
2252  // Treat volatile accesses, ordered accesses and unmodeled side effects as
2253  // barriers. We can look after this barrier for separate merges.
2254  if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) {
2255  LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI);
2256 
2257  // Search will resume after this instruction in a separate merge list.
2258  ++BlockI;
2259  break;
2260  }
2261 
2262  const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
2263  if (InstClass == UNKNOWN)
2264  continue;
2265 
2266  // Do not merge VMEM buffer instructions with "swizzled" bit set.
2267  int Swizzled =
2268  AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
2269  if (Swizzled != -1 && MI.getOperand(Swizzled).getImm())
2270  continue;
2271 
2272  CombineInfo CI;
2273  CI.setMI(MI, *this);
2274  CI.Order = Order++;
2275 
2276  if (!CI.hasMergeableAddress(*MRI))
2277  continue;
2278 
2279  if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
2280  // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
2281  // operands. However we are reporting that ds_write2 shall have
2282  // only VGPR data so that machine copy propagation does not
2283  // create an illegal instruction with a VGPR and AGPR sources.
2284  // Consequenctially if we create such instruction the verifier
2285  // will complain.
2286  continue;
2287  }
2288 
2289  LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
2290 
2291  addInstToMergeableList(CI, MergeableInsts);
2292  }
2293 
2294  // At this point we have lists of Mergeable instructions.
2295  //
2296  // Part 2: Sort lists by offset and then for each CombineInfo object in the
2297  // list try to find an instruction that can be merged with I. If an instruction
2298  // is found, it is stored in the Paired field. If no instructions are found, then
2299  // the CombineInfo object is deleted from the list.
2300 
2301  for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2302  E = MergeableInsts.end(); I != E;) {
2303 
2304  std::list<CombineInfo> &MergeList = *I;
2305  if (MergeList.size() <= 1) {
2306  // This means we have found only one instruction with a given address
2307  // that can be merged, and we need at least 2 instructions to do a merge,
2308  // so this list can be discarded.
2309  I = MergeableInsts.erase(I);
2310  continue;
2311  }
2312 
2313  // Sort the lists by offsets, this way mergeable instructions will be
2314  // adjacent to each other in the list, which will make it easier to find
2315  // matches.
2316  MergeList.sort(
2317  [] (const CombineInfo &A, const CombineInfo &B) {
2318  return A.Offset < B.Offset;
2319  });
2320  ++I;
2321  }
2322 
2323  return std::make_pair(BlockI, Modified);
2324 }
2325 
2326 // Scan through looking for adjacent LDS operations with constant offsets from
2327 // the same base register. We rely on the scheduler to do the hard work of
2328 // clustering nearby loads, and assume these are all adjacent.
2330  std::list<std::list<CombineInfo> > &MergeableInsts) {
2331  bool Modified = false;
2332 
2333  for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2334  E = MergeableInsts.end(); I != E;) {
2335  std::list<CombineInfo> &MergeList = *I;
2336 
2337  bool OptimizeListAgain = false;
2338  if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2339  // We weren't able to make any changes, so delete the list so we don't
2340  // process the same instructions the next time we try to optimize this
2341  // block.
2342  I = MergeableInsts.erase(I);
2343  continue;
2344  }
2345 
2346  Modified = true;
2347 
2348  // We made changes, but also determined that there were no more optimization
2349  // opportunities, so we don't need to reprocess the list
2350  if (!OptimizeListAgain) {
2351  I = MergeableInsts.erase(I);
2352  continue;
2353  }
2354  OptimizeAgain = true;
2355  }
2356  return Modified;
2357 }
2358 
2359 bool
2360 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2361  std::list<CombineInfo> &MergeList,
2362  bool &OptimizeListAgain) {
2363  if (MergeList.empty())
2364  return false;
2365 
2366  bool Modified = false;
2367 
2368  for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
2369  Next = std::next(I)) {
2370 
2371  auto First = I;
2372  auto Second = Next;
2373 
2374  if ((*First).Order > (*Second).Order)
2375  std::swap(First, Second);
2376  CombineInfo &CI = *First;
2377  CombineInfo &Paired = *Second;
2378 
2379  CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2380  if (!Where) {
2381  ++I;
2382  continue;
2383  }
2384 
2385  Modified = true;
2386 
2387  LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I);
2388 
2390  switch (CI.InstClass) {
2391  default:
2392  llvm_unreachable("unknown InstClass");
2393  break;
2394  case DS_READ:
2395  NewMI = mergeRead2Pair(CI, Paired, Where->I);
2396  break;
2397  case DS_WRITE:
2398  NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2399  break;
2400  case S_BUFFER_LOAD_IMM:
2401  case S_BUFFER_LOAD_SGPR_IMM:
2402  case S_LOAD_IMM:
2403  NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);
2404  OptimizeListAgain |= CI.Width + Paired.Width < 8;
2405  break;
2406  case BUFFER_LOAD:
2407  NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2408  OptimizeListAgain |= CI.Width + Paired.Width < 4;
2409  break;
2410  case BUFFER_STORE:
2411  NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2412  OptimizeListAgain |= CI.Width + Paired.Width < 4;
2413  break;
2414  case MIMG:
2415  NewMI = mergeImagePair(CI, Paired, Where->I);
2416  OptimizeListAgain |= CI.Width + Paired.Width < 4;
2417  break;
2418  case TBUFFER_LOAD:
2419  NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2420  OptimizeListAgain |= CI.Width + Paired.Width < 4;
2421  break;
2422  case TBUFFER_STORE:
2423  NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2424  OptimizeListAgain |= CI.Width + Paired.Width < 4;
2425  break;
2426  case FLAT_LOAD:
2427  case GLOBAL_LOAD:
2428  case GLOBAL_LOAD_SADDR:
2429  NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2430  OptimizeListAgain |= CI.Width + Paired.Width < 4;
2431  break;
2432  case FLAT_STORE:
2433  case GLOBAL_STORE:
2434  case GLOBAL_STORE_SADDR:
2435  NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2436  OptimizeListAgain |= CI.Width + Paired.Width < 4;
2437  break;
2438  }
2439  CI.setMI(NewMI, *this);
2440  CI.Order = Where->Order;
2441  if (I == Second)
2442  I = Next;
2443 
2444  MergeList.erase(Second);
2445  }
2446 
2447  return Modified;
2448 }
2449 
2450 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
2451  if (skipFunction(MF.getFunction()))
2452  return false;
2453 
2454  STM = &MF.getSubtarget<GCNSubtarget>();
2455  if (!STM->loadStoreOptEnabled())
2456  return false;
2457 
2458  TII = STM->getInstrInfo();
2459  TRI = &TII->getRegisterInfo();
2460 
2461  MRI = &MF.getRegInfo();
2462  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2463 
2464  LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
2465 
2466  bool Modified = false;
2467 
2468  // Contains the list of instructions for which constant offsets are being
2469  // promoted to the IMM. This is tracked for an entire block at time.
2470  SmallPtrSet<MachineInstr *, 4> AnchorList;
2471  MemInfoMap Visited;
2472 
2473  for (MachineBasicBlock &MBB : MF) {
2474  MachineBasicBlock::iterator SectionEnd;
2475  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
2476  I = SectionEnd) {
2477  bool CollectModified;
2478  std::list<std::list<CombineInfo>> MergeableInsts;
2479 
2480  // First pass: Collect list of all instructions we know how to merge in a
2481  // subset of the block.
2482  std::tie(SectionEnd, CollectModified) =
2483  collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
2484 
2485  Modified |= CollectModified;
2486 
2487  do {
2488  OptimizeAgain = false;
2489  Modified |= optimizeBlock(MergeableInsts);
2490  } while (OptimizeAgain);
2491  }
2492 
2493  Visited.clear();
2494  AnchorList.clear();
2495  }
2496 
2497  return Modified;
2498 }
llvm::Check::Size
@ Size
Definition: FileCheck.h:77
i
i
Definition: README.txt:29
DEBUG_TYPE
#define DEBUG_TYPE
Definition: SILoadStoreOptimizer.cpp:69
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:108
llvm::MachineInstrBuilder::addImm
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Definition: MachineInstrBuilder.h:131
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
llvm::tgtok::Def
@ Def
Definition: TGLexer.h:50
llvm::AMDGPU::getMUBUFHasSoffset
bool getMUBUFHasSoffset(unsigned Opc)
Definition: AMDGPUBaseInfo.cpp:373
llvm::MachineOperand::CreateReg
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
Definition: MachineOperand.h:800
llvm::MachineRegisterInfo::createVirtualRegister
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
Definition: MachineRegisterInfo.cpp:156
llvm::MachineRegisterInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Definition: MachineRegisterInfo.h:50
llvm::MachineInstrBuilder::add
const MachineInstrBuilder & add(const MachineOperand &MO) const
Definition: MachineInstrBuilder.h:224
llvm::AMDGPU::getMUBUFBaseOpcode
int getMUBUFBaseOpcode(unsigned Opc)
Definition: AMDGPUBaseInfo.cpp:348
P
This currently compiles esp xmm0 movsd esp eax eax esp ret We should use not the dag combiner This is because dagcombine2 needs to be able to see through the X86ISD::Wrapper which DAGCombine can t really do The code for turning x load into a single vector load is target independent and should be moved to the dag combiner The code for turning x load into a vector load can only handle a direct load from a global or a direct load from the stack It should be generalized to handle any load from P
Definition: README-SSE.txt:411
llvm::AMDGPUISD::BUFFER_LOAD
@ BUFFER_LOAD
Definition: AMDGPUISelLowering.h:508
op
#define op(i)
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1182
llvm::MachineFunction::getMachineMemOperand
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, uint64_t s, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
Definition: MachineFunction.cpp:454
llvm::X86Disassembler::Reg
Reg
All possible values of the reg field in the ModR/M byte.
Definition: X86DisassemblerDecoder.h:462
llvm::MachineRegisterInfo::getUniqueVRegDef
MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
Definition: MachineRegisterInfo.cpp:407
llvm::MachineFunctionPass
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
Definition: MachineFunctionPass.h:30
llvm::AMDGPU::getGlobalSaddrOp
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
llvm::MachineMemOperand
A description of a memory reference used in the backend.
Definition: MachineMemOperand.h:127
llvm::GCNSubtarget::hasDwordx3LoadStores
bool hasDwordx3LoadStores() const
Definition: GCNSubtarget.h:962
llvm::MachineFunctionProperties::Property::IsSSA
@ IsSSA
llvm::MachineFunctionProperties
Properties which a MachineFunction may have at a given point in time.
Definition: MachineFunction.h:127
llvm::Optional< int32_t >
llvm::createSILoadStoreOptimizerPass
FunctionPass * createSILoadStoreOptimizerPass()
Definition: SILoadStoreOptimizer.cpp:802
llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:450
llvm::GCNSubtarget
Definition: GCNSubtarget.h:31
llvm::max
Expected< ExpressionValue > max(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:337
llvm::GCNSubtarget::loadStoreOptEnabled
bool loadStoreOptEnabled() const
Definition: GCNSubtarget.h:935
llvm::AMDGPU::getNamedOperandIdx
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
llvm::detail::DenseSetImpl< ValueT, DenseMap< ValueT, detail::DenseSetEmpty, DenseMapInfo< ValueT >, detail::DenseSetPair< ValueT > >, DenseMapInfo< ValueT > >::insert
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
llvm::SPII::Load
@ Load
Definition: SparcInstrInfo.h:32
TRI
unsigned const TargetRegisterInfo * TRI
Definition: MachineSink.cpp:1628
llvm::AMDGPU::getMTBUFOpcode
int getMTBUFOpcode(unsigned BaseOpc, unsigned Elements)
Definition: AMDGPUBaseInfo.cpp:323
llvm::MachineFunctionPass::getAnalysisUsage
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Definition: MachineFunctionPass.cpp:167
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
isImm
static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI)
Definition: SPIRVInstructionSelector.cpp:1218
llvm::GCNSubtarget::getInstrInfo
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:213
AliasAnalysis.h
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
llvm::MachineMemOperand::getPointerInfo
const MachinePointerInfo & getPointerInfo() const
Definition: MachineMemOperand.h:201
llvm::BitmaskEnumDetail::Mask
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
llvm::Register::isPhysical
bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:97
llvm::AMDGPU::CPol::CPol
CPol
Definition: SIDefines.h:303
llvm::MachineFunction::getRegInfo
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Definition: MachineFunction.h:666
llvm::AMDGPU::getMTBUFHasSrsrc
bool getMTBUFHasSrsrc(unsigned Opc)
Definition: AMDGPUBaseInfo.cpp:338
llvm::PassRegistry::getPassRegistry
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Definition: PassRegistry.cpp:24
llvm::AAResults
Definition: AliasAnalysis.h:518
getReg
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
Definition: MipsDisassembler.cpp:517
llvm::MachineOperand::CreateImm
static MachineOperand CreateImm(int64_t Val)
Definition: MachineOperand.h:782
GCNSubtarget.h
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::MachineMemOperand::getAddrSpace
unsigned getAddrSpace() const
Definition: MachineMemOperand.h:228
llvm::MachineOperand::getImm
int64_t getImm() const
Definition: MachineOperand.h:546
llvm::AMDGPU::getMIMGBaseOpcode
const MIMGBaseOpcodeInfo * getMIMGBaseOpcode(unsigned Opc)
Definition: AMDGPUBaseInfo.cpp:206
llvm::TargetLoweringBase::AddrMode::HasBaseReg
bool HasBaseReg
Definition: TargetLowering.h:2543
llvm::ARM_PROC::A
@ A
Definition: ARMBaseInfo.h:34
llvm::GCNSubtarget::getTargetLowering
const SITargetLowering * getTargetLowering() const override
Definition: GCNSubtarget.h:221
llvm::AMDGPU::getMUBUFOpcode
int getMUBUFOpcode(unsigned BaseOpc, unsigned Elements)
Definition: AMDGPUBaseInfo.cpp:353
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
llvm::TargetRegisterClass
Definition: TargetRegisterInfo.h:46
llvm::AMDGPU::GcnBufferFormatInfo::NumFormat
unsigned NumFormat
Definition: AMDGPUBaseInfo.h:70
llvm::SPII::Store
@ Store
Definition: SparcInstrInfo.h:33
false
Definition: StackSlotColoring.cpp:141
TII
const HexagonInstrInfo * TII
Definition: HexagonCopyToCombine.cpp:125
llvm::MCInstrDesc
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:197
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
First
into llvm powi allowing the code generator to produce balanced multiplication trees First
Definition: README.txt:54
llvm::MachineOperand
MachineOperand class - Representation of each machine instruction operand.
Definition: MachineOperand.h:48
llvm::MachineFunctionProperties::set
MachineFunctionProperties & set(Property P)
Definition: MachineFunction.h:196
llvm::AMDGPU::getMTBUFBaseOpcode
int getMTBUFBaseOpcode(unsigned Opc)
Definition: AMDGPUBaseInfo.cpp:318
llvm::AMDGPU::getMTBUFHasSoffset
bool getMTBUFHasSoffset(unsigned Opc)
Definition: AMDGPUBaseInfo.cpp:343
LoopDeletionResult::Modified
@ Modified
llvm::SITargetLowering::isLegalGlobalAddressingMode
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
Definition: SIISelLowering.cpp:1229
llvm::AMDGPU::getMUBUFHasSrsrc
bool getMUBUFHasSrsrc(unsigned Opc)
Definition: AMDGPUBaseInfo.cpp:368
Info
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
llvm::AMDGPU::convertSMRDOffsetUnits
uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset)
Convert ByteOffset to dwords if the subtarget uses dword SMRD immediate offsets.
Definition: AMDGPUBaseInfo.cpp:2254
llvm::SIRegisterInfo
Definition: SIRegisterInfo.h:30
addDefsUsesToList
static void addDefsUsesToList(const MachineInstr &MI, DenseSet< Register > &RegDefs, DenseSet< Register > &RegUses)
Definition: SILoadStoreOptimizer.cpp:806
llvm::SILoadStoreOptimizerID
char & SILoadStoreOptimizerID
Definition: SILoadStoreOptimizer.cpp:800
llvm::None
const NoneType None
Definition: None.h:24
llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
llvm::MachineBasicBlock
Definition: MachineBasicBlock.h:94
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
llvm::sys::unicode::SBase
constexpr const char32_t SBase
Definition: UnicodeNameToCodepoint.cpp:256
llvm::DenseSet
Implements a dense probed hash-table based set.
Definition: DenseSet.h:268
llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition: MachineFunction.h:656
llvm::countPopulation
unsigned countPopulation(T Value)
Count the number of set bits in a value.
Definition: MathExtras.h:502
AMDGPUMCTargetDesc.h
llvm::TargetRegisterInfo::getRegClass
const TargetRegisterClass * getRegClass(unsigned i) const
Returns the register class associated with the enumeration value.
Definition: TargetRegisterInfo.h:771
llvm::MachineOperand::isReg
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Definition: MachineOperand.h:320
llvm::MachineInstr
Representation of each machine instruction.
Definition: MachineInstr.h:66
llvm::MachineInstrBuilder
Definition: MachineInstrBuilder.h:69
uint64_t
Addr
uint64_t Addr
Definition: ELFObjHandler.cpp:78
INITIALIZE_PASS_DEPENDENCY
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
Optimizer
SI Load Store Optimizer
Definition: SILoadStoreOptimizer.cpp:795
llvm::MachinePointerInfo
This class contains a discriminated union of information about pointers in memory operands,...
Definition: MachineMemOperand.h:39
llvm::DenseMap
Definition: DenseMap.h:714
llvm::operator<
bool operator<(int64_t V1, const APSInt &V2)
Definition: APSInt.h:339
I
#define I(x, y, z)
Definition: MD5.cpp:58
llvm::AMDGPU::GcnBufferFormatInfo::BitsPerComp
unsigned BitsPerComp
Definition: AMDGPUBaseInfo.h:68
llvm::AMDGPU::getGcnBufferFormatInfo
const GcnBufferFormatInfo * getGcnBufferFormatInfo(uint8_t BitsPerComp, uint8_t NumComponents, uint8_t NumFormat, const MCSubtargetInfo &STI)
Definition: AMDGPUBaseInfo.cpp:2395
MachineFunctionPass.h
llvm::RegState::Define
@ Define
Register definition.
Definition: MachineInstrBuilder.h:44
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
mostAlignedValueInRange
static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi)
Definition: SILoadStoreOptimizer.cpp:928
std::swap
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:853
SI
StandardInstrumentations SI(Debug, VerifyEach)
llvm::MachineInstrBuilder::addReg
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Definition: MachineInstrBuilder.h:97
llvm::AMDGPU::getMIMGInfo
const LLVM_READONLY MIMGInfo * getMIMGInfo(unsigned Opc)
llvm::MachineOperand::getReg
Register getReg() const
getReg - Returns the register number.
Definition: MachineOperand.h:359
llvm::SmallPtrSetImpl::count
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:383
llvm::APInt
Class for arbitrary precision integers.
Definition: APInt.h:75
llvm::MachineFunction
Definition: MachineFunction.h:257
llvm::MachineInstr::dump
void dump() const
Definition: MachineInstr.cpp:1518
llvm::AMDGPU::getMIMGBaseOpcodeInfo
const LLVM_READONLY MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
llvm::SmallPtrSetImplBase::clear
void clear()
Definition: SmallPtrSet.h:95
da
da
Definition: DependenceAnalysis.cpp:142
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
llvm::countTrailingZeros
unsigned countTrailingZeros(T Val, ZeroBehavior ZB=ZB_Width)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: MathExtras.h:153
llvm::AnalysisUsage::setPreservesCFG
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:263
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
AMDGPU.h
list
compiles ldr LCPI1_0 ldr ldr mov lsr tst moveq r1 ldr LCPI1_1 and r0 bx lr It would be better to do something like to fold the shift into the conditional ldr LCPI1_0 ldr ldr tst movne lsr ldr LCPI1_1 and r0 bx lr it saves an instruction and a register It might be profitable to cse MOVi16 if there are lots of bit immediates with the same bottom half Robert Muth started working on an alternate jump table implementation that does not put the tables in line in the text This is more like the llvm default jump table implementation This might be useful sometime Several revisions of patches are on the mailing list
Definition: README.txt:568
MBBI
MachineBasicBlock MachineBasicBlock::iterator MBBI
Definition: AArch64SLSHardening.cpp:75
llvm::MachineInstr::getOpcode
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:516
llvm::MachineRegisterInfo::hasOneNonDBGUse
bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
Definition: MachineRegisterInfo.cpp:415
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:143
llvm::detail::DenseSetImpl< ValueT, DenseMap< ValueT, detail::DenseSetEmpty, DenseMapInfo< ValueT >, detail::DenseSetPair< ValueT > >, DenseMapInfo< ValueT > >::contains
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:185
llvm::ms_demangle::IntrinsicFunctionKind::New
@ New
uint32_t
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
llvm::format
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:124
llvm::AMDGPU::getMTBUFHasVAddr
bool getMTBUFHasVAddr(unsigned Opc)
Definition: AMDGPUBaseInfo.cpp:333
llvm::Pass::dump
void dump() const
Definition: Pass.cpp:135
llvm::AMDGPU::GcnBufferFormatInfo
Definition: AMDGPUBaseInfo.h:66
llvm::MachineInstrBuilder::cloneMergedMemRefs
const MachineInstrBuilder & cloneMergedMemRefs(ArrayRef< const MachineInstr * > OtherMIs) const
Definition: MachineInstrBuilder.h:219
llvm::AMDGPU::GcnBufferFormatInfo::Format
unsigned Format
Definition: AMDGPUBaseInfo.h:67
MRI
unsigned const MachineRegisterInfo * MRI
Definition: AArch64AdvSIMDScalarPass.cpp:105
llvm::Register
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
llvm::MachineOperand::getSubReg
unsigned getSubReg() const
Definition: MachineOperand.h:364
llvm::LPAC::UNKNOWN
@ UNKNOWN
Definition: LanaiAluCode.h:40
MBB
MachineBasicBlock & MBB
Definition: AArch64SLSHardening.cpp:74
getBufferFormatWithCompCount
static unsigned getBufferFormatWithCompCount(unsigned OldFormat, unsigned ComponentCount, const GCNSubtarget &STI)
Definition: SILoadStoreOptimizer.cpp:897
llvm::TargetLoweringBase::AddrMode::BaseOffs
int64_t BaseOffs
Definition: TargetLowering.h:2542
llvm::MachineFunction::getFunction
Function & getFunction()
Return the LLVM function that this machine code represents.
Definition: MachineFunction.h:622
llvm::AMDGPU::getMUBUFHasVAddr
bool getMUBUFHasVAddr(unsigned Opc)
Definition: AMDGPUBaseInfo.cpp:363
llvm::TargetRegisterInfo::getRegSizeInBits
unsigned getRegSizeInBits(const TargetRegisterClass &RC) const
Return the size in bits of a register from class RC.
Definition: TargetRegisterInfo.h:280
llvm::AMDGPU::getMUBUFElements
int getMUBUFElements(unsigned Opc)
Definition: AMDGPUBaseInfo.cpp:358
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:348
llvm::MachineMemOperand::getSize
uint64_t getSize() const
Return the size in bytes of the memory reference.
Definition: MachineMemOperand.h:235
optimizeBlock
static bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT, const TargetTransformInfo &TTI, const DataLayout &DL, DomTreeUpdater *DTU)
Definition: ScalarizeMaskedMemIntrin.cpp:908
llvm::SITargetLowering
Definition: SIISelLowering.h:31
llvm::BitWidth
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:147
llvm::GCNSubtarget::ldsRequiresM0Init
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
Definition: GCNSubtarget.h:632
llvm::countLeadingZeros
unsigned countLeadingZeros(T Val, ZeroBehavior ZB=ZB_Width)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: MathExtras.h:221
llvm::MachineOperand::isImm
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Definition: MachineOperand.h:322
llvm::AMDGPU::Hwreg::Width
Width
Definition: SIDefines.h:439
llvm::SIInstrInfo
Definition: SIInstrInfo.h:44
AA
llvm::pdb::DbgHeaderType::Max
@ Max
llvm::BuildMI
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
Definition: MachineInstrBuilder.h:357
llvm::MachineBasicBlock::begin
iterator begin()
Definition: MachineBasicBlock.h:305
llvm::SIInstrInfo::isFLATGlobal
static bool isFLATGlobal(const MachineInstr &MI)
Definition: SIInstrInfo.h:535
llvm::AAResultsWrapperPass
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Definition: AliasAnalysis.h:1308
llvm::initializeSILoadStoreOptimizerPass
void initializeSILoadStoreOptimizerPass(PassRegistry &)
llvm::TargetLoweringBase::AddrMode
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg If BaseGV is null...
Definition: TargetLowering.h:2540
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", false, false) INITIALIZE_PASS_END(SILoadStoreOptimizer
llvm::SIInstrFlags::MIMG
@ MIMG
Definition: SIDefines.h:57
llvm::RegState::Kill
@ Kill
The last use of a register.
Definition: MachineInstrBuilder.h:48
llvm::RegState::Dead
@ Dead
Unused definition.
Definition: MachineInstrBuilder.h:50
llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:308
llvm::AMDGPUISD::BUFFER_STORE
@ BUFFER_STORE
Definition: AMDGPUISelLowering.h:516
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
llvm::DebugLoc
A debug info location.
Definition: DebugLoc.h:33
llvm::MachineInstrBundleIterator
MachineBasicBlock iterator that automatically skips over MIs that are inside bundles (i....
Definition: MachineInstrBundleIterator.h:108
llvm::abs
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition: APFloat.h:1282
InitializePasses.h
llvm::AMDGPUAS::FLAT_ADDRESS
@ FLAT_ADDRESS
Address space for flat memory.
Definition: AMDGPU.h:367
llvm::MachineBasicBlock::end
iterator end()
Definition: MachineBasicBlock.h:307
llvm::AMDGPU::getMaskedMIMGOp
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
Definition: AMDGPUBaseInfo.cpp:211
llvm::AMDGPU::MIMGBaseOpcodeInfo::Sampler
bool Sampler
Definition: AMDGPUBaseInfo.h:305
llvm::sampleprof::Base
@ Base
Definition: Discriminator.h:58
llvm::AMDGPU::getMTBUFElements
int getMTBUFElements(unsigned Opc)
Definition: AMDGPUBaseInfo.cpp:328
Other
Optional< std::vector< StOtherPiece > > Other
Definition: ELFYAML.cpp:1247
llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:365
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:38