LLVM  15.0.0git
SILoadStoreOptimizer.cpp
Go to the documentation of this file.
1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass tries to fuse DS instructions with close by immediate offsets.
10 // This will fuse operations such as
11 // ds_read_b32 v0, v2 offset:16
12 // ds_read_b32 v1, v2 offset:32
13 // ==>
14 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15 //
16 // The same is done for certain SMEM and VMEM opcodes, e.g.:
17 // s_buffer_load_dword s4, s[0:3], 4
18 // s_buffer_load_dword s5, s[0:3], 8
19 // ==>
20 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21 //
22 // This pass also tries to promote constant offset to the immediate by
23 // adjusting the base. It tries to use a base from the nearby instructions that
24 // allows it to have a 13bit constant offset and then promotes the 13bit offset
25 // to the immediate.
26 // E.g.
27 // s_movk_i32 s0, 0x1800
28 // v_add_co_u32_e32 v0, vcc, s0, v2
29 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30 //
31 // s_movk_i32 s0, 0x1000
32 // v_add_co_u32_e32 v5, vcc, s0, v2
33 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34 // global_load_dwordx2 v[5:6], v[5:6], off
35 // global_load_dwordx2 v[0:1], v[0:1], off
36 // =>
37 // s_movk_i32 s0, 0x1000
38 // v_add_co_u32_e32 v5, vcc, s0, v2
39 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40 // global_load_dwordx2 v[5:6], v[5:6], off
41 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42 //
43 // Future improvements:
44 //
45 // - This is currently missing stores of constants because loading
46 // the constant into the data register is placed between the stores, although
47 // this is arguably a scheduling problem.
48 //
49 // - Live interval recomputing seems inefficient. This currently only matches
50 // one pair, and recomputes live intervals and moves on to the next pair. It
51 // would be better to compute a list of all merges that need to occur.
52 //
53 // - With a list of instructions to process, we can also merge more. If a
54 // cluster of loads have offsets that are too large to fit in the 8-bit
55 // offsets, but are close enough to fit in the 8 bits, we can add to the base
56 // pointer and use the new reduced offsets.
57 //
58 //===----------------------------------------------------------------------===//
59 
60 #include "AMDGPU.h"
61 #include "GCNSubtarget.h"
65 #include "llvm/InitializePasses.h"
66 
67 using namespace llvm;
68 
69 #define DEBUG_TYPE "si-load-store-opt"
70 
71 namespace {
72 enum InstClassEnum {
73  UNKNOWN,
74  DS_READ,
75  DS_WRITE,
76  S_BUFFER_LOAD_IMM,
79  MIMG,
80  TBUFFER_LOAD,
81  TBUFFER_STORE,
82  GLOBAL_LOAD_SADDR,
83  GLOBAL_STORE_SADDR,
84  FLAT_LOAD,
85  FLAT_STORE,
86  GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of
87  GLOBAL_STORE // any CombineInfo, they are only ever returned by
88  // getCommonInstClass.
89 };
90 
91 struct AddressRegs {
92  unsigned char NumVAddrs = 0;
93  bool SBase = false;
94  bool SRsrc = false;
95  bool SOffset = false;
96  bool SAddr = false;
97  bool VAddr = false;
98  bool Addr = false;
99  bool SSamp = false;
100 };
101 
102 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
103 const unsigned MaxAddressRegs = 12 + 1 + 1;
104 
105 class SILoadStoreOptimizer : public MachineFunctionPass {
106  struct CombineInfo {
108  unsigned EltSize;
109  unsigned Offset;
110  unsigned Width;
111  unsigned Format;
112  unsigned BaseOff;
113  unsigned DMask;
114  InstClassEnum InstClass;
115  unsigned CPol = 0;
116  bool IsAGPR;
117  bool UseST64;
118  int AddrIdx[MaxAddressRegs];
119  const MachineOperand *AddrReg[MaxAddressRegs];
120  unsigned NumAddresses;
121  unsigned Order;
122 
123  bool hasSameBaseAddress(const MachineInstr &MI) {
124  for (unsigned i = 0; i < NumAddresses; i++) {
125  const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
126 
127  if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
128  if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
129  AddrReg[i]->getImm() != AddrRegNext.getImm()) {
130  return false;
131  }
132  continue;
133  }
134 
135  // Check same base pointer. Be careful of subregisters, which can occur
136  // with vectors of pointers.
137  if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
138  AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
139  return false;
140  }
141  }
142  return true;
143  }
144 
145  bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
146  for (unsigned i = 0; i < NumAddresses; ++i) {
147  const MachineOperand *AddrOp = AddrReg[i];
148  // Immediates are always OK.
149  if (AddrOp->isImm())
150  continue;
151 
152  // Don't try to merge addresses that aren't either immediates or registers.
153  // TODO: Should be possible to merge FrameIndexes and maybe some other
154  // non-register
155  if (!AddrOp->isReg())
156  return false;
157 
158  // TODO: We should be able to merge physical reg addresses.
159  if (AddrOp->getReg().isPhysical())
160  return false;
161 
162  // If an address has only one use then there will be on other
163  // instructions with the same address, so we can't merge this one.
164  if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
165  return false;
166  }
167  return true;
168  }
169 
170  void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
171 
172  // Compare by pointer order.
173  bool operator<(const CombineInfo& Other) const {
174  return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset;
175  }
176  };
177 
178  struct BaseRegisters {
179  Register LoReg;
180  Register HiReg;
181 
182  unsigned LoSubReg = 0;
183  unsigned HiSubReg = 0;
184  };
185 
186  struct MemAddress {
187  BaseRegisters Base;
188  int64_t Offset = 0;
189  };
190 
191  using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
192 
193 private:
194  const GCNSubtarget *STM = nullptr;
195  const SIInstrInfo *TII = nullptr;
196  const SIRegisterInfo *TRI = nullptr;
197  MachineRegisterInfo *MRI = nullptr;
198  AliasAnalysis *AA = nullptr;
199  bool OptimizeAgain;
200 
201  bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
202  const DenseSet<Register> &ARegUses,
203  const MachineInstr &A, const MachineInstr &B) const;
204  static bool dmasksCanBeCombined(const CombineInfo &CI,
205  const SIInstrInfo &TII,
206  const CombineInfo &Paired);
207  static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
208  CombineInfo &Paired, bool Modify = false);
209  static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
210  const CombineInfo &Paired);
211  static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
212  static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
213  const CombineInfo &Paired);
214  const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI,
215  const CombineInfo &Paired);
216  const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
217 
218  CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
219 
220  unsigned read2Opcode(unsigned EltSize) const;
221  unsigned read2ST64Opcode(unsigned EltSize) const;
223  mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
224  MachineBasicBlock::iterator InsertBefore);
225 
226  unsigned write2Opcode(unsigned EltSize) const;
227  unsigned write2ST64Opcode(unsigned EltSize) const;
229  mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
230  MachineBasicBlock::iterator InsertBefore);
232  mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
233  MachineBasicBlock::iterator InsertBefore);
235  mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
236  MachineBasicBlock::iterator InsertBefore);
238  mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
239  MachineBasicBlock::iterator InsertBefore);
241  mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
242  MachineBasicBlock::iterator InsertBefore);
244  mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
245  MachineBasicBlock::iterator InsertBefore);
247  mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
248  MachineBasicBlock::iterator InsertBefore);
250  mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
251  MachineBasicBlock::iterator InsertBefore);
253  mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
254  MachineBasicBlock::iterator InsertBefore);
255 
256  void updateBaseAndOffset(MachineInstr &I, Register NewBase,
257  int32_t NewOffset) const;
258  Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
259  MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
260  Optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
261  void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
262  /// Promotes constant offset to the immediate by adjusting the base. It
263  /// tries to use a base from the nearby instructions that allows it to have
264  /// a 13bit constant offset which gets promoted to the immediate.
265  bool promoteConstantOffsetToImm(MachineInstr &CI,
266  MemInfoMap &Visited,
267  SmallPtrSet<MachineInstr *, 4> &Promoted) const;
268  void addInstToMergeableList(const CombineInfo &CI,
269  std::list<std::list<CombineInfo> > &MergeableInsts) const;
270 
271  std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
273  MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
274  std::list<std::list<CombineInfo>> &MergeableInsts) const;
275 
276  static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI,
277  const CombineInfo &Paired);
278 
279  static InstClassEnum getCommonInstClass(const CombineInfo &CI,
280  const CombineInfo &Paired);
281 
282 public:
283  static char ID;
284 
285  SILoadStoreOptimizer() : MachineFunctionPass(ID) {
287  }
288 
289  bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
290  bool &OptimizeListAgain);
291  bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
292 
293  bool runOnMachineFunction(MachineFunction &MF) override;
294 
295  StringRef getPassName() const override { return "SI Load Store Optimizer"; }
296 
297  void getAnalysisUsage(AnalysisUsage &AU) const override {
298  AU.setPreservesCFG();
300 
302  }
303 
304  MachineFunctionProperties getRequiredProperties() const override {
307  }
308 };
309 
310 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
311  const unsigned Opc = MI.getOpcode();
312 
313  if (TII.isMUBUF(Opc)) {
314  // FIXME: Handle d16 correctly
315  return AMDGPU::getMUBUFElements(Opc);
316  }
317  if (TII.isMIMG(MI)) {
318  uint64_t DMaskImm =
319  TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
320  return countPopulation(DMaskImm);
321  }
322  if (TII.isMTBUF(Opc)) {
323  return AMDGPU::getMTBUFElements(Opc);
324  }
325 
326  switch (Opc) {
327  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
328  case AMDGPU::GLOBAL_LOAD_DWORD:
329  case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
330  case AMDGPU::GLOBAL_STORE_DWORD:
331  case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
332  case AMDGPU::FLAT_LOAD_DWORD:
333  case AMDGPU::FLAT_STORE_DWORD:
334  return 1;
335  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
336  case AMDGPU::GLOBAL_LOAD_DWORDX2:
337  case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
338  case AMDGPU::GLOBAL_STORE_DWORDX2:
339  case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
340  case AMDGPU::FLAT_LOAD_DWORDX2:
341  case AMDGPU::FLAT_STORE_DWORDX2:
342  return 2;
343  case AMDGPU::GLOBAL_LOAD_DWORDX3:
344  case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
345  case AMDGPU::GLOBAL_STORE_DWORDX3:
346  case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
347  case AMDGPU::FLAT_LOAD_DWORDX3:
348  case AMDGPU::FLAT_STORE_DWORDX3:
349  return 3;
350  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
351  case AMDGPU::GLOBAL_LOAD_DWORDX4:
352  case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
353  case AMDGPU::GLOBAL_STORE_DWORDX4:
354  case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
355  case AMDGPU::FLAT_LOAD_DWORDX4:
356  case AMDGPU::FLAT_STORE_DWORDX4:
357  return 4;
358  case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
359  return 8;
360  case AMDGPU::DS_READ_B32: LLVM_FALLTHROUGH;
361  case AMDGPU::DS_READ_B32_gfx9: LLVM_FALLTHROUGH;
362  case AMDGPU::DS_WRITE_B32: LLVM_FALLTHROUGH;
363  case AMDGPU::DS_WRITE_B32_gfx9:
364  return 1;
365  case AMDGPU::DS_READ_B64: LLVM_FALLTHROUGH;
366  case AMDGPU::DS_READ_B64_gfx9: LLVM_FALLTHROUGH;
367  case AMDGPU::DS_WRITE_B64: LLVM_FALLTHROUGH;
368  case AMDGPU::DS_WRITE_B64_gfx9:
369  return 2;
370  default:
371  return 0;
372  }
373 }
374 
375 /// Maps instruction opcode to enum InstClassEnum.
376 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
377  switch (Opc) {
378  default:
379  if (TII.isMUBUF(Opc)) {
380  switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
381  default:
382  return UNKNOWN;
383  case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
384  case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
385  case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
386  case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
387  return BUFFER_LOAD;
388  case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
389  case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
390  case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
391  case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
392  return BUFFER_STORE;
393  }
394  }
395  if (TII.isMIMG(Opc)) {
396  // Ignore instructions encoded without vaddr.
397  if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1 &&
398  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) == -1)
399  return UNKNOWN;
400  // Ignore BVH instructions
401  if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
402  return UNKNOWN;
403  // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
404  if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
405  TII.isGather4(Opc))
406  return UNKNOWN;
407  return MIMG;
408  }
409  if (TII.isMTBUF(Opc)) {
410  switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
411  default:
412  return UNKNOWN;
413  case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
414  case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
415  case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
416  case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
417  return TBUFFER_LOAD;
418  case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
419  case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
420  case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
421  case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
422  return TBUFFER_STORE;
423  }
424  }
425  return UNKNOWN;
426  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
427  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
428  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
429  case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
430  return S_BUFFER_LOAD_IMM;
431  case AMDGPU::DS_READ_B32:
432  case AMDGPU::DS_READ_B32_gfx9:
433  case AMDGPU::DS_READ_B64:
434  case AMDGPU::DS_READ_B64_gfx9:
435  return DS_READ;
436  case AMDGPU::DS_WRITE_B32:
437  case AMDGPU::DS_WRITE_B32_gfx9:
438  case AMDGPU::DS_WRITE_B64:
439  case AMDGPU::DS_WRITE_B64_gfx9:
440  return DS_WRITE;
441  case AMDGPU::GLOBAL_LOAD_DWORD:
442  case AMDGPU::GLOBAL_LOAD_DWORDX2:
443  case AMDGPU::GLOBAL_LOAD_DWORDX3:
444  case AMDGPU::GLOBAL_LOAD_DWORDX4:
445  case AMDGPU::FLAT_LOAD_DWORD:
446  case AMDGPU::FLAT_LOAD_DWORDX2:
447  case AMDGPU::FLAT_LOAD_DWORDX3:
448  case AMDGPU::FLAT_LOAD_DWORDX4:
449  return FLAT_LOAD;
450  case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
451  case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
452  case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
453  case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
454  return GLOBAL_LOAD_SADDR;
455  case AMDGPU::GLOBAL_STORE_DWORD:
456  case AMDGPU::GLOBAL_STORE_DWORDX2:
457  case AMDGPU::GLOBAL_STORE_DWORDX3:
458  case AMDGPU::GLOBAL_STORE_DWORDX4:
459  case AMDGPU::FLAT_STORE_DWORD:
460  case AMDGPU::FLAT_STORE_DWORDX2:
461  case AMDGPU::FLAT_STORE_DWORDX3:
462  case AMDGPU::FLAT_STORE_DWORDX4:
463  return FLAT_STORE;
464  case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
465  case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
466  case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
467  case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
468  return GLOBAL_STORE_SADDR;
469  }
470 }
471 
472 /// Determines instruction subclass from opcode. Only instructions
473 /// of the same subclass can be merged together. The merged instruction may have
474 /// a different subclass but must have the same class.
475 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
476  switch (Opc) {
477  default:
478  if (TII.isMUBUF(Opc))
479  return AMDGPU::getMUBUFBaseOpcode(Opc);
480  if (TII.isMIMG(Opc)) {
481  const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
482  assert(Info);
483  return Info->BaseOpcode;
484  }
485  if (TII.isMTBUF(Opc))
486  return AMDGPU::getMTBUFBaseOpcode(Opc);
487  return -1;
488  case AMDGPU::DS_READ_B32:
489  case AMDGPU::DS_READ_B32_gfx9:
490  case AMDGPU::DS_READ_B64:
491  case AMDGPU::DS_READ_B64_gfx9:
492  case AMDGPU::DS_WRITE_B32:
493  case AMDGPU::DS_WRITE_B32_gfx9:
494  case AMDGPU::DS_WRITE_B64:
495  case AMDGPU::DS_WRITE_B64_gfx9:
496  return Opc;
497  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
498  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
499  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
500  case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
501  return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
502  case AMDGPU::GLOBAL_LOAD_DWORD:
503  case AMDGPU::GLOBAL_LOAD_DWORDX2:
504  case AMDGPU::GLOBAL_LOAD_DWORDX3:
505  case AMDGPU::GLOBAL_LOAD_DWORDX4:
506  case AMDGPU::FLAT_LOAD_DWORD:
507  case AMDGPU::FLAT_LOAD_DWORDX2:
508  case AMDGPU::FLAT_LOAD_DWORDX3:
509  case AMDGPU::FLAT_LOAD_DWORDX4:
510  return AMDGPU::FLAT_LOAD_DWORD;
511  case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
512  case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
513  case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
514  case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
515  return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
516  case AMDGPU::GLOBAL_STORE_DWORD:
517  case AMDGPU::GLOBAL_STORE_DWORDX2:
518  case AMDGPU::GLOBAL_STORE_DWORDX3:
519  case AMDGPU::GLOBAL_STORE_DWORDX4:
520  case AMDGPU::FLAT_STORE_DWORD:
521  case AMDGPU::FLAT_STORE_DWORDX2:
522  case AMDGPU::FLAT_STORE_DWORDX3:
523  case AMDGPU::FLAT_STORE_DWORDX4:
524  return AMDGPU::FLAT_STORE_DWORD;
525  case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
526  case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
527  case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
528  case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
529  return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
530  }
531 }
532 
533 // GLOBAL loads and stores are classified as FLAT initially. If both combined
534 // instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE.
535 // If either or both instructions are non segment specific FLAT the resulting
536 // combined operation will be FLAT, potentially promoting one of the GLOBAL
537 // operations to FLAT.
538 // For other instructions return the original unmodified class.
539 InstClassEnum
540 SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI,
541  const CombineInfo &Paired) {
542  assert(CI.InstClass == Paired.InstClass);
543 
544  if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
546  return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
547 
548  return CI.InstClass;
549 }
550 
551 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
552  AddressRegs Result;
553 
554  if (TII.isMUBUF(Opc)) {
555  if (AMDGPU::getMUBUFHasVAddr(Opc))
556  Result.VAddr = true;
557  if (AMDGPU::getMUBUFHasSrsrc(Opc))
558  Result.SRsrc = true;
560  Result.SOffset = true;
561 
562  return Result;
563  }
564 
565  if (TII.isMIMG(Opc)) {
566  int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
567  if (VAddr0Idx >= 0) {
568  int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
569  Result.NumVAddrs = SRsrcIdx - VAddr0Idx;
570  } else {
571  Result.VAddr = true;
572  }
573  Result.SRsrc = true;
574  const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
575  if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
576  Result.SSamp = true;
577 
578  return Result;
579  }
580  if (TII.isMTBUF(Opc)) {
581  if (AMDGPU::getMTBUFHasVAddr(Opc))
582  Result.VAddr = true;
583  if (AMDGPU::getMTBUFHasSrsrc(Opc))
584  Result.SRsrc = true;
586  Result.SOffset = true;
587 
588  return Result;
589  }
590 
591  switch (Opc) {
592  default:
593  return Result;
594  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
595  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
596  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
597  case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
598  Result.SBase = true;
599  return Result;
600  case AMDGPU::DS_READ_B32:
601  case AMDGPU::DS_READ_B64:
602  case AMDGPU::DS_READ_B32_gfx9:
603  case AMDGPU::DS_READ_B64_gfx9:
604  case AMDGPU::DS_WRITE_B32:
605  case AMDGPU::DS_WRITE_B64:
606  case AMDGPU::DS_WRITE_B32_gfx9:
607  case AMDGPU::DS_WRITE_B64_gfx9:
608  Result.Addr = true;
609  return Result;
610  case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
611  case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
612  case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
613  case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
614  case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
615  case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
616  case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
617  case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
618  Result.SAddr = true;
620  case AMDGPU::GLOBAL_LOAD_DWORD:
621  case AMDGPU::GLOBAL_LOAD_DWORDX2:
622  case AMDGPU::GLOBAL_LOAD_DWORDX3:
623  case AMDGPU::GLOBAL_LOAD_DWORDX4:
624  case AMDGPU::GLOBAL_STORE_DWORD:
625  case AMDGPU::GLOBAL_STORE_DWORDX2:
626  case AMDGPU::GLOBAL_STORE_DWORDX3:
627  case AMDGPU::GLOBAL_STORE_DWORDX4:
628  case AMDGPU::FLAT_LOAD_DWORD:
629  case AMDGPU::FLAT_LOAD_DWORDX2:
630  case AMDGPU::FLAT_LOAD_DWORDX3:
631  case AMDGPU::FLAT_LOAD_DWORDX4:
632  case AMDGPU::FLAT_STORE_DWORD:
633  case AMDGPU::FLAT_STORE_DWORDX2:
634  case AMDGPU::FLAT_STORE_DWORDX3:
635  case AMDGPU::FLAT_STORE_DWORDX4:
636  Result.VAddr = true;
637  return Result;
638  }
639 }
640 
641 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
642  const SILoadStoreOptimizer &LSO) {
643  I = MI;
644  unsigned Opc = MI->getOpcode();
645  InstClass = getInstClass(Opc, *LSO.TII);
646 
647  if (InstClass == UNKNOWN)
648  return;
649 
650  IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI));
651 
652  switch (InstClass) {
653  case DS_READ:
654  EltSize =
655  (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
656  : 4;
657  break;
658  case DS_WRITE:
659  EltSize =
660  (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
661  : 4;
662  break;
663  case S_BUFFER_LOAD_IMM:
664  EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);
665  break;
666  default:
667  EltSize = 4;
668  break;
669  }
670 
671  if (InstClass == MIMG) {
672  DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
673  // Offset is not considered for MIMG instructions.
674  Offset = 0;
675  } else {
676  int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
677  Offset = I->getOperand(OffsetIdx).getImm();
678  }
679 
680  if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
681  Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
682 
683  Width = getOpcodeWidth(*I, *LSO.TII);
684 
685  if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
686  Offset &= 0xffff;
687  } else if (InstClass != MIMG) {
688  CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
689  }
690 
691  AddressRegs Regs = getRegs(Opc, *LSO.TII);
692 
693  NumAddresses = 0;
694  for (unsigned J = 0; J < Regs.NumVAddrs; J++)
695  AddrIdx[NumAddresses++] =
696  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
697  if (Regs.Addr)
698  AddrIdx[NumAddresses++] =
699  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
700  if (Regs.SBase)
701  AddrIdx[NumAddresses++] =
702  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
703  if (Regs.SRsrc)
704  AddrIdx[NumAddresses++] =
705  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
706  if (Regs.SOffset)
707  AddrIdx[NumAddresses++] =
708  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
709  if (Regs.SAddr)
710  AddrIdx[NumAddresses++] =
711  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
712  if (Regs.VAddr)
713  AddrIdx[NumAddresses++] =
714  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
715  if (Regs.SSamp)
716  AddrIdx[NumAddresses++] =
717  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp);
718  assert(NumAddresses <= MaxAddressRegs);
719 
720  for (unsigned J = 0; J < NumAddresses; J++)
721  AddrReg[J] = &I->getOperand(AddrIdx[J]);
722 }
723 
724 } // end anonymous namespace.
725 
726 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
727  "SI Load Store Optimizer", false, false)
731 
732 char SILoadStoreOptimizer::ID = 0;
733 
734 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
735 
737  return new SILoadStoreOptimizer();
738 }
739 
740 static void addDefsUsesToList(const MachineInstr &MI,
741  DenseSet<Register> &RegDefs,
742  DenseSet<Register> &RegUses) {
743  for (const auto &Op : MI.operands()) {
744  if (!Op.isReg())
745  continue;
746  if (Op.isDef())
747  RegDefs.insert(Op.getReg());
748  if (Op.readsReg())
749  RegUses.insert(Op.getReg());
750  }
751 }
752 
753 bool SILoadStoreOptimizer::canSwapInstructions(
754  const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
755  const MachineInstr &A, const MachineInstr &B) const {
756  if (A.mayLoadOrStore() && B.mayLoadOrStore() &&
757  (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true))
758  return false;
759  for (const auto &BOp : B.operands()) {
760  if (!BOp.isReg())
761  continue;
762  if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))
763  return false;
764  if (BOp.isDef() && ARegUses.contains(BOp.getReg()))
765  return false;
766  }
767  return true;
768 }
769 
770 // Given that \p CI and \p Paired are adjacent memory operations produce a new
771 // MMO for the combined operation with a new access size.
773 SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,
774  const CombineInfo &Paired) {
775  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
776  const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
777 
778  unsigned Size = MMOa->getSize() + MMOb->getSize();
779 
780  // A base pointer for the combined operation is the same as the leading
781  // operation's pointer.
782  if (Paired < CI)
783  std::swap(MMOa, MMOb);
784 
785  MachinePointerInfo PtrInfo(MMOa->getPointerInfo());
786  // If merging FLAT and GLOBAL set address space to FLAT.
787  if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
788  PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS;
789 
790  MachineFunction *MF = CI.I->getMF();
791  return MF->getMachineMemOperand(MMOa, PtrInfo, Size);
792 }
793 
794 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
795  const SIInstrInfo &TII,
796  const CombineInfo &Paired) {
797  assert(CI.InstClass == MIMG);
798 
799  // Ignore instructions with tfe/lwe set.
800  const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
801  const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
802 
803  if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
804  return false;
805 
806  // Check other optional immediate operands for equality.
807  unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
808  AMDGPU::OpName::unorm, AMDGPU::OpName::da,
809  AMDGPU::OpName::r128, AMDGPU::OpName::a16};
810 
811  for (auto op : OperandsToMatch) {
812  int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
813  if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
814  return false;
815  if (Idx != -1 &&
816  CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
817  return false;
818  }
819 
820  // Check DMask for overlaps.
821  unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
822  unsigned MinMask = std::min(CI.DMask, Paired.DMask);
823 
824  unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask);
825  if ((1u << AllowedBitsForMin) <= MinMask)
826  return false;
827 
828  return true;
829 }
830 
831 static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
832  unsigned ComponentCount,
833  const GCNSubtarget &STI) {
834  if (ComponentCount > 4)
835  return 0;
836 
837  const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
838  llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI);
839  if (!OldFormatInfo)
840  return 0;
841 
842  const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
844  ComponentCount,
845  OldFormatInfo->NumFormat, STI);
846 
847  if (!NewFormatInfo)
848  return 0;
849 
850  assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
851  NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
852 
853  return NewFormatInfo->Format;
854 }
855 
856 // Return the value in the inclusive range [Lo,Hi] that is aligned to the
857 // highest power of two. Note that the result is well defined for all inputs
858 // including corner cases like:
859 // - if Lo == Hi, return that value
860 // - if Lo == 0, return 0 (even though the "- 1" below underflows
861 // - if Lo > Hi, return 0 (as if the range wrapped around)
863  return Hi & maskLeadingOnes<uint32_t>(countLeadingZeros((Lo - 1) ^ Hi) + 1);
864 }
865 
866 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
867  const GCNSubtarget &STI,
868  CombineInfo &Paired,
869  bool Modify) {
870  assert(CI.InstClass != MIMG);
871 
872  // XXX - Would the same offset be OK? Is there any reason this would happen or
873  // be useful?
874  if (CI.Offset == Paired.Offset)
875  return false;
876 
877  // This won't be valid if the offset isn't aligned.
878  if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
879  return false;
880 
881  if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
882 
883  const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
884  llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI);
885  if (!Info0)
886  return false;
887  const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
888  llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
889  if (!Info1)
890  return false;
891 
892  if (Info0->BitsPerComp != Info1->BitsPerComp ||
893  Info0->NumFormat != Info1->NumFormat)
894  return false;
895 
896  // TODO: Should be possible to support more formats, but if format loads
897  // are not dword-aligned, the merged load might not be valid.
898  if (Info0->BitsPerComp != 32)
899  return false;
900 
901  if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
902  return false;
903  }
904 
905  uint32_t EltOffset0 = CI.Offset / CI.EltSize;
906  uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
907  CI.UseST64 = false;
908  CI.BaseOff = 0;
909 
910  // Handle all non-DS instructions.
911  if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
912  return (EltOffset0 + CI.Width == EltOffset1 ||
913  EltOffset1 + Paired.Width == EltOffset0) &&
914  CI.CPol == Paired.CPol;
915  }
916 
917  // If the offset in elements doesn't fit in 8-bits, we might be able to use
918  // the stride 64 versions.
919  if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
920  isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
921  if (Modify) {
922  CI.Offset = EltOffset0 / 64;
923  Paired.Offset = EltOffset1 / 64;
924  CI.UseST64 = true;
925  }
926  return true;
927  }
928 
929  // Check if the new offsets fit in the reduced 8-bit range.
930  if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
931  if (Modify) {
932  CI.Offset = EltOffset0;
933  Paired.Offset = EltOffset1;
934  }
935  return true;
936  }
937 
938  // Try to shift base address to decrease offsets.
939  uint32_t Min = std::min(EltOffset0, EltOffset1);
940  uint32_t Max = std::max(EltOffset0, EltOffset1);
941 
942  const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
943  if (((Max - Min) & ~Mask) == 0) {
944  if (Modify) {
945  // From the range of values we could use for BaseOff, choose the one that
946  // is aligned to the highest power of two, to maximise the chance that
947  // the same offset can be reused for other load/store pairs.
948  uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
949  // Copy the low bits of the offsets, so that when we adjust them by
950  // subtracting BaseOff they will be multiples of 64.
951  BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
952  CI.BaseOff = BaseOff * CI.EltSize;
953  CI.Offset = (EltOffset0 - BaseOff) / 64;
954  Paired.Offset = (EltOffset1 - BaseOff) / 64;
955  CI.UseST64 = true;
956  }
957  return true;
958  }
959 
960  if (isUInt<8>(Max - Min)) {
961  if (Modify) {
962  // From the range of values we could use for BaseOff, choose the one that
963  // is aligned to the highest power of two, to maximise the chance that
964  // the same offset can be reused for other load/store pairs.
965  uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
966  CI.BaseOff = BaseOff * CI.EltSize;
967  CI.Offset = EltOffset0 - BaseOff;
968  Paired.Offset = EltOffset1 - BaseOff;
969  }
970  return true;
971  }
972 
973  return false;
974 }
975 
976 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
977  const CombineInfo &CI,
978  const CombineInfo &Paired) {
979  const unsigned Width = (CI.Width + Paired.Width);
980  switch (CI.InstClass) {
981  default:
982  return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
983  case S_BUFFER_LOAD_IMM:
984  switch (Width) {
985  default:
986  return false;
987  case 2:
988  case 4:
989  case 8:
990  return true;
991  }
992  }
993 }
994 
995 const TargetRegisterClass *
996 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
997  if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
998  return TRI->getRegClassForReg(*MRI, Dst->getReg());
999  }
1000  if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
1001  return TRI->getRegClassForReg(*MRI, Src->getReg());
1002  }
1003  if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
1004  return TRI->getRegClassForReg(*MRI, Src->getReg());
1005  }
1006  if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
1007  return TRI->getRegClassForReg(*MRI, Dst->getReg());
1008  }
1009  if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
1010  return TRI->getRegClassForReg(*MRI, Src->getReg());
1011  }
1012  return nullptr;
1013 }
1014 
1015 /// This function assumes that CI comes before Paired in a basic block. Return
1016 /// an insertion point for the merged instruction or nullptr on failure.
1017 SILoadStoreOptimizer::CombineInfo *
1018 SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1019  CombineInfo &Paired) {
1020  // If another instruction has already been merged into CI, it may now be a
1021  // type that we can't do any further merging into.
1022  if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1023  return nullptr;
1024  assert(CI.InstClass == Paired.InstClass);
1025 
1026  if (getInstSubclass(CI.I->getOpcode(), *TII) !=
1027  getInstSubclass(Paired.I->getOpcode(), *TII))
1028  return nullptr;
1029 
1030  // Check both offsets (or masks for MIMG) can be combined and fit in the
1031  // reduced range.
1032  if (CI.InstClass == MIMG) {
1033  if (!dmasksCanBeCombined(CI, *TII, Paired))
1034  return nullptr;
1035  } else {
1036  if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
1037  return nullptr;
1038  }
1039 
1040  DenseSet<Register> RegDefs;
1041  DenseSet<Register> RegUses;
1042  CombineInfo *Where;
1043  if (CI.I->mayLoad()) {
1044  // Try to hoist Paired up to CI.
1045  addDefsUsesToList(*Paired.I, RegDefs, RegUses);
1046  for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
1047  if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))
1048  return nullptr;
1049  }
1050  Where = &CI;
1051  } else {
1052  // Try to sink CI down to Paired.
1053  addDefsUsesToList(*CI.I, RegDefs, RegUses);
1054  for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
1055  if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))
1056  return nullptr;
1057  }
1058  Where = &Paired;
1059  }
1060 
1061  // Call offsetsCanBeCombined with modify = true so that the offsets are
1062  // correct for the new instruction. This should return true, because
1063  // this function should only be called on CombineInfo objects that
1064  // have already been confirmed to be mergeable.
1065  if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1066  offsetsCanBeCombined(CI, *STM, Paired, true);
1067  return Where;
1068 }
1069 
1070 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
1071  if (STM->ldsRequiresM0Init())
1072  return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1073  return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1074 }
1075 
1076 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
1077  if (STM->ldsRequiresM0Init())
1078  return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1079 
1080  return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1081  : AMDGPU::DS_READ2ST64_B64_gfx9;
1082 }
1083 
1085 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1086  MachineBasicBlock::iterator InsertBefore) {
1087  MachineBasicBlock *MBB = CI.I->getParent();
1088 
1089  // Be careful, since the addresses could be subregisters themselves in weird
1090  // cases, like vectors of pointers.
1091  const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1092 
1093  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1094  const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1095 
1096  unsigned NewOffset0 = CI.Offset;
1097  unsigned NewOffset1 = Paired.Offset;
1098  unsigned Opc =
1099  CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1100 
1101  unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
1102  unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
1103 
1104  if (NewOffset0 > NewOffset1) {
1105  // Canonicalize the merged instruction so the smaller offset comes first.
1106  std::swap(NewOffset0, NewOffset1);
1107  std::swap(SubRegIdx0, SubRegIdx1);
1108  }
1109 
1110  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1111  (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1112 
1113  const MCInstrDesc &Read2Desc = TII->get(Opc);
1114 
1115  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1116  Register DestReg = MRI->createVirtualRegister(SuperRC);
1117 
1118  DebugLoc DL = CI.I->getDebugLoc();
1119 
1120  Register BaseReg = AddrReg->getReg();
1121  unsigned BaseSubReg = AddrReg->getSubReg();
1122  unsigned BaseRegFlags = 0;
1123  if (CI.BaseOff) {
1124  Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1125  BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1126  .addImm(CI.BaseOff);
1127 
1128  BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1129  BaseRegFlags = RegState::Kill;
1130 
1131  TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1132  .addReg(ImmReg)
1133  .addReg(AddrReg->getReg(), 0, BaseSubReg)
1134  .addImm(0); // clamp bit
1135  BaseSubReg = 0;
1136  }
1137 
1138  MachineInstrBuilder Read2 =
1139  BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg)
1140  .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1141  .addImm(NewOffset0) // offset0
1142  .addImm(NewOffset1) // offset1
1143  .addImm(0) // gds
1144  .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1145 
1146  (void)Read2;
1147 
1148  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1149 
1150  // Copy to the old destination registers.
1151  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1152  .add(*Dest0) // Copy to same destination including flags and sub reg.
1153  .addReg(DestReg, 0, SubRegIdx0);
1154  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1155  .add(*Dest1)
1156  .addReg(DestReg, RegState::Kill, SubRegIdx1);
1157 
1158  CI.I->eraseFromParent();
1159  Paired.I->eraseFromParent();
1160 
1161  LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
1162  return Read2;
1163 }
1164 
1165 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
1166  if (STM->ldsRequiresM0Init())
1167  return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1168  return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1169  : AMDGPU::DS_WRITE2_B64_gfx9;
1170 }
1171 
1172 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
1173  if (STM->ldsRequiresM0Init())
1174  return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1175  : AMDGPU::DS_WRITE2ST64_B64;
1176 
1177  return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1178  : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1179 }
1180 
1181 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
1182  CombineInfo &CI, CombineInfo &Paired,
1183  MachineBasicBlock::iterator InsertBefore) {
1184  MachineBasicBlock *MBB = CI.I->getParent();
1185 
1186  // Be sure to use .addOperand(), and not .addReg() with these. We want to be
1187  // sure we preserve the subregister index and any register flags set on them.
1188  const MachineOperand *AddrReg =
1189  TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1190  const MachineOperand *Data0 =
1191  TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1192  const MachineOperand *Data1 =
1193  TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1194 
1195  unsigned NewOffset0 = CI.Offset;
1196  unsigned NewOffset1 = Paired.Offset;
1197  unsigned Opc =
1198  CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1199 
1200  if (NewOffset0 > NewOffset1) {
1201  // Canonicalize the merged instruction so the smaller offset comes first.
1202  std::swap(NewOffset0, NewOffset1);
1203  std::swap(Data0, Data1);
1204  }
1205 
1206  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1207  (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1208 
1209  const MCInstrDesc &Write2Desc = TII->get(Opc);
1210  DebugLoc DL = CI.I->getDebugLoc();
1211 
1212  Register BaseReg = AddrReg->getReg();
1213  unsigned BaseSubReg = AddrReg->getSubReg();
1214  unsigned BaseRegFlags = 0;
1215  if (CI.BaseOff) {
1216  Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1217  BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1218  .addImm(CI.BaseOff);
1219 
1220  BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1221  BaseRegFlags = RegState::Kill;
1222 
1223  TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1224  .addReg(ImmReg)
1225  .addReg(AddrReg->getReg(), 0, BaseSubReg)
1226  .addImm(0); // clamp bit
1227  BaseSubReg = 0;
1228  }
1229 
1230  MachineInstrBuilder Write2 =
1231  BuildMI(*MBB, InsertBefore, DL, Write2Desc)
1232  .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1233  .add(*Data0) // data0
1234  .add(*Data1) // data1
1235  .addImm(NewOffset0) // offset0
1236  .addImm(NewOffset1) // offset1
1237  .addImm(0) // gds
1238  .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1239 
1240  CI.I->eraseFromParent();
1241  Paired.I->eraseFromParent();
1242 
1243  LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
1244  return Write2;
1245 }
1246 
1248 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1249  MachineBasicBlock::iterator InsertBefore) {
1250  MachineBasicBlock *MBB = CI.I->getParent();
1251  DebugLoc DL = CI.I->getDebugLoc();
1252  const unsigned Opcode = getNewOpcode(CI, Paired);
1253 
1254  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1255 
1256  Register DestReg = MRI->createVirtualRegister(SuperRC);
1257  unsigned MergedDMask = CI.DMask | Paired.DMask;
1258  unsigned DMaskIdx =
1259  AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1260 
1261  auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1262  for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
1263  if (I == DMaskIdx)
1264  MIB.addImm(MergedDMask);
1265  else
1266  MIB.add((*CI.I).getOperand(I));
1267  }
1268 
1269  // It shouldn't be possible to get this far if the two instructions
1270  // don't have a single memoperand, because MachineInstr::mayAlias()
1271  // will return true if this is the case.
1272  assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1273 
1274  MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1275 
1276  unsigned SubRegIdx0, SubRegIdx1;
1277  std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired);
1278 
1279  // Copy to the old destination registers.
1280  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1281  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1282  const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1283 
1284  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1285  .add(*Dest0) // Copy to same destination including flags and sub reg.
1286  .addReg(DestReg, 0, SubRegIdx0);
1287  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1288  .add(*Dest1)
1289  .addReg(DestReg, RegState::Kill, SubRegIdx1);
1290 
1291  CI.I->eraseFromParent();
1292  Paired.I->eraseFromParent();
1293  return New;
1294 }
1295 
1296 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
1297  CombineInfo &CI, CombineInfo &Paired,
1298  MachineBasicBlock::iterator InsertBefore) {
1299  MachineBasicBlock *MBB = CI.I->getParent();
1300  DebugLoc DL = CI.I->getDebugLoc();
1301  const unsigned Opcode = getNewOpcode(CI, Paired);
1302 
1303  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1304 
1305  Register DestReg = MRI->createVirtualRegister(SuperRC);
1306  unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1307 
1308  // It shouldn't be possible to get this far if the two instructions
1309  // don't have a single memoperand, because MachineInstr::mayAlias()
1310  // will return true if this is the case.
1311  assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1312 
1313  MachineInstr *New =
1314  BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)
1315  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
1316  .addImm(MergedOffset) // offset
1317  .addImm(CI.CPol) // cpol
1318  .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1319 
1320  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1321  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1322  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1323 
1324  // Copy to the old destination registers.
1325  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1326  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
1327  const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst);
1328 
1329  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1330  .add(*Dest0) // Copy to same destination including flags and sub reg.
1331  .addReg(DestReg, 0, SubRegIdx0);
1332  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1333  .add(*Dest1)
1334  .addReg(DestReg, RegState::Kill, SubRegIdx1);
1335 
1336  CI.I->eraseFromParent();
1337  Paired.I->eraseFromParent();
1338  return New;
1339 }
1340 
1341 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
1342  CombineInfo &CI, CombineInfo &Paired,
1343  MachineBasicBlock::iterator InsertBefore) {
1344  MachineBasicBlock *MBB = CI.I->getParent();
1345  DebugLoc DL = CI.I->getDebugLoc();
1346 
1347  const unsigned Opcode = getNewOpcode(CI, Paired);
1348 
1349  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1350 
1351  // Copy to the new source register.
1352  Register DestReg = MRI->createVirtualRegister(SuperRC);
1353  unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1354 
1355  auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1356 
1357  AddressRegs Regs = getRegs(Opcode, *TII);
1358 
1359  if (Regs.VAddr)
1360  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1361 
1362  // It shouldn't be possible to get this far if the two instructions
1363  // don't have a single memoperand, because MachineInstr::mayAlias()
1364  // will return true if this is the case.
1365  assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1366 
1367  MachineInstr *New =
1368  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1369  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1370  .addImm(MergedOffset) // offset
1371  .addImm(CI.CPol) // cpol
1372  .addImm(0) // tfe
1373  .addImm(0) // swz
1374  .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1375 
1376  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1377  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1378  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1379 
1380  // Copy to the old destination registers.
1381  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1382  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1383  const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1384 
1385  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1386  .add(*Dest0) // Copy to same destination including flags and sub reg.
1387  .addReg(DestReg, 0, SubRegIdx0);
1388  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1389  .add(*Dest1)
1390  .addReg(DestReg, RegState::Kill, SubRegIdx1);
1391 
1392  CI.I->eraseFromParent();
1393  Paired.I->eraseFromParent();
1394  return New;
1395 }
1396 
1397 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
1398  CombineInfo &CI, CombineInfo &Paired,
1399  MachineBasicBlock::iterator InsertBefore) {
1400  MachineBasicBlock *MBB = CI.I->getParent();
1401  DebugLoc DL = CI.I->getDebugLoc();
1402 
1403  const unsigned Opcode = getNewOpcode(CI, Paired);
1404 
1405  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1406 
1407  // Copy to the new source register.
1408  Register DestReg = MRI->createVirtualRegister(SuperRC);
1409  unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1410 
1411  auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1412 
1413  AddressRegs Regs = getRegs(Opcode, *TII);
1414 
1415  if (Regs.VAddr)
1416  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1417 
1418  unsigned JoinedFormat =
1419  getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1420 
1421  // It shouldn't be possible to get this far if the two instructions
1422  // don't have a single memoperand, because MachineInstr::mayAlias()
1423  // will return true if this is the case.
1424  assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1425 
1426  MachineInstr *New =
1427  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1428  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1429  .addImm(MergedOffset) // offset
1430  .addImm(JoinedFormat) // format
1431  .addImm(CI.CPol) // cpol
1432  .addImm(0) // tfe
1433  .addImm(0) // swz
1434  .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1435 
1436  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1437  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1438  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1439 
1440  // Copy to the old destination registers.
1441  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1442  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1443  const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1444 
1445  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1446  .add(*Dest0) // Copy to same destination including flags and sub reg.
1447  .addReg(DestReg, 0, SubRegIdx0);
1448  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1449  .add(*Dest1)
1450  .addReg(DestReg, RegState::Kill, SubRegIdx1);
1451 
1452  CI.I->eraseFromParent();
1453  Paired.I->eraseFromParent();
1454  return New;
1455 }
1456 
1457 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
1458  CombineInfo &CI, CombineInfo &Paired,
1459  MachineBasicBlock::iterator InsertBefore) {
1460  MachineBasicBlock *MBB = CI.I->getParent();
1461  DebugLoc DL = CI.I->getDebugLoc();
1462 
1463  const unsigned Opcode = getNewOpcode(CI, Paired);
1464 
1465  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1466  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1467  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1468 
1469  // Copy to the new source register.
1470  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1471  Register SrcReg = MRI->createVirtualRegister(SuperRC);
1472 
1473  const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1474  const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1475 
1476  BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1477  .add(*Src0)
1478  .addImm(SubRegIdx0)
1479  .add(*Src1)
1480  .addImm(SubRegIdx1);
1481 
1482  auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1483  .addReg(SrcReg, RegState::Kill);
1484 
1485  AddressRegs Regs = getRegs(Opcode, *TII);
1486 
1487  if (Regs.VAddr)
1488  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1489 
1490  unsigned JoinedFormat =
1491  getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1492 
1493  // It shouldn't be possible to get this far if the two instructions
1494  // don't have a single memoperand, because MachineInstr::mayAlias()
1495  // will return true if this is the case.
1496  assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1497 
1498  MachineInstr *New =
1499  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1500  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1501  .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1502  .addImm(JoinedFormat) // format
1503  .addImm(CI.CPol) // cpol
1504  .addImm(0) // tfe
1505  .addImm(0) // swz
1506  .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1507 
1508  CI.I->eraseFromParent();
1509  Paired.I->eraseFromParent();
1510  return New;
1511 }
1512 
1513 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
1514  CombineInfo &CI, CombineInfo &Paired,
1515  MachineBasicBlock::iterator InsertBefore) {
1516  MachineBasicBlock *MBB = CI.I->getParent();
1517  DebugLoc DL = CI.I->getDebugLoc();
1518 
1519  const unsigned Opcode = getNewOpcode(CI, Paired);
1520 
1521  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1522  Register DestReg = MRI->createVirtualRegister(SuperRC);
1523 
1524  auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1525 
1526  if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1527  MIB.add(*SAddr);
1528 
1529  MachineInstr *New =
1530  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1531  .addImm(std::min(CI.Offset, Paired.Offset))
1532  .addImm(CI.CPol)
1533  .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1534 
1535  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1536  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1537  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1538 
1539  // Copy to the old destination registers.
1540  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1541  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1542  const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1543 
1544  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1545  .add(*Dest0) // Copy to same destination including flags and sub reg.
1546  .addReg(DestReg, 0, SubRegIdx0);
1547  BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1548  .add(*Dest1)
1549  .addReg(DestReg, RegState::Kill, SubRegIdx1);
1550 
1551  CI.I->eraseFromParent();
1552  Paired.I->eraseFromParent();
1553  return New;
1554 }
1555 
1556 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
1557  CombineInfo &CI, CombineInfo &Paired,
1558  MachineBasicBlock::iterator InsertBefore) {
1559  MachineBasicBlock *MBB = CI.I->getParent();
1560  DebugLoc DL = CI.I->getDebugLoc();
1561 
1562  const unsigned Opcode = getNewOpcode(CI, Paired);
1563 
1564  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1565  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1566  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1567 
1568  // Copy to the new source register.
1569  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1570  Register SrcReg = MRI->createVirtualRegister(SuperRC);
1571 
1572  const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1573  const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1574 
1575  BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1576  .add(*Src0)
1577  .addImm(SubRegIdx0)
1578  .add(*Src1)
1579  .addImm(SubRegIdx1);
1580 
1581  auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1582  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1583  .addReg(SrcReg, RegState::Kill);
1584 
1585  if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1586  MIB.add(*SAddr);
1587 
1588  MachineInstr *New =
1589  MIB.addImm(std::min(CI.Offset, Paired.Offset))
1590  .addImm(CI.CPol)
1591  .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1592 
1593  CI.I->eraseFromParent();
1594  Paired.I->eraseFromParent();
1595  return New;
1596 }
1597 
1598 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1599  const CombineInfo &Paired) {
1600  const unsigned Width = CI.Width + Paired.Width;
1601 
1602  switch (getCommonInstClass(CI, Paired)) {
1603  default:
1604  assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1605  // FIXME: Handle d16 correctly
1606  return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
1607  Width);
1608  case TBUFFER_LOAD:
1609  case TBUFFER_STORE:
1610  return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1611  Width);
1612 
1613  case UNKNOWN:
1614  llvm_unreachable("Unknown instruction class");
1615  case S_BUFFER_LOAD_IMM:
1616  switch (Width) {
1617  default:
1618  return 0;
1619  case 2:
1620  return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1621  case 4:
1622  return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1623  case 8:
1624  return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1625  }
1626  case GLOBAL_LOAD:
1627  switch (Width) {
1628  default:
1629  return 0;
1630  case 2:
1631  return AMDGPU::GLOBAL_LOAD_DWORDX2;
1632  case 3:
1633  return AMDGPU::GLOBAL_LOAD_DWORDX3;
1634  case 4:
1635  return AMDGPU::GLOBAL_LOAD_DWORDX4;
1636  }
1637  case GLOBAL_LOAD_SADDR:
1638  switch (Width) {
1639  default:
1640  return 0;
1641  case 2:
1642  return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1643  case 3:
1644  return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1645  case 4:
1646  return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1647  }
1648  case GLOBAL_STORE:
1649  switch (Width) {
1650  default:
1651  return 0;
1652  case 2:
1653  return AMDGPU::GLOBAL_STORE_DWORDX2;
1654  case 3:
1655  return AMDGPU::GLOBAL_STORE_DWORDX3;
1656  case 4:
1657  return AMDGPU::GLOBAL_STORE_DWORDX4;
1658  }
1659  case GLOBAL_STORE_SADDR:
1660  switch (Width) {
1661  default:
1662  return 0;
1663  case 2:
1664  return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1665  case 3:
1666  return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1667  case 4:
1668  return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1669  }
1670  case FLAT_LOAD:
1671  switch (Width) {
1672  default:
1673  return 0;
1674  case 2:
1675  return AMDGPU::FLAT_LOAD_DWORDX2;
1676  case 3:
1677  return AMDGPU::FLAT_LOAD_DWORDX3;
1678  case 4:
1679  return AMDGPU::FLAT_LOAD_DWORDX4;
1680  }
1681  case FLAT_STORE:
1682  switch (Width) {
1683  default:
1684  return 0;
1685  case 2:
1686  return AMDGPU::FLAT_STORE_DWORDX2;
1687  case 3:
1688  return AMDGPU::FLAT_STORE_DWORDX3;
1689  case 4:
1690  return AMDGPU::FLAT_STORE_DWORDX4;
1691  }
1692  case MIMG:
1693  assert((countPopulation(CI.DMask | Paired.DMask) == Width) &&
1694  "No overlaps");
1695  return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
1696  }
1697 }
1698 
1699 std::pair<unsigned, unsigned>
1700 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
1701  const CombineInfo &Paired) {
1702  assert((CI.InstClass != MIMG || (countPopulation(CI.DMask | Paired.DMask) ==
1703  CI.Width + Paired.Width)) &&
1704  "No overlaps");
1705 
1706  unsigned Idx0;
1707  unsigned Idx1;
1708 
1709  static const unsigned Idxs[5][4] = {
1710  {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1711  {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
1712  {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
1713  {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
1714  {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
1715  };
1716 
1717  assert(CI.Width >= 1 && CI.Width <= 4);
1718  assert(Paired.Width >= 1 && Paired.Width <= 4);
1719 
1720  if (Paired < CI) {
1721  Idx1 = Idxs[0][Paired.Width - 1];
1722  Idx0 = Idxs[Paired.Width][CI.Width - 1];
1723  } else {
1724  Idx0 = Idxs[0][CI.Width - 1];
1725  Idx1 = Idxs[CI.Width][Paired.Width - 1];
1726  }
1727 
1728  return std::make_pair(Idx0, Idx1);
1729 }
1730 
1731 const TargetRegisterClass *
1732 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
1733  const CombineInfo &Paired) {
1734  if (CI.InstClass == S_BUFFER_LOAD_IMM) {
1735  switch (CI.Width + Paired.Width) {
1736  default:
1737  return nullptr;
1738  case 2:
1739  return &AMDGPU::SReg_64_XEXECRegClass;
1740  case 4:
1741  return &AMDGPU::SGPR_128RegClass;
1742  case 8:
1743  return &AMDGPU::SGPR_256RegClass;
1744  case 16:
1745  return &AMDGPU::SGPR_512RegClass;
1746  }
1747  }
1748 
1749  unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1750  return TRI->isAGPRClass(getDataRegClass(*CI.I))
1751  ? TRI->getAGPRClassForBitWidth(BitWidth)
1752  : TRI->getVGPRClassForBitWidth(BitWidth);
1753 }
1754 
1755 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
1756  CombineInfo &CI, CombineInfo &Paired,
1757  MachineBasicBlock::iterator InsertBefore) {
1758  MachineBasicBlock *MBB = CI.I->getParent();
1759  DebugLoc DL = CI.I->getDebugLoc();
1760 
1761  const unsigned Opcode = getNewOpcode(CI, Paired);
1762 
1763  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1764  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1765  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1766 
1767  // Copy to the new source register.
1768  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1769  Register SrcReg = MRI->createVirtualRegister(SuperRC);
1770 
1771  const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1772  const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1773 
1774  BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1775  .add(*Src0)
1776  .addImm(SubRegIdx0)
1777  .add(*Src1)
1778  .addImm(SubRegIdx1);
1779 
1780  auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1781  .addReg(SrcReg, RegState::Kill);
1782 
1783  AddressRegs Regs = getRegs(Opcode, *TII);
1784 
1785  if (Regs.VAddr)
1786  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1787 
1788 
1789  // It shouldn't be possible to get this far if the two instructions
1790  // don't have a single memoperand, because MachineInstr::mayAlias()
1791  // will return true if this is the case.
1792  assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1793 
1794  MachineInstr *New =
1795  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1796  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1797  .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1798  .addImm(CI.CPol) // cpol
1799  .addImm(0) // tfe
1800  .addImm(0) // swz
1801  .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1802 
1803  CI.I->eraseFromParent();
1804  Paired.I->eraseFromParent();
1805  return New;
1806 }
1807 
1809 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
1810  APInt V(32, Val, true);
1811  if (TII->isInlineConstant(V))
1812  return MachineOperand::CreateImm(Val);
1813 
1814  Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1815  MachineInstr *Mov =
1816  BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1817  TII->get(AMDGPU::S_MOV_B32), Reg)
1818  .addImm(Val);
1819  (void)Mov;
1820  LLVM_DEBUG(dbgs() << " "; Mov->dump());
1821  return MachineOperand::CreateReg(Reg, false);
1822 }
1823 
1824 // Compute base address using Addr and return the final register.
1825 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1826  const MemAddress &Addr) const {
1827  MachineBasicBlock *MBB = MI.getParent();
1828  MachineBasicBlock::iterator MBBI = MI.getIterator();
1829  DebugLoc DL = MI.getDebugLoc();
1830 
1831  assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1832  Addr.Base.LoSubReg) &&
1833  "Expected 32-bit Base-Register-Low!!");
1834 
1835  assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1836  Addr.Base.HiSubReg) &&
1837  "Expected 32-bit Base-Register-Hi!!");
1838 
1839  LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n");
1840  MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1841  MachineOperand OffsetHi =
1842  createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1843 
1844  const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1845  Register CarryReg = MRI->createVirtualRegister(CarryRC);
1846  Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
1847 
1848  Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1849  Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1850  MachineInstr *LoHalf =
1851  BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
1852  .addReg(CarryReg, RegState::Define)
1853  .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1854  .add(OffsetLo)
1855  .addImm(0); // clamp bit
1856  (void)LoHalf;
1857  LLVM_DEBUG(dbgs() << " "; LoHalf->dump(););
1858 
1859  MachineInstr *HiHalf =
1860  BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1861  .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
1862  .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
1863  .add(OffsetHi)
1864  .addReg(CarryReg, RegState::Kill)
1865  .addImm(0); // clamp bit
1866  (void)HiHalf;
1867  LLVM_DEBUG(dbgs() << " "; HiHalf->dump(););
1868 
1869  Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
1870  MachineInstr *FullBase =
1871  BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
1872  .addReg(DestSub0)
1873  .addImm(AMDGPU::sub0)
1874  .addReg(DestSub1)
1875  .addImm(AMDGPU::sub1);
1876  (void)FullBase;
1877  LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";);
1878 
1879  return FullDestReg;
1880 }
1881 
1882 // Update base and offset with the NewBase and NewOffset in MI.
1883 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
1884  Register NewBase,
1885  int32_t NewOffset) const {
1886  auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1887  Base->setReg(NewBase);
1888  Base->setIsKill(false);
1889  TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
1890 }
1891 
1893 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
1894  if (Op.isImm())
1895  return Op.getImm();
1896 
1897  if (!Op.isReg())
1898  return None;
1899 
1900  MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
1901  if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
1902  !Def->getOperand(1).isImm())
1903  return None;
1904 
1905  return Def->getOperand(1).getImm();
1906 }
1907 
1908 // Analyze Base and extracts:
1909 // - 32bit base registers, subregisters
1910 // - 64bit constant offset
1911 // Expecting base computation as:
1912 // %OFFSET0:sgpr_32 = S_MOV_B32 8000
1913 // %LO:vgpr_32, %c:sreg_64_xexec =
1914 // V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
1915 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
1916 // %Base:vreg_64 =
1917 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
1918 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
1919  MemAddress &Addr) const {
1920  if (!Base.isReg())
1921  return;
1922 
1923  MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
1924  if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
1925  || Def->getNumOperands() != 5)
1926  return;
1927 
1928  MachineOperand BaseLo = Def->getOperand(1);
1929  MachineOperand BaseHi = Def->getOperand(3);
1930  if (!BaseLo.isReg() || !BaseHi.isReg())
1931  return;
1932 
1933  MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
1934  MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
1935 
1936  if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
1937  !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
1938  return;
1939 
1940  const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
1941  const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
1942 
1943  auto Offset0P = extractConstOffset(*Src0);
1944  if (Offset0P)
1945  BaseLo = *Src1;
1946  else {
1947  if (!(Offset0P = extractConstOffset(*Src1)))
1948  return;
1949  BaseLo = *Src0;
1950  }
1951 
1952  Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
1953  Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
1954 
1955  if (Src0->isImm())
1956  std::swap(Src0, Src1);
1957 
1958  if (!Src1->isImm())
1959  return;
1960 
1961  uint64_t Offset1 = Src1->getImm();
1962  BaseHi = *Src0;
1963 
1964  Addr.Base.LoReg = BaseLo.getReg();
1965  Addr.Base.HiReg = BaseHi.getReg();
1966  Addr.Base.LoSubReg = BaseLo.getSubReg();
1967  Addr.Base.HiSubReg = BaseHi.getSubReg();
1968  Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
1969 }
1970 
1971 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
1972  MachineInstr &MI,
1973  MemInfoMap &Visited,
1974  SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
1975 
1976  if (!(MI.mayLoad() ^ MI.mayStore()))
1977  return false;
1978 
1979  // TODO: Support flat and scratch.
1980  if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0)
1981  return false;
1982 
1983  if (MI.mayLoad() &&
1984  TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr)
1985  return false;
1986 
1987  if (AnchorList.count(&MI))
1988  return false;
1989 
1990  LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
1991 
1992  if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
1993  LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";);
1994  return false;
1995  }
1996 
1997  // Step1: Find the base-registers and a 64bit constant offset.
1998  MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1999  MemAddress MAddr;
2000  if (Visited.find(&MI) == Visited.end()) {
2001  processBaseWithConstOffset(Base, MAddr);
2002  Visited[&MI] = MAddr;
2003  } else
2004  MAddr = Visited[&MI];
2005 
2006  if (MAddr.Offset == 0) {
2007  LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no"
2008  " constant offsets that can be promoted.\n";);
2009  return false;
2010  }
2011 
2012  LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", "
2013  << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
2014 
2015  // Step2: Traverse through MI's basic block and find an anchor(that has the
2016  // same base-registers) with the highest 13bit distance from MI's offset.
2017  // E.g. (64bit loads)
2018  // bb:
2019  // addr1 = &a + 4096; load1 = load(addr1, 0)
2020  // addr2 = &a + 6144; load2 = load(addr2, 0)
2021  // addr3 = &a + 8192; load3 = load(addr3, 0)
2022  // addr4 = &a + 10240; load4 = load(addr4, 0)
2023  // addr5 = &a + 12288; load5 = load(addr5, 0)
2024  //
2025  // Starting from the first load, the optimization will try to find a new base
2026  // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
2027  // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
2028  // as the new-base(anchor) because of the maximum distance which can
2029  // accommodate more intermediate bases presumably.
2030  //
2031  // Step3: move (&a + 8192) above load1. Compute and promote offsets from
2032  // (&a + 8192) for load1, load2, load4.
2033  // addr = &a + 8192
2034  // load1 = load(addr, -4096)
2035  // load2 = load(addr, -2048)
2036  // load3 = load(addr, 0)
2037  // load4 = load(addr, 2048)
2038  // addr5 = &a + 12288; load5 = load(addr5, 0)
2039  //
2040  MachineInstr *AnchorInst = nullptr;
2041  MemAddress AnchorAddr;
2044 
2045  MachineBasicBlock *MBB = MI.getParent();
2047  MachineBasicBlock::iterator MBBI = MI.getIterator();
2048  ++MBBI;
2049  const SITargetLowering *TLI =
2050  static_cast<const SITargetLowering *>(STM->getTargetLowering());
2051 
2052  for ( ; MBBI != E; ++MBBI) {
2053  MachineInstr &MINext = *MBBI;
2054  // TODO: Support finding an anchor(with same base) from store addresses or
2055  // any other load addresses where the opcodes are different.
2056  if (MINext.getOpcode() != MI.getOpcode() ||
2057  TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
2058  continue;
2059 
2060  const MachineOperand &BaseNext =
2061  *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
2062  MemAddress MAddrNext;
2063  if (Visited.find(&MINext) == Visited.end()) {
2064  processBaseWithConstOffset(BaseNext, MAddrNext);
2065  Visited[&MINext] = MAddrNext;
2066  } else
2067  MAddrNext = Visited[&MINext];
2068 
2069  if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
2070  MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
2071  MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
2072  MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2073  continue;
2074 
2075  InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset));
2076 
2077  int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2079  AM.HasBaseReg = true;
2080  AM.BaseOffs = Dist;
2081  if (TLI->isLegalGlobalAddressingMode(AM) &&
2082  (uint32_t)std::abs(Dist) > MaxDist) {
2083  MaxDist = std::abs(Dist);
2084 
2085  AnchorAddr = MAddrNext;
2086  AnchorInst = &MINext;
2087  }
2088  }
2089 
2090  if (AnchorInst) {
2091  LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): ";
2092  AnchorInst->dump());
2093  LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: "
2094  << AnchorAddr.Offset << "\n\n");
2095 
2096  // Instead of moving up, just re-compute anchor-instruction's base address.
2097  Register Base = computeBase(MI, AnchorAddr);
2098 
2099  updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
2100  LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump(););
2101 
2102  for (auto P : InstsWCommonBase) {
2104  AM.HasBaseReg = true;
2105  AM.BaseOffs = P.second - AnchorAddr.Offset;
2106 
2107  if (TLI->isLegalGlobalAddressingMode(AM)) {
2108  LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second;
2109  dbgs() << ")"; P.first->dump());
2110  updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
2111  LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump());
2112  }
2113  }
2114  AnchorList.insert(AnchorInst);
2115  return true;
2116  }
2117 
2118  return false;
2119 }
2120 
2121 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
2122  std::list<std::list<CombineInfo> > &MergeableInsts) const {
2123  for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2124  if (AddrList.front().InstClass == CI.InstClass &&
2125  AddrList.front().IsAGPR == CI.IsAGPR &&
2126  AddrList.front().hasSameBaseAddress(*CI.I)) {
2127  AddrList.emplace_back(CI);
2128  return;
2129  }
2130  }
2131 
2132  // Base address not found, so add a new list.
2133  MergeableInsts.emplace_back(1, CI);
2134 }
2135 
2136 std::pair<MachineBasicBlock::iterator, bool>
2137 SILoadStoreOptimizer::collectMergeableInsts(
2139  MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
2140  std::list<std::list<CombineInfo>> &MergeableInsts) const {
2141  bool Modified = false;
2142 
2143  // Sort potential mergeable instructions into lists. One list per base address.
2144  unsigned Order = 0;
2145  MachineBasicBlock::iterator BlockI = Begin;
2146  for (; BlockI != End; ++BlockI) {
2147  MachineInstr &MI = *BlockI;
2148 
2149  // We run this before checking if an address is mergeable, because it can produce
2150  // better code even if the instructions aren't mergeable.
2151  if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
2152  Modified = true;
2153 
2154  // Treat volatile accesses, ordered accesses and unmodeled side effects as
2155  // barriers. We can look after this barrier for separate merges.
2156  if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) {
2157  LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI);
2158 
2159  // Search will resume after this instruction in a separate merge list.
2160  ++BlockI;
2161  break;
2162  }
2163 
2164  const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
2165  if (InstClass == UNKNOWN)
2166  continue;
2167 
2168  // Do not merge VMEM buffer instructions with "swizzled" bit set.
2169  int Swizzled =
2170  AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
2171  if (Swizzled != -1 && MI.getOperand(Swizzled).getImm())
2172  continue;
2173 
2174  CombineInfo CI;
2175  CI.setMI(MI, *this);
2176  CI.Order = Order++;
2177 
2178  if (!CI.hasMergeableAddress(*MRI))
2179  continue;
2180 
2181  if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
2182  // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
2183  // operands. However we are reporting that ds_write2 shall have
2184  // only VGPR data so that machine copy propagation does not
2185  // create an illegal instruction with a VGPR and AGPR sources.
2186  // Consequenctially if we create such instruction the verifier
2187  // will complain.
2188  continue;
2189  }
2190 
2191  LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
2192 
2193  addInstToMergeableList(CI, MergeableInsts);
2194  }
2195 
2196  // At this point we have lists of Mergeable instructions.
2197  //
2198  // Part 2: Sort lists by offset and then for each CombineInfo object in the
2199  // list try to find an instruction that can be merged with I. If an instruction
2200  // is found, it is stored in the Paired field. If no instructions are found, then
2201  // the CombineInfo object is deleted from the list.
2202 
2203  for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2204  E = MergeableInsts.end(); I != E;) {
2205 
2206  std::list<CombineInfo> &MergeList = *I;
2207  if (MergeList.size() <= 1) {
2208  // This means we have found only one instruction with a given address
2209  // that can be merged, and we need at least 2 instructions to do a merge,
2210  // so this list can be discarded.
2211  I = MergeableInsts.erase(I);
2212  continue;
2213  }
2214 
2215  // Sort the lists by offsets, this way mergeable instructions will be
2216  // adjacent to each other in the list, which will make it easier to find
2217  // matches.
2218  MergeList.sort(
2219  [] (const CombineInfo &A, const CombineInfo &B) {
2220  return A.Offset < B.Offset;
2221  });
2222  ++I;
2223  }
2224 
2225  return std::make_pair(BlockI, Modified);
2226 }
2227 
2228 // Scan through looking for adjacent LDS operations with constant offsets from
2229 // the same base register. We rely on the scheduler to do the hard work of
2230 // clustering nearby loads, and assume these are all adjacent.
2232  std::list<std::list<CombineInfo> > &MergeableInsts) {
2233  bool Modified = false;
2234 
2235  for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2236  E = MergeableInsts.end(); I != E;) {
2237  std::list<CombineInfo> &MergeList = *I;
2238 
2239  bool OptimizeListAgain = false;
2240  if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2241  // We weren't able to make any changes, so delete the list so we don't
2242  // process the same instructions the next time we try to optimize this
2243  // block.
2244  I = MergeableInsts.erase(I);
2245  continue;
2246  }
2247 
2248  Modified = true;
2249 
2250  // We made changes, but also determined that there were no more optimization
2251  // opportunities, so we don't need to reprocess the list
2252  if (!OptimizeListAgain) {
2253  I = MergeableInsts.erase(I);
2254  continue;
2255  }
2256  OptimizeAgain = true;
2257  }
2258  return Modified;
2259 }
2260 
2261 bool
2262 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2263  std::list<CombineInfo> &MergeList,
2264  bool &OptimizeListAgain) {
2265  if (MergeList.empty())
2266  return false;
2267 
2268  bool Modified = false;
2269 
2270  for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
2271  Next = std::next(I)) {
2272 
2273  auto First = I;
2274  auto Second = Next;
2275 
2276  if ((*First).Order > (*Second).Order)
2277  std::swap(First, Second);
2278  CombineInfo &CI = *First;
2279  CombineInfo &Paired = *Second;
2280 
2281  CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2282  if (!Where) {
2283  ++I;
2284  continue;
2285  }
2286 
2287  Modified = true;
2288 
2289  LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I);
2290 
2292  switch (CI.InstClass) {
2293  default:
2294  llvm_unreachable("unknown InstClass");
2295  break;
2296  case DS_READ:
2297  NewMI = mergeRead2Pair(CI, Paired, Where->I);
2298  break;
2299  case DS_WRITE:
2300  NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2301  break;
2302  case S_BUFFER_LOAD_IMM:
2303  NewMI = mergeSBufferLoadImmPair(CI, Paired, Where->I);
2304  OptimizeListAgain |= CI.Width + Paired.Width < 8;
2305  break;
2306  case BUFFER_LOAD:
2307  NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2308  OptimizeListAgain |= CI.Width + Paired.Width < 4;
2309  break;
2310  case BUFFER_STORE:
2311  NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2312  OptimizeListAgain |= CI.Width + Paired.Width < 4;
2313  break;
2314  case MIMG:
2315  NewMI = mergeImagePair(CI, Paired, Where->I);
2316  OptimizeListAgain |= CI.Width + Paired.Width < 4;
2317  break;
2318  case TBUFFER_LOAD:
2319  NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2320  OptimizeListAgain |= CI.Width + Paired.Width < 4;
2321  break;
2322  case TBUFFER_STORE:
2323  NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2324  OptimizeListAgain |= CI.Width + Paired.Width < 4;
2325  break;
2326  case FLAT_LOAD:
2327  case GLOBAL_LOAD:
2328  case GLOBAL_LOAD_SADDR:
2329  NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2330  OptimizeListAgain |= CI.Width + Paired.Width < 4;
2331  break;
2332  case FLAT_STORE:
2333  case GLOBAL_STORE:
2334  case GLOBAL_STORE_SADDR:
2335  NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2336  OptimizeListAgain |= CI.Width + Paired.Width < 4;
2337  break;
2338  }
2339  CI.setMI(NewMI, *this);
2340  CI.Order = Where->Order;
2341  if (I == Second)
2342  I = Next;
2343 
2344  MergeList.erase(Second);
2345  }
2346 
2347  return Modified;
2348 }
2349 
2350 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
2351  if (skipFunction(MF.getFunction()))
2352  return false;
2353 
2354  STM = &MF.getSubtarget<GCNSubtarget>();
2355  if (!STM->loadStoreOptEnabled())
2356  return false;
2357 
2358  TII = STM->getInstrInfo();
2359  TRI = &TII->getRegisterInfo();
2360 
2361  MRI = &MF.getRegInfo();
2362  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2363 
2364  LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
2365 
2366  bool Modified = false;
2367 
2368  // Contains the list of instructions for which constant offsets are being
2369  // promoted to the IMM. This is tracked for an entire block at time.
2370  SmallPtrSet<MachineInstr *, 4> AnchorList;
2371  MemInfoMap Visited;
2372 
2373  for (MachineBasicBlock &MBB : MF) {
2374  MachineBasicBlock::iterator SectionEnd;
2375  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
2376  I = SectionEnd) {
2377  bool CollectModified;
2378  std::list<std::list<CombineInfo>> MergeableInsts;
2379 
2380  // First pass: Collect list of all instructions we know how to merge in a
2381  // subset of the block.
2382  std::tie(SectionEnd, CollectModified) =
2383  collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
2384 
2385  Modified |= CollectModified;
2386 
2387  do {
2388  OptimizeAgain = false;
2389  Modified |= optimizeBlock(MergeableInsts);
2390  } while (OptimizeAgain);
2391  }
2392 
2393  Visited.clear();
2394  AnchorList.clear();
2395  }
2396 
2397  return Modified;
2398 }
llvm::Check::Size
@ Size
Definition: FileCheck.h:76
i
i
Definition: README.txt:29
DEBUG_TYPE
#define DEBUG_TYPE
Definition: SILoadStoreOptimizer.cpp:69
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:104
llvm::MachineInstrBuilder::addImm
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Definition: MachineInstrBuilder.h:131
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:17
llvm::tgtok::Def
@ Def
Definition: TGLexer.h:50
llvm::AMDGPU::getMUBUFHasSoffset
bool getMUBUFHasSoffset(unsigned Opc)
Definition: AMDGPUBaseInfo.cpp:331
llvm::MachineOperand::CreateReg
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
Definition: MachineOperand.h:800
llvm::RegState::Define
@ Define
Register definition.
Definition: MachineInstrBuilder.h:44
llvm::MachineRegisterInfo::createVirtualRegister
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
Definition: MachineRegisterInfo.cpp:156
llvm::MachineRegisterInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Definition: MachineRegisterInfo.h:50
llvm::MachineInstrBuilder::add
const MachineInstrBuilder & add(const MachineOperand &MO) const
Definition: MachineInstrBuilder.h:224
llvm::AMDGPU::getMUBUFBaseOpcode
int getMUBUFBaseOpcode(unsigned Opc)
Definition: AMDGPUBaseInfo.cpp:306
P
This currently compiles esp xmm0 movsd esp eax eax esp ret We should use not the dag combiner This is because dagcombine2 needs to be able to see through the X86ISD::Wrapper which DAGCombine can t really do The code for turning x load into a single vector load is target independent and should be moved to the dag combiner The code for turning x load into a vector load can only handle a direct load from a global or a direct load from the stack It should be generalized to handle any load from P
Definition: README-SSE.txt:411
llvm::AMDGPUISD::BUFFER_LOAD
@ BUFFER_LOAD
Definition: AMDGPUISelLowering.h:508
op
#define op(i)
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1185
llvm::MachineFunction::getMachineMemOperand
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, uint64_t s, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
Definition: MachineFunction.cpp:456
llvm::X86Disassembler::Reg
Reg
All possible values of the reg field in the ModR/M byte.
Definition: X86DisassemblerDecoder.h:462
llvm::MachineRegisterInfo::getUniqueVRegDef
MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
Definition: MachineRegisterInfo.cpp:407
llvm::MachineFunctionPass
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
Definition: MachineFunctionPass.h:30
llvm::AMDGPU::getGlobalSaddrOp
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
llvm::MachineMemOperand
A description of a memory reference used in the backend.
Definition: MachineMemOperand.h:126
llvm::GCNSubtarget::hasDwordx3LoadStores
bool hasDwordx3LoadStores() const
Definition: GCNSubtarget.h:939
llvm::SPII::Load
@ Load
Definition: SparcInstrInfo.h:32
llvm::MachineFunctionProperties::Property::IsSSA
@ IsSSA
llvm::MachineFunctionProperties
Properties which a MachineFunction may have at a given point in time.
Definition: MachineFunction.h:111
llvm::Optional< int32_t >
llvm::createSILoadStoreOptimizerPass
FunctionPass * createSILoadStoreOptimizerPass()
Definition: SILoadStoreOptimizer.cpp:736
llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:450
llvm::GCNSubtarget
Definition: GCNSubtarget.h:31
llvm::AMDGPUAS::FLAT_ADDRESS
@ FLAT_ADDRESS
Address space for flat memory.
Definition: AMDGPU.h:358
llvm::GCNSubtarget::loadStoreOptEnabled
bool loadStoreOptEnabled() const
Definition: GCNSubtarget.h:916
llvm::AMDGPU::getNamedOperandIdx
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
llvm::detail::DenseSetImpl< ValueT, DenseMap< ValueT, detail::DenseSetEmpty, DenseMapInfo< ValueT >, detail::DenseSetPair< ValueT > >, DenseMapInfo< ValueT > >::insert
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
TRI
unsigned const TargetRegisterInfo * TRI
Definition: MachineSink.cpp:1618
llvm::AMDGPU::getMTBUFOpcode
int getMTBUFOpcode(unsigned BaseOpc, unsigned Elements)
Definition: AMDGPUBaseInfo.cpp:281
llvm::MachineFunctionPass::getAnalysisUsage
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Definition: MachineFunctionPass.cpp:103
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
isImm
static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI)
Definition: SPIRVInstructionSelector.cpp:982
llvm::GCNSubtarget::getInstrInfo
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:214
AliasAnalysis.h
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
llvm::MachineMemOperand::getPointerInfo
const MachinePointerInfo & getPointerInfo() const
Definition: MachineMemOperand.h:200
llvm::BitmaskEnumDetail::Mask
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
llvm::Register::isPhysical
bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:97
llvm::AMDGPU::CPol::CPol
CPol
Definition: SIDefines.h:292
llvm::MachineFunction::getRegInfo
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Definition: MachineFunction.h:650
llvm::AMDGPU::getMTBUFHasSrsrc
bool getMTBUFHasSrsrc(unsigned Opc)
Definition: AMDGPUBaseInfo.cpp:296
llvm::PassRegistry::getPassRegistry
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Definition: PassRegistry.cpp:31
llvm::AAResults
Definition: AliasAnalysis.h:511
getReg
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
Definition: MipsDisassembler.cpp:517
llvm::MachineOperand::CreateImm
static MachineOperand CreateImm(int64_t Val)
Definition: MachineOperand.h:782
GCNSubtarget.h
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::MachineMemOperand::getAddrSpace
unsigned getAddrSpace() const
Definition: MachineMemOperand.h:227
llvm::MachineOperand::getImm
int64_t getImm() const
Definition: MachineOperand.h:546
llvm::AMDGPU::getMIMGBaseOpcode
const MIMGBaseOpcodeInfo * getMIMGBaseOpcode(unsigned Opc)
Definition: AMDGPUBaseInfo.cpp:192
llvm::TargetLoweringBase::AddrMode::HasBaseReg
bool HasBaseReg
Definition: TargetLowering.h:2445
llvm::ARM_PROC::A
@ A
Definition: ARMBaseInfo.h:34
llvm::GCNSubtarget::getTargetLowering
const SITargetLowering * getTargetLowering() const override
Definition: GCNSubtarget.h:222
llvm::AMDGPU::getMUBUFOpcode
int getMUBUFOpcode(unsigned BaseOpc, unsigned Elements)
Definition: AMDGPUBaseInfo.cpp:311
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
llvm::TargetRegisterClass
Definition: TargetRegisterInfo.h:45
llvm::AMDGPU::GcnBufferFormatInfo::NumFormat
unsigned NumFormat
Definition: AMDGPUBaseInfo.h:70
false
Definition: StackSlotColoring.cpp:141
TII
const HexagonInstrInfo * TII
Definition: HexagonCopyToCombine.cpp:127
llvm::MCInstrDesc
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:197
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
First
into llvm powi allowing the code generator to produce balanced multiplication trees First
Definition: README.txt:54
llvm::MachineOperand
MachineOperand class - Representation of each machine instruction operand.
Definition: MachineOperand.h:48
llvm::MachineFunctionProperties::set
MachineFunctionProperties & set(Property P)
Definition: MachineFunction.h:180
llvm::SPII::Store
@ Store
Definition: SparcInstrInfo.h:33
llvm::AMDGPU::getMTBUFBaseOpcode
int getMTBUFBaseOpcode(unsigned Opc)
Definition: AMDGPUBaseInfo.cpp:276
llvm::AMDGPU::getMTBUFHasSoffset
bool getMTBUFHasSoffset(unsigned Opc)
Definition: AMDGPUBaseInfo.cpp:301
LoopDeletionResult::Modified
@ Modified
llvm::SITargetLowering::isLegalGlobalAddressingMode
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
Definition: SIISelLowering.cpp:1358
llvm::AMDGPU::getMUBUFHasSrsrc
bool getMUBUFHasSrsrc(unsigned Opc)
Definition: AMDGPUBaseInfo.cpp:326
Info
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
llvm::AMDGPU::convertSMRDOffsetUnits
uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset)
Convert ByteOffset to dwords if the subtarget uses dword SMRD immediate offsets.
Definition: AMDGPUBaseInfo.cpp:2096
llvm::SIRegisterInfo
Definition: SIRegisterInfo.h:30
addDefsUsesToList
static void addDefsUsesToList(const MachineInstr &MI, DenseSet< Register > &RegDefs, DenseSet< Register > &RegUses)
Definition: SILoadStoreOptimizer.cpp:740
llvm::SILoadStoreOptimizerID
char & SILoadStoreOptimizerID
Definition: SILoadStoreOptimizer.cpp:734
llvm::None
const NoneType None
Definition: None.h:24
llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
llvm::MachineBasicBlock
Definition: MachineBasicBlock.h:94
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
llvm::RegState::Dead
@ Dead
Unused definition.
Definition: MachineInstrBuilder.h:50
llvm::DenseSet
Implements a dense probed hash-table based set.
Definition: DenseSet.h:268
llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition: MachineFunction.h:640
llvm::countPopulation
unsigned countPopulation(T Value)
Count the number of set bits in a value.
Definition: MathExtras.h:567
llvm::SIInstrFlags::MIMG
@ MIMG
Definition: SIDefines.h:57
AMDGPUMCTargetDesc.h
llvm::TargetRegisterInfo::getRegClass
const TargetRegisterClass * getRegClass(unsigned i) const
Returns the register class associated with the enumeration value.
Definition: TargetRegisterInfo.h:748
llvm::MachineOperand::isReg
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Definition: MachineOperand.h:320
llvm::MachineInstr
Representation of each machine instruction.
Definition: MachineInstr.h:66
llvm::MachineInstrBuilder
Definition: MachineInstrBuilder.h:69
uint64_t
Addr
uint64_t Addr
Definition: ELFObjHandler.cpp:78
INITIALIZE_PASS_DEPENDENCY
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
Optimizer
SI Load Store Optimizer
Definition: SILoadStoreOptimizer.cpp:729
llvm::MachinePointerInfo
This class contains a discriminated union of information about pointers in memory operands,...
Definition: MachineMemOperand.h:38
llvm::DenseMap
Definition: DenseMap.h:716
llvm::operator<
bool operator<(int64_t V1, const APSInt &V2)
Definition: APSInt.h:339
I
#define I(x, y, z)
Definition: MD5.cpp:58
llvm::AMDGPU::GcnBufferFormatInfo::BitsPerComp
unsigned BitsPerComp
Definition: AMDGPUBaseInfo.h:68
llvm::AMDGPU::getGcnBufferFormatInfo
const GcnBufferFormatInfo * getGcnBufferFormatInfo(uint8_t BitsPerComp, uint8_t NumComponents, uint8_t NumFormat, const MCSubtargetInfo &STI)
Definition: AMDGPUBaseInfo.cpp:2236
MachineFunctionPass.h
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
mostAlignedValueInRange
static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi)
Definition: SILoadStoreOptimizer.cpp:862
llvm::isUInt< 8 >
constexpr bool isUInt< 8 >(uint64_t x)
Definition: MathExtras.h:405
std::swap
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:853
SI
StandardInstrumentations SI(Debug, VerifyEach)
llvm::MachineInstrBuilder::addMemOperand
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
Definition: MachineInstrBuilder.h:202
llvm::MachineInstrBuilder::addReg
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Definition: MachineInstrBuilder.h:97
llvm::AMDGPU::getMIMGInfo
const LLVM_READONLY MIMGInfo * getMIMGInfo(unsigned Opc)
llvm::MachineOperand::getReg
Register getReg() const
getReg - Returns the register number.
Definition: MachineOperand.h:359
llvm::SmallPtrSetImpl::count
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:383
llvm::APInt
Class for arbitrary precision integers.
Definition: APInt.h:75
llvm::MachineFunction
Definition: MachineFunction.h:241
llvm::MachineInstr::dump
void dump() const
Definition: MachineInstr.cpp:1491
llvm::AMDGPU::getMIMGBaseOpcodeInfo
const LLVM_READONLY MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
llvm::SmallPtrSetImplBase::clear
void clear()
Definition: SmallPtrSet.h:95
da
da
Definition: DependenceAnalysis.cpp:143
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
llvm::countTrailingZeros
unsigned countTrailingZeros(T Val, ZeroBehavior ZB=ZB_Width)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: MathExtras.h:156
llvm::AnalysisUsage::setPreservesCFG
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:263
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:58
AMDGPU.h
list
compiles ldr LCPI1_0 ldr ldr mov lsr tst moveq r1 ldr LCPI1_1 and r0 bx lr It would be better to do something like to fold the shift into the conditional ldr LCPI1_0 ldr ldr tst movne lsr ldr LCPI1_1 and r0 bx lr it saves an instruction and a register It might be profitable to cse MOVi16 if there are lots of bit immediates with the same bottom half Robert Muth started working on an alternate jump table implementation that does not put the tables in line in the text This is more like the llvm default jump table implementation This might be useful sometime Several revisions of patches are on the mailing list
Definition: README.txt:568
MBBI
MachineBasicBlock MachineBasicBlock::iterator MBBI
Definition: AArch64SLSHardening.cpp:75
llvm::MachineInstr::getOpcode
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:491
llvm::MachineRegisterInfo::hasOneNonDBGUse
bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
Definition: MachineRegisterInfo.cpp:415
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:143
llvm::detail::DenseSetImpl< ValueT, DenseMap< ValueT, detail::DenseSetEmpty, DenseMapInfo< ValueT >, detail::DenseSetPair< ValueT > >, DenseMapInfo< ValueT > >::contains
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:185
llvm::ms_demangle::IntrinsicFunctionKind::New
@ New
uint32_t
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
llvm::format
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:124
llvm::AMDGPU::getMTBUFHasVAddr
bool getMTBUFHasVAddr(unsigned Opc)
Definition: AMDGPUBaseInfo.cpp:291
llvm::Pass::dump
void dump() const
Definition: Pass.cpp:135
LLVM_FALLTHROUGH
#define LLVM_FALLTHROUGH
LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
Definition: Compiler.h:280
llvm::AMDGPU::GcnBufferFormatInfo
Definition: AMDGPUBaseInfo.h:66
llvm::MachineInstrBuilder::cloneMergedMemRefs
const MachineInstrBuilder & cloneMergedMemRefs(ArrayRef< const MachineInstr * > OtherMIs) const
Definition: MachineInstrBuilder.h:219
llvm::AMDGPU::GcnBufferFormatInfo::Format
unsigned Format
Definition: AMDGPUBaseInfo.h:67
MRI
unsigned const MachineRegisterInfo * MRI
Definition: AArch64AdvSIMDScalarPass.cpp:105
llvm::Register
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
llvm::MachineOperand::getSubReg
unsigned getSubReg() const
Definition: MachineOperand.h:364
llvm::LPAC::UNKNOWN
@ UNKNOWN
Definition: LanaiAluCode.h:40
MBB
MachineBasicBlock & MBB
Definition: AArch64SLSHardening.cpp:74
getBufferFormatWithCompCount
static unsigned getBufferFormatWithCompCount(unsigned OldFormat, unsigned ComponentCount, const GCNSubtarget &STI)
Definition: SILoadStoreOptimizer.cpp:831
llvm::TargetLoweringBase::AddrMode::BaseOffs
int64_t BaseOffs
Definition: TargetLowering.h:2444
llvm::MachineFunction::getFunction
Function & getFunction()
Return the LLVM function that this machine code represents.
Definition: MachineFunction.h:606
llvm::AMDGPU::getMUBUFHasVAddr
bool getMUBUFHasVAddr(unsigned Opc)
Definition: AMDGPUBaseInfo.cpp:321
llvm::TargetRegisterInfo::getRegSizeInBits
unsigned getRegSizeInBits(const TargetRegisterClass &RC) const
Return the size in bits of a register from class RC.
Definition: TargetRegisterInfo.h:275
llvm::AMDGPU::getMUBUFElements
int getMUBUFElements(unsigned Opc)
Definition: AMDGPUBaseInfo.cpp:316
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:326
llvm::MachineMemOperand::getSize
uint64_t getSize() const
Return the size in bytes of the memory reference.
Definition: MachineMemOperand.h:234
optimizeBlock
static bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT, const TargetTransformInfo &TTI, const DataLayout &DL, DomTreeUpdater *DTU)
Definition: ScalarizeMaskedMemIntrin.cpp:908
llvm::SITargetLowering
Definition: SIISelLowering.h:31
llvm::BitWidth
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:147
llvm::GCNSubtarget::ldsRequiresM0Init
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
Definition: GCNSubtarget.h:617
llvm::countLeadingZeros
unsigned countLeadingZeros(T Val, ZeroBehavior ZB=ZB_Width)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: MathExtras.h:225
llvm::MachineOperand::isImm
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Definition: MachineOperand.h:322
llvm::AMDGPU::Hwreg::Width
Width
Definition: SIDefines.h:417
llvm::SIInstrInfo
Definition: SIInstrInfo.h:43
AA
llvm::pdb::DbgHeaderType::Max
@ Max
llvm::MachineBasicBlock::begin
iterator begin()
Definition: MachineBasicBlock.h:277
llvm::SIInstrInfo::isFLATGlobal
static bool isFLATGlobal(const MachineInstr &MI)
Definition: SIInstrInfo.h:532
llvm::BuildMI
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
Definition: MachineInstrBuilder.h:328
llvm::AAResultsWrapperPass
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Definition: AliasAnalysis.h:1347
llvm::max
Align max(MaybeAlign Lhs, Align Rhs)
Definition: Alignment.h:340
llvm::initializeSILoadStoreOptimizerPass
void initializeSILoadStoreOptimizerPass(PassRegistry &)
llvm::TargetLoweringBase::AddrMode
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg If BaseGV is null...
Definition: TargetLowering.h:2442
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", false, false) INITIALIZE_PASS_END(SILoadStoreOptimizer
llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:308
llvm::AMDGPUISD::BUFFER_STORE
@ BUFFER_STORE
Definition: AMDGPUISelLowering.h:516
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
llvm::DebugLoc
A debug info location.
Definition: DebugLoc.h:33
llvm::RegState::Kill
@ Kill
The last use of a register.
Definition: MachineInstrBuilder.h:48
llvm::MachineInstrBundleIterator
MachineBasicBlock iterator that automatically skips over MIs that are inside bundles (i....
Definition: MachineInstrBundleIterator.h:108
llvm::abs
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition: APFloat.h:1281
InitializePasses.h
llvm::MachineBasicBlock::end
iterator end()
Definition: MachineBasicBlock.h:279
llvm::AMDGPU::getMaskedMIMGOp
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
Definition: AMDGPUBaseInfo.cpp:197
llvm::AMDGPU::MIMGBaseOpcodeInfo::Sampler
bool Sampler
Definition: AMDGPUBaseInfo.h:305
llvm::sampleprof::Base
@ Base
Definition: Discriminator.h:58
llvm::AMDGPU::getMTBUFElements
int getMTBUFElements(unsigned Opc)
Definition: AMDGPUBaseInfo.cpp:286
Other
Optional< std::vector< StOtherPiece > > Other
Definition: ELFYAML.cpp:1225
llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:365
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:37