LLVM  9.0.0svn
SILoadStoreOptimizer.cpp
Go to the documentation of this file.
1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass tries to fuse DS instructions with close by immediate offsets.
10 // This will fuse operations such as
11 // ds_read_b32 v0, v2 offset:16
12 // ds_read_b32 v1, v2 offset:32
13 // ==>
14 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15 //
16 // The same is done for certain SMEM and VMEM opcodes, e.g.:
17 // s_buffer_load_dword s4, s[0:3], 4
18 // s_buffer_load_dword s5, s[0:3], 8
19 // ==>
20 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21 //
22 // This pass also tries to promote constant offset to the immediate by
23 // adjusting the base. It tries to use a base from the nearby instructions that
24 // allows it to have a 13bit constant offset and then promotes the 13bit offset
25 // to the immediate.
26 // E.g.
27 // s_movk_i32 s0, 0x1800
28 // v_add_co_u32_e32 v0, vcc, s0, v2
29 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30 //
31 // s_movk_i32 s0, 0x1000
32 // v_add_co_u32_e32 v5, vcc, s0, v2
33 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34 // global_load_dwordx2 v[5:6], v[5:6], off
35 // global_load_dwordx2 v[0:1], v[0:1], off
36 // =>
37 // s_movk_i32 s0, 0x1000
38 // v_add_co_u32_e32 v5, vcc, s0, v2
39 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40 // global_load_dwordx2 v[5:6], v[5:6], off
41 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42 //
43 // Future improvements:
44 //
45 // - This currently relies on the scheduler to place loads and stores next to
46 // each other, and then only merges adjacent pairs of instructions. It would
47 // be good to be more flexible with interleaved instructions, and possibly run
48 // before scheduling. It currently missing stores of constants because loading
49 // the constant into the data register is placed between the stores, although
50 // this is arguably a scheduling problem.
51 //
52 // - Live interval recomputing seems inefficient. This currently only matches
53 // one pair, and recomputes live intervals and moves on to the next pair. It
54 // would be better to compute a list of all merges that need to occur.
55 //
56 // - With a list of instructions to process, we can also merge more. If a
57 // cluster of loads have offsets that are too large to fit in the 8-bit
58 // offsets, but are close enough to fit in the 8 bits, we can add to the base
59 // pointer and use the new reduced offsets.
60 //
61 //===----------------------------------------------------------------------===//
62 
63 #include "AMDGPU.h"
64 #include "AMDGPUSubtarget.h"
66 #include "SIInstrInfo.h"
67 #include "SIRegisterInfo.h"
68 #include "Utils/AMDGPUBaseInfo.h"
69 #include "llvm/ADT/ArrayRef.h"
70 #include "llvm/ADT/SmallVector.h"
71 #include "llvm/ADT/StringRef.h"
80 #include "llvm/IR/DebugLoc.h"
81 #include "llvm/Pass.h"
82 #include "llvm/Support/Debug.h"
85 #include <algorithm>
86 #include <cassert>
87 #include <cstdlib>
88 #include <iterator>
89 #include <utility>
90 
91 using namespace llvm;
92 
93 #define DEBUG_TYPE "si-load-store-opt"
94 
95 namespace {
97  UNKNOWN,
98  DS_READ,
99  DS_WRITE,
100  S_BUFFER_LOAD_IMM,
101  BUFFER_LOAD_OFFEN = AMDGPU::BUFFER_LOAD_DWORD_OFFEN,
102  BUFFER_LOAD_OFFSET = AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
103  BUFFER_STORE_OFFEN = AMDGPU::BUFFER_STORE_DWORD_OFFEN,
104  BUFFER_STORE_OFFSET = AMDGPU::BUFFER_STORE_DWORD_OFFSET,
105  BUFFER_LOAD_OFFEN_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact,
106  BUFFER_LOAD_OFFSET_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact,
107  BUFFER_STORE_OFFEN_exact = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact,
108  BUFFER_STORE_OFFSET_exact = AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact,
109 };
110 
112  SBASE = 0x1,
113  SRSRC = 0x2,
114  SOFFSET = 0x4,
115  VADDR = 0x8,
116  ADDR = 0x10,
117 };
118 
119 class SILoadStoreOptimizer : public MachineFunctionPass {
120  struct CombineInfo {
123  unsigned EltSize;
124  unsigned Offset0;
125  unsigned Offset1;
126  unsigned Width0;
127  unsigned Width1;
128  unsigned BaseOff;
129  InstClassEnum InstClass;
130  bool GLC0;
131  bool GLC1;
132  bool SLC0;
133  bool SLC1;
134  bool UseST64;
135  SmallVector<MachineInstr *, 8> InstsToMove;
136  };
137 
138  struct BaseRegisters {
139  unsigned LoReg = 0;
140  unsigned HiReg = 0;
141 
142  unsigned LoSubReg = 0;
143  unsigned HiSubReg = 0;
144  };
145 
146  struct MemAddress {
147  BaseRegisters Base;
148  int64_t Offset = 0;
149  };
150 
151  using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
152 
153 private:
154  const GCNSubtarget *STM = nullptr;
155  const SIInstrInfo *TII = nullptr;
156  const SIRegisterInfo *TRI = nullptr;
157  MachineRegisterInfo *MRI = nullptr;
158  AliasAnalysis *AA = nullptr;
159  bool OptimizeAgain;
160 
161  static bool offsetsCanBeCombined(CombineInfo &CI);
162  static bool widthsFit(const GCNSubtarget &STM, const CombineInfo &CI);
163  static unsigned getNewOpcode(const CombineInfo &CI);
164  static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI);
165  const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI);
166  unsigned getOpcodeWidth(const MachineInstr &MI);
167  InstClassEnum getInstClass(unsigned Opc);
168  unsigned getRegs(unsigned Opc);
169 
170  bool findMatchingInst(CombineInfo &CI);
171 
172  unsigned read2Opcode(unsigned EltSize) const;
173  unsigned read2ST64Opcode(unsigned EltSize) const;
174  MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI);
175 
176  unsigned write2Opcode(unsigned EltSize) const;
177  unsigned write2ST64Opcode(unsigned EltSize) const;
178  MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
179  MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
180  MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI);
181  MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI);
182 
183  void updateBaseAndOffset(MachineInstr &I, unsigned NewBase,
184  int32_t NewOffset);
185  unsigned computeBase(MachineInstr &MI, const MemAddress &Addr);
186  MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI);
187  Optional<int32_t> extractConstOffset(const MachineOperand &Op);
188  void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr);
189  /// Promotes constant offset to the immediate by adjusting the base. It
190  /// tries to use a base from the nearby instructions that allows it to have
191  /// a 13bit constant offset which gets promoted to the immediate.
192  bool promoteConstantOffsetToImm(MachineInstr &CI,
193  MemInfoMap &Visited,
195 
196 public:
197  static char ID;
198 
199  SILoadStoreOptimizer() : MachineFunctionPass(ID) {
201  }
202 
203  bool optimizeBlock(MachineBasicBlock &MBB);
204 
205  bool runOnMachineFunction(MachineFunction &MF) override;
206 
207  StringRef getPassName() const override { return "SI Load Store Optimizer"; }
208 
209  void getAnalysisUsage(AnalysisUsage &AU) const override {
210  AU.setPreservesCFG();
212 
214  }
215 };
216 
217 } // end anonymous namespace.
218 
219 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
220  "SI Load Store Optimizer", false, false)
223  false, false)
224 
225 char SILoadStoreOptimizer::ID = 0;
226 
227 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
228 
230  return new SILoadStoreOptimizer();
231 }
232 
234  ArrayRef<MachineInstr *> InstsToMove) {
235  MachineBasicBlock *MBB = I->getParent();
236  ++I;
237  for (MachineInstr *MI : InstsToMove) {
238  MI->removeFromParent();
239  MBB->insert(I, MI);
240  }
241 }
242 
243 static void addDefsUsesToList(const MachineInstr &MI,
244  DenseSet<unsigned> &RegDefs,
245  DenseSet<unsigned> &PhysRegUses) {
246  for (const MachineOperand &Op : MI.operands()) {
247  if (Op.isReg()) {
248  if (Op.isDef())
249  RegDefs.insert(Op.getReg());
250  else if (Op.readsReg() &&
252  PhysRegUses.insert(Op.getReg());
253  }
254  }
255 }
256 
259  AliasAnalysis *AA) {
260  // RAW or WAR - cannot reorder
261  // WAW - cannot reorder
262  // RAR - safe to reorder
263  return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true);
264 }
265 
266 // Add MI and its defs to the lists if MI reads one of the defs that are
267 // already in the list. Returns true in that case.
269  DenseSet<unsigned> &PhysRegUses,
271  for (MachineOperand &Use : MI.operands()) {
272  // If one of the defs is read, then there is a use of Def between I and the
273  // instruction that I will potentially be merged with. We will need to move
274  // this instruction after the merged instructions.
275  //
276  // Similarly, if there is a def which is read by an instruction that is to
277  // be moved for merging, then we need to move the def-instruction as well.
278  // This can only happen for physical registers such as M0; virtual
279  // registers are in SSA form.
280  if (Use.isReg() &&
281  ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
282  (Use.isDef() && TargetRegisterInfo::isPhysicalRegister(Use.getReg()) &&
283  PhysRegUses.count(Use.getReg())))) {
284  Insts.push_back(&MI);
285  addDefsUsesToList(MI, RegDefs, PhysRegUses);
286  return true;
287  }
288  }
289 
290  return false;
291 }
292 
294  ArrayRef<MachineInstr *> InstsToMove,
295  AliasAnalysis *AA) {
296  assert(MemOp.mayLoadOrStore());
297 
298  for (MachineInstr *InstToMove : InstsToMove) {
299  if (!InstToMove->mayLoadOrStore())
300  continue;
301  if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA))
302  return false;
303  }
304  return true;
305 }
306 
307 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
308  // XXX - Would the same offset be OK? Is there any reason this would happen or
309  // be useful?
310  if (CI.Offset0 == CI.Offset1)
311  return false;
312 
313  // This won't be valid if the offset isn't aligned.
314  if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0))
315  return false;
316 
317  unsigned EltOffset0 = CI.Offset0 / CI.EltSize;
318  unsigned EltOffset1 = CI.Offset1 / CI.EltSize;
319  CI.UseST64 = false;
320  CI.BaseOff = 0;
321 
322  // Handle SMEM and VMEM instructions.
323  if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
324  return (EltOffset0 + CI.Width0 == EltOffset1 ||
325  EltOffset1 + CI.Width1 == EltOffset0) &&
326  CI.GLC0 == CI.GLC1 &&
327  (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1);
328  }
329 
330  // If the offset in elements doesn't fit in 8-bits, we might be able to use
331  // the stride 64 versions.
332  if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
333  isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
334  CI.Offset0 = EltOffset0 / 64;
335  CI.Offset1 = EltOffset1 / 64;
336  CI.UseST64 = true;
337  return true;
338  }
339 
340  // Check if the new offsets fit in the reduced 8-bit range.
341  if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
342  CI.Offset0 = EltOffset0;
343  CI.Offset1 = EltOffset1;
344  return true;
345  }
346 
347  // Try to shift base address to decrease offsets.
348  unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0);
349  CI.BaseOff = std::min(CI.Offset0, CI.Offset1);
350 
351  if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) {
352  CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
353  CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
354  CI.UseST64 = true;
355  return true;
356  }
357 
358  if (isUInt<8>(OffsetDiff)) {
359  CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize;
360  CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize;
361  return true;
362  }
363 
364  return false;
365 }
366 
367 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
368  const CombineInfo &CI) {
369  const unsigned Width = (CI.Width0 + CI.Width1);
370  switch (CI.InstClass) {
371  default:
372  return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
373  case S_BUFFER_LOAD_IMM:
374  switch (Width) {
375  default:
376  return false;
377  case 2:
378  case 4:
379  return true;
380  }
381  }
382 }
383 
384 unsigned SILoadStoreOptimizer::getOpcodeWidth(const MachineInstr &MI) {
385  const unsigned Opc = MI.getOpcode();
386 
387  if (TII->isMUBUF(MI)) {
388  return AMDGPU::getMUBUFDwords(Opc);
389  }
390 
391  switch (Opc) {
392  default:
393  return 0;
394  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
395  return 1;
396  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
397  return 2;
398  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
399  return 4;
400  }
401 }
402 
403 InstClassEnum SILoadStoreOptimizer::getInstClass(unsigned Opc) {
404  if (TII->isMUBUF(Opc)) {
405  const int baseOpcode = AMDGPU::getMUBUFBaseOpcode(Opc);
406 
407  // If we couldn't identify the opcode, bail out.
408  if (baseOpcode == -1) {
409  return UNKNOWN;
410  }
411 
412  switch (baseOpcode) {
413  default:
414  return UNKNOWN;
415  case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
416  return BUFFER_LOAD_OFFEN;
417  case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
418  return BUFFER_LOAD_OFFSET;
419  case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
420  return BUFFER_STORE_OFFEN;
421  case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
422  return BUFFER_STORE_OFFSET;
423  case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
424  return BUFFER_LOAD_OFFEN_exact;
425  case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
426  return BUFFER_LOAD_OFFSET_exact;
427  case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
428  return BUFFER_STORE_OFFEN_exact;
429  case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
430  return BUFFER_STORE_OFFSET_exact;
431  }
432  }
433 
434  switch (Opc) {
435  default:
436  return UNKNOWN;
437  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
438  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
439  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
440  return S_BUFFER_LOAD_IMM;
441  case AMDGPU::DS_READ_B32:
442  case AMDGPU::DS_READ_B64:
443  case AMDGPU::DS_READ_B32_gfx9:
444  case AMDGPU::DS_READ_B64_gfx9:
445  return DS_READ;
446  case AMDGPU::DS_WRITE_B32:
447  case AMDGPU::DS_WRITE_B64:
448  case AMDGPU::DS_WRITE_B32_gfx9:
449  case AMDGPU::DS_WRITE_B64_gfx9:
450  return DS_WRITE;
451  }
452 }
453 
454 unsigned SILoadStoreOptimizer::getRegs(unsigned Opc) {
455  if (TII->isMUBUF(Opc)) {
456  unsigned result = 0;
457 
458  if (AMDGPU::getMUBUFHasVAddr(Opc)) {
459  result |= VADDR;
460  }
461 
462  if (AMDGPU::getMUBUFHasSrsrc(Opc)) {
463  result |= SRSRC;
464  }
465 
466  if (AMDGPU::getMUBUFHasSoffset(Opc)) {
467  result |= SOFFSET;
468  }
469 
470  return result;
471  }
472 
473  switch (Opc) {
474  default:
475  return 0;
476  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
477  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
478  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
479  return SBASE;
480  case AMDGPU::DS_READ_B32:
481  case AMDGPU::DS_READ_B64:
482  case AMDGPU::DS_READ_B32_gfx9:
483  case AMDGPU::DS_READ_B64_gfx9:
484  case AMDGPU::DS_WRITE_B32:
485  case AMDGPU::DS_WRITE_B64:
486  case AMDGPU::DS_WRITE_B32_gfx9:
487  case AMDGPU::DS_WRITE_B64_gfx9:
488  return ADDR;
489  }
490 }
491 
492 bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
493  MachineBasicBlock *MBB = CI.I->getParent();
495  MachineBasicBlock::iterator MBBI = CI.I;
496 
497  const unsigned Opc = CI.I->getOpcode();
498  const InstClassEnum InstClass = getInstClass(Opc);
499 
500  if (InstClass == UNKNOWN) {
501  return false;
502  }
503 
504  const unsigned Regs = getRegs(Opc);
505 
506  unsigned AddrOpName[5] = {0};
507  int AddrIdx[5];
508  const MachineOperand *AddrReg[5];
509  unsigned NumAddresses = 0;
510 
511  if (Regs & ADDR) {
512  AddrOpName[NumAddresses++] = AMDGPU::OpName::addr;
513  }
514 
515  if (Regs & SBASE) {
516  AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase;
517  }
518 
519  if (Regs & SRSRC) {
520  AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
521  }
522 
523  if (Regs & SOFFSET) {
524  AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
525  }
526 
527  if (Regs & VADDR) {
528  AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
529  }
530 
531  for (unsigned i = 0; i < NumAddresses; i++) {
532  AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]);
533  AddrReg[i] = &CI.I->getOperand(AddrIdx[i]);
534 
535  // We only ever merge operations with the same base address register, so
536  // don't bother scanning forward if there are no other uses.
537  if (AddrReg[i]->isReg() &&
539  MRI->hasOneNonDBGUse(AddrReg[i]->getReg())))
540  return false;
541  }
542 
543  ++MBBI;
544 
545  DenseSet<unsigned> RegDefsToMove;
546  DenseSet<unsigned> PhysRegUsesToMove;
547  addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
548 
549  for (; MBBI != E; ++MBBI) {
550  const bool IsDS = (InstClass == DS_READ) || (InstClass == DS_WRITE);
551 
552  if ((getInstClass(MBBI->getOpcode()) != InstClass) ||
553  (IsDS && (MBBI->getOpcode() != Opc))) {
554  // This is not a matching DS instruction, but we can keep looking as
555  // long as one of these conditions are met:
556  // 1. It is safe to move I down past MBBI.
557  // 2. It is safe to move MBBI down past the instruction that I will
558  // be merged into.
559 
560  if (MBBI->hasUnmodeledSideEffects()) {
561  // We can't re-order this instruction with respect to other memory
562  // operations, so we fail both conditions mentioned above.
563  return false;
564  }
565 
566  if (MBBI->mayLoadOrStore() &&
567  (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
568  !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))) {
569  // We fail condition #1, but we may still be able to satisfy condition
570  // #2. Add this instruction to the move list and then we will check
571  // if condition #2 holds once we have selected the matching instruction.
572  CI.InstsToMove.push_back(&*MBBI);
573  addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);
574  continue;
575  }
576 
577  // When we match I with another DS instruction we will be moving I down
578  // to the location of the matched instruction any uses of I will need to
579  // be moved down as well.
580  addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
581  CI.InstsToMove);
582  continue;
583  }
584 
585  // Don't merge volatiles.
586  if (MBBI->hasOrderedMemoryRef())
587  return false;
588 
589  // Handle a case like
590  // DS_WRITE_B32 addr, v, idx0
591  // w = DS_READ_B32 addr, idx0
592  // DS_WRITE_B32 addr, f(w), idx1
593  // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
594  // merging of the two writes.
595  if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
596  CI.InstsToMove))
597  continue;
598 
599  bool Match = true;
600  for (unsigned i = 0; i < NumAddresses; i++) {
601  const MachineOperand &AddrRegNext = MBBI->getOperand(AddrIdx[i]);
602 
603  if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
604  if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
605  AddrReg[i]->getImm() != AddrRegNext.getImm()) {
606  Match = false;
607  break;
608  }
609  continue;
610  }
611 
612  // Check same base pointer. Be careful of subregisters, which can occur
613  // with vectors of pointers.
614  if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
615  AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
616  Match = false;
617  break;
618  }
619  }
620 
621  if (Match) {
622  int OffsetIdx =
623  AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::offset);
624  CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm();
625  CI.Width0 = getOpcodeWidth(*CI.I);
626  CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm();
627  CI.Width1 = getOpcodeWidth(*MBBI);
628  CI.Paired = MBBI;
629 
630  if ((CI.InstClass == DS_READ) || (CI.InstClass == DS_WRITE)) {
631  CI.Offset0 &= 0xffff;
632  CI.Offset1 &= 0xffff;
633  } else {
634  CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm();
635  CI.GLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm();
636  if (CI.InstClass != S_BUFFER_LOAD_IMM) {
637  CI.SLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm();
638  CI.SLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::slc)->getImm();
639  }
640  }
641 
642  // Check both offsets fit in the reduced range.
643  // We also need to go through the list of instructions that we plan to
644  // move and make sure they are all safe to move down past the merged
645  // instruction.
646  if (widthsFit(*STM, CI) && offsetsCanBeCombined(CI))
647  if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))
648  return true;
649  }
650 
651  // We've found a load/store that we couldn't merge for some reason.
652  // We could potentially keep looking, but we'd need to make sure that
653  // it was safe to move I and also all the instruction in InstsToMove
654  // down past this instruction.
655  // check if we can move I across MBBI and if we can move all I's users
656  if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
657  !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))
658  break;
659  }
660  return false;
661 }
662 
663 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
664  if (STM->ldsRequiresM0Init())
665  return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
666  return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
667 }
668 
669 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
670  if (STM->ldsRequiresM0Init())
671  return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
672 
673  return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
674  : AMDGPU::DS_READ2ST64_B64_gfx9;
675 }
676 
678 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) {
679  MachineBasicBlock *MBB = CI.I->getParent();
680 
681  // Be careful, since the addresses could be subregisters themselves in weird
682  // cases, like vectors of pointers.
683  const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
684 
685  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
686  const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdst);
687 
688  unsigned NewOffset0 = CI.Offset0;
689  unsigned NewOffset1 = CI.Offset1;
690  unsigned Opc =
691  CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
692 
693  unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
694  unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
695 
696  if (NewOffset0 > NewOffset1) {
697  // Canonicalize the merged instruction so the smaller offset comes first.
698  std::swap(NewOffset0, NewOffset1);
699  std::swap(SubRegIdx0, SubRegIdx1);
700  }
701 
702  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
703  (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
704 
705  const MCInstrDesc &Read2Desc = TII->get(Opc);
706 
707  const TargetRegisterClass *SuperRC =
708  (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
709  unsigned DestReg = MRI->createVirtualRegister(SuperRC);
710 
711  DebugLoc DL = CI.I->getDebugLoc();
712 
713  unsigned BaseReg = AddrReg->getReg();
714  unsigned BaseSubReg = AddrReg->getSubReg();
715  unsigned BaseRegFlags = 0;
716  if (CI.BaseOff) {
717  unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
718  BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
719  .addImm(CI.BaseOff);
720 
721  BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
722  BaseRegFlags = RegState::Kill;
723 
724  TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
725  .addReg(ImmReg)
726  .addReg(AddrReg->getReg(), 0, BaseSubReg)
727  .addImm(0); // clamp bit
728  BaseSubReg = 0;
729  }
730 
731  MachineInstrBuilder Read2 =
732  BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg)
733  .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
734  .addImm(NewOffset0) // offset0
735  .addImm(NewOffset1) // offset1
736  .addImm(0) // gds
737  .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
738 
739  (void)Read2;
740 
741  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
742 
743  // Copy to the old destination registers.
744  BuildMI(*MBB, CI.Paired, DL, CopyDesc)
745  .add(*Dest0) // Copy to same destination including flags and sub reg.
746  .addReg(DestReg, 0, SubRegIdx0);
747  MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
748  .add(*Dest1)
749  .addReg(DestReg, RegState::Kill, SubRegIdx1);
750 
751  moveInstsAfter(Copy1, CI.InstsToMove);
752 
753  MachineBasicBlock::iterator Next = std::next(CI.I);
754  CI.I->eraseFromParent();
755  CI.Paired->eraseFromParent();
756 
757  LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
758  return Next;
759 }
760 
761 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
762  if (STM->ldsRequiresM0Init())
763  return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
764  return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
765  : AMDGPU::DS_WRITE2_B64_gfx9;
766 }
767 
768 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
769  if (STM->ldsRequiresM0Init())
770  return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
771  : AMDGPU::DS_WRITE2ST64_B64;
772 
773  return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
774  : AMDGPU::DS_WRITE2ST64_B64_gfx9;
775 }
776 
778 SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) {
779  MachineBasicBlock *MBB = CI.I->getParent();
780 
781  // Be sure to use .addOperand(), and not .addReg() with these. We want to be
782  // sure we preserve the subregister index and any register flags set on them.
783  const MachineOperand *AddrReg =
784  TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
785  const MachineOperand *Data0 =
786  TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
787  const MachineOperand *Data1 =
788  TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
789 
790  unsigned NewOffset0 = CI.Offset0;
791  unsigned NewOffset1 = CI.Offset1;
792  unsigned Opc =
793  CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
794 
795  if (NewOffset0 > NewOffset1) {
796  // Canonicalize the merged instruction so the smaller offset comes first.
797  std::swap(NewOffset0, NewOffset1);
798  std::swap(Data0, Data1);
799  }
800 
801  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
802  (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
803 
804  const MCInstrDesc &Write2Desc = TII->get(Opc);
805  DebugLoc DL = CI.I->getDebugLoc();
806 
807  unsigned BaseReg = AddrReg->getReg();
808  unsigned BaseSubReg = AddrReg->getSubReg();
809  unsigned BaseRegFlags = 0;
810  if (CI.BaseOff) {
811  unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
812  BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
813  .addImm(CI.BaseOff);
814 
815  BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
816  BaseRegFlags = RegState::Kill;
817 
818  TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
819  .addReg(ImmReg)
820  .addReg(AddrReg->getReg(), 0, BaseSubReg)
821  .addImm(0); // clamp bit
822  BaseSubReg = 0;
823  }
824 
825  MachineInstrBuilder Write2 =
826  BuildMI(*MBB, CI.Paired, DL, Write2Desc)
827  .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
828  .add(*Data0) // data0
829  .add(*Data1) // data1
830  .addImm(NewOffset0) // offset0
831  .addImm(NewOffset1) // offset1
832  .addImm(0) // gds
833  .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
834 
835  moveInstsAfter(Write2, CI.InstsToMove);
836 
837  MachineBasicBlock::iterator Next = std::next(CI.I);
838  CI.I->eraseFromParent();
839  CI.Paired->eraseFromParent();
840 
841  LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
842  return Next;
843 }
844 
846 SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) {
847  MachineBasicBlock *MBB = CI.I->getParent();
848  DebugLoc DL = CI.I->getDebugLoc();
849  const unsigned Opcode = getNewOpcode(CI);
850 
851  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
852 
853  unsigned DestReg = MRI->createVirtualRegister(SuperRC);
854  unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
855 
856  BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
857  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
858  .addImm(MergedOffset) // offset
859  .addImm(CI.GLC0) // glc
860  .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
861 
862  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
863  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
864  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
865 
866  // Copy to the old destination registers.
867  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
868  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
869  const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::sdst);
870 
871  BuildMI(*MBB, CI.Paired, DL, CopyDesc)
872  .add(*Dest0) // Copy to same destination including flags and sub reg.
873  .addReg(DestReg, 0, SubRegIdx0);
874  MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
875  .add(*Dest1)
876  .addReg(DestReg, RegState::Kill, SubRegIdx1);
877 
878  moveInstsAfter(Copy1, CI.InstsToMove);
879 
880  MachineBasicBlock::iterator Next = std::next(CI.I);
881  CI.I->eraseFromParent();
882  CI.Paired->eraseFromParent();
883  return Next;
884 }
885 
887 SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) {
888  MachineBasicBlock *MBB = CI.I->getParent();
889  DebugLoc DL = CI.I->getDebugLoc();
890 
891  const unsigned Opcode = getNewOpcode(CI);
892 
893  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
894 
895  // Copy to the new source register.
896  unsigned DestReg = MRI->createVirtualRegister(SuperRC);
897  unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
898 
899  auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
900 
901  const unsigned Regs = getRegs(Opcode);
902 
903  if (Regs & VADDR)
904  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
905 
906  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
907  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
908  .addImm(MergedOffset) // offset
909  .addImm(CI.GLC0) // glc
910  .addImm(CI.SLC0) // slc
911  .addImm(0) // tfe
912  .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
913 
914  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
915  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
916  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
917 
918  // Copy to the old destination registers.
919  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
920  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
921  const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
922 
923  BuildMI(*MBB, CI.Paired, DL, CopyDesc)
924  .add(*Dest0) // Copy to same destination including flags and sub reg.
925  .addReg(DestReg, 0, SubRegIdx0);
926  MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
927  .add(*Dest1)
928  .addReg(DestReg, RegState::Kill, SubRegIdx1);
929 
930  moveInstsAfter(Copy1, CI.InstsToMove);
931 
932  MachineBasicBlock::iterator Next = std::next(CI.I);
933  CI.I->eraseFromParent();
934  CI.Paired->eraseFromParent();
935  return Next;
936 }
937 
938 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) {
939  const unsigned Width = CI.Width0 + CI.Width1;
940 
941  switch (CI.InstClass) {
942  default:
943  return AMDGPU::getMUBUFOpcode(CI.InstClass, Width);
944  case UNKNOWN:
945  llvm_unreachable("Unknown instruction class");
946  case S_BUFFER_LOAD_IMM:
947  switch (Width) {
948  default:
949  return 0;
950  case 2:
951  return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
952  case 4:
953  return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
954  }
955  }
956 }
957 
958 std::pair<unsigned, unsigned>
959 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI) {
960  if (CI.Offset0 > CI.Offset1) {
961  switch (CI.Width0) {
962  default:
963  return std::make_pair(0, 0);
964  case 1:
965  switch (CI.Width1) {
966  default:
967  return std::make_pair(0, 0);
968  case 1:
969  return std::make_pair(AMDGPU::sub1, AMDGPU::sub0);
970  case 2:
971  return std::make_pair(AMDGPU::sub2, AMDGPU::sub0_sub1);
972  case 3:
973  return std::make_pair(AMDGPU::sub3, AMDGPU::sub0_sub1_sub2);
974  }
975  case 2:
976  switch (CI.Width1) {
977  default:
978  return std::make_pair(0, 0);
979  case 1:
980  return std::make_pair(AMDGPU::sub1_sub2, AMDGPU::sub0);
981  case 2:
982  return std::make_pair(AMDGPU::sub2_sub3, AMDGPU::sub0_sub1);
983  }
984  case 3:
985  switch (CI.Width1) {
986  default:
987  return std::make_pair(0, 0);
988  case 1:
989  return std::make_pair(AMDGPU::sub1_sub2_sub3, AMDGPU::sub0);
990  }
991  }
992  } else {
993  switch (CI.Width0) {
994  default:
995  return std::make_pair(0, 0);
996  case 1:
997  switch (CI.Width1) {
998  default:
999  return std::make_pair(0, 0);
1000  case 1:
1001  return std::make_pair(AMDGPU::sub0, AMDGPU::sub1);
1002  case 2:
1003  return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2);
1004  case 3:
1005  return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2_sub3);
1006  }
1007  case 2:
1008  switch (CI.Width1) {
1009  default:
1010  return std::make_pair(0, 0);
1011  case 1:
1012  return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2);
1013  case 2:
1014  return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2_sub3);
1015  }
1016  case 3:
1017  switch (CI.Width1) {
1018  default:
1019  return std::make_pair(0, 0);
1020  case 1:
1021  return std::make_pair(AMDGPU::sub0_sub1_sub2, AMDGPU::sub3);
1022  }
1023  }
1024  }
1025 }
1026 
1027 const TargetRegisterClass *
1028 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI) {
1029  if (CI.InstClass == S_BUFFER_LOAD_IMM) {
1030  switch (CI.Width0 + CI.Width1) {
1031  default:
1032  return nullptr;
1033  case 2:
1034  return &AMDGPU::SReg_64_XEXECRegClass;
1035  case 4:
1036  return &AMDGPU::SReg_128RegClass;
1037  case 8:
1038  return &AMDGPU::SReg_256RegClass;
1039  case 16:
1040  return &AMDGPU::SReg_512RegClass;
1041  }
1042  } else {
1043  switch (CI.Width0 + CI.Width1) {
1044  default:
1045  return nullptr;
1046  case 2:
1047  return &AMDGPU::VReg_64RegClass;
1048  case 3:
1049  return &AMDGPU::VReg_96RegClass;
1050  case 4:
1051  return &AMDGPU::VReg_128RegClass;
1052  }
1053  }
1054 }
1055 
1057 SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) {
1058  MachineBasicBlock *MBB = CI.I->getParent();
1059  DebugLoc DL = CI.I->getDebugLoc();
1060 
1061  const unsigned Opcode = getNewOpcode(CI);
1062 
1063  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
1064  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1065  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1066 
1067  // Copy to the new source register.
1068  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
1069  unsigned SrcReg = MRI->createVirtualRegister(SuperRC);
1070 
1071  const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1072  const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
1073 
1074  BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1075  .add(*Src0)
1076  .addImm(SubRegIdx0)
1077  .add(*Src1)
1078  .addImm(SubRegIdx1);
1079 
1080  auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode))
1081  .addReg(SrcReg, RegState::Kill);
1082 
1083  const unsigned Regs = getRegs(Opcode);
1084 
1085  if (Regs & VADDR)
1086  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1087 
1088  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1089  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1090  .addImm(std::min(CI.Offset0, CI.Offset1)) // offset
1091  .addImm(CI.GLC0) // glc
1092  .addImm(CI.SLC0) // slc
1093  .addImm(0) // tfe
1094  .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
1095 
1096  moveInstsAfter(MIB, CI.InstsToMove);
1097 
1098  MachineBasicBlock::iterator Next = std::next(CI.I);
1099  CI.I->eraseFromParent();
1100  CI.Paired->eraseFromParent();
1101  return Next;
1102 }
1103 
1105 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) {
1106  APInt V(32, Val, true);
1107  if (TII->isInlineConstant(V))
1108  return MachineOperand::CreateImm(Val);
1109 
1110  unsigned Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1111  MachineInstr *Mov =
1112  BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1113  TII->get(AMDGPU::S_MOV_B32), Reg)
1114  .addImm(Val);
1115  (void)Mov;
1116  LLVM_DEBUG(dbgs() << " "; Mov->dump());
1117  return MachineOperand::CreateReg(Reg, false);
1118 }
1119 
1120 // Compute base address using Addr and return the final register.
1121 unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1122  const MemAddress &Addr) {
1123  MachineBasicBlock *MBB = MI.getParent();
1125  DebugLoc DL = MI.getDebugLoc();
1126 
1127  assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1128  Addr.Base.LoSubReg) &&
1129  "Expected 32-bit Base-Register-Low!!");
1130 
1131  assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1132  Addr.Base.HiSubReg) &&
1133  "Expected 32-bit Base-Register-Hi!!");
1134 
1135  LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n");
1136  MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1137  MachineOperand OffsetHi =
1138  createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1139  unsigned CarryReg = MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
1140  unsigned DeadCarryReg =
1141  MRI->createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
1142 
1143  unsigned DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1144  unsigned DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1145  MachineInstr *LoHalf =
1146  BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_I32_e64), DestSub0)
1147  .addReg(CarryReg, RegState::Define)
1148  .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1149  .add(OffsetLo)
1150  .addImm(0); // clamp bit
1151  (void)LoHalf;
1152  LLVM_DEBUG(dbgs() << " "; LoHalf->dump(););
1153 
1154  MachineInstr *HiHalf =
1155  BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1156  .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
1157  .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
1158  .add(OffsetHi)
1159  .addReg(CarryReg, RegState::Kill)
1160  .addImm(0); // clamp bit
1161  (void)HiHalf;
1162  LLVM_DEBUG(dbgs() << " "; HiHalf->dump(););
1163 
1164  unsigned FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass);
1165  MachineInstr *FullBase =
1166  BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
1167  .addReg(DestSub0)
1168  .addImm(AMDGPU::sub0)
1169  .addReg(DestSub1)
1170  .addImm(AMDGPU::sub1);
1171  (void)FullBase;
1172  LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";);
1173 
1174  return FullDestReg;
1175 }
1176 
1177 // Update base and offset with the NewBase and NewOffset in MI.
1178 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
1179  unsigned NewBase,
1180  int32_t NewOffset) {
1181  TII->getNamedOperand(MI, AMDGPU::OpName::vaddr)->setReg(NewBase);
1182  TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
1183 }
1184 
1186 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) {
1187  if (Op.isImm())
1188  return Op.getImm();
1189 
1190  if (!Op.isReg())
1191  return None;
1192 
1193  MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
1194  if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
1195  !Def->getOperand(1).isImm())
1196  return None;
1197 
1198  return Def->getOperand(1).getImm();
1199 }
1200 
1201 // Analyze Base and extracts:
1202 // - 32bit base registers, subregisters
1203 // - 64bit constant offset
1204 // Expecting base computation as:
1205 // %OFFSET0:sgpr_32 = S_MOV_B32 8000
1206 // %LO:vgpr_32, %c:sreg_64_xexec =
1207 // V_ADD_I32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
1208 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
1209 // %Base:vreg_64 =
1210 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
1211 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
1212  MemAddress &Addr) {
1213  if (!Base.isReg())
1214  return;
1215 
1216  MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
1217  if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
1218  || Def->getNumOperands() != 5)
1219  return;
1220 
1221  MachineOperand BaseLo = Def->getOperand(1);
1222  MachineOperand BaseHi = Def->getOperand(3);
1223  if (!BaseLo.isReg() || !BaseHi.isReg())
1224  return;
1225 
1226  MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
1227  MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
1228 
1229  if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_I32_e64 ||
1230  !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
1231  return;
1232 
1233  const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
1234  const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
1235 
1236  auto Offset0P = extractConstOffset(*Src0);
1237  if (Offset0P)
1238  BaseLo = *Src1;
1239  else {
1240  if (!(Offset0P = extractConstOffset(*Src1)))
1241  return;
1242  BaseLo = *Src0;
1243  }
1244 
1245  Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
1246  Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
1247 
1248  if (Src0->isImm())
1249  std::swap(Src0, Src1);
1250 
1251  if (!Src1->isImm())
1252  return;
1253 
1254  uint64_t Offset1 = Src1->getImm();
1255  BaseHi = *Src0;
1256 
1257  Addr.Base.LoReg = BaseLo.getReg();
1258  Addr.Base.HiReg = BaseHi.getReg();
1259  Addr.Base.LoSubReg = BaseLo.getSubReg();
1260  Addr.Base.HiSubReg = BaseHi.getSubReg();
1261  Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
1262 }
1263 
1264 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
1265  MachineInstr &MI,
1266  MemInfoMap &Visited,
1267  SmallPtrSet<MachineInstr *, 4> &AnchorList) {
1268 
1269  // TODO: Support flat and scratch.
1270  if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0 ||
1271  TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL)
1272  return false;
1273 
1274  // TODO: Support Store.
1275  if (!MI.mayLoad())
1276  return false;
1277 
1278  if (AnchorList.count(&MI))
1279  return false;
1280 
1281  LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
1282 
1283  if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
1284  LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";);
1285  return false;
1286  }
1287 
1288  // Step1: Find the base-registers and a 64bit constant offset.
1289  MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1290  MemAddress MAddr;
1291  if (Visited.find(&MI) == Visited.end()) {
1292  processBaseWithConstOffset(Base, MAddr);
1293  Visited[&MI] = MAddr;
1294  } else
1295  MAddr = Visited[&MI];
1296 
1297  if (MAddr.Offset == 0) {
1298  LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no"
1299  " constant offsets that can be promoted.\n";);
1300  return false;
1301  }
1302 
1303  LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", "
1304  << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
1305 
1306  // Step2: Traverse through MI's basic block and find an anchor(that has the
1307  // same base-registers) with the highest 13bit distance from MI's offset.
1308  // E.g. (64bit loads)
1309  // bb:
1310  // addr1 = &a + 4096; load1 = load(addr1, 0)
1311  // addr2 = &a + 6144; load2 = load(addr2, 0)
1312  // addr3 = &a + 8192; load3 = load(addr3, 0)
1313  // addr4 = &a + 10240; load4 = load(addr4, 0)
1314  // addr5 = &a + 12288; load5 = load(addr5, 0)
1315  //
1316  // Starting from the first load, the optimization will try to find a new base
1317  // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
1318  // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
1319  // as the new-base(anchor) because of the maximum distance which can
1320  // accomodate more intermediate bases presumeably.
1321  //
1322  // Step3: move (&a + 8192) above load1. Compute and promote offsets from
1323  // (&a + 8192) for load1, load2, load4.
1324  // addr = &a + 8192
1325  // load1 = load(addr, -4096)
1326  // load2 = load(addr, -2048)
1327  // load3 = load(addr, 0)
1328  // load4 = load(addr, 2048)
1329  // addr5 = &a + 12288; load5 = load(addr5, 0)
1330  //
1331  MachineInstr *AnchorInst = nullptr;
1332  MemAddress AnchorAddr;
1333  uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
1335 
1336  MachineBasicBlock *MBB = MI.getParent();
1339  ++MBBI;
1340  const SITargetLowering *TLI =
1341  static_cast<const SITargetLowering *>(STM->getTargetLowering());
1342 
1343  for ( ; MBBI != E; ++MBBI) {
1344  MachineInstr &MINext = *MBBI;
1345  // TODO: Support finding an anchor(with same base) from store addresses or
1346  // any other load addresses where the opcodes are different.
1347  if (MINext.getOpcode() != MI.getOpcode() ||
1348  TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
1349  continue;
1350 
1351  const MachineOperand &BaseNext =
1352  *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
1353  MemAddress MAddrNext;
1354  if (Visited.find(&MINext) == Visited.end()) {
1355  processBaseWithConstOffset(BaseNext, MAddrNext);
1356  Visited[&MINext] = MAddrNext;
1357  } else
1358  MAddrNext = Visited[&MINext];
1359 
1360  if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
1361  MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
1362  MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
1363  MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
1364  continue;
1365 
1366  InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset));
1367 
1368  int64_t Dist = MAddr.Offset - MAddrNext.Offset;
1370  AM.HasBaseReg = true;
1371  AM.BaseOffs = Dist;
1372  if (TLI->isLegalGlobalAddressingMode(AM) &&
1373  (uint32_t)std::abs(Dist) > MaxDist) {
1374  MaxDist = std::abs(Dist);
1375 
1376  AnchorAddr = MAddrNext;
1377  AnchorInst = &MINext;
1378  }
1379  }
1380 
1381  if (AnchorInst) {
1382  LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): ";
1383  AnchorInst->dump());
1384  LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: "
1385  << AnchorAddr.Offset << "\n\n");
1386 
1387  // Instead of moving up, just re-compute anchor-instruction's base address.
1388  unsigned Base = computeBase(MI, AnchorAddr);
1389 
1390  updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
1391  LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump(););
1392 
1393  for (auto P : InstsWCommonBase) {
1395  AM.HasBaseReg = true;
1396  AM.BaseOffs = P.second - AnchorAddr.Offset;
1397 
1398  if (TLI->isLegalGlobalAddressingMode(AM)) {
1399  LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second;
1400  dbgs() << ")"; P.first->dump());
1401  updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
1402  LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump());
1403  }
1404  }
1405  AnchorList.insert(AnchorInst);
1406  return true;
1407  }
1408 
1409  return false;
1410 }
1411 
1412 // Scan through looking for adjacent LDS operations with constant offsets from
1413 // the same base register. We rely on the scheduler to do the hard work of
1414 // clustering nearby loads, and assume these are all adjacent.
1415 bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
1416  bool Modified = false;
1417 
1418  // Contain the list
1419  MemInfoMap Visited;
1420  // Contains the list of instructions for which constant offsets are being
1421  // promoted to the IMM.
1422  SmallPtrSet<MachineInstr *, 4> AnchorList;
1423 
1424  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
1425  MachineInstr &MI = *I;
1426 
1427  if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
1428  Modified = true;
1429 
1430  // Don't combine if volatile.
1431  if (MI.hasOrderedMemoryRef()) {
1432  ++I;
1433  continue;
1434  }
1435 
1436  const unsigned Opc = MI.getOpcode();
1437 
1438  CombineInfo CI;
1439  CI.I = I;
1440  CI.InstClass = getInstClass(Opc);
1441 
1442  switch (CI.InstClass) {
1443  default:
1444  break;
1445  case DS_READ:
1446  CI.EltSize =
1447  (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
1448  : 4;
1449  if (findMatchingInst(CI)) {
1450  Modified = true;
1451  I = mergeRead2Pair(CI);
1452  } else {
1453  ++I;
1454  }
1455  continue;
1456  case DS_WRITE:
1457  CI.EltSize =
1458  (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
1459  : 4;
1460  if (findMatchingInst(CI)) {
1461  Modified = true;
1462  I = mergeWrite2Pair(CI);
1463  } else {
1464  ++I;
1465  }
1466  continue;
1467  case S_BUFFER_LOAD_IMM:
1468  CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4);
1469  if (findMatchingInst(CI)) {
1470  Modified = true;
1471  I = mergeSBufferLoadImmPair(CI);
1472  OptimizeAgain |= (CI.Width0 + CI.Width1) < 16;
1473  } else {
1474  ++I;
1475  }
1476  continue;
1477  case BUFFER_LOAD_OFFEN:
1478  case BUFFER_LOAD_OFFSET:
1479  case BUFFER_LOAD_OFFEN_exact:
1480  case BUFFER_LOAD_OFFSET_exact:
1481  CI.EltSize = 4;
1482  if (findMatchingInst(CI)) {
1483  Modified = true;
1484  I = mergeBufferLoadPair(CI);
1485  OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;
1486  } else {
1487  ++I;
1488  }
1489  continue;
1490  case BUFFER_STORE_OFFEN:
1491  case BUFFER_STORE_OFFSET:
1492  case BUFFER_STORE_OFFEN_exact:
1493  case BUFFER_STORE_OFFSET_exact:
1494  CI.EltSize = 4;
1495  if (findMatchingInst(CI)) {
1496  Modified = true;
1497  I = mergeBufferStorePair(CI);
1498  OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;
1499  } else {
1500  ++I;
1501  }
1502  continue;
1503  }
1504 
1505  ++I;
1506  }
1507 
1508  return Modified;
1509 }
1510 
1511 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
1512  if (skipFunction(MF.getFunction()))
1513  return false;
1514 
1515  STM = &MF.getSubtarget<GCNSubtarget>();
1516  if (!STM->loadStoreOptEnabled())
1517  return false;
1518 
1519  TII = STM->getInstrInfo();
1520  TRI = &TII->getRegisterInfo();
1521 
1522  MRI = &MF.getRegInfo();
1523  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1524 
1525  assert(MRI->isSSA() && "Must be run on SSA");
1526 
1527  LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
1528 
1529  bool Modified = false;
1530 
1531  for (MachineBasicBlock &MBB : MF) {
1532  do {
1533  OptimizeAgain = false;
1534  Modified |= optimizeBlock(MBB);
1535  } while (OptimizeAgain);
1536  }
1537 
1538  return Modified;
1539 }
static bool isReg(const MCInst &MI, unsigned OpNo)
int getMUBUFOpcode(unsigned BaseOpc, unsigned Dwords)
const MachineInstrBuilder & add(const MachineOperand &MO) const
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
Interface definition for SIRegisterInfo.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg If BaseGV is null...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
AMDGPU specific subclass of TargetSubtarget.
bool getMUBUFHasSrsrc(unsigned Opc)
This class represents lattice values for constants.
Definition: AllocatorList.h:23
char & SILoadStoreOptimizerID
void push_back(const T &Elt)
Definition: SmallVector.h:211
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:382
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:163
unsigned getReg() const
getReg - Returns the register number.
unsigned Reg
unsigned getSubReg() const
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
Definition: MachineInstr.h:829
const SIInstrInfo * getInstrInfo() const override
unsigned const TargetRegisterInfo * TRI
A debug info location.
Definition: DebugLoc.h:33
int getMUBUFDwords(unsigned Opc)
iterator_range< mop_iterator > operands()
Definition: MachineInstr.h:458
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateReg(unsigned Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:50
bool hasDwordx3LoadStores() const
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const HexagonInstrInfo * TII
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:411
A Use represents the edge between a Value definition and its users.
Definition: Use.h:55
#define DEBUG_TYPE
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:408
static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, MachineBasicBlock::iterator B, AliasAnalysis *AA)
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:32
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp, ArrayRef< MachineInstr *> InstsToMove, AliasAnalysis *AA)
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
#define P(N)
int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
constexpr bool isUInt< 8 >(uint64_t x)
Definition: MathExtras.h:342
unsigned const MachineRegisterInfo * MRI
bool getMUBUFHasSoffset(unsigned Opc)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:187
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:370
int getMUBUFBaseOpcode(unsigned Opc)
Represent the analysis usage information of a pass.
SI Load Store Optimizer
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:284
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:381
self_iterator getIterator()
Definition: ilist_node.h:81
static void moveInstsAfter(MachineBasicBlock::iterator I, ArrayRef< MachineInstr *> InstsToMove)
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements...
Definition: SmallPtrSet.h:417
MachineOperand class - Representation of each machine instruction operand.
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:841
bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:301
int64_t getImm() const
const Function & getFunction() const
Return the LLVM function that this machine code represents.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:940
Class for arbitrary precision integers.
Definition: APInt.h:69
static bool addToListsIfDependent(MachineInstr &MI, DenseSet< unsigned > &RegDefs, DenseSet< unsigned > &PhysRegUses, SmallVectorImpl< MachineInstr *> &Insts)
static unsigned getReg(const void *D, unsigned RC, unsigned RegNo)
INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", false, false) INITIALIZE_PASS_END(SILoadStoreOptimizer
void initializeSILoadStoreOptimizerPass(PassRegistry &)
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:253
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
Provides AMDGPU specific target descriptions.
Representation of each machine instruction.
Definition: MachineInstr.h:63
static bool isPhysicalRegister(unsigned Reg)
Return true if the specified register number is in the physical register namespace.
Interface definition for SIInstrInfo.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
bool loadStoreOptEnabled() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
static MachineOperand CreateImm(int64_t Val)
#define I(x, y, z)
Definition: MD5.cpp:58
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition: APFloat.h:1212
const MachineInstrBuilder & cloneMergedMemRefs(ArrayRef< const MachineInstr *> OtherMIs) const
const MachineInstrBuilder & addReg(unsigned RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:91
bool isReg() const
isReg - Tests if this is a MO_Register operand.
FunctionPass * createSILoadStoreOptimizerPass()
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be iniitalized. ...
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
Definition: MachineInstr.h:806
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
IRTranslator LLVM IR MI
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:48
bool getMUBUFHasVAddr(unsigned Opc)
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object...
const SITargetLowering * getTargetLowering() const override
#define LLVM_DEBUG(X)
Definition: Debug.h:122
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:413
static void addDefsUsesToList(const MachineInstr &MI, DenseSet< unsigned > &RegDefs, DenseSet< unsigned > &PhysRegUses)