LLVM  10.0.0svn
SILoadStoreOptimizer.cpp
Go to the documentation of this file.
1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass tries to fuse DS instructions with close by immediate offsets.
10 // This will fuse operations such as
11 // ds_read_b32 v0, v2 offset:16
12 // ds_read_b32 v1, v2 offset:32
13 // ==>
14 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15 //
16 // The same is done for certain SMEM and VMEM opcodes, e.g.:
17 // s_buffer_load_dword s4, s[0:3], 4
18 // s_buffer_load_dword s5, s[0:3], 8
19 // ==>
20 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21 //
22 // This pass also tries to promote constant offset to the immediate by
23 // adjusting the base. It tries to use a base from the nearby instructions that
24 // allows it to have a 13bit constant offset and then promotes the 13bit offset
25 // to the immediate.
26 // E.g.
27 // s_movk_i32 s0, 0x1800
28 // v_add_co_u32_e32 v0, vcc, s0, v2
29 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30 //
31 // s_movk_i32 s0, 0x1000
32 // v_add_co_u32_e32 v5, vcc, s0, v2
33 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34 // global_load_dwordx2 v[5:6], v[5:6], off
35 // global_load_dwordx2 v[0:1], v[0:1], off
36 // =>
37 // s_movk_i32 s0, 0x1000
38 // v_add_co_u32_e32 v5, vcc, s0, v2
39 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40 // global_load_dwordx2 v[5:6], v[5:6], off
41 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42 //
43 // Future improvements:
44 //
45 // - This currently relies on the scheduler to place loads and stores next to
46 // each other, and then only merges adjacent pairs of instructions. It would
47 // be good to be more flexible with interleaved instructions, and possibly run
48 // before scheduling. It currently missing stores of constants because loading
49 // the constant into the data register is placed between the stores, although
50 // this is arguably a scheduling problem.
51 //
52 // - Live interval recomputing seems inefficient. This currently only matches
53 // one pair, and recomputes live intervals and moves on to the next pair. It
54 // would be better to compute a list of all merges that need to occur.
55 //
56 // - With a list of instructions to process, we can also merge more. If a
57 // cluster of loads have offsets that are too large to fit in the 8-bit
58 // offsets, but are close enough to fit in the 8 bits, we can add to the base
59 // pointer and use the new reduced offsets.
60 //
61 //===----------------------------------------------------------------------===//
62 
63 #include "AMDGPU.h"
64 #include "AMDGPUSubtarget.h"
66 #include "SIInstrInfo.h"
67 #include "SIRegisterInfo.h"
68 #include "Utils/AMDGPUBaseInfo.h"
69 #include "llvm/ADT/ArrayRef.h"
70 #include "llvm/ADT/SmallVector.h"
71 #include "llvm/ADT/StringRef.h"
80 #include "llvm/IR/DebugLoc.h"
81 #include "llvm/Pass.h"
82 #include "llvm/Support/Debug.h"
85 #include <algorithm>
86 #include <cassert>
87 #include <cstdlib>
88 #include <iterator>
89 #include <utility>
90 
91 using namespace llvm;
92 
93 #define DEBUG_TYPE "si-load-store-opt"
94 
95 namespace {
97  UNKNOWN,
98  DS_READ,
99  DS_WRITE,
100  S_BUFFER_LOAD_IMM,
101  BUFFER_LOAD_OFFEN = AMDGPU::BUFFER_LOAD_DWORD_OFFEN,
102  BUFFER_LOAD_OFFSET = AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
103  BUFFER_STORE_OFFEN = AMDGPU::BUFFER_STORE_DWORD_OFFEN,
104  BUFFER_STORE_OFFSET = AMDGPU::BUFFER_STORE_DWORD_OFFSET,
105  BUFFER_LOAD_OFFEN_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact,
106  BUFFER_LOAD_OFFSET_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact,
107  BUFFER_STORE_OFFEN_exact = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact,
108  BUFFER_STORE_OFFSET_exact = AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact,
109 };
110 
112  SBASE = 0x1,
113  SRSRC = 0x2,
114  SOFFSET = 0x4,
115  VADDR = 0x8,
116  ADDR = 0x10,
117 };
118 
119 class SILoadStoreOptimizer : public MachineFunctionPass {
120  struct CombineInfo {
123  unsigned EltSize;
124  unsigned Offset0;
125  unsigned Offset1;
126  unsigned Width0;
127  unsigned Width1;
128  unsigned BaseOff;
129  InstClassEnum InstClass;
130  bool GLC0;
131  bool GLC1;
132  bool SLC0;
133  bool SLC1;
134  bool DLC0;
135  bool DLC1;
136  bool UseST64;
137  SmallVector<MachineInstr *, 8> InstsToMove;
138  };
139 
140  struct BaseRegisters {
141  unsigned LoReg = 0;
142  unsigned HiReg = 0;
143 
144  unsigned LoSubReg = 0;
145  unsigned HiSubReg = 0;
146  };
147 
148  struct MemAddress {
149  BaseRegisters Base;
150  int64_t Offset = 0;
151  };
152 
153  using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
154 
155 private:
156  const GCNSubtarget *STM = nullptr;
157  const SIInstrInfo *TII = nullptr;
158  const SIRegisterInfo *TRI = nullptr;
159  MachineRegisterInfo *MRI = nullptr;
160  AliasAnalysis *AA = nullptr;
161  bool OptimizeAgain;
162 
163  static bool offsetsCanBeCombined(CombineInfo &CI);
164  static bool widthsFit(const GCNSubtarget &STM, const CombineInfo &CI);
165  static unsigned getNewOpcode(const CombineInfo &CI);
166  static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI);
167  const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI);
168  unsigned getOpcodeWidth(const MachineInstr &MI) const;
169  InstClassEnum getInstClass(unsigned Opc) const;
170  unsigned getRegs(unsigned Opc) const;
171 
172  bool findMatchingInst(CombineInfo &CI);
173 
174  unsigned read2Opcode(unsigned EltSize) const;
175  unsigned read2ST64Opcode(unsigned EltSize) const;
176  MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI);
177 
178  unsigned write2Opcode(unsigned EltSize) const;
179  unsigned write2ST64Opcode(unsigned EltSize) const;
180  MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
181  MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
182  MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI);
183  MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI);
184 
185  void updateBaseAndOffset(MachineInstr &I, unsigned NewBase,
186  int32_t NewOffset) const;
187  unsigned computeBase(MachineInstr &MI, const MemAddress &Addr) const;
188  MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
189  Optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
190  void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
191  /// Promotes constant offset to the immediate by adjusting the base. It
192  /// tries to use a base from the nearby instructions that allows it to have
193  /// a 13bit constant offset which gets promoted to the immediate.
194  bool promoteConstantOffsetToImm(MachineInstr &CI,
195  MemInfoMap &Visited,
196  SmallPtrSet<MachineInstr *, 4> &Promoted) const;
197 
198 public:
199  static char ID;
200 
201  SILoadStoreOptimizer() : MachineFunctionPass(ID) {
203  }
204 
205  bool optimizeBlock(MachineBasicBlock &MBB);
206 
207  bool runOnMachineFunction(MachineFunction &MF) override;
208 
209  StringRef getPassName() const override { return "SI Load Store Optimizer"; }
210 
211  void getAnalysisUsage(AnalysisUsage &AU) const override {
212  AU.setPreservesCFG();
214 
216  }
217 };
218 
219 } // end anonymous namespace.
220 
221 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
222  "SI Load Store Optimizer", false, false)
225  false, false)
226 
227 char SILoadStoreOptimizer::ID = 0;
228 
229 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
230 
232  return new SILoadStoreOptimizer();
233 }
234 
236  ArrayRef<MachineInstr *> InstsToMove) {
237  MachineBasicBlock *MBB = I->getParent();
238  ++I;
239  for (MachineInstr *MI : InstsToMove) {
240  MI->removeFromParent();
241  MBB->insert(I, MI);
242  }
243 }
244 
245 static void addDefsUsesToList(const MachineInstr &MI,
246  DenseSet<unsigned> &RegDefs,
247  DenseSet<unsigned> &PhysRegUses) {
248  for (const MachineOperand &Op : MI.operands()) {
249  if (Op.isReg()) {
250  if (Op.isDef())
251  RegDefs.insert(Op.getReg());
252  else if (Op.readsReg() && Register::isPhysicalRegister(Op.getReg()))
253  PhysRegUses.insert(Op.getReg());
254  }
255  }
256 }
257 
260  AliasAnalysis *AA) {
261  // RAW or WAR - cannot reorder
262  // WAW - cannot reorder
263  // RAR - safe to reorder
264  return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true);
265 }
266 
267 // Add MI and its defs to the lists if MI reads one of the defs that are
268 // already in the list. Returns true in that case.
270  DenseSet<unsigned> &PhysRegUses,
272  for (MachineOperand &Use : MI.operands()) {
273  // If one of the defs is read, then there is a use of Def between I and the
274  // instruction that I will potentially be merged with. We will need to move
275  // this instruction after the merged instructions.
276  //
277  // Similarly, if there is a def which is read by an instruction that is to
278  // be moved for merging, then we need to move the def-instruction as well.
279  // This can only happen for physical registers such as M0; virtual
280  // registers are in SSA form.
281  if (Use.isReg() &&
282  ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
283  (Use.isDef() && RegDefs.count(Use.getReg())) ||
284  (Use.isDef() && Register::isPhysicalRegister(Use.getReg()) &&
285  PhysRegUses.count(Use.getReg())))) {
286  Insts.push_back(&MI);
287  addDefsUsesToList(MI, RegDefs, PhysRegUses);
288  return true;
289  }
290  }
291 
292  return false;
293 }
294 
296  ArrayRef<MachineInstr *> InstsToMove,
297  AliasAnalysis *AA) {
298  assert(MemOp.mayLoadOrStore());
299 
300  for (MachineInstr *InstToMove : InstsToMove) {
301  if (!InstToMove->mayLoadOrStore())
302  continue;
303  if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA))
304  return false;
305  }
306  return true;
307 }
308 
309 // This function assumes that \p A and \p B have are identical except for
310 // size and offset, and they referecne adjacent memory.
312  const MachineMemOperand *A,
313  const MachineMemOperand *B) {
314  unsigned MinOffset = std::min(A->getOffset(), B->getOffset());
315  unsigned Size = A->getSize() + B->getSize();
316  // This function adds the offset parameter to the existing offset for A,
317  // so we pass 0 here as the offset and then manually set it to the correct
318  // value after the call.
319  MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size);
320  MMO->setOffset(MinOffset);
321  return MMO;
322 }
323 
324 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
325  // XXX - Would the same offset be OK? Is there any reason this would happen or
326  // be useful?
327  if (CI.Offset0 == CI.Offset1)
328  return false;
329 
330  // This won't be valid if the offset isn't aligned.
331  if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0))
332  return false;
333 
334  unsigned EltOffset0 = CI.Offset0 / CI.EltSize;
335  unsigned EltOffset1 = CI.Offset1 / CI.EltSize;
336  CI.UseST64 = false;
337  CI.BaseOff = 0;
338 
339  // Handle SMEM and VMEM instructions.
340  if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
341  return (EltOffset0 + CI.Width0 == EltOffset1 ||
342  EltOffset1 + CI.Width1 == EltOffset0) &&
343  CI.GLC0 == CI.GLC1 && CI.DLC0 == CI.DLC1 &&
344  (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1);
345  }
346 
347  // If the offset in elements doesn't fit in 8-bits, we might be able to use
348  // the stride 64 versions.
349  if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
350  isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
351  CI.Offset0 = EltOffset0 / 64;
352  CI.Offset1 = EltOffset1 / 64;
353  CI.UseST64 = true;
354  return true;
355  }
356 
357  // Check if the new offsets fit in the reduced 8-bit range.
358  if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
359  CI.Offset0 = EltOffset0;
360  CI.Offset1 = EltOffset1;
361  return true;
362  }
363 
364  // Try to shift base address to decrease offsets.
365  unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0);
366  CI.BaseOff = std::min(CI.Offset0, CI.Offset1);
367 
368  if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) {
369  CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
370  CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
371  CI.UseST64 = true;
372  return true;
373  }
374 
375  if (isUInt<8>(OffsetDiff)) {
376  CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize;
377  CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize;
378  return true;
379  }
380 
381  return false;
382 }
383 
384 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
385  const CombineInfo &CI) {
386  const unsigned Width = (CI.Width0 + CI.Width1);
387  switch (CI.InstClass) {
388  default:
389  return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
390  case S_BUFFER_LOAD_IMM:
391  switch (Width) {
392  default:
393  return false;
394  case 2:
395  case 4:
396  return true;
397  }
398  }
399 }
400 
401 unsigned SILoadStoreOptimizer::getOpcodeWidth(const MachineInstr &MI) const {
402  const unsigned Opc = MI.getOpcode();
403 
404  if (TII->isMUBUF(MI)) {
405  // FIXME: Handle d16 correctly
406  return AMDGPU::getMUBUFElements(Opc);
407  }
408 
409  switch (Opc) {
410  default:
411  return 0;
412  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
413  return 1;
414  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
415  return 2;
416  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
417  return 4;
418  }
419 }
420 
421 InstClassEnum SILoadStoreOptimizer::getInstClass(unsigned Opc) const {
422  if (TII->isMUBUF(Opc)) {
423  const int baseOpcode = AMDGPU::getMUBUFBaseOpcode(Opc);
424 
425  // If we couldn't identify the opcode, bail out.
426  if (baseOpcode == -1) {
427  return UNKNOWN;
428  }
429 
430  switch (baseOpcode) {
431  default:
432  return UNKNOWN;
433  case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
434  return BUFFER_LOAD_OFFEN;
435  case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
436  return BUFFER_LOAD_OFFSET;
437  case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
438  return BUFFER_STORE_OFFEN;
439  case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
440  return BUFFER_STORE_OFFSET;
441  case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
442  return BUFFER_LOAD_OFFEN_exact;
443  case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
444  return BUFFER_LOAD_OFFSET_exact;
445  case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
446  return BUFFER_STORE_OFFEN_exact;
447  case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
448  return BUFFER_STORE_OFFSET_exact;
449  }
450  }
451 
452  switch (Opc) {
453  default:
454  return UNKNOWN;
455  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
456  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
457  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
458  return S_BUFFER_LOAD_IMM;
459  case AMDGPU::DS_READ_B32:
460  case AMDGPU::DS_READ_B64:
461  case AMDGPU::DS_READ_B32_gfx9:
462  case AMDGPU::DS_READ_B64_gfx9:
463  return DS_READ;
464  case AMDGPU::DS_WRITE_B32:
465  case AMDGPU::DS_WRITE_B64:
466  case AMDGPU::DS_WRITE_B32_gfx9:
467  case AMDGPU::DS_WRITE_B64_gfx9:
468  return DS_WRITE;
469  }
470 }
471 
472 unsigned SILoadStoreOptimizer::getRegs(unsigned Opc) const {
473  if (TII->isMUBUF(Opc)) {
474  unsigned result = 0;
475 
476  if (AMDGPU::getMUBUFHasVAddr(Opc)) {
477  result |= VADDR;
478  }
479 
480  if (AMDGPU::getMUBUFHasSrsrc(Opc)) {
481  result |= SRSRC;
482  }
483 
484  if (AMDGPU::getMUBUFHasSoffset(Opc)) {
485  result |= SOFFSET;
486  }
487 
488  return result;
489  }
490 
491  switch (Opc) {
492  default:
493  return 0;
494  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
495  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
496  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
497  return SBASE;
498  case AMDGPU::DS_READ_B32:
499  case AMDGPU::DS_READ_B64:
500  case AMDGPU::DS_READ_B32_gfx9:
501  case AMDGPU::DS_READ_B64_gfx9:
502  case AMDGPU::DS_WRITE_B32:
503  case AMDGPU::DS_WRITE_B64:
504  case AMDGPU::DS_WRITE_B32_gfx9:
505  case AMDGPU::DS_WRITE_B64_gfx9:
506  return ADDR;
507  }
508 }
509 
510 bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
511  MachineBasicBlock *MBB = CI.I->getParent();
513  MachineBasicBlock::iterator MBBI = CI.I;
514 
515  const unsigned Opc = CI.I->getOpcode();
516  const InstClassEnum InstClass = getInstClass(Opc);
517 
518  if (InstClass == UNKNOWN) {
519  return false;
520  }
521 
522  const unsigned Regs = getRegs(Opc);
523 
524  unsigned AddrOpName[5] = {0};
525  int AddrIdx[5];
526  const MachineOperand *AddrReg[5];
527  unsigned NumAddresses = 0;
528 
529  if (Regs & ADDR) {
530  AddrOpName[NumAddresses++] = AMDGPU::OpName::addr;
531  }
532 
533  if (Regs & SBASE) {
534  AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase;
535  }
536 
537  if (Regs & SRSRC) {
538  AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
539  }
540 
541  if (Regs & SOFFSET) {
542  AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
543  }
544 
545  if (Regs & VADDR) {
546  AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
547  }
548 
549  for (unsigned i = 0; i < NumAddresses; i++) {
550  AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]);
551  AddrReg[i] = &CI.I->getOperand(AddrIdx[i]);
552 
553  // We only ever merge operations with the same base address register, so
554  // don't bother scanning forward if there are no other uses.
555  if (AddrReg[i]->isReg() &&
556  (Register::isPhysicalRegister(AddrReg[i]->getReg()) ||
557  MRI->hasOneNonDBGUse(AddrReg[i]->getReg())))
558  return false;
559  }
560 
561  ++MBBI;
562 
563  DenseSet<unsigned> RegDefsToMove;
564  DenseSet<unsigned> PhysRegUsesToMove;
565  addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
566 
567  for (; MBBI != E; ++MBBI) {
568  const bool IsDS = (InstClass == DS_READ) || (InstClass == DS_WRITE);
569 
570  if ((getInstClass(MBBI->getOpcode()) != InstClass) ||
571  (IsDS && (MBBI->getOpcode() != Opc))) {
572  // This is not a matching DS instruction, but we can keep looking as
573  // long as one of these conditions are met:
574  // 1. It is safe to move I down past MBBI.
575  // 2. It is safe to move MBBI down past the instruction that I will
576  // be merged into.
577 
578  if (MBBI->hasUnmodeledSideEffects()) {
579  // We can't re-order this instruction with respect to other memory
580  // operations, so we fail both conditions mentioned above.
581  return false;
582  }
583 
584  if (MBBI->mayLoadOrStore() &&
585  (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
586  !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))) {
587  // We fail condition #1, but we may still be able to satisfy condition
588  // #2. Add this instruction to the move list and then we will check
589  // if condition #2 holds once we have selected the matching instruction.
590  CI.InstsToMove.push_back(&*MBBI);
591  addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);
592  continue;
593  }
594 
595  // When we match I with another DS instruction we will be moving I down
596  // to the location of the matched instruction any uses of I will need to
597  // be moved down as well.
598  addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
599  CI.InstsToMove);
600  continue;
601  }
602 
603  // Don't merge volatiles.
604  if (MBBI->hasOrderedMemoryRef())
605  return false;
606 
607  // Handle a case like
608  // DS_WRITE_B32 addr, v, idx0
609  // w = DS_READ_B32 addr, idx0
610  // DS_WRITE_B32 addr, f(w), idx1
611  // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
612  // merging of the two writes.
613  if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
614  CI.InstsToMove))
615  continue;
616 
617  bool Match = true;
618  for (unsigned i = 0; i < NumAddresses; i++) {
619  const MachineOperand &AddrRegNext = MBBI->getOperand(AddrIdx[i]);
620 
621  if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
622  if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
623  AddrReg[i]->getImm() != AddrRegNext.getImm()) {
624  Match = false;
625  break;
626  }
627  continue;
628  }
629 
630  // Check same base pointer. Be careful of subregisters, which can occur
631  // with vectors of pointers.
632  if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
633  AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
634  Match = false;
635  break;
636  }
637  }
638 
639  if (Match) {
640  int OffsetIdx =
641  AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::offset);
642  CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm();
643  CI.Width0 = getOpcodeWidth(*CI.I);
644  CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm();
645  CI.Width1 = getOpcodeWidth(*MBBI);
646  CI.Paired = MBBI;
647 
648  if ((CI.InstClass == DS_READ) || (CI.InstClass == DS_WRITE)) {
649  CI.Offset0 &= 0xffff;
650  CI.Offset1 &= 0xffff;
651  } else {
652  CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm();
653  CI.GLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm();
654  if (CI.InstClass != S_BUFFER_LOAD_IMM) {
655  CI.SLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm();
656  CI.SLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::slc)->getImm();
657  }
658  CI.DLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::dlc)->getImm();
659  CI.DLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::dlc)->getImm();
660  }
661 
662  // Check both offsets fit in the reduced range.
663  // We also need to go through the list of instructions that we plan to
664  // move and make sure they are all safe to move down past the merged
665  // instruction.
666  if (widthsFit(*STM, CI) && offsetsCanBeCombined(CI))
667  if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))
668  return true;
669  }
670 
671  // We've found a load/store that we couldn't merge for some reason.
672  // We could potentially keep looking, but we'd need to make sure that
673  // it was safe to move I and also all the instruction in InstsToMove
674  // down past this instruction.
675  // check if we can move I across MBBI and if we can move all I's users
676  if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
677  !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))
678  break;
679  }
680  return false;
681 }
682 
683 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
684  if (STM->ldsRequiresM0Init())
685  return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
686  return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
687 }
688 
689 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
690  if (STM->ldsRequiresM0Init())
691  return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
692 
693  return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
694  : AMDGPU::DS_READ2ST64_B64_gfx9;
695 }
696 
698 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) {
699  MachineBasicBlock *MBB = CI.I->getParent();
700 
701  // Be careful, since the addresses could be subregisters themselves in weird
702  // cases, like vectors of pointers.
703  const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
704 
705  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
706  const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdst);
707 
708  unsigned NewOffset0 = CI.Offset0;
709  unsigned NewOffset1 = CI.Offset1;
710  unsigned Opc =
711  CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
712 
713  unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
714  unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
715 
716  if (NewOffset0 > NewOffset1) {
717  // Canonicalize the merged instruction so the smaller offset comes first.
718  std::swap(NewOffset0, NewOffset1);
719  std::swap(SubRegIdx0, SubRegIdx1);
720  }
721 
722  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
723  (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
724 
725  const MCInstrDesc &Read2Desc = TII->get(Opc);
726 
727  const TargetRegisterClass *SuperRC =
728  (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
729  Register DestReg = MRI->createVirtualRegister(SuperRC);
730 
731  DebugLoc DL = CI.I->getDebugLoc();
732 
733  Register BaseReg = AddrReg->getReg();
734  unsigned BaseSubReg = AddrReg->getSubReg();
735  unsigned BaseRegFlags = 0;
736  if (CI.BaseOff) {
737  Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
738  BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
739  .addImm(CI.BaseOff);
740 
741  BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
742  BaseRegFlags = RegState::Kill;
743 
744  TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
745  .addReg(ImmReg)
746  .addReg(AddrReg->getReg(), 0, BaseSubReg)
747  .addImm(0); // clamp bit
748  BaseSubReg = 0;
749  }
750 
751  MachineInstrBuilder Read2 =
752  BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg)
753  .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
754  .addImm(NewOffset0) // offset0
755  .addImm(NewOffset1) // offset1
756  .addImm(0) // gds
757  .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
758 
759  (void)Read2;
760 
761  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
762 
763  // Copy to the old destination registers.
764  BuildMI(*MBB, CI.Paired, DL, CopyDesc)
765  .add(*Dest0) // Copy to same destination including flags and sub reg.
766  .addReg(DestReg, 0, SubRegIdx0);
767  MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
768  .add(*Dest1)
769  .addReg(DestReg, RegState::Kill, SubRegIdx1);
770 
771  moveInstsAfter(Copy1, CI.InstsToMove);
772 
773  MachineBasicBlock::iterator Next = std::next(CI.I);
774  CI.I->eraseFromParent();
775  CI.Paired->eraseFromParent();
776 
777  LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
778  return Next;
779 }
780 
781 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
782  if (STM->ldsRequiresM0Init())
783  return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
784  return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
785  : AMDGPU::DS_WRITE2_B64_gfx9;
786 }
787 
788 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
789  if (STM->ldsRequiresM0Init())
790  return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
791  : AMDGPU::DS_WRITE2ST64_B64;
792 
793  return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
794  : AMDGPU::DS_WRITE2ST64_B64_gfx9;
795 }
796 
798 SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) {
799  MachineBasicBlock *MBB = CI.I->getParent();
800 
801  // Be sure to use .addOperand(), and not .addReg() with these. We want to be
802  // sure we preserve the subregister index and any register flags set on them.
803  const MachineOperand *AddrReg =
804  TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
805  const MachineOperand *Data0 =
806  TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
807  const MachineOperand *Data1 =
808  TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
809 
810  unsigned NewOffset0 = CI.Offset0;
811  unsigned NewOffset1 = CI.Offset1;
812  unsigned Opc =
813  CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
814 
815  if (NewOffset0 > NewOffset1) {
816  // Canonicalize the merged instruction so the smaller offset comes first.
817  std::swap(NewOffset0, NewOffset1);
818  std::swap(Data0, Data1);
819  }
820 
821  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
822  (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
823 
824  const MCInstrDesc &Write2Desc = TII->get(Opc);
825  DebugLoc DL = CI.I->getDebugLoc();
826 
827  Register BaseReg = AddrReg->getReg();
828  unsigned BaseSubReg = AddrReg->getSubReg();
829  unsigned BaseRegFlags = 0;
830  if (CI.BaseOff) {
831  Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
832  BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
833  .addImm(CI.BaseOff);
834 
835  BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
836  BaseRegFlags = RegState::Kill;
837 
838  TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
839  .addReg(ImmReg)
840  .addReg(AddrReg->getReg(), 0, BaseSubReg)
841  .addImm(0); // clamp bit
842  BaseSubReg = 0;
843  }
844 
845  MachineInstrBuilder Write2 =
846  BuildMI(*MBB, CI.Paired, DL, Write2Desc)
847  .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
848  .add(*Data0) // data0
849  .add(*Data1) // data1
850  .addImm(NewOffset0) // offset0
851  .addImm(NewOffset1) // offset1
852  .addImm(0) // gds
853  .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
854 
855  moveInstsAfter(Write2, CI.InstsToMove);
856 
857  MachineBasicBlock::iterator Next = std::next(CI.I);
858  CI.I->eraseFromParent();
859  CI.Paired->eraseFromParent();
860 
861  LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
862  return Next;
863 }
864 
866 SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) {
867  MachineBasicBlock *MBB = CI.I->getParent();
868  DebugLoc DL = CI.I->getDebugLoc();
869  const unsigned Opcode = getNewOpcode(CI);
870 
871  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
872 
873  Register DestReg = MRI->createVirtualRegister(SuperRC);
874  unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
875 
876  // It shouldn't be possible to get this far if the two instructions
877  // don't have a single memoperand, because MachineInstr::mayAlias()
878  // will return true if this is the case.
879  assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
880 
881  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
882  const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
883 
884  BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
885  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
886  .addImm(MergedOffset) // offset
887  .addImm(CI.GLC0) // glc
888  .addImm(CI.DLC0) // dlc
889  .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
890 
891  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
892  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
893  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
894 
895  // Copy to the old destination registers.
896  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
897  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
898  const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::sdst);
899 
900  BuildMI(*MBB, CI.Paired, DL, CopyDesc)
901  .add(*Dest0) // Copy to same destination including flags and sub reg.
902  .addReg(DestReg, 0, SubRegIdx0);
903  MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
904  .add(*Dest1)
905  .addReg(DestReg, RegState::Kill, SubRegIdx1);
906 
907  moveInstsAfter(Copy1, CI.InstsToMove);
908 
909  MachineBasicBlock::iterator Next = std::next(CI.I);
910  CI.I->eraseFromParent();
911  CI.Paired->eraseFromParent();
912  return Next;
913 }
914 
916 SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) {
917  MachineBasicBlock *MBB = CI.I->getParent();
918  DebugLoc DL = CI.I->getDebugLoc();
919 
920  const unsigned Opcode = getNewOpcode(CI);
921 
922  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
923 
924  // Copy to the new source register.
925  Register DestReg = MRI->createVirtualRegister(SuperRC);
926  unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
927 
928  auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
929 
930  const unsigned Regs = getRegs(Opcode);
931 
932  if (Regs & VADDR)
933  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
934 
935  // It shouldn't be possible to get this far if the two instructions
936  // don't have a single memoperand, because MachineInstr::mayAlias()
937  // will return true if this is the case.
938  assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
939 
940  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
941  const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
942 
943  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
944  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
945  .addImm(MergedOffset) // offset
946  .addImm(CI.GLC0) // glc
947  .addImm(CI.SLC0) // slc
948  .addImm(0) // tfe
949  .addImm(CI.DLC0) // dlc
950  .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
951 
952  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
953  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
954  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
955 
956  // Copy to the old destination registers.
957  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
958  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
959  const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
960 
961  BuildMI(*MBB, CI.Paired, DL, CopyDesc)
962  .add(*Dest0) // Copy to same destination including flags and sub reg.
963  .addReg(DestReg, 0, SubRegIdx0);
964  MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
965  .add(*Dest1)
966  .addReg(DestReg, RegState::Kill, SubRegIdx1);
967 
968  moveInstsAfter(Copy1, CI.InstsToMove);
969 
970  MachineBasicBlock::iterator Next = std::next(CI.I);
971  CI.I->eraseFromParent();
972  CI.Paired->eraseFromParent();
973  return Next;
974 }
975 
976 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) {
977  const unsigned Width = CI.Width0 + CI.Width1;
978 
979  switch (CI.InstClass) {
980  default:
981  // FIXME: Handle d16 correctly
982  return AMDGPU::getMUBUFOpcode(CI.InstClass, Width);
983  case UNKNOWN:
984  llvm_unreachable("Unknown instruction class");
985  case S_BUFFER_LOAD_IMM:
986  switch (Width) {
987  default:
988  return 0;
989  case 2:
990  return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
991  case 4:
992  return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
993  }
994  }
995 }
996 
997 std::pair<unsigned, unsigned>
998 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI) {
999  if (CI.Offset0 > CI.Offset1) {
1000  switch (CI.Width0) {
1001  default:
1002  return std::make_pair(0, 0);
1003  case 1:
1004  switch (CI.Width1) {
1005  default:
1006  return std::make_pair(0, 0);
1007  case 1:
1008  return std::make_pair(AMDGPU::sub1, AMDGPU::sub0);
1009  case 2:
1010  return std::make_pair(AMDGPU::sub2, AMDGPU::sub0_sub1);
1011  case 3:
1012  return std::make_pair(AMDGPU::sub3, AMDGPU::sub0_sub1_sub2);
1013  }
1014  case 2:
1015  switch (CI.Width1) {
1016  default:
1017  return std::make_pair(0, 0);
1018  case 1:
1019  return std::make_pair(AMDGPU::sub1_sub2, AMDGPU::sub0);
1020  case 2:
1021  return std::make_pair(AMDGPU::sub2_sub3, AMDGPU::sub0_sub1);
1022  }
1023  case 3:
1024  switch (CI.Width1) {
1025  default:
1026  return std::make_pair(0, 0);
1027  case 1:
1028  return std::make_pair(AMDGPU::sub1_sub2_sub3, AMDGPU::sub0);
1029  }
1030  }
1031  } else {
1032  switch (CI.Width0) {
1033  default:
1034  return std::make_pair(0, 0);
1035  case 1:
1036  switch (CI.Width1) {
1037  default:
1038  return std::make_pair(0, 0);
1039  case 1:
1040  return std::make_pair(AMDGPU::sub0, AMDGPU::sub1);
1041  case 2:
1042  return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2);
1043  case 3:
1044  return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2_sub3);
1045  }
1046  case 2:
1047  switch (CI.Width1) {
1048  default:
1049  return std::make_pair(0, 0);
1050  case 1:
1051  return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2);
1052  case 2:
1053  return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2_sub3);
1054  }
1055  case 3:
1056  switch (CI.Width1) {
1057  default:
1058  return std::make_pair(0, 0);
1059  case 1:
1060  return std::make_pair(AMDGPU::sub0_sub1_sub2, AMDGPU::sub3);
1061  }
1062  }
1063  }
1064 }
1065 
1066 const TargetRegisterClass *
1067 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI) {
1068  if (CI.InstClass == S_BUFFER_LOAD_IMM) {
1069  switch (CI.Width0 + CI.Width1) {
1070  default:
1071  return nullptr;
1072  case 2:
1073  return &AMDGPU::SReg_64_XEXECRegClass;
1074  case 4:
1075  return &AMDGPU::SReg_128RegClass;
1076  case 8:
1077  return &AMDGPU::SReg_256RegClass;
1078  case 16:
1079  return &AMDGPU::SReg_512RegClass;
1080  }
1081  } else {
1082  switch (CI.Width0 + CI.Width1) {
1083  default:
1084  return nullptr;
1085  case 2:
1086  return &AMDGPU::VReg_64RegClass;
1087  case 3:
1088  return &AMDGPU::VReg_96RegClass;
1089  case 4:
1090  return &AMDGPU::VReg_128RegClass;
1091  }
1092  }
1093 }
1094 
1096 SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) {
1097  MachineBasicBlock *MBB = CI.I->getParent();
1098  DebugLoc DL = CI.I->getDebugLoc();
1099 
1100  const unsigned Opcode = getNewOpcode(CI);
1101 
1102  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
1103  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1104  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1105 
1106  // Copy to the new source register.
1107  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
1108  Register SrcReg = MRI->createVirtualRegister(SuperRC);
1109 
1110  const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1111  const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
1112 
1113  BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1114  .add(*Src0)
1115  .addImm(SubRegIdx0)
1116  .add(*Src1)
1117  .addImm(SubRegIdx1);
1118 
1119  auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode))
1120  .addReg(SrcReg, RegState::Kill);
1121 
1122  const unsigned Regs = getRegs(Opcode);
1123 
1124  if (Regs & VADDR)
1125  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1126 
1127 
1128  // It shouldn't be possible to get this far if the two instructions
1129  // don't have a single memoperand, because MachineInstr::mayAlias()
1130  // will return true if this is the case.
1131  assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
1132 
1133  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1134  const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
1135 
1136  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1137  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1138  .addImm(std::min(CI.Offset0, CI.Offset1)) // offset
1139  .addImm(CI.GLC0) // glc
1140  .addImm(CI.SLC0) // slc
1141  .addImm(0) // tfe
1142  .addImm(CI.DLC0) // dlc
1143  .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1144 
1145  moveInstsAfter(MIB, CI.InstsToMove);
1146 
1147  MachineBasicBlock::iterator Next = std::next(CI.I);
1148  CI.I->eraseFromParent();
1149  CI.Paired->eraseFromParent();
1150  return Next;
1151 }
1152 
1154 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
1155  APInt V(32, Val, true);
1156  if (TII->isInlineConstant(V))
1157  return MachineOperand::CreateImm(Val);
1158 
1159  Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1160  MachineInstr *Mov =
1161  BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1162  TII->get(AMDGPU::S_MOV_B32), Reg)
1163  .addImm(Val);
1164  (void)Mov;
1165  LLVM_DEBUG(dbgs() << " "; Mov->dump());
1166  return MachineOperand::CreateReg(Reg, false);
1167 }
1168 
1169 // Compute base address using Addr and return the final register.
1170 unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1171  const MemAddress &Addr) const {
1172  MachineBasicBlock *MBB = MI.getParent();
1174  DebugLoc DL = MI.getDebugLoc();
1175 
1176  assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1177  Addr.Base.LoSubReg) &&
1178  "Expected 32-bit Base-Register-Low!!");
1179 
1180  assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1181  Addr.Base.HiSubReg) &&
1182  "Expected 32-bit Base-Register-Hi!!");
1183 
1184  LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n");
1185  MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1186  MachineOperand OffsetHi =
1187  createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1188 
1189  const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1190  Register CarryReg = MRI->createVirtualRegister(CarryRC);
1191  Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
1192 
1193  Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1194  Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1195  MachineInstr *LoHalf =
1196  BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_I32_e64), DestSub0)
1197  .addReg(CarryReg, RegState::Define)
1198  .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1199  .add(OffsetLo)
1200  .addImm(0); // clamp bit
1201  (void)LoHalf;
1202  LLVM_DEBUG(dbgs() << " "; LoHalf->dump(););
1203 
1204  MachineInstr *HiHalf =
1205  BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1206  .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
1207  .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
1208  .add(OffsetHi)
1209  .addReg(CarryReg, RegState::Kill)
1210  .addImm(0); // clamp bit
1211  (void)HiHalf;
1212  LLVM_DEBUG(dbgs() << " "; HiHalf->dump(););
1213 
1214  Register FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass);
1215  MachineInstr *FullBase =
1216  BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
1217  .addReg(DestSub0)
1218  .addImm(AMDGPU::sub0)
1219  .addReg(DestSub1)
1220  .addImm(AMDGPU::sub1);
1221  (void)FullBase;
1222  LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";);
1223 
1224  return FullDestReg;
1225 }
1226 
1227 // Update base and offset with the NewBase and NewOffset in MI.
1228 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
1229  unsigned NewBase,
1230  int32_t NewOffset) const {
1231  TII->getNamedOperand(MI, AMDGPU::OpName::vaddr)->setReg(NewBase);
1232  TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
1233 }
1234 
1236 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
1237  if (Op.isImm())
1238  return Op.getImm();
1239 
1240  if (!Op.isReg())
1241  return None;
1242 
1243  MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
1244  if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
1245  !Def->getOperand(1).isImm())
1246  return None;
1247 
1248  return Def->getOperand(1).getImm();
1249 }
1250 
1251 // Analyze Base and extracts:
1252 // - 32bit base registers, subregisters
1253 // - 64bit constant offset
1254 // Expecting base computation as:
1255 // %OFFSET0:sgpr_32 = S_MOV_B32 8000
1256 // %LO:vgpr_32, %c:sreg_64_xexec =
1257 // V_ADD_I32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
1258 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
1259 // %Base:vreg_64 =
1260 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
1261 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
1262  MemAddress &Addr) const {
1263  if (!Base.isReg())
1264  return;
1265 
1266  MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
1267  if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
1268  || Def->getNumOperands() != 5)
1269  return;
1270 
1271  MachineOperand BaseLo = Def->getOperand(1);
1272  MachineOperand BaseHi = Def->getOperand(3);
1273  if (!BaseLo.isReg() || !BaseHi.isReg())
1274  return;
1275 
1276  MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
1277  MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
1278 
1279  if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_I32_e64 ||
1280  !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
1281  return;
1282 
1283  const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
1284  const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
1285 
1286  auto Offset0P = extractConstOffset(*Src0);
1287  if (Offset0P)
1288  BaseLo = *Src1;
1289  else {
1290  if (!(Offset0P = extractConstOffset(*Src1)))
1291  return;
1292  BaseLo = *Src0;
1293  }
1294 
1295  Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
1296  Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
1297 
1298  if (Src0->isImm())
1299  std::swap(Src0, Src1);
1300 
1301  if (!Src1->isImm())
1302  return;
1303 
1304  uint64_t Offset1 = Src1->getImm();
1305  BaseHi = *Src0;
1306 
1307  Addr.Base.LoReg = BaseLo.getReg();
1308  Addr.Base.HiReg = BaseHi.getReg();
1309  Addr.Base.LoSubReg = BaseLo.getSubReg();
1310  Addr.Base.HiSubReg = BaseHi.getSubReg();
1311  Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
1312 }
1313 
1314 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
1315  MachineInstr &MI,
1316  MemInfoMap &Visited,
1317  SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
1318 
1319  if (!(MI.mayLoad() ^ MI.mayStore()))
1320  return false;
1321 
1322  // TODO: Support flat and scratch.
1323  if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0)
1324  return false;
1325 
1326  if (MI.mayLoad() && TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL)
1327  return false;
1328 
1329  if (AnchorList.count(&MI))
1330  return false;
1331 
1332  LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
1333 
1334  if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
1335  LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";);
1336  return false;
1337  }
1338 
1339  // Step1: Find the base-registers and a 64bit constant offset.
1340  MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1341  MemAddress MAddr;
1342  if (Visited.find(&MI) == Visited.end()) {
1343  processBaseWithConstOffset(Base, MAddr);
1344  Visited[&MI] = MAddr;
1345  } else
1346  MAddr = Visited[&MI];
1347 
1348  if (MAddr.Offset == 0) {
1349  LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no"
1350  " constant offsets that can be promoted.\n";);
1351  return false;
1352  }
1353 
1354  LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", "
1355  << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
1356 
1357  // Step2: Traverse through MI's basic block and find an anchor(that has the
1358  // same base-registers) with the highest 13bit distance from MI's offset.
1359  // E.g. (64bit loads)
1360  // bb:
1361  // addr1 = &a + 4096; load1 = load(addr1, 0)
1362  // addr2 = &a + 6144; load2 = load(addr2, 0)
1363  // addr3 = &a + 8192; load3 = load(addr3, 0)
1364  // addr4 = &a + 10240; load4 = load(addr4, 0)
1365  // addr5 = &a + 12288; load5 = load(addr5, 0)
1366  //
1367  // Starting from the first load, the optimization will try to find a new base
1368  // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
1369  // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
1370  // as the new-base(anchor) because of the maximum distance which can
1371  // accomodate more intermediate bases presumeably.
1372  //
1373  // Step3: move (&a + 8192) above load1. Compute and promote offsets from
1374  // (&a + 8192) for load1, load2, load4.
1375  // addr = &a + 8192
1376  // load1 = load(addr, -4096)
1377  // load2 = load(addr, -2048)
1378  // load3 = load(addr, 0)
1379  // load4 = load(addr, 2048)
1380  // addr5 = &a + 12288; load5 = load(addr5, 0)
1381  //
1382  MachineInstr *AnchorInst = nullptr;
1383  MemAddress AnchorAddr;
1384  uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
1386 
1387  MachineBasicBlock *MBB = MI.getParent();
1390  ++MBBI;
1391  const SITargetLowering *TLI =
1392  static_cast<const SITargetLowering *>(STM->getTargetLowering());
1393 
1394  for ( ; MBBI != E; ++MBBI) {
1395  MachineInstr &MINext = *MBBI;
1396  // TODO: Support finding an anchor(with same base) from store addresses or
1397  // any other load addresses where the opcodes are different.
1398  if (MINext.getOpcode() != MI.getOpcode() ||
1399  TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
1400  continue;
1401 
1402  const MachineOperand &BaseNext =
1403  *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
1404  MemAddress MAddrNext;
1405  if (Visited.find(&MINext) == Visited.end()) {
1406  processBaseWithConstOffset(BaseNext, MAddrNext);
1407  Visited[&MINext] = MAddrNext;
1408  } else
1409  MAddrNext = Visited[&MINext];
1410 
1411  if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
1412  MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
1413  MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
1414  MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
1415  continue;
1416 
1417  InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset));
1418 
1419  int64_t Dist = MAddr.Offset - MAddrNext.Offset;
1421  AM.HasBaseReg = true;
1422  AM.BaseOffs = Dist;
1423  if (TLI->isLegalGlobalAddressingMode(AM) &&
1424  (uint32_t)std::abs(Dist) > MaxDist) {
1425  MaxDist = std::abs(Dist);
1426 
1427  AnchorAddr = MAddrNext;
1428  AnchorInst = &MINext;
1429  }
1430  }
1431 
1432  if (AnchorInst) {
1433  LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): ";
1434  AnchorInst->dump());
1435  LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: "
1436  << AnchorAddr.Offset << "\n\n");
1437 
1438  // Instead of moving up, just re-compute anchor-instruction's base address.
1439  unsigned Base = computeBase(MI, AnchorAddr);
1440 
1441  updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
1442  LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump(););
1443 
1444  for (auto P : InstsWCommonBase) {
1446  AM.HasBaseReg = true;
1447  AM.BaseOffs = P.second - AnchorAddr.Offset;
1448 
1449  if (TLI->isLegalGlobalAddressingMode(AM)) {
1450  LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second;
1451  dbgs() << ")"; P.first->dump());
1452  updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
1453  LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump());
1454  }
1455  }
1456  AnchorList.insert(AnchorInst);
1457  return true;
1458  }
1459 
1460  return false;
1461 }
1462 
1463 // Scan through looking for adjacent LDS operations with constant offsets from
1464 // the same base register. We rely on the scheduler to do the hard work of
1465 // clustering nearby loads, and assume these are all adjacent.
1466 bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
1467  bool Modified = false;
1468 
1469  // Contain the list
1470  MemInfoMap Visited;
1471  // Contains the list of instructions for which constant offsets are being
1472  // promoted to the IMM.
1473  SmallPtrSet<MachineInstr *, 4> AnchorList;
1474 
1475  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
1476  MachineInstr &MI = *I;
1477 
1478  if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
1479  Modified = true;
1480 
1481  // Don't combine if volatile.
1482  if (MI.hasOrderedMemoryRef()) {
1483  ++I;
1484  continue;
1485  }
1486 
1487  const unsigned Opc = MI.getOpcode();
1488 
1489  CombineInfo CI;
1490  CI.I = I;
1491  CI.InstClass = getInstClass(Opc);
1492 
1493  switch (CI.InstClass) {
1494  default:
1495  break;
1496  case DS_READ:
1497  CI.EltSize =
1498  (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
1499  : 4;
1500  if (findMatchingInst(CI)) {
1501  Modified = true;
1502  I = mergeRead2Pair(CI);
1503  } else {
1504  ++I;
1505  }
1506  continue;
1507  case DS_WRITE:
1508  CI.EltSize =
1509  (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
1510  : 4;
1511  if (findMatchingInst(CI)) {
1512  Modified = true;
1513  I = mergeWrite2Pair(CI);
1514  } else {
1515  ++I;
1516  }
1517  continue;
1518  case S_BUFFER_LOAD_IMM:
1519  CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4);
1520  if (findMatchingInst(CI)) {
1521  Modified = true;
1522  I = mergeSBufferLoadImmPair(CI);
1523  OptimizeAgain |= (CI.Width0 + CI.Width1) < 16;
1524  } else {
1525  ++I;
1526  }
1527  continue;
1528  case BUFFER_LOAD_OFFEN:
1529  case BUFFER_LOAD_OFFSET:
1530  case BUFFER_LOAD_OFFEN_exact:
1531  case BUFFER_LOAD_OFFSET_exact:
1532  CI.EltSize = 4;
1533  if (findMatchingInst(CI)) {
1534  Modified = true;
1535  I = mergeBufferLoadPair(CI);
1536  OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;
1537  } else {
1538  ++I;
1539  }
1540  continue;
1541  case BUFFER_STORE_OFFEN:
1542  case BUFFER_STORE_OFFSET:
1543  case BUFFER_STORE_OFFEN_exact:
1544  case BUFFER_STORE_OFFSET_exact:
1545  CI.EltSize = 4;
1546  if (findMatchingInst(CI)) {
1547  Modified = true;
1548  I = mergeBufferStorePair(CI);
1549  OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;
1550  } else {
1551  ++I;
1552  }
1553  continue;
1554  }
1555 
1556  ++I;
1557  }
1558 
1559  return Modified;
1560 }
1561 
1562 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
1563  if (skipFunction(MF.getFunction()))
1564  return false;
1565 
1566  STM = &MF.getSubtarget<GCNSubtarget>();
1567  if (!STM->loadStoreOptEnabled())
1568  return false;
1569 
1570  TII = STM->getInstrInfo();
1571  TRI = &TII->getRegisterInfo();
1572 
1573  MRI = &MF.getRegInfo();
1574  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1575 
1576  assert(MRI->isSSA() && "Must be run on SSA");
1577 
1578  LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
1579 
1580  bool Modified = false;
1581 
1582  for (MachineBasicBlock &MBB : MF) {
1583  do {
1584  OptimizeAgain = false;
1585  Modified |= optimizeBlock(MBB);
1586  } while (OptimizeAgain);
1587  }
1588 
1589  return Modified;
1590 }
static bool isReg(const MCInst &MI, unsigned OpNo)
const MachineInstrBuilder & add(const MachineOperand &MO) const
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
Interface definition for SIRegisterInfo.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg If BaseGV is null...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
AMDGPU specific subclass of TargetSubtarget.
bool getMUBUFHasSrsrc(unsigned Opc)
This class represents lattice values for constants.
Definition: AllocatorList.h:23
char & SILoadStoreOptimizerID
void push_back(const T &Elt)
Definition: SmallVector.h:211
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:385
static bool isPhysicalRegister(unsigned Reg)
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:63
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:178
unsigned Reg
unsigned getSubReg() const
uint64_t getSize() const
Return the size in bytes of the memory reference.
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
Definition: MachineInstr.h:848
const SIInstrInfo * getInstrInfo() const override
unsigned const TargetRegisterInfo * TRI
A debug info location.
Definition: DebugLoc.h:33
iterator_range< mop_iterator > operands()
Definition: MachineInstr.h:477
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:50
A description of a memory reference used in the backend.
bool hasDwordx3LoadStores() const
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const HexagonInstrInfo * TII
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:414
A Use represents the edge between a Value definition and its users.
Definition: Use.h:55
MachineBasicBlock iterator that automatically skips over MIs that are inside bundles (i...
#define DEBUG_TYPE
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:411
int getMUBUFElements(unsigned Opc)
static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, MachineBasicBlock::iterator B, AliasAnalysis *AA)
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, uint64_t s, unsigned base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:32
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp, ArrayRef< MachineInstr *> InstsToMove, AliasAnalysis *AA)
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
Definition: MachineInstr.h:838
#define P(N)
int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
constexpr bool isUInt< 8 >(uint64_t x)
Definition: MathExtras.h:342
unsigned const MachineRegisterInfo * MRI
bool getMUBUFHasSoffset(unsigned Opc)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:187
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:370
int getMUBUFBaseOpcode(unsigned Opc)
int getMUBUFOpcode(unsigned BaseOpc, unsigned Elements)
Represent the analysis usage information of a pass.
SI Load Store Optimizer
void setOffset(int64_t NewOffset)
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:284
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:381
self_iterator getIterator()
Definition: ilist_node.h:81
static void moveInstsAfter(MachineBasicBlock::iterator I, ArrayRef< MachineInstr *> InstsToMove)
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static MachineMemOperand * combineKnownAdjacentMMOs(MachineFunction &MF, const MachineMemOperand *A, const MachineMemOperand *B)
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements...
Definition: SmallPtrSet.h:417
static uint64_t add(uint64_t LeftOp, uint64_t RightOp)
Definition: FileCheck.cpp:214
MachineOperand class - Representation of each machine instruction operand.
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:837
bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:301
int64_t getImm() const
const Function & getFunction() const
Return the LLVM function that this machine code represents.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:940
Class for arbitrary precision integers.
Definition: APInt.h:69
static bool addToListsIfDependent(MachineInstr &MI, DenseSet< unsigned > &RegDefs, DenseSet< unsigned > &PhysRegUses, SmallVectorImpl< MachineInstr *> &Insts)
static unsigned getReg(const void *D, unsigned RC, unsigned RegNo)
INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", false, false) INITIALIZE_PASS_END(SILoadStoreOptimizer
void initializeSILoadStoreOptimizerPass(PassRegistry &)
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:256
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
Provides AMDGPU specific target descriptions.
Representation of each machine instruction.
Definition: MachineInstr.h:64
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
Interface definition for SIInstrInfo.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
bool loadStoreOptEnabled() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
static MachineOperand CreateImm(int64_t Val)
#define I(x, y, z)
Definition: MD5.cpp:58
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition: APFloat.h:1228
const MachineInstrBuilder & cloneMergedMemRefs(ArrayRef< const MachineInstr *> OtherMIs) const
uint32_t Size
Definition: Profile.cpp:46
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:91
bool isReg() const
isReg - Tests if this is a MO_Register operand.
FunctionPass * createSILoadStoreOptimizerPass()
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be iniitalized. ...
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
Definition: MachineInstr.h:825
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
int64_t getOffset() const
For normal values, this is a byte offset added to the base address.
IRTranslator LLVM IR MI
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:48
bool getMUBUFHasVAddr(unsigned Opc)
Register getReg() const
getReg - Returns the register number.
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object...
const SITargetLowering * getTargetLowering() const override
#define LLVM_DEBUG(X)
Definition: Debug.h:122
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:416
static void addDefsUsesToList(const MachineInstr &MI, DenseSet< unsigned > &RegDefs, DenseSet< unsigned > &PhysRegUses)
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Wrapper class representing virtual and physical registers.
Definition: Register.h:19