LLVM  13.0.0git
SILoadStoreOptimizer.cpp
Go to the documentation of this file.
1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass tries to fuse DS instructions with close by immediate offsets.
10 // This will fuse operations such as
11 // ds_read_b32 v0, v2 offset:16
12 // ds_read_b32 v1, v2 offset:32
13 // ==>
14 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15 //
16 // The same is done for certain SMEM and VMEM opcodes, e.g.:
17 // s_buffer_load_dword s4, s[0:3], 4
18 // s_buffer_load_dword s5, s[0:3], 8
19 // ==>
20 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21 //
22 // This pass also tries to promote constant offset to the immediate by
23 // adjusting the base. It tries to use a base from the nearby instructions that
24 // allows it to have a 13bit constant offset and then promotes the 13bit offset
25 // to the immediate.
26 // E.g.
27 // s_movk_i32 s0, 0x1800
28 // v_add_co_u32_e32 v0, vcc, s0, v2
29 // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30 //
31 // s_movk_i32 s0, 0x1000
32 // v_add_co_u32_e32 v5, vcc, s0, v2
33 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34 // global_load_dwordx2 v[5:6], v[5:6], off
35 // global_load_dwordx2 v[0:1], v[0:1], off
36 // =>
37 // s_movk_i32 s0, 0x1000
38 // v_add_co_u32_e32 v5, vcc, s0, v2
39 // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40 // global_load_dwordx2 v[5:6], v[5:6], off
41 // global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42 //
43 // Future improvements:
44 //
45 // - This is currently missing stores of constants because loading
46 // the constant into the data register is placed between the stores, although
47 // this is arguably a scheduling problem.
48 //
49 // - Live interval recomputing seems inefficient. This currently only matches
50 // one pair, and recomputes live intervals and moves on to the next pair. It
51 // would be better to compute a list of all merges that need to occur.
52 //
53 // - With a list of instructions to process, we can also merge more. If a
54 // cluster of loads have offsets that are too large to fit in the 8-bit
55 // offsets, but are close enough to fit in the 8 bits, we can add to the base
56 // pointer and use the new reduced offsets.
57 //
58 //===----------------------------------------------------------------------===//
59 
60 #include "AMDGPU.h"
61 #include "GCNSubtarget.h"
65 #include "llvm/InitializePasses.h"
66 
67 using namespace llvm;
68 
69 #define DEBUG_TYPE "si-load-store-opt"
70 
71 namespace {
72 enum InstClassEnum {
73  UNKNOWN,
74  DS_READ,
75  DS_WRITE,
76  S_BUFFER_LOAD_IMM,
79  MIMG,
80  TBUFFER_LOAD,
81  TBUFFER_STORE,
82 };
83 
84 struct AddressRegs {
85  unsigned char NumVAddrs = 0;
86  bool SBase = false;
87  bool SRsrc = false;
88  bool SOffset = false;
89  bool VAddr = false;
90  bool Addr = false;
91  bool SSamp = false;
92 };
93 
94 // GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
95 const unsigned MaxAddressRegs = 12 + 1 + 1;
96 
97 class SILoadStoreOptimizer : public MachineFunctionPass {
98  struct CombineInfo {
100  unsigned EltSize;
101  unsigned Offset;
102  unsigned Width;
103  unsigned Format;
104  unsigned BaseOff;
105  unsigned DMask;
106  InstClassEnum InstClass;
107  unsigned CPol = 0;
108  bool UseST64;
109  int AddrIdx[MaxAddressRegs];
110  const MachineOperand *AddrReg[MaxAddressRegs];
111  unsigned NumAddresses;
112  unsigned Order;
113 
114  bool hasSameBaseAddress(const MachineInstr &MI) {
115  for (unsigned i = 0; i < NumAddresses; i++) {
116  const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
117 
118  if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
119  if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
120  AddrReg[i]->getImm() != AddrRegNext.getImm()) {
121  return false;
122  }
123  continue;
124  }
125 
126  // Check same base pointer. Be careful of subregisters, which can occur
127  // with vectors of pointers.
128  if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
129  AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
130  return false;
131  }
132  }
133  return true;
134  }
135 
136  bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
137  for (unsigned i = 0; i < NumAddresses; ++i) {
138  const MachineOperand *AddrOp = AddrReg[i];
139  // Immediates are always OK.
140  if (AddrOp->isImm())
141  continue;
142 
143  // Don't try to merge addresses that aren't either immediates or registers.
144  // TODO: Should be possible to merge FrameIndexes and maybe some other
145  // non-register
146  if (!AddrOp->isReg())
147  return false;
148 
149  // TODO: We should be able to merge physical reg addreses.
150  if (AddrOp->getReg().isPhysical())
151  return false;
152 
153  // If an address has only one use then there will be on other
154  // instructions with the same address, so we can't merge this one.
155  if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
156  return false;
157  }
158  return true;
159  }
160 
161  void setMI(MachineBasicBlock::iterator MI, const SIInstrInfo &TII,
162  const GCNSubtarget &STM);
163  };
164 
165  struct BaseRegisters {
166  Register LoReg;
167  Register HiReg;
168 
169  unsigned LoSubReg = 0;
170  unsigned HiSubReg = 0;
171  };
172 
173  struct MemAddress {
174  BaseRegisters Base;
175  int64_t Offset = 0;
176  };
177 
178  using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
179 
180 private:
181  const GCNSubtarget *STM = nullptr;
182  const SIInstrInfo *TII = nullptr;
183  const SIRegisterInfo *TRI = nullptr;
184  MachineRegisterInfo *MRI = nullptr;
185  AliasAnalysis *AA = nullptr;
186  bool OptimizeAgain;
187 
188  static bool dmasksCanBeCombined(const CombineInfo &CI,
189  const SIInstrInfo &TII,
190  const CombineInfo &Paired);
191  static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
192  CombineInfo &Paired, bool Modify = false);
193  static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
194  const CombineInfo &Paired);
195  static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
196  static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
197  const CombineInfo &Paired);
198  const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI,
199  const CombineInfo &Paired);
200  const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
201 
202  bool checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired,
203  SmallVectorImpl<MachineInstr *> &InstsToMove);
204 
205  unsigned read2Opcode(unsigned EltSize) const;
206  unsigned read2ST64Opcode(unsigned EltSize) const;
207  MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI,
208  CombineInfo &Paired,
209  const SmallVectorImpl<MachineInstr *> &InstsToMove);
210 
211  unsigned write2Opcode(unsigned EltSize) const;
212  unsigned write2ST64Opcode(unsigned EltSize) const;
214  mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
215  const SmallVectorImpl<MachineInstr *> &InstsToMove);
217  mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
218  const SmallVectorImpl<MachineInstr *> &InstsToMove);
220  mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
221  const SmallVectorImpl<MachineInstr *> &InstsToMove);
223  mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
224  const SmallVectorImpl<MachineInstr *> &InstsToMove);
226  mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
227  const SmallVectorImpl<MachineInstr *> &InstsToMove);
229  mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
230  const SmallVectorImpl<MachineInstr *> &InstsToMove);
232  mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
233  const SmallVectorImpl<MachineInstr *> &InstsToMove);
234 
235  void updateBaseAndOffset(MachineInstr &I, Register NewBase,
236  int32_t NewOffset) const;
237  Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
238  MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
239  Optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
240  void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
241  /// Promotes constant offset to the immediate by adjusting the base. It
242  /// tries to use a base from the nearby instructions that allows it to have
243  /// a 13bit constant offset which gets promoted to the immediate.
244  bool promoteConstantOffsetToImm(MachineInstr &CI,
245  MemInfoMap &Visited,
246  SmallPtrSet<MachineInstr *, 4> &Promoted) const;
247  void addInstToMergeableList(const CombineInfo &CI,
248  std::list<std::list<CombineInfo> > &MergeableInsts) const;
249 
250  std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
252  MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
253  std::list<std::list<CombineInfo>> &MergeableInsts) const;
254 
255 public:
256  static char ID;
257 
258  SILoadStoreOptimizer() : MachineFunctionPass(ID) {
260  }
261 
262  bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
263  bool &OptimizeListAgain);
264  bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
265 
266  bool runOnMachineFunction(MachineFunction &MF) override;
267 
268  StringRef getPassName() const override { return "SI Load Store Optimizer"; }
269 
270  void getAnalysisUsage(AnalysisUsage &AU) const override {
271  AU.setPreservesCFG();
273 
275  }
276 
277  MachineFunctionProperties getRequiredProperties() const override {
280  }
281 };
282 
283 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
284  const unsigned Opc = MI.getOpcode();
285 
286  if (TII.isMUBUF(Opc)) {
287  // FIXME: Handle d16 correctly
288  return AMDGPU::getMUBUFElements(Opc);
289  }
290  if (TII.isMIMG(MI)) {
291  uint64_t DMaskImm =
292  TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
293  return countPopulation(DMaskImm);
294  }
295  if (TII.isMTBUF(Opc)) {
296  return AMDGPU::getMTBUFElements(Opc);
297  }
298 
299  switch (Opc) {
300  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
301  return 1;
302  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
303  return 2;
304  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
305  return 4;
306  case AMDGPU::DS_READ_B32: LLVM_FALLTHROUGH;
307  case AMDGPU::DS_READ_B32_gfx9: LLVM_FALLTHROUGH;
308  case AMDGPU::DS_WRITE_B32: LLVM_FALLTHROUGH;
309  case AMDGPU::DS_WRITE_B32_gfx9:
310  return 1;
311  case AMDGPU::DS_READ_B64: LLVM_FALLTHROUGH;
312  case AMDGPU::DS_READ_B64_gfx9: LLVM_FALLTHROUGH;
313  case AMDGPU::DS_WRITE_B64: LLVM_FALLTHROUGH;
314  case AMDGPU::DS_WRITE_B64_gfx9:
315  return 2;
316  default:
317  return 0;
318  }
319 }
320 
321 /// Maps instruction opcode to enum InstClassEnum.
322 static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
323  switch (Opc) {
324  default:
325  if (TII.isMUBUF(Opc)) {
326  switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
327  default:
328  return UNKNOWN;
329  case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
330  case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
331  case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
332  case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
333  return BUFFER_LOAD;
334  case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
335  case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
336  case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
337  case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
338  return BUFFER_STORE;
339  }
340  }
341  if (TII.isMIMG(Opc)) {
342  // Ignore instructions encoded without vaddr.
343  if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1 &&
344  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) == -1)
345  return UNKNOWN;
346  // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
347  if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
348  TII.isGather4(Opc))
349  return UNKNOWN;
350  return MIMG;
351  }
352  if (TII.isMTBUF(Opc)) {
353  switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
354  default:
355  return UNKNOWN;
356  case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
357  case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
358  case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
359  case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
360  return TBUFFER_LOAD;
361  case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
362  case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
363  case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
364  case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
365  return TBUFFER_STORE;
366  }
367  }
368  return UNKNOWN;
369  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
370  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
371  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
372  return S_BUFFER_LOAD_IMM;
373  case AMDGPU::DS_READ_B32:
374  case AMDGPU::DS_READ_B32_gfx9:
375  case AMDGPU::DS_READ_B64:
376  case AMDGPU::DS_READ_B64_gfx9:
377  return DS_READ;
378  case AMDGPU::DS_WRITE_B32:
379  case AMDGPU::DS_WRITE_B32_gfx9:
380  case AMDGPU::DS_WRITE_B64:
381  case AMDGPU::DS_WRITE_B64_gfx9:
382  return DS_WRITE;
383  case AMDGPU::IMAGE_BVH_INTERSECT_RAY_sa:
384  case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_sa:
385  case AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_sa:
386  case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_sa:
387  case AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa:
388  case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa:
389  case AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa:
390  case AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa:
391  return UNKNOWN;
392  }
393 }
394 
395 /// Determines instruction subclass from opcode. Only instructions
396 /// of the same subclass can be merged together.
397 static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
398  switch (Opc) {
399  default:
400  if (TII.isMUBUF(Opc))
401  return AMDGPU::getMUBUFBaseOpcode(Opc);
402  if (TII.isMIMG(Opc)) {
404  assert(Info);
405  return Info->BaseOpcode;
406  }
407  if (TII.isMTBUF(Opc))
408  return AMDGPU::getMTBUFBaseOpcode(Opc);
409  return -1;
410  case AMDGPU::DS_READ_B32:
411  case AMDGPU::DS_READ_B32_gfx9:
412  case AMDGPU::DS_READ_B64:
413  case AMDGPU::DS_READ_B64_gfx9:
414  case AMDGPU::DS_WRITE_B32:
415  case AMDGPU::DS_WRITE_B32_gfx9:
416  case AMDGPU::DS_WRITE_B64:
417  case AMDGPU::DS_WRITE_B64_gfx9:
418  return Opc;
419  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
420  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
421  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
422  return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
423  }
424 }
425 
426 static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
427  AddressRegs Result;
428 
429  if (TII.isMUBUF(Opc)) {
430  if (AMDGPU::getMUBUFHasVAddr(Opc))
431  Result.VAddr = true;
432  if (AMDGPU::getMUBUFHasSrsrc(Opc))
433  Result.SRsrc = true;
435  Result.SOffset = true;
436 
437  return Result;
438  }
439 
440  if (TII.isMIMG(Opc)) {
441  int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
442  if (VAddr0Idx >= 0) {
443  int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
444  Result.NumVAddrs = SRsrcIdx - VAddr0Idx;
445  } else {
446  Result.VAddr = true;
447  }
448  Result.SRsrc = true;
450  if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
451  Result.SSamp = true;
452 
453  return Result;
454  }
455  if (TII.isMTBUF(Opc)) {
456  if (AMDGPU::getMTBUFHasVAddr(Opc))
457  Result.VAddr = true;
458  if (AMDGPU::getMTBUFHasSrsrc(Opc))
459  Result.SRsrc = true;
461  Result.SOffset = true;
462 
463  return Result;
464  }
465 
466  switch (Opc) {
467  default:
468  return Result;
469  case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
470  case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
471  case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
472  Result.SBase = true;
473  return Result;
474  case AMDGPU::DS_READ_B32:
475  case AMDGPU::DS_READ_B64:
476  case AMDGPU::DS_READ_B32_gfx9:
477  case AMDGPU::DS_READ_B64_gfx9:
478  case AMDGPU::DS_WRITE_B32:
479  case AMDGPU::DS_WRITE_B64:
480  case AMDGPU::DS_WRITE_B32_gfx9:
481  case AMDGPU::DS_WRITE_B64_gfx9:
482  Result.Addr = true;
483  return Result;
484  }
485 }
486 
487 void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
488  const SIInstrInfo &TII,
489  const GCNSubtarget &STM) {
490  I = MI;
491  unsigned Opc = MI->getOpcode();
492  InstClass = getInstClass(Opc, TII);
493 
494  if (InstClass == UNKNOWN)
495  return;
496 
497  switch (InstClass) {
498  case DS_READ:
499  EltSize =
500  (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
501  : 4;
502  break;
503  case DS_WRITE:
504  EltSize =
505  (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
506  : 4;
507  break;
508  case S_BUFFER_LOAD_IMM:
509  EltSize = AMDGPU::convertSMRDOffsetUnits(STM, 4);
510  break;
511  default:
512  EltSize = 4;
513  break;
514  }
515 
516  if (InstClass == MIMG) {
517  DMask = TII.getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
518  // Offset is not considered for MIMG instructions.
519  Offset = 0;
520  } else {
521  int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
522  Offset = I->getOperand(OffsetIdx).getImm();
523  }
524 
525  if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
526  Format = TII.getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
527 
528  Width = getOpcodeWidth(*I, TII);
529 
530  if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
531  Offset &= 0xffff;
532  } else if (InstClass != MIMG) {
533  CPol = TII.getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
534  }
535 
536  AddressRegs Regs = getRegs(Opc, TII);
537 
538  NumAddresses = 0;
539  for (unsigned J = 0; J < Regs.NumVAddrs; J++)
540  AddrIdx[NumAddresses++] =
541  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
542  if (Regs.Addr)
543  AddrIdx[NumAddresses++] =
544  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
545  if (Regs.SBase)
546  AddrIdx[NumAddresses++] =
547  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
548  if (Regs.SRsrc)
549  AddrIdx[NumAddresses++] =
550  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
551  if (Regs.SOffset)
552  AddrIdx[NumAddresses++] =
553  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
554  if (Regs.VAddr)
555  AddrIdx[NumAddresses++] =
556  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
557  if (Regs.SSamp)
558  AddrIdx[NumAddresses++] =
559  AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp);
560  assert(NumAddresses <= MaxAddressRegs);
561 
562  for (unsigned J = 0; J < NumAddresses; J++)
563  AddrReg[J] = &I->getOperand(AddrIdx[J]);
564 }
565 
566 } // end anonymous namespace.
567 
568 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
569  "SI Load Store Optimizer", false, false)
573 
574 char SILoadStoreOptimizer::ID = 0;
575 
576 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
577 
579  return new SILoadStoreOptimizer();
580 }
581 
583  ArrayRef<MachineInstr *> InstsToMove) {
584  MachineBasicBlock *MBB = I->getParent();
585  ++I;
586  for (MachineInstr *MI : InstsToMove) {
587  MI->removeFromParent();
588  MBB->insert(I, MI);
589  }
590 }
591 
592 static void addDefsUsesToList(const MachineInstr &MI,
593  DenseSet<Register> &RegDefs,
594  DenseSet<Register> &PhysRegUses) {
595  for (const MachineOperand &Op : MI.operands()) {
596  if (Op.isReg()) {
597  if (Op.isDef())
598  RegDefs.insert(Op.getReg());
599  else if (Op.readsReg() && Op.getReg().isPhysical())
600  PhysRegUses.insert(Op.getReg());
601  }
602  }
603 }
604 
607  AliasAnalysis *AA) {
608  // RAW or WAR - cannot reorder
609  // WAW - cannot reorder
610  // RAR - safe to reorder
611  return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true);
612 }
613 
614 // Add MI and its defs to the lists if MI reads one of the defs that are
615 // already in the list. Returns true in that case.
617  DenseSet<Register> &PhysRegUses,
619  for (MachineOperand &Use : MI.operands()) {
620  // If one of the defs is read, then there is a use of Def between I and the
621  // instruction that I will potentially be merged with. We will need to move
622  // this instruction after the merged instructions.
623  //
624  // Similarly, if there is a def which is read by an instruction that is to
625  // be moved for merging, then we need to move the def-instruction as well.
626  // This can only happen for physical registers such as M0; virtual
627  // registers are in SSA form.
628  if (Use.isReg() && ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
629  (Use.isDef() && RegDefs.count(Use.getReg())) ||
630  (Use.isDef() && Use.getReg().isPhysical() &&
631  PhysRegUses.count(Use.getReg())))) {
632  Insts.push_back(&MI);
633  addDefsUsesToList(MI, RegDefs, PhysRegUses);
634  return true;
635  }
636  }
637 
638  return false;
639 }
640 
642  ArrayRef<MachineInstr *> InstsToMove,
643  AliasAnalysis *AA) {
644  assert(MemOp.mayLoadOrStore());
645 
646  for (MachineInstr *InstToMove : InstsToMove) {
647  if (!InstToMove->mayLoadOrStore())
648  continue;
649  if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA))
650  return false;
651  }
652  return true;
653 }
654 
655 // This function assumes that \p A and \p B have are identical except for
656 // size and offset, and they referecne adjacent memory.
658  const MachineMemOperand *A,
659  const MachineMemOperand *B) {
660  unsigned MinOffset = std::min(A->getOffset(), B->getOffset());
661  unsigned Size = A->getSize() + B->getSize();
662  // This function adds the offset parameter to the existing offset for A,
663  // so we pass 0 here as the offset and then manually set it to the correct
664  // value after the call.
666  MMO->setOffset(MinOffset);
667  return MMO;
668 }
669 
670 bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
671  const SIInstrInfo &TII,
672  const CombineInfo &Paired) {
673  assert(CI.InstClass == MIMG);
674 
675  // Ignore instructions with tfe/lwe set.
676  const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
677  const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
678 
679  if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
680  return false;
681 
682  // Check other optional immediate operands for equality.
683  unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
684  AMDGPU::OpName::unorm, AMDGPU::OpName::da,
685  AMDGPU::OpName::r128, AMDGPU::OpName::a16};
686 
687  for (auto op : OperandsToMatch) {
688  int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
689  if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
690  return false;
691  if (Idx != -1 &&
692  CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
693  return false;
694  }
695 
696  // Check DMask for overlaps.
697  unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
698  unsigned MinMask = std::min(CI.DMask, Paired.DMask);
699 
700  unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask);
701  if ((1u << AllowedBitsForMin) <= MinMask)
702  return false;
703 
704  return true;
705 }
706 
707 static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
708  unsigned ComponentCount,
709  const GCNSubtarget &STI) {
710  if (ComponentCount > 4)
711  return 0;
712 
713  const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
714  llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI);
715  if (!OldFormatInfo)
716  return 0;
717 
718  const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
720  ComponentCount,
721  OldFormatInfo->NumFormat, STI);
722 
723  if (!NewFormatInfo)
724  return 0;
725 
726  assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
727  NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
728 
729  return NewFormatInfo->Format;
730 }
731 
732 // Return the value in the inclusive range [Lo,Hi] that is aligned to the
733 // highest power of two. Note that the result is well defined for all inputs
734 // including corner cases like:
735 // - if Lo == Hi, return that value
736 // - if Lo == 0, return 0 (even though the "- 1" below underflows
737 // - if Lo > Hi, return 0 (as if the range wrapped around)
739  return Hi & maskLeadingOnes<uint32_t>(countLeadingZeros((Lo - 1) ^ Hi) + 1);
740 }
741 
742 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
743  const GCNSubtarget &STI,
744  CombineInfo &Paired,
745  bool Modify) {
746  assert(CI.InstClass != MIMG);
747 
748  // XXX - Would the same offset be OK? Is there any reason this would happen or
749  // be useful?
750  if (CI.Offset == Paired.Offset)
751  return false;
752 
753  // This won't be valid if the offset isn't aligned.
754  if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
755  return false;
756 
757  if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
758 
759  const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
760  llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI);
761  if (!Info0)
762  return false;
763  const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
764  llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
765  if (!Info1)
766  return false;
767 
768  if (Info0->BitsPerComp != Info1->BitsPerComp ||
769  Info0->NumFormat != Info1->NumFormat)
770  return false;
771 
772  // TODO: Should be possible to support more formats, but if format loads
773  // are not dword-aligned, the merged load might not be valid.
774  if (Info0->BitsPerComp != 32)
775  return false;
776 
777  if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
778  return false;
779  }
780 
781  uint32_t EltOffset0 = CI.Offset / CI.EltSize;
782  uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
783  CI.UseST64 = false;
784  CI.BaseOff = 0;
785 
786  // Handle all non-DS instructions.
787  if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
788  return (EltOffset0 + CI.Width == EltOffset1 ||
789  EltOffset1 + Paired.Width == EltOffset0) &&
790  CI.CPol == Paired.CPol &&
791  (CI.InstClass == S_BUFFER_LOAD_IMM || CI.CPol == Paired.CPol);
792  }
793 
794  // If the offset in elements doesn't fit in 8-bits, we might be able to use
795  // the stride 64 versions.
796  if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
797  isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
798  if (Modify) {
799  CI.Offset = EltOffset0 / 64;
800  Paired.Offset = EltOffset1 / 64;
801  CI.UseST64 = true;
802  }
803  return true;
804  }
805 
806  // Check if the new offsets fit in the reduced 8-bit range.
807  if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
808  if (Modify) {
809  CI.Offset = EltOffset0;
810  Paired.Offset = EltOffset1;
811  }
812  return true;
813  }
814 
815  // Try to shift base address to decrease offsets.
816  uint32_t Min = std::min(EltOffset0, EltOffset1);
817  uint32_t Max = std::max(EltOffset0, EltOffset1);
818 
819  const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
820  if (((Max - Min) & ~Mask) == 0) {
821  if (Modify) {
822  // From the range of values we could use for BaseOff, choose the one that
823  // is aligned to the highest power of two, to maximise the chance that
824  // the same offset can be reused for other load/store pairs.
825  uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
826  // Copy the low bits of the offsets, so that when we adjust them by
827  // subtracting BaseOff they will be multiples of 64.
828  BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
829  CI.BaseOff = BaseOff * CI.EltSize;
830  CI.Offset = (EltOffset0 - BaseOff) / 64;
831  Paired.Offset = (EltOffset1 - BaseOff) / 64;
832  CI.UseST64 = true;
833  }
834  return true;
835  }
836 
837  if (isUInt<8>(Max - Min)) {
838  if (Modify) {
839  // From the range of values we could use for BaseOff, choose the one that
840  // is aligned to the highest power of two, to maximise the chance that
841  // the same offset can be reused for other load/store pairs.
842  uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
843  CI.BaseOff = BaseOff * CI.EltSize;
844  CI.Offset = EltOffset0 - BaseOff;
845  Paired.Offset = EltOffset1 - BaseOff;
846  }
847  return true;
848  }
849 
850  return false;
851 }
852 
853 bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
854  const CombineInfo &CI,
855  const CombineInfo &Paired) {
856  const unsigned Width = (CI.Width + Paired.Width);
857  switch (CI.InstClass) {
858  default:
859  return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
860  case S_BUFFER_LOAD_IMM:
861  switch (Width) {
862  default:
863  return false;
864  case 2:
865  case 4:
866  return true;
867  }
868  }
869 }
870 
871 const TargetRegisterClass *
872 SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
873  if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
874  return TRI->getRegClassForReg(*MRI, Dst->getReg());
875  }
876  if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
877  return TRI->getRegClassForReg(*MRI, Src->getReg());
878  }
879  if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
880  return TRI->getRegClassForReg(*MRI, Src->getReg());
881  }
882  if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
883  return TRI->getRegClassForReg(*MRI, Dst->getReg());
884  }
885  if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
886  return TRI->getRegClassForReg(*MRI, Src->getReg());
887  }
888  return nullptr;
889 }
890 
891 /// This function assumes that CI comes before Paired in a basic block.
892 bool SILoadStoreOptimizer::checkAndPrepareMerge(
893  CombineInfo &CI, CombineInfo &Paired,
894  SmallVectorImpl<MachineInstr *> &InstsToMove) {
895 
896  // Check both offsets (or masks for MIMG) can be combined and fit in the
897  // reduced range.
898  if (CI.InstClass == MIMG && !dmasksCanBeCombined(CI, *TII, Paired))
899  return false;
900 
901  if (CI.InstClass != MIMG &&
902  (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired)))
903  return false;
904 
905  const unsigned Opc = CI.I->getOpcode();
906  const InstClassEnum InstClass = getInstClass(Opc, *TII);
907 
908  if (InstClass == UNKNOWN) {
909  return false;
910  }
911  const unsigned InstSubclass = getInstSubclass(Opc, *TII);
912 
913  // Do not merge VMEM buffer instructions with "swizzled" bit set.
914  int Swizzled =
915  AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::swz);
916  if (Swizzled != -1 && CI.I->getOperand(Swizzled).getImm())
917  return false;
918 
919  DenseSet<Register> RegDefsToMove;
920  DenseSet<Register> PhysRegUsesToMove;
921  addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
922 
923  const TargetRegisterClass *DataRC = getDataRegClass(*CI.I);
924  bool IsAGPR = TRI->hasAGPRs(DataRC);
925 
926  MachineBasicBlock::iterator E = std::next(Paired.I);
927  MachineBasicBlock::iterator MBBI = std::next(CI.I);
928  MachineBasicBlock::iterator MBBE = CI.I->getParent()->end();
929  for (; MBBI != E; ++MBBI) {
930 
931  if (MBBI == MBBE) {
932  // CombineInfo::Order is a hint on the instruction ordering within the
933  // basic block. This hint suggests that CI precedes Paired, which is
934  // true most of the time. However, moveInstsAfter() processing a
935  // previous list may have changed this order in a situation when it
936  // moves an instruction which exists in some other merge list.
937  // In this case it must be dependent.
938  return false;
939  }
940 
941  if ((getInstClass(MBBI->getOpcode(), *TII) != InstClass) ||
942  (getInstSubclass(MBBI->getOpcode(), *TII) != InstSubclass)) {
943  // This is not a matching instruction, but we can keep looking as
944  // long as one of these conditions are met:
945  // 1. It is safe to move I down past MBBI.
946  // 2. It is safe to move MBBI down past the instruction that I will
947  // be merged into.
948 
949  if (MBBI->hasUnmodeledSideEffects()) {
950  // We can't re-order this instruction with respect to other memory
951  // operations, so we fail both conditions mentioned above.
952  return false;
953  }
954 
955  if (MBBI->mayLoadOrStore() &&
956  (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
957  !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))) {
958  // We fail condition #1, but we may still be able to satisfy condition
959  // #2. Add this instruction to the move list and then we will check
960  // if condition #2 holds once we have selected the matching instruction.
961  InstsToMove.push_back(&*MBBI);
962  addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);
963  continue;
964  }
965 
966  // When we match I with another DS instruction we will be moving I down
967  // to the location of the matched instruction any uses of I will need to
968  // be moved down as well.
969  addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
970  InstsToMove);
971  continue;
972  }
973 
974  // Don't merge volatiles.
975  if (MBBI->hasOrderedMemoryRef())
976  return false;
977 
978  int Swizzled =
979  AMDGPU::getNamedOperandIdx(MBBI->getOpcode(), AMDGPU::OpName::swz);
980  if (Swizzled != -1 && MBBI->getOperand(Swizzled).getImm())
981  return false;
982 
983  // Handle a case like
984  // DS_WRITE_B32 addr, v, idx0
985  // w = DS_READ_B32 addr, idx0
986  // DS_WRITE_B32 addr, f(w), idx1
987  // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
988  // merging of the two writes.
989  if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
990  InstsToMove))
991  continue;
992 
993  if (&*MBBI == &*Paired.I) {
994  if (TRI->hasAGPRs(getDataRegClass(*MBBI)) != IsAGPR)
995  return false;
996  // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
997  // operands. However we are reporting that ds_write2 shall have
998  // only VGPR data so that machine copy propagation does not
999  // create an illegal instruction with a VGPR and AGPR sources.
1000  // Consequenctially if we create such instruction the verifier
1001  // will complain.
1002  if (IsAGPR && CI.InstClass == DS_WRITE)
1003  return false;
1004 
1005  // We need to go through the list of instructions that we plan to
1006  // move and make sure they are all safe to move down past the merged
1007  // instruction.
1008  if (canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA)) {
1009 
1010  // Call offsetsCanBeCombined with modify = true so that the offsets are
1011  // correct for the new instruction. This should return true, because
1012  // this function should only be called on CombineInfo objects that
1013  // have already been confirmed to be mergeable.
1014  if (CI.InstClass != MIMG)
1015  offsetsCanBeCombined(CI, *STM, Paired, true);
1016  return true;
1017  }
1018  return false;
1019  }
1020 
1021  // We've found a load/store that we couldn't merge for some reason.
1022  // We could potentially keep looking, but we'd need to make sure that
1023  // it was safe to move I and also all the instruction in InstsToMove
1024  // down past this instruction.
1025  // check if we can move I across MBBI and if we can move all I's users
1026  if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
1027  !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))
1028  break;
1029  }
1030  return false;
1031 }
1032 
1033 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
1034  if (STM->ldsRequiresM0Init())
1035  return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1036  return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1037 }
1038 
1039 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
1040  if (STM->ldsRequiresM0Init())
1041  return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1042 
1043  return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1044  : AMDGPU::DS_READ2ST64_B64_gfx9;
1045 }
1046 
1048 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1049  const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1050  MachineBasicBlock *MBB = CI.I->getParent();
1051 
1052  // Be careful, since the addresses could be subregisters themselves in weird
1053  // cases, like vectors of pointers.
1054  const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1055 
1056  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1057  const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1058 
1059  unsigned NewOffset0 = CI.Offset;
1060  unsigned NewOffset1 = Paired.Offset;
1061  unsigned Opc =
1062  CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1063 
1064  unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
1065  unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
1066 
1067  if (NewOffset0 > NewOffset1) {
1068  // Canonicalize the merged instruction so the smaller offset comes first.
1069  std::swap(NewOffset0, NewOffset1);
1070  std::swap(SubRegIdx0, SubRegIdx1);
1071  }
1072 
1073  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1074  (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1075 
1076  const MCInstrDesc &Read2Desc = TII->get(Opc);
1077 
1078  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1079  Register DestReg = MRI->createVirtualRegister(SuperRC);
1080 
1081  DebugLoc DL = CI.I->getDebugLoc();
1082 
1083  Register BaseReg = AddrReg->getReg();
1084  unsigned BaseSubReg = AddrReg->getSubReg();
1085  unsigned BaseRegFlags = 0;
1086  if (CI.BaseOff) {
1087  Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1088  BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1089  .addImm(CI.BaseOff);
1090 
1091  BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1092  BaseRegFlags = RegState::Kill;
1093 
1094  TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg)
1095  .addReg(ImmReg)
1096  .addReg(AddrReg->getReg(), 0, BaseSubReg)
1097  .addImm(0); // clamp bit
1098  BaseSubReg = 0;
1099  }
1100 
1101  MachineInstrBuilder Read2 =
1102  BuildMI(*MBB, Paired.I, DL, Read2Desc, DestReg)
1103  .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1104  .addImm(NewOffset0) // offset0
1105  .addImm(NewOffset1) // offset1
1106  .addImm(0) // gds
1107  .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1108 
1109  (void)Read2;
1110 
1111  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1112 
1113  // Copy to the old destination registers.
1114  BuildMI(*MBB, Paired.I, DL, CopyDesc)
1115  .add(*Dest0) // Copy to same destination including flags and sub reg.
1116  .addReg(DestReg, 0, SubRegIdx0);
1117  MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
1118  .add(*Dest1)
1119  .addReg(DestReg, RegState::Kill, SubRegIdx1);
1120 
1121  moveInstsAfter(Copy1, InstsToMove);
1122 
1123  CI.I->eraseFromParent();
1124  Paired.I->eraseFromParent();
1125 
1126  LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
1127  return Read2;
1128 }
1129 
1130 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
1131  if (STM->ldsRequiresM0Init())
1132  return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1133  return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1134  : AMDGPU::DS_WRITE2_B64_gfx9;
1135 }
1136 
1137 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
1138  if (STM->ldsRequiresM0Init())
1139  return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1140  : AMDGPU::DS_WRITE2ST64_B64;
1141 
1142  return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1143  : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1144 }
1145 
1147 SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
1148  const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1149  MachineBasicBlock *MBB = CI.I->getParent();
1150 
1151  // Be sure to use .addOperand(), and not .addReg() with these. We want to be
1152  // sure we preserve the subregister index and any register flags set on them.
1153  const MachineOperand *AddrReg =
1154  TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1155  const MachineOperand *Data0 =
1156  TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1157  const MachineOperand *Data1 =
1158  TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1159 
1160  unsigned NewOffset0 = CI.Offset;
1161  unsigned NewOffset1 = Paired.Offset;
1162  unsigned Opc =
1163  CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1164 
1165  if (NewOffset0 > NewOffset1) {
1166  // Canonicalize the merged instruction so the smaller offset comes first.
1167  std::swap(NewOffset0, NewOffset1);
1168  std::swap(Data0, Data1);
1169  }
1170 
1171  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1172  (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1173 
1174  const MCInstrDesc &Write2Desc = TII->get(Opc);
1175  DebugLoc DL = CI.I->getDebugLoc();
1176 
1177  Register BaseReg = AddrReg->getReg();
1178  unsigned BaseSubReg = AddrReg->getSubReg();
1179  unsigned BaseRegFlags = 0;
1180  if (CI.BaseOff) {
1181  Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1182  BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1183  .addImm(CI.BaseOff);
1184 
1185  BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1186  BaseRegFlags = RegState::Kill;
1187 
1188  TII->getAddNoCarry(*MBB, Paired.I, DL, BaseReg)
1189  .addReg(ImmReg)
1190  .addReg(AddrReg->getReg(), 0, BaseSubReg)
1191  .addImm(0); // clamp bit
1192  BaseSubReg = 0;
1193  }
1194 
1195  MachineInstrBuilder Write2 =
1196  BuildMI(*MBB, Paired.I, DL, Write2Desc)
1197  .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1198  .add(*Data0) // data0
1199  .add(*Data1) // data1
1200  .addImm(NewOffset0) // offset0
1201  .addImm(NewOffset1) // offset1
1202  .addImm(0) // gds
1203  .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1204 
1205  moveInstsAfter(Write2, InstsToMove);
1206 
1207  CI.I->eraseFromParent();
1208  Paired.I->eraseFromParent();
1209 
1210  LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
1211  return Write2;
1212 }
1213 
1215 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1216  const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1217  MachineBasicBlock *MBB = CI.I->getParent();
1218  DebugLoc DL = CI.I->getDebugLoc();
1219  const unsigned Opcode = getNewOpcode(CI, Paired);
1220 
1221  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1222 
1223  Register DestReg = MRI->createVirtualRegister(SuperRC);
1224  unsigned MergedDMask = CI.DMask | Paired.DMask;
1225  unsigned DMaskIdx =
1226  AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1227 
1228  auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg);
1229  for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
1230  if (I == DMaskIdx)
1231  MIB.addImm(MergedDMask);
1232  else
1233  MIB.add((*CI.I).getOperand(I));
1234  }
1235 
1236  // It shouldn't be possible to get this far if the two instructions
1237  // don't have a single memoperand, because MachineInstr::mayAlias()
1238  // will return true if this is the case.
1239  assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1240 
1241  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1242  const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1243 
1244  MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1245 
1246  unsigned SubRegIdx0, SubRegIdx1;
1247  std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired);
1248 
1249  // Copy to the old destination registers.
1250  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1251  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1252  const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1253 
1254  BuildMI(*MBB, Paired.I, DL, CopyDesc)
1255  .add(*Dest0) // Copy to same destination including flags and sub reg.
1256  .addReg(DestReg, 0, SubRegIdx0);
1257  MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
1258  .add(*Dest1)
1259  .addReg(DestReg, RegState::Kill, SubRegIdx1);
1260 
1261  moveInstsAfter(Copy1, InstsToMove);
1262 
1263  CI.I->eraseFromParent();
1264  Paired.I->eraseFromParent();
1265  return New;
1266 }
1267 
1268 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
1269  CombineInfo &CI, CombineInfo &Paired,
1270  const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1271  MachineBasicBlock *MBB = CI.I->getParent();
1272  DebugLoc DL = CI.I->getDebugLoc();
1273  const unsigned Opcode = getNewOpcode(CI, Paired);
1274 
1275  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1276 
1277  Register DestReg = MRI->createVirtualRegister(SuperRC);
1278  unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1279 
1280  // It shouldn't be possible to get this far if the two instructions
1281  // don't have a single memoperand, because MachineInstr::mayAlias()
1282  // will return true if this is the case.
1283  assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1284 
1285  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1286  const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1287 
1288  MachineInstr *New =
1289  BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg)
1290  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
1291  .addImm(MergedOffset) // offset
1292  .addImm(CI.CPol) // cpol
1294 
1295  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1296  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1297  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1298 
1299  // Copy to the old destination registers.
1300  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1301  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
1302  const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst);
1303 
1304  BuildMI(*MBB, Paired.I, DL, CopyDesc)
1305  .add(*Dest0) // Copy to same destination including flags and sub reg.
1306  .addReg(DestReg, 0, SubRegIdx0);
1307  MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
1308  .add(*Dest1)
1309  .addReg(DestReg, RegState::Kill, SubRegIdx1);
1310 
1311  moveInstsAfter(Copy1, InstsToMove);
1312 
1313  CI.I->eraseFromParent();
1314  Paired.I->eraseFromParent();
1315  return New;
1316 }
1317 
1318 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
1319  CombineInfo &CI, CombineInfo &Paired,
1320  const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1321  MachineBasicBlock *MBB = CI.I->getParent();
1322  DebugLoc DL = CI.I->getDebugLoc();
1323 
1324  const unsigned Opcode = getNewOpcode(CI, Paired);
1325 
1326  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1327 
1328  // Copy to the new source register.
1329  Register DestReg = MRI->createVirtualRegister(SuperRC);
1330  unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1331 
1332  auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg);
1333 
1334  AddressRegs Regs = getRegs(Opcode, *TII);
1335 
1336  if (Regs.VAddr)
1337  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1338 
1339  // It shouldn't be possible to get this far if the two instructions
1340  // don't have a single memoperand, because MachineInstr::mayAlias()
1341  // will return true if this is the case.
1342  assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1343 
1344  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1345  const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1346 
1347  MachineInstr *New =
1348  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1349  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1350  .addImm(MergedOffset) // offset
1351  .addImm(CI.CPol) // cpol
1352  .addImm(0) // tfe
1353  .addImm(0) // swz
1354  .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1355 
1356  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1357  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1358  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1359 
1360  // Copy to the old destination registers.
1361  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1362  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1363  const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1364 
1365  BuildMI(*MBB, Paired.I, DL, CopyDesc)
1366  .add(*Dest0) // Copy to same destination including flags and sub reg.
1367  .addReg(DestReg, 0, SubRegIdx0);
1368  MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
1369  .add(*Dest1)
1370  .addReg(DestReg, RegState::Kill, SubRegIdx1);
1371 
1372  moveInstsAfter(Copy1, InstsToMove);
1373 
1374  CI.I->eraseFromParent();
1375  Paired.I->eraseFromParent();
1376  return New;
1377 }
1378 
1379 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
1380  CombineInfo &CI, CombineInfo &Paired,
1381  const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1382  MachineBasicBlock *MBB = CI.I->getParent();
1383  DebugLoc DL = CI.I->getDebugLoc();
1384 
1385  const unsigned Opcode = getNewOpcode(CI, Paired);
1386 
1387  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1388 
1389  // Copy to the new source register.
1390  Register DestReg = MRI->createVirtualRegister(SuperRC);
1391  unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1392 
1393  auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg);
1394 
1395  AddressRegs Regs = getRegs(Opcode, *TII);
1396 
1397  if (Regs.VAddr)
1398  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1399 
1400  unsigned JoinedFormat =
1401  getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1402 
1403  // It shouldn't be possible to get this far if the two instructions
1404  // don't have a single memoperand, because MachineInstr::mayAlias()
1405  // will return true if this is the case.
1406  assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1407 
1408  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1409  const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1410 
1411  MachineInstr *New =
1412  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1413  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1414  .addImm(MergedOffset) // offset
1415  .addImm(JoinedFormat) // format
1416  .addImm(CI.CPol) // cpol
1417  .addImm(0) // tfe
1418  .addImm(0) // swz
1419  .addMemOperand(
1420  combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1421 
1422  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1423  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1424  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1425 
1426  // Copy to the old destination registers.
1427  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1428  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1429  const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1430 
1431  BuildMI(*MBB, Paired.I, DL, CopyDesc)
1432  .add(*Dest0) // Copy to same destination including flags and sub reg.
1433  .addReg(DestReg, 0, SubRegIdx0);
1434  MachineInstr *Copy1 = BuildMI(*MBB, Paired.I, DL, CopyDesc)
1435  .add(*Dest1)
1436  .addReg(DestReg, RegState::Kill, SubRegIdx1);
1437 
1438  moveInstsAfter(Copy1, InstsToMove);
1439 
1440  CI.I->eraseFromParent();
1441  Paired.I->eraseFromParent();
1442  return New;
1443 }
1444 
1445 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
1446  CombineInfo &CI, CombineInfo &Paired,
1447  const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1448  MachineBasicBlock *MBB = CI.I->getParent();
1449  DebugLoc DL = CI.I->getDebugLoc();
1450 
1451  const unsigned Opcode = getNewOpcode(CI, Paired);
1452 
1453  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1454  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1455  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1456 
1457  // Copy to the new source register.
1458  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1459  Register SrcReg = MRI->createVirtualRegister(SuperRC);
1460 
1461  const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1462  const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1463 
1464  BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1465  .add(*Src0)
1466  .addImm(SubRegIdx0)
1467  .add(*Src1)
1468  .addImm(SubRegIdx1);
1469 
1470  auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode))
1471  .addReg(SrcReg, RegState::Kill);
1472 
1473  AddressRegs Regs = getRegs(Opcode, *TII);
1474 
1475  if (Regs.VAddr)
1476  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1477 
1478  unsigned JoinedFormat =
1479  getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1480 
1481  // It shouldn't be possible to get this far if the two instructions
1482  // don't have a single memoperand, because MachineInstr::mayAlias()
1483  // will return true if this is the case.
1484  assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1485 
1486  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1487  const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1488 
1489  MachineInstr *New =
1490  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1491  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1492  .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1493  .addImm(JoinedFormat) // format
1494  .addImm(CI.CPol) // cpol
1495  .addImm(0) // tfe
1496  .addImm(0) // swz
1497  .addMemOperand(
1498  combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1499 
1500  moveInstsAfter(MIB, InstsToMove);
1501 
1502  CI.I->eraseFromParent();
1503  Paired.I->eraseFromParent();
1504  return New;
1505 }
1506 
1507 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1508  const CombineInfo &Paired) {
1509  const unsigned Width = CI.Width + Paired.Width;
1510 
1511  switch (CI.InstClass) {
1512  default:
1513  assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1514  // FIXME: Handle d16 correctly
1515  return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
1516  Width);
1517  case TBUFFER_LOAD:
1518  case TBUFFER_STORE:
1519  return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1520  Width);
1521 
1522  case UNKNOWN:
1523  llvm_unreachable("Unknown instruction class");
1524  case S_BUFFER_LOAD_IMM:
1525  switch (Width) {
1526  default:
1527  return 0;
1528  case 2:
1529  return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1530  case 4:
1531  return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1532  }
1533  case MIMG:
1534  assert("No overlaps" && (countPopulation(CI.DMask | Paired.DMask) == Width));
1535  return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
1536  }
1537 }
1538 
1539 std::pair<unsigned, unsigned>
1540 SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI, const CombineInfo &Paired) {
1541 
1542  if (CI.Width == 0 || Paired.Width == 0 || CI.Width + Paired.Width > 4)
1543  return std::make_pair(0, 0);
1544 
1545  bool ReverseOrder;
1546  if (CI.InstClass == MIMG) {
1547  assert((countPopulation(CI.DMask | Paired.DMask) == CI.Width + Paired.Width) &&
1548  "No overlaps");
1549  ReverseOrder = CI.DMask > Paired.DMask;
1550  } else
1551  ReverseOrder = CI.Offset > Paired.Offset;
1552 
1553  static const unsigned Idxs[4][4] = {
1554  {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1555  {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0},
1556  {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0},
1557  {AMDGPU::sub3, 0, 0, 0},
1558  };
1559  unsigned Idx0;
1560  unsigned Idx1;
1561 
1562  assert(CI.Width >= 1 && CI.Width <= 3);
1563  assert(Paired.Width >= 1 && Paired.Width <= 3);
1564 
1565  if (ReverseOrder) {
1566  Idx1 = Idxs[0][Paired.Width - 1];
1567  Idx0 = Idxs[Paired.Width][CI.Width - 1];
1568  } else {
1569  Idx0 = Idxs[0][CI.Width - 1];
1570  Idx1 = Idxs[CI.Width][Paired.Width - 1];
1571  }
1572 
1573  return std::make_pair(Idx0, Idx1);
1574 }
1575 
1576 const TargetRegisterClass *
1577 SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
1578  const CombineInfo &Paired) {
1579  if (CI.InstClass == S_BUFFER_LOAD_IMM) {
1580  switch (CI.Width + Paired.Width) {
1581  default:
1582  return nullptr;
1583  case 2:
1584  return &AMDGPU::SReg_64_XEXECRegClass;
1585  case 4:
1586  return &AMDGPU::SGPR_128RegClass;
1587  case 8:
1588  return &AMDGPU::SGPR_256RegClass;
1589  case 16:
1590  return &AMDGPU::SGPR_512RegClass;
1591  }
1592  }
1593 
1594  unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1595  return TRI->hasAGPRs(getDataRegClass(*CI.I))
1596  ? TRI->getAGPRClassForBitWidth(BitWidth)
1597  : TRI->getVGPRClassForBitWidth(BitWidth);
1598 }
1599 
1600 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
1601  CombineInfo &CI, CombineInfo &Paired,
1602  const SmallVectorImpl<MachineInstr *> &InstsToMove) {
1603  MachineBasicBlock *MBB = CI.I->getParent();
1604  DebugLoc DL = CI.I->getDebugLoc();
1605 
1606  const unsigned Opcode = getNewOpcode(CI, Paired);
1607 
1608  std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1609  const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1610  const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1611 
1612  // Copy to the new source register.
1613  const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1614  Register SrcReg = MRI->createVirtualRegister(SuperRC);
1615 
1616  const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1617  const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1618 
1619  BuildMI(*MBB, Paired.I, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1620  .add(*Src0)
1621  .addImm(SubRegIdx0)
1622  .add(*Src1)
1623  .addImm(SubRegIdx1);
1624 
1625  auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode))
1626  .addReg(SrcReg, RegState::Kill);
1627 
1628  AddressRegs Regs = getRegs(Opcode, *TII);
1629 
1630  if (Regs.VAddr)
1631  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1632 
1633 
1634  // It shouldn't be possible to get this far if the two instructions
1635  // don't have a single memoperand, because MachineInstr::mayAlias()
1636  // will return true if this is the case.
1637  assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1638 
1639  const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
1640  const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1641 
1642  MachineInstr *New =
1643  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1644  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1645  .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1646  .addImm(CI.CPol) // cpol
1647  .addImm(0) // tfe
1648  .addImm(0) // swz
1649  .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
1650 
1651  moveInstsAfter(MIB, InstsToMove);
1652 
1653  CI.I->eraseFromParent();
1654  Paired.I->eraseFromParent();
1655  return New;
1656 }
1657 
1659 SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
1660  APInt V(32, Val, true);
1661  if (TII->isInlineConstant(V))
1662  return MachineOperand::CreateImm(Val);
1663 
1664  Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1665  MachineInstr *Mov =
1666  BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1667  TII->get(AMDGPU::S_MOV_B32), Reg)
1668  .addImm(Val);
1669  (void)Mov;
1670  LLVM_DEBUG(dbgs() << " "; Mov->dump());
1671  return MachineOperand::CreateReg(Reg, false);
1672 }
1673 
1674 // Compute base address using Addr and return the final register.
1675 Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1676  const MemAddress &Addr) const {
1677  MachineBasicBlock *MBB = MI.getParent();
1678  MachineBasicBlock::iterator MBBI = MI.getIterator();
1679  DebugLoc DL = MI.getDebugLoc();
1680 
1681  assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1682  Addr.Base.LoSubReg) &&
1683  "Expected 32-bit Base-Register-Low!!");
1684 
1685  assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1686  Addr.Base.HiSubReg) &&
1687  "Expected 32-bit Base-Register-Hi!!");
1688 
1689  LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n");
1690  MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1691  MachineOperand OffsetHi =
1692  createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1693 
1694  const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1695  Register CarryReg = MRI->createVirtualRegister(CarryRC);
1696  Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
1697 
1698  Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1699  Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1700  MachineInstr *LoHalf =
1701  BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
1702  .addReg(CarryReg, RegState::Define)
1703  .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1704  .add(OffsetLo)
1705  .addImm(0); // clamp bit
1706  (void)LoHalf;
1707  LLVM_DEBUG(dbgs() << " "; LoHalf->dump(););
1708 
1709  MachineInstr *HiHalf =
1710  BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1711  .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
1712  .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
1713  .add(OffsetHi)
1714  .addReg(CarryReg, RegState::Kill)
1715  .addImm(0); // clamp bit
1716  (void)HiHalf;
1717  LLVM_DEBUG(dbgs() << " "; HiHalf->dump(););
1718 
1719  Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
1720  MachineInstr *FullBase =
1721  BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
1722  .addReg(DestSub0)
1723  .addImm(AMDGPU::sub0)
1724  .addReg(DestSub1)
1725  .addImm(AMDGPU::sub1);
1726  (void)FullBase;
1727  LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";);
1728 
1729  return FullDestReg;
1730 }
1731 
1732 // Update base and offset with the NewBase and NewOffset in MI.
1733 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
1734  Register NewBase,
1735  int32_t NewOffset) const {
1736  auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1737  Base->setReg(NewBase);
1738  Base->setIsKill(false);
1739  TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
1740 }
1741 
1743 SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
1744  if (Op.isImm())
1745  return Op.getImm();
1746 
1747  if (!Op.isReg())
1748  return None;
1749 
1750  MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
1751  if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
1752  !Def->getOperand(1).isImm())
1753  return None;
1754 
1755  return Def->getOperand(1).getImm();
1756 }
1757 
1758 // Analyze Base and extracts:
1759 // - 32bit base registers, subregisters
1760 // - 64bit constant offset
1761 // Expecting base computation as:
1762 // %OFFSET0:sgpr_32 = S_MOV_B32 8000
1763 // %LO:vgpr_32, %c:sreg_64_xexec =
1764 // V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
1765 // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
1766 // %Base:vreg_64 =
1767 // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
1768 void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
1769  MemAddress &Addr) const {
1770  if (!Base.isReg())
1771  return;
1772 
1774  if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
1775  || Def->getNumOperands() != 5)
1776  return;
1777 
1778  MachineOperand BaseLo = Def->getOperand(1);
1779  MachineOperand BaseHi = Def->getOperand(3);
1780  if (!BaseLo.isReg() || !BaseHi.isReg())
1781  return;
1782 
1783  MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
1784  MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
1785 
1786  if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
1787  !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
1788  return;
1789 
1790  const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
1791  const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
1792 
1793  auto Offset0P = extractConstOffset(*Src0);
1794  if (Offset0P)
1795  BaseLo = *Src1;
1796  else {
1797  if (!(Offset0P = extractConstOffset(*Src1)))
1798  return;
1799  BaseLo = *Src0;
1800  }
1801 
1802  Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
1803  Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
1804 
1805  if (Src0->isImm())
1806  std::swap(Src0, Src1);
1807 
1808  if (!Src1->isImm())
1809  return;
1810 
1811  uint64_t Offset1 = Src1->getImm();
1812  BaseHi = *Src0;
1813 
1814  Addr.Base.LoReg = BaseLo.getReg();
1815  Addr.Base.HiReg = BaseHi.getReg();
1816  Addr.Base.LoSubReg = BaseLo.getSubReg();
1817  Addr.Base.HiSubReg = BaseHi.getSubReg();
1818  Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
1819 }
1820 
1821 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
1822  MachineInstr &MI,
1823  MemInfoMap &Visited,
1824  SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
1825 
1826  if (!(MI.mayLoad() ^ MI.mayStore()))
1827  return false;
1828 
1829  // TODO: Support flat and scratch.
1830  if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0)
1831  return false;
1832 
1833  if (MI.mayLoad() && TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL)
1834  return false;
1835 
1836  if (AnchorList.count(&MI))
1837  return false;
1838 
1839  LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
1840 
1841  if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
1842  LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";);
1843  return false;
1844  }
1845 
1846  // Step1: Find the base-registers and a 64bit constant offset.
1847  MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
1848  MemAddress MAddr;
1849  if (Visited.find(&MI) == Visited.end()) {
1850  processBaseWithConstOffset(Base, MAddr);
1851  Visited[&MI] = MAddr;
1852  } else
1853  MAddr = Visited[&MI];
1854 
1855  if (MAddr.Offset == 0) {
1856  LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no"
1857  " constant offsets that can be promoted.\n";);
1858  return false;
1859  }
1860 
1861  LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", "
1862  << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
1863 
1864  // Step2: Traverse through MI's basic block and find an anchor(that has the
1865  // same base-registers) with the highest 13bit distance from MI's offset.
1866  // E.g. (64bit loads)
1867  // bb:
1868  // addr1 = &a + 4096; load1 = load(addr1, 0)
1869  // addr2 = &a + 6144; load2 = load(addr2, 0)
1870  // addr3 = &a + 8192; load3 = load(addr3, 0)
1871  // addr4 = &a + 10240; load4 = load(addr4, 0)
1872  // addr5 = &a + 12288; load5 = load(addr5, 0)
1873  //
1874  // Starting from the first load, the optimization will try to find a new base
1875  // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
1876  // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
1877  // as the new-base(anchor) because of the maximum distance which can
1878  // accomodate more intermediate bases presumeably.
1879  //
1880  // Step3: move (&a + 8192) above load1. Compute and promote offsets from
1881  // (&a + 8192) for load1, load2, load4.
1882  // addr = &a + 8192
1883  // load1 = load(addr, -4096)
1884  // load2 = load(addr, -2048)
1885  // load3 = load(addr, 0)
1886  // load4 = load(addr, 2048)
1887  // addr5 = &a + 12288; load5 = load(addr5, 0)
1888  //
1889  MachineInstr *AnchorInst = nullptr;
1890  MemAddress AnchorAddr;
1893 
1894  MachineBasicBlock *MBB = MI.getParent();
1896  MachineBasicBlock::iterator MBBI = MI.getIterator();
1897  ++MBBI;
1898  const SITargetLowering *TLI =
1899  static_cast<const SITargetLowering *>(STM->getTargetLowering());
1900 
1901  for ( ; MBBI != E; ++MBBI) {
1902  MachineInstr &MINext = *MBBI;
1903  // TODO: Support finding an anchor(with same base) from store addresses or
1904  // any other load addresses where the opcodes are different.
1905  if (MINext.getOpcode() != MI.getOpcode() ||
1906  TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
1907  continue;
1908 
1909  const MachineOperand &BaseNext =
1910  *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
1911  MemAddress MAddrNext;
1912  if (Visited.find(&MINext) == Visited.end()) {
1913  processBaseWithConstOffset(BaseNext, MAddrNext);
1914  Visited[&MINext] = MAddrNext;
1915  } else
1916  MAddrNext = Visited[&MINext];
1917 
1918  if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
1919  MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
1920  MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
1921  MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
1922  continue;
1923 
1924  InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset));
1925 
1926  int64_t Dist = MAddr.Offset - MAddrNext.Offset;
1928  AM.HasBaseReg = true;
1929  AM.BaseOffs = Dist;
1930  if (TLI->isLegalGlobalAddressingMode(AM) &&
1931  (uint32_t)std::abs(Dist) > MaxDist) {
1932  MaxDist = std::abs(Dist);
1933 
1934  AnchorAddr = MAddrNext;
1935  AnchorInst = &MINext;
1936  }
1937  }
1938 
1939  if (AnchorInst) {
1940  LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): ";
1941  AnchorInst->dump());
1942  LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: "
1943  << AnchorAddr.Offset << "\n\n");
1944 
1945  // Instead of moving up, just re-compute anchor-instruction's base address.
1946  Register Base = computeBase(MI, AnchorAddr);
1947 
1948  updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
1949  LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump(););
1950 
1951  for (auto P : InstsWCommonBase) {
1953  AM.HasBaseReg = true;
1954  AM.BaseOffs = P.second - AnchorAddr.Offset;
1955 
1956  if (TLI->isLegalGlobalAddressingMode(AM)) {
1957  LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second;
1958  dbgs() << ")"; P.first->dump());
1959  updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
1960  LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump());
1961  }
1962  }
1963  AnchorList.insert(AnchorInst);
1964  return true;
1965  }
1966 
1967  return false;
1968 }
1969 
1970 void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
1971  std::list<std::list<CombineInfo> > &MergeableInsts) const {
1972  for (std::list<CombineInfo> &AddrList : MergeableInsts) {
1973  if (AddrList.front().InstClass == CI.InstClass &&
1974  AddrList.front().hasSameBaseAddress(*CI.I)) {
1975  AddrList.emplace_back(CI);
1976  return;
1977  }
1978  }
1979 
1980  // Base address not found, so add a new list.
1981  MergeableInsts.emplace_back(1, CI);
1982 }
1983 
1984 std::pair<MachineBasicBlock::iterator, bool>
1985 SILoadStoreOptimizer::collectMergeableInsts(
1987  MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
1988  std::list<std::list<CombineInfo>> &MergeableInsts) const {
1989  bool Modified = false;
1990 
1991  // Sort potential mergeable instructions into lists. One list per base address.
1992  unsigned Order = 0;
1993  MachineBasicBlock::iterator BlockI = Begin;
1994  for (; BlockI != End; ++BlockI) {
1995  MachineInstr &MI = *BlockI;
1996 
1997  // We run this before checking if an address is mergeable, because it can produce
1998  // better code even if the instructions aren't mergeable.
1999  if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
2000  Modified = true;
2001 
2002  // Don't combine if volatile. We also won't be able to merge across this, so
2003  // break the search. We can look after this barrier for separate merges.
2004  if (MI.hasOrderedMemoryRef()) {
2005  LLVM_DEBUG(dbgs() << "Breaking search on memory fence: " << MI);
2006 
2007  // Search will resume after this instruction in a separate merge list.
2008  ++BlockI;
2009  break;
2010  }
2011 
2012  const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
2013  if (InstClass == UNKNOWN)
2014  continue;
2015 
2016  CombineInfo CI;
2017  CI.setMI(MI, *TII, *STM);
2018  CI.Order = Order++;
2019 
2020  if (!CI.hasMergeableAddress(*MRI))
2021  continue;
2022 
2023  LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
2024 
2025  addInstToMergeableList(CI, MergeableInsts);
2026  }
2027 
2028  // At this point we have lists of Mergeable instructions.
2029  //
2030  // Part 2: Sort lists by offset and then for each CombineInfo object in the
2031  // list try to find an instruction that can be merged with I. If an instruction
2032  // is found, it is stored in the Paired field. If no instructions are found, then
2033  // the CombineInfo object is deleted from the list.
2034 
2035  for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2036  E = MergeableInsts.end(); I != E;) {
2037 
2038  std::list<CombineInfo> &MergeList = *I;
2039  if (MergeList.size() <= 1) {
2040  // This means we have found only one instruction with a given address
2041  // that can be merged, and we need at least 2 instructions to do a merge,
2042  // so this list can be discarded.
2043  I = MergeableInsts.erase(I);
2044  continue;
2045  }
2046 
2047  // Sort the lists by offsets, this way mergeable instructions will be
2048  // adjacent to each other in the list, which will make it easier to find
2049  // matches.
2050  MergeList.sort(
2051  [] (const CombineInfo &A, CombineInfo &B) {
2052  return A.Offset < B.Offset;
2053  });
2054  ++I;
2055  }
2056 
2057  return std::make_pair(BlockI, Modified);
2058 }
2059 
2060 // Scan through looking for adjacent LDS operations with constant offsets from
2061 // the same base register. We rely on the scheduler to do the hard work of
2062 // clustering nearby loads, and assume these are all adjacent.
2064  std::list<std::list<CombineInfo> > &MergeableInsts) {
2065  bool Modified = false;
2066 
2067  for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2068  E = MergeableInsts.end(); I != E;) {
2069  std::list<CombineInfo> &MergeList = *I;
2070 
2071  bool OptimizeListAgain = false;
2072  if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2073  // We weren't able to make any changes, so delete the list so we don't
2074  // process the same instructions the next time we try to optimize this
2075  // block.
2076  I = MergeableInsts.erase(I);
2077  continue;
2078  }
2079 
2080  Modified = true;
2081 
2082  // We made changes, but also determined that there were no more optimization
2083  // opportunities, so we don't need to reprocess the list
2084  if (!OptimizeListAgain) {
2085  I = MergeableInsts.erase(I);
2086  continue;
2087  }
2088  OptimizeAgain = true;
2089  }
2090  return Modified;
2091 }
2092 
2093 bool
2094 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2095  std::list<CombineInfo> &MergeList,
2096  bool &OptimizeListAgain) {
2097  if (MergeList.empty())
2098  return false;
2099 
2100  bool Modified = false;
2101 
2102  for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
2103  Next = std::next(I)) {
2104 
2105  auto First = I;
2106  auto Second = Next;
2107 
2108  if ((*First).Order > (*Second).Order)
2109  std::swap(First, Second);
2110  CombineInfo &CI = *First;
2111  CombineInfo &Paired = *Second;
2112 
2113  SmallVector<MachineInstr *, 8> InstsToMove;
2114  if (!checkAndPrepareMerge(CI, Paired, InstsToMove)) {
2115  ++I;
2116  continue;
2117  }
2118 
2119  Modified = true;
2120 
2121  LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I);
2122 
2123  switch (CI.InstClass) {
2124  default:
2125  llvm_unreachable("unknown InstClass");
2126  break;
2127  case DS_READ: {
2129  mergeRead2Pair(CI, Paired, InstsToMove);
2130  CI.setMI(NewMI, *TII, *STM);
2131  break;
2132  }
2133  case DS_WRITE: {
2135  mergeWrite2Pair(CI, Paired, InstsToMove);
2136  CI.setMI(NewMI, *TII, *STM);
2137  break;
2138  }
2139  case S_BUFFER_LOAD_IMM: {
2141  mergeSBufferLoadImmPair(CI, Paired, InstsToMove);
2142  CI.setMI(NewMI, *TII, *STM);
2143  OptimizeListAgain |= (CI.Width + Paired.Width) < 16;
2144  break;
2145  }
2146  case BUFFER_LOAD: {
2148  mergeBufferLoadPair(CI, Paired, InstsToMove);
2149  CI.setMI(NewMI, *TII, *STM);
2150  OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
2151  break;
2152  }
2153  case BUFFER_STORE: {
2155  mergeBufferStorePair(CI, Paired, InstsToMove);
2156  CI.setMI(NewMI, *TII, *STM);
2157  OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
2158  break;
2159  }
2160  case MIMG: {
2162  mergeImagePair(CI, Paired, InstsToMove);
2163  CI.setMI(NewMI, *TII, *STM);
2164  OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
2165  break;
2166  }
2167  case TBUFFER_LOAD: {
2169  mergeTBufferLoadPair(CI, Paired, InstsToMove);
2170  CI.setMI(NewMI, *TII, *STM);
2171  OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
2172  break;
2173  }
2174  case TBUFFER_STORE: {
2176  mergeTBufferStorePair(CI, Paired, InstsToMove);
2177  CI.setMI(NewMI, *TII, *STM);
2178  OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
2179  break;
2180  }
2181  }
2182  CI.Order = Paired.Order;
2183  if (I == Second)
2184  I = Next;
2185 
2186  MergeList.erase(Second);
2187  }
2188 
2189  return Modified;
2190 }
2191 
2192 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
2193  if (skipFunction(MF.getFunction()))
2194  return false;
2195 
2196  STM = &MF.getSubtarget<GCNSubtarget>();
2197  if (!STM->loadStoreOptEnabled())
2198  return false;
2199 
2200  TII = STM->getInstrInfo();
2201  TRI = &TII->getRegisterInfo();
2202 
2203  MRI = &MF.getRegInfo();
2204  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2205 
2206  LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
2207 
2208  bool Modified = false;
2209 
2210  // Contains the list of instructions for which constant offsets are being
2211  // promoted to the IMM. This is tracked for an entire block at time.
2212  SmallPtrSet<MachineInstr *, 4> AnchorList;
2213  MemInfoMap Visited;
2214 
2215  for (MachineBasicBlock &MBB : MF) {
2216  MachineBasicBlock::iterator SectionEnd;
2217  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
2218  I = SectionEnd) {
2219  bool CollectModified;
2220  std::list<std::list<CombineInfo>> MergeableInsts;
2221 
2222  // First pass: Collect list of all instructions we know how to merge in a
2223  // subset of the block.
2224  std::tie(SectionEnd, CollectModified) =
2225  collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
2226 
2227  Modified |= CollectModified;
2228 
2229  do {
2230  OptimizeAgain = false;
2231  Modified |= optimizeBlock(MergeableInsts);
2232  } while (OptimizeAgain);
2233  }
2234 
2235  Visited.clear();
2236  AnchorList.clear();
2237  }
2238 
2239  return Modified;
2240 }
llvm::Check::Size
@ Size
Definition: FileCheck.h:73
i
i
Definition: README.txt:29
llvm::SIInstrFlags::MIMG
@ MIMG
Definition: SIDefines.h:49
DEBUG_TYPE
#define DEBUG_TYPE
Definition: SILoadStoreOptimizer.cpp:69
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:100
llvm::MachineInstrBuilder::addImm
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Definition: MachineInstrBuilder.h:131
llvm
Definition: AllocatorList.h:23
llvm::tgtok::Def
@ Def
Definition: TGLexer.h:50
llvm::AMDGPU::getMUBUFHasSoffset
bool getMUBUFHasSoffset(unsigned Opc)
Definition: AMDGPUBaseInfo.cpp:284
Reg
unsigned Reg
Definition: MachineSink.cpp:1566
moveInstsAfter
static void moveInstsAfter(MachineBasicBlock::iterator I, ArrayRef< MachineInstr * > InstsToMove)
Definition: SILoadStoreOptimizer.cpp:582
llvm::MachineOperand::CreateReg
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
Definition: MachineOperand.h:791
llvm::MachineRegisterInfo::createVirtualRegister
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
Definition: MachineRegisterInfo.cpp:158
memAccessesCanBeReordered
static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, MachineBasicBlock::iterator B, AliasAnalysis *AA)
Definition: SILoadStoreOptimizer.cpp:605
llvm::MachineRegisterInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Definition: MachineRegisterInfo.h:52
llvm::MachineInstrBuilder::add
const MachineInstrBuilder & add(const MachineOperand &MO) const
Definition: MachineInstrBuilder.h:224
llvm::AMDGPU::getMUBUFBaseOpcode
int getMUBUFBaseOpcode(unsigned Opc)
Definition: AMDGPUBaseInfo.cpp:259
P
This currently compiles esp xmm0 movsd esp eax eax esp ret We should use not the dag combiner This is because dagcombine2 needs to be able to see through the X86ISD::Wrapper which DAGCombine can t really do The code for turning x load into a single vector load is target independent and should be moved to the dag combiner The code for turning x load into a vector load can only handle a direct load from a global or a direct load from the stack It should be generalized to handle any load from P
Definition: README-SSE.txt:411
llvm::AMDGPUISD::BUFFER_LOAD
@ BUFFER_LOAD
Definition: AMDGPUISelLowering.h:497
op
#define op(i)
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1168
llvm::MachineFunction::getMachineMemOperand
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, uint64_t s, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
Definition: MachineFunction.cpp:430
llvm::MipsISD::Lo
@ Lo
Definition: MipsISelLowering.h:79
llvm::SPII::Load
@ Load
Definition: SparcInstrInfo.h:32
llvm::MachineRegisterInfo::getUniqueVRegDef
MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
Definition: MachineRegisterInfo.cpp:411
llvm::MachineFunctionPass
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
Definition: MachineFunctionPass.h:30
addToListsIfDependent
static bool addToListsIfDependent(MachineInstr &MI, DenseSet< Register > &RegDefs, DenseSet< Register > &PhysRegUses, SmallVectorImpl< MachineInstr * > &Insts)
Definition: SILoadStoreOptimizer.cpp:616
llvm::MemOp
Definition: TargetLowering.h:111
canMoveInstsAcrossMemOp
static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp, ArrayRef< MachineInstr * > InstsToMove, AliasAnalysis *AA)
Definition: SILoadStoreOptimizer.cpp:641
llvm::AMDGPU::getGlobalSaddrOp
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
llvm::MachineMemOperand
A description of a memory reference used in the backend.
Definition: MachineMemOperand.h:127
llvm::GCNSubtarget::hasDwordx3LoadStores
bool hasDwordx3LoadStores() const
Definition: GCNSubtarget.h:912
llvm::MachineFunctionProperties::Property::IsSSA
@ IsSSA
llvm::MachineFunctionProperties
Properties which a MachineFunction may have at a given point in time.
Definition: MachineFunction.h:111
llvm::Optional
Definition: APInt.h:33
llvm::createSILoadStoreOptimizerPass
FunctionPass * createSILoadStoreOptimizerPass()
Definition: SILoadStoreOptimizer.cpp:578
Offset
uint64_t Offset
Definition: ELFObjHandler.cpp:81
llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:449
llvm::GCNSubtarget
Definition: GCNSubtarget.h:38
llvm::GCNSubtarget::loadStoreOptEnabled
bool loadStoreOptEnabled() const
Definition: GCNSubtarget.h:889
llvm::AMDGPU::getNamedOperandIdx
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
llvm::detail::DenseSetImpl< ValueT, DenseMap< ValueT, detail::DenseSetEmpty, DenseMapInfo< ValueT >, detail::DenseSetPair< ValueT > >, DenseMapInfo< ValueT > >::insert
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
llvm::BitmaskEnumDetail::Mask
std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:80
llvm::detail::DenseSetImpl< ValueT, DenseMap< ValueT, detail::DenseSetEmpty, DenseMapInfo< ValueT >, detail::DenseSetPair< ValueT > >, DenseMapInfo< ValueT > >::count
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:97
llvm::RegState::Kill
@ Kill
The last use of a register.
Definition: MachineInstrBuilder.h:48
TRI
unsigned const TargetRegisterInfo * TRI
Definition: MachineSink.cpp:1567
llvm::AMDGPU::getMTBUFOpcode
int getMTBUFOpcode(unsigned BaseOpc, unsigned Elements)
Definition: AMDGPUBaseInfo.cpp:234
llvm::MachineFunctionPass::getAnalysisUsage
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Definition: MachineFunctionPass.cpp:102
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:122
llvm::GCNSubtarget::getInstrInfo
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:218
llvm::MipsISD::Hi
@ Hi
Definition: MipsISelLowering.h:75
AliasAnalysis.h
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
llvm::Register::isPhysical
bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:97
llvm::AMDGPU::CPol::CPol
CPol
Definition: SIDefines.h:281
llvm::MachineFunction::getRegInfo
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Definition: MachineFunction.h:568
llvm::AMDGPU::getMTBUFHasSrsrc
bool getMTBUFHasSrsrc(unsigned Opc)
Definition: AMDGPUBaseInfo.cpp:249
llvm::PassRegistry::getPassRegistry
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Definition: PassRegistry.cpp:31
llvm::AAResults
Definition: AliasAnalysis.h:456
llvm::MachineOperand::CreateImm
static MachineOperand CreateImm(int64_t Val)
Definition: MachineOperand.h:773
GCNSubtarget.h
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::MachineOperand::getImm
int64_t getImm() const
Definition: MachineOperand.h:537
llvm::TargetLoweringBase::AddrMode::HasBaseReg
bool HasBaseReg
Definition: TargetLowering.h:2332
llvm::ARM_PROC::A
@ A
Definition: ARMBaseInfo.h:34
llvm::GCNSubtarget::getTargetLowering
const SITargetLowering * getTargetLowering() const override
Definition: GCNSubtarget.h:226
llvm::AMDGPU::getMUBUFOpcode
int getMUBUFOpcode(unsigned BaseOpc, unsigned Elements)
Definition: AMDGPUBaseInfo.cpp:264
llvm::RegState::Define
@ Define
Register definition.
Definition: MachineInstrBuilder.h:44
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
llvm::TargetRegisterClass
Definition: TargetRegisterInfo.h:46
llvm::AMDGPU::GcnBufferFormatInfo::NumFormat
unsigned NumFormat
Definition: AMDGPUBaseInfo.h:58
false
Definition: StackSlotColoring.cpp:142
TII
const HexagonInstrInfo * TII
Definition: HexagonCopyToCombine.cpp:129
llvm::MCInstrDesc
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:195
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
First
into llvm powi allowing the code generator to produce balanced multiplication trees First
Definition: README.txt:54
llvm::MachineOperand
MachineOperand class - Representation of each machine instruction operand.
Definition: MachineOperand.h:49
llvm::MachineFunctionProperties::set
MachineFunctionProperties & set(Property P)
Definition: MachineFunction.h:169
llvm::AMDGPU::getMTBUFBaseOpcode
int getMTBUFBaseOpcode(unsigned Opc)
Definition: AMDGPUBaseInfo.cpp:229
llvm::AMDGPU::getMTBUFHasSoffset
bool getMTBUFHasSoffset(unsigned Opc)
Definition: AMDGPUBaseInfo.cpp:254
LoopDeletionResult::Modified
@ Modified
llvm::SITargetLowering::isLegalGlobalAddressingMode
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
Definition: SIISelLowering.cpp:1245
llvm::AMDGPU::getMUBUFHasSrsrc
bool getMUBUFHasSrsrc(unsigned Opc)
Definition: AMDGPUBaseInfo.cpp:279
Info
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
llvm::AMDGPU::convertSMRDOffsetUnits
uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset)
Convert ByteOffset to dwords if the subtarget uses dword SMRD immediate offsets.
Definition: AMDGPUBaseInfo.cpp:1840
llvm::SIRegisterInfo
Definition: SIRegisterInfo.h:29
llvm::SILoadStoreOptimizerID
char & SILoadStoreOptimizerID
Definition: SILoadStoreOptimizer.cpp:576
llvm::None
const NoneType None
Definition: None.h:23
llvm::MachineBasicBlock
Definition: MachineBasicBlock.h:95
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
llvm::DenseSet
Implements a dense probed hash-table based set.
Definition: DenseSet.h:268
llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition: MachineFunction.h:558
llvm::countPopulation
unsigned countPopulation(T Value)
Count the number of set bits in a value.
Definition: MathExtras.h:567
AMDGPUMCTargetDesc.h
llvm::TargetRegisterInfo::getRegClass
const TargetRegisterClass * getRegClass(unsigned i) const
Returns the register class associated with the enumeration value.
Definition: TargetRegisterInfo.h:737
llvm::MachineOperand::isReg
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Definition: MachineOperand.h:321
llvm::MachineInstr
Representation of each machine instruction.
Definition: MachineInstr.h:64
llvm::MachineInstrBuilder
Definition: MachineInstrBuilder.h:69
Addr
uint64_t Addr
Definition: ELFObjHandler.cpp:80
INITIALIZE_PASS_DEPENDENCY
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
Optimizer
SI Load Store Optimizer
Definition: SILoadStoreOptimizer.cpp:571
llvm::DenseMap
Definition: DenseMap.h:714
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::AMDGPU::GcnBufferFormatInfo::BitsPerComp
unsigned BitsPerComp
Definition: AMDGPUBaseInfo.h:56
llvm::AMDGPU::getGcnBufferFormatInfo
const GcnBufferFormatInfo * getGcnBufferFormatInfo(uint8_t BitsPerComp, uint8_t NumComponents, uint8_t NumFormat, const MCSubtargetInfo &STI)
Definition: AMDGPUBaseInfo.cpp:1980
llvm::MachineMemOperand::setOffset
void setOffset(int64_t NewOffset)
Definition: MachineMemOperand.h:288
MachineFunctionPass.h
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
mostAlignedValueInRange
static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi)
Definition: SILoadStoreOptimizer.cpp:738
llvm::isUInt< 8 >
constexpr bool isUInt< 8 >(uint64_t x)
Definition: MathExtras.h:405
std::swap
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:840
SI
StandardInstrumentations SI(Debug, VerifyEach)
llvm::MachineBasicBlock::getParent
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
Definition: MachineBasicBlock.h:225
llvm::MachineInstrBuilder::addMemOperand
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
Definition: MachineInstrBuilder.h:202
llvm::MachineInstrBuilder::addReg
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Definition: MachineInstrBuilder.h:97
llvm::AMDGPU::getMIMGInfo
const LLVM_READONLY MIMGInfo * getMIMGInfo(unsigned Opc)
llvm::MachineOperand::getReg
Register getReg() const
getReg - Returns the register number.
Definition: MachineOperand.h:360
llvm::SmallPtrSetImpl::count
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:382
llvm::APInt
Class for arbitrary precision integers.
Definition: APInt.h:70
llvm::MachineFunction
Definition: MachineFunction.h:230
llvm::MachineInstr::dump
void dump() const
Definition: MachineInstr.cpp:1540
llvm::AMDGPU::getMIMGBaseOpcodeInfo
const LLVM_READONLY MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
llvm::SmallPtrSetImplBase::clear
void clear()
Definition: SmallPtrSet.h:94
llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: APInt.h:32
da
da
Definition: DependenceAnalysis.cpp:140
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
llvm::countTrailingZeros
unsigned countTrailingZeros(T Val, ZeroBehavior ZB=ZB_Width)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: MathExtras.h:156
llvm::AnalysisUsage::setPreservesCFG
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:253
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:57
AMDGPU.h
list
compiles ldr LCPI1_0 ldr ldr mov lsr tst moveq r1 ldr LCPI1_1 and r0 bx lr It would be better to do something like to fold the shift into the conditional ldr LCPI1_0 ldr ldr tst movne lsr ldr LCPI1_1 and r0 bx lr it saves an instruction and a register It might be profitable to cse MOVi16 if there are lots of bit immediates with the same bottom half Robert Muth started working on an alternate jump table implementation that does not put the tables in line in the text This is more like the llvm default jump table implementation This might be useful sometime Several revisions of patches are on the mailing list
Definition: README.txt:568
MBBI
MachineBasicBlock MachineBasicBlock::iterator MBBI
Definition: AArch64SLSHardening.cpp:75
llvm::MachineInstr::getOpcode
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:478
llvm::MachineRegisterInfo::hasOneNonDBGUse
bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
Definition: MachineRegisterInfo.cpp:419
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:136
llvm::ms_demangle::IntrinsicFunctionKind::New
@ New
uint32_t
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
llvm::format
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:124
llvm::AMDGPU::getMTBUFHasVAddr
bool getMTBUFHasVAddr(unsigned Opc)
Definition: AMDGPUBaseInfo.cpp:244
llvm::Pass::dump
void dump() const
Definition: Pass.cpp:131
LLVM_FALLTHROUGH
#define LLVM_FALLTHROUGH
LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
Definition: Compiler.h:281
llvm::AMDGPU::GcnBufferFormatInfo
Definition: AMDGPUBaseInfo.h:54
llvm::MachineInstrBuilder::cloneMergedMemRefs
const MachineInstrBuilder & cloneMergedMemRefs(ArrayRef< const MachineInstr * > OtherMIs) const
Definition: MachineInstrBuilder.h:219
llvm::SPII::Store
@ Store
Definition: SparcInstrInfo.h:33
llvm::AMDGPU::GcnBufferFormatInfo::Format
unsigned Format
Definition: AMDGPUBaseInfo.h:55
MRI
unsigned const MachineRegisterInfo * MRI
Definition: AArch64AdvSIMDScalarPass.cpp:105
llvm::RegState::Dead
@ Dead
Unused definition.
Definition: MachineInstrBuilder.h:50
llvm::Register
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
llvm::MachineOperand::getSubReg
unsigned getSubReg() const
Definition: MachineOperand.h:365
llvm::LPAC::UNKNOWN
@ UNKNOWN
Definition: LanaiAluCode.h:40
MBB
MachineBasicBlock & MBB
Definition: AArch64SLSHardening.cpp:74
getBufferFormatWithCompCount
static unsigned getBufferFormatWithCompCount(unsigned OldFormat, unsigned ComponentCount, const GCNSubtarget &STI)
Definition: SILoadStoreOptimizer.cpp:707
llvm::TargetLoweringBase::AddrMode::BaseOffs
int64_t BaseOffs
Definition: TargetLowering.h:2331
llvm::MachineFunction::getFunction
Function & getFunction()
Return the LLVM function that this machine code represents.
Definition: MachineFunction.h:524
llvm::AMDGPU::getMUBUFHasVAddr
bool getMUBUFHasVAddr(unsigned Opc)
Definition: AMDGPUBaseInfo.cpp:274
llvm::TargetRegisterInfo::getRegSizeInBits
unsigned getRegSizeInBits(const TargetRegisterClass &RC) const
Return the size in bits of a register from class RC.
Definition: TargetRegisterInfo.h:274
llvm::AMDGPU::getMUBUFElements
int getMUBUFElements(unsigned Opc)
Definition: AMDGPUBaseInfo.cpp:269
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:314
llvm::MachineBasicBlock::insert
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
Definition: MachineBasicBlock.cpp:1337
optimizeBlock
static bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT, const TargetTransformInfo &TTI, const DataLayout &DL, DomTreeUpdater *DTU)
Definition: ScalarizeMaskedMemIntrin.cpp:913
llvm::SITargetLowering
Definition: SIISelLowering.h:30
llvm::BitWidth
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:147
llvm::GCNSubtarget::ldsRequiresM0Init
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be iniitalized.
Definition: GCNSubtarget.h:609
llvm::AMDGPU::MIMGInfo
Definition: AMDGPUBaseInfo.h:355
llvm::countLeadingZeros
unsigned countLeadingZeros(T Val, ZeroBehavior ZB=ZB_Width)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: MathExtras.h:225
llvm::MachineOperand::isImm
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Definition: MachineOperand.h:323
llvm::AMDGPU::Hwreg::Width
Width
Definition: SIDefines.h:403
llvm::SIInstrInfo
Definition: SIInstrInfo.h:38
addDefsUsesToList
static void addDefsUsesToList(const MachineInstr &MI, DenseSet< Register > &RegDefs, DenseSet< Register > &PhysRegUses)
Definition: SILoadStoreOptimizer.cpp:592
llvm::pdb::DbgHeaderType::Max
@ Max
llvm::MachineBasicBlock::begin
iterator begin()
Definition: MachineBasicBlock.h:268
llvm::BuildMI
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
Definition: MachineInstrBuilder.h:328
llvm::AAResultsWrapperPass
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Definition: AliasAnalysis.h:1281
llvm::max
Align max(MaybeAlign Lhs, Align Rhs)
Definition: Alignment.h:340
llvm::initializeSILoadStoreOptimizerPass
void initializeSILoadStoreOptimizerPass(PassRegistry &)
llvm::TargetLoweringBase::AddrMode
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg If BaseGV is null...
Definition: TargetLowering.h:2329
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", false, false) INITIALIZE_PASS_END(SILoadStoreOptimizer
llvm::SmallVectorImpl< MachineInstr * >
llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:298
llvm::AMDGPUISD::BUFFER_STORE
@ BUFFER_STORE
Definition: AMDGPUISelLowering.h:505
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
combineKnownAdjacentMMOs
static MachineMemOperand * combineKnownAdjacentMMOs(MachineFunction &MF, const MachineMemOperand *A, const MachineMemOperand *B)
Definition: SILoadStoreOptimizer.cpp:657
llvm::DebugLoc
A debug info location.
Definition: DebugLoc.h:33
llvm::MachineInstrBundleIterator< MachineInstr >
llvm::abs
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition: APFloat.h:1272
InitializePasses.h
llvm::MachineBasicBlock::end
iterator end()
Definition: MachineBasicBlock.h:270
llvm::AMDGPU::getMaskedMIMGOp
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
Definition: AMDGPUBaseInfo.cpp:150
llvm::AMDGPU::MIMGBaseOpcodeInfo::Sampler
bool Sampler
Definition: AMDGPUBaseInfo.h:285
llvm::AMDGPU::getMTBUFElements
int getMTBUFElements(unsigned Opc)
Definition: AMDGPUBaseInfo.cpp:239
getReg
static unsigned getReg(const void *D, unsigned RC, unsigned RegNo)
Definition: MipsDisassembler.cpp:580
llvm::Use
A Use represents the edge between a Value definition and its users.
Definition: Use.h:44
llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:364
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:38