LLVM  8.0.0svn
SILoadStoreOptimizer.cpp
Go to the documentation of this file.
1 //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass tries to fuse DS instructions with close by immediate offsets.
11 // This will fuse operations such as
12 // ds_read_b32 v0, v2 offset:16
13 // ds_read_b32 v1, v2 offset:32
14 // ==>
15 // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
16 //
17 // The same is done for certain SMEM and VMEM opcodes, e.g.:
18 // s_buffer_load_dword s4, s[0:3], 4
19 // s_buffer_load_dword s5, s[0:3], 8
20 // ==>
21 // s_buffer_load_dwordx2 s[4:5], s[0:3], 4
22 //
23 //
24 // Future improvements:
25 //
26 // - This currently relies on the scheduler to place loads and stores next to
27 // each other, and then only merges adjacent pairs of instructions. It would
28 // be good to be more flexible with interleaved instructions, and possibly run
29 // before scheduling. It currently missing stores of constants because loading
30 // the constant into the data register is placed between the stores, although
31 // this is arguably a scheduling problem.
32 //
33 // - Live interval recomputing seems inefficient. This currently only matches
34 // one pair, and recomputes live intervals and moves on to the next pair. It
35 // would be better to compute a list of all merges that need to occur.
36 //
37 // - With a list of instructions to process, we can also merge more. If a
38 // cluster of loads have offsets that are too large to fit in the 8-bit
39 // offsets, but are close enough to fit in the 8 bits, we can add to the base
40 // pointer and use the new reduced offsets.
41 //
42 //===----------------------------------------------------------------------===//
43 
44 #include "AMDGPU.h"
45 #include "AMDGPUSubtarget.h"
46 #include "SIInstrInfo.h"
47 #include "SIRegisterInfo.h"
49 #include "Utils/AMDGPUBaseInfo.h"
50 #include "llvm/ADT/ArrayRef.h"
51 #include "llvm/ADT/SmallVector.h"
52 #include "llvm/ADT/StringRef.h"
61 #include "llvm/IR/DebugLoc.h"
62 #include "llvm/Pass.h"
63 #include "llvm/Support/Debug.h"
66 #include <algorithm>
67 #include <cassert>
68 #include <cstdlib>
69 #include <iterator>
70 #include <utility>
71 
72 using namespace llvm;
73 
74 #define DEBUG_TYPE "si-load-store-opt"
75 
76 namespace {
77 
78 class SILoadStoreOptimizer : public MachineFunctionPass {
79  enum InstClassEnum {
80  DS_READ_WRITE,
81  S_BUFFER_LOAD_IMM,
82  BUFFER_LOAD_OFFEN,
83  BUFFER_LOAD_OFFSET,
84  BUFFER_STORE_OFFEN,
85  BUFFER_STORE_OFFSET,
86  };
87 
88  struct CombineInfo {
91  unsigned EltSize;
92  unsigned Offset0;
93  unsigned Offset1;
94  unsigned BaseOff;
95  InstClassEnum InstClass;
96  bool GLC0;
97  bool GLC1;
98  bool SLC0;
99  bool SLC1;
100  bool UseST64;
101  bool IsX2;
102  SmallVector<MachineInstr*, 8> InstsToMove;
103  };
104 
105 private:
106  const GCNSubtarget *STM = nullptr;
107  const SIInstrInfo *TII = nullptr;
108  const SIRegisterInfo *TRI = nullptr;
109  MachineRegisterInfo *MRI = nullptr;
110  AliasAnalysis *AA = nullptr;
111  unsigned CreatedX2;
112 
113  static bool offsetsCanBeCombined(CombineInfo &CI);
114 
115  bool findMatchingInst(CombineInfo &CI);
116 
117  unsigned read2Opcode(unsigned EltSize) const;
118  unsigned read2ST64Opcode(unsigned EltSize) const;
119  MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI);
120 
121  unsigned write2Opcode(unsigned EltSize) const;
122  unsigned write2ST64Opcode(unsigned EltSize) const;
123  MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
124  MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
125  MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI);
126  unsigned promoteBufferStoreOpcode(const MachineInstr &I, bool &IsX2,
127  bool &IsOffen) const;
128  MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI);
129 
130 public:
131  static char ID;
132 
133  SILoadStoreOptimizer() : MachineFunctionPass(ID) {
135  }
136 
137  bool optimizeBlock(MachineBasicBlock &MBB);
138 
139  bool runOnMachineFunction(MachineFunction &MF) override;
140 
141  StringRef getPassName() const override { return "SI Load Store Optimizer"; }
142 
143  void getAnalysisUsage(AnalysisUsage &AU) const override {
144  AU.setPreservesCFG();
146 
148  }
149 };
150 
151 } // end anonymous namespace.
152 
153 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
154  "SI Load Store Optimizer", false, false)
156 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE,
158 
159 char SILoadStoreOptimizer::ID = 0;
160 
161 char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
162 
164  return new SILoadStoreOptimizer();
165 }
166 
168  ArrayRef<MachineInstr*> InstsToMove) {
169  MachineBasicBlock *MBB = I->getParent();
170  ++I;
171  for (MachineInstr *MI : InstsToMove) {
172  MI->removeFromParent();
173  MBB->insert(I, MI);
174  }
175 }
176 
177 static void addDefsUsesToList(const MachineInstr &MI,
178  DenseSet<unsigned> &RegDefs,
179  DenseSet<unsigned> &PhysRegUses) {
180  for (const MachineOperand &Op : MI.operands()) {
181  if (Op.isReg()) {
182  if (Op.isDef())
183  RegDefs.insert(Op.getReg());
184  else if (Op.readsReg() &&
186  PhysRegUses.insert(Op.getReg());
187  }
188  }
189 }
190 
193  const SIInstrInfo *TII,
194  AliasAnalysis * AA) {
195  // RAW or WAR - cannot reorder
196  // WAW - cannot reorder
197  // RAR - safe to reorder
198  return !(A->mayStore() || B->mayStore()) ||
199  TII->areMemAccessesTriviallyDisjoint(*A, *B, AA);
200 }
201 
202 // Add MI and its defs to the lists if MI reads one of the defs that are
203 // already in the list. Returns true in that case.
204 static bool
206  DenseSet<unsigned> &RegDefs,
207  DenseSet<unsigned> &PhysRegUses,
209  for (MachineOperand &Use : MI.operands()) {
210  // If one of the defs is read, then there is a use of Def between I and the
211  // instruction that I will potentially be merged with. We will need to move
212  // this instruction after the merged instructions.
213  //
214  // Similarly, if there is a def which is read by an instruction that is to
215  // be moved for merging, then we need to move the def-instruction as well.
216  // This can only happen for physical registers such as M0; virtual
217  // registers are in SSA form.
218  if (Use.isReg() &&
219  ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
220  (Use.isDef() && TargetRegisterInfo::isPhysicalRegister(Use.getReg()) &&
221  PhysRegUses.count(Use.getReg())))) {
222  Insts.push_back(&MI);
223  addDefsUsesToList(MI, RegDefs, PhysRegUses);
224  return true;
225  }
226  }
227 
228  return false;
229 }
230 
231 static bool
233  ArrayRef<MachineInstr*> InstsToMove,
234  const SIInstrInfo *TII,
235  AliasAnalysis *AA) {
236  assert(MemOp.mayLoadOrStore());
237 
238  for (MachineInstr *InstToMove : InstsToMove) {
239  if (!InstToMove->mayLoadOrStore())
240  continue;
241  if (!memAccessesCanBeReordered(MemOp, *InstToMove, TII, AA))
242  return false;
243  }
244  return true;
245 }
246 
247 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
248  // XXX - Would the same offset be OK? Is there any reason this would happen or
249  // be useful?
250  if (CI.Offset0 == CI.Offset1)
251  return false;
252 
253  // This won't be valid if the offset isn't aligned.
254  if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0))
255  return false;
256 
257  unsigned EltOffset0 = CI.Offset0 / CI.EltSize;
258  unsigned EltOffset1 = CI.Offset1 / CI.EltSize;
259  CI.UseST64 = false;
260  CI.BaseOff = 0;
261 
262  // Handle SMEM and VMEM instructions.
263  if (CI.InstClass != DS_READ_WRITE) {
264  unsigned Diff = CI.IsX2 ? 2 : 1;
265  return (EltOffset0 + Diff == EltOffset1 ||
266  EltOffset1 + Diff == EltOffset0) &&
267  CI.GLC0 == CI.GLC1 &&
268  (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1);
269  }
270 
271  // If the offset in elements doesn't fit in 8-bits, we might be able to use
272  // the stride 64 versions.
273  if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
274  isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
275  CI.Offset0 = EltOffset0 / 64;
276  CI.Offset1 = EltOffset1 / 64;
277  CI.UseST64 = true;
278  return true;
279  }
280 
281  // Check if the new offsets fit in the reduced 8-bit range.
282  if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
283  CI.Offset0 = EltOffset0;
284  CI.Offset1 = EltOffset1;
285  return true;
286  }
287 
288  // Try to shift base address to decrease offsets.
289  unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0);
290  CI.BaseOff = std::min(CI.Offset0, CI.Offset1);
291 
292  if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) {
293  CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
294  CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
295  CI.UseST64 = true;
296  return true;
297  }
298 
299  if (isUInt<8>(OffsetDiff)) {
300  CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize;
301  CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize;
302  return true;
303  }
304 
305  return false;
306 }
307 
308 bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
309  MachineBasicBlock *MBB = CI.I->getParent();
311  MachineBasicBlock::iterator MBBI = CI.I;
312 
313  unsigned AddrOpName[3] = {0};
314  int AddrIdx[3];
315  const MachineOperand *AddrReg[3];
316  unsigned NumAddresses = 0;
317 
318  switch (CI.InstClass) {
319  case DS_READ_WRITE:
320  AddrOpName[NumAddresses++] = AMDGPU::OpName::addr;
321  break;
322  case S_BUFFER_LOAD_IMM:
323  AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase;
324  break;
325  case BUFFER_LOAD_OFFEN:
326  case BUFFER_STORE_OFFEN:
327  AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
328  AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
329  AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
330  break;
331  case BUFFER_LOAD_OFFSET:
332  case BUFFER_STORE_OFFSET:
333  AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
334  AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
335  break;
336  }
337 
338  for (unsigned i = 0; i < NumAddresses; i++) {
339  AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]);
340  AddrReg[i] = &CI.I->getOperand(AddrIdx[i]);
341 
342  // We only ever merge operations with the same base address register, so don't
343  // bother scanning forward if there are no other uses.
344  if (AddrReg[i]->isReg() &&
346  MRI->hasOneNonDBGUse(AddrReg[i]->getReg())))
347  return false;
348  }
349 
350  ++MBBI;
351 
352  DenseSet<unsigned> RegDefsToMove;
353  DenseSet<unsigned> PhysRegUsesToMove;
354  addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
355 
356  for ( ; MBBI != E; ++MBBI) {
357  if (MBBI->getOpcode() != CI.I->getOpcode()) {
358  // This is not a matching DS instruction, but we can keep looking as
359  // long as one of these conditions are met:
360  // 1. It is safe to move I down past MBBI.
361  // 2. It is safe to move MBBI down past the instruction that I will
362  // be merged into.
363 
364  if (MBBI->hasUnmodeledSideEffects()) {
365  // We can't re-order this instruction with respect to other memory
366  // operations, so we fail both conditions mentioned above.
367  return false;
368  }
369 
370  if (MBBI->mayLoadOrStore() &&
371  (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
372  !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) {
373  // We fail condition #1, but we may still be able to satisfy condition
374  // #2. Add this instruction to the move list and then we will check
375  // if condition #2 holds once we have selected the matching instruction.
376  CI.InstsToMove.push_back(&*MBBI);
377  addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);
378  continue;
379  }
380 
381  // When we match I with another DS instruction we will be moving I down
382  // to the location of the matched instruction any uses of I will need to
383  // be moved down as well.
384  addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
385  CI.InstsToMove);
386  continue;
387  }
388 
389  // Don't merge volatiles.
390  if (MBBI->hasOrderedMemoryRef())
391  return false;
392 
393  // Handle a case like
394  // DS_WRITE_B32 addr, v, idx0
395  // w = DS_READ_B32 addr, idx0
396  // DS_WRITE_B32 addr, f(w), idx1
397  // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
398  // merging of the two writes.
399  if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
400  CI.InstsToMove))
401  continue;
402 
403  bool Match = true;
404  for (unsigned i = 0; i < NumAddresses; i++) {
405  const MachineOperand &AddrRegNext = MBBI->getOperand(AddrIdx[i]);
406 
407  if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
408  if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
409  AddrReg[i]->getImm() != AddrRegNext.getImm()) {
410  Match = false;
411  break;
412  }
413  continue;
414  }
415 
416  // Check same base pointer. Be careful of subregisters, which can occur with
417  // vectors of pointers.
418  if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
419  AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
420  Match = false;
421  break;
422  }
423  }
424 
425  if (Match) {
426  int OffsetIdx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(),
427  AMDGPU::OpName::offset);
428  CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm();
429  CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm();
430  CI.Paired = MBBI;
431 
432  if (CI.InstClass == DS_READ_WRITE) {
433  CI.Offset0 &= 0xffff;
434  CI.Offset1 &= 0xffff;
435  } else {
436  CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm();
437  CI.GLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm();
438  if (CI.InstClass != S_BUFFER_LOAD_IMM) {
439  CI.SLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm();
440  CI.SLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::slc)->getImm();
441  }
442  }
443 
444  // Check both offsets fit in the reduced range.
445  // We also need to go through the list of instructions that we plan to
446  // move and make sure they are all safe to move down past the merged
447  // instruction.
448  if (offsetsCanBeCombined(CI))
449  if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
450  return true;
451  }
452 
453  // We've found a load/store that we couldn't merge for some reason.
454  // We could potentially keep looking, but we'd need to make sure that
455  // it was safe to move I and also all the instruction in InstsToMove
456  // down past this instruction.
457  // check if we can move I across MBBI and if we can move all I's users
458  if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
459  !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
460  break;
461  }
462  return false;
463 }
464 
465 unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
466  if (STM->ldsRequiresM0Init())
467  return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
468  return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
469 }
470 
471 unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
472  if (STM->ldsRequiresM0Init())
473  return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
474 
475  return (EltSize == 4) ?
476  AMDGPU::DS_READ2ST64_B32_gfx9 : AMDGPU::DS_READ2ST64_B64_gfx9;
477 }
478 
479 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
480  CombineInfo &CI) {
481  MachineBasicBlock *MBB = CI.I->getParent();
482 
483  // Be careful, since the addresses could be subregisters themselves in weird
484  // cases, like vectors of pointers.
485  const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
486 
487  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
488  const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdst);
489 
490  unsigned NewOffset0 = CI.Offset0;
491  unsigned NewOffset1 = CI.Offset1;
492  unsigned Opc = CI.UseST64 ?
493  read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
494 
495  unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
496  unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
497 
498  if (NewOffset0 > NewOffset1) {
499  // Canonicalize the merged instruction so the smaller offset comes first.
500  std::swap(NewOffset0, NewOffset1);
501  std::swap(SubRegIdx0, SubRegIdx1);
502  }
503 
504  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
505  (NewOffset0 != NewOffset1) &&
506  "Computed offset doesn't fit");
507 
508  const MCInstrDesc &Read2Desc = TII->get(Opc);
509 
510  const TargetRegisterClass *SuperRC
511  = (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
512  unsigned DestReg = MRI->createVirtualRegister(SuperRC);
513 
514  DebugLoc DL = CI.I->getDebugLoc();
515 
516  unsigned BaseReg = AddrReg->getReg();
517  unsigned BaseSubReg = AddrReg->getSubReg();
518  unsigned BaseRegFlags = 0;
519  if (CI.BaseOff) {
520  unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
521  BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
522  .addImm(CI.BaseOff);
523 
524  BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
525  BaseRegFlags = RegState::Kill;
526 
527  TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
528  .addReg(ImmReg)
529  .addReg(AddrReg->getReg(), 0, BaseSubReg);
530  BaseSubReg = 0;
531  }
532 
533  MachineInstrBuilder Read2 = BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg)
534  .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
535  .addImm(NewOffset0) // offset0
536  .addImm(NewOffset1) // offset1
537  .addImm(0) // gds
538  .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
539 
540  (void)Read2;
541 
542  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
543 
544  // Copy to the old destination registers.
545  BuildMI(*MBB, CI.Paired, DL, CopyDesc)
546  .add(*Dest0) // Copy to same destination including flags and sub reg.
547  .addReg(DestReg, 0, SubRegIdx0);
548  MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
549  .add(*Dest1)
550  .addReg(DestReg, RegState::Kill, SubRegIdx1);
551 
552  moveInstsAfter(Copy1, CI.InstsToMove);
553 
554  MachineBasicBlock::iterator Next = std::next(CI.I);
555  CI.I->eraseFromParent();
556  CI.Paired->eraseFromParent();
557 
558  LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
559  return Next;
560 }
561 
562 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
563  if (STM->ldsRequiresM0Init())
564  return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
565  return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9 : AMDGPU::DS_WRITE2_B64_gfx9;
566 }
567 
568 unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
569  if (STM->ldsRequiresM0Init())
570  return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64;
571 
572  return (EltSize == 4) ?
573  AMDGPU::DS_WRITE2ST64_B32_gfx9 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
574 }
575 
576 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
577  CombineInfo &CI) {
578  MachineBasicBlock *MBB = CI.I->getParent();
579 
580  // Be sure to use .addOperand(), and not .addReg() with these. We want to be
581  // sure we preserve the subregister index and any register flags set on them.
582  const MachineOperand *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
583  const MachineOperand *Data0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
584  const MachineOperand *Data1
585  = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
586 
587  unsigned NewOffset0 = CI.Offset0;
588  unsigned NewOffset1 = CI.Offset1;
589  unsigned Opc = CI.UseST64 ?
590  write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
591 
592  if (NewOffset0 > NewOffset1) {
593  // Canonicalize the merged instruction so the smaller offset comes first.
594  std::swap(NewOffset0, NewOffset1);
595  std::swap(Data0, Data1);
596  }
597 
598  assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
599  (NewOffset0 != NewOffset1) &&
600  "Computed offset doesn't fit");
601 
602  const MCInstrDesc &Write2Desc = TII->get(Opc);
603  DebugLoc DL = CI.I->getDebugLoc();
604 
605  unsigned BaseReg = AddrReg->getReg();
606  unsigned BaseSubReg = AddrReg->getSubReg();
607  unsigned BaseRegFlags = 0;
608  if (CI.BaseOff) {
609  unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
610  BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
611  .addImm(CI.BaseOff);
612 
613  BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
614  BaseRegFlags = RegState::Kill;
615 
616  TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
617  .addReg(ImmReg)
618  .addReg(AddrReg->getReg(), 0, BaseSubReg);
619  BaseSubReg = 0;
620  }
621 
622  MachineInstrBuilder Write2 = BuildMI(*MBB, CI.Paired, DL, Write2Desc)
623  .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
624  .add(*Data0) // data0
625  .add(*Data1) // data1
626  .addImm(NewOffset0) // offset0
627  .addImm(NewOffset1) // offset1
628  .addImm(0) // gds
629  .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
630 
631  moveInstsAfter(Write2, CI.InstsToMove);
632 
633  MachineBasicBlock::iterator Next = std::next(CI.I);
634  CI.I->eraseFromParent();
635  CI.Paired->eraseFromParent();
636 
637  LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
638  return Next;
639 }
640 
641 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
642  CombineInfo &CI) {
643  MachineBasicBlock *MBB = CI.I->getParent();
644  DebugLoc DL = CI.I->getDebugLoc();
645  unsigned Opcode = CI.IsX2 ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM :
646  AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
647 
648  const TargetRegisterClass *SuperRC =
649  CI.IsX2 ? &AMDGPU::SReg_128RegClass : &AMDGPU::SReg_64_XEXECRegClass;
650  unsigned DestReg = MRI->createVirtualRegister(SuperRC);
651  unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
652 
653  BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
654  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
655  .addImm(MergedOffset) // offset
656  .addImm(CI.GLC0) // glc
657  .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
658 
659  unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
660  unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
661 
662  // Handle descending offsets
663  if (CI.Offset0 > CI.Offset1)
664  std::swap(SubRegIdx0, SubRegIdx1);
665 
666  // Copy to the old destination registers.
667  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
668  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
669  const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::sdst);
670 
671  BuildMI(*MBB, CI.Paired, DL, CopyDesc)
672  .add(*Dest0) // Copy to same destination including flags and sub reg.
673  .addReg(DestReg, 0, SubRegIdx0);
674  MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
675  .add(*Dest1)
676  .addReg(DestReg, RegState::Kill, SubRegIdx1);
677 
678  moveInstsAfter(Copy1, CI.InstsToMove);
679 
680  MachineBasicBlock::iterator Next = std::next(CI.I);
681  CI.I->eraseFromParent();
682  CI.Paired->eraseFromParent();
683  return Next;
684 }
685 
686 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
687  CombineInfo &CI) {
688  MachineBasicBlock *MBB = CI.I->getParent();
689  DebugLoc DL = CI.I->getDebugLoc();
690  unsigned Opcode;
691 
692  if (CI.InstClass == BUFFER_LOAD_OFFEN) {
693  Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN :
694  AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
695  } else {
696  Opcode = CI.IsX2 ? AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET :
697  AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
698  }
699 
700  const TargetRegisterClass *SuperRC =
701  CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass;
702  unsigned DestReg = MRI->createVirtualRegister(SuperRC);
703  unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
704 
705  auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
706 
707  if (CI.InstClass == BUFFER_LOAD_OFFEN)
708  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
709 
710  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
711  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
712  .addImm(MergedOffset) // offset
713  .addImm(CI.GLC0) // glc
714  .addImm(CI.SLC0) // slc
715  .addImm(0) // tfe
716  .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
717 
718  unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
719  unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
720 
721  // Handle descending offsets
722  if (CI.Offset0 > CI.Offset1)
723  std::swap(SubRegIdx0, SubRegIdx1);
724 
725  // Copy to the old destination registers.
726  const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
727  const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
728  const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
729 
730  BuildMI(*MBB, CI.Paired, DL, CopyDesc)
731  .add(*Dest0) // Copy to same destination including flags and sub reg.
732  .addReg(DestReg, 0, SubRegIdx0);
733  MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
734  .add(*Dest1)
735  .addReg(DestReg, RegState::Kill, SubRegIdx1);
736 
737  moveInstsAfter(Copy1, CI.InstsToMove);
738 
739  MachineBasicBlock::iterator Next = std::next(CI.I);
740  CI.I->eraseFromParent();
741  CI.Paired->eraseFromParent();
742  return Next;
743 }
744 
745 unsigned SILoadStoreOptimizer::promoteBufferStoreOpcode(
746  const MachineInstr &I, bool &IsX2, bool &IsOffen) const {
747  IsX2 = false;
748  IsOffen = false;
749 
750  switch (I.getOpcode()) {
751  case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
752  IsOffen = true;
753  return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN;
754  case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
755  IsOffen = true;
756  return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact;
757  case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
758  IsX2 = true;
759  IsOffen = true;
760  return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN;
761  case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN_exact:
762  IsX2 = true;
763  IsOffen = true;
764  return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN_exact;
765  case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
766  return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
767  case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
768  return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact;
769  case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET:
770  IsX2 = true;
771  return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
772  case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET_exact:
773  IsX2 = true;
774  return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET_exact;
775  }
776  return 0;
777 }
778 
779 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
780  CombineInfo &CI) {
781  MachineBasicBlock *MBB = CI.I->getParent();
782  DebugLoc DL = CI.I->getDebugLoc();
783  bool Unused1, Unused2;
784  unsigned Opcode = promoteBufferStoreOpcode(*CI.I, Unused1, Unused2);
785 
786  unsigned SubRegIdx0 = CI.IsX2 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
787  unsigned SubRegIdx1 = CI.IsX2 ? AMDGPU::sub2_sub3 : AMDGPU::sub1;
788 
789  // Handle descending offsets
790  if (CI.Offset0 > CI.Offset1)
791  std::swap(SubRegIdx0, SubRegIdx1);
792 
793  // Copy to the new source register.
794  const TargetRegisterClass *SuperRC =
795  CI.IsX2 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass;
796  unsigned SrcReg = MRI->createVirtualRegister(SuperRC);
797 
798  const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
799  const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
800 
801  BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
802  .add(*Src0)
803  .addImm(SubRegIdx0)
804  .add(*Src1)
805  .addImm(SubRegIdx1);
806 
807  auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode))
808  .addReg(SrcReg, RegState::Kill);
809 
810  if (CI.InstClass == BUFFER_STORE_OFFEN)
811  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
812 
813  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
814  .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
815  .addImm(std::min(CI.Offset0, CI.Offset1)) // offset
816  .addImm(CI.GLC0) // glc
817  .addImm(CI.SLC0) // slc
818  .addImm(0) // tfe
819  .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
820 
821  moveInstsAfter(MIB, CI.InstsToMove);
822 
823  MachineBasicBlock::iterator Next = std::next(CI.I);
824  CI.I->eraseFromParent();
825  CI.Paired->eraseFromParent();
826  return Next;
827 }
828 
829 // Scan through looking for adjacent LDS operations with constant offsets from
830 // the same base register. We rely on the scheduler to do the hard work of
831 // clustering nearby loads, and assume these are all adjacent.
832 bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
833  bool Modified = false;
834 
835  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
836  MachineInstr &MI = *I;
837 
838  // Don't combine if volatile.
839  if (MI.hasOrderedMemoryRef()) {
840  ++I;
841  continue;
842  }
843 
844  CombineInfo CI;
845  CI.I = I;
846  unsigned Opc = MI.getOpcode();
847  if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64 ||
848  Opc == AMDGPU::DS_READ_B32_gfx9 || Opc == AMDGPU::DS_READ_B64_gfx9) {
849 
850  CI.InstClass = DS_READ_WRITE;
851  CI.EltSize =
852  (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8 : 4;
853 
854  if (findMatchingInst(CI)) {
855  Modified = true;
856  I = mergeRead2Pair(CI);
857  } else {
858  ++I;
859  }
860 
861  continue;
862  } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64 ||
863  Opc == AMDGPU::DS_WRITE_B32_gfx9 ||
864  Opc == AMDGPU::DS_WRITE_B64_gfx9) {
865  CI.InstClass = DS_READ_WRITE;
866  CI.EltSize
867  = (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8 : 4;
868 
869  if (findMatchingInst(CI)) {
870  Modified = true;
871  I = mergeWrite2Pair(CI);
872  } else {
873  ++I;
874  }
875 
876  continue;
877  }
878  if (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM ||
879  Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM) {
880  // EltSize is in units of the offset encoding.
881  CI.InstClass = S_BUFFER_LOAD_IMM;
882  CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4);
883  CI.IsX2 = Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
884  if (findMatchingInst(CI)) {
885  Modified = true;
886  I = mergeSBufferLoadImmPair(CI);
887  if (!CI.IsX2)
888  CreatedX2++;
889  } else {
890  ++I;
891  }
892  continue;
893  }
894  if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN ||
895  Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN ||
896  Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFSET ||
897  Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET) {
898  if (Opc == AMDGPU::BUFFER_LOAD_DWORD_OFFEN ||
899  Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN)
900  CI.InstClass = BUFFER_LOAD_OFFEN;
901  else
902  CI.InstClass = BUFFER_LOAD_OFFSET;
903 
904  CI.EltSize = 4;
905  CI.IsX2 = Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN ||
906  Opc == AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
907  if (findMatchingInst(CI)) {
908  Modified = true;
909  I = mergeBufferLoadPair(CI);
910  if (!CI.IsX2)
911  CreatedX2++;
912  } else {
913  ++I;
914  }
915  continue;
916  }
917 
918  bool StoreIsX2, IsOffen;
919  if (promoteBufferStoreOpcode(*I, StoreIsX2, IsOffen)) {
920  CI.InstClass = IsOffen ? BUFFER_STORE_OFFEN : BUFFER_STORE_OFFSET;
921  CI.EltSize = 4;
922  CI.IsX2 = StoreIsX2;
923  if (findMatchingInst(CI)) {
924  Modified = true;
925  I = mergeBufferStorePair(CI);
926  if (!CI.IsX2)
927  CreatedX2++;
928  } else {
929  ++I;
930  }
931  continue;
932  }
933 
934  ++I;
935  }
936 
937  return Modified;
938 }
939 
940 bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
941  if (skipFunction(MF.getFunction()))
942  return false;
943 
944  STM = &MF.getSubtarget<GCNSubtarget>();
945  if (!STM->loadStoreOptEnabled())
946  return false;
947 
948  TII = STM->getInstrInfo();
949  TRI = &TII->getRegisterInfo();
950 
951  MRI = &MF.getRegInfo();
952  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
953 
954  assert(MRI->isSSA() && "Must be run on SSA");
955 
956  LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
957 
958  bool Modified = false;
959 
960  for (MachineBasicBlock &MBB : MF) {
961  CreatedX2 = 0;
962  Modified |= optimizeBlock(MBB);
963 
964  // Run again to convert x2 to x4.
965  if (CreatedX2 >= 1)
966  Modified |= optimizeBlock(MBB);
967  }
968 
969  return Modified;
970 }
static bool isReg(const MCInst &MI, unsigned OpNo)
void push_back(const T &Elt)
Definition: SmallVector.h:218
const MachineInstrBuilder & add(const MachineOperand &MO) const
Interface definition for SIRegisterInfo.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
AMDGPU specific subclass of TargetSubtarget.
Compute iterated dominance frontiers using a linear time algorithm.
Definition: AllocatorList.h:24
char & SILoadStoreOptimizerID
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:163
unsigned getReg() const
getReg - Returns the register number.
unsigned getSubReg() const
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
Definition: MachineInstr.h:830
unsigned const TargetRegisterInfo * TRI
A debug info location.
Definition: DebugLoc.h:34
iterator_range< mop_iterator > operands()
Definition: MachineInstr.h:459
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
AnalysisUsage & addRequired()
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:51
static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, MachineBasicBlock::iterator B, const SIInstrInfo *TII, AliasAnalysis *AA)
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const HexagonInstrInfo * TII
A Use represents the edge between a Value definition and its users.
Definition: Use.h:56
#define DEBUG_TYPE
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:409
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:33
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
constexpr bool isUInt< 8 >(uint64_t x)
Definition: MathExtras.h:343
unsigned const MachineRegisterInfo * MRI
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:188
Represent the analysis usage information of a pass.
SI Load Store Optimizer
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:285
static void moveInstsAfter(MachineBasicBlock::iterator I, ArrayRef< MachineInstr *> InstsToMove)
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
MachineOperand class - Representation of each machine instruction operand.
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:847
bool areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb, AliasAnalysis *AA=nullptr) const override
bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:286
int64_t getImm() const
const Function & getFunction() const
Return the LLVM function that this machine code represents.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:133
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:941
static bool addToListsIfDependent(MachineInstr &MI, DenseSet< unsigned > &RegDefs, DenseSet< unsigned > &PhysRegUses, SmallVectorImpl< MachineInstr *> &Insts)
static unsigned getReg(const void *D, unsigned RC, unsigned RegNo)
INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer", false, false) INITIALIZE_PASS_END(SILoadStoreOptimizer
void initializeSILoadStoreOptimizerPass(PassRegistry &)
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
Provides AMDGPU specific target descriptions.
Representation of each machine instruction.
Definition: MachineInstr.h:64
static bool isPhysicalRegister(unsigned Reg)
Return true if the specified register number is in the physical register namespace.
Interface definition for SIInstrInfo.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
#define I(x, y, z)
Definition: MD5.cpp:58
APFloat abs(APFloat X)
Returns the absolute value of the argument.
Definition: APFloat.h:1213
const MachineInstrBuilder & cloneMergedMemRefs(ArrayRef< const MachineInstr *> OtherMIs) const
static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp, ArrayRef< MachineInstr *> InstsToMove, const SIInstrInfo *TII, AliasAnalysis *AA)
const MachineInstrBuilder & addReg(unsigned RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:92
FunctionPass * createSILoadStoreOptimizerPass()
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
IRTranslator LLVM IR MI
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:49
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object...
#define LLVM_DEBUG(X)
Definition: Debug.h:123
static void addDefsUsesToList(const MachineInstr &MI, DenseSet< unsigned > &RegDefs, DenseSet< unsigned > &PhysRegUses)