LLVM 19.0.0git
SILoadStoreOptimizer.cpp
Go to the documentation of this file.
1//===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass tries to fuse DS instructions with close by immediate offsets.
10// This will fuse operations such as
11// ds_read_b32 v0, v2 offset:16
12// ds_read_b32 v1, v2 offset:32
13// ==>
14// ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15//
16// The same is done for certain SMEM and VMEM opcodes, e.g.:
17// s_buffer_load_dword s4, s[0:3], 4
18// s_buffer_load_dword s5, s[0:3], 8
19// ==>
20// s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21//
22// This pass also tries to promote constant offset to the immediate by
23// adjusting the base. It tries to use a base from the nearby instructions that
24// allows it to have a 13bit constant offset and then promotes the 13bit offset
25// to the immediate.
26// E.g.
27// s_movk_i32 s0, 0x1800
28// v_add_co_u32_e32 v0, vcc, s0, v2
29// v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30//
31// s_movk_i32 s0, 0x1000
32// v_add_co_u32_e32 v5, vcc, s0, v2
33// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34// global_load_dwordx2 v[5:6], v[5:6], off
35// global_load_dwordx2 v[0:1], v[0:1], off
36// =>
37// s_movk_i32 s0, 0x1000
38// v_add_co_u32_e32 v5, vcc, s0, v2
39// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40// global_load_dwordx2 v[5:6], v[5:6], off
41// global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42//
43// Future improvements:
44//
45// - This is currently missing stores of constants because loading
46// the constant into the data register is placed between the stores, although
47// this is arguably a scheduling problem.
48//
49// - Live interval recomputing seems inefficient. This currently only matches
50// one pair, and recomputes live intervals and moves on to the next pair. It
51// would be better to compute a list of all merges that need to occur.
52//
53// - With a list of instructions to process, we can also merge more. If a
54// cluster of loads have offsets that are too large to fit in the 8-bit
55// offsets, but are close enough to fit in the 8 bits, we can add to the base
56// pointer and use the new reduced offsets.
57//
58//===----------------------------------------------------------------------===//
59
60#include "AMDGPU.h"
61#include "GCNSubtarget.h"
66
67using namespace llvm;
68
69#define DEBUG_TYPE "si-load-store-opt"
70
71namespace {
72enum InstClassEnum {
73 UNKNOWN,
74 DS_READ,
75 DS_WRITE,
76 S_BUFFER_LOAD_IMM,
77 S_BUFFER_LOAD_SGPR_IMM,
78 S_LOAD_IMM,
79 BUFFER_LOAD,
80 BUFFER_STORE,
81 MIMG,
82 TBUFFER_LOAD,
83 TBUFFER_STORE,
84 GLOBAL_LOAD_SADDR,
85 GLOBAL_STORE_SADDR,
86 FLAT_LOAD,
87 FLAT_STORE,
88 GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of
89 GLOBAL_STORE // any CombineInfo, they are only ever returned by
90 // getCommonInstClass.
91};
92
93struct AddressRegs {
94 unsigned char NumVAddrs = 0;
95 bool SBase = false;
96 bool SRsrc = false;
97 bool SOffset = false;
98 bool SAddr = false;
99 bool VAddr = false;
100 bool Addr = false;
101 bool SSamp = false;
102};
103
104// GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
105const unsigned MaxAddressRegs = 12 + 1 + 1;
106
107class SILoadStoreOptimizer : public MachineFunctionPass {
108 struct CombineInfo {
110 unsigned EltSize;
111 unsigned Offset;
112 unsigned Width;
113 unsigned Format;
114 unsigned BaseOff;
115 unsigned DMask;
116 InstClassEnum InstClass;
117 unsigned CPol = 0;
118 bool IsAGPR;
119 bool UseST64;
120 int AddrIdx[MaxAddressRegs];
121 const MachineOperand *AddrReg[MaxAddressRegs];
122 unsigned NumAddresses;
123 unsigned Order;
124
125 bool hasSameBaseAddress(const CombineInfo &CI) {
126 if (NumAddresses != CI.NumAddresses)
127 return false;
128
129 const MachineInstr &MI = *CI.I;
130 for (unsigned i = 0; i < NumAddresses; i++) {
131 const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
132
133 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
134 if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
135 AddrReg[i]->getImm() != AddrRegNext.getImm()) {
136 return false;
137 }
138 continue;
139 }
140
141 // Check same base pointer. Be careful of subregisters, which can occur
142 // with vectors of pointers.
143 if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
144 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
145 return false;
146 }
147 }
148 return true;
149 }
150
151 bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
152 for (unsigned i = 0; i < NumAddresses; ++i) {
153 const MachineOperand *AddrOp = AddrReg[i];
154 // Immediates are always OK.
155 if (AddrOp->isImm())
156 continue;
157
158 // Don't try to merge addresses that aren't either immediates or registers.
159 // TODO: Should be possible to merge FrameIndexes and maybe some other
160 // non-register
161 if (!AddrOp->isReg())
162 return false;
163
164 // TODO: We should be able to merge instructions with other physical reg
165 // addresses too.
166 if (AddrOp->getReg().isPhysical() &&
167 AddrOp->getReg() != AMDGPU::SGPR_NULL)
168 return false;
169
170 // If an address has only one use then there will be no other
171 // instructions with the same address, so we can't merge this one.
172 if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
173 return false;
174 }
175 return true;
176 }
177
178 void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
179
180 // Compare by pointer order.
181 bool operator<(const CombineInfo& Other) const {
182 return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset;
183 }
184 };
185
186 struct BaseRegisters {
187 Register LoReg;
188 Register HiReg;
189
190 unsigned LoSubReg = 0;
191 unsigned HiSubReg = 0;
192 };
193
194 struct MemAddress {
195 BaseRegisters Base;
196 int64_t Offset = 0;
197 };
198
199 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
200
201private:
202 const GCNSubtarget *STM = nullptr;
203 const SIInstrInfo *TII = nullptr;
204 const SIRegisterInfo *TRI = nullptr;
205 MachineRegisterInfo *MRI = nullptr;
206 AliasAnalysis *AA = nullptr;
207 bool OptimizeAgain;
208
209 bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
210 const DenseSet<Register> &ARegUses,
211 const MachineInstr &A, const MachineInstr &B) const;
212 static bool dmasksCanBeCombined(const CombineInfo &CI,
213 const SIInstrInfo &TII,
214 const CombineInfo &Paired);
215 static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
216 CombineInfo &Paired, bool Modify = false);
217 static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
218 const CombineInfo &Paired);
219 static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
220 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
221 const CombineInfo &Paired);
222 const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI,
223 const CombineInfo &Paired);
224 const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
225
226 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
227
228 unsigned read2Opcode(unsigned EltSize) const;
229 unsigned read2ST64Opcode(unsigned EltSize) const;
231 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
232 MachineBasicBlock::iterator InsertBefore);
233
234 unsigned write2Opcode(unsigned EltSize) const;
235 unsigned write2ST64Opcode(unsigned EltSize) const;
237 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
238 MachineBasicBlock::iterator InsertBefore);
240 mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
241 MachineBasicBlock::iterator InsertBefore);
243 mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
244 MachineBasicBlock::iterator InsertBefore);
246 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
247 MachineBasicBlock::iterator InsertBefore);
249 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
250 MachineBasicBlock::iterator InsertBefore);
252 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
253 MachineBasicBlock::iterator InsertBefore);
255 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
256 MachineBasicBlock::iterator InsertBefore);
258 mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
259 MachineBasicBlock::iterator InsertBefore);
261 mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
262 MachineBasicBlock::iterator InsertBefore);
263
264 void updateBaseAndOffset(MachineInstr &I, Register NewBase,
265 int32_t NewOffset) const;
266 Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
267 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
268 std::optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
269 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
270 /// Promotes constant offset to the immediate by adjusting the base. It
271 /// tries to use a base from the nearby instructions that allows it to have
272 /// a 13bit constant offset which gets promoted to the immediate.
273 bool promoteConstantOffsetToImm(MachineInstr &CI,
274 MemInfoMap &Visited,
275 SmallPtrSet<MachineInstr *, 4> &Promoted) const;
276 void addInstToMergeableList(const CombineInfo &CI,
277 std::list<std::list<CombineInfo> > &MergeableInsts) const;
278
279 std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
281 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
282 std::list<std::list<CombineInfo>> &MergeableInsts) const;
283
284 static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI,
285 const CombineInfo &Paired);
286
287 static InstClassEnum getCommonInstClass(const CombineInfo &CI,
288 const CombineInfo &Paired);
289
290public:
291 static char ID;
292
293 SILoadStoreOptimizer() : MachineFunctionPass(ID) {
295 }
296
297 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
298 bool &OptimizeListAgain);
299 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
300
301 bool runOnMachineFunction(MachineFunction &MF) override;
302
303 StringRef getPassName() const override { return "SI Load Store Optimizer"; }
304
305 void getAnalysisUsage(AnalysisUsage &AU) const override {
306 AU.setPreservesCFG();
308
310 }
311
312 MachineFunctionProperties getRequiredProperties() const override {
314 .set(MachineFunctionProperties::Property::IsSSA);
315 }
316};
317
318static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
319 const unsigned Opc = MI.getOpcode();
320
321 if (TII.isMUBUF(Opc)) {
322 // FIXME: Handle d16 correctly
323 return AMDGPU::getMUBUFElements(Opc);
324 }
325 if (TII.isImage(MI)) {
326 uint64_t DMaskImm =
327 TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
328 return llvm::popcount(DMaskImm);
329 }
330 if (TII.isMTBUF(Opc)) {
331 return AMDGPU::getMTBUFElements(Opc);
332 }
333
334 switch (Opc) {
335 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
336 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
337 case AMDGPU::S_LOAD_DWORD_IMM:
338 case AMDGPU::GLOBAL_LOAD_DWORD:
339 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
340 case AMDGPU::GLOBAL_STORE_DWORD:
341 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
342 case AMDGPU::FLAT_LOAD_DWORD:
343 case AMDGPU::FLAT_STORE_DWORD:
344 return 1;
345 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
346 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
347 case AMDGPU::S_LOAD_DWORDX2_IMM:
348 case AMDGPU::GLOBAL_LOAD_DWORDX2:
349 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
350 case AMDGPU::GLOBAL_STORE_DWORDX2:
351 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
352 case AMDGPU::FLAT_LOAD_DWORDX2:
353 case AMDGPU::FLAT_STORE_DWORDX2:
354 return 2;
355 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
356 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
357 case AMDGPU::S_LOAD_DWORDX3_IMM:
358 case AMDGPU::GLOBAL_LOAD_DWORDX3:
359 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
360 case AMDGPU::GLOBAL_STORE_DWORDX3:
361 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
362 case AMDGPU::FLAT_LOAD_DWORDX3:
363 case AMDGPU::FLAT_STORE_DWORDX3:
364 return 3;
365 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
366 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
367 case AMDGPU::S_LOAD_DWORDX4_IMM:
368 case AMDGPU::GLOBAL_LOAD_DWORDX4:
369 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
370 case AMDGPU::GLOBAL_STORE_DWORDX4:
371 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
372 case AMDGPU::FLAT_LOAD_DWORDX4:
373 case AMDGPU::FLAT_STORE_DWORDX4:
374 return 4;
375 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
376 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
377 case AMDGPU::S_LOAD_DWORDX8_IMM:
378 return 8;
379 case AMDGPU::DS_READ_B32: [[fallthrough]];
380 case AMDGPU::DS_READ_B32_gfx9: [[fallthrough]];
381 case AMDGPU::DS_WRITE_B32: [[fallthrough]];
382 case AMDGPU::DS_WRITE_B32_gfx9:
383 return 1;
384 case AMDGPU::DS_READ_B64: [[fallthrough]];
385 case AMDGPU::DS_READ_B64_gfx9: [[fallthrough]];
386 case AMDGPU::DS_WRITE_B64: [[fallthrough]];
387 case AMDGPU::DS_WRITE_B64_gfx9:
388 return 2;
389 default:
390 return 0;
391 }
392}
393
394/// Maps instruction opcode to enum InstClassEnum.
395static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
396 switch (Opc) {
397 default:
398 if (TII.isMUBUF(Opc)) {
399 switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
400 default:
401 return UNKNOWN;
402 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
403 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
404 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
405 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
406 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN:
407 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact:
408 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET:
409 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact:
410 return BUFFER_LOAD;
411 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
412 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
413 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
414 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
415 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN:
416 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact:
417 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET:
418 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact:
419 return BUFFER_STORE;
420 }
421 }
422 if (TII.isImage(Opc)) {
423 // Ignore instructions encoded without vaddr.
424 if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
425 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0))
426 return UNKNOWN;
427 // Ignore BVH instructions
428 if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
429 return UNKNOWN;
430 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
431 if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
432 TII.isGather4(Opc))
433 return UNKNOWN;
434 return MIMG;
435 }
436 if (TII.isMTBUF(Opc)) {
437 switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
438 default:
439 return UNKNOWN;
440 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:
441 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:
442 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:
443 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:
444 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
445 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
446 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
447 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
448 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN:
449 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact:
450 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN:
451 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact:
452 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN:
453 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact:
454 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET:
455 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact:
456 return TBUFFER_LOAD;
457 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
458 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
459 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
460 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
461 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN:
462 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact:
463 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET:
464 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact:
465 return TBUFFER_STORE;
466 }
467 }
468 return UNKNOWN;
469 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
470 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
471 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
472 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
473 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
474 return S_BUFFER_LOAD_IMM;
475 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
476 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
477 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
478 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
479 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
480 return S_BUFFER_LOAD_SGPR_IMM;
481 case AMDGPU::S_LOAD_DWORD_IMM:
482 case AMDGPU::S_LOAD_DWORDX2_IMM:
483 case AMDGPU::S_LOAD_DWORDX3_IMM:
484 case AMDGPU::S_LOAD_DWORDX4_IMM:
485 case AMDGPU::S_LOAD_DWORDX8_IMM:
486 return S_LOAD_IMM;
487 case AMDGPU::DS_READ_B32:
488 case AMDGPU::DS_READ_B32_gfx9:
489 case AMDGPU::DS_READ_B64:
490 case AMDGPU::DS_READ_B64_gfx9:
491 return DS_READ;
492 case AMDGPU::DS_WRITE_B32:
493 case AMDGPU::DS_WRITE_B32_gfx9:
494 case AMDGPU::DS_WRITE_B64:
495 case AMDGPU::DS_WRITE_B64_gfx9:
496 return DS_WRITE;
497 case AMDGPU::GLOBAL_LOAD_DWORD:
498 case AMDGPU::GLOBAL_LOAD_DWORDX2:
499 case AMDGPU::GLOBAL_LOAD_DWORDX3:
500 case AMDGPU::GLOBAL_LOAD_DWORDX4:
501 case AMDGPU::FLAT_LOAD_DWORD:
502 case AMDGPU::FLAT_LOAD_DWORDX2:
503 case AMDGPU::FLAT_LOAD_DWORDX3:
504 case AMDGPU::FLAT_LOAD_DWORDX4:
505 return FLAT_LOAD;
506 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
507 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
508 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
509 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
510 return GLOBAL_LOAD_SADDR;
511 case AMDGPU::GLOBAL_STORE_DWORD:
512 case AMDGPU::GLOBAL_STORE_DWORDX2:
513 case AMDGPU::GLOBAL_STORE_DWORDX3:
514 case AMDGPU::GLOBAL_STORE_DWORDX4:
515 case AMDGPU::FLAT_STORE_DWORD:
516 case AMDGPU::FLAT_STORE_DWORDX2:
517 case AMDGPU::FLAT_STORE_DWORDX3:
518 case AMDGPU::FLAT_STORE_DWORDX4:
519 return FLAT_STORE;
520 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
521 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
522 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
523 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
524 return GLOBAL_STORE_SADDR;
525 }
526}
527
528/// Determines instruction subclass from opcode. Only instructions
529/// of the same subclass can be merged together. The merged instruction may have
530/// a different subclass but must have the same class.
531static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
532 switch (Opc) {
533 default:
534 if (TII.isMUBUF(Opc))
535 return AMDGPU::getMUBUFBaseOpcode(Opc);
536 if (TII.isImage(Opc)) {
538 assert(Info);
539 return Info->BaseOpcode;
540 }
541 if (TII.isMTBUF(Opc))
542 return AMDGPU::getMTBUFBaseOpcode(Opc);
543 return -1;
544 case AMDGPU::DS_READ_B32:
545 case AMDGPU::DS_READ_B32_gfx9:
546 case AMDGPU::DS_READ_B64:
547 case AMDGPU::DS_READ_B64_gfx9:
548 case AMDGPU::DS_WRITE_B32:
549 case AMDGPU::DS_WRITE_B32_gfx9:
550 case AMDGPU::DS_WRITE_B64:
551 case AMDGPU::DS_WRITE_B64_gfx9:
552 return Opc;
553 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
554 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
555 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
556 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
557 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
558 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
559 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
560 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
561 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
562 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
563 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
564 return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
565 case AMDGPU::S_LOAD_DWORD_IMM:
566 case AMDGPU::S_LOAD_DWORDX2_IMM:
567 case AMDGPU::S_LOAD_DWORDX3_IMM:
568 case AMDGPU::S_LOAD_DWORDX4_IMM:
569 case AMDGPU::S_LOAD_DWORDX8_IMM:
570 return AMDGPU::S_LOAD_DWORD_IMM;
571 case AMDGPU::GLOBAL_LOAD_DWORD:
572 case AMDGPU::GLOBAL_LOAD_DWORDX2:
573 case AMDGPU::GLOBAL_LOAD_DWORDX3:
574 case AMDGPU::GLOBAL_LOAD_DWORDX4:
575 case AMDGPU::FLAT_LOAD_DWORD:
576 case AMDGPU::FLAT_LOAD_DWORDX2:
577 case AMDGPU::FLAT_LOAD_DWORDX3:
578 case AMDGPU::FLAT_LOAD_DWORDX4:
579 return AMDGPU::FLAT_LOAD_DWORD;
580 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
581 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
582 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
583 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
584 return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
585 case AMDGPU::GLOBAL_STORE_DWORD:
586 case AMDGPU::GLOBAL_STORE_DWORDX2:
587 case AMDGPU::GLOBAL_STORE_DWORDX3:
588 case AMDGPU::GLOBAL_STORE_DWORDX4:
589 case AMDGPU::FLAT_STORE_DWORD:
590 case AMDGPU::FLAT_STORE_DWORDX2:
591 case AMDGPU::FLAT_STORE_DWORDX3:
592 case AMDGPU::FLAT_STORE_DWORDX4:
593 return AMDGPU::FLAT_STORE_DWORD;
594 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
595 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
596 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
597 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
598 return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
599 }
600}
601
602// GLOBAL loads and stores are classified as FLAT initially. If both combined
603// instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE.
604// If either or both instructions are non segment specific FLAT the resulting
605// combined operation will be FLAT, potentially promoting one of the GLOBAL
606// operations to FLAT.
607// For other instructions return the original unmodified class.
608InstClassEnum
609SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI,
610 const CombineInfo &Paired) {
611 assert(CI.InstClass == Paired.InstClass);
612
613 if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
615 return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
616
617 return CI.InstClass;
618}
619
620static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
621 AddressRegs Result;
622
623 if (TII.isMUBUF(Opc)) {
625 Result.VAddr = true;
627 Result.SRsrc = true;
629 Result.SOffset = true;
630
631 return Result;
632 }
633
634 if (TII.isImage(Opc)) {
635 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
636 if (VAddr0Idx >= 0) {
637 int RsrcName =
638 TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
639 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcName);
640 Result.NumVAddrs = RsrcIdx - VAddr0Idx;
641 } else {
642 Result.VAddr = true;
643 }
644 Result.SRsrc = true;
646 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
647 Result.SSamp = true;
648
649 return Result;
650 }
651 if (TII.isMTBUF(Opc)) {
653 Result.VAddr = true;
655 Result.SRsrc = true;
657 Result.SOffset = true;
658
659 return Result;
660 }
661
662 switch (Opc) {
663 default:
664 return Result;
665 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
666 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
667 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
668 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
669 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
670 Result.SOffset = true;
671 [[fallthrough]];
672 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
673 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
674 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
675 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
676 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
677 case AMDGPU::S_LOAD_DWORD_IMM:
678 case AMDGPU::S_LOAD_DWORDX2_IMM:
679 case AMDGPU::S_LOAD_DWORDX3_IMM:
680 case AMDGPU::S_LOAD_DWORDX4_IMM:
681 case AMDGPU::S_LOAD_DWORDX8_IMM:
682 Result.SBase = true;
683 return Result;
684 case AMDGPU::DS_READ_B32:
685 case AMDGPU::DS_READ_B64:
686 case AMDGPU::DS_READ_B32_gfx9:
687 case AMDGPU::DS_READ_B64_gfx9:
688 case AMDGPU::DS_WRITE_B32:
689 case AMDGPU::DS_WRITE_B64:
690 case AMDGPU::DS_WRITE_B32_gfx9:
691 case AMDGPU::DS_WRITE_B64_gfx9:
692 Result.Addr = true;
693 return Result;
694 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
695 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
696 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
697 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
698 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
699 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
700 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
701 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
702 Result.SAddr = true;
703 [[fallthrough]];
704 case AMDGPU::GLOBAL_LOAD_DWORD:
705 case AMDGPU::GLOBAL_LOAD_DWORDX2:
706 case AMDGPU::GLOBAL_LOAD_DWORDX3:
707 case AMDGPU::GLOBAL_LOAD_DWORDX4:
708 case AMDGPU::GLOBAL_STORE_DWORD:
709 case AMDGPU::GLOBAL_STORE_DWORDX2:
710 case AMDGPU::GLOBAL_STORE_DWORDX3:
711 case AMDGPU::GLOBAL_STORE_DWORDX4:
712 case AMDGPU::FLAT_LOAD_DWORD:
713 case AMDGPU::FLAT_LOAD_DWORDX2:
714 case AMDGPU::FLAT_LOAD_DWORDX3:
715 case AMDGPU::FLAT_LOAD_DWORDX4:
716 case AMDGPU::FLAT_STORE_DWORD:
717 case AMDGPU::FLAT_STORE_DWORDX2:
718 case AMDGPU::FLAT_STORE_DWORDX3:
719 case AMDGPU::FLAT_STORE_DWORDX4:
720 Result.VAddr = true;
721 return Result;
722 }
723}
724
725void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
726 const SILoadStoreOptimizer &LSO) {
727 I = MI;
728 unsigned Opc = MI->getOpcode();
729 InstClass = getInstClass(Opc, *LSO.TII);
730
731 if (InstClass == UNKNOWN)
732 return;
733
734 IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI));
735
736 switch (InstClass) {
737 case DS_READ:
738 EltSize =
739 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
740 : 4;
741 break;
742 case DS_WRITE:
743 EltSize =
744 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
745 : 4;
746 break;
747 case S_BUFFER_LOAD_IMM:
748 case S_BUFFER_LOAD_SGPR_IMM:
749 case S_LOAD_IMM:
750 EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);
751 break;
752 default:
753 EltSize = 4;
754 break;
755 }
756
757 if (InstClass == MIMG) {
758 DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
759 // Offset is not considered for MIMG instructions.
760 Offset = 0;
761 } else {
762 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
763 Offset = I->getOperand(OffsetIdx).getImm();
764 }
765
766 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
767 Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
768
769 Width = getOpcodeWidth(*I, *LSO.TII);
770
771 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
772 Offset &= 0xffff;
773 } else if (InstClass != MIMG) {
774 CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
775 }
776
777 AddressRegs Regs = getRegs(Opc, *LSO.TII);
778 bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*I) || LSO.TII->isVSAMPLE(*I);
779
780 NumAddresses = 0;
781 for (unsigned J = 0; J < Regs.NumVAddrs; J++)
782 AddrIdx[NumAddresses++] =
783 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
784 if (Regs.Addr)
785 AddrIdx[NumAddresses++] =
786 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
787 if (Regs.SBase)
788 AddrIdx[NumAddresses++] =
789 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
790 if (Regs.SRsrc)
791 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
792 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc);
793 if (Regs.SOffset)
794 AddrIdx[NumAddresses++] =
795 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
796 if (Regs.SAddr)
797 AddrIdx[NumAddresses++] =
798 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
799 if (Regs.VAddr)
800 AddrIdx[NumAddresses++] =
801 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
802 if (Regs.SSamp)
803 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
804 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp);
805 assert(NumAddresses <= MaxAddressRegs);
806
807 for (unsigned J = 0; J < NumAddresses; J++)
808 AddrReg[J] = &I->getOperand(AddrIdx[J]);
809}
810
811} // end anonymous namespace.
812
813INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
814 "SI Load Store Optimizer", false, false)
816INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
818
819char SILoadStoreOptimizer::ID = 0;
820
821char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
822
824 return new SILoadStoreOptimizer();
825}
826
828 DenseSet<Register> &RegDefs,
829 DenseSet<Register> &RegUses) {
830 for (const auto &Op : MI.operands()) {
831 if (!Op.isReg())
832 continue;
833 if (Op.isDef())
834 RegDefs.insert(Op.getReg());
835 if (Op.readsReg())
836 RegUses.insert(Op.getReg());
837 }
838}
839
840bool SILoadStoreOptimizer::canSwapInstructions(
841 const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
842 const MachineInstr &A, const MachineInstr &B) const {
843 if (A.mayLoadOrStore() && B.mayLoadOrStore() &&
844 (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true))
845 return false;
846 for (const auto &BOp : B.operands()) {
847 if (!BOp.isReg())
848 continue;
849 if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))
850 return false;
851 if (BOp.isDef() && ARegUses.contains(BOp.getReg()))
852 return false;
853 }
854 return true;
855}
856
857// Given that \p CI and \p Paired are adjacent memory operations produce a new
858// MMO for the combined operation with a new access size.
860SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,
861 const CombineInfo &Paired) {
862 const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
863 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
864
865 unsigned Size = MMOa->getSize().getValue() + MMOb->getSize().getValue();
866
867 // A base pointer for the combined operation is the same as the leading
868 // operation's pointer.
869 if (Paired < CI)
870 std::swap(MMOa, MMOb);
871
872 MachinePointerInfo PtrInfo(MMOa->getPointerInfo());
873 // If merging FLAT and GLOBAL set address space to FLAT.
875 PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS;
876
877 MachineFunction *MF = CI.I->getMF();
878 return MF->getMachineMemOperand(MMOa, PtrInfo, Size);
879}
880
881bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
882 const SIInstrInfo &TII,
883 const CombineInfo &Paired) {
884 assert(CI.InstClass == MIMG);
885
886 // Ignore instructions with tfe/lwe set.
887 const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
888 const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
889
890 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
891 return false;
892
893 // Check other optional immediate operands for equality.
894 unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
895 AMDGPU::OpName::unorm, AMDGPU::OpName::da,
896 AMDGPU::OpName::r128, AMDGPU::OpName::a16};
897
898 for (auto op : OperandsToMatch) {
899 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
900 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
901 return false;
902 if (Idx != -1 &&
903 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
904 return false;
905 }
906
907 // Check DMask for overlaps.
908 unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
909 unsigned MinMask = std::min(CI.DMask, Paired.DMask);
910
911 if (!MaxMask)
912 return false;
913
914 unsigned AllowedBitsForMin = llvm::countr_zero(MaxMask);
915 if ((1u << AllowedBitsForMin) <= MinMask)
916 return false;
917
918 return true;
919}
920
921static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
922 unsigned ComponentCount,
923 const GCNSubtarget &STI) {
924 if (ComponentCount > 4)
925 return 0;
926
927 const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
929 if (!OldFormatInfo)
930 return 0;
931
932 const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
934 ComponentCount,
935 OldFormatInfo->NumFormat, STI);
936
937 if (!NewFormatInfo)
938 return 0;
939
940 assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
941 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
942
943 return NewFormatInfo->Format;
944}
945
946// Return the value in the inclusive range [Lo,Hi] that is aligned to the
947// highest power of two. Note that the result is well defined for all inputs
948// including corner cases like:
949// - if Lo == Hi, return that value
950// - if Lo == 0, return 0 (even though the "- 1" below underflows
951// - if Lo > Hi, return 0 (as if the range wrapped around)
953 return Hi & maskLeadingOnes<uint32_t>(llvm::countl_zero((Lo - 1) ^ Hi) + 1);
954}
955
956bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
957 const GCNSubtarget &STI,
958 CombineInfo &Paired,
959 bool Modify) {
960 assert(CI.InstClass != MIMG);
961
962 // XXX - Would the same offset be OK? Is there any reason this would happen or
963 // be useful?
964 if (CI.Offset == Paired.Offset)
965 return false;
966
967 // This won't be valid if the offset isn't aligned.
968 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
969 return false;
970
971 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
972
975 if (!Info0)
976 return false;
978 llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
979 if (!Info1)
980 return false;
981
982 if (Info0->BitsPerComp != Info1->BitsPerComp ||
983 Info0->NumFormat != Info1->NumFormat)
984 return false;
985
986 // TODO: Should be possible to support more formats, but if format loads
987 // are not dword-aligned, the merged load might not be valid.
988 if (Info0->BitsPerComp != 32)
989 return false;
990
991 if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
992 return false;
993 }
994
995 uint32_t EltOffset0 = CI.Offset / CI.EltSize;
996 uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
997 CI.UseST64 = false;
998 CI.BaseOff = 0;
999
1000 // Handle all non-DS instructions.
1001 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
1002 if (EltOffset0 + CI.Width != EltOffset1 &&
1003 EltOffset1 + Paired.Width != EltOffset0)
1004 return false;
1005 if (CI.CPol != Paired.CPol)
1006 return false;
1007 if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||
1008 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {
1009 // Reject cases like:
1010 // dword + dwordx2 -> dwordx3
1011 // dword + dwordx3 -> dwordx4
1012 // If we tried to combine these cases, we would fail to extract a subreg
1013 // for the result of the second load due to SGPR alignment requirements.
1014 if (CI.Width != Paired.Width &&
1015 (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset))
1016 return false;
1017 }
1018 return true;
1019 }
1020
1021 // If the offset in elements doesn't fit in 8-bits, we might be able to use
1022 // the stride 64 versions.
1023 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
1024 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
1025 if (Modify) {
1026 CI.Offset = EltOffset0 / 64;
1027 Paired.Offset = EltOffset1 / 64;
1028 CI.UseST64 = true;
1029 }
1030 return true;
1031 }
1032
1033 // Check if the new offsets fit in the reduced 8-bit range.
1034 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
1035 if (Modify) {
1036 CI.Offset = EltOffset0;
1037 Paired.Offset = EltOffset1;
1038 }
1039 return true;
1040 }
1041
1042 // Try to shift base address to decrease offsets.
1043 uint32_t Min = std::min(EltOffset0, EltOffset1);
1044 uint32_t Max = std::max(EltOffset0, EltOffset1);
1045
1046 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
1047 if (((Max - Min) & ~Mask) == 0) {
1048 if (Modify) {
1049 // From the range of values we could use for BaseOff, choose the one that
1050 // is aligned to the highest power of two, to maximise the chance that
1051 // the same offset can be reused for other load/store pairs.
1052 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
1053 // Copy the low bits of the offsets, so that when we adjust them by
1054 // subtracting BaseOff they will be multiples of 64.
1055 BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
1056 CI.BaseOff = BaseOff * CI.EltSize;
1057 CI.Offset = (EltOffset0 - BaseOff) / 64;
1058 Paired.Offset = (EltOffset1 - BaseOff) / 64;
1059 CI.UseST64 = true;
1060 }
1061 return true;
1062 }
1063
1064 if (isUInt<8>(Max - Min)) {
1065 if (Modify) {
1066 // From the range of values we could use for BaseOff, choose the one that
1067 // is aligned to the highest power of two, to maximise the chance that
1068 // the same offset can be reused for other load/store pairs.
1069 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
1070 CI.BaseOff = BaseOff * CI.EltSize;
1071 CI.Offset = EltOffset0 - BaseOff;
1072 Paired.Offset = EltOffset1 - BaseOff;
1073 }
1074 return true;
1075 }
1076
1077 return false;
1078}
1079
1080bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
1081 const CombineInfo &CI,
1082 const CombineInfo &Paired) {
1083 const unsigned Width = (CI.Width + Paired.Width);
1084 switch (CI.InstClass) {
1085 default:
1086 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
1087 case S_BUFFER_LOAD_IMM:
1088 case S_BUFFER_LOAD_SGPR_IMM:
1089 case S_LOAD_IMM:
1090 switch (Width) {
1091 default:
1092 return false;
1093 case 2:
1094 case 4:
1095 case 8:
1096 return true;
1097 case 3:
1098 return STM.hasScalarDwordx3Loads();
1099 }
1100 }
1101}
1102
1103const TargetRegisterClass *
1104SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
1105 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
1106 return TRI->getRegClassForReg(*MRI, Dst->getReg());
1107 }
1108 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
1109 return TRI->getRegClassForReg(*MRI, Src->getReg());
1110 }
1111 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
1112 return TRI->getRegClassForReg(*MRI, Src->getReg());
1113 }
1114 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
1115 return TRI->getRegClassForReg(*MRI, Dst->getReg());
1116 }
1117 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
1118 return TRI->getRegClassForReg(*MRI, Src->getReg());
1119 }
1120 return nullptr;
1121}
1122
1123/// This function assumes that CI comes before Paired in a basic block. Return
1124/// an insertion point for the merged instruction or nullptr on failure.
1125SILoadStoreOptimizer::CombineInfo *
1126SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1127 CombineInfo &Paired) {
1128 // If another instruction has already been merged into CI, it may now be a
1129 // type that we can't do any further merging into.
1130 if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1131 return nullptr;
1132 assert(CI.InstClass == Paired.InstClass);
1133
1134 if (getInstSubclass(CI.I->getOpcode(), *TII) !=
1135 getInstSubclass(Paired.I->getOpcode(), *TII))
1136 return nullptr;
1137
1138 // Check both offsets (or masks for MIMG) can be combined and fit in the
1139 // reduced range.
1140 if (CI.InstClass == MIMG) {
1141 if (!dmasksCanBeCombined(CI, *TII, Paired))
1142 return nullptr;
1143 } else {
1144 if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
1145 return nullptr;
1146 }
1147
1148 DenseSet<Register> RegDefs;
1149 DenseSet<Register> RegUses;
1150 CombineInfo *Where;
1151 if (CI.I->mayLoad()) {
1152 // Try to hoist Paired up to CI.
1153 addDefsUsesToList(*Paired.I, RegDefs, RegUses);
1154 for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
1155 if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))
1156 return nullptr;
1157 }
1158 Where = &CI;
1159 } else {
1160 // Try to sink CI down to Paired.
1161 addDefsUsesToList(*CI.I, RegDefs, RegUses);
1162 for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
1163 if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))
1164 return nullptr;
1165 }
1166 Where = &Paired;
1167 }
1168
1169 // Call offsetsCanBeCombined with modify = true so that the offsets are
1170 // correct for the new instruction. This should return true, because
1171 // this function should only be called on CombineInfo objects that
1172 // have already been confirmed to be mergeable.
1173 if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1174 offsetsCanBeCombined(CI, *STM, Paired, true);
1175 return Where;
1176}
1177
1178unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
1179 if (STM->ldsRequiresM0Init())
1180 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1181 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1182}
1183
1184unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
1185 if (STM->ldsRequiresM0Init())
1186 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1187
1188 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1189 : AMDGPU::DS_READ2ST64_B64_gfx9;
1190}
1191
1193SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1194 MachineBasicBlock::iterator InsertBefore) {
1195 MachineBasicBlock *MBB = CI.I->getParent();
1196
1197 // Be careful, since the addresses could be subregisters themselves in weird
1198 // cases, like vectors of pointers.
1199 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1200
1201 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1202 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1203
1204 unsigned NewOffset0 = CI.Offset;
1205 unsigned NewOffset1 = Paired.Offset;
1206 unsigned Opc =
1207 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1208
1209 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
1210 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
1211
1212 if (NewOffset0 > NewOffset1) {
1213 // Canonicalize the merged instruction so the smaller offset comes first.
1214 std::swap(NewOffset0, NewOffset1);
1215 std::swap(SubRegIdx0, SubRegIdx1);
1216 }
1217
1218 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1219 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1220
1221 const MCInstrDesc &Read2Desc = TII->get(Opc);
1222
1223 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1224 Register DestReg = MRI->createVirtualRegister(SuperRC);
1225
1226 DebugLoc DL = CI.I->getDebugLoc();
1227
1228 Register BaseReg = AddrReg->getReg();
1229 unsigned BaseSubReg = AddrReg->getSubReg();
1230 unsigned BaseRegFlags = 0;
1231 if (CI.BaseOff) {
1232 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1233 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1234 .addImm(CI.BaseOff);
1235
1236 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1237 BaseRegFlags = RegState::Kill;
1238
1239 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1240 .addReg(ImmReg)
1241 .addReg(AddrReg->getReg(), 0, BaseSubReg)
1242 .addImm(0); // clamp bit
1243 BaseSubReg = 0;
1244 }
1245
1246 MachineInstrBuilder Read2 =
1247 BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg)
1248 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1249 .addImm(NewOffset0) // offset0
1250 .addImm(NewOffset1) // offset1
1251 .addImm(0) // gds
1252 .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1253
1254 (void)Read2;
1255
1256 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1257
1258 // Copy to the old destination registers.
1259 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1260 .add(*Dest0) // Copy to same destination including flags and sub reg.
1261 .addReg(DestReg, 0, SubRegIdx0);
1262 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1263 .add(*Dest1)
1264 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1265
1266 CI.I->eraseFromParent();
1267 Paired.I->eraseFromParent();
1268
1269 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
1270 return Read2;
1271}
1272
1273unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
1274 if (STM->ldsRequiresM0Init())
1275 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1276 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1277 : AMDGPU::DS_WRITE2_B64_gfx9;
1278}
1279
1280unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
1281 if (STM->ldsRequiresM0Init())
1282 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1283 : AMDGPU::DS_WRITE2ST64_B64;
1284
1285 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1286 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1287}
1288
1289MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
1290 CombineInfo &CI, CombineInfo &Paired,
1291 MachineBasicBlock::iterator InsertBefore) {
1292 MachineBasicBlock *MBB = CI.I->getParent();
1293
1294 // Be sure to use .addOperand(), and not .addReg() with these. We want to be
1295 // sure we preserve the subregister index and any register flags set on them.
1296 const MachineOperand *AddrReg =
1297 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1298 const MachineOperand *Data0 =
1299 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1300 const MachineOperand *Data1 =
1301 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1302
1303 unsigned NewOffset0 = CI.Offset;
1304 unsigned NewOffset1 = Paired.Offset;
1305 unsigned Opc =
1306 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1307
1308 if (NewOffset0 > NewOffset1) {
1309 // Canonicalize the merged instruction so the smaller offset comes first.
1310 std::swap(NewOffset0, NewOffset1);
1311 std::swap(Data0, Data1);
1312 }
1313
1314 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1315 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1316
1317 const MCInstrDesc &Write2Desc = TII->get(Opc);
1318 DebugLoc DL = CI.I->getDebugLoc();
1319
1320 Register BaseReg = AddrReg->getReg();
1321 unsigned BaseSubReg = AddrReg->getSubReg();
1322 unsigned BaseRegFlags = 0;
1323 if (CI.BaseOff) {
1324 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1325 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1326 .addImm(CI.BaseOff);
1327
1328 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1329 BaseRegFlags = RegState::Kill;
1330
1331 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1332 .addReg(ImmReg)
1333 .addReg(AddrReg->getReg(), 0, BaseSubReg)
1334 .addImm(0); // clamp bit
1335 BaseSubReg = 0;
1336 }
1337
1338 MachineInstrBuilder Write2 =
1339 BuildMI(*MBB, InsertBefore, DL, Write2Desc)
1340 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1341 .add(*Data0) // data0
1342 .add(*Data1) // data1
1343 .addImm(NewOffset0) // offset0
1344 .addImm(NewOffset1) // offset1
1345 .addImm(0) // gds
1346 .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1347
1348 CI.I->eraseFromParent();
1349 Paired.I->eraseFromParent();
1350
1351 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
1352 return Write2;
1353}
1354
1356SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1357 MachineBasicBlock::iterator InsertBefore) {
1358 MachineBasicBlock *MBB = CI.I->getParent();
1359 DebugLoc DL = CI.I->getDebugLoc();
1360 const unsigned Opcode = getNewOpcode(CI, Paired);
1361
1362 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1363
1364 Register DestReg = MRI->createVirtualRegister(SuperRC);
1365 unsigned MergedDMask = CI.DMask | Paired.DMask;
1366 unsigned DMaskIdx =
1367 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1368
1369 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1370 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
1371 if (I == DMaskIdx)
1372 MIB.addImm(MergedDMask);
1373 else
1374 MIB.add((*CI.I).getOperand(I));
1375 }
1376
1377 // It shouldn't be possible to get this far if the two instructions
1378 // don't have a single memoperand, because MachineInstr::mayAlias()
1379 // will return true if this is the case.
1380 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1381
1382 MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1383
1384 unsigned SubRegIdx0, SubRegIdx1;
1385 std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired);
1386
1387 // Copy to the old destination registers.
1388 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1389 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1390 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1391
1392 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1393 .add(*Dest0) // Copy to same destination including flags and sub reg.
1394 .addReg(DestReg, 0, SubRegIdx0);
1395 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1396 .add(*Dest1)
1397 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1398
1399 CI.I->eraseFromParent();
1400 Paired.I->eraseFromParent();
1401 return New;
1402}
1403
1404MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
1405 CombineInfo &CI, CombineInfo &Paired,
1406 MachineBasicBlock::iterator InsertBefore) {
1407 MachineBasicBlock *MBB = CI.I->getParent();
1408 DebugLoc DL = CI.I->getDebugLoc();
1409 const unsigned Opcode = getNewOpcode(CI, Paired);
1410
1411 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1412
1413 Register DestReg = MRI->createVirtualRegister(SuperRC);
1414 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1415
1416 // It shouldn't be possible to get this far if the two instructions
1417 // don't have a single memoperand, because MachineInstr::mayAlias()
1418 // will return true if this is the case.
1419 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1420
1422 BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)
1423 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));
1424 if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1425 New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));
1426 New.addImm(MergedOffset);
1427 New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1428
1429 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1430 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1431 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1432
1433 // Copy to the old destination registers.
1434 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1435 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
1436 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst);
1437
1438 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1439 .add(*Dest0) // Copy to same destination including flags and sub reg.
1440 .addReg(DestReg, 0, SubRegIdx0);
1441 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1442 .add(*Dest1)
1443 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1444
1445 CI.I->eraseFromParent();
1446 Paired.I->eraseFromParent();
1447 return New;
1448}
1449
1450MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
1451 CombineInfo &CI, CombineInfo &Paired,
1452 MachineBasicBlock::iterator InsertBefore) {
1453 MachineBasicBlock *MBB = CI.I->getParent();
1454 DebugLoc DL = CI.I->getDebugLoc();
1455
1456 const unsigned Opcode = getNewOpcode(CI, Paired);
1457
1458 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1459
1460 // Copy to the new source register.
1461 Register DestReg = MRI->createVirtualRegister(SuperRC);
1462 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1463
1464 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1465
1466 AddressRegs Regs = getRegs(Opcode, *TII);
1467
1468 if (Regs.VAddr)
1469 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1470
1471 // It shouldn't be possible to get this far if the two instructions
1472 // don't have a single memoperand, because MachineInstr::mayAlias()
1473 // will return true if this is the case.
1474 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1475
1476 MachineInstr *New =
1477 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1478 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1479 .addImm(MergedOffset) // offset
1480 .addImm(CI.CPol) // cpol
1481 .addImm(0) // swz
1482 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1483
1484 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1485 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1486 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1487
1488 // Copy to the old destination registers.
1489 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1490 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1491 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1492
1493 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1494 .add(*Dest0) // Copy to same destination including flags and sub reg.
1495 .addReg(DestReg, 0, SubRegIdx0);
1496 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1497 .add(*Dest1)
1498 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1499
1500 CI.I->eraseFromParent();
1501 Paired.I->eraseFromParent();
1502 return New;
1503}
1504
1505MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
1506 CombineInfo &CI, CombineInfo &Paired,
1507 MachineBasicBlock::iterator InsertBefore) {
1508 MachineBasicBlock *MBB = CI.I->getParent();
1509 DebugLoc DL = CI.I->getDebugLoc();
1510
1511 const unsigned Opcode = getNewOpcode(CI, Paired);
1512
1513 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1514
1515 // Copy to the new source register.
1516 Register DestReg = MRI->createVirtualRegister(SuperRC);
1517 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1518
1519 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1520
1521 AddressRegs Regs = getRegs(Opcode, *TII);
1522
1523 if (Regs.VAddr)
1524 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1525
1526 unsigned JoinedFormat =
1527 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1528
1529 // It shouldn't be possible to get this far if the two instructions
1530 // don't have a single memoperand, because MachineInstr::mayAlias()
1531 // will return true if this is the case.
1532 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1533
1534 MachineInstr *New =
1535 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1536 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1537 .addImm(MergedOffset) // offset
1538 .addImm(JoinedFormat) // format
1539 .addImm(CI.CPol) // cpol
1540 .addImm(0) // swz
1541 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1542
1543 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1544 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1545 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1546
1547 // Copy to the old destination registers.
1548 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1549 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1550 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1551
1552 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1553 .add(*Dest0) // Copy to same destination including flags and sub reg.
1554 .addReg(DestReg, 0, SubRegIdx0);
1555 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1556 .add(*Dest1)
1557 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1558
1559 CI.I->eraseFromParent();
1560 Paired.I->eraseFromParent();
1561 return New;
1562}
1563
1564MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
1565 CombineInfo &CI, CombineInfo &Paired,
1566 MachineBasicBlock::iterator InsertBefore) {
1567 MachineBasicBlock *MBB = CI.I->getParent();
1568 DebugLoc DL = CI.I->getDebugLoc();
1569
1570 const unsigned Opcode = getNewOpcode(CI, Paired);
1571
1572 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1573 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1574 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1575
1576 // Copy to the new source register.
1577 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1578 Register SrcReg = MRI->createVirtualRegister(SuperRC);
1579
1580 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1581 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1582
1583 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1584 .add(*Src0)
1585 .addImm(SubRegIdx0)
1586 .add(*Src1)
1587 .addImm(SubRegIdx1);
1588
1589 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1590 .addReg(SrcReg, RegState::Kill);
1591
1592 AddressRegs Regs = getRegs(Opcode, *TII);
1593
1594 if (Regs.VAddr)
1595 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1596
1597 unsigned JoinedFormat =
1598 getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1599
1600 // It shouldn't be possible to get this far if the two instructions
1601 // don't have a single memoperand, because MachineInstr::mayAlias()
1602 // will return true if this is the case.
1603 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1604
1605 MachineInstr *New =
1606 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1607 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1608 .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1609 .addImm(JoinedFormat) // format
1610 .addImm(CI.CPol) // cpol
1611 .addImm(0) // swz
1612 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1613
1614 CI.I->eraseFromParent();
1615 Paired.I->eraseFromParent();
1616 return New;
1617}
1618
1619MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
1620 CombineInfo &CI, CombineInfo &Paired,
1621 MachineBasicBlock::iterator InsertBefore) {
1622 MachineBasicBlock *MBB = CI.I->getParent();
1623 DebugLoc DL = CI.I->getDebugLoc();
1624
1625 const unsigned Opcode = getNewOpcode(CI, Paired);
1626
1627 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1628 Register DestReg = MRI->createVirtualRegister(SuperRC);
1629
1630 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1631
1632 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1633 MIB.add(*SAddr);
1634
1635 MachineInstr *New =
1636 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1637 .addImm(std::min(CI.Offset, Paired.Offset))
1638 .addImm(CI.CPol)
1639 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1640
1641 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1642 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1643 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1644
1645 // Copy to the old destination registers.
1646 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1647 const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1648 const auto *Dest1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1649
1650 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1651 .add(*Dest0) // Copy to same destination including flags and sub reg.
1652 .addReg(DestReg, 0, SubRegIdx0);
1653 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1654 .add(*Dest1)
1655 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1656
1657 CI.I->eraseFromParent();
1658 Paired.I->eraseFromParent();
1659 return New;
1660}
1661
1662MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
1663 CombineInfo &CI, CombineInfo &Paired,
1664 MachineBasicBlock::iterator InsertBefore) {
1665 MachineBasicBlock *MBB = CI.I->getParent();
1666 DebugLoc DL = CI.I->getDebugLoc();
1667
1668 const unsigned Opcode = getNewOpcode(CI, Paired);
1669
1670 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1671 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1672 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1673
1674 // Copy to the new source register.
1675 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1676 Register SrcReg = MRI->createVirtualRegister(SuperRC);
1677
1678 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1679 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1680
1681 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1682 .add(*Src0)
1683 .addImm(SubRegIdx0)
1684 .add(*Src1)
1685 .addImm(SubRegIdx1);
1686
1687 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1688 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1689 .addReg(SrcReg, RegState::Kill);
1690
1691 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1692 MIB.add(*SAddr);
1693
1694 MachineInstr *New =
1695 MIB.addImm(std::min(CI.Offset, Paired.Offset))
1696 .addImm(CI.CPol)
1697 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1698
1699 CI.I->eraseFromParent();
1700 Paired.I->eraseFromParent();
1701 return New;
1702}
1703
1704unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1705 const CombineInfo &Paired) {
1706 const unsigned Width = CI.Width + Paired.Width;
1707
1708 switch (getCommonInstClass(CI, Paired)) {
1709 default:
1710 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1711 // FIXME: Handle d16 correctly
1712 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
1713 Width);
1714 case TBUFFER_LOAD:
1715 case TBUFFER_STORE:
1716 return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1717 Width);
1718
1719 case UNKNOWN:
1720 llvm_unreachable("Unknown instruction class");
1721 case S_BUFFER_LOAD_IMM:
1722 switch (Width) {
1723 default:
1724 return 0;
1725 case 2:
1726 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1727 case 3:
1728 return AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
1729 case 4:
1730 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1731 case 8:
1732 return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1733 }
1734 case S_BUFFER_LOAD_SGPR_IMM:
1735 switch (Width) {
1736 default:
1737 return 0;
1738 case 2:
1739 return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1740 case 3:
1741 return AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
1742 case 4:
1743 return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1744 case 8:
1745 return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1746 }
1747 case S_LOAD_IMM:
1748 switch (Width) {
1749 default:
1750 return 0;
1751 case 2:
1752 return AMDGPU::S_LOAD_DWORDX2_IMM;
1753 case 3:
1754 return AMDGPU::S_LOAD_DWORDX3_IMM;
1755 case 4:
1756 return AMDGPU::S_LOAD_DWORDX4_IMM;
1757 case 8:
1758 return AMDGPU::S_LOAD_DWORDX8_IMM;
1759 }
1760 case GLOBAL_LOAD:
1761 switch (Width) {
1762 default:
1763 return 0;
1764 case 2:
1765 return AMDGPU::GLOBAL_LOAD_DWORDX2;
1766 case 3:
1767 return AMDGPU::GLOBAL_LOAD_DWORDX3;
1768 case 4:
1769 return AMDGPU::GLOBAL_LOAD_DWORDX4;
1770 }
1771 case GLOBAL_LOAD_SADDR:
1772 switch (Width) {
1773 default:
1774 return 0;
1775 case 2:
1776 return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1777 case 3:
1778 return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1779 case 4:
1780 return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1781 }
1782 case GLOBAL_STORE:
1783 switch (Width) {
1784 default:
1785 return 0;
1786 case 2:
1787 return AMDGPU::GLOBAL_STORE_DWORDX2;
1788 case 3:
1789 return AMDGPU::GLOBAL_STORE_DWORDX3;
1790 case 4:
1791 return AMDGPU::GLOBAL_STORE_DWORDX4;
1792 }
1793 case GLOBAL_STORE_SADDR:
1794 switch (Width) {
1795 default:
1796 return 0;
1797 case 2:
1798 return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1799 case 3:
1800 return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1801 case 4:
1802 return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1803 }
1804 case FLAT_LOAD:
1805 switch (Width) {
1806 default:
1807 return 0;
1808 case 2:
1809 return AMDGPU::FLAT_LOAD_DWORDX2;
1810 case 3:
1811 return AMDGPU::FLAT_LOAD_DWORDX3;
1812 case 4:
1813 return AMDGPU::FLAT_LOAD_DWORDX4;
1814 }
1815 case FLAT_STORE:
1816 switch (Width) {
1817 default:
1818 return 0;
1819 case 2:
1820 return AMDGPU::FLAT_STORE_DWORDX2;
1821 case 3:
1822 return AMDGPU::FLAT_STORE_DWORDX3;
1823 case 4:
1824 return AMDGPU::FLAT_STORE_DWORDX4;
1825 }
1826 case MIMG:
1827 assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) &&
1828 "No overlaps");
1829 return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
1830 }
1831}
1832
1833std::pair<unsigned, unsigned>
1834SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
1835 const CombineInfo &Paired) {
1836 assert((CI.InstClass != MIMG ||
1837 ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) ==
1838 CI.Width + Paired.Width)) &&
1839 "No overlaps");
1840
1841 unsigned Idx0;
1842 unsigned Idx1;
1843
1844 static const unsigned Idxs[5][4] = {
1845 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1846 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
1847 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
1848 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
1849 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
1850 };
1851
1852 assert(CI.Width >= 1 && CI.Width <= 4);
1853 assert(Paired.Width >= 1 && Paired.Width <= 4);
1854
1855 if (Paired < CI) {
1856 Idx1 = Idxs[0][Paired.Width - 1];
1857 Idx0 = Idxs[Paired.Width][CI.Width - 1];
1858 } else {
1859 Idx0 = Idxs[0][CI.Width - 1];
1860 Idx1 = Idxs[CI.Width][Paired.Width - 1];
1861 }
1862
1863 return std::pair(Idx0, Idx1);
1864}
1865
1866const TargetRegisterClass *
1867SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
1868 const CombineInfo &Paired) {
1869 if (CI.InstClass == S_BUFFER_LOAD_IMM ||
1870 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
1871 switch (CI.Width + Paired.Width) {
1872 default:
1873 return nullptr;
1874 case 2:
1875 return &AMDGPU::SReg_64_XEXECRegClass;
1876 case 3:
1877 return &AMDGPU::SGPR_96RegClass;
1878 case 4:
1879 return &AMDGPU::SGPR_128RegClass;
1880 case 8:
1881 return &AMDGPU::SGPR_256RegClass;
1882 case 16:
1883 return &AMDGPU::SGPR_512RegClass;
1884 }
1885 }
1886
1887 unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1888 return TRI->isAGPRClass(getDataRegClass(*CI.I))
1889 ? TRI->getAGPRClassForBitWidth(BitWidth)
1890 : TRI->getVGPRClassForBitWidth(BitWidth);
1891}
1892
1893MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
1894 CombineInfo &CI, CombineInfo &Paired,
1895 MachineBasicBlock::iterator InsertBefore) {
1896 MachineBasicBlock *MBB = CI.I->getParent();
1897 DebugLoc DL = CI.I->getDebugLoc();
1898
1899 const unsigned Opcode = getNewOpcode(CI, Paired);
1900
1901 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1902 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1903 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1904
1905 // Copy to the new source register.
1906 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1907 Register SrcReg = MRI->createVirtualRegister(SuperRC);
1908
1909 const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1910 const auto *Src1 = TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1911
1912 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1913 .add(*Src0)
1914 .addImm(SubRegIdx0)
1915 .add(*Src1)
1916 .addImm(SubRegIdx1);
1917
1918 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1919 .addReg(SrcReg, RegState::Kill);
1920
1921 AddressRegs Regs = getRegs(Opcode, *TII);
1922
1923 if (Regs.VAddr)
1924 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1925
1926
1927 // It shouldn't be possible to get this far if the two instructions
1928 // don't have a single memoperand, because MachineInstr::mayAlias()
1929 // will return true if this is the case.
1930 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1931
1932 MachineInstr *New =
1933 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1934 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1935 .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1936 .addImm(CI.CPol) // cpol
1937 .addImm(0) // swz
1938 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1939
1940 CI.I->eraseFromParent();
1941 Paired.I->eraseFromParent();
1942 return New;
1943}
1944
1946SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
1947 APInt V(32, Val, true);
1948 if (TII->isInlineConstant(V))
1949 return MachineOperand::CreateImm(Val);
1950
1951 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1952 MachineInstr *Mov =
1953 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1954 TII->get(AMDGPU::S_MOV_B32), Reg)
1955 .addImm(Val);
1956 (void)Mov;
1957 LLVM_DEBUG(dbgs() << " "; Mov->dump());
1958 return MachineOperand::CreateReg(Reg, false);
1959}
1960
1961// Compute base address using Addr and return the final register.
1962Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1963 const MemAddress &Addr) const {
1964 MachineBasicBlock *MBB = MI.getParent();
1965 MachineBasicBlock::iterator MBBI = MI.getIterator();
1966 DebugLoc DL = MI.getDebugLoc();
1967
1968 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
1969 Addr.Base.LoSubReg) &&
1970 "Expected 32-bit Base-Register-Low!!");
1971
1972 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
1973 Addr.Base.HiSubReg) &&
1974 "Expected 32-bit Base-Register-Hi!!");
1975
1976 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n");
1977 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
1978 MachineOperand OffsetHi =
1979 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
1980
1981 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1982 Register CarryReg = MRI->createVirtualRegister(CarryRC);
1983 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
1984
1985 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1986 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1987 MachineInstr *LoHalf =
1988 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
1989 .addReg(CarryReg, RegState::Define)
1990 .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
1991 .add(OffsetLo)
1992 .addImm(0); // clamp bit
1993 (void)LoHalf;
1994 LLVM_DEBUG(dbgs() << " "; LoHalf->dump(););
1995
1996 MachineInstr *HiHalf =
1997 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
1998 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
1999 .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
2000 .add(OffsetHi)
2001 .addReg(CarryReg, RegState::Kill)
2002 .addImm(0); // clamp bit
2003 (void)HiHalf;
2004 LLVM_DEBUG(dbgs() << " "; HiHalf->dump(););
2005
2006 Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
2007 MachineInstr *FullBase =
2008 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
2009 .addReg(DestSub0)
2010 .addImm(AMDGPU::sub0)
2011 .addReg(DestSub1)
2012 .addImm(AMDGPU::sub1);
2013 (void)FullBase;
2014 LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";);
2015
2016 return FullDestReg;
2017}
2018
2019// Update base and offset with the NewBase and NewOffset in MI.
2020void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
2021 Register NewBase,
2022 int32_t NewOffset) const {
2023 auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
2024 Base->setReg(NewBase);
2025 Base->setIsKill(false);
2026 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
2027}
2028
2029std::optional<int32_t>
2030SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
2031 if (Op.isImm())
2032 return Op.getImm();
2033
2034 if (!Op.isReg())
2035 return std::nullopt;
2036
2037 MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
2038 if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
2039 !Def->getOperand(1).isImm())
2040 return std::nullopt;
2041
2042 return Def->getOperand(1).getImm();
2043}
2044
2045// Analyze Base and extracts:
2046// - 32bit base registers, subregisters
2047// - 64bit constant offset
2048// Expecting base computation as:
2049// %OFFSET0:sgpr_32 = S_MOV_B32 8000
2050// %LO:vgpr_32, %c:sreg_64_xexec =
2051// V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
2052// %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
2053// %Base:vreg_64 =
2054// REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
2055void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
2056 MemAddress &Addr) const {
2057 if (!Base.isReg())
2058 return;
2059
2060 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
2061 if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
2062 || Def->getNumOperands() != 5)
2063 return;
2064
2065 MachineOperand BaseLo = Def->getOperand(1);
2066 MachineOperand BaseHi = Def->getOperand(3);
2067 if (!BaseLo.isReg() || !BaseHi.isReg())
2068 return;
2069
2070 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
2071 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
2072
2073 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
2074 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
2075 return;
2076
2077 const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
2078 const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
2079
2080 auto Offset0P = extractConstOffset(*Src0);
2081 if (Offset0P)
2082 BaseLo = *Src1;
2083 else {
2084 if (!(Offset0P = extractConstOffset(*Src1)))
2085 return;
2086 BaseLo = *Src0;
2087 }
2088
2089 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
2090 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
2091
2092 if (Src0->isImm())
2093 std::swap(Src0, Src1);
2094
2095 if (!Src1->isImm())
2096 return;
2097
2098 uint64_t Offset1 = Src1->getImm();
2099 BaseHi = *Src0;
2100
2101 Addr.Base.LoReg = BaseLo.getReg();
2102 Addr.Base.HiReg = BaseHi.getReg();
2103 Addr.Base.LoSubReg = BaseLo.getSubReg();
2104 Addr.Base.HiSubReg = BaseHi.getSubReg();
2105 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
2106}
2107
2108bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
2110 MemInfoMap &Visited,
2111 SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
2112
2113 if (!(MI.mayLoad() ^ MI.mayStore()))
2114 return false;
2115
2116 // TODO: Support flat and scratch.
2117 if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0)
2118 return false;
2119
2120 if (MI.mayLoad() &&
2121 TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr)
2122 return false;
2123
2124 if (AnchorList.count(&MI))
2125 return false;
2126
2127 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
2128
2129 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
2130 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";);
2131 return false;
2132 }
2133
2134 // Step1: Find the base-registers and a 64bit constant offset.
2135 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
2136 MemAddress MAddr;
2137 if (!Visited.contains(&MI)) {
2138 processBaseWithConstOffset(Base, MAddr);
2139 Visited[&MI] = MAddr;
2140 } else
2141 MAddr = Visited[&MI];
2142
2143 if (MAddr.Offset == 0) {
2144 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no"
2145 " constant offsets that can be promoted.\n";);
2146 return false;
2147 }
2148
2149 LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", "
2150 << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
2151
2152 // Step2: Traverse through MI's basic block and find an anchor(that has the
2153 // same base-registers) with the highest 13bit distance from MI's offset.
2154 // E.g. (64bit loads)
2155 // bb:
2156 // addr1 = &a + 4096; load1 = load(addr1, 0)
2157 // addr2 = &a + 6144; load2 = load(addr2, 0)
2158 // addr3 = &a + 8192; load3 = load(addr3, 0)
2159 // addr4 = &a + 10240; load4 = load(addr4, 0)
2160 // addr5 = &a + 12288; load5 = load(addr5, 0)
2161 //
2162 // Starting from the first load, the optimization will try to find a new base
2163 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
2164 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
2165 // as the new-base(anchor) because of the maximum distance which can
2166 // accommodate more intermediate bases presumably.
2167 //
2168 // Step3: move (&a + 8192) above load1. Compute and promote offsets from
2169 // (&a + 8192) for load1, load2, load4.
2170 // addr = &a + 8192
2171 // load1 = load(addr, -4096)
2172 // load2 = load(addr, -2048)
2173 // load3 = load(addr, 0)
2174 // load4 = load(addr, 2048)
2175 // addr5 = &a + 12288; load5 = load(addr5, 0)
2176 //
2177 MachineInstr *AnchorInst = nullptr;
2178 MemAddress AnchorAddr;
2179 uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
2181
2182 MachineBasicBlock *MBB = MI.getParent();
2184 MachineBasicBlock::iterator MBBI = MI.getIterator();
2185 ++MBBI;
2186 const SITargetLowering *TLI =
2187 static_cast<const SITargetLowering *>(STM->getTargetLowering());
2188
2189 for ( ; MBBI != E; ++MBBI) {
2190 MachineInstr &MINext = *MBBI;
2191 // TODO: Support finding an anchor(with same base) from store addresses or
2192 // any other load addresses where the opcodes are different.
2193 if (MINext.getOpcode() != MI.getOpcode() ||
2194 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
2195 continue;
2196
2197 const MachineOperand &BaseNext =
2198 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
2199 MemAddress MAddrNext;
2200 if (!Visited.contains(&MINext)) {
2201 processBaseWithConstOffset(BaseNext, MAddrNext);
2202 Visited[&MINext] = MAddrNext;
2203 } else
2204 MAddrNext = Visited[&MINext];
2205
2206 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
2207 MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
2208 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
2209 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2210 continue;
2211
2212 InstsWCommonBase.push_back(std::pair(&MINext, MAddrNext.Offset));
2213
2214 int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2216 AM.HasBaseReg = true;
2217 AM.BaseOffs = Dist;
2218 if (TLI->isLegalGlobalAddressingMode(AM) &&
2219 (uint32_t)std::abs(Dist) > MaxDist) {
2220 MaxDist = std::abs(Dist);
2221
2222 AnchorAddr = MAddrNext;
2223 AnchorInst = &MINext;
2224 }
2225 }
2226
2227 if (AnchorInst) {
2228 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): ";
2229 AnchorInst->dump());
2230 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: "
2231 << AnchorAddr.Offset << "\n\n");
2232
2233 // Instead of moving up, just re-compute anchor-instruction's base address.
2234 Register Base = computeBase(MI, AnchorAddr);
2235
2236 updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
2237 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump(););
2238
2239 for (auto P : InstsWCommonBase) {
2241 AM.HasBaseReg = true;
2242 AM.BaseOffs = P.second - AnchorAddr.Offset;
2243
2244 if (TLI->isLegalGlobalAddressingMode(AM)) {
2245 LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second;
2246 dbgs() << ")"; P.first->dump());
2247 updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
2248 LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump());
2249 }
2250 }
2251 AnchorList.insert(AnchorInst);
2252 return true;
2253 }
2254
2255 return false;
2256}
2257
2258void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
2259 std::list<std::list<CombineInfo> > &MergeableInsts) const {
2260 for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2261 if (AddrList.front().InstClass == CI.InstClass &&
2262 AddrList.front().IsAGPR == CI.IsAGPR &&
2263 AddrList.front().hasSameBaseAddress(CI)) {
2264 AddrList.emplace_back(CI);
2265 return;
2266 }
2267 }
2268
2269 // Base address not found, so add a new list.
2270 MergeableInsts.emplace_back(1, CI);
2271}
2272
2273std::pair<MachineBasicBlock::iterator, bool>
2274SILoadStoreOptimizer::collectMergeableInsts(
2276 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
2277 std::list<std::list<CombineInfo>> &MergeableInsts) const {
2278 bool Modified = false;
2279
2280 // Sort potential mergeable instructions into lists. One list per base address.
2281 unsigned Order = 0;
2282 MachineBasicBlock::iterator BlockI = Begin;
2283 for (; BlockI != End; ++BlockI) {
2284 MachineInstr &MI = *BlockI;
2285
2286 // We run this before checking if an address is mergeable, because it can produce
2287 // better code even if the instructions aren't mergeable.
2288 if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
2289 Modified = true;
2290
2291 // Treat volatile accesses, ordered accesses and unmodeled side effects as
2292 // barriers. We can look after this barrier for separate merges.
2293 if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) {
2294 LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI);
2295
2296 // Search will resume after this instruction in a separate merge list.
2297 ++BlockI;
2298 break;
2299 }
2300
2301 const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
2302 if (InstClass == UNKNOWN)
2303 continue;
2304
2305 // Do not merge VMEM buffer instructions with "swizzled" bit set.
2306 int Swizzled =
2307 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
2308 if (Swizzled != -1 && MI.getOperand(Swizzled).getImm())
2309 continue;
2310
2311 CombineInfo CI;
2312 CI.setMI(MI, *this);
2313 CI.Order = Order++;
2314
2315 if (!CI.hasMergeableAddress(*MRI))
2316 continue;
2317
2318 if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
2319 // FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
2320 // operands. However we are reporting that ds_write2 shall have
2321 // only VGPR data so that machine copy propagation does not
2322 // create an illegal instruction with a VGPR and AGPR sources.
2323 // Consequenctially if we create such instruction the verifier
2324 // will complain.
2325 continue;
2326 }
2327
2328 LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
2329
2330 addInstToMergeableList(CI, MergeableInsts);
2331 }
2332
2333 // At this point we have lists of Mergeable instructions.
2334 //
2335 // Part 2: Sort lists by offset and then for each CombineInfo object in the
2336 // list try to find an instruction that can be merged with I. If an instruction
2337 // is found, it is stored in the Paired field. If no instructions are found, then
2338 // the CombineInfo object is deleted from the list.
2339
2340 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2341 E = MergeableInsts.end(); I != E;) {
2342
2343 std::list<CombineInfo> &MergeList = *I;
2344 if (MergeList.size() <= 1) {
2345 // This means we have found only one instruction with a given address
2346 // that can be merged, and we need at least 2 instructions to do a merge,
2347 // so this list can be discarded.
2348 I = MergeableInsts.erase(I);
2349 continue;
2350 }
2351
2352 // Sort the lists by offsets, this way mergeable instructions will be
2353 // adjacent to each other in the list, which will make it easier to find
2354 // matches.
2355 MergeList.sort(
2356 [] (const CombineInfo &A, const CombineInfo &B) {
2357 return A.Offset < B.Offset;
2358 });
2359 ++I;
2360 }
2361
2362 return std::pair(BlockI, Modified);
2363}
2364
2365// Scan through looking for adjacent LDS operations with constant offsets from
2366// the same base register. We rely on the scheduler to do the hard work of
2367// clustering nearby loads, and assume these are all adjacent.
2368bool SILoadStoreOptimizer::optimizeBlock(
2369 std::list<std::list<CombineInfo> > &MergeableInsts) {
2370 bool Modified = false;
2371
2372 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2373 E = MergeableInsts.end(); I != E;) {
2374 std::list<CombineInfo> &MergeList = *I;
2375
2376 bool OptimizeListAgain = false;
2377 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2378 // We weren't able to make any changes, so delete the list so we don't
2379 // process the same instructions the next time we try to optimize this
2380 // block.
2381 I = MergeableInsts.erase(I);
2382 continue;
2383 }
2384
2385 Modified = true;
2386
2387 // We made changes, but also determined that there were no more optimization
2388 // opportunities, so we don't need to reprocess the list
2389 if (!OptimizeListAgain) {
2390 I = MergeableInsts.erase(I);
2391 continue;
2392 }
2393 OptimizeAgain = true;
2394 }
2395 return Modified;
2396}
2397
2398bool
2399SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2400 std::list<CombineInfo> &MergeList,
2401 bool &OptimizeListAgain) {
2402 if (MergeList.empty())
2403 return false;
2404
2405 bool Modified = false;
2406
2407 for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
2408 Next = std::next(I)) {
2409
2410 auto First = I;
2411 auto Second = Next;
2412
2413 if ((*First).Order > (*Second).Order)
2414 std::swap(First, Second);
2415 CombineInfo &CI = *First;
2416 CombineInfo &Paired = *Second;
2417
2418 CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2419 if (!Where) {
2420 ++I;
2421 continue;
2422 }
2423
2424 Modified = true;
2425
2426 LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I);
2427
2429 switch (CI.InstClass) {
2430 default:
2431 llvm_unreachable("unknown InstClass");
2432 break;
2433 case DS_READ:
2434 NewMI = mergeRead2Pair(CI, Paired, Where->I);
2435 break;
2436 case DS_WRITE:
2437 NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2438 break;
2439 case S_BUFFER_LOAD_IMM:
2440 case S_BUFFER_LOAD_SGPR_IMM:
2441 case S_LOAD_IMM:
2442 NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);
2443 OptimizeListAgain |= CI.Width + Paired.Width < 8;
2444 break;
2445 case BUFFER_LOAD:
2446 NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2447 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2448 break;
2449 case BUFFER_STORE:
2450 NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2451 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2452 break;
2453 case MIMG:
2454 NewMI = mergeImagePair(CI, Paired, Where->I);
2455 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2456 break;
2457 case TBUFFER_LOAD:
2458 NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2459 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2460 break;
2461 case TBUFFER_STORE:
2462 NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2463 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2464 break;
2465 case FLAT_LOAD:
2466 case GLOBAL_LOAD:
2467 case GLOBAL_LOAD_SADDR:
2468 NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2469 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2470 break;
2471 case FLAT_STORE:
2472 case GLOBAL_STORE:
2473 case GLOBAL_STORE_SADDR:
2474 NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2475 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2476 break;
2477 }
2478 CI.setMI(NewMI, *this);
2479 CI.Order = Where->Order;
2480 if (I == Second)
2481 I = Next;
2482
2483 MergeList.erase(Second);
2484 }
2485
2486 return Modified;
2487}
2488
2489bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
2490 if (skipFunction(MF.getFunction()))
2491 return false;
2492
2493 STM = &MF.getSubtarget<GCNSubtarget>();
2494 if (!STM->loadStoreOptEnabled())
2495 return false;
2496
2497 TII = STM->getInstrInfo();
2498 TRI = &TII->getRegisterInfo();
2499
2500 MRI = &MF.getRegInfo();
2501 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2502
2503 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
2504
2505 bool Modified = false;
2506
2507 // Contains the list of instructions for which constant offsets are being
2508 // promoted to the IMM. This is tracked for an entire block at time.
2510 MemInfoMap Visited;
2511
2512 for (MachineBasicBlock &MBB : MF) {
2513 MachineBasicBlock::iterator SectionEnd;
2514 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
2515 I = SectionEnd) {
2516 bool CollectModified;
2517 std::list<std::list<CombineInfo>> MergeableInsts;
2518
2519 // First pass: Collect list of all instructions we know how to merge in a
2520 // subset of the block.
2521 std::tie(SectionEnd, CollectModified) =
2522 collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
2523
2524 Modified |= CollectModified;
2525
2526 do {
2527 OptimizeAgain = false;
2528 Modified |= optimizeBlock(MergeableInsts);
2529 } while (OptimizeAgain);
2530 }
2531
2532 Visited.clear();
2533 AnchorList.clear();
2534 }
2535
2536 return Modified;
2537}
unsigned const MachineRegisterInfo * MRI
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE, "AMDGPU Image Intrinsic Optimizer", false, false) char AMDGPUImageIntrinsicOptimizer void addInstToMergeableList(IntrinsicInst *II, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr)
BasicBlock::iterator collectMergeableInsts(BasicBlock::iterator I, BasicBlock::iterator E, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts)
Provides AMDGPU specific target descriptions.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
uint64_t Size
std::optional< std::vector< StOtherPiece > > Other
Definition: ELFYAML.cpp:1290
bool End
Definition: ELF_riscv.cpp:480
AMD GCN specific subclass of TargetSubtarget.
#define op(i)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
#define P(N)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi)
SI Load Store Optimizer
static void addDefsUsesToList(const MachineInstr &MI, DenseSet< Register > &RegDefs, DenseSet< Register > &RegUses)
static unsigned getBufferFormatWithCompCount(unsigned OldFormat, unsigned ComponentCount, const GCNSubtarget &STI)
#define DEBUG_TYPE
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI)
static bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT, const TargetTransformInfo &TTI, const DataLayout &DL, DomTreeUpdater *DTU)
support::ulittle16_t & Lo
Definition: aarch32.cpp:206
support::ulittle16_t & Hi
Definition: aarch32.cpp:205
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Class for arbitrary precision integers.
Definition: APInt.h:76
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:269
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
bool loadStoreOptEnabled() const
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:251
bool hasDwordx3LoadStores() const
const SITargetLowering * getTargetLowering() const override
Definition: GCNSubtarget.h:259
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
Definition: GCNSubtarget.h:688
bool hasScalarDwordx3Loads() const
Definition: GCNSubtarget.h:948
TypeSize getValue() const
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Properties which a MachineFunction may have at a given point in time.
MachineFunctionProperties & set(Property P)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & cloneMergedMemRefs(ArrayRef< const MachineInstr * > OtherMIs) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:544
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
unsigned getAddrSpace() const
const MachinePointerInfo & getPointerInfo() const
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
void dump() const
Definition: Pass.cpp:136
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
static bool isFLATGlobal(const MachineInstr &MI)
Definition: SIInstrInfo.h:628
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:360
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:185
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ FLAT_ADDRESS
Address space for flat memory.
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset)
Convert ByteOffset to dwords if the subtarget uses dword SMRD immediate offsets.
bool getMTBUFHasSrsrc(unsigned Opc)
int getMTBUFElements(unsigned Opc)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
bool getMTBUFHasSoffset(unsigned Opc)
int getMUBUFOpcode(unsigned BaseOpc, unsigned Elements)
int getMUBUFBaseOpcode(unsigned Opc)
int getMTBUFBaseOpcode(unsigned Opc)
bool getMUBUFHasVAddr(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
int getMTBUFOpcode(unsigned BaseOpc, unsigned Elements)
bool getMUBUFHasSoffset(unsigned Opc)
const MIMGBaseOpcodeInfo * getMIMGBaseOpcode(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
bool getMTBUFHasVAddr(unsigned Opc)
int getMUBUFElements(unsigned Opc)
const GcnBufferFormatInfo * getGcnBufferFormatInfo(uint8_t BitsPerComp, uint8_t NumComponents, uint8_t NumFormat, const MCSubtargetInfo &STI)
bool getMUBUFHasSrsrc(unsigned Opc)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Offset
Definition: DWP.cpp:456
bool operator<(int64_t V1, const APSInt &V2)
Definition: APSInt.h:361
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
char & SILoadStoreOptimizerID
FunctionPass * createSILoadStoreOptimizerPass()
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
void initializeSILoadStoreOptimizerPass(PassRegistry &)
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:281
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
This class contains a discriminated union of information about pointers in memory operands,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg If BaseGV is null...