LLVM 23.0.0git
SILoadStoreOptimizer.cpp
Go to the documentation of this file.
1//===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass tries to fuse DS instructions with close by immediate offsets.
10// This will fuse operations such as
11// ds_read_b32 v0, v2 offset:16
12// ds_read_b32 v1, v2 offset:32
13// ==>
14// ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15//
16// The same is done for certain SMEM and VMEM opcodes, e.g.:
17// s_buffer_load_dword s4, s[0:3], 4
18// s_buffer_load_dword s5, s[0:3], 8
19// ==>
20// s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21//
22// This pass also tries to promote constant offset to the immediate by
23// adjusting the base. It tries to use a base from the nearby instructions that
24// allows it to have a 13bit constant offset and then promotes the 13bit offset
25// to the immediate.
26// E.g.
27// s_movk_i32 s0, 0x1800
28// v_add_co_u32_e32 v0, vcc, s0, v2
29// v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30//
31// s_movk_i32 s0, 0x1000
32// v_add_co_u32_e32 v5, vcc, s0, v2
33// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34// global_load_dwordx2 v[5:6], v[5:6], off
35// global_load_dwordx2 v[0:1], v[0:1], off
36// =>
37// s_movk_i32 s0, 0x1000
38// v_add_co_u32_e32 v5, vcc, s0, v2
39// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40// global_load_dwordx2 v[5:6], v[5:6], off
41// global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42//
43// Future improvements:
44//
45// - This is currently missing stores of constants because loading
46// the constant into the data register is placed between the stores, although
47// this is arguably a scheduling problem.
48//
49// - Live interval recomputing seems inefficient. This currently only matches
50// one pair, and recomputes live intervals and moves on to the next pair. It
51// would be better to compute a list of all merges that need to occur.
52//
53// - With a list of instructions to process, we can also merge more. If a
54// cluster of loads have offsets that are too large to fit in the 8-bit
55// offsets, but are close enough to fit in the 8 bits, we can add to the base
56// pointer and use the new reduced offsets.
57//
58//===----------------------------------------------------------------------===//
59
61#include "AMDGPU.h"
62#include "GCNSubtarget.h"
64#include "SIDefines.h"
68
69using namespace llvm;
70
71#define DEBUG_TYPE "si-load-store-opt"
72
73namespace {
74enum InstClassEnum {
75 UNKNOWN,
76 DS_READ,
77 DS_WRITE,
78 S_BUFFER_LOAD_IMM,
79 S_BUFFER_LOAD_SGPR_IMM,
80 S_LOAD_IMM,
81 BUFFER_LOAD,
82 BUFFER_STORE,
83 MIMG,
84 TBUFFER_LOAD,
85 TBUFFER_STORE,
86 GLOBAL_LOAD_SADDR,
87 GLOBAL_STORE_SADDR,
88 FLAT_LOAD,
89 FLAT_STORE,
90 FLAT_LOAD_SADDR,
91 FLAT_STORE_SADDR,
92 GLOBAL_LOAD, // GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of
93 GLOBAL_STORE // any CombineInfo, they are only ever returned by
94 // getCommonInstClass.
95};
96
97struct AddressRegs {
98 unsigned char NumVAddrs = 0;
99 bool SBase = false;
100 bool SRsrc = false;
101 bool SOffset = false;
102 bool SAddr = false;
103 bool VAddr = false;
104 bool Addr = false;
105 bool SSamp = false;
106};
107
108// GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
109const unsigned MaxAddressRegs = 12 + 1 + 1;
110
111class SILoadStoreOptimizer {
112 struct CombineInfo {
114 unsigned EltSize;
115 unsigned Offset;
116 unsigned Width;
117 unsigned Format;
118 unsigned BaseOff;
119 unsigned DMask;
120 InstClassEnum InstClass;
121 unsigned CPol = 0;
122 const TargetRegisterClass *DataRC;
123 bool UseST64;
124 int AddrIdx[MaxAddressRegs];
125 const MachineOperand *AddrReg[MaxAddressRegs];
126 unsigned NumAddresses;
127 unsigned Order;
128
129 bool hasSameBaseAddress(const CombineInfo &CI) {
130 if (NumAddresses != CI.NumAddresses)
131 return false;
132
133 const MachineInstr &MI = *CI.I;
134 for (unsigned i = 0; i < NumAddresses; i++) {
135 const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
136
137 if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
138 if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
139 AddrReg[i]->getImm() != AddrRegNext.getImm()) {
140 return false;
141 }
142 continue;
143 }
144
145 // Check same base pointer. Be careful of subregisters, which can occur
146 // with vectors of pointers.
147 if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
148 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
149 return false;
150 }
151 }
152 return true;
153 }
154
155 bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
156 for (unsigned i = 0; i < NumAddresses; ++i) {
157 const MachineOperand *AddrOp = AddrReg[i];
158 // Immediates are always OK.
159 if (AddrOp->isImm())
160 continue;
161
162 // Don't try to merge addresses that aren't either immediates or registers.
163 // TODO: Should be possible to merge FrameIndexes and maybe some other
164 // non-register
165 if (!AddrOp->isReg())
166 return false;
167
168 // TODO: We should be able to merge instructions with other physical reg
169 // addresses too.
170 if (AddrOp->getReg().isPhysical() &&
171 AddrOp->getReg() != AMDGPU::SGPR_NULL)
172 return false;
173
174 // If an address has only one use then there will be no other
175 // instructions with the same address, so we can't merge this one.
176 if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
177 return false;
178 }
179 return true;
180 }
181
182 void setMI(MachineBasicBlock::iterator MI, const SILoadStoreOptimizer &LSO);
183
184 // Compare by pointer order.
185 bool operator<(const CombineInfo& Other) const {
186 return (InstClass == MIMG) ? DMask < Other.DMask : Offset < Other.Offset;
187 }
188 };
189
190 struct BaseRegisters {
191 Register LoReg;
192 Register HiReg;
193
194 unsigned LoSubReg = 0;
195 unsigned HiSubReg = 0;
196 // True when using V_ADD_U64_e64 pattern
197 bool UseV64Pattern = false;
198 };
199
200 struct MemAddress {
201 BaseRegisters Base;
202 int64_t Offset = 0;
203 };
204
205 using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
206
207private:
208 MachineFunction *MF = nullptr;
209 const GCNSubtarget *STM = nullptr;
210 const SIInstrInfo *TII = nullptr;
211 const SIRegisterInfo *TRI = nullptr;
212 MachineRegisterInfo *MRI = nullptr;
213 AliasAnalysis *AA = nullptr;
214 bool OptimizeAgain;
215
216 bool canSwapInstructions(const DenseSet<Register> &ARegDefs,
217 const DenseSet<Register> &ARegUses,
218 const MachineInstr &A, const MachineInstr &B) const;
219 static bool dmasksCanBeCombined(const CombineInfo &CI,
220 const SIInstrInfo &TII,
221 const CombineInfo &Paired);
222 static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
223 CombineInfo &Paired, bool Modify = false);
224 static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
225 const CombineInfo &Paired);
226 unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
227 static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
228 const CombineInfo &Paired);
229 const TargetRegisterClass *
230 getTargetRegisterClass(const CombineInfo &CI,
231 const CombineInfo &Paired) const;
232 const TargetRegisterClass *getDataRegClass(const MachineInstr &MI) const;
233
234 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
235
236 void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired,
237 MachineBasicBlock::iterator InsertBefore,
238 const DebugLoc &DL, AMDGPU::OpName OpName,
239 Register DestReg) const;
240 Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
241 MachineBasicBlock::iterator InsertBefore,
242 const DebugLoc &DL, AMDGPU::OpName OpName) const;
243
244 unsigned read2Opcode(unsigned EltSize) const;
245 unsigned read2ST64Opcode(unsigned EltSize) const;
247 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
248 MachineBasicBlock::iterator InsertBefore);
249
250 unsigned write2Opcode(unsigned EltSize) const;
251 unsigned write2ST64Opcode(unsigned EltSize) const;
252 unsigned getWrite2Opcode(const CombineInfo &CI) const;
253
255 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
256 MachineBasicBlock::iterator InsertBefore);
258 mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
259 MachineBasicBlock::iterator InsertBefore);
261 mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
262 MachineBasicBlock::iterator InsertBefore);
264 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
265 MachineBasicBlock::iterator InsertBefore);
267 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
268 MachineBasicBlock::iterator InsertBefore);
270 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
271 MachineBasicBlock::iterator InsertBefore);
273 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
274 MachineBasicBlock::iterator InsertBefore);
276 mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
277 MachineBasicBlock::iterator InsertBefore);
279 mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
280 MachineBasicBlock::iterator InsertBefore);
281
282 void updateBaseAndOffset(MachineInstr &I, Register NewBase,
283 int32_t NewOffset) const;
284 void updateAsyncLDSAddress(MachineInstr &MI, int32_t OffsetDiff) const;
285 Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
286 MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
287 bool processBaseWithConstOffset64(MachineInstr *AddDef,
288 const MachineOperand &Base,
289 MemAddress &Addr) const;
290 void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
291 /// Promotes constant offset to the immediate by adjusting the base. It
292 /// tries to use a base from the nearby instructions that allows it to have
293 /// a 13bit constant offset which gets promoted to the immediate.
294 bool promoteConstantOffsetToImm(MachineInstr &CI,
295 MemInfoMap &Visited,
296 SmallPtrSet<MachineInstr *, 4> &Promoted) const;
297 void addInstToMergeableList(const CombineInfo &CI,
298 std::list<std::list<CombineInfo> > &MergeableInsts) const;
299
300 std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
302 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
303 std::list<std::list<CombineInfo>> &MergeableInsts) const;
304
305 static MachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI,
306 const CombineInfo &Paired);
307
308 static InstClassEnum getCommonInstClass(const CombineInfo &CI,
309 const CombineInfo &Paired);
310
311 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
312 bool &OptimizeListAgain);
313 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
314
315public:
316 SILoadStoreOptimizer(AliasAnalysis *AA) : AA(AA) {}
317 bool run(MachineFunction &MF);
318};
319
320class SILoadStoreOptimizerLegacy : public MachineFunctionPass {
321public:
322 static char ID;
323
324 SILoadStoreOptimizerLegacy() : MachineFunctionPass(ID) {}
325
326 bool runOnMachineFunction(MachineFunction &MF) override;
327
328 StringRef getPassName() const override { return "SI Load Store Optimizer"; }
329
330 void getAnalysisUsage(AnalysisUsage &AU) const override {
331 AU.setPreservesCFG();
333
335 }
336
337 MachineFunctionProperties getRequiredProperties() const override {
338 return MachineFunctionProperties().setIsSSA();
339 }
340};
341
342static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
343 const unsigned Opc = MI.getOpcode();
344
345 if (TII.isMUBUF(Opc)) {
346 // FIXME: Handle d16 correctly
348 }
349 if (TII.isImage(MI)) {
350 uint64_t DMaskImm =
351 TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
352 return llvm::popcount(DMaskImm);
353 }
354 if (TII.isMTBUF(Opc)) {
356 }
357
358 switch (Opc) {
359 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
360 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
361 case AMDGPU::S_LOAD_DWORD_IMM:
362 case AMDGPU::GLOBAL_LOAD_DWORD:
363 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
364 case AMDGPU::GLOBAL_STORE_DWORD:
365 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
366 case AMDGPU::FLAT_LOAD_DWORD:
367 case AMDGPU::FLAT_STORE_DWORD:
368 case AMDGPU::FLAT_LOAD_DWORD_SADDR:
369 case AMDGPU::FLAT_STORE_DWORD_SADDR:
370 return 1;
371 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
372 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
373 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
374 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
375 case AMDGPU::S_LOAD_DWORDX2_IMM:
376 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
377 case AMDGPU::GLOBAL_LOAD_DWORDX2:
378 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
379 case AMDGPU::GLOBAL_STORE_DWORDX2:
380 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
381 case AMDGPU::FLAT_LOAD_DWORDX2:
382 case AMDGPU::FLAT_STORE_DWORDX2:
383 case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
384 case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
385 return 2;
386 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
387 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
388 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
389 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
390 case AMDGPU::S_LOAD_DWORDX3_IMM:
391 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
392 case AMDGPU::GLOBAL_LOAD_DWORDX3:
393 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
394 case AMDGPU::GLOBAL_STORE_DWORDX3:
395 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
396 case AMDGPU::FLAT_LOAD_DWORDX3:
397 case AMDGPU::FLAT_STORE_DWORDX3:
398 case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
399 case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
400 return 3;
401 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
402 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
403 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
404 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
405 case AMDGPU::S_LOAD_DWORDX4_IMM:
406 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
407 case AMDGPU::GLOBAL_LOAD_DWORDX4:
408 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
409 case AMDGPU::GLOBAL_STORE_DWORDX4:
410 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
411 case AMDGPU::FLAT_LOAD_DWORDX4:
412 case AMDGPU::FLAT_STORE_DWORDX4:
413 case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
414 case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
415 return 4;
416 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
417 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
418 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
419 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
420 case AMDGPU::S_LOAD_DWORDX8_IMM:
421 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
422 return 8;
423 case AMDGPU::DS_READ_B32:
424 case AMDGPU::DS_READ_B32_gfx9:
425 case AMDGPU::DS_WRITE_B32:
426 case AMDGPU::DS_WRITE_B32_gfx9:
427 return 1;
428 case AMDGPU::DS_READ_B64:
429 case AMDGPU::DS_READ_B64_gfx9:
430 case AMDGPU::DS_WRITE_B64:
431 case AMDGPU::DS_WRITE_B64_gfx9:
432 return 2;
433 default:
434 return 0;
435 }
436}
437
438/// Maps instruction opcode to enum InstClassEnum.
439static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
440 switch (Opc) {
441 default:
442 if (TII.isMUBUF(Opc)) {
444 default:
445 return UNKNOWN;
446 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN:
447 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact:
448 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN:
449 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact:
450 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
451 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
452 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
453 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
454 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:
455 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact:
456 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN:
457 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact:
458 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN:
459 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact:
460 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET:
461 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact:
462 return BUFFER_LOAD;
463 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN:
464 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN_exact:
465 case AMDGPU::BUFFER_STORE_DWORD_IDXEN:
466 case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact:
467 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
468 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
469 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
470 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
471 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN:
472 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact:
473 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN:
474 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN_exact:
475 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN:
476 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact:
477 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET:
478 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact:
479 return BUFFER_STORE;
480 }
481 }
482 if (TII.isImage(Opc)) {
483 // Ignore instructions encoded without vaddr.
484 if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
485 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0))
486 return UNKNOWN;
487 // Ignore BVH instructions
489 return UNKNOWN;
490 // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
491 if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
492 TII.isGather4(Opc))
493 return UNKNOWN;
494 return MIMG;
495 }
496 if (TII.isMTBUF(Opc)) {
498 default:
499 return UNKNOWN;
500 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:
501 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:
502 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:
503 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:
504 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
505 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
506 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
507 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
508 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN:
509 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact:
510 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN:
511 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact:
512 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN:
513 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact:
514 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET:
515 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact:
516 return TBUFFER_LOAD;
517 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
518 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
519 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
520 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
521 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN:
522 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact:
523 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET:
524 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact:
525 return TBUFFER_STORE;
526 }
527 }
528 return UNKNOWN;
529 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
530 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
531 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
532 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
533 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
534 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
535 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
536 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
537 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
538 return S_BUFFER_LOAD_IMM;
539 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
540 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
541 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
542 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
543 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
544 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
545 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
546 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
547 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
548 return S_BUFFER_LOAD_SGPR_IMM;
549 case AMDGPU::S_LOAD_DWORD_IMM:
550 case AMDGPU::S_LOAD_DWORDX2_IMM:
551 case AMDGPU::S_LOAD_DWORDX3_IMM:
552 case AMDGPU::S_LOAD_DWORDX4_IMM:
553 case AMDGPU::S_LOAD_DWORDX8_IMM:
554 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
555 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
556 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
557 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
558 return S_LOAD_IMM;
559 case AMDGPU::DS_READ_B32:
560 case AMDGPU::DS_READ_B32_gfx9:
561 case AMDGPU::DS_READ_B64:
562 case AMDGPU::DS_READ_B64_gfx9:
563 return DS_READ;
564 case AMDGPU::DS_WRITE_B32:
565 case AMDGPU::DS_WRITE_B32_gfx9:
566 case AMDGPU::DS_WRITE_B64:
567 case AMDGPU::DS_WRITE_B64_gfx9:
568 return DS_WRITE;
569 case AMDGPU::GLOBAL_LOAD_DWORD:
570 case AMDGPU::GLOBAL_LOAD_DWORDX2:
571 case AMDGPU::GLOBAL_LOAD_DWORDX3:
572 case AMDGPU::GLOBAL_LOAD_DWORDX4:
573 case AMDGPU::FLAT_LOAD_DWORD:
574 case AMDGPU::FLAT_LOAD_DWORDX2:
575 case AMDGPU::FLAT_LOAD_DWORDX3:
576 case AMDGPU::FLAT_LOAD_DWORDX4:
577 return FLAT_LOAD;
578 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
579 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
580 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
581 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
582 return GLOBAL_LOAD_SADDR;
583 case AMDGPU::GLOBAL_STORE_DWORD:
584 case AMDGPU::GLOBAL_STORE_DWORDX2:
585 case AMDGPU::GLOBAL_STORE_DWORDX3:
586 case AMDGPU::GLOBAL_STORE_DWORDX4:
587 case AMDGPU::FLAT_STORE_DWORD:
588 case AMDGPU::FLAT_STORE_DWORDX2:
589 case AMDGPU::FLAT_STORE_DWORDX3:
590 case AMDGPU::FLAT_STORE_DWORDX4:
591 return FLAT_STORE;
592 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
593 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
594 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
595 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
596 return GLOBAL_STORE_SADDR;
597 case AMDGPU::FLAT_LOAD_DWORD_SADDR:
598 case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
599 case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
600 case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
601 return FLAT_LOAD_SADDR;
602 case AMDGPU::FLAT_STORE_DWORD_SADDR:
603 case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
604 case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
605 case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
606 return FLAT_STORE_SADDR;
607 }
608}
609
610/// Determines instruction subclass from opcode. Only instructions
611/// of the same subclass can be merged together. The merged instruction may have
612/// a different subclass but must have the same class.
613static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
614 switch (Opc) {
615 default:
616 if (TII.isMUBUF(Opc))
618 if (TII.isImage(Opc)) {
620 assert(Info);
621 return Info->BaseOpcode;
622 }
623 if (TII.isMTBUF(Opc))
625 return -1;
626 case AMDGPU::DS_READ_B32:
627 case AMDGPU::DS_READ_B32_gfx9:
628 case AMDGPU::DS_READ_B64:
629 case AMDGPU::DS_READ_B64_gfx9:
630 case AMDGPU::DS_WRITE_B32:
631 case AMDGPU::DS_WRITE_B32_gfx9:
632 case AMDGPU::DS_WRITE_B64:
633 case AMDGPU::DS_WRITE_B64_gfx9:
634 return Opc;
635 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
636 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
637 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
638 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
639 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
640 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
641 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
642 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
643 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
644 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
645 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
646 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
647 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
648 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
649 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
650 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
651 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
652 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
653 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
654 return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
655 case AMDGPU::S_LOAD_DWORD_IMM:
656 case AMDGPU::S_LOAD_DWORDX2_IMM:
657 case AMDGPU::S_LOAD_DWORDX3_IMM:
658 case AMDGPU::S_LOAD_DWORDX4_IMM:
659 case AMDGPU::S_LOAD_DWORDX8_IMM:
660 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
661 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
662 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
663 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
664 return AMDGPU::S_LOAD_DWORD_IMM;
665 case AMDGPU::GLOBAL_LOAD_DWORD:
666 case AMDGPU::GLOBAL_LOAD_DWORDX2:
667 case AMDGPU::GLOBAL_LOAD_DWORDX3:
668 case AMDGPU::GLOBAL_LOAD_DWORDX4:
669 case AMDGPU::FLAT_LOAD_DWORD:
670 case AMDGPU::FLAT_LOAD_DWORDX2:
671 case AMDGPU::FLAT_LOAD_DWORDX3:
672 case AMDGPU::FLAT_LOAD_DWORDX4:
673 return AMDGPU::FLAT_LOAD_DWORD;
674 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
675 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
676 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
677 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
678 return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
679 case AMDGPU::GLOBAL_STORE_DWORD:
680 case AMDGPU::GLOBAL_STORE_DWORDX2:
681 case AMDGPU::GLOBAL_STORE_DWORDX3:
682 case AMDGPU::GLOBAL_STORE_DWORDX4:
683 case AMDGPU::FLAT_STORE_DWORD:
684 case AMDGPU::FLAT_STORE_DWORDX2:
685 case AMDGPU::FLAT_STORE_DWORDX3:
686 case AMDGPU::FLAT_STORE_DWORDX4:
687 return AMDGPU::FLAT_STORE_DWORD;
688 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
689 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
690 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
691 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
692 return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
693 case AMDGPU::FLAT_LOAD_DWORD_SADDR:
694 case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
695 case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
696 case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
697 return AMDGPU::FLAT_LOAD_DWORD_SADDR;
698 case AMDGPU::FLAT_STORE_DWORD_SADDR:
699 case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
700 case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
701 case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
702 return AMDGPU::FLAT_STORE_DWORD_SADDR;
703 }
704}
705
706// GLOBAL loads and stores are classified as FLAT initially. If both combined
707// instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE.
708// If either or both instructions are non segment specific FLAT the resulting
709// combined operation will be FLAT, potentially promoting one of the GLOBAL
710// operations to FLAT.
711// For other instructions return the original unmodified class.
712InstClassEnum
713SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI,
714 const CombineInfo &Paired) {
715 assert(CI.InstClass == Paired.InstClass);
716
717 if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
719 return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
720
721 return CI.InstClass;
722}
723
724static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
725 AddressRegs Result;
726
727 if (TII.isMUBUF(Opc)) {
729 Result.VAddr = true;
731 Result.SRsrc = true;
733 Result.SOffset = true;
734
735 return Result;
736 }
737
738 if (TII.isImage(Opc)) {
739 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
740 if (VAddr0Idx >= 0) {
741 AMDGPU::OpName RsrcName =
742 TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
743 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcName);
744 Result.NumVAddrs = RsrcIdx - VAddr0Idx;
745 } else {
746 Result.VAddr = true;
747 }
748 Result.SRsrc = true;
750 if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
751 Result.SSamp = true;
752
753 return Result;
754 }
755 if (TII.isMTBUF(Opc)) {
757 Result.VAddr = true;
759 Result.SRsrc = true;
761 Result.SOffset = true;
762
763 return Result;
764 }
765
766 switch (Opc) {
767 default:
768 return Result;
769 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
770 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
771 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
772 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
773 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
774 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
775 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
776 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
777 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
778 Result.SOffset = true;
779 [[fallthrough]];
780 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
781 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
782 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
783 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
784 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
785 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
786 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
787 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
788 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
789 case AMDGPU::S_LOAD_DWORD_IMM:
790 case AMDGPU::S_LOAD_DWORDX2_IMM:
791 case AMDGPU::S_LOAD_DWORDX3_IMM:
792 case AMDGPU::S_LOAD_DWORDX4_IMM:
793 case AMDGPU::S_LOAD_DWORDX8_IMM:
794 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
795 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
796 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
797 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
798 Result.SBase = true;
799 return Result;
800 case AMDGPU::DS_READ_B32:
801 case AMDGPU::DS_READ_B64:
802 case AMDGPU::DS_READ_B32_gfx9:
803 case AMDGPU::DS_READ_B64_gfx9:
804 case AMDGPU::DS_WRITE_B32:
805 case AMDGPU::DS_WRITE_B64:
806 case AMDGPU::DS_WRITE_B32_gfx9:
807 case AMDGPU::DS_WRITE_B64_gfx9:
808 Result.Addr = true;
809 return Result;
810 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
811 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
812 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
813 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
814 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
815 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
816 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
817 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
818 case AMDGPU::FLAT_LOAD_DWORD_SADDR:
819 case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
820 case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
821 case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
822 case AMDGPU::FLAT_STORE_DWORD_SADDR:
823 case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
824 case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
825 case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
826 Result.SAddr = true;
827 [[fallthrough]];
828 case AMDGPU::GLOBAL_LOAD_DWORD:
829 case AMDGPU::GLOBAL_LOAD_DWORDX2:
830 case AMDGPU::GLOBAL_LOAD_DWORDX3:
831 case AMDGPU::GLOBAL_LOAD_DWORDX4:
832 case AMDGPU::GLOBAL_STORE_DWORD:
833 case AMDGPU::GLOBAL_STORE_DWORDX2:
834 case AMDGPU::GLOBAL_STORE_DWORDX3:
835 case AMDGPU::GLOBAL_STORE_DWORDX4:
836 case AMDGPU::FLAT_LOAD_DWORD:
837 case AMDGPU::FLAT_LOAD_DWORDX2:
838 case AMDGPU::FLAT_LOAD_DWORDX3:
839 case AMDGPU::FLAT_LOAD_DWORDX4:
840 case AMDGPU::FLAT_STORE_DWORD:
841 case AMDGPU::FLAT_STORE_DWORDX2:
842 case AMDGPU::FLAT_STORE_DWORDX3:
843 case AMDGPU::FLAT_STORE_DWORDX4:
844 Result.VAddr = true;
845 return Result;
846 }
847}
848
849void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
850 const SILoadStoreOptimizer &LSO) {
851 I = MI;
852 unsigned Opc = MI->getOpcode();
853 InstClass = getInstClass(Opc, *LSO.TII);
854
855 if (InstClass == UNKNOWN)
856 return;
857
858 DataRC = LSO.getDataRegClass(*MI);
859
860 switch (InstClass) {
861 case DS_READ:
862 EltSize =
863 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
864 : 4;
865 break;
866 case DS_WRITE:
867 EltSize =
868 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
869 : 4;
870 break;
871 case S_BUFFER_LOAD_IMM:
872 case S_BUFFER_LOAD_SGPR_IMM:
873 case S_LOAD_IMM:
874 EltSize = AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);
875 break;
876 default:
877 EltSize = 4;
878 break;
879 }
880
881 if (InstClass == MIMG) {
882 DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
883 // Offset is not considered for MIMG instructions.
884 Offset = 0;
885 } else {
886 int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
887 Offset = I->getOperand(OffsetIdx).getImm();
888 }
889
890 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) {
891 Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
892 const AMDGPU::GcnBufferFormatInfo *Info =
893 AMDGPU::getGcnBufferFormatInfo(Format, *LSO.STM);
894 EltSize = Info->BitsPerComp / 8;
895 }
896
897 Width = getOpcodeWidth(*I, *LSO.TII);
898
899 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
900 Offset &= 0xffff;
901 } else if (InstClass != MIMG) {
902 CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
903 }
904
905 AddressRegs Regs = getRegs(Opc, *LSO.TII);
906 bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*I) || LSO.TII->isVSAMPLE(*I);
907
908 NumAddresses = 0;
909 for (unsigned J = 0; J < Regs.NumVAddrs; J++)
910 AddrIdx[NumAddresses++] =
911 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
912 if (Regs.Addr)
913 AddrIdx[NumAddresses++] =
914 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
915 if (Regs.SBase)
916 AddrIdx[NumAddresses++] =
917 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
918 if (Regs.SRsrc)
919 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
920 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc);
921 if (Regs.SOffset)
922 AddrIdx[NumAddresses++] =
923 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
924 if (Regs.SAddr)
925 AddrIdx[NumAddresses++] =
926 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
927 if (Regs.VAddr)
928 AddrIdx[NumAddresses++] =
929 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
930 if (Regs.SSamp)
931 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
932 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp);
933 assert(NumAddresses <= MaxAddressRegs);
934
935 for (unsigned J = 0; J < NumAddresses; J++)
936 AddrReg[J] = &I->getOperand(AddrIdx[J]);
937}
938
939} // end anonymous namespace.
940
941INITIALIZE_PASS_BEGIN(SILoadStoreOptimizerLegacy, DEBUG_TYPE,
942 "SI Load Store Optimizer", false, false)
944INITIALIZE_PASS_END(SILoadStoreOptimizerLegacy, DEBUG_TYPE,
945 "SI Load Store Optimizer", false, false)
946
947char SILoadStoreOptimizerLegacy::ID = 0;
948
949char &llvm::SILoadStoreOptimizerLegacyID = SILoadStoreOptimizerLegacy::ID;
950
952 return new SILoadStoreOptimizerLegacy();
953}
954
956 DenseSet<Register> &RegDefs,
957 DenseSet<Register> &RegUses) {
958 for (const auto &Op : MI.operands()) {
959 if (!Op.isReg())
960 continue;
961 if (Op.isDef())
962 RegDefs.insert(Op.getReg());
963 if (Op.readsReg())
964 RegUses.insert(Op.getReg());
965 }
966}
967
968bool SILoadStoreOptimizer::canSwapInstructions(
969 const DenseSet<Register> &ARegDefs, const DenseSet<Register> &ARegUses,
970 const MachineInstr &A, const MachineInstr &B) const {
971 if (A.mayLoadOrStore() && B.mayLoadOrStore() &&
972 (A.mayStore() || B.mayStore()) && A.mayAlias(AA, B, true))
973 return false;
974 for (const auto &BOp : B.operands()) {
975 if (!BOp.isReg())
976 continue;
977 if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))
978 return false;
979 if (BOp.isDef() && ARegUses.contains(BOp.getReg()))
980 return false;
981 }
982 return true;
983}
984
985// Given that \p CI and \p Paired are adjacent memory operations produce a new
986// MMO for the combined operation with a new access size.
987MachineMemOperand *
988SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,
989 const CombineInfo &Paired) {
990 const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
991 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
992
993 unsigned Size = MMOa->getSize().getValue() + MMOb->getSize().getValue();
994
995 // A base pointer for the combined operation is the same as the leading
996 // operation's pointer.
997 if (Paired < CI)
998 std::swap(MMOa, MMOb);
999
1000 MachinePointerInfo PtrInfo(MMOa->getPointerInfo());
1001 // If merging FLAT and GLOBAL set address space to FLAT.
1002 if (MMOb->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
1003 PtrInfo.AddrSpace = AMDGPUAS::FLAT_ADDRESS;
1004
1005 MachineFunction *MF = CI.I->getMF();
1006 return MF->getMachineMemOperand(MMOa, PtrInfo, Size);
1007}
1008
1009bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
1010 const SIInstrInfo &TII,
1011 const CombineInfo &Paired) {
1012 assert(CI.InstClass == MIMG);
1013
1014 // Ignore instructions with tfe/lwe set.
1015 const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
1016 const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
1017
1018 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
1019 return false;
1020
1021 // Check other optional immediate operands for equality.
1022 AMDGPU::OpName OperandsToMatch[] = {
1023 AMDGPU::OpName::cpol, AMDGPU::OpName::d16, AMDGPU::OpName::unorm,
1024 AMDGPU::OpName::da, AMDGPU::OpName::r128, AMDGPU::OpName::a16};
1025
1026 for (AMDGPU::OpName op : OperandsToMatch) {
1027 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
1028 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(), op) != Idx)
1029 return false;
1030 if (Idx != -1 &&
1031 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
1032 return false;
1033 }
1034
1035 // Check DMask for overlaps.
1036 unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
1037 unsigned MinMask = std::min(CI.DMask, Paired.DMask);
1038
1039 if (!MaxMask)
1040 return false;
1041
1042 unsigned AllowedBitsForMin = llvm::countr_zero(MaxMask);
1043 if ((1u << AllowedBitsForMin) <= MinMask)
1044 return false;
1045
1046 return true;
1047}
1048
1049static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
1050 unsigned ComponentCount,
1051 const GCNSubtarget &STI) {
1052 if (ComponentCount > 4)
1053 return 0;
1054
1055 const llvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
1057 if (!OldFormatInfo)
1058 return 0;
1059
1060 const llvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
1062 ComponentCount,
1063 OldFormatInfo->NumFormat, STI);
1064
1065 if (!NewFormatInfo)
1066 return 0;
1067
1068 assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
1069 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
1070
1071 return NewFormatInfo->Format;
1072}
1073
1074// Return the value in the inclusive range [Lo,Hi] that is aligned to the
1075// highest power of two. Note that the result is well defined for all inputs
1076// including corner cases like:
1077// - if Lo == Hi, return that value
1078// - if Lo == 0, return 0 (even though the "- 1" below underflows
1079// - if Lo > Hi, return 0 (as if the range wrapped around)
1083
1084bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
1085 const GCNSubtarget &STI,
1086 CombineInfo &Paired,
1087 bool Modify) {
1088 assert(CI.InstClass != MIMG);
1089
1090 // XXX - Would the same offset be OK? Is there any reason this would happen or
1091 // be useful?
1092 if (CI.Offset == Paired.Offset)
1093 return false;
1094
1095 // This won't be valid if the offset isn't aligned.
1096 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
1097 return false;
1098
1099 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
1100
1101 const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
1103 const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
1104 llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
1105
1106 if (Info0->BitsPerComp != Info1->BitsPerComp ||
1107 Info0->NumFormat != Info1->NumFormat)
1108 return false;
1109
1110 // For 8-bit or 16-bit formats there is no 3-component variant.
1111 // If NumCombinedComponents is 3, try the 4-component format and use XYZ.
1112 // Example:
1113 // tbuffer_load_format_x + tbuffer_load_format_x + tbuffer_load_format_x
1114 // ==> tbuffer_load_format_xyz with format:[BUF_FMT_16_16_16_16_SNORM]
1115 unsigned NumCombinedComponents = CI.Width + Paired.Width;
1116 if (NumCombinedComponents == 3 && CI.EltSize <= 2)
1117 NumCombinedComponents = 4;
1118
1119 if (getBufferFormatWithCompCount(CI.Format, NumCombinedComponents, STI) ==
1120 0)
1121 return false;
1122
1123 // Merge only when the two access ranges are strictly back-to-back,
1124 // any gap or overlap can over-write data or leave holes.
1125 unsigned ElemIndex0 = CI.Offset / CI.EltSize;
1126 unsigned ElemIndex1 = Paired.Offset / Paired.EltSize;
1127 if (ElemIndex0 + CI.Width != ElemIndex1 &&
1128 ElemIndex1 + Paired.Width != ElemIndex0)
1129 return false;
1130
1131 // 1-byte formats require 1-byte alignment.
1132 // 2-byte formats require 2-byte alignment.
1133 // 4-byte and larger formats require 4-byte alignment.
1134 unsigned MergedBytes = CI.EltSize * NumCombinedComponents;
1135 unsigned RequiredAlign = std::min(MergedBytes, 4u);
1136 unsigned MinOff = std::min(CI.Offset, Paired.Offset);
1137 if (MinOff % RequiredAlign != 0)
1138 return false;
1139
1140 return true;
1141 }
1142
1143 uint32_t EltOffset0 = CI.Offset / CI.EltSize;
1144 uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
1145 CI.UseST64 = false;
1146 CI.BaseOff = 0;
1147
1148 // Handle all non-DS instructions.
1149 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
1150 if (EltOffset0 + CI.Width != EltOffset1 &&
1151 EltOffset1 + Paired.Width != EltOffset0)
1152 return false;
1153 // Instructions with scale_offset modifier cannot be combined unless we
1154 // also generate a code to scale the offset and reset that bit.
1155 if (CI.CPol != Paired.CPol || (CI.CPol & AMDGPU::CPol::SCAL))
1156 return false;
1157 if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||
1158 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {
1159 // Reject cases like:
1160 // dword + dwordx2 -> dwordx3
1161 // dword + dwordx3 -> dwordx4
1162 // If we tried to combine these cases, we would fail to extract a subreg
1163 // for the result of the second load due to SGPR alignment requirements.
1164 if (CI.Width != Paired.Width &&
1165 (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset))
1166 return false;
1167 }
1168 return true;
1169 }
1170
1171 // If the offset in elements doesn't fit in 8-bits, we might be able to use
1172 // the stride 64 versions.
1173 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
1174 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
1175 if (Modify) {
1176 CI.Offset = EltOffset0 / 64;
1177 Paired.Offset = EltOffset1 / 64;
1178 CI.UseST64 = true;
1179 }
1180 return true;
1181 }
1182
1183 // Check if the new offsets fit in the reduced 8-bit range.
1184 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
1185 if (Modify) {
1186 CI.Offset = EltOffset0;
1187 Paired.Offset = EltOffset1;
1188 }
1189 return true;
1190 }
1191
1192 // Try to shift base address to decrease offsets.
1193 uint32_t Min = std::min(EltOffset0, EltOffset1);
1194 uint32_t Max = std::max(EltOffset0, EltOffset1);
1195
1196 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
1197 if (((Max - Min) & ~Mask) == 0) {
1198 if (Modify) {
1199 // From the range of values we could use for BaseOff, choose the one that
1200 // is aligned to the highest power of two, to maximise the chance that
1201 // the same offset can be reused for other load/store pairs.
1202 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
1203 // Copy the low bits of the offsets, so that when we adjust them by
1204 // subtracting BaseOff they will be multiples of 64.
1205 BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
1206 CI.BaseOff = BaseOff * CI.EltSize;
1207 CI.Offset = (EltOffset0 - BaseOff) / 64;
1208 Paired.Offset = (EltOffset1 - BaseOff) / 64;
1209 CI.UseST64 = true;
1210 }
1211 return true;
1212 }
1213
1214 if (isUInt<8>(Max - Min)) {
1215 if (Modify) {
1216 // From the range of values we could use for BaseOff, choose the one that
1217 // is aligned to the highest power of two, to maximise the chance that
1218 // the same offset can be reused for other load/store pairs.
1219 uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
1220 CI.BaseOff = BaseOff * CI.EltSize;
1221 CI.Offset = EltOffset0 - BaseOff;
1222 Paired.Offset = EltOffset1 - BaseOff;
1223 }
1224 return true;
1225 }
1226
1227 return false;
1228}
1229
1230bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
1231 const CombineInfo &CI,
1232 const CombineInfo &Paired) {
1233 const unsigned Width = (CI.Width + Paired.Width);
1234 switch (CI.InstClass) {
1235 default:
1236 return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
1237 case S_BUFFER_LOAD_IMM:
1238 case S_BUFFER_LOAD_SGPR_IMM:
1239 case S_LOAD_IMM:
1240 switch (Width) {
1241 default:
1242 return false;
1243 case 2:
1244 case 4:
1245 case 8:
1246 return true;
1247 case 3:
1248 return STM.hasScalarDwordx3Loads();
1249 }
1250 }
1251}
1252
1253const TargetRegisterClass *
1254SILoadStoreOptimizer::getDataRegClass(const MachineInstr &MI) const {
1255 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
1256 return TRI->getRegClassForReg(*MRI, Dst->getReg());
1257 }
1258 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
1259 return TRI->getRegClassForReg(*MRI, Src->getReg());
1260 }
1261 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
1262 return TRI->getRegClassForReg(*MRI, Src->getReg());
1263 }
1264 if (const auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
1265 return TRI->getRegClassForReg(*MRI, Dst->getReg());
1266 }
1267 if (const auto *Src = TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
1268 return TRI->getRegClassForReg(*MRI, Src->getReg());
1269 }
1270 return nullptr;
1271}
1272
1273/// This function assumes that CI comes before Paired in a basic block. Return
1274/// an insertion point for the merged instruction or nullptr on failure.
1275SILoadStoreOptimizer::CombineInfo *
1276SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1277 CombineInfo &Paired) {
1278 // If another instruction has already been merged into CI, it may now be a
1279 // type that we can't do any further merging into.
1280 if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1281 return nullptr;
1282 assert(CI.InstClass == Paired.InstClass);
1283
1284 if (getInstSubclass(CI.I->getOpcode(), *TII) !=
1285 getInstSubclass(Paired.I->getOpcode(), *TII))
1286 return nullptr;
1287
1288 // Check both offsets (or masks for MIMG) can be combined and fit in the
1289 // reduced range.
1290 if (CI.InstClass == MIMG) {
1291 if (!dmasksCanBeCombined(CI, *TII, Paired))
1292 return nullptr;
1293 } else {
1294 if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
1295 return nullptr;
1296 }
1297
1298 DenseSet<Register> RegDefs;
1299 DenseSet<Register> RegUses;
1300 CombineInfo *Where;
1301 if (CI.I->mayLoad()) {
1302 // Try to hoist Paired up to CI.
1303 addDefsUsesToList(*Paired.I, RegDefs, RegUses);
1304 for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {
1305 if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))
1306 return nullptr;
1307 }
1308 Where = &CI;
1309 } else {
1310 // Try to sink CI down to Paired.
1311 addDefsUsesToList(*CI.I, RegDefs, RegUses);
1312 for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {
1313 if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))
1314 return nullptr;
1315 }
1316 Where = &Paired;
1317 }
1318
1319 // Call offsetsCanBeCombined with modify = true so that the offsets are
1320 // correct for the new instruction. This should return true, because
1321 // this function should only be called on CombineInfo objects that
1322 // have already been confirmed to be mergeable.
1323 if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1324 offsetsCanBeCombined(CI, *STM, Paired, true);
1325
1326 if (CI.InstClass == DS_WRITE) {
1327 // Both data operands must be AGPR or VGPR, so the data registers needs to
1328 // be constrained to one or the other. We expect to only emit the VGPR form
1329 // here for now.
1330 //
1331 // FIXME: There is currently a hack in getRegClass to report that the write2
1332 // operands are VGPRs. In the future we should have separate agpr
1333 // instruction definitions.
1334 const MachineOperand *Data0 =
1335 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1336 const MachineOperand *Data1 =
1337 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1338
1339 const MCInstrDesc &Write2Opc = TII->get(getWrite2Opcode(CI));
1340 int Data0Idx = AMDGPU::getNamedOperandIdx(Write2Opc.getOpcode(),
1341 AMDGPU::OpName::data0);
1342 int Data1Idx = AMDGPU::getNamedOperandIdx(Write2Opc.getOpcode(),
1343 AMDGPU::OpName::data1);
1344
1345 const TargetRegisterClass *DataRC0 = TII->getRegClass(Write2Opc, Data0Idx);
1346
1347 const TargetRegisterClass *DataRC1 = TII->getRegClass(Write2Opc, Data1Idx);
1348
1349 if (unsigned SubReg = Data0->getSubReg()) {
1350 DataRC0 = TRI->getMatchingSuperRegClass(MRI->getRegClass(Data0->getReg()),
1351 DataRC0, SubReg);
1352 }
1353
1354 if (unsigned SubReg = Data1->getSubReg()) {
1355 DataRC1 = TRI->getMatchingSuperRegClass(MRI->getRegClass(Data1->getReg()),
1356 DataRC1, SubReg);
1357 }
1358
1359 if (!MRI->constrainRegClass(Data0->getReg(), DataRC0) ||
1360 !MRI->constrainRegClass(Data1->getReg(), DataRC1))
1361 return nullptr;
1362
1363 // TODO: If one register can be constrained, and not the other, insert a
1364 // copy.
1365 }
1366
1367 return Where;
1368}
1369
1370// Copy the merged load result from DestReg to the original dest regs of CI and
1371// Paired.
1372void SILoadStoreOptimizer::copyToDestRegs(
1373 CombineInfo &CI, CombineInfo &Paired,
1374 MachineBasicBlock::iterator InsertBefore, const DebugLoc &DL,
1375 AMDGPU::OpName OpName, Register DestReg) const {
1376 MachineBasicBlock *MBB = CI.I->getParent();
1377
1378 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1379
1380 // Copy to the old destination registers.
1381 const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
1382 auto *Dest0 = TII->getNamedOperand(*CI.I, OpName);
1383 auto *Dest1 = TII->getNamedOperand(*Paired.I, OpName);
1384
1385 // The constrained sload instructions in S_LOAD_IMM class will have
1386 // `early-clobber` flag in the dst operand. Remove the flag before using the
1387 // MOs in copies.
1388 Dest0->setIsEarlyClobber(false);
1389 Dest1->setIsEarlyClobber(false);
1390
1391 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1392 .add(*Dest0) // Copy to same destination including flags and sub reg.
1393 .addReg(DestReg, {}, SubRegIdx0);
1394 BuildMI(*MBB, InsertBefore, DL, CopyDesc)
1395 .add(*Dest1)
1396 .addReg(DestReg, RegState::Kill, SubRegIdx1);
1397}
1398
1399// Return a register for the source of the merged store after copying the
1400// original source regs of CI and Paired into it.
1402SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
1403 MachineBasicBlock::iterator InsertBefore,
1404 const DebugLoc &DL,
1405 AMDGPU::OpName OpName) const {
1406 MachineBasicBlock *MBB = CI.I->getParent();
1407
1408 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1409
1410 // Copy to the new source register.
1411 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1412 Register SrcReg = MRI->createVirtualRegister(SuperRC);
1413
1414 const auto *Src0 = TII->getNamedOperand(*CI.I, OpName);
1415 const auto *Src1 = TII->getNamedOperand(*Paired.I, OpName);
1416
1417 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1418 .add(*Src0)
1419 .addImm(SubRegIdx0)
1420 .add(*Src1)
1421 .addImm(SubRegIdx1);
1422
1423 return SrcReg;
1424}
1425
1426unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
1427 if (STM->ldsRequiresM0Init())
1428 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1429 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1430}
1431
1432unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
1433 if (STM->ldsRequiresM0Init())
1434 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1435
1436 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1437 : AMDGPU::DS_READ2ST64_B64_gfx9;
1438}
1439
1441SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1442 MachineBasicBlock::iterator InsertBefore) {
1443 MachineBasicBlock *MBB = CI.I->getParent();
1444
1445 // Be careful, since the addresses could be subregisters themselves in weird
1446 // cases, like vectors of pointers.
1447 const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1448
1449 unsigned NewOffset0 = std::min(CI.Offset, Paired.Offset);
1450 unsigned NewOffset1 = std::max(CI.Offset, Paired.Offset);
1451 unsigned Opc =
1452 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1453
1454 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1455 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1456
1457 const MCInstrDesc &Read2Desc = TII->get(Opc);
1458
1459 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1460 Register DestReg = MRI->createVirtualRegister(SuperRC);
1461
1462 DebugLoc DL =
1463 DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
1464
1465 Register BaseReg = AddrReg->getReg();
1466 unsigned BaseSubReg = AddrReg->getSubReg();
1467 RegState BaseRegFlags = {};
1468 if (CI.BaseOff) {
1469 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1470 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1471 .addImm(CI.BaseOff);
1472
1473 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1474 BaseRegFlags = RegState::Kill;
1475
1476 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1477 .addReg(ImmReg)
1478 .addReg(AddrReg->getReg(), {}, BaseSubReg)
1479 .addImm(0); // clamp bit
1480 BaseSubReg = 0;
1481 }
1482
1483 MachineInstrBuilder Read2 =
1484 BuildMI(*MBB, InsertBefore, DL, Read2Desc, DestReg)
1485 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1486 .addImm(NewOffset0) // offset0
1487 .addImm(NewOffset1) // offset1
1488 .addImm(0) // gds
1489 .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1490
1491 copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdst, DestReg);
1492
1493 CI.I->eraseFromParent();
1494 Paired.I->eraseFromParent();
1495
1496 LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
1497 return Read2;
1498}
1499
1500unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
1501 if (STM->ldsRequiresM0Init())
1502 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1503 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1504 : AMDGPU::DS_WRITE2_B64_gfx9;
1505}
1506
1507unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
1508 if (STM->ldsRequiresM0Init())
1509 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1510 : AMDGPU::DS_WRITE2ST64_B64;
1511
1512 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1513 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1514}
1515
1516unsigned SILoadStoreOptimizer::getWrite2Opcode(const CombineInfo &CI) const {
1517 return CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1518}
1519
1520MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
1521 CombineInfo &CI, CombineInfo &Paired,
1522 MachineBasicBlock::iterator InsertBefore) {
1523 MachineBasicBlock *MBB = CI.I->getParent();
1524
1525 // Be sure to use .addOperand(), and not .addReg() with these. We want to be
1526 // sure we preserve the subregister index and any register flags set on them.
1527 const MachineOperand *AddrReg =
1528 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1529 const MachineOperand *Data0 =
1530 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1531 const MachineOperand *Data1 =
1532 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1533
1534 unsigned NewOffset0 = CI.Offset;
1535 unsigned NewOffset1 = Paired.Offset;
1536 unsigned Opc = getWrite2Opcode(CI);
1537
1538 if (NewOffset0 > NewOffset1) {
1539 // Canonicalize the merged instruction so the smaller offset comes first.
1540 std::swap(NewOffset0, NewOffset1);
1541 std::swap(Data0, Data1);
1542 }
1543
1544 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1545 (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
1546
1547 const MCInstrDesc &Write2Desc = TII->get(Opc);
1548 DebugLoc DL =
1549 DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
1550
1551 Register BaseReg = AddrReg->getReg();
1552 unsigned BaseSubReg = AddrReg->getSubReg();
1553 RegState BaseRegFlags = {};
1554 if (CI.BaseOff) {
1555 Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1556 BuildMI(*MBB, InsertBefore, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
1557 .addImm(CI.BaseOff);
1558
1559 BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1560 BaseRegFlags = RegState::Kill;
1561
1562 TII->getAddNoCarry(*MBB, InsertBefore, DL, BaseReg)
1563 .addReg(ImmReg)
1564 .addReg(AddrReg->getReg(), {}, BaseSubReg)
1565 .addImm(0); // clamp bit
1566 BaseSubReg = 0;
1567 }
1568
1569 MachineInstrBuilder Write2 =
1570 BuildMI(*MBB, InsertBefore, DL, Write2Desc)
1571 .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
1572 .add(*Data0) // data0
1573 .add(*Data1) // data1
1574 .addImm(NewOffset0) // offset0
1575 .addImm(NewOffset1) // offset1
1576 .addImm(0) // gds
1577 .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1578
1579 CI.I->eraseFromParent();
1580 Paired.I->eraseFromParent();
1581
1582 LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
1583 return Write2;
1584}
1585
1587SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1588 MachineBasicBlock::iterator InsertBefore) {
1589 MachineBasicBlock *MBB = CI.I->getParent();
1590 DebugLoc DL =
1591 DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
1592
1593 const unsigned Opcode = getNewOpcode(CI, Paired);
1594
1595 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1596
1597 Register DestReg = MRI->createVirtualRegister(SuperRC);
1598 unsigned MergedDMask = CI.DMask | Paired.DMask;
1599 unsigned DMaskIdx =
1600 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1601
1602 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1603 for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
1604 if (I == DMaskIdx)
1605 MIB.addImm(MergedDMask);
1606 else
1607 MIB.add((*CI.I).getOperand(I));
1608 }
1609
1610 // It shouldn't be possible to get this far if the two instructions
1611 // don't have a single memoperand, because MachineInstr::mayAlias()
1612 // will return true if this is the case.
1613 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1614
1615 MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1616
1617 copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata, DestReg);
1618
1619 CI.I->eraseFromParent();
1620 Paired.I->eraseFromParent();
1621 return New;
1622}
1623
1624MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
1625 CombineInfo &CI, CombineInfo &Paired,
1626 MachineBasicBlock::iterator InsertBefore) {
1627 MachineBasicBlock *MBB = CI.I->getParent();
1628 DebugLoc DL =
1629 DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
1630
1631 const unsigned Opcode = getNewOpcode(CI, Paired);
1632
1633 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1634
1635 Register DestReg = MRI->createVirtualRegister(SuperRC);
1636 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1637
1638 // It shouldn't be possible to get this far if the two instructions
1639 // don't have a single memoperand, because MachineInstr::mayAlias()
1640 // will return true if this is the case.
1641 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1642
1643 MachineInstrBuilder New =
1644 BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg)
1645 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));
1646 if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1647 New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));
1648 New.addImm(MergedOffset);
1649 New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1650
1651 copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::sdst, DestReg);
1652
1653 CI.I->eraseFromParent();
1654 Paired.I->eraseFromParent();
1655 return New;
1656}
1657
1658MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
1659 CombineInfo &CI, CombineInfo &Paired,
1660 MachineBasicBlock::iterator InsertBefore) {
1661 MachineBasicBlock *MBB = CI.I->getParent();
1662
1663 DebugLoc DL =
1664 DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
1665
1666 const unsigned Opcode = getNewOpcode(CI, Paired);
1667
1668 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1669
1670 // Copy to the new source register.
1671 Register DestReg = MRI->createVirtualRegister(SuperRC);
1672 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1673
1674 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1675
1676 AddressRegs Regs = getRegs(Opcode, *TII);
1677
1678 if (Regs.VAddr)
1679 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1680
1681 // It shouldn't be possible to get this far if the two instructions
1682 // don't have a single memoperand, because MachineInstr::mayAlias()
1683 // will return true if this is the case.
1684 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1685
1686 MachineInstr *New =
1687 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1688 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1689 .addImm(MergedOffset) // offset
1690 .addImm(CI.CPol) // cpol
1691 .addImm(0) // swz
1692 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1693
1694 copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata, DestReg);
1695
1696 CI.I->eraseFromParent();
1697 Paired.I->eraseFromParent();
1698 return New;
1699}
1700
1701MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
1702 CombineInfo &CI, CombineInfo &Paired,
1703 MachineBasicBlock::iterator InsertBefore) {
1704 MachineBasicBlock *MBB = CI.I->getParent();
1705
1706 DebugLoc DL =
1707 DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
1708
1709 const unsigned Opcode = getNewOpcode(CI, Paired);
1710
1711 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1712
1713 // Copy to the new source register.
1714 Register DestReg = MRI->createVirtualRegister(SuperRC);
1715 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1716
1717 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1718
1719 AddressRegs Regs = getRegs(Opcode, *TII);
1720
1721 if (Regs.VAddr)
1722 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1723
1724 // For 8-bit or 16-bit tbuffer formats there is no 3-component encoding.
1725 // If the combined count is 3 (e.g. X+X+X or XY+X), promote to 4 components
1726 // and use XYZ of XYZW to enable the merge.
1727 unsigned NumCombinedComponents = CI.Width + Paired.Width;
1728 if (NumCombinedComponents == 3 && CI.EltSize <= 2)
1729 NumCombinedComponents = 4;
1730 unsigned JoinedFormat =
1731 getBufferFormatWithCompCount(CI.Format, NumCombinedComponents, *STM);
1732
1733 // It shouldn't be possible to get this far if the two instructions
1734 // don't have a single memoperand, because MachineInstr::mayAlias()
1735 // will return true if this is the case.
1736 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1737
1738 MachineInstr *New =
1739 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1740 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1741 .addImm(MergedOffset) // offset
1742 .addImm(JoinedFormat) // format
1743 .addImm(CI.CPol) // cpol
1744 .addImm(0) // swz
1745 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1746
1747 copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata, DestReg);
1748
1749 CI.I->eraseFromParent();
1750 Paired.I->eraseFromParent();
1751 return New;
1752}
1753
1754MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
1755 CombineInfo &CI, CombineInfo &Paired,
1756 MachineBasicBlock::iterator InsertBefore) {
1757 MachineBasicBlock *MBB = CI.I->getParent();
1758 DebugLoc DL =
1759 DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
1760
1761 const unsigned Opcode = getNewOpcode(CI, Paired);
1762
1763 Register SrcReg =
1764 copyFromSrcRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata);
1765
1766 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1767 .addReg(SrcReg, RegState::Kill);
1768
1769 AddressRegs Regs = getRegs(Opcode, *TII);
1770
1771 if (Regs.VAddr)
1772 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1773
1774 // For 8-bit or 16-bit tbuffer formats there is no 3-component encoding.
1775 // If the combined count is 3 (e.g. X+X+X or XY+X), promote to 4 components
1776 // and use XYZ of XYZW to enable the merge.
1777 unsigned NumCombinedComponents = CI.Width + Paired.Width;
1778 if (NumCombinedComponents == 3 && CI.EltSize <= 2)
1779 NumCombinedComponents = 4;
1780 unsigned JoinedFormat =
1781 getBufferFormatWithCompCount(CI.Format, NumCombinedComponents, *STM);
1782
1783 // It shouldn't be possible to get this far if the two instructions
1784 // don't have a single memoperand, because MachineInstr::mayAlias()
1785 // will return true if this is the case.
1786 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1787
1788 MachineInstr *New =
1789 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1790 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1791 .addImm(std::min(CI.Offset, Paired.Offset)) // offset
1792 .addImm(JoinedFormat) // format
1793 .addImm(CI.CPol) // cpol
1794 .addImm(0) // swz
1795 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1796
1797 CI.I->eraseFromParent();
1798 Paired.I->eraseFromParent();
1799 return New;
1800}
1801
1802MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
1803 CombineInfo &CI, CombineInfo &Paired,
1804 MachineBasicBlock::iterator InsertBefore) {
1805 MachineBasicBlock *MBB = CI.I->getParent();
1806
1807 DebugLoc DL =
1808 DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
1809
1810 const unsigned Opcode = getNewOpcode(CI, Paired);
1811
1812 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1813 Register DestReg = MRI->createVirtualRegister(SuperRC);
1814
1815 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode), DestReg);
1816
1817 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1818 MIB.add(*SAddr);
1819
1820 MachineInstr *New =
1821 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1822 .addImm(std::min(CI.Offset, Paired.Offset))
1823 .addImm(CI.CPol)
1824 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1825
1826 copyToDestRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdst, DestReg);
1827
1828 CI.I->eraseFromParent();
1829 Paired.I->eraseFromParent();
1830 return New;
1831}
1832
1833MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
1834 CombineInfo &CI, CombineInfo &Paired,
1835 MachineBasicBlock::iterator InsertBefore) {
1836 MachineBasicBlock *MBB = CI.I->getParent();
1837
1838 DebugLoc DL =
1839 DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
1840
1841 const unsigned Opcode = getNewOpcode(CI, Paired);
1842
1843 Register SrcReg =
1844 copyFromSrcRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata);
1845
1846 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
1847 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1848 .addReg(SrcReg, RegState::Kill);
1849
1850 if (auto *SAddr = TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1851 MIB.add(*SAddr);
1852
1853 MachineInstr *New =
1854 MIB.addImm(std::min(CI.Offset, Paired.Offset))
1855 .addImm(CI.CPol)
1856 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1857
1858 CI.I->eraseFromParent();
1859 Paired.I->eraseFromParent();
1860 return New;
1861}
1862
1865 unsigned Width) {
1866 // Conservatively returns true if not found the MMO.
1867 return STM.isXNACKEnabled() &&
1868 (MMOs.size() != 1 || MMOs[0]->getAlign().value() < Width * 4);
1869}
1870
1871unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1872 const CombineInfo &Paired) {
1873 const unsigned Width = CI.Width + Paired.Width;
1874
1875 switch (getCommonInstClass(CI, Paired)) {
1876 default:
1877 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1878 // FIXME: Handle d16 correctly
1879 return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
1880 Width);
1881 case TBUFFER_LOAD:
1882 case TBUFFER_STORE:
1883 return AMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1884 Width);
1885
1886 case UNKNOWN:
1887 llvm_unreachable("Unknown instruction class");
1888 case S_BUFFER_LOAD_IMM: {
1889 // If XNACK is enabled, use the constrained opcodes when the first load is
1890 // under-aligned.
1891 bool NeedsConstrainedOpc =
1892 needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);
1893 switch (Width) {
1894 default:
1895 return 0;
1896 case 2:
1897 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec
1898 : AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1899 case 3:
1900 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec
1901 : AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
1902 case 4:
1903 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec
1904 : AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1905 case 8:
1906 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec
1907 : AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1908 }
1909 }
1910 case S_BUFFER_LOAD_SGPR_IMM: {
1911 // If XNACK is enabled, use the constrained opcodes when the first load is
1912 // under-aligned.
1913 bool NeedsConstrainedOpc =
1914 needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);
1915 switch (Width) {
1916 default:
1917 return 0;
1918 case 2:
1919 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec
1920 : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1921 case 3:
1922 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec
1923 : AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
1924 case 4:
1925 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec
1926 : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1927 case 8:
1928 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec
1929 : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1930 }
1931 }
1932 case S_LOAD_IMM: {
1933 // If XNACK is enabled, use the constrained opcodes when the first load is
1934 // under-aligned.
1935 bool NeedsConstrainedOpc =
1936 needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);
1937 switch (Width) {
1938 default:
1939 return 0;
1940 case 2:
1941 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX2_IMM_ec
1942 : AMDGPU::S_LOAD_DWORDX2_IMM;
1943 case 3:
1944 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX3_IMM_ec
1945 : AMDGPU::S_LOAD_DWORDX3_IMM;
1946 case 4:
1947 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX4_IMM_ec
1948 : AMDGPU::S_LOAD_DWORDX4_IMM;
1949 case 8:
1950 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX8_IMM_ec
1951 : AMDGPU::S_LOAD_DWORDX8_IMM;
1952 }
1953 }
1954 case GLOBAL_LOAD:
1955 switch (Width) {
1956 default:
1957 return 0;
1958 case 2:
1959 return AMDGPU::GLOBAL_LOAD_DWORDX2;
1960 case 3:
1961 return AMDGPU::GLOBAL_LOAD_DWORDX3;
1962 case 4:
1963 return AMDGPU::GLOBAL_LOAD_DWORDX4;
1964 }
1965 case GLOBAL_LOAD_SADDR:
1966 switch (Width) {
1967 default:
1968 return 0;
1969 case 2:
1970 return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1971 case 3:
1972 return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1973 case 4:
1974 return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1975 }
1976 case GLOBAL_STORE:
1977 switch (Width) {
1978 default:
1979 return 0;
1980 case 2:
1981 return AMDGPU::GLOBAL_STORE_DWORDX2;
1982 case 3:
1983 return AMDGPU::GLOBAL_STORE_DWORDX3;
1984 case 4:
1985 return AMDGPU::GLOBAL_STORE_DWORDX4;
1986 }
1987 case GLOBAL_STORE_SADDR:
1988 switch (Width) {
1989 default:
1990 return 0;
1991 case 2:
1992 return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1993 case 3:
1994 return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1995 case 4:
1996 return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1997 }
1998 case FLAT_LOAD:
1999 switch (Width) {
2000 default:
2001 return 0;
2002 case 2:
2003 return AMDGPU::FLAT_LOAD_DWORDX2;
2004 case 3:
2005 return AMDGPU::FLAT_LOAD_DWORDX3;
2006 case 4:
2007 return AMDGPU::FLAT_LOAD_DWORDX4;
2008 }
2009 case FLAT_STORE:
2010 switch (Width) {
2011 default:
2012 return 0;
2013 case 2:
2014 return AMDGPU::FLAT_STORE_DWORDX2;
2015 case 3:
2016 return AMDGPU::FLAT_STORE_DWORDX3;
2017 case 4:
2018 return AMDGPU::FLAT_STORE_DWORDX4;
2019 }
2020 case FLAT_LOAD_SADDR:
2021 switch (Width) {
2022 default:
2023 return 0;
2024 case 2:
2025 return AMDGPU::FLAT_LOAD_DWORDX2_SADDR;
2026 case 3:
2027 return AMDGPU::FLAT_LOAD_DWORDX3_SADDR;
2028 case 4:
2029 return AMDGPU::FLAT_LOAD_DWORDX4_SADDR;
2030 }
2031 case FLAT_STORE_SADDR:
2032 switch (Width) {
2033 default:
2034 return 0;
2035 case 2:
2036 return AMDGPU::FLAT_STORE_DWORDX2_SADDR;
2037 case 3:
2038 return AMDGPU::FLAT_STORE_DWORDX3_SADDR;
2039 case 4:
2040 return AMDGPU::FLAT_STORE_DWORDX4_SADDR;
2041 }
2042 case MIMG:
2043 assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) &&
2044 "No overlaps");
2045 return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
2046 }
2047}
2048
2049std::pair<unsigned, unsigned>
2050SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
2051 const CombineInfo &Paired) {
2052 assert((CI.InstClass != MIMG ||
2053 ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) ==
2054 CI.Width + Paired.Width)) &&
2055 "No overlaps");
2056
2057 unsigned Idx0;
2058 unsigned Idx1;
2059
2060 static const unsigned Idxs[5][4] = {
2061 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
2062 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
2063 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
2064 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
2065 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
2066 };
2067
2068 assert(CI.Width >= 1 && CI.Width <= 4);
2069 assert(Paired.Width >= 1 && Paired.Width <= 4);
2070
2071 if (Paired < CI) {
2072 Idx1 = Idxs[0][Paired.Width - 1];
2073 Idx0 = Idxs[Paired.Width][CI.Width - 1];
2074 } else {
2075 Idx0 = Idxs[0][CI.Width - 1];
2076 Idx1 = Idxs[CI.Width][Paired.Width - 1];
2077 }
2078
2079 return {Idx0, Idx1};
2080}
2081
2082const TargetRegisterClass *
2083SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
2084 const CombineInfo &Paired) const {
2085 if (CI.InstClass == S_BUFFER_LOAD_IMM ||
2086 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
2087 switch (CI.Width + Paired.Width) {
2088 default:
2089 return nullptr;
2090 case 2:
2091 return &AMDGPU::SReg_64_XEXECRegClass;
2092 case 3:
2093 return &AMDGPU::SGPR_96RegClass;
2094 case 4:
2095 return &AMDGPU::SGPR_128RegClass;
2096 case 8:
2097 return &AMDGPU::SGPR_256RegClass;
2098 case 16:
2099 return &AMDGPU::SGPR_512RegClass;
2100 }
2101 }
2102
2103 // FIXME: This should compute the instruction to use, and then use the result
2104 // of TII->getRegClass.
2105 unsigned BitWidth = 32 * (CI.Width + Paired.Width);
2106 return TRI->isAGPRClass(getDataRegClass(*CI.I))
2107 ? TRI->getAGPRClassForBitWidth(BitWidth)
2108 : TRI->getVGPRClassForBitWidth(BitWidth);
2109}
2110
2111MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
2112 CombineInfo &CI, CombineInfo &Paired,
2113 MachineBasicBlock::iterator InsertBefore) {
2114 MachineBasicBlock *MBB = CI.I->getParent();
2115 DebugLoc DL =
2116 DebugLoc::getMergedLocation(CI.I->getDebugLoc(), Paired.I->getDebugLoc());
2117
2118 const unsigned Opcode = getNewOpcode(CI, Paired);
2119
2120 Register SrcReg =
2121 copyFromSrcRegs(CI, Paired, InsertBefore, DL, AMDGPU::OpName::vdata);
2122
2123 auto MIB = BuildMI(*MBB, InsertBefore, DL, TII->get(Opcode))
2124 .addReg(SrcReg, RegState::Kill);
2125
2126 AddressRegs Regs = getRegs(Opcode, *TII);
2127
2128 if (Regs.VAddr)
2129 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
2130
2131
2132 // It shouldn't be possible to get this far if the two instructions
2133 // don't have a single memoperand, because MachineInstr::mayAlias()
2134 // will return true if this is the case.
2135 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
2136
2137 MachineInstr *New =
2138 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
2139 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
2140 .addImm(std::min(CI.Offset, Paired.Offset)) // offset
2141 .addImm(CI.CPol) // cpol
2142 .addImm(0) // swz
2143 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
2144
2145 CI.I->eraseFromParent();
2146 Paired.I->eraseFromParent();
2147 return New;
2148}
2149
2150MachineOperand
2151SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
2152 APInt V(32, Val, true);
2153 if (TII->isInlineConstant(V))
2154 return MachineOperand::CreateImm(Val);
2155
2156 Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2157 MachineInstr *Mov =
2158 BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
2159 TII->get(AMDGPU::S_MOV_B32), Reg)
2160 .addImm(Val);
2161 (void)Mov;
2162 LLVM_DEBUG(dbgs() << " "; Mov->dump());
2163 return MachineOperand::CreateReg(Reg, false);
2164}
2165
2166// Compute base address using Addr and return the final register.
2167Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
2168 const MemAddress &Addr) const {
2169 MachineBasicBlock *MBB = MI.getParent();
2170 MachineBasicBlock::iterator MBBI = MI.getIterator();
2171 const DebugLoc &DL = MI.getDebugLoc();
2172
2173 LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n");
2174
2175 // Use V_ADD_U64_e64 when the original pattern used it (gfx1250+)
2176 if (Addr.Base.UseV64Pattern) {
2177 Register FullDestReg = MRI->createVirtualRegister(
2178 TII->getRegClass(TII->get(AMDGPU::V_ADD_U64_e64), 0));
2179
2180 // Load the 64-bit offset into an SGPR pair if needed
2181 Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2182 MachineInstr *MovOffset =
2183 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::S_MOV_B64_IMM_PSEUDO),
2184 OffsetReg)
2185 .addImm(Addr.Offset);
2186 MachineInstr *Add64 =
2187 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_U64_e64), FullDestReg)
2188 .addReg(Addr.Base.LoReg)
2189 .addReg(OffsetReg, RegState::Kill)
2190 .addImm(0);
2191 (void)MovOffset;
2192 (void)Add64;
2193 LLVM_DEBUG(dbgs() << " " << *MovOffset << "\n";
2194 dbgs() << " " << *Add64 << "\n\n";);
2195
2196 return FullDestReg;
2197 }
2198
2199 // Original carry-chain pattern (V_ADD_CO_U32 + V_ADDC_U32)
2200 assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
2201 Addr.Base.LoSubReg) &&
2202 "Expected 32-bit Base-Register-Low!!");
2203
2204 assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
2205 Addr.Base.HiSubReg) &&
2206 "Expected 32-bit Base-Register-Hi!!");
2207
2208 MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
2209 MachineOperand OffsetHi =
2210 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
2211
2212 const auto *CarryRC = TRI->getWaveMaskRegClass();
2213 Register CarryReg = MRI->createVirtualRegister(CarryRC);
2214 Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
2215
2216 Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2217 Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2218 MachineInstr *LoHalf =
2219 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
2220 .addReg(CarryReg, RegState::Define)
2221 .addReg(Addr.Base.LoReg, {}, Addr.Base.LoSubReg)
2222 .add(OffsetLo)
2223 .addImm(0); // clamp bit
2224
2225 MachineInstr *HiHalf =
2226 BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
2227 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
2228 .addReg(Addr.Base.HiReg, {}, Addr.Base.HiSubReg)
2229 .add(OffsetHi)
2230 .addReg(CarryReg, RegState::Kill)
2231 .addImm(0); // clamp bit
2232
2233 Register FullDestReg = MRI->createVirtualRegister(TRI->getVGPR64Class());
2234 MachineInstr *FullBase =
2235 BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
2236 .addReg(DestSub0)
2237 .addImm(AMDGPU::sub0)
2238 .addReg(DestSub1)
2239 .addImm(AMDGPU::sub1);
2240
2241 (void)LoHalf;
2242 (void)HiHalf;
2243 (void)FullBase;
2244 LLVM_DEBUG(dbgs() << " " << *LoHalf << "\n";
2245 dbgs() << " " << *HiHalf << "\n";
2246 dbgs() << " " << *FullBase << "\n\n";);
2247
2248 return FullDestReg;
2249}
2250
2251// Update base and offset with the NewBase and NewOffset in MI.
2252void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
2253 Register NewBase,
2254 int32_t NewOffset) const {
2255 auto *Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
2256 Base->setReg(NewBase);
2257 Base->setIsKill(false);
2258 TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
2259}
2260
2261// Helper to extract a 64-bit constant offset from a V_ADD_U64_e64 instruction.
2262// Returns true if successful, populating Addr with base register info and
2263// offset.
2264bool SILoadStoreOptimizer::processBaseWithConstOffset64(
2265 MachineInstr *AddDef, const MachineOperand &Base, MemAddress &Addr) const {
2266 if (!Base.isReg())
2267 return false;
2268
2269 MachineOperand *Src0 = TII->getNamedOperand(*AddDef, AMDGPU::OpName::src0);
2270 MachineOperand *Src1 = TII->getNamedOperand(*AddDef, AMDGPU::OpName::src1);
2271
2272 const MachineOperand *BaseOp = nullptr;
2273
2274 auto Offset = TII->getImmOrMaterializedImm(*Src1);
2275
2276 if (Offset) {
2277 BaseOp = Src0;
2278 Addr.Offset = *Offset;
2279 } else {
2280 // Both or neither are constants - can't handle this pattern
2281 return false;
2282 }
2283
2284 // Now extract the base register (which should be a 64-bit VGPR).
2285 Addr.Base.LoReg = BaseOp->getReg();
2286 Addr.Base.UseV64Pattern = true;
2287 return true;
2288}
2289
2290// Analyze Base and extracts:
2291// - 32bit base registers, subregisters
2292// - 64bit constant offset
2293// Expecting base computation as:
2294// %OFFSET0:sgpr_32 = S_MOV_B32 8000
2295// %LO:vgpr_32, %c:sreg_64_xexec =
2296// V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
2297// %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
2298// %Base:vreg_64 =
2299// REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
2300//
2301// Also handles V_ADD_U64_e64 pattern (gfx1250+):
2302// %OFFSET:sreg_64 = S_MOV_B64_IMM_PSEUDO 256
2303// %Base:vreg_64 = V_ADD_U64_e64 %BASE:vreg_64, %OFFSET:sreg_64, 0
2304void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
2305 MemAddress &Addr) const {
2306 if (!Base.isReg())
2307 return;
2308
2309 MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
2310 if (!Def)
2311 return;
2312
2313 // Try V_ADD_U64_e64 pattern first (simpler, used on gfx1250+)
2314 if (Def->getOpcode() == AMDGPU::V_ADD_U64_e64) {
2315 if (processBaseWithConstOffset64(Def, Base, Addr))
2316 return;
2317 }
2318
2319 // Fall through to REG_SEQUENCE + V_ADD_CO_U32 + V_ADDC_U32 pattern
2320 if (Def->getOpcode() != AMDGPU::REG_SEQUENCE || Def->getNumOperands() != 5)
2321 return;
2322
2323 MachineOperand BaseLo = Def->getOperand(1);
2324 MachineOperand BaseHi = Def->getOperand(3);
2325 if (!BaseLo.isReg() || !BaseHi.isReg())
2326 return;
2327
2328 MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
2329 MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
2330
2331 if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
2332 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
2333 return;
2334
2335 MachineOperand *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
2336 MachineOperand *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
2337
2338 auto Offset0P = TII->getImmOrMaterializedImm(*Src0);
2339 if (Offset0P)
2340 BaseLo = *Src1;
2341 else {
2342 if (!(Offset0P = TII->getImmOrMaterializedImm(*Src1)))
2343 return;
2344 BaseLo = *Src0;
2345 }
2346
2347 if (!BaseLo.isReg())
2348 return;
2349
2350 Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
2351 Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
2352
2353 if (Src0->isImm())
2354 std::swap(Src0, Src1);
2355
2356 if (!Src1->isImm() || Src0->isImm())
2357 return;
2358
2359 uint64_t Offset1 = Src1->getImm();
2360 BaseHi = *Src0;
2361
2362 if (!BaseHi.isReg())
2363 return;
2364
2365 Addr.Base.LoReg = BaseLo.getReg();
2366 Addr.Base.HiReg = BaseHi.getReg();
2367 Addr.Base.LoSubReg = BaseLo.getSubReg();
2368 Addr.Base.HiSubReg = BaseHi.getSubReg();
2369 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
2370}
2371
2372// Maintain the correct LDS address for async loads.
2373// It becomes incorrect when promoteConstantOffsetToImm
2374// adds an offset only meant for the src operand.
2375void SILoadStoreOptimizer::updateAsyncLDSAddress(MachineInstr &MI,
2376 int32_t OffsetDiff) const {
2377 if (!TII->usesASYNC_CNT(MI) || OffsetDiff == 0)
2378 return;
2379
2380 Register OldVDst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
2381 Register NewVDst = MRI->createVirtualRegister(MRI->getRegClass(OldVDst));
2382 MachineBasicBlock &MBB = *MI.getParent();
2383 const DebugLoc &DL = MI.getDebugLoc();
2384 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), NewVDst)
2385 .addReg(OldVDst)
2386 .addImm(-OffsetDiff)
2387 .addImm(0);
2388
2389 MI.getOperand(0).setReg(NewVDst);
2390}
2391
2392bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
2393 MachineInstr &MI,
2394 MemInfoMap &Visited,
2395 SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
2396
2397 if (!STM->hasFlatInstOffsets() || !SIInstrInfo::isFLAT(MI))
2398 return false;
2399
2400 // TODO: Support FLAT_SCRATCH. Currently code expects 64-bit pointers.
2402 return false;
2403
2406
2407 if (AnchorList.count(&MI))
2408 return false;
2409
2410 LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
2411
2412 if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
2413 LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";);
2414 return false;
2415 }
2416
2417 // Step1: Find the base-registers and a 64bit constant offset.
2418 MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
2419 auto [It, Inserted] = Visited.try_emplace(&MI);
2420 MemAddress MAddr;
2421 if (Inserted) {
2422 processBaseWithConstOffset(Base, MAddr);
2423 It->second = MAddr;
2424 } else
2425 MAddr = It->second;
2426
2427 if (MAddr.Offset == 0) {
2428 LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no"
2429 " constant offsets that can be promoted.\n";);
2430 return false;
2431 }
2432
2433 LLVM_DEBUG(dbgs() << " BASE: {" << printReg(MAddr.Base.HiReg, TRI) << ", "
2434 << printReg(MAddr.Base.LoReg, TRI)
2435 << "} Offset: " << MAddr.Offset << "\n\n";);
2436
2437 // Step2: Traverse through MI's basic block and find an anchor(that has the
2438 // same base-registers) with the highest 13bit distance from MI's offset.
2439 // E.g. (64bit loads)
2440 // bb:
2441 // addr1 = &a + 4096; load1 = load(addr1, 0)
2442 // addr2 = &a + 6144; load2 = load(addr2, 0)
2443 // addr3 = &a + 8192; load3 = load(addr3, 0)
2444 // addr4 = &a + 10240; load4 = load(addr4, 0)
2445 // addr5 = &a + 12288; load5 = load(addr5, 0)
2446 //
2447 // Starting from the first load, the optimization will try to find a new base
2448 // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
2449 // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
2450 // as the new-base(anchor) because of the maximum distance which can
2451 // accommodate more intermediate bases presumably.
2452 //
2453 // Step3: move (&a + 8192) above load1. Compute and promote offsets from
2454 // (&a + 8192) for load1, load2, load4.
2455 // addr = &a + 8192
2456 // load1 = load(addr, -4096)
2457 // load2 = load(addr, -2048)
2458 // load3 = load(addr, 0)
2459 // load4 = load(addr, 2048)
2460 // addr5 = &a + 12288; load5 = load(addr5, 0)
2461 //
2462 MachineInstr *AnchorInst = nullptr;
2463 MemAddress AnchorAddr;
2464 uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
2466
2467 MachineBasicBlock *MBB = MI.getParent();
2469 MachineBasicBlock::iterator MBBI = MI.getIterator();
2470 ++MBBI;
2471 const SITargetLowering *TLI = STM->getTargetLowering();
2472
2473 for ( ; MBBI != E; ++MBBI) {
2474 MachineInstr &MINext = *MBBI;
2475 // TODO: Support finding an anchor(with same base) from store addresses or
2476 // any other load addresses where the opcodes are different.
2477 if (MINext.getOpcode() != MI.getOpcode() ||
2478 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
2479 continue;
2480
2481 const MachineOperand &BaseNext =
2482 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
2483 MemAddress MAddrNext;
2484 auto [It, Inserted] = Visited.try_emplace(&MINext);
2485 if (Inserted) {
2486 processBaseWithConstOffset(BaseNext, MAddrNext);
2487 It->second = MAddrNext;
2488 } else
2489 MAddrNext = It->second;
2490
2491 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
2492 MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
2493 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
2494 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2495 continue;
2496
2497 InstsWCommonBase.emplace_back(&MINext, MAddrNext.Offset);
2498
2499 int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2500 TargetLoweringBase::AddrMode AM;
2501 AM.HasBaseReg = true;
2502 AM.BaseOffs = Dist;
2503 if (TLI->isLegalFlatAddressingMode(AM, AS) &&
2504 (uint32_t)std::abs(Dist) > MaxDist) {
2505 MaxDist = std::abs(Dist);
2506
2507 AnchorAddr = MAddrNext;
2508 AnchorInst = &MINext;
2509 }
2510 }
2511
2512 if (AnchorInst) {
2513 LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): ";
2514 AnchorInst->dump());
2515 LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: "
2516 << AnchorAddr.Offset << "\n\n");
2517
2518 // Instead of moving up, just re-compute anchor-instruction's base address.
2519 Register Base = computeBase(MI, AnchorAddr);
2520
2521 int32_t OffsetDiff = MAddr.Offset - AnchorAddr.Offset;
2522 updateBaseAndOffset(MI, Base, OffsetDiff);
2523 updateAsyncLDSAddress(MI, OffsetDiff);
2524 LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump(););
2525
2526 for (auto [OtherMI, OtherOffset] : InstsWCommonBase) {
2527 TargetLoweringBase::AddrMode AM;
2528 AM.HasBaseReg = true;
2529 AM.BaseOffs = OtherOffset - AnchorAddr.Offset;
2530
2531 if (TLI->isLegalFlatAddressingMode(AM, AS)) {
2532 LLVM_DEBUG(dbgs() << " Promote Offset(" << OtherOffset; dbgs() << ")";
2533 OtherMI->dump());
2534 int32_t OtherOffsetDiff = OtherOffset - AnchorAddr.Offset;
2535 updateBaseAndOffset(*OtherMI, Base, OtherOffsetDiff);
2536 updateAsyncLDSAddress(*OtherMI, OtherOffsetDiff);
2537 LLVM_DEBUG(dbgs() << " After promotion: "; OtherMI->dump());
2538 }
2539 }
2540 AnchorList.insert(AnchorInst);
2541 return true;
2542 }
2543
2544 return false;
2545}
2546
2547void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
2548 std::list<std::list<CombineInfo> > &MergeableInsts) const {
2549 for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2550 if (AddrList.front().InstClass == CI.InstClass &&
2551 AddrList.front().hasSameBaseAddress(CI)) {
2552 AddrList.emplace_back(CI);
2553 return;
2554 }
2555 }
2556
2557 // Base address not found, so add a new list.
2558 MergeableInsts.emplace_back(1, CI);
2559}
2560
2561std::pair<MachineBasicBlock::iterator, bool>
2562SILoadStoreOptimizer::collectMergeableInsts(
2564 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
2565 std::list<std::list<CombineInfo>> &MergeableInsts) const {
2566 bool Modified = false;
2567
2568 // Sort potential mergeable instructions into lists. One list per base address.
2569 unsigned Order = 0;
2570 MachineBasicBlock::iterator BlockI = Begin;
2571 for (; BlockI != End; ++BlockI) {
2572 MachineInstr &MI = *BlockI;
2573
2574 // We run this before checking if an address is mergeable, because it can produce
2575 // better code even if the instructions aren't mergeable.
2576 if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
2577 Modified = true;
2578
2579 // Treat volatile accesses, ordered accesses and unmodeled side effects as
2580 // barriers. We can look after this barrier for separate merges.
2581 if (MI.hasOrderedMemoryRef() || MI.hasUnmodeledSideEffects()) {
2582 LLVM_DEBUG(dbgs() << "Breaking search on barrier: " << MI);
2583
2584 // Search will resume after this instruction in a separate merge list.
2585 ++BlockI;
2586 break;
2587 }
2588
2589 const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
2590 if (InstClass == UNKNOWN)
2591 continue;
2592
2593 // Do not merge VMEM buffer instructions with "swizzled" bit set.
2594 int Swizzled =
2595 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
2596 if (Swizzled != -1 && MI.getOperand(Swizzled).getImm())
2597 continue;
2598
2599 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) {
2600 const MachineOperand *Fmt =
2601 TII->getNamedOperand(MI, AMDGPU::OpName::format);
2602 if (!AMDGPU::getGcnBufferFormatInfo(Fmt->getImm(), *STM)) {
2603 LLVM_DEBUG(dbgs() << "Skip tbuffer with unknown format: " << MI);
2604 continue;
2605 }
2606 }
2607
2608 CombineInfo CI;
2609 CI.setMI(MI, *this);
2610 CI.Order = Order++;
2611
2612 if (!CI.hasMergeableAddress(*MRI))
2613 continue;
2614
2615 LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
2616
2617 addInstToMergeableList(CI, MergeableInsts);
2618 }
2619
2620 // At this point we have lists of Mergeable instructions.
2621 //
2622 // Part 2: Sort lists by offset and then for each CombineInfo object in the
2623 // list try to find an instruction that can be merged with I. If an instruction
2624 // is found, it is stored in the Paired field. If no instructions are found, then
2625 // the CombineInfo object is deleted from the list.
2626
2627 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2628 E = MergeableInsts.end(); I != E;) {
2629
2630 std::list<CombineInfo> &MergeList = *I;
2631 if (MergeList.size() <= 1) {
2632 // This means we have found only one instruction with a given address
2633 // that can be merged, and we need at least 2 instructions to do a merge,
2634 // so this list can be discarded.
2635 I = MergeableInsts.erase(I);
2636 continue;
2637 }
2638
2639 // Sort the lists by offsets, this way mergeable instructions will be
2640 // adjacent to each other in the list, which will make it easier to find
2641 // matches.
2642 MergeList.sort(
2643 [] (const CombineInfo &A, const CombineInfo &B) {
2644 return A.Offset < B.Offset;
2645 });
2646 ++I;
2647 }
2648
2649 return {BlockI, Modified};
2650}
2651
2652// Scan through looking for adjacent LDS operations with constant offsets from
2653// the same base register. We rely on the scheduler to do the hard work of
2654// clustering nearby loads, and assume these are all adjacent.
2655bool SILoadStoreOptimizer::optimizeBlock(
2656 std::list<std::list<CombineInfo> > &MergeableInsts) {
2657 bool Modified = false;
2658
2659 for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
2660 E = MergeableInsts.end(); I != E;) {
2661 std::list<CombineInfo> &MergeList = *I;
2662
2663 bool OptimizeListAgain = false;
2664 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2665 // We weren't able to make any changes, so delete the list so we don't
2666 // process the same instructions the next time we try to optimize this
2667 // block.
2668 I = MergeableInsts.erase(I);
2669 continue;
2670 }
2671
2672 Modified = true;
2673
2674 // We made changes, but also determined that there were no more optimization
2675 // opportunities, so we don't need to reprocess the list
2676 if (!OptimizeListAgain) {
2677 I = MergeableInsts.erase(I);
2678 continue;
2679 }
2680 OptimizeAgain = true;
2681 }
2682 return Modified;
2683}
2684
2685bool
2686SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2687 std::list<CombineInfo> &MergeList,
2688 bool &OptimizeListAgain) {
2689 if (MergeList.empty())
2690 return false;
2691
2692 bool Modified = false;
2693
2694 for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
2695 Next = std::next(I)) {
2696
2697 auto First = I;
2698 auto Second = Next;
2699
2700 if ((*First).Order > (*Second).Order)
2701 std::swap(First, Second);
2702 CombineInfo &CI = *First;
2703 CombineInfo &Paired = *Second;
2704
2705 CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2706 if (!Where) {
2707 ++I;
2708 continue;
2709 }
2710
2711 Modified = true;
2712
2713 LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << " with: " << *Paired.I);
2714
2716 switch (CI.InstClass) {
2717 default:
2718 llvm_unreachable("unknown InstClass");
2719 break;
2720 case DS_READ:
2721 NewMI = mergeRead2Pair(CI, Paired, Where->I);
2722 break;
2723 case DS_WRITE:
2724 NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2725 break;
2726 case S_BUFFER_LOAD_IMM:
2727 case S_BUFFER_LOAD_SGPR_IMM:
2728 case S_LOAD_IMM:
2729 NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);
2730 OptimizeListAgain |= CI.Width + Paired.Width < 8;
2731 break;
2732 case BUFFER_LOAD:
2733 NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2734 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2735 break;
2736 case BUFFER_STORE:
2737 NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2738 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2739 break;
2740 case MIMG:
2741 NewMI = mergeImagePair(CI, Paired, Where->I);
2742 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2743 break;
2744 case TBUFFER_LOAD:
2745 NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2746 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2747 break;
2748 case TBUFFER_STORE:
2749 NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2750 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2751 break;
2752 case FLAT_LOAD:
2753 case FLAT_LOAD_SADDR:
2754 case GLOBAL_LOAD:
2755 case GLOBAL_LOAD_SADDR:
2756 NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2757 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2758 break;
2759 case FLAT_STORE:
2760 case FLAT_STORE_SADDR:
2761 case GLOBAL_STORE:
2762 case GLOBAL_STORE_SADDR:
2763 NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2764 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2765 break;
2766 }
2767 CI.setMI(NewMI, *this);
2768 CI.Order = Where->Order;
2769 if (I == Second)
2770 I = Next;
2771
2772 MergeList.erase(Second);
2773 }
2774
2775 return Modified;
2776}
2777
2778bool SILoadStoreOptimizerLegacy::runOnMachineFunction(MachineFunction &MF) {
2779 if (skipFunction(MF.getFunction()))
2780 return false;
2781 return SILoadStoreOptimizer(
2782 &getAnalysis<AAResultsWrapperPass>().getAAResults())
2783 .run(MF);
2784}
2785
2786bool SILoadStoreOptimizer::run(MachineFunction &MF) {
2787 this->MF = &MF;
2788 STM = &MF.getSubtarget<GCNSubtarget>();
2789 if (!STM->loadStoreOptEnabled())
2790 return false;
2791
2792 TII = STM->getInstrInfo();
2793 TRI = &TII->getRegisterInfo();
2794
2795 MRI = &MF.getRegInfo();
2796
2797 LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
2798
2799 bool Modified = false;
2800
2801 // Contains the list of instructions for which constant offsets are being
2802 // promoted to the IMM. This is tracked for an entire block at time.
2803 SmallPtrSet<MachineInstr *, 4> AnchorList;
2804 MemInfoMap Visited;
2805
2806 for (MachineBasicBlock &MBB : MF) {
2807 MachineBasicBlock::iterator SectionEnd;
2808 for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
2809 I = SectionEnd) {
2810 bool CollectModified;
2811 std::list<std::list<CombineInfo>> MergeableInsts;
2812
2813 // First pass: Collect list of all instructions we know how to merge in a
2814 // subset of the block.
2815 std::tie(SectionEnd, CollectModified) =
2816 collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
2817
2818 Modified |= CollectModified;
2819
2820 do {
2821 OptimizeAgain = false;
2822 Modified |= optimizeBlock(MergeableInsts);
2823 } while (OptimizeAgain);
2824 }
2825
2826 Visited.clear();
2827 AnchorList.clear();
2828 }
2829
2830 return Modified;
2831}
2832
2833PreservedAnalyses
2836 MFPropsModifier _(*this, MF);
2837
2838 if (MF.getFunction().hasOptNone())
2839 return PreservedAnalyses::all();
2840
2842 .getManager();
2843 AAResults &AA = FAM.getResult<AAManager>(MF.getFunction());
2844
2845 bool Changed = SILoadStoreOptimizer(&AA).run(MF);
2846 if (!Changed)
2847 return PreservedAnalyses::all();
2848
2851 return PA;
2852}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE, "AMDGPU Image Intrinsic Optimizer", false, false) char AMDGPUImageIntrinsicOptimizer void addInstToMergeableList(IntrinsicInst *II, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr)
BasicBlock::iterator collectMergeableInsts(BasicBlock::iterator I, BasicBlock::iterator E, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts)
Provides AMDGPU specific target descriptions.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
#define op(i)
const HexagonInstrInfo * TII
#define _
static MaybeAlign getAlign(Value *Ptr)
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi)
static bool needsConstrainedOpcode(const GCNSubtarget &STM, ArrayRef< MachineMemOperand * > MMOs, unsigned Width)
static void addDefsUsesToList(const MachineInstr &MI, DenseSet< Register > &RegDefs, DenseSet< Register > &RegUses)
static unsigned getBufferFormatWithCompCount(unsigned OldFormat, unsigned ComponentCount, const GCNSubtarget &STI)
static bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT, const TargetTransformInfo &TTI, const DataLayout &DL, bool HasBranchDivergence, DomTreeUpdater *DTU)
#define LLVM_DEBUG(...)
Definition Debug.h:114
A manager for alias analyses.
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
A debug info location.
Definition DebugLoc.h:123
static LLVM_ABI DebugLoc getMergedLocation(DebugLoc LocA, DebugLoc LocB)
When two instructions are combined into a single instruction we also need to combine the original loc...
Definition DebugLoc.cpp:179
Implements a dense probed hash-table based set.
Definition DenseSet.h:279
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
bool hasOptNone() const
Do not optimize this function (-O0).
Definition Function.h:706
bool loadStoreOptEnabled() const
const SIInstrInfo * getInstrInfo() const override
bool hasDwordx3LoadStores() const
const SITargetLowering * getTargetLowering() const override
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
bool isXNACKEnabled() const
const HexagonRegisterInfo & getRegisterInfo() const
TypeSize getValue() const
unsigned getOpcode() const
Return the opcode number for this descriptor.
An RAII based helper class to modify MachineFunctionProperties when running pass.
MachineInstrBundleIterator< MachineInstr > iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Properties which a MachineFunction may have at a given point in time.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & cloneMergedMemRefs(ArrayRef< const MachineInstr * > OtherMIs) const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
LLVM_ABI void dump() const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
unsigned getAddrSpace() const
const MachinePointerInfo & getPointerInfo() const
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void dump() const
Definition Pass.cpp:146
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
static bool isFLATScratch(const MachineInstr &MI)
static bool isVIMAGE(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isVSAMPLE(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, AMDGPU::OpName OperandName) const
Returns the operand named Op.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
reference emplace_back(ArgTypes &&... Args)
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:175
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
Definition Attributor.h:165
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset)
Convert ByteOffset to dwords if the subtarget uses dword SMRD immediate offsets.
bool getMTBUFHasSrsrc(unsigned Opc)
int getMTBUFElements(unsigned Opc)
bool getMTBUFHasSoffset(unsigned Opc)
int getMUBUFOpcode(unsigned BaseOpc, unsigned Elements)
int getMUBUFBaseOpcode(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
int getMTBUFBaseOpcode(unsigned Opc)
bool getMUBUFHasVAddr(unsigned Opc)
int getMTBUFOpcode(unsigned BaseOpc, unsigned Elements)
bool getMUBUFHasSoffset(unsigned Opc)
const MIMGBaseOpcodeInfo * getMIMGBaseOpcode(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
bool getMTBUFHasVAddr(unsigned Opc)
int getMUBUFElements(unsigned Opc)
const GcnBufferFormatInfo * getGcnBufferFormatInfo(uint8_t BitsPerComp, uint8_t NumComponents, uint8_t NumFormat, const MCSubtargetInfo &STI)
bool getMUBUFHasSrsrc(unsigned Opc)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
BaseReg
Stack frame base register. Bit 0 of FREInfo.Info.
Definition SFrame.h:77
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
@ Offset
Definition DWP.cpp:532
bool operator<(int64_t V1, const APSInt &V2)
Definition APSInt.h:362
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
RegState
Flags to represent properties of register accesses.
constexpr T maskLeadingOnes(unsigned N)
Create a bitmask with the N left-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:88
FunctionPass * createSILoadStoreOptimizerLegacyPass()
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
char & SILoadStoreOptimizerLegacyID
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:236
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
@ Other
Any other memory.
Definition ModRef.h:68
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
std::vector< std::pair< LineLocation, FunctionId > > AnchorList
constexpr unsigned BitWidth
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872