69 #define DEBUG_TYPE "si-load-store-opt"
92 unsigned char NumVAddrs = 0;
103 const unsigned MaxAddressRegs = 12 + 1 + 1;
114 InstClassEnum InstClass;
118 int AddrIdx[MaxAddressRegs];
120 unsigned NumAddresses;
124 for (
unsigned i = 0;
i < NumAddresses;
i++) {
146 for (
unsigned i = 0;
i < NumAddresses; ++
i) {
155 if (!AddrOp->
isReg())
174 return (InstClass ==
MIMG) ? DMask <
Other.DMask : Offset <
Other.Offset;
178 struct BaseRegisters {
182 unsigned LoSubReg = 0;
183 unsigned HiSubReg = 0;
204 static bool dmasksCanBeCombined(
const CombineInfo &CI,
206 const CombineInfo &Paired);
207 static bool offsetsCanBeCombined(CombineInfo &CI,
const GCNSubtarget &STI,
208 CombineInfo &Paired,
bool Modify =
false);
209 static bool widthsFit(
const GCNSubtarget &STI,
const CombineInfo &CI,
210 const CombineInfo &Paired);
211 static unsigned getNewOpcode(
const CombineInfo &CI,
const CombineInfo &Paired);
212 static std::pair<unsigned, unsigned> getSubRegIdxs(
const CombineInfo &CI,
213 const CombineInfo &Paired);
215 const CombineInfo &Paired);
218 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
220 unsigned read2Opcode(
unsigned EltSize)
const;
221 unsigned read2ST64Opcode(
unsigned EltSize)
const;
223 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
226 unsigned write2Opcode(
unsigned EltSize)
const;
227 unsigned write2ST64Opcode(
unsigned EltSize)
const;
229 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
232 mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
235 mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
238 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
241 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
244 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
247 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
250 mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
253 mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
257 int32_t NewOffset)
const;
268 void addInstToMergeableList(
const CombineInfo &CI,
269 std::list<std::list<CombineInfo> > &MergeableInsts)
const;
271 std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
274 std::list<std::list<CombineInfo>> &MergeableInsts)
const;
277 const CombineInfo &Paired);
279 static InstClassEnum getCommonInstClass(
const CombineInfo &CI,
280 const CombineInfo &Paired);
289 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
290 bool &OptimizeListAgain);
295 StringRef getPassName()
const override {
return "SI Load Store Optimizer"; }
311 const unsigned Opc =
MI.getOpcode();
313 if (
TII.isMUBUF(Opc)) {
317 if (
TII.isMIMG(
MI)) {
319 TII.getNamedOperand(
MI, AMDGPU::OpName::dmask)->getImm();
322 if (
TII.isMTBUF(Opc)) {
327 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
328 case AMDGPU::GLOBAL_LOAD_DWORD:
329 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
330 case AMDGPU::GLOBAL_STORE_DWORD:
331 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
332 case AMDGPU::FLAT_LOAD_DWORD:
333 case AMDGPU::FLAT_STORE_DWORD:
335 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
336 case AMDGPU::GLOBAL_LOAD_DWORDX2:
337 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
338 case AMDGPU::GLOBAL_STORE_DWORDX2:
339 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
340 case AMDGPU::FLAT_LOAD_DWORDX2:
341 case AMDGPU::FLAT_STORE_DWORDX2:
343 case AMDGPU::GLOBAL_LOAD_DWORDX3:
344 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
345 case AMDGPU::GLOBAL_STORE_DWORDX3:
346 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
347 case AMDGPU::FLAT_LOAD_DWORDX3:
348 case AMDGPU::FLAT_STORE_DWORDX3:
350 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
351 case AMDGPU::GLOBAL_LOAD_DWORDX4:
352 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
353 case AMDGPU::GLOBAL_STORE_DWORDX4:
354 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
355 case AMDGPU::FLAT_LOAD_DWORDX4:
356 case AMDGPU::FLAT_STORE_DWORDX4:
358 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
363 case AMDGPU::DS_WRITE_B32_gfx9:
368 case AMDGPU::DS_WRITE_B64_gfx9:
376 static InstClassEnum getInstClass(
unsigned Opc,
const SIInstrInfo &
TII) {
379 if (
TII.isMUBUF(Opc)) {
383 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
384 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
385 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
386 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
388 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
389 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
390 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
391 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
395 if (
TII.isMIMG(Opc)) {
404 if (
TII.get(Opc).mayStore() || !
TII.get(Opc).mayLoad() ||
409 if (
TII.isMTBUF(Opc)) {
413 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
414 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
415 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
416 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
418 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
419 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
420 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
421 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
422 return TBUFFER_STORE;
426 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
427 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
428 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
429 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
430 return S_BUFFER_LOAD_IMM;
431 case AMDGPU::DS_READ_B32:
432 case AMDGPU::DS_READ_B32_gfx9:
433 case AMDGPU::DS_READ_B64:
434 case AMDGPU::DS_READ_B64_gfx9:
436 case AMDGPU::DS_WRITE_B32:
437 case AMDGPU::DS_WRITE_B32_gfx9:
438 case AMDGPU::DS_WRITE_B64:
439 case AMDGPU::DS_WRITE_B64_gfx9:
441 case AMDGPU::GLOBAL_LOAD_DWORD:
442 case AMDGPU::GLOBAL_LOAD_DWORDX2:
443 case AMDGPU::GLOBAL_LOAD_DWORDX3:
444 case AMDGPU::GLOBAL_LOAD_DWORDX4:
445 case AMDGPU::FLAT_LOAD_DWORD:
446 case AMDGPU::FLAT_LOAD_DWORDX2:
447 case AMDGPU::FLAT_LOAD_DWORDX3:
448 case AMDGPU::FLAT_LOAD_DWORDX4:
450 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
451 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
452 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
453 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
454 return GLOBAL_LOAD_SADDR;
455 case AMDGPU::GLOBAL_STORE_DWORD:
456 case AMDGPU::GLOBAL_STORE_DWORDX2:
457 case AMDGPU::GLOBAL_STORE_DWORDX3:
458 case AMDGPU::GLOBAL_STORE_DWORDX4:
459 case AMDGPU::FLAT_STORE_DWORD:
460 case AMDGPU::FLAT_STORE_DWORDX2:
461 case AMDGPU::FLAT_STORE_DWORDX3:
462 case AMDGPU::FLAT_STORE_DWORDX4:
464 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
465 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
466 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
467 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
468 return GLOBAL_STORE_SADDR;
475 static unsigned getInstSubclass(
unsigned Opc,
const SIInstrInfo &
TII) {
478 if (
TII.isMUBUF(Opc))
480 if (
TII.isMIMG(Opc)) {
483 return Info->BaseOpcode;
485 if (
TII.isMTBUF(Opc))
488 case AMDGPU::DS_READ_B32:
489 case AMDGPU::DS_READ_B32_gfx9:
490 case AMDGPU::DS_READ_B64:
491 case AMDGPU::DS_READ_B64_gfx9:
492 case AMDGPU::DS_WRITE_B32:
493 case AMDGPU::DS_WRITE_B32_gfx9:
494 case AMDGPU::DS_WRITE_B64:
495 case AMDGPU::DS_WRITE_B64_gfx9:
497 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
498 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
499 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
500 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
501 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
502 case AMDGPU::GLOBAL_LOAD_DWORD:
503 case AMDGPU::GLOBAL_LOAD_DWORDX2:
504 case AMDGPU::GLOBAL_LOAD_DWORDX3:
505 case AMDGPU::GLOBAL_LOAD_DWORDX4:
506 case AMDGPU::FLAT_LOAD_DWORD:
507 case AMDGPU::FLAT_LOAD_DWORDX2:
508 case AMDGPU::FLAT_LOAD_DWORDX3:
509 case AMDGPU::FLAT_LOAD_DWORDX4:
510 return AMDGPU::FLAT_LOAD_DWORD;
511 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
512 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
513 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
514 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
515 return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
516 case AMDGPU::GLOBAL_STORE_DWORD:
517 case AMDGPU::GLOBAL_STORE_DWORDX2:
518 case AMDGPU::GLOBAL_STORE_DWORDX3:
519 case AMDGPU::GLOBAL_STORE_DWORDX4:
520 case AMDGPU::FLAT_STORE_DWORD:
521 case AMDGPU::FLAT_STORE_DWORDX2:
522 case AMDGPU::FLAT_STORE_DWORDX3:
523 case AMDGPU::FLAT_STORE_DWORDX4:
524 return AMDGPU::FLAT_STORE_DWORD;
525 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
526 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
527 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
528 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
529 return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
540 SILoadStoreOptimizer::getCommonInstClass(
const CombineInfo &CI,
541 const CombineInfo &Paired) {
542 assert(CI.InstClass == Paired.InstClass);
544 if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
546 return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
551 static AddressRegs getRegs(
unsigned Opc,
const SIInstrInfo &
TII) {
554 if (
TII.isMUBUF(Opc)) {
560 Result.SOffset =
true;
565 if (
TII.isMIMG(Opc)) {
567 if (VAddr0Idx >= 0) {
569 Result.NumVAddrs = SRsrcIdx - VAddr0Idx;
580 if (
TII.isMTBUF(Opc)) {
586 Result.SOffset =
true;
594 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
595 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
596 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
597 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
600 case AMDGPU::DS_READ_B32:
601 case AMDGPU::DS_READ_B64:
602 case AMDGPU::DS_READ_B32_gfx9:
603 case AMDGPU::DS_READ_B64_gfx9:
604 case AMDGPU::DS_WRITE_B32:
605 case AMDGPU::DS_WRITE_B64:
606 case AMDGPU::DS_WRITE_B32_gfx9:
607 case AMDGPU::DS_WRITE_B64_gfx9:
610 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
611 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
612 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
613 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
614 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
615 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
616 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
617 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
620 case AMDGPU::GLOBAL_LOAD_DWORD:
621 case AMDGPU::GLOBAL_LOAD_DWORDX2:
622 case AMDGPU::GLOBAL_LOAD_DWORDX3:
623 case AMDGPU::GLOBAL_LOAD_DWORDX4:
624 case AMDGPU::GLOBAL_STORE_DWORD:
625 case AMDGPU::GLOBAL_STORE_DWORDX2:
626 case AMDGPU::GLOBAL_STORE_DWORDX3:
627 case AMDGPU::GLOBAL_STORE_DWORDX4:
628 case AMDGPU::FLAT_LOAD_DWORD:
629 case AMDGPU::FLAT_LOAD_DWORDX2:
630 case AMDGPU::FLAT_LOAD_DWORDX3:
631 case AMDGPU::FLAT_LOAD_DWORDX4:
632 case AMDGPU::FLAT_STORE_DWORD:
633 case AMDGPU::FLAT_STORE_DWORDX2:
634 case AMDGPU::FLAT_STORE_DWORDX3:
635 case AMDGPU::FLAT_STORE_DWORDX4:
642 const SILoadStoreOptimizer &LSO) {
644 unsigned Opc =
MI->getOpcode();
645 InstClass = getInstClass(Opc, *LSO.TII);
647 if (InstClass == UNKNOWN)
650 IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*
MI));
655 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
660 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
663 case S_BUFFER_LOAD_IMM:
671 if (InstClass ==
MIMG) {
672 DMask = LSO.TII->getNamedOperand(*
I, AMDGPU::OpName::dmask)->getImm();
677 Offset =
I->getOperand(OffsetIdx).getImm();
680 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
683 Width = getOpcodeWidth(*
I, *LSO.TII);
685 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
687 }
else if (InstClass !=
MIMG) {
688 CPol = LSO.TII->getNamedOperand(*
I, AMDGPU::OpName::cpol)->getImm();
691 AddressRegs Regs = getRegs(Opc, *LSO.TII);
694 for (
unsigned J = 0; J < Regs.NumVAddrs; J++)
695 AddrIdx[NumAddresses++] =
698 AddrIdx[NumAddresses++] =
701 AddrIdx[NumAddresses++] =
704 AddrIdx[NumAddresses++] =
707 AddrIdx[NumAddresses++] =
710 AddrIdx[NumAddresses++] =
713 AddrIdx[NumAddresses++] =
716 AddrIdx[NumAddresses++] =
718 assert(NumAddresses <= MaxAddressRegs);
720 for (
unsigned J = 0; J < NumAddresses; J++)
721 AddrReg[J] = &
I->getOperand(AddrIdx[J]);
727 "SI Load Store Optimizer",
false,
false)
732 char SILoadStoreOptimizer::
ID = 0;
737 return new SILoadStoreOptimizer();
743 for (
const auto &
Op :
MI.operands()) {
753 bool SILoadStoreOptimizer::canSwapInstructions(
756 if (
A.mayLoadOrStore() &&
B.mayLoadOrStore() &&
757 (
A.mayStore() ||
B.mayStore()) &&
A.mayAlias(
AA,
B,
true))
759 for (
const auto &BOp :
B.operands()) {
762 if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.
contains(BOp.getReg()))
764 if (BOp.isDef() && ARegUses.
contains(BOp.getReg()))
773 SILoadStoreOptimizer::combineKnownAdjacentMMOs(
const CombineInfo &CI,
774 const CombineInfo &Paired) {
794 bool SILoadStoreOptimizer::dmasksCanBeCombined(
const CombineInfo &CI,
796 const CombineInfo &Paired) {
800 const auto *TFEOp =
TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
801 const auto *LWEOp =
TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
803 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
807 unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
809 AMDGPU::OpName::r128, AMDGPU::OpName::a16};
811 for (
auto op : OperandsToMatch) {
816 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
821 unsigned MaxMask =
std::max(CI.DMask, Paired.DMask);
822 unsigned MinMask =
std::min(CI.DMask, Paired.DMask);
825 if ((1u << AllowedBitsForMin) <= MinMask)
832 unsigned ComponentCount,
834 if (ComponentCount > 4)
853 return NewFormatInfo->
Format;
866 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
874 if (CI.Offset == Paired.Offset)
878 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
881 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
905 uint32_t EltOffset0 = CI.Offset / CI.EltSize;
906 uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
911 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
912 return (EltOffset0 + CI.Width == EltOffset1 ||
913 EltOffset1 + Paired.Width == EltOffset0) &&
914 CI.CPol == Paired.CPol;
919 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
922 CI.Offset = EltOffset0 / 64;
923 Paired.Offset = EltOffset1 / 64;
932 CI.Offset = EltOffset0;
933 Paired.Offset = EltOffset1;
942 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
943 if (((Max - Min) & ~
Mask) == 0) {
951 BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
952 CI.BaseOff = BaseOff * CI.EltSize;
953 CI.Offset = (EltOffset0 - BaseOff) / 64;
954 Paired.Offset = (EltOffset1 - BaseOff) / 64;
966 CI.BaseOff = BaseOff * CI.EltSize;
967 CI.Offset = EltOffset0 - BaseOff;
968 Paired.Offset = EltOffset1 - BaseOff;
976 bool SILoadStoreOptimizer::widthsFit(
const GCNSubtarget &STM,
977 const CombineInfo &CI,
978 const CombineInfo &Paired) {
979 const unsigned Width = (CI.Width + Paired.Width);
980 switch (CI.InstClass) {
983 case S_BUFFER_LOAD_IMM:
996 SILoadStoreOptimizer::getDataRegClass(
const MachineInstr &
MI)
const {
997 if (
const auto *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst)) {
998 return TRI->getRegClassForReg(*
MRI, Dst->getReg());
1000 if (
const auto *Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdata)) {
1001 return TRI->getRegClassForReg(*
MRI, Src->getReg());
1003 if (
const auto *Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::data0)) {
1004 return TRI->getRegClassForReg(*
MRI, Src->getReg());
1006 if (
const auto *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::sdst)) {
1007 return TRI->getRegClassForReg(*
MRI, Dst->getReg());
1009 if (
const auto *Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::sdata)) {
1010 return TRI->getRegClassForReg(*
MRI, Src->getReg());
1017 SILoadStoreOptimizer::CombineInfo *
1018 SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1019 CombineInfo &Paired) {
1022 if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1024 assert(CI.InstClass == Paired.InstClass);
1026 if (getInstSubclass(CI.I->getOpcode(), *
TII) !=
1027 getInstSubclass(Paired.I->getOpcode(), *
TII))
1032 if (CI.InstClass ==
MIMG) {
1033 if (!dmasksCanBeCombined(CI, *
TII, Paired))
1036 if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
1043 if (CI.I->mayLoad()) {
1047 if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *
MBBI))
1055 if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *
MBBI))
1065 if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1066 offsetsCanBeCombined(CI, *STM, Paired,
true);
1070 unsigned SILoadStoreOptimizer::read2Opcode(
unsigned EltSize)
const {
1072 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1073 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1076 unsigned SILoadStoreOptimizer::read2ST64Opcode(
unsigned EltSize)
const {
1078 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1080 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1081 : AMDGPU::DS_READ2ST64_B64_gfx9;
1085 SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1091 const auto *AddrReg =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1093 const auto *Dest0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1094 const auto *Dest1 =
TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1096 unsigned NewOffset0 = CI.Offset;
1097 unsigned NewOffset1 = Paired.Offset;
1099 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1101 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
1102 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
1104 if (NewOffset0 > NewOffset1) {
1111 (NewOffset0 != NewOffset1) &&
"Computed offset doesn't fit");
1120 Register BaseReg = AddrReg->getReg();
1121 unsigned BaseSubReg = AddrReg->getSubReg();
1122 unsigned BaseRegFlags = 0;
1131 TII->getAddNoCarry(*
MBB, InsertBefore,
DL, BaseReg)
1133 .addReg(AddrReg->getReg(), 0, BaseSubReg)
1140 .
addReg(BaseReg, BaseRegFlags, BaseSubReg)
1153 .
addReg(DestReg, 0, SubRegIdx0);
1158 CI.I->eraseFromParent();
1159 Paired.I->eraseFromParent();
1165 unsigned SILoadStoreOptimizer::write2Opcode(
unsigned EltSize)
const {
1167 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1168 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1169 : AMDGPU::DS_WRITE2_B64_gfx9;
1172 unsigned SILoadStoreOptimizer::write2ST64Opcode(
unsigned EltSize)
const {
1174 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1175 : AMDGPU::DS_WRITE2ST64_B64;
1177 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1178 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1182 CombineInfo &CI, CombineInfo &Paired,
1189 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1191 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1193 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1195 unsigned NewOffset0 = CI.Offset;
1196 unsigned NewOffset1 = Paired.Offset;
1198 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1200 if (NewOffset0 > NewOffset1) {
1207 (NewOffset0 != NewOffset1) &&
"Computed offset doesn't fit");
1213 unsigned BaseSubReg = AddrReg->
getSubReg();
1214 unsigned BaseRegFlags = 0;
1223 TII->getAddNoCarry(*
MBB, InsertBefore,
DL, BaseReg)
1225 .addReg(AddrReg->
getReg(), 0, BaseSubReg)
1232 .
addReg(BaseReg, BaseRegFlags, BaseSubReg)
1240 CI.I->eraseFromParent();
1241 Paired.I->eraseFromParent();
1243 LLVM_DEBUG(
dbgs() <<
"Inserted write2 inst: " << *Write2 <<
'\n');
1248 SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1252 const unsigned Opcode = getNewOpcode(CI, Paired);
1257 unsigned MergedDMask = CI.DMask | Paired.DMask;
1261 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1262 for (
unsigned I = 1,
E = (*CI.I).getNumOperands();
I !=
E; ++
I) {
1264 MIB.addImm(MergedDMask);
1266 MIB.add((*CI.I).getOperand(
I));
1272 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1274 MachineInstr *
New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1276 unsigned SubRegIdx0, SubRegIdx1;
1277 std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired);
1281 const auto *Dest0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1282 const auto *Dest1 =
TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1286 .
addReg(DestReg, 0, SubRegIdx0);
1291 CI.I->eraseFromParent();
1292 Paired.I->eraseFromParent();
1297 CombineInfo &CI, CombineInfo &Paired,
1301 const unsigned Opcode = getNewOpcode(CI, Paired);
1306 unsigned MergedOffset =
std::min(CI.Offset, Paired.Offset);
1311 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1315 .
add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
1320 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1321 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1322 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1326 const auto *Dest0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
1327 const auto *Dest1 =
TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst);
1331 .
addReg(DestReg, 0, SubRegIdx0);
1336 CI.I->eraseFromParent();
1337 Paired.I->eraseFromParent();
1342 CombineInfo &CI, CombineInfo &Paired,
1347 const unsigned Opcode = getNewOpcode(CI, Paired);
1353 unsigned MergedOffset =
std::min(CI.Offset, Paired.Offset);
1355 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1357 AddressRegs Regs = getRegs(Opcode, *
TII);
1360 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1365 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1368 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1369 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1370 .addImm(MergedOffset)
1374 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1376 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1377 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1378 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1382 const auto *Dest0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1383 const auto *Dest1 =
TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1387 .
addReg(DestReg, 0, SubRegIdx0);
1392 CI.I->eraseFromParent();
1393 Paired.I->eraseFromParent();
1398 CombineInfo &CI, CombineInfo &Paired,
1403 const unsigned Opcode = getNewOpcode(CI, Paired);
1409 unsigned MergedOffset =
std::min(CI.Offset, Paired.Offset);
1411 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1413 AddressRegs Regs = getRegs(Opcode, *
TII);
1416 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1418 unsigned JoinedFormat =
1424 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1427 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1428 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1429 .addImm(MergedOffset)
1430 .addImm(JoinedFormat)
1434 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1436 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1437 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1438 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1442 const auto *Dest0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1443 const auto *Dest1 =
TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1447 .
addReg(DestReg, 0, SubRegIdx0);
1452 CI.I->eraseFromParent();
1453 Paired.I->eraseFromParent();
1458 CombineInfo &CI, CombineInfo &Paired,
1463 const unsigned Opcode = getNewOpcode(CI, Paired);
1465 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1466 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1467 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1473 const auto *Src0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1474 const auto *Src1 =
TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1476 BuildMI(*
MBB, InsertBefore,
DL,
TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1485 AddressRegs Regs = getRegs(Opcode, *
TII);
1488 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1490 unsigned JoinedFormat =
1496 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1499 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1500 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1501 .addImm(
std::min(CI.Offset, Paired.Offset))
1502 .addImm(JoinedFormat)
1506 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1508 CI.I->eraseFromParent();
1509 Paired.I->eraseFromParent();
1514 CombineInfo &CI, CombineInfo &Paired,
1519 const unsigned Opcode = getNewOpcode(CI, Paired);
1524 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1526 if (
auto *SAddr =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1530 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1531 .addImm(
std::min(CI.Offset, Paired.Offset))
1533 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1535 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1536 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1537 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1541 const auto *Dest0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1542 const auto *Dest1 =
TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1546 .
addReg(DestReg, 0, SubRegIdx0);
1551 CI.I->eraseFromParent();
1552 Paired.I->eraseFromParent();
1557 CombineInfo &CI, CombineInfo &Paired,
1562 const unsigned Opcode = getNewOpcode(CI, Paired);
1564 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1565 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1566 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1572 const auto *Src0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1573 const auto *Src1 =
TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1575 BuildMI(*
MBB, InsertBefore,
DL,
TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1582 .
add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1585 if (
auto *SAddr =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1589 MIB.addImm(
std::min(CI.Offset, Paired.Offset))
1591 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1593 CI.I->eraseFromParent();
1594 Paired.I->eraseFromParent();
1598 unsigned SILoadStoreOptimizer::getNewOpcode(
const CombineInfo &CI,
1599 const CombineInfo &Paired) {
1600 const unsigned Width = CI.Width + Paired.Width;
1602 switch (getCommonInstClass(CI, Paired)) {
1615 case S_BUFFER_LOAD_IMM:
1620 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1622 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1624 return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1631 return AMDGPU::GLOBAL_LOAD_DWORDX2;
1633 return AMDGPU::GLOBAL_LOAD_DWORDX3;
1635 return AMDGPU::GLOBAL_LOAD_DWORDX4;
1637 case GLOBAL_LOAD_SADDR:
1642 return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1644 return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1646 return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1653 return AMDGPU::GLOBAL_STORE_DWORDX2;
1655 return AMDGPU::GLOBAL_STORE_DWORDX3;
1657 return AMDGPU::GLOBAL_STORE_DWORDX4;
1659 case GLOBAL_STORE_SADDR:
1664 return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1666 return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1668 return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1675 return AMDGPU::FLAT_LOAD_DWORDX2;
1677 return AMDGPU::FLAT_LOAD_DWORDX3;
1679 return AMDGPU::FLAT_LOAD_DWORDX4;
1686 return AMDGPU::FLAT_STORE_DWORDX2;
1688 return AMDGPU::FLAT_STORE_DWORDX3;
1690 return AMDGPU::FLAT_STORE_DWORDX4;
1699 std::pair<unsigned, unsigned>
1700 SILoadStoreOptimizer::getSubRegIdxs(
const CombineInfo &CI,
1701 const CombineInfo &Paired) {
1703 CI.Width + Paired.Width)) &&
1709 static const unsigned Idxs[5][4] = {
1710 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1711 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
1712 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
1713 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
1714 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
1717 assert(CI.Width >= 1 && CI.Width <= 4);
1718 assert(Paired.Width >= 1 && Paired.Width <= 4);
1721 Idx1 = Idxs[0][Paired.Width - 1];
1722 Idx0 = Idxs[Paired.Width][CI.Width - 1];
1724 Idx0 = Idxs[0][CI.Width - 1];
1725 Idx1 = Idxs[CI.Width][Paired.Width - 1];
1728 return std::make_pair(Idx0, Idx1);
1732 SILoadStoreOptimizer::getTargetRegisterClass(
const CombineInfo &CI,
1733 const CombineInfo &Paired) {
1734 if (CI.InstClass == S_BUFFER_LOAD_IMM) {
1735 switch (CI.Width + Paired.Width) {
1739 return &AMDGPU::SReg_64_XEXECRegClass;
1741 return &AMDGPU::SGPR_128RegClass;
1743 return &AMDGPU::SGPR_256RegClass;
1745 return &AMDGPU::SGPR_512RegClass;
1749 unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1750 return TRI->isAGPRClass(getDataRegClass(*CI.I))
1756 CombineInfo &CI, CombineInfo &Paired,
1761 const unsigned Opcode = getNewOpcode(CI, Paired);
1763 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1764 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1765 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1771 const auto *Src0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1772 const auto *Src1 =
TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1774 BuildMI(*
MBB, InsertBefore,
DL,
TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1783 AddressRegs Regs = getRegs(Opcode, *
TII);
1786 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1792 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1795 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1796 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1797 .addImm(
std::min(CI.Offset, Paired.Offset))
1801 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1803 CI.I->eraseFromParent();
1804 Paired.I->eraseFromParent();
1809 SILoadStoreOptimizer::createRegOrImm(int32_t Val,
MachineInstr &
MI)
const {
1810 APInt V(32, Val,
true);
1811 if (
TII->isInlineConstant(V))
1816 BuildMI(*
MI.getParent(),
MI.getIterator(),
MI.getDebugLoc(),
1817 TII->get(AMDGPU::S_MOV_B32),
Reg)
1826 const MemAddress &
Addr)
const {
1832 Addr.Base.LoSubReg) &&
1833 "Expected 32-bit Base-Register-Low!!");
1836 Addr.Base.HiSubReg) &&
1837 "Expected 32-bit Base-Register-Hi!!");
1842 createRegOrImm(
static_cast<int32_t
>(
Addr.Offset >> 32),
MI);
1844 const auto *CarryRC =
TRI->
getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1885 int32_t NewOffset)
const {
1886 auto Base =
TII->getNamedOperand(
MI, AMDGPU::OpName::vaddr);
1887 Base->setReg(NewBase);
1888 Base->setIsKill(
false);
1889 TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->setImm(NewOffset);
1901 if (!
Def ||
Def->getOpcode() != AMDGPU::S_MOV_B32 ||
1902 !
Def->getOperand(1).isImm())
1905 return Def->getOperand(1).getImm();
1919 MemAddress &
Addr)
const {
1924 if (!
Def ||
Def->getOpcode() != AMDGPU::REG_SEQUENCE
1925 ||
Def->getNumOperands() != 5)
1936 if (!BaseLoDef || BaseLoDef->
getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
1937 !BaseHiDef || BaseHiDef->
getOpcode() != AMDGPU::V_ADDC_U32_e64)
1940 const auto *Src0 =
TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
1941 const auto *Src1 =
TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
1943 auto Offset0P = extractConstOffset(*Src0);
1947 if (!(Offset0P = extractConstOffset(*Src1)))
1952 Src0 =
TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
1953 Src1 =
TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
1968 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
1971 bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
1973 MemInfoMap &Visited,
1976 if (!(
MI.mayLoad() ^
MI.mayStore()))
1984 TII->getNamedOperand(
MI, AMDGPU::OpName::vdata) !=
nullptr)
1992 if (
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm()) {
2000 if (Visited.find(&
MI) == Visited.end()) {
2001 processBaseWithConstOffset(
Base, MAddr);
2002 Visited[&
MI] = MAddr;
2004 MAddr = Visited[&
MI];
2006 if (MAddr.Offset == 0) {
2007 LLVM_DEBUG(
dbgs() <<
" Failed to extract constant-offset or there are no"
2008 " constant offsets that can be promoted.\n";);
2013 << MAddr.Base.LoReg <<
"} Offset: " << MAddr.Offset <<
"\n\n";);
2041 MemAddress AnchorAddr;
2057 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
2061 *
TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
2062 MemAddress MAddrNext;
2063 if (Visited.find(&MINext) == Visited.end()) {
2064 processBaseWithConstOffset(BaseNext, MAddrNext);
2065 Visited[&MINext] = MAddrNext;
2067 MAddrNext = Visited[&MINext];
2069 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
2070 MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
2071 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
2072 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2075 InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset));
2077 int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2085 AnchorAddr = MAddrNext;
2086 AnchorInst = &MINext;
2091 LLVM_DEBUG(
dbgs() <<
" Anchor-Inst(with max-distance from Offset): ";
2092 AnchorInst->
dump());
2094 << AnchorAddr.Offset <<
"\n\n");
2099 updateBaseAndOffset(
MI,
Base, MAddr.Offset - AnchorAddr.Offset);
2102 for (
auto P : InstsWCommonBase) {
2105 AM.
BaseOffs =
P.second - AnchorAddr.Offset;
2109 dbgs() <<
")";
P.first->dump());
2110 updateBaseAndOffset(*
P.first,
Base,
P.second - AnchorAddr.Offset);
2114 AnchorList.
insert(AnchorInst);
2121 void SILoadStoreOptimizer::addInstToMergeableList(
const CombineInfo &CI,
2122 std::list<std::list<CombineInfo> > &MergeableInsts)
const {
2123 for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2124 if (AddrList.front().InstClass == CI.InstClass &&
2125 AddrList.front().IsAGPR == CI.IsAGPR &&
2126 AddrList.front().hasSameBaseAddress(*CI.I)) {
2127 AddrList.emplace_back(CI);
2133 MergeableInsts.emplace_back(1, CI);
2136 std::pair<MachineBasicBlock::iterator, bool>
2137 SILoadStoreOptimizer::collectMergeableInsts(
2140 std::list<std::list<CombineInfo>> &MergeableInsts)
const {
2146 for (; BlockI != End; ++BlockI) {
2151 if (promoteConstantOffsetToImm(
MI, Visited, AnchorList))
2156 if (
MI.hasOrderedMemoryRef() ||
MI.hasUnmodeledSideEffects()) {
2164 const InstClassEnum InstClass = getInstClass(
MI.getOpcode(), *
TII);
2165 if (InstClass == UNKNOWN)
2171 if (Swizzled != -1 &&
MI.getOperand(Swizzled).getImm())
2175 CI.setMI(
MI, *
this);
2178 if (!CI.hasMergeableAddress(*
MRI))
2181 if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
2193 addInstToMergeableList(CI, MergeableInsts);
2203 for (
std::list<std::list<CombineInfo>>::iterator
I = MergeableInsts.begin(),
2204 E = MergeableInsts.end();
I !=
E;) {
2206 std::list<CombineInfo> &MergeList = *
I;
2207 if (MergeList.size() <= 1) {
2211 I = MergeableInsts.erase(
I);
2219 [] (
const CombineInfo &A,
const CombineInfo &
B) {
2220 return A.Offset <
B.Offset;
2225 return std::make_pair(BlockI, Modified);
2232 std::list<std::list<CombineInfo> > &MergeableInsts) {
2235 for (
std::list<std::list<CombineInfo>>::iterator
I = MergeableInsts.begin(),
2236 E = MergeableInsts.end();
I !=
E;) {
2237 std::list<CombineInfo> &MergeList = *
I;
2239 bool OptimizeListAgain =
false;
2240 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2244 I = MergeableInsts.erase(
I);
2252 if (!OptimizeListAgain) {
2253 I = MergeableInsts.erase(
I);
2256 OptimizeAgain =
true;
2262 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2263 std::list<CombineInfo> &MergeList,
2264 bool &OptimizeListAgain) {
2265 if (MergeList.empty())
2270 for (
auto I = MergeList.begin(), Next = std::next(
I); Next != MergeList.end();
2271 Next = std::next(
I)) {
2276 if ((*First).Order > (*Second).Order)
2278 CombineInfo &CI = *
First;
2279 CombineInfo &Paired = *Second;
2281 CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2289 LLVM_DEBUG(
dbgs() <<
"Merging: " << *CI.I <<
" with: " << *Paired.I);
2292 switch (CI.InstClass) {
2297 NewMI = mergeRead2Pair(CI, Paired, Where->I);
2300 NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2302 case S_BUFFER_LOAD_IMM:
2303 NewMI = mergeSBufferLoadImmPair(CI, Paired, Where->I);
2304 OptimizeListAgain |= CI.Width + Paired.Width < 8;
2307 NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2308 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2311 NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2312 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2315 NewMI = mergeImagePair(CI, Paired, Where->I);
2316 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2319 NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2320 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2323 NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2324 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2328 case GLOBAL_LOAD_SADDR:
2329 NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2330 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2334 case GLOBAL_STORE_SADDR:
2335 NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2336 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2339 CI.setMI(NewMI, *
this);
2340 CI.Order = Where->Order;
2344 MergeList.erase(Second);
2350 bool SILoadStoreOptimizer::runOnMachineFunction(
MachineFunction &MF) {
2359 TRI = &
TII->getRegisterInfo();
2362 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2377 bool CollectModified;
2378 std::list<std::list<CombineInfo>> MergeableInsts;
2382 std::tie(SectionEnd, CollectModified) =
2383 collectMergeableInsts(
I,
E, Visited, AnchorList, MergeableInsts);
2388 OptimizeAgain =
false;
2390 }
while (OptimizeAgain);