69#define DEBUG_TYPE "si-load-store-opt"
77 S_BUFFER_LOAD_SGPR_IMM,
94 unsigned char NumVAddrs = 0;
105const unsigned MaxAddressRegs = 12 + 1 + 1;
116 InstClassEnum InstClass;
120 int AddrIdx[MaxAddressRegs];
122 unsigned NumAddresses;
125 bool hasSameBaseAddress(
const CombineInfo &CI) {
126 if (NumAddresses != CI.NumAddresses)
130 for (
unsigned i = 0; i < NumAddresses; i++) {
133 if (AddrReg[i]->
isImm() || AddrRegNext.
isImm()) {
134 if (AddrReg[i]->
isImm() != AddrRegNext.
isImm() ||
152 for (
unsigned i = 0; i < NumAddresses; ++i) {
161 if (!AddrOp->
isReg())
167 AddrOp->
getReg() != AMDGPU::SGPR_NULL)
172 if (
MRI.hasOneNonDBGUse(AddrOp->
getReg()))
182 return (InstClass == MIMG) ? DMask <
Other.DMask : Offset <
Other.Offset;
186 struct BaseRegisters {
190 unsigned LoSubReg = 0;
191 unsigned HiSubReg = 0;
212 static bool dmasksCanBeCombined(
const CombineInfo &CI,
214 const CombineInfo &Paired);
215 static bool offsetsCanBeCombined(CombineInfo &CI,
const GCNSubtarget &STI,
216 CombineInfo &Paired,
bool Modify =
false);
217 static bool widthsFit(
const GCNSubtarget &STI,
const CombineInfo &CI,
218 const CombineInfo &Paired);
219 static unsigned getNewOpcode(
const CombineInfo &CI,
const CombineInfo &Paired);
220 static std::pair<unsigned, unsigned> getSubRegIdxs(
const CombineInfo &CI,
221 const CombineInfo &Paired);
223 const CombineInfo &Paired);
226 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
228 unsigned read2Opcode(
unsigned EltSize)
const;
229 unsigned read2ST64Opcode(
unsigned EltSize)
const;
231 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
234 unsigned write2Opcode(
unsigned EltSize)
const;
235 unsigned write2ST64Opcode(
unsigned EltSize)
const;
237 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
240 mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
243 mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
246 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
249 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
252 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
255 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
258 mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
261 mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
265 int32_t NewOffset)
const;
277 std::list<std::list<CombineInfo> > &MergeableInsts)
const;
282 std::list<std::list<CombineInfo>> &MergeableInsts)
const;
285 const CombineInfo &Paired);
287 static InstClassEnum getCommonInstClass(
const CombineInfo &CI,
288 const CombineInfo &Paired);
297 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
298 bool &OptimizeListAgain);
299 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
303 StringRef getPassName()
const override {
return "SI Load Store Optimizer"; }
314 .
set(MachineFunctionProperties::Property::IsSSA);
319 const unsigned Opc =
MI.getOpcode();
321 if (
TII.isMUBUF(Opc)) {
325 if (
TII.isImage(
MI)) {
327 TII.getNamedOperand(
MI, AMDGPU::OpName::dmask)->getImm();
330 if (
TII.isMTBUF(Opc)) {
335 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
336 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
337 case AMDGPU::S_LOAD_DWORD_IMM:
338 case AMDGPU::GLOBAL_LOAD_DWORD:
339 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
340 case AMDGPU::GLOBAL_STORE_DWORD:
341 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
342 case AMDGPU::FLAT_LOAD_DWORD:
343 case AMDGPU::FLAT_STORE_DWORD:
345 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
346 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
347 case AMDGPU::S_LOAD_DWORDX2_IMM:
348 case AMDGPU::GLOBAL_LOAD_DWORDX2:
349 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
350 case AMDGPU::GLOBAL_STORE_DWORDX2:
351 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
352 case AMDGPU::FLAT_LOAD_DWORDX2:
353 case AMDGPU::FLAT_STORE_DWORDX2:
355 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
356 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
357 case AMDGPU::S_LOAD_DWORDX3_IMM:
358 case AMDGPU::GLOBAL_LOAD_DWORDX3:
359 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
360 case AMDGPU::GLOBAL_STORE_DWORDX3:
361 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
362 case AMDGPU::FLAT_LOAD_DWORDX3:
363 case AMDGPU::FLAT_STORE_DWORDX3:
365 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
366 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
367 case AMDGPU::S_LOAD_DWORDX4_IMM:
368 case AMDGPU::GLOBAL_LOAD_DWORDX4:
369 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
370 case AMDGPU::GLOBAL_STORE_DWORDX4:
371 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
372 case AMDGPU::FLAT_LOAD_DWORDX4:
373 case AMDGPU::FLAT_STORE_DWORDX4:
375 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
376 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
377 case AMDGPU::S_LOAD_DWORDX8_IMM:
379 case AMDGPU::DS_READ_B32: [[fallthrough]];
380 case AMDGPU::DS_READ_B32_gfx9: [[fallthrough]];
381 case AMDGPU::DS_WRITE_B32: [[fallthrough]];
382 case AMDGPU::DS_WRITE_B32_gfx9:
384 case AMDGPU::DS_READ_B64: [[fallthrough]];
385 case AMDGPU::DS_READ_B64_gfx9: [[fallthrough]];
386 case AMDGPU::DS_WRITE_B64: [[fallthrough]];
387 case AMDGPU::DS_WRITE_B64_gfx9:
395static InstClassEnum getInstClass(
unsigned Opc,
const SIInstrInfo &
TII) {
398 if (
TII.isMUBUF(Opc)) {
402 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
403 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
404 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
405 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
406 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN:
407 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact:
408 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET:
409 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact:
411 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
412 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
413 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
414 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
415 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN:
416 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact:
417 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET:
418 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact:
422 if (
TII.isImage(Opc)) {
431 if (
TII.get(Opc).mayStore() || !
TII.get(Opc).mayLoad() ||
436 if (
TII.isMTBUF(Opc)) {
440 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:
441 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:
442 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:
443 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:
444 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
445 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
446 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
447 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
448 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN:
449 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact:
450 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN:
451 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact:
452 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN:
453 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact:
454 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET:
455 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact:
457 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
458 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
459 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
460 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
461 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN:
462 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact:
463 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET:
464 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact:
465 return TBUFFER_STORE;
469 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
470 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
471 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
472 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
473 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
474 return S_BUFFER_LOAD_IMM;
475 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
476 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
477 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
478 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
479 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
480 return S_BUFFER_LOAD_SGPR_IMM;
481 case AMDGPU::S_LOAD_DWORD_IMM:
482 case AMDGPU::S_LOAD_DWORDX2_IMM:
483 case AMDGPU::S_LOAD_DWORDX3_IMM:
484 case AMDGPU::S_LOAD_DWORDX4_IMM:
485 case AMDGPU::S_LOAD_DWORDX8_IMM:
487 case AMDGPU::DS_READ_B32:
488 case AMDGPU::DS_READ_B32_gfx9:
489 case AMDGPU::DS_READ_B64:
490 case AMDGPU::DS_READ_B64_gfx9:
492 case AMDGPU::DS_WRITE_B32:
493 case AMDGPU::DS_WRITE_B32_gfx9:
494 case AMDGPU::DS_WRITE_B64:
495 case AMDGPU::DS_WRITE_B64_gfx9:
497 case AMDGPU::GLOBAL_LOAD_DWORD:
498 case AMDGPU::GLOBAL_LOAD_DWORDX2:
499 case AMDGPU::GLOBAL_LOAD_DWORDX3:
500 case AMDGPU::GLOBAL_LOAD_DWORDX4:
501 case AMDGPU::FLAT_LOAD_DWORD:
502 case AMDGPU::FLAT_LOAD_DWORDX2:
503 case AMDGPU::FLAT_LOAD_DWORDX3:
504 case AMDGPU::FLAT_LOAD_DWORDX4:
506 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
507 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
508 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
509 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
510 return GLOBAL_LOAD_SADDR;
511 case AMDGPU::GLOBAL_STORE_DWORD:
512 case AMDGPU::GLOBAL_STORE_DWORDX2:
513 case AMDGPU::GLOBAL_STORE_DWORDX3:
514 case AMDGPU::GLOBAL_STORE_DWORDX4:
515 case AMDGPU::FLAT_STORE_DWORD:
516 case AMDGPU::FLAT_STORE_DWORDX2:
517 case AMDGPU::FLAT_STORE_DWORDX3:
518 case AMDGPU::FLAT_STORE_DWORDX4:
520 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
521 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
522 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
523 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
524 return GLOBAL_STORE_SADDR;
531static unsigned getInstSubclass(
unsigned Opc,
const SIInstrInfo &
TII) {
534 if (
TII.isMUBUF(Opc))
536 if (
TII.isImage(Opc)) {
539 return Info->BaseOpcode;
541 if (
TII.isMTBUF(Opc))
544 case AMDGPU::DS_READ_B32:
545 case AMDGPU::DS_READ_B32_gfx9:
546 case AMDGPU::DS_READ_B64:
547 case AMDGPU::DS_READ_B64_gfx9:
548 case AMDGPU::DS_WRITE_B32:
549 case AMDGPU::DS_WRITE_B32_gfx9:
550 case AMDGPU::DS_WRITE_B64:
551 case AMDGPU::DS_WRITE_B64_gfx9:
553 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
554 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
555 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
556 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
557 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
558 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
559 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
560 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
561 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
562 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
563 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
564 return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
565 case AMDGPU::S_LOAD_DWORD_IMM:
566 case AMDGPU::S_LOAD_DWORDX2_IMM:
567 case AMDGPU::S_LOAD_DWORDX3_IMM:
568 case AMDGPU::S_LOAD_DWORDX4_IMM:
569 case AMDGPU::S_LOAD_DWORDX8_IMM:
570 return AMDGPU::S_LOAD_DWORD_IMM;
571 case AMDGPU::GLOBAL_LOAD_DWORD:
572 case AMDGPU::GLOBAL_LOAD_DWORDX2:
573 case AMDGPU::GLOBAL_LOAD_DWORDX3:
574 case AMDGPU::GLOBAL_LOAD_DWORDX4:
575 case AMDGPU::FLAT_LOAD_DWORD:
576 case AMDGPU::FLAT_LOAD_DWORDX2:
577 case AMDGPU::FLAT_LOAD_DWORDX3:
578 case AMDGPU::FLAT_LOAD_DWORDX4:
579 return AMDGPU::FLAT_LOAD_DWORD;
580 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
581 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
582 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
583 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
584 return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
585 case AMDGPU::GLOBAL_STORE_DWORD:
586 case AMDGPU::GLOBAL_STORE_DWORDX2:
587 case AMDGPU::GLOBAL_STORE_DWORDX3:
588 case AMDGPU::GLOBAL_STORE_DWORDX4:
589 case AMDGPU::FLAT_STORE_DWORD:
590 case AMDGPU::FLAT_STORE_DWORDX2:
591 case AMDGPU::FLAT_STORE_DWORDX3:
592 case AMDGPU::FLAT_STORE_DWORDX4:
593 return AMDGPU::FLAT_STORE_DWORD;
594 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
595 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
596 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
597 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
598 return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
609SILoadStoreOptimizer::getCommonInstClass(
const CombineInfo &CI,
610 const CombineInfo &Paired) {
611 assert(CI.InstClass == Paired.InstClass);
613 if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
615 return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
623 if (
TII.isMUBUF(Opc)) {
629 Result.SOffset =
true;
634 if (
TII.isImage(Opc)) {
636 if (VAddr0Idx >= 0) {
638 TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
640 Result.NumVAddrs = RsrcIdx - VAddr0Idx;
651 if (
TII.isMTBUF(Opc)) {
657 Result.SOffset =
true;
665 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
666 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
667 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
668 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
669 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
670 Result.SOffset =
true;
672 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
673 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
674 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
675 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
676 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
677 case AMDGPU::S_LOAD_DWORD_IMM:
678 case AMDGPU::S_LOAD_DWORDX2_IMM:
679 case AMDGPU::S_LOAD_DWORDX3_IMM:
680 case AMDGPU::S_LOAD_DWORDX4_IMM:
681 case AMDGPU::S_LOAD_DWORDX8_IMM:
684 case AMDGPU::DS_READ_B32:
685 case AMDGPU::DS_READ_B64:
686 case AMDGPU::DS_READ_B32_gfx9:
687 case AMDGPU::DS_READ_B64_gfx9:
688 case AMDGPU::DS_WRITE_B32:
689 case AMDGPU::DS_WRITE_B64:
690 case AMDGPU::DS_WRITE_B32_gfx9:
691 case AMDGPU::DS_WRITE_B64_gfx9:
694 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
695 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
696 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
697 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
698 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
699 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
700 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
701 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
704 case AMDGPU::GLOBAL_LOAD_DWORD:
705 case AMDGPU::GLOBAL_LOAD_DWORDX2:
706 case AMDGPU::GLOBAL_LOAD_DWORDX3:
707 case AMDGPU::GLOBAL_LOAD_DWORDX4:
708 case AMDGPU::GLOBAL_STORE_DWORD:
709 case AMDGPU::GLOBAL_STORE_DWORDX2:
710 case AMDGPU::GLOBAL_STORE_DWORDX3:
711 case AMDGPU::GLOBAL_STORE_DWORDX4:
712 case AMDGPU::FLAT_LOAD_DWORD:
713 case AMDGPU::FLAT_LOAD_DWORDX2:
714 case AMDGPU::FLAT_LOAD_DWORDX3:
715 case AMDGPU::FLAT_LOAD_DWORDX4:
716 case AMDGPU::FLAT_STORE_DWORD:
717 case AMDGPU::FLAT_STORE_DWORDX2:
718 case AMDGPU::FLAT_STORE_DWORDX3:
719 case AMDGPU::FLAT_STORE_DWORDX4:
726 const SILoadStoreOptimizer &LSO) {
728 unsigned Opc =
MI->getOpcode();
729 InstClass = getInstClass(Opc, *LSO.TII);
731 if (InstClass == UNKNOWN)
734 IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*
MI));
739 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
744 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
747 case S_BUFFER_LOAD_IMM:
748 case S_BUFFER_LOAD_SGPR_IMM:
757 if (InstClass == MIMG) {
758 DMask = LSO.TII->getNamedOperand(*
I, AMDGPU::OpName::dmask)->getImm();
763 Offset =
I->getOperand(OffsetIdx).getImm();
766 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
767 Format = LSO.TII->getNamedOperand(*
I, AMDGPU::OpName::format)->getImm();
769 Width = getOpcodeWidth(*
I, *LSO.TII);
771 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
773 }
else if (InstClass != MIMG) {
774 CPol = LSO.TII->getNamedOperand(*
I, AMDGPU::OpName::cpol)->getImm();
777 AddressRegs Regs = getRegs(Opc, *LSO.TII);
778 bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*
I) || LSO.TII->isVSAMPLE(*
I);
781 for (
unsigned J = 0; J < Regs.NumVAddrs; J++)
782 AddrIdx[NumAddresses++] =
785 AddrIdx[NumAddresses++] =
788 AddrIdx[NumAddresses++] =
792 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc);
794 AddrIdx[NumAddresses++] =
797 AddrIdx[NumAddresses++] =
800 AddrIdx[NumAddresses++] =
804 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp);
805 assert(NumAddresses <= MaxAddressRegs);
807 for (
unsigned J = 0; J < NumAddresses; J++)
808 AddrReg[J] = &
I->getOperand(AddrIdx[J]);
814 "SI Load Store Optimizer",
false,
false)
819char SILoadStoreOptimizer::
ID = 0;
824 return new SILoadStoreOptimizer();
830 for (
const auto &
Op :
MI.operands()) {
840bool SILoadStoreOptimizer::canSwapInstructions(
843 if (
A.mayLoadOrStore() &&
B.mayLoadOrStore() &&
844 (
A.mayStore() ||
B.mayStore()) &&
A.mayAlias(AA,
B,
true))
846 for (
const auto &BOp :
B.operands()) {
849 if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.
contains(BOp.getReg()))
851 if (BOp.isDef() && ARegUses.
contains(BOp.getReg()))
860SILoadStoreOptimizer::combineKnownAdjacentMMOs(
const CombineInfo &CI,
861 const CombineInfo &Paired) {
881bool SILoadStoreOptimizer::dmasksCanBeCombined(
const CombineInfo &CI,
883 const CombineInfo &Paired) {
884 assert(CI.InstClass == MIMG);
887 const auto *TFEOp =
TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
888 const auto *LWEOp =
TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
890 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
894 unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
895 AMDGPU::OpName::unorm, AMDGPU::OpName::da,
896 AMDGPU::OpName::r128, AMDGPU::OpName::a16};
898 for (
auto op : OperandsToMatch) {
903 CI.I->getOperand(
Idx).getImm() != Paired.I->getOperand(
Idx).getImm())
908 unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
909 unsigned MinMask = std::min(CI.DMask, Paired.DMask);
915 if ((1u << AllowedBitsForMin) <= MinMask)
922 unsigned ComponentCount,
924 if (ComponentCount > 4)
943 return NewFormatInfo->
Format;
956bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
960 assert(CI.InstClass != MIMG);
964 if (CI.Offset == Paired.Offset)
968 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
971 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
995 uint32_t EltOffset0 = CI.Offset / CI.EltSize;
996 uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
1001 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
1002 if (EltOffset0 + CI.Width != EltOffset1 &&
1003 EltOffset1 + Paired.Width != EltOffset0)
1005 if (CI.CPol != Paired.CPol)
1007 if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||
1008 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {
1014 if (CI.Width != Paired.Width &&
1015 (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset))
1023 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
1024 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
1026 CI.Offset = EltOffset0 / 64;
1027 Paired.Offset = EltOffset1 / 64;
1034 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
1036 CI.Offset = EltOffset0;
1037 Paired.Offset = EltOffset1;
1043 uint32_t Min = std::min(EltOffset0, EltOffset1);
1046 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
1047 if (((Max - Min) & ~Mask) == 0) {
1055 BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
1056 CI.BaseOff = BaseOff * CI.EltSize;
1057 CI.Offset = (EltOffset0 - BaseOff) / 64;
1058 Paired.Offset = (EltOffset1 - BaseOff) / 64;
1064 if (isUInt<8>(Max - Min)) {
1070 CI.BaseOff = BaseOff * CI.EltSize;
1071 CI.Offset = EltOffset0 - BaseOff;
1072 Paired.Offset = EltOffset1 - BaseOff;
1080bool SILoadStoreOptimizer::widthsFit(
const GCNSubtarget &STM,
1081 const CombineInfo &CI,
1082 const CombineInfo &Paired) {
1083 const unsigned Width = (CI.Width + Paired.Width);
1084 switch (CI.InstClass) {
1087 case S_BUFFER_LOAD_IMM:
1088 case S_BUFFER_LOAD_SGPR_IMM:
1104SILoadStoreOptimizer::getDataRegClass(
const MachineInstr &
MI)
const {
1105 if (
const auto *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst)) {
1106 return TRI->getRegClassForReg(*
MRI, Dst->getReg());
1108 if (
const auto *Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdata)) {
1109 return TRI->getRegClassForReg(*
MRI, Src->getReg());
1111 if (
const auto *Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::data0)) {
1112 return TRI->getRegClassForReg(*
MRI, Src->getReg());
1114 if (
const auto *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::sdst)) {
1115 return TRI->getRegClassForReg(*
MRI, Dst->getReg());
1117 if (
const auto *Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::sdata)) {
1118 return TRI->getRegClassForReg(*
MRI, Src->getReg());
1125SILoadStoreOptimizer::CombineInfo *
1126SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1127 CombineInfo &Paired) {
1130 if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1132 assert(CI.InstClass == Paired.InstClass);
1134 if (getInstSubclass(CI.I->getOpcode(), *
TII) !=
1135 getInstSubclass(Paired.I->getOpcode(), *
TII))
1140 if (CI.InstClass == MIMG) {
1141 if (!dmasksCanBeCombined(CI, *
TII, Paired))
1144 if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
1151 if (CI.I->mayLoad()) {
1155 if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *
MBBI))
1163 if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *
MBBI))
1173 if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1174 offsetsCanBeCombined(CI, *STM, Paired,
true);
1178unsigned SILoadStoreOptimizer::read2Opcode(
unsigned EltSize)
const {
1180 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1181 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1184unsigned SILoadStoreOptimizer::read2ST64Opcode(
unsigned EltSize)
const {
1186 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1188 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1189 : AMDGPU::DS_READ2ST64_B64_gfx9;
1193SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1199 const auto *AddrReg =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1201 const auto *Dest0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1202 const auto *Dest1 =
TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1204 unsigned NewOffset0 = CI.Offset;
1205 unsigned NewOffset1 = Paired.Offset;
1207 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1209 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
1210 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
1212 if (NewOffset0 > NewOffset1) {
1218 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1219 (NewOffset0 != NewOffset1) &&
"Computed offset doesn't fit");
1224 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1228 Register BaseReg = AddrReg->getReg();
1229 unsigned BaseSubReg = AddrReg->getSubReg();
1230 unsigned BaseRegFlags = 0;
1232 Register ImmReg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1236 BaseReg =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1239 TII->getAddNoCarry(*
MBB, InsertBefore,
DL, BaseReg)
1241 .addReg(AddrReg->getReg(), 0, BaseSubReg)
1248 .
addReg(BaseReg, BaseRegFlags, BaseSubReg)
1261 .
addReg(DestReg, 0, SubRegIdx0);
1267 Paired.I->eraseFromParent();
1273unsigned SILoadStoreOptimizer::write2Opcode(
unsigned EltSize)
const {
1275 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1276 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1277 : AMDGPU::DS_WRITE2_B64_gfx9;
1280unsigned SILoadStoreOptimizer::write2ST64Opcode(
unsigned EltSize)
const {
1282 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1283 : AMDGPU::DS_WRITE2ST64_B64;
1285 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1286 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1290 CombineInfo &CI, CombineInfo &Paired,
1297 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1299 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1301 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1303 unsigned NewOffset0 = CI.Offset;
1304 unsigned NewOffset1 = Paired.Offset;
1306 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1308 if (NewOffset0 > NewOffset1) {
1314 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1315 (NewOffset0 != NewOffset1) &&
"Computed offset doesn't fit");
1321 unsigned BaseSubReg = AddrReg->
getSubReg();
1322 unsigned BaseRegFlags = 0;
1324 Register ImmReg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1328 BaseReg =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1331 TII->getAddNoCarry(*
MBB, InsertBefore,
DL, BaseReg)
1333 .addReg(AddrReg->
getReg(), 0, BaseSubReg)
1340 .
addReg(BaseReg, BaseRegFlags, BaseSubReg)
1349 Paired.I->eraseFromParent();
1351 LLVM_DEBUG(
dbgs() <<
"Inserted write2 inst: " << *Write2 <<
'\n');
1356SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1360 const unsigned Opcode = getNewOpcode(CI, Paired);
1364 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1365 unsigned MergedDMask = CI.DMask | Paired.DMask;
1369 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1370 for (
unsigned I = 1,
E = (*CI.I).getNumOperands();
I !=
E; ++
I) {
1372 MIB.addImm(MergedDMask);
1374 MIB.add((*CI.I).getOperand(
I));
1380 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1382 MachineInstr *
New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1384 unsigned SubRegIdx0, SubRegIdx1;
1385 std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired);
1389 const auto *Dest0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1390 const auto *Dest1 =
TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1394 .
addReg(DestReg, 0, SubRegIdx0);
1400 Paired.I->eraseFromParent();
1405 CombineInfo &CI, CombineInfo &Paired,
1409 const unsigned Opcode = getNewOpcode(CI, Paired);
1413 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1414 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1419 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1423 .
add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));
1424 if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1425 New.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));
1426 New.addImm(MergedOffset);
1427 New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1429 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1430 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1431 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1435 const auto *Dest0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
1436 const auto *Dest1 =
TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst);
1440 .
addReg(DestReg, 0, SubRegIdx0);
1446 Paired.I->eraseFromParent();
1451 CombineInfo &CI, CombineInfo &Paired,
1456 const unsigned Opcode = getNewOpcode(CI, Paired);
1461 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1462 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1464 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1466 AddressRegs Regs = getRegs(Opcode, *
TII);
1469 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1474 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1477 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1478 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1479 .addImm(MergedOffset)
1482 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1484 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1485 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1486 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1490 const auto *Dest0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1491 const auto *Dest1 =
TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1495 .
addReg(DestReg, 0, SubRegIdx0);
1501 Paired.I->eraseFromParent();
1506 CombineInfo &CI, CombineInfo &Paired,
1511 const unsigned Opcode = getNewOpcode(CI, Paired);
1516 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1517 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1519 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1521 AddressRegs Regs = getRegs(Opcode, *
TII);
1524 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1526 unsigned JoinedFormat =
1532 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1535 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1536 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1537 .addImm(MergedOffset)
1538 .addImm(JoinedFormat)
1541 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1543 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1544 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1545 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1549 const auto *Dest0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1550 const auto *Dest1 =
TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1554 .
addReg(DestReg, 0, SubRegIdx0);
1560 Paired.I->eraseFromParent();
1565 CombineInfo &CI, CombineInfo &Paired,
1570 const unsigned Opcode = getNewOpcode(CI, Paired);
1572 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1573 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1574 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1578 Register SrcReg =
MRI->createVirtualRegister(SuperRC);
1580 const auto *Src0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1581 const auto *Src1 =
TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1583 BuildMI(*
MBB, InsertBefore,
DL,
TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1592 AddressRegs Regs = getRegs(Opcode, *
TII);
1595 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1597 unsigned JoinedFormat =
1603 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1606 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1607 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1608 .addImm(std::min(CI.Offset, Paired.Offset))
1609 .addImm(JoinedFormat)
1612 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1614 CI.I->eraseFromParent();
1615 Paired.I->eraseFromParent();
1620 CombineInfo &CI, CombineInfo &Paired,
1625 const unsigned Opcode = getNewOpcode(CI, Paired);
1628 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1630 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1632 if (
auto *SAddr =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1636 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1637 .addImm(std::min(CI.Offset, Paired.Offset))
1639 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1641 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1642 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1643 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1647 const auto *Dest0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1648 const auto *Dest1 =
TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1652 .
addReg(DestReg, 0, SubRegIdx0);
1658 Paired.I->eraseFromParent();
1663 CombineInfo &CI, CombineInfo &Paired,
1668 const unsigned Opcode = getNewOpcode(CI, Paired);
1670 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1671 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1672 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1676 Register SrcReg =
MRI->createVirtualRegister(SuperRC);
1678 const auto *Src0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1679 const auto *Src1 =
TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1681 BuildMI(*
MBB, InsertBefore,
DL,
TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1688 .
add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1691 if (
auto *SAddr =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1695 MIB.addImm(std::min(CI.Offset, Paired.Offset))
1697 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1699 CI.I->eraseFromParent();
1700 Paired.I->eraseFromParent();
1704unsigned SILoadStoreOptimizer::getNewOpcode(
const CombineInfo &CI,
1705 const CombineInfo &Paired) {
1706 const unsigned Width = CI.Width + Paired.Width;
1708 switch (getCommonInstClass(CI, Paired)) {
1710 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1721 case S_BUFFER_LOAD_IMM:
1726 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1728 return AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
1730 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1732 return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1734 case S_BUFFER_LOAD_SGPR_IMM:
1739 return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1741 return AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
1743 return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1745 return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1752 return AMDGPU::S_LOAD_DWORDX2_IMM;
1754 return AMDGPU::S_LOAD_DWORDX3_IMM;
1756 return AMDGPU::S_LOAD_DWORDX4_IMM;
1758 return AMDGPU::S_LOAD_DWORDX8_IMM;
1765 return AMDGPU::GLOBAL_LOAD_DWORDX2;
1767 return AMDGPU::GLOBAL_LOAD_DWORDX3;
1769 return AMDGPU::GLOBAL_LOAD_DWORDX4;
1771 case GLOBAL_LOAD_SADDR:
1776 return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1778 return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1780 return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1787 return AMDGPU::GLOBAL_STORE_DWORDX2;
1789 return AMDGPU::GLOBAL_STORE_DWORDX3;
1791 return AMDGPU::GLOBAL_STORE_DWORDX4;
1793 case GLOBAL_STORE_SADDR:
1798 return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1800 return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1802 return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1809 return AMDGPU::FLAT_LOAD_DWORDX2;
1811 return AMDGPU::FLAT_LOAD_DWORDX3;
1813 return AMDGPU::FLAT_LOAD_DWORDX4;
1820 return AMDGPU::FLAT_STORE_DWORDX2;
1822 return AMDGPU::FLAT_STORE_DWORDX3;
1824 return AMDGPU::FLAT_STORE_DWORDX4;
1833std::pair<unsigned, unsigned>
1834SILoadStoreOptimizer::getSubRegIdxs(
const CombineInfo &CI,
1835 const CombineInfo &Paired) {
1836 assert((CI.InstClass != MIMG ||
1838 CI.Width + Paired.Width)) &&
1844 static const unsigned Idxs[5][4] = {
1845 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1846 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
1847 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
1848 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
1849 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
1852 assert(CI.Width >= 1 && CI.Width <= 4);
1853 assert(Paired.Width >= 1 && Paired.Width <= 4);
1856 Idx1 = Idxs[0][Paired.Width - 1];
1857 Idx0 = Idxs[Paired.Width][CI.Width - 1];
1859 Idx0 = Idxs[0][CI.Width - 1];
1860 Idx1 = Idxs[CI.Width][Paired.Width - 1];
1863 return std::pair(Idx0, Idx1);
1867SILoadStoreOptimizer::getTargetRegisterClass(
const CombineInfo &CI,
1868 const CombineInfo &Paired) {
1869 if (CI.InstClass == S_BUFFER_LOAD_IMM ||
1870 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
1871 switch (CI.Width + Paired.Width) {
1875 return &AMDGPU::SReg_64_XEXECRegClass;
1877 return &AMDGPU::SGPR_96RegClass;
1879 return &AMDGPU::SGPR_128RegClass;
1881 return &AMDGPU::SGPR_256RegClass;
1883 return &AMDGPU::SGPR_512RegClass;
1887 unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1888 return TRI->isAGPRClass(getDataRegClass(*CI.I))
1894 CombineInfo &CI, CombineInfo &Paired,
1899 const unsigned Opcode = getNewOpcode(CI, Paired);
1901 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1902 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1903 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1907 Register SrcReg =
MRI->createVirtualRegister(SuperRC);
1909 const auto *Src0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1910 const auto *Src1 =
TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1912 BuildMI(*
MBB, InsertBefore,
DL,
TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1921 AddressRegs Regs = getRegs(Opcode, *
TII);
1924 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1930 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1933 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1934 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1935 .addImm(std::min(CI.Offset, Paired.Offset))
1938 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1940 CI.I->eraseFromParent();
1941 Paired.I->eraseFromParent();
1946SILoadStoreOptimizer::createRegOrImm(int32_t Val,
MachineInstr &
MI)
const {
1948 if (
TII->isInlineConstant(V))
1951 Register Reg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1953 BuildMI(*
MI.getParent(),
MI.getIterator(),
MI.getDebugLoc(),
1954 TII->get(AMDGPU::S_MOV_B32), Reg)
1963 const MemAddress &
Addr)
const {
1969 Addr.Base.LoSubReg) &&
1970 "Expected 32-bit Base-Register-Low!!");
1973 Addr.Base.HiSubReg) &&
1974 "Expected 32-bit Base-Register-Hi!!");
1979 createRegOrImm(
static_cast<int32_t
>(
Addr.Offset >> 32),
MI);
1981 const auto *CarryRC =
TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1982 Register CarryReg =
MRI->createVirtualRegister(CarryRC);
1983 Register DeadCarryReg =
MRI->createVirtualRegister(CarryRC);
1985 Register DestSub0 =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1986 Register DestSub1 =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2006 Register FullDestReg =
MRI->createVirtualRegister(
TRI->getVGPR64Class());
2022 int32_t NewOffset)
const {
2023 auto Base =
TII->getNamedOperand(
MI, AMDGPU::OpName::vaddr);
2024 Base->setReg(NewBase);
2025 Base->setIsKill(
false);
2026 TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->setImm(NewOffset);
2029std::optional<int32_t>
2035 return std::nullopt;
2038 if (!Def ||
Def->getOpcode() != AMDGPU::S_MOV_B32 ||
2039 !
Def->getOperand(1).isImm())
2040 return std::nullopt;
2042 return Def->getOperand(1).getImm();
2056 MemAddress &
Addr)
const {
2061 if (!Def ||
Def->getOpcode() != AMDGPU::REG_SEQUENCE
2062 ||
Def->getNumOperands() != 5)
2073 if (!BaseLoDef || BaseLoDef->
getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
2074 !BaseHiDef || BaseHiDef->
getOpcode() != AMDGPU::V_ADDC_U32_e64)
2077 const auto *Src0 =
TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
2078 const auto *Src1 =
TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
2080 auto Offset0P = extractConstOffset(*Src0);
2084 if (!(Offset0P = extractConstOffset(*Src1)))
2089 Src0 =
TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
2090 Src1 =
TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
2105 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
2108bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
2110 MemInfoMap &Visited,
2113 if (!(
MI.mayLoad() ^
MI.mayStore()))
2121 TII->getNamedOperand(
MI, AMDGPU::OpName::vdata) !=
nullptr)
2129 if (
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm()) {
2137 if (!Visited.contains(&
MI)) {
2138 processBaseWithConstOffset(
Base, MAddr);
2139 Visited[&
MI] = MAddr;
2141 MAddr = Visited[&
MI];
2143 if (MAddr.Offset == 0) {
2144 LLVM_DEBUG(
dbgs() <<
" Failed to extract constant-offset or there are no"
2145 " constant offsets that can be promoted.\n";);
2150 << MAddr.Base.LoReg <<
"} Offset: " << MAddr.Offset <<
"\n\n";);
2178 MemAddress AnchorAddr;
2179 uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
2194 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
2198 *
TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
2199 MemAddress MAddrNext;
2200 if (!Visited.contains(&MINext)) {
2201 processBaseWithConstOffset(BaseNext, MAddrNext);
2202 Visited[&MINext] = MAddrNext;
2204 MAddrNext = Visited[&MINext];
2206 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
2207 MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
2208 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
2209 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2212 InstsWCommonBase.
push_back(std::pair(&MINext, MAddrNext.Offset));
2214 int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2219 (
uint32_t)std::abs(Dist) > MaxDist) {
2220 MaxDist = std::abs(Dist);
2222 AnchorAddr = MAddrNext;
2223 AnchorInst = &MINext;
2228 LLVM_DEBUG(
dbgs() <<
" Anchor-Inst(with max-distance from Offset): ";
2229 AnchorInst->
dump());
2231 << AnchorAddr.Offset <<
"\n\n");
2236 updateBaseAndOffset(
MI,
Base, MAddr.Offset - AnchorAddr.Offset);
2239 for (
auto P : InstsWCommonBase) {
2242 AM.
BaseOffs =
P.second - AnchorAddr.Offset;
2246 dbgs() <<
")";
P.first->dump());
2247 updateBaseAndOffset(*
P.first,
Base,
P.second - AnchorAddr.Offset);
2251 AnchorList.
insert(AnchorInst);
2258void SILoadStoreOptimizer::addInstToMergeableList(
const CombineInfo &CI,
2259 std::list<std::list<CombineInfo> > &MergeableInsts)
const {
2260 for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2261 if (AddrList.front().InstClass == CI.InstClass &&
2262 AddrList.front().IsAGPR == CI.IsAGPR &&
2263 AddrList.front().hasSameBaseAddress(CI)) {
2264 AddrList.emplace_back(CI);
2270 MergeableInsts.emplace_back(1, CI);
2273std::pair<MachineBasicBlock::iterator, bool>
2274SILoadStoreOptimizer::collectMergeableInsts(
2277 std::list<std::list<CombineInfo>> &MergeableInsts)
const {
2283 for (; BlockI !=
End; ++BlockI) {
2288 if (promoteConstantOffsetToImm(
MI, Visited, AnchorList))
2293 if (
MI.hasOrderedMemoryRef() ||
MI.hasUnmodeledSideEffects()) {
2301 const InstClassEnum InstClass = getInstClass(
MI.getOpcode(), *
TII);
2302 if (InstClass == UNKNOWN)
2308 if (Swizzled != -1 &&
MI.getOperand(Swizzled).getImm())
2312 CI.setMI(
MI, *
this);
2315 if (!CI.hasMergeableAddress(*
MRI))
2318 if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
2340 for (std::list<std::list<CombineInfo>>::iterator
I = MergeableInsts.begin(),
2341 E = MergeableInsts.end();
I !=
E;) {
2343 std::list<CombineInfo> &MergeList = *
I;
2344 if (MergeList.size() <= 1) {
2348 I = MergeableInsts.erase(
I);
2356 [] (
const CombineInfo &
A,
const CombineInfo &
B) {
2357 return A.Offset <
B.Offset;
2362 return std::pair(BlockI,
Modified);
2368bool SILoadStoreOptimizer::optimizeBlock(
2369 std::list<std::list<CombineInfo> > &MergeableInsts) {
2372 for (std::list<std::list<CombineInfo>>::iterator
I = MergeableInsts.begin(),
2373 E = MergeableInsts.end();
I !=
E;) {
2374 std::list<CombineInfo> &MergeList = *
I;
2376 bool OptimizeListAgain =
false;
2377 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2381 I = MergeableInsts.erase(
I);
2389 if (!OptimizeListAgain) {
2390 I = MergeableInsts.erase(
I);
2393 OptimizeAgain =
true;
2399SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2400 std::list<CombineInfo> &MergeList,
2401 bool &OptimizeListAgain) {
2402 if (MergeList.empty())
2407 for (
auto I = MergeList.begin(), Next = std::next(
I); Next != MergeList.end();
2408 Next = std::next(
I)) {
2413 if ((*First).Order > (*Second).Order)
2415 CombineInfo &CI = *
First;
2416 CombineInfo &Paired = *Second;
2418 CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2426 LLVM_DEBUG(
dbgs() <<
"Merging: " << *CI.I <<
" with: " << *Paired.I);
2429 switch (CI.InstClass) {
2434 NewMI = mergeRead2Pair(CI, Paired, Where->I);
2437 NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2439 case S_BUFFER_LOAD_IMM:
2440 case S_BUFFER_LOAD_SGPR_IMM:
2442 NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);
2443 OptimizeListAgain |= CI.Width + Paired.Width < 8;
2446 NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2447 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2450 NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2451 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2454 NewMI = mergeImagePair(CI, Paired, Where->I);
2455 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2458 NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2459 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2462 NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2463 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2467 case GLOBAL_LOAD_SADDR:
2468 NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2469 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2473 case GLOBAL_STORE_SADDR:
2474 NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2475 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2478 CI.setMI(NewMI, *
this);
2479 CI.Order = Where->Order;
2483 MergeList.erase(Second);
2498 TRI = &
TII->getRegisterInfo();
2501 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2516 bool CollectModified;
2517 std::list<std::list<CombineInfo>> MergeableInsts;
2521 std::tie(SectionEnd, CollectModified) =
2527 OptimizeAgain =
false;
2529 }
while (OptimizeAgain);
unsigned const MachineRegisterInfo * MRI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE, "AMDGPU Image Intrinsic Optimizer", false, false) char AMDGPUImageIntrinsicOptimizer void addInstToMergeableList(IntrinsicInst *II, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr)
BasicBlock::iterator collectMergeableInsts(BasicBlock::iterator I, BasicBlock::iterator E, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts)
Provides AMDGPU specific target descriptions.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
std::optional< std::vector< StOtherPiece > > Other
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
unsigned const TargetRegisterInfo * TRI
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi)
static void addDefsUsesToList(const MachineInstr &MI, DenseSet< Register > &RegDefs, DenseSet< Register > &RegUses)
static unsigned getBufferFormatWithCompCount(unsigned OldFormat, unsigned ComponentCount, const GCNSubtarget &STI)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI)
static bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT, const TargetTransformInfo &TTI, const DataLayout &DL, DomTreeUpdater *DTU)
support::ulittle16_t & Lo
support::ulittle16_t & Hi
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Class for arbitrary precision integers.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesCFG()
This function should be called by the pass, iff they do not:
This class represents an Operation in the Expression.
Implements a dense probed hash-table based set.
FunctionPass class - This class is used to implement most global optimizations.
bool loadStoreOptEnabled() const
const SIInstrInfo * getInstrInfo() const override
bool hasDwordx3LoadStores() const
const SITargetLowering * getTargetLowering() const override
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
bool hasScalarDwordx3Loads() const
TypeSize getValue() const
Describe properties that are true of each instruction in the target description file.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Properties which a MachineFunction may have at a given point in time.
MachineFunctionProperties & set(Property P)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & cloneMergedMemRefs(ArrayRef< const MachineInstr * > OtherMIs) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
unsigned getAddrSpace() const
const MachinePointerInfo & getPointerInfo() const
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Wrapper class representing virtual and physical registers.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
static bool isFLATGlobal(const MachineInstr &MI)
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
std::pair< iterator, bool > insert(const ValueT &V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ FLAT_ADDRESS
Address space for flat memory.
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset)
Convert ByteOffset to dwords if the subtarget uses dword SMRD immediate offsets.
bool getMTBUFHasSrsrc(unsigned Opc)
int getMTBUFElements(unsigned Opc)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
bool getMTBUFHasSoffset(unsigned Opc)
int getMUBUFOpcode(unsigned BaseOpc, unsigned Elements)
int getMUBUFBaseOpcode(unsigned Opc)
int getMTBUFBaseOpcode(unsigned Opc)
bool getMUBUFHasVAddr(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
int getMTBUFOpcode(unsigned BaseOpc, unsigned Elements)
bool getMUBUFHasSoffset(unsigned Opc)
const MIMGBaseOpcodeInfo * getMIMGBaseOpcode(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
bool getMTBUFHasVAddr(unsigned Opc)
int getMUBUFElements(unsigned Opc)
const GcnBufferFormatInfo * getGcnBufferFormatInfo(uint8_t BitsPerComp, uint8_t NumComponents, uint8_t NumFormat, const MCSubtargetInfo &STI)
bool getMUBUFHasSrsrc(unsigned Opc)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Define
Register definition.
@ Kill
The last use of a register.
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< DefNode * > Def
This is an optimization pass for GlobalISel generic memory operations.
bool operator<(int64_t V1, const APSInt &V2)
int popcount(T Value) noexcept
Count the number of set bits in a value.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
char & SILoadStoreOptimizerID
FunctionPass * createSILoadStoreOptimizerPass()
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
void initializeSILoadStoreOptimizerPass(PassRegistry &)
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
constexpr unsigned BitWidth
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This class contains a discriminated union of information about pointers in memory operands,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg If BaseGV is null...