69#define DEBUG_TYPE "si-load-store-opt"
77 S_BUFFER_LOAD_SGPR_IMM,
94 unsigned char NumVAddrs = 0;
105const unsigned MaxAddressRegs = 12 + 1 + 1;
116 InstClassEnum InstClass;
120 int AddrIdx[MaxAddressRegs];
122 unsigned NumAddresses;
125 bool hasSameBaseAddress(
const CombineInfo &CI) {
126 if (NumAddresses != CI.NumAddresses)
130 for (
unsigned i = 0; i < NumAddresses; i++) {
133 if (AddrReg[i]->
isImm() || AddrRegNext.
isImm()) {
134 if (AddrReg[i]->
isImm() != AddrRegNext.
isImm() ||
152 for (
unsigned i = 0; i < NumAddresses; ++i) {
161 if (!AddrOp->
isReg())
167 AddrOp->
getReg() != AMDGPU::SGPR_NULL)
172 if (
MRI.hasOneNonDBGUse(AddrOp->
getReg()))
182 return (InstClass == MIMG) ? DMask <
Other.DMask : Offset <
Other.Offset;
186 struct BaseRegisters {
190 unsigned LoSubReg = 0;
191 unsigned HiSubReg = 0;
212 static bool dmasksCanBeCombined(
const CombineInfo &CI,
214 const CombineInfo &Paired);
215 static bool offsetsCanBeCombined(CombineInfo &CI,
const GCNSubtarget &STI,
216 CombineInfo &Paired,
bool Modify =
false);
217 static bool widthsFit(
const GCNSubtarget &STI,
const CombineInfo &CI,
218 const CombineInfo &Paired);
219 static unsigned getNewOpcode(
const CombineInfo &CI,
const CombineInfo &Paired);
220 static std::pair<unsigned, unsigned> getSubRegIdxs(
const CombineInfo &CI,
221 const CombineInfo &Paired);
223 const CombineInfo &Paired);
226 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
228 unsigned read2Opcode(
unsigned EltSize)
const;
229 unsigned read2ST64Opcode(
unsigned EltSize)
const;
231 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
234 unsigned write2Opcode(
unsigned EltSize)
const;
235 unsigned write2ST64Opcode(
unsigned EltSize)
const;
237 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
240 mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
243 mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
246 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
249 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
252 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
255 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
258 mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
261 mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
265 int32_t NewOffset)
const;
277 std::list<std::list<CombineInfo> > &MergeableInsts)
const;
282 std::list<std::list<CombineInfo>> &MergeableInsts)
const;
285 const CombineInfo &Paired);
287 static InstClassEnum getCommonInstClass(
const CombineInfo &CI,
288 const CombineInfo &Paired);
297 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
298 bool &OptimizeListAgain);
299 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
303 StringRef getPassName()
const override {
return "SI Load Store Optimizer"; }
314 .
set(MachineFunctionProperties::Property::IsSSA);
319 const unsigned Opc =
MI.getOpcode();
321 if (
TII.isMUBUF(Opc)) {
325 if (
TII.isImage(
MI)) {
327 TII.getNamedOperand(
MI, AMDGPU::OpName::dmask)->getImm();
330 if (
TII.isMTBUF(Opc)) {
335 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
336 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
337 case AMDGPU::S_LOAD_DWORD_IMM:
338 case AMDGPU::GLOBAL_LOAD_DWORD:
339 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
340 case AMDGPU::GLOBAL_STORE_DWORD:
341 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
342 case AMDGPU::FLAT_LOAD_DWORD:
343 case AMDGPU::FLAT_STORE_DWORD:
345 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
346 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
347 case AMDGPU::S_LOAD_DWORDX2_IMM:
348 case AMDGPU::GLOBAL_LOAD_DWORDX2:
349 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
350 case AMDGPU::GLOBAL_STORE_DWORDX2:
351 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
352 case AMDGPU::FLAT_LOAD_DWORDX2:
353 case AMDGPU::FLAT_STORE_DWORDX2:
355 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
356 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
357 case AMDGPU::S_LOAD_DWORDX3_IMM:
358 case AMDGPU::GLOBAL_LOAD_DWORDX3:
359 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
360 case AMDGPU::GLOBAL_STORE_DWORDX3:
361 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
362 case AMDGPU::FLAT_LOAD_DWORDX3:
363 case AMDGPU::FLAT_STORE_DWORDX3:
365 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
366 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
367 case AMDGPU::S_LOAD_DWORDX4_IMM:
368 case AMDGPU::GLOBAL_LOAD_DWORDX4:
369 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
370 case AMDGPU::GLOBAL_STORE_DWORDX4:
371 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
372 case AMDGPU::FLAT_LOAD_DWORDX4:
373 case AMDGPU::FLAT_STORE_DWORDX4:
375 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
376 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
377 case AMDGPU::S_LOAD_DWORDX8_IMM:
379 case AMDGPU::DS_READ_B32: [[fallthrough]];
380 case AMDGPU::DS_READ_B32_gfx9: [[fallthrough]];
381 case AMDGPU::DS_WRITE_B32: [[fallthrough]];
382 case AMDGPU::DS_WRITE_B32_gfx9:
384 case AMDGPU::DS_READ_B64: [[fallthrough]];
385 case AMDGPU::DS_READ_B64_gfx9: [[fallthrough]];
386 case AMDGPU::DS_WRITE_B64: [[fallthrough]];
387 case AMDGPU::DS_WRITE_B64_gfx9:
395static InstClassEnum getInstClass(
unsigned Opc,
const SIInstrInfo &
TII) {
398 if (
TII.isMUBUF(Opc)) {
402 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN:
403 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact:
404 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN:
405 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact:
406 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
407 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
408 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
409 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
410 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:
411 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact:
412 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN:
413 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact:
414 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN:
415 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact:
416 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET:
417 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact:
419 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN:
420 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN_exact:
421 case AMDGPU::BUFFER_STORE_DWORD_IDXEN:
422 case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact:
423 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
424 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
425 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
426 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
427 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN:
428 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact:
429 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN:
430 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN_exact:
431 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN:
432 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact:
433 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET:
434 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact:
438 if (
TII.isImage(Opc)) {
447 if (
TII.get(Opc).mayStore() || !
TII.get(Opc).mayLoad() ||
452 if (
TII.isMTBUF(Opc)) {
456 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:
457 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:
458 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:
459 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:
460 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
461 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
462 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
463 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
464 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN:
465 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact:
466 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN:
467 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact:
468 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN:
469 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact:
470 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET:
471 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact:
473 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
474 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
475 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
476 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
477 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN:
478 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact:
479 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET:
480 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact:
481 return TBUFFER_STORE;
485 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
486 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
487 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
488 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
489 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
490 return S_BUFFER_LOAD_IMM;
491 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
492 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
493 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
494 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
495 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
496 return S_BUFFER_LOAD_SGPR_IMM;
497 case AMDGPU::S_LOAD_DWORD_IMM:
498 case AMDGPU::S_LOAD_DWORDX2_IMM:
499 case AMDGPU::S_LOAD_DWORDX3_IMM:
500 case AMDGPU::S_LOAD_DWORDX4_IMM:
501 case AMDGPU::S_LOAD_DWORDX8_IMM:
503 case AMDGPU::DS_READ_B32:
504 case AMDGPU::DS_READ_B32_gfx9:
505 case AMDGPU::DS_READ_B64:
506 case AMDGPU::DS_READ_B64_gfx9:
508 case AMDGPU::DS_WRITE_B32:
509 case AMDGPU::DS_WRITE_B32_gfx9:
510 case AMDGPU::DS_WRITE_B64:
511 case AMDGPU::DS_WRITE_B64_gfx9:
513 case AMDGPU::GLOBAL_LOAD_DWORD:
514 case AMDGPU::GLOBAL_LOAD_DWORDX2:
515 case AMDGPU::GLOBAL_LOAD_DWORDX3:
516 case AMDGPU::GLOBAL_LOAD_DWORDX4:
517 case AMDGPU::FLAT_LOAD_DWORD:
518 case AMDGPU::FLAT_LOAD_DWORDX2:
519 case AMDGPU::FLAT_LOAD_DWORDX3:
520 case AMDGPU::FLAT_LOAD_DWORDX4:
522 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
523 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
524 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
525 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
526 return GLOBAL_LOAD_SADDR;
527 case AMDGPU::GLOBAL_STORE_DWORD:
528 case AMDGPU::GLOBAL_STORE_DWORDX2:
529 case AMDGPU::GLOBAL_STORE_DWORDX3:
530 case AMDGPU::GLOBAL_STORE_DWORDX4:
531 case AMDGPU::FLAT_STORE_DWORD:
532 case AMDGPU::FLAT_STORE_DWORDX2:
533 case AMDGPU::FLAT_STORE_DWORDX3:
534 case AMDGPU::FLAT_STORE_DWORDX4:
536 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
537 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
538 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
539 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
540 return GLOBAL_STORE_SADDR;
547static unsigned getInstSubclass(
unsigned Opc,
const SIInstrInfo &
TII) {
550 if (
TII.isMUBUF(Opc))
552 if (
TII.isImage(Opc)) {
555 return Info->BaseOpcode;
557 if (
TII.isMTBUF(Opc))
560 case AMDGPU::DS_READ_B32:
561 case AMDGPU::DS_READ_B32_gfx9:
562 case AMDGPU::DS_READ_B64:
563 case AMDGPU::DS_READ_B64_gfx9:
564 case AMDGPU::DS_WRITE_B32:
565 case AMDGPU::DS_WRITE_B32_gfx9:
566 case AMDGPU::DS_WRITE_B64:
567 case AMDGPU::DS_WRITE_B64_gfx9:
569 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
570 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
571 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
572 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
573 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
574 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
575 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
576 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
577 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
578 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
579 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
580 return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
581 case AMDGPU::S_LOAD_DWORD_IMM:
582 case AMDGPU::S_LOAD_DWORDX2_IMM:
583 case AMDGPU::S_LOAD_DWORDX3_IMM:
584 case AMDGPU::S_LOAD_DWORDX4_IMM:
585 case AMDGPU::S_LOAD_DWORDX8_IMM:
586 return AMDGPU::S_LOAD_DWORD_IMM;
587 case AMDGPU::GLOBAL_LOAD_DWORD:
588 case AMDGPU::GLOBAL_LOAD_DWORDX2:
589 case AMDGPU::GLOBAL_LOAD_DWORDX3:
590 case AMDGPU::GLOBAL_LOAD_DWORDX4:
591 case AMDGPU::FLAT_LOAD_DWORD:
592 case AMDGPU::FLAT_LOAD_DWORDX2:
593 case AMDGPU::FLAT_LOAD_DWORDX3:
594 case AMDGPU::FLAT_LOAD_DWORDX4:
595 return AMDGPU::FLAT_LOAD_DWORD;
596 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
597 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
598 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
599 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
600 return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
601 case AMDGPU::GLOBAL_STORE_DWORD:
602 case AMDGPU::GLOBAL_STORE_DWORDX2:
603 case AMDGPU::GLOBAL_STORE_DWORDX3:
604 case AMDGPU::GLOBAL_STORE_DWORDX4:
605 case AMDGPU::FLAT_STORE_DWORD:
606 case AMDGPU::FLAT_STORE_DWORDX2:
607 case AMDGPU::FLAT_STORE_DWORDX3:
608 case AMDGPU::FLAT_STORE_DWORDX4:
609 return AMDGPU::FLAT_STORE_DWORD;
610 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
611 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
612 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
613 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
614 return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
625SILoadStoreOptimizer::getCommonInstClass(
const CombineInfo &CI,
626 const CombineInfo &Paired) {
627 assert(CI.InstClass == Paired.InstClass);
629 if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
631 return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
639 if (
TII.isMUBUF(Opc)) {
645 Result.SOffset =
true;
650 if (
TII.isImage(Opc)) {
652 if (VAddr0Idx >= 0) {
654 TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
656 Result.NumVAddrs = RsrcIdx - VAddr0Idx;
667 if (
TII.isMTBUF(Opc)) {
673 Result.SOffset =
true;
681 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
682 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
683 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
684 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
685 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
686 Result.SOffset =
true;
688 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
689 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
690 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
691 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
692 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
693 case AMDGPU::S_LOAD_DWORD_IMM:
694 case AMDGPU::S_LOAD_DWORDX2_IMM:
695 case AMDGPU::S_LOAD_DWORDX3_IMM:
696 case AMDGPU::S_LOAD_DWORDX4_IMM:
697 case AMDGPU::S_LOAD_DWORDX8_IMM:
700 case AMDGPU::DS_READ_B32:
701 case AMDGPU::DS_READ_B64:
702 case AMDGPU::DS_READ_B32_gfx9:
703 case AMDGPU::DS_READ_B64_gfx9:
704 case AMDGPU::DS_WRITE_B32:
705 case AMDGPU::DS_WRITE_B64:
706 case AMDGPU::DS_WRITE_B32_gfx9:
707 case AMDGPU::DS_WRITE_B64_gfx9:
710 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
711 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
712 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
713 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
714 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
715 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
716 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
717 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
720 case AMDGPU::GLOBAL_LOAD_DWORD:
721 case AMDGPU::GLOBAL_LOAD_DWORDX2:
722 case AMDGPU::GLOBAL_LOAD_DWORDX3:
723 case AMDGPU::GLOBAL_LOAD_DWORDX4:
724 case AMDGPU::GLOBAL_STORE_DWORD:
725 case AMDGPU::GLOBAL_STORE_DWORDX2:
726 case AMDGPU::GLOBAL_STORE_DWORDX3:
727 case AMDGPU::GLOBAL_STORE_DWORDX4:
728 case AMDGPU::FLAT_LOAD_DWORD:
729 case AMDGPU::FLAT_LOAD_DWORDX2:
730 case AMDGPU::FLAT_LOAD_DWORDX3:
731 case AMDGPU::FLAT_LOAD_DWORDX4:
732 case AMDGPU::FLAT_STORE_DWORD:
733 case AMDGPU::FLAT_STORE_DWORDX2:
734 case AMDGPU::FLAT_STORE_DWORDX3:
735 case AMDGPU::FLAT_STORE_DWORDX4:
742 const SILoadStoreOptimizer &LSO) {
744 unsigned Opc =
MI->getOpcode();
745 InstClass = getInstClass(Opc, *LSO.TII);
747 if (InstClass == UNKNOWN)
750 IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*
MI));
755 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
760 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
763 case S_BUFFER_LOAD_IMM:
764 case S_BUFFER_LOAD_SGPR_IMM:
773 if (InstClass == MIMG) {
774 DMask = LSO.TII->getNamedOperand(*
I, AMDGPU::OpName::dmask)->getImm();
779 Offset =
I->getOperand(OffsetIdx).getImm();
782 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
783 Format = LSO.TII->getNamedOperand(*
I, AMDGPU::OpName::format)->getImm();
785 Width = getOpcodeWidth(*
I, *LSO.TII);
787 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
789 }
else if (InstClass != MIMG) {
790 CPol = LSO.TII->getNamedOperand(*
I, AMDGPU::OpName::cpol)->getImm();
793 AddressRegs Regs = getRegs(Opc, *LSO.TII);
794 bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*
I) || LSO.TII->isVSAMPLE(*
I);
797 for (
unsigned J = 0; J < Regs.NumVAddrs; J++)
798 AddrIdx[NumAddresses++] =
801 AddrIdx[NumAddresses++] =
804 AddrIdx[NumAddresses++] =
808 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc);
810 AddrIdx[NumAddresses++] =
813 AddrIdx[NumAddresses++] =
816 AddrIdx[NumAddresses++] =
820 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp);
821 assert(NumAddresses <= MaxAddressRegs);
823 for (
unsigned J = 0; J < NumAddresses; J++)
824 AddrReg[J] = &
I->getOperand(AddrIdx[J]);
830 "SI Load Store Optimizer",
false,
false)
835char SILoadStoreOptimizer::
ID = 0;
840 return new SILoadStoreOptimizer();
846 for (
const auto &
Op :
MI.operands()) {
856bool SILoadStoreOptimizer::canSwapInstructions(
859 if (
A.mayLoadOrStore() &&
B.mayLoadOrStore() &&
860 (
A.mayStore() ||
B.mayStore()) &&
A.mayAlias(AA,
B,
true))
862 for (
const auto &BOp :
B.operands()) {
865 if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.
contains(BOp.getReg()))
867 if (BOp.isDef() && ARegUses.
contains(BOp.getReg()))
876SILoadStoreOptimizer::combineKnownAdjacentMMOs(
const CombineInfo &CI,
877 const CombineInfo &Paired) {
897bool SILoadStoreOptimizer::dmasksCanBeCombined(
const CombineInfo &CI,
899 const CombineInfo &Paired) {
900 assert(CI.InstClass == MIMG);
903 const auto *TFEOp =
TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
904 const auto *LWEOp =
TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
906 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
910 unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
911 AMDGPU::OpName::unorm, AMDGPU::OpName::da,
912 AMDGPU::OpName::r128, AMDGPU::OpName::a16};
914 for (
auto op : OperandsToMatch) {
919 CI.I->getOperand(
Idx).getImm() != Paired.I->getOperand(
Idx).getImm())
924 unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
925 unsigned MinMask = std::min(CI.DMask, Paired.DMask);
931 if ((1u << AllowedBitsForMin) <= MinMask)
938 unsigned ComponentCount,
940 if (ComponentCount > 4)
959 return NewFormatInfo->
Format;
972bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
976 assert(CI.InstClass != MIMG);
980 if (CI.Offset == Paired.Offset)
984 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
987 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
1011 uint32_t EltOffset0 = CI.Offset / CI.EltSize;
1012 uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
1017 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
1018 if (EltOffset0 + CI.Width != EltOffset1 &&
1019 EltOffset1 + Paired.Width != EltOffset0)
1021 if (CI.CPol != Paired.CPol)
1023 if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||
1024 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {
1030 if (CI.Width != Paired.Width &&
1031 (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset))
1039 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
1040 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
1042 CI.Offset = EltOffset0 / 64;
1043 Paired.Offset = EltOffset1 / 64;
1050 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
1052 CI.Offset = EltOffset0;
1053 Paired.Offset = EltOffset1;
1059 uint32_t Min = std::min(EltOffset0, EltOffset1);
1062 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
1063 if (((Max - Min) & ~Mask) == 0) {
1071 BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
1072 CI.BaseOff = BaseOff * CI.EltSize;
1073 CI.Offset = (EltOffset0 - BaseOff) / 64;
1074 Paired.Offset = (EltOffset1 - BaseOff) / 64;
1080 if (isUInt<8>(Max - Min)) {
1086 CI.BaseOff = BaseOff * CI.EltSize;
1087 CI.Offset = EltOffset0 - BaseOff;
1088 Paired.Offset = EltOffset1 - BaseOff;
1096bool SILoadStoreOptimizer::widthsFit(
const GCNSubtarget &STM,
1097 const CombineInfo &CI,
1098 const CombineInfo &Paired) {
1099 const unsigned Width = (CI.Width + Paired.Width);
1100 switch (CI.InstClass) {
1103 case S_BUFFER_LOAD_IMM:
1104 case S_BUFFER_LOAD_SGPR_IMM:
1120SILoadStoreOptimizer::getDataRegClass(
const MachineInstr &
MI)
const {
1121 if (
const auto *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst)) {
1122 return TRI->getRegClassForReg(*
MRI, Dst->getReg());
1124 if (
const auto *Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdata)) {
1125 return TRI->getRegClassForReg(*
MRI, Src->getReg());
1127 if (
const auto *Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::data0)) {
1128 return TRI->getRegClassForReg(*
MRI, Src->getReg());
1130 if (
const auto *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::sdst)) {
1131 return TRI->getRegClassForReg(*
MRI, Dst->getReg());
1133 if (
const auto *Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::sdata)) {
1134 return TRI->getRegClassForReg(*
MRI, Src->getReg());
1141SILoadStoreOptimizer::CombineInfo *
1142SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1143 CombineInfo &Paired) {
1146 if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1148 assert(CI.InstClass == Paired.InstClass);
1150 if (getInstSubclass(CI.I->getOpcode(), *
TII) !=
1151 getInstSubclass(Paired.I->getOpcode(), *
TII))
1156 if (CI.InstClass == MIMG) {
1157 if (!dmasksCanBeCombined(CI, *
TII, Paired))
1160 if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
1167 if (CI.I->mayLoad()) {
1171 if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *
MBBI))
1179 if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *
MBBI))
1189 if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1190 offsetsCanBeCombined(CI, *STM, Paired,
true);
1194unsigned SILoadStoreOptimizer::read2Opcode(
unsigned EltSize)
const {
1196 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1197 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1200unsigned SILoadStoreOptimizer::read2ST64Opcode(
unsigned EltSize)
const {
1202 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1204 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1205 : AMDGPU::DS_READ2ST64_B64_gfx9;
1209SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1215 const auto *AddrReg =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1217 const auto *Dest0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1218 const auto *Dest1 =
TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1220 unsigned NewOffset0 = CI.Offset;
1221 unsigned NewOffset1 = Paired.Offset;
1223 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1225 unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
1226 unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
1228 if (NewOffset0 > NewOffset1) {
1234 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1235 (NewOffset0 != NewOffset1) &&
"Computed offset doesn't fit");
1240 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1244 Register BaseReg = AddrReg->getReg();
1245 unsigned BaseSubReg = AddrReg->getSubReg();
1246 unsigned BaseRegFlags = 0;
1248 Register ImmReg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1252 BaseReg =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1255 TII->getAddNoCarry(*
MBB, InsertBefore,
DL, BaseReg)
1257 .addReg(AddrReg->getReg(), 0, BaseSubReg)
1264 .
addReg(BaseReg, BaseRegFlags, BaseSubReg)
1277 .
addReg(DestReg, 0, SubRegIdx0);
1283 Paired.I->eraseFromParent();
1289unsigned SILoadStoreOptimizer::write2Opcode(
unsigned EltSize)
const {
1291 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1292 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1293 : AMDGPU::DS_WRITE2_B64_gfx9;
1296unsigned SILoadStoreOptimizer::write2ST64Opcode(
unsigned EltSize)
const {
1298 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1299 : AMDGPU::DS_WRITE2ST64_B64;
1301 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1302 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1306 CombineInfo &CI, CombineInfo &Paired,
1313 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1315 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1317 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1319 unsigned NewOffset0 = CI.Offset;
1320 unsigned NewOffset1 = Paired.Offset;
1322 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1324 if (NewOffset0 > NewOffset1) {
1330 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1331 (NewOffset0 != NewOffset1) &&
"Computed offset doesn't fit");
1337 unsigned BaseSubReg = AddrReg->
getSubReg();
1338 unsigned BaseRegFlags = 0;
1340 Register ImmReg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1344 BaseReg =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1347 TII->getAddNoCarry(*
MBB, InsertBefore,
DL, BaseReg)
1349 .addReg(AddrReg->
getReg(), 0, BaseSubReg)
1356 .
addReg(BaseReg, BaseRegFlags, BaseSubReg)
1365 Paired.I->eraseFromParent();
1367 LLVM_DEBUG(
dbgs() <<
"Inserted write2 inst: " << *Write2 <<
'\n');
1372SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1376 const unsigned Opcode = getNewOpcode(CI, Paired);
1380 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1381 unsigned MergedDMask = CI.DMask | Paired.DMask;
1385 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1386 for (
unsigned I = 1, E = (*CI.I).getNumOperands();
I != E; ++
I) {
1388 MIB.addImm(MergedDMask);
1390 MIB.add((*CI.I).getOperand(
I));
1396 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1398 MachineInstr *
New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1400 unsigned SubRegIdx0, SubRegIdx1;
1401 std::tie(SubRegIdx0, SubRegIdx1) = getSubRegIdxs(CI, Paired);
1405 const auto *Dest0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1406 const auto *Dest1 =
TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1410 .
addReg(DestReg, 0, SubRegIdx0);
1416 Paired.I->eraseFromParent();
1421 CombineInfo &CI, CombineInfo &Paired,
1425 const unsigned Opcode = getNewOpcode(CI, Paired);
1429 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1430 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1435 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1439 .
add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));
1440 if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1441 New.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));
1442 New.addImm(MergedOffset);
1443 New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1445 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1446 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1447 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1451 const auto *Dest0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
1452 const auto *Dest1 =
TII->getNamedOperand(*Paired.I, AMDGPU::OpName::sdst);
1456 .
addReg(DestReg, 0, SubRegIdx0);
1462 Paired.I->eraseFromParent();
1467 CombineInfo &CI, CombineInfo &Paired,
1472 const unsigned Opcode = getNewOpcode(CI, Paired);
1477 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1478 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1480 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1482 AddressRegs Regs = getRegs(Opcode, *
TII);
1485 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1490 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1493 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1494 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1495 .addImm(MergedOffset)
1498 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1500 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1501 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1502 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1506 const auto *Dest0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1507 const auto *Dest1 =
TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1511 .
addReg(DestReg, 0, SubRegIdx0);
1517 Paired.I->eraseFromParent();
1522 CombineInfo &CI, CombineInfo &Paired,
1527 const unsigned Opcode = getNewOpcode(CI, Paired);
1532 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1533 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1535 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1537 AddressRegs Regs = getRegs(Opcode, *
TII);
1540 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1542 unsigned JoinedFormat =
1548 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1551 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1552 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1553 .addImm(MergedOffset)
1554 .addImm(JoinedFormat)
1557 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1559 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1560 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1561 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1565 const auto *Dest0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1566 const auto *Dest1 =
TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1570 .
addReg(DestReg, 0, SubRegIdx0);
1576 Paired.I->eraseFromParent();
1581 CombineInfo &CI, CombineInfo &Paired,
1586 const unsigned Opcode = getNewOpcode(CI, Paired);
1588 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1589 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1590 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1594 Register SrcReg =
MRI->createVirtualRegister(SuperRC);
1596 const auto *Src0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1597 const auto *Src1 =
TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1599 BuildMI(*
MBB, InsertBefore,
DL,
TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1608 AddressRegs Regs = getRegs(Opcode, *
TII);
1611 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1613 unsigned JoinedFormat =
1619 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1622 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1623 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1624 .addImm(std::min(CI.Offset, Paired.Offset))
1625 .addImm(JoinedFormat)
1628 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1630 CI.I->eraseFromParent();
1631 Paired.I->eraseFromParent();
1636 CombineInfo &CI, CombineInfo &Paired,
1641 const unsigned Opcode = getNewOpcode(CI, Paired);
1644 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1646 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1648 if (
auto *SAddr =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1652 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1653 .addImm(std::min(CI.Offset, Paired.Offset))
1655 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1657 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1658 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1659 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1663 const auto *Dest0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
1664 const auto *Dest1 =
TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdst);
1668 .
addReg(DestReg, 0, SubRegIdx0);
1674 Paired.I->eraseFromParent();
1679 CombineInfo &CI, CombineInfo &Paired,
1684 const unsigned Opcode = getNewOpcode(CI, Paired);
1686 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1687 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1688 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1692 Register SrcReg =
MRI->createVirtualRegister(SuperRC);
1694 const auto *Src0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1695 const auto *Src1 =
TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1697 BuildMI(*
MBB, InsertBefore,
DL,
TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1704 .
add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1707 if (
auto *SAddr =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1711 MIB.addImm(std::min(CI.Offset, Paired.Offset))
1713 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1715 CI.I->eraseFromParent();
1716 Paired.I->eraseFromParent();
1720unsigned SILoadStoreOptimizer::getNewOpcode(
const CombineInfo &CI,
1721 const CombineInfo &Paired) {
1722 const unsigned Width = CI.Width + Paired.Width;
1724 switch (getCommonInstClass(CI, Paired)) {
1726 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1737 case S_BUFFER_LOAD_IMM:
1742 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1744 return AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
1746 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1748 return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1750 case S_BUFFER_LOAD_SGPR_IMM:
1755 return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1757 return AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
1759 return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1761 return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1768 return AMDGPU::S_LOAD_DWORDX2_IMM;
1770 return AMDGPU::S_LOAD_DWORDX3_IMM;
1772 return AMDGPU::S_LOAD_DWORDX4_IMM;
1774 return AMDGPU::S_LOAD_DWORDX8_IMM;
1781 return AMDGPU::GLOBAL_LOAD_DWORDX2;
1783 return AMDGPU::GLOBAL_LOAD_DWORDX3;
1785 return AMDGPU::GLOBAL_LOAD_DWORDX4;
1787 case GLOBAL_LOAD_SADDR:
1792 return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1794 return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1796 return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1803 return AMDGPU::GLOBAL_STORE_DWORDX2;
1805 return AMDGPU::GLOBAL_STORE_DWORDX3;
1807 return AMDGPU::GLOBAL_STORE_DWORDX4;
1809 case GLOBAL_STORE_SADDR:
1814 return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1816 return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1818 return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1825 return AMDGPU::FLAT_LOAD_DWORDX2;
1827 return AMDGPU::FLAT_LOAD_DWORDX3;
1829 return AMDGPU::FLAT_LOAD_DWORDX4;
1836 return AMDGPU::FLAT_STORE_DWORDX2;
1838 return AMDGPU::FLAT_STORE_DWORDX3;
1840 return AMDGPU::FLAT_STORE_DWORDX4;
1849std::pair<unsigned, unsigned>
1850SILoadStoreOptimizer::getSubRegIdxs(
const CombineInfo &CI,
1851 const CombineInfo &Paired) {
1852 assert((CI.InstClass != MIMG ||
1854 CI.Width + Paired.Width)) &&
1860 static const unsigned Idxs[5][4] = {
1861 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1862 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
1863 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
1864 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
1865 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
1868 assert(CI.Width >= 1 && CI.Width <= 4);
1869 assert(Paired.Width >= 1 && Paired.Width <= 4);
1872 Idx1 = Idxs[0][Paired.Width - 1];
1873 Idx0 = Idxs[Paired.Width][CI.Width - 1];
1875 Idx0 = Idxs[0][CI.Width - 1];
1876 Idx1 = Idxs[CI.Width][Paired.Width - 1];
1879 return std::pair(Idx0, Idx1);
1883SILoadStoreOptimizer::getTargetRegisterClass(
const CombineInfo &CI,
1884 const CombineInfo &Paired) {
1885 if (CI.InstClass == S_BUFFER_LOAD_IMM ||
1886 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
1887 switch (CI.Width + Paired.Width) {
1891 return &AMDGPU::SReg_64_XEXECRegClass;
1893 return &AMDGPU::SGPR_96RegClass;
1895 return &AMDGPU::SGPR_128RegClass;
1897 return &AMDGPU::SGPR_256RegClass;
1899 return &AMDGPU::SGPR_512RegClass;
1903 unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1904 return TRI->isAGPRClass(getDataRegClass(*CI.I))
1910 CombineInfo &CI, CombineInfo &Paired,
1915 const unsigned Opcode = getNewOpcode(CI, Paired);
1917 std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
1918 const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
1919 const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
1923 Register SrcReg =
MRI->createVirtualRegister(SuperRC);
1925 const auto *Src0 =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
1926 const auto *Src1 =
TII->getNamedOperand(*Paired.I, AMDGPU::OpName::vdata);
1928 BuildMI(*
MBB, InsertBefore,
DL,
TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1937 AddressRegs Regs = getRegs(Opcode, *
TII);
1940 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1946 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1949 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1950 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1951 .addImm(std::min(CI.Offset, Paired.Offset))
1954 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1956 CI.I->eraseFromParent();
1957 Paired.I->eraseFromParent();
1962SILoadStoreOptimizer::createRegOrImm(int32_t Val,
MachineInstr &
MI)
const {
1964 if (
TII->isInlineConstant(V))
1967 Register Reg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1969 BuildMI(*
MI.getParent(),
MI.getIterator(),
MI.getDebugLoc(),
1970 TII->get(AMDGPU::S_MOV_B32), Reg)
1979 const MemAddress &
Addr)
const {
1985 Addr.Base.LoSubReg) &&
1986 "Expected 32-bit Base-Register-Low!!");
1989 Addr.Base.HiSubReg) &&
1990 "Expected 32-bit Base-Register-Hi!!");
1995 createRegOrImm(
static_cast<int32_t
>(
Addr.Offset >> 32),
MI);
1997 const auto *CarryRC =
TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1998 Register CarryReg =
MRI->createVirtualRegister(CarryRC);
1999 Register DeadCarryReg =
MRI->createVirtualRegister(CarryRC);
2001 Register DestSub0 =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2002 Register DestSub1 =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2022 Register FullDestReg =
MRI->createVirtualRegister(
TRI->getVGPR64Class());
2038 int32_t NewOffset)
const {
2039 auto Base =
TII->getNamedOperand(
MI, AMDGPU::OpName::vaddr);
2040 Base->setReg(NewBase);
2041 Base->setIsKill(
false);
2042 TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->setImm(NewOffset);
2045std::optional<int32_t>
2051 return std::nullopt;
2054 if (!Def ||
Def->getOpcode() != AMDGPU::S_MOV_B32 ||
2055 !
Def->getOperand(1).isImm())
2056 return std::nullopt;
2058 return Def->getOperand(1).getImm();
2072 MemAddress &
Addr)
const {
2077 if (!Def ||
Def->getOpcode() != AMDGPU::REG_SEQUENCE
2078 ||
Def->getNumOperands() != 5)
2089 if (!BaseLoDef || BaseLoDef->
getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
2090 !BaseHiDef || BaseHiDef->
getOpcode() != AMDGPU::V_ADDC_U32_e64)
2093 const auto *Src0 =
TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
2094 const auto *Src1 =
TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
2096 auto Offset0P = extractConstOffset(*Src0);
2100 if (!(Offset0P = extractConstOffset(*Src1)))
2105 Src0 =
TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
2106 Src1 =
TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
2121 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
2124bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
2126 MemInfoMap &Visited,
2129 if (!(
MI.mayLoad() ^
MI.mayStore()))
2137 TII->getNamedOperand(
MI, AMDGPU::OpName::vdata) !=
nullptr)
2145 if (
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm()) {
2153 if (!Visited.contains(&
MI)) {
2154 processBaseWithConstOffset(
Base, MAddr);
2155 Visited[&
MI] = MAddr;
2157 MAddr = Visited[&
MI];
2159 if (MAddr.Offset == 0) {
2160 LLVM_DEBUG(
dbgs() <<
" Failed to extract constant-offset or there are no"
2161 " constant offsets that can be promoted.\n";);
2166 << MAddr.Base.LoReg <<
"} Offset: " << MAddr.Offset <<
"\n\n";);
2194 MemAddress AnchorAddr;
2195 uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
2210 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
2214 *
TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
2215 MemAddress MAddrNext;
2216 if (!Visited.contains(&MINext)) {
2217 processBaseWithConstOffset(BaseNext, MAddrNext);
2218 Visited[&MINext] = MAddrNext;
2220 MAddrNext = Visited[&MINext];
2222 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
2223 MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
2224 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
2225 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2228 InstsWCommonBase.
push_back(std::pair(&MINext, MAddrNext.Offset));
2230 int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2235 (
uint32_t)std::abs(Dist) > MaxDist) {
2236 MaxDist = std::abs(Dist);
2238 AnchorAddr = MAddrNext;
2239 AnchorInst = &MINext;
2244 LLVM_DEBUG(
dbgs() <<
" Anchor-Inst(with max-distance from Offset): ";
2245 AnchorInst->
dump());
2247 << AnchorAddr.Offset <<
"\n\n");
2252 updateBaseAndOffset(
MI,
Base, MAddr.Offset - AnchorAddr.Offset);
2255 for (
auto P : InstsWCommonBase) {
2258 AM.
BaseOffs =
P.second - AnchorAddr.Offset;
2262 dbgs() <<
")";
P.first->dump());
2263 updateBaseAndOffset(*
P.first,
Base,
P.second - AnchorAddr.Offset);
2267 AnchorList.
insert(AnchorInst);
2274void SILoadStoreOptimizer::addInstToMergeableList(
const CombineInfo &CI,
2275 std::list<std::list<CombineInfo> > &MergeableInsts)
const {
2276 for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2277 if (AddrList.front().InstClass == CI.InstClass &&
2278 AddrList.front().IsAGPR == CI.IsAGPR &&
2279 AddrList.front().hasSameBaseAddress(CI)) {
2280 AddrList.emplace_back(CI);
2286 MergeableInsts.emplace_back(1, CI);
2289std::pair<MachineBasicBlock::iterator, bool>
2290SILoadStoreOptimizer::collectMergeableInsts(
2293 std::list<std::list<CombineInfo>> &MergeableInsts)
const {
2299 for (; BlockI !=
End; ++BlockI) {
2304 if (promoteConstantOffsetToImm(
MI, Visited, AnchorList))
2309 if (
MI.hasOrderedMemoryRef() ||
MI.hasUnmodeledSideEffects()) {
2317 const InstClassEnum InstClass = getInstClass(
MI.getOpcode(), *
TII);
2318 if (InstClass == UNKNOWN)
2324 if (Swizzled != -1 &&
MI.getOperand(Swizzled).getImm())
2328 CI.setMI(
MI, *
this);
2331 if (!CI.hasMergeableAddress(*
MRI))
2334 if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
2356 for (std::list<std::list<CombineInfo>>::iterator
I = MergeableInsts.begin(),
2357 E = MergeableInsts.end();
I != E;) {
2359 std::list<CombineInfo> &MergeList = *
I;
2360 if (MergeList.size() <= 1) {
2364 I = MergeableInsts.erase(
I);
2372 [] (
const CombineInfo &
A,
const CombineInfo &
B) {
2373 return A.Offset <
B.Offset;
2378 return std::pair(BlockI,
Modified);
2384bool SILoadStoreOptimizer::optimizeBlock(
2385 std::list<std::list<CombineInfo> > &MergeableInsts) {
2388 for (std::list<std::list<CombineInfo>>::iterator
I = MergeableInsts.begin(),
2389 E = MergeableInsts.end();
I != E;) {
2390 std::list<CombineInfo> &MergeList = *
I;
2392 bool OptimizeListAgain =
false;
2393 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2397 I = MergeableInsts.erase(
I);
2405 if (!OptimizeListAgain) {
2406 I = MergeableInsts.erase(
I);
2409 OptimizeAgain =
true;
2415SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2416 std::list<CombineInfo> &MergeList,
2417 bool &OptimizeListAgain) {
2418 if (MergeList.empty())
2423 for (
auto I = MergeList.begin(), Next = std::next(
I); Next != MergeList.end();
2424 Next = std::next(
I)) {
2429 if ((*First).Order > (*Second).Order)
2431 CombineInfo &CI = *
First;
2432 CombineInfo &Paired = *Second;
2434 CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2442 LLVM_DEBUG(
dbgs() <<
"Merging: " << *CI.I <<
" with: " << *Paired.I);
2445 switch (CI.InstClass) {
2450 NewMI = mergeRead2Pair(CI, Paired, Where->I);
2453 NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2455 case S_BUFFER_LOAD_IMM:
2456 case S_BUFFER_LOAD_SGPR_IMM:
2458 NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);
2459 OptimizeListAgain |= CI.Width + Paired.Width < 8;
2462 NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2463 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2466 NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2467 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2470 NewMI = mergeImagePair(CI, Paired, Where->I);
2471 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2474 NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2475 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2478 NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2479 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2483 case GLOBAL_LOAD_SADDR:
2484 NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2485 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2489 case GLOBAL_STORE_SADDR:
2490 NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2491 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2494 CI.setMI(NewMI, *
this);
2495 CI.Order = Where->Order;
2499 MergeList.erase(Second);
2514 TRI = &
TII->getRegisterInfo();
2517 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2532 bool CollectModified;
2533 std::list<std::list<CombineInfo>> MergeableInsts;
2537 std::tie(SectionEnd, CollectModified) =
2543 OptimizeAgain =
false;
2545 }
while (OptimizeAgain);
unsigned const MachineRegisterInfo * MRI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE, "AMDGPU Image Intrinsic Optimizer", false, false) char AMDGPUImageIntrinsicOptimizer void addInstToMergeableList(IntrinsicInst *II, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr)
BasicBlock::iterator collectMergeableInsts(BasicBlock::iterator I, BasicBlock::iterator E, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts)
Provides AMDGPU specific target descriptions.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
std::optional< std::vector< StOtherPiece > > Other
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
unsigned const TargetRegisterInfo * TRI
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi)
static void addDefsUsesToList(const MachineInstr &MI, DenseSet< Register > &RegDefs, DenseSet< Register > &RegUses)
static unsigned getBufferFormatWithCompCount(unsigned OldFormat, unsigned ComponentCount, const GCNSubtarget &STI)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI)
static bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT, const TargetTransformInfo &TTI, const DataLayout &DL, DomTreeUpdater *DTU)
support::ulittle16_t & Lo
support::ulittle16_t & Hi
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Class for arbitrary precision integers.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesCFG()
This function should be called by the pass, iff they do not:
This class represents an Operation in the Expression.
Implements a dense probed hash-table based set.
FunctionPass class - This class is used to implement most global optimizations.
bool loadStoreOptEnabled() const
const SIInstrInfo * getInstrInfo() const override
bool hasDwordx3LoadStores() const
const SITargetLowering * getTargetLowering() const override
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
bool hasScalarDwordx3Loads() const
TypeSize getValue() const
Describe properties that are true of each instruction in the target description file.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Properties which a MachineFunction may have at a given point in time.
MachineFunctionProperties & set(Property P)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & cloneMergedMemRefs(ArrayRef< const MachineInstr * > OtherMIs) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
unsigned getAddrSpace() const
const MachinePointerInfo & getPointerInfo() const
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Wrapper class representing virtual and physical registers.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
static bool isFLATGlobal(const MachineInstr &MI)
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
std::pair< iterator, bool > insert(const ValueT &V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ FLAT_ADDRESS
Address space for flat memory.
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset)
Convert ByteOffset to dwords if the subtarget uses dword SMRD immediate offsets.
bool getMTBUFHasSrsrc(unsigned Opc)
int getMTBUFElements(unsigned Opc)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
bool getMTBUFHasSoffset(unsigned Opc)
int getMUBUFOpcode(unsigned BaseOpc, unsigned Elements)
int getMUBUFBaseOpcode(unsigned Opc)
int getMTBUFBaseOpcode(unsigned Opc)
bool getMUBUFHasVAddr(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
int getMTBUFOpcode(unsigned BaseOpc, unsigned Elements)
bool getMUBUFHasSoffset(unsigned Opc)
const MIMGBaseOpcodeInfo * getMIMGBaseOpcode(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
bool getMTBUFHasVAddr(unsigned Opc)
int getMUBUFElements(unsigned Opc)
const GcnBufferFormatInfo * getGcnBufferFormatInfo(uint8_t BitsPerComp, uint8_t NumComponents, uint8_t NumFormat, const MCSubtargetInfo &STI)
bool getMUBUFHasSrsrc(unsigned Opc)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Define
Register definition.
@ Kill
The last use of a register.
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< DefNode * > Def
This is an optimization pass for GlobalISel generic memory operations.
bool operator<(int64_t V1, const APSInt &V2)
int popcount(T Value) noexcept
Count the number of set bits in a value.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
char & SILoadStoreOptimizerID
FunctionPass * createSILoadStoreOptimizerPass()
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
void initializeSILoadStoreOptimizerPass(PassRegistry &)
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
constexpr unsigned BitWidth
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This class contains a discriminated union of information about pointers in memory operands,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...