71#define DEBUG_TYPE "si-load-store-opt"
79 S_BUFFER_LOAD_SGPR_IMM,
98 unsigned char NumVAddrs = 0;
101 bool SOffset =
false;
109const unsigned MaxAddressRegs = 12 + 1 + 1;
111class SILoadStoreOptimizer {
120 InstClassEnum InstClass;
124 int AddrIdx[MaxAddressRegs];
126 unsigned NumAddresses;
129 bool hasSameBaseAddress(
const CombineInfo &CI) {
130 if (NumAddresses != CI.NumAddresses)
134 for (
unsigned i = 0; i < NumAddresses; i++) {
137 if (AddrReg[i]->isImm() || AddrRegNext.
isImm()) {
138 if (AddrReg[i]->isImm() != AddrRegNext.
isImm() ||
156 for (
unsigned i = 0; i < NumAddresses; ++i) {
165 if (!AddrOp->
isReg())
171 AddrOp->
getReg() != AMDGPU::SGPR_NULL)
176 if (
MRI.hasOneNonDBGUse(AddrOp->
getReg()))
190 struct BaseRegisters {
194 unsigned LoSubReg = 0;
195 unsigned HiSubReg = 0;
197 bool UseV64Pattern =
false;
219 static bool dmasksCanBeCombined(
const CombineInfo &CI,
221 const CombineInfo &Paired);
222 static bool offsetsCanBeCombined(CombineInfo &CI,
const GCNSubtarget &STI,
223 CombineInfo &Paired,
bool Modify =
false);
224 static bool widthsFit(
const GCNSubtarget &STI,
const CombineInfo &CI,
225 const CombineInfo &Paired);
226 unsigned getNewOpcode(
const CombineInfo &CI,
const CombineInfo &Paired);
227 static std::pair<unsigned, unsigned> getSubRegIdxs(
const CombineInfo &CI,
228 const CombineInfo &Paired);
230 getTargetRegisterClass(
const CombineInfo &CI,
231 const CombineInfo &Paired)
const;
234 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
236 void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired,
240 Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
244 unsigned read2Opcode(
unsigned EltSize)
const;
245 unsigned read2ST64Opcode(
unsigned EltSize)
const;
247 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
250 unsigned write2Opcode(
unsigned EltSize)
const;
251 unsigned write2ST64Opcode(
unsigned EltSize)
const;
252 unsigned getWrite2Opcode(
const CombineInfo &CI)
const;
255 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
258 mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
261 mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
264 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
267 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
270 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
273 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
276 mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
279 mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
283 int32_t NewOffset)
const;
284 void updateAsyncLDSAddress(
MachineInstr &
MI, int32_t OffsetDiff)
const;
289 MemAddress &Addr)
const;
298 std::list<std::list<CombineInfo> > &MergeableInsts)
const;
303 std::list<std::list<CombineInfo>> &MergeableInsts)
const;
306 const CombineInfo &Paired);
308 static InstClassEnum getCommonInstClass(
const CombineInfo &CI,
309 const CombineInfo &Paired);
311 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
312 bool &OptimizeListAgain);
313 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
328 StringRef getPassName()
const override {
return "SI Load Store Optimizer"; }
343 const unsigned Opc =
MI.getOpcode();
349 if (
TII.isImage(
MI)) {
351 TII.getNamedOperand(
MI, AMDGPU::OpName::dmask)->getImm();
359 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
360 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
361 case AMDGPU::S_LOAD_DWORD_IMM:
362 case AMDGPU::GLOBAL_LOAD_DWORD:
363 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
364 case AMDGPU::GLOBAL_STORE_DWORD:
365 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
366 case AMDGPU::FLAT_LOAD_DWORD:
367 case AMDGPU::FLAT_STORE_DWORD:
368 case AMDGPU::FLAT_LOAD_DWORD_SADDR:
369 case AMDGPU::FLAT_STORE_DWORD_SADDR:
371 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
372 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
373 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
374 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
375 case AMDGPU::S_LOAD_DWORDX2_IMM:
376 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
377 case AMDGPU::GLOBAL_LOAD_DWORDX2:
378 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
379 case AMDGPU::GLOBAL_STORE_DWORDX2:
380 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
381 case AMDGPU::FLAT_LOAD_DWORDX2:
382 case AMDGPU::FLAT_STORE_DWORDX2:
383 case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
384 case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
386 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
387 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
388 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
389 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
390 case AMDGPU::S_LOAD_DWORDX3_IMM:
391 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
392 case AMDGPU::GLOBAL_LOAD_DWORDX3:
393 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
394 case AMDGPU::GLOBAL_STORE_DWORDX3:
395 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
396 case AMDGPU::FLAT_LOAD_DWORDX3:
397 case AMDGPU::FLAT_STORE_DWORDX3:
398 case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
399 case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
401 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
402 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
403 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
404 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
405 case AMDGPU::S_LOAD_DWORDX4_IMM:
406 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
407 case AMDGPU::GLOBAL_LOAD_DWORDX4:
408 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
409 case AMDGPU::GLOBAL_STORE_DWORDX4:
410 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
411 case AMDGPU::FLAT_LOAD_DWORDX4:
412 case AMDGPU::FLAT_STORE_DWORDX4:
413 case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
414 case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
416 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
417 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
418 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
419 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
420 case AMDGPU::S_LOAD_DWORDX8_IMM:
421 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
423 case AMDGPU::DS_READ_B32:
424 case AMDGPU::DS_READ_B32_gfx9:
425 case AMDGPU::DS_WRITE_B32:
426 case AMDGPU::DS_WRITE_B32_gfx9:
428 case AMDGPU::DS_READ_B64:
429 case AMDGPU::DS_READ_B64_gfx9:
430 case AMDGPU::DS_WRITE_B64:
431 case AMDGPU::DS_WRITE_B64_gfx9:
446 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN:
447 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact:
448 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN:
449 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact:
450 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
451 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
452 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
453 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
454 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:
455 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact:
456 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN:
457 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact:
458 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN:
459 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact:
460 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET:
461 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact:
463 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN:
464 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN_exact:
465 case AMDGPU::BUFFER_STORE_DWORD_IDXEN:
466 case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact:
467 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
468 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
469 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
470 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
471 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN:
472 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact:
473 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN:
474 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN_exact:
475 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN:
476 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact:
477 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET:
478 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact:
491 if (
TII.get(
Opc).mayStore() || !
TII.get(
Opc).mayLoad() ||
500 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:
501 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:
502 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:
503 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:
504 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
505 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
506 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
507 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
508 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN:
509 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact:
510 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN:
511 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact:
512 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN:
513 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact:
514 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET:
515 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact:
517 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
518 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
519 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
520 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
521 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN:
522 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact:
523 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET:
524 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact:
525 return TBUFFER_STORE;
529 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
530 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
531 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
532 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
533 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
534 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
535 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
536 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
537 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
538 return S_BUFFER_LOAD_IMM;
539 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
540 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
541 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
542 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
543 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
544 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
545 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
546 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
547 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
548 return S_BUFFER_LOAD_SGPR_IMM;
549 case AMDGPU::S_LOAD_DWORD_IMM:
550 case AMDGPU::S_LOAD_DWORDX2_IMM:
551 case AMDGPU::S_LOAD_DWORDX3_IMM:
552 case AMDGPU::S_LOAD_DWORDX4_IMM:
553 case AMDGPU::S_LOAD_DWORDX8_IMM:
554 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
555 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
556 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
557 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
559 case AMDGPU::DS_READ_B32:
560 case AMDGPU::DS_READ_B32_gfx9:
561 case AMDGPU::DS_READ_B64:
562 case AMDGPU::DS_READ_B64_gfx9:
564 case AMDGPU::DS_WRITE_B32:
565 case AMDGPU::DS_WRITE_B32_gfx9:
566 case AMDGPU::DS_WRITE_B64:
567 case AMDGPU::DS_WRITE_B64_gfx9:
569 case AMDGPU::GLOBAL_LOAD_DWORD:
570 case AMDGPU::GLOBAL_LOAD_DWORDX2:
571 case AMDGPU::GLOBAL_LOAD_DWORDX3:
572 case AMDGPU::GLOBAL_LOAD_DWORDX4:
573 case AMDGPU::FLAT_LOAD_DWORD:
574 case AMDGPU::FLAT_LOAD_DWORDX2:
575 case AMDGPU::FLAT_LOAD_DWORDX3:
576 case AMDGPU::FLAT_LOAD_DWORDX4:
578 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
579 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
580 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
581 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
582 return GLOBAL_LOAD_SADDR;
583 case AMDGPU::GLOBAL_STORE_DWORD:
584 case AMDGPU::GLOBAL_STORE_DWORDX2:
585 case AMDGPU::GLOBAL_STORE_DWORDX3:
586 case AMDGPU::GLOBAL_STORE_DWORDX4:
587 case AMDGPU::FLAT_STORE_DWORD:
588 case AMDGPU::FLAT_STORE_DWORDX2:
589 case AMDGPU::FLAT_STORE_DWORDX3:
590 case AMDGPU::FLAT_STORE_DWORDX4:
592 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
593 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
594 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
595 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
596 return GLOBAL_STORE_SADDR;
597 case AMDGPU::FLAT_LOAD_DWORD_SADDR:
598 case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
599 case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
600 case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
601 return FLAT_LOAD_SADDR;
602 case AMDGPU::FLAT_STORE_DWORD_SADDR:
603 case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
604 case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
605 case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
606 return FLAT_STORE_SADDR;
621 return Info->BaseOpcode;
626 case AMDGPU::DS_READ_B32:
627 case AMDGPU::DS_READ_B32_gfx9:
628 case AMDGPU::DS_READ_B64:
629 case AMDGPU::DS_READ_B64_gfx9:
630 case AMDGPU::DS_WRITE_B32:
631 case AMDGPU::DS_WRITE_B32_gfx9:
632 case AMDGPU::DS_WRITE_B64:
633 case AMDGPU::DS_WRITE_B64_gfx9:
635 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
636 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
637 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
638 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
639 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
640 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
641 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
642 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
643 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
644 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
645 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
646 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
647 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
648 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
649 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
650 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
651 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
652 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
653 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
654 return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
655 case AMDGPU::S_LOAD_DWORD_IMM:
656 case AMDGPU::S_LOAD_DWORDX2_IMM:
657 case AMDGPU::S_LOAD_DWORDX3_IMM:
658 case AMDGPU::S_LOAD_DWORDX4_IMM:
659 case AMDGPU::S_LOAD_DWORDX8_IMM:
660 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
661 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
662 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
663 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
664 return AMDGPU::S_LOAD_DWORD_IMM;
665 case AMDGPU::GLOBAL_LOAD_DWORD:
666 case AMDGPU::GLOBAL_LOAD_DWORDX2:
667 case AMDGPU::GLOBAL_LOAD_DWORDX3:
668 case AMDGPU::GLOBAL_LOAD_DWORDX4:
669 case AMDGPU::FLAT_LOAD_DWORD:
670 case AMDGPU::FLAT_LOAD_DWORDX2:
671 case AMDGPU::FLAT_LOAD_DWORDX3:
672 case AMDGPU::FLAT_LOAD_DWORDX4:
673 return AMDGPU::FLAT_LOAD_DWORD;
674 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
675 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
676 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
677 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
678 return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
679 case AMDGPU::GLOBAL_STORE_DWORD:
680 case AMDGPU::GLOBAL_STORE_DWORDX2:
681 case AMDGPU::GLOBAL_STORE_DWORDX3:
682 case AMDGPU::GLOBAL_STORE_DWORDX4:
683 case AMDGPU::FLAT_STORE_DWORD:
684 case AMDGPU::FLAT_STORE_DWORDX2:
685 case AMDGPU::FLAT_STORE_DWORDX3:
686 case AMDGPU::FLAT_STORE_DWORDX4:
687 return AMDGPU::FLAT_STORE_DWORD;
688 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
689 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
690 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
691 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
692 return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
693 case AMDGPU::FLAT_LOAD_DWORD_SADDR:
694 case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
695 case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
696 case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
697 return AMDGPU::FLAT_LOAD_DWORD_SADDR;
698 case AMDGPU::FLAT_STORE_DWORD_SADDR:
699 case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
700 case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
701 case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
702 return AMDGPU::FLAT_STORE_DWORD_SADDR;
713SILoadStoreOptimizer::getCommonInstClass(
const CombineInfo &CI,
714 const CombineInfo &Paired) {
715 assert(CI.InstClass == Paired.InstClass);
717 if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
719 return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
733 Result.SOffset =
true;
739 int VAddr0Idx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::vaddr0);
740 if (VAddr0Idx >= 0) {
741 AMDGPU::OpName RsrcName =
742 TII.isMIMG(
Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
743 int RsrcIdx = AMDGPU::getNamedOperandIdx(
Opc, RsrcName);
744 Result.NumVAddrs = RsrcIdx - VAddr0Idx;
761 Result.SOffset =
true;
769 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
770 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
771 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
772 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
773 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
774 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
775 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
776 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
777 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
778 Result.SOffset =
true;
780 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
781 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
782 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
783 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
784 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
785 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
786 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
787 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
788 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
789 case AMDGPU::S_LOAD_DWORD_IMM:
790 case AMDGPU::S_LOAD_DWORDX2_IMM:
791 case AMDGPU::S_LOAD_DWORDX3_IMM:
792 case AMDGPU::S_LOAD_DWORDX4_IMM:
793 case AMDGPU::S_LOAD_DWORDX8_IMM:
794 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
795 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
796 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
797 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
800 case AMDGPU::DS_READ_B32:
801 case AMDGPU::DS_READ_B64:
802 case AMDGPU::DS_READ_B32_gfx9:
803 case AMDGPU::DS_READ_B64_gfx9:
804 case AMDGPU::DS_WRITE_B32:
805 case AMDGPU::DS_WRITE_B64:
806 case AMDGPU::DS_WRITE_B32_gfx9:
807 case AMDGPU::DS_WRITE_B64_gfx9:
810 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
811 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
812 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
813 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
814 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
815 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
816 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
817 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
818 case AMDGPU::FLAT_LOAD_DWORD_SADDR:
819 case AMDGPU::FLAT_LOAD_DWORDX2_SADDR:
820 case AMDGPU::FLAT_LOAD_DWORDX3_SADDR:
821 case AMDGPU::FLAT_LOAD_DWORDX4_SADDR:
822 case AMDGPU::FLAT_STORE_DWORD_SADDR:
823 case AMDGPU::FLAT_STORE_DWORDX2_SADDR:
824 case AMDGPU::FLAT_STORE_DWORDX3_SADDR:
825 case AMDGPU::FLAT_STORE_DWORDX4_SADDR:
828 case AMDGPU::GLOBAL_LOAD_DWORD:
829 case AMDGPU::GLOBAL_LOAD_DWORDX2:
830 case AMDGPU::GLOBAL_LOAD_DWORDX3:
831 case AMDGPU::GLOBAL_LOAD_DWORDX4:
832 case AMDGPU::GLOBAL_STORE_DWORD:
833 case AMDGPU::GLOBAL_STORE_DWORDX2:
834 case AMDGPU::GLOBAL_STORE_DWORDX3:
835 case AMDGPU::GLOBAL_STORE_DWORDX4:
836 case AMDGPU::FLAT_LOAD_DWORD:
837 case AMDGPU::FLAT_LOAD_DWORDX2:
838 case AMDGPU::FLAT_LOAD_DWORDX3:
839 case AMDGPU::FLAT_LOAD_DWORDX4:
840 case AMDGPU::FLAT_STORE_DWORD:
841 case AMDGPU::FLAT_STORE_DWORDX2:
842 case AMDGPU::FLAT_STORE_DWORDX3:
843 case AMDGPU::FLAT_STORE_DWORDX4:
850 const SILoadStoreOptimizer &LSO) {
852 unsigned Opc =
MI->getOpcode();
853 InstClass = getInstClass(
Opc, *LSO.TII);
855 if (InstClass == UNKNOWN)
858 DataRC = LSO.getDataRegClass(*
MI);
863 (
Opc == AMDGPU::DS_READ_B64 ||
Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
868 (
Opc == AMDGPU::DS_WRITE_B64 ||
Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
871 case S_BUFFER_LOAD_IMM:
872 case S_BUFFER_LOAD_SGPR_IMM:
881 if (InstClass == MIMG) {
886 int OffsetIdx = AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::offset);
887 Offset =
I->getOperand(OffsetIdx).getImm();
890 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) {
894 EltSize = Info->BitsPerComp / 8;
897 Width = getOpcodeWidth(*
I, *LSO.TII);
899 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
901 }
else if (InstClass != MIMG) {
905 AddressRegs Regs = getRegs(
Opc, *LSO.TII);
909 for (
unsigned J = 0; J < Regs.NumVAddrs; J++)
910 AddrIdx[NumAddresses++] =
911 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::vaddr0) + J;
913 AddrIdx[NumAddresses++] =
914 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::addr);
916 AddrIdx[NumAddresses++] =
917 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::sbase);
919 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
920 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc);
922 AddrIdx[NumAddresses++] =
923 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::soffset);
925 AddrIdx[NumAddresses++] =
926 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::saddr);
928 AddrIdx[NumAddresses++] =
929 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::vaddr);
931 AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
932 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp);
933 assert(NumAddresses <= MaxAddressRegs);
935 for (
unsigned J = 0; J < NumAddresses; J++)
936 AddrReg[J] = &
I->getOperand(AddrIdx[J]);
942 "SI Load Store Optimizer",
false,
false)
947char SILoadStoreOptimizerLegacy::
ID = 0;
952 return new SILoadStoreOptimizerLegacy();
958 for (
const auto &
Op :
MI.operands()) {
968bool SILoadStoreOptimizer::canSwapInstructions(
969 const DenseSet<Register> &ARegDefs,
const DenseSet<Register> &ARegUses,
970 const MachineInstr &
A,
const MachineInstr &
B)
const {
971 if (
A.mayLoadOrStore() &&
B.mayLoadOrStore() &&
972 (
A.mayStore() ||
B.mayStore()) &&
A.mayAlias(AA,
B,
true))
974 for (
const auto &BOp :
B.operands()) {
977 if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.
contains(BOp.getReg()))
979 if (BOp.isDef() && ARegUses.
contains(BOp.getReg()))
988SILoadStoreOptimizer::combineKnownAdjacentMMOs(
const CombineInfo &CI,
989 const CombineInfo &Paired) {
990 const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
991 const MachineMemOperand *MMOb = *Paired.I->memoperands_begin();
1005 MachineFunction *MF = CI.I->getMF();
1009bool SILoadStoreOptimizer::dmasksCanBeCombined(
const CombineInfo &CI,
1010 const SIInstrInfo &
TII,
1011 const CombineInfo &Paired) {
1012 assert(CI.InstClass == MIMG);
1015 const auto *TFEOp =
TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
1016 const auto *LWEOp =
TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
1018 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
1022 AMDGPU::OpName OperandsToMatch[] = {
1023 AMDGPU::OpName::cpol, AMDGPU::OpName::d16, AMDGPU::OpName::unorm,
1024 AMDGPU::OpName::da, AMDGPU::OpName::r128, AMDGPU::OpName::a16};
1026 for (AMDGPU::OpName
op : OperandsToMatch) {
1027 int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(),
op);
1028 if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(),
op) != Idx)
1031 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
1036 unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
1037 unsigned MinMask = std::min(CI.DMask, Paired.DMask);
1043 if ((1u << AllowedBitsForMin) <= MinMask)
1050 unsigned ComponentCount,
1052 if (ComponentCount > 4)
1071 return NewFormatInfo->
Format;
1084bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
1085 const GCNSubtarget &STI,
1086 CombineInfo &Paired,
1088 assert(CI.InstClass != MIMG);
1092 if (CI.Offset == Paired.Offset)
1096 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
1099 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
1101 const llvm::AMDGPU::GcnBufferFormatInfo *Info0 =
1103 const llvm::AMDGPU::GcnBufferFormatInfo *Info1 =
1115 unsigned NumCombinedComponents = CI.Width + Paired.Width;
1116 if (NumCombinedComponents == 3 && CI.EltSize <= 2)
1117 NumCombinedComponents = 4;
1125 unsigned ElemIndex0 = CI.Offset / CI.EltSize;
1126 unsigned ElemIndex1 = Paired.Offset / Paired.EltSize;
1127 if (ElemIndex0 + CI.Width != ElemIndex1 &&
1128 ElemIndex1 + Paired.Width != ElemIndex0)
1134 unsigned MergedBytes = CI.EltSize * NumCombinedComponents;
1135 unsigned RequiredAlign = std::min(MergedBytes, 4u);
1136 unsigned MinOff = std::min(CI.Offset, Paired.Offset);
1137 if (MinOff % RequiredAlign != 0)
1143 uint32_t EltOffset0 = CI.Offset / CI.EltSize;
1144 uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
1149 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
1150 if (EltOffset0 + CI.Width != EltOffset1 &&
1151 EltOffset1 + Paired.Width != EltOffset0)
1157 if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||
1158 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {
1164 if (CI.Width != Paired.Width &&
1165 (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset))
1173 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
1176 CI.Offset = EltOffset0 / 64;
1177 Paired.Offset = EltOffset1 / 64;
1186 CI.Offset = EltOffset0;
1187 Paired.Offset = EltOffset1;
1193 uint32_t Min = std::min(EltOffset0, EltOffset1);
1194 uint32_t
Max = std::max(EltOffset0, EltOffset1);
1197 if (((Max - Min) & ~Mask) == 0) {
1206 CI.BaseOff = BaseOff * CI.EltSize;
1207 CI.Offset = (EltOffset0 - BaseOff) / 64;
1208 Paired.Offset = (EltOffset1 - BaseOff) / 64;
1220 CI.BaseOff = BaseOff * CI.EltSize;
1221 CI.Offset = EltOffset0 - BaseOff;
1222 Paired.Offset = EltOffset1 - BaseOff;
1230bool SILoadStoreOptimizer::widthsFit(
const GCNSubtarget &STM,
1231 const CombineInfo &CI,
1232 const CombineInfo &Paired) {
1233 const unsigned Width = (CI.Width + Paired.Width);
1234 switch (CI.InstClass) {
1237 case S_BUFFER_LOAD_IMM:
1238 case S_BUFFER_LOAD_SGPR_IMM:
1248 return STM.hasScalarDwordx3Loads();
1253const TargetRegisterClass *
1254SILoadStoreOptimizer::getDataRegClass(
const MachineInstr &
MI)
const {
1255 if (
const auto *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst)) {
1256 return TRI->getRegClassForReg(*
MRI, Dst->getReg());
1258 if (
const auto *Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdata)) {
1259 return TRI->getRegClassForReg(*
MRI, Src->getReg());
1261 if (
const auto *Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::data0)) {
1262 return TRI->getRegClassForReg(*
MRI, Src->getReg());
1264 if (
const auto *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::sdst)) {
1265 return TRI->getRegClassForReg(*
MRI, Dst->getReg());
1267 if (
const auto *Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::sdata)) {
1268 return TRI->getRegClassForReg(*
MRI, Src->getReg());
1275SILoadStoreOptimizer::CombineInfo *
1276SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1277 CombineInfo &Paired) {
1280 if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1282 assert(CI.InstClass == Paired.InstClass);
1284 if (getInstSubclass(CI.I->getOpcode(), *
TII) !=
1285 getInstSubclass(Paired.I->getOpcode(), *
TII))
1290 if (CI.InstClass == MIMG) {
1291 if (!dmasksCanBeCombined(CI, *
TII, Paired))
1294 if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
1298 DenseSet<Register> RegDefs;
1299 DenseSet<Register> RegUses;
1301 if (CI.I->mayLoad()) {
1305 if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *
MBBI))
1313 if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *
MBBI))
1323 if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1324 offsetsCanBeCombined(CI, *STM, Paired,
true);
1326 if (CI.InstClass == DS_WRITE) {
1334 const MachineOperand *Data0 =
1335 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1336 const MachineOperand *Data1 =
1337 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1339 const MCInstrDesc &Write2Opc =
TII->get(getWrite2Opcode(CI));
1340 int Data0Idx = AMDGPU::getNamedOperandIdx(Write2Opc.
getOpcode(),
1341 AMDGPU::OpName::data0);
1342 int Data1Idx = AMDGPU::getNamedOperandIdx(Write2Opc.
getOpcode(),
1343 AMDGPU::OpName::data1);
1345 const TargetRegisterClass *DataRC0 =
TII->getRegClass(Write2Opc, Data0Idx);
1347 const TargetRegisterClass *DataRC1 =
TII->getRegClass(Write2Opc, Data1Idx);
1350 DataRC0 =
TRI->getMatchingSuperRegClass(
MRI->getRegClass(Data0->
getReg()),
1355 DataRC1 =
TRI->getMatchingSuperRegClass(
MRI->getRegClass(Data1->
getReg()),
1359 if (!
MRI->constrainRegClass(Data0->
getReg(), DataRC0) ||
1360 !
MRI->constrainRegClass(Data1->
getReg(), DataRC1))
1372void SILoadStoreOptimizer::copyToDestRegs(
1373 CombineInfo &CI, CombineInfo &Paired,
1375 AMDGPU::OpName OpName,
Register DestReg)
const {
1376 MachineBasicBlock *
MBB = CI.I->getParent();
1378 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1381 const MCInstrDesc &CopyDesc =
TII->get(TargetOpcode::COPY);
1382 auto *Dest0 =
TII->getNamedOperand(*CI.I, OpName);
1383 auto *Dest1 =
TII->getNamedOperand(*Paired.I, OpName);
1388 Dest0->setIsEarlyClobber(
false);
1389 Dest1->setIsEarlyClobber(
false);
1393 .
addReg(DestReg, {}, SubRegIdx0);
1396 .
addReg(DestReg, RegState::Kill, SubRegIdx1);
1402SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
1405 AMDGPU::OpName OpName)
const {
1406 MachineBasicBlock *
MBB = CI.I->getParent();
1408 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1411 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1412 Register SrcReg =
MRI->createVirtualRegister(SuperRC);
1414 const auto *Src0 =
TII->getNamedOperand(*CI.I, OpName);
1415 const auto *Src1 =
TII->getNamedOperand(*Paired.I, OpName);
1417 BuildMI(*
MBB, InsertBefore,
DL,
TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1426unsigned SILoadStoreOptimizer::read2Opcode(
unsigned EltSize)
const {
1428 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1429 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1432unsigned SILoadStoreOptimizer::read2ST64Opcode(
unsigned EltSize)
const {
1434 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1436 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1437 : AMDGPU::DS_READ2ST64_B64_gfx9;
1441SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1443 MachineBasicBlock *
MBB = CI.I->getParent();
1447 const auto *AddrReg =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1449 unsigned NewOffset0 = std::min(CI.Offset, Paired.Offset);
1450 unsigned NewOffset1 = std::max(CI.Offset, Paired.Offset);
1452 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1455 (NewOffset0 != NewOffset1) &&
"Computed offset doesn't fit");
1457 const MCInstrDesc &Read2Desc =
TII->get(
Opc);
1459 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1460 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1466 unsigned BaseSubReg = AddrReg->getSubReg();
1469 Register ImmReg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1473 BaseReg =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1474 BaseRegFlags = RegState::Kill;
1476 TII->getAddNoCarry(*
MBB, InsertBefore,
DL, BaseReg)
1478 .addReg(AddrReg->getReg(), {}, BaseSubReg)
1483 MachineInstrBuilder Read2 =
1485 .
addReg(BaseReg, BaseRegFlags, BaseSubReg)
1491 copyToDestRegs(CI, Paired, InsertBefore,
DL, AMDGPU::OpName::vdst, DestReg);
1493 CI.I->eraseFromParent();
1494 Paired.I->eraseFromParent();
1500unsigned SILoadStoreOptimizer::write2Opcode(
unsigned EltSize)
const {
1502 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1503 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1504 : AMDGPU::DS_WRITE2_B64_gfx9;
1507unsigned SILoadStoreOptimizer::write2ST64Opcode(
unsigned EltSize)
const {
1509 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1510 : AMDGPU::DS_WRITE2ST64_B64;
1512 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1513 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1516unsigned SILoadStoreOptimizer::getWrite2Opcode(
const CombineInfo &CI)
const {
1517 return CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1521 CombineInfo &CI, CombineInfo &Paired,
1523 MachineBasicBlock *
MBB = CI.I->getParent();
1527 const MachineOperand *AddrReg =
1528 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1529 const MachineOperand *Data0 =
1530 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1531 const MachineOperand *Data1 =
1532 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1534 unsigned NewOffset0 = CI.Offset;
1535 unsigned NewOffset1 = Paired.Offset;
1536 unsigned Opc = getWrite2Opcode(CI);
1538 if (NewOffset0 > NewOffset1) {
1545 (NewOffset0 != NewOffset1) &&
"Computed offset doesn't fit");
1547 const MCInstrDesc &Write2Desc =
TII->get(
Opc);
1552 unsigned BaseSubReg = AddrReg->
getSubReg();
1555 Register ImmReg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1559 BaseReg =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1560 BaseRegFlags = RegState::Kill;
1562 TII->getAddNoCarry(*
MBB, InsertBefore,
DL, BaseReg)
1564 .addReg(AddrReg->
getReg(), {}, BaseSubReg)
1569 MachineInstrBuilder Write2 =
1571 .
addReg(BaseReg, BaseRegFlags, BaseSubReg)
1579 CI.I->eraseFromParent();
1580 Paired.I->eraseFromParent();
1582 LLVM_DEBUG(
dbgs() <<
"Inserted write2 inst: " << *Write2 <<
'\n');
1587SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1589 MachineBasicBlock *
MBB = CI.I->getParent();
1593 const unsigned Opcode = getNewOpcode(CI, Paired);
1595 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1597 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1598 unsigned MergedDMask = CI.DMask | Paired.DMask;
1600 AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1602 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1603 for (
unsigned I = 1,
E = (*CI.I).getNumOperands();
I !=
E; ++
I) {
1605 MIB.addImm(MergedDMask);
1607 MIB.add((*CI.I).getOperand(
I));
1613 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1615 MachineInstr *
New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1617 copyToDestRegs(CI, Paired, InsertBefore,
DL, AMDGPU::OpName::vdata, DestReg);
1619 CI.I->eraseFromParent();
1620 Paired.I->eraseFromParent();
1625 CombineInfo &CI, CombineInfo &Paired,
1627 MachineBasicBlock *
MBB = CI.I->getParent();
1631 const unsigned Opcode = getNewOpcode(CI, Paired);
1633 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1635 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1636 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1641 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1643 MachineInstrBuilder
New =
1645 .
add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));
1646 if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1647 New.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));
1648 New.addImm(MergedOffset);
1649 New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1651 copyToDestRegs(CI, Paired, InsertBefore,
DL, AMDGPU::OpName::sdst, DestReg);
1653 CI.I->eraseFromParent();
1654 Paired.I->eraseFromParent();
1659 CombineInfo &CI, CombineInfo &Paired,
1661 MachineBasicBlock *
MBB = CI.I->getParent();
1666 const unsigned Opcode = getNewOpcode(CI, Paired);
1668 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1671 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1672 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1674 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1676 AddressRegs Regs = getRegs(Opcode, *
TII);
1679 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1684 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1687 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1688 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1689 .addImm(MergedOffset)
1692 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1694 copyToDestRegs(CI, Paired, InsertBefore,
DL, AMDGPU::OpName::vdata, DestReg);
1696 CI.I->eraseFromParent();
1697 Paired.I->eraseFromParent();
1702 CombineInfo &CI, CombineInfo &Paired,
1704 MachineBasicBlock *
MBB = CI.I->getParent();
1709 const unsigned Opcode = getNewOpcode(CI, Paired);
1711 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1714 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1715 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1717 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1719 AddressRegs Regs = getRegs(Opcode, *
TII);
1722 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1727 unsigned NumCombinedComponents = CI.Width + Paired.Width;
1728 if (NumCombinedComponents == 3 && CI.EltSize <= 2)
1729 NumCombinedComponents = 4;
1730 unsigned JoinedFormat =
1736 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1739 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1740 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1741 .addImm(MergedOffset)
1742 .addImm(JoinedFormat)
1745 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1747 copyToDestRegs(CI, Paired, InsertBefore,
DL, AMDGPU::OpName::vdata, DestReg);
1749 CI.I->eraseFromParent();
1750 Paired.I->eraseFromParent();
1755 CombineInfo &CI, CombineInfo &Paired,
1757 MachineBasicBlock *
MBB = CI.I->getParent();
1761 const unsigned Opcode = getNewOpcode(CI, Paired);
1764 copyFromSrcRegs(CI, Paired, InsertBefore,
DL, AMDGPU::OpName::vdata);
1767 .
addReg(SrcReg, RegState::Kill);
1769 AddressRegs Regs = getRegs(Opcode, *
TII);
1772 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1777 unsigned NumCombinedComponents = CI.Width + Paired.Width;
1778 if (NumCombinedComponents == 3 && CI.EltSize <= 2)
1779 NumCombinedComponents = 4;
1780 unsigned JoinedFormat =
1786 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1789 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1790 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1791 .addImm(std::min(CI.Offset, Paired.Offset))
1792 .addImm(JoinedFormat)
1795 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1797 CI.I->eraseFromParent();
1798 Paired.I->eraseFromParent();
1803 CombineInfo &CI, CombineInfo &Paired,
1805 MachineBasicBlock *
MBB = CI.I->getParent();
1810 const unsigned Opcode = getNewOpcode(CI, Paired);
1812 const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1813 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1815 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1817 if (
auto *SAddr =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1821 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1822 .addImm(std::min(CI.Offset, Paired.Offset))
1824 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1826 copyToDestRegs(CI, Paired, InsertBefore,
DL, AMDGPU::OpName::vdst, DestReg);
1828 CI.I->eraseFromParent();
1829 Paired.I->eraseFromParent();
1834 CombineInfo &CI, CombineInfo &Paired,
1836 MachineBasicBlock *
MBB = CI.I->getParent();
1841 const unsigned Opcode = getNewOpcode(CI, Paired);
1844 copyFromSrcRegs(CI, Paired, InsertBefore,
DL, AMDGPU::OpName::vdata);
1847 .
add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1848 .
addReg(SrcReg, RegState::Kill);
1850 if (
auto *SAddr =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1854 MIB.addImm(std::min(CI.Offset, Paired.Offset))
1856 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1858 CI.I->eraseFromParent();
1859 Paired.I->eraseFromParent();
1868 (MMOs.
size() != 1 || MMOs[0]->
getAlign().value() < Width * 4);
1871unsigned SILoadStoreOptimizer::getNewOpcode(
const CombineInfo &CI,
1872 const CombineInfo &Paired) {
1873 const unsigned Width = CI.Width + Paired.Width;
1875 switch (getCommonInstClass(CI, Paired)) {
1877 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1888 case S_BUFFER_LOAD_IMM: {
1891 bool NeedsConstrainedOpc =
1897 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec
1898 : AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1900 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec
1901 : AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
1903 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec
1904 : AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1906 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec
1907 : AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1910 case S_BUFFER_LOAD_SGPR_IMM: {
1913 bool NeedsConstrainedOpc =
1919 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec
1920 : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1922 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec
1923 : AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
1925 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec
1926 : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1928 return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec
1929 : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1935 bool NeedsConstrainedOpc =
1941 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX2_IMM_ec
1942 : AMDGPU::S_LOAD_DWORDX2_IMM;
1944 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX3_IMM_ec
1945 : AMDGPU::S_LOAD_DWORDX3_IMM;
1947 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX4_IMM_ec
1948 : AMDGPU::S_LOAD_DWORDX4_IMM;
1950 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX8_IMM_ec
1951 : AMDGPU::S_LOAD_DWORDX8_IMM;
1959 return AMDGPU::GLOBAL_LOAD_DWORDX2;
1961 return AMDGPU::GLOBAL_LOAD_DWORDX3;
1963 return AMDGPU::GLOBAL_LOAD_DWORDX4;
1965 case GLOBAL_LOAD_SADDR:
1970 return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1972 return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1974 return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1981 return AMDGPU::GLOBAL_STORE_DWORDX2;
1983 return AMDGPU::GLOBAL_STORE_DWORDX3;
1985 return AMDGPU::GLOBAL_STORE_DWORDX4;
1987 case GLOBAL_STORE_SADDR:
1992 return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1994 return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1996 return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
2003 return AMDGPU::FLAT_LOAD_DWORDX2;
2005 return AMDGPU::FLAT_LOAD_DWORDX3;
2007 return AMDGPU::FLAT_LOAD_DWORDX4;
2014 return AMDGPU::FLAT_STORE_DWORDX2;
2016 return AMDGPU::FLAT_STORE_DWORDX3;
2018 return AMDGPU::FLAT_STORE_DWORDX4;
2020 case FLAT_LOAD_SADDR:
2025 return AMDGPU::FLAT_LOAD_DWORDX2_SADDR;
2027 return AMDGPU::FLAT_LOAD_DWORDX3_SADDR;
2029 return AMDGPU::FLAT_LOAD_DWORDX4_SADDR;
2031 case FLAT_STORE_SADDR:
2036 return AMDGPU::FLAT_STORE_DWORDX2_SADDR;
2038 return AMDGPU::FLAT_STORE_DWORDX3_SADDR;
2040 return AMDGPU::FLAT_STORE_DWORDX4_SADDR;
2049std::pair<unsigned, unsigned>
2050SILoadStoreOptimizer::getSubRegIdxs(
const CombineInfo &CI,
2051 const CombineInfo &Paired) {
2052 assert((CI.InstClass != MIMG ||
2054 CI.Width + Paired.Width)) &&
2060 static const unsigned Idxs[5][4] = {
2061 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
2062 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
2063 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
2064 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
2065 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
2068 assert(CI.Width >= 1 && CI.Width <= 4);
2069 assert(Paired.Width >= 1 && Paired.Width <= 4);
2072 Idx1 = Idxs[0][Paired.Width - 1];
2073 Idx0 = Idxs[Paired.Width][CI.Width - 1];
2075 Idx0 = Idxs[0][CI.Width - 1];
2076 Idx1 = Idxs[CI.Width][Paired.Width - 1];
2079 return {Idx0, Idx1};
2082const TargetRegisterClass *
2083SILoadStoreOptimizer::getTargetRegisterClass(
const CombineInfo &CI,
2084 const CombineInfo &Paired)
const {
2085 if (CI.InstClass == S_BUFFER_LOAD_IMM ||
2086 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
2087 switch (CI.Width + Paired.Width) {
2091 return &AMDGPU::SReg_64_XEXECRegClass;
2093 return &AMDGPU::SGPR_96RegClass;
2095 return &AMDGPU::SGPR_128RegClass;
2097 return &AMDGPU::SGPR_256RegClass;
2099 return &AMDGPU::SGPR_512RegClass;
2105 unsigned BitWidth = 32 * (CI.Width + Paired.Width);
2106 return TRI->isAGPRClass(getDataRegClass(*CI.I))
2112 CombineInfo &CI, CombineInfo &Paired,
2114 MachineBasicBlock *
MBB = CI.I->getParent();
2118 const unsigned Opcode = getNewOpcode(CI, Paired);
2121 copyFromSrcRegs(CI, Paired, InsertBefore,
DL, AMDGPU::OpName::vdata);
2124 .
addReg(SrcReg, RegState::Kill);
2126 AddressRegs Regs = getRegs(Opcode, *
TII);
2129 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
2135 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
2138 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
2139 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
2140 .addImm(std::min(CI.Offset, Paired.Offset))
2143 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
2145 CI.I->eraseFromParent();
2146 Paired.I->eraseFromParent();
2151SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &
MI)
const {
2152 APInt
V(32, Val,
true);
2153 if (
TII->isInlineConstant(V))
2156 Register Reg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2158 BuildMI(*
MI.getParent(),
MI.getIterator(),
MI.getDebugLoc(),
2159 TII->get(AMDGPU::S_MOV_B32),
Reg)
2167Register SILoadStoreOptimizer::computeBase(MachineInstr &
MI,
2168 const MemAddress &Addr)
const {
2169 MachineBasicBlock *
MBB =
MI.getParent();
2176 if (Addr.Base.UseV64Pattern) {
2177 Register FullDestReg =
MRI->createVirtualRegister(
2178 TII->getRegClass(
TII->get(AMDGPU::V_ADD_U64_e64), 0));
2181 Register OffsetReg =
MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2182 MachineInstr *MovOffset =
2186 MachineInstr *
Add64 =
2189 .
addReg(OffsetReg, RegState::Kill)
2200 assert((
TRI->getRegSizeInBits(Addr.Base.LoReg, *
MRI) == 32 ||
2201 Addr.Base.LoSubReg) &&
2202 "Expected 32-bit Base-Register-Low!!");
2204 assert((
TRI->getRegSizeInBits(Addr.Base.HiReg, *
MRI) == 32 ||
2205 Addr.Base.HiSubReg) &&
2206 "Expected 32-bit Base-Register-Hi!!");
2208 MachineOperand OffsetLo = createRegOrImm(
static_cast<int32_t
>(Addr.Offset),
MI);
2209 MachineOperand OffsetHi =
2210 createRegOrImm(
static_cast<int32_t
>(Addr.Offset >> 32),
MI);
2212 const auto *CarryRC =
TRI->getWaveMaskRegClass();
2213 Register CarryReg =
MRI->createVirtualRegister(CarryRC);
2214 Register DeadCarryReg =
MRI->createVirtualRegister(CarryRC);
2216 Register DestSub0 =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2217 Register DestSub1 =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2218 MachineInstr *LoHalf =
2220 .
addReg(CarryReg, RegState::Define)
2221 .
addReg(Addr.Base.LoReg, {}, Addr.Base.LoSubReg)
2225 MachineInstr *HiHalf =
2227 .
addReg(DeadCarryReg, RegState::Define | RegState::Dead)
2228 .
addReg(Addr.Base.HiReg, {}, Addr.Base.HiSubReg)
2230 .
addReg(CarryReg, RegState::Kill)
2233 Register FullDestReg =
MRI->createVirtualRegister(
TRI->getVGPR64Class());
2234 MachineInstr *FullBase =
2245 dbgs() <<
" " << *HiHalf <<
"\n";
2246 dbgs() <<
" " << *FullBase <<
"\n\n";);
2252void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &
MI,
2254 int32_t NewOffset)
const {
2255 auto *
Base =
TII->getNamedOperand(
MI, AMDGPU::OpName::vaddr);
2256 Base->setReg(NewBase);
2257 Base->setIsKill(
false);
2258 TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->setImm(NewOffset);
2264bool SILoadStoreOptimizer::processBaseWithConstOffset64(
2265 MachineInstr *AddDef,
const MachineOperand &
Base, MemAddress &Addr)
const {
2269 MachineOperand *Src0 =
TII->getNamedOperand(*AddDef, AMDGPU::OpName::src0);
2270 MachineOperand *Src1 =
TII->getNamedOperand(*AddDef, AMDGPU::OpName::src1);
2272 const MachineOperand *BaseOp =
nullptr;
2274 auto Offset =
TII->getImmOrMaterializedImm(*Src1);
2285 Addr.Base.LoReg = BaseOp->
getReg();
2286 Addr.Base.UseV64Pattern =
true;
2304void SILoadStoreOptimizer::processBaseWithConstOffset(
const MachineOperand &
Base,
2305 MemAddress &Addr)
const {
2309 MachineInstr *
Def =
MRI->getUniqueVRegDef(
Base.getReg());
2314 if (
Def->getOpcode() == AMDGPU::V_ADD_U64_e64) {
2315 if (processBaseWithConstOffset64(Def,
Base, Addr))
2320 if (
Def->getOpcode() != AMDGPU::REG_SEQUENCE ||
Def->getNumOperands() != 5)
2323 MachineOperand BaseLo =
Def->getOperand(1);
2324 MachineOperand BaseHi =
Def->getOperand(3);
2328 MachineInstr *BaseLoDef =
MRI->getUniqueVRegDef(BaseLo.
getReg());
2329 MachineInstr *BaseHiDef =
MRI->getUniqueVRegDef(BaseHi.
getReg());
2331 if (!BaseLoDef || BaseLoDef->
getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
2332 !BaseHiDef || BaseHiDef->
getOpcode() != AMDGPU::V_ADDC_U32_e64)
2335 MachineOperand *Src0 =
TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
2336 MachineOperand *Src1 =
TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
2338 auto Offset0P =
TII->getImmOrMaterializedImm(*Src0);
2342 if (!(Offset0P =
TII->getImmOrMaterializedImm(*Src1)))
2347 if (!BaseLo.
isReg())
2350 Src0 =
TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
2351 Src1 =
TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
2359 uint64_t Offset1 = Src1->
getImm();
2362 if (!BaseHi.
isReg())
2365 Addr.Base.LoReg = BaseLo.
getReg();
2366 Addr.Base.HiReg = BaseHi.
getReg();
2367 Addr.Base.LoSubReg = BaseLo.
getSubReg();
2368 Addr.Base.HiSubReg = BaseHi.
getSubReg();
2369 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
2375void SILoadStoreOptimizer::updateAsyncLDSAddress(MachineInstr &
MI,
2376 int32_t OffsetDiff)
const {
2377 if (!
TII->usesASYNC_CNT(
MI) || OffsetDiff == 0)
2380 Register OldVDst =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst)->getReg();
2381 Register NewVDst =
MRI->createVirtualRegister(
MRI->getRegClass(OldVDst));
2382 MachineBasicBlock &
MBB = *
MI.getParent();
2389 MI.getOperand(0).setReg(NewVDst);
2392bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
2394 MemInfoMap &Visited,
2395 SmallPtrSet<MachineInstr *, 4> &
AnchorList)
const {
2412 if (
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm()) {
2418 MachineOperand &
Base = *
TII->getNamedOperand(
MI, AMDGPU::OpName::vaddr);
2419 auto [It,
Inserted] = Visited.try_emplace(&
MI);
2422 processBaseWithConstOffset(
Base, MAddr);
2427 if (MAddr.Offset == 0) {
2428 LLVM_DEBUG(
dbgs() <<
" Failed to extract constant-offset or there are no"
2429 " constant offsets that can be promoted.\n";);
2435 <<
"} Offset: " << MAddr.Offset <<
"\n\n";);
2462 MachineInstr *AnchorInst =
nullptr;
2463 MemAddress AnchorAddr;
2464 uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
2467 MachineBasicBlock *
MBB =
MI.getParent();
2474 MachineInstr &MINext = *
MBBI;
2478 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
2481 const MachineOperand &BaseNext =
2482 *
TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
2483 MemAddress MAddrNext;
2484 auto [It,
Inserted] = Visited.try_emplace(&MINext);
2486 processBaseWithConstOffset(BaseNext, MAddrNext);
2487 It->second = MAddrNext;
2489 MAddrNext = It->second;
2491 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
2492 MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
2493 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
2494 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2497 InstsWCommonBase.
emplace_back(&MINext, MAddrNext.Offset);
2499 int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2500 TargetLoweringBase::AddrMode AM;
2504 (uint32_t)std::abs(Dist) > MaxDist) {
2505 MaxDist = std::abs(Dist);
2507 AnchorAddr = MAddrNext;
2508 AnchorInst = &MINext;
2513 LLVM_DEBUG(
dbgs() <<
" Anchor-Inst(with max-distance from Offset): ";
2514 AnchorInst->
dump());
2516 << AnchorAddr.Offset <<
"\n\n");
2521 int32_t OffsetDiff = MAddr.Offset - AnchorAddr.Offset;
2522 updateBaseAndOffset(
MI,
Base, OffsetDiff);
2523 updateAsyncLDSAddress(
MI, OffsetDiff);
2526 for (
auto [OtherMI, OtherOffset] : InstsWCommonBase) {
2527 TargetLoweringBase::AddrMode AM;
2529 AM.
BaseOffs = OtherOffset - AnchorAddr.Offset;
2534 int32_t OtherOffsetDiff = OtherOffset - AnchorAddr.Offset;
2535 updateBaseAndOffset(*OtherMI,
Base, OtherOffsetDiff);
2536 updateAsyncLDSAddress(*OtherMI, OtherOffsetDiff);
2547void SILoadStoreOptimizer::addInstToMergeableList(
const CombineInfo &CI,
2548 std::list<std::list<CombineInfo> > &MergeableInsts)
const {
2549 for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2550 if (AddrList.front().InstClass == CI.InstClass &&
2551 AddrList.front().hasSameBaseAddress(CI)) {
2552 AddrList.emplace_back(CI);
2558 MergeableInsts.emplace_back(1, CI);
2561std::pair<MachineBasicBlock::iterator, bool>
2562SILoadStoreOptimizer::collectMergeableInsts(
2564 MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &
AnchorList,
2565 std::list<std::list<CombineInfo>> &MergeableInsts)
const {
2571 for (; BlockI != End; ++BlockI) {
2572 MachineInstr &
MI = *BlockI;
2576 if (promoteConstantOffsetToImm(
MI, Visited,
AnchorList))
2581 if (
MI.hasOrderedMemoryRef() ||
MI.hasUnmodeledSideEffects()) {
2589 const InstClassEnum InstClass = getInstClass(
MI.getOpcode(), *
TII);
2590 if (InstClass == UNKNOWN)
2595 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::swz);
2596 if (Swizzled != -1 &&
MI.getOperand(Swizzled).getImm())
2599 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE) {
2600 const MachineOperand *Fmt =
2601 TII->getNamedOperand(
MI, AMDGPU::OpName::format);
2609 CI.setMI(
MI, *
this);
2612 if (!CI.hasMergeableAddress(*
MRI))
2627 for (std::list<std::list<CombineInfo>>::iterator
I = MergeableInsts.begin(),
2628 E = MergeableInsts.end();
I !=
E;) {
2630 std::list<CombineInfo> &MergeList = *
I;
2631 if (MergeList.size() <= 1) {
2635 I = MergeableInsts.erase(
I);
2643 [] (
const CombineInfo &
A,
const CombineInfo &
B) {
2644 return A.Offset <
B.Offset;
2655bool SILoadStoreOptimizer::optimizeBlock(
2656 std::list<std::list<CombineInfo> > &MergeableInsts) {
2659 for (std::list<std::list<CombineInfo>>::iterator
I = MergeableInsts.begin(),
2660 E = MergeableInsts.end();
I !=
E;) {
2661 std::list<CombineInfo> &MergeList = *
I;
2663 bool OptimizeListAgain =
false;
2664 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2668 I = MergeableInsts.erase(
I);
2676 if (!OptimizeListAgain) {
2677 I = MergeableInsts.erase(
I);
2680 OptimizeAgain =
true;
2686SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2687 std::list<CombineInfo> &MergeList,
2688 bool &OptimizeListAgain) {
2689 if (MergeList.empty())
2694 for (
auto I = MergeList.begin(),
Next = std::next(
I);
Next != MergeList.end();
2695 Next = std::next(
I)) {
2700 if ((*First).Order > (*Second).Order)
2702 CombineInfo &CI = *
First;
2703 CombineInfo &Paired = *Second;
2705 CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2713 LLVM_DEBUG(
dbgs() <<
"Merging: " << *CI.I <<
" with: " << *Paired.I);
2716 switch (CI.InstClass) {
2721 NewMI = mergeRead2Pair(CI, Paired, Where->I);
2724 NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2726 case S_BUFFER_LOAD_IMM:
2727 case S_BUFFER_LOAD_SGPR_IMM:
2729 NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);
2730 OptimizeListAgain |= CI.Width + Paired.Width < 8;
2733 NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2734 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2737 NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2738 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2741 NewMI = mergeImagePair(CI, Paired, Where->I);
2742 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2745 NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2746 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2749 NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2750 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2753 case FLAT_LOAD_SADDR:
2755 case GLOBAL_LOAD_SADDR:
2756 NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2757 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2760 case FLAT_STORE_SADDR:
2762 case GLOBAL_STORE_SADDR:
2763 NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2764 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2767 CI.setMI(NewMI, *
this);
2768 CI.Order = Where->Order;
2772 MergeList.erase(Second);
2778bool SILoadStoreOptimizerLegacy::runOnMachineFunction(MachineFunction &MF) {
2781 return SILoadStoreOptimizer(
2782 &getAnalysis<AAResultsWrapperPass>().getAAResults())
2786bool SILoadStoreOptimizer::run(MachineFunction &MF) {
2806 for (MachineBasicBlock &
MBB : MF) {
2810 bool CollectModified;
2811 std::list<std::list<CombineInfo>> MergeableInsts;
2815 std::tie(SectionEnd, CollectModified) =
2821 OptimizeAgain =
false;
2823 }
while (OptimizeAgain);
2845 bool Changed = SILoadStoreOptimizer(&
AA).run(MF);
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE, "AMDGPU Image Intrinsic Optimizer", false, false) char AMDGPUImageIntrinsicOptimizer void addInstToMergeableList(IntrinsicInst *II, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr)
BasicBlock::iterator collectMergeableInsts(BasicBlock::iterator I, BasicBlock::iterator E, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts)
Provides AMDGPU specific target descriptions.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
static MaybeAlign getAlign(Value *Ptr)
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi)
static bool needsConstrainedOpcode(const GCNSubtarget &STM, ArrayRef< MachineMemOperand * > MMOs, unsigned Width)
static void addDefsUsesToList(const MachineInstr &MI, DenseSet< Register > &RegDefs, DenseSet< Register > &RegUses)
static unsigned getBufferFormatWithCompCount(unsigned OldFormat, unsigned ComponentCount, const GCNSubtarget &STI)
static bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT, const TargetTransformInfo &TTI, const DataLayout &DL, bool HasBranchDivergence, DomTreeUpdater *DTU)
A manager for alias analyses.
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
Represents analyses that only rely on functions' control flow.
static LLVM_ABI DebugLoc getMergedLocation(DebugLoc LocA, DebugLoc LocB)
When two instructions are combined into a single instruction we also need to combine the original loc...
Implements a dense probed hash-table based set.
FunctionPass class - This class is used to implement most global optimizations.
bool hasOptNone() const
Do not optimize this function (-O0).
bool loadStoreOptEnabled() const
const SIInstrInfo * getInstrInfo() const override
bool hasDwordx3LoadStores() const
const SITargetLowering * getTargetLowering() const override
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
bool isXNACKEnabled() const
const HexagonRegisterInfo & getRegisterInfo() const
TypeSize getValue() const
unsigned getOpcode() const
Return the opcode number for this descriptor.
An RAII based helper class to modify MachineFunctionProperties when running pass.
MachineInstrBundleIterator< MachineInstr > iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Properties which a MachineFunction may have at a given point in time.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & cloneMergedMemRefs(ArrayRef< const MachineInstr * > OtherMIs) const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
LLVM_ABI void dump() const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
unsigned getAddrSpace() const
const MachinePointerInfo & getPointerInfo() const
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Wrapper class representing virtual and physical registers.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
static bool isFLATScratch(const MachineInstr &MI)
static bool isVIMAGE(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isVSAMPLE(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, AMDGPU::OpName OperandName) const
Returns the operand named Op.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
reference emplace_back(ArgTypes &&... Args)
StringRef - Represent a constant reference to a string, i.e.
std::pair< iterator, bool > insert(const ValueT &V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset)
Convert ByteOffset to dwords if the subtarget uses dword SMRD immediate offsets.
bool getMTBUFHasSrsrc(unsigned Opc)
int getMTBUFElements(unsigned Opc)
bool getMTBUFHasSoffset(unsigned Opc)
int getMUBUFOpcode(unsigned BaseOpc, unsigned Elements)
int getMUBUFBaseOpcode(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
int getMTBUFBaseOpcode(unsigned Opc)
bool getMUBUFHasVAddr(unsigned Opc)
int getMTBUFOpcode(unsigned BaseOpc, unsigned Elements)
bool getMUBUFHasSoffset(unsigned Opc)
const MIMGBaseOpcodeInfo * getMIMGBaseOpcode(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
bool getMTBUFHasVAddr(unsigned Opc)
int getMUBUFElements(unsigned Opc)
const GcnBufferFormatInfo * getGcnBufferFormatInfo(uint8_t BitsPerComp, uint8_t NumComponents, uint8_t NumFormat, const MCSubtargetInfo &STI)
bool getMUBUFHasSrsrc(unsigned Opc)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ Add64
64 bits label addition
NodeAddr< DefNode * > Def
BaseReg
Stack frame base register. Bit 0 of FREInfo.Info.
This is an optimization pass for GlobalISel generic memory operations.
bool operator<(int64_t V1, const APSInt &V2)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
RegState
Flags to represent properties of register accesses.
constexpr T maskLeadingOnes(unsigned N)
Create a bitmask with the N left-most bits set to 1, and all other bits set to 0.
FunctionPass * createSILoadStoreOptimizerLegacyPass()
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
char & SILoadStoreOptimizerLegacyID
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
FunctionAddr VTableAddr Next
DWARFExpression::Operation Op
std::vector< std::pair< LineLocation, FunctionId > > AnchorList
constexpr unsigned BitWidth
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.