69#define DEBUG_TYPE "si-load-store-opt"
77 S_BUFFER_LOAD_SGPR_IMM,
94 unsigned char NumVAddrs = 0;
105const unsigned MaxAddressRegs = 12 + 1 + 1;
116 InstClassEnum InstClass;
120 int AddrIdx[MaxAddressRegs];
122 unsigned NumAddresses;
125 bool hasSameBaseAddress(
const CombineInfo &CI) {
126 if (NumAddresses != CI.NumAddresses)
130 for (
unsigned i = 0; i < NumAddresses; i++) {
133 if (AddrReg[i]->
isImm() || AddrRegNext.
isImm()) {
134 if (AddrReg[i]->
isImm() != AddrRegNext.
isImm() ||
152 for (
unsigned i = 0; i < NumAddresses; ++i) {
161 if (!AddrOp->
isReg())
167 AddrOp->
getReg() != AMDGPU::SGPR_NULL)
172 if (
MRI.hasOneNonDBGUse(AddrOp->
getReg()))
182 return (InstClass == MIMG) ? DMask <
Other.DMask : Offset <
Other.Offset;
186 struct BaseRegisters {
190 unsigned LoSubReg = 0;
191 unsigned HiSubReg = 0;
212 static bool dmasksCanBeCombined(
const CombineInfo &CI,
214 const CombineInfo &Paired);
215 static bool offsetsCanBeCombined(CombineInfo &CI,
const GCNSubtarget &STI,
216 CombineInfo &Paired,
bool Modify =
false);
217 static bool widthsFit(
const GCNSubtarget &STI,
const CombineInfo &CI,
218 const CombineInfo &Paired);
219 unsigned getNewOpcode(
const CombineInfo &CI,
const CombineInfo &Paired);
220 static std::pair<unsigned, unsigned> getSubRegIdxs(
const CombineInfo &CI,
221 const CombineInfo &Paired);
223 getTargetRegisterClass(
const CombineInfo &CI,
224 const CombineInfo &Paired)
const;
227 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
229 void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired,
232 Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
236 unsigned read2Opcode(
unsigned EltSize)
const;
237 unsigned read2ST64Opcode(
unsigned EltSize)
const;
239 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
242 unsigned write2Opcode(
unsigned EltSize)
const;
243 unsigned write2ST64Opcode(
unsigned EltSize)
const;
245 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
248 mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
251 mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
254 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
257 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
260 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
263 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
266 mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
269 mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
273 int32_t NewOffset)
const;
285 std::list<std::list<CombineInfo> > &MergeableInsts)
const;
290 std::list<std::list<CombineInfo>> &MergeableInsts)
const;
293 const CombineInfo &Paired);
295 static InstClassEnum getCommonInstClass(
const CombineInfo &CI,
296 const CombineInfo &Paired);
305 bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
306 bool &OptimizeListAgain);
307 bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
311 StringRef getPassName()
const override {
return "SI Load Store Optimizer"; }
322 .
set(MachineFunctionProperties::Property::IsSSA);
327 const unsigned Opc =
MI.getOpcode();
329 if (
TII.isMUBUF(Opc)) {
333 if (
TII.isImage(
MI)) {
335 TII.getNamedOperand(
MI, AMDGPU::OpName::dmask)->getImm();
338 if (
TII.isMTBUF(Opc)) {
343 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
344 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
345 case AMDGPU::S_LOAD_DWORD_IMM:
346 case AMDGPU::GLOBAL_LOAD_DWORD:
347 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
348 case AMDGPU::GLOBAL_STORE_DWORD:
349 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
350 case AMDGPU::FLAT_LOAD_DWORD:
351 case AMDGPU::FLAT_STORE_DWORD:
353 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
354 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
355 case AMDGPU::S_LOAD_DWORDX2_IMM:
356 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
357 case AMDGPU::GLOBAL_LOAD_DWORDX2:
358 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
359 case AMDGPU::GLOBAL_STORE_DWORDX2:
360 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
361 case AMDGPU::FLAT_LOAD_DWORDX2:
362 case AMDGPU::FLAT_STORE_DWORDX2:
364 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
365 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
366 case AMDGPU::S_LOAD_DWORDX3_IMM:
367 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
368 case AMDGPU::GLOBAL_LOAD_DWORDX3:
369 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
370 case AMDGPU::GLOBAL_STORE_DWORDX3:
371 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
372 case AMDGPU::FLAT_LOAD_DWORDX3:
373 case AMDGPU::FLAT_STORE_DWORDX3:
375 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
376 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
377 case AMDGPU::S_LOAD_DWORDX4_IMM:
378 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
379 case AMDGPU::GLOBAL_LOAD_DWORDX4:
380 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
381 case AMDGPU::GLOBAL_STORE_DWORDX4:
382 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
383 case AMDGPU::FLAT_LOAD_DWORDX4:
384 case AMDGPU::FLAT_STORE_DWORDX4:
386 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
387 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
388 case AMDGPU::S_LOAD_DWORDX8_IMM:
389 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
391 case AMDGPU::DS_READ_B32:
392 case AMDGPU::DS_READ_B32_gfx9:
393 case AMDGPU::DS_WRITE_B32:
394 case AMDGPU::DS_WRITE_B32_gfx9:
396 case AMDGPU::DS_READ_B64:
397 case AMDGPU::DS_READ_B64_gfx9:
398 case AMDGPU::DS_WRITE_B64:
399 case AMDGPU::DS_WRITE_B64_gfx9:
407static InstClassEnum getInstClass(
unsigned Opc,
const SIInstrInfo &
TII) {
410 if (
TII.isMUBUF(Opc)) {
414 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN:
415 case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact:
416 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN:
417 case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact:
418 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
419 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
420 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
421 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
422 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:
423 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact:
424 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN:
425 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact:
426 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN:
427 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact:
428 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET:
429 case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact:
431 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN:
432 case AMDGPU::BUFFER_STORE_DWORD_BOTHEN_exact:
433 case AMDGPU::BUFFER_STORE_DWORD_IDXEN:
434 case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact:
435 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
436 case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
437 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
438 case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
439 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN:
440 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact:
441 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN:
442 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN_exact:
443 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN:
444 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact:
445 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET:
446 case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact:
450 if (
TII.isImage(Opc)) {
459 if (
TII.get(Opc).mayStore() || !
TII.get(Opc).mayLoad() ||
464 if (
TII.isMTBUF(Opc)) {
468 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:
469 case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:
470 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:
471 case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:
472 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
473 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
474 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
475 case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
476 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN:
477 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact:
478 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN:
479 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact:
480 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN:
481 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact:
482 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET:
483 case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact:
485 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
486 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
487 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
488 case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
489 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN:
490 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact:
491 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET:
492 case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact:
493 return TBUFFER_STORE;
497 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
498 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
499 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
500 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
501 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
502 return S_BUFFER_LOAD_IMM;
503 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
504 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
505 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
506 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
507 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
508 return S_BUFFER_LOAD_SGPR_IMM;
509 case AMDGPU::S_LOAD_DWORD_IMM:
510 case AMDGPU::S_LOAD_DWORDX2_IMM:
511 case AMDGPU::S_LOAD_DWORDX3_IMM:
512 case AMDGPU::S_LOAD_DWORDX4_IMM:
513 case AMDGPU::S_LOAD_DWORDX8_IMM:
514 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
515 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
516 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
517 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
519 case AMDGPU::DS_READ_B32:
520 case AMDGPU::DS_READ_B32_gfx9:
521 case AMDGPU::DS_READ_B64:
522 case AMDGPU::DS_READ_B64_gfx9:
524 case AMDGPU::DS_WRITE_B32:
525 case AMDGPU::DS_WRITE_B32_gfx9:
526 case AMDGPU::DS_WRITE_B64:
527 case AMDGPU::DS_WRITE_B64_gfx9:
529 case AMDGPU::GLOBAL_LOAD_DWORD:
530 case AMDGPU::GLOBAL_LOAD_DWORDX2:
531 case AMDGPU::GLOBAL_LOAD_DWORDX3:
532 case AMDGPU::GLOBAL_LOAD_DWORDX4:
533 case AMDGPU::FLAT_LOAD_DWORD:
534 case AMDGPU::FLAT_LOAD_DWORDX2:
535 case AMDGPU::FLAT_LOAD_DWORDX3:
536 case AMDGPU::FLAT_LOAD_DWORDX4:
538 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
539 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
540 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
541 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
542 return GLOBAL_LOAD_SADDR;
543 case AMDGPU::GLOBAL_STORE_DWORD:
544 case AMDGPU::GLOBAL_STORE_DWORDX2:
545 case AMDGPU::GLOBAL_STORE_DWORDX3:
546 case AMDGPU::GLOBAL_STORE_DWORDX4:
547 case AMDGPU::FLAT_STORE_DWORD:
548 case AMDGPU::FLAT_STORE_DWORDX2:
549 case AMDGPU::FLAT_STORE_DWORDX3:
550 case AMDGPU::FLAT_STORE_DWORDX4:
552 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
553 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
554 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
555 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
556 return GLOBAL_STORE_SADDR;
563static unsigned getInstSubclass(
unsigned Opc,
const SIInstrInfo &
TII) {
566 if (
TII.isMUBUF(Opc))
568 if (
TII.isImage(Opc)) {
571 return Info->BaseOpcode;
573 if (
TII.isMTBUF(Opc))
576 case AMDGPU::DS_READ_B32:
577 case AMDGPU::DS_READ_B32_gfx9:
578 case AMDGPU::DS_READ_B64:
579 case AMDGPU::DS_READ_B64_gfx9:
580 case AMDGPU::DS_WRITE_B32:
581 case AMDGPU::DS_WRITE_B32_gfx9:
582 case AMDGPU::DS_WRITE_B64:
583 case AMDGPU::DS_WRITE_B64_gfx9:
585 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
586 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
587 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
588 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
589 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
590 return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
591 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
592 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
593 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
594 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
595 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
596 return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
597 case AMDGPU::S_LOAD_DWORD_IMM:
598 case AMDGPU::S_LOAD_DWORDX2_IMM:
599 case AMDGPU::S_LOAD_DWORDX3_IMM:
600 case AMDGPU::S_LOAD_DWORDX4_IMM:
601 case AMDGPU::S_LOAD_DWORDX8_IMM:
602 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
603 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
604 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
605 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
606 return AMDGPU::S_LOAD_DWORD_IMM;
607 case AMDGPU::GLOBAL_LOAD_DWORD:
608 case AMDGPU::GLOBAL_LOAD_DWORDX2:
609 case AMDGPU::GLOBAL_LOAD_DWORDX3:
610 case AMDGPU::GLOBAL_LOAD_DWORDX4:
611 case AMDGPU::FLAT_LOAD_DWORD:
612 case AMDGPU::FLAT_LOAD_DWORDX2:
613 case AMDGPU::FLAT_LOAD_DWORDX3:
614 case AMDGPU::FLAT_LOAD_DWORDX4:
615 return AMDGPU::FLAT_LOAD_DWORD;
616 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
617 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
618 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
619 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
620 return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
621 case AMDGPU::GLOBAL_STORE_DWORD:
622 case AMDGPU::GLOBAL_STORE_DWORDX2:
623 case AMDGPU::GLOBAL_STORE_DWORDX3:
624 case AMDGPU::GLOBAL_STORE_DWORDX4:
625 case AMDGPU::FLAT_STORE_DWORD:
626 case AMDGPU::FLAT_STORE_DWORDX2:
627 case AMDGPU::FLAT_STORE_DWORDX3:
628 case AMDGPU::FLAT_STORE_DWORDX4:
629 return AMDGPU::FLAT_STORE_DWORD;
630 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
631 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
632 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
633 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
634 return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
645SILoadStoreOptimizer::getCommonInstClass(
const CombineInfo &CI,
646 const CombineInfo &Paired) {
647 assert(CI.InstClass == Paired.InstClass);
649 if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
651 return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
659 if (
TII.isMUBUF(Opc)) {
665 Result.SOffset =
true;
670 if (
TII.isImage(Opc)) {
672 if (VAddr0Idx >= 0) {
674 TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
676 Result.NumVAddrs = RsrcIdx - VAddr0Idx;
687 if (
TII.isMTBUF(Opc)) {
693 Result.SOffset =
true;
701 case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
702 case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
703 case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
704 case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
705 case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
706 Result.SOffset =
true;
708 case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
709 case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
710 case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
711 case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
712 case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
713 case AMDGPU::S_LOAD_DWORD_IMM:
714 case AMDGPU::S_LOAD_DWORDX2_IMM:
715 case AMDGPU::S_LOAD_DWORDX3_IMM:
716 case AMDGPU::S_LOAD_DWORDX4_IMM:
717 case AMDGPU::S_LOAD_DWORDX8_IMM:
718 case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
719 case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
720 case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
721 case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
724 case AMDGPU::DS_READ_B32:
725 case AMDGPU::DS_READ_B64:
726 case AMDGPU::DS_READ_B32_gfx9:
727 case AMDGPU::DS_READ_B64_gfx9:
728 case AMDGPU::DS_WRITE_B32:
729 case AMDGPU::DS_WRITE_B64:
730 case AMDGPU::DS_WRITE_B32_gfx9:
731 case AMDGPU::DS_WRITE_B64_gfx9:
734 case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
735 case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
736 case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
737 case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
738 case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
739 case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
740 case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
741 case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
744 case AMDGPU::GLOBAL_LOAD_DWORD:
745 case AMDGPU::GLOBAL_LOAD_DWORDX2:
746 case AMDGPU::GLOBAL_LOAD_DWORDX3:
747 case AMDGPU::GLOBAL_LOAD_DWORDX4:
748 case AMDGPU::GLOBAL_STORE_DWORD:
749 case AMDGPU::GLOBAL_STORE_DWORDX2:
750 case AMDGPU::GLOBAL_STORE_DWORDX3:
751 case AMDGPU::GLOBAL_STORE_DWORDX4:
752 case AMDGPU::FLAT_LOAD_DWORD:
753 case AMDGPU::FLAT_LOAD_DWORDX2:
754 case AMDGPU::FLAT_LOAD_DWORDX3:
755 case AMDGPU::FLAT_LOAD_DWORDX4:
756 case AMDGPU::FLAT_STORE_DWORD:
757 case AMDGPU::FLAT_STORE_DWORDX2:
758 case AMDGPU::FLAT_STORE_DWORDX3:
759 case AMDGPU::FLAT_STORE_DWORDX4:
766 const SILoadStoreOptimizer &LSO) {
768 unsigned Opc =
MI->getOpcode();
769 InstClass = getInstClass(Opc, *LSO.TII);
771 if (InstClass == UNKNOWN)
774 IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*
MI));
779 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
784 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
787 case S_BUFFER_LOAD_IMM:
788 case S_BUFFER_LOAD_SGPR_IMM:
797 if (InstClass == MIMG) {
798 DMask = LSO.TII->getNamedOperand(*
I, AMDGPU::OpName::dmask)->getImm();
803 Offset =
I->getOperand(OffsetIdx).getImm();
806 if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
807 Format = LSO.TII->getNamedOperand(*
I, AMDGPU::OpName::format)->getImm();
809 Width = getOpcodeWidth(*
I, *LSO.TII);
811 if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
813 }
else if (InstClass != MIMG) {
814 CPol = LSO.TII->getNamedOperand(*
I, AMDGPU::OpName::cpol)->getImm();
817 AddressRegs Regs = getRegs(Opc, *LSO.TII);
818 bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*
I) || LSO.TII->isVSAMPLE(*
I);
821 for (
unsigned J = 0; J < Regs.NumVAddrs; J++)
822 AddrIdx[NumAddresses++] =
825 AddrIdx[NumAddresses++] =
828 AddrIdx[NumAddresses++] =
832 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc);
834 AddrIdx[NumAddresses++] =
837 AddrIdx[NumAddresses++] =
840 AddrIdx[NumAddresses++] =
844 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp);
845 assert(NumAddresses <= MaxAddressRegs);
847 for (
unsigned J = 0; J < NumAddresses; J++)
848 AddrReg[J] = &
I->getOperand(AddrIdx[J]);
854 "SI Load Store Optimizer",
false,
false)
859char SILoadStoreOptimizer::
ID = 0;
864 return new SILoadStoreOptimizer();
870 for (
const auto &
Op :
MI.operands()) {
880bool SILoadStoreOptimizer::canSwapInstructions(
883 if (
A.mayLoadOrStore() &&
B.mayLoadOrStore() &&
884 (
A.mayStore() ||
B.mayStore()) &&
A.mayAlias(AA,
B,
true))
886 for (
const auto &BOp :
B.operands()) {
889 if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.
contains(BOp.getReg()))
891 if (BOp.isDef() && ARegUses.
contains(BOp.getReg()))
900SILoadStoreOptimizer::combineKnownAdjacentMMOs(
const CombineInfo &CI,
901 const CombineInfo &Paired) {
921bool SILoadStoreOptimizer::dmasksCanBeCombined(
const CombineInfo &CI,
923 const CombineInfo &Paired) {
924 assert(CI.InstClass == MIMG);
927 const auto *TFEOp =
TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
928 const auto *LWEOp =
TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
930 if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
934 unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
935 AMDGPU::OpName::unorm, AMDGPU::OpName::da,
936 AMDGPU::OpName::r128, AMDGPU::OpName::a16};
938 for (
auto op : OperandsToMatch) {
943 CI.I->getOperand(
Idx).getImm() != Paired.I->getOperand(
Idx).getImm())
948 unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
949 unsigned MinMask = std::min(CI.DMask, Paired.DMask);
955 if ((1u << AllowedBitsForMin) <= MinMask)
962 unsigned ComponentCount,
964 if (ComponentCount > 4)
983 return NewFormatInfo->
Format;
996bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
1000 assert(CI.InstClass != MIMG);
1004 if (CI.Offset == Paired.Offset)
1008 if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
1011 if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
1035 uint32_t EltOffset0 = CI.Offset / CI.EltSize;
1036 uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
1041 if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
1042 if (EltOffset0 + CI.Width != EltOffset1 &&
1043 EltOffset1 + Paired.Width != EltOffset0)
1045 if (CI.CPol != Paired.CPol)
1047 if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||
1048 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {
1054 if (CI.Width != Paired.Width &&
1055 (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset))
1063 if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
1064 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
1066 CI.Offset = EltOffset0 / 64;
1067 Paired.Offset = EltOffset1 / 64;
1074 if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
1076 CI.Offset = EltOffset0;
1077 Paired.Offset = EltOffset1;
1083 uint32_t Min = std::min(EltOffset0, EltOffset1);
1086 const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
1087 if (((Max - Min) & ~Mask) == 0) {
1095 BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
1096 CI.BaseOff = BaseOff * CI.EltSize;
1097 CI.Offset = (EltOffset0 - BaseOff) / 64;
1098 Paired.Offset = (EltOffset1 - BaseOff) / 64;
1104 if (isUInt<8>(Max - Min)) {
1110 CI.BaseOff = BaseOff * CI.EltSize;
1111 CI.Offset = EltOffset0 - BaseOff;
1112 Paired.Offset = EltOffset1 - BaseOff;
1120bool SILoadStoreOptimizer::widthsFit(
const GCNSubtarget &STM,
1121 const CombineInfo &CI,
1122 const CombineInfo &Paired) {
1123 const unsigned Width = (CI.Width + Paired.Width);
1124 switch (CI.InstClass) {
1127 case S_BUFFER_LOAD_IMM:
1128 case S_BUFFER_LOAD_SGPR_IMM:
1144SILoadStoreOptimizer::getDataRegClass(
const MachineInstr &
MI)
const {
1145 if (
const auto *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst)) {
1146 return TRI->getRegClassForReg(*
MRI, Dst->getReg());
1148 if (
const auto *Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdata)) {
1149 return TRI->getRegClassForReg(*
MRI, Src->getReg());
1151 if (
const auto *Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::data0)) {
1152 return TRI->getRegClassForReg(*
MRI, Src->getReg());
1154 if (
const auto *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::sdst)) {
1155 return TRI->getRegClassForReg(*
MRI, Dst->getReg());
1157 if (
const auto *Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::sdata)) {
1158 return TRI->getRegClassForReg(*
MRI, Src->getReg());
1165SILoadStoreOptimizer::CombineInfo *
1166SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1167 CombineInfo &Paired) {
1170 if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1172 assert(CI.InstClass == Paired.InstClass);
1174 if (getInstSubclass(CI.I->getOpcode(), *
TII) !=
1175 getInstSubclass(Paired.I->getOpcode(), *
TII))
1180 if (CI.InstClass == MIMG) {
1181 if (!dmasksCanBeCombined(CI, *
TII, Paired))
1184 if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
1191 if (CI.I->mayLoad()) {
1195 if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *
MBBI))
1203 if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *
MBBI))
1213 if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1214 offsetsCanBeCombined(CI, *STM, Paired,
true);
1220void SILoadStoreOptimizer::copyToDestRegs(
1221 CombineInfo &CI, CombineInfo &Paired,
1227 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1231 auto *Dest0 =
TII->getNamedOperand(*CI.I,
OpName);
1232 auto *Dest1 =
TII->getNamedOperand(*Paired.I,
OpName);
1237 Dest0->setIsEarlyClobber(
false);
1238 Dest1->setIsEarlyClobber(
false);
1242 .
addReg(DestReg, 0, SubRegIdx0);
1251SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
1257 auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1261 Register SrcReg =
MRI->createVirtualRegister(SuperRC);
1263 const auto *Src0 =
TII->getNamedOperand(*CI.I,
OpName);
1264 const auto *Src1 =
TII->getNamedOperand(*Paired.I,
OpName);
1266 BuildMI(*
MBB, InsertBefore,
DL,
TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1275unsigned SILoadStoreOptimizer::read2Opcode(
unsigned EltSize)
const {
1277 return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1278 return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1281unsigned SILoadStoreOptimizer::read2ST64Opcode(
unsigned EltSize)
const {
1283 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1285 return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1286 : AMDGPU::DS_READ2ST64_B64_gfx9;
1290SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1296 const auto *AddrReg =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1298 unsigned NewOffset0 = std::min(CI.Offset, Paired.Offset);
1299 unsigned NewOffset1 = std::max(CI.Offset, Paired.Offset);
1301 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1303 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1304 (NewOffset0 != NewOffset1) &&
"Computed offset doesn't fit");
1309 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1313 Register BaseReg = AddrReg->getReg();
1314 unsigned BaseSubReg = AddrReg->getSubReg();
1315 unsigned BaseRegFlags = 0;
1317 Register ImmReg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1321 BaseReg =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1324 TII->getAddNoCarry(*
MBB, InsertBefore,
DL, BaseReg)
1326 .addReg(AddrReg->getReg(), 0, BaseSubReg)
1333 .
addReg(BaseReg, BaseRegFlags, BaseSubReg)
1339 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
1341 CI.I->eraseFromParent();
1342 Paired.I->eraseFromParent();
1348unsigned SILoadStoreOptimizer::write2Opcode(
unsigned EltSize)
const {
1350 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1351 return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1352 : AMDGPU::DS_WRITE2_B64_gfx9;
1355unsigned SILoadStoreOptimizer::write2ST64Opcode(
unsigned EltSize)
const {
1357 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1358 : AMDGPU::DS_WRITE2ST64_B64;
1360 return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1361 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1365 CombineInfo &CI, CombineInfo &Paired,
1372 TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1374 TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1376 TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1378 unsigned NewOffset0 = CI.Offset;
1379 unsigned NewOffset1 = Paired.Offset;
1381 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1383 if (NewOffset0 > NewOffset1) {
1389 assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1390 (NewOffset0 != NewOffset1) &&
"Computed offset doesn't fit");
1396 unsigned BaseSubReg = AddrReg->
getSubReg();
1397 unsigned BaseRegFlags = 0;
1399 Register ImmReg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1403 BaseReg =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1406 TII->getAddNoCarry(*
MBB, InsertBefore,
DL, BaseReg)
1408 .addReg(AddrReg->
getReg(), 0, BaseSubReg)
1415 .
addReg(BaseReg, BaseRegFlags, BaseSubReg)
1424 Paired.I->eraseFromParent();
1426 LLVM_DEBUG(
dbgs() <<
"Inserted write2 inst: " << *Write2 <<
'\n');
1431SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1435 const unsigned Opcode = getNewOpcode(CI, Paired);
1439 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1440 unsigned MergedDMask = CI.DMask | Paired.DMask;
1444 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1445 for (
unsigned I = 1, E = (*CI.I).getNumOperands();
I != E; ++
I) {
1447 MIB.addImm(MergedDMask);
1449 MIB.add((*CI.I).getOperand(
I));
1455 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1457 MachineInstr *
New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1459 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1461 CI.I->eraseFromParent();
1462 Paired.I->eraseFromParent();
1467 CombineInfo &CI, CombineInfo &Paired,
1471 const unsigned Opcode = getNewOpcode(CI, Paired);
1475 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1476 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1481 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1485 .
add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));
1486 if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1487 New.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));
1488 New.addImm(MergedOffset);
1489 New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1491 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::sdst, DestReg);
1493 CI.I->eraseFromParent();
1494 Paired.I->eraseFromParent();
1499 CombineInfo &CI, CombineInfo &Paired,
1504 const unsigned Opcode = getNewOpcode(CI, Paired);
1509 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1510 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1512 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1514 AddressRegs Regs = getRegs(Opcode, *
TII);
1517 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1522 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1525 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1526 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1527 .addImm(MergedOffset)
1530 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1532 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1534 CI.I->eraseFromParent();
1535 Paired.I->eraseFromParent();
1540 CombineInfo &CI, CombineInfo &Paired,
1545 const unsigned Opcode = getNewOpcode(CI, Paired);
1550 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1551 unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1553 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1555 AddressRegs Regs = getRegs(Opcode, *
TII);
1558 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1560 unsigned JoinedFormat =
1566 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1569 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1570 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1571 .addImm(MergedOffset)
1572 .addImm(JoinedFormat)
1575 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1577 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1579 CI.I->eraseFromParent();
1580 Paired.I->eraseFromParent();
1585 CombineInfo &CI, CombineInfo &Paired,
1590 const unsigned Opcode = getNewOpcode(CI, Paired);
1593 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1598 AddressRegs Regs = getRegs(Opcode, *
TII);
1601 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1603 unsigned JoinedFormat =
1609 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1612 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1613 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1614 .addImm(std::min(CI.Offset, Paired.Offset))
1615 .addImm(JoinedFormat)
1618 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1620 CI.I->eraseFromParent();
1621 Paired.I->eraseFromParent();
1626 CombineInfo &CI, CombineInfo &Paired,
1631 const unsigned Opcode = getNewOpcode(CI, Paired);
1634 Register DestReg =
MRI->createVirtualRegister(SuperRC);
1636 auto MIB =
BuildMI(*
MBB, InsertBefore,
DL,
TII->get(Opcode), DestReg);
1638 if (
auto *SAddr =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1642 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1643 .addImm(std::min(CI.Offset, Paired.Offset))
1645 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1647 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
1649 CI.I->eraseFromParent();
1650 Paired.I->eraseFromParent();
1655 CombineInfo &CI, CombineInfo &Paired,
1660 const unsigned Opcode = getNewOpcode(CI, Paired);
1663 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1666 .
add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1669 if (
auto *SAddr =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1673 MIB.addImm(std::min(CI.Offset, Paired.Offset))
1675 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1677 CI.I->eraseFromParent();
1678 Paired.I->eraseFromParent();
1682unsigned SILoadStoreOptimizer::getNewOpcode(
const CombineInfo &CI,
1683 const CombineInfo &Paired) {
1684 const unsigned Width = CI.Width + Paired.Width;
1686 switch (getCommonInstClass(CI, Paired)) {
1688 assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1699 case S_BUFFER_LOAD_IMM:
1704 return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1706 return AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
1708 return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1710 return AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1712 case S_BUFFER_LOAD_SGPR_IMM:
1717 return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1719 return AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
1721 return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1723 return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1729 bool NeedsConstrainedOpc =
1735 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX2_IMM_ec
1736 : AMDGPU::S_LOAD_DWORDX2_IMM;
1738 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX3_IMM_ec
1739 : AMDGPU::S_LOAD_DWORDX3_IMM;
1741 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX4_IMM_ec
1742 : AMDGPU::S_LOAD_DWORDX4_IMM;
1744 return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX8_IMM_ec
1745 : AMDGPU::S_LOAD_DWORDX8_IMM;
1753 return AMDGPU::GLOBAL_LOAD_DWORDX2;
1755 return AMDGPU::GLOBAL_LOAD_DWORDX3;
1757 return AMDGPU::GLOBAL_LOAD_DWORDX4;
1759 case GLOBAL_LOAD_SADDR:
1764 return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1766 return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1768 return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1775 return AMDGPU::GLOBAL_STORE_DWORDX2;
1777 return AMDGPU::GLOBAL_STORE_DWORDX3;
1779 return AMDGPU::GLOBAL_STORE_DWORDX4;
1781 case GLOBAL_STORE_SADDR:
1786 return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1788 return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1790 return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1797 return AMDGPU::FLAT_LOAD_DWORDX2;
1799 return AMDGPU::FLAT_LOAD_DWORDX3;
1801 return AMDGPU::FLAT_LOAD_DWORDX4;
1808 return AMDGPU::FLAT_STORE_DWORDX2;
1810 return AMDGPU::FLAT_STORE_DWORDX3;
1812 return AMDGPU::FLAT_STORE_DWORDX4;
1821std::pair<unsigned, unsigned>
1822SILoadStoreOptimizer::getSubRegIdxs(
const CombineInfo &CI,
1823 const CombineInfo &Paired) {
1824 assert((CI.InstClass != MIMG ||
1826 CI.Width + Paired.Width)) &&
1832 static const unsigned Idxs[5][4] = {
1833 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1834 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
1835 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
1836 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
1837 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
1840 assert(CI.Width >= 1 && CI.Width <= 4);
1841 assert(Paired.Width >= 1 && Paired.Width <= 4);
1844 Idx1 = Idxs[0][Paired.Width - 1];
1845 Idx0 = Idxs[Paired.Width][CI.Width - 1];
1847 Idx0 = Idxs[0][CI.Width - 1];
1848 Idx1 = Idxs[CI.Width][Paired.Width - 1];
1851 return {Idx0, Idx1};
1855SILoadStoreOptimizer::getTargetRegisterClass(
const CombineInfo &CI,
1856 const CombineInfo &Paired)
const {
1857 if (CI.InstClass == S_BUFFER_LOAD_IMM ||
1858 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
1859 switch (CI.Width + Paired.Width) {
1863 return &AMDGPU::SReg_64_XEXECRegClass;
1865 return &AMDGPU::SGPR_96RegClass;
1867 return &AMDGPU::SGPR_128RegClass;
1869 return &AMDGPU::SGPR_256RegClass;
1871 return &AMDGPU::SGPR_512RegClass;
1875 unsigned BitWidth = 32 * (CI.Width + Paired.Width);
1876 return TRI->isAGPRClass(getDataRegClass(*CI.I))
1882 CombineInfo &CI, CombineInfo &Paired,
1887 const unsigned Opcode = getNewOpcode(CI, Paired);
1890 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1895 AddressRegs Regs = getRegs(Opcode, *
TII);
1898 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1904 assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1907 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1908 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1909 .addImm(std::min(CI.Offset, Paired.Offset))
1912 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1914 CI.I->eraseFromParent();
1915 Paired.I->eraseFromParent();
1920SILoadStoreOptimizer::createRegOrImm(int32_t Val,
MachineInstr &
MI)
const {
1922 if (
TII->isInlineConstant(V))
1925 Register Reg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1927 BuildMI(*
MI.getParent(),
MI.getIterator(),
MI.getDebugLoc(),
1928 TII->get(AMDGPU::S_MOV_B32), Reg)
1937 const MemAddress &
Addr)
const {
1943 Addr.Base.LoSubReg) &&
1944 "Expected 32-bit Base-Register-Low!!");
1947 Addr.Base.HiSubReg) &&
1948 "Expected 32-bit Base-Register-Hi!!");
1953 createRegOrImm(
static_cast<int32_t
>(
Addr.Offset >> 32),
MI);
1955 const auto *CarryRC =
TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1956 Register CarryReg =
MRI->createVirtualRegister(CarryRC);
1957 Register DeadCarryReg =
MRI->createVirtualRegister(CarryRC);
1959 Register DestSub0 =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1960 Register DestSub1 =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1980 Register FullDestReg =
MRI->createVirtualRegister(
TRI->getVGPR64Class());
1996 int32_t NewOffset)
const {
1997 auto Base =
TII->getNamedOperand(
MI, AMDGPU::OpName::vaddr);
1998 Base->setReg(NewBase);
1999 Base->setIsKill(
false);
2000 TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->setImm(NewOffset);
2003std::optional<int32_t>
2009 return std::nullopt;
2012 if (!Def ||
Def->getOpcode() != AMDGPU::S_MOV_B32 ||
2013 !
Def->getOperand(1).isImm())
2014 return std::nullopt;
2016 return Def->getOperand(1).getImm();
2030 MemAddress &
Addr)
const {
2035 if (!Def ||
Def->getOpcode() != AMDGPU::REG_SEQUENCE
2036 ||
Def->getNumOperands() != 5)
2047 if (!BaseLoDef || BaseLoDef->
getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
2048 !BaseHiDef || BaseHiDef->
getOpcode() != AMDGPU::V_ADDC_U32_e64)
2051 const auto *Src0 =
TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
2052 const auto *Src1 =
TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
2054 auto Offset0P = extractConstOffset(*Src0);
2058 if (!(Offset0P = extractConstOffset(*Src1)))
2063 Src0 =
TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
2064 Src1 =
TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
2069 if (!Src1->isImm() || Src0->isImm())
2079 Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
2082bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
2084 MemInfoMap &Visited,
2102 if (
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm()) {
2110 if (!Visited.contains(&
MI)) {
2111 processBaseWithConstOffset(
Base, MAddr);
2112 Visited[&
MI] = MAddr;
2114 MAddr = Visited[&
MI];
2116 if (MAddr.Offset == 0) {
2117 LLVM_DEBUG(
dbgs() <<
" Failed to extract constant-offset or there are no"
2118 " constant offsets that can be promoted.\n";);
2123 << MAddr.Base.LoReg <<
"} Offset: " << MAddr.Offset <<
"\n\n";);
2151 MemAddress AnchorAddr;
2152 uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
2167 TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
2171 *
TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
2172 MemAddress MAddrNext;
2173 if (!Visited.contains(&MINext)) {
2174 processBaseWithConstOffset(BaseNext, MAddrNext);
2175 Visited[&MINext] = MAddrNext;
2177 MAddrNext = Visited[&MINext];
2179 if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
2180 MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
2181 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
2182 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2185 InstsWCommonBase.
emplace_back(&MINext, MAddrNext.Offset);
2187 int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2192 (
uint32_t)std::abs(Dist) > MaxDist) {
2193 MaxDist = std::abs(Dist);
2195 AnchorAddr = MAddrNext;
2196 AnchorInst = &MINext;
2201 LLVM_DEBUG(
dbgs() <<
" Anchor-Inst(with max-distance from Offset): ";
2202 AnchorInst->
dump());
2204 << AnchorAddr.Offset <<
"\n\n");
2209 updateBaseAndOffset(
MI,
Base, MAddr.Offset - AnchorAddr.Offset);
2212 for (
auto [OtherMI, OtherOffset] : InstsWCommonBase) {
2215 AM.
BaseOffs = OtherOffset - AnchorAddr.Offset;
2220 updateBaseAndOffset(*OtherMI,
Base, OtherOffset - AnchorAddr.Offset);
2231void SILoadStoreOptimizer::addInstToMergeableList(
const CombineInfo &CI,
2232 std::list<std::list<CombineInfo> > &MergeableInsts)
const {
2233 for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2234 if (AddrList.front().InstClass == CI.InstClass &&
2235 AddrList.front().IsAGPR == CI.IsAGPR &&
2236 AddrList.front().hasSameBaseAddress(CI)) {
2237 AddrList.emplace_back(CI);
2243 MergeableInsts.emplace_back(1, CI);
2246std::pair<MachineBasicBlock::iterator, bool>
2247SILoadStoreOptimizer::collectMergeableInsts(
2250 std::list<std::list<CombineInfo>> &MergeableInsts)
const {
2256 for (; BlockI !=
End; ++BlockI) {
2261 if (promoteConstantOffsetToImm(
MI, Visited,
AnchorList))
2266 if (
MI.hasOrderedMemoryRef() ||
MI.hasUnmodeledSideEffects()) {
2274 const InstClassEnum InstClass = getInstClass(
MI.getOpcode(), *
TII);
2275 if (InstClass == UNKNOWN)
2281 if (Swizzled != -1 &&
MI.getOperand(Swizzled).getImm())
2285 CI.setMI(
MI, *
this);
2288 if (!CI.hasMergeableAddress(*
MRI))
2291 if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
2313 for (std::list<std::list<CombineInfo>>::iterator
I = MergeableInsts.begin(),
2314 E = MergeableInsts.end();
I != E;) {
2316 std::list<CombineInfo> &MergeList = *
I;
2317 if (MergeList.size() <= 1) {
2321 I = MergeableInsts.erase(
I);
2329 [] (
const CombineInfo &
A,
const CombineInfo &
B) {
2330 return A.Offset <
B.Offset;
2341bool SILoadStoreOptimizer::optimizeBlock(
2342 std::list<std::list<CombineInfo> > &MergeableInsts) {
2345 for (std::list<std::list<CombineInfo>>::iterator
I = MergeableInsts.begin(),
2346 E = MergeableInsts.end();
I != E;) {
2347 std::list<CombineInfo> &MergeList = *
I;
2349 bool OptimizeListAgain =
false;
2350 if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2354 I = MergeableInsts.erase(
I);
2362 if (!OptimizeListAgain) {
2363 I = MergeableInsts.erase(
I);
2366 OptimizeAgain =
true;
2372SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2373 std::list<CombineInfo> &MergeList,
2374 bool &OptimizeListAgain) {
2375 if (MergeList.empty())
2380 for (
auto I = MergeList.begin(), Next = std::next(
I); Next != MergeList.end();
2381 Next = std::next(
I)) {
2386 if ((*First).Order > (*Second).Order)
2388 CombineInfo &CI = *
First;
2389 CombineInfo &Paired = *Second;
2391 CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2399 LLVM_DEBUG(
dbgs() <<
"Merging: " << *CI.I <<
" with: " << *Paired.I);
2402 switch (CI.InstClass) {
2407 NewMI = mergeRead2Pair(CI, Paired, Where->I);
2410 NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2412 case S_BUFFER_LOAD_IMM:
2413 case S_BUFFER_LOAD_SGPR_IMM:
2415 NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);
2416 OptimizeListAgain |= CI.Width + Paired.Width < 8;
2419 NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2420 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2423 NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2424 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2427 NewMI = mergeImagePair(CI, Paired, Where->I);
2428 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2431 NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2432 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2435 NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2436 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2440 case GLOBAL_LOAD_SADDR:
2441 NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2442 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2446 case GLOBAL_STORE_SADDR:
2447 NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2448 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2451 CI.setMI(NewMI, *
this);
2452 CI.Order = Where->Order;
2456 MergeList.erase(Second);
2471 TRI = &
TII->getRegisterInfo();
2474 AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2489 bool CollectModified;
2490 std::list<std::list<CombineInfo>> MergeableInsts;
2494 std::tie(SectionEnd, CollectModified) =
2500 OptimizeAgain =
false;
2502 }
while (OptimizeAgain);
unsigned const MachineRegisterInfo * MRI
INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE, "AMDGPU Image Intrinsic Optimizer", false, false) char AMDGPUImageIntrinsicOptimizer void addInstToMergeableList(IntrinsicInst *II, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr)
BasicBlock::iterator collectMergeableInsts(BasicBlock::iterator I, BasicBlock::iterator E, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts)
Provides AMDGPU specific target descriptions.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
std::optional< std::vector< StOtherPiece > > Other
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
unsigned const TargetRegisterInfo * TRI
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi)
static void addDefsUsesToList(const MachineInstr &MI, DenseSet< Register > &RegDefs, DenseSet< Register > &RegUses)
static unsigned getBufferFormatWithCompCount(unsigned OldFormat, unsigned ComponentCount, const GCNSubtarget &STI)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI)
static bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT, const TargetTransformInfo &TTI, const DataLayout &DL, DomTreeUpdater *DTU)
support::ulittle16_t & Lo
support::ulittle16_t & Hi
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Class for arbitrary precision integers.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesCFG()
This function should be called by the pass, iff they do not:
This class represents an Operation in the Expression.
Implements a dense probed hash-table based set.
FunctionPass class - This class is used to implement most global optimizations.
bool loadStoreOptEnabled() const
bool hasFlatInstOffsets() const
const SIInstrInfo * getInstrInfo() const override
bool hasDwordx3LoadStores() const
const SITargetLowering * getTargetLowering() const override
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
bool hasScalarDwordx3Loads() const
bool isXNACKEnabled() const
TypeSize getValue() const
Describe properties that are true of each instruction in the target description file.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Properties which a MachineFunction may have at a given point in time.
MachineFunctionProperties & set(Property P)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & cloneMergedMemRefs(ArrayRef< const MachineInstr * > OtherMIs) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
unsigned getAddrSpace() const
const MachinePointerInfo & getPointerInfo() const
Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Wrapper class representing virtual and physical registers.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
static bool isFLATScratch(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
reference emplace_back(ArgTypes &&... Args)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
std::pair< iterator, bool > insert(const ValueT &V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset)
Convert ByteOffset to dwords if the subtarget uses dword SMRD immediate offsets.
bool getMTBUFHasSrsrc(unsigned Opc)
int getMTBUFElements(unsigned Opc)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
bool getMTBUFHasSoffset(unsigned Opc)
int getMUBUFOpcode(unsigned BaseOpc, unsigned Elements)
int getMUBUFBaseOpcode(unsigned Opc)
int getMTBUFBaseOpcode(unsigned Opc)
bool getMUBUFHasVAddr(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
int getMTBUFOpcode(unsigned BaseOpc, unsigned Elements)
bool getMUBUFHasSoffset(unsigned Opc)
const MIMGBaseOpcodeInfo * getMIMGBaseOpcode(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
bool getMTBUFHasVAddr(unsigned Opc)
int getMUBUFElements(unsigned Opc)
const GcnBufferFormatInfo * getGcnBufferFormatInfo(uint8_t BitsPerComp, uint8_t NumComponents, uint8_t NumFormat, const MCSubtargetInfo &STI)
bool getMUBUFHasSrsrc(unsigned Opc)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Define
Register definition.
@ Kill
The last use of a register.
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< DefNode * > Def
This is an optimization pass for GlobalISel generic memory operations.
bool operator<(int64_t V1, const APSInt &V2)
std::vector< std::pair< LineLocation, FunctionId > > AnchorList
int popcount(T Value) noexcept
Count the number of set bits in a value.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
char & SILoadStoreOptimizerID
FunctionPass * createSILoadStoreOptimizerPass()
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
void initializeSILoadStoreOptimizerPass(PassRegistry &)
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
constexpr unsigned BitWidth
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
uint64_t value() const
This is a hole in the type system and should not be abused.
This class contains a discriminated union of information about pointers in memory operands,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...