46#define DEBUG_TYPE "si-insert-waitcnts" 
   49              "Force emit s_waitcnt expcnt(0) instrs");
 
   51              "Force emit s_waitcnt lgkmcnt(0) instrs");
 
   53              "Force emit s_waitcnt vmcnt(0) instrs");
 
   57                      cl::desc(
"Force all waitcnt instrs to be emitted as " 
   58                               "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
 
   62    "amdgpu-waitcnt-load-forcezero",
 
   63    cl::desc(
"Force all waitcnt load counters to wait until 0"),
 
   77  SAMPLE_CNT = NUM_NORMAL_INST_CNTS, 
 
   81  NUM_EXTENDED_INST_CNTS,
 
   82  NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS
 
   96auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) {
 
   97  return enum_seq(LOAD_CNT, MaxCounter);
 
  100using RegInterval = std::pair<int, int>;
 
  102struct HardwareLimits {
 
  106  unsigned StorecntMax;  
 
  107  unsigned SamplecntMax; 
 
  113#define AMDGPU_DECLARE_WAIT_EVENTS(DECL)                                       \ 
  115  DECL(VMEM_READ_ACCESS)                                        \ 
  116  DECL(VMEM_SAMPLER_READ_ACCESS)          \ 
  117  DECL(VMEM_BVH_READ_ACCESS)                  \ 
  118  DECL(VMEM_WRITE_ACCESS)                  \ 
  119  DECL(SCRATCH_WRITE_ACCESS)               \ 
  129  DECL(EXP_POS_ACCESS)                           \ 
  130  DECL(EXP_PARAM_ACCESS)                        \ 
 
  135#define AMDGPU_EVENT_ENUM(Name) Name, 
  140#undef AMDGPU_EVENT_ENUM 
  142#define AMDGPU_EVENT_NAME(Name) #Name, 
  146#undef AMDGPU_EVENT_NAME 
  156enum RegisterMapping {
 
  157  SQ_MAX_PGM_VGPRS = 2048, 
 
  159  SQ_MAX_PGM_SGPRS = 128,  
 
  165  FIRST_LDS_VGPR = SQ_MAX_PGM_VGPRS, 
 
  167  NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_LDS_VGPRS, 
 
  168  NUM_ALL_ALLOCATABLE = NUM_ALL_VGPRS + SQ_MAX_PGM_SGPRS,
 
  170  SCC = NUM_ALL_ALLOCATABLE
 
  191static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
 
  192    AMDGPU::S_WAIT_LOADCNT,  AMDGPU::S_WAIT_DSCNT,     AMDGPU::S_WAIT_EXPCNT,
 
  193    AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
 
  194    AMDGPU::S_WAIT_KMCNT,    AMDGPU::S_WAIT_XCNT};
 
  202static bool isNormalMode(InstCounterType MaxCounter) {
 
  203  return MaxCounter == NUM_NORMAL_INST_CNTS;
 
  208  assert(updateVMCntOnly(Inst));
 
  210    return VMEM_NOSAMPLER;
 
  224  return VMEM_NOSAMPLER;
 
  236    return Wait.StoreCnt;
 
  238    return Wait.SampleCnt;
 
  251  unsigned &WC = getCounterRef(
Wait, 
T);
 
  252  WC = std::min(WC, 
Count);
 
  256  getCounterRef(
Wait, 
T) = ~0
u;
 
  260  return getCounterRef(
Wait, 
T);
 
  264InstCounterType eventCounter(
const unsigned *masks, WaitEventType 
E) {
 
  265  for (
auto T : inst_counter_types()) {
 
  266    if (masks[
T] & (1 << 
E))
 
  272class WaitcntBrackets;
 
  280class WaitcntGenerator {
 
  282  const GCNSubtarget *ST = 
nullptr;
 
  283  const SIInstrInfo *TII = 
nullptr;
 
  284  AMDGPU::IsaVersion IV;
 
  285  InstCounterType MaxCounter;
 
  289  WaitcntGenerator() = 
default;
 
  290  WaitcntGenerator(
const MachineFunction &MF, InstCounterType MaxCounter)
 
  291      : ST(&MF.getSubtarget<GCNSubtarget>()), TII(ST->getInstrInfo()),
 
  298  bool isOptNone()
 const { 
return OptNone; }
 
  312  applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
 
  313                          MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &
Wait,
 
  317  bool promoteSoftWaitCnt(MachineInstr *Waitcnt) 
const;
 
  321  virtual bool createNewWaitcnt(MachineBasicBlock &
Block,
 
  323                                AMDGPU::Waitcnt 
Wait) = 0;
 
  327  virtual const unsigned *getWaitEventMask() 
const = 0;
 
  331  virtual AMDGPU::Waitcnt getAllZeroWaitcnt(
bool IncludeVSCnt) 
const = 0;
 
  333  virtual ~WaitcntGenerator() = 
default;
 
  336  static constexpr unsigned 
  337  eventMask(std::initializer_list<WaitEventType> Events) {
 
  339    for (
auto &
E : Events)
 
  346class WaitcntGeneratorPreGFX12 : 
public WaitcntGenerator {
 
  348  WaitcntGeneratorPreGFX12() = 
default;
 
  349  WaitcntGeneratorPreGFX12(
const MachineFunction &MF)
 
  350      : WaitcntGenerator(MF, NUM_NORMAL_INST_CNTS) {}
 
  353  applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
 
  354                          MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &
Wait,
 
  357  bool createNewWaitcnt(MachineBasicBlock &
Block,
 
  359                        AMDGPU::Waitcnt 
Wait) 
override;
 
  361  const unsigned *getWaitEventMask()
 const override {
 
  364    static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
 
  365        eventMask({VMEM_ACCESS, VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS,
 
  366                   VMEM_BVH_READ_ACCESS}),
 
  367        eventMask({SMEM_ACCESS, LDS_ACCESS, GDS_ACCESS, SQ_MESSAGE}),
 
  368        eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
 
  369                   EXP_POS_ACCESS, EXP_LDS_ACCESS}),
 
  370        eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
 
  376    return WaitEventMaskForInstPreGFX12;
 
  379  AMDGPU::Waitcnt getAllZeroWaitcnt(
bool IncludeVSCnt) 
const override;
 
  382class WaitcntGeneratorGFX12Plus : 
public WaitcntGenerator {
 
  384  WaitcntGeneratorGFX12Plus() = 
default;
 
  385  WaitcntGeneratorGFX12Plus(
const MachineFunction &MF,
 
  386                            InstCounterType MaxCounter)
 
  387      : WaitcntGenerator(MF, MaxCounter) {}
 
  390  applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
 
  391                          MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &
Wait,
 
  394  bool createNewWaitcnt(MachineBasicBlock &
Block,
 
  396                        AMDGPU::Waitcnt 
Wait) 
override;
 
  398  const unsigned *getWaitEventMask()
 const override {
 
  401    static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
 
  402        eventMask({VMEM_ACCESS, VMEM_READ_ACCESS}),
 
  403        eventMask({LDS_ACCESS, GDS_ACCESS}),
 
  404        eventMask({EXP_GPR_LOCK, GDS_GPR_LOCK, VMW_GPR_LOCK, EXP_PARAM_ACCESS,
 
  405                   EXP_POS_ACCESS, EXP_LDS_ACCESS}),
 
  406        eventMask({VMEM_WRITE_ACCESS, SCRATCH_WRITE_ACCESS}),
 
  407        eventMask({VMEM_SAMPLER_READ_ACCESS}),
 
  408        eventMask({VMEM_BVH_READ_ACCESS}),
 
  409        eventMask({SMEM_ACCESS, SQ_MESSAGE, SCC_WRITE}),
 
  410        eventMask({VMEM_GROUP, SMEM_GROUP})};
 
  412    return WaitEventMaskForInstGFX12Plus;
 
  415  AMDGPU::Waitcnt getAllZeroWaitcnt(
bool IncludeVSCnt) 
const override;
 
  418class SIInsertWaitcnts {
 
  420  const GCNSubtarget *ST;
 
  421  const SIInstrInfo *TII = 
nullptr;
 
  422  const SIRegisterInfo *TRI = 
nullptr;
 
  423  const MachineRegisterInfo *MRI = 
nullptr;
 
  424  InstCounterType SmemAccessCounter;
 
  425  InstCounterType MaxCounter;
 
  426  const unsigned *WaitEventMaskForInst;
 
  429  DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
 
  430  DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
 
  431  MachineLoopInfo *MLI;
 
  432  MachinePostDominatorTree *PDT;
 
  436    std::unique_ptr<WaitcntBrackets> Incoming;
 
  440  MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
 
  442  bool ForceEmitWaitcnt[NUM_INST_CNTS];
 
  447  WaitcntGeneratorPreGFX12 WCGPreGFX12;
 
  448  WaitcntGeneratorGFX12Plus WCGGFX12Plus;
 
  450  WaitcntGenerator *WCG = 
nullptr;
 
  454  DenseSet<MachineInstr *> ReleaseVGPRInsts;
 
  456  HardwareLimits Limits;
 
  459  SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT,
 
  461      : MLI(MLI), PDT(PDT), AA(AA) {
 
  462    (void)ForceExpCounter;
 
  463    (void)ForceLgkmCounter;
 
  464    (void)ForceVMCounter;
 
  467  unsigned getWaitCountMax(InstCounterType 
T)
 const {
 
  470      return Limits.LoadcntMax;
 
  472      return Limits.DscntMax;
 
  474      return Limits.ExpcntMax;
 
  476      return Limits.StorecntMax;
 
  478      return Limits.SamplecntMax;
 
  480      return Limits.BvhcntMax;
 
  482      return Limits.KmcntMax;
 
  484      return Limits.XcntMax;
 
  491  bool shouldFlushVmCnt(MachineLoop *
ML, 
const WaitcntBrackets &Brackets);
 
  492  bool isPreheaderToFlush(MachineBasicBlock &
MBB,
 
  493                          const WaitcntBrackets &ScoreBrackets);
 
  494  bool isVMEMOrFlatVMEM(
const MachineInstr &
MI) 
const;
 
  495  bool run(MachineFunction &MF);
 
  497  void setForceEmitWaitcnt() {
 
  503      ForceEmitWaitcnt[
EXP_CNT] = 
true;
 
  505      ForceEmitWaitcnt[
EXP_CNT] = 
false;
 
  510      ForceEmitWaitcnt[DS_CNT] = 
true;
 
  511      ForceEmitWaitcnt[KM_CNT] = 
true;
 
  513      ForceEmitWaitcnt[DS_CNT] = 
false;
 
  514      ForceEmitWaitcnt[KM_CNT] = 
false;
 
  519      ForceEmitWaitcnt[LOAD_CNT] = 
true;
 
  520      ForceEmitWaitcnt[SAMPLE_CNT] = 
true;
 
  521      ForceEmitWaitcnt[BVH_CNT] = 
true;
 
  523      ForceEmitWaitcnt[LOAD_CNT] = 
false;
 
  524      ForceEmitWaitcnt[SAMPLE_CNT] = 
false;
 
  525      ForceEmitWaitcnt[BVH_CNT] = 
false;
 
  532  WaitEventType getVmemWaitEventType(
const MachineInstr &Inst)
 const {
 
  534    case AMDGPU::GLOBAL_INV:
 
  535      return VMEM_READ_ACCESS; 
 
  536    case AMDGPU::GLOBAL_WB:
 
  537    case AMDGPU::GLOBAL_WBINV:
 
  538      return VMEM_WRITE_ACCESS; 
 
  544    static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
 
  545        VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
 
  556      if (TII->mayAccessScratchThroughFlat(Inst))
 
  557        return SCRATCH_WRITE_ACCESS;
 
  558      return VMEM_WRITE_ACCESS;
 
  561      return VMEM_READ_ACCESS;
 
  562    return VmemReadMapping[getVmemType(Inst)];
 
  565  bool isVmemAccess(
const MachineInstr &
MI) 
const;
 
  566  bool generateWaitcntInstBefore(MachineInstr &
MI,
 
  567                                 WaitcntBrackets &ScoreBrackets,
 
  568                                 MachineInstr *OldWaitcntInstr,
 
  570  bool generateWaitcnt(AMDGPU::Waitcnt 
Wait,
 
  572                       MachineBasicBlock &
Block, WaitcntBrackets &ScoreBrackets,
 
  573                       MachineInstr *OldWaitcntInstr);
 
  574  void updateEventWaitcntAfter(MachineInstr &Inst,
 
  575                               WaitcntBrackets *ScoreBrackets);
 
  577                    MachineBasicBlock *
Block) 
const;
 
  578  bool insertForcedWaitAfter(MachineInstr &Inst, MachineBasicBlock &
Block,
 
  579                             WaitcntBrackets &ScoreBrackets);
 
  580  bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &
Block,
 
  581                            WaitcntBrackets &ScoreBrackets);
 
  592class WaitcntBrackets {
 
  594  WaitcntBrackets(
const SIInsertWaitcnts *Context) : Context(Context) {}
 
  596  bool isSmemCounter(InstCounterType 
T)
 const {
 
  597    return T == Context->SmemAccessCounter || 
T == X_CNT;
 
  600  unsigned getSgprScoresIdx(InstCounterType 
T)
 const {
 
  601    assert(isSmemCounter(
T) && 
"Invalid SMEM counter");
 
  602    return T == X_CNT ? 1 : 0;
 
  605  unsigned getScoreLB(InstCounterType 
T)
 const {
 
  610  unsigned getScoreUB(InstCounterType 
T)
 const {
 
  615  unsigned getScoreRange(InstCounterType 
T)
 const {
 
  616    return getScoreUB(
T) - getScoreLB(
T);
 
  619  unsigned getRegScore(
int GprNo, InstCounterType 
T)
 const {
 
  620    if (GprNo < NUM_ALL_VGPRS)
 
  621      return VgprScores[
T][GprNo];
 
  623    if (GprNo < NUM_ALL_ALLOCATABLE)
 
  624      return SgprScores[getSgprScoresIdx(
T)][GprNo - NUM_ALL_VGPRS];
 
  632  RegInterval getRegInterval(
const MachineInstr *
MI,
 
  633                             const MachineOperand &
Op) 
const;
 
  635  bool counterOutOfOrder(InstCounterType 
T) 
const;
 
  636  void simplifyWaitcnt(AMDGPU::Waitcnt &
Wait) 
const;
 
  637  void simplifyWaitcnt(InstCounterType 
T, 
unsigned &
Count) 
const;
 
  639  void determineWait(InstCounterType 
T, RegInterval 
Interval,
 
  640                     AMDGPU::Waitcnt &
Wait) 
const;
 
  641  void determineWait(InstCounterType 
T, 
int RegNo,
 
  642                     AMDGPU::Waitcnt &
Wait)
 const {
 
  643    determineWait(
T, {RegNo, RegNo + 1}, 
Wait);
 
  645  void tryClearSCCWriteEvent(MachineInstr *Inst);
 
  647  void applyWaitcnt(
const AMDGPU::Waitcnt &
Wait);
 
  648  void applyWaitcnt(InstCounterType 
T, 
unsigned Count);
 
  649  void applyXcnt(
const AMDGPU::Waitcnt &
Wait);
 
  650  void updateByEvent(WaitEventType 
E, MachineInstr &
MI);
 
  652  unsigned hasPendingEvent()
 const { 
return PendingEvents; }
 
  653  unsigned hasPendingEvent(WaitEventType 
E)
 const {
 
  654    return PendingEvents & (1 << 
E);
 
  656  unsigned hasPendingEvent(InstCounterType 
T)
 const {
 
  657    unsigned HasPending = PendingEvents & Context->WaitEventMaskForInst[
T];
 
  658    assert((HasPending != 0) == (getScoreRange(
T) != 0));
 
  662  bool hasMixedPendingEvents(InstCounterType 
T)
 const {
 
  663    unsigned Events = hasPendingEvent(
T);
 
  665    return Events & (Events - 1);
 
  668  bool hasPendingFlat()
 const {
 
  669    return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
 
  670             LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
 
  671            (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
 
  672             LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
 
  675  void setPendingFlat() {
 
  676    LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
 
  677    LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
 
  680  bool hasPendingGDS()
 const {
 
  681    return LastGDS > ScoreLBs[DS_CNT] && LastGDS <= ScoreUBs[DS_CNT];
 
  684  unsigned getPendingGDSWait()
 const {
 
  685    return std::min(getScoreUB(DS_CNT) - LastGDS,
 
  686                    Context->getWaitCountMax(DS_CNT) - 1);
 
  689  void setPendingGDS() { LastGDS = ScoreUBs[DS_CNT]; }
 
  693  bool hasOtherPendingVmemTypes(RegInterval 
Interval, VmemType V)
 const {
 
  695      assert(RegNo < NUM_ALL_VGPRS);
 
  696      if (VgprVmemTypes[RegNo] & ~(1 << V))
 
  702  void clearVgprVmemTypes(RegInterval 
Interval) {
 
  704      assert(RegNo < NUM_ALL_VGPRS);
 
  705      VgprVmemTypes[RegNo] = 0;
 
  709  void setStateOnFunctionEntryOrReturn() {
 
  710    setScoreUB(STORE_CNT,
 
  711               getScoreUB(STORE_CNT) + Context->getWaitCountMax(STORE_CNT));
 
  712    PendingEvents |= Context->WaitEventMaskForInst[STORE_CNT];
 
  715  ArrayRef<const MachineInstr *> getLDSDMAStores()
 const {
 
  719  bool hasPointSampleAccel(
const MachineInstr &
MI) 
const;
 
  720  bool hasPointSamplePendingVmemTypes(
const MachineInstr &
MI,
 
  723  void print(raw_ostream &) 
const;
 
  733  static bool mergeScore(
const MergeInfo &M, 
unsigned &Score,
 
  734                         unsigned OtherScore);
 
  736  void setScoreLB(InstCounterType 
T, 
unsigned Val) {
 
  741  void setScoreUB(InstCounterType 
T, 
unsigned Val) {
 
  748    if (getScoreRange(EXP_CNT) > Context->getWaitCountMax(EXP_CNT))
 
  749      ScoreLBs[
EXP_CNT] = ScoreUBs[
EXP_CNT] - Context->getWaitCountMax(EXP_CNT);
 
  752  void setRegScore(
int GprNo, InstCounterType 
T, 
unsigned Val) {
 
  753    setScoreByInterval({GprNo, GprNo + 1}, 
T, Val);
 
  756  void setScoreByInterval(RegInterval 
Interval, InstCounterType CntTy,
 
  759  void setScoreByOperand(
const MachineInstr *
MI, 
const MachineOperand &
Op,
 
  760                         InstCounterType CntTy, 
unsigned Val);
 
  762  const SIInsertWaitcnts *Context;
 
  764  unsigned ScoreLBs[NUM_INST_CNTS] = {0};
 
  765  unsigned ScoreUBs[NUM_INST_CNTS] = {0};
 
  766  unsigned PendingEvents = 0;
 
  768  unsigned LastFlat[NUM_INST_CNTS] = {0};
 
  770  unsigned LastGDS = 0;
 
  775  unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
 
  780  unsigned SgprScores[2][SQ_MAX_PGM_SGPRS] = {{0}};
 
  782  unsigned SCCScore = 0;
 
  784  const MachineInstr *PendingSCCWrite = 
nullptr;
 
  787  unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
 
  790  SmallVector<
const MachineInstr *, NUM_LDS_VGPRS - 1> LDSDMAStores;
 
  796  SIInsertWaitcntsLegacy() : MachineFunctionPass(ID) {}
 
  798  bool runOnMachineFunction(MachineFunction &MF) 
override;
 
  800  StringRef getPassName()
 const override {
 
  801    return "SI insert wait instructions";
 
  804  void getAnalysisUsage(AnalysisUsage &AU)
 const override {
 
  807    AU.
addRequired<MachinePostDominatorTreeWrapperPass>();
 
  816RegInterval WaitcntBrackets::getRegInterval(
const MachineInstr *
MI,
 
  818  if (
Op.getReg() == AMDGPU::SCC)
 
  822  const MachineRegisterInfo *
MRI = 
Context->MRI;
 
  824  if (!
TRI->isInAllocatableClass(
Op.getReg()))
 
  834  unsigned RegIdx = 
TRI->getHWRegIndex(MCReg);
 
  836  const TargetRegisterClass *RC = 
TRI->getPhysRegBaseClass(
Op.getReg());
 
  837  unsigned Size = 
TRI->getRegSizeInBits(*RC);
 
  840  if (
TRI->isVectorRegister(*
MRI, 
Op.getReg())) {
 
  845      Result.first += AGPR_OFFSET;
 
  850    if (
Size == 16 && 
Context->ST->hasD16Writes32BitVgpr()) {
 
  858  } 
else if (
TRI->isSGPRReg(*
MRI, 
Op.getReg()) && RegIdx < SQ_MAX_PGM_SGPRS) {
 
  861    Result.first = RegIdx + NUM_ALL_VGPRS;
 
  870void WaitcntBrackets::setScoreByInterval(RegInterval 
Interval,
 
  871                                         InstCounterType CntTy,
 
  874    if (RegNo < NUM_ALL_VGPRS) {
 
  875      VgprUB = std::max(VgprUB, RegNo);
 
  876      VgprScores[CntTy][RegNo] = Score;
 
  877    } 
else if (RegNo < NUM_ALL_ALLOCATABLE) {
 
  878      SgprUB = std::max(SgprUB, RegNo - NUM_ALL_VGPRS);
 
  879      SgprScores[getSgprScoresIdx(CntTy)][RegNo - NUM_ALL_VGPRS] = Score;
 
  887void WaitcntBrackets::setScoreByOperand(
const MachineInstr *
MI,
 
  888                                        const MachineOperand &
Op,
 
  889                                        InstCounterType CntTy, 
unsigned Score) {
 
  891  setScoreByInterval(
Interval, CntTy, Score);
 
  899bool WaitcntBrackets::hasPointSampleAccel(
const MachineInstr &
MI)
 const {
 
  904  const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
 
  914bool WaitcntBrackets::hasPointSamplePendingVmemTypes(
 
  915    const MachineInstr &
MI, RegInterval 
Interval)
 const {
 
  916  if (!hasPointSampleAccel(
MI))
 
  919  return hasOtherPendingVmemTypes(
Interval, VMEM_NOSAMPLER);
 
  922void WaitcntBrackets::updateByEvent(WaitEventType 
E, MachineInstr &Inst) {
 
  923  InstCounterType 
T = eventCounter(
Context->WaitEventMaskForInst, 
E);
 
  925  unsigned UB = getScoreUB(
T);
 
  926  unsigned CurrScore = UB + 1;
 
  932  PendingEvents |= 1 << 
E;
 
  933  setScoreUB(
T, CurrScore);
 
  936  const MachineRegisterInfo *
MRI = 
Context->MRI;
 
  945      if (
const auto *AddrOp = 
TII->getNamedOperand(Inst, AMDGPU::OpName::addr))
 
  946        setScoreByOperand(&Inst, *AddrOp, EXP_CNT, CurrScore);
 
  949        if (
const auto *Data0 =
 
  950                TII->getNamedOperand(Inst, AMDGPU::OpName::data0))
 
  951          setScoreByOperand(&Inst, *Data0, EXP_CNT, CurrScore);
 
  952        if (
const auto *Data1 =
 
  953                TII->getNamedOperand(Inst, AMDGPU::OpName::data1))
 
  954          setScoreByOperand(&Inst, *Data1, EXP_CNT, CurrScore);
 
  957                 Inst.
getOpcode() != AMDGPU::DS_CONSUME &&
 
  958                 Inst.
getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
 
  959        for (
const MachineOperand &
Op : Inst.
all_uses()) {
 
  960          if (
TRI->isVectorRegister(*
MRI, 
Op.getReg()))
 
  961            setScoreByOperand(&Inst, 
Op, EXP_CNT, CurrScore);
 
  964    } 
else if (
TII->isFLAT(Inst)) {
 
  966        setScoreByOperand(&Inst,
 
  967                          *
TII->getNamedOperand(Inst, AMDGPU::OpName::data),
 
  970        setScoreByOperand(&Inst,
 
  971                          *
TII->getNamedOperand(Inst, AMDGPU::OpName::data),
 
  974    } 
else if (
TII->isMIMG(Inst)) {
 
  976        setScoreByOperand(&Inst, Inst.
getOperand(0), EXP_CNT, CurrScore);
 
  978        setScoreByOperand(&Inst,
 
  979                          *
TII->getNamedOperand(Inst, AMDGPU::OpName::data),
 
  982    } 
else if (
TII->isMTBUF(Inst)) {
 
  984        setScoreByOperand(&Inst, Inst.
getOperand(0), EXP_CNT, CurrScore);
 
  985    } 
else if (
TII->isMUBUF(Inst)) {
 
  987        setScoreByOperand(&Inst, Inst.
getOperand(0), EXP_CNT, CurrScore);
 
  989        setScoreByOperand(&Inst,
 
  990                          *
TII->getNamedOperand(Inst, AMDGPU::OpName::data),
 
  993    } 
else if (
TII->isLDSDIR(Inst)) {
 
  995      setScoreByOperand(&Inst,
 
  996                        *
TII->getNamedOperand(Inst, AMDGPU::OpName::vdst),
 
  999      if (
TII->isEXP(Inst)) {
 
 1004        for (MachineOperand &DefMO : Inst.
all_defs()) {
 
 1005          if (
TRI->isVGPR(*
MRI, DefMO.getReg())) {
 
 1006            setScoreByOperand(&Inst, DefMO, EXP_CNT, CurrScore);
 
 1010      for (
const MachineOperand &
Op : Inst.
all_uses()) {
 
 1011        if (
TRI->isVectorRegister(*
MRI, 
Op.getReg()))
 
 1012          setScoreByOperand(&Inst, 
Op, EXP_CNT, CurrScore);
 
 1015  } 
else if (
T == X_CNT) {
 
 1016    WaitEventType OtherEvent = 
E == SMEM_GROUP ? VMEM_GROUP : SMEM_GROUP;
 
 1017    if (PendingEvents & (1 << OtherEvent)) {
 
 1022      setScoreLB(
T, getScoreUB(
T) - 1);
 
 1023      PendingEvents &= ~(1 << OtherEvent);
 
 1025    for (
const MachineOperand &
Op : Inst.
all_uses())
 
 1026      setScoreByOperand(&Inst, 
Op, 
T, CurrScore);
 
 1037    for (
const MachineOperand &
Op : Inst.
defs()) {
 
 1038      RegInterval 
Interval = getRegInterval(&Inst, 
Op);
 
 1039      if (
T == LOAD_CNT || 
T == SAMPLE_CNT || 
T == BVH_CNT) {
 
 1040        if (
Interval.first >= NUM_ALL_VGPRS)
 
 1042        if (updateVMCntOnly(Inst)) {
 
 1047          VmemType 
V = getVmemType(Inst);
 
 1048          unsigned char TypesMask = 1 << 
V;
 
 1051          if (hasPointSampleAccel(Inst))
 
 1052            TypesMask |= 1 << VMEM_NOSAMPLER;
 
 1054            VgprVmemTypes[RegNo] |= TypesMask;
 
 1057      setScoreByInterval(
Interval, 
T, CurrScore);
 
 1060        (
TII->isDS(Inst) || 
TII->mayWriteLDSThroughDMA(Inst))) {
 
 1065        if (!MemOp->isStore() ||
 
 1070        auto AAI = MemOp->getAAInfo();
 
 1078        if (!AAI || !AAI.Scope)
 
 1080        for (
unsigned I = 0, 
E = LDSDMAStores.size(); 
I != 
E && !Slot; ++
I) {
 
 1081          for (
const auto *MemOp : LDSDMAStores[
I]->memoperands()) {
 
 1082            if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
 
 1088        if (Slot || LDSDMAStores.size() == NUM_LDS_VGPRS - 1)
 
 1090        LDSDMAStores.push_back(&Inst);
 
 1091        Slot = LDSDMAStores.size();
 
 1094      setRegScore(FIRST_LDS_VGPR + Slot, 
T, CurrScore);
 
 1096        setRegScore(FIRST_LDS_VGPR, 
T, CurrScore);
 
 1100      setRegScore(SCC, 
T, CurrScore);
 
 1101      PendingSCCWrite = &Inst;
 
 1106void WaitcntBrackets::print(raw_ostream &OS)
 const {
 
 1110  for (
auto T : inst_counter_types(
Context->MaxCounter)) {
 
 1111    unsigned SR = getScoreRange(
T);
 
 1115      OS << 
"    " << (
ST->hasExtendedWaitCounts() ? 
"LOAD" : 
"VM") << 
"_CNT(" 
 1119      OS << 
"    " << (
ST->hasExtendedWaitCounts() ? 
"DS" : 
"LGKM") << 
"_CNT(" 
 1123      OS << 
"    EXP_CNT(" << SR << 
"): ";
 
 1126      OS << 
"    " << (
ST->hasExtendedWaitCounts() ? 
"STORE" : 
"VS") << 
"_CNT(" 
 1130      OS << 
"    SAMPLE_CNT(" << SR << 
"): ";
 
 1133      OS << 
"    BVH_CNT(" << SR << 
"): ";
 
 1136      OS << 
"    KM_CNT(" << SR << 
"): ";
 
 1139      OS << 
"    X_CNT(" << SR << 
"): ";
 
 1142      OS << 
"    UNKNOWN(" << SR << 
"): ";
 
 1148      unsigned LB = getScoreLB(
T);
 
 1150      for (
int J = 0; J <= VgprUB; J++) {
 
 1151        unsigned RegScore = getRegScore(J, 
T);
 
 1154        unsigned RelScore = RegScore - LB - 1;
 
 1155        if (J < FIRST_LDS_VGPR) {
 
 1156          OS << RelScore << 
":v" << J << 
" ";
 
 1158          OS << RelScore << 
":ds ";
 
 1162      if (isSmemCounter(
T)) {
 
 1163        for (
int J = 0; J <= SgprUB; J++) {
 
 1164          unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, 
T);
 
 1167          unsigned RelScore = RegScore - LB - 1;
 
 1168          OS << RelScore << 
":s" << J << 
" ";
 
 1171      if (
T == KM_CNT && SCCScore > 0)
 
 1172        OS << SCCScore << 
":scc ";
 
 1177  OS << 
"Pending Events: ";
 
 1178  if (hasPendingEvent()) {
 
 1180    for (
unsigned I = 0; 
I != NUM_WAIT_EVENTS; ++
I) {
 
 1181      if (hasPendingEvent((WaitEventType)
I)) {
 
 1182        OS << 
LS << WaitEventTypeName[
I];
 
 1195void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &
Wait)
 const {
 
 1196  simplifyWaitcnt(LOAD_CNT, 
Wait.LoadCnt);
 
 1197  simplifyWaitcnt(EXP_CNT, 
Wait.ExpCnt);
 
 1198  simplifyWaitcnt(DS_CNT, 
Wait.DsCnt);
 
 1199  simplifyWaitcnt(STORE_CNT, 
Wait.StoreCnt);
 
 1200  simplifyWaitcnt(SAMPLE_CNT, 
Wait.SampleCnt);
 
 1201  simplifyWaitcnt(BVH_CNT, 
Wait.BvhCnt);
 
 1202  simplifyWaitcnt(KM_CNT, 
Wait.KmCnt);
 
 1203  simplifyWaitcnt(X_CNT, 
Wait.XCnt);
 
 1206void WaitcntBrackets::simplifyWaitcnt(InstCounterType 
T,
 
 1207                                      unsigned &
Count)
 const {
 
 1211  if (
Count >= getScoreRange(
T))
 
 1215void WaitcntBrackets::determineWait(InstCounterType 
T, RegInterval 
Interval,
 
 1216                                    AMDGPU::Waitcnt &
Wait)
 const {
 
 1217  const unsigned LB = getScoreLB(
T);
 
 1218  const unsigned UB = getScoreUB(
T);
 
 1220    unsigned ScoreToWait = getRegScore(RegNo, 
T);
 
 1224    if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
 
 1225      if ((
T == LOAD_CNT || 
T == DS_CNT) && hasPendingFlat() &&
 
 1226          !
Context->ST->hasFlatLgkmVMemCountInOrder()) {
 
 1230        addWait(
Wait, 
T, 0);
 
 1231      } 
else if (counterOutOfOrder(
T)) {
 
 1235        addWait(
Wait, 
T, 0);
 
 1239        unsigned NeededWait =
 
 1240            std::min(UB - ScoreToWait, 
Context->getWaitCountMax(
T) - 1);
 
 1241        addWait(
Wait, 
T, NeededWait);
 
 1247void WaitcntBrackets::tryClearSCCWriteEvent(MachineInstr *Inst) {
 
 1250  if (PendingSCCWrite &&
 
 1251      PendingSCCWrite->
getOpcode() == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM &&
 
 1253    unsigned SCC_WRITE_PendingEvent = 1 << SCC_WRITE;
 
 1255    if ((PendingEvents & 
Context->WaitEventMaskForInst[KM_CNT]) ==
 
 1256        SCC_WRITE_PendingEvent) {
 
 1257      setScoreLB(KM_CNT, getScoreUB(KM_CNT));
 
 1260    PendingEvents &= ~SCC_WRITE_PendingEvent;
 
 1261    PendingSCCWrite = 
nullptr;
 
 1265void WaitcntBrackets::applyWaitcnt(
const AMDGPU::Waitcnt &
Wait) {
 
 1266  applyWaitcnt(LOAD_CNT, 
Wait.LoadCnt);
 
 1267  applyWaitcnt(EXP_CNT, 
Wait.ExpCnt);
 
 1268  applyWaitcnt(DS_CNT, 
Wait.DsCnt);
 
 1269  applyWaitcnt(STORE_CNT, 
Wait.StoreCnt);
 
 1270  applyWaitcnt(SAMPLE_CNT, 
Wait.SampleCnt);
 
 1271  applyWaitcnt(BVH_CNT, 
Wait.BvhCnt);
 
 1272  applyWaitcnt(KM_CNT, 
Wait.KmCnt);
 
 1276void WaitcntBrackets::applyWaitcnt(InstCounterType 
T, 
unsigned Count) {
 
 1277  const unsigned UB = getScoreUB(
T);
 
 1281    if (counterOutOfOrder(
T))
 
 1283    setScoreLB(
T, std::max(getScoreLB(
T), UB - 
Count));
 
 1286    PendingEvents &= ~Context->WaitEventMaskForInst[
T];
 
 1290void WaitcntBrackets::applyXcnt(
const AMDGPU::Waitcnt &
Wait) {
 
 1294  auto applyPendingXcntGroup = [
this](
unsigned E) {
 
 1295    unsigned LowerBound = getScoreLB(X_CNT);
 
 1296    applyWaitcnt(X_CNT, 0);
 
 1297    PendingEvents |= (1 << 
E);
 
 1298    setScoreLB(X_CNT, LowerBound);
 
 1304  if (
Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP)) {
 
 1305    if (hasPendingEvent(VMEM_GROUP))
 
 1306      applyPendingXcntGroup(VMEM_GROUP);
 
 1308      applyWaitcnt(X_CNT, 0);
 
 1315  if (
Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
 
 1316      !hasPendingEvent(STORE_CNT)) {
 
 1317    if (hasPendingEvent(SMEM_GROUP))
 
 1318      applyPendingXcntGroup(SMEM_GROUP);
 
 1320      applyWaitcnt(X_CNT, std::min(
Wait.XCnt, 
Wait.LoadCnt));
 
 1324  applyWaitcnt(X_CNT, 
Wait.XCnt);
 
 1329bool WaitcntBrackets::counterOutOfOrder(InstCounterType 
T)
 const {
 
 1331  if ((
T == 
Context->SmemAccessCounter && hasPendingEvent(SMEM_ACCESS)) ||
 
 1332      (
T == X_CNT && hasPendingEvent(SMEM_GROUP)))
 
 1334  return hasMixedPendingEvents(
T);
 
 1344char SIInsertWaitcntsLegacy::
ID = 0;
 
 1349  return new SIInsertWaitcntsLegacy();
 
 
 1354  int OpIdx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(), 
OpName);
 
 1359  if (NewEnc == MO.
getImm())
 
 
 1370  case AMDGPU::S_WAIT_LOADCNT:
 
 1372  case AMDGPU::S_WAIT_EXPCNT:
 
 1374  case AMDGPU::S_WAIT_STORECNT:
 
 1376  case AMDGPU::S_WAIT_SAMPLECNT:
 
 1378  case AMDGPU::S_WAIT_BVHCNT:
 
 1380  case AMDGPU::S_WAIT_DSCNT:
 
 1382  case AMDGPU::S_WAIT_KMCNT:
 
 1384  case AMDGPU::S_WAIT_XCNT:
 
 
 1391bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt)
 const {
 
 1405bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
 
 1406    WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
 
 1409  assert(isNormalMode(MaxCounter));
 
 1412  MachineInstr *WaitcntInstr = 
nullptr;
 
 1413  MachineInstr *WaitcntVsCntInstr = 
nullptr;
 
 1416    dbgs() << 
"PreGFX12::applyPreexistingWaitcnt at: ";
 
 1418      dbgs() << 
"end of block\n";
 
 1426    if (
II.isMetaInstruction()) {
 
 1432    bool TrySimplify = Opcode != 
II.getOpcode() && !OptNone;
 
 1436    if (Opcode == AMDGPU::S_WAITCNT) {
 
 1437      unsigned IEnc = 
II.getOperand(0).getImm();
 
 1440        ScoreBrackets.simplifyWaitcnt(OldWait);
 
 1444      if (WaitcntInstr || (!
Wait.hasWaitExceptStoreCnt() && TrySimplify)) {
 
 1445        II.eraseFromParent();
 
 1449    } 
else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
 
 1452                        << 
"Before: " << 
Wait.LoadCnt << 
'\n';);
 
 1453      ScoreBrackets.determineWait(LOAD_CNT, FIRST_LDS_VGPR, 
Wait);
 
 1462      II.eraseFromParent();
 
 1464      assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
 
 1465      assert(
II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
 
 1468          TII->getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
 
 1470        ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);
 
 1471      Wait.StoreCnt = std::min(
Wait.StoreCnt, OldVSCnt);
 
 1473      if (WaitcntVsCntInstr || (!
Wait.hasWaitStoreCnt() && TrySimplify)) {
 
 1474        II.eraseFromParent();
 
 1477        WaitcntVsCntInstr = &
II;
 
 1484    Modified |= promoteSoftWaitCnt(WaitcntInstr);
 
 1486    ScoreBrackets.applyWaitcnt(LOAD_CNT, 
Wait.LoadCnt);
 
 1487    ScoreBrackets.applyWaitcnt(EXP_CNT, 
Wait.ExpCnt);
 
 1488    ScoreBrackets.applyWaitcnt(DS_CNT, 
Wait.DsCnt);
 
 1495                         << 
"applied pre-existing waitcnt\n" 
 1496                         << 
"New Instr at block end: " << *WaitcntInstr << 
'\n' 
 1497                   : 
dbgs() << 
"applied pre-existing waitcnt\n" 
 1498                            << 
"Old Instr: " << *It
 
 1499                            << 
"New Instr: " << *WaitcntInstr << 
'\n');
 
 1502  if (WaitcntVsCntInstr) {
 
 1504                                         AMDGPU::OpName::simm16, 
Wait.StoreCnt);
 
 1505    Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
 
 1507    ScoreBrackets.applyWaitcnt(STORE_CNT, 
Wait.StoreCnt);
 
 1508    Wait.StoreCnt = ~0
u;
 
 1511                   ? 
dbgs() << 
"applied pre-existing waitcnt\n" 
 1512                            << 
"New Instr at block end: " << *WaitcntVsCntInstr
 
 1514                   : 
dbgs() << 
"applied pre-existing waitcnt\n" 
 1515                            << 
"Old Instr: " << *It
 
 1516                            << 
"New Instr: " << *WaitcntVsCntInstr << 
'\n');
 
 1524bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
 
 1526    AMDGPU::Waitcnt 
Wait) {
 
 1528  assert(isNormalMode(MaxCounter));
 
 1535  if (
Wait.hasWaitExceptStoreCnt()) {
 
 1537    [[maybe_unused]] 
auto SWaitInst =
 
 1542               if (It != 
Block.instr_end()) 
dbgs() << 
"Old Instr: " << *It;
 
 1543               dbgs() << 
"New Instr: " << *SWaitInst << 
'\n');
 
 1546  if (
Wait.hasWaitStoreCnt()) {
 
 1549    [[maybe_unused]] 
auto SWaitInst =
 
 1556               if (It != 
Block.instr_end()) 
dbgs() << 
"Old Instr: " << *It;
 
 1557               dbgs() << 
"New Instr: " << *SWaitInst << 
'\n');
 
 1564WaitcntGeneratorPreGFX12::getAllZeroWaitcnt(
bool IncludeVSCnt)
 const {
 
 1565  return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt && 
ST->hasVscnt() ? 0 : ~0u);
 
 1569WaitcntGeneratorGFX12Plus::getAllZeroWaitcnt(
bool IncludeVSCnt)
 const {
 
 1570  return AMDGPU::Waitcnt(0, 0, 0, IncludeVSCnt ? 0 : ~0u, 0, 0, 0,
 
 1578bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
 
 1579    WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
 
 1582  assert(!isNormalMode(MaxCounter));
 
 1585  MachineInstr *CombinedLoadDsCntInstr = 
nullptr;
 
 1586  MachineInstr *CombinedStoreDsCntInstr = 
nullptr;
 
 1587  MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};
 
 1590    dbgs() << 
"GFX12Plus::applyPreexistingWaitcnt at: ";
 
 1592      dbgs() << 
"end of block\n";
 
 1600    if (
II.isMetaInstruction()) {
 
 1605    MachineInstr **UpdatableInstr;
 
 1611    bool TrySimplify = Opcode != 
II.getOpcode() && !OptNone;
 
 1615    if (Opcode == AMDGPU::S_WAITCNT)
 
 1618    if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
 
 1620          TII->getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
 
 1623        ScoreBrackets.simplifyWaitcnt(OldWait);
 
 1625      UpdatableInstr = &CombinedLoadDsCntInstr;
 
 1626    } 
else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
 
 1628          TII->getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
 
 1631        ScoreBrackets.simplifyWaitcnt(OldWait);
 
 1633      UpdatableInstr = &CombinedStoreDsCntInstr;
 
 1634    } 
else if (Opcode == AMDGPU::S_WAITCNT_lds_direct) {
 
 1637      II.eraseFromParent();
 
 1643          TII->getNamedOperand(
II, AMDGPU::OpName::simm16)->getImm();
 
 1645        ScoreBrackets.simplifyWaitcnt(CT.value(), OldCnt);
 
 1646      addWait(
Wait, CT.value(), OldCnt);
 
 1647      UpdatableInstr = &WaitInstrs[CT.value()];
 
 1651    if (!*UpdatableInstr) {
 
 1652      *UpdatableInstr = &
II;
 
 1654      II.eraseFromParent();
 
 1659  if (CombinedLoadDsCntInstr) {
 
 1667    if (
Wait.LoadCnt != ~0u && 
Wait.DsCnt != ~0u) {
 
 1670                                           AMDGPU::OpName::simm16, NewEnc);
 
 1671      Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
 
 1672      ScoreBrackets.applyWaitcnt(LOAD_CNT, 
Wait.LoadCnt);
 
 1673      ScoreBrackets.applyWaitcnt(DS_CNT, 
Wait.DsCnt);
 
 1678                     ? 
dbgs() << 
"applied pre-existing waitcnt\n" 
 1679                              << 
"New Instr at block end: " 
 1680                              << *CombinedLoadDsCntInstr << 
'\n' 
 1681                     : 
dbgs() << 
"applied pre-existing waitcnt\n" 
 1682                              << 
"Old Instr: " << *It << 
"New Instr: " 
 1683                              << *CombinedLoadDsCntInstr << 
'\n');
 
 1690  if (CombinedStoreDsCntInstr) {
 
 1692    if (
Wait.StoreCnt != ~0u && 
Wait.DsCnt != ~0u) {
 
 1695                                           AMDGPU::OpName::simm16, NewEnc);
 
 1696      Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
 
 1697      ScoreBrackets.applyWaitcnt(STORE_CNT, 
Wait.StoreCnt);
 
 1698      ScoreBrackets.applyWaitcnt(DS_CNT, 
Wait.DsCnt);
 
 1699      Wait.StoreCnt = ~0
u;
 
 1703                     ? 
dbgs() << 
"applied pre-existing waitcnt\n" 
 1704                              << 
"New Instr at block end: " 
 1705                              << *CombinedStoreDsCntInstr << 
'\n' 
 1706                     : 
dbgs() << 
"applied pre-existing waitcnt\n" 
 1707                              << 
"Old Instr: " << *It << 
"New Instr: " 
 1708                              << *CombinedStoreDsCntInstr << 
'\n');
 
 1721  if (
Wait.DsCnt != ~0u) {
 
 1730    if (
Wait.LoadCnt != ~0u) {
 
 1731      WaitsToErase.
push_back(&WaitInstrs[LOAD_CNT]);
 
 1732      WaitsToErase.
push_back(&WaitInstrs[DS_CNT]);
 
 1733    } 
else if (
Wait.StoreCnt != ~0u) {
 
 1734      WaitsToErase.
push_back(&WaitInstrs[STORE_CNT]);
 
 1735      WaitsToErase.
push_back(&WaitInstrs[DS_CNT]);
 
 1738    for (MachineInstr **WI : WaitsToErase) {
 
 1742      (*WI)->eraseFromParent();
 
 1748  for (
auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
 
 1749    if (!WaitInstrs[CT])
 
 1752    unsigned NewCnt = getWait(
Wait, CT);
 
 1753    if (NewCnt != ~0u) {
 
 1755                                           AMDGPU::OpName::simm16, NewCnt);
 
 1756      Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
 
 1758      ScoreBrackets.applyWaitcnt(CT, NewCnt);
 
 1759      setNoWait(
Wait, CT);
 
 1762                     ? 
dbgs() << 
"applied pre-existing waitcnt\n" 
 1763                              << 
"New Instr at block end: " << *WaitInstrs[CT]
 
 1765                     : 
dbgs() << 
"applied pre-existing waitcnt\n" 
 1766                              << 
"Old Instr: " << *It
 
 1767                              << 
"New Instr: " << *WaitInstrs[CT] << 
'\n');
 
 1778bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
 
 1780    AMDGPU::Waitcnt 
Wait) {
 
 1782  assert(!isNormalMode(MaxCounter));
 
 1788  if (
Wait.DsCnt != ~0u) {
 
 1789    MachineInstr *SWaitInst = 
nullptr;
 
 1791    if (
Wait.LoadCnt != ~0u) {
 
 1799    } 
else if (
Wait.StoreCnt != ~0u) {
 
 1806      Wait.StoreCnt = ~0
u;
 
 1814                 if (It != 
Block.instr_end()) 
dbgs() << 
"Old Instr: " << *It;
 
 1815                 dbgs() << 
"New Instr: " << *SWaitInst << 
'\n');
 
 1822  for (
auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
 
 1827    [[maybe_unused]] 
auto SWaitInst =
 
 1834               if (It != 
Block.instr_end()) 
dbgs() << 
"Old Instr: " << *It;
 
 1835               dbgs() << 
"New Instr: " << *SWaitInst << 
'\n');
 
 1866bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &
MI,
 
 1867                                                 WaitcntBrackets &ScoreBrackets,
 
 1868                                                 MachineInstr *OldWaitcntInstr,
 
 1870  setForceEmitWaitcnt();
 
 1874  AMDGPU::Waitcnt 
Wait;
 
 1875  const unsigned Opc = 
MI.getOpcode();
 
 1881  if (
Opc == AMDGPU::BUFFER_WBINVL1 || 
Opc == AMDGPU::BUFFER_WBINVL1_SC ||
 
 1882      Opc == AMDGPU::BUFFER_WBINVL1_VOL || 
Opc == AMDGPU::BUFFER_GL0_INV ||
 
 1883      Opc == AMDGPU::BUFFER_GL1_INV) {
 
 1890  if (
Opc == AMDGPU::SI_RETURN_TO_EPILOG || 
Opc == AMDGPU::SI_RETURN ||
 
 1891      Opc == AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN ||
 
 1892      Opc == AMDGPU::S_SETPC_B64_return ||
 
 1894    Wait = 
Wait.combined(WCG->getAllZeroWaitcnt(
false));
 
 1904  else if (
Opc == AMDGPU::S_ENDPGM || 
Opc == AMDGPU::S_ENDPGM_SAVED) {
 
 1905    if (!WCG->isOptNone() &&
 
 1906        (
MI.getMF()->getInfo<SIMachineFunctionInfo>()->isDynamicVGPREnabled() ||
 
 1907         (
ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
 
 1908          ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
 
 1909          !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))))
 
 1913  else if ((
Opc == AMDGPU::S_SENDMSG || 
Opc == AMDGPU::S_SENDMSGHALT) &&
 
 1914           ST->hasLegacyGeometry() &&
 
 1925    if (
MI.modifiesRegister(AMDGPU::EXEC, 
TRI)) {
 
 1928      if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
 
 1929          ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
 
 1930          ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
 
 1931          ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
 
 1938    if (
TII->isAlwaysGDS(
Opc) && ScoreBrackets.hasPendingGDS())
 
 1939      addWait(
Wait, DS_CNT, ScoreBrackets.getPendingGDSWait());
 
 1945      Wait = AMDGPU::Waitcnt();
 
 1947      const auto &CallAddrOp = *
TII->getNamedOperand(
MI, AMDGPU::OpName::src0);
 
 1948      if (CallAddrOp.isReg()) {
 
 1949        RegInterval CallAddrOpInterval =
 
 1950            ScoreBrackets.getRegInterval(&
MI, CallAddrOp);
 
 1952        ScoreBrackets.determineWait(SmemAccessCounter, CallAddrOpInterval,
 
 1955        if (
const auto *RtnAddrOp =
 
 1956                TII->getNamedOperand(
MI, AMDGPU::OpName::dst)) {
 
 1957          RegInterval RtnAddrOpInterval =
 
 1958              ScoreBrackets.getRegInterval(&
MI, *RtnAddrOp);
 
 1960          ScoreBrackets.determineWait(SmemAccessCounter, RtnAddrOpInterval,
 
 1964    } 
else if (
Opc == AMDGPU::S_BARRIER_WAIT) {
 
 1965      ScoreBrackets.tryClearSCCWriteEvent(&
MI);
 
 1981      for (
const MachineMemOperand *Memop : 
MI.memoperands()) {
 
 1982        const Value *
Ptr = Memop->getValue();
 
 1983        if (Memop->isStore()) {
 
 1984          if (
auto It = SLoadAddresses.
find(
Ptr); It != SLoadAddresses.
end()) {
 
 1985            addWait(
Wait, SmemAccessCounter, 0);
 
 1987              SLoadAddresses.
erase(It);
 
 1990        unsigned AS = Memop->getAddrSpace();
 
 1994        if (
TII->mayWriteLDSThroughDMA(
MI))
 
 1998        unsigned RegNo = FIRST_LDS_VGPR;
 
 1999        if (
Ptr && Memop->getAAInfo()) {
 
 2000          const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
 
 2001          for (
unsigned I = 0, 
E = LDSDMAStores.size(); 
I != 
E; ++
I) {
 
 2002            if (
MI.mayAlias(AA, *LDSDMAStores[
I], 
true))
 
 2003              ScoreBrackets.determineWait(LOAD_CNT, RegNo + 
I + 1, 
Wait);
 
 2006          ScoreBrackets.determineWait(LOAD_CNT, RegNo, 
Wait);
 
 2008        if (Memop->isStore()) {
 
 2009          ScoreBrackets.determineWait(EXP_CNT, RegNo, 
Wait);
 
 2014      for (
const MachineOperand &
Op : 
MI.operands()) {
 
 2019        if (
Op.isTied() && 
Op.isUse() && 
TII->doesNotReadTiedSource(
MI))
 
 2022        RegInterval 
Interval = ScoreBrackets.getRegInterval(&
MI, 
Op);
 
 2024        const bool IsVGPR = 
TRI->isVectorRegister(*
MRI, 
Op.getReg());
 
 2031          if (
Op.isImplicit() && 
MI.mayLoadOrStore())
 
 2040          if (
Op.isUse() || !updateVMCntOnly(
MI) ||
 
 2041              ScoreBrackets.hasOtherPendingVmemTypes(
Interval,
 
 2043              ScoreBrackets.hasPointSamplePendingVmemTypes(
MI, 
Interval) ||
 
 2044              !
ST->hasVmemWriteVgprInOrder()) {
 
 2046            ScoreBrackets.determineWait(SAMPLE_CNT, 
Interval, 
Wait);
 
 2048            ScoreBrackets.clearVgprVmemTypes(
Interval);
 
 2051          if (
Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
 
 2055        } 
else if (
Op.getReg() == AMDGPU::SCC) {
 
 2058          ScoreBrackets.determineWait(SmemAccessCounter, 
Interval, 
Wait);
 
 2061        if (
ST->hasWaitXCnt() && 
Op.isDef())
 
 2079  if (
Opc == AMDGPU::S_BARRIER && !
ST->hasAutoWaitcntBeforeBarrier() &&
 
 2080      !
ST->supportsBackOffBarrier()) {
 
 2081    Wait = 
Wait.combined(WCG->getAllZeroWaitcnt(
true));
 
 2088      ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
 
 2093  ScoreBrackets.simplifyWaitcnt(
Wait);
 
 2098    Wait = WCG->getAllZeroWaitcnt(
false);
 
 2100  if (ForceEmitWaitcnt[LOAD_CNT])
 
 2102  if (ForceEmitWaitcnt[EXP_CNT])
 
 2104  if (ForceEmitWaitcnt[DS_CNT])
 
 2106  if (ForceEmitWaitcnt[SAMPLE_CNT])
 
 2108  if (ForceEmitWaitcnt[BVH_CNT])
 
 2110  if (ForceEmitWaitcnt[KM_CNT])
 
 2112  if (ForceEmitWaitcnt[X_CNT])
 
 2116    if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
 
 2118    if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
 
 2120    if (ScoreBrackets.hasPendingEvent(BVH_CNT))
 
 2127  return generateWaitcnt(
Wait, 
MI.getIterator(), *
MI.getParent(), ScoreBrackets,
 
 2131bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt 
Wait,
 
 2133                                       MachineBasicBlock &
Block,
 
 2134                                       WaitcntBrackets &ScoreBrackets,
 
 2135                                       MachineInstr *OldWaitcntInstr) {
 
 2138  if (OldWaitcntInstr)
 
 2142        WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, 
Wait, It);
 
 2146  ScoreBrackets.applyWaitcnt(
Wait);
 
 2149  if (
Wait.ExpCnt != ~0u && It != 
Block.instr_end() &&
 
 2151    MachineOperand *WaitExp =
 
 2152        TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
 
 2160                      << 
"Update Instr: " << *It);
 
 2164  if (
Wait.XCnt != ~0u) {
 
 2165    if (
Wait.KmCnt == 0 && !ScoreBrackets.hasPendingEvent(SMEM_GROUP))
 
 2168    if (
Wait.LoadCnt == 0 && !ScoreBrackets.hasPendingEvent(VMEM_GROUP))
 
 2174    if (isVmemAccess(*It))
 
 2178  if (WCG->createNewWaitcnt(
Block, It, 
Wait))
 
 2184bool SIInsertWaitcnts::isVmemAccess(
const MachineInstr &
MI)
 const {
 
 2185  return (
TII->isFLAT(
MI) && 
TII->mayAccessVMEMThroughFlat(
MI)) ||
 
 2192                                    MachineBasicBlock *
Block)
 const {
 
 2193  auto BlockEnd = 
Block->getParent()->end();
 
 2194  auto BlockIter = 
Block->getIterator();
 
 2198      if (++BlockIter != BlockEnd) {
 
 2199        It = BlockIter->instr_begin();
 
 2206    if (!It->isMetaInstruction())
 
 2214  return It->getOpcode() == AMDGPU::S_ENDPGM;
 
 2218bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
 
 2219                                             MachineBasicBlock &
Block,
 
 2220                                             WaitcntBrackets &ScoreBrackets) {
 
 2221  AMDGPU::Waitcnt 
Wait;
 
 2222  bool NeedsEndPGMCheck = 
false;
 
 2230    NeedsEndPGMCheck = 
true;
 
 2233  ScoreBrackets.simplifyWaitcnt(
Wait);
 
 2236  bool Result = generateWaitcnt(
Wait, SuccessorIt, 
Block, ScoreBrackets,
 
 2239  if (Result && NeedsEndPGMCheck && isNextENDPGM(SuccessorIt, &
Block)) {
 
 2247void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
 
 2248                                               WaitcntBrackets *ScoreBrackets) {
 
 2256  bool IsVMEMAccess = 
false;
 
 2257  bool IsSMEMAccess = 
false;
 
 2258  if (
TII->isDS(Inst) && 
TII->usesLGKM_CNT(Inst)) {
 
 2260        TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
 
 2261      ScoreBrackets->updateByEvent(GDS_ACCESS, Inst);
 
 2262      ScoreBrackets->updateByEvent(GDS_GPR_LOCK, Inst);
 
 2263      ScoreBrackets->setPendingGDS();
 
 2265      ScoreBrackets->updateByEvent(LDS_ACCESS, Inst);
 
 2267  } 
else if (
TII->isFLAT(Inst)) {
 
 2269      ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);
 
 2275    int FlatASCount = 0;
 
 2277    if (
TII->mayAccessVMEMThroughFlat(Inst)) {
 
 2279      IsVMEMAccess = 
true;
 
 2280      ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);
 
 2283    if (
TII->mayAccessLDSThroughFlat(Inst)) {
 
 2285      ScoreBrackets->updateByEvent(LDS_ACCESS, Inst);
 
 2291    if (FlatASCount > 1)
 
 2292      ScoreBrackets->setPendingFlat();
 
 2295    IsVMEMAccess = 
true;
 
 2296    ScoreBrackets->updateByEvent(getVmemWaitEventType(Inst), Inst);
 
 2298    if (
ST->vmemWriteNeedsExpWaitcnt() &&
 
 2300      ScoreBrackets->updateByEvent(VMW_GPR_LOCK, Inst);
 
 2302  } 
else if (
TII->isSMRD(Inst)) {
 
 2303    IsSMEMAccess = 
true;
 
 2304    ScoreBrackets->updateByEvent(SMEM_ACCESS, Inst);
 
 2305  } 
else if (Inst.
isCall()) {
 
 2308      ScoreBrackets->applyWaitcnt(
 
 2309          WCG->getAllZeroWaitcnt(
false));
 
 2310      ScoreBrackets->setStateOnFunctionEntryOrReturn();
 
 2313      ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
 
 2316    ScoreBrackets->updateByEvent(EXP_LDS_ACCESS, Inst);
 
 2317  } 
else if (
TII->isVINTERP(Inst)) {
 
 2318    int64_t 
Imm = 
TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
 
 2319    ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);
 
 2321    unsigned Imm = 
TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
 
 2323      ScoreBrackets->updateByEvent(EXP_PARAM_ACCESS, Inst);
 
 2325      ScoreBrackets->updateByEvent(EXP_POS_ACCESS, Inst);
 
 2327      ScoreBrackets->updateByEvent(EXP_GPR_LOCK, Inst);
 
 2329    ScoreBrackets->updateByEvent(SCC_WRITE, Inst);
 
 2332    case AMDGPU::S_SENDMSG:
 
 2333    case AMDGPU::S_SENDMSG_RTN_B32:
 
 2334    case AMDGPU::S_SENDMSG_RTN_B64:
 
 2335    case AMDGPU::S_SENDMSGHALT:
 
 2336      ScoreBrackets->updateByEvent(SQ_MESSAGE, Inst);
 
 2338    case AMDGPU::S_MEMTIME:
 
 2339    case AMDGPU::S_MEMREALTIME:
 
 2340    case AMDGPU::S_GET_BARRIER_STATE_M0:
 
 2341    case AMDGPU::S_GET_BARRIER_STATE_IMM:
 
 2342      ScoreBrackets->updateByEvent(SMEM_ACCESS, Inst);
 
 2347  if (!
ST->hasWaitXCnt())
 
 2351    ScoreBrackets->updateByEvent(VMEM_GROUP, Inst);
 
 2354    ScoreBrackets->updateByEvent(SMEM_GROUP, Inst);
 
 2357bool WaitcntBrackets::mergeScore(
const MergeInfo &M, 
unsigned &Score,
 
 2358                                 unsigned OtherScore) {
 
 2359  unsigned MyShifted = Score <= 
M.OldLB ? 0 : Score + 
M.MyShift;
 
 2360  unsigned OtherShifted =
 
 2361      OtherScore <= 
M.OtherLB ? 0 : OtherScore + 
M.OtherShift;
 
 2362  Score = std::max(MyShifted, OtherShifted);
 
 2363  return OtherShifted > MyShifted;
 
 2371bool WaitcntBrackets::merge(
const WaitcntBrackets &
Other) {
 
 2372  bool StrictDom = 
false;
 
 2374  VgprUB = std::max(VgprUB, 
Other.VgprUB);
 
 2375  SgprUB = std::max(SgprUB, 
Other.SgprUB);
 
 2377  for (
auto T : inst_counter_types(
Context->MaxCounter)) {
 
 2379    const unsigned *WaitEventMaskForInst = 
Context->WaitEventMaskForInst;
 
 2380    const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[
T];
 
 2381    const unsigned OtherEvents = 
Other.PendingEvents & WaitEventMaskForInst[
T];
 
 2382    if (OtherEvents & ~OldEvents)
 
 2384    PendingEvents |= OtherEvents;
 
 2387    const unsigned MyPending = ScoreUBs[
T] - ScoreLBs[
T];
 
 2388    const unsigned OtherPending = 
Other.ScoreUBs[
T] - 
Other.ScoreLBs[
T];
 
 2389    const unsigned NewUB = ScoreLBs[
T] + std::max(MyPending, OtherPending);
 
 2390    if (NewUB < ScoreLBs[
T])
 
 2394    M.OldLB = ScoreLBs[
T];
 
 2395    M.OtherLB = 
Other.ScoreLBs[
T];
 
 2396    M.MyShift = NewUB - ScoreUBs[
T];
 
 2397    M.OtherShift = NewUB - 
Other.ScoreUBs[
T];
 
 2399    ScoreUBs[
T] = NewUB;
 
 2401    StrictDom |= mergeScore(M, LastFlat[
T], 
Other.LastFlat[
T]);
 
 2404      StrictDom |= mergeScore(M, LastGDS, 
Other.LastGDS);
 
 2407      StrictDom |= mergeScore(M, SCCScore, 
Other.SCCScore);
 
 2408      if (
Other.hasPendingEvent(SCC_WRITE)) {
 
 2409        unsigned OldEventsHasSCCWrite = OldEvents & (1 << SCC_WRITE);
 
 2410        if (!OldEventsHasSCCWrite) {
 
 2411          PendingSCCWrite = 
Other.PendingSCCWrite;
 
 2412        } 
else if (PendingSCCWrite != 
Other.PendingSCCWrite) {
 
 2413          PendingSCCWrite = 
nullptr;
 
 2418    for (
int J = 0; J <= VgprUB; J++)
 
 2419      StrictDom |= mergeScore(M, VgprScores[
T][J], 
Other.VgprScores[
T][J]);
 
 2421    if (isSmemCounter(
T)) {
 
 2422      unsigned Idx = getSgprScoresIdx(
T);
 
 2423      for (
int J = 0; J <= SgprUB; J++)
 
 2425            mergeScore(M, SgprScores[Idx][J], 
Other.SgprScores[Idx][J]);
 
 2429  for (
int J = 0; J <= VgprUB; J++) {
 
 2430    unsigned char NewVmemTypes = VgprVmemTypes[J] | 
Other.VgprVmemTypes[J];
 
 2431    StrictDom |= NewVmemTypes != VgprVmemTypes[J];
 
 2432    VgprVmemTypes[J] = NewVmemTypes;
 
 2440  return Opcode == AMDGPU::S_WAITCNT ||
 
 2443         Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
 
 2444         Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
 
 2445         Opcode == AMDGPU::S_WAITCNT_lds_direct ||
 
 
 2450bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
 
 2451                                            MachineBasicBlock &
Block,
 
 2452                                            WaitcntBrackets &ScoreBrackets) {
 
 2456    dbgs() << 
"*** Begin Block: ";
 
 2458    ScoreBrackets.dump();
 
 2464  bool VCCZCorrect = 
true;
 
 2465  if (
ST->hasReadVCCZBug()) {
 
 2468    VCCZCorrect = 
false;
 
 2469  } 
else if (!
ST->partialVCCWritesUpdateVCCZ()) {
 
 2472    VCCZCorrect = 
false;
 
 2476  MachineInstr *OldWaitcntInstr = 
nullptr;
 
 2481    MachineInstr &Inst = *Iter;
 
 2490      if (!OldWaitcntInstr)
 
 2491        OldWaitcntInstr = &Inst;
 
 2496    bool FlushVmCnt = 
Block.getFirstTerminator() == Inst &&
 
 2497                      isPreheaderToFlush(
Block, ScoreBrackets);
 
 2500    Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
 
 2502    OldWaitcntInstr = 
nullptr;
 
 2508    if (
ST->hasReadVCCZBug() || !
ST->partialVCCWritesUpdateVCCZ()) {
 
 2512        if (!
ST->partialVCCWritesUpdateVCCZ())
 
 2513          VCCZCorrect = 
false;
 
 2522        if (
ST->hasReadVCCZBug() &&
 
 2523            ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
 
 2526          VCCZCorrect = 
false;
 
 2534    if (
TII->isSMRD(Inst)) {
 
 2535      for (
const MachineMemOperand *Memop : Inst.
memoperands()) {
 
 2538        if (!Memop->isInvariant()) {
 
 2539          const Value *
Ptr = Memop->getValue();
 
 2543      if (
ST->hasReadVCCZBug()) {
 
 2545        VCCZCorrect = 
false;
 
 2549    updateEventWaitcntAfter(Inst, &ScoreBrackets);
 
 2551    Modified |= insertForcedWaitAfter(Inst, 
Block, ScoreBrackets);
 
 2555      ScoreBrackets.dump();
 
 2565              TII->get(
ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
 
 2577  AMDGPU::Waitcnt 
Wait;
 
 2578  if (
Block.getFirstTerminator() == 
Block.end() &&
 
 2579      isPreheaderToFlush(
Block, ScoreBrackets)) {
 
 2580    if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
 
 2582    if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
 
 2584    if (ScoreBrackets.hasPendingEvent(BVH_CNT))
 
 2593    dbgs() << 
"*** End Block: ";
 
 2595    ScoreBrackets.dump();
 
 2603bool SIInsertWaitcnts::isPreheaderToFlush(
 
 2604    MachineBasicBlock &
MBB, 
const WaitcntBrackets &ScoreBrackets) {
 
 2605  auto [Iterator, IsInserted] = PreheadersToFlush.
try_emplace(&
MBB, 
false);
 
 2607    return Iterator->second;
 
 2618      shouldFlushVmCnt(Loop, ScoreBrackets)) {
 
 2619    Iterator->second = 
true;
 
 2626bool SIInsertWaitcnts::isVMEMOrFlatVMEM(
const MachineInstr &
MI)
 const {
 
 2628    return TII->mayAccessVMEMThroughFlat(
MI);
 
 2640bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *
ML,
 
 2641                                        const WaitcntBrackets &Brackets) {
 
 2642  bool HasVMemLoad = 
false;
 
 2643  bool HasVMemStore = 
false;
 
 2644  bool UsesVgprLoadedOutside = 
false;
 
 2645  DenseSet<Register> VgprUse;
 
 2646  DenseSet<Register> VgprDef;
 
 2648  for (MachineBasicBlock *
MBB : 
ML->blocks()) {
 
 2649    for (MachineInstr &
MI : *
MBB) {
 
 2650      if (isVMEMOrFlatVMEM(
MI)) {
 
 2651        HasVMemLoad |= 
MI.mayLoad();
 
 2652        HasVMemStore |= 
MI.mayStore();
 
 2655      for (
const MachineOperand &
Op : 
MI.all_uses()) {
 
 2656        if (
Op.isDebug() || !
TRI->isVectorRegister(*
MRI, 
Op.getReg()))
 
 2658        RegInterval 
Interval = Brackets.getRegInterval(&
MI, 
Op);
 
 2668          if (Brackets.getRegScore(RegNo, LOAD_CNT) >
 
 2669                  Brackets.getScoreLB(LOAD_CNT) ||
 
 2670              Brackets.getRegScore(RegNo, SAMPLE_CNT) >
 
 2671                  Brackets.getScoreLB(SAMPLE_CNT) ||
 
 2672              Brackets.getRegScore(RegNo, BVH_CNT) >
 
 2673                  Brackets.getScoreLB(BVH_CNT)) {
 
 2674            UsesVgprLoadedOutside = 
true;
 
 2681      if (isVMEMOrFlatVMEM(
MI) && 
MI.mayLoad()) {
 
 2682        for (
const MachineOperand &
Op : 
MI.all_defs()) {
 
 2683          RegInterval 
Interval = Brackets.getRegInterval(&
MI, 
Op);
 
 2695  if (!
ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
 
 2697  return HasVMemLoad && UsesVgprLoadedOutside && 
ST->hasVmemWriteVgprInOrder();
 
 2700bool SIInsertWaitcntsLegacy::runOnMachineFunction(MachineFunction &MF) {
 
 2701  auto *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
 
 2703      &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
 
 2705  if (
auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
 
 2706    AA = &AAR->getAAResults();
 
 2708  return SIInsertWaitcnts(MLI, PDT, AA).run(MF);
 
 2720  if (!SIInsertWaitcnts(MLI, PDT, 
AA).
run(MF))
 
 2725      .preserve<AAManager>();
 
 
 2730  TII = ST->getInstrInfo();
 
 2731  TRI = &
TII->getRegisterInfo();
 
 2737  if (ST->hasExtendedWaitCounts()) {
 
 2738    MaxCounter = NUM_EXTENDED_INST_CNTS;
 
 2739    WCGGFX12Plus = WaitcntGeneratorGFX12Plus(MF, MaxCounter);
 
 2740    WCG = &WCGGFX12Plus;
 
 2742    MaxCounter = NUM_NORMAL_INST_CNTS;
 
 2743    WCGPreGFX12 = WaitcntGeneratorPreGFX12(MF);
 
 2747  for (
auto T : inst_counter_types())
 
 2748    ForceEmitWaitcnt[
T] = 
false;
 
 2750  WaitEventMaskForInst = WCG->getWaitEventMask();
 
 2752  SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS);
 
 2754  if (
ST->hasExtendedWaitCounts()) {
 
 2768  [[maybe_unused]] 
unsigned NumVGPRsMax =
 
 2770  [[maybe_unused]] 
unsigned NumSGPRsMax = 
ST->getAddressableNumSGPRs();
 
 2771  assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
 
 2772  assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
 
 2777  MachineBasicBlock &EntryBB = MF.
front();
 
 2788         I != 
E && (
I->isPHI() || 
I->isMetaInstruction()); ++
I)
 
 2791    if (
ST->hasExtendedWaitCounts()) {
 
 2794      for (
auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
 
 2795        if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT || CT == X_CNT)
 
 2798        if (!
ST->hasImageInsts() &&
 
 2799            (CT == EXP_CNT || CT == SAMPLE_CNT || CT == BVH_CNT))
 
 2803                TII->get(instrsForExtendedCounterTypes[CT]))
 
 2810    auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(
this);
 
 2811    NonKernelInitialState->setStateOnFunctionEntryOrReturn();
 
 2812    BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
 
 2819  for (
auto *
MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
 
 2822  std::unique_ptr<WaitcntBrackets> Brackets;
 
 2827    for (
auto BII = BlockInfos.
begin(), BIE = BlockInfos.
end(); BII != BIE;
 
 2829      MachineBasicBlock *
MBB = BII->first;
 
 2830      BlockInfo &BI = BII->second;
 
 2836          Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
 
 2838          *Brackets = *BI.Incoming;
 
 2841          Brackets = std::make_unique<WaitcntBrackets>(
this);
 
 2846          Brackets->~WaitcntBrackets();
 
 2847          new (Brackets.get()) WaitcntBrackets(
this);
 
 2851      Modified |= insertWaitcntInBlock(MF, *
MBB, *Brackets);
 
 2854      if (Brackets->hasPendingEvent()) {
 
 2855        BlockInfo *MoveBracketsToSucc = 
nullptr;
 
 2857          auto *SuccBII = BlockInfos.
find(Succ);
 
 2858          BlockInfo &SuccBI = SuccBII->second;
 
 2859          if (!SuccBI.Incoming) {
 
 2860            SuccBI.Dirty = 
true;
 
 2861            if (SuccBII <= BII) {
 
 2865            if (!MoveBracketsToSucc) {
 
 2866              MoveBracketsToSucc = &SuccBI;
 
 2868              SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
 
 2870          } 
else if (SuccBI.Incoming->merge(*Brackets)) {
 
 2871            SuccBI.Dirty = 
true;
 
 2872            if (SuccBII <= BII) {
 
 2878        if (MoveBracketsToSucc)
 
 2879          MoveBracketsToSucc->Incoming = std::move(Brackets);
 
 2884  if (
ST->hasScalarStores()) {
 
 2885    SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
 
 2886    bool HaveScalarStores = 
false;
 
 2888    for (MachineBasicBlock &
MBB : MF) {
 
 2889      for (MachineInstr &
MI : 
MBB) {
 
 2890        if (!HaveScalarStores && 
TII->isScalarStore(
MI))
 
 2891          HaveScalarStores = 
true;
 
 2893        if (
MI.getOpcode() == AMDGPU::S_ENDPGM ||
 
 2894            MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
 
 2899    if (HaveScalarStores) {
 
 2908      for (MachineBasicBlock *
MBB : EndPgmBlocks) {
 
 2909        bool SeenDCacheWB = 
false;
 
 2913          if (
I->getOpcode() == AMDGPU::S_DCACHE_WB)
 
 2914            SeenDCacheWB = 
true;
 
 2915          else if (
TII->isScalarStore(*
I))
 
 2916            SeenDCacheWB = 
false;
 
 2919          if ((
I->getOpcode() == AMDGPU::S_ENDPGM ||
 
 2920               I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
 
 2936    for (MachineInstr *
MI : ReleaseVGPRInsts) {
 
 2938              TII->get(AMDGPU::S_ALLOC_VGPR))
 
 2943    if (!ReleaseVGPRInsts.empty() &&
 
 2944        (MF.getFrameInfo().hasCalls() ||
 
 2945         ST->getOccupancyWithNumVGPRs(
 
 2946             TRI->getNumUsedPhysRegs(*
MRI, AMDGPU::VGPR_32RegClass),
 
 2949      for (MachineInstr *
MI : ReleaseVGPRInsts) {
 
 2950        if (
ST->requiresNopBeforeDeallocVGPRs()) {
 
 2952                  TII->get(AMDGPU::S_NOP))
 
 2956                TII->get(AMDGPU::S_SENDMSG))
 
 2962  ReleaseVGPRInsts.clear();
 
 2963  PreheadersToFlush.
clear();
 
 2964  SLoadAddresses.
clear();
 
unsigned const MachineRegisterInfo * MRI
 
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
 
Provides AMDGPU specific target descriptions.
 
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
 
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
 
Analysis containing CSE Info
 
This file provides an implementation of debug counters.
 
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
 
AMD GCN specific subclass of TargetSubtarget.
 
const HexagonInstrInfo * TII
 
static bool isOptNone(const MachineFunction &MF)
 
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
 
Register const TargetRegisterInfo * TRI
 
This file implements a map that provides insertion order iteration.
 
std::pair< uint64_t, uint64_t > Interval
 
static bool isReg(const MCInst &MI, unsigned OpNo)
 
MachineInstr unsigned OpIdx
 
uint64_t IntrinsicInst * II
 
#define INITIALIZE_PASS_DEPENDENCY(depName)
 
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
 
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
 
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
 
static cl::opt< bool > ForceEmitZeroLoadFlag("amdgpu-waitcnt-load-forcezero", cl::desc("Force all waitcnt load counters to wait until 0"), cl::init(false), cl::Hidden)
 
static bool callWaitsOnFunctionReturn(const MachineInstr &MI)
 
#define AMDGPU_EVENT_NAME(Name)
 
static bool callWaitsOnFunctionEntry(const MachineInstr &MI)
 
static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName, unsigned NewEnc)
 
static bool isWaitInstr(MachineInstr &Inst)
 
static std::optional< InstCounterType > counterTypeForInstr(unsigned Opcode)
Determine if MI is a gfx12+ single-counter S_WAIT_*CNT instruction, and if so, which counter it is wa...
 
static cl::opt< bool > ForceEmitZeroFlag("amdgpu-waitcnt-forcezero", cl::desc("Force all waitcnt instrs to be emitted as " "s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden)
 
#define AMDGPU_DECLARE_WAIT_EVENTS(DECL)
 
#define AMDGPU_EVENT_ENUM(Name)
 
Provides some synthesis utilities to produce sequences of values.
 
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
 
static const uint32_t IV[8]
 
A manager for alias analyses.
 
bool isEntryFunction() const
 
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
 
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
 
AnalysisUsage & addRequired()
 
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
 
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
 
Represents analyses that only rely on functions' control flow.
 
static bool isCounterSet(unsigned ID)
 
static bool shouldExecute(unsigned CounterName)
 
iterator find(const_arg_type_t< KeyT > Val)
 
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
 
bool erase(const KeyT &Val)
 
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
 
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
 
FunctionPass class - This class is used to implement most global optimizations.
 
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
 
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
 
LLVM_ABI const MachineBasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
 
LLVM_ABI DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
 
Instructions::iterator instr_iterator
 
instr_iterator instr_end()
 
iterator_range< succ_iterator > successors()
 
MachineInstrBundleIterator< MachineInstr > iterator
 
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
 
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
 
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
 
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
 
Function & getFunction()
Return the LLVM function that this machine code represents.
 
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
 
const MachineBasicBlock & front() const
 
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
 
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
 
Representation of each machine instruction.
 
mop_range defs()
Returns all explicit operands that are register definitions.
 
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
 
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
 
const MachineBasicBlock * getParent() const
 
filtered_mop_range all_defs()
Returns an iterator range over all operands that are (explicit or implicit) register defs.
 
bool isCall(QueryType Type=AnyInBundle) const
 
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
 
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr fully defines the specified register.
 
LLVM_ABI void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
 
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
 
LLVM_ABI void print(raw_ostream &OS, bool IsStandalone=true, bool SkipOpers=false, bool SkipDebugLoc=false, bool AddNewLine=true, const TargetInstrInfo *TII=nullptr) const
Print this MI to OS.
 
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
 
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
 
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
 
filtered_mop_range all_uses()
Returns an iterator range over all operands that are (explicit or implicit) register uses.
 
const MachineOperand & getOperand(unsigned i) const
 
bool isMetaInstruction(QueryType Type=IgnoreBundle) const
Return true if this instruction doesn't produce any output in the form of executable instructions.
 
Analysis pass that exposes the MachineLoopInfo for a machine function.
 
MachineOperand class - Representation of each machine instruction operand.
 
void setImm(int64_t immVal)
 
Register getReg() const
getReg - Returns the register number.
 
iterator find(const KeyT &Key)
 
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
 
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
 
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
 
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
 
static bool isCBranchVCCZRead(const MachineInstr &MI)
 
static bool isVMEM(const MachineInstr &MI)
 
static bool isFLATScratch(const MachineInstr &MI)
 
static bool isEXP(const MachineInstr &MI)
 
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
 
static bool isLDSDIR(const MachineInstr &MI)
 
static bool isGWS(const MachineInstr &MI)
 
static bool isFLATGlobal(const MachineInstr &MI)
 
static bool isVSAMPLE(const MachineInstr &MI)
 
static bool isAtomicRet(const MachineInstr &MI)
 
static bool isImage(const MachineInstr &MI)
 
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
 
static bool isVINTERP(const MachineInstr &MI)
 
static bool isGFX12CacheInvOrWBInst(unsigned Opc)
 
static bool isSBarrierSCCWrite(unsigned Opcode)
 
static bool isMIMG(const MachineInstr &MI)
 
static bool isFLAT(const MachineInstr &MI)
 
static bool isAtomicNoRet(const MachineInstr &MI)
 
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
 
unsigned getDynamicVGPRBlockSize() const
 
bool isDynamicVGPREnabled() const
 
void push_back(const T &Elt)
 
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
 
std::pair< iterator, bool > insert(const ValueT &V)
 
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
 
self_iterator getIterator()
 
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
 
Abstract Attribute helper functions.
 
@ LOCAL_ADDRESS
Address space for local memory.
 
@ FLAT_ADDRESS
Address space for flat memory.
 
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
 
@ ID_DEALLOC_VGPRS_GFX11Plus
 
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
 
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
 
MCRegister getMCReg(MCRegister Reg, const MCSubtargetInfo &STI)
If Reg is a pseudo reg, return the correct hardware register given STI otherwise return Reg.
 
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
 
unsigned getStorecntBitMask(const IsaVersion &Version)
 
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
 
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
 
unsigned getSamplecntBitMask(const IsaVersion &Version)
 
unsigned getKmcntBitMask(const IsaVersion &Version)
 
unsigned getVmcntBitMask(const IsaVersion &Version)
 
unsigned getXcntBitMask(const IsaVersion &Version)
 
Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt)
 
unsigned getLgkmcntBitMask(const IsaVersion &Version)
 
unsigned getBvhcntBitMask(const IsaVersion &Version)
 
unsigned getExpcntBitMask(const IsaVersion &Version)
 
Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt)
 
static unsigned encodeStorecntDscnt(const IsaVersion &Version, unsigned Storecnt, unsigned Dscnt)
 
bool getMUBUFIsBufferInv(unsigned Opc)
 
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
 
unsigned getLoadcntBitMask(const IsaVersion &Version)
 
static unsigned encodeLoadcntDscnt(const IsaVersion &Version, unsigned Loadcnt, unsigned Dscnt)
 
unsigned getDscntBitMask(const IsaVersion &Version)
 
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
 
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
 
@ Undef
Value of the register doesn't matter.
 
initializer< Ty > init(const Ty &Val)
 
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
 
This is an optimization pass for GlobalISel generic memory operations.
 
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
 
FunctionAddr VTableAddr Value
 
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
 
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
 
auto enum_seq(EnumT Begin, EnumT End)
Iterate over an enum type from Begin up to - but not including - End.
 
static StringRef getCPU(StringRef CPU)
Processes a CPU name.
 
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
 
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
 
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
 
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
 
char & SIInsertWaitcntsID
 
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
 
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
 
FunctionAddr VTableAddr Count
 
CodeGenOptLevel
Code generation optimization level.
 
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
 
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
 
DWARFExpression::Operation Op
 
FunctionPass * createSIInsertWaitcntsPass()
 
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
 
Instruction set architecture version.
 
Represents the counter values to wait for in an s_waitcnt instruction.
 
static constexpr bool is_iterable