doxygen/AMDGPUBarrierLatency_8cpp_source.html

//===--- AMDGPUBarrierLatency.cpp - AMDGPU Barrier Latency ----------------===//

//

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

// See https://llvm.org/LICENSE.txt for license information.

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//

//===----------------------------------------------------------------------===//

//

/// \file This file contains a DAG scheduling mutation to add latency to:

///       1. Barrier edges between ATOMIC_FENCE instructions and preceding

///          memory accesses potentially affected by the fence.

///          This encourages the scheduling of more instructions before

///          ATOMIC_FENCE instructions.  ATOMIC_FENCE instructions may

///          introduce wait counting or indicate an impending S_BARRIER

///          wait.  Having more instructions in-flight across these

///          constructs improves latency hiding.

///       2. Barrier edges from S_BARRIER_SIGNAL to S_BARRIER_WAIT.

///          This encourages independent work to be scheduled between

///          signal and wait, hiding barrier synchronization latency.

//

//===----------------------------------------------------------------------===//


#include "AMDGPUBarrierLatency.h"

#include "GCNSubtarget.h"

#include "MCTargetDesc/AMDGPUMCTargetDesc.h"

#include "SIInstrInfo.h"

#include "llvm/CodeGen/ScheduleDAGInstrs.h"

#include "llvm/Support/CommandLine.h"


using namespace llvm;


static cl::opt<unsigned> BarrierSignalWaitLatencyOpt(

    "amdgpu-barrier-signal-wait-latency",

    cl::desc("Synthetic latency between S_BARRIER_SIGNAL and S_BARRIER_WAIT "

             "to encourage scheduling independent work between them"),

    cl::init(16), cl::Hidden);


namespace {


class BarrierLatency : public ScheduleDAGMutation {

private:

  SmallSet<SyncScope::ID, 4> IgnoredScopes;


public:

  BarrierLatency(MachineFunction *MF) {

    LLVMContext &Context = MF->getFunction().getContext();

    IgnoredScopes.insert(SyncScope::SingleThread);

    IgnoredScopes.insert(Context.getOrInsertSyncScopeID("wavefront"));

    IgnoredScopes.insert(Context.getOrInsertSyncScopeID("wavefront-one-as"));

    IgnoredScopes.insert(Context.getOrInsertSyncScopeID("singlethread-one-as"));


    const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();

    if (!ST.requiresWaitOnWorkgroupReleaseFence()) {

      // Prior to GFX10 workgroup scope does not normally require waitcnts

      IgnoredScopes.insert(Context.getOrInsertSyncScopeID("workgroup"));

    }

  }

  void apply(ScheduleDAGInstrs *DAG) override;

};


void addLatencyToEdge(SDep &PredDep, SUnit &SU, unsigned Latency) {

  SUnit *PredSU = PredDep.getSUnit();

  SDep ForwardD = PredDep;

  ForwardD.setSUnit(&SU);

  for (SDep &SuccDep : PredSU->Succs) {

    if (SuccDep == ForwardD) {

      SuccDep.setLatency(SuccDep.getLatency() + Latency);

      break;

    }

  }

  PredDep.setLatency(PredDep.getLatency() + Latency);

  PredSU->setDepthDirty();

  SU.setDepthDirty();

}


void BarrierLatency::apply(ScheduleDAGInstrs *DAG) {

  const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(DAG->TII);

  constexpr unsigned FenceLatency = 2000;

  const unsigned BarrierSignalWaitLatency = BarrierSignalWaitLatencyOpt;


  for (SUnit &SU : DAG->SUnits) {

    const MachineInstr *MI = SU.getInstr();

    unsigned Op = MI->getOpcode();


    if (Op == AMDGPU::ATOMIC_FENCE) {

      // Update latency on barrier edges of ATOMIC_FENCE.

      // Ignore scopes not expected to have any latency.

      SyncScope::ID SSID =

          static_cast<SyncScope::ID>(MI->getOperand(1).getImm());

      if (IgnoredScopes.contains(SSID))

        continue;


      for (SDep &PredDep : SU.Preds) {

        if (!PredDep.isBarrier())

          continue;

        SUnit *PredSU = PredDep.getSUnit();

        MachineInstr *MI = PredSU->getInstr();

        // Only consider memory loads

        if (!MI->mayLoad() || MI->mayStore())

          continue;

        addLatencyToEdge(PredDep, SU, FenceLatency);

      }

    } else if (Op == AMDGPU::S_BARRIER_WAIT) {

      for (SDep &PredDep : SU.Preds) {

        SUnit *PredSU = PredDep.getSUnit();

        const MachineInstr *PredMI = PredSU->getInstr();

        if (TII->isBarrierStart(PredMI->getOpcode())) {

          addLatencyToEdge(PredDep, SU, BarrierSignalWaitLatency);

        }

      }

    }

  }

}


} // end namespace


std::unique_ptr<ScheduleDAGMutation>


llvm::createAMDGPUBarrierLatencyDAGMutation(MachineFunction *MF) {

  return std::make_unique<BarrierLatency>(MF);

}


BarrierSignalWaitLatencyOpt
static cl::opt< unsigned > BarrierSignalWaitLatencyOpt("amdgpu-barrier-signal-wait-latency", cl::desc("Synthetic latency between S_BARRIER_SIGNAL and S_BARRIER_WAIT " "to encourage scheduling independent work between them"), cl::init(16), cl::Hidden)

AMDGPUBarrierLatency.h

AMDGPUMCTargetDesc.h
Provides AMDGPU specific target descriptions.

CommandLine.h

GCNSubtarget.h
AMD GCN specific subclass of TargetSubtarget.

TII
const HexagonInstrInfo * TII
Definition HexagonCopyToCombine.cpp:118

MI
IRTranslator LLVM IR MI
Definition IRTranslator.cpp:110

SIInstrInfo.h
Interface definition for SIInstrInfo.

ScheduleDAGInstrs.h

llvm::Function::getContext
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358

llvm::GCNSubtarget
Definition GCNSubtarget.h:34

llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68

llvm::MachineFunction
Definition MachineFunction.h:294

llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition MachineFunction.h:776

llvm::MachineFunction::getFunction
Function & getFunction()
Return the LLVM function that this machine code represents.
Definition MachineFunction.h:747

llvm::MachineInstr::getOpcode
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition MachineInstr.h:599

llvm::SDep
Scheduling dependency.
Definition ScheduleDAG.h:51

llvm::SDep::getSUnit
SUnit * getSUnit() const
Definition ScheduleDAG.h:507

llvm::SDep::setLatency
void setLatency(unsigned Lat)
Sets the latency for this edge.
Definition ScheduleDAG.h:147

llvm::SDep::getLatency
unsigned getLatency() const
Returns the latency value for this edge, which roughly means the minimum number of cycles that must e...
Definition ScheduleDAG.h:142

llvm::SDep::setSUnit
void setSUnit(SUnit *SU)
Definition ScheduleDAG.h:510

llvm::SDep::isBarrier
bool isBarrier() const
Tests if this is an Order dependence that is marked as a barrier.
Definition ScheduleDAG.h:174

llvm::SUnit
Scheduling unit. This is a node in the scheduling DAG.
Definition ScheduleDAG.h:249

llvm::SUnit::Succs
SmallVector< SDep, 4 > Succs
All sunit successors.
Definition ScheduleDAG.h:270

llvm::SUnit::setDepthDirty
LLVM_ABI void setDepthDirty()
Sets a flag in this node to indicate that its stored Depth value will require recomputation the next ...
Definition ScheduleDAG.cpp:217

llvm::SUnit::Preds
SmallVector< SDep, 4 > Preds
All sunit predecessors.
Definition ScheduleDAG.h:269

llvm::SUnit::getInstr
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition ScheduleDAG.h:399

llvm::ScheduleDAGInstrs
A ScheduleDAG for scheduling lists of MachineInstr.
Definition ScheduleDAGInstrs.h:118

llvm::ScheduleDAGMutation
Mutate the DAG as a postpass after normal DAG building.
Definition ScheduleDAGMutation.h:24

llvm::ScheduleDAG::TII
const TargetInstrInfo * TII
Target instruction information.
Definition ScheduleDAG.h:584

llvm::ScheduleDAG::SUnits
std::vector< SUnit > SUnits
The scheduling units.
Definition ScheduleDAG.h:588

llvm::SmallSet
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:134

llvm::SmallSet::contains
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition SmallSet.h:229

llvm::SmallSet::insert
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184

llvm::cl::opt
Definition CommandLine.h:1454

llvm::SyncScope::SingleThread
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition LLVMContext.h:55

llvm::SyncScope::ID
uint8_t ID
Definition LLVMContext.h:47

llvm::cl::Hidden
@ Hidden
Definition CommandLine.h:138

llvm::cl::apply
void apply(Opt *O, const Mod &M, const Mods &... Ms)
Definition CommandLine.h:1340

llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition CommandLine.h:444

llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26

llvm::createAMDGPUBarrierLatencyDAGMutation
std::unique_ptr< ScheduleDAGMutation > createAMDGPUBarrierLatencyDAGMutation(MachineFunction *MF)
Definition AMDGPUBarrierLatency.cpp:118

llvm::Latency
@ Latency
Definition SIMachineScheduler.h:34

llvm::getImm
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
Definition SPIRVUtils.cpp:1082

llvm::Op
DWARFExpression::Operation Op
Definition DWARFExpressionPrinter.cpp:22

llvm::cl::desc
Definition CommandLine.h:410