33 "amdgpu-barrier-signal-wait-latency",
34 cl::desc(
"Synthetic latency between S_BARRIER_SIGNAL and S_BARRIER_WAIT "
35 "to encourage scheduling independent work between them"),
48 IgnoredScopes.
insert(Context.getOrInsertSyncScopeID(
"wavefront"));
49 IgnoredScopes.
insert(Context.getOrInsertSyncScopeID(
"wavefront-one-as"));
50 IgnoredScopes.
insert(Context.getOrInsertSyncScopeID(
"singlethread-one-as"));
53 if (!ST.requiresWaitOnWorkgroupReleaseFence()) {
55 IgnoredScopes.
insert(Context.getOrInsertSyncScopeID(
"workgroup"));
58 void apply(ScheduleDAGInstrs *DAG)
override;
63 SDep ForwardD = PredDep;
66 if (SuccDep == ForwardD) {
78 SDep ForwardD = PredDep;
81 if (SuccDep == ForwardD) {
92 const SIInstrInfo *
TII =
static_cast<const SIInstrInfo *
>(DAG->
TII);
93 constexpr unsigned FenceLatency = 2000;
99 for (SUnit &SU : DAG->
SUnits) {
101 unsigned Op =
MI->getOpcode();
103 if (
Op == AMDGPU::ATOMIC_FENCE) {
111 for (SDep &PredDep : SU.
Preds) {
117 if (!
MI->mayLoad() ||
MI->mayStore())
120 addLatencyToEdge(PredDep, SU,
121 SchedModel ? SchedModel->computeInstrLatency(
MI,
false)
124 }
else if (
Op == AMDGPU::S_BARRIER_WAIT) {
125 for (SDep &PredDep : SU.
Preds) {
127 const MachineInstr *PredMI = PredSU->
getInstr();
129 addLatencyToEdge(PredDep, SU, BarrierSignalWaitLatency);
132 }
else if (
TII->isLDSDMA(*
MI)) {
137 }
else if (
Op == AMDGPU::S_WAIT_TENSORCNT ||
138 Op == AMDGPU::S_WAIT_ASYNCCNT) {
139 auto needWaitFor = [&](SmallVectorImpl<SUnit *> &RegionLDSDMA, SUnit *SU,
141 if (RegionLDSDMA.
size() <=
static_cast<uint64_t
>(
Count)) {
146 auto I = RegionLDSDMA.
rbegin(),
E = RegionLDSDMA.
rend();
147 for (;
I !=
E;
I++) {
148 if (Counter >=
Count)
151 if (SU->NodeNum == (*I)->NodeNum)
159 int64_t WaitVal =
MI->getOperand(0).getImm();
160 for (SDep &PredDep : SU.Preds) {
161 if (PredDep.
getKind() != SDep::Kind::Data)
165 Register LDSDMACnt = AMDGPU::TENSORcnt;
167 if (
Op == AMDGPU::S_WAIT_ASYNCCNT) {
168 LDSDMACnt = AMDGPU::ASYNCcnt;
172 if (DepReg != LDSDMACnt)
183 if (!needWaitFor(
Op == AMDGPU::S_WAIT_ASYNCCNT ? RegionAsync
186 setLatencyForEdge(PredDep, SU, 1);
195std::unique_ptr<ScheduleDAGMutation>
197 return std::make_unique<BarrierLatency>(MF);
static cl::opt< unsigned > BarrierSignalWaitLatencyOpt("amdgpu-barrier-signal-wait-latency", cl::desc("Synthetic latency between S_BARRIER_SIGNAL and S_BARRIER_WAIT " "to encourage scheduling independent work between them"), cl::init(16), cl::Hidden)
Provides AMDGPU specific target descriptions.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
Promote Memory to Register
Interface definition for SIInstrInfo.
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
This is an important class for using LLVM in a threaded context.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Function & getFunction()
Return the LLVM function that this machine code represents.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
Kind getKind() const
Returns an enum value representing the kind of the dependence.
void setLatency(unsigned Lat)
Sets the latency for this edge.
unsigned getLatency() const
Returns the latency value for this edge, which roughly means the minimum number of cycles that must e...
Register getReg() const
Returns the register associated with this edge.
bool isBarrier() const
Tests if this is an Order dependence that is marked as a barrier.
Scheduling unit. This is a node in the scheduling DAG.
SmallVector< SDep, 4 > Succs
All sunit successors.
LLVM_ABI void setDepthDirty()
Sets a flag in this node to indicate that its stored Depth value will require recomputation the next ...
SmallVector< SDep, 4 > Preds
All sunit predecessors.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
A ScheduleDAG for scheduling lists of MachineInstr.
const TargetSchedModel * getSchedModel() const
Gets the machine model for instruction scheduling.
Mutate the DAG as a postpass after normal DAG building.
const TargetInstrInfo * TII
Target instruction information.
std::vector< SUnit > SUnits
The scheduling units.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
bool contains(const T &V) const
Check if the SmallSet contains the given element.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
void push_back(const T &Elt)
reverse_iterator rbegin()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
void apply(Opt *O, const Mod &M, const Mods &... Ms)
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
std::unique_ptr< ScheduleDAGMutation > createAMDGPUBarrierLatencyDAGMutation(MachineFunction *MF)
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
FunctionAddr VTableAddr Count
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
DWARFExpression::Operation Op