LLVM 23.0.0git
AMDGPUBarrierLatency.cpp
Go to the documentation of this file.
1//===--- AMDGPUBarrierLatency.cpp - AMDGPU Barrier Latency ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file This file contains a DAG scheduling mutation to add latency to:
10/// 1. Barrier edges between ATOMIC_FENCE instructions and preceding
11/// memory accesses potentially affected by the fence.
12/// This encourages the scheduling of more instructions before
13/// ATOMIC_FENCE instructions. ATOMIC_FENCE instructions may
14/// introduce wait counting or indicate an impending S_BARRIER
15/// wait. Having more instructions in-flight across these
16/// constructs improves latency hiding.
17/// 2. Barrier edges from S_BARRIER_SIGNAL to S_BARRIER_WAIT.
18/// This encourages independent work to be scheduled between
19/// signal and wait, hiding barrier synchronization latency.
20//
21//===----------------------------------------------------------------------===//
22
24#include "GCNSubtarget.h"
26#include "SIInstrInfo.h"
29
30using namespace llvm;
31
33 "amdgpu-barrier-signal-wait-latency",
34 cl::desc("Synthetic latency between S_BARRIER_SIGNAL and S_BARRIER_WAIT "
35 "to encourage scheduling independent work between them"),
36 cl::init(16), cl::Hidden);
37
38namespace {
39
40class BarrierLatency : public ScheduleDAGMutation {
41private:
42 SmallSet<SyncScope::ID, 4> IgnoredScopes;
43
44public:
45 BarrierLatency(MachineFunction *MF) {
46 LLVMContext &Context = MF->getFunction().getContext();
47 IgnoredScopes.insert(SyncScope::SingleThread);
48 IgnoredScopes.insert(Context.getOrInsertSyncScopeID("wavefront"));
49 IgnoredScopes.insert(Context.getOrInsertSyncScopeID("wavefront-one-as"));
50 IgnoredScopes.insert(Context.getOrInsertSyncScopeID("singlethread-one-as"));
51
52 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
53 if (!ST.requiresWaitOnWorkgroupReleaseFence()) {
54 // Prior to GFX10 workgroup scope does not normally require waitcnts
55 IgnoredScopes.insert(Context.getOrInsertSyncScopeID("workgroup"));
56 }
57 }
58 void apply(ScheduleDAGInstrs *DAG) override;
59};
60
61void addLatencyToEdge(SDep &PredDep, SUnit &SU, unsigned Latency) {
62 SUnit *PredSU = PredDep.getSUnit();
63 SDep ForwardD = PredDep;
64 ForwardD.setSUnit(&SU);
65 for (SDep &SuccDep : PredSU->Succs) {
66 if (SuccDep == ForwardD) {
67 SuccDep.setLatency(SuccDep.getLatency() + Latency);
68 break;
69 }
70 }
71 PredDep.setLatency(PredDep.getLatency() + Latency);
72 PredSU->setDepthDirty();
73 SU.setDepthDirty();
74}
75
76void setLatencyForEdge(SDep &PredDep, SUnit &SU, unsigned Latency) {
77 SUnit *PredSU = PredDep.getSUnit();
78 SDep ForwardD = PredDep;
79 ForwardD.setSUnit(&SU);
80 for (SDep &SuccDep : PredSU->Succs) {
81 if (SuccDep == ForwardD) {
82 SuccDep.setLatency(Latency);
83 break;
84 }
85 }
86 PredDep.setLatency(Latency);
87 PredSU->setDepthDirty();
88 SU.setDepthDirty();
89}
90
91void BarrierLatency::apply(ScheduleDAGInstrs *DAG) {
92 const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(DAG->TII);
93 constexpr unsigned FenceLatency = 2000;
94 const unsigned BarrierSignalWaitLatency = BarrierSignalWaitLatencyOpt;
96 SmallVector<SUnit *, 8> RegionAsync;
97 const TargetSchedModel *SchedModel = DAG->getSchedModel();
98
99 for (SUnit &SU : DAG->SUnits) {
100 const MachineInstr *MI = SU.getInstr();
101 unsigned Op = MI->getOpcode();
102
103 if (Op == AMDGPU::ATOMIC_FENCE) {
104 // Update latency on barrier edges of ATOMIC_FENCE.
105 // Ignore scopes not expected to have any latency.
106 SyncScope::ID SSID =
107 static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
108 if (IgnoredScopes.contains(SSID))
109 continue;
110
111 for (SDep &PredDep : SU.Preds) {
112 if (!PredDep.isBarrier())
113 continue;
114 SUnit *PredSU = PredDep.getSUnit();
115 MachineInstr *MI = PredSU->getInstr();
116 // Only consider memory loads
117 if (!MI->mayLoad() || MI->mayStore())
118 continue;
119
120 addLatencyToEdge(PredDep, SU,
121 SchedModel ? SchedModel->computeInstrLatency(MI, false)
122 : FenceLatency);
123 }
124 } else if (Op == AMDGPU::S_BARRIER_WAIT) {
125 for (SDep &PredDep : SU.Preds) {
126 SUnit *PredSU = PredDep.getSUnit();
127 const MachineInstr *PredMI = PredSU->getInstr();
128 if (TII->isBarrierStart(PredMI->getOpcode())) {
129 addLatencyToEdge(PredDep, SU, BarrierSignalWaitLatency);
130 }
131 }
132 } else if (TII->isLDSDMA(*MI)) {
133 if (MI->getDesc().TSFlags & SIInstrFlags::TENSOR_CNT)
134 RegionTDM.push_back(&SU);
135 else if (MI->getDesc().TSFlags & SIInstrFlags::ASYNC_CNT)
136 RegionAsync.push_back(&SU);
137 } else if (Op == AMDGPU::S_WAIT_TENSORCNT ||
138 Op == AMDGPU::S_WAIT_ASYNCCNT) {
139 auto needWaitFor = [&](SmallVectorImpl<SUnit *> &RegionLDSDMA, SUnit *SU,
140 int64_t Count) {
141 if (RegionLDSDMA.size() <= static_cast<uint64_t>(Count)) {
142 return false;
143 }
144
145 int64_t Counter = 0;
146 auto I = RegionLDSDMA.rbegin(), E = RegionLDSDMA.rend();
147 for (; I != E; I++) {
148 if (Counter >= Count)
149 return true;
150
151 if (SU->NodeNum == (*I)->NodeNum)
152 return false;
153
154 ++Counter;
155 }
156 llvm_unreachable("Malformed RegionLDSDMA");
157 };
158
159 int64_t WaitVal = MI->getOperand(0).getImm();
160 for (SDep &PredDep : SU.Preds) {
161 if (PredDep.getKind() != SDep::Kind::Data)
162 continue;
163
164 Register DepReg = PredDep.getReg();
165 Register LDSDMACnt = AMDGPU::TENSORcnt;
166 uint64_t LDSDMAFlags = SIInstrFlags::TENSOR_CNT;
167 if (Op == AMDGPU::S_WAIT_ASYNCCNT) {
168 LDSDMACnt = AMDGPU::ASYNCcnt;
169 LDSDMAFlags = SIInstrFlags::ASYNC_CNT;
170 }
171
172 if (DepReg != LDSDMACnt)
173 continue;
174
175 SUnit *PredSU = PredDep.getSUnit();
176
177 // The data dep can be carried by a non-LDSDMA SU
178 // (e.g. an intervening COPY or pseudo). Such predecessors are not
179 // tracked, so needWaitFor cannot reason about them.
180 if (!(PredSU->getInstr()->getDesc().TSFlags & LDSDMAFlags))
181 continue;
182
183 if (!needWaitFor(Op == AMDGPU::S_WAIT_ASYNCCNT ? RegionAsync
184 : RegionTDM,
185 PredSU, WaitVal)) {
186 setLatencyForEdge(PredDep, SU, 1);
187 }
188 }
189 }
190 }
191}
192
193} // end namespace
194
195std::unique_ptr<ScheduleDAGMutation>
197 return std::make_unique<BarrierLatency>(MF);
198}
static cl::opt< unsigned > BarrierSignalWaitLatencyOpt("amdgpu-barrier-signal-wait-latency", cl::desc("Synthetic latency between S_BARRIER_SIGNAL and S_BARRIER_WAIT " "to encourage scheduling independent work between them"), cl::init(16), cl::Hidden)
Provides AMDGPU specific target descriptions.
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition MD5.cpp:57
Promote Memory to Register
Definition Mem2Reg.cpp:110
Interface definition for SIInstrInfo.
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:354
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Function & getFunction()
Return the LLVM function that this machine code represents.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
Scheduling dependency.
Definition ScheduleDAG.h:51
SUnit * getSUnit() const
Kind getKind() const
Returns an enum value representing the kind of the dependence.
void setLatency(unsigned Lat)
Sets the latency for this edge.
unsigned getLatency() const
Returns the latency value for this edge, which roughly means the minimum number of cycles that must e...
void setSUnit(SUnit *SU)
Register getReg() const
Returns the register associated with this edge.
bool isBarrier() const
Tests if this is an Order dependence that is marked as a barrier.
Scheduling unit. This is a node in the scheduling DAG.
SmallVector< SDep, 4 > Succs
All sunit successors.
LLVM_ABI void setDepthDirty()
Sets a flag in this node to indicate that its stored Depth value will require recomputation the next ...
SmallVector< SDep, 4 > Preds
All sunit predecessors.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
A ScheduleDAG for scheduling lists of MachineInstr.
const TargetSchedModel * getSchedModel() const
Gets the machine model for instruction scheduling.
Mutate the DAG as a postpass after normal DAG building.
const TargetInstrInfo * TII
Target instruction information.
std::vector< SUnit > SUnits
The scheduling units.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:134
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition SmallSet.h:229
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184
void push_back(const T &Elt)
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition LLVMContext.h:55
void apply(Opt *O, const Mod &M, const Mods &... Ms)
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
std::unique_ptr< ScheduleDAGMutation > createAMDGPUBarrierLatencyDAGMutation(MachineFunction *MF)
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
DWARFExpression::Operation Op