LLVM 22.0.0git
AMDGPUCustomBehaviour.cpp
Go to the documentation of this file.
1//===------------------ AMDGPUCustomBehaviour.cpp ---------------*-C++ -* -===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// This file implements methods from the AMDGPUCustomBehaviour class.
11///
12//===----------------------------------------------------------------------===//
13
21
22namespace llvm::mca {
23
25 std::unique_ptr<Instruction> &Inst, const MCInst &MCI) {
26 switch (MCI.getOpcode()) {
27 case AMDGPU::S_WAITCNT:
28 case AMDGPU::S_WAITCNT_soft:
29 case AMDGPU::S_WAITCNT_EXPCNT:
30 case AMDGPU::S_WAITCNT_LGKMCNT:
31 case AMDGPU::S_WAITCNT_VMCNT:
32 case AMDGPU::S_WAITCNT_VSCNT:
33 case AMDGPU::S_WAITCNT_VSCNT_soft:
34 case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
35 case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
36 case AMDGPU::S_WAITCNT_VMCNT_gfx10:
37 case AMDGPU::S_WAITCNT_VSCNT_gfx10:
38 case AMDGPU::S_WAITCNT_gfx10:
39 case AMDGPU::S_WAITCNT_gfx6_gfx7:
40 case AMDGPU::S_WAITCNT_vi:
41 return processWaitCnt(Inst, MCI);
42 }
43}
44
45// s_waitcnt instructions encode important information as immediate operands
46// which are lost during the MCInst -> mca::Instruction lowering.
47void AMDGPUInstrPostProcess::processWaitCnt(std::unique_ptr<Instruction> &Inst,
48 const MCInst &MCI) {
49 for (int Idx = 0, N = MCI.size(); Idx < N; Idx++) {
51 const MCOperand &MCOp = MCI.getOperand(Idx);
52 if (MCOp.isReg()) {
54 } else if (MCOp.isImm()) {
56 }
57 Op.setIndex(Idx);
58 Inst->addOperand(Op);
59 }
60}
61
68
70 const InstRef &IR) {
71 const Instruction &Inst = *IR.getInstruction();
72 unsigned Opcode = Inst.getOpcode();
73
74 // llvm-mca is generally run on fully compiled assembly so we wouldn't see any
75 // pseudo instructions here. However, there are plans for the future to make
76 // it possible to use mca within backend passes. As such, I have left the
77 // pseudo version of s_waitcnt within this switch statement.
78 switch (Opcode) {
79 default:
80 return 0;
81 case AMDGPU::S_WAITCNT: // This instruction
82 case AMDGPU::S_WAITCNT_soft:
83 case AMDGPU::S_WAITCNT_EXPCNT:
84 case AMDGPU::S_WAITCNT_LGKMCNT:
85 case AMDGPU::S_WAITCNT_VMCNT:
86 case AMDGPU::S_WAITCNT_VSCNT:
87 case AMDGPU::S_WAITCNT_VSCNT_soft: // to this instruction are all pseudo.
88 case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
89 case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
90 case AMDGPU::S_WAITCNT_VMCNT_gfx10:
91 case AMDGPU::S_WAITCNT_VSCNT_gfx10:
92 case AMDGPU::S_WAITCNT_gfx10:
93 case AMDGPU::S_WAITCNT_gfx6_gfx7:
94 case AMDGPU::S_WAITCNT_vi:
95 // s_endpgm also behaves as if there is an implicit
96 // s_waitcnt 0, but I'm not sure if it would be appropriate
97 // to model this in llvm-mca based on how the iterations work
98 // while simulating the pipeline over and over.
99 return handleWaitCnt(IssuedInst, IR);
100 }
101
102 return 0;
103}
104
105unsigned AMDGPUCustomBehaviour::handleWaitCnt(ArrayRef<InstRef> IssuedInst,
106 const InstRef &IR) {
107 // Currently, all s_waitcnt instructions are handled except s_waitcnt_depctr.
108 // I do not know how that instruction works so I did not attempt to model it.
109 // set the max values to begin
110 unsigned Vmcnt = 63;
111 unsigned Expcnt = 7;
112 unsigned Lgkmcnt = 31;
113 unsigned Vscnt = 63;
114 unsigned CurrVmcnt = 0;
115 unsigned CurrExpcnt = 0;
116 unsigned CurrLgkmcnt = 0;
117 unsigned CurrVscnt = 0;
118 unsigned CyclesToWaitVm = ~0U;
119 unsigned CyclesToWaitExp = ~0U;
120 unsigned CyclesToWaitLgkm = ~0U;
121 unsigned CyclesToWaitVs = ~0U;
122
123 computeWaitCnt(IR, Vmcnt, Expcnt, Lgkmcnt, Vscnt);
124
125 // We will now look at each of the currently executing instructions
126 // to find out if this wait instruction still needs to wait.
127 for (const InstRef &PrevIR : IssuedInst) {
128 const Instruction &PrevInst = *PrevIR.getInstruction();
129 const unsigned PrevInstIndex = PrevIR.getSourceIndex() % SrcMgr.size();
130 const WaitCntInfo &PrevInstWaitInfo = InstrWaitCntInfo[PrevInstIndex];
131 const int CyclesLeft = PrevInst.getCyclesLeft();
132 assert(CyclesLeft != UNKNOWN_CYCLES &&
133 "We should know how many cycles are left for this instruction");
134 if (PrevInstWaitInfo.VmCnt) {
135 CurrVmcnt++;
136 if ((unsigned)CyclesLeft < CyclesToWaitVm)
137 CyclesToWaitVm = CyclesLeft;
138 }
139 if (PrevInstWaitInfo.ExpCnt) {
140 CurrExpcnt++;
141 if ((unsigned)CyclesLeft < CyclesToWaitExp)
142 CyclesToWaitExp = CyclesLeft;
143 }
144 if (PrevInstWaitInfo.LgkmCnt) {
145 CurrLgkmcnt++;
146 if ((unsigned)CyclesLeft < CyclesToWaitLgkm)
147 CyclesToWaitLgkm = CyclesLeft;
148 }
149 if (PrevInstWaitInfo.VsCnt) {
150 CurrVscnt++;
151 if ((unsigned)CyclesLeft < CyclesToWaitVs)
152 CyclesToWaitVs = CyclesLeft;
153 }
154 }
155
156 unsigned CyclesToWait = ~0U;
157 if (CurrVmcnt > Vmcnt && CyclesToWaitVm < CyclesToWait)
158 CyclesToWait = CyclesToWaitVm;
159 if (CurrExpcnt > Expcnt && CyclesToWaitExp < CyclesToWait)
160 CyclesToWait = CyclesToWaitExp;
161 if (CurrLgkmcnt > Lgkmcnt && CyclesToWaitLgkm < CyclesToWait)
162 CyclesToWait = CyclesToWaitLgkm;
163 if (CurrVscnt > Vscnt && CyclesToWaitVs < CyclesToWait)
164 CyclesToWait = CyclesToWaitVs;
165
166 // We may underestimate how many cycles we need to wait, but this
167 // isn't a big deal. Our return value is just how many cycles until
168 // this function gets run again. So as long as we don't overestimate
169 // the wait time, we'll still end up stalling at this instruction
170 // for the correct number of cycles.
171
172 if (CyclesToWait == ~0U)
173 return 0;
174 return CyclesToWait;
175}
176
177void AMDGPUCustomBehaviour::computeWaitCnt(const InstRef &IR, unsigned &Vmcnt,
178 unsigned &Expcnt, unsigned &Lgkmcnt,
179 unsigned &Vscnt) {
180 AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU());
181 const Instruction &Inst = *IR.getInstruction();
182 unsigned Opcode = Inst.getOpcode();
183
184 switch (Opcode) {
185 case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
186 case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
187 case AMDGPU::S_WAITCNT_VMCNT_gfx10:
188 case AMDGPU::S_WAITCNT_VSCNT_gfx10: {
189 // Should probably be checking for nullptr
190 // here, but I'm not sure how I should handle the case
191 // where we see a nullptr.
192 const MCAOperand *OpReg = Inst.getOperand(0);
193 const MCAOperand *OpImm = Inst.getOperand(1);
194 assert(OpReg && OpReg->isReg() && "First operand should be a register.");
195 assert(OpImm && OpImm->isImm() && "Second operand should be an immediate.");
196 if (OpReg->getReg() != AMDGPU::SGPR_NULL) {
197 // Instruction is using a real register.
198 // Since we can't know what value this register will have,
199 // we can't compute what the value of this wait should be.
200 WithColor::warning() << "The register component of "
201 << MCII.getName(Opcode) << " will be completely "
202 << "ignored. So the wait may not be accurate.\n";
203 }
204 switch (Opcode) {
205 // Redundant switch so I don't have to repeat the code above
206 // for each case. There are more clever ways to avoid this
207 // extra switch and anyone can feel free to implement one of them.
208 case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
209 Expcnt = OpImm->getImm();
210 break;
211 case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
212 Lgkmcnt = OpImm->getImm();
213 break;
214 case AMDGPU::S_WAITCNT_VMCNT_gfx10:
215 Vmcnt = OpImm->getImm();
216 break;
217 case AMDGPU::S_WAITCNT_VSCNT_gfx10:
218 Vscnt = OpImm->getImm();
219 break;
220 }
221 return;
222 }
223 case AMDGPU::S_WAITCNT_gfx10:
224 case AMDGPU::S_WAITCNT_gfx6_gfx7:
225 case AMDGPU::S_WAITCNT_vi:
226 unsigned WaitCnt = Inst.getOperand(0)->getImm();
227 AMDGPU::decodeWaitcnt(IV, WaitCnt, Vmcnt, Expcnt, Lgkmcnt);
228 return;
229 }
230}
231
232void AMDGPUCustomBehaviour::generateWaitCntInfo() {
233 // The core logic from this function is taken from
234 // SIInsertWaitcnts::updateEventWaitcntAfter() In that pass, the instructions
235 // that are being looked at are in the MachineInstr format, whereas we have
236 // access to the MCInst format. The side effects of this are that we can't use
237 // the mayAccessVMEMThroughFlat(Inst) or mayAccessLDSThroughFlat(Inst)
238 // functions. Therefore, we conservatively assume that these functions will
239 // return true. This may cause a few instructions to be incorrectly tagged
240 // with an extra CNT. However, these are instructions that do interact with at
241 // least one CNT so giving them an extra CNT shouldn't cause issues in most
242 // scenarios.
243 AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU());
244 InstrWaitCntInfo.resize(SrcMgr.size());
245
246 for (const auto &EN : llvm::enumerate(SrcMgr.getInstructions())) {
247 const std::unique_ptr<Instruction> &Inst = EN.value();
248 unsigned Index = EN.index();
249 unsigned Opcode = Inst->getOpcode();
250 const MCInstrDesc &MCID = MCII.get(Opcode);
251 if ((MCID.TSFlags & SIInstrFlags::DS) &&
253 InstrWaitCntInfo[Index].LgkmCnt = true;
254 if (isAlwaysGDS(Opcode) || hasModifiersSet(Inst, AMDGPU::OpName::gds))
255 InstrWaitCntInfo[Index].ExpCnt = true;
256 } else if (MCID.TSFlags & SIInstrFlags::FLAT) {
257 // We conservatively assume that mayAccessVMEMThroughFlat(Inst)
258 // and mayAccessLDSThroughFlat(Inst) would both return true for this
259 // instruction. We have to do this because those functions use
260 // information about the memory operands that we don't have access to.
261 InstrWaitCntInfo[Index].LgkmCnt = true;
262 if (!STI.hasFeature(AMDGPU::FeatureVscnt))
263 InstrWaitCntInfo[Index].VmCnt = true;
264 else if (MCID.mayLoad() && !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet))
265 InstrWaitCntInfo[Index].VmCnt = true;
266 else
267 InstrWaitCntInfo[Index].VsCnt = true;
268 } else if (isVMEM(MCID) && !AMDGPU::getMUBUFIsBufferInv(Opcode)) {
269 if (!STI.hasFeature(AMDGPU::FeatureVscnt))
270 InstrWaitCntInfo[Index].VmCnt = true;
271 else if ((MCID.mayLoad() &&
273 ((MCID.TSFlags & SIInstrFlags::MIMG) && !MCID.mayLoad() &&
274 !MCID.mayStore()))
275 InstrWaitCntInfo[Index].VmCnt = true;
276 else if (MCID.mayStore())
277 InstrWaitCntInfo[Index].VsCnt = true;
278
279 // (IV.Major < 7) is meant to represent
280 // GCNTarget.vmemWriteNeedsExpWaitcnt()
281 // which is defined as
282 // { return getGeneration() < SEA_ISLANDS; }
283 if (IV.Major < 7 &&
284 (MCID.mayStore() || (MCID.TSFlags & SIInstrFlags::IsAtomicRet)))
285 InstrWaitCntInfo[Index].ExpCnt = true;
286 } else if (MCID.TSFlags & SIInstrFlags::SMRD) {
287 InstrWaitCntInfo[Index].LgkmCnt = true;
288 } else if (MCID.TSFlags & SIInstrFlags::EXP) {
289 InstrWaitCntInfo[Index].ExpCnt = true;
290 } else {
291 switch (Opcode) {
292 case AMDGPU::S_SENDMSG:
293 case AMDGPU::S_SENDMSGHALT:
294 case AMDGPU::S_MEMTIME:
295 case AMDGPU::S_MEMREALTIME:
296 InstrWaitCntInfo[Index].LgkmCnt = true;
297 break;
298 }
299 }
300 }
301}
302
303// taken from SIInstrInfo::isVMEM()
304bool AMDGPUCustomBehaviour::isVMEM(const MCInstrDesc &MCID) {
305 return MCID.TSFlags & SIInstrFlags::MUBUF ||
308}
309
310// taken from SIInstrInfo::hasModifiersSet()
311bool AMDGPUCustomBehaviour::hasModifiersSet(
312 const std::unique_ptr<Instruction> &Inst, AMDGPU::OpName OpName) const {
313 int Idx = AMDGPU::getNamedOperandIdx(Inst->getOpcode(), OpName);
314 if (Idx == -1)
315 return false;
316
317 const MCAOperand *Op = Inst->getOperand(Idx);
318 if (Op == nullptr || !Op->isImm() || !Op->getImm())
319 return false;
320
321 return true;
322}
323
324// taken from SIInstrInfo::isGWS()
325bool AMDGPUCustomBehaviour::isGWS(uint16_t Opcode) const {
326 const MCInstrDesc &MCID = MCII.get(Opcode);
327 return MCID.TSFlags & SIInstrFlags::GWS;
328}
329
330// taken from SIInstrInfo::isAlwaysGDS()
331bool AMDGPUCustomBehaviour::isAlwaysGDS(uint16_t Opcode) const {
332 return Opcode == AMDGPU::DS_ORDERED_COUNT ||
333 Opcode == AMDGPU::DS_ADD_GS_REG_RTN ||
334 Opcode == AMDGPU::DS_SUB_GS_REG_RTN || isGWS(Opcode);
335}
336
337} // namespace llvm::mca
338
339using namespace llvm;
340using namespace mca;
341
342static CustomBehaviour *
344 const mca::SourceMgr &SrcMgr,
345 const MCInstrInfo &MCII) {
346 return new AMDGPUCustomBehaviour(STI, SrcMgr, MCII);
347}
348
349static InstrPostProcess *
351 const MCInstrInfo &MCII) {
352 return new AMDGPUInstrPostProcess(STI, MCII);
353}
354
355/// Extern function to initialize the targets for the AMDGPU backend
356
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static CustomBehaviour * createAMDGPUCustomBehaviour(const MCSubtargetInfo &STI, const mca::SourceMgr &SrcMgr, const MCInstrInfo &MCII)
LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTargetMCA()
Extern function to initialize the targets for the AMDGPU backend.
static InstrPostProcess * createAMDGPUInstrPostProcess(const MCSubtargetInfo &STI, const MCInstrInfo &MCII)
This file defines the AMDGPUCustomBehaviour class which inherits from CustomBehaviour.
Provides AMDGPU specific target descriptions.
#define LLVM_ABI
Definition Compiler.h:213
#define LLVM_EXTERNAL_VISIBILITY
Definition Compiler.h:132
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:80
static const uint32_t IV[8]
Definition blake3_impl.h:83
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Instances of this class represent a single low-level machine instruction.
Definition MCInst.h:188
unsigned getOpcode() const
Definition MCInst.h:202
size_t size() const
Definition MCInst.h:226
const MCOperand & getOperand(unsigned i) const
Definition MCInst.h:210
bool mayStore() const
Return true if this instruction could possibly modify memory.
bool mayLoad() const
Return true if this instruction could possibly read memory.
Interface to description of machine instruction set.
Definition MCInstrInfo.h:27
Instances of this class represent operands of the MCInst class.
Definition MCInst.h:40
int64_t getImm() const
Definition MCInst.h:84
bool isImm() const
Definition MCInst.h:66
bool isReg() const
Definition MCInst.h:65
MCRegister getReg() const
Returns the register number.
Definition MCInst.h:73
Generic base class for all target subtargets.
Value * getOperand(unsigned i) const
Definition User.h:232
static LLVM_ABI raw_ostream & warning()
Convenience method for printing "warning: " to stderr.
Definition WithColor.cpp:85
unsigned checkCustomHazard(ArrayRef< InstRef > IssuedInst, const InstRef &IR) override
This method is used to determine if an instruction should be allowed to be dispatched.
AMDGPUCustomBehaviour(const MCSubtargetInfo &STI, const mca::SourceMgr &SrcMgr, const MCInstrInfo &MCII)
void postProcessInstruction(std::unique_ptr< Instruction > &Inst, const MCInst &MCI) override
This method can be overriden by targets to modify the mca::Instruction object after it has been lower...
const MCInstrInfo & MCII
const mca::SourceMgr & SrcMgr
const MCSubtargetInfo & STI
CustomBehaviour(const MCSubtargetInfo &STI, const mca::SourceMgr &SrcMgr, const MCInstrInfo &MCII)
An InstRef contains both a SourceMgr index and Instruction pair.
unsigned getOpcode() const
An instruction propagated through the simulated instruction pipeline.
int getCyclesLeft() const
A representation of an mca::Instruction operand for use in mca::CustomBehaviour.
Definition Instruction.h:39
unsigned getReg() const
Returns the register number.
Definition Instruction.h:75
static MCAOperand createImm(int64_t Val)
static MCAOperand createReg(unsigned Reg)
Definition Instruction.h:99
int64_t getImm() const
Definition Instruction.h:80
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt, unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt)
Decodes Vmcnt, Expcnt and Lgkmcnt from given Waitcnt for given isa Version, and writes decoded values...
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
bool getMUBUFIsBufferInv(unsigned Opc)
constexpr int UNKNOWN_CYCLES
Definition Instruction.h:35
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2452
Target & getTheR600Target()
The target for R600 GPUs.
SourceMgr SrcMgr
Definition Error.cpp:24
Target & getTheGCNTarget()
The target for GCN GPUs.
DWARFExpression::Operation Op
#define N
static void RegisterInstrPostProcess(Target &T, Target::InstrPostProcessCtorTy Fn)
RegisterInstrPostProcess - Register an InstrPostProcess implementation for the given target.
static void RegisterCustomBehaviour(Target &T, Target::CustomBehaviourCtorTy Fn)
RegisterCustomBehaviour - Register a CustomBehaviour implementation for the given target.
Abstracting the input code sequence (a sequence of MCInst) and assigning unique identifiers to every ...
Definition SourceMgr.h:29