LLVM 19.0.0git
R600MachineScheduler.cpp
Go to the documentation of this file.
1//===-- R600MachineScheduler.cpp - R600 Scheduler Interface -*- C++ -*-----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// R600 Machine Scheduler interface
11//
12//===----------------------------------------------------------------------===//
13
16#include "R600Subtarget.h"
17
18using namespace llvm;
19
20#define DEBUG_TYPE "machine-scheduler"
21
23 assert(dag->hasVRegLiveness() && "R600SchedStrategy needs vreg liveness");
24 DAG = static_cast<ScheduleDAGMILive*>(dag);
25 const R600Subtarget &ST = DAG->MF.getSubtarget<R600Subtarget>();
26 TII = static_cast<const R600InstrInfo*>(DAG->TII);
27 TRI = static_cast<const R600RegisterInfo*>(DAG->TRI);
28 VLIW5 = !ST.hasCaymanISA();
29 MRI = &DAG->MRI;
30 CurInstKind = IDOther;
31 CurEmitted = 0;
32 OccupiedSlotsMask = 31;
33 InstKindLimit[IDAlu] = TII->getMaxAlusPerClause();
34 InstKindLimit[IDOther] = 32;
35 InstKindLimit[IDFetch] = ST.getTexVTXClauseSize();
36 AluInstCount = 0;
37 FetchInstCount = 0;
38}
39
40void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc,
41 std::vector<SUnit *> &QDst)
42{
43 llvm::append_range(QDst, QSrc);
44 QSrc.clear();
45}
46
47static unsigned getWFCountLimitedByGPR(unsigned GPRCount) {
48 assert (GPRCount && "GPRCount cannot be 0");
49 return 248 / GPRCount;
50}
51
53 SUnit *SU = nullptr;
54 NextInstKind = IDOther;
55
56 IsTopNode = false;
57
58 // check if we might want to switch current clause type
59 bool AllowSwitchToAlu = (CurEmitted >= InstKindLimit[CurInstKind]) ||
60 (Available[CurInstKind].empty());
61 bool AllowSwitchFromAlu = (CurEmitted >= InstKindLimit[CurInstKind]) &&
62 (!Available[IDFetch].empty() || !Available[IDOther].empty());
63
64 if (CurInstKind == IDAlu && !Available[IDFetch].empty()) {
65 // We use the heuristic provided by AMD Accelerated Parallel Processing
66 // OpenCL Programming Guide :
67 // The approx. number of WF that allows TEX inst to hide ALU inst is :
68 // 500 (cycles for TEX) / (AluFetchRatio * 8 (cycles for ALU))
69 float ALUFetchRationEstimate =
70 (AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) /
71 (FetchInstCount + Available[IDFetch].size());
72 if (ALUFetchRationEstimate == 0) {
73 AllowSwitchFromAlu = true;
74 } else {
75 unsigned NeededWF = 62.5f / ALUFetchRationEstimate;
76 LLVM_DEBUG(dbgs() << NeededWF << " approx. Wavefronts Required\n");
77 // We assume the local GPR requirements to be "dominated" by the requirement
78 // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and
79 // after TEX are indeed likely to consume or generate values from/for the
80 // TEX clause.
81 // Available[IDFetch].size() * 2 : GPRs required in the Fetch clause
82 // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need
83 // one GPR) or TmXYZW = TnXYZW (need 2 GPR).
84 // (TODO : use RegisterPressure)
85 // If we are going too use too many GPR, we flush Fetch instruction to lower
86 // register pressure on 128 bits regs.
87 unsigned NearRegisterRequirement = 2 * Available[IDFetch].size();
88 if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement))
89 AllowSwitchFromAlu = true;
90 }
91 }
92
93 if (!SU && ((AllowSwitchToAlu && CurInstKind != IDAlu) ||
94 (!AllowSwitchFromAlu && CurInstKind == IDAlu))) {
95 // try to pick ALU
96 SU = pickAlu();
97 if (!SU && !PhysicalRegCopy.empty()) {
98 SU = PhysicalRegCopy.front();
99 PhysicalRegCopy.erase(PhysicalRegCopy.begin());
100 }
101 if (SU) {
102 if (CurEmitted >= InstKindLimit[IDAlu])
103 CurEmitted = 0;
104 NextInstKind = IDAlu;
105 }
106 }
107
108 if (!SU) {
109 // try to pick FETCH
110 SU = pickOther(IDFetch);
111 if (SU)
112 NextInstKind = IDFetch;
113 }
114
115 // try to pick other
116 if (!SU) {
117 SU = pickOther(IDOther);
118 if (SU)
119 NextInstKind = IDOther;
120 }
121
122 LLVM_DEBUG(if (SU) {
123 dbgs() << " ** Pick node **\n";
124 DAG->dumpNode(*SU);
125 } else {
126 dbgs() << "NO NODE \n";
127 for (const SUnit &S : DAG->SUnits)
128 if (!S.isScheduled)
129 DAG->dumpNode(S);
130 });
131
132 return SU;
133}
134
135void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
136 if (NextInstKind != CurInstKind) {
137 LLVM_DEBUG(dbgs() << "Instruction Type Switch\n");
138 if (NextInstKind != IDAlu)
139 OccupiedSlotsMask |= 31;
140 CurEmitted = 0;
141 CurInstKind = NextInstKind;
142 }
143
144 if (CurInstKind == IDAlu) {
145 AluInstCount ++;
146 switch (getAluKind(SU)) {
147 case AluT_XYZW:
148 CurEmitted += 4;
149 break;
150 case AluDiscarded:
151 break;
152 default: {
153 ++CurEmitted;
155 E = SU->getInstr()->operands_end(); It != E; ++It) {
156 MachineOperand &MO = *It;
157 if (MO.isReg() && MO.getReg() == R600::ALU_LITERAL_X)
158 ++CurEmitted;
159 }
160 }
161 }
162 } else {
163 ++CurEmitted;
164 }
165
166 LLVM_DEBUG(dbgs() << CurEmitted << " Instructions Emitted in this clause\n");
167
168 if (CurInstKind != IDFetch) {
169 MoveUnits(Pending[IDFetch], Available[IDFetch]);
170 } else
171 FetchInstCount++;
172}
173
174static bool
176 if (MI->getOpcode() != R600::COPY)
177 return false;
178
179 return !MI->getOperand(1).getReg().isVirtual();
180}
181
183 LLVM_DEBUG(dbgs() << "Top Releasing "; DAG->dumpNode(*SU));
184}
185
187 LLVM_DEBUG(dbgs() << "Bottom Releasing "; DAG->dumpNode(*SU));
188 if (isPhysicalRegCopy(SU->getInstr())) {
189 PhysicalRegCopy.push_back(SU);
190 return;
191 }
192
193 int IK = getInstKind(SU);
194
195 // There is no export clause, we can schedule one as soon as its ready
196 if (IK == IDOther)
197 Available[IDOther].push_back(SU);
198 else
199 Pending[IK].push_back(SU);
200
201}
202
203bool R600SchedStrategy::regBelongsToClass(Register Reg,
204 const TargetRegisterClass *RC) const {
205 if (!Reg.isVirtual()) {
206 return RC->contains(Reg);
207 } else {
208 return MRI->getRegClass(Reg) == RC;
209 }
210}
211
212R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
213 MachineInstr *MI = SU->getInstr();
214
215 if (TII->isTransOnly(*MI))
216 return AluTrans;
217
218 switch (MI->getOpcode()) {
219 case R600::PRED_X:
220 return AluPredX;
221 case R600::INTERP_PAIR_XY:
222 case R600::INTERP_PAIR_ZW:
223 case R600::INTERP_VEC_LOAD:
224 case R600::DOT_4:
225 return AluT_XYZW;
226 case R600::COPY:
227 if (MI->getOperand(1).isUndef()) {
228 // MI will become a KILL, don't considers it in scheduling
229 return AluDiscarded;
230 }
231 break;
232 default:
233 break;
234 }
235
236 // Does the instruction take a whole IG ?
237 // XXX: Is it possible to add a helper function in R600InstrInfo that can
238 // be used here and in R600PacketizerList::isSoloInstruction() ?
239 if(TII->isVector(*MI) ||
240 TII->isCubeOp(MI->getOpcode()) ||
241 TII->isReductionOp(MI->getOpcode()) ||
242 MI->getOpcode() == R600::GROUP_BARRIER) {
243 return AluT_XYZW;
244 }
245
246 if (TII->isLDSInstr(MI->getOpcode())) {
247 return AluT_X;
248 }
249
250 // Is the result already assigned to a channel ?
251 unsigned DestSubReg = MI->getOperand(0).getSubReg();
252 switch (DestSubReg) {
253 case R600::sub0:
254 return AluT_X;
255 case R600::sub1:
256 return AluT_Y;
257 case R600::sub2:
258 return AluT_Z;
259 case R600::sub3:
260 return AluT_W;
261 default:
262 break;
263 }
264
265 // Is the result already member of a X/Y/Z/W class ?
266 Register DestReg = MI->getOperand(0).getReg();
267 if (regBelongsToClass(DestReg, &R600::R600_TReg32_XRegClass) ||
268 regBelongsToClass(DestReg, &R600::R600_AddrRegClass))
269 return AluT_X;
270 if (regBelongsToClass(DestReg, &R600::R600_TReg32_YRegClass))
271 return AluT_Y;
272 if (regBelongsToClass(DestReg, &R600::R600_TReg32_ZRegClass))
273 return AluT_Z;
274 if (regBelongsToClass(DestReg, &R600::R600_TReg32_WRegClass))
275 return AluT_W;
276 if (regBelongsToClass(DestReg, &R600::R600_Reg128RegClass))
277 return AluT_XYZW;
278
279 // LDS src registers cannot be used in the Trans slot.
280 if (TII->readsLDSSrcReg(*MI))
281 return AluT_XYZW;
282
283 return AluAny;
284}
285
286int R600SchedStrategy::getInstKind(SUnit* SU) {
287 int Opcode = SU->getInstr()->getOpcode();
288
289 if (TII->usesTextureCache(Opcode) || TII->usesVertexCache(Opcode))
290 return IDFetch;
291
292 if (TII->isALUInstr(Opcode)) {
293 return IDAlu;
294 }
295
296 switch (Opcode) {
297 case R600::PRED_X:
298 case R600::COPY:
299 case R600::CONST_COPY:
300 case R600::INTERP_PAIR_XY:
301 case R600::INTERP_PAIR_ZW:
302 case R600::INTERP_VEC_LOAD:
303 case R600::DOT_4:
304 return IDAlu;
305 default:
306 return IDOther;
307 }
308}
309
310SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q, bool AnyALU) {
311 if (Q.empty())
312 return nullptr;
313 for (std::vector<SUnit *>::reverse_iterator It = Q.rbegin(), E = Q.rend();
314 It != E; ++It) {
315 SUnit *SU = *It;
316 InstructionsGroupCandidate.push_back(SU->getInstr());
317 if (TII->fitsConstReadLimitations(InstructionsGroupCandidate) &&
318 (!AnyALU || !TII->isVectorOnly(*SU->getInstr()))) {
319 InstructionsGroupCandidate.pop_back();
320 Q.erase((It + 1).base());
321 return SU;
322 } else {
323 InstructionsGroupCandidate.pop_back();
324 }
325 }
326 return nullptr;
327}
328
329void R600SchedStrategy::LoadAlu() {
330 std::vector<SUnit *> &QSrc = Pending[IDAlu];
331 for (SUnit *SU : QSrc) {
332 AluKind AK = getAluKind(SU);
333 AvailableAlus[AK].push_back(SU);
334 }
335 QSrc.clear();
336}
337
338void R600SchedStrategy::PrepareNextSlot() {
339 LLVM_DEBUG(dbgs() << "New Slot\n");
340 assert(OccupiedSlotsMask && "Slot wasn't filled");
341 OccupiedSlotsMask = 0;
342 // if (HwGen == AMDGPUSubtarget::NORTHERN_ISLANDS)
343 // OccupiedSlotsMask |= 16;
344 InstructionsGroupCandidate.clear();
345 LoadAlu();
346}
347
348void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) {
349 int DstIndex = TII->getOperandIdx(MI->getOpcode(), R600::OpName::dst);
350 if (DstIndex == -1) {
351 return;
352 }
353 Register DestReg = MI->getOperand(DstIndex).getReg();
354 // PressureRegister crashes if an operand is def and used in the same inst
355 // and we try to constraint its regclass
356 for (MachineInstr::mop_iterator It = MI->operands_begin(),
357 E = MI->operands_end(); It != E; ++It) {
358 MachineOperand &MO = *It;
359 if (MO.isReg() && !MO.isDef() &&
360 MO.getReg() == DestReg)
361 return;
362 }
363 // Constrains the regclass of DestReg to assign it to Slot
364 switch (Slot) {
365 case 0:
366 MRI->constrainRegClass(DestReg, &R600::R600_TReg32_XRegClass);
367 break;
368 case 1:
369 MRI->constrainRegClass(DestReg, &R600::R600_TReg32_YRegClass);
370 break;
371 case 2:
372 MRI->constrainRegClass(DestReg, &R600::R600_TReg32_ZRegClass);
373 break;
374 case 3:
375 MRI->constrainRegClass(DestReg, &R600::R600_TReg32_WRegClass);
376 break;
377 }
378}
379
380SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot, bool AnyAlu) {
381 static const AluKind IndexToID[] = {AluT_X, AluT_Y, AluT_Z, AluT_W};
382 SUnit *SlotedSU = PopInst(AvailableAlus[IndexToID[Slot]], AnyAlu);
383 if (SlotedSU)
384 return SlotedSU;
385 SUnit *UnslotedSU = PopInst(AvailableAlus[AluAny], AnyAlu);
386 if (UnslotedSU)
387 AssignSlot(UnslotedSU->getInstr(), Slot);
388 return UnslotedSU;
389}
390
391unsigned R600SchedStrategy::AvailablesAluCount() const {
392 return AvailableAlus[AluAny].size() + AvailableAlus[AluT_XYZW].size() +
393 AvailableAlus[AluT_X].size() + AvailableAlus[AluT_Y].size() +
394 AvailableAlus[AluT_Z].size() + AvailableAlus[AluT_W].size() +
395 AvailableAlus[AluTrans].size() + AvailableAlus[AluDiscarded].size() +
396 AvailableAlus[AluPredX].size();
397}
398
399SUnit* R600SchedStrategy::pickAlu() {
400 while (AvailablesAluCount() || !Pending[IDAlu].empty()) {
401 if (!OccupiedSlotsMask) {
402 // Bottom up scheduling : predX must comes first
403 if (!AvailableAlus[AluPredX].empty()) {
404 OccupiedSlotsMask |= 31;
405 return PopInst(AvailableAlus[AluPredX], false);
406 }
407 // Flush physical reg copies (RA will discard them)
408 if (!AvailableAlus[AluDiscarded].empty()) {
409 OccupiedSlotsMask |= 31;
410 return PopInst(AvailableAlus[AluDiscarded], false);
411 }
412 // If there is a T_XYZW alu available, use it
413 if (!AvailableAlus[AluT_XYZW].empty()) {
414 OccupiedSlotsMask |= 15;
415 return PopInst(AvailableAlus[AluT_XYZW], false);
416 }
417 }
418 bool TransSlotOccupied = OccupiedSlotsMask & 16;
419 if (!TransSlotOccupied && VLIW5) {
420 if (!AvailableAlus[AluTrans].empty()) {
421 OccupiedSlotsMask |= 16;
422 return PopInst(AvailableAlus[AluTrans], false);
423 }
424 SUnit *SU = AttemptFillSlot(3, true);
425 if (SU) {
426 OccupiedSlotsMask |= 16;
427 return SU;
428 }
429 }
430 for (int Chan = 3; Chan > -1; --Chan) {
431 bool isOccupied = OccupiedSlotsMask & (1 << Chan);
432 if (!isOccupied) {
433 SUnit *SU = AttemptFillSlot(Chan, false);
434 if (SU) {
435 OccupiedSlotsMask |= (1 << Chan);
436 InstructionsGroupCandidate.push_back(SU->getInstr());
437 return SU;
438 }
439 }
440 }
441 PrepareNextSlot();
442 }
443 return nullptr;
444}
445
446SUnit* R600SchedStrategy::pickOther(int QID) {
447 SUnit *SU = nullptr;
448 std::vector<SUnit *> &AQ = Available[QID];
449
450 if (AQ.empty()) {
451 MoveUnits(Pending[QID], AQ);
452 }
453 if (!AQ.empty()) {
454 SU = AQ.back();
455 AQ.pop_back();
456 }
457 return SU;
458}
#define LLVM_DEBUG(X)
Definition: Debug.h:101
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
Provides R600 specific target descriptions.
static unsigned getWFCountLimitedByGPR(unsigned GPRCount)
static bool isPhysicalRegCopy(MachineInstr *MI)
R600 Machine Scheduler interface.
AMDGPU R600 specific subclass of TargetSubtarget.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Representation of each machine instruction.
Definition: MachineInstr.h:69
mop_iterator operands_begin()
Definition: MachineInstr.h:674
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:564
mop_iterator operands_end()
Definition: MachineInstr.h:675
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
const TargetRegisterClass * constrainRegClass(Register Reg, const TargetRegisterClass *RC, unsigned MinNumRegs=0)
constrainRegClass - Constrain the register class of the specified virtual register to be a common sub...
bool usesVertexCache(unsigned Opcode) const
bool usesTextureCache(unsigned Opcode) const
bool fitsConstReadLimitations(const std::vector< MachineInstr * > &) const
An instruction group can only access 2 channel pair (either [XY] or [ZW]) from KCache bank on R700+.
bool isVector(const MachineInstr &MI) const
Vector instructions are instructions that must fill all instruction slots within an instruction group...
bool isTransOnly(unsigned Opcode) const
bool isReductionOp(unsigned opcode) const
bool isCubeOp(unsigned opcode) const
bool isLDSInstr(unsigned Opcode) const
bool readsLDSSrcReg(const MachineInstr &MI) const
bool isALUInstr(unsigned Opcode) const
bool isVectorOnly(unsigned Opcode) const
int getOperandIdx(const MachineInstr &MI, unsigned Op) const
Get the index of Op in the MachineInstr.
void initialize(ScheduleDAGMI *dag) override
Initialize the strategy after building the DAG for a new region.
SUnit * pickNode(bool &IsTopNode) override
Pick the next node to schedule, or return NULL.
void releaseBottomNode(SUnit *SU) override
When all successor dependencies have been resolved, free this node for bottom-up scheduling.
void releaseTopNode(SUnit *SU) override
When all predecessor dependencies have been resolved, free this node for top-down scheduling.
void schedNode(SUnit *SU, bool IsTopNode) override
Notify MachineSchedStrategy that ScheduleDAGMI has scheduled an instruction and updated scheduled/rem...
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
Scheduling unit. This is a node in the scheduling DAG.
Definition: ScheduleDAG.h:242
bool isScheduled
True once scheduled.
Definition: ScheduleDAG.h:296
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
Definition: ScheduleDAG.h:390
void dumpNode(const SUnit &SU) const override
ScheduleDAGMILive is an implementation of ScheduleDAGInstrs that schedules machine instructions while...
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
MachineRegisterInfo & MRI
Virtual/real register map.
Definition: ScheduleDAG.h:578
const TargetInstrInfo * TII
Target instruction information.
Definition: ScheduleDAG.h:575
std::vector< SUnit > SUnits
The scheduling units.
Definition: ScheduleDAG.h:579
const TargetRegisterInfo * TRI
Target processor register info.
Definition: ScheduleDAG.h:576
MachineFunction & MF
Machine function.
Definition: ScheduleDAG.h:577
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1680
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2067
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163