File: | llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp |
Warning: | line 1649, column 3 Called C++ object pointer is null |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | //===-- SIMachineScheduler.cpp - SI Scheduler Interface -------------------===// | |||
2 | // | |||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | |||
4 | // See https://llvm.org/LICENSE.txt for license information. | |||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | |||
6 | // | |||
7 | //===----------------------------------------------------------------------===// | |||
8 | // | |||
9 | /// \file | |||
10 | /// SI Machine Scheduler interface | |||
11 | // | |||
12 | //===----------------------------------------------------------------------===// | |||
13 | ||||
14 | #include "SIMachineScheduler.h" | |||
15 | #include "SIInstrInfo.h" | |||
16 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" | |||
17 | #include "llvm/CodeGen/LiveIntervals.h" | |||
18 | #include "llvm/CodeGen/MachineRegisterInfo.h" | |||
19 | ||||
20 | using namespace llvm; | |||
21 | ||||
22 | #define DEBUG_TYPE"machine-scheduler" "machine-scheduler" | |||
23 | ||||
24 | // This scheduler implements a different scheduling algorithm than | |||
25 | // GenericScheduler. | |||
26 | // | |||
27 | // There are several specific architecture behaviours that can't be modelled | |||
28 | // for GenericScheduler: | |||
29 | // . When accessing the result of an SGPR load instruction, you have to wait | |||
30 | // for all the SGPR load instructions before your current instruction to | |||
31 | // have finished. | |||
32 | // . When accessing the result of an VGPR load instruction, you have to wait | |||
33 | // for all the VGPR load instructions previous to the VGPR load instruction | |||
34 | // you are interested in to finish. | |||
35 | // . The less the register pressure, the best load latencies are hidden | |||
36 | // | |||
37 | // Moreover some specifities (like the fact a lot of instructions in the shader | |||
38 | // have few dependencies) makes the generic scheduler have some unpredictable | |||
39 | // behaviours. For example when register pressure becomes high, it can either | |||
40 | // manage to prevent register pressure from going too high, or it can | |||
41 | // increase register pressure even more than if it hadn't taken register | |||
42 | // pressure into account. | |||
43 | // | |||
44 | // Also some other bad behaviours are generated, like loading at the beginning | |||
45 | // of the shader a constant in VGPR you won't need until the end of the shader. | |||
46 | // | |||
47 | // The scheduling problem for SI can distinguish three main parts: | |||
48 | // . Hiding high latencies (texture sampling, etc) | |||
49 | // . Hiding low latencies (SGPR constant loading, etc) | |||
50 | // . Keeping register usage low for better latency hiding and general | |||
51 | // performance | |||
52 | // | |||
53 | // Some other things can also affect performance, but are hard to predict | |||
54 | // (cache usage, the fact the HW can issue several instructions from different | |||
55 | // wavefronts if different types, etc) | |||
56 | // | |||
57 | // This scheduler tries to solve the scheduling problem by dividing it into | |||
58 | // simpler sub-problems. It divides the instructions into blocks, schedules | |||
59 | // locally inside the blocks where it takes care of low latencies, and then | |||
60 | // chooses the order of the blocks by taking care of high latencies. | |||
61 | // Dividing the instructions into blocks helps control keeping register | |||
62 | // usage low. | |||
63 | // | |||
64 | // First the instructions are put into blocks. | |||
65 | // We want the blocks help control register usage and hide high latencies | |||
66 | // later. To help control register usage, we typically want all local | |||
67 | // computations, when for example you create a result that can be comsummed | |||
68 | // right away, to be contained in a block. Block inputs and outputs would | |||
69 | // typically be important results that are needed in several locations of | |||
70 | // the shader. Since we do want blocks to help hide high latencies, we want | |||
71 | // the instructions inside the block to have a minimal set of dependencies | |||
72 | // on high latencies. It will make it easy to pick blocks to hide specific | |||
73 | // high latencies. | |||
74 | // The block creation algorithm is divided into several steps, and several | |||
75 | // variants can be tried during the scheduling process. | |||
76 | // | |||
77 | // Second the order of the instructions inside the blocks is chosen. | |||
78 | // At that step we do take into account only register usage and hiding | |||
79 | // low latency instructions | |||
80 | // | |||
81 | // Third the block order is chosen, there we try to hide high latencies | |||
82 | // and keep register usage low. | |||
83 | // | |||
84 | // After the third step, a pass is done to improve the hiding of low | |||
85 | // latencies. | |||
86 | // | |||
87 | // Actually when talking about 'low latency' or 'high latency' it includes | |||
88 | // both the latency to get the cache (or global mem) data go to the register, | |||
89 | // and the bandwidth limitations. | |||
90 | // Increasing the number of active wavefronts helps hide the former, but it | |||
91 | // doesn't solve the latter, thus why even if wavefront count is high, we have | |||
92 | // to try have as many instructions hiding high latencies as possible. | |||
93 | // The OpenCL doc says for example latency of 400 cycles for a global mem access, | |||
94 | // which is hidden by 10 instructions if the wavefront count is 10. | |||
95 | ||||
96 | // Some figures taken from AMD docs: | |||
97 | // Both texture and constant L1 caches are 4-way associative with 64 bytes | |||
98 | // lines. | |||
99 | // Constant cache is shared with 4 CUs. | |||
100 | // For texture sampling, the address generation unit receives 4 texture | |||
101 | // addresses per cycle, thus we could expect texture sampling latency to be | |||
102 | // equivalent to 4 instructions in the very best case (a VGPR is 64 work items, | |||
103 | // instructions in a wavefront group are executed every 4 cycles), | |||
104 | // or 16 instructions if the other wavefronts associated to the 3 other VALUs | |||
105 | // of the CU do texture sampling too. (Don't take these figures too seriously, | |||
106 | // as I'm not 100% sure of the computation) | |||
107 | // Data exports should get similar latency. | |||
108 | // For constant loading, the cache is shader with 4 CUs. | |||
109 | // The doc says "a throughput of 16B/cycle for each of the 4 Compute Unit" | |||
110 | // I guess if the other CU don't read the cache, it can go up to 64B/cycle. | |||
111 | // It means a simple s_buffer_load should take one instruction to hide, as | |||
112 | // well as a s_buffer_loadx2 and potentially a s_buffer_loadx8 if on the same | |||
113 | // cache line. | |||
114 | // | |||
115 | // As of today the driver doesn't preload the constants in cache, thus the | |||
116 | // first loads get extra latency. The doc says global memory access can be | |||
117 | // 300-600 cycles. We do not specially take that into account when scheduling | |||
118 | // As we expect the driver to be able to preload the constants soon. | |||
119 | ||||
120 | // common code // | |||
121 | ||||
122 | #ifndef NDEBUG | |||
123 | ||||
124 | static const char *getReasonStr(SIScheduleCandReason Reason) { | |||
125 | switch (Reason) { | |||
126 | case NoCand: return "NOCAND"; | |||
127 | case RegUsage: return "REGUSAGE"; | |||
128 | case Latency: return "LATENCY"; | |||
129 | case Successor: return "SUCCESSOR"; | |||
130 | case Depth: return "DEPTH"; | |||
131 | case NodeOrder: return "ORDER"; | |||
132 | } | |||
133 | llvm_unreachable("Unknown reason!")::llvm::llvm_unreachable_internal("Unknown reason!", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 133); | |||
134 | } | |||
135 | ||||
136 | #endif | |||
137 | ||||
138 | namespace llvm { | |||
139 | namespace SISched { | |||
140 | static bool tryLess(int TryVal, int CandVal, | |||
141 | SISchedulerCandidate &TryCand, | |||
142 | SISchedulerCandidate &Cand, | |||
143 | SIScheduleCandReason Reason) { | |||
144 | if (TryVal < CandVal) { | |||
145 | TryCand.Reason = Reason; | |||
146 | return true; | |||
147 | } | |||
148 | if (TryVal > CandVal) { | |||
149 | if (Cand.Reason > Reason) | |||
150 | Cand.Reason = Reason; | |||
151 | return true; | |||
152 | } | |||
153 | Cand.setRepeat(Reason); | |||
154 | return false; | |||
155 | } | |||
156 | ||||
157 | static bool tryGreater(int TryVal, int CandVal, | |||
158 | SISchedulerCandidate &TryCand, | |||
159 | SISchedulerCandidate &Cand, | |||
160 | SIScheduleCandReason Reason) { | |||
161 | if (TryVal > CandVal) { | |||
162 | TryCand.Reason = Reason; | |||
163 | return true; | |||
164 | } | |||
165 | if (TryVal < CandVal) { | |||
166 | if (Cand.Reason > Reason) | |||
167 | Cand.Reason = Reason; | |||
168 | return true; | |||
169 | } | |||
170 | Cand.setRepeat(Reason); | |||
171 | return false; | |||
172 | } | |||
173 | } // end namespace SISched | |||
174 | } // end namespace llvm | |||
175 | ||||
176 | // SIScheduleBlock // | |||
177 | ||||
178 | void SIScheduleBlock::addUnit(SUnit *SU) { | |||
179 | NodeNum2Index[SU->NodeNum] = SUnits.size(); | |||
180 | SUnits.push_back(SU); | |||
181 | } | |||
182 | ||||
183 | #ifndef NDEBUG | |||
184 | void SIScheduleBlock::traceCandidate(const SISchedCandidate &Cand) { | |||
185 | ||||
186 | dbgs() << " SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason); | |||
187 | dbgs() << '\n'; | |||
188 | } | |||
189 | #endif | |||
190 | ||||
191 | void SIScheduleBlock::tryCandidateTopDown(SISchedCandidate &Cand, | |||
192 | SISchedCandidate &TryCand) { | |||
193 | // Initialize the candidate if needed. | |||
194 | if (!Cand.isValid()) { | |||
195 | TryCand.Reason = NodeOrder; | |||
196 | return; | |||
197 | } | |||
198 | ||||
199 | if (Cand.SGPRUsage > 60 && | |||
200 | SISched::tryLess(TryCand.SGPRUsage, Cand.SGPRUsage, | |||
201 | TryCand, Cand, RegUsage)) | |||
202 | return; | |||
203 | ||||
204 | // Schedule low latency instructions as top as possible. | |||
205 | // Order of priority is: | |||
206 | // . Low latency instructions which do not depend on other low latency | |||
207 | // instructions we haven't waited for | |||
208 | // . Other instructions which do not depend on low latency instructions | |||
209 | // we haven't waited for | |||
210 | // . Low latencies | |||
211 | // . All other instructions | |||
212 | // Goal is to get: low latency instructions - independent instructions | |||
213 | // - (eventually some more low latency instructions) | |||
214 | // - instructions that depend on the first low latency instructions. | |||
215 | // If in the block there is a lot of constant loads, the SGPR usage | |||
216 | // could go quite high, thus above the arbitrary limit of 60 will encourage | |||
217 | // use the already loaded constants (in order to release some SGPRs) before | |||
218 | // loading more. | |||
219 | if (SISched::tryLess(TryCand.HasLowLatencyNonWaitedParent, | |||
220 | Cand.HasLowLatencyNonWaitedParent, | |||
221 | TryCand, Cand, SIScheduleCandReason::Depth)) | |||
222 | return; | |||
223 | ||||
224 | if (SISched::tryGreater(TryCand.IsLowLatency, Cand.IsLowLatency, | |||
225 | TryCand, Cand, SIScheduleCandReason::Depth)) | |||
226 | return; | |||
227 | ||||
228 | if (TryCand.IsLowLatency && | |||
229 | SISched::tryLess(TryCand.LowLatencyOffset, Cand.LowLatencyOffset, | |||
230 | TryCand, Cand, SIScheduleCandReason::Depth)) | |||
231 | return; | |||
232 | ||||
233 | if (SISched::tryLess(TryCand.VGPRUsage, Cand.VGPRUsage, | |||
234 | TryCand, Cand, RegUsage)) | |||
235 | return; | |||
236 | ||||
237 | // Fall through to original instruction order. | |||
238 | if (TryCand.SU->NodeNum < Cand.SU->NodeNum) { | |||
239 | TryCand.Reason = NodeOrder; | |||
240 | } | |||
241 | } | |||
242 | ||||
243 | SUnit* SIScheduleBlock::pickNode() { | |||
244 | SISchedCandidate TopCand; | |||
245 | ||||
246 | for (SUnit* SU : TopReadySUs) { | |||
247 | SISchedCandidate TryCand; | |||
248 | std::vector<unsigned> pressure; | |||
249 | std::vector<unsigned> MaxPressure; | |||
250 | // Predict register usage after this instruction. | |||
251 | TryCand.SU = SU; | |||
252 | TopRPTracker.getDownwardPressure(SU->getInstr(), pressure, MaxPressure); | |||
253 | TryCand.SGPRUsage = pressure[AMDGPU::RegisterPressureSets::SReg_32]; | |||
254 | TryCand.VGPRUsage = pressure[AMDGPU::RegisterPressureSets::VGPR_32]; | |||
255 | TryCand.IsLowLatency = DAG->IsLowLatencySU[SU->NodeNum]; | |||
256 | TryCand.LowLatencyOffset = DAG->LowLatencyOffset[SU->NodeNum]; | |||
257 | TryCand.HasLowLatencyNonWaitedParent = | |||
258 | HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]]; | |||
259 | tryCandidateTopDown(TopCand, TryCand); | |||
260 | if (TryCand.Reason != NoCand) | |||
261 | TopCand.setBest(TryCand); | |||
262 | } | |||
263 | ||||
264 | return TopCand.SU; | |||
265 | } | |||
266 | ||||
267 | ||||
268 | // Schedule something valid. | |||
269 | void SIScheduleBlock::fastSchedule() { | |||
270 | TopReadySUs.clear(); | |||
271 | if (Scheduled) | |||
272 | undoSchedule(); | |||
273 | ||||
274 | for (SUnit* SU : SUnits) { | |||
275 | if (!SU->NumPredsLeft) | |||
276 | TopReadySUs.push_back(SU); | |||
277 | } | |||
278 | ||||
279 | while (!TopReadySUs.empty()) { | |||
280 | SUnit *SU = TopReadySUs[0]; | |||
281 | ScheduledSUnits.push_back(SU); | |||
282 | nodeScheduled(SU); | |||
283 | } | |||
284 | ||||
285 | Scheduled = true; | |||
286 | } | |||
287 | ||||
288 | // Returns if the register was set between first and last. | |||
289 | static bool isDefBetween(unsigned Reg, | |||
290 | SlotIndex First, SlotIndex Last, | |||
291 | const MachineRegisterInfo *MRI, | |||
292 | const LiveIntervals *LIS) { | |||
293 | for (MachineRegisterInfo::def_instr_iterator | |||
294 | UI = MRI->def_instr_begin(Reg), | |||
295 | UE = MRI->def_instr_end(); UI != UE; ++UI) { | |||
296 | const MachineInstr* MI = &*UI; | |||
297 | if (MI->isDebugValue()) | |||
298 | continue; | |||
299 | SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot(); | |||
300 | if (InstSlot >= First && InstSlot <= Last) | |||
301 | return true; | |||
302 | } | |||
303 | return false; | |||
304 | } | |||
305 | ||||
306 | void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock, | |||
307 | MachineBasicBlock::iterator EndBlock) { | |||
308 | IntervalPressure Pressure, BotPressure; | |||
309 | RegPressureTracker RPTracker(Pressure), BotRPTracker(BotPressure); | |||
310 | LiveIntervals *LIS = DAG->getLIS(); | |||
311 | MachineRegisterInfo *MRI = DAG->getMRI(); | |||
312 | DAG->initRPTracker(TopRPTracker); | |||
313 | DAG->initRPTracker(BotRPTracker); | |||
314 | DAG->initRPTracker(RPTracker); | |||
315 | ||||
316 | // Goes though all SU. RPTracker captures what had to be alive for the SUs | |||
317 | // to execute, and what is still alive at the end. | |||
318 | for (SUnit* SU : ScheduledSUnits) { | |||
319 | RPTracker.setPos(SU->getInstr()); | |||
320 | RPTracker.advance(); | |||
321 | } | |||
322 | ||||
323 | // Close the RPTracker to finalize live ins/outs. | |||
324 | RPTracker.closeRegion(); | |||
325 | ||||
326 | // Initialize the live ins and live outs. | |||
327 | TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs); | |||
328 | BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs); | |||
329 | ||||
330 | // Do not Track Physical Registers, because it messes up. | |||
331 | for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) { | |||
332 | if (Register::isVirtualRegister(RegMaskPair.RegUnit)) | |||
333 | LiveInRegs.insert(RegMaskPair.RegUnit); | |||
334 | } | |||
335 | LiveOutRegs.clear(); | |||
336 | // There is several possibilities to distinguish: | |||
337 | // 1) Reg is not input to any instruction in the block, but is output of one | |||
338 | // 2) 1) + read in the block and not needed after it | |||
339 | // 3) 1) + read in the block but needed in another block | |||
340 | // 4) Reg is input of an instruction but another block will read it too | |||
341 | // 5) Reg is input of an instruction and then rewritten in the block. | |||
342 | // result is not read in the block (implies used in another block) | |||
343 | // 6) Reg is input of an instruction and then rewritten in the block. | |||
344 | // result is read in the block and not needed in another block | |||
345 | // 7) Reg is input of an instruction and then rewritten in the block. | |||
346 | // result is read in the block but also needed in another block | |||
347 | // LiveInRegs will contains all the regs in situation 4, 5, 6, 7 | |||
348 | // We want LiveOutRegs to contain only Regs whose content will be read after | |||
349 | // in another block, and whose content was written in the current block, | |||
350 | // that is we want it to get 1, 3, 5, 7 | |||
351 | // Since we made the MIs of a block to be packed all together before | |||
352 | // scheduling, then the LiveIntervals were correct, and the RPTracker was | |||
353 | // able to correctly handle 5 vs 6, 2 vs 3. | |||
354 | // (Note: This is not sufficient for RPTracker to not do mistakes for case 4) | |||
355 | // The RPTracker's LiveOutRegs has 1, 3, (some correct or incorrect)4, 5, 7 | |||
356 | // Comparing to LiveInRegs is not sufficient to differenciate 4 vs 5, 7 | |||
357 | // The use of findDefBetween removes the case 4. | |||
358 | for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) { | |||
359 | Register Reg = RegMaskPair.RegUnit; | |||
360 | if (Reg.isVirtual() && | |||
361 | isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(), | |||
362 | LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI, | |||
363 | LIS)) { | |||
364 | LiveOutRegs.insert(Reg); | |||
365 | } | |||
366 | } | |||
367 | ||||
368 | // Pressure = sum_alive_registers register size | |||
369 | // Internally llvm will represent some registers as big 128 bits registers | |||
370 | // for example, but they actually correspond to 4 actual 32 bits registers. | |||
371 | // Thus Pressure is not equal to num_alive_registers * constant. | |||
372 | LiveInPressure = TopPressure.MaxSetPressure; | |||
373 | LiveOutPressure = BotPressure.MaxSetPressure; | |||
374 | ||||
375 | // Prepares TopRPTracker for top down scheduling. | |||
376 | TopRPTracker.closeTop(); | |||
377 | } | |||
378 | ||||
379 | void SIScheduleBlock::schedule(MachineBasicBlock::iterator BeginBlock, | |||
380 | MachineBasicBlock::iterator EndBlock) { | |||
381 | if (!Scheduled) | |||
382 | fastSchedule(); | |||
383 | ||||
384 | // PreScheduling phase to set LiveIn and LiveOut. | |||
385 | initRegPressure(BeginBlock, EndBlock); | |||
386 | undoSchedule(); | |||
387 | ||||
388 | // Schedule for real now. | |||
389 | ||||
390 | TopReadySUs.clear(); | |||
391 | ||||
392 | for (SUnit* SU : SUnits) { | |||
393 | if (!SU->NumPredsLeft) | |||
394 | TopReadySUs.push_back(SU); | |||
395 | } | |||
396 | ||||
397 | while (!TopReadySUs.empty()) { | |||
398 | SUnit *SU = pickNode(); | |||
399 | ScheduledSUnits.push_back(SU); | |||
400 | TopRPTracker.setPos(SU->getInstr()); | |||
401 | TopRPTracker.advance(); | |||
402 | nodeScheduled(SU); | |||
403 | } | |||
404 | ||||
405 | // TODO: compute InternalAdditionnalPressure. | |||
406 | InternalAdditionnalPressure.resize(TopPressure.MaxSetPressure.size()); | |||
407 | ||||
408 | // Check everything is right. | |||
409 | #ifndef NDEBUG | |||
410 | assert(SUnits.size() == ScheduledSUnits.size() &&(static_cast <bool> (SUnits.size() == ScheduledSUnits.size () && TopReadySUs.empty()) ? void (0) : __assert_fail ("SUnits.size() == ScheduledSUnits.size() && TopReadySUs.empty()" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 411, __extension__ __PRETTY_FUNCTION__)) | |||
411 | TopReadySUs.empty())(static_cast <bool> (SUnits.size() == ScheduledSUnits.size () && TopReadySUs.empty()) ? void (0) : __assert_fail ("SUnits.size() == ScheduledSUnits.size() && TopReadySUs.empty()" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 411, __extension__ __PRETTY_FUNCTION__)); | |||
412 | for (SUnit* SU : SUnits) { | |||
413 | assert(SU->isScheduled &&(static_cast <bool> (SU->isScheduled && SU-> NumPredsLeft == 0) ? void (0) : __assert_fail ("SU->isScheduled && SU->NumPredsLeft == 0" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 414, __extension__ __PRETTY_FUNCTION__)) | |||
414 | SU->NumPredsLeft == 0)(static_cast <bool> (SU->isScheduled && SU-> NumPredsLeft == 0) ? void (0) : __assert_fail ("SU->isScheduled && SU->NumPredsLeft == 0" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 414, __extension__ __PRETTY_FUNCTION__)); | |||
415 | } | |||
416 | #endif | |||
417 | ||||
418 | Scheduled = true; | |||
419 | } | |||
420 | ||||
421 | void SIScheduleBlock::undoSchedule() { | |||
422 | for (SUnit* SU : SUnits) { | |||
423 | SU->isScheduled = false; | |||
424 | for (SDep& Succ : SU->Succs) { | |||
425 | if (BC->isSUInBlock(Succ.getSUnit(), ID)) | |||
426 | undoReleaseSucc(SU, &Succ); | |||
427 | } | |||
428 | } | |||
429 | HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0); | |||
430 | ScheduledSUnits.clear(); | |||
431 | Scheduled = false; | |||
432 | } | |||
433 | ||||
434 | void SIScheduleBlock::undoReleaseSucc(SUnit *SU, SDep *SuccEdge) { | |||
435 | SUnit *SuccSU = SuccEdge->getSUnit(); | |||
436 | ||||
437 | if (SuccEdge->isWeak()) { | |||
438 | ++SuccSU->WeakPredsLeft; | |||
439 | return; | |||
440 | } | |||
441 | ++SuccSU->NumPredsLeft; | |||
442 | } | |||
443 | ||||
444 | void SIScheduleBlock::releaseSucc(SUnit *SU, SDep *SuccEdge) { | |||
445 | SUnit *SuccSU = SuccEdge->getSUnit(); | |||
446 | ||||
447 | if (SuccEdge->isWeak()) { | |||
448 | --SuccSU->WeakPredsLeft; | |||
449 | return; | |||
450 | } | |||
451 | #ifndef NDEBUG | |||
452 | if (SuccSU->NumPredsLeft == 0) { | |||
453 | dbgs() << "*** Scheduling failed! ***\n"; | |||
454 | DAG->dumpNode(*SuccSU); | |||
455 | dbgs() << " has been released too many times!\n"; | |||
456 | llvm_unreachable(nullptr)::llvm::llvm_unreachable_internal(nullptr, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 456); | |||
457 | } | |||
458 | #endif | |||
459 | ||||
460 | --SuccSU->NumPredsLeft; | |||
461 | } | |||
462 | ||||
463 | /// Release Successors of the SU that are in the block or not. | |||
464 | void SIScheduleBlock::releaseSuccessors(SUnit *SU, bool InOrOutBlock) { | |||
465 | for (SDep& Succ : SU->Succs) { | |||
466 | SUnit *SuccSU = Succ.getSUnit(); | |||
467 | ||||
468 | if (SuccSU->NodeNum >= DAG->SUnits.size()) | |||
469 | continue; | |||
470 | ||||
471 | if (BC->isSUInBlock(SuccSU, ID) != InOrOutBlock) | |||
472 | continue; | |||
473 | ||||
474 | releaseSucc(SU, &Succ); | |||
475 | if (SuccSU->NumPredsLeft == 0 && InOrOutBlock) | |||
476 | TopReadySUs.push_back(SuccSU); | |||
477 | } | |||
478 | } | |||
479 | ||||
480 | void SIScheduleBlock::nodeScheduled(SUnit *SU) { | |||
481 | // Is in TopReadySUs | |||
482 | assert (!SU->NumPredsLeft)(static_cast <bool> (!SU->NumPredsLeft) ? void (0) : __assert_fail ("!SU->NumPredsLeft", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 482, __extension__ __PRETTY_FUNCTION__)); | |||
483 | std::vector<SUnit *>::iterator I = llvm::find(TopReadySUs, SU); | |||
484 | if (I == TopReadySUs.end()) { | |||
485 | dbgs() << "Data Structure Bug in SI Scheduler\n"; | |||
486 | llvm_unreachable(nullptr)::llvm::llvm_unreachable_internal(nullptr, "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 486); | |||
487 | } | |||
488 | TopReadySUs.erase(I); | |||
489 | ||||
490 | releaseSuccessors(SU, true); | |||
491 | // Scheduling this node will trigger a wait, | |||
492 | // thus propagate to other instructions that they do not need to wait either. | |||
493 | if (HasLowLatencyNonWaitedParent[NodeNum2Index[SU->NodeNum]]) | |||
494 | HasLowLatencyNonWaitedParent.assign(SUnits.size(), 0); | |||
495 | ||||
496 | if (DAG->IsLowLatencySU[SU->NodeNum]) { | |||
497 | for (SDep& Succ : SU->Succs) { | |||
498 | std::map<unsigned, unsigned>::iterator I = | |||
499 | NodeNum2Index.find(Succ.getSUnit()->NodeNum); | |||
500 | if (I != NodeNum2Index.end()) | |||
501 | HasLowLatencyNonWaitedParent[I->second] = 1; | |||
502 | } | |||
503 | } | |||
504 | SU->isScheduled = true; | |||
505 | } | |||
506 | ||||
507 | void SIScheduleBlock::finalizeUnits() { | |||
508 | // We remove links from outside blocks to enable scheduling inside the block. | |||
509 | for (SUnit* SU : SUnits) { | |||
510 | releaseSuccessors(SU, false); | |||
511 | if (DAG->IsHighLatencySU[SU->NodeNum]) | |||
512 | HighLatencyBlock = true; | |||
513 | } | |||
514 | HasLowLatencyNonWaitedParent.resize(SUnits.size(), 0); | |||
515 | } | |||
516 | ||||
517 | // we maintain ascending order of IDs | |||
518 | void SIScheduleBlock::addPred(SIScheduleBlock *Pred) { | |||
519 | unsigned PredID = Pred->getID(); | |||
520 | ||||
521 | // Check if not already predecessor. | |||
522 | for (SIScheduleBlock* P : Preds) { | |||
523 | if (PredID == P->getID()) | |||
524 | return; | |||
525 | } | |||
526 | Preds.push_back(Pred); | |||
527 | ||||
528 | assert(none_of(Succs,(static_cast <bool> (none_of(Succs, [=](std::pair<SIScheduleBlock *, SIScheduleBlockLinkKind> S) { return PredID == S.first-> getID(); }) && "Loop in the Block Graph!") ? void (0) : __assert_fail ("none_of(Succs, [=](std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S) { return PredID == S.first->getID(); }) && \"Loop in the Block Graph!\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 533, __extension__ __PRETTY_FUNCTION__)) | |||
529 | [=](std::pair<SIScheduleBlock*,(static_cast <bool> (none_of(Succs, [=](std::pair<SIScheduleBlock *, SIScheduleBlockLinkKind> S) { return PredID == S.first-> getID(); }) && "Loop in the Block Graph!") ? void (0) : __assert_fail ("none_of(Succs, [=](std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S) { return PredID == S.first->getID(); }) && \"Loop in the Block Graph!\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 533, __extension__ __PRETTY_FUNCTION__)) | |||
530 | SIScheduleBlockLinkKind> S) {(static_cast <bool> (none_of(Succs, [=](std::pair<SIScheduleBlock *, SIScheduleBlockLinkKind> S) { return PredID == S.first-> getID(); }) && "Loop in the Block Graph!") ? void (0) : __assert_fail ("none_of(Succs, [=](std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S) { return PredID == S.first->getID(); }) && \"Loop in the Block Graph!\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 533, __extension__ __PRETTY_FUNCTION__)) | |||
531 | return PredID == S.first->getID();(static_cast <bool> (none_of(Succs, [=](std::pair<SIScheduleBlock *, SIScheduleBlockLinkKind> S) { return PredID == S.first-> getID(); }) && "Loop in the Block Graph!") ? void (0) : __assert_fail ("none_of(Succs, [=](std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S) { return PredID == S.first->getID(); }) && \"Loop in the Block Graph!\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 533, __extension__ __PRETTY_FUNCTION__)) | |||
532 | }) &&(static_cast <bool> (none_of(Succs, [=](std::pair<SIScheduleBlock *, SIScheduleBlockLinkKind> S) { return PredID == S.first-> getID(); }) && "Loop in the Block Graph!") ? void (0) : __assert_fail ("none_of(Succs, [=](std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S) { return PredID == S.first->getID(); }) && \"Loop in the Block Graph!\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 533, __extension__ __PRETTY_FUNCTION__)) | |||
533 | "Loop in the Block Graph!")(static_cast <bool> (none_of(Succs, [=](std::pair<SIScheduleBlock *, SIScheduleBlockLinkKind> S) { return PredID == S.first-> getID(); }) && "Loop in the Block Graph!") ? void (0) : __assert_fail ("none_of(Succs, [=](std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S) { return PredID == S.first->getID(); }) && \"Loop in the Block Graph!\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 533, __extension__ __PRETTY_FUNCTION__)); | |||
534 | } | |||
535 | ||||
536 | void SIScheduleBlock::addSucc(SIScheduleBlock *Succ, | |||
537 | SIScheduleBlockLinkKind Kind) { | |||
538 | unsigned SuccID = Succ->getID(); | |||
539 | ||||
540 | // Check if not already predecessor. | |||
541 | for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> &S : Succs) { | |||
542 | if (SuccID == S.first->getID()) { | |||
543 | if (S.second == SIScheduleBlockLinkKind::NoData && | |||
544 | Kind == SIScheduleBlockLinkKind::Data) | |||
545 | S.second = Kind; | |||
546 | return; | |||
547 | } | |||
548 | } | |||
549 | if (Succ->isHighLatencyBlock()) | |||
550 | ++NumHighLatencySuccessors; | |||
551 | Succs.push_back(std::make_pair(Succ, Kind)); | |||
552 | ||||
553 | assert(none_of(Preds,(static_cast <bool> (none_of(Preds, [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) && "Loop in the Block Graph!" ) ? void (0) : __assert_fail ("none_of(Preds, [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) && \"Loop in the Block Graph!\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 555, __extension__ __PRETTY_FUNCTION__)) | |||
554 | [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) &&(static_cast <bool> (none_of(Preds, [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) && "Loop in the Block Graph!" ) ? void (0) : __assert_fail ("none_of(Preds, [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) && \"Loop in the Block Graph!\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 555, __extension__ __PRETTY_FUNCTION__)) | |||
555 | "Loop in the Block Graph!")(static_cast <bool> (none_of(Preds, [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) && "Loop in the Block Graph!" ) ? void (0) : __assert_fail ("none_of(Preds, [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) && \"Loop in the Block Graph!\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 555, __extension__ __PRETTY_FUNCTION__)); | |||
556 | } | |||
557 | ||||
558 | #ifndef NDEBUG | |||
559 | void SIScheduleBlock::printDebug(bool full) { | |||
560 | dbgs() << "Block (" << ID << ")\n"; | |||
561 | if (!full) | |||
562 | return; | |||
563 | ||||
564 | dbgs() << "\nContains High Latency Instruction: " | |||
565 | << HighLatencyBlock << '\n'; | |||
566 | dbgs() << "\nDepends On:\n"; | |||
567 | for (SIScheduleBlock* P : Preds) { | |||
568 | P->printDebug(false); | |||
569 | } | |||
570 | ||||
571 | dbgs() << "\nSuccessors:\n"; | |||
572 | for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S : Succs) { | |||
573 | if (S.second == SIScheduleBlockLinkKind::Data) | |||
574 | dbgs() << "(Data Dep) "; | |||
575 | S.first->printDebug(false); | |||
576 | } | |||
577 | ||||
578 | if (Scheduled) { | |||
579 | dbgs() << "LiveInPressure " | |||
580 | << LiveInPressure[AMDGPU::RegisterPressureSets::SReg_32] << ' ' | |||
581 | << LiveInPressure[AMDGPU::RegisterPressureSets::VGPR_32] << '\n'; | |||
582 | dbgs() << "LiveOutPressure " | |||
583 | << LiveOutPressure[AMDGPU::RegisterPressureSets::SReg_32] << ' ' | |||
584 | << LiveOutPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n\n"; | |||
585 | dbgs() << "LiveIns:\n"; | |||
586 | for (unsigned Reg : LiveInRegs) | |||
587 | dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; | |||
588 | ||||
589 | dbgs() << "\nLiveOuts:\n"; | |||
590 | for (unsigned Reg : LiveOutRegs) | |||
591 | dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; | |||
592 | } | |||
593 | ||||
594 | dbgs() << "\nInstructions:\n"; | |||
595 | for (const SUnit* SU : SUnits) | |||
596 | DAG->dumpNode(*SU); | |||
597 | ||||
598 | dbgs() << "///////////////////////\n"; | |||
599 | } | |||
600 | #endif | |||
601 | ||||
602 | // SIScheduleBlockCreator // | |||
603 | ||||
604 | SIScheduleBlockCreator::SIScheduleBlockCreator(SIScheduleDAGMI *DAG) | |||
605 | : DAG(DAG) {} | |||
606 | ||||
607 | SIScheduleBlocks | |||
608 | SIScheduleBlockCreator::getBlocks(SISchedulerBlockCreatorVariant BlockVariant) { | |||
609 | std::map<SISchedulerBlockCreatorVariant, SIScheduleBlocks>::iterator B = | |||
610 | Blocks.find(BlockVariant); | |||
611 | if (B == Blocks.end()) { | |||
612 | SIScheduleBlocks Res; | |||
613 | createBlocksForVariant(BlockVariant); | |||
614 | topologicalSort(); | |||
615 | scheduleInsideBlocks(); | |||
616 | fillStats(); | |||
617 | Res.Blocks = CurrentBlocks; | |||
618 | Res.TopDownIndex2Block = TopDownIndex2Block; | |||
619 | Res.TopDownBlock2Index = TopDownBlock2Index; | |||
620 | Blocks[BlockVariant] = Res; | |||
621 | return Res; | |||
622 | } else { | |||
623 | return B->second; | |||
624 | } | |||
625 | } | |||
626 | ||||
627 | bool SIScheduleBlockCreator::isSUInBlock(SUnit *SU, unsigned ID) { | |||
628 | if (SU->NodeNum >= DAG->SUnits.size()) | |||
629 | return false; | |||
630 | return CurrentBlocks[Node2CurrentBlock[SU->NodeNum]]->getID() == ID; | |||
631 | } | |||
632 | ||||
633 | void SIScheduleBlockCreator::colorHighLatenciesAlone() { | |||
634 | unsigned DAGSize = DAG->SUnits.size(); | |||
635 | ||||
636 | for (unsigned i = 0, e = DAGSize; i != e; ++i) { | |||
637 | SUnit *SU = &DAG->SUnits[i]; | |||
638 | if (DAG->IsHighLatencySU[SU->NodeNum]) { | |||
639 | CurrentColoring[SU->NodeNum] = NextReservedID++; | |||
640 | } | |||
641 | } | |||
642 | } | |||
643 | ||||
644 | static bool | |||
645 | hasDataDependencyPred(const SUnit &SU, const SUnit &FromSU) { | |||
646 | for (const auto &PredDep : SU.Preds) { | |||
647 | if (PredDep.getSUnit() == &FromSU && | |||
648 | PredDep.getKind() == llvm::SDep::Data) | |||
649 | return true; | |||
650 | } | |||
651 | return false; | |||
652 | } | |||
653 | ||||
654 | void SIScheduleBlockCreator::colorHighLatenciesGroups() { | |||
655 | unsigned DAGSize = DAG->SUnits.size(); | |||
656 | unsigned NumHighLatencies = 0; | |||
657 | unsigned GroupSize; | |||
658 | int Color = NextReservedID; | |||
659 | unsigned Count = 0; | |||
660 | std::set<unsigned> FormingGroup; | |||
661 | ||||
662 | for (unsigned i = 0, e = DAGSize; i != e; ++i) { | |||
663 | SUnit *SU = &DAG->SUnits[i]; | |||
664 | if (DAG->IsHighLatencySU[SU->NodeNum]) | |||
665 | ++NumHighLatencies; | |||
666 | } | |||
667 | ||||
668 | if (NumHighLatencies == 0) | |||
669 | return; | |||
670 | ||||
671 | if (NumHighLatencies <= 6) | |||
672 | GroupSize = 2; | |||
673 | else if (NumHighLatencies <= 12) | |||
674 | GroupSize = 3; | |||
675 | else | |||
676 | GroupSize = 4; | |||
677 | ||||
678 | for (unsigned SUNum : DAG->TopDownIndex2SU) { | |||
679 | const SUnit &SU = DAG->SUnits[SUNum]; | |||
680 | if (DAG->IsHighLatencySU[SU.NodeNum]) { | |||
681 | unsigned CompatibleGroup = true; | |||
682 | int ProposedColor = Color; | |||
683 | std::vector<int> AdditionalElements; | |||
684 | ||||
685 | // We don't want to put in the same block | |||
686 | // two high latency instructions that depend | |||
687 | // on each other. | |||
688 | // One way would be to check canAddEdge | |||
689 | // in both directions, but that currently is not | |||
690 | // enough because there the high latency order is | |||
691 | // enforced (via links). | |||
692 | // Instead, look at the dependencies between the | |||
693 | // high latency instructions and deduce if it is | |||
694 | // a data dependency or not. | |||
695 | for (unsigned j : FormingGroup) { | |||
696 | bool HasSubGraph; | |||
697 | std::vector<int> SubGraph; | |||
698 | // By construction (topological order), if SU and | |||
699 | // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary | |||
700 | // in the parent graph of SU. | |||
701 | #ifndef NDEBUG | |||
702 | SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j], | |||
703 | HasSubGraph); | |||
704 | assert(!HasSubGraph)(static_cast <bool> (!HasSubGraph) ? void (0) : __assert_fail ("!HasSubGraph", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 704, __extension__ __PRETTY_FUNCTION__)); | |||
705 | #endif | |||
706 | SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU, | |||
707 | HasSubGraph); | |||
708 | if (!HasSubGraph) | |||
709 | continue; // No dependencies between each other | |||
710 | else if (SubGraph.size() > 5) { | |||
711 | // Too many elements would be required to be added to the block. | |||
712 | CompatibleGroup = false; | |||
713 | break; | |||
714 | } | |||
715 | else { | |||
716 | // Check the type of dependency | |||
717 | for (unsigned k : SubGraph) { | |||
718 | // If in the path to join the two instructions, | |||
719 | // there is another high latency instruction, | |||
720 | // or instructions colored for another block | |||
721 | // abort the merge. | |||
722 | if (DAG->IsHighLatencySU[k] || | |||
723 | (CurrentColoring[k] != ProposedColor && | |||
724 | CurrentColoring[k] != 0)) { | |||
725 | CompatibleGroup = false; | |||
726 | break; | |||
727 | } | |||
728 | // If one of the SU in the subgraph depends on the result of SU j, | |||
729 | // there'll be a data dependency. | |||
730 | if (hasDataDependencyPred(DAG->SUnits[k], DAG->SUnits[j])) { | |||
731 | CompatibleGroup = false; | |||
732 | break; | |||
733 | } | |||
734 | } | |||
735 | if (!CompatibleGroup) | |||
736 | break; | |||
737 | // Same check for the SU | |||
738 | if (hasDataDependencyPred(SU, DAG->SUnits[j])) { | |||
739 | CompatibleGroup = false; | |||
740 | break; | |||
741 | } | |||
742 | // Add all the required instructions to the block | |||
743 | // These cannot live in another block (because they | |||
744 | // depend (order dependency) on one of the | |||
745 | // instruction in the block, and are required for the | |||
746 | // high latency instruction we add. | |||
747 | llvm::append_range(AdditionalElements, SubGraph); | |||
748 | } | |||
749 | } | |||
750 | if (CompatibleGroup) { | |||
751 | FormingGroup.insert(SU.NodeNum); | |||
752 | for (unsigned j : AdditionalElements) | |||
753 | CurrentColoring[j] = ProposedColor; | |||
754 | CurrentColoring[SU.NodeNum] = ProposedColor; | |||
755 | ++Count; | |||
756 | } | |||
757 | // Found one incompatible instruction, | |||
758 | // or has filled a big enough group. | |||
759 | // -> start a new one. | |||
760 | if (!CompatibleGroup) { | |||
761 | FormingGroup.clear(); | |||
762 | Color = ++NextReservedID; | |||
763 | ProposedColor = Color; | |||
764 | FormingGroup.insert(SU.NodeNum); | |||
765 | CurrentColoring[SU.NodeNum] = ProposedColor; | |||
766 | Count = 0; | |||
767 | } else if (Count == GroupSize) { | |||
768 | FormingGroup.clear(); | |||
769 | Color = ++NextReservedID; | |||
770 | ProposedColor = Color; | |||
771 | Count = 0; | |||
772 | } | |||
773 | } | |||
774 | } | |||
775 | } | |||
776 | ||||
777 | void SIScheduleBlockCreator::colorComputeReservedDependencies() { | |||
778 | unsigned DAGSize = DAG->SUnits.size(); | |||
779 | std::map<std::set<unsigned>, unsigned> ColorCombinations; | |||
780 | ||||
781 | CurrentTopDownReservedDependencyColoring.clear(); | |||
782 | CurrentBottomUpReservedDependencyColoring.clear(); | |||
783 | ||||
784 | CurrentTopDownReservedDependencyColoring.resize(DAGSize, 0); | |||
785 | CurrentBottomUpReservedDependencyColoring.resize(DAGSize, 0); | |||
786 | ||||
787 | // Traverse TopDown, and give different colors to SUs depending | |||
788 | // on which combination of High Latencies they depend on. | |||
789 | ||||
790 | for (unsigned SUNum : DAG->TopDownIndex2SU) { | |||
791 | SUnit *SU = &DAG->SUnits[SUNum]; | |||
792 | std::set<unsigned> SUColors; | |||
793 | ||||
794 | // Already given. | |||
795 | if (CurrentColoring[SU->NodeNum]) { | |||
796 | CurrentTopDownReservedDependencyColoring[SU->NodeNum] = | |||
797 | CurrentColoring[SU->NodeNum]; | |||
798 | continue; | |||
799 | } | |||
800 | ||||
801 | for (SDep& PredDep : SU->Preds) { | |||
802 | SUnit *Pred = PredDep.getSUnit(); | |||
803 | if (PredDep.isWeak() || Pred->NodeNum >= DAGSize) | |||
804 | continue; | |||
805 | if (CurrentTopDownReservedDependencyColoring[Pred->NodeNum] > 0) | |||
806 | SUColors.insert(CurrentTopDownReservedDependencyColoring[Pred->NodeNum]); | |||
807 | } | |||
808 | // Color 0 by default. | |||
809 | if (SUColors.empty()) | |||
810 | continue; | |||
811 | // Same color than parents. | |||
812 | if (SUColors.size() == 1 && *SUColors.begin() > DAGSize) | |||
813 | CurrentTopDownReservedDependencyColoring[SU->NodeNum] = | |||
814 | *SUColors.begin(); | |||
815 | else { | |||
816 | std::map<std::set<unsigned>, unsigned>::iterator Pos = | |||
817 | ColorCombinations.find(SUColors); | |||
818 | if (Pos != ColorCombinations.end()) { | |||
819 | CurrentTopDownReservedDependencyColoring[SU->NodeNum] = Pos->second; | |||
820 | } else { | |||
821 | CurrentTopDownReservedDependencyColoring[SU->NodeNum] = | |||
822 | NextNonReservedID; | |||
823 | ColorCombinations[SUColors] = NextNonReservedID++; | |||
824 | } | |||
825 | } | |||
826 | } | |||
827 | ||||
828 | ColorCombinations.clear(); | |||
829 | ||||
830 | // Same as before, but BottomUp. | |||
831 | ||||
832 | for (unsigned SUNum : DAG->BottomUpIndex2SU) { | |||
833 | SUnit *SU = &DAG->SUnits[SUNum]; | |||
834 | std::set<unsigned> SUColors; | |||
835 | ||||
836 | // Already given. | |||
837 | if (CurrentColoring[SU->NodeNum]) { | |||
838 | CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = | |||
839 | CurrentColoring[SU->NodeNum]; | |||
840 | continue; | |||
841 | } | |||
842 | ||||
843 | for (SDep& SuccDep : SU->Succs) { | |||
844 | SUnit *Succ = SuccDep.getSUnit(); | |||
845 | if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) | |||
846 | continue; | |||
847 | if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0) | |||
848 | SUColors.insert(CurrentBottomUpReservedDependencyColoring[Succ->NodeNum]); | |||
849 | } | |||
850 | // Keep color 0. | |||
851 | if (SUColors.empty()) | |||
852 | continue; | |||
853 | // Same color than parents. | |||
854 | if (SUColors.size() == 1 && *SUColors.begin() > DAGSize) | |||
855 | CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = | |||
856 | *SUColors.begin(); | |||
857 | else { | |||
858 | std::map<std::set<unsigned>, unsigned>::iterator Pos = | |||
859 | ColorCombinations.find(SUColors); | |||
860 | if (Pos != ColorCombinations.end()) { | |||
861 | CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = Pos->second; | |||
862 | } else { | |||
863 | CurrentBottomUpReservedDependencyColoring[SU->NodeNum] = | |||
864 | NextNonReservedID; | |||
865 | ColorCombinations[SUColors] = NextNonReservedID++; | |||
866 | } | |||
867 | } | |||
868 | } | |||
869 | } | |||
870 | ||||
871 | void SIScheduleBlockCreator::colorAccordingToReservedDependencies() { | |||
872 | unsigned DAGSize = DAG->SUnits.size(); | |||
873 | std::map<std::pair<unsigned, unsigned>, unsigned> ColorCombinations; | |||
874 | ||||
875 | // Every combination of colors given by the top down | |||
876 | // and bottom up Reserved node dependency | |||
877 | ||||
878 | for (unsigned i = 0, e = DAGSize; i != e; ++i) { | |||
879 | SUnit *SU = &DAG->SUnits[i]; | |||
880 | std::pair<unsigned, unsigned> SUColors; | |||
881 | ||||
882 | // High latency instructions: already given. | |||
883 | if (CurrentColoring[SU->NodeNum]) | |||
884 | continue; | |||
885 | ||||
886 | SUColors.first = CurrentTopDownReservedDependencyColoring[SU->NodeNum]; | |||
887 | SUColors.second = CurrentBottomUpReservedDependencyColoring[SU->NodeNum]; | |||
888 | ||||
889 | std::map<std::pair<unsigned, unsigned>, unsigned>::iterator Pos = | |||
890 | ColorCombinations.find(SUColors); | |||
891 | if (Pos != ColorCombinations.end()) { | |||
892 | CurrentColoring[SU->NodeNum] = Pos->second; | |||
893 | } else { | |||
894 | CurrentColoring[SU->NodeNum] = NextNonReservedID; | |||
895 | ColorCombinations[SUColors] = NextNonReservedID++; | |||
896 | } | |||
897 | } | |||
898 | } | |||
899 | ||||
900 | void SIScheduleBlockCreator::colorEndsAccordingToDependencies() { | |||
901 | unsigned DAGSize = DAG->SUnits.size(); | |||
902 | std::vector<int> PendingColoring = CurrentColoring; | |||
903 | ||||
904 | assert(DAGSize >= 1 &&(static_cast <bool> (DAGSize >= 1 && CurrentBottomUpReservedDependencyColoring .size() == DAGSize && CurrentTopDownReservedDependencyColoring .size() == DAGSize) ? void (0) : __assert_fail ("DAGSize >= 1 && CurrentBottomUpReservedDependencyColoring.size() == DAGSize && CurrentTopDownReservedDependencyColoring.size() == DAGSize" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 906, __extension__ __PRETTY_FUNCTION__)) | |||
905 | CurrentBottomUpReservedDependencyColoring.size() == DAGSize &&(static_cast <bool> (DAGSize >= 1 && CurrentBottomUpReservedDependencyColoring .size() == DAGSize && CurrentTopDownReservedDependencyColoring .size() == DAGSize) ? void (0) : __assert_fail ("DAGSize >= 1 && CurrentBottomUpReservedDependencyColoring.size() == DAGSize && CurrentTopDownReservedDependencyColoring.size() == DAGSize" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 906, __extension__ __PRETTY_FUNCTION__)) | |||
906 | CurrentTopDownReservedDependencyColoring.size() == DAGSize)(static_cast <bool> (DAGSize >= 1 && CurrentBottomUpReservedDependencyColoring .size() == DAGSize && CurrentTopDownReservedDependencyColoring .size() == DAGSize) ? void (0) : __assert_fail ("DAGSize >= 1 && CurrentBottomUpReservedDependencyColoring.size() == DAGSize && CurrentTopDownReservedDependencyColoring.size() == DAGSize" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 906, __extension__ __PRETTY_FUNCTION__)); | |||
907 | // If there is no reserved block at all, do nothing. We don't want | |||
908 | // everything in one block. | |||
909 | if (*std::max_element(CurrentBottomUpReservedDependencyColoring.begin(), | |||
910 | CurrentBottomUpReservedDependencyColoring.end()) == 0 && | |||
911 | *std::max_element(CurrentTopDownReservedDependencyColoring.begin(), | |||
912 | CurrentTopDownReservedDependencyColoring.end()) == 0) | |||
913 | return; | |||
914 | ||||
915 | for (unsigned SUNum : DAG->BottomUpIndex2SU) { | |||
916 | SUnit *SU = &DAG->SUnits[SUNum]; | |||
917 | std::set<unsigned> SUColors; | |||
918 | std::set<unsigned> SUColorsPending; | |||
919 | ||||
920 | if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) | |||
921 | continue; | |||
922 | ||||
923 | if (CurrentBottomUpReservedDependencyColoring[SU->NodeNum] > 0 || | |||
924 | CurrentTopDownReservedDependencyColoring[SU->NodeNum] > 0) | |||
925 | continue; | |||
926 | ||||
927 | for (SDep& SuccDep : SU->Succs) { | |||
928 | SUnit *Succ = SuccDep.getSUnit(); | |||
929 | if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) | |||
930 | continue; | |||
931 | if (CurrentBottomUpReservedDependencyColoring[Succ->NodeNum] > 0 || | |||
932 | CurrentTopDownReservedDependencyColoring[Succ->NodeNum] > 0) | |||
933 | SUColors.insert(CurrentColoring[Succ->NodeNum]); | |||
934 | SUColorsPending.insert(PendingColoring[Succ->NodeNum]); | |||
935 | } | |||
936 | // If there is only one child/parent block, and that block | |||
937 | // is not among the ones we are removing in this path, then | |||
938 | // merge the instruction to that block | |||
939 | if (SUColors.size() == 1 && SUColorsPending.size() == 1) | |||
940 | PendingColoring[SU->NodeNum] = *SUColors.begin(); | |||
941 | else // TODO: Attribute new colors depending on color | |||
942 | // combination of children. | |||
943 | PendingColoring[SU->NodeNum] = NextNonReservedID++; | |||
944 | } | |||
945 | CurrentColoring = PendingColoring; | |||
946 | } | |||
947 | ||||
948 | ||||
949 | void SIScheduleBlockCreator::colorForceConsecutiveOrderInGroup() { | |||
950 | unsigned DAGSize = DAG->SUnits.size(); | |||
951 | unsigned PreviousColor; | |||
952 | std::set<unsigned> SeenColors; | |||
953 | ||||
954 | if (DAGSize <= 1) | |||
955 | return; | |||
956 | ||||
957 | PreviousColor = CurrentColoring[0]; | |||
958 | ||||
959 | for (unsigned i = 1, e = DAGSize; i != e; ++i) { | |||
960 | SUnit *SU = &DAG->SUnits[i]; | |||
961 | unsigned CurrentColor = CurrentColoring[i]; | |||
962 | unsigned PreviousColorSave = PreviousColor; | |||
963 | assert(i == SU->NodeNum)(static_cast <bool> (i == SU->NodeNum) ? void (0) : __assert_fail ("i == SU->NodeNum", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 963, __extension__ __PRETTY_FUNCTION__)); | |||
964 | ||||
965 | if (CurrentColor != PreviousColor) | |||
966 | SeenColors.insert(PreviousColor); | |||
967 | PreviousColor = CurrentColor; | |||
968 | ||||
969 | if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) | |||
970 | continue; | |||
971 | ||||
972 | if (SeenColors.find(CurrentColor) == SeenColors.end()) | |||
973 | continue; | |||
974 | ||||
975 | if (PreviousColorSave != CurrentColor) | |||
976 | CurrentColoring[i] = NextNonReservedID++; | |||
977 | else | |||
978 | CurrentColoring[i] = CurrentColoring[i-1]; | |||
979 | } | |||
980 | } | |||
981 | ||||
982 | void SIScheduleBlockCreator::colorMergeConstantLoadsNextGroup() { | |||
983 | unsigned DAGSize = DAG->SUnits.size(); | |||
984 | ||||
985 | for (unsigned SUNum : DAG->BottomUpIndex2SU) { | |||
986 | SUnit *SU = &DAG->SUnits[SUNum]; | |||
987 | std::set<unsigned> SUColors; | |||
988 | ||||
989 | if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) | |||
990 | continue; | |||
991 | ||||
992 | // No predecessor: Vgpr constant loading. | |||
993 | // Low latency instructions usually have a predecessor (the address) | |||
994 | if (SU->Preds.size() > 0 && !DAG->IsLowLatencySU[SU->NodeNum]) | |||
995 | continue; | |||
996 | ||||
997 | for (SDep& SuccDep : SU->Succs) { | |||
998 | SUnit *Succ = SuccDep.getSUnit(); | |||
999 | if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) | |||
1000 | continue; | |||
1001 | SUColors.insert(CurrentColoring[Succ->NodeNum]); | |||
1002 | } | |||
1003 | if (SUColors.size() == 1) | |||
1004 | CurrentColoring[SU->NodeNum] = *SUColors.begin(); | |||
1005 | } | |||
1006 | } | |||
1007 | ||||
1008 | void SIScheduleBlockCreator::colorMergeIfPossibleNextGroup() { | |||
1009 | unsigned DAGSize = DAG->SUnits.size(); | |||
1010 | ||||
1011 | for (unsigned SUNum : DAG->BottomUpIndex2SU) { | |||
1012 | SUnit *SU = &DAG->SUnits[SUNum]; | |||
1013 | std::set<unsigned> SUColors; | |||
1014 | ||||
1015 | if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) | |||
1016 | continue; | |||
1017 | ||||
1018 | for (SDep& SuccDep : SU->Succs) { | |||
1019 | SUnit *Succ = SuccDep.getSUnit(); | |||
1020 | if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) | |||
1021 | continue; | |||
1022 | SUColors.insert(CurrentColoring[Succ->NodeNum]); | |||
1023 | } | |||
1024 | if (SUColors.size() == 1) | |||
1025 | CurrentColoring[SU->NodeNum] = *SUColors.begin(); | |||
1026 | } | |||
1027 | } | |||
1028 | ||||
1029 | void SIScheduleBlockCreator::colorMergeIfPossibleNextGroupOnlyForReserved() { | |||
1030 | unsigned DAGSize = DAG->SUnits.size(); | |||
1031 | ||||
1032 | for (unsigned SUNum : DAG->BottomUpIndex2SU) { | |||
1033 | SUnit *SU = &DAG->SUnits[SUNum]; | |||
1034 | std::set<unsigned> SUColors; | |||
1035 | ||||
1036 | if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) | |||
1037 | continue; | |||
1038 | ||||
1039 | for (SDep& SuccDep : SU->Succs) { | |||
1040 | SUnit *Succ = SuccDep.getSUnit(); | |||
1041 | if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) | |||
1042 | continue; | |||
1043 | SUColors.insert(CurrentColoring[Succ->NodeNum]); | |||
1044 | } | |||
1045 | if (SUColors.size() == 1 && *SUColors.begin() <= DAGSize) | |||
1046 | CurrentColoring[SU->NodeNum] = *SUColors.begin(); | |||
1047 | } | |||
1048 | } | |||
1049 | ||||
1050 | void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() { | |||
1051 | unsigned DAGSize = DAG->SUnits.size(); | |||
1052 | std::map<unsigned, unsigned> ColorCount; | |||
1053 | ||||
1054 | for (unsigned SUNum : DAG->BottomUpIndex2SU) { | |||
1055 | SUnit *SU = &DAG->SUnits[SUNum]; | |||
1056 | unsigned color = CurrentColoring[SU->NodeNum]; | |||
1057 | ++ColorCount[color]; | |||
1058 | } | |||
1059 | ||||
1060 | for (unsigned SUNum : DAG->BottomUpIndex2SU) { | |||
1061 | SUnit *SU = &DAG->SUnits[SUNum]; | |||
1062 | unsigned color = CurrentColoring[SU->NodeNum]; | |||
1063 | std::set<unsigned> SUColors; | |||
1064 | ||||
1065 | if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) | |||
1066 | continue; | |||
1067 | ||||
1068 | if (ColorCount[color] > 1) | |||
1069 | continue; | |||
1070 | ||||
1071 | for (SDep& SuccDep : SU->Succs) { | |||
1072 | SUnit *Succ = SuccDep.getSUnit(); | |||
1073 | if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) | |||
1074 | continue; | |||
1075 | SUColors.insert(CurrentColoring[Succ->NodeNum]); | |||
1076 | } | |||
1077 | if (SUColors.size() == 1 && *SUColors.begin() != color) { | |||
1078 | --ColorCount[color]; | |||
1079 | CurrentColoring[SU->NodeNum] = *SUColors.begin(); | |||
1080 | ++ColorCount[*SUColors.begin()]; | |||
1081 | } | |||
1082 | } | |||
1083 | } | |||
1084 | ||||
1085 | void SIScheduleBlockCreator::cutHugeBlocks() { | |||
1086 | // TODO | |||
1087 | } | |||
1088 | ||||
1089 | void SIScheduleBlockCreator::regroupNoUserInstructions() { | |||
1090 | unsigned DAGSize = DAG->SUnits.size(); | |||
1091 | int GroupID = NextNonReservedID++; | |||
1092 | ||||
1093 | for (unsigned SUNum : DAG->BottomUpIndex2SU) { | |||
1094 | SUnit *SU = &DAG->SUnits[SUNum]; | |||
1095 | bool hasSuccessor = false; | |||
1096 | ||||
1097 | if (CurrentColoring[SU->NodeNum] <= (int)DAGSize) | |||
1098 | continue; | |||
1099 | ||||
1100 | for (SDep& SuccDep : SU->Succs) { | |||
1101 | SUnit *Succ = SuccDep.getSUnit(); | |||
1102 | if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) | |||
1103 | continue; | |||
1104 | hasSuccessor = true; | |||
1105 | } | |||
1106 | if (!hasSuccessor) | |||
1107 | CurrentColoring[SU->NodeNum] = GroupID; | |||
1108 | } | |||
1109 | } | |||
1110 | ||||
1111 | void SIScheduleBlockCreator::colorExports() { | |||
1112 | unsigned ExportColor = NextNonReservedID++; | |||
1113 | SmallVector<unsigned, 8> ExpGroup; | |||
1114 | ||||
1115 | // Put all exports together in a block. | |||
1116 | // The block will naturally end up being scheduled last, | |||
1117 | // thus putting exports at the end of the schedule, which | |||
1118 | // is better for performance. | |||
1119 | // However we must ensure, for safety, the exports can be put | |||
1120 | // together in the same block without any other instruction. | |||
1121 | // This could happen, for example, when scheduling after regalloc | |||
1122 | // if reloading a spilled register from memory using the same | |||
1123 | // register than used in a previous export. | |||
1124 | // If that happens, do not regroup the exports. | |||
1125 | for (unsigned SUNum : DAG->TopDownIndex2SU) { | |||
1126 | const SUnit &SU = DAG->SUnits[SUNum]; | |||
1127 | if (SIInstrInfo::isEXP(*SU.getInstr())) { | |||
1128 | // Check the EXP can be added to the group safely, | |||
1129 | // ie without needing any other instruction. | |||
1130 | // The EXP is allowed to depend on other EXP | |||
1131 | // (they will be in the same group). | |||
1132 | for (unsigned j : ExpGroup) { | |||
1133 | bool HasSubGraph; | |||
1134 | std::vector<int> SubGraph; | |||
1135 | // By construction (topological order), if SU and | |||
1136 | // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary | |||
1137 | // in the parent graph of SU. | |||
1138 | #ifndef NDEBUG | |||
1139 | SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j], | |||
1140 | HasSubGraph); | |||
1141 | assert(!HasSubGraph)(static_cast <bool> (!HasSubGraph) ? void (0) : __assert_fail ("!HasSubGraph", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 1141, __extension__ __PRETTY_FUNCTION__)); | |||
1142 | #endif | |||
1143 | SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU, | |||
1144 | HasSubGraph); | |||
1145 | if (!HasSubGraph) | |||
1146 | continue; // No dependencies between each other | |||
1147 | ||||
1148 | // SubGraph contains all the instructions required | |||
1149 | // between EXP SUnits[j] and EXP SU. | |||
1150 | for (unsigned k : SubGraph) { | |||
1151 | if (!SIInstrInfo::isEXP(*DAG->SUnits[k].getInstr())) | |||
1152 | // Other instructions than EXP would be required in the group. | |||
1153 | // Abort the groupping. | |||
1154 | return; | |||
1155 | } | |||
1156 | } | |||
1157 | ||||
1158 | ExpGroup.push_back(SUNum); | |||
1159 | } | |||
1160 | } | |||
1161 | ||||
1162 | // The group can be formed. Give the color. | |||
1163 | for (unsigned j : ExpGroup) | |||
1164 | CurrentColoring[j] = ExportColor; | |||
1165 | } | |||
1166 | ||||
1167 | void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant) { | |||
1168 | unsigned DAGSize = DAG->SUnits.size(); | |||
1169 | std::map<unsigned,unsigned> RealID; | |||
1170 | ||||
1171 | CurrentBlocks.clear(); | |||
1172 | CurrentColoring.clear(); | |||
1173 | CurrentColoring.resize(DAGSize, 0); | |||
1174 | Node2CurrentBlock.clear(); | |||
1175 | ||||
1176 | // Restore links previous scheduling variant has overridden. | |||
1177 | DAG->restoreSULinksLeft(); | |||
1178 | ||||
1179 | NextReservedID = 1; | |||
1180 | NextNonReservedID = DAGSize + 1; | |||
1181 | ||||
1182 | LLVM_DEBUG(dbgs() << "Coloring the graph\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Coloring the graph\n" ; } } while (false); | |||
1183 | ||||
1184 | if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesGrouped) | |||
1185 | colorHighLatenciesGroups(); | |||
1186 | else | |||
1187 | colorHighLatenciesAlone(); | |||
1188 | colorComputeReservedDependencies(); | |||
1189 | colorAccordingToReservedDependencies(); | |||
1190 | colorEndsAccordingToDependencies(); | |||
1191 | if (BlockVariant == SISchedulerBlockCreatorVariant::LatenciesAlonePlusConsecutive) | |||
1192 | colorForceConsecutiveOrderInGroup(); | |||
1193 | regroupNoUserInstructions(); | |||
1194 | colorMergeConstantLoadsNextGroup(); | |||
1195 | colorMergeIfPossibleNextGroupOnlyForReserved(); | |||
1196 | colorExports(); | |||
1197 | ||||
1198 | // Put SUs of same color into same block | |||
1199 | Node2CurrentBlock.resize(DAGSize, -1); | |||
1200 | for (unsigned i = 0, e = DAGSize; i != e; ++i) { | |||
1201 | SUnit *SU = &DAG->SUnits[i]; | |||
1202 | unsigned Color = CurrentColoring[SU->NodeNum]; | |||
1203 | if (RealID.find(Color) == RealID.end()) { | |||
1204 | int ID = CurrentBlocks.size(); | |||
1205 | BlockPtrs.push_back(std::make_unique<SIScheduleBlock>(DAG, this, ID)); | |||
1206 | CurrentBlocks.push_back(BlockPtrs.rbegin()->get()); | |||
1207 | RealID[Color] = ID; | |||
1208 | } | |||
1209 | CurrentBlocks[RealID[Color]]->addUnit(SU); | |||
1210 | Node2CurrentBlock[SU->NodeNum] = RealID[Color]; | |||
1211 | } | |||
1212 | ||||
1213 | // Build dependencies between blocks. | |||
1214 | for (unsigned i = 0, e = DAGSize; i != e; ++i) { | |||
1215 | SUnit *SU = &DAG->SUnits[i]; | |||
1216 | int SUID = Node2CurrentBlock[i]; | |||
1217 | for (SDep& SuccDep : SU->Succs) { | |||
1218 | SUnit *Succ = SuccDep.getSUnit(); | |||
1219 | if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) | |||
1220 | continue; | |||
1221 | if (Node2CurrentBlock[Succ->NodeNum] != SUID) | |||
1222 | CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]], | |||
1223 | SuccDep.isCtrl() ? NoData : Data); | |||
1224 | } | |||
1225 | for (SDep& PredDep : SU->Preds) { | |||
1226 | SUnit *Pred = PredDep.getSUnit(); | |||
1227 | if (PredDep.isWeak() || Pred->NodeNum >= DAGSize) | |||
1228 | continue; | |||
1229 | if (Node2CurrentBlock[Pred->NodeNum] != SUID) | |||
1230 | CurrentBlocks[SUID]->addPred(CurrentBlocks[Node2CurrentBlock[Pred->NodeNum]]); | |||
1231 | } | |||
1232 | } | |||
1233 | ||||
1234 | // Free root and leafs of all blocks to enable scheduling inside them. | |||
1235 | for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { | |||
1236 | SIScheduleBlock *Block = CurrentBlocks[i]; | |||
1237 | Block->finalizeUnits(); | |||
1238 | } | |||
1239 | LLVM_DEBUG(dbgs() << "Blocks created:\n\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Blocks created:\n\n" ; for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { SIScheduleBlock *Block = CurrentBlocks[i]; Block->printDebug (true); }; } } while (false) | |||
1240 | for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Blocks created:\n\n" ; for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { SIScheduleBlock *Block = CurrentBlocks[i]; Block->printDebug (true); }; } } while (false) | |||
1241 | SIScheduleBlock *Block = CurrentBlocks[i];do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Blocks created:\n\n" ; for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { SIScheduleBlock *Block = CurrentBlocks[i]; Block->printDebug (true); }; } } while (false) | |||
1242 | Block->printDebug(true);do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Blocks created:\n\n" ; for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { SIScheduleBlock *Block = CurrentBlocks[i]; Block->printDebug (true); }; } } while (false) | |||
1243 | })do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Blocks created:\n\n" ; for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) { SIScheduleBlock *Block = CurrentBlocks[i]; Block->printDebug (true); }; } } while (false); | |||
1244 | } | |||
1245 | ||||
1246 | // Two functions taken from Codegen/MachineScheduler.cpp | |||
1247 | ||||
1248 | /// Non-const version. | |||
1249 | static MachineBasicBlock::iterator | |||
1250 | nextIfDebug(MachineBasicBlock::iterator I, | |||
1251 | MachineBasicBlock::const_iterator End) { | |||
1252 | for (; I != End; ++I) { | |||
1253 | if (!I->isDebugInstr()) | |||
1254 | break; | |||
1255 | } | |||
1256 | return I; | |||
1257 | } | |||
1258 | ||||
1259 | void SIScheduleBlockCreator::topologicalSort() { | |||
1260 | unsigned DAGSize = CurrentBlocks.size(); | |||
1261 | std::vector<int> WorkList; | |||
1262 | ||||
1263 | LLVM_DEBUG(dbgs() << "Topological Sort\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Topological Sort\n" ; } } while (false); | |||
1264 | ||||
1265 | WorkList.reserve(DAGSize); | |||
1266 | TopDownIndex2Block.resize(DAGSize); | |||
1267 | TopDownBlock2Index.resize(DAGSize); | |||
1268 | BottomUpIndex2Block.resize(DAGSize); | |||
1269 | ||||
1270 | for (unsigned i = 0, e = DAGSize; i != e; ++i) { | |||
1271 | SIScheduleBlock *Block = CurrentBlocks[i]; | |||
1272 | unsigned Degree = Block->getSuccs().size(); | |||
1273 | TopDownBlock2Index[i] = Degree; | |||
1274 | if (Degree == 0) { | |||
1275 | WorkList.push_back(i); | |||
1276 | } | |||
1277 | } | |||
1278 | ||||
1279 | int Id = DAGSize; | |||
1280 | while (!WorkList.empty()) { | |||
1281 | int i = WorkList.back(); | |||
1282 | SIScheduleBlock *Block = CurrentBlocks[i]; | |||
1283 | WorkList.pop_back(); | |||
1284 | TopDownBlock2Index[i] = --Id; | |||
1285 | TopDownIndex2Block[Id] = i; | |||
1286 | for (SIScheduleBlock* Pred : Block->getPreds()) { | |||
1287 | if (!--TopDownBlock2Index[Pred->getID()]) | |||
1288 | WorkList.push_back(Pred->getID()); | |||
1289 | } | |||
1290 | } | |||
1291 | ||||
1292 | #ifndef NDEBUG | |||
1293 | // Check correctness of the ordering. | |||
1294 | for (unsigned i = 0, e = DAGSize; i != e; ++i) { | |||
1295 | SIScheduleBlock *Block = CurrentBlocks[i]; | |||
1296 | for (SIScheduleBlock* Pred : Block->getPreds()) { | |||
1297 | assert(TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] &&(static_cast <bool> (TopDownBlock2Index[i] > TopDownBlock2Index [Pred->getID()] && "Wrong Top Down topological sorting" ) ? void (0) : __assert_fail ("TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] && \"Wrong Top Down topological sorting\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 1298, __extension__ __PRETTY_FUNCTION__)) | |||
1298 | "Wrong Top Down topological sorting")(static_cast <bool> (TopDownBlock2Index[i] > TopDownBlock2Index [Pred->getID()] && "Wrong Top Down topological sorting" ) ? void (0) : __assert_fail ("TopDownBlock2Index[i] > TopDownBlock2Index[Pred->getID()] && \"Wrong Top Down topological sorting\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 1298, __extension__ __PRETTY_FUNCTION__)); | |||
1299 | } | |||
1300 | } | |||
1301 | #endif | |||
1302 | ||||
1303 | BottomUpIndex2Block = std::vector<int>(TopDownIndex2Block.rbegin(), | |||
1304 | TopDownIndex2Block.rend()); | |||
1305 | } | |||
1306 | ||||
1307 | void SIScheduleBlockCreator::scheduleInsideBlocks() { | |||
1308 | unsigned DAGSize = CurrentBlocks.size(); | |||
1309 | ||||
1310 | LLVM_DEBUG(dbgs() << "\nScheduling Blocks\n\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "\nScheduling Blocks\n\n" ; } } while (false); | |||
1311 | ||||
1312 | // We do schedule a valid scheduling such that a Block corresponds | |||
1313 | // to a range of instructions. | |||
1314 | LLVM_DEBUG(dbgs() << "First phase: Fast scheduling for Reg Liveness\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "First phase: Fast scheduling for Reg Liveness\n" ; } } while (false); | |||
1315 | for (unsigned i = 0, e = DAGSize; i != e; ++i) { | |||
1316 | SIScheduleBlock *Block = CurrentBlocks[i]; | |||
1317 | Block->fastSchedule(); | |||
1318 | } | |||
1319 | ||||
1320 | // Note: the following code, and the part restoring previous position | |||
1321 | // is by far the most expensive operation of the Scheduler. | |||
1322 | ||||
1323 | // Do not update CurrentTop. | |||
1324 | MachineBasicBlock::iterator CurrentTopFastSched = DAG->getCurrentTop(); | |||
1325 | std::vector<MachineBasicBlock::iterator> PosOld; | |||
1326 | std::vector<MachineBasicBlock::iterator> PosNew; | |||
1327 | PosOld.reserve(DAG->SUnits.size()); | |||
1328 | PosNew.reserve(DAG->SUnits.size()); | |||
1329 | ||||
1330 | for (unsigned i = 0, e = DAGSize; i != e; ++i) { | |||
1331 | int BlockIndice = TopDownIndex2Block[i]; | |||
1332 | SIScheduleBlock *Block = CurrentBlocks[BlockIndice]; | |||
1333 | std::vector<SUnit*> SUs = Block->getScheduledUnits(); | |||
1334 | ||||
1335 | for (SUnit* SU : SUs) { | |||
1336 | MachineInstr *MI = SU->getInstr(); | |||
1337 | MachineBasicBlock::iterator Pos = MI; | |||
1338 | PosOld.push_back(Pos); | |||
1339 | if (&*CurrentTopFastSched == MI) { | |||
1340 | PosNew.push_back(Pos); | |||
1341 | CurrentTopFastSched = nextIfDebug(++CurrentTopFastSched, | |||
1342 | DAG->getCurrentBottom()); | |||
1343 | } else { | |||
1344 | // Update the instruction stream. | |||
1345 | DAG->getBB()->splice(CurrentTopFastSched, DAG->getBB(), MI); | |||
1346 | ||||
1347 | // Update LiveIntervals. | |||
1348 | // Note: Moving all instructions and calling handleMove every time | |||
1349 | // is the most cpu intensive operation of the scheduler. | |||
1350 | // It would gain a lot if there was a way to recompute the | |||
1351 | // LiveIntervals for the entire scheduling region. | |||
1352 | DAG->getLIS()->handleMove(*MI, /*UpdateFlags=*/true); | |||
1353 | PosNew.push_back(CurrentTopFastSched); | |||
1354 | } | |||
1355 | } | |||
1356 | } | |||
1357 | ||||
1358 | // Now we have Block of SUs == Block of MI. | |||
1359 | // We do the final schedule for the instructions inside the block. | |||
1360 | // The property that all the SUs of the Block are grouped together as MI | |||
1361 | // is used for correct reg usage tracking. | |||
1362 | for (unsigned i = 0, e = DAGSize; i != e; ++i) { | |||
1363 | SIScheduleBlock *Block = CurrentBlocks[i]; | |||
1364 | std::vector<SUnit*> SUs = Block->getScheduledUnits(); | |||
1365 | Block->schedule((*SUs.begin())->getInstr(), (*SUs.rbegin())->getInstr()); | |||
1366 | } | |||
1367 | ||||
1368 | LLVM_DEBUG(dbgs() << "Restoring MI Pos\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Restoring MI Pos\n" ; } } while (false); | |||
1369 | // Restore old ordering (which prevents a LIS->handleMove bug). | |||
1370 | for (unsigned i = PosOld.size(), e = 0; i != e; --i) { | |||
1371 | MachineBasicBlock::iterator POld = PosOld[i-1]; | |||
1372 | MachineBasicBlock::iterator PNew = PosNew[i-1]; | |||
1373 | if (PNew != POld) { | |||
1374 | // Update the instruction stream. | |||
1375 | DAG->getBB()->splice(POld, DAG->getBB(), PNew); | |||
1376 | ||||
1377 | // Update LiveIntervals. | |||
1378 | DAG->getLIS()->handleMove(*POld, /*UpdateFlags=*/true); | |||
1379 | } | |||
1380 | } | |||
1381 | ||||
1382 | LLVM_DEBUG(for (unsigned i = 0, e = CurrentBlocks.size(); i != e; ++i) {do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { for (unsigned i = 0, e = CurrentBlocks .size(); i != e; ++i) { SIScheduleBlock *Block = CurrentBlocks [i]; Block->printDebug(true); }; } } while (false) | |||
1383 | SIScheduleBlock *Block = CurrentBlocks[i];do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { for (unsigned i = 0, e = CurrentBlocks .size(); i != e; ++i) { SIScheduleBlock *Block = CurrentBlocks [i]; Block->printDebug(true); }; } } while (false) | |||
1384 | Block->printDebug(true);do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { for (unsigned i = 0, e = CurrentBlocks .size(); i != e; ++i) { SIScheduleBlock *Block = CurrentBlocks [i]; Block->printDebug(true); }; } } while (false) | |||
1385 | })do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { for (unsigned i = 0, e = CurrentBlocks .size(); i != e; ++i) { SIScheduleBlock *Block = CurrentBlocks [i]; Block->printDebug(true); }; } } while (false); | |||
1386 | } | |||
1387 | ||||
1388 | void SIScheduleBlockCreator::fillStats() { | |||
1389 | unsigned DAGSize = CurrentBlocks.size(); | |||
1390 | ||||
1391 | for (unsigned i = 0, e = DAGSize; i != e; ++i) { | |||
1392 | int BlockIndice = TopDownIndex2Block[i]; | |||
1393 | SIScheduleBlock *Block = CurrentBlocks[BlockIndice]; | |||
1394 | if (Block->getPreds().empty()) | |||
1395 | Block->Depth = 0; | |||
1396 | else { | |||
1397 | unsigned Depth = 0; | |||
1398 | for (SIScheduleBlock *Pred : Block->getPreds()) { | |||
1399 | if (Depth < Pred->Depth + Pred->getCost()) | |||
1400 | Depth = Pred->Depth + Pred->getCost(); | |||
1401 | } | |||
1402 | Block->Depth = Depth; | |||
1403 | } | |||
1404 | } | |||
1405 | ||||
1406 | for (unsigned i = 0, e = DAGSize; i != e; ++i) { | |||
1407 | int BlockIndice = BottomUpIndex2Block[i]; | |||
1408 | SIScheduleBlock *Block = CurrentBlocks[BlockIndice]; | |||
1409 | if (Block->getSuccs().empty()) | |||
1410 | Block->Height = 0; | |||
1411 | else { | |||
1412 | unsigned Height = 0; | |||
1413 | for (const auto &Succ : Block->getSuccs()) | |||
1414 | Height = std::max(Height, Succ.first->Height + Succ.first->getCost()); | |||
1415 | Block->Height = Height; | |||
1416 | } | |||
1417 | } | |||
1418 | } | |||
1419 | ||||
1420 | // SIScheduleBlockScheduler // | |||
1421 | ||||
1422 | SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG, | |||
1423 | SISchedulerBlockSchedulerVariant Variant, | |||
1424 | SIScheduleBlocks BlocksStruct) : | |||
1425 | DAG(DAG), Variant(Variant), Blocks(BlocksStruct.Blocks), | |||
1426 | LastPosWaitedHighLatency(0), NumBlockScheduled(0), VregCurrentUsage(0), | |||
1427 | SregCurrentUsage(0), maxVregUsage(0), maxSregUsage(0) { | |||
1428 | ||||
1429 | // Fill the usage of every output | |||
1430 | // Warning: while by construction we always have a link between two blocks | |||
1431 | // when one needs a result from the other, the number of users of an output | |||
1432 | // is not the sum of child blocks having as input the same virtual register. | |||
1433 | // Here is an example. A produces x and y. B eats x and produces x'. | |||
1434 | // C eats x' and y. The register coalescer may have attributed the same | |||
1435 | // virtual register to x and x'. | |||
1436 | // To count accurately, we do a topological sort. In case the register is | |||
1437 | // found for several parents, we increment the usage of the one with the | |||
1438 | // highest topological index. | |||
1439 | LiveOutRegsNumUsages.resize(Blocks.size()); | |||
1440 | for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { | |||
1441 | SIScheduleBlock *Block = Blocks[i]; | |||
1442 | for (unsigned Reg : Block->getInRegs()) { | |||
1443 | bool Found = false; | |||
1444 | int topoInd = -1; | |||
1445 | for (SIScheduleBlock* Pred: Block->getPreds()) { | |||
1446 | std::set<unsigned> PredOutRegs = Pred->getOutRegs(); | |||
1447 | std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg); | |||
1448 | ||||
1449 | if (RegPos != PredOutRegs.end()) { | |||
1450 | Found = true; | |||
1451 | if (topoInd < BlocksStruct.TopDownBlock2Index[Pred->getID()]) { | |||
1452 | topoInd = BlocksStruct.TopDownBlock2Index[Pred->getID()]; | |||
1453 | } | |||
1454 | } | |||
1455 | } | |||
1456 | ||||
1457 | if (!Found) | |||
1458 | continue; | |||
1459 | ||||
1460 | int PredID = BlocksStruct.TopDownIndex2Block[topoInd]; | |||
1461 | ++LiveOutRegsNumUsages[PredID][Reg]; | |||
1462 | } | |||
1463 | } | |||
1464 | ||||
1465 | LastPosHighLatencyParentScheduled.resize(Blocks.size(), 0); | |||
1466 | BlockNumPredsLeft.resize(Blocks.size()); | |||
1467 | BlockNumSuccsLeft.resize(Blocks.size()); | |||
1468 | ||||
1469 | for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { | |||
1470 | SIScheduleBlock *Block = Blocks[i]; | |||
1471 | BlockNumPredsLeft[i] = Block->getPreds().size(); | |||
1472 | BlockNumSuccsLeft[i] = Block->getSuccs().size(); | |||
1473 | } | |||
1474 | ||||
1475 | #ifndef NDEBUG | |||
1476 | for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { | |||
1477 | SIScheduleBlock *Block = Blocks[i]; | |||
1478 | assert(Block->getID() == i)(static_cast <bool> (Block->getID() == i) ? void (0) : __assert_fail ("Block->getID() == i", "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 1478, __extension__ __PRETTY_FUNCTION__)); | |||
1479 | } | |||
1480 | #endif | |||
1481 | ||||
1482 | std::set<unsigned> InRegs = DAG->getInRegs(); | |||
1483 | addLiveRegs(InRegs); | |||
1484 | ||||
1485 | // Increase LiveOutRegsNumUsages for blocks | |||
1486 | // producing registers consumed in another | |||
1487 | // scheduling region. | |||
1488 | for (unsigned Reg : DAG->getOutRegs()) { | |||
1489 | for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { | |||
1490 | // Do reverse traversal | |||
1491 | int ID = BlocksStruct.TopDownIndex2Block[Blocks.size()-1-i]; | |||
1492 | SIScheduleBlock *Block = Blocks[ID]; | |||
1493 | const std::set<unsigned> &OutRegs = Block->getOutRegs(); | |||
1494 | ||||
1495 | if (OutRegs.find(Reg) == OutRegs.end()) | |||
1496 | continue; | |||
1497 | ||||
1498 | ++LiveOutRegsNumUsages[ID][Reg]; | |||
1499 | break; | |||
1500 | } | |||
1501 | } | |||
1502 | ||||
1503 | // Fill LiveRegsConsumers for regs that were already | |||
1504 | // defined before scheduling. | |||
1505 | for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { | |||
1506 | SIScheduleBlock *Block = Blocks[i]; | |||
1507 | for (unsigned Reg : Block->getInRegs()) { | |||
1508 | bool Found = false; | |||
1509 | for (SIScheduleBlock* Pred: Block->getPreds()) { | |||
1510 | std::set<unsigned> PredOutRegs = Pred->getOutRegs(); | |||
1511 | std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg); | |||
1512 | ||||
1513 | if (RegPos != PredOutRegs.end()) { | |||
1514 | Found = true; | |||
1515 | break; | |||
1516 | } | |||
1517 | } | |||
1518 | ||||
1519 | if (!Found) | |||
1520 | ++LiveRegsConsumers[Reg]; | |||
1521 | } | |||
1522 | } | |||
1523 | ||||
1524 | for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { | |||
1525 | SIScheduleBlock *Block = Blocks[i]; | |||
1526 | if (BlockNumPredsLeft[i] == 0) { | |||
1527 | ReadyBlocks.push_back(Block); | |||
1528 | } | |||
1529 | } | |||
1530 | ||||
1531 | while (SIScheduleBlock *Block = pickBlock()) { | |||
1532 | BlocksScheduled.push_back(Block); | |||
1533 | blockScheduled(Block); | |||
1534 | } | |||
1535 | ||||
1536 | LLVM_DEBUG(dbgs() << "Block Order:"; for (SIScheduleBlock *Blockdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Block Order:"; for ( SIScheduleBlock *Block : BlocksScheduled) { dbgs() << ' ' << Block->getID(); } dbgs() << '\n';; } } while (false) | |||
1537 | : BlocksScheduled) {do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Block Order:"; for ( SIScheduleBlock *Block : BlocksScheduled) { dbgs() << ' ' << Block->getID(); } dbgs() << '\n';; } } while (false) | |||
1538 | dbgs() << ' ' << Block->getID();do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Block Order:"; for ( SIScheduleBlock *Block : BlocksScheduled) { dbgs() << ' ' << Block->getID(); } dbgs() << '\n';; } } while (false) | |||
1539 | } dbgs() << '\n';)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Block Order:"; for ( SIScheduleBlock *Block : BlocksScheduled) { dbgs() << ' ' << Block->getID(); } dbgs() << '\n';; } } while (false); | |||
1540 | } | |||
1541 | ||||
1542 | bool SIScheduleBlockScheduler::tryCandidateLatency(SIBlockSchedCandidate &Cand, | |||
1543 | SIBlockSchedCandidate &TryCand) { | |||
1544 | if (!Cand.isValid()) { | |||
1545 | TryCand.Reason = NodeOrder; | |||
1546 | return true; | |||
1547 | } | |||
1548 | ||||
1549 | // Try to hide high latencies. | |||
1550 | if (SISched::tryLess(TryCand.LastPosHighLatParentScheduled, | |||
1551 | Cand.LastPosHighLatParentScheduled, TryCand, Cand, Latency)) | |||
1552 | return true; | |||
1553 | // Schedule high latencies early so you can hide them better. | |||
1554 | if (SISched::tryGreater(TryCand.IsHighLatency, Cand.IsHighLatency, | |||
1555 | TryCand, Cand, Latency)) | |||
1556 | return true; | |||
1557 | if (TryCand.IsHighLatency && SISched::tryGreater(TryCand.Height, Cand.Height, | |||
1558 | TryCand, Cand, Depth)) | |||
1559 | return true; | |||
1560 | if (SISched::tryGreater(TryCand.NumHighLatencySuccessors, | |||
1561 | Cand.NumHighLatencySuccessors, | |||
1562 | TryCand, Cand, Successor)) | |||
1563 | return true; | |||
1564 | return false; | |||
1565 | } | |||
1566 | ||||
1567 | bool SIScheduleBlockScheduler::tryCandidateRegUsage(SIBlockSchedCandidate &Cand, | |||
1568 | SIBlockSchedCandidate &TryCand) { | |||
1569 | if (!Cand.isValid()) { | |||
1570 | TryCand.Reason = NodeOrder; | |||
1571 | return true; | |||
1572 | } | |||
1573 | ||||
1574 | if (SISched::tryLess(TryCand.VGPRUsageDiff > 0, Cand.VGPRUsageDiff > 0, | |||
1575 | TryCand, Cand, RegUsage)) | |||
1576 | return true; | |||
1577 | if (SISched::tryGreater(TryCand.NumSuccessors > 0, | |||
1578 | Cand.NumSuccessors > 0, | |||
1579 | TryCand, Cand, Successor)) | |||
1580 | return true; | |||
1581 | if (SISched::tryGreater(TryCand.Height, Cand.Height, TryCand, Cand, Depth)) | |||
1582 | return true; | |||
1583 | if (SISched::tryLess(TryCand.VGPRUsageDiff, Cand.VGPRUsageDiff, | |||
1584 | TryCand, Cand, RegUsage)) | |||
1585 | return true; | |||
1586 | return false; | |||
1587 | } | |||
1588 | ||||
1589 | SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() { | |||
1590 | SIBlockSchedCandidate Cand; | |||
1591 | std::vector<SIScheduleBlock*>::iterator Best; | |||
1592 | SIScheduleBlock *Block; | |||
1593 | if (ReadyBlocks.empty()) | |||
1594 | return nullptr; | |||
1595 | ||||
1596 | DAG->fillVgprSgprCost(LiveRegs.begin(), LiveRegs.end(), | |||
1597 | VregCurrentUsage, SregCurrentUsage); | |||
1598 | if (VregCurrentUsage > maxVregUsage) | |||
1599 | maxVregUsage = VregCurrentUsage; | |||
1600 | if (SregCurrentUsage > maxSregUsage) | |||
1601 | maxSregUsage = SregCurrentUsage; | |||
1602 | LLVM_DEBUG(dbgs() << "Picking New Blocks\n"; dbgs() << "Available: ";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking New Blocks\n" ; dbgs() << "Available: "; for (SIScheduleBlock *Block : ReadyBlocks) dbgs() << Block->getID() << ' '; dbgs() << "\nCurrent Live:\n"; for (unsigned Reg : LiveRegs ) dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; dbgs() << '\n'; dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';; } } while (false) | |||
1603 | for (SIScheduleBlock *Blockdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking New Blocks\n" ; dbgs() << "Available: "; for (SIScheduleBlock *Block : ReadyBlocks) dbgs() << Block->getID() << ' '; dbgs() << "\nCurrent Live:\n"; for (unsigned Reg : LiveRegs ) dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; dbgs() << '\n'; dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';; } } while (false) | |||
1604 | : ReadyBlocks) dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking New Blocks\n" ; dbgs() << "Available: "; for (SIScheduleBlock *Block : ReadyBlocks) dbgs() << Block->getID() << ' '; dbgs() << "\nCurrent Live:\n"; for (unsigned Reg : LiveRegs ) dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; dbgs() << '\n'; dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';; } } while (false) | |||
1605 | << Block->getID() << ' ';do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking New Blocks\n" ; dbgs() << "Available: "; for (SIScheduleBlock *Block : ReadyBlocks) dbgs() << Block->getID() << ' '; dbgs() << "\nCurrent Live:\n"; for (unsigned Reg : LiveRegs ) dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; dbgs() << '\n'; dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';; } } while (false) | |||
1606 | dbgs() << "\nCurrent Live:\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking New Blocks\n" ; dbgs() << "Available: "; for (SIScheduleBlock *Block : ReadyBlocks) dbgs() << Block->getID() << ' '; dbgs() << "\nCurrent Live:\n"; for (unsigned Reg : LiveRegs ) dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; dbgs() << '\n'; dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';; } } while (false) | |||
1607 | for (unsigned Regdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking New Blocks\n" ; dbgs() << "Available: "; for (SIScheduleBlock *Block : ReadyBlocks) dbgs() << Block->getID() << ' '; dbgs() << "\nCurrent Live:\n"; for (unsigned Reg : LiveRegs ) dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; dbgs() << '\n'; dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';; } } while (false) | |||
1608 | : LiveRegs) dbgs()do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking New Blocks\n" ; dbgs() << "Available: "; for (SIScheduleBlock *Block : ReadyBlocks) dbgs() << Block->getID() << ' '; dbgs() << "\nCurrent Live:\n"; for (unsigned Reg : LiveRegs ) dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; dbgs() << '\n'; dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';; } } while (false) | |||
1609 | << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking New Blocks\n" ; dbgs() << "Available: "; for (SIScheduleBlock *Block : ReadyBlocks) dbgs() << Block->getID() << ' '; dbgs() << "\nCurrent Live:\n"; for (unsigned Reg : LiveRegs ) dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; dbgs() << '\n'; dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';; } } while (false) | |||
1610 | dbgs() << '\n';do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking New Blocks\n" ; dbgs() << "Available: "; for (SIScheduleBlock *Block : ReadyBlocks) dbgs() << Block->getID() << ' '; dbgs() << "\nCurrent Live:\n"; for (unsigned Reg : LiveRegs ) dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; dbgs() << '\n'; dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';; } } while (false) | |||
1611 | dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking New Blocks\n" ; dbgs() << "Available: "; for (SIScheduleBlock *Block : ReadyBlocks) dbgs() << Block->getID() << ' '; dbgs() << "\nCurrent Live:\n"; for (unsigned Reg : LiveRegs ) dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; dbgs() << '\n'; dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';; } } while (false) | |||
1612 | dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking New Blocks\n" ; dbgs() << "Available: "; for (SIScheduleBlock *Block : ReadyBlocks) dbgs() << Block->getID() << ' '; dbgs() << "\nCurrent Live:\n"; for (unsigned Reg : LiveRegs ) dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' '; dbgs() << '\n'; dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n'; dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';; } } while (false); | |||
1613 | ||||
1614 | Cand.Block = nullptr; | |||
1615 | for (std::vector<SIScheduleBlock*>::iterator I = ReadyBlocks.begin(), | |||
1616 | E = ReadyBlocks.end(); I != E; ++I) { | |||
1617 | SIBlockSchedCandidate TryCand; | |||
1618 | TryCand.Block = *I; | |||
1619 | TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock(); | |||
1620 | TryCand.VGPRUsageDiff = | |||
1621 | checkRegUsageImpact(TryCand.Block->getInRegs(), | |||
1622 | TryCand.Block->getOutRegs())[AMDGPU::RegisterPressureSets::VGPR_32]; | |||
1623 | TryCand.NumSuccessors = TryCand.Block->getSuccs().size(); | |||
1624 | TryCand.NumHighLatencySuccessors = | |||
1625 | TryCand.Block->getNumHighLatencySuccessors(); | |||
1626 | TryCand.LastPosHighLatParentScheduled = | |||
1627 | (unsigned int) std::max<int> (0, | |||
1628 | LastPosHighLatencyParentScheduled[TryCand.Block->getID()] - | |||
1629 | LastPosWaitedHighLatency); | |||
1630 | TryCand.Height = TryCand.Block->Height; | |||
1631 | // Try not to increase VGPR usage too much, else we may spill. | |||
1632 | if (VregCurrentUsage > 120 || | |||
1633 | Variant != SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage) { | |||
1634 | if (!tryCandidateRegUsage(Cand, TryCand) && | |||
1635 | Variant != SISchedulerBlockSchedulerVariant::BlockRegUsage) | |||
1636 | tryCandidateLatency(Cand, TryCand); | |||
1637 | } else { | |||
1638 | if (!tryCandidateLatency(Cand, TryCand)) | |||
1639 | tryCandidateRegUsage(Cand, TryCand); | |||
1640 | } | |||
1641 | if (TryCand.Reason != NoCand) { | |||
1642 | Cand.setBest(TryCand); | |||
1643 | Best = I; | |||
1644 | LLVM_DEBUG(dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' 'do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' ' << getReasonStr (Cand.Reason) << '\n'; } } while (false) | |||
1645 | << getReasonStr(Cand.Reason) << '\n')do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Best Current Choice: " << Cand.Block->getID() << ' ' << getReasonStr (Cand.Reason) << '\n'; } } while (false); | |||
1646 | } | |||
1647 | } | |||
1648 | ||||
1649 | LLVM_DEBUG(dbgs() << "Picking: " << Cand.Block->getID() << '\n';do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking: " << Cand.Block->getID() << '\n'; dbgs() << "Is a block with high latency instruction: " << (Cand.IsHighLatency ? "yes\n" : "no\n"); dbgs() << "Position of last high latency dependency: " << Cand.LastPosHighLatParentScheduled << '\n'; dbgs() << "VGPRUsageDiff: " << Cand .VGPRUsageDiff << '\n'; dbgs() << '\n';; } } while (false) | |||
| ||||
1650 | dbgs() << "Is a block with high latency instruction: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking: " << Cand.Block->getID() << '\n'; dbgs() << "Is a block with high latency instruction: " << (Cand.IsHighLatency ? "yes\n" : "no\n"); dbgs() << "Position of last high latency dependency: " << Cand.LastPosHighLatParentScheduled << '\n'; dbgs() << "VGPRUsageDiff: " << Cand .VGPRUsageDiff << '\n'; dbgs() << '\n';; } } while (false) | |||
1651 | << (Cand.IsHighLatency ? "yes\n" : "no\n");do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking: " << Cand.Block->getID() << '\n'; dbgs() << "Is a block with high latency instruction: " << (Cand.IsHighLatency ? "yes\n" : "no\n"); dbgs() << "Position of last high latency dependency: " << Cand.LastPosHighLatParentScheduled << '\n'; dbgs() << "VGPRUsageDiff: " << Cand .VGPRUsageDiff << '\n'; dbgs() << '\n';; } } while (false) | |||
1652 | dbgs() << "Position of last high latency dependency: "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking: " << Cand.Block->getID() << '\n'; dbgs() << "Is a block with high latency instruction: " << (Cand.IsHighLatency ? "yes\n" : "no\n"); dbgs() << "Position of last high latency dependency: " << Cand.LastPosHighLatParentScheduled << '\n'; dbgs() << "VGPRUsageDiff: " << Cand .VGPRUsageDiff << '\n'; dbgs() << '\n';; } } while (false) | |||
1653 | << Cand.LastPosHighLatParentScheduled << '\n';do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking: " << Cand.Block->getID() << '\n'; dbgs() << "Is a block with high latency instruction: " << (Cand.IsHighLatency ? "yes\n" : "no\n"); dbgs() << "Position of last high latency dependency: " << Cand.LastPosHighLatParentScheduled << '\n'; dbgs() << "VGPRUsageDiff: " << Cand .VGPRUsageDiff << '\n'; dbgs() << '\n';; } } while (false) | |||
1654 | dbgs() << "VGPRUsageDiff: " << Cand.VGPRUsageDiff << '\n';do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking: " << Cand.Block->getID() << '\n'; dbgs() << "Is a block with high latency instruction: " << (Cand.IsHighLatency ? "yes\n" : "no\n"); dbgs() << "Position of last high latency dependency: " << Cand.LastPosHighLatParentScheduled << '\n'; dbgs() << "VGPRUsageDiff: " << Cand .VGPRUsageDiff << '\n'; dbgs() << '\n';; } } while (false) | |||
1655 | dbgs() << '\n';)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Picking: " << Cand.Block->getID() << '\n'; dbgs() << "Is a block with high latency instruction: " << (Cand.IsHighLatency ? "yes\n" : "no\n"); dbgs() << "Position of last high latency dependency: " << Cand.LastPosHighLatParentScheduled << '\n'; dbgs() << "VGPRUsageDiff: " << Cand .VGPRUsageDiff << '\n'; dbgs() << '\n';; } } while (false); | |||
1656 | ||||
1657 | Block = Cand.Block; | |||
1658 | ReadyBlocks.erase(Best); | |||
1659 | return Block; | |||
1660 | } | |||
1661 | ||||
1662 | // Tracking of currently alive registers to determine VGPR Usage. | |||
1663 | ||||
1664 | void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) { | |||
1665 | for (Register Reg : Regs) { | |||
1666 | // For now only track virtual registers. | |||
1667 | if (!Reg.isVirtual()) | |||
1668 | continue; | |||
1669 | // If not already in the live set, then add it. | |||
1670 | (void) LiveRegs.insert(Reg); | |||
1671 | } | |||
1672 | } | |||
1673 | ||||
1674 | void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block, | |||
1675 | std::set<unsigned> &Regs) { | |||
1676 | for (unsigned Reg : Regs) { | |||
1677 | // For now only track virtual registers. | |||
1678 | std::set<unsigned>::iterator Pos = LiveRegs.find(Reg); | |||
1679 | assert (Pos != LiveRegs.end() && // Reg must be live.(static_cast <bool> (Pos != LiveRegs.end() && LiveRegsConsumers .find(Reg) != LiveRegsConsumers.end() && LiveRegsConsumers [Reg] >= 1) ? void (0) : __assert_fail ("Pos != LiveRegs.end() && LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() && LiveRegsConsumers[Reg] >= 1" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 1681, __extension__ __PRETTY_FUNCTION__)) | |||
1680 | LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() &&(static_cast <bool> (Pos != LiveRegs.end() && LiveRegsConsumers .find(Reg) != LiveRegsConsumers.end() && LiveRegsConsumers [Reg] >= 1) ? void (0) : __assert_fail ("Pos != LiveRegs.end() && LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() && LiveRegsConsumers[Reg] >= 1" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 1681, __extension__ __PRETTY_FUNCTION__)) | |||
1681 | LiveRegsConsumers[Reg] >= 1)(static_cast <bool> (Pos != LiveRegs.end() && LiveRegsConsumers .find(Reg) != LiveRegsConsumers.end() && LiveRegsConsumers [Reg] >= 1) ? void (0) : __assert_fail ("Pos != LiveRegs.end() && LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() && LiveRegsConsumers[Reg] >= 1" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 1681, __extension__ __PRETTY_FUNCTION__)); | |||
1682 | --LiveRegsConsumers[Reg]; | |||
1683 | if (LiveRegsConsumers[Reg] == 0) | |||
1684 | LiveRegs.erase(Pos); | |||
1685 | } | |||
1686 | } | |||
1687 | ||||
1688 | void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) { | |||
1689 | for (const auto &Block : Parent->getSuccs()) { | |||
1690 | if (--BlockNumPredsLeft[Block.first->getID()] == 0) | |||
1691 | ReadyBlocks.push_back(Block.first); | |||
1692 | ||||
1693 | if (Parent->isHighLatencyBlock() && | |||
1694 | Block.second == SIScheduleBlockLinkKind::Data) | |||
1695 | LastPosHighLatencyParentScheduled[Block.first->getID()] = NumBlockScheduled; | |||
1696 | } | |||
1697 | } | |||
1698 | ||||
1699 | void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) { | |||
1700 | decreaseLiveRegs(Block, Block->getInRegs()); | |||
1701 | addLiveRegs(Block->getOutRegs()); | |||
1702 | releaseBlockSuccs(Block); | |||
1703 | for (std::map<unsigned, unsigned>::iterator RegI = | |||
1704 | LiveOutRegsNumUsages[Block->getID()].begin(), | |||
1705 | E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; ++RegI) { | |||
1706 | std::pair<unsigned, unsigned> RegP = *RegI; | |||
1707 | // We produce this register, thus it must not be previously alive. | |||
1708 | assert(LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() ||(static_cast <bool> (LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() || LiveRegsConsumers[RegP.first] == 0) ? void (0) : __assert_fail ("LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() || LiveRegsConsumers[RegP.first] == 0" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 1709, __extension__ __PRETTY_FUNCTION__)) | |||
1709 | LiveRegsConsumers[RegP.first] == 0)(static_cast <bool> (LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() || LiveRegsConsumers[RegP.first] == 0) ? void (0) : __assert_fail ("LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() || LiveRegsConsumers[RegP.first] == 0" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 1709, __extension__ __PRETTY_FUNCTION__)); | |||
1710 | LiveRegsConsumers[RegP.first] += RegP.second; | |||
1711 | } | |||
1712 | if (LastPosHighLatencyParentScheduled[Block->getID()] > | |||
1713 | (unsigned)LastPosWaitedHighLatency) | |||
1714 | LastPosWaitedHighLatency = | |||
1715 | LastPosHighLatencyParentScheduled[Block->getID()]; | |||
1716 | ++NumBlockScheduled; | |||
1717 | } | |||
1718 | ||||
1719 | std::vector<int> | |||
1720 | SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs, | |||
1721 | std::set<unsigned> &OutRegs) { | |||
1722 | std::vector<int> DiffSetPressure; | |||
1723 | DiffSetPressure.assign(DAG->getTRI()->getNumRegPressureSets(), 0); | |||
1724 | ||||
1725 | for (Register Reg : InRegs) { | |||
1726 | // For now only track virtual registers. | |||
1727 | if (!Reg.isVirtual()) | |||
1728 | continue; | |||
1729 | if (LiveRegsConsumers[Reg] > 1) | |||
1730 | continue; | |||
1731 | PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg); | |||
1732 | for (; PSetI.isValid(); ++PSetI) { | |||
1733 | DiffSetPressure[*PSetI] -= PSetI.getWeight(); | |||
1734 | } | |||
1735 | } | |||
1736 | ||||
1737 | for (Register Reg : OutRegs) { | |||
1738 | // For now only track virtual registers. | |||
1739 | if (!Reg.isVirtual()) | |||
1740 | continue; | |||
1741 | PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg); | |||
1742 | for (; PSetI.isValid(); ++PSetI) { | |||
1743 | DiffSetPressure[*PSetI] += PSetI.getWeight(); | |||
1744 | } | |||
1745 | } | |||
1746 | ||||
1747 | return DiffSetPressure; | |||
1748 | } | |||
1749 | ||||
1750 | // SIScheduler // | |||
1751 | ||||
1752 | struct SIScheduleBlockResult | |||
1753 | SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant, | |||
1754 | SISchedulerBlockSchedulerVariant ScheduleVariant) { | |||
1755 | SIScheduleBlocks Blocks = BlockCreator.getBlocks(BlockVariant); | |||
1756 | SIScheduleBlockScheduler Scheduler(DAG, ScheduleVariant, Blocks); | |||
1757 | std::vector<SIScheduleBlock*> ScheduledBlocks; | |||
1758 | struct SIScheduleBlockResult Res; | |||
1759 | ||||
1760 | ScheduledBlocks = Scheduler.getBlocks(); | |||
1761 | ||||
1762 | for (unsigned b = 0; b < ScheduledBlocks.size(); ++b) { | |||
1763 | SIScheduleBlock *Block = ScheduledBlocks[b]; | |||
1764 | std::vector<SUnit*> SUs = Block->getScheduledUnits(); | |||
1765 | ||||
1766 | for (SUnit* SU : SUs) | |||
1767 | Res.SUs.push_back(SU->NodeNum); | |||
1768 | } | |||
1769 | ||||
1770 | Res.MaxSGPRUsage = Scheduler.getSGPRUsage(); | |||
1771 | Res.MaxVGPRUsage = Scheduler.getVGPRUsage(); | |||
1772 | return Res; | |||
1773 | } | |||
1774 | ||||
1775 | // SIScheduleDAGMI // | |||
1776 | ||||
1777 | SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) : | |||
1778 | ScheduleDAGMILive(C, std::make_unique<GenericScheduler>(C)) { | |||
1779 | SITII = static_cast<const SIInstrInfo*>(TII); | |||
1780 | SITRI = static_cast<const SIRegisterInfo*>(TRI); | |||
1781 | } | |||
1782 | ||||
1783 | SIScheduleDAGMI::~SIScheduleDAGMI() = default; | |||
1784 | ||||
1785 | // Code adapted from scheduleDAG.cpp | |||
1786 | // Does a topological sort over the SUs. | |||
1787 | // Both TopDown and BottomUp | |||
1788 | void SIScheduleDAGMI::topologicalSort() { | |||
1789 | Topo.InitDAGTopologicalSorting(); | |||
1790 | ||||
1791 | TopDownIndex2SU = std::vector<int>(Topo.begin(), Topo.end()); | |||
1792 | BottomUpIndex2SU = std::vector<int>(Topo.rbegin(), Topo.rend()); | |||
1793 | } | |||
1794 | ||||
1795 | // Move low latencies further from their user without | |||
1796 | // increasing SGPR usage (in general) | |||
1797 | // This is to be replaced by a better pass that would | |||
1798 | // take into account SGPR usage (based on VGPR Usage | |||
1799 | // and the corresponding wavefront count), that would | |||
1800 | // try to merge groups of loads if it make sense, etc | |||
1801 | void SIScheduleDAGMI::moveLowLatencies() { | |||
1802 | unsigned DAGSize = SUnits.size(); | |||
1803 | int LastLowLatencyUser = -1; | |||
1804 | int LastLowLatencyPos = -1; | |||
1805 | ||||
1806 | for (unsigned i = 0, e = ScheduledSUnits.size(); i != e; ++i) { | |||
1807 | SUnit *SU = &SUnits[ScheduledSUnits[i]]; | |||
1808 | bool IsLowLatencyUser = false; | |||
1809 | unsigned MinPos = 0; | |||
1810 | ||||
1811 | for (SDep& PredDep : SU->Preds) { | |||
1812 | SUnit *Pred = PredDep.getSUnit(); | |||
1813 | if (SITII->isLowLatencyInstruction(*Pred->getInstr())) { | |||
1814 | IsLowLatencyUser = true; | |||
1815 | } | |||
1816 | if (Pred->NodeNum >= DAGSize) | |||
1817 | continue; | |||
1818 | unsigned PredPos = ScheduledSUnitsInv[Pred->NodeNum]; | |||
1819 | if (PredPos >= MinPos) | |||
1820 | MinPos = PredPos + 1; | |||
1821 | } | |||
1822 | ||||
1823 | if (SITII->isLowLatencyInstruction(*SU->getInstr())) { | |||
1824 | unsigned BestPos = LastLowLatencyUser + 1; | |||
1825 | if ((int)BestPos <= LastLowLatencyPos) | |||
1826 | BestPos = LastLowLatencyPos + 1; | |||
1827 | if (BestPos < MinPos) | |||
1828 | BestPos = MinPos; | |||
1829 | if (BestPos < i) { | |||
1830 | for (unsigned u = i; u > BestPos; --u) { | |||
1831 | ++ScheduledSUnitsInv[ScheduledSUnits[u-1]]; | |||
1832 | ScheduledSUnits[u] = ScheduledSUnits[u-1]; | |||
1833 | } | |||
1834 | ScheduledSUnits[BestPos] = SU->NodeNum; | |||
1835 | ScheduledSUnitsInv[SU->NodeNum] = BestPos; | |||
1836 | } | |||
1837 | LastLowLatencyPos = BestPos; | |||
1838 | if (IsLowLatencyUser) | |||
1839 | LastLowLatencyUser = BestPos; | |||
1840 | } else if (IsLowLatencyUser) { | |||
1841 | LastLowLatencyUser = i; | |||
1842 | // Moves COPY instructions on which depends | |||
1843 | // the low latency instructions too. | |||
1844 | } else if (SU->getInstr()->getOpcode() == AMDGPU::COPY) { | |||
1845 | bool CopyForLowLat = false; | |||
1846 | for (SDep& SuccDep : SU->Succs) { | |||
1847 | SUnit *Succ = SuccDep.getSUnit(); | |||
1848 | if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize) | |||
1849 | continue; | |||
1850 | if (SITII->isLowLatencyInstruction(*Succ->getInstr())) { | |||
1851 | CopyForLowLat = true; | |||
1852 | } | |||
1853 | } | |||
1854 | if (!CopyForLowLat) | |||
1855 | continue; | |||
1856 | if (MinPos < i) { | |||
1857 | for (unsigned u = i; u > MinPos; --u) { | |||
1858 | ++ScheduledSUnitsInv[ScheduledSUnits[u-1]]; | |||
1859 | ScheduledSUnits[u] = ScheduledSUnits[u-1]; | |||
1860 | } | |||
1861 | ScheduledSUnits[MinPos] = SU->NodeNum; | |||
1862 | ScheduledSUnitsInv[SU->NodeNum] = MinPos; | |||
1863 | } | |||
1864 | } | |||
1865 | } | |||
1866 | } | |||
1867 | ||||
1868 | void SIScheduleDAGMI::restoreSULinksLeft() { | |||
1869 | for (unsigned i = 0, e = SUnits.size(); i != e; ++i) { | |||
1870 | SUnits[i].isScheduled = false; | |||
1871 | SUnits[i].WeakPredsLeft = SUnitsLinksBackup[i].WeakPredsLeft; | |||
1872 | SUnits[i].NumPredsLeft = SUnitsLinksBackup[i].NumPredsLeft; | |||
1873 | SUnits[i].WeakSuccsLeft = SUnitsLinksBackup[i].WeakSuccsLeft; | |||
1874 | SUnits[i].NumSuccsLeft = SUnitsLinksBackup[i].NumSuccsLeft; | |||
1875 | } | |||
1876 | } | |||
1877 | ||||
1878 | // Return the Vgpr and Sgpr usage corresponding to some virtual registers. | |||
1879 | template<typename _Iterator> void | |||
1880 | SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End, | |||
1881 | unsigned &VgprUsage, unsigned &SgprUsage) { | |||
1882 | VgprUsage = 0; | |||
1883 | SgprUsage = 0; | |||
1884 | for (_Iterator RegI = First; RegI != End; ++RegI) { | |||
1885 | Register Reg = *RegI; | |||
1886 | // For now only track virtual registers | |||
1887 | if (!Reg.isVirtual()) | |||
1888 | continue; | |||
1889 | PSetIterator PSetI = MRI.getPressureSets(Reg); | |||
1890 | for (; PSetI.isValid(); ++PSetI) { | |||
1891 | if (*PSetI == AMDGPU::RegisterPressureSets::VGPR_32) | |||
1892 | VgprUsage += PSetI.getWeight(); | |||
1893 | else if (*PSetI == AMDGPU::RegisterPressureSets::SReg_32) | |||
1894 | SgprUsage += PSetI.getWeight(); | |||
1895 | } | |||
1896 | } | |||
1897 | } | |||
1898 | ||||
1899 | void SIScheduleDAGMI::schedule() | |||
1900 | { | |||
1901 | SmallVector<SUnit*, 8> TopRoots, BotRoots; | |||
1902 | SIScheduleBlockResult Best, Temp; | |||
1903 | LLVM_DEBUG(dbgs() << "Preparing Scheduling\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Preparing Scheduling\n" ; } } while (false); | |||
| ||||
1904 | ||||
1905 | buildDAGWithRegPressure(); | |||
1906 | LLVM_DEBUG(dump())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dump(); } } while (false); | |||
1907 | ||||
1908 | topologicalSort(); | |||
1909 | findRootsAndBiasEdges(TopRoots, BotRoots); | |||
1910 | // We reuse several ScheduleDAGMI and ScheduleDAGMILive | |||
1911 | // functions, but to make them happy we must initialize | |||
1912 | // the default Scheduler implementation (even if we do not | |||
1913 | // run it) | |||
1914 | SchedImpl->initialize(this); | |||
1915 | initQueues(TopRoots, BotRoots); | |||
1916 | ||||
1917 | // Fill some stats to help scheduling. | |||
1918 | ||||
1919 | SUnitsLinksBackup = SUnits; | |||
1920 | IsLowLatencySU.clear(); | |||
1921 | LowLatencyOffset.clear(); | |||
1922 | IsHighLatencySU.clear(); | |||
1923 | ||||
1924 | IsLowLatencySU.resize(SUnits.size(), 0); | |||
1925 | LowLatencyOffset.resize(SUnits.size(), 0); | |||
1926 | IsHighLatencySU.resize(SUnits.size(), 0); | |||
1927 | ||||
1928 | for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) { | |||
1929 | SUnit *SU = &SUnits[i]; | |||
1930 | const MachineOperand *BaseLatOp; | |||
1931 | int64_t OffLatReg; | |||
1932 | if (SITII->isLowLatencyInstruction(*SU->getInstr())) { | |||
1933 | IsLowLatencySU[i] = 1; | |||
1934 | bool OffsetIsScalable; | |||
1935 | if (SITII->getMemOperandWithOffset(*SU->getInstr(), BaseLatOp, OffLatReg, | |||
1936 | OffsetIsScalable, TRI)) | |||
1937 | LowLatencyOffset[i] = OffLatReg; | |||
1938 | } else if (SITII->isHighLatencyDef(SU->getInstr()->getOpcode())) | |||
1939 | IsHighLatencySU[i] = 1; | |||
1940 | } | |||
1941 | ||||
1942 | SIScheduler Scheduler(this); | |||
1943 | Best = Scheduler.scheduleVariant(SISchedulerBlockCreatorVariant::LatenciesAlone, | |||
1944 | SISchedulerBlockSchedulerVariant::BlockLatencyRegUsage); | |||
1945 | ||||
1946 | // if VGPR usage is extremely high, try other good performing variants | |||
1947 | // which could lead to lower VGPR usage | |||
1948 | if (Best.MaxVGPRUsage > 180) { | |||
1949 | static const std::pair<SISchedulerBlockCreatorVariant, | |||
1950 | SISchedulerBlockSchedulerVariant> | |||
1951 | Variants[] = { | |||
1952 | { LatenciesAlone, BlockRegUsageLatency }, | |||
1953 | // { LatenciesAlone, BlockRegUsage }, | |||
1954 | { LatenciesGrouped, BlockLatencyRegUsage }, | |||
1955 | // { LatenciesGrouped, BlockRegUsageLatency }, | |||
1956 | // { LatenciesGrouped, BlockRegUsage }, | |||
1957 | { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage }, | |||
1958 | // { LatenciesAlonePlusConsecutive, BlockRegUsageLatency }, | |||
1959 | // { LatenciesAlonePlusConsecutive, BlockRegUsage } | |||
1960 | }; | |||
1961 | for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) { | |||
1962 | Temp = Scheduler.scheduleVariant(v.first, v.second); | |||
1963 | if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage) | |||
1964 | Best = Temp; | |||
1965 | } | |||
1966 | } | |||
1967 | // if VGPR usage is still extremely high, we may spill. Try other variants | |||
1968 | // which are less performing, but that could lead to lower VGPR usage. | |||
1969 | if (Best.MaxVGPRUsage > 200) { | |||
1970 | static const std::pair<SISchedulerBlockCreatorVariant, | |||
1971 | SISchedulerBlockSchedulerVariant> | |||
1972 | Variants[] = { | |||
1973 | // { LatenciesAlone, BlockRegUsageLatency }, | |||
1974 | { LatenciesAlone, BlockRegUsage }, | |||
1975 | // { LatenciesGrouped, BlockLatencyRegUsage }, | |||
1976 | { LatenciesGrouped, BlockRegUsageLatency }, | |||
1977 | { LatenciesGrouped, BlockRegUsage }, | |||
1978 | // { LatenciesAlonePlusConsecutive, BlockLatencyRegUsage }, | |||
1979 | { LatenciesAlonePlusConsecutive, BlockRegUsageLatency }, | |||
1980 | { LatenciesAlonePlusConsecutive, BlockRegUsage } | |||
1981 | }; | |||
1982 | for (std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant> v : Variants) { | |||
1983 | Temp = Scheduler.scheduleVariant(v.first, v.second); | |||
1984 | if (Temp.MaxVGPRUsage < Best.MaxVGPRUsage) | |||
1985 | Best = Temp; | |||
1986 | } | |||
1987 | } | |||
1988 | ||||
1989 | ScheduledSUnits = Best.SUs; | |||
1990 | ScheduledSUnitsInv.resize(SUnits.size()); | |||
1991 | ||||
1992 | for (unsigned i = 0, e = (unsigned)SUnits.size(); i != e; ++i) { | |||
1993 | ScheduledSUnitsInv[ScheduledSUnits[i]] = i; | |||
1994 | } | |||
1995 | ||||
1996 | moveLowLatencies(); | |||
1997 | ||||
1998 | // Tell the outside world about the result of the scheduling. | |||
1999 | ||||
2000 | assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker")(static_cast <bool> (TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker") ? void (0) : __assert_fail ("TopRPTracker.getPos() == RegionBegin && \"bad initial Top tracker\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 2000, __extension__ __PRETTY_FUNCTION__)); | |||
2001 | TopRPTracker.setPos(CurrentTop); | |||
2002 | ||||
2003 | for (std::vector<unsigned>::iterator I = ScheduledSUnits.begin(), | |||
2004 | E = ScheduledSUnits.end(); I != E; ++I) { | |||
2005 | SUnit *SU = &SUnits[*I]; | |||
2006 | ||||
2007 | scheduleMI(SU, true); | |||
2008 | ||||
2009 | LLVM_DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Scheduling SU(" << SU->NodeNum << ") " << *SU->getInstr(); } } while (false) | |||
2010 | << *SU->getInstr())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { dbgs() << "Scheduling SU(" << SU->NodeNum << ") " << *SU->getInstr(); } } while (false); | |||
2011 | } | |||
2012 | ||||
2013 | assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.")(static_cast <bool> (CurrentTop == CurrentBottom && "Nonempty unscheduled zone.") ? void (0) : __assert_fail ("CurrentTop == CurrentBottom && \"Nonempty unscheduled zone.\"" , "/build/llvm-toolchain-snapshot-14~++20210828111110+16086d47c0d0/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp" , 2013, __extension__ __PRETTY_FUNCTION__)); | |||
2014 | ||||
2015 | placeDebugValues(); | |||
2016 | ||||
2017 | LLVM_DEBUG({do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { { dbgs() << "*** Final schedule for " << printMBBReference(*begin()->getParent()) << " ***\n"; dumpSchedule(); dbgs() << '\n'; }; } } while (false) | |||
2018 | dbgs() << "*** Final schedule for "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { { dbgs() << "*** Final schedule for " << printMBBReference(*begin()->getParent()) << " ***\n"; dumpSchedule(); dbgs() << '\n'; }; } } while (false) | |||
2019 | << printMBBReference(*begin()->getParent()) << " ***\n";do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { { dbgs() << "*** Final schedule for " << printMBBReference(*begin()->getParent()) << " ***\n"; dumpSchedule(); dbgs() << '\n'; }; } } while (false) | |||
2020 | dumpSchedule();do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { { dbgs() << "*** Final schedule for " << printMBBReference(*begin()->getParent()) << " ***\n"; dumpSchedule(); dbgs() << '\n'; }; } } while (false) | |||
2021 | dbgs() << '\n';do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { { dbgs() << "*** Final schedule for " << printMBBReference(*begin()->getParent()) << " ***\n"; dumpSchedule(); dbgs() << '\n'; }; } } while (false) | |||
2022 | })do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("machine-scheduler")) { { dbgs() << "*** Final schedule for " << printMBBReference(*begin()->getParent()) << " ***\n"; dumpSchedule(); dbgs() << '\n'; }; } } while (false); | |||
2023 | } |