LLVM 18.0.0git
SIWholeQuadMode.cpp
Go to the documentation of this file.
1//===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This pass adds instructions to enable whole quad mode (strict or non-strict)
11/// for pixel shaders, and strict whole wavefront mode for all programs.
12///
13/// The "strict" prefix indicates that inactive lanes do not take part in
14/// control flow, specifically an inactive lane enabled by a strict WQM/WWM will
15/// always be enabled irrespective of control flow decisions. Conversely in
16/// non-strict WQM inactive lanes may control flow decisions.
17///
18/// Whole quad mode is required for derivative computations, but it interferes
19/// with shader side effects (stores and atomics). It ensures that WQM is
20/// enabled when necessary, but disabled around stores and atomics.
21///
22/// When necessary, this pass creates a function prolog
23///
24/// S_MOV_B64 LiveMask, EXEC
25/// S_WQM_B64 EXEC, EXEC
26///
27/// to enter WQM at the top of the function and surrounds blocks of Exact
28/// instructions by
29///
30/// S_AND_SAVEEXEC_B64 Tmp, LiveMask
31/// ...
32/// S_MOV_B64 EXEC, Tmp
33///
34/// We also compute when a sequence of instructions requires strict whole
35/// wavefront mode (StrictWWM) and insert instructions to save and restore it:
36///
37/// S_OR_SAVEEXEC_B64 Tmp, -1
38/// ...
39/// S_MOV_B64 EXEC, Tmp
40///
41/// When a sequence of instructions requires strict whole quad mode (StrictWQM)
42/// we use a similar save and restore mechanism and force whole quad mode for
43/// those instructions:
44///
45/// S_MOV_B64 Tmp, EXEC
46/// S_WQM_B64 EXEC, EXEC
47/// ...
48/// S_MOV_B64 EXEC, Tmp
49///
50/// In order to avoid excessive switching during sequences of Exact
51/// instructions, the pass first analyzes which instructions must be run in WQM
52/// (aka which instructions produce values that lead to derivative
53/// computations).
54///
55/// Basic blocks are always exited in WQM as long as some successor needs WQM.
56///
57/// There is room for improvement given better control flow analysis:
58///
59/// (1) at the top level (outside of control flow statements, and as long as
60/// kill hasn't been used), one SGPR can be saved by recovering WQM from
61/// the LiveMask (this is implemented for the entry block).
62///
63/// (2) when entire regions (e.g. if-else blocks or entire loops) only
64/// consist of exact and don't-care instructions, the switch only has to
65/// be done at the entry and exit points rather than potentially in each
66/// block of the region.
67///
68//===----------------------------------------------------------------------===//
69
70#include "AMDGPU.h"
71#include "GCNSubtarget.h"
73#include "llvm/ADT/MapVector.h"
81#include "llvm/IR/CallingConv.h"
84
85using namespace llvm;
86
87#define DEBUG_TYPE "si-wqm"
88
89namespace {
90
91enum {
92 StateWQM = 0x1,
93 StateStrictWWM = 0x2,
94 StateStrictWQM = 0x4,
95 StateExact = 0x8,
96 StateStrict = StateStrictWWM | StateStrictWQM,
97};
98
99struct PrintState {
100public:
101 int State;
102
103 explicit PrintState(int State) : State(State) {}
104};
105
106#ifndef NDEBUG
107static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
108
109 static const std::pair<char, const char *> Mapping[] = {
110 std::pair(StateWQM, "WQM"), std::pair(StateStrictWWM, "StrictWWM"),
111 std::pair(StateStrictWQM, "StrictWQM"), std::pair(StateExact, "Exact")};
112 char State = PS.State;
113 for (auto M : Mapping) {
114 if (State & M.first) {
115 OS << M.second;
116 State &= ~M.first;
117
118 if (State)
119 OS << '|';
120 }
121 }
122 assert(State == 0);
123 return OS;
124}
125#endif
126
127struct InstrInfo {
128 char Needs = 0;
129 char Disabled = 0;
130 char OutNeeds = 0;
131};
132
133struct BlockInfo {
134 char Needs = 0;
135 char InNeeds = 0;
136 char OutNeeds = 0;
137 char InitialState = 0;
138 bool NeedsLowering = false;
139};
140
141struct WorkItem {
142 MachineBasicBlock *MBB = nullptr;
143 MachineInstr *MI = nullptr;
144
145 WorkItem() = default;
148};
149
150class SIWholeQuadMode : public MachineFunctionPass {
151private:
152 const SIInstrInfo *TII;
153 const SIRegisterInfo *TRI;
154 const GCNSubtarget *ST;
156 LiveIntervals *LIS;
159
160 unsigned AndOpc;
161 unsigned AndTermOpc;
162 unsigned AndN2Opc;
163 unsigned XorOpc;
164 unsigned AndSaveExecOpc;
165 unsigned AndSaveExecTermOpc;
166 unsigned WQMOpc;
167 Register Exec;
168 Register LiveMaskReg;
169
172
173 // Tracks state (WQM/StrictWWM/StrictWQM/Exact) after a given instruction
175
176 SmallVector<MachineInstr *, 2> LiveMaskQueries;
177 SmallVector<MachineInstr *, 4> LowerToMovInstrs;
178 SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
180
181 void printInfo();
182
183 void markInstruction(MachineInstr &MI, char Flag,
184 std::vector<WorkItem> &Worklist);
185 void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg,
186 unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist);
187 void markOperand(const MachineInstr &MI, const MachineOperand &Op, char Flag,
188 std::vector<WorkItem> &Worklist);
189 void markInstructionUses(const MachineInstr &MI, char Flag,
190 std::vector<WorkItem> &Worklist);
191 char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
192 void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
193 void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
195
200 MachineBasicBlock::iterator Last, bool PreferLast,
201 bool SaveSCC);
203 Register SaveWQM);
205 Register SavedWQM);
206 void toStrictMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
207 Register SaveOrig, char StrictStateNeeded);
208 void fromStrictMode(MachineBasicBlock &MBB,
209 MachineBasicBlock::iterator Before, Register SavedOrig,
210 char NonStrictState, char CurrentStrictState);
211
213
215 bool IsWQM);
217 void lowerPseudoStrictMode(MachineBasicBlock &MBB, MachineInstr *Entry,
218 MachineInstr *Exit);
219
220 void lowerBlock(MachineBasicBlock &MBB);
221 void processBlock(MachineBasicBlock &MBB, bool IsEntry);
222
223 void lowerLiveMaskQueries();
224 void lowerCopyInstrs();
225 void lowerKillInstrs(bool IsWQM);
226
227public:
228 static char ID;
229
230 SIWholeQuadMode() :
232
233 bool runOnMachineFunction(MachineFunction &MF) override;
234
235 StringRef getPassName() const override { return "SI Whole Quad Mode"; }
236
237 void getAnalysisUsage(AnalysisUsage &AU) const override {
246 }
247
250 MachineFunctionProperties::Property::IsSSA);
251 }
252};
253
254} // end anonymous namespace
255
256char SIWholeQuadMode::ID = 0;
257
258INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
259 false)
263INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
264 false)
265
266char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
267
269 return new SIWholeQuadMode;
270}
271
272#ifndef NDEBUG
273LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
274 for (const auto &BII : Blocks) {
275 dbgs() << "\n"
276 << printMBBReference(*BII.first) << ":\n"
277 << " InNeeds = " << PrintState(BII.second.InNeeds)
278 << ", Needs = " << PrintState(BII.second.Needs)
279 << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
280
281 for (const MachineInstr &MI : *BII.first) {
282 auto III = Instructions.find(&MI);
283 if (III == Instructions.end())
284 continue;
285
286 dbgs() << " " << MI << " Needs = " << PrintState(III->second.Needs)
287 << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
288 }
289 }
290}
291#endif
292
293void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
294 std::vector<WorkItem> &Worklist) {
295 InstrInfo &II = Instructions[&MI];
296
297 assert(!(Flag & StateExact) && Flag != 0);
298
299 // Remove any disabled states from the flag. The user that required it gets
300 // an undefined value in the helper lanes. For example, this can happen if
301 // the result of an atomic is used by instruction that requires WQM, where
302 // ignoring the request for WQM is correct as per the relevant specs.
303 Flag &= ~II.Disabled;
304
305 // Ignore if the flag is already encompassed by the existing needs, or we
306 // just disabled everything.
307 if ((II.Needs & Flag) == Flag)
308 return;
309
310 LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI);
311 II.Needs |= Flag;
312 Worklist.push_back(&MI);
313}
314
315/// Mark all relevant definitions of register \p Reg in usage \p UseMI.
316void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
317 Register Reg, unsigned SubReg, char Flag,
318 std::vector<WorkItem> &Worklist) {
319 LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI);
320
321 LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI));
322 const VNInfo *Value = UseLRQ.valueIn();
323 if (!Value)
324 return;
325
326 // Note: this code assumes that lane masks on AMDGPU completely
327 // cover registers.
328 const LaneBitmask UseLanes =
329 SubReg ? TRI->getSubRegIndexLaneMask(SubReg)
330 : (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg)
332
333 // Perform a depth-first iteration of the LiveRange graph marking defs.
334 // Stop processing of a given branch when all use lanes have been defined.
335 // The first definition stops processing for a physical register.
336 struct PhiEntry {
337 const VNInfo *Phi;
338 unsigned PredIdx;
339 LaneBitmask DefinedLanes;
340
341 PhiEntry(const VNInfo *Phi, unsigned PredIdx, LaneBitmask DefinedLanes)
342 : Phi(Phi), PredIdx(PredIdx), DefinedLanes(DefinedLanes) {}
343 };
344 using VisitKey = std::pair<const VNInfo *, LaneBitmask>;
346 SmallSet<VisitKey, 4> Visited;
347 LaneBitmask DefinedLanes;
348 unsigned NextPredIdx = 0; // Only used for processing phi nodes
349 do {
350 const VNInfo *NextValue = nullptr;
351 const VisitKey Key(Value, DefinedLanes);
352
353 if (Visited.insert(Key).second) {
354 // On first visit to a phi then start processing first predecessor
355 NextPredIdx = 0;
356 }
357
358 if (Value->isPHIDef()) {
359 // Each predecessor node in the phi must be processed as a subgraph
360 const MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def);
361 assert(MBB && "Phi-def has no defining MBB");
362
363 // Find next predecessor to process
364 unsigned Idx = NextPredIdx;
365 auto PI = MBB->pred_begin() + Idx;
366 auto PE = MBB->pred_end();
367 for (; PI != PE && !NextValue; ++PI, ++Idx) {
368 if (const VNInfo *VN = LR.getVNInfoBefore(LIS->getMBBEndIdx(*PI))) {
369 if (!Visited.count(VisitKey(VN, DefinedLanes)))
370 NextValue = VN;
371 }
372 }
373
374 // If there are more predecessors to process; add phi to stack
375 if (PI != PE)
376 PhiStack.emplace_back(Value, Idx, DefinedLanes);
377 } else {
378 MachineInstr *MI = LIS->getInstructionFromIndex(Value->def);
379 assert(MI && "Def has no defining instruction");
380
381 if (Reg.isVirtual()) {
382 // Iterate over all operands to find relevant definitions
383 bool HasDef = false;
384 for (const MachineOperand &Op : MI->all_defs()) {
385 if (Op.getReg() != Reg)
386 continue;
387
388 // Compute lanes defined and overlap with use
389 LaneBitmask OpLanes =
390 Op.isUndef() ? LaneBitmask::getAll()
391 : TRI->getSubRegIndexLaneMask(Op.getSubReg());
392 LaneBitmask Overlap = (UseLanes & OpLanes);
393
394 // Record if this instruction defined any of use
395 HasDef |= Overlap.any();
396
397 // Mark any lanes defined
398 DefinedLanes |= OpLanes;
399 }
400
401 // Check if all lanes of use have been defined
402 if ((DefinedLanes & UseLanes) != UseLanes) {
403 // Definition not complete; need to process input value
404 LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI));
405 if (const VNInfo *VN = LRQ.valueIn()) {
406 if (!Visited.count(VisitKey(VN, DefinedLanes)))
407 NextValue = VN;
408 }
409 }
410
411 // Only mark the instruction if it defines some part of the use
412 if (HasDef)
413 markInstruction(*MI, Flag, Worklist);
414 } else {
415 // For physical registers simply mark the defining instruction
416 markInstruction(*MI, Flag, Worklist);
417 }
418 }
419
420 if (!NextValue && !PhiStack.empty()) {
421 // Reach end of chain; revert to processing last phi
422 PhiEntry &Entry = PhiStack.back();
423 NextValue = Entry.Phi;
424 NextPredIdx = Entry.PredIdx;
425 DefinedLanes = Entry.DefinedLanes;
426 PhiStack.pop_back();
427 }
428
429 Value = NextValue;
430 } while (Value);
431}
432
433void SIWholeQuadMode::markOperand(const MachineInstr &MI,
434 const MachineOperand &Op, char Flag,
435 std::vector<WorkItem> &Worklist) {
436 assert(Op.isReg());
437 Register Reg = Op.getReg();
438
439 // Ignore some hardware registers
440 switch (Reg) {
441 case AMDGPU::EXEC:
442 case AMDGPU::EXEC_LO:
443 return;
444 default:
445 break;
446 }
447
448 LLVM_DEBUG(dbgs() << "markOperand " << PrintState(Flag) << ": " << Op
449 << " for " << MI);
450 if (Reg.isVirtual()) {
451 LiveRange &LR = LIS->getInterval(Reg);
452 markDefs(MI, LR, Reg, Op.getSubReg(), Flag, Worklist);
453 } else {
454 // Handle physical registers that we need to track; this is mostly relevant
455 // for VCC, which can appear as the (implicit) input of a uniform branch,
456 // e.g. when a loop counter is stored in a VGPR.
457 for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())) {
458 LiveRange &LR = LIS->getRegUnit(Unit);
459 const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
460 if (!Value)
461 continue;
462
463 markDefs(MI, LR, Unit, AMDGPU::NoSubRegister, Flag, Worklist);
464 }
465 }
466}
467
468/// Mark all instructions defining the uses in \p MI with \p Flag.
469void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
470 std::vector<WorkItem> &Worklist) {
471 LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": "
472 << MI);
473
474 for (const MachineOperand &Use : MI.all_uses())
475 markOperand(MI, Use, Flag, Worklist);
476}
477
478// Scan instructions to determine which ones require an Exact execmask and
479// which ones seed WQM requirements.
480char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
481 std::vector<WorkItem> &Worklist) {
482 char GlobalFlags = 0;
483 bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
484 SmallVector<MachineInstr *, 4> SetInactiveInstrs;
485 SmallVector<MachineInstr *, 4> SoftWQMInstrs;
486 bool HasImplicitDerivatives =
488
489 // We need to visit the basic blocks in reverse post-order so that we visit
490 // defs before uses, in particular so that we don't accidentally mark an
491 // instruction as needing e.g. WQM before visiting it and realizing it needs
492 // WQM disabled.
494 for (MachineBasicBlock *MBB : RPOT) {
495 BlockInfo &BBI = Blocks[MBB];
496
497 for (MachineInstr &MI : *MBB) {
498 InstrInfo &III = Instructions[&MI];
499 unsigned Opcode = MI.getOpcode();
500 char Flags = 0;
501
502 if (TII->isWQM(Opcode)) {
503 // If LOD is not supported WQM is not needed.
504 if (!ST->hasExtendedImageInsts())
505 continue;
506 // Only generate implicit WQM if implicit derivatives are required.
507 // This avoids inserting unintended WQM if a shader type without
508 // implicit derivatives uses an image sampling instruction.
509 if (!HasImplicitDerivatives)
510 continue;
511 // Sampling instructions don't need to produce results for all pixels
512 // in a quad, they just require all inputs of a quad to have been
513 // computed for derivatives.
514 markInstructionUses(MI, StateWQM, Worklist);
515 GlobalFlags |= StateWQM;
516 continue;
517 } else if (Opcode == AMDGPU::WQM) {
518 // The WQM intrinsic requires its output to have all the helper lanes
519 // correct, so we need it to be in WQM.
520 Flags = StateWQM;
521 LowerToCopyInstrs.push_back(&MI);
522 } else if (Opcode == AMDGPU::SOFT_WQM) {
523 LowerToCopyInstrs.push_back(&MI);
524 SoftWQMInstrs.push_back(&MI);
525 continue;
526 } else if (Opcode == AMDGPU::STRICT_WWM) {
527 // The STRICT_WWM intrinsic doesn't make the same guarantee, and plus
528 // it needs to be executed in WQM or Exact so that its copy doesn't
529 // clobber inactive lanes.
530 markInstructionUses(MI, StateStrictWWM, Worklist);
531 GlobalFlags |= StateStrictWWM;
532 LowerToMovInstrs.push_back(&MI);
533 continue;
534 } else if (Opcode == AMDGPU::STRICT_WQM ||
535 TII->isDualSourceBlendEXP(MI)) {
536 // STRICT_WQM is similar to STRICTWWM, but instead of enabling all
537 // threads of the wave like STRICTWWM, STRICT_WQM enables all threads in
538 // quads that have at least one active thread.
539 markInstructionUses(MI, StateStrictWQM, Worklist);
540 GlobalFlags |= StateStrictWQM;
541
542 if (Opcode == AMDGPU::STRICT_WQM) {
543 LowerToMovInstrs.push_back(&MI);
544 } else {
545 // Dual source blend export acts as implicit strict-wqm, its sources
546 // need to be shuffled in strict wqm, but the export itself needs to
547 // run in exact mode.
548 BBI.Needs |= StateExact;
549 if (!(BBI.InNeeds & StateExact)) {
550 BBI.InNeeds |= StateExact;
551 Worklist.push_back(MBB);
552 }
553 GlobalFlags |= StateExact;
554 III.Disabled = StateWQM | StateStrict;
555 }
556 continue;
557 } else if (Opcode == AMDGPU::LDS_PARAM_LOAD ||
558 Opcode == AMDGPU::LDS_DIRECT_LOAD) {
559 // Mark these STRICTWQM, but only for the instruction, not its operands.
560 // This avoid unnecessarily marking M0 as requiring WQM.
561 InstrInfo &II = Instructions[&MI];
562 II.Needs |= StateStrictWQM;
563 GlobalFlags |= StateStrictWQM;
564 continue;
565 } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
566 Opcode == AMDGPU::V_SET_INACTIVE_B64) {
567 III.Disabled = StateStrict;
568 MachineOperand &Inactive = MI.getOperand(2);
569 if (Inactive.isReg()) {
570 if (Inactive.isUndef()) {
571 LowerToCopyInstrs.push_back(&MI);
572 } else {
573 markOperand(MI, Inactive, StateStrictWWM, Worklist);
574 }
575 }
576 SetInactiveInstrs.push_back(&MI);
577 continue;
578 } else if (TII->isDisableWQM(MI)) {
579 BBI.Needs |= StateExact;
580 if (!(BBI.InNeeds & StateExact)) {
581 BBI.InNeeds |= StateExact;
582 Worklist.push_back(MBB);
583 }
584 GlobalFlags |= StateExact;
585 III.Disabled = StateWQM | StateStrict;
586 continue;
587 } else {
588 if (Opcode == AMDGPU::SI_PS_LIVE || Opcode == AMDGPU::SI_LIVE_MASK) {
589 LiveMaskQueries.push_back(&MI);
590 } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
591 Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
592 Opcode == AMDGPU::SI_DEMOTE_I1) {
593 KillInstrs.push_back(&MI);
594 BBI.NeedsLowering = true;
595 } else if (WQMOutputs) {
596 // The function is in machine SSA form, which means that physical
597 // VGPRs correspond to shader inputs and outputs. Inputs are
598 // only used, outputs are only defined.
599 // FIXME: is this still valid?
600 for (const MachineOperand &MO : MI.defs()) {
601 if (!MO.isReg())
602 continue;
603
604 Register Reg = MO.getReg();
605
606 if (!Reg.isVirtual() &&
607 TRI->hasVectorRegisters(TRI->getPhysRegBaseClass(Reg))) {
608 Flags = StateWQM;
609 break;
610 }
611 }
612 }
613
614 if (!Flags)
615 continue;
616 }
617
618 markInstruction(MI, Flags, Worklist);
619 GlobalFlags |= Flags;
620 }
621 }
622
623 // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
624 // ever used anywhere in the function. This implements the corresponding
625 // semantics of @llvm.amdgcn.set.inactive.
626 // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm.
627 if (GlobalFlags & StateWQM) {
628 for (MachineInstr *MI : SetInactiveInstrs)
629 markInstruction(*MI, StateWQM, Worklist);
630 for (MachineInstr *MI : SoftWQMInstrs)
631 markInstruction(*MI, StateWQM, Worklist);
632 }
633
634 return GlobalFlags;
635}
636
637void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
638 std::vector<WorkItem>& Worklist) {
639 MachineBasicBlock *MBB = MI.getParent();
640 InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
641 BlockInfo &BI = Blocks[MBB];
642
643 // Control flow-type instructions and stores to temporary memory that are
644 // followed by WQM computations must themselves be in WQM.
645 if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
646 (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
647 Instructions[&MI].Needs = StateWQM;
648 II.Needs = StateWQM;
649 }
650
651 // Propagate to block level
652 if (II.Needs & StateWQM) {
653 BI.Needs |= StateWQM;
654 if (!(BI.InNeeds & StateWQM)) {
655 BI.InNeeds |= StateWQM;
656 Worklist.push_back(MBB);
657 }
658 }
659
660 // Propagate backwards within block
661 if (MachineInstr *PrevMI = MI.getPrevNode()) {
662 char InNeeds = (II.Needs & ~StateStrict) | II.OutNeeds;
663 if (!PrevMI->isPHI()) {
664 InstrInfo &PrevII = Instructions[PrevMI];
665 if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
666 PrevII.OutNeeds |= InNeeds;
667 Worklist.push_back(PrevMI);
668 }
669 }
670 }
671
672 // Propagate WQM flag to instruction inputs
673 assert(!(II.Needs & StateExact));
674
675 if (II.Needs != 0)
676 markInstructionUses(MI, II.Needs, Worklist);
677
678 // Ensure we process a block containing StrictWWM/StrictWQM, even if it does
679 // not require any WQM transitions.
680 if (II.Needs & StateStrictWWM)
681 BI.Needs |= StateStrictWWM;
682 if (II.Needs & StateStrictWQM)
683 BI.Needs |= StateStrictWQM;
684}
685
686void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
687 std::vector<WorkItem>& Worklist) {
688 BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
689
690 // Propagate through instructions
691 if (!MBB.empty()) {
692 MachineInstr *LastMI = &*MBB.rbegin();
693 InstrInfo &LastII = Instructions[LastMI];
694 if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
695 LastII.OutNeeds |= BI.OutNeeds;
696 Worklist.push_back(LastMI);
697 }
698 }
699
700 // Predecessor blocks must provide for our WQM/Exact needs.
701 for (MachineBasicBlock *Pred : MBB.predecessors()) {
702 BlockInfo &PredBI = Blocks[Pred];
703 if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
704 continue;
705
706 PredBI.OutNeeds |= BI.InNeeds;
707 PredBI.InNeeds |= BI.InNeeds;
708 Worklist.push_back(Pred);
709 }
710
711 // All successors must be prepared to accept the same set of WQM/Exact data.
712 for (MachineBasicBlock *Succ : MBB.successors()) {
713 BlockInfo &SuccBI = Blocks[Succ];
714 if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
715 continue;
716
717 SuccBI.InNeeds |= BI.OutNeeds;
718 Worklist.push_back(Succ);
719 }
720}
721
722char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
723 std::vector<WorkItem> Worklist;
724 char GlobalFlags = scanInstructions(MF, Worklist);
725
726 while (!Worklist.empty()) {
727 WorkItem WI = Worklist.back();
728 Worklist.pop_back();
729
730 if (WI.MI)
731 propagateInstruction(*WI.MI, Worklist);
732 else
733 propagateBlock(*WI.MBB, Worklist);
734 }
735
736 return GlobalFlags;
737}
738
740SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
742 Register SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
743
744 MachineInstr *Save =
745 BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
746 .addReg(AMDGPU::SCC);
747 MachineInstr *Restore =
748 BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
749 .addReg(SaveReg);
750
751 LIS->InsertMachineInstrInMaps(*Save);
752 LIS->InsertMachineInstrInMaps(*Restore);
753 LIS->createAndComputeVirtRegInterval(SaveReg);
754
755 return Restore;
756}
757
758MachineBasicBlock *SIWholeQuadMode::splitBlock(MachineBasicBlock *BB,
759 MachineInstr *TermMI) {
760 LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB) << " @ "
761 << *TermMI << "\n");
762
763 MachineBasicBlock *SplitBB =
764 BB->splitAt(*TermMI, /*UpdateLiveIns*/ true, LIS);
765
766 // Convert last instruction in block to a terminator.
767 // Note: this only covers the expected patterns
768 unsigned NewOpcode = 0;
769 switch (TermMI->getOpcode()) {
770 case AMDGPU::S_AND_B32:
771 NewOpcode = AMDGPU::S_AND_B32_term;
772 break;
773 case AMDGPU::S_AND_B64:
774 NewOpcode = AMDGPU::S_AND_B64_term;
775 break;
776 case AMDGPU::S_MOV_B32:
777 NewOpcode = AMDGPU::S_MOV_B32_term;
778 break;
779 case AMDGPU::S_MOV_B64:
780 NewOpcode = AMDGPU::S_MOV_B64_term;
781 break;
782 default:
783 break;
784 }
785 if (NewOpcode)
786 TermMI->setDesc(TII->get(NewOpcode));
787
788 if (SplitBB != BB) {
789 // Update dominator trees
790 using DomTreeT = DomTreeBase<MachineBasicBlock>;
792 for (MachineBasicBlock *Succ : SplitBB->successors()) {
793 DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
794 DTUpdates.push_back({DomTreeT::Delete, BB, Succ});
795 }
796 DTUpdates.push_back({DomTreeT::Insert, BB, SplitBB});
797 if (MDT)
798 MDT->getBase().applyUpdates(DTUpdates);
799 if (PDT)
800 PDT->getBase().applyUpdates(DTUpdates);
801
802 // Link blocks
804 BuildMI(*BB, BB->end(), DebugLoc(), TII->get(AMDGPU::S_BRANCH))
805 .addMBB(SplitBB);
806 LIS->InsertMachineInstrInMaps(*MI);
807 }
808
809 return SplitBB;
810}
811
812MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB,
813 MachineInstr &MI) {
814 const DebugLoc &DL = MI.getDebugLoc();
815 unsigned Opcode = 0;
816
817 assert(MI.getOperand(0).isReg());
818
819 // Comparison is for live lanes; however here we compute the inverse
820 // (killed lanes). This is because VCMP will always generate 0 bits
821 // for inactive lanes so a mask of live lanes would not be correct
822 // inside control flow.
823 // Invert the comparison by swapping the operands and adjusting
824 // the comparison codes.
825
826 switch (MI.getOperand(2).getImm()) {
827 case ISD::SETUEQ:
828 Opcode = AMDGPU::V_CMP_LG_F32_e64;
829 break;
830 case ISD::SETUGT:
831 Opcode = AMDGPU::V_CMP_GE_F32_e64;
832 break;
833 case ISD::SETUGE:
834 Opcode = AMDGPU::V_CMP_GT_F32_e64;
835 break;
836 case ISD::SETULT:
837 Opcode = AMDGPU::V_CMP_LE_F32_e64;
838 break;
839 case ISD::SETULE:
840 Opcode = AMDGPU::V_CMP_LT_F32_e64;
841 break;
842 case ISD::SETUNE:
843 Opcode = AMDGPU::V_CMP_EQ_F32_e64;
844 break;
845 case ISD::SETO:
846 Opcode = AMDGPU::V_CMP_O_F32_e64;
847 break;
848 case ISD::SETUO:
849 Opcode = AMDGPU::V_CMP_U_F32_e64;
850 break;
851 case ISD::SETOEQ:
852 case ISD::SETEQ:
853 Opcode = AMDGPU::V_CMP_NEQ_F32_e64;
854 break;
855 case ISD::SETOGT:
856 case ISD::SETGT:
857 Opcode = AMDGPU::V_CMP_NLT_F32_e64;
858 break;
859 case ISD::SETOGE:
860 case ISD::SETGE:
861 Opcode = AMDGPU::V_CMP_NLE_F32_e64;
862 break;
863 case ISD::SETOLT:
864 case ISD::SETLT:
865 Opcode = AMDGPU::V_CMP_NGT_F32_e64;
866 break;
867 case ISD::SETOLE:
868 case ISD::SETLE:
869 Opcode = AMDGPU::V_CMP_NGE_F32_e64;
870 break;
871 case ISD::SETONE:
872 case ISD::SETNE:
873 Opcode = AMDGPU::V_CMP_NLG_F32_e64;
874 break;
875 default:
876 llvm_unreachable("invalid ISD:SET cond code");
877 }
878
879 // Pick opcode based on comparison type.
880 MachineInstr *VcmpMI;
881 const MachineOperand &Op0 = MI.getOperand(0);
882 const MachineOperand &Op1 = MI.getOperand(1);
883
884 // VCC represents lanes killed.
885 Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
886
887 if (TRI->isVGPR(*MRI, Op0.getReg())) {
888 Opcode = AMDGPU::getVOPe32(Opcode);
889 VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0);
890 } else {
891 VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode))
893 .addImm(0) // src0 modifiers
894 .add(Op1)
895 .addImm(0) // src1 modifiers
896 .add(Op0)
897 .addImm(0); // omod
898 }
899
900 MachineInstr *MaskUpdateMI =
901 BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
902 .addReg(LiveMaskReg)
903 .addReg(VCC);
904
905 // State of SCC represents whether any lanes are live in mask,
906 // if SCC is 0 then no lanes will be alive anymore.
907 MachineInstr *EarlyTermMI =
908 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
909
910 MachineInstr *ExecMaskMI =
911 BuildMI(MBB, MI, DL, TII->get(AndN2Opc), Exec).addReg(Exec).addReg(VCC);
912
913 assert(MBB.succ_size() == 1);
914 MachineInstr *NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
915 .addMBB(*MBB.succ_begin());
916
917 // Update live intervals
918 LIS->ReplaceMachineInstrInMaps(MI, *VcmpMI);
919 MBB.remove(&MI);
920
921 LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
922 LIS->InsertMachineInstrInMaps(*ExecMaskMI);
923 LIS->InsertMachineInstrInMaps(*EarlyTermMI);
924 LIS->InsertMachineInstrInMaps(*NewTerm);
925
926 return NewTerm;
927}
928
929MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
930 MachineInstr &MI, bool IsWQM) {
931 const DebugLoc &DL = MI.getDebugLoc();
932 MachineInstr *MaskUpdateMI = nullptr;
933
934 const bool IsDemote = IsWQM && (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1);
935 const MachineOperand &Op = MI.getOperand(0);
936 int64_t KillVal = MI.getOperand(1).getImm();
937 MachineInstr *ComputeKilledMaskMI = nullptr;
938 Register CndReg = !Op.isImm() ? Op.getReg() : Register();
939 Register TmpReg;
940
941 // Is this a static or dynamic kill?
942 if (Op.isImm()) {
943 if (Op.getImm() == KillVal) {
944 // Static: all active lanes are killed
945 MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
946 .addReg(LiveMaskReg)
947 .addReg(Exec);
948 } else {
949 // Static: kill does nothing
950 MachineInstr *NewTerm = nullptr;
951 if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) {
952 LIS->RemoveMachineInstrFromMaps(MI);
953 } else {
954 assert(MBB.succ_size() == 1);
955 NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
956 .addMBB(*MBB.succ_begin());
957 LIS->ReplaceMachineInstrInMaps(MI, *NewTerm);
958 }
959 MBB.remove(&MI);
960 return NewTerm;
961 }
962 } else {
963 if (!KillVal) {
964 // Op represents live lanes after kill,
965 // so exec mask needs to be factored in.
966 TmpReg = MRI->createVirtualRegister(TRI->getBoolRC());
967 ComputeKilledMaskMI =
968 BuildMI(MBB, MI, DL, TII->get(XorOpc), TmpReg).add(Op).addReg(Exec);
969 MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
970 .addReg(LiveMaskReg)
971 .addReg(TmpReg);
972 } else {
973 // Op represents lanes to kill
974 MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
975 .addReg(LiveMaskReg)
976 .add(Op);
977 }
978 }
979
980 // State of SCC represents whether any lanes are live in mask,
981 // if SCC is 0 then no lanes will be alive anymore.
982 MachineInstr *EarlyTermMI =
983 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
984
985 // In the case we got this far some lanes are still live,
986 // update EXEC to deactivate lanes as appropriate.
987 MachineInstr *NewTerm;
988 MachineInstr *WQMMaskMI = nullptr;
989 Register LiveMaskWQM;
990 if (IsDemote) {
991 // Demote - deactivate quads with only helper lanes
992 LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC());
993 WQMMaskMI =
994 BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg);
995 NewTerm = BuildMI(MBB, MI, DL, TII->get(AndOpc), Exec)
996 .addReg(Exec)
997 .addReg(LiveMaskWQM);
998 } else {
999 // Kill - deactivate lanes no longer in live mask
1000 if (Op.isImm()) {
1001 unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1002 NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0);
1003 } else if (!IsWQM) {
1004 NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec)
1005 .addReg(Exec)
1006 .addReg(LiveMaskReg);
1007 } else {
1008 unsigned Opcode = KillVal ? AndN2Opc : AndOpc;
1009 NewTerm =
1010 BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op);
1011 }
1012 }
1013
1014 // Update live intervals
1015 LIS->RemoveMachineInstrFromMaps(MI);
1016 MBB.remove(&MI);
1017 assert(EarlyTermMI);
1018 assert(MaskUpdateMI);
1019 assert(NewTerm);
1020 if (ComputeKilledMaskMI)
1021 LIS->InsertMachineInstrInMaps(*ComputeKilledMaskMI);
1022 LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
1023 LIS->InsertMachineInstrInMaps(*EarlyTermMI);
1024 if (WQMMaskMI)
1025 LIS->InsertMachineInstrInMaps(*WQMMaskMI);
1026 LIS->InsertMachineInstrInMaps(*NewTerm);
1027
1028 if (CndReg) {
1029 LIS->removeInterval(CndReg);
1030 LIS->createAndComputeVirtRegInterval(CndReg);
1031 }
1032 if (TmpReg)
1033 LIS->createAndComputeVirtRegInterval(TmpReg);
1034 if (LiveMaskWQM)
1035 LIS->createAndComputeVirtRegInterval(LiveMaskWQM);
1036
1037 return NewTerm;
1038}
1039
1040// Convert a strict mode transition to a pseudo transition.
1041// This still pre-allocates registers to prevent clobbering,
1042// but avoids any EXEC mask changes.
1043void SIWholeQuadMode::lowerPseudoStrictMode(MachineBasicBlock &MBB,
1044 MachineInstr *Entry,
1045 MachineInstr *Exit) {
1046 assert(Entry->getOpcode() == AMDGPU::ENTER_STRICT_WQM);
1047 assert(Exit->getOpcode() == AMDGPU::EXIT_STRICT_WQM);
1048
1049 Register SaveOrig = Entry->getOperand(0).getReg();
1050
1051 MachineInstr *NewEntry =
1052 BuildMI(MBB, Entry, DebugLoc(), TII->get(AMDGPU::ENTER_PSEUDO_WM));
1053 MachineInstr *NewExit =
1054 BuildMI(MBB, Exit, DebugLoc(), TII->get(AMDGPU::EXIT_PSEUDO_WM));
1055
1056 LIS->ReplaceMachineInstrInMaps(*Exit, *NewExit);
1057 Exit->eraseFromParent();
1058
1059 LIS->ReplaceMachineInstrInMaps(*Entry, *NewEntry);
1060 Entry->eraseFromParent();
1061
1062 LIS->removeInterval(SaveOrig);
1063}
1064
1065// Replace (or supplement) instructions accessing live mask.
1066// This can only happen once all the live mask registers have been created
1067// and the execute state (WQM/StrictWWM/Exact) of instructions is known.
1068void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
1069 auto BII = Blocks.find(&MBB);
1070 if (BII == Blocks.end())
1071 return;
1072
1073 const BlockInfo &BI = BII->second;
1074 if (!BI.NeedsLowering)
1075 return;
1076
1077 LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n");
1078
1080 char State = BI.InitialState;
1081 MachineInstr *StrictEntry = nullptr;
1082
1085 char PreviousState = State;
1086
1087 if (StateTransition.count(&MI))
1088 State = StateTransition[&MI];
1089
1090 MachineInstr *SplitPoint = nullptr;
1091 switch (MI.getOpcode()) {
1092 case AMDGPU::SI_DEMOTE_I1:
1093 case AMDGPU::SI_KILL_I1_TERMINATOR:
1094 SplitPoint = lowerKillI1(MBB, MI, State == StateWQM);
1095 break;
1096 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1097 SplitPoint = lowerKillF32(MBB, MI);
1098 break;
1099 case AMDGPU::ENTER_STRICT_WQM:
1100 StrictEntry = PreviousState == StateWQM ? &MI : nullptr;
1101 break;
1102 case AMDGPU::EXIT_STRICT_WQM:
1103 if (State == StateWQM && StrictEntry) {
1104 // Transition WQM -> StrictWQM -> WQM detected.
1105 lowerPseudoStrictMode(MBB, StrictEntry, &MI);
1106 }
1107 StrictEntry = nullptr;
1108 break;
1109 case AMDGPU::ENTER_STRICT_WWM:
1110 case AMDGPU::EXIT_STRICT_WWM:
1111 StrictEntry = nullptr;
1112 break;
1113 default:
1114 break;
1115 }
1116 if (SplitPoint)
1117 SplitPoints.push_back(SplitPoint);
1118 }
1119
1120 // Perform splitting after instruction scan to simplify iteration.
1121 if (!SplitPoints.empty()) {
1122 MachineBasicBlock *BB = &MBB;
1123 for (MachineInstr *MI : SplitPoints) {
1124 BB = splitBlock(BB, MI);
1125 }
1126 }
1127}
1128
1129// Return an iterator in the (inclusive) range [First, Last] at which
1130// instructions can be safely inserted, keeping in mind that some of the
1131// instructions we want to add necessarily clobber SCC.
1132MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
1134 MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
1135 if (!SaveSCC)
1136 return PreferLast ? Last : First;
1137
1138 LiveRange &LR =
1139 LIS->getRegUnit(*TRI->regunits(MCRegister::from(AMDGPU::SCC)).begin());
1140 auto MBBE = MBB.end();
1141 SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
1142 : LIS->getMBBEndIdx(&MBB);
1143 SlotIndex LastIdx =
1144 Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
1145 SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
1146 const LiveRange::Segment *S;
1147
1148 for (;;) {
1149 S = LR.getSegmentContaining(Idx);
1150 if (!S)
1151 break;
1152
1153 if (PreferLast) {
1154 SlotIndex Next = S->start.getBaseIndex();
1155 if (Next < FirstIdx)
1156 break;
1157 Idx = Next;
1158 } else {
1159 MachineInstr *EndMI = LIS->getInstructionFromIndex(S->end.getBaseIndex());
1160 assert(EndMI && "Segment does not end on valid instruction");
1161 auto NextI = std::next(EndMI->getIterator());
1162 if (NextI == MBB.end())
1163 break;
1164 SlotIndex Next = LIS->getInstructionIndex(*NextI);
1165 if (Next > LastIdx)
1166 break;
1167 Idx = Next;
1168 }
1169 }
1170
1172
1173 if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
1174 MBBI = MI;
1175 else {
1176 assert(Idx == LIS->getMBBEndIdx(&MBB));
1177 MBBI = MBB.end();
1178 }
1179
1180 // Move insertion point past any operations modifying EXEC.
1181 // This assumes that the value of SCC defined by any of these operations
1182 // does not need to be preserved.
1183 while (MBBI != Last) {
1184 bool IsExecDef = false;
1185 for (const MachineOperand &MO : MBBI->all_defs()) {
1186 IsExecDef |=
1187 MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC;
1188 }
1189 if (!IsExecDef)
1190 break;
1191 MBBI++;
1192 S = nullptr;
1193 }
1194
1195 if (S)
1196 MBBI = saveSCC(MBB, MBBI);
1197
1198 return MBBI;
1199}
1200
1201void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
1203 Register SaveWQM) {
1204 bool IsTerminator = Before == MBB.end();
1205 if (!IsTerminator) {
1206 auto FirstTerm = MBB.getFirstTerminator();
1207 if (FirstTerm != MBB.end()) {
1208 SlotIndex FirstTermIdx = LIS->getInstructionIndex(*FirstTerm);
1209 SlotIndex BeforeIdx = LIS->getInstructionIndex(*Before);
1210 IsTerminator = BeforeIdx > FirstTermIdx;
1211 }
1212 }
1213
1215
1216 if (SaveWQM) {
1217 unsigned Opcode = IsTerminator ? AndSaveExecTermOpc : AndSaveExecOpc;
1218 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), SaveWQM)
1219 .addReg(LiveMaskReg);
1220 } else {
1221 unsigned Opcode = IsTerminator ? AndTermOpc : AndOpc;
1222 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), Exec)
1223 .addReg(Exec)
1224 .addReg(LiveMaskReg);
1225 }
1226
1227 LIS->InsertMachineInstrInMaps(*MI);
1228 StateTransition[MI] = StateExact;
1229}
1230
1231void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
1233 Register SavedWQM) {
1235
1236 if (SavedWQM) {
1237 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec)
1238 .addReg(SavedWQM);
1239 } else {
1240 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(WQMOpc), Exec).addReg(Exec);
1241 }
1242
1243 LIS->InsertMachineInstrInMaps(*MI);
1244 StateTransition[MI] = StateWQM;
1245}
1246
1247void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB,
1249 Register SaveOrig, char StrictStateNeeded) {
1251 assert(SaveOrig);
1252 assert(StrictStateNeeded == StateStrictWWM ||
1253 StrictStateNeeded == StateStrictWQM);
1254
1255 if (StrictStateNeeded == StateStrictWWM) {
1256 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WWM),
1257 SaveOrig)
1258 .addImm(-1);
1259 } else {
1260 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WQM),
1261 SaveOrig)
1262 .addImm(-1);
1263 }
1264 LIS->InsertMachineInstrInMaps(*MI);
1265 StateTransition[MI] = StrictStateNeeded;
1266
1267 // Mark block as needing lower so it will be checked for unnecessary transitions.
1268 auto BII = Blocks.find(&MBB);
1269 if (BII != Blocks.end())
1270 BII->second.NeedsLowering = true;
1271}
1272
1273void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB,
1275 Register SavedOrig, char NonStrictState,
1276 char CurrentStrictState) {
1278
1279 assert(SavedOrig);
1280 assert(CurrentStrictState == StateStrictWWM ||
1281 CurrentStrictState == StateStrictWQM);
1282
1283 if (CurrentStrictState == StateStrictWWM) {
1284 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WWM),
1285 Exec)
1286 .addReg(SavedOrig);
1287 } else {
1288 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WQM),
1289 Exec)
1290 .addReg(SavedOrig);
1291 }
1292 LIS->InsertMachineInstrInMaps(*MI);
1293 StateTransition[MI] = NonStrictState;
1294}
1295
1296void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
1297 auto BII = Blocks.find(&MBB);
1298 if (BII == Blocks.end())
1299 return;
1300
1301 BlockInfo &BI = BII->second;
1302
1303 // This is a non-entry block that is WQM throughout, so no need to do
1304 // anything.
1305 if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) {
1306 BI.InitialState = StateWQM;
1307 return;
1308 }
1309
1310 LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
1311 << ":\n");
1312
1313 Register SavedWQMReg;
1314 Register SavedNonStrictReg;
1315 bool WQMFromExec = IsEntry;
1316 char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
1317 char NonStrictState = 0;
1318 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
1319
1320 auto II = MBB.getFirstNonPHI(), IE = MBB.end();
1321 if (IsEntry) {
1322 // Skip the instruction that saves LiveMask
1323 if (II != IE && II->getOpcode() == AMDGPU::COPY)
1324 ++II;
1325 }
1326
1327 // This stores the first instruction where it's safe to switch from WQM to
1328 // Exact or vice versa.
1330
1331 // This stores the first instruction where it's safe to switch from Strict
1332 // mode to Exact/WQM or to switch to Strict mode. It must always be the same
1333 // as, or after, FirstWQM since if it's safe to switch to/from Strict, it must
1334 // be safe to switch to/from WQM as well.
1335 MachineBasicBlock::iterator FirstStrict = IE;
1336
1337 // Record initial state is block information.
1338 BI.InitialState = State;
1339
1340 for (;;) {
1342 char Needs = StateExact | StateWQM; // Strict mode is disabled by default.
1343 char OutNeeds = 0;
1344
1345 if (FirstWQM == IE)
1346 FirstWQM = II;
1347
1348 if (FirstStrict == IE)
1349 FirstStrict = II;
1350
1351 // First, figure out the allowed states (Needs) based on the propagated
1352 // flags.
1353 if (II != IE) {
1354 MachineInstr &MI = *II;
1355
1356 if (MI.isTerminator() || TII->mayReadEXEC(*MRI, MI)) {
1357 auto III = Instructions.find(&MI);
1358 if (III != Instructions.end()) {
1359 if (III->second.Needs & StateStrictWWM)
1360 Needs = StateStrictWWM;
1361 else if (III->second.Needs & StateStrictWQM)
1362 Needs = StateStrictWQM;
1363 else if (III->second.Needs & StateWQM)
1364 Needs = StateWQM;
1365 else
1366 Needs &= ~III->second.Disabled;
1367 OutNeeds = III->second.OutNeeds;
1368 }
1369 } else {
1370 // If the instruction doesn't actually need a correct EXEC, then we can
1371 // safely leave Strict mode enabled.
1372 Needs = StateExact | StateWQM | StateStrict;
1373 }
1374
1375 // Exact mode exit can occur in terminators, but must be before branches.
1376 if (MI.isBranch() && OutNeeds == StateExact)
1377 Needs = StateExact;
1378
1379 ++Next;
1380 } else {
1381 // End of basic block
1382 if (BI.OutNeeds & StateWQM)
1383 Needs = StateWQM;
1384 else if (BI.OutNeeds == StateExact)
1385 Needs = StateExact;
1386 else
1387 Needs = StateWQM | StateExact;
1388 }
1389
1390 // Now, transition if necessary.
1391 if (!(Needs & State)) {
1393 if (State == StateStrictWWM || Needs == StateStrictWWM ||
1394 State == StateStrictWQM || Needs == StateStrictWQM) {
1395 // We must switch to or from Strict mode.
1396 First = FirstStrict;
1397 } else {
1398 // We only need to switch to/from WQM, so we can use FirstWQM.
1399 First = FirstWQM;
1400 }
1401
1402 // Whether we need to save SCC depends on start and end states.
1403 bool SaveSCC = false;
1404 switch (State) {
1405 case StateExact:
1406 case StateStrictWWM:
1407 case StateStrictWQM:
1408 // Exact/Strict -> Strict: save SCC
1409 // Exact/Strict -> WQM: save SCC if WQM mask is generated from exec
1410 // Exact/Strict -> Exact: no save
1411 SaveSCC = (Needs & StateStrict) || ((Needs & StateWQM) && WQMFromExec);
1412 break;
1413 case StateWQM:
1414 // WQM -> Exact/Strict: save SCC
1415 SaveSCC = !(Needs & StateWQM);
1416 break;
1417 default:
1418 llvm_unreachable("Unknown state");
1419 break;
1420 }
1422 prepareInsertion(MBB, First, II, Needs == StateWQM, SaveSCC);
1423
1424 if (State & StateStrict) {
1425 assert(State == StateStrictWWM || State == StateStrictWQM);
1426 assert(SavedNonStrictReg);
1427 fromStrictMode(MBB, Before, SavedNonStrictReg, NonStrictState, State);
1428
1429 LIS->createAndComputeVirtRegInterval(SavedNonStrictReg);
1430 SavedNonStrictReg = 0;
1431 State = NonStrictState;
1432 }
1433
1434 if (Needs & StateStrict) {
1435 NonStrictState = State;
1436 assert(Needs == StateStrictWWM || Needs == StateStrictWQM);
1437 assert(!SavedNonStrictReg);
1438 SavedNonStrictReg = MRI->createVirtualRegister(BoolRC);
1439
1440 toStrictMode(MBB, Before, SavedNonStrictReg, Needs);
1441 State = Needs;
1442
1443 } else {
1444 if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
1445 if (!WQMFromExec && (OutNeeds & StateWQM)) {
1446 assert(!SavedWQMReg);
1447 SavedWQMReg = MRI->createVirtualRegister(BoolRC);
1448 }
1449
1450 toExact(MBB, Before, SavedWQMReg);
1451 State = StateExact;
1452 } else if (State == StateExact && (Needs & StateWQM) &&
1453 !(Needs & StateExact)) {
1454 assert(WQMFromExec == (SavedWQMReg == 0));
1455
1456 toWQM(MBB, Before, SavedWQMReg);
1457
1458 if (SavedWQMReg) {
1459 LIS->createAndComputeVirtRegInterval(SavedWQMReg);
1460 SavedWQMReg = 0;
1461 }
1462 State = StateWQM;
1463 } else {
1464 // We can get here if we transitioned from StrictWWM to a
1465 // non-StrictWWM state that already matches our needs, but we
1466 // shouldn't need to do anything.
1467 assert(Needs & State);
1468 }
1469 }
1470 }
1471
1472 if (Needs != (StateExact | StateWQM | StateStrict)) {
1473 if (Needs != (StateExact | StateWQM))
1474 FirstWQM = IE;
1475 FirstStrict = IE;
1476 }
1477
1478 if (II == IE)
1479 break;
1480
1481 II = Next;
1482 }
1483 assert(!SavedWQMReg);
1484 assert(!SavedNonStrictReg);
1485}
1486
1487void SIWholeQuadMode::lowerLiveMaskQueries() {
1488 for (MachineInstr *MI : LiveMaskQueries) {
1489 const DebugLoc &DL = MI->getDebugLoc();
1490 Register Dest = MI->getOperand(0).getReg();
1491
1493 BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
1494 .addReg(LiveMaskReg);
1495
1496 LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
1497 MI->eraseFromParent();
1498 }
1499}
1500
1501void SIWholeQuadMode::lowerCopyInstrs() {
1502 for (MachineInstr *MI : LowerToMovInstrs) {
1503 assert(MI->getNumExplicitOperands() == 2);
1504
1505 const Register Reg = MI->getOperand(0).getReg();
1506
1507 const TargetRegisterClass *regClass =
1508 TRI->getRegClassForOperandReg(*MRI, MI->getOperand(0));
1509 if (TRI->isVGPRClass(regClass)) {
1510 const unsigned MovOp = TII->getMovOpcode(regClass);
1511 MI->setDesc(TII->get(MovOp));
1512
1513 // Check that it already implicitly depends on exec (like all VALU movs
1514 // should do).
1515 assert(any_of(MI->implicit_operands(), [](const MachineOperand &MO) {
1516 return MO.isUse() && MO.getReg() == AMDGPU::EXEC;
1517 }));
1518 } else {
1519 // Remove early-clobber and exec dependency from simple SGPR copies.
1520 // This allows some to be eliminated during/post RA.
1521 LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI);
1522 if (MI->getOperand(0).isEarlyClobber()) {
1523 LIS->removeInterval(Reg);
1524 MI->getOperand(0).setIsEarlyClobber(false);
1525 LIS->createAndComputeVirtRegInterval(Reg);
1526 }
1527 int Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC);
1528 while (Index >= 0) {
1529 MI->removeOperand(Index);
1530 Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC);
1531 }
1532 MI->setDesc(TII->get(AMDGPU::COPY));
1533 LLVM_DEBUG(dbgs() << " -> " << *MI);
1534 }
1535 }
1536 for (MachineInstr *MI : LowerToCopyInstrs) {
1537 if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
1538 MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) {
1539 assert(MI->getNumExplicitOperands() == 3);
1540 // the only reason we should be here is V_SET_INACTIVE has
1541 // an undef input so it is being replaced by a simple copy.
1542 // There should be a second undef source that we should remove.
1543 assert(MI->getOperand(2).isUndef());
1544 MI->removeOperand(2);
1545 MI->untieRegOperand(1);
1546 } else {
1547 assert(MI->getNumExplicitOperands() == 2);
1548 }
1549
1550 unsigned CopyOp = MI->getOperand(1).isReg()
1551 ? (unsigned)AMDGPU::COPY
1552 : TII->getMovOpcode(TRI->getRegClassForOperandReg(
1553 *MRI, MI->getOperand(0)));
1554 MI->setDesc(TII->get(CopyOp));
1555 }
1556}
1557
1558void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
1559 for (MachineInstr *MI : KillInstrs) {
1560 MachineBasicBlock *MBB = MI->getParent();
1561 MachineInstr *SplitPoint = nullptr;
1562 switch (MI->getOpcode()) {
1563 case AMDGPU::SI_DEMOTE_I1:
1564 case AMDGPU::SI_KILL_I1_TERMINATOR:
1565 SplitPoint = lowerKillI1(*MBB, *MI, IsWQM);
1566 break;
1567 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1568 SplitPoint = lowerKillF32(*MBB, *MI);
1569 break;
1570 default:
1571 continue;
1572 }
1573 if (SplitPoint)
1574 splitBlock(MBB, SplitPoint);
1575 }
1576}
1577
1578bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
1579 LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName()
1580 << " ------------- \n");
1581 LLVM_DEBUG(MF.dump(););
1582
1583 Instructions.clear();
1584 Blocks.clear();
1585 LiveMaskQueries.clear();
1586 LowerToCopyInstrs.clear();
1587 LowerToMovInstrs.clear();
1588 KillInstrs.clear();
1589 StateTransition.clear();
1590
1591 ST = &MF.getSubtarget<GCNSubtarget>();
1592
1593 TII = ST->getInstrInfo();
1594 TRI = &TII->getRegisterInfo();
1595 MRI = &MF.getRegInfo();
1596 LIS = &getAnalysis<LiveIntervals>();
1597 MDT = &getAnalysis<MachineDominatorTree>();
1598 PDT = &getAnalysis<MachinePostDominatorTree>();
1599
1600 if (ST->isWave32()) {
1601 AndOpc = AMDGPU::S_AND_B32;
1602 AndTermOpc = AMDGPU::S_AND_B32_term;
1603 AndN2Opc = AMDGPU::S_ANDN2_B32;
1604 XorOpc = AMDGPU::S_XOR_B32;
1605 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
1606 AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B32_term;
1607 WQMOpc = AMDGPU::S_WQM_B32;
1608 Exec = AMDGPU::EXEC_LO;
1609 } else {
1610 AndOpc = AMDGPU::S_AND_B64;
1611 AndTermOpc = AMDGPU::S_AND_B64_term;
1612 AndN2Opc = AMDGPU::S_ANDN2_B64;
1613 XorOpc = AMDGPU::S_XOR_B64;
1614 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
1615 AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B64_term;
1616 WQMOpc = AMDGPU::S_WQM_B64;
1617 Exec = AMDGPU::EXEC;
1618 }
1619
1620 const char GlobalFlags = analyzeFunction(MF);
1621 const bool NeedsLiveMask = !(KillInstrs.empty() && LiveMaskQueries.empty());
1622
1623 LiveMaskReg = Exec;
1624
1625 // Shader is simple does not need any state changes or any complex lowering
1626 if (!(GlobalFlags & (StateWQM | StateStrict)) && LowerToCopyInstrs.empty() &&
1627 LowerToMovInstrs.empty() && KillInstrs.empty()) {
1628 lowerLiveMaskQueries();
1629 return !LiveMaskQueries.empty();
1630 }
1631
1632 MachineBasicBlock &Entry = MF.front();
1633 MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
1634
1635 // Store a copy of the original live mask when required
1636 if (NeedsLiveMask || (GlobalFlags & StateWQM)) {
1637 LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
1638 MachineInstr *MI =
1639 BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
1640 .addReg(Exec);
1641 LIS->InsertMachineInstrInMaps(*MI);
1642 }
1643
1644 LLVM_DEBUG(printInfo());
1645
1646 lowerLiveMaskQueries();
1647 lowerCopyInstrs();
1648
1649 // Shader only needs WQM
1650 if (GlobalFlags == StateWQM) {
1651 auto MI = BuildMI(Entry, EntryMI, DebugLoc(), TII->get(WQMOpc), Exec)
1652 .addReg(Exec);
1653 LIS->InsertMachineInstrInMaps(*MI);
1654 lowerKillInstrs(true);
1655 } else {
1656 for (auto BII : Blocks)
1657 processBlock(*BII.first, BII.first == &Entry);
1658 // Lowering blocks causes block splitting so perform as a second pass.
1659 for (auto BII : Blocks)
1660 lowerBlock(*BII.first);
1661 }
1662
1663 // Compute live range for live mask
1664 if (LiveMaskReg != Exec)
1665 LIS->createAndComputeVirtRegInterval(LiveMaskReg);
1666
1667 // Physical registers like SCC aren't tracked by default anyway, so just
1668 // removing the ranges we computed is the simplest option for maintaining
1669 // the analysis results.
1670 LIS->removeAllRegUnitsForPhysReg(AMDGPU::SCC);
1671
1672 // If we performed any kills then recompute EXEC
1673 if (!KillInstrs.empty())
1674 LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
1675
1676 return true;
1677}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
Provides AMDGPU specific target descriptions.
static void analyzeFunction(Function &Fn, const DataLayout &Layout, FunctionVarLocsBuilder *FnVarLocs)
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:510
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
DenseMap< Block *, BlockRelaxAux > Blocks
Definition: ELF_riscv.cpp:496
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
unsigned const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
if(VerifyEach)
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
return InstrInfo
static void splitBlock(MachineBasicBlock &MBB, MachineInstr &MI, MachineDominatorTree *MDT)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
SI Whole Quad Mode
#define DEBUG_TYPE
raw_pwrite_stream & OS
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Core dominator tree base class.
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:239
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition: Function.cpp:645
Result of a LiveRange query.
Definition: LiveInterval.h:90
VNInfo * valueIn() const
Return the value that is live-in to the instruction.
Definition: LiveInterval.h:105
This class represents the liveness of a register, stack slot, etc.
Definition: LiveInterval.h:157
const Segment * getSegmentContaining(SlotIndex Idx) const
Return the segment that contains the specified index, or null if there is none.
Definition: LiveInterval.h:408
LiveQueryResult Query(SlotIndex Idx) const
Query Liveness at Idx.
Definition: LiveInterval.h:541
VNInfo * getVNInfoBefore(SlotIndex Idx) const
getVNInfoBefore - Return the VNInfo that is live up to but not necessarilly including Idx,...
Definition: LiveInterval.h:429
static MCRegister from(unsigned Val)
Check the provided unsigned value is a valid MCRegister.
Definition: MCRegister.h:74
MachineInstr * remove(MachineInstr *I)
Remove the unbundled instruction from the instruction list without deleting it.
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
unsigned succ_size() const
iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
iterator_range< succ_iterator > successors()
reverse_iterator rbegin()
iterator_range< pred_iterator > predecessors()
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
virtual MachineFunctionProperties getClearedProperties() const
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
Properties which a MachineFunction may have at a given point in time.
MachineFunctionProperties & set(Property P)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
void dump() const
dump - Print the current MachineFunction to cerr, useful for debugger use.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineBasicBlock & front() const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
Definition: MachineInstr.h:68
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:543
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition: Pass.cpp:81
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
SlotIndex - An opaque wrapper around machine indexes.
Definition: SlotIndexes.h:68
SlotIndex getBaseIndex() const
Returns the base index for associated with this index.
Definition: SlotIndexes.h:228
SlotIndexes pass.
Definition: SlotIndexes.h:301
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:166
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
bool empty() const
Definition: SmallVector.h:94
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:941
void push_back(const T &Elt)
Definition: SmallVector.h:416
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
VNInfo - Value Number Information.
Definition: LiveInterval.h:53
LLVM Value Representation.
Definition: Value.h:74
self_iterator getIterator()
Definition: ilist_node.h:82
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Key
PAL metadata keys.
LLVM_READONLY int getVOPe32(uint16_t Opcode)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:191
Flag
These should be considered private to the implementation of the MCInstrDesc class.
Definition: MCInstrDesc.h:148
@ Define
Register definition.
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
FunctionPass * createSIWholeQuadModePass()
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:666
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1734
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
char & SIWholeQuadModeID
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
Definition: APFixedPoint.h:292
Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
static constexpr LaneBitmask getAll()
Definition: LaneBitmask.h:82
constexpr bool any() const
Definition: LaneBitmask.h:53
static constexpr LaneBitmask getNone()
Definition: LaneBitmask.h:81
This represents a simple continuous liveness interval for a value.
Definition: LiveInterval.h:162