23#define DEBUG_TYPE "amdgpu-insert-delay-alu"
27class AMDGPUInsertDelayAlu {
41 if (
MI.getDesc().TSFlags & VA_VDST_0)
43 if (
MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B32 ||
44 MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64)
46 if (
MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
52 static bool instructionWaitsForSGPRWrites(
const MachineInstr &
MI) {
59 for (
auto &
Op :
MI.operands()) {
68 enum DelayType { VALU, TRANS, SALU, OTHER };
93 static constexpr unsigned VALU_MAX = 5;
97 static constexpr unsigned TRANS_MAX = 4;
101 static constexpr unsigned SALU_CYCLES_MAX = 4;
118 uint8_t TRANSNumVALU = VALU_MAX;
124 DelayInfo() =
default;
126 DelayInfo(DelayType
Type,
unsigned Cycles) {
135 TRANSCycles = Cycles;
142 SALUCycles = std::min(Cycles, SALU_CYCLES_MAX);
148 return VALUCycles ==
RHS.VALUCycles && VALUNum ==
RHS.VALUNum &&
149 TRANSCycles ==
RHS.TRANSCycles && TRANSNum ==
RHS.TRANSNum &&
150 TRANSNumVALU ==
RHS.TRANSNumVALU && SALUCycles ==
RHS.SALUCycles;
158 VALUCycles = std::max(VALUCycles,
RHS.VALUCycles);
159 VALUNum = std::min(VALUNum,
RHS.VALUNum);
160 TRANSCycles = std::max(TRANSCycles,
RHS.TRANSCycles);
161 TRANSNum = std::min(TRANSNum,
RHS.TRANSNum);
162 TRANSNumVALU = std::min(TRANSNumVALU,
RHS.TRANSNumVALU);
163 SALUCycles = std::max(SALUCycles,
RHS.SALUCycles);
169 bool advance(DelayType
Type,
unsigned Cycles) {
172 VALUNum += (
Type == VALU);
173 if (VALUNum >= VALU_MAX || VALUCycles <= Cycles) {
179 VALUCycles -= Cycles;
183 TRANSNum += (
Type == TRANS);
184 TRANSNumVALU += (
Type == VALU);
185 if (TRANSNum >= TRANS_MAX || TRANSCycles <= Cycles) {
188 TRANSNum = TRANS_MAX;
189 TRANSNumVALU = VALU_MAX;
192 TRANSCycles -= Cycles;
196 if (SALUCycles <= Cycles) {
201 SALUCycles -= Cycles;
208#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
211 dbgs() <<
" VALUCycles=" << (int)VALUCycles;
212 if (VALUNum < VALU_MAX)
213 dbgs() <<
" VALUNum=" << (int)VALUNum;
215 dbgs() <<
" TRANSCycles=" << (int)TRANSCycles;
216 if (TRANSNum < TRANS_MAX)
217 dbgs() <<
" TRANSNum=" << (int)TRANSNum;
218 if (TRANSNumVALU < VALU_MAX)
219 dbgs() <<
" TRANSNumVALU=" << (int)TRANSNumVALU;
221 dbgs() <<
" SALUCycles=" << (int)SALUCycles;
227 struct DelayState :
DenseMap<MCRegUnit, DelayInfo> {
231 for (
const auto &KV :
RHS) {
234 std::tie(It, Inserted) = insert(KV);
236 It->second.merge(KV.second);
242 void advance(DelayType
Type,
unsigned Cycles) {
244 for (
auto I = begin(),
E = end();
I !=
E;
I =
Next) {
246 if (
I->second.advance(
Type, Cycles))
251 void advanceByVALUNum(
unsigned VALUNum) {
253 for (
auto I = begin(),
E = end();
I !=
E;
I =
Next) {
255 if (
I->second.VALUNum >= VALUNum &&
I->second.VALUCycles > 0) {
261#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
264 dbgs() <<
" empty\n";
274 return A->first <
B->first;
294 if (Delay.TRANSNum < DelayInfo::TRANS_MAX)
295 Imm |= 4 + Delay.TRANSNum;
299 if (Delay.VALUNum < DelayInfo::VALU_MAX &&
300 Delay.VALUNum <= Delay.TRANSNumVALU) {
302 Imm |= Delay.VALUNum << 7;
304 Imm |= Delay.VALUNum;
308 if (Delay.SALUCycles) {
309 assert(Delay.SALUCycles < DelayInfo::SALU_CYCLES_MAX);
313 }
else if (Imm & 0xf) {
314 Imm |= (Delay.SALUCycles + 8) << 7;
316 Imm |= Delay.SALUCycles + 8;
326 if (!(Imm & 0x780) && LastDelayAlu) {
331 if (
I->getOpcode() == AMDGPU::S_SET_VGPR_MSB) {
338 if (!
I->isBundle() && !
I->isMetaInstruction())
343 unsigned LastImm =
Op.getImm();
344 assert((LastImm & ~0xf) == 0 &&
345 "Remembered an s_delay_alu with no room for another delay!");
346 LastImm |= Imm << 7 | Skip << 4;
352 auto &
MBB = *
MI.getParent();
357 return (Imm & 0x780) ? nullptr : DelayAlu;
362 for (
auto *Pred :
MBB.predecessors())
363 State.merge(BlockState[Pred]);
373 MCRegUnit LastSGPRFromVALU =
static_cast<MCRegUnit
>(0);
376 for (
auto &
MI :
MBB.instrs()) {
377 if (
MI.isBundle() ||
MI.isMetaInstruction())
381 switch (
MI.getOpcode()) {
382 case AMDGPU::SI_RETURN_TO_EPILOG:
386 DelayType
Type = getDelayType(
MI);
388 if (instructionWaitsForSGPRWrites(
MI)) {
389 auto It = State.find(LastSGPRFromVALU);
390 if (It != State.end()) {
391 DelayInfo Info = It->getSecond();
392 State.advanceByVALUNum(Info.VALUNum);
394 LastSGPRFromVALU =
static_cast<MCRegUnit
>(0);
398 if (instructionWaitsForVALU(
MI)) {
401 State = DelayState();
402 }
else if (
Type != OTHER) {
405 for (
const auto &
Op :
MI.explicit_uses()) {
410 if (
MI.getOpcode() == AMDGPU::V_WRITELANE_B32 &&
Op.isTied())
412 for (MCRegUnit Unit :
TRI->regunits(
Op.getReg())) {
413 auto It = State.find(Unit);
414 if (It != State.end()) {
415 Delay.merge(It->second);
423 for (
const auto &
Op :
MI.defs()) {
426 LastSGPRFromVALU = *
TRI->regunits(
Reg).begin();
432 if (Emit && !
MI.isBundledWithPred()) {
435 LastDelayAlu = emitDelayAlu(
MI, Delay, LastDelayAlu);
441 for (
const auto &
Op :
MI.defs()) {
443 &
MI,
Op.getOperandNo(),
nullptr, 0);
444 for (MCRegUnit Unit :
TRI->regunits(
Op.getReg()))
456 State.advance(
Type, Cycles);
463 "Basic block state should not have changed on final pass!");
464 }
else if (DelayState &BS = BlockState[&
MBB]; State != BS) {
465 BS = std::move(State);
476 if (!ST->hasDelayAlu())
484 SII = ST->getInstrInfo();
485 TRI = ST->getRegisterInfo();
493 while (!WorkList.
empty()) {
495 bool Changed = runOnMachineBasicBlock(
MBB,
false);
525 AMDGPUInsertDelayAlu Impl;
534 if (!AMDGPUInsertDelayAlu().
run(MF))
541char AMDGPUInsertDelayAluLegacy::ID = 0;
546 "AMDGPU Insert Delay ALU",
false,
false)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
Register const TargetRegisterInfo * TRI
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Interface definition for SIInstrInfo.
This file implements a set that has insertion order iteration characteristics.
Represent the analysis usage information of a pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Represents analyses that only rely on functions' control flow.
Instructions::iterator instr_iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
StringRef getName() const
getName - Return the name of the corresponding LLVM function.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Wrapper class representing virtual and physical registers.
bool isXDLWMMA(const MachineInstr &MI) const
static bool isSALU(const MachineInstr &MI)
const TargetSchedModel & getSchedModel() const
static bool isTRANS(const MachineInstr &MI)
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
static bool isVALU(const MachineInstr &MI)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
unsigned getMaxWavesPerEU() const
A vector that has set insertion semantics.
void insert_range(Range &&R)
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
value_type pop_back_val()
void reserve(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Provide an instruction scheduling machine model to CodeGen passes.
LLVM_ABI unsigned computeOperandLatency(const MachineInstr *DefMI, unsigned DefOperIdx, const MachineInstr *UseMI, unsigned UseOperIdx) const
Compute operand latency based on the available machine model.
The instances of the Type class are immutable: once they are created, they are never changed.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned decodeFieldVaVdst(unsigned Encoded)
bool isSGPR(MCRegister Reg, const MCRegisterInfo *TRI)
Is Reg - scalar register.
bool isDPMACCInstruction(unsigned Opc)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
This is an optimization pass for GlobalISel generic memory operations.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool operator!=(uint64_t V1, const APInt &V2)
LLVM_ABI Printable printRegUnit(MCRegUnit Unit, const TargetRegisterInfo *TRI)
Create Printable object to print register units on a raw_ostream.
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
auto reverse(ContainerTy &&C)
void sort(IteratorTy Start, IteratorTy End)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
char & AMDGPUInsertDelayAluID
FunctionAddr VTableAddr Next
DWARFExpression::Operation Op
LLVM_ABI Printable printMBBReference(const MachineBasicBlock &MBB)
Prints a machine basic block reference.
PreservedAnalyses run(MachineFunction &F, MachineFunctionAnalysisManager &MFAM)