28#include "llvm/IR/IntrinsicsAMDGPU.h"
31#define GET_GICOMBINER_DEPS
32#include "AMDGPUGenPreLegalizeGICombiner.inc"
33#undef GET_GICOMBINER_DEPS
35#define DEBUG_TYPE "amdgpu-postlegalizer-combiner"
38using namespace MIPatternMatch;
41#define GET_GICOMBINER_TYPES
42#include "AMDGPUGenPostLegalizeGICombiner.inc"
43#undef GET_GICOMBINER_TYPES
45class AMDGPUPostLegalizerCombinerImpl :
public Combiner {
47 const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig;
54 AMDGPUPostLegalizerCombinerImpl(
57 const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig,
61 static const char *
getName() {
return "AMDGPUPostLegalizerCombinerImpl"; }
66 struct FMinFMaxLegacyInfo {
77 const FMinFMaxLegacyInfo &
Info)
const;
88 struct CvtF32UByteMatchInfo {
94 CvtF32UByteMatchInfo &MatchInfo)
const;
96 const CvtF32UByteMatchInfo &MatchInfo)
const;
108#define GET_GICOMBINER_CLASS_MEMBERS
109#define AMDGPUSubtarget GCNSubtarget
110#include "AMDGPUGenPostLegalizeGICombiner.inc"
111#undef GET_GICOMBINER_CLASS_MEMBERS
112#undef AMDGPUSubtarget
115#define GET_GICOMBINER_IMPL
116#define AMDGPUSubtarget GCNSubtarget
117#include "AMDGPUGenPostLegalizeGICombiner.inc"
118#undef AMDGPUSubtarget
119#undef GET_GICOMBINER_IMPL
121AMDGPUPostLegalizerCombinerImpl::AMDGPUPostLegalizerCombinerImpl(
124 const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig,
126 :
Combiner(MF, CInfo, TPC, &KB, CSEInfo), RuleConfig(RuleConfig), STI(STI),
127 TII(*STI.getInstrInfo()),
128 Helper(Observer,
B,
false, &KB, MDT, LI),
130#include
"AMDGPUGenPostLegalizeGICombiner.inc"
135bool AMDGPUPostLegalizerCombinerImpl::tryCombineAll(
MachineInstr &
MI)
const {
136 if (tryCombineAllImpl(
MI))
139 switch (
MI.getOpcode()) {
140 case TargetOpcode::G_SHL:
141 case TargetOpcode::G_LSHR:
142 case TargetOpcode::G_ASHR:
146 return Helper.tryCombineShiftToUnmerge(
MI, 32);
152bool AMDGPUPostLegalizerCombinerImpl::matchFMinFMaxLegacy(
159 if (!
MRI.hasOneNonDBGUse(
Cond) ||
164 Info.True =
MI.getOperand(2).getReg();
165 Info.False =
MI.getOperand(3).getReg();
188void AMDGPUPostLegalizerCombinerImpl::applySelectFCmpToFMinToFMaxLegacy(
190 B.setInstrAndDebugLoc(
MI);
192 B.buildInstr(Opc, {
MI.getOperand(0)}, {
X,
Y},
MI.getFlags());
199 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY,
Info.RHS,
Info.LHS);
201 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY,
Info.LHS,
Info.RHS);
209 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY,
Info.LHS,
Info.RHS);
211 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY,
Info.RHS,
Info.LHS);
217 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY,
Info.RHS,
Info.LHS);
219 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY,
Info.LHS,
Info.RHS);
225 buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY,
Info.LHS,
Info.RHS);
227 buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY,
Info.RHS,
Info.LHS);
234 MI.eraseFromParent();
237bool AMDGPUPostLegalizerCombinerImpl::matchUCharToFloat(
245 LLT Ty =
MRI.getType(DstReg);
248 unsigned SrcSize =
MRI.getType(SrcReg).getSizeInBits();
249 assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64);
251 return Helper.getKnownBits()->maskedValueIsZero(SrcReg, Mask);
257void AMDGPUPostLegalizerCombinerImpl::applyUCharToFloat(
259 B.setInstrAndDebugLoc(
MI);
265 LLT Ty =
MRI.getType(DstReg);
266 LLT SrcTy =
MRI.getType(SrcReg);
268 SrcReg =
B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0);
271 B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg}, {SrcReg},
274 auto Cvt0 =
B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32}, {SrcReg},
276 B.buildFPTrunc(DstReg, Cvt0,
MI.getFlags());
279 MI.eraseFromParent();
282bool AMDGPUPostLegalizerCombinerImpl::matchRcpSqrtToRsq(
289 if (
auto *GI = dyn_cast<GIntrinsic>(&
MI)) {
290 if (GI->is(Intrinsic::amdgcn_rcp))
291 return MRI.getVRegDef(
MI.getOperand(2).getReg());
308 if ((RcpSrcMI = getRcpSrc(
MI)) && (SqrtSrcMI = getSqrtSrc(*RcpSrcMI))) {
310 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
MI.getOperand(0)})
312 .setMIFlags(
MI.getFlags());
318 if ((SqrtSrcMI = getSqrtSrc(
MI)) && (RcpSrcMI = getRcpSrc(*SqrtSrcMI))) {
320 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
MI.getOperand(0)})
322 .setMIFlags(
MI.getFlags());
329bool AMDGPUPostLegalizerCombinerImpl::matchCvtF32UByteN(
340 const unsigned Offset =
MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0;
342 unsigned ShiftOffset = 8 *
Offset;
344 ShiftOffset += ShiftAmt;
346 ShiftOffset -= ShiftAmt;
348 MatchInfo.CvtVal = Src0;
349 MatchInfo.ShiftOffset = ShiftOffset;
350 return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0;
357void AMDGPUPostLegalizerCombinerImpl::applyCvtF32UByteN(
359 B.setInstrAndDebugLoc(
MI);
360 unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8;
364 LLT SrcTy =
MRI.getType(MatchInfo.CvtVal);
367 CvtSrc =
B.buildAnyExt(S32, CvtSrc).getReg(0);
371 B.buildInstr(NewOpc, {
MI.getOperand(0)}, {CvtSrc},
MI.getFlags());
372 MI.eraseFromParent();
375bool AMDGPUPostLegalizerCombinerImpl::matchRemoveFcanonicalize(
378 MF.getSubtarget().getTargetLowering());
379 Reg =
MI.getOperand(1).getReg();
389bool AMDGPUPostLegalizerCombinerImpl::matchCombineSignExtendInReg(
392 SubwordBufferLoad =
MRI.getVRegDef(Op0Reg);
394 if (!
MRI.hasOneNonDBGUse(Op0Reg))
399 return SubwordBufferLoad->
getOpcode() == AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE ||
400 SubwordBufferLoad->
getOpcode() == AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
405void AMDGPUPostLegalizerCombinerImpl::applyCombineSignExtendInReg(
410 SubwordBufferLoad->
getOpcode() == AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE
411 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE
412 : AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT;
416 Register SignExtendInsnDst =
MI.getOperand(0).getReg();
419 MI.eraseFromParent();
429 AMDGPUPostLegalizerCombiner(
bool IsOptNone =
false);
432 return "AMDGPUPostLegalizerCombiner";
441 AMDGPUPostLegalizerCombinerImplRuleConfig RuleConfig;
445void AMDGPUPostLegalizerCombiner::getAnalysisUsage(
AnalysisUsage &AU)
const {
458AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(
bool IsOptNone)
462 if (!RuleConfig.parseCommandLineOption())
466bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(
MachineFunction &MF) {
468 MachineFunctionProperties::Property::FailedISel))
470 auto *TPC = &getAnalysis<TargetPassConfig>();
479 GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
481 IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
484 LI, EnableOpt,
F.hasOptSize(),
F.hasMinSize());
486 AMDGPUPostLegalizerCombinerImpl Impl(MF, CInfo, TPC, *KB,
nullptr,
487 RuleConfig, ST, MDT, LI);
488 return Impl.combineMachineInstrs();
491char AMDGPUPostLegalizerCombiner::ID = 0;
493 "Combine AMDGPU machine instrs after legalization",
false,
503 return new AMDGPUPostLegalizerCombiner(IsOptNone);
unsigned const MachineRegisterInfo * MRI
This contains common combine transformations that may be used in a combine pass.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
#define GET_GICOMBINER_CONSTRUCTOR_INITS
Combine AMDGPU machine instrs after legalization
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
This contains common combine transformations that may be used in a combine pass,or by the target else...
Option class for Targets to specify which operations are combined how and when.
This contains the base class for all Combiners generated by TableGen.
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
Contains matchers for matching SSA Machine Instructions.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
static StringRef getName(Value *V)
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Target-Independent Code Generator Pass Configuration Options pass.
This class provides the information for the target register banks.
Class for arbitrary precision integers.
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
@ FCMP_OLT
0 1 0 0 True if ordered and less than
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
@ FCMP_ULT
1 1 0 0 True if unordered or less than
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
virtual bool tryCombineAll(MachineInstr &I) const =0
FunctionPass class - This class is used to implement most global optimizations.
To use KnownBitsInfo analysis in a pass, KnownBitsInfo &Info = getAnalysis<GISelKnownBitsInfoAnalysis...
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
bool hasProperty(Property P) const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineFunctionProperties & getProperties() const
Get the function properties.
Helper class to build MachineInstr.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineOperand & getOperand(unsigned i) const
void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
void setReg(Register Reg)
Change the register this operand corresponds to.
Register getReg() const
getReg - Returns the register number.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Wrapper class representing virtual and physical registers.
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
StringRef - Represent a constant reference to a string, i.e.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
Target-Independent Code Generator Pass Configuration Options.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
operand_type_match m_Reg()
operand_type_match m_Pred()
UnaryOp_match< SrcTy, TargetOpcode::G_ZEXT > m_GZExt(const SrcTy &Src)
ConstantMatch< APInt > m_ICst(APInt &Cst)
UnaryOp_match< SrcTy, TargetOpcode::G_FSQRT > m_GFSqrt(const SrcTy &Src)
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
BinaryOp_match< LHS, RHS, TargetOpcode::G_SHL, false > m_GShl(const LHS &L, const RHS &R)
bind_ty< MachineInstr * > m_MInstr(MachineInstr *&MI)
BinaryOp_match< LHS, RHS, TargetOpcode::G_LSHR, false > m_GLShr(const LHS &L, const RHS &R)
CompareOp_match< Pred, LHS, RHS, TargetOpcode::G_FCMP > m_GFCmp(const Pred &P, const LHS &L, const RHS &R)
Reg
All possible values of the reg field in the ModR/M byte.
This is an optimization pass for GlobalISel generic memory operations.
FunctionPass * createAMDGPUPostLegalizeCombiner(bool IsOptNone)
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
void initializeAMDGPUPostLegalizerCombinerPass(PassRegistry &)
void getSelectionDAGFallbackAnalysisUsage(AnalysisUsage &AU)
Modify analysis usage so it preserves passes required for the SelectionDAG fallback.
auto instrs(const MachineBasicBlock &BB)