36#include "llvm/IR/IntrinsicsAMDGPU.h"
37#include "llvm/IR/IntrinsicsR600.h"
39#define DEBUG_TYPE "amdgpu-legalinfo"
49 "amdgpu-global-isel-new-legality",
50 cl::desc(
"Use GlobalISel desired legality, rather than try to use"
51 "rules compatible with selection patterns"),
66 unsigned Bits = Ty.getSizeInBits();
76 const LLT Ty = Query.Types[TypeIdx];
82 return Ty.getNumElements() % 2 != 0 &&
83 EltSize > 1 && EltSize < 32 &&
84 Ty.getSizeInBits() % 32 != 0;
90 const LLT Ty = Query.Types[TypeIdx];
97 const LLT Ty = Query.Types[TypeIdx];
99 return EltTy.
getSizeInBits() == 16 && Ty.getNumElements() > 2;
105 const LLT Ty = Query.Types[TypeIdx];
107 return std::pair(TypeIdx,
114 const LLT Ty = Query.Types[TypeIdx];
116 unsigned Size = Ty.getSizeInBits();
117 unsigned Pieces = (
Size + 63) / 64;
118 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
128 const LLT Ty = Query.Types[TypeIdx];
131 const int Size = Ty.getSizeInBits();
133 const int NextMul32 = (
Size + 31) / 32;
137 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
145 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
146 return std::make_pair(TypeIdx,
LLT::scalar(MemSize));
153 const LLT Ty = Query.Types[TypeIdx];
155 const unsigned EltSize = Ty.getElementType().getSizeInBits();
158 assert(EltSize == 32 || EltSize == 64);
163 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
167 return std::pair(TypeIdx,
182 const unsigned NumElems = Ty.getElementCount().getFixedValue();
187 const unsigned Size = Ty.getSizeInBits();
200 const LLT Ty = Query.Types[TypeIdx];
207 const LLT Ty = Query.Types[TypeIdx];
208 unsigned Size = Ty.getSizeInBits();
217 const LLT QueryTy = Query.Types[TypeIdx];
224 const LLT QueryTy = Query.Types[TypeIdx];
231 const LLT QueryTy = Query.Types[TypeIdx];
237 return ((ST.useRealTrue16Insts() &&
Size == 16) ||
Size % 32 == 0) &&
243 return EltSize == 16 || EltSize % 32 == 0;
247 const int EltSize = Ty.getElementType().getSizeInBits();
248 return EltSize == 32 || EltSize == 64 ||
249 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
250 EltSize == 128 || EltSize == 256;
279 LLT Ty = Query.Types[TypeIdx];
287 const LLT QueryTy = Query.Types[TypeIdx];
371 if (Ty.isPointerOrPointerVector())
372 Ty = Ty.changeElementType(
LLT::scalar(Ty.getScalarSizeInBits()));
376 (ST.useRealTrue16Insts() && Ty ==
S16) ||
391 const LLT Ty = Query.Types[TypeIdx];
392 return !Ty.
isVector() && Ty.getSizeInBits() > 32 &&
393 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
401 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
411 bool IsLoad,
bool IsAtomic) {
415 return ST.enableFlatScratch() ? 128 : 32;
417 return ST.useDS128() ? 128 : 64;
428 return IsLoad ? 512 : 128;
433 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
442 const bool IsLoad = Query.
Opcode != AMDGPU::G_STORE;
444 unsigned RegSize = Ty.getSizeInBits();
447 unsigned AS = Query.
Types[1].getAddressSpace();
454 if (Ty.isVector() && MemSize !=
RegSize)
461 if (IsLoad && MemSize <
Size)
462 MemSize = std::max(MemSize,
Align);
482 if (!ST.hasDwordx3LoadStores())
495 if (AlignBits < MemSize) {
498 Align(AlignBits / 8)))
528 const unsigned Size = Ty.getSizeInBits();
529 if (Ty.isPointerVector())
539 unsigned EltSize = Ty.getScalarSizeInBits();
540 return EltSize != 32 && EltSize != 64;
554 const unsigned Size = Ty.getSizeInBits();
555 if (
Size != MemSizeInBits)
556 return Size <= 32 && Ty.isVector();
562 return Ty.isVector() && (!MemTy.
isVector() || MemTy == Ty) &&
571 uint64_t AlignInBits,
unsigned AddrSpace,
581 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
592 if (AlignInBits < RoundedSize)
599 RoundedSize, AddrSpace,
Align(AlignInBits / 8),
611 Query.
Types[1].getAddressSpace(), Opcode);
631 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
634 Register VectorReg =
MRI.createGenericVirtualRegister(VectorTy);
635 std::array<Register, 4> VectorElems;
636 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
637 for (
unsigned I = 0;
I < NumParts; ++
I)
639 B.buildExtractVectorElementConstant(
S32, VectorReg,
I).getReg(0);
640 B.buildMergeValues(MO, VectorElems);
644 Register BitcastReg =
MRI.createGenericVirtualRegister(VectorTy);
645 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
646 auto Scalar =
B.buildBitcast(ScalarTy, BitcastReg);
647 B.buildIntToPtr(MO, Scalar);
667 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
668 auto Unmerged =
B.buildUnmerge(
LLT::scalar(32), Pointer);
669 for (
unsigned I = 0;
I < NumParts; ++
I)
671 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
673 Register Scalar =
B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
674 return B.buildBitcast(VectorTy, Scalar).getReg(0);
693 auto GetAddrSpacePtr = [&TM](
unsigned AS) {
706 const LLT BufferStridedPtr =
709 const LLT CodePtr = FlatPtr;
711 const std::initializer_list<LLT> AddrSpaces64 = {
712 GlobalPtr, ConstantPtr, FlatPtr
715 const std::initializer_list<LLT> AddrSpaces32 = {
716 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
719 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
721 const std::initializer_list<LLT> FPTypesBase = {
725 const std::initializer_list<LLT> FPTypes16 = {
729 const std::initializer_list<LLT> FPTypesPK16 = {
733 const LLT MinScalarFPTy = ST.has16BitInsts() ?
S16 :
S32;
754 if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
756 if (ST.hasScalarAddSub64()) {
759 .clampMaxNumElementsStrict(0,
S16, 2)
767 .clampMaxNumElementsStrict(0,
S16, 2)
774 if (ST.hasScalarSMulU64()) {
777 .clampMaxNumElementsStrict(0,
S16, 2)
785 .clampMaxNumElementsStrict(0,
S16, 2)
795 .minScalarOrElt(0,
S16)
800 }
else if (ST.has16BitInsts()) {
834 .widenScalarToNextMultipleOf(0, 32)
844 if (ST.hasMad64_32())
849 if (ST.hasIntClamp()) {
872 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
882 if (ST.hasVOP3PInsts()) {
884 .clampMaxNumElements(0,
S8, 2)
905 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
917 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
924 .clampScalar(0,
S16,
S64);
957 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
958 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
965 if (ST.has16BitInsts()) {
966 if (ST.hasVOP3PInsts())
969 FPOpActions.legalFor({
S16});
971 TrigActions.customFor({
S16});
972 FDIVActions.customFor({
S16});
975 if (ST.hasPackedFP32Ops()) {
976 FPOpActions.legalFor({
V2S32});
977 FPOpActions.clampMaxNumElementsStrict(0,
S32, 2);
980 auto &MinNumMaxNumIeee =
983 if (ST.hasVOP3PInsts()) {
984 MinNumMaxNumIeee.legalFor(FPTypesPK16)
986 .clampMaxNumElements(0,
S16, 2)
989 }
else if (ST.has16BitInsts()) {
990 MinNumMaxNumIeee.legalFor(FPTypes16).clampScalar(0,
S16,
S64).scalarize(0);
992 MinNumMaxNumIeee.legalFor(FPTypesBase)
998 {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM});
1000 if (ST.hasVOP3PInsts()) {
1001 MinNumMaxNum.customFor(FPTypesPK16)
1003 .clampMaxNumElements(0,
S16, 2)
1004 .clampScalar(0,
S16,
S64)
1006 }
else if (ST.has16BitInsts()) {
1007 MinNumMaxNum.customFor(FPTypes16)
1008 .clampScalar(0,
S16,
S64)
1011 MinNumMaxNum.customFor(FPTypesBase)
1012 .clampScalar(0,
S32,
S64)
1016 if (ST.hasVOP3PInsts())
1032 .legalFor(FPTypesPK16)
1037 if (ST.has16BitInsts()) {
1066 if (ST.hasFractBug()) {
1095 if (ST.hasCvtPkF16F32Inst()) {
1097 .clampMaxNumElements(0,
S16, 2);
1101 FPTruncActions.scalarize(0).lower();
1109 if (ST.has16BitInsts()) {
1129 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1130 FMad.customFor({
S32,
S16});
1131 else if (ST.hasMadMacF32Insts())
1132 FMad.customFor({
S32});
1133 else if (ST.hasMadF16())
1134 FMad.customFor({
S16});
1139 if (ST.has16BitInsts()) {
1142 FRem.minScalar(0,
S32)
1151 .clampMaxNumElements(0,
S16, 2)
1170 if (ST.has16BitInsts())
1181 if (ST.has16BitInsts())
1192 .clampScalar(0,
S16,
S64)
1207 .clampScalar(0,
S16,
S64)
1211 if (ST.has16BitInsts()) {
1213 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1215 .clampScalar(0,
S16,
S64)
1219 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1221 .clampScalar(0,
S32,
S64)
1225 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1228 .clampScalar(0,
S32,
S64)
1240 .scalarSameSizeAs(1, 0)
1256 {
S1}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1257 .legalForCartesianProduct(
1258 {
S32}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1259 if (ST.has16BitInsts()) {
1260 CmpBuilder.legalFor({{
S1,
S16}});
1271 {
S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1273 if (ST.hasSALUFloatInsts())
1283 if (ST.has16BitInsts())
1284 ExpOps.customFor({{
S32}, {
S16}});
1286 ExpOps.customFor({
S32});
1287 ExpOps.clampScalar(0, MinScalarFPTy,
S32)
1296 if (ST.has16BitInsts())
1312 .clampScalar(0,
S32,
S32)
1319 if (ST.has16BitInsts())
1322 .widenScalarToNextPow2(1)
1328 .lowerFor({
S1,
S16})
1329 .widenScalarToNextPow2(1)
1356 .clampScalar(0,
S32,
S32)
1366 .clampScalar(0,
S32,
S64)
1370 if (ST.has16BitInsts()) {
1373 .clampMaxNumElementsStrict(0,
S16, 2)
1380 if (ST.hasVOP3PInsts()) {
1383 .clampMaxNumElements(0,
S16, 2)
1388 if (ST.hasIntMinMax64()) {
1391 .clampMaxNumElements(0,
S16, 2)
1399 .clampMaxNumElements(0,
S16, 2)
1408 .widenScalarToNextPow2(0)
1436 .legalForCartesianProduct(AddrSpaces32, {
S32})
1452 .legalForCartesianProduct(AddrSpaces32, {
S32})
1469 const auto needToSplitMemOp = [=](
const LegalityQuery &Query,
1470 bool IsLoad) ->
bool {
1474 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1488 unsigned NumRegs = (MemSize + 31) / 32;
1490 if (!ST.hasDwordx3LoadStores())
1501 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1502 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1503 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1509 for (
unsigned Op : {G_LOAD, G_STORE}) {
1510 const bool IsStore =
Op == G_STORE;
1515 Actions.legalForTypesWithMemDesc({{
S32, GlobalPtr,
S32, GlobalAlign32},
1518 {
S64, GlobalPtr,
S64, GlobalAlign32},
1521 {
S32, GlobalPtr,
S8, GlobalAlign8},
1522 {
S32, GlobalPtr,
S16, GlobalAlign16},
1524 {
S32, LocalPtr,
S32, 32},
1525 {
S64, LocalPtr,
S64, 32},
1527 {
S32, LocalPtr,
S8, 8},
1528 {
S32, LocalPtr,
S16, 16},
1531 {
S32, PrivatePtr,
S32, 32},
1532 {
S32, PrivatePtr,
S8, 8},
1533 {
S32, PrivatePtr,
S16, 16},
1536 {
S32, ConstantPtr,
S32, GlobalAlign32},
1539 {
S64, ConstantPtr,
S64, GlobalAlign32},
1540 {
V2S32, ConstantPtr,
V2S32, GlobalAlign32}});
1549 Actions.unsupportedIf(
1550 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1564 Actions.customIf(
typeIs(1, Constant32Ptr));
1590 return !Query.
Types[0].isVector() &&
1591 needToSplitMemOp(Query,
Op == G_LOAD);
1593 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1598 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1601 if (DstSize > MemSize)
1607 if (MemSize > MaxSize)
1615 return Query.
Types[0].isVector() &&
1616 needToSplitMemOp(Query,
Op == G_LOAD);
1618 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1632 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1633 if (MemSize > MaxSize) {
1637 if (MaxSize % EltSize == 0) {
1643 unsigned NumPieces = MemSize / MaxSize;
1647 if (NumPieces == 1 || NumPieces >= NumElts ||
1648 NumElts % NumPieces != 0)
1649 return std::pair(0, EltTy);
1657 return std::pair(0, EltTy);
1672 return std::pair(0, EltTy);
1677 .widenScalarToNextPow2(0)
1684 .legalForTypesWithMemDesc({{
S32, GlobalPtr,
S8, 8},
1685 {
S32, GlobalPtr,
S16, 2 * 8},
1686 {
S32, LocalPtr,
S8, 8},
1687 {
S32, LocalPtr,
S16, 16},
1688 {
S32, PrivatePtr,
S8, 8},
1689 {
S32, PrivatePtr,
S16, 16},
1690 {
S32, ConstantPtr,
S8, 8},
1691 {
S32, ConstantPtr,
S16, 2 * 8}})
1697 if (ST.hasFlatAddressSpace()) {
1698 ExtLoads.legalForTypesWithMemDesc(
1699 {{
S32, FlatPtr,
S8, 8}, {
S32, FlatPtr,
S16, 16}});
1714 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1715 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1716 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1717 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1718 .legalFor({{
S32, GlobalPtr}, {
S32, LocalPtr},
1719 {
S64, GlobalPtr}, {
S64, LocalPtr},
1720 {
S32, RegionPtr}, {
S64, RegionPtr}});
1721 if (ST.hasFlatAddressSpace()) {
1722 Atomics.legalFor({{
S32, FlatPtr}, {
S64, FlatPtr}});
1727 .legalFor({{
S32, GlobalPtr}, {
S32, LocalPtr}, {
S32, RegionPtr}});
1728 if (ST.hasFlatAddressSpace()) {
1729 Atomics32.legalFor({{
S32, FlatPtr}});
1734 if (ST.hasLDSFPAtomicAddF32()) {
1735 Atomic.legalFor({{
S32, LocalPtr}, {
S32, RegionPtr}});
1736 if (ST.hasLdsAtomicAddF64())
1737 Atomic.legalFor({{
S64, LocalPtr}});
1738 if (ST.hasAtomicDsPkAdd16Insts())
1739 Atomic.legalFor({{
V2F16, LocalPtr}, {
V2BF16, LocalPtr}});
1741 if (ST.hasAtomicFaddInsts())
1742 Atomic.legalFor({{
S32, GlobalPtr}});
1743 if (ST.hasFlatAtomicFaddF32Inst())
1744 Atomic.legalFor({{
S32, FlatPtr}});
1746 if (ST.hasGFX90AInsts() || ST.hasGFX1250Insts()) {
1757 if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1758 ST.hasAtomicBufferGlobalPkAddF16Insts())
1759 Atomic.legalFor({{
V2F16, GlobalPtr}, {
V2F16, BufferFatPtr}});
1760 if (ST.hasAtomicGlobalPkAddBF16Inst())
1761 Atomic.legalFor({{
V2BF16, GlobalPtr}});
1762 if (ST.hasAtomicFlatPkAdd16Insts())
1763 Atomic.legalFor({{
V2F16, FlatPtr}, {
V2BF16, FlatPtr}});
1768 auto &AtomicFMinFMax =
1770 .legalFor({{
F32, LocalPtr}, {
F64, LocalPtr}});
1772 if (ST.hasAtomicFMinFMaxF32GlobalInsts())
1774 if (ST.hasAtomicFMinFMaxF64GlobalInsts())
1775 AtomicFMinFMax.
legalFor({{
F64, GlobalPtr}, {
F64, BufferFatPtr}});
1776 if (ST.hasAtomicFMinFMaxF32FlatInsts())
1778 if (ST.hasAtomicFMinFMaxF64FlatInsts())
1785 {
S32, FlatPtr}, {
S64, FlatPtr}})
1786 .legalFor({{
S32, LocalPtr}, {
S64, LocalPtr},
1787 {
S32, RegionPtr}, {
S64, RegionPtr}});
1793 LocalPtr, FlatPtr, PrivatePtr,
1797 .clampScalar(0,
S16,
S64)
1812 if (ST.has16BitInsts()) {
1813 if (ST.hasVOP3PInsts()) {
1815 .clampMaxNumElements(0,
S16, 2);
1817 Shifts.legalFor({{
S16,
S16}});
1820 Shifts.widenScalarIf(
1825 const LLT AmountTy = Query.
Types[1];
1826 return ValTy.isScalar() && ValTy.getSizeInBits() <= 16 &&
1830 Shifts.clampScalar(1,
S32,
S32);
1831 Shifts.widenScalarToNextPow2(0, 16);
1832 Shifts.clampScalar(0,
S16,
S64);
1842 Shifts.clampScalar(1,
S32,
S32);
1843 Shifts.widenScalarToNextPow2(0, 32);
1844 Shifts.clampScalar(0,
S32,
S64);
1853 for (
unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1854 unsigned VecTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1855 unsigned EltTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1856 unsigned IdxTypeIdx = 2;
1860 const LLT EltTy = Query.
Types[EltTypeIdx];
1861 const LLT VecTy = Query.
Types[VecTypeIdx];
1862 const LLT IdxTy = Query.
Types[IdxTypeIdx];
1864 const bool isLegalVecType =
1874 return (EltSize == 32 || EltSize == 64) &&
1890 const LLT EltTy = Query.
Types[EltTypeIdx];
1891 const LLT VecTy = Query.
Types[VecTypeIdx];
1895 const unsigned TargetEltSize =
1896 DstEltSize % 64 == 0 ? 64 : 32;
1897 return std::pair(VecTypeIdx,
1901 .clampScalar(EltTypeIdx,
S32,
S64)
1915 const LLT &EltTy = Query.
Types[1].getElementType();
1916 return Query.
Types[0] != EltTy;
1919 for (
unsigned Op : {G_EXTRACT, G_INSERT}) {
1920 unsigned BigTyIdx =
Op == G_EXTRACT ? 1 : 0;
1921 unsigned LitTyIdx =
Op == G_EXTRACT ? 0 : 1;
1930 const LLT BigTy = Query.
Types[BigTyIdx];
1935 const LLT BigTy = Query.
Types[BigTyIdx];
1936 const LLT LitTy = Query.
Types[LitTyIdx];
1942 const LLT BigTy = Query.
Types[BigTyIdx];
1948 const LLT LitTy = Query.
Types[LitTyIdx];
1967 if (ST.hasScalarPackInsts()) {
1970 .minScalarOrElt(0,
S16)
1977 BuildVector.customFor({
V2S16,
S16});
1978 BuildVector.minScalarOrElt(0,
S32);
1997 for (
unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1998 unsigned BigTyIdx =
Op == G_MERGE_VALUES ? 0 : 1;
1999 unsigned LitTyIdx =
Op == G_MERGE_VALUES ? 1 : 0;
2001 auto notValidElt = [=](
const LegalityQuery &Query,
unsigned TypeIdx) {
2002 const LLT Ty = Query.
Types[TypeIdx];
2003 if (Ty.isVector()) {
2018 const LLT BigTy = Query.
Types[BigTyIdx];
2038 return notValidElt(Query, LitTyIdx);
2043 return notValidElt(Query, BigTyIdx);
2048 if (
Op == G_MERGE_VALUES) {
2049 Builder.widenScalarIf(
2052 const LLT Ty = Query.
Types[LitTyIdx];
2053 return Ty.getSizeInBits() < 32;
2060 const LLT Ty = Query.
Types[BigTyIdx];
2061 return Ty.getSizeInBits() % 16 != 0;
2066 const LLT &Ty = Query.
Types[BigTyIdx];
2067 unsigned NewSizeInBits = 1 <<
Log2_32_Ceil(Ty.getSizeInBits() + 1);
2068 if (NewSizeInBits >= 256) {
2069 unsigned RoundedTo =
alignTo<64>(Ty.getSizeInBits() + 1);
2070 if (RoundedTo < NewSizeInBits)
2071 NewSizeInBits = RoundedTo;
2073 return std::pair(BigTyIdx,
LLT::scalar(NewSizeInBits));
2084 .clampScalar(0,
S32,
S64);
2086 if (ST.hasVOP3PInsts()) {
2087 SextInReg.lowerFor({{
V2S16}})
2091 .clampMaxNumElementsStrict(0,
S16, 2);
2092 }
else if (ST.has16BitInsts()) {
2093 SextInReg.lowerFor({{
S32}, {
S64}, {
S16}});
2097 SextInReg.lowerFor({{
S32}, {
S64}});
2110 FSHRActionDefs.legalFor({{
S32,
S32}})
2111 .clampMaxNumElementsStrict(0,
S16, 2);
2112 if (ST.hasVOP3PInsts())
2114 FSHRActionDefs.scalarize(0).lower();
2116 if (ST.hasVOP3PInsts()) {
2119 .clampMaxNumElementsStrict(0,
S16, 2)
2143 .clampScalar(1,
S32,
S32)
2152 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2153 G_READ_REGISTER, G_WRITE_REGISTER,
2158 if (ST.hasIEEEMinimumMaximumInsts()) {
2160 .legalFor(FPTypesPK16)
2163 }
else if (ST.hasVOP3PInsts()) {
2166 .clampMaxNumElementsStrict(0,
S16, 2)
2182 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2183 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2189 {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX,
2190 G_VECREDUCE_ADD, G_VECREDUCE_MUL, G_VECREDUCE_FMUL, G_VECREDUCE_FMIN,
2191 G_VECREDUCE_FMAX, G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM,
2192 G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
2198 verify(*ST.getInstrInfo());
2207 switch (
MI.getOpcode()) {
2208 case TargetOpcode::G_ADDRSPACE_CAST:
2210 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2212 case TargetOpcode::G_FCEIL:
2214 case TargetOpcode::G_FREM:
2216 case TargetOpcode::G_INTRINSIC_TRUNC:
2218 case TargetOpcode::G_SITOFP:
2220 case TargetOpcode::G_UITOFP:
2222 case TargetOpcode::G_FPTOSI:
2224 case TargetOpcode::G_FPTOUI:
2226 case TargetOpcode::G_FMINNUM:
2227 case TargetOpcode::G_FMAXNUM:
2228 case TargetOpcode::G_FMINIMUMNUM:
2229 case TargetOpcode::G_FMAXIMUMNUM:
2231 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2233 case TargetOpcode::G_INSERT_VECTOR_ELT:
2235 case TargetOpcode::G_FSIN:
2236 case TargetOpcode::G_FCOS:
2238 case TargetOpcode::G_GLOBAL_VALUE:
2240 case TargetOpcode::G_LOAD:
2241 case TargetOpcode::G_SEXTLOAD:
2242 case TargetOpcode::G_ZEXTLOAD:
2244 case TargetOpcode::G_STORE:
2246 case TargetOpcode::G_FMAD:
2248 case TargetOpcode::G_FDIV:
2250 case TargetOpcode::G_FFREXP:
2252 case TargetOpcode::G_FSQRT:
2254 case TargetOpcode::G_UDIV:
2255 case TargetOpcode::G_UREM:
2256 case TargetOpcode::G_UDIVREM:
2258 case TargetOpcode::G_SDIV:
2259 case TargetOpcode::G_SREM:
2260 case TargetOpcode::G_SDIVREM:
2262 case TargetOpcode::G_ATOMIC_CMPXCHG:
2264 case TargetOpcode::G_FLOG2:
2266 case TargetOpcode::G_FLOG:
2267 case TargetOpcode::G_FLOG10:
2269 case TargetOpcode::G_FEXP2:
2271 case TargetOpcode::G_FEXP:
2272 case TargetOpcode::G_FEXP10:
2274 case TargetOpcode::G_FPOW:
2276 case TargetOpcode::G_FFLOOR:
2278 case TargetOpcode::G_BUILD_VECTOR:
2279 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2281 case TargetOpcode::G_MUL:
2283 case TargetOpcode::G_CTLZ:
2284 case TargetOpcode::G_CTTZ:
2286 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2288 case TargetOpcode::G_STACKSAVE:
2290 case TargetOpcode::G_GET_FPENV:
2292 case TargetOpcode::G_SET_FPENV:
2294 case TargetOpcode::G_TRAP:
2296 case TargetOpcode::G_DEBUGTRAP:
2316 if (ST.hasApertureRegs()) {
2321 ? AMDGPU::SRC_SHARED_BASE
2322 : AMDGPU::SRC_PRIVATE_BASE;
2323 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
2324 !ST.hasGloballyAddressableScratch()) &&
2325 "Cannot use src_private_base with globally addressable scratch!");
2327 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2328 B.buildCopy({Dst}, {
Register(ApertureRegNo)});
2329 return B.buildUnmerge(
S32, Dst).getReg(1);
2332 Register LoadAddr =
MRI.createGenericVirtualRegister(
2344 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), Param);
2346 Register KernargPtrReg =
MRI.createGenericVirtualRegister(
2360 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
2363 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2366 Register QueuePtr =
MRI.createGenericVirtualRegister(
2385 B.buildObjectPtrOffset(
2387 B.buildConstant(
LLT::scalar(64), StructOffset).getReg(0));
2388 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2396 switch (Def->getOpcode()) {
2397 case AMDGPU::G_FRAME_INDEX:
2398 case AMDGPU::G_GLOBAL_VALUE:
2399 case AMDGPU::G_BLOCK_ADDR:
2401 case AMDGPU::G_CONSTANT: {
2402 const ConstantInt *CI = Def->getOperand(1).getCImm();
2419 assert(
MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2421 Intrinsic::amdgcn_addrspacecast_nonnull));
2426 :
MI.getOperand(1).getReg();
2427 LLT DstTy =
MRI.getType(Dst);
2428 LLT SrcTy =
MRI.getType(Src);
2430 unsigned SrcAS = SrcTy.getAddressSpace();
2440 MI.setDesc(
B.getTII().get(TargetOpcode::G_BITCAST));
2447 auto castFlatToLocalOrPrivate = [&](
const DstOp &Dst) ->
Register {
2449 ST.hasGloballyAddressableScratch()) {
2453 Register SrcLo =
B.buildExtract(
S32, Src, 0).getReg(0);
2455 B.buildInstr(AMDGPU::S_MOV_B32, {
S32},
2456 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)})
2458 MRI.setRegClass(FlatScratchBaseLo, &AMDGPU::SReg_32RegClass);
2460 return B.buildIntToPtr(Dst,
Sub).getReg(0);
2464 return B.buildExtract(Dst, Src, 0).getReg(0);
2470 castFlatToLocalOrPrivate(Dst);
2471 MI.eraseFromParent();
2477 auto SegmentNull =
B.buildConstant(DstTy, NullVal);
2478 auto FlatNull =
B.buildConstant(SrcTy, 0);
2481 auto PtrLo32 = castFlatToLocalOrPrivate(DstTy);
2485 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2487 MI.eraseFromParent();
2494 auto castLocalOrPrivateToFlat = [&](
const DstOp &Dst) ->
Register {
2497 Register SrcAsInt =
B.buildPtrToInt(
S32, Src).getReg(0);
2500 ST.hasGloballyAddressableScratch()) {
2505 ThreadID =
B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {
S32})
2509 if (ST.isWave64()) {
2510 ThreadID =
B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {
S32})
2516 B.buildConstant(
S32, 57 - 32 - ST.getWavefrontSizeLog2()).getReg(0);
2517 Register SrcHi =
B.buildShl(
S32, ThreadID, ShAmt).getReg(0);
2519 B.buildMergeLikeInstr(DstTy, {SrcAsInt, SrcHi}).
getReg(0);
2523 B.buildInstr(AMDGPU::S_MOV_B64, {
S64},
2524 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE)})
2526 MRI.setRegClass(FlatScratchBase, &AMDGPU::SReg_64RegClass);
2527 return B.buildPtrAdd(Dst, CvtPtr, FlatScratchBase).getReg(0);
2536 return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).
getReg(0);
2542 castLocalOrPrivateToFlat(Dst);
2543 MI.eraseFromParent();
2547 Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2553 SegmentNull.getReg(0));
2555 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2557 MI.eraseFromParent();
2562 SrcTy.getSizeInBits() == 64) {
2564 B.buildExtract(Dst, Src, 0);
2565 MI.eraseFromParent();
2572 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2573 auto PtrLo =
B.buildPtrToInt(
S32, Src);
2574 if (AddrHiVal == 0) {
2576 B.buildIntToPtr(Dst, Zext);
2578 auto HighAddr =
B.buildConstant(
S32, AddrHiVal);
2579 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2582 MI.eraseFromParent();
2589 MI.eraseFromParent();
2597 LLT Ty =
MRI.getType(Src);
2598 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2603 auto C1 =
B.buildFConstant(Ty, C1Val);
2604 auto CopySign =
B.buildFCopysign(Ty, C1, Src);
2607 auto Tmp1 =
B.buildFAdd(Ty, Src, CopySign);
2608 auto Tmp2 =
B.buildFSub(Ty, Tmp1, CopySign);
2610 auto C2 =
B.buildFConstant(Ty, C2Val);
2611 auto Fabs =
B.buildFAbs(Ty, Src);
2614 B.buildSelect(
MI.getOperand(0).getReg(),
Cond, Src, Tmp2);
2615 MI.eraseFromParent();
2633 auto Trunc =
B.buildIntrinsicTrunc(
S64, Src);
2635 const auto Zero =
B.buildFConstant(
S64, 0.0);
2636 const auto One =
B.buildFConstant(
S64, 1.0);
2639 auto And =
B.buildAnd(
S1, Lt0, NeTrunc);
2640 auto Add =
B.buildSelect(
S64,
And, One, Zero);
2643 B.buildFAdd(
MI.getOperand(0).getReg(), Trunc,
Add);
2644 MI.eraseFromParent();
2652 Register Src0Reg =
MI.getOperand(1).getReg();
2653 Register Src1Reg =
MI.getOperand(2).getReg();
2654 auto Flags =
MI.getFlags();
2655 LLT Ty =
MRI.getType(DstReg);
2657 auto Div =
B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2658 auto Trunc =
B.buildIntrinsicTrunc(Ty, Div, Flags);
2659 auto Neg =
B.buildFNeg(Ty, Trunc, Flags);
2660 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2661 MI.eraseFromParent();
2667 const unsigned FractBits = 52;
2668 const unsigned ExpBits = 11;
2671 auto Const0 =
B.buildConstant(
S32, FractBits - 32);
2672 auto Const1 =
B.buildConstant(
S32, ExpBits);
2674 auto ExpPart =
B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {
S32})
2676 .addUse(Const0.getReg(0))
2677 .addUse(Const1.getReg(0));
2679 return B.buildSub(
S32, ExpPart,
B.buildConstant(
S32, 1023));
2693 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2700 const unsigned FractBits = 52;
2703 const auto SignBitMask =
B.buildConstant(
S32, UINT32_C(1) << 31);
2704 auto SignBit =
B.buildAnd(
S32,
Hi, SignBitMask);
2706 const auto FractMask =
B.buildConstant(
S64, (UINT64_C(1) << FractBits) - 1);
2708 const auto Zero32 =
B.buildConstant(
S32, 0);
2711 auto SignBit64 =
B.buildMergeLikeInstr(
S64, {Zero32, SignBit});
2713 auto Shr =
B.buildAShr(
S64, FractMask, Exp);
2714 auto Not =
B.buildNot(
S64, Shr);
2715 auto Tmp0 =
B.buildAnd(
S64, Src, Not);
2716 auto FiftyOne =
B.buildConstant(
S32, FractBits - 1);
2721 auto Tmp1 =
B.buildSelect(
S64, ExpLt0, SignBit64, Tmp0);
2722 B.buildSelect(
MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2723 MI.eraseFromParent();
2739 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2740 auto ThirtyTwo =
B.buildConstant(
S32, 32);
2742 if (
MRI.getType(Dst) ==
S64) {
2743 auto CvtHi =
Signed ?
B.buildSITOFP(
S64, Unmerge.getReg(1))
2744 :
B.buildUITOFP(
S64, Unmerge.getReg(1));
2746 auto CvtLo =
B.buildUITOFP(
S64, Unmerge.getReg(0));
2747 auto LdExp =
B.buildFLdexp(
S64, CvtHi, ThirtyTwo);
2750 B.buildFAdd(Dst, LdExp, CvtLo);
2751 MI.eraseFromParent();
2757 auto One =
B.buildConstant(
S32, 1);
2761 auto ThirtyOne =
B.buildConstant(
S32, 31);
2762 auto X =
B.buildXor(
S32, Unmerge.getReg(0), Unmerge.getReg(1));
2763 auto OppositeSign =
B.buildAShr(
S32,
X, ThirtyOne);
2764 auto MaxShAmt =
B.buildAdd(
S32, ThirtyTwo, OppositeSign);
2765 auto LS =
B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {
S32})
2766 .addUse(Unmerge.getReg(1));
2767 auto LS2 =
B.buildSub(
S32, LS, One);
2768 ShAmt =
B.buildUMin(
S32, LS2, MaxShAmt);
2770 ShAmt =
B.buildCTLZ(
S32, Unmerge.getReg(1));
2771 auto Norm =
B.buildShl(
S64, Src, ShAmt);
2772 auto Unmerge2 =
B.buildUnmerge({
S32,
S32}, Norm);
2773 auto Adjust =
B.buildUMin(
S32, One, Unmerge2.getReg(0));
2774 auto Norm2 =
B.buildOr(
S32, Unmerge2.getReg(1), Adjust);
2775 auto FVal =
Signed ?
B.buildSITOFP(
S32, Norm2) :
B.buildUITOFP(
S32, Norm2);
2776 auto Scale =
B.buildSub(
S32, ThirtyTwo, ShAmt);
2777 B.buildFLdexp(Dst, FVal, Scale);
2778 MI.eraseFromParent();
2795 const LLT SrcLT =
MRI.getType(Src);
2798 unsigned Flags =
MI.getFlags();
2809 auto Trunc =
B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2817 Sign =
B.buildAShr(
S32, Src,
B.buildConstant(
S32, 31));
2818 Trunc =
B.buildFAbs(
S32, Trunc, Flags);
2822 K0 =
B.buildFConstant(
2824 K1 =
B.buildFConstant(
2827 K0 =
B.buildFConstant(
2829 K1 =
B.buildFConstant(
2833 auto Mul =
B.buildFMul(SrcLT, Trunc, K0, Flags);
2834 auto FloorMul =
B.buildFFloor(SrcLT,
Mul, Flags);
2835 auto Fma =
B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2838 :
B.buildFPTOUI(
S32, FloorMul);
2839 auto Lo =
B.buildFPTOUI(
S32, Fma);
2843 Sign =
B.buildMergeLikeInstr(
S64, {Sign, Sign});
2845 B.buildSub(Dst,
B.buildXor(
S64,
B.buildMergeLikeInstr(
S64, {Lo, Hi}), Sign),
2848 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
2849 MI.eraseFromParent();
2876 LLT VecTy =
MRI.getType(Vec);
2889 auto IntVec =
B.buildPtrToInt(IntVecTy, Vec);
2890 auto IntElt =
B.buildExtractVectorElement(IntTy, IntVec,
MI.getOperand(2));
2891 B.buildIntToPtr(Dst, IntElt);
2893 MI.eraseFromParent();
2900 std::optional<ValueAndVReg> MaybeIdxVal =
2904 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2907 auto Unmerge =
B.buildUnmerge(EltTy, Vec);
2908 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2913 MI.eraseFromParent();
2928 LLT VecTy =
MRI.getType(Vec);
2942 auto IntVecSource =
B.buildPtrToInt(IntVecTy, Vec);
2943 auto IntIns =
B.buildPtrToInt(IntTy, Ins);
2944 auto IntVecDest =
B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
2946 B.buildIntToPtr(Dst, IntVecDest);
2947 MI.eraseFromParent();
2954 std::optional<ValueAndVReg> MaybeIdxVal =
2959 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2962 if (IdxVal < NumElts) {
2964 for (
unsigned i = 0; i < NumElts; ++i)
2965 SrcRegs.
push_back(
MRI.createGenericVirtualRegister(EltTy));
2966 B.buildUnmerge(SrcRegs, Vec);
2968 SrcRegs[IdxVal] =
MI.getOperand(2).getReg();
2969 B.buildMergeLikeInstr(Dst, SrcRegs);
2974 MI.eraseFromParent();
2984 LLT Ty =
MRI.getType(DstReg);
2985 unsigned Flags =
MI.getFlags();
2989 if (ST.hasTrigReducedRange()) {
2990 auto MulVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2991 TrigVal =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
2992 .addUse(MulVal.getReg(0))
2996 TrigVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2999 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
3003 MI.eraseFromParent();
3011 unsigned GAFlags)
const {
3040 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
3042 if (ST.has64BitLiterals()) {
3046 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET64).addDef(PCReg);
3050 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET).addDef(PCReg);
3059 if (!
B.getMRI()->getRegClassOrNull(PCReg))
3060 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
3063 B.buildExtract(DstReg, PCReg, 0);
3073 if (RequiresHighHalf && ST.has64BitLiterals()) {
3074 if (!
MRI.getRegClassOrNull(DstReg))
3075 MRI.setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
3076 B.buildInstr(AMDGPU::S_MOV_B64)
3086 Register AddrLo = !RequiresHighHalf && !
MRI.getRegClassOrNull(DstReg)
3088 :
MRI.createGenericVirtualRegister(
S32);
3090 if (!
MRI.getRegClassOrNull(AddrLo))
3091 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
3094 B.buildInstr(AMDGPU::S_MOV_B32)
3099 if (RequiresHighHalf) {
3101 "Must provide a 64-bit pointer type!");
3104 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
3106 B.buildInstr(AMDGPU::S_MOV_B32)
3116 if (!
MRI.getRegClassOrNull(AddrDst))
3117 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
3119 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
3123 if (AddrDst != DstReg)
3124 B.buildCast(DstReg, AddrDst);
3125 }
else if (AddrLo != DstReg) {
3128 B.buildCast(DstReg, AddrLo);
3136 LLT Ty =
MRI.getType(DstReg);
3137 unsigned AS = Ty.getAddressSpace();
3145 GV->
getName() !=
"llvm.amdgcn.module.lds" &&
3149 Fn,
"local memory global used by non-kernel function",
3158 B.buildUndef(DstReg);
3159 MI.eraseFromParent();
3179 if (
B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
3183 auto Sz =
B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {
S32});
3184 B.buildIntToPtr(DstReg, Sz);
3185 MI.eraseFromParent();
3192 MI.eraseFromParent();
3196 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3198 MI.eraseFromParent();
3206 MI.eraseFromParent();
3212 MI.eraseFromParent();
3217 Register GOTAddr =
MRI.createGenericVirtualRegister(PtrTy);
3228 if (Ty.getSizeInBits() == 32) {
3230 auto Load =
B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3231 B.buildExtract(DstReg, Load, 0);
3233 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3235 MI.eraseFromParent();
3253 LLT PtrTy =
MRI.getType(PtrReg);
3258 auto Cast =
B.buildAddrSpaceCast(ConstPtr, PtrReg);
3260 MI.getOperand(1).setReg(Cast.getReg(0));
3265 if (
MI.getOpcode() != AMDGPU::G_LOAD)
3269 LLT ValTy =
MRI.getType(ValReg);
3279 const unsigned ValSize = ValTy.getSizeInBits();
3291 if (WideMemSize == ValSize) {
3297 MI.setMemRefs(MF, {WideMMO});
3303 if (ValSize > WideMemSize)
3310 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3311 B.buildTrunc(ValReg, WideLoad).getReg(0);
3318 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3319 B.buildExtract(ValReg, WideLoad, 0);
3323 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3324 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3328 MI.eraseFromParent();
3341 Register DataReg =
MI.getOperand(0).getReg();
3342 LLT DataTy =
MRI.getType(DataReg);
3356 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
3385 "this should not have been custom lowered");
3387 LLT ValTy =
MRI.getType(CmpVal);
3390 Register PackedVal =
B.buildBuildVector(VecTy, { NewVal, CmpVal }).
getReg(0);
3392 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3396 .setMemRefs(
MI.memoperands());
3398 MI.eraseFromParent();
3406 switch (
DefMI->getOpcode()) {
3407 case TargetOpcode::G_INTRINSIC: {
3409 case Intrinsic::amdgcn_frexp_mant:
3410 case Intrinsic::amdgcn_log:
3411 case Intrinsic::amdgcn_log_clamp:
3412 case Intrinsic::amdgcn_exp2:
3413 case Intrinsic::amdgcn_sqrt:
3421 case TargetOpcode::G_FSQRT:
3423 case TargetOpcode::G_FFREXP: {
3424 if (
DefMI->getOperand(0).getReg() == Src)
3428 case TargetOpcode::G_FPEXT: {
3449std::pair<Register, Register>
3451 unsigned Flags)
const {
3456 auto SmallestNormal =
B.buildFConstant(
3458 auto IsLtSmallestNormal =
3461 auto Scale32 =
B.buildFConstant(
F32, 0x1.0p+32);
3462 auto One =
B.buildFConstant(
F32, 1.0);
3464 B.buildSelect(
F32, IsLtSmallestNormal, Scale32, One, Flags);
3465 auto ScaledInput =
B.buildFMul(
F32, Src, ScaleFactor, Flags);
3467 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3480 LLT Ty =
B.getMRI()->getType(Dst);
3481 unsigned Flags =
MI.getFlags();
3486 auto Ext =
B.buildFPExt(
F32, Src, Flags);
3487 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {
F32})
3488 .addUse(Ext.getReg(0))
3490 B.buildFPTrunc(Dst,
Log2, Flags);
3491 MI.eraseFromParent();
3499 B.buildIntrinsic(Intrinsic::amdgcn_log, {
MI.getOperand(0)})
3502 MI.eraseFromParent();
3506 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3507 .addUse(ScaledInput)
3510 auto ThirtyTwo =
B.buildFConstant(Ty, 32.0);
3511 auto Zero =
B.buildFConstant(Ty, 0.0);
3513 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3514 B.buildFSub(Dst,
Log2, ResultOffset, Flags);
3516 MI.eraseFromParent();
3522 auto FMul =
B.buildFMul(Ty,
X,
Y, Flags);
3523 return B.buildFAdd(Ty,
FMul, Z, Flags).getReg(0);
3528 const bool IsLog10 =
MI.getOpcode() == TargetOpcode::G_FLOG10;
3529 assert(IsLog10 ||
MI.getOpcode() == TargetOpcode::G_FLOG);
3534 unsigned Flags =
MI.getFlags();
3535 const LLT Ty =
MRI.getType(
X);
3545 if (Ty == F16 && !ST.has16BitInsts()) {
3547 auto PromoteSrc =
B.buildFPExt(
F32,
X);
3549 B.buildFPTrunc(Dst, LogVal);
3554 MI.eraseFromParent();
3563 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(
X).setMIFlags(Flags);
3566 if (ST.hasFastFMAF32()) {
3568 const float c_log10 = 0x1.344134p-2f;
3569 const float cc_log10 = 0x1.09f79ep-26f;
3572 const float c_log = 0x1.62e42ep-1f;
3573 const float cc_log = 0x1.efa39ep-25f;
3575 auto C =
B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3576 auto CC =
B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3580 R =
B.buildFMul(Ty,
Y,
C, NewFlags).getReg(0);
3581 auto NegR =
B.buildFNeg(Ty, R, NewFlags);
3582 auto FMA0 =
B.buildFMA(Ty,
Y,
C, NegR, NewFlags);
3583 auto FMA1 =
B.buildFMA(Ty,
Y, CC, FMA0, NewFlags);
3584 R =
B.buildFAdd(Ty, R, FMA1, NewFlags).getReg(0);
3587 const float ch_log10 = 0x1.344000p-2f;
3588 const float ct_log10 = 0x1.3509f6p-18f;
3591 const float ch_log = 0x1.62e000p-1f;
3592 const float ct_log = 0x1.0bfbe8p-15f;
3594 auto CH =
B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3595 auto CT =
B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3597 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
3598 auto YH =
B.buildAnd(Ty,
Y, MaskConst);
3599 auto YT =
B.buildFSub(Ty,
Y, YH, Flags);
3603 auto YTCT =
B.buildFMul(Ty, YT, CT, NewFlags);
3606 getMad(
B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), NewFlags);
3608 R =
getMad(
B, Ty, YH.getReg(0),
CH.getReg(0), Mad1, NewFlags);
3611 const bool IsFiniteOnly =
3615 if (!IsFiniteOnly) {
3618 auto Fabs =
B.buildFAbs(Ty,
Y);
3621 R =
B.buildSelect(Ty, IsFinite, R,
Y, Flags).getReg(0);
3625 auto Zero =
B.buildFConstant(Ty, 0.0);
3627 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3628 auto Shift =
B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3629 B.buildFSub(Dst, R, Shift, Flags);
3631 B.buildCopy(Dst, R);
3634 MI.eraseFromParent();
3640 unsigned Flags)
const {
3641 const double Log2BaseInverted =
3644 LLT Ty =
B.getMRI()->getType(Dst);
3649 auto LogSrc =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3652 auto ScaledResultOffset =
B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3653 auto Zero =
B.buildFConstant(Ty, 0.0);
3655 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3656 auto Log2Inv =
B.buildFConstant(Ty, Log2BaseInverted);
3658 if (ST.hasFastFMAF32())
3659 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3661 auto Mul =
B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3662 B.buildFAdd(Dst,
Mul, ResultOffset, Flags);
3670 ?
B.buildFLog2(Ty, Src, Flags)
3671 :
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3674 auto Log2BaseInvertedOperand =
B.buildFConstant(Ty, Log2BaseInverted);
3675 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3686 unsigned Flags =
MI.getFlags();
3687 LLT Ty =
B.getMRI()->getType(Dst);
3693 auto Ext =
B.buildFPExt(
F32, Src, Flags);
3694 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {
F32})
3695 .addUse(Ext.getReg(0))
3697 B.buildFPTrunc(Dst,
Log2, Flags);
3698 MI.eraseFromParent();
3708 MI.eraseFromParent();
3716 auto RangeCheckConst =
B.buildFConstant(Ty, -0x1.f80000p+6f);
3718 RangeCheckConst, Flags);
3720 auto SixtyFour =
B.buildFConstant(Ty, 0x1.0p+6f);
3721 auto Zero =
B.buildFConstant(Ty, 0.0);
3722 auto AddOffset =
B.buildSelect(
F32, NeedsScaling, SixtyFour, Zero, Flags);
3723 auto AddInput =
B.buildFAdd(
F32, Src, AddOffset, Flags);
3725 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3726 .addUse(AddInput.getReg(0))
3729 auto TwoExpNeg64 =
B.buildFConstant(Ty, 0x1.0p-64f);
3730 auto One =
B.buildFConstant(Ty, 1.0);
3731 auto ResultScale =
B.buildSelect(
F32, NeedsScaling, TwoExpNeg64, One, Flags);
3732 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3733 MI.eraseFromParent();
3738 const SrcOp &Src,
unsigned Flags) {
3739 LLT Ty = Dst.getLLTTy(*
B.getMRI());
3742 return B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Dst})
3743 .addUse(Src.getReg())
3746 return B.buildFExp2(Dst, Src, Flags);
3752 bool IsExp10)
const {
3753 LLT Ty =
B.getMRI()->getType(
X);
3757 auto Const =
B.buildFConstant(Ty, IsExp10 ? 0x1.a934f0p+1f :
numbers::log2e);
3758 auto Mul =
B.buildFMul(Ty,
X, Const, Flags);
3765 LLT Ty =
B.getMRI()->getType(Dst);
3772 auto Threshold =
B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3775 auto ScaleOffset =
B.buildFConstant(Ty, 0x1.0p+6f);
3776 auto ScaledX =
B.buildFAdd(Ty,
X, ScaleOffset, Flags);
3777 auto AdjustedX =
B.buildSelect(Ty, NeedsScaling, ScaledX,
X, Flags);
3780 auto ExpInput =
B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3782 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3783 .addUse(ExpInput.getReg(0))
3786 auto ResultScaleFactor =
B.buildFConstant(Ty, 0x1.969d48p-93f);
3787 auto AdjustedResult =
B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3788 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3794 unsigned Flags)
const {
3795 LLT Ty =
B.getMRI()->getType(Dst);
3800 auto K0 =
B.buildFConstant(Ty, 0x1.a92000p+1f);
3801 auto K1 =
B.buildFConstant(Ty, 0x1.4f0978p-11f);
3803 auto Mul1 =
B.buildFMul(Ty,
X, K1, Flags);
3804 auto Exp2_1 =
buildExp(
B, Ty, Mul1, Flags);
3805 auto Mul0 =
B.buildFMul(Ty,
X, K0, Flags);
3806 auto Exp2_0 =
buildExp(
B, Ty, Mul0, Flags);
3807 B.buildFMul(Dst, Exp2_0, Exp2_1, Flags);
3817 auto Threshold =
B.buildFConstant(Ty, -0x1.2f7030p+5f);
3821 auto ScaleOffset =
B.buildFConstant(Ty, 0x1.0p+5f);
3822 auto ScaledX =
B.buildFAdd(Ty,
X, ScaleOffset, Flags);
3823 auto AdjustedX =
B.buildSelect(Ty, NeedsScaling, ScaledX,
X);
3825 auto K0 =
B.buildFConstant(Ty, 0x1.a92000p+1f);
3826 auto K1 =
B.buildFConstant(Ty, 0x1.4f0978p-11f);
3828 auto Mul1 =
B.buildFMul(Ty, AdjustedX, K1, Flags);
3829 auto Exp2_1 =
buildExp(
B, Ty, Mul1, Flags);
3830 auto Mul0 =
B.buildFMul(Ty, AdjustedX, K0, Flags);
3831 auto Exp2_0 =
buildExp(
B, Ty, Mul0, Flags);
3833 auto MulExps =
B.buildFMul(Ty, Exp2_0, Exp2_1, Flags);
3834 auto ResultScaleFactor =
B.buildFConstant(Ty, 0x1.9f623ep-107f);
3835 auto AdjustedResult =
B.buildFMul(Ty, MulExps, ResultScaleFactor, Flags);
3837 B.buildSelect(Dst, NeedsScaling, AdjustedResult, MulExps);
3845 const unsigned Flags =
MI.getFlags();
3848 LLT Ty =
MRI.getType(Dst);
3851 const bool IsExp10 =
MI.getOpcode() == TargetOpcode::G_FEXP10;
3859 MI.eraseFromParent();
3870 auto Ext =
B.buildFPExt(
F32,
X, Flags);
3873 B.buildFPTrunc(Dst, Lowered, Flags);
3874 MI.eraseFromParent();
3885 MI.eraseFromParent();
3913 const unsigned FlagsNoContract = Flags &
~MachineInstr::FmContract;
3916 if (ST.hasFastFMAF32()) {
3918 const float cc_exp = 0x1.4ae0bep-26f;
3919 const float c_exp10 = 0x1.a934f0p+1f;
3920 const float cc_exp10 = 0x1.2f346ep-24f;
3922 auto C =
B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
3923 PH =
B.buildFMul(Ty,
X,
C, Flags).getReg(0);
3924 auto NegPH =
B.buildFNeg(Ty, PH, Flags);
3925 auto FMA0 =
B.buildFMA(Ty,
X,
C, NegPH, Flags);
3927 auto CC =
B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
3928 PL =
B.buildFMA(Ty,
X, CC, FMA0, Flags).getReg(0);
3930 const float ch_exp = 0x1.714000p+0f;
3931 const float cl_exp = 0x1.47652ap-12f;
3933 const float ch_exp10 = 0x1.a92000p+1f;
3934 const float cl_exp10 = 0x1.4f0978p-11f;
3936 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
3937 auto XH =
B.buildAnd(Ty,
X, MaskConst);
3938 auto XL =
B.buildFSub(Ty,
X, XH, Flags);
3940 auto CH =
B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
3941 PH =
B.buildFMul(Ty, XH,
CH, Flags).getReg(0);
3943 auto CL =
B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
3944 auto XLCL =
B.buildFMul(Ty, XL, CL, Flags);
3947 getMad(
B, Ty, XL.getReg(0),
CH.getReg(0), XLCL.getReg(0), Flags);
3948 PL =
getMad(
B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
3951 auto E =
B.buildIntrinsicRoundeven(Ty, PH, Flags);
3954 auto PHSubE =
B.buildFSub(Ty, PH, E, FlagsNoContract);
3955 auto A =
B.buildFAdd(Ty, PHSubE, PL, Flags);
3958 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3959 .addUse(
A.getReg(0))
3961 auto R =
B.buildFLdexp(Ty, Exp2, IntE, Flags);
3963 auto UnderflowCheckConst =
3964 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3965 auto Zero =
B.buildFConstant(Ty, 0.0);
3969 R =
B.buildSelect(Ty, Underflow, Zero, R);
3972 auto OverflowCheckConst =
3973 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3978 R =
B.buildSelect(Ty, Overflow, Inf, R, Flags);
3981 B.buildCopy(Dst, R);
3982 MI.eraseFromParent();
3991 unsigned Flags =
MI.getFlags();
3992 LLT Ty =
B.getMRI()->getType(Dst);
3997 auto Log =
B.buildFLog2(
F32, Src0, Flags);
3998 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {
F32})
3999 .addUse(Log.getReg(0))
4002 B.buildFExp2(Dst,
Mul, Flags);
4003 }
else if (Ty == F16) {
4005 auto Log =
B.buildFLog2(F16, Src0, Flags);
4006 auto Ext0 =
B.buildFPExt(
F32, Log, Flags);
4007 auto Ext1 =
B.buildFPExt(
F32, Src1, Flags);
4008 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {
F32})
4009 .addUse(Ext0.getReg(0))
4010 .addUse(Ext1.getReg(0))
4012 B.buildFExp2(Dst,
B.buildFPTrunc(F16,
Mul), Flags);
4016 MI.eraseFromParent();
4024 ModSrc = SrcFNeg->getOperand(1).getReg();
4026 ModSrc = SrcFAbs->getOperand(1).getReg();
4028 ModSrc = SrcFAbs->getOperand(1).getReg();
4039 Register OrigSrc =
MI.getOperand(1).getReg();
4040 unsigned Flags =
MI.getFlags();
4042 "this should not have been custom lowered");
4052 auto Fract =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {
F64})
4072 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
4074 B.buildFMinNum(Min, Fract, Const, Flags);
4079 CorrectedFract =
B.buildSelect(
F64, IsNan, ModSrc, Min, Flags).getReg(0);
4082 auto NegFract =
B.buildFNeg(
F64, CorrectedFract, Flags);
4083 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
4085 MI.eraseFromParent();
4101 if (
MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
4103 Src0 =
B.buildTrunc(
S16,
MI.getOperand(1).getReg()).getReg(0);
4104 Src1 =
B.buildTrunc(
S16,
MI.getOperand(2).getReg()).getReg(0);
4107 auto Merge =
B.buildMergeLikeInstr(
S32, {Src0, Src1});
4108 B.buildBitcast(Dst,
Merge);
4110 MI.eraseFromParent();
4127 bool UsePartialMad64_32,
4128 bool SeparateOddAlignedProducts)
const {
4143 auto getZero32 = [&]() ->
Register {
4145 Zero32 =
B.buildConstant(
S32, 0).getReg(0);
4148 auto getZero64 = [&]() ->
Register {
4150 Zero64 =
B.buildConstant(
S64, 0).getReg(0);
4155 for (
unsigned i = 0; i < Src0.
size(); ++i) {
4166 if (CarryIn.empty())
4169 bool HaveCarryOut =
true;
4171 if (CarryIn.size() == 1) {
4173 LocalAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
4177 CarryAccum = getZero32();
4179 CarryAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
4180 for (
unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
4182 B.buildUAdde(
S32,
S1, CarryAccum, getZero32(), CarryIn[i])
4187 LocalAccum = getZero32();
4188 HaveCarryOut =
false;
4193 B.buildUAdde(
S32,
S1, CarryAccum, LocalAccum, CarryIn.back());
4194 LocalAccum =
Add.getReg(0);
4208 auto buildMadChain =
4211 assert((DstIndex + 1 < Accum.
size() && LocalAccum.size() == 2) ||
4212 (DstIndex + 1 >= Accum.
size() && LocalAccum.size() == 1));
4219 if (LocalAccum.size() == 1 &&
4220 (!UsePartialMad64_32 || !CarryIn.empty())) {
4223 unsigned j1 = DstIndex - j0;
4224 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4228 auto Mul =
B.buildMul(
S32, Src0[j0], Src1[j1]);
4230 LocalAccum[0] =
Mul.getReg(0);
4232 if (CarryIn.empty()) {
4233 LocalAccum[0] =
B.buildAdd(
S32, LocalAccum[0],
Mul).getReg(0);
4236 B.buildUAdde(
S32,
S1, LocalAccum[0],
Mul, CarryIn.back())
4242 }
while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4246 if (j0 <= DstIndex) {
4247 bool HaveSmallAccum =
false;
4250 if (LocalAccum[0]) {
4251 if (LocalAccum.size() == 1) {
4252 Tmp =
B.buildAnyExt(
S64, LocalAccum[0]).getReg(0);
4253 HaveSmallAccum =
true;
4254 }
else if (LocalAccum[1]) {
4255 Tmp =
B.buildMergeLikeInstr(
S64, LocalAccum).getReg(0);
4256 HaveSmallAccum =
false;
4258 Tmp =
B.buildZExt(
S64, LocalAccum[0]).getReg(0);
4259 HaveSmallAccum =
true;
4262 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4264 HaveSmallAccum =
true;
4268 unsigned j1 = DstIndex - j0;
4269 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4273 auto Mad =
B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {
S64,
S1},
4274 {Src0[j0], Src1[j1], Tmp});
4275 Tmp = Mad.getReg(0);
4276 if (!HaveSmallAccum)
4277 CarryOut.push_back(Mad.getReg(1));
4278 HaveSmallAccum =
false;
4281 }
while (j0 <= DstIndex);
4283 auto Unmerge =
B.buildUnmerge(
S32, Tmp);
4284 LocalAccum[0] = Unmerge.getReg(0);
4285 if (LocalAccum.size() > 1)
4286 LocalAccum[1] = Unmerge.getReg(1);
4313 for (
unsigned i = 0; i <= Accum.
size() / 2; ++i) {
4314 Carry OddCarryIn = std::move(OddCarry);
4315 Carry EvenCarryIn = std::move(EvenCarry);
4320 if (2 * i < Accum.
size()) {
4321 auto LocalAccum = Accum.
drop_front(2 * i).take_front(2);
4322 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4327 if (!SeparateOddAlignedProducts) {
4328 auto LocalAccum = Accum.
drop_front(2 * i - 1).take_front(2);
4329 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4331 bool IsHighest = 2 * i >= Accum.
size();
4334 .take_front(IsHighest ? 1 : 2);
4335 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4341 Lo =
B.buildUAddo(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0]);
4343 Lo =
B.buildAdd(
S32, Accum[2 * i - 1], SeparateOddOut[0]);
4345 Lo =
B.buildUAdde(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0],
4348 Accum[2 * i - 1] =
Lo->getOperand(0).getReg();
4351 auto Hi =
B.buildUAdde(
S32,
S1, Accum[2 * i], SeparateOddOut[1],
4352 Lo->getOperand(1).getReg());
4353 Accum[2 * i] =
Hi.getReg(0);
4354 SeparateOddCarry =
Hi.getReg(1);
4361 if (
Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4362 EvenCarryIn.push_back(CarryOut);
4364 if (2 * i < Accum.
size()) {
4365 if (
Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4366 OddCarry.push_back(CarryOut);
4378 assert(ST.hasMad64_32());
4379 assert(
MI.getOpcode() == TargetOpcode::G_MUL);
4388 LLT Ty =
MRI.getType(DstReg);
4391 unsigned Size = Ty.getSizeInBits();
4392 if (ST.hasVectorMulU64() &&
Size == 64)
4395 unsigned NumParts =
Size / 32;
4407 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4411 for (
unsigned i = 0; i < NumParts; ++i) {
4415 B.buildUnmerge(Src0Parts, Src0);
4416 B.buildUnmerge(Src1Parts, Src1);
4419 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4420 SeparateOddAlignedProducts);
4422 B.buildMergeLikeInstr(DstReg, AccumRegs);
4423 MI.eraseFromParent();
4435 LLT DstTy =
MRI.getType(Dst);
4436 LLT SrcTy =
MRI.getType(Src);
4438 unsigned NewOpc =
MI.getOpcode() == AMDGPU::G_CTLZ
4439 ? AMDGPU::G_AMDGPU_FFBH_U32
4440 : AMDGPU::G_AMDGPU_FFBL_B32;
4441 auto Tmp =
B.buildInstr(NewOpc, {DstTy}, {Src});
4444 MI.eraseFromParent();
4453 LLT SrcTy =
MRI.getType(Src);
4454 TypeSize NumBits = SrcTy.getSizeInBits();
4458 auto ShiftAmt =
B.buildConstant(
S32, 32u - NumBits);
4459 auto Extend =
B.buildAnyExt(
S32, {Src}).
getReg(0u);
4460 auto Shift =
B.buildShl(
S32, Extend, ShiftAmt);
4461 auto Ctlz =
B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {
S32}, {Shift});
4462 B.buildTrunc(Dst, Ctlz);
4463 MI.eraseFromParent();
4469 if (
MI.getOpcode() != TargetOpcode::G_XOR)
4472 return ConstVal == -1;
4479 Register CondDef =
MI.getOperand(0).getReg();
4480 if (!
MRI.hasOneNonDBGUse(CondDef))
4488 if (!
MRI.hasOneNonDBGUse(NegatedCond))
4494 UseMI = &*
MRI.use_instr_nodbg_begin(NegatedCond);
4498 if (
UseMI->getParent() != Parent ||
UseMI->getOpcode() != AMDGPU::G_BRCOND)
4507 UncondBrTarget = &*NextMBB;
4509 if (
Next->getOpcode() != AMDGPU::G_BR)
4528 *ArgRC,
B.getDebugLoc(), ArgTy);
4532 const unsigned Mask = Arg->
getMask();
4540 auto ShiftAmt =
B.buildConstant(
S32, Shift);
4541 AndMaskSrc =
B.buildLShr(
S32, LiveIn, ShiftAmt).getReg(0);
4544 B.buildAnd(DstReg, AndMaskSrc,
B.buildConstant(
S32, Mask >> Shift));
4546 B.buildCopy(DstReg, LiveIn);
4556 if (!ST.hasClusters()) {
4559 MI.eraseFromParent();
4572 Register ClusterMaxIdXYZ =
MRI.createGenericVirtualRegister(
S32);
4573 Register ClusterWorkGroupIdXYZ =
MRI.createGenericVirtualRegister(
S32);
4579 auto One =
B.buildConstant(
S32, 1);
4580 auto ClusterSizeXYZ =
B.buildAdd(
S32, ClusterMaxIdXYZ, One);
4581 auto GlobalIdXYZ =
B.buildAdd(
S32, ClusterWorkGroupIdXYZ,
4582 B.buildMul(
S32, ClusterIdXYZ, ClusterSizeXYZ));
4589 B.buildCopy(DstReg, GlobalIdXYZ);
4590 MI.eraseFromParent();
4594 B.buildCopy(DstReg, ClusterIdXYZ);
4595 MI.eraseFromParent();
4600 unsigned ClusterIdField = HwregEncoding::encode(ID_IB_STS2, 6, 4);
4602 MRI.setRegClass(ClusterId, &AMDGPU::SReg_32RegClass);
4603 B.buildInstr(AMDGPU::S_GETREG_B32_const)
4605 .addImm(ClusterIdField);
4606 auto Zero =
B.buildConstant(
S32, 0);
4609 B.buildSelect(DstReg, NoClusters, ClusterIdXYZ, GlobalIdXYZ);
4610 MI.eraseFromParent();
4652 auto LoadConstant = [&](
unsigned N) {
4653 B.buildConstant(DstReg,
N);
4657 if (ST.hasArchitectedSGPRs() &&
4664 Arg = &WorkGroupIDX;
4665 ArgRC = &AMDGPU::SReg_32RegClass;
4669 Arg = &WorkGroupIDY;
4670 ArgRC = &AMDGPU::SReg_32RegClass;
4674 Arg = &WorkGroupIDZ;
4675 ArgRC = &AMDGPU::SReg_32RegClass;
4679 if (HasFixedDims && ClusterDims.
getDims()[0] == 1)
4680 return LoadConstant(0);
4681 Arg = &ClusterWorkGroupIDX;
4682 ArgRC = &AMDGPU::SReg_32RegClass;
4686 if (HasFixedDims && ClusterDims.
getDims()[1] == 1)
4687 return LoadConstant(0);
4688 Arg = &ClusterWorkGroupIDY;
4689 ArgRC = &AMDGPU::SReg_32RegClass;
4693 if (HasFixedDims && ClusterDims.
getDims()[2] == 1)
4694 return LoadConstant(0);
4695 Arg = &ClusterWorkGroupIDZ;
4696 ArgRC = &AMDGPU::SReg_32RegClass;
4701 return LoadConstant(ClusterDims.
getDims()[0] - 1);
4702 Arg = &ClusterWorkGroupMaxIDX;
4703 ArgRC = &AMDGPU::SReg_32RegClass;
4708 return LoadConstant(ClusterDims.
getDims()[1] - 1);
4709 Arg = &ClusterWorkGroupMaxIDY;
4710 ArgRC = &AMDGPU::SReg_32RegClass;
4715 return LoadConstant(ClusterDims.
getDims()[2] - 1);
4716 Arg = &ClusterWorkGroupMaxIDZ;
4717 ArgRC = &AMDGPU::SReg_32RegClass;
4721 Arg = &ClusterWorkGroupMaxFlatID;
4722 ArgRC = &AMDGPU::SReg_32RegClass;
4737 return LoadConstant(0);
4742 B.buildUndef(DstReg);
4746 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4758 MI.eraseFromParent();
4764 B.buildConstant(
MI.getOperand(0).getReg(),
C);
4765 MI.eraseFromParent();
4772 unsigned MaxID = ST.getMaxWorkitemID(
B.getMF().getFunction(), Dim);
4786 B.buildUndef(DstReg);
4787 MI.eraseFromParent();
4791 if (Arg->isMasked()) {
4805 MI.eraseFromParent();
4820 Register KernArgReg =
B.getMRI()->createGenericVirtualRegister(PtrTy);
4829 return B.buildObjectPtrOffset(PtrTy, KernArgReg, COffset).getReg(0);
4837 Align Alignment)
const {
4841 "unexpected kernarg parameter type");
4848 MI.eraseFromParent();
4856 LLT DstTy =
MRI.getType(Dst);
4883 auto FloatY =
B.buildUITOFP(
S32,
Y);
4884 auto RcpIFlag =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {
S32}, {FloatY});
4886 auto ScaledY =
B.buildFMul(
S32, RcpIFlag, Scale);
4887 auto Z =
B.buildFPTOUI(
S32, ScaledY);
4890 auto NegY =
B.buildSub(
S32,
B.buildConstant(
S32, 0),
Y);
4891 auto NegYZ =
B.buildMul(
S32, NegY, Z);
4892 Z =
B.buildAdd(
S32, Z,
B.buildUMulH(
S32, Z, NegYZ));
4895 auto Q =
B.buildUMulH(
S32,
X, Z);
4896 auto R =
B.buildSub(
S32,
X,
B.buildMul(
S32, Q,
Y));
4899 auto One =
B.buildConstant(
S32, 1);
4902 Q =
B.buildSelect(
S32,
Cond,
B.buildAdd(
S32, Q, One), Q);
4908 B.buildSelect(DstDivReg,
Cond,
B.buildAdd(
S32, Q, One), Q);
4911 B.buildSelect(DstRemReg,
Cond,
B.buildSub(
S32, R,
Y), R);
4930 auto Unmerge =
B.buildUnmerge(
S32, Val);
4932 auto CvtLo =
B.buildUITOFP(
S32, Unmerge.getReg(0));
4933 auto CvtHi =
B.buildUITOFP(
S32, Unmerge.getReg(1));
4935 auto Mad =
B.buildFMAD(
4939 auto Rcp =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {
S32}, {Mad});
4940 auto Mul1 =
B.buildFMul(
4944 auto Mul2 =
B.buildFMul(
4946 auto Trunc =
B.buildIntrinsicTrunc(
S32, Mul2);
4949 auto Mad2 =
B.buildFMAD(
4953 auto ResultLo =
B.buildFPTOUI(
S32, Mad2);
4954 auto ResultHi =
B.buildFPTOUI(
S32, Trunc);
4956 return {ResultLo.getReg(0), ResultHi.getReg(0)};
4971 auto Rcp =
B.buildMergeLikeInstr(
S64, {RcpLo, RcpHi});
4973 auto Zero64 =
B.buildConstant(
S64, 0);
4974 auto NegDenom =
B.buildSub(
S64, Zero64, Denom);
4976 auto MulLo1 =
B.buildMul(
S64, NegDenom, Rcp);
4977 auto MulHi1 =
B.buildUMulH(
S64, Rcp, MulLo1);
4979 auto UnmergeMulHi1 =
B.buildUnmerge(
S32, MulHi1);
4980 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
4981 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
4983 auto Add1_Lo =
B.buildUAddo(
S32,
S1, RcpLo, MulHi1_Lo);
4984 auto Add1_Hi =
B.buildUAdde(
S32,
S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
4985 auto Add1 =
B.buildMergeLikeInstr(
S64, {Add1_Lo, Add1_Hi});
4987 auto MulLo2 =
B.buildMul(
S64, NegDenom, Add1);
4988 auto MulHi2 =
B.buildUMulH(
S64, Add1, MulLo2);
4989 auto UnmergeMulHi2 =
B.buildUnmerge(
S32, MulHi2);
4990 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
4991 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
4993 auto Zero32 =
B.buildConstant(
S32, 0);
4994 auto Add2_Lo =
B.buildUAddo(
S32,
S1, Add1_Lo, MulHi2_Lo);
4995 auto Add2_Hi =
B.buildUAdde(
S32,
S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
4996 auto Add2 =
B.buildMergeLikeInstr(
S64, {Add2_Lo, Add2_Hi});
4998 auto UnmergeNumer =
B.buildUnmerge(
S32, Numer);
4999 Register NumerLo = UnmergeNumer.getReg(0);
5000 Register NumerHi = UnmergeNumer.getReg(1);
5002 auto MulHi3 =
B.buildUMulH(
S64, Numer, Add2);
5003 auto Mul3 =
B.buildMul(
S64, Denom, MulHi3);
5004 auto UnmergeMul3 =
B.buildUnmerge(
S32, Mul3);
5005 Register Mul3_Lo = UnmergeMul3.getReg(0);
5006 Register Mul3_Hi = UnmergeMul3.getReg(1);
5007 auto Sub1_Lo =
B.buildUSubo(
S32,
S1, NumerLo, Mul3_Lo);
5008 auto Sub1_Hi =
B.buildUSube(
S32,
S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
5009 auto Sub1_Mi =
B.buildSub(
S32, NumerHi, Mul3_Hi);
5010 auto Sub1 =
B.buildMergeLikeInstr(
S64, {Sub1_Lo, Sub1_Hi});
5012 auto UnmergeDenom =
B.buildUnmerge(
S32, Denom);
5013 Register DenomLo = UnmergeDenom.getReg(0);
5014 Register DenomHi = UnmergeDenom.getReg(1);
5017 auto C1 =
B.buildSExt(
S32, CmpHi);
5020 auto C2 =
B.buildSExt(
S32, CmpLo);
5023 auto C3 =
B.buildSelect(
S32, CmpEq, C2, C1);
5030 auto Sub2_Lo =
B.buildUSubo(
S32,
S1, Sub1_Lo, DenomLo);
5031 auto Sub2_Mi =
B.buildUSube(
S32,
S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
5032 auto Sub2_Hi =
B.buildUSube(
S32,
S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
5033 auto Sub2 =
B.buildMergeLikeInstr(
S64, {Sub2_Lo, Sub2_Hi});
5035 auto One64 =
B.buildConstant(
S64, 1);
5036 auto Add3 =
B.buildAdd(
S64, MulHi3, One64);
5042 auto C6 =
B.buildSelect(
5046 auto Add4 =
B.buildAdd(
S64, Add3, One64);
5047 auto Sub3_Lo =
B.buildUSubo(
S32,
S1, Sub2_Lo, DenomLo);
5049 auto Sub3_Mi =
B.buildUSube(
S32,
S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
5050 auto Sub3_Hi =
B.buildUSube(
S32,
S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
5051 auto Sub3 =
B.buildMergeLikeInstr(
S64, {Sub3_Lo, Sub3_Hi});
5057 auto Sel1 =
B.buildSelect(
5064 auto Sel2 =
B.buildSelect(
5075 switch (
MI.getOpcode()) {
5078 case AMDGPU::G_UDIV: {
5079 DstDivReg =
MI.getOperand(0).getReg();
5082 case AMDGPU::G_UREM: {
5083 DstRemReg =
MI.getOperand(0).getReg();
5086 case AMDGPU::G_UDIVREM: {
5087 DstDivReg =
MI.getOperand(0).getReg();
5088 DstRemReg =
MI.getOperand(1).getReg();
5095 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
5096 Register Num =
MI.getOperand(FirstSrcOpIdx).getReg();
5097 Register Den =
MI.getOperand(FirstSrcOpIdx + 1).getReg();
5098 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
5107 MI.eraseFromParent();
5117 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
5118 if (Ty !=
S32 && Ty !=
S64)
5121 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
5122 Register LHS =
MI.getOperand(FirstSrcOpIdx).getReg();
5123 Register RHS =
MI.getOperand(FirstSrcOpIdx + 1).getReg();
5125 auto SignBitOffset =
B.buildConstant(
S32, Ty.getSizeInBits() - 1);
5126 auto LHSign =
B.buildAShr(Ty, LHS, SignBitOffset);
5127 auto RHSign =
B.buildAShr(Ty, RHS, SignBitOffset);
5129 LHS =
B.buildAdd(Ty, LHS, LHSign).getReg(0);
5130 RHS =
B.buildAdd(Ty, RHS, RHSign).getReg(0);
5132 LHS =
B.buildXor(Ty, LHS, LHSign).getReg(0);
5133 RHS =
B.buildXor(Ty, RHS, RHSign).getReg(0);
5135 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
5136 switch (
MI.getOpcode()) {
5139 case AMDGPU::G_SDIV: {
5140 DstDivReg =
MI.getOperand(0).getReg();
5141 TmpDivReg =
MRI.createGenericVirtualRegister(Ty);
5144 case AMDGPU::G_SREM: {
5145 DstRemReg =
MI.getOperand(0).getReg();
5146 TmpRemReg =
MRI.createGenericVirtualRegister(Ty);
5149 case AMDGPU::G_SDIVREM: {
5150 DstDivReg =
MI.getOperand(0).getReg();
5151 DstRemReg =
MI.getOperand(1).getReg();
5152 TmpDivReg =
MRI.createGenericVirtualRegister(Ty);
5153 TmpRemReg =
MRI.createGenericVirtualRegister(Ty);
5164 auto Sign =
B.buildXor(Ty, LHSign, RHSign).getReg(0);
5165 auto SignXor =
B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
5166 B.buildSub(DstDivReg, SignXor, Sign);
5170 auto Sign = LHSign.getReg(0);
5171 auto SignXor =
B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
5172 B.buildSub(DstRemReg, SignXor, Sign);
5175 MI.eraseFromParent();
5186 LLT ResTy =
MRI.getType(Res);
5191 if (!AllowInaccurateRcp && ResTy !=
LLT::scalar(16))
5202 if (CLHS->isExactlyValue(1.0)) {
5203 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5207 MI.eraseFromParent();
5212 if (CLHS->isExactlyValue(-1.0)) {
5213 auto FNeg =
B.buildFNeg(ResTy, RHS, Flags);
5214 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5215 .addUse(FNeg.getReg(0))
5218 MI.eraseFromParent();
5225 if (!AllowInaccurateRcp && (ResTy !=
LLT::scalar(16) ||
5230 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5233 B.buildFMul(Res, LHS, RCP, Flags);
5235 MI.eraseFromParent();
5246 LLT ResTy =
MRI.getType(Res);
5250 if (!AllowInaccurateRcp)
5253 auto NegY =
B.buildFNeg(ResTy,
Y);
5254 auto One =
B.buildFConstant(ResTy, 1.0);
5256 auto R =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5260 auto Tmp0 =
B.buildFMA(ResTy, NegY, R, One);
5261 R =
B.buildFMA(ResTy, Tmp0, R, R);
5263 auto Tmp1 =
B.buildFMA(ResTy, NegY, R, One);
5264 R =
B.buildFMA(ResTy, Tmp1, R, R);
5266 auto Ret =
B.buildFMul(ResTy,
X, R);
5267 auto Tmp2 =
B.buildFMA(ResTy, NegY, Ret,
X);
5269 B.buildFMA(Res, Tmp2, R, Ret);
5270 MI.eraseFromParent();
5302 auto LHSExt =
B.buildFPExt(
S32, LHS, Flags);
5303 auto RHSExt =
B.buildFPExt(
S32, RHS, Flags);
5304 auto NegRHSExt =
B.buildFNeg(
S32, RHSExt);
5305 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5306 .addUse(RHSExt.getReg(0))
5308 auto Quot =
B.buildFMul(
S32, LHSExt, Rcp, Flags);
5310 if (ST.hasMadMacF32Insts()) {
5311 Err =
B.buildFMAD(
S32, NegRHSExt, Quot, LHSExt, Flags);
5312 Quot =
B.buildFMAD(
S32, Err, Rcp, Quot, Flags);
5313 Err =
B.buildFMAD(
S32, NegRHSExt, Quot, LHSExt, Flags);
5315 Err =
B.buildFMA(
S32, NegRHSExt, Quot, LHSExt, Flags);
5316 Quot =
B.buildFMA(
S32, Err, Rcp, Quot, Flags);
5317 Err =
B.buildFMA(
S32, NegRHSExt, Quot, LHSExt, Flags);
5319 auto Tmp =
B.buildFMul(
S32, Err, Rcp, Flags);
5320 Tmp =
B.buildAnd(
S32, Tmp,
B.buildConstant(
S32, 0xff800000));
5321 Quot =
B.buildFAdd(
S32, Tmp, Quot, Flags);
5322 auto RDst =
B.buildFPTrunc(
S16, Quot, Flags);
5323 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5324 .addUse(RDst.getReg(0))
5329 MI.eraseFromParent();
5342 unsigned SPDenormMode =
5345 if (ST.hasDenormModeInst()) {
5347 uint32_t DPDenormModeDefault =
Mode.fpDenormModeDPValue();
5349 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
5350 B.buildInstr(AMDGPU::S_DENORM_MODE)
5351 .addImm(NewDenormModeValue);
5354 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
5355 .addImm(SPDenormMode)
5377 auto One =
B.buildFConstant(
S32, 1.0f);
5379 auto DenominatorScaled =
5380 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S32,
S1})
5385 auto NumeratorScaled =
5386 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S32,
S1})
5392 auto ApproxRcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5393 .addUse(DenominatorScaled.getReg(0))
5395 auto NegDivScale0 =
B.buildFNeg(
S32, DenominatorScaled, Flags);
5398 const bool HasDynamicDenormals =
5403 if (!PreservesDenormals) {
5404 if (HasDynamicDenormals) {
5405 SavedSPDenormMode =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5406 B.buildInstr(AMDGPU::S_GETREG_B32)
5407 .addDef(SavedSPDenormMode)
5413 auto Fma0 =
B.buildFMA(
S32, NegDivScale0, ApproxRcp, One, Flags);
5414 auto Fma1 =
B.buildFMA(
S32, Fma0, ApproxRcp, ApproxRcp, Flags);
5415 auto Mul =
B.buildFMul(
S32, NumeratorScaled, Fma1, Flags);
5416 auto Fma2 =
B.buildFMA(
S32, NegDivScale0,
Mul, NumeratorScaled, Flags);
5417 auto Fma3 =
B.buildFMA(
S32, Fma2, Fma1,
Mul, Flags);
5418 auto Fma4 =
B.buildFMA(
S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
5420 if (!PreservesDenormals) {
5421 if (HasDynamicDenormals) {
5422 assert(SavedSPDenormMode);
5423 B.buildInstr(AMDGPU::S_SETREG_B32)
5424 .addReg(SavedSPDenormMode)
5430 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {
S32})
5431 .addUse(Fma4.getReg(0))
5432 .addUse(Fma1.getReg(0))
5433 .addUse(Fma3.getReg(0))
5434 .addUse(NumeratorScaled.getReg(1))
5437 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5438 .addUse(Fmas.getReg(0))
5443 MI.eraseFromParent();
5462 auto One =
B.buildFConstant(
S64, 1.0);
5464 auto DivScale0 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S64,
S1})
5470 auto NegDivScale0 =
B.buildFNeg(
S64, DivScale0.getReg(0), Flags);
5472 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S64})
5473 .addUse(DivScale0.getReg(0))
5476 auto Fma0 =
B.buildFMA(
S64, NegDivScale0, Rcp, One, Flags);
5477 auto Fma1 =
B.buildFMA(
S64, Rcp, Fma0, Rcp, Flags);
5478 auto Fma2 =
B.buildFMA(
S64, NegDivScale0, Fma1, One, Flags);
5480 auto DivScale1 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S64,
S1})
5486 auto Fma3 =
B.buildFMA(
S64, Fma1, Fma2, Fma1, Flags);
5487 auto Mul =
B.buildFMul(
S64, DivScale1.getReg(0), Fma3, Flags);
5488 auto Fma4 =
B.buildFMA(
S64, NegDivScale0,
Mul, DivScale1.getReg(0), Flags);
5491 if (!ST.hasUsableDivScaleConditionOutput()) {
5497 auto NumUnmerge =
B.buildUnmerge(
S32, LHS);
5498 auto DenUnmerge =
B.buildUnmerge(
S32, RHS);
5499 auto Scale0Unmerge =
B.buildUnmerge(
S32, DivScale0);
5500 auto Scale1Unmerge =
B.buildUnmerge(
S32, DivScale1);
5503 Scale1Unmerge.getReg(1));
5505 Scale0Unmerge.getReg(1));
5506 Scale =
B.buildXor(
S1, CmpNum, CmpDen).getReg(0);
5508 Scale = DivScale1.getReg(1);
5511 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {
S64})
5512 .addUse(Fma4.getReg(0))
5513 .addUse(Fma3.getReg(0))
5514 .addUse(
Mul.getReg(0))
5518 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup,
ArrayRef(Res))
5519 .addUse(Fmas.getReg(0))
5524 MI.eraseFromParent();
5536 LLT Ty =
MRI.getType(Res0);
5539 auto Mant =
B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5542 auto Exp =
B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5546 if (ST.hasFractBug()) {
5547 auto Fabs =
B.buildFAbs(Ty, Val);
5551 auto Zero =
B.buildConstant(InstrExpTy, 0);
5552 Exp =
B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5553 Mant =
B.buildSelect(Ty, IsFinite, Mant, Val);
5556 B.buildCopy(Res0, Mant);
5557 B.buildSExtOrTrunc(Res1, Exp);
5559 MI.eraseFromParent();
5574 auto Abs =
B.buildFAbs(
S32, RHS, Flags);
5577 auto C0 =
B.buildFConstant(
S32, 0x1p+96f);
5578 auto C1 =
B.buildFConstant(
S32, 0x1p-32f);
5579 auto C2 =
B.buildFConstant(
S32, 1.0f);
5582 auto Sel =
B.buildSelect(
S32, CmpRes, C1, C2, Flags);
5584 auto Mul0 =
B.buildFMul(
S32, RHS, Sel, Flags);
5586 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5587 .addUse(Mul0.getReg(0))
5590 auto Mul1 =
B.buildFMul(
S32, LHS, RCP, Flags);
5592 B.buildFMul(Res, Sel, Mul1, Flags);
5594 MI.eraseFromParent();
5603 unsigned Flags =
MI.getFlags();
5604 assert(!ST.has16BitInsts());
5606 auto Ext =
B.buildFPExt(
F32,
MI.getOperand(1), Flags);
5607 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {
F32})
5608 .addUse(Ext.getReg(0))
5610 B.buildFPTrunc(
MI.getOperand(0),
Log2, Flags);
5611 MI.eraseFromParent();
5621 const unsigned Flags =
MI.getFlags();
5630 MI.eraseFromParent();
5634 auto ScaleThreshold =
B.buildFConstant(
F32, 0x1.0p-96f);
5636 auto ScaleUpFactor =
B.buildFConstant(
F32, 0x1.0p+32f);
5637 auto ScaledX =
B.buildFMul(
F32,
X, ScaleUpFactor, Flags);
5638 auto SqrtX =
B.buildSelect(
F32, NeedScale, ScaledX,
X, Flags);
5643 .addUse(SqrtX.getReg(0))
5646 auto NegOne =
B.buildConstant(I32, -1);
5647 auto SqrtSNextDown =
B.buildAdd(I32, SqrtS, NegOne);
5649 auto NegSqrtSNextDown =
B.buildFNeg(
F32, SqrtSNextDown, Flags);
5650 auto SqrtVP =
B.buildFMA(
F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5652 auto PosOne =
B.buildConstant(I32, 1);
5653 auto SqrtSNextUp =
B.buildAdd(I32, SqrtS, PosOne);
5655 auto NegSqrtSNextUp =
B.buildFNeg(
F32, SqrtSNextUp, Flags);
5656 auto SqrtVS =
B.buildFMA(
F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5658 auto Zero =
B.buildFConstant(
F32, 0.0f);
5662 B.buildSelect(
F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5666 B.buildSelect(
F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5669 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
F32}).addReg(SqrtX.getReg(0));
5670 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5672 auto Half =
B.buildFConstant(
F32, 0.5f);
5673 auto SqrtH =
B.buildFMul(
F32, SqrtR, Half, Flags);
5674 auto NegSqrtH =
B.buildFNeg(
F32, SqrtH, Flags);
5675 auto SqrtE =
B.buildFMA(
F32, NegSqrtH, SqrtS, Half, Flags);
5676 SqrtH =
B.buildFMA(
F32, SqrtH, SqrtE, SqrtH, Flags);
5677 SqrtS =
B.buildFMA(
F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5678 auto NegSqrtS =
B.buildFNeg(
F32, SqrtS, Flags);
5679 auto SqrtD =
B.buildFMA(
F32, NegSqrtS, SqrtS, SqrtX, Flags);
5680 SqrtS =
B.buildFMA(
F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5683 auto ScaleDownFactor =
B.buildFConstant(
F32, 0x1.0p-16f);
5685 auto ScaledDown =
B.buildFMul(
F32, SqrtS, ScaleDownFactor, Flags);
5687 SqrtS =
B.buildSelect(
F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5690 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5692 MI.eraseFromParent();
5724 assert(
MRI.getType(Dst) ==
F64 &&
"only expect to lower f64 sqrt");
5727 unsigned Flags =
MI.getFlags();
5729 auto ScaleConstant =
B.buildFConstant(
F64, 0x1.0p-767);
5731 auto ZeroInt =
B.buildConstant(
S32, 0);
5735 auto ScaleUpFactor =
B.buildConstant(
S32, 256);
5736 auto ScaleUp =
B.buildSelect(
S32, Scaling, ScaleUpFactor, ZeroInt);
5737 auto SqrtX =
B.buildFLdexp(
F64,
X, ScaleUp, Flags);
5740 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
F64}).addReg(SqrtX.getReg(0));
5742 auto Half =
B.buildFConstant(
F64, 0.5);
5743 auto SqrtH0 =
B.buildFMul(
F64, SqrtY, Half);
5744 auto SqrtS0 =
B.buildFMul(
F64, SqrtX, SqrtY);
5746 auto NegSqrtH0 =
B.buildFNeg(
F64, SqrtH0);
5747 auto SqrtR0 =
B.buildFMA(
F64, NegSqrtH0, SqrtS0, Half);
5749 auto SqrtS1 =
B.buildFMA(
F64, SqrtS0, SqrtR0, SqrtS0);
5750 auto SqrtH1 =
B.buildFMA(
F64, SqrtH0, SqrtR0, SqrtH0);
5752 auto NegSqrtS1 =
B.buildFNeg(
F64, SqrtS1);
5753 auto SqrtD0 =
B.buildFMA(
F64, NegSqrtS1, SqrtS1, SqrtX);
5755 auto SqrtS2 =
B.buildFMA(
F64, SqrtD0, SqrtH1, SqrtS1);
5757 auto NegSqrtS2 =
B.buildFNeg(
F64, SqrtS2);
5758 auto SqrtD1 =
B.buildFMA(
F64, NegSqrtS2, SqrtS2, SqrtX);
5760 auto SqrtRet =
B.buildFMA(
F64, SqrtD1, SqrtH1, SqrtS2);
5763 auto ScaleDownFactor =
B.buildConstant(
S32, -128);
5764 auto ScaleDown =
B.buildSelect(
S32, Scaling, ScaleDownFactor, ZeroInt);
5765 SqrtRet =
B.buildFLdexp(
F64, SqrtRet, ScaleDown, Flags);
5774 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
5776 MI.eraseFromParent();
5783 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
5807 auto Flags =
MI.getFlags();
5809 LLT Ty =
MRI.getType(Dst);
5819 auto Rsq =
B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
5829 auto ClampMax = UseIEEE ?
B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
5830 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
5835 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
5837 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
5838 MI.eraseFromParent();
5850 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
5851 IID == Intrinsic::amdgcn_permlanex16;
5852 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
5853 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
5857 auto LaneOp =
B.buildIntrinsic(IID, {VT}).addUse(Src0);
5859 case Intrinsic::amdgcn_readfirstlane:
5860 case Intrinsic::amdgcn_permlane64:
5861 return LaneOp.getReg(0);
5862 case Intrinsic::amdgcn_readlane:
5863 case Intrinsic::amdgcn_set_inactive:
5864 case Intrinsic::amdgcn_set_inactive_chain_arg:
5865 return LaneOp.addUse(Src1).getReg(0);
5866 case Intrinsic::amdgcn_writelane:
5867 return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
5868 case Intrinsic::amdgcn_permlane16:
5869 case Intrinsic::amdgcn_permlanex16: {
5871 int64_t Src4 =
MI.getOperand(6).getImm();
5872 int64_t Src5 =
MI.getOperand(7).getImm();
5873 return LaneOp.addUse(Src1)
5880 case Intrinsic::amdgcn_mov_dpp8:
5881 return LaneOp.addImm(
MI.getOperand(3).getImm()).getReg(0);
5882 case Intrinsic::amdgcn_update_dpp:
5883 return LaneOp.addUse(Src1)
5884 .addImm(
MI.getOperand(4).getImm())
5885 .addImm(
MI.getOperand(5).getImm())
5886 .addImm(
MI.getOperand(6).getImm())
5887 .addImm(
MI.getOperand(7).getImm())
5897 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
5898 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
5899 Src1 =
MI.getOperand(3).getReg();
5900 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
5901 Src2 =
MI.getOperand(4).getReg();
5905 LLT Ty =
MRI.getType(DstReg);
5906 unsigned Size = Ty.getSizeInBits();
5908 unsigned SplitSize = 32;
5909 if (IID == Intrinsic::amdgcn_update_dpp && (
Size % 64 == 0) &&
5910 ST.hasDPALU_DPP() &&
5914 if (
Size == SplitSize) {
5920 Src0 =
B.buildAnyExt(
S32, Src0).getReg(0);
5922 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5925 if (IID == Intrinsic::amdgcn_writelane)
5928 Register LaneOpDst = createLaneOp(Src0, Src1, Src2,
S32);
5929 B.buildTrunc(DstReg, LaneOpDst);
5930 MI.eraseFromParent();
5934 if (
Size % SplitSize != 0)
5938 bool NeedsBitcast =
false;
5939 if (Ty.isVector()) {
5942 if (EltSize == SplitSize) {
5943 PartialResTy = EltTy;
5944 }
else if (EltSize == 16 || EltSize == 32) {
5945 unsigned NElem = SplitSize / EltSize;
5949 NeedsBitcast =
true;
5954 unsigned NumParts =
Size / SplitSize;
5958 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5959 Src1Parts =
B.buildUnmerge(PartialResTy, Src1);
5961 if (IID == Intrinsic::amdgcn_writelane)
5962 Src2Parts =
B.buildUnmerge(PartialResTy, Src2);
5964 for (
unsigned i = 0; i < NumParts; ++i) {
5965 Src0 = Src0Parts.
getReg(i);
5967 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5968 Src1 = Src1Parts.
getReg(i);
5970 if (IID == Intrinsic::amdgcn_writelane)
5971 Src2 = Src2Parts.
getReg(i);
5973 PartialRes.
push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
5977 B.buildBitcast(DstReg,
B.buildMergeLikeInstr(
5980 B.buildMergeLikeInstr(DstReg, PartialRes);
5982 MI.eraseFromParent();
5990 ST.getTargetLowering()->getImplicitParameterOffset(
5992 LLT DstTy =
MRI.getType(DstReg);
5995 Register KernargPtrReg =
MRI.createGenericVirtualRegister(DstTy);
6000 B.buildObjectPtrOffset(DstReg, KernargPtrReg,
6001 B.buildConstant(IdxTy,
Offset).getReg(0));
6012 Register Pointer =
MI.getOperand(2).getReg();
6014 Register NumRecords =
MI.getOperand(4).getReg();
6020 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6022 auto ExtStride =
B.buildAnyExt(
S32, Stride);
6024 if (ST.has45BitNumRecordsBufferResource()) {
6029 auto PointerInt =
B.buildPtrToInt(PtrIntTy, Pointer);
6030 auto ExtPointer =
B.buildAnyExtOrTrunc(
S64, PointerInt);
6031 auto NumRecordsLHS =
B.buildShl(
S64, NumRecords,
B.buildConstant(
S32, 57));
6032 Register LowHalf =
B.buildOr(
S64, ExtPointer, NumRecordsLHS).getReg(0);
6036 auto NumRecordsRHS =
B.buildLShr(
S64, NumRecords,
B.buildConstant(
S32, 7));
6037 auto ShiftedStride =
B.buildShl(
S32, ExtStride,
B.buildConstant(
S32, 12));
6038 auto ExtShiftedStride =
6039 B.buildMergeValues(
S64, {Zero, ShiftedStride.getReg(0)});
6040 auto ShiftedFlags =
B.buildShl(
S32, Flags,
B.buildConstant(
S32, 28));
6041 auto ExtShiftedFlags =
6042 B.buildMergeValues(
S64, {Zero, ShiftedFlags.getReg(0)});
6043 auto CombinedFields =
B.buildOr(
S64, NumRecordsRHS, ExtShiftedStride);
6045 B.buildOr(
S64, CombinedFields, ExtShiftedFlags).getReg(0);
6046 B.buildMergeValues(Result, {LowHalf, HighHalf});
6048 NumRecords =
B.buildTrunc(
S32, NumRecords).getReg(0);
6049 auto Unmerge =
B.buildUnmerge(
S32, Pointer);
6050 auto LowHalf = Unmerge.getReg(0);
6051 auto HighHalf = Unmerge.getReg(1);
6053 auto AndMask =
B.buildConstant(
S32, 0x0000ffff);
6054 auto Masked =
B.buildAnd(
S32, HighHalf, AndMask);
6055 auto ShiftConst =
B.buildConstant(
S32, 16);
6056 auto ShiftedStride =
B.buildShl(
S32, ExtStride, ShiftConst);
6057 auto NewHighHalf =
B.buildOr(
S32,
Masked, ShiftedStride);
6058 Register NewHighHalfReg = NewHighHalf.getReg(0);
6059 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
6062 MI.eraseFromParent();
6079 MI.eraseFromParent();
6087 std::optional<uint32_t> KnownSize =
6089 if (KnownSize.has_value())
6090 B.buildConstant(DstReg, *KnownSize);
6108 MI.eraseFromParent();
6115 unsigned AddrSpace)
const {
6117 auto Unmerge =
B.buildUnmerge(
S32,
MI.getOperand(2).getReg());
6121 ST.hasGloballyAddressableScratch()) {
6123 B.buildInstr(AMDGPU::S_MOV_B32, {
S32},
6124 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)})
6126 MRI.setRegClass(FlatScratchBaseHi, &AMDGPU::SReg_32RegClass);
6128 Register XOR =
B.buildXor(
S32, Hi32, FlatScratchBaseHi).getReg(0);
6130 B.buildConstant(
S32, 1u << 26));
6135 MI.eraseFromParent();
6145std::pair<Register, unsigned>
6159 MRI, OrigOffset,
nullptr, CheckNUW);
6162 if (
MRI.getType(BaseReg).isPointer())
6163 BaseReg =
B.buildPtrToInt(
MRI.getType(OrigOffset), BaseReg).getReg(0);
6173 unsigned Overflow = ImmOffset & ~MaxImm;
6174 ImmOffset -= Overflow;
6175 if ((int32_t)Overflow < 0) {
6176 Overflow += ImmOffset;
6180 if (Overflow != 0) {
6182 BaseReg =
B.buildConstant(
S32, Overflow).getReg(0);
6184 auto OverflowVal =
B.buildConstant(
S32, Overflow);
6185 BaseReg =
B.buildAdd(
S32, BaseReg, OverflowVal).getReg(0);
6190 BaseReg =
B.buildConstant(
S32, 0).getReg(0);
6192 return std::pair(BaseReg, ImmOffset);
6199 bool ImageStore)
const {
6202 LLT StoreVT =
MRI.getType(Reg);
6205 if (ST.hasUnpackedD16VMem()) {
6206 auto Unmerge =
B.buildUnmerge(
S16, Reg);
6209 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6210 WideRegs.
push_back(
B.buildAnyExt(
S32, Unmerge.getReg(
I)).getReg(0));
6218 if (ImageStore && ST.hasImageStoreD16Bug()) {
6221 Reg =
B.buildBitcast(
S32, Reg).getReg(0);
6223 PackedRegs.
resize(2,
B.buildUndef(
S32).getReg(0));
6230 auto Unmerge =
B.buildUnmerge(
S16, Reg);
6231 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6233 PackedRegs.
resize(6,
B.buildUndef(
S16).getReg(0));
6241 auto Unmerge =
B.buildUnmerge(
S32, Reg);
6242 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6244 PackedRegs.
resize(4,
B.buildUndef(
S32).getReg(0));
6261 bool IsFormat)
const {
6263 LLT Ty =
MRI->getType(VData);
6273 VData =
B.buildBitcast(Ty, VData).getReg(0);
6281 if (Ty.isVector()) {
6282 if (Ty.getElementType() ==
S16 && Ty.getNumElements() <= 4) {
6294 bool IsFormat)
const {
6299 LLT Ty =
MRI.getType(VData);
6301 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
6316 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6319 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
6323 VIndex =
MI.getOperand(3).getReg();
6326 VIndex =
B.buildConstant(
S32, 0).getReg(0);
6329 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
6330 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
6334 Format =
MI.getOperand(5 + OpOffset).getImm();
6338 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
6344 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
6345 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
6346 }
else if (IsFormat) {
6347 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
6348 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
6352 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
6355 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
6358 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
6363 auto MIB =
B.buildInstr(
Opc)
6374 MIB.addImm(AuxiliaryData)
6375 .addImm(HasVIndex ? -1 : 0)
6376 .addMemOperand(MMO);
6378 MI.eraseFromParent();
6384 unsigned ImmOffset,
unsigned Format,
6387 auto MIB =
B.buildInstr(
Opc)
6398 MIB.addImm(AuxiliaryData)
6399 .addImm(HasVIndex ? -1 : 0)
6400 .addMemOperand(MMO);
6406 bool IsTyped)
const {
6420 assert(
MI.getNumExplicitDefs() == 1 ||
MI.getNumExplicitDefs() == 2);
6421 bool IsTFE =
MI.getNumExplicitDefs() == 2;
6423 StatusDst =
MI.getOperand(1).getReg();
6428 Register RSrc =
MI.getOperand(2 + OpOffset).getReg();
6431 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6434 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps + OpOffset;
6437 VIndex =
MI.getOperand(3 + OpOffset).getReg();
6440 VIndex =
B.buildConstant(
S32, 0).getReg(0);
6443 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
6444 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
6448 Format =
MI.getOperand(5 + OpOffset).getImm();
6452 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
6455 LLT Ty =
MRI.getType(Dst);
6462 Dst =
MI.getOperand(0).getReg();
6463 B.setInsertPt(
B.getMBB(),
MI);
6470 Dst =
MI.getOperand(0).getReg();
6471 B.setInsertPt(
B.getMBB(),
MI);
6475 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
6476 const bool Unpacked = ST.hasUnpackedD16VMem();
6486 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
6487 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
6488 }
else if (IsFormat) {
6492 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6494 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6495 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6500 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6501 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6504 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6505 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6508 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6509 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6515 unsigned NumValueDWords =
divideCeil(Ty.getSizeInBits(), 32);
6516 unsigned NumLoadDWords = NumValueDWords + 1;
6518 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(LoadTy);
6520 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6522 Register ExtDst =
B.getMRI()->createGenericVirtualRegister(
S32);
6523 B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6524 B.buildTrunc(Dst, ExtDst);
6525 }
else if (NumValueDWords == 1) {
6526 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6529 for (
unsigned I = 0;
I != NumValueDWords; ++
I)
6530 LoadElts.
push_back(
B.getMRI()->createGenericVirtualRegister(
S32));
6532 B.buildUnmerge(LoadElts, LoadDstReg);
6534 B.buildMergeLikeInstr(Dst, LoadElts);
6537 (IsD16 && !Ty.isVector())) {
6538 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(
S32);
6540 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6541 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6542 B.buildTrunc(Dst, LoadDstReg);
6543 }
else if (Unpacked && IsD16 && Ty.isVector()) {
6545 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6547 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6548 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6550 auto Unmerge =
B.buildUnmerge(
S32, LoadDstReg);
6552 for (
unsigned I = 0,
N = Unmerge->getNumOperands() - 1;
I !=
N; ++
I)
6553 Repack.
push_back(
B.buildTrunc(EltTy, Unmerge.getReg(
I)).getReg(0));
6554 B.buildMergeLikeInstr(Dst, Repack);
6557 AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6560 MI.eraseFromParent();
6566 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6567 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6568 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6569 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6570 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6571 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6572 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6573 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6574 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6575 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6576 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6577 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6578 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6579 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6580 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6581 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6582 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6583 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6584 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6585 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6586 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6587 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6588 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6589 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6590 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6591 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6592 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6593 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6594 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6595 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6596 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6597 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6598 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6599 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6600 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6601 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6602 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6603 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6604 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6605 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6606 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6607 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6608 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6609 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6610 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6611 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6612 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6613 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6614 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6615 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6616 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6617 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6618 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6619 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6620 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6621 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6622 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6623 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6624 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6625 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6626 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6627 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6628 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6629 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6630 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6631 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6632 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6633 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6634 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6635 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6636 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6637 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6638 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6639 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6640 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6641 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6642 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6643 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6644 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6645 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6646 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
6647 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
6648 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
6649 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
6650 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32;
6651 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6652 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
6653 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6654 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
6655 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6664 const bool IsCmpSwap =
6665 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6666 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6667 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6668 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6679 CmpVal =
MI.getOperand(3).getReg();
6684 Register RSrc =
MI.getOperand(3 + OpOffset).getReg();
6685 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6688 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
6691 VIndex =
MI.getOperand(4 + OpOffset).getReg();
6694 VIndex =
B.buildConstant(
LLT::scalar(32), 0).getReg(0);
6697 Register VOffset =
MI.getOperand(4 + OpOffset).getReg();
6698 Register SOffset =
MI.getOperand(5 + OpOffset).getReg();
6699 unsigned AuxiliaryData =
MI.getOperand(6 + OpOffset).getImm();
6718 .addImm(AuxiliaryData)
6719 .addImm(HasVIndex ? -1 : 0)
6720 .addMemOperand(MMO);
6722 MI.eraseFromParent();
6732 bool IsA16,
bool IsG16) {
6748 (
B.getMRI()->getType(AddrReg) ==
S16)) {
6753 B.buildBuildVector(
V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6757 "Bias needs to be converted to 16 bit in A16 mode");
6759 AddrReg =
B.buildBitcast(
V2S16, AddrReg).getReg(0);
6765 if (((
I + 1) >= EndIdx) ||
6772 !
MI.getOperand(ArgOffset +
I + 1).isReg()) {
6774 B.buildBuildVector(
V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6779 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
6790 int DimIdx,
int NumVAddrs) {
6794 for (
int I = 0;
I != NumVAddrs; ++
I) {
6796 if (
SrcOp.isReg()) {
6802 int NumAddrRegs = AddrRegs.
size();
6803 if (NumAddrRegs != 1) {
6806 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
6809 for (
int I = 1;
I != NumVAddrs; ++
I) {
6812 MI.getOperand(DimIdx +
I).setReg(AMDGPU::NoRegister);
6834 const unsigned NumDefs =
MI.getNumExplicitDefs();
6835 const unsigned ArgOffset = NumDefs + 1;
6836 bool IsTFE = NumDefs == 2;
6854 VData =
MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
6855 Ty =
MRI->getType(VData);
6858 const bool IsAtomicPacked16Bit =
6859 (BaseOpcode->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
6860 BaseOpcode->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
6868 ST.hasG16() ? (BaseOpcode->
Gradients && GradTy ==
S16) : GradTy ==
S16;
6869 const bool IsA16 = AddrTy ==
S16;
6870 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() ==
S16;
6873 if (!BaseOpcode->
Atomic) {
6874 DMask =
MI.getOperand(ArgOffset + Intr->
DMaskIndex).getImm();
6877 }
else if (DMask != 0) {
6879 }
else if (!IsTFE && !BaseOpcode->
Store) {
6881 B.buildUndef(
MI.getOperand(0));
6882 MI.eraseFromParent();
6890 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
6891 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
6892 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
6893 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
6894 unsigned NewOpcode = LoadOpcode;
6895 if (BaseOpcode->
Store)
6896 NewOpcode = StoreOpcode;
6898 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
6901 MI.setDesc(
B.getTII().get(NewOpcode));
6905 if (IsTFE && DMask == 0) {
6908 MI.getOperand(ArgOffset + Intr->
DMaskIndex).setImm(DMask);
6911 if (BaseOpcode->
Atomic) {
6913 LLT Ty =
MRI->getType(VData0);
6916 if (Ty.isVector() && !IsAtomicPacked16Bit)
6923 auto Concat =
B.buildBuildVector(PackedTy, {VData0, VData1});
6924 MI.getOperand(2).setReg(
Concat.getReg(0));
6925 MI.getOperand(3).setReg(AMDGPU::NoRegister);
6929 unsigned CorrectedNumVAddrs = Intr->
NumVAddrs;
6932 if (BaseOpcode->
Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
6938 if (IsA16 && !ST.hasA16()) {
6943 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->
Sampler);
6944 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
6946 if (IsA16 || IsG16) {
6954 const bool UseNSA = ST.hasNSAEncoding() &&
6955 PackedRegs.
size() >= ST.getNSAThreshold(MF) &&
6956 (PackedRegs.
size() <= NSAMaxSize || HasPartialNSA);
6957 const bool UsePartialNSA =
6958 UseNSA && HasPartialNSA && PackedRegs.
size() > NSAMaxSize;
6960 if (UsePartialNSA) {
6964 auto Concat =
B.buildConcatVectors(
6965 PackedAddrTy,
ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
6966 PackedRegs[NSAMaxSize - 1] =
Concat.getReg(0);
6967 PackedRegs.
resize(NSAMaxSize);
6968 }
else if (!UseNSA && PackedRegs.
size() > 1) {
6970 auto Concat =
B.buildConcatVectors(PackedAddrTy, PackedRegs);
6971 PackedRegs[0] =
Concat.getReg(0);
6975 const unsigned NumPacked = PackedRegs.
size();
6978 if (!
SrcOp.isReg()) {
6988 SrcOp.setReg(AMDGPU::NoRegister);
7005 const bool UseNSA = ST.hasNSAEncoding() &&
7006 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
7007 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
7008 const bool UsePartialNSA =
7009 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
7011 if (UsePartialNSA) {
7013 ArgOffset + Intr->
VAddrStart + NSAMaxSize - 1,
7015 }
else if (!UseNSA && Intr->
NumVAddrs > 1) {
7030 if (!Ty.isVector() || !IsD16)
7034 if (RepackedReg != VData) {
7035 MI.getOperand(1).setReg(RepackedReg);
7043 const int NumElts = Ty.
isVector() ? Ty.getNumElements() : 1;
7046 if (NumElts < DMaskLanes)
7049 if (NumElts > 4 || DMaskLanes > 4)
7059 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
7060 const LLT AdjustedTy =
7076 if (IsD16 && ST.hasUnpackedD16VMem()) {
7083 unsigned RoundedElts = (AdjustedTy.
getSizeInBits() + 31) / 32;
7084 unsigned RoundedSize = 32 * RoundedElts;
7088 RegTy = !IsTFE && EltSize == 16 ?
V2S16 :
S32;
7093 if (!IsTFE && (RoundedTy == Ty || !Ty.
isVector()))
7099 B.setInsertPt(*
MI.getParent(), ++
MI.getIterator());
7103 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
7104 const int ResultNumRegs = LoadResultTy.
getSizeInBits() / 32;
7106 Register NewResultReg =
MRI->createGenericVirtualRegister(LoadResultTy);
7108 MI.getOperand(0).setReg(NewResultReg);
7116 Dst1Reg =
MI.getOperand(1).getReg();
7117 if (
MRI->getType(Dst1Reg) !=
S32)
7121 MI.removeOperand(1);
7125 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
7134 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
7136 if (ResultNumRegs == 1) {
7138 ResultRegs[0] = NewResultReg;
7141 for (
int I = 0;
I != NumDataRegs; ++
I)
7142 ResultRegs[
I] =
MRI->createGenericVirtualRegister(RegTy);
7143 B.buildUnmerge(ResultRegs, NewResultReg);
7148 ResultRegs.
resize(NumDataRegs);
7153 if (IsD16 && !Ty.isVector()) {
7154 B.buildTrunc(DstReg, ResultRegs[0]);
7159 if (Ty ==
V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
7160 B.buildBitcast(DstReg, ResultRegs[0]);
7172 if (RegTy !=
V2S16 && !ST.hasUnpackedD16VMem()) {
7174 Reg =
B.buildBitcast(
V2S16, Reg).getReg(0);
7175 }
else if (ST.hasUnpackedD16VMem()) {
7177 Reg =
B.buildTrunc(
S16, Reg).getReg(0);
7181 auto padWithUndef = [&](
LLT Ty,
int NumElts) {
7184 Register Undef =
B.buildUndef(Ty).getReg(0);
7185 for (
int I = 0;
I != NumElts; ++
I)
7190 LLT ResTy =
MRI->getType(ResultRegs[0]);
7192 padWithUndef(ResTy, NumElts - ResultRegs.
size());
7193 B.buildBuildVector(DstReg, ResultRegs);
7197 assert(!ST.hasUnpackedD16VMem() && ResTy ==
V2S16);
7198 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
7204 if (ResultRegs.
size() == 1) {
7205 NewResultReg = ResultRegs[0];
7206 }
else if (ResultRegs.
size() == 2) {
7208 NewResultReg =
B.buildConcatVectors(
V4S16, ResultRegs).getReg(0);
7214 if (
MRI->getType(DstReg).getNumElements() <
7215 MRI->getType(NewResultReg).getNumElements()) {
7216 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
7218 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
7223 padWithUndef(ResTy, RegsToCover - ResultRegs.
size());
7224 B.buildConcatVectors(DstReg, ResultRegs);
7233 Register OrigDst =
MI.getOperand(0).getReg();
7235 LLT Ty =
B.getMRI()->getType(OrigDst);
7236 unsigned Size = Ty.getSizeInBits();
7239 if (
Size < 32 && ST.hasScalarSubwordLoads()) {
7241 Opc =
Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
7242 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
7245 Dst =
B.getMRI()->createGenericVirtualRegister(
LLT::scalar(32));
7247 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
7256 B.setInsertPt(
B.getMBB(),
MI);
7261 B.setInsertPt(
B.getMBB(),
MI);
7267 MI.setDesc(
B.getTII().get(
Opc));
7268 MI.removeOperand(1);
7271 const unsigned MemSize = (
Size + 7) / 8;
7272 const Align MemAlign =
B.getDataLayout().getABITypeAlign(
7279 MI.addMemOperand(MF, MMO);
7280 if (Dst != OrigDst) {
7281 MI.getOperand(0).setReg(Dst);
7282 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
7283 B.buildTrunc(OrigDst, Dst);
7305 MI.setDesc(
B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
7306 MI.removeOperand(0);
7316 if (!ST.isTrapHandlerEnabled() ||
7320 return ST.supportsGetDoorbellID() ?
7333 MI.eraseFromParent();
7343 BuildMI(*TrapBB, TrapBB->
end(),
DL,
B.getTII().get(AMDGPU::S_ENDPGM))
7345 BuildMI(BB, &
MI,
DL,
B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
7349 MI.eraseFromParent();
7358 Register SGPR01(AMDGPU::SGPR0_SGPR1);
7365 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), Param);
7367 Register KernargPtrReg =
MRI.createGenericVirtualRegister(
7383 Register LoadAddr =
MRI.createGenericVirtualRegister(
7385 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
7388 Register Temp =
B.buildLoad(
S64, LoadAddr, *MMO).getReg(0);
7389 B.buildCopy(SGPR01, Temp);
7390 B.buildInstr(AMDGPU::S_TRAP)
7393 MI.eraseFromParent();
7404 B.buildCopy(SGPR01, LiveIn);
7405 B.buildInstr(AMDGPU::S_TRAP)
7409 MI.eraseFromParent();
7418 if (ST.hasPrivEnabledTrap2NopBug()) {
7419 ST.getInstrInfo()->insertSimulatedTrap(
MRI,
B.getMBB(),
MI,
7421 MI.eraseFromParent();
7425 B.buildInstr(AMDGPU::S_TRAP)
7427 MI.eraseFromParent();
7436 if (!ST.isTrapHandlerEnabled() ||
7440 Fn,
"debugtrap handler not supported",
MI.getDebugLoc(),
DS_Warning));
7443 B.buildInstr(AMDGPU::S_TRAP)
7447 MI.eraseFromParent();
7460 Register NodePtr =
MI.getOperand(2).getReg();
7461 Register RayExtent =
MI.getOperand(3).getReg();
7462 Register RayOrigin =
MI.getOperand(4).getReg();
7464 Register RayInvDir =
MI.getOperand(6).getReg();
7467 if (!ST.hasGFX10_AEncoding()) {
7470 Fn,
"intrinsic not supported on subtarget",
MI.getDebugLoc()));
7477 const bool IsA16 =
MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
7478 const bool Is64 =
MRI.getType(NodePtr).getSizeInBits() == 64;
7479 const unsigned NumVDataDwords = 4;
7480 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
7481 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
7483 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
7485 const unsigned BaseOpcodes[2][2] = {
7486 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7487 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7488 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7492 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
7493 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
7494 : AMDGPU::MIMGEncGfx10NSA,
7495 NumVDataDwords, NumVAddrDwords);
7499 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
7500 : AMDGPU::MIMGEncGfx10Default,
7501 NumVDataDwords, NumVAddrDwords);
7506 if (UseNSA && IsGFX11Plus) {
7508 auto Unmerge =
B.buildUnmerge({
S32,
S32,
S32}, Src);
7509 auto Merged =
B.buildMergeLikeInstr(
7510 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
7511 Ops.push_back(Merged.getReg(0));
7514 Ops.push_back(NodePtr);
7515 Ops.push_back(RayExtent);
7516 packLanes(RayOrigin);
7519 auto UnmergeRayDir =
B.buildUnmerge({
S16,
S16,
S16}, RayDir);
7520 auto UnmergeRayInvDir =
B.buildUnmerge({
S16,
S16,
S16}, RayInvDir);
7521 auto MergedDir =
B.buildMergeLikeInstr(
7524 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(0),
7525 UnmergeRayDir.getReg(0)}))
7528 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(1),
7529 UnmergeRayDir.getReg(1)}))
7532 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(2),
7533 UnmergeRayDir.getReg(2)}))
7535 Ops.push_back(MergedDir.getReg(0));
7538 packLanes(RayInvDir);
7542 auto Unmerge =
B.buildUnmerge({
S32,
S32}, NodePtr);
7543 Ops.push_back(Unmerge.getReg(0));
7544 Ops.push_back(Unmerge.getReg(1));
7546 Ops.push_back(NodePtr);
7548 Ops.push_back(RayExtent);
7551 auto Unmerge =
B.buildUnmerge({
S32,
S32,
S32}, Src);
7552 Ops.push_back(Unmerge.getReg(0));
7553 Ops.push_back(Unmerge.getReg(1));
7554 Ops.push_back(Unmerge.getReg(2));
7557 packLanes(RayOrigin);
7559 auto UnmergeRayDir =
B.buildUnmerge({
S16,
S16,
S16}, RayDir);
7560 auto UnmergeRayInvDir =
B.buildUnmerge({
S16,
S16,
S16}, RayInvDir);
7564 B.buildMergeLikeInstr(R1,
7565 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7566 B.buildMergeLikeInstr(
7567 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7568 B.buildMergeLikeInstr(
7569 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7575 packLanes(RayInvDir);
7582 Register MergedOps =
B.buildMergeLikeInstr(OpTy,
Ops).getReg(0);
7584 Ops.push_back(MergedOps);
7587 auto MIB =
B.buildInstr(AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY)
7596 .addImm(IsA16 ? 1 : 0)
7599 MI.eraseFromParent();
7609 Register DstOrigin =
MI.getOperand(1).getReg();
7611 Register NodePtr =
MI.getOperand(4).getReg();
7612 Register RayExtent =
MI.getOperand(5).getReg();
7613 Register InstanceMask =
MI.getOperand(6).getReg();
7614 Register RayOrigin =
MI.getOperand(7).getReg();
7616 Register Offsets =
MI.getOperand(9).getReg();
7617 Register TDescr =
MI.getOperand(10).getReg();
7619 if (!ST.hasBVHDualAndBVH8Insts()) {
7622 Fn,
"intrinsic not supported on subtarget",
MI.getDebugLoc()));
7627 Intrinsic::amdgcn_image_bvh8_intersect_ray;
7628 const unsigned NumVDataDwords = 10;
7629 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
7631 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
7632 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
7633 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
7636 auto RayExtentInstanceMaskVec =
B.buildMergeLikeInstr(
7637 V2S32, {RayExtent,
B.buildAnyExt(
S32, InstanceMask)});
7639 B.buildInstr(IsBVH8 ? AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY
7640 : AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY)
7646 .addUse(RayExtentInstanceMaskVec.getReg(0))
7653 MI.eraseFromParent();
7662 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
7663 MI.eraseFromParent();
7670 if (!ST.hasArchitectedSGPRs())
7674 auto TTMP8 =
B.buildCopy(
S32,
Register(AMDGPU::TTMP8));
7675 auto LSB =
B.buildConstant(
S32, 25);
7676 auto Width =
B.buildConstant(
S32, 5);
7677 B.buildUbfx(DstReg, TTMP8, LSB, Width);
7678 MI.eraseFromParent();
7686 unsigned Width)
const {
7689 if (!
MRI.getRegClassOrNull(DstReg))
7690 MRI.setRegClass(DstReg, &AMDGPU::SReg_32RegClass);
7691 B.buildInstr(AMDGPU::S_GETREG_B32_const)
7694 MI.eraseFromParent();
7708 if (
MRI.getType(Src) !=
S64)
7712 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {
S32},
7716 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {
S32},
7719 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
7720 MI.eraseFromParent();
7728 if (
MRI.getType(Src) !=
S64)
7731 auto Unmerge =
B.buildUnmerge({
S32,
S32},
MI.getOperand(0));
7735 .addReg(Unmerge.getReg(0));
7739 .addReg(Unmerge.getReg(1));
7740 MI.eraseFromParent();
7752 case Intrinsic::amdgcn_if:
7753 case Intrinsic::amdgcn_else: {
7756 bool Negated =
false;
7768 std::swap(CondBrTarget, UncondBrTarget);
7770 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
7771 if (IntrID == Intrinsic::amdgcn_if) {
7772 B.buildInstr(AMDGPU::SI_IF)
7775 .addMBB(UncondBrTarget);
7777 B.buildInstr(AMDGPU::SI_ELSE)
7780 .addMBB(UncondBrTarget);
7789 B.buildBr(*CondBrTarget);
7792 MRI.setRegClass(Def,
TRI->getWaveMaskRegClass());
7793 MRI.setRegClass(
Use,
TRI->getWaveMaskRegClass());
7794 MI.eraseFromParent();
7795 BrCond->eraseFromParent();
7801 case Intrinsic::amdgcn_loop: {
7804 bool Negated =
false;
7814 std::swap(CondBrTarget, UncondBrTarget);
7816 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
7817 B.buildInstr(AMDGPU::SI_LOOP)
7819 .addMBB(UncondBrTarget);
7824 B.buildBr(*CondBrTarget);
7826 MI.eraseFromParent();
7827 BrCond->eraseFromParent();
7828 MRI.setRegClass(Reg,
TRI->getWaveMaskRegClass());
7834 case Intrinsic::amdgcn_addrspacecast_nonnull:
7836 case Intrinsic::amdgcn_make_buffer_rsrc:
7838 case Intrinsic::amdgcn_kernarg_segment_ptr:
7841 B.buildConstant(
MI.getOperand(0).getReg(), 0);
7842 MI.eraseFromParent();
7848 case Intrinsic::amdgcn_implicitarg_ptr:
7850 case Intrinsic::amdgcn_workitem_id_x:
7853 case Intrinsic::amdgcn_workitem_id_y:
7856 case Intrinsic::amdgcn_workitem_id_z:
7859 case Intrinsic::amdgcn_workgroup_id_x:
7864 case Intrinsic::amdgcn_workgroup_id_y:
7869 case Intrinsic::amdgcn_workgroup_id_z:
7874 case Intrinsic::amdgcn_cluster_id_x:
7875 return ST.hasClusters() &&
7878 case Intrinsic::amdgcn_cluster_id_y:
7879 return ST.hasClusters() &&
7882 case Intrinsic::amdgcn_cluster_id_z:
7883 return ST.hasClusters() &&
7886 case Intrinsic::amdgcn_cluster_workgroup_id_x:
7887 return ST.hasClusters() &&
7890 case Intrinsic::amdgcn_cluster_workgroup_id_y:
7891 return ST.hasClusters() &&
7894 case Intrinsic::amdgcn_cluster_workgroup_id_z:
7895 return ST.hasClusters() &&
7898 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
7899 return ST.hasClusters() &&
7901 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
7902 return ST.hasClusters() &&
7905 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
7906 return ST.hasClusters() &&
7909 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
7910 return ST.hasClusters() &&
7913 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
7914 return ST.hasClusters() &&
7918 case Intrinsic::amdgcn_wave_id:
7920 case Intrinsic::amdgcn_lds_kernel_id:
7923 case Intrinsic::amdgcn_dispatch_ptr:
7926 case Intrinsic::amdgcn_queue_ptr:
7929 case Intrinsic::amdgcn_implicit_buffer_ptr:
7932 case Intrinsic::amdgcn_dispatch_id:
7935 case Intrinsic::r600_read_ngroups_x:
7939 case Intrinsic::r600_read_ngroups_y:
7942 case Intrinsic::r600_read_ngroups_z:
7945 case Intrinsic::r600_read_local_size_x:
7948 case Intrinsic::r600_read_local_size_y:
7952 case Intrinsic::r600_read_local_size_z:
7955 case Intrinsic::amdgcn_fdiv_fast:
7957 case Intrinsic::amdgcn_is_shared:
7959 case Intrinsic::amdgcn_is_private:
7961 case Intrinsic::amdgcn_wavefrontsize: {
7962 B.buildConstant(
MI.getOperand(0), ST.getWavefrontSize());
7963 MI.eraseFromParent();
7966 case Intrinsic::amdgcn_s_buffer_load:
7968 case Intrinsic::amdgcn_raw_buffer_store:
7969 case Intrinsic::amdgcn_raw_ptr_buffer_store:
7970 case Intrinsic::amdgcn_struct_buffer_store:
7971 case Intrinsic::amdgcn_struct_ptr_buffer_store:
7973 case Intrinsic::amdgcn_raw_buffer_store_format:
7974 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
7975 case Intrinsic::amdgcn_struct_buffer_store_format:
7976 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
7978 case Intrinsic::amdgcn_raw_tbuffer_store:
7979 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
7980 case Intrinsic::amdgcn_struct_tbuffer_store:
7981 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
7983 case Intrinsic::amdgcn_raw_buffer_load:
7984 case Intrinsic::amdgcn_raw_ptr_buffer_load:
7985 case Intrinsic::amdgcn_raw_atomic_buffer_load:
7986 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
7987 case Intrinsic::amdgcn_struct_buffer_load:
7988 case Intrinsic::amdgcn_struct_ptr_buffer_load:
7989 case Intrinsic::amdgcn_struct_atomic_buffer_load:
7990 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
7992 case Intrinsic::amdgcn_raw_buffer_load_format:
7993 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
7994 case Intrinsic::amdgcn_struct_buffer_load_format:
7995 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
7997 case Intrinsic::amdgcn_raw_tbuffer_load:
7998 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
7999 case Intrinsic::amdgcn_struct_tbuffer_load:
8000 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
8002 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
8003 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
8004 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
8005 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
8006 case Intrinsic::amdgcn_raw_buffer_atomic_add:
8007 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
8008 case Intrinsic::amdgcn_struct_buffer_atomic_add:
8009 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
8010 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
8011 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
8012 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
8013 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
8014 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
8015 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
8016 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
8017 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
8018 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
8019 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
8020 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
8021 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
8022 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
8023 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
8024 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
8025 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
8026 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
8027 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
8028 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
8029 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
8030 case Intrinsic::amdgcn_raw_buffer_atomic_and:
8031 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
8032 case Intrinsic::amdgcn_struct_buffer_atomic_and:
8033 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
8034 case Intrinsic::amdgcn_raw_buffer_atomic_or:
8035 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
8036 case Intrinsic::amdgcn_struct_buffer_atomic_or:
8037 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
8038 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
8039 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
8040 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
8041 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
8042 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
8043 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
8044 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
8045 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
8046 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
8047 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
8048 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
8049 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
8050 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
8051 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
8052 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
8053 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
8054 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
8055 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
8056 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
8057 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
8058 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
8059 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
8060 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
8061 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
8062 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
8063 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
8064 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
8065 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
8066 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
8067 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
8068 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
8069 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
8070 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
8071 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
8072 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
8073 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
8075 case Intrinsic::amdgcn_rsq_clamp:
8077 case Intrinsic::amdgcn_image_bvh_intersect_ray:
8079 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
8080 case Intrinsic::amdgcn_image_bvh8_intersect_ray:
8082 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
8083 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
8084 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
8085 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
8086 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
8087 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
8088 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
8089 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
8092 if (
MRI.getType(Index) !=
S64)
8093 MI.getOperand(5).setReg(
B.buildAnyExt(
S64, Index).getReg(0));
8096 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8097 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8098 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8099 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8100 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8101 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8102 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8103 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8106 if (
MRI.getType(Index) !=
S32)
8107 MI.getOperand(5).setReg(
B.buildAnyExt(
S32, Index).getReg(0));
8110 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
8111 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
8112 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
8113 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
8114 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
8115 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
8116 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8117 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8118 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8120 LLT IdxTy = IntrID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
8123 if (
MRI.getType(Index) != IdxTy)
8124 MI.getOperand(7).setReg(
B.buildAnyExt(IdxTy, Index).getReg(0));
8128 case Intrinsic::amdgcn_fmed3: {
8134 MI.setDesc(
B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
8135 MI.removeOperand(1);
8139 case Intrinsic::amdgcn_readlane:
8140 case Intrinsic::amdgcn_writelane:
8141 case Intrinsic::amdgcn_readfirstlane:
8142 case Intrinsic::amdgcn_permlane16:
8143 case Intrinsic::amdgcn_permlanex16:
8144 case Intrinsic::amdgcn_permlane64:
8145 case Intrinsic::amdgcn_set_inactive:
8146 case Intrinsic::amdgcn_set_inactive_chain_arg:
8147 case Intrinsic::amdgcn_mov_dpp8:
8148 case Intrinsic::amdgcn_update_dpp:
8150 case Intrinsic::amdgcn_s_buffer_prefetch_data:
8152 case Intrinsic::amdgcn_dead: {
8156 MI.eraseFromParent();
8159 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
8160 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
8161 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
8162 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8163 B.buildLoad(
MI.getOperand(0),
MI.getOperand(2), **
MI.memoperands_begin());
8164 MI.eraseFromParent();
8166 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
8167 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
8168 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
8169 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8170 B.buildStore(
MI.getOperand(2),
MI.getOperand(1), **
MI.memoperands_begin());
8171 MI.eraseFromParent();
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID)
static LLT getBufferRsrcScalarType(const LLT Ty)
static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST, unsigned TypeIdx)
static cl::opt< bool > EnableNewLegality("amdgpu-global-isel-new-legality", cl::desc("Use GlobalISel desired legality, rather than try to use" "rules compatible with selection patterns"), cl::init(false), cl::ReallyHidden)
static MachineInstrBuilder buildExp(MachineIRBuilder &B, const DstOp &Dst, const SrcOp &Src, unsigned Flags)
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, unsigned Flags)
constexpr std::initializer_list< LLT > AllVectors
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx)
static LegalityPredicate isSmallOddVector(unsigned TypeIdx)
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size)
static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags)
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const LLT MemTy)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
static constexpr unsigned FPEnvModeBitField
static LegalizeMutation getScalarTypeFromMemDesc(unsigned TypeIdx)
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, uint64_t AlignInBits, unsigned AddrSpace, unsigned Opcode)
Return true if we should legalize a load by widening an odd sized memory access up to the alignment.
static bool isRegisterVectorElementType(LLT EltTy)
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
static LegalityPredicate isWideVec16(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllScalarTypes
static LegalityPredicate isTruncStoreToSizePowerOf2(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllS32Vectors
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx)
static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B)
Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is the form in which the valu...
static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty)
static std::pair< Register, Register > emitReciprocalU64(MachineIRBuilder &B, Register Val)
static LLT getBitcastRegisterType(const LLT Ty)
static LLT getBufferRsrcRegisterType(const LLT Ty)
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx)
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI)
static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, MachineRegisterInfo &MRI, unsigned Idx)
Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial type of the operand idx an...
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C)
static constexpr unsigned SPDenormModeBitField
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, bool IsLoad, bool IsAtomic)
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, MachineBasicBlock *&UncondBrTarget, bool &Negated)
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllS64Vectors
static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, unsigned Idx)
static constexpr unsigned FPEnvTrapBitField
static constexpr unsigned MaxRegisterSize
static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size)
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx)
static bool hasBufferRsrcWorkaround(const LLT Ty)
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, SIModeRegisterDefaults Mode)
constexpr std::initializer_list< LLT > AllS16Vectors
static bool loadStoreBitcastWorkaround(const LLT Ty)
static LLT widenToNextPowerOf2(LLT Ty)
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI)
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx)
static LLT getPow2VectorType(LLT Ty)
static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, Register VIndex, Register VOffset, Register SOffset, unsigned ImmOffset, unsigned Format, unsigned AuxiliaryData, MachineMemOperand *MMO, bool IsTyped, bool HasVIndex, MachineIRBuilder &B)
static LLT getPow2ScalarType(LLT Ty)
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx)
static bool isRegisterVectorType(LLT Ty)
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
static bool isRegisterType(const GCNSubtarget &ST, LLT Ty)
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Interface for Targets to specify which operations they can successfully select and how the others sho...
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
#define FP_DENORM_FLUSH_NONE
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static constexpr int Concat[]
bool legalizeConstHwRegRead(MachineInstr &MI, MachineIRBuilder &B, AMDGPU::Hwreg::Id HwReg, unsigned LowBit, unsigned Width) const
void buildMultiply(LegalizerHelper &Helper, MutableArrayRef< Register > Accum, ArrayRef< Register > Src0, ArrayRef< Register > Src1, bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const
bool legalizeBVHIntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferStore(MachineInstr &MI, LegalizerHelper &Helper, bool IsTyped, bool IsFormat) const
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFFREXP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
To create a buffer resource from a 64-bit pointer, mask off the upper 32 bits of the pointer and repl...
bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const
bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const
void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore=false) const
Handle register layout difference for f16 images for some subtargets.
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM)
bool legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSBufferPrefetch(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFExp10Unsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override
Called for instructions with the Custom LegalizationAction.
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, int64_t Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafeImpl(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags, bool IsExp10) const
std::pair< Register, Register > getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool loadInputValue(Register DstReg, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeBVHDualOrBVH8IntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferLoad(MachineInstr &MI, LegalizerHelper &Helper, bool IsFormat, bool IsTyped) const
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const
void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
bool legalizeDebugTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFroundeven(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkGroupId(MachineInstr &MI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ClusterIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const
bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, bool IsLog10, unsigned Flags) const
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, uint64_t Offset, Align Alignment=Align(4)) const
Legalize a value that's loaded from kernel arguments.
bool legalizeImageIntrinsic(MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const
Rewrite image intrinsics to use register layouts expected by the subtarget.
void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, MachineRegisterInfo &MRI) const
bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register fixStoreSourceType(MachineIRBuilder &B, Register VData, LLT MemTy, bool IsFormat) const
bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const
bool legalizeSetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkitemIDIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
void buildLoadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool isEntryFunction() const
bool isModuleEntryFunction() const
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const fltSemantics & IEEEsingle()
static const fltSemantics & IEEEdouble()
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
@ ICMP_SLT
signed less than
@ FCMP_OLT
0 1 0 0 True if ordered and less than
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
@ ICMP_UGE
unsigned greater or equal
@ ICMP_SGT
signed greater than
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
@ ICMP_ULT
unsigned less than
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
This is the shared class of boolean and integer constants.
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Abstract class that contains various methods for clients to notify about changes.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
Simple wrapper observer that takes several observers, and calls each one for each event.
KnownBits getKnownBits(Register R)
bool hasExternalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
static constexpr LLT float64()
Get a 64-bit IEEE double value.
constexpr unsigned getScalarSizeInBits() const
constexpr LLT changeElementType(LLT NewEltTy) const
If this type is a vector, return a vector with the same number of elements but the new element type.
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
static constexpr LLT float16()
Get a 16-bit IEEE half value.
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr LLT changeElementCount(ElementCount EC) const
Return a vector or scalar with the same element type and the new element count.
constexpr LLT getScalarType() const
static constexpr LLT scalarOrVector(ElementCount EC, LLT ScalarTy)
static constexpr LLT float32()
Get a 32-bit IEEE float value.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI void computeTables()
Compute any ancillary tables needed to quickly decide how an operation should be handled.
LegalizeRuleSet & minScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty.
LegalizeRuleSet & legalFor(std::initializer_list< LLT > Types)
The instruction is legal when type index 0 is any type in the given list.
LegalizeRuleSet & unsupported()
The instruction is unsupported.
LegalizeRuleSet & scalarSameSizeAs(unsigned TypeIdx, unsigned SameSizeIdx)
Change the type TypeIdx to have the same scalar size as type SameSizeIdx.
LegalizeRuleSet & fewerElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Remove elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalarOrElt(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & bitcastIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
The specified type index is coerced if predicate is true.
LegalizeRuleSet & maxScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at most as wide as Ty.
LegalizeRuleSet & minScalarOrElt(unsigned TypeIdx, const LLT Ty)
Ensure the scalar or element is at least as wide as Ty.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
LegalizeRuleSet & unsupportedFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & lower()
The instruction is lowered.
LegalizeRuleSet & moreElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Add more elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & lowerFor(std::initializer_list< LLT > Types)
The instruction is lowered when type index 0 is any type in the given list.
LegalizeRuleSet & lowerIf(LegalityPredicate Predicate)
The instruction is lowered if predicate is true.
LegalizeRuleSet & clampScalar(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & custom()
Unconditionally custom lower.
LegalizeRuleSet & clampMaxNumElementsStrict(unsigned TypeIdx, const LLT EltTy, unsigned NumElts)
Express EltTy vectors strictly using vectors with NumElts elements (or scalars when NumElts equals 1)...
LegalizeRuleSet & unsupportedIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Widen the scalar to the one selected by the mutation if the predicate is true.
LegalizeRuleSet & alwaysLegal()
LegalizeRuleSet & clampNumElements(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the number of elements for the given vectors to at least MinTy's number of elements and at most...
LegalizeRuleSet & maxScalarIf(LegalityPredicate Predicate, unsigned TypeIdx, const LLT Ty)
Conditionally limit the maximum size of the scalar.
LegalizeRuleSet & customIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
LegalizeRuleSet & scalarize(unsigned TypeIdx)
LegalizeRuleSet & legalForCartesianProduct(std::initializer_list< LLT > Types)
The instruction is legal when type indexes 0 and 1 are both in the given list.
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & widenScalarToNextMultipleOf(unsigned TypeIdx, unsigned Size)
Widen the scalar to the next multiple of Size.
LLVM_ABI LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
LLVM_ABI void moreElementsVectorDst(MachineInstr &MI, LLT MoreTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a Def by performing it with addition...
GISelValueTracking * getValueTracking() const
@ Legalized
Instruction has been legalized and the MachineFunction changed.
GISelChangeObserver & Observer
To keep track of changes made by the LegalizerHelper.
LLVM_ABI void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a def by inserting a G_BITCAST from ...
LLVM_ABI LegalizeResult lowerFMad(MachineInstr &MI)
MachineIRBuilder & MIRBuilder
Expose MIRBuilder so clients can set their own RecordInsertInstruction functions.
LLVM_ABI void widenScalarDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx=0, unsigned TruncOpcode=TargetOpcode::G_TRUNC)
Legalize a single operand OpIdx of the machine instruction MI as a Def by extending the operand's typ...
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
const LegacyLegalizerInfo & getLegacyLegalizerInfo() const
TypeSize getValue() const
Wrapper class representing physical registers. Should be passed by value.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Helper class to build MachineInstr.
MachineFunction & getMF()
Getter for the function we currently build.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLT getMemoryType() const
Return the memory type of the memory reference.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineBasicBlock * getMBB() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setMBB(MachineBasicBlock *MBB)
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
MutableArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
constexpr bool isValid() const
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool hasWorkGroupIDZ() const
AMDGPU::ClusterDimsAttr getClusterDims() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
bool shouldEmitFixup(const GlobalValue *GV) const
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool shouldEmitPCReloc(const GlobalValue *GV) const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void truncate(size_type N)
Like resize, but requires that N is less than size().
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
unsigned getPointerSizeInBits(unsigned AS) const
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
The instances of the Type class are immutable: once they are created, they are never changed.
A Use represents the edge between a Value definition and its users.
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isGFX11(const MCSubtargetInfo &STI)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isGFX11Plus(const MCSubtargetInfo &STI)
bool isGFX1250(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
LLVM_ABI LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
LLVM_ABI LegalityPredicate isScalar(unsigned TypeIdx)
True iff the specified type index is a scalar.
LLVM_ABI LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
LLVM_ABI LegalityPredicate typeInSet(unsigned TypeIdx, std::initializer_list< LLT > TypesInit)
True iff the given type index is one of the specified types.
LLVM_ABI LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a smaller total bit size than second type index.
LLVM_ABI LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
LLVM_ABI LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
LLVM_ABI LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the specified type indices are both the same bit size.
LLVM_ABI LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or vector with an element type that's narrower than the...
LLVM_ABI LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size)
True if the total bitwidth of the specified type index is Size bits.
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
LLVM_ABI LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified type.
LLVM_ABI LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
LLVM_ABI LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
LLVM_ABI LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min=0)
Widen the scalar type or vector element type for the given type index to the next power of 2.
LLVM_ABI LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
Invariant opcodes: All instruction sets have these as their low opcodes.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
LLVM_ABI Type * getTypeForLLT(LLT Ty, LLVMContext &C)
Get the type back from LLT.
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
LLVM_ABI const llvm::fltSemantics & getFltSemanticForLLT(LLT Ty)
Get the appropriate floating point arithmetic semantic based on the bit size of the given scalar LLT.
std::function< std::pair< unsigned, LLT >(const LegalityQuery &)> LegalizeMutation
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool has_single_bit(T Value) noexcept
std::function< bool(const LegalityQuery &)> LegalityPredicate
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
To bit_cast(const From &from) noexcept
@ Mul
Product of integers.
@ Sub
Subtraction of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
FunctionAddr VTableAddr Next
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI, LostDebugLocObserver *LocObserver=nullptr)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
unsigned Log2(Align A)
Returns the log2 of the alignment.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
@ CLUSTER_WORKGROUP_MAX_ID_X
@ CLUSTER_WORKGROUP_MAX_ID_Z
@ CLUSTER_WORKGROUP_MAX_FLAT_ID
@ CLUSTER_WORKGROUP_MAX_ID_Y
static constexpr uint64_t encode(Fields... Values)
MIMGBaseOpcode BaseOpcode
This struct is a compact representation of a valid (non-zero power of two) alignment.
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
MCRegister getRegister() const
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
bool isZero() const
Returns true if value is all zero.
The LegalityQuery object bundles together all the information that's needed to decide whether a given...
ArrayRef< MemDesc > MMODescrs
Operations which require memory can use this to place requirements on the memory type for each MMO.
This class contains a discriminated union of information about pointers in memory operands,...
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.