36#include "llvm/IR/IntrinsicsAMDGPU.h"
37#include "llvm/IR/IntrinsicsR600.h"
39#define DEBUG_TYPE "amdgpu-legalinfo"
49 "amdgpu-global-isel-new-legality",
50 cl::desc(
"Use GlobalISel desired legality, rather than try to use"
51 "rules compatible with selection patterns"),
66 unsigned Bits = Ty.getSizeInBits();
76 const LLT Ty = Query.Types[TypeIdx];
82 return Ty.getNumElements() % 2 != 0 &&
83 EltSize > 1 && EltSize < 32 &&
84 Ty.getSizeInBits() % 32 != 0;
90 const LLT Ty = Query.Types[TypeIdx];
97 const LLT Ty = Query.Types[TypeIdx];
99 return EltTy.
getSizeInBits() == 16 && Ty.getNumElements() > 2;
105 const LLT Ty = Query.Types[TypeIdx];
107 return std::pair(TypeIdx,
114 const LLT Ty = Query.Types[TypeIdx];
116 unsigned Size = Ty.getSizeInBits();
117 unsigned Pieces = (
Size + 63) / 64;
118 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
128 const LLT Ty = Query.Types[TypeIdx];
131 const int Size = Ty.getSizeInBits();
133 const int NextMul32 = (
Size + 31) / 32;
137 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
145 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
146 return std::make_pair(TypeIdx,
LLT::scalar(MemSize));
153 const LLT Ty = Query.Types[TypeIdx];
155 const unsigned EltSize = Ty.getElementType().getSizeInBits();
158 assert(EltSize == 32 || EltSize == 64);
163 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
167 return std::pair(TypeIdx,
182 const unsigned NumElems = Ty.getElementCount().getFixedValue();
187 const unsigned Size = Ty.getSizeInBits();
200 const LLT Ty = Query.Types[TypeIdx];
207 const LLT Ty = Query.Types[TypeIdx];
208 unsigned Size = Ty.getSizeInBits();
217 const LLT QueryTy = Query.Types[TypeIdx];
224 const LLT QueryTy = Query.Types[TypeIdx];
231 const LLT QueryTy = Query.Types[TypeIdx];
237 return ((ST.useRealTrue16Insts() &&
Size == 16) ||
Size % 32 == 0) &&
243 return EltSize == 16 || EltSize % 32 == 0;
247 const int EltSize = Ty.getElementType().getSizeInBits();
248 return EltSize == 32 || EltSize == 64 ||
249 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
250 EltSize == 128 || EltSize == 256;
279 LLT Ty = Query.Types[TypeIdx];
287 const LLT QueryTy = Query.Types[TypeIdx];
371 if (Ty.isPointerOrPointerVector())
372 Ty = Ty.changeElementType(
LLT::scalar(Ty.getScalarSizeInBits()));
376 (ST.useRealTrue16Insts() && Ty ==
S16) ||
391 const LLT Ty = Query.Types[TypeIdx];
392 return !Ty.
isVector() && Ty.getSizeInBits() > 32 &&
393 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
401 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
411 bool IsLoad,
bool IsAtomic) {
415 return ST.hasFlatScratchEnabled() ? 128 : 32;
417 return ST.useDS128() ? 128 : 64;
428 return IsLoad ? 512 : 128;
433 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
442 const bool IsLoad = Query.
Opcode != AMDGPU::G_STORE;
444 unsigned RegSize = Ty.getSizeInBits();
447 unsigned AS = Query.
Types[1].getAddressSpace();
454 if (Ty.isVector() && MemSize !=
RegSize)
461 if (IsLoad && MemSize <
Size)
462 MemSize = std::max(MemSize,
Align);
482 if (!ST.hasDwordx3LoadStores())
495 if (AlignBits < MemSize) {
498 Align(AlignBits / 8)))
528 const unsigned Size = Ty.getSizeInBits();
529 if (Ty.isPointerVector())
539 unsigned EltSize = Ty.getScalarSizeInBits();
540 return EltSize != 32 && EltSize != 64;
554 const unsigned Size = Ty.getSizeInBits();
555 if (
Size != MemSizeInBits)
556 return Size <= 32 && Ty.isVector();
562 return Ty.isVector() && (!MemTy.
isVector() || MemTy == Ty) &&
571 uint64_t AlignInBits,
unsigned AddrSpace,
581 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
592 if (AlignInBits < RoundedSize)
599 RoundedSize, AddrSpace,
Align(AlignInBits / 8),
611 Query.
Types[1].getAddressSpace(), Opcode);
631 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
634 Register VectorReg =
MRI.createGenericVirtualRegister(VectorTy);
635 std::array<Register, 4> VectorElems;
636 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
637 for (
unsigned I = 0;
I < NumParts; ++
I)
639 B.buildExtractVectorElementConstant(
S32, VectorReg,
I).getReg(0);
640 B.buildMergeValues(MO, VectorElems);
644 Register BitcastReg =
MRI.createGenericVirtualRegister(VectorTy);
645 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
646 auto Scalar =
B.buildBitcast(ScalarTy, BitcastReg);
647 B.buildIntToPtr(MO, Scalar);
667 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
668 auto Unmerged =
B.buildUnmerge(
LLT::scalar(32), Pointer);
669 for (
unsigned I = 0;
I < NumParts; ++
I)
671 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
673 Register Scalar =
B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
674 return B.buildBitcast(VectorTy, Scalar).getReg(0);
693 auto GetAddrSpacePtr = [&TM](
unsigned AS) {
706 const LLT BufferStridedPtr =
709 const LLT CodePtr = FlatPtr;
711 const std::initializer_list<LLT> AddrSpaces64 = {
712 GlobalPtr, ConstantPtr, FlatPtr
715 const std::initializer_list<LLT> AddrSpaces32 = {
716 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
719 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
721 const std::initializer_list<LLT> FPTypesBase = {
725 const std::initializer_list<LLT> FPTypes16 = {
729 const std::initializer_list<LLT> FPTypesPK16 = {
733 const LLT MinScalarFPTy = ST.has16BitInsts() ?
S16 :
S32;
754 if (ST.hasVOP3PInsts() && ST.hasAddNoCarryInsts() && ST.hasIntClamp()) {
756 if (ST.hasScalarAddSub64()) {
759 .clampMaxNumElementsStrict(0,
S16, 2)
767 .clampMaxNumElementsStrict(0,
S16, 2)
774 if (ST.hasScalarSMulU64()) {
777 .clampMaxNumElementsStrict(0,
S16, 2)
785 .clampMaxNumElementsStrict(0,
S16, 2)
795 .minScalarOrElt(0,
S16)
800 }
else if (ST.has16BitInsts()) {
834 .widenScalarToNextMultipleOf(0, 32)
844 if (ST.hasMad64_32())
849 if (ST.hasIntClamp()) {
872 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
882 if (ST.hasVOP3PInsts()) {
884 .clampMaxNumElements(0,
S8, 2)
905 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
917 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
924 .clampScalar(0,
S16,
S64);
957 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
958 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
965 if (ST.has16BitInsts()) {
966 if (ST.hasVOP3PInsts())
969 FPOpActions.legalFor({
S16});
971 TrigActions.customFor({
S16});
972 FDIVActions.customFor({
S16});
975 if (ST.hasPackedFP32Ops()) {
976 FPOpActions.legalFor({
V2S32});
977 FPOpActions.clampMaxNumElementsStrict(0,
S32, 2);
980 auto &MinNumMaxNumIeee =
983 if (ST.hasVOP3PInsts()) {
984 MinNumMaxNumIeee.legalFor(FPTypesPK16)
986 .clampMaxNumElements(0,
S16, 2)
989 }
else if (ST.has16BitInsts()) {
990 MinNumMaxNumIeee.legalFor(FPTypes16).clampScalar(0,
S16,
S64).scalarize(0);
992 MinNumMaxNumIeee.legalFor(FPTypesBase)
998 {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM});
1000 if (ST.hasVOP3PInsts()) {
1001 MinNumMaxNum.customFor(FPTypesPK16)
1003 .clampMaxNumElements(0,
S16, 2)
1004 .clampScalar(0,
S16,
S64)
1006 }
else if (ST.has16BitInsts()) {
1007 MinNumMaxNum.customFor(FPTypes16)
1008 .clampScalar(0,
S16,
S64)
1011 MinNumMaxNum.customFor(FPTypesBase)
1012 .clampScalar(0,
S32,
S64)
1016 if (ST.hasVOP3PInsts())
1032 .legalFor(FPTypesPK16)
1037 if (ST.has16BitInsts()) {
1066 if (ST.hasFractBug()) {
1095 if (ST.hasCvtPkF16F32Inst()) {
1097 .clampMaxNumElements(0,
S16, 2);
1101 FPTruncActions.scalarize(0).lower();
1109 if (ST.has16BitInsts()) {
1129 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1130 FMad.customFor({
S32,
S16});
1131 else if (ST.hasMadMacF32Insts())
1132 FMad.customFor({
S32});
1133 else if (ST.hasMadF16())
1134 FMad.customFor({
S16});
1139 if (ST.has16BitInsts()) {
1142 FRem.minScalar(0,
S32)
1151 .clampMaxNumElements(0,
S16, 2)
1170 if (ST.has16BitInsts())
1181 if (ST.has16BitInsts())
1203 .clampScalar(0,
S16,
S64)
1218 .clampScalar(0,
S16,
S64)
1222 if (ST.has16BitInsts()) {
1224 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1226 .clampScalar(0,
S16,
S64)
1230 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1232 .clampScalar(0,
S32,
S64)
1236 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1239 .clampScalar(0,
S32,
S64)
1251 .scalarSameSizeAs(1, 0)
1267 {
S1}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1268 .legalForCartesianProduct(
1269 {
S32}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1270 if (ST.has16BitInsts()) {
1271 CmpBuilder.legalFor({{
S1,
S16}});
1282 {
S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1284 if (ST.hasSALUFloatInsts())
1294 if (ST.has16BitInsts())
1295 ExpOps.customFor({{
S32}, {
S16}});
1297 ExpOps.customFor({
S32});
1298 ExpOps.clampScalar(0, MinScalarFPTy,
S32)
1307 if (ST.has16BitInsts())
1323 .clampScalar(0,
S32,
S32)
1330 if (ST.has16BitInsts())
1333 .widenScalarToNextPow2(1)
1339 .lowerFor({
S1,
S16})
1340 .widenScalarToNextPow2(1)
1367 .clampScalar(0,
S32,
S32)
1377 .clampScalar(0,
S32,
S64)
1381 if (ST.has16BitInsts()) {
1384 .clampMaxNumElementsStrict(0,
S16, 2)
1391 if (ST.hasVOP3PInsts()) {
1394 .clampMaxNumElements(0,
S16, 2)
1399 if (ST.hasIntMinMax64()) {
1402 .clampMaxNumElements(0,
S16, 2)
1410 .clampMaxNumElements(0,
S16, 2)
1419 .widenScalarToNextPow2(0)
1447 .legalForCartesianProduct(AddrSpaces32, {
S32})
1463 .legalForCartesianProduct(AddrSpaces32, {
S32})
1480 const auto needToSplitMemOp = [=](
const LegalityQuery &Query,
1481 bool IsLoad) ->
bool {
1485 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1499 unsigned NumRegs = (MemSize + 31) / 32;
1501 if (!ST.hasDwordx3LoadStores())
1512 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1513 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1514 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1520 for (
unsigned Op : {G_LOAD, G_STORE}) {
1521 const bool IsStore =
Op == G_STORE;
1526 Actions.legalForTypesWithMemDesc({{
S32, GlobalPtr,
S32, GlobalAlign32},
1529 {
S64, GlobalPtr,
S64, GlobalAlign32},
1532 {
S32, GlobalPtr,
S8, GlobalAlign8},
1533 {
S32, GlobalPtr,
S16, GlobalAlign16},
1535 {
S32, LocalPtr,
S32, 32},
1536 {
S64, LocalPtr,
S64, 32},
1538 {
S32, LocalPtr,
S8, 8},
1539 {
S32, LocalPtr,
S16, 16},
1542 {
S32, PrivatePtr,
S32, 32},
1543 {
S32, PrivatePtr,
S8, 8},
1544 {
S32, PrivatePtr,
S16, 16},
1547 {
S32, ConstantPtr,
S32, GlobalAlign32},
1550 {
S64, ConstantPtr,
S64, GlobalAlign32},
1551 {
V2S32, ConstantPtr,
V2S32, GlobalAlign32}});
1560 Actions.unsupportedIf(
1561 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1575 Actions.customIf(
typeIs(1, Constant32Ptr));
1601 return !Query.
Types[0].isVector() &&
1602 needToSplitMemOp(Query,
Op == G_LOAD);
1604 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1609 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1612 if (DstSize > MemSize)
1618 if (MemSize > MaxSize)
1626 return Query.
Types[0].isVector() &&
1627 needToSplitMemOp(Query,
Op == G_LOAD);
1629 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1643 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1644 if (MemSize > MaxSize) {
1648 if (MaxSize % EltSize == 0) {
1654 unsigned NumPieces = MemSize / MaxSize;
1658 if (NumPieces == 1 || NumPieces >= NumElts ||
1659 NumElts % NumPieces != 0)
1660 return std::pair(0, EltTy);
1668 return std::pair(0, EltTy);
1683 return std::pair(0, EltTy);
1688 .widenScalarToNextPow2(0)
1695 .legalForTypesWithMemDesc({{
S32, GlobalPtr,
S8, 8},
1696 {
S32, GlobalPtr,
S16, 2 * 8},
1697 {
S32, LocalPtr,
S8, 8},
1698 {
S32, LocalPtr,
S16, 16},
1699 {
S32, PrivatePtr,
S8, 8},
1700 {
S32, PrivatePtr,
S16, 16},
1701 {
S32, ConstantPtr,
S8, 8},
1702 {
S32, ConstantPtr,
S16, 2 * 8}})
1708 if (ST.hasFlatAddressSpace()) {
1709 ExtLoads.legalForTypesWithMemDesc(
1710 {{
S32, FlatPtr,
S8, 8}, {
S32, FlatPtr,
S16, 16}});
1725 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1726 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1727 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1728 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1729 .legalFor({{
S32, GlobalPtr}, {
S32, LocalPtr},
1730 {
S64, GlobalPtr}, {
S64, LocalPtr},
1731 {
S32, RegionPtr}, {
S64, RegionPtr}});
1732 if (ST.hasFlatAddressSpace()) {
1733 Atomics.legalFor({{
S32, FlatPtr}, {
S64, FlatPtr}});
1738 .legalFor({{
S32, GlobalPtr}, {
S32, LocalPtr}, {
S32, RegionPtr}});
1739 if (ST.hasFlatAddressSpace()) {
1740 Atomics32.legalFor({{
S32, FlatPtr}});
1745 if (ST.hasLDSFPAtomicAddF32()) {
1746 Atomic.legalFor({{
S32, LocalPtr}, {
S32, RegionPtr}});
1747 if (ST.hasLdsAtomicAddF64())
1748 Atomic.legalFor({{
S64, LocalPtr}});
1749 if (ST.hasAtomicDsPkAdd16Insts())
1750 Atomic.legalFor({{
V2F16, LocalPtr}, {
V2BF16, LocalPtr}});
1752 if (ST.hasAtomicFaddInsts())
1753 Atomic.legalFor({{
S32, GlobalPtr}});
1754 if (ST.hasFlatAtomicFaddF32Inst())
1755 Atomic.legalFor({{
S32, FlatPtr}});
1757 if (ST.hasGFX90AInsts() || ST.hasGFX1250Insts()) {
1768 if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1769 ST.hasAtomicBufferGlobalPkAddF16Insts())
1770 Atomic.legalFor({{
V2F16, GlobalPtr}, {
V2F16, BufferFatPtr}});
1771 if (ST.hasAtomicGlobalPkAddBF16Inst())
1772 Atomic.legalFor({{
V2BF16, GlobalPtr}});
1773 if (ST.hasAtomicFlatPkAdd16Insts())
1774 Atomic.legalFor({{
V2F16, FlatPtr}, {
V2BF16, FlatPtr}});
1779 auto &AtomicFMinFMax =
1781 .legalFor({{
F32, LocalPtr}, {
F64, LocalPtr}});
1783 if (ST.hasAtomicFMinFMaxF32GlobalInsts())
1785 if (ST.hasAtomicFMinFMaxF64GlobalInsts())
1786 AtomicFMinFMax.
legalFor({{
F64, GlobalPtr}, {
F64, BufferFatPtr}});
1787 if (ST.hasAtomicFMinFMaxF32FlatInsts())
1789 if (ST.hasAtomicFMinFMaxF64FlatInsts())
1796 {
S32, FlatPtr}, {
S64, FlatPtr}})
1797 .legalFor({{
S32, LocalPtr}, {
S64, LocalPtr},
1798 {
S32, RegionPtr}, {
S64, RegionPtr}});
1804 LocalPtr, FlatPtr, PrivatePtr,
1808 .clampScalar(0,
S16,
S64)
1823 if (ST.has16BitInsts()) {
1824 if (ST.hasVOP3PInsts()) {
1826 .clampMaxNumElements(0,
S16, 2);
1828 Shifts.legalFor({{
S16,
S16}});
1831 Shifts.widenScalarIf(
1836 const LLT AmountTy = Query.
Types[1];
1837 return ValTy.isScalar() && ValTy.getSizeInBits() <= 16 &&
1841 Shifts.clampScalar(1,
S32,
S32);
1842 Shifts.widenScalarToNextPow2(0, 16);
1843 Shifts.clampScalar(0,
S16,
S64);
1853 Shifts.clampScalar(1,
S32,
S32);
1854 Shifts.widenScalarToNextPow2(0, 32);
1855 Shifts.clampScalar(0,
S32,
S64);
1864 for (
unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1865 unsigned VecTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1866 unsigned EltTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1867 unsigned IdxTypeIdx = 2;
1871 const LLT EltTy = Query.
Types[EltTypeIdx];
1872 const LLT VecTy = Query.
Types[VecTypeIdx];
1873 const LLT IdxTy = Query.
Types[IdxTypeIdx];
1875 const bool isLegalVecType =
1885 return (EltSize == 32 || EltSize == 64) &&
1901 const LLT EltTy = Query.
Types[EltTypeIdx];
1902 const LLT VecTy = Query.
Types[VecTypeIdx];
1906 const unsigned TargetEltSize =
1907 DstEltSize % 64 == 0 ? 64 : 32;
1908 return std::pair(VecTypeIdx,
1912 .clampScalar(EltTypeIdx,
S32,
S64)
1926 const LLT &EltTy = Query.
Types[1].getElementType();
1927 return Query.
Types[0] != EltTy;
1930 for (
unsigned Op : {G_EXTRACT, G_INSERT}) {
1931 unsigned BigTyIdx =
Op == G_EXTRACT ? 1 : 0;
1932 unsigned LitTyIdx =
Op == G_EXTRACT ? 0 : 1;
1941 const LLT BigTy = Query.
Types[BigTyIdx];
1946 const LLT BigTy = Query.
Types[BigTyIdx];
1947 const LLT LitTy = Query.
Types[LitTyIdx];
1953 const LLT BigTy = Query.
Types[BigTyIdx];
1959 const LLT LitTy = Query.
Types[LitTyIdx];
1978 if (ST.hasScalarPackInsts()) {
1981 .minScalarOrElt(0,
S16)
1988 BuildVector.customFor({
V2S16,
S16});
1989 BuildVector.minScalarOrElt(0,
S32);
2008 for (
unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
2009 unsigned BigTyIdx =
Op == G_MERGE_VALUES ? 0 : 1;
2010 unsigned LitTyIdx =
Op == G_MERGE_VALUES ? 1 : 0;
2012 auto notValidElt = [=](
const LegalityQuery &Query,
unsigned TypeIdx) {
2013 const LLT Ty = Query.
Types[TypeIdx];
2014 if (Ty.isVector()) {
2029 const LLT BigTy = Query.
Types[BigTyIdx];
2049 return notValidElt(Query, LitTyIdx);
2054 return notValidElt(Query, BigTyIdx);
2059 if (
Op == G_MERGE_VALUES) {
2060 Builder.widenScalarIf(
2063 const LLT Ty = Query.
Types[LitTyIdx];
2064 return Ty.getSizeInBits() < 32;
2071 const LLT Ty = Query.
Types[BigTyIdx];
2072 return Ty.getSizeInBits() % 16 != 0;
2077 const LLT &Ty = Query.
Types[BigTyIdx];
2078 unsigned NewSizeInBits = 1 <<
Log2_32_Ceil(Ty.getSizeInBits() + 1);
2079 if (NewSizeInBits >= 256) {
2080 unsigned RoundedTo =
alignTo<64>(Ty.getSizeInBits() + 1);
2081 if (RoundedTo < NewSizeInBits)
2082 NewSizeInBits = RoundedTo;
2084 return std::pair(BigTyIdx,
LLT::scalar(NewSizeInBits));
2095 .clampScalar(0,
S32,
S64);
2097 if (ST.hasVOP3PInsts()) {
2098 SextInReg.lowerFor({{
V2S16}})
2102 .clampMaxNumElementsStrict(0,
S16, 2);
2103 }
else if (ST.has16BitInsts()) {
2104 SextInReg.lowerFor({{
S32}, {
S64}, {
S16}});
2108 SextInReg.lowerFor({{
S32}, {
S64}});
2121 FSHRActionDefs.legalFor({{
S32,
S32}})
2122 .clampMaxNumElementsStrict(0,
S16, 2);
2123 if (ST.hasVOP3PInsts())
2125 FSHRActionDefs.scalarize(0).lower();
2127 if (ST.hasVOP3PInsts()) {
2130 .clampMaxNumElementsStrict(0,
S16, 2)
2154 .clampScalar(1,
S32,
S32)
2163 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2164 G_READ_REGISTER, G_WRITE_REGISTER,
2169 if (ST.hasIEEEMinimumMaximumInsts()) {
2171 .legalFor(FPTypesPK16)
2174 }
else if (ST.hasVOP3PInsts()) {
2177 .clampMaxNumElementsStrict(0,
S16, 2)
2193 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2194 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2200 {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX,
2201 G_VECREDUCE_ADD, G_VECREDUCE_MUL, G_VECREDUCE_FMUL, G_VECREDUCE_FMIN,
2202 G_VECREDUCE_FMAX, G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM,
2203 G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
2209 verify(*ST.getInstrInfo());
2218 switch (
MI.getOpcode()) {
2219 case TargetOpcode::G_ADDRSPACE_CAST:
2221 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2223 case TargetOpcode::G_FCEIL:
2225 case TargetOpcode::G_FREM:
2227 case TargetOpcode::G_INTRINSIC_TRUNC:
2229 case TargetOpcode::G_SITOFP:
2231 case TargetOpcode::G_UITOFP:
2233 case TargetOpcode::G_FPTOSI:
2235 case TargetOpcode::G_FPTOUI:
2237 case TargetOpcode::G_FMINNUM:
2238 case TargetOpcode::G_FMAXNUM:
2239 case TargetOpcode::G_FMINIMUMNUM:
2240 case TargetOpcode::G_FMAXIMUMNUM:
2242 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2244 case TargetOpcode::G_INSERT_VECTOR_ELT:
2246 case TargetOpcode::G_FSIN:
2247 case TargetOpcode::G_FCOS:
2249 case TargetOpcode::G_GLOBAL_VALUE:
2251 case TargetOpcode::G_LOAD:
2252 case TargetOpcode::G_SEXTLOAD:
2253 case TargetOpcode::G_ZEXTLOAD:
2255 case TargetOpcode::G_STORE:
2257 case TargetOpcode::G_FMAD:
2259 case TargetOpcode::G_FDIV:
2261 case TargetOpcode::G_FFREXP:
2263 case TargetOpcode::G_FSQRT:
2265 case TargetOpcode::G_UDIV:
2266 case TargetOpcode::G_UREM:
2267 case TargetOpcode::G_UDIVREM:
2269 case TargetOpcode::G_SDIV:
2270 case TargetOpcode::G_SREM:
2271 case TargetOpcode::G_SDIVREM:
2273 case TargetOpcode::G_ATOMIC_CMPXCHG:
2275 case TargetOpcode::G_FLOG2:
2277 case TargetOpcode::G_FLOG:
2278 case TargetOpcode::G_FLOG10:
2280 case TargetOpcode::G_FEXP2:
2282 case TargetOpcode::G_FEXP:
2283 case TargetOpcode::G_FEXP10:
2285 case TargetOpcode::G_FPOW:
2287 case TargetOpcode::G_FFLOOR:
2289 case TargetOpcode::G_BUILD_VECTOR:
2290 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2292 case TargetOpcode::G_MUL:
2294 case TargetOpcode::G_CTLZ:
2295 case TargetOpcode::G_CTTZ:
2297 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2299 case TargetOpcode::G_STACKSAVE:
2301 case TargetOpcode::G_GET_FPENV:
2303 case TargetOpcode::G_SET_FPENV:
2305 case TargetOpcode::G_TRAP:
2307 case TargetOpcode::G_DEBUGTRAP:
2327 if (ST.hasApertureRegs()) {
2332 ? AMDGPU::SRC_SHARED_BASE
2333 : AMDGPU::SRC_PRIVATE_BASE;
2334 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
2335 !ST.hasGloballyAddressableScratch()) &&
2336 "Cannot use src_private_base with globally addressable scratch!");
2338 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2339 B.buildCopy({Dst}, {
Register(ApertureRegNo)});
2340 return B.buildUnmerge(
S32, Dst).getReg(1);
2343 Register LoadAddr =
MRI.createGenericVirtualRegister(
2355 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), Param);
2357 Register KernargPtrReg =
MRI.createGenericVirtualRegister(
2371 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
2374 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2377 Register QueuePtr =
MRI.createGenericVirtualRegister(
2396 B.buildObjectPtrOffset(
2398 B.buildConstant(
LLT::scalar(64), StructOffset).getReg(0));
2399 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2407 switch (Def->getOpcode()) {
2408 case AMDGPU::G_FRAME_INDEX:
2409 case AMDGPU::G_GLOBAL_VALUE:
2410 case AMDGPU::G_BLOCK_ADDR:
2412 case AMDGPU::G_CONSTANT: {
2413 const ConstantInt *CI = Def->getOperand(1).getCImm();
2430 assert(
MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2432 Intrinsic::amdgcn_addrspacecast_nonnull));
2437 :
MI.getOperand(1).getReg();
2438 LLT DstTy =
MRI.getType(Dst);
2439 LLT SrcTy =
MRI.getType(Src);
2441 unsigned SrcAS = SrcTy.getAddressSpace();
2451 MI.setDesc(
B.getTII().get(TargetOpcode::G_BITCAST));
2458 auto castFlatToLocalOrPrivate = [&](
const DstOp &Dst) ->
Register {
2460 ST.hasGloballyAddressableScratch()) {
2464 Register SrcLo =
B.buildExtract(
S32, Src, 0).getReg(0);
2466 B.buildInstr(AMDGPU::S_MOV_B32, {
S32},
2467 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)})
2469 MRI.setRegClass(FlatScratchBaseLo, &AMDGPU::SReg_32RegClass);
2471 return B.buildIntToPtr(Dst,
Sub).getReg(0);
2475 return B.buildExtract(Dst, Src, 0).getReg(0);
2481 castFlatToLocalOrPrivate(Dst);
2482 MI.eraseFromParent();
2488 auto SegmentNull =
B.buildConstant(DstTy, NullVal);
2489 auto FlatNull =
B.buildConstant(SrcTy, 0);
2492 auto PtrLo32 = castFlatToLocalOrPrivate(DstTy);
2496 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2498 MI.eraseFromParent();
2505 auto castLocalOrPrivateToFlat = [&](
const DstOp &Dst) ->
Register {
2508 Register SrcAsInt =
B.buildPtrToInt(
S32, Src).getReg(0);
2511 ST.hasGloballyAddressableScratch()) {
2516 ThreadID =
B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {
S32})
2520 if (ST.isWave64()) {
2521 ThreadID =
B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {
S32})
2527 B.buildConstant(
S32, 57 - 32 - ST.getWavefrontSizeLog2()).getReg(0);
2528 Register SrcHi =
B.buildShl(
S32, ThreadID, ShAmt).getReg(0);
2530 B.buildMergeLikeInstr(DstTy, {SrcAsInt, SrcHi}).
getReg(0);
2534 B.buildInstr(AMDGPU::S_MOV_B64, {
S64},
2535 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE)})
2537 MRI.setRegClass(FlatScratchBase, &AMDGPU::SReg_64RegClass);
2538 return B.buildPtrAdd(Dst, CvtPtr, FlatScratchBase).getReg(0);
2547 return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).
getReg(0);
2553 castLocalOrPrivateToFlat(Dst);
2554 MI.eraseFromParent();
2558 Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2564 SegmentNull.getReg(0));
2566 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2568 MI.eraseFromParent();
2573 SrcTy.getSizeInBits() == 64) {
2575 B.buildExtract(Dst, Src, 0);
2576 MI.eraseFromParent();
2583 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2584 auto PtrLo =
B.buildPtrToInt(
S32, Src);
2585 if (AddrHiVal == 0) {
2587 B.buildIntToPtr(Dst, Zext);
2589 auto HighAddr =
B.buildConstant(
S32, AddrHiVal);
2590 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2593 MI.eraseFromParent();
2600 MI.eraseFromParent();
2608 LLT Ty =
MRI.getType(Src);
2609 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2614 auto C1 =
B.buildFConstant(Ty, C1Val);
2615 auto CopySign =
B.buildFCopysign(Ty, C1, Src);
2618 auto Tmp1 =
B.buildFAdd(Ty, Src, CopySign);
2619 auto Tmp2 =
B.buildFSub(Ty, Tmp1, CopySign);
2621 auto C2 =
B.buildFConstant(Ty, C2Val);
2622 auto Fabs =
B.buildFAbs(Ty, Src);
2625 B.buildSelect(
MI.getOperand(0).getReg(),
Cond, Src, Tmp2);
2626 MI.eraseFromParent();
2644 auto Trunc =
B.buildIntrinsicTrunc(
S64, Src);
2646 const auto Zero =
B.buildFConstant(
S64, 0.0);
2647 const auto One =
B.buildFConstant(
S64, 1.0);
2650 auto And =
B.buildAnd(
S1, Lt0, NeTrunc);
2651 auto Add =
B.buildSelect(
S64,
And, One, Zero);
2654 B.buildFAdd(
MI.getOperand(0).getReg(), Trunc,
Add);
2655 MI.eraseFromParent();
2663 Register Src0Reg =
MI.getOperand(1).getReg();
2664 Register Src1Reg =
MI.getOperand(2).getReg();
2665 auto Flags =
MI.getFlags();
2666 LLT Ty =
MRI.getType(DstReg);
2668 auto Div =
B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2669 auto Trunc =
B.buildIntrinsicTrunc(Ty, Div, Flags);
2670 auto Neg =
B.buildFNeg(Ty, Trunc, Flags);
2671 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2672 MI.eraseFromParent();
2678 const unsigned FractBits = 52;
2679 const unsigned ExpBits = 11;
2682 auto Const0 =
B.buildConstant(
S32, FractBits - 32);
2683 auto Const1 =
B.buildConstant(
S32, ExpBits);
2685 auto ExpPart =
B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {
S32})
2687 .addUse(Const0.getReg(0))
2688 .addUse(Const1.getReg(0));
2690 return B.buildSub(
S32, ExpPart,
B.buildConstant(
S32, 1023));
2704 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2711 const unsigned FractBits = 52;
2714 const auto SignBitMask =
B.buildConstant(
S32, UINT32_C(1) << 31);
2715 auto SignBit =
B.buildAnd(
S32,
Hi, SignBitMask);
2717 const auto FractMask =
B.buildConstant(
S64, (UINT64_C(1) << FractBits) - 1);
2719 const auto Zero32 =
B.buildConstant(
S32, 0);
2722 auto SignBit64 =
B.buildMergeLikeInstr(
S64, {Zero32, SignBit});
2724 auto Shr =
B.buildAShr(
S64, FractMask, Exp);
2725 auto Not =
B.buildNot(
S64, Shr);
2726 auto Tmp0 =
B.buildAnd(
S64, Src, Not);
2727 auto FiftyOne =
B.buildConstant(
S32, FractBits - 1);
2732 auto Tmp1 =
B.buildSelect(
S64, ExpLt0, SignBit64, Tmp0);
2733 B.buildSelect(
MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2734 MI.eraseFromParent();
2750 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2751 auto ThirtyTwo =
B.buildConstant(
S32, 32);
2753 if (
MRI.getType(Dst) ==
S64) {
2754 auto CvtHi =
Signed ?
B.buildSITOFP(
S64, Unmerge.getReg(1))
2755 :
B.buildUITOFP(
S64, Unmerge.getReg(1));
2757 auto CvtLo =
B.buildUITOFP(
S64, Unmerge.getReg(0));
2758 auto LdExp =
B.buildFLdexp(
S64, CvtHi, ThirtyTwo);
2761 B.buildFAdd(Dst, LdExp, CvtLo);
2762 MI.eraseFromParent();
2768 auto One =
B.buildConstant(
S32, 1);
2772 auto ThirtyOne =
B.buildConstant(
S32, 31);
2773 auto X =
B.buildXor(
S32, Unmerge.getReg(0), Unmerge.getReg(1));
2774 auto OppositeSign =
B.buildAShr(
S32,
X, ThirtyOne);
2775 auto MaxShAmt =
B.buildAdd(
S32, ThirtyTwo, OppositeSign);
2776 auto LS =
B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {
S32})
2777 .addUse(Unmerge.getReg(1));
2778 auto LS2 =
B.buildSub(
S32, LS, One);
2779 ShAmt =
B.buildUMin(
S32, LS2, MaxShAmt);
2781 ShAmt =
B.buildCTLZ(
S32, Unmerge.getReg(1));
2782 auto Norm =
B.buildShl(
S64, Src, ShAmt);
2783 auto Unmerge2 =
B.buildUnmerge({
S32,
S32}, Norm);
2784 auto Adjust =
B.buildUMin(
S32, One, Unmerge2.getReg(0));
2785 auto Norm2 =
B.buildOr(
S32, Unmerge2.getReg(1), Adjust);
2786 auto FVal =
Signed ?
B.buildSITOFP(
S32, Norm2) :
B.buildUITOFP(
S32, Norm2);
2787 auto Scale =
B.buildSub(
S32, ThirtyTwo, ShAmt);
2788 B.buildFLdexp(Dst, FVal, Scale);
2789 MI.eraseFromParent();
2806 const LLT SrcLT =
MRI.getType(Src);
2809 unsigned Flags =
MI.getFlags();
2820 auto Trunc =
B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2828 Sign =
B.buildAShr(
S32, Src,
B.buildConstant(
S32, 31));
2829 Trunc =
B.buildFAbs(
S32, Trunc, Flags);
2833 K0 =
B.buildFConstant(
2835 K1 =
B.buildFConstant(
2838 K0 =
B.buildFConstant(
2840 K1 =
B.buildFConstant(
2844 auto Mul =
B.buildFMul(SrcLT, Trunc, K0, Flags);
2845 auto FloorMul =
B.buildFFloor(SrcLT,
Mul, Flags);
2846 auto Fma =
B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2849 :
B.buildFPTOUI(
S32, FloorMul);
2850 auto Lo =
B.buildFPTOUI(
S32, Fma);
2854 Sign =
B.buildMergeLikeInstr(
S64, {Sign, Sign});
2856 B.buildSub(Dst,
B.buildXor(
S64,
B.buildMergeLikeInstr(
S64, {Lo, Hi}), Sign),
2859 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
2860 MI.eraseFromParent();
2887 LLT VecTy =
MRI.getType(Vec);
2900 auto IntVec =
B.buildPtrToInt(IntVecTy, Vec);
2901 auto IntElt =
B.buildExtractVectorElement(IntTy, IntVec,
MI.getOperand(2));
2902 B.buildIntToPtr(Dst, IntElt);
2904 MI.eraseFromParent();
2911 std::optional<ValueAndVReg> MaybeIdxVal =
2915 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2918 auto Unmerge =
B.buildUnmerge(EltTy, Vec);
2919 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2924 MI.eraseFromParent();
2939 LLT VecTy =
MRI.getType(Vec);
2953 auto IntVecSource =
B.buildPtrToInt(IntVecTy, Vec);
2954 auto IntIns =
B.buildPtrToInt(IntTy, Ins);
2955 auto IntVecDest =
B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
2957 B.buildIntToPtr(Dst, IntVecDest);
2958 MI.eraseFromParent();
2965 std::optional<ValueAndVReg> MaybeIdxVal =
2970 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2973 if (IdxVal < NumElts) {
2975 for (
unsigned i = 0; i < NumElts; ++i)
2976 SrcRegs.
push_back(
MRI.createGenericVirtualRegister(EltTy));
2977 B.buildUnmerge(SrcRegs, Vec);
2979 SrcRegs[IdxVal] =
MI.getOperand(2).getReg();
2980 B.buildMergeLikeInstr(Dst, SrcRegs);
2985 MI.eraseFromParent();
2995 LLT Ty =
MRI.getType(DstReg);
2996 unsigned Flags =
MI.getFlags();
3000 if (ST.hasTrigReducedRange()) {
3001 auto MulVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
3002 TrigVal =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
3003 .addUse(MulVal.getReg(0))
3007 TrigVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
3010 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
3014 MI.eraseFromParent();
3022 unsigned GAFlags)
const {
3051 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
3053 if (ST.has64BitLiterals()) {
3057 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET64).addDef(PCReg);
3061 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET).addDef(PCReg);
3070 if (!
B.getMRI()->getRegClassOrNull(PCReg))
3071 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
3074 B.buildExtract(DstReg, PCReg, 0);
3084 if (RequiresHighHalf && ST.has64BitLiterals()) {
3085 if (!
MRI.getRegClassOrNull(DstReg))
3086 MRI.setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
3087 B.buildInstr(AMDGPU::S_MOV_B64)
3097 Register AddrLo = !RequiresHighHalf && !
MRI.getRegClassOrNull(DstReg)
3099 :
MRI.createGenericVirtualRegister(
S32);
3101 if (!
MRI.getRegClassOrNull(AddrLo))
3102 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
3105 B.buildInstr(AMDGPU::S_MOV_B32)
3110 if (RequiresHighHalf) {
3112 "Must provide a 64-bit pointer type!");
3115 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
3117 B.buildInstr(AMDGPU::S_MOV_B32)
3127 if (!
MRI.getRegClassOrNull(AddrDst))
3128 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
3130 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
3134 if (AddrDst != DstReg)
3135 B.buildCast(DstReg, AddrDst);
3136 }
else if (AddrLo != DstReg) {
3139 B.buildCast(DstReg, AddrLo);
3147 LLT Ty =
MRI.getType(DstReg);
3148 unsigned AS = Ty.getAddressSpace();
3156 GV->
getName() !=
"llvm.amdgcn.module.lds" &&
3160 Fn,
"local memory global used by non-kernel function",
3169 B.buildUndef(DstReg);
3170 MI.eraseFromParent();
3194 auto Sz =
B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {
S32});
3195 B.buildIntToPtr(DstReg, Sz);
3196 MI.eraseFromParent();
3202 MI.eraseFromParent();
3206 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3208 MI.eraseFromParent();
3216 MI.eraseFromParent();
3222 MI.eraseFromParent();
3227 Register GOTAddr =
MRI.createGenericVirtualRegister(PtrTy);
3238 if (Ty.getSizeInBits() == 32) {
3240 auto Load =
B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3241 B.buildExtract(DstReg, Load, 0);
3243 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3245 MI.eraseFromParent();
3263 LLT PtrTy =
MRI.getType(PtrReg);
3268 auto Cast =
B.buildAddrSpaceCast(ConstPtr, PtrReg);
3270 MI.getOperand(1).setReg(Cast.getReg(0));
3275 if (
MI.getOpcode() != AMDGPU::G_LOAD)
3279 LLT ValTy =
MRI.getType(ValReg);
3289 const unsigned ValSize = ValTy.getSizeInBits();
3301 if (WideMemSize == ValSize) {
3307 MI.setMemRefs(MF, {WideMMO});
3313 if (ValSize > WideMemSize)
3320 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3321 B.buildTrunc(ValReg, WideLoad).getReg(0);
3328 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3329 B.buildExtract(ValReg, WideLoad, 0);
3333 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3334 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3338 MI.eraseFromParent();
3351 Register DataReg =
MI.getOperand(0).getReg();
3352 LLT DataTy =
MRI.getType(DataReg);
3366 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
3395 "this should not have been custom lowered");
3397 LLT ValTy =
MRI.getType(CmpVal);
3400 Register PackedVal =
B.buildBuildVector(VecTy, { NewVal, CmpVal }).
getReg(0);
3402 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3406 .setMemRefs(
MI.memoperands());
3408 MI.eraseFromParent();
3416 switch (
DefMI->getOpcode()) {
3417 case TargetOpcode::G_INTRINSIC: {
3419 case Intrinsic::amdgcn_frexp_mant:
3420 case Intrinsic::amdgcn_log:
3421 case Intrinsic::amdgcn_log_clamp:
3422 case Intrinsic::amdgcn_exp2:
3423 case Intrinsic::amdgcn_sqrt:
3431 case TargetOpcode::G_FSQRT:
3433 case TargetOpcode::G_FFREXP: {
3434 if (
DefMI->getOperand(0).getReg() == Src)
3438 case TargetOpcode::G_FPEXT: {
3459std::pair<Register, Register>
3461 unsigned Flags)
const {
3466 auto SmallestNormal =
B.buildFConstant(
3468 auto IsLtSmallestNormal =
3471 auto Scale32 =
B.buildFConstant(
F32, 0x1.0p+32);
3472 auto One =
B.buildFConstant(
F32, 1.0);
3474 B.buildSelect(
F32, IsLtSmallestNormal, Scale32, One, Flags);
3475 auto ScaledInput =
B.buildFMul(
F32, Src, ScaleFactor, Flags);
3477 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3490 LLT Ty =
B.getMRI()->getType(Dst);
3491 unsigned Flags =
MI.getFlags();
3496 auto Ext =
B.buildFPExt(
F32, Src, Flags);
3497 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {
F32})
3498 .addUse(Ext.getReg(0))
3500 B.buildFPTrunc(Dst,
Log2, Flags);
3501 MI.eraseFromParent();
3509 B.buildIntrinsic(Intrinsic::amdgcn_log, {
MI.getOperand(0)})
3512 MI.eraseFromParent();
3516 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3517 .addUse(ScaledInput)
3520 auto ThirtyTwo =
B.buildFConstant(Ty, 32.0);
3521 auto Zero =
B.buildFConstant(Ty, 0.0);
3523 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3524 B.buildFSub(Dst,
Log2, ResultOffset, Flags);
3526 MI.eraseFromParent();
3532 auto FMul =
B.buildFMul(Ty,
X,
Y, Flags);
3533 return B.buildFAdd(Ty,
FMul, Z, Flags).getReg(0);
3538 const bool IsLog10 =
MI.getOpcode() == TargetOpcode::G_FLOG10;
3539 assert(IsLog10 ||
MI.getOpcode() == TargetOpcode::G_FLOG);
3544 unsigned Flags =
MI.getFlags();
3545 const LLT Ty =
MRI.getType(
X);
3555 if (Ty == F16 && !ST.has16BitInsts()) {
3557 auto PromoteSrc =
B.buildFPExt(
F32,
X);
3559 B.buildFPTrunc(Dst, LogVal);
3564 MI.eraseFromParent();
3573 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(
X).setMIFlags(Flags);
3576 if (ST.hasFastFMAF32()) {
3578 const float c_log10 = 0x1.344134p-2f;
3579 const float cc_log10 = 0x1.09f79ep-26f;
3582 const float c_log = 0x1.62e42ep-1f;
3583 const float cc_log = 0x1.efa39ep-25f;
3585 auto C =
B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3586 auto CC =
B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3590 R =
B.buildFMul(Ty,
Y,
C, NewFlags).getReg(0);
3591 auto NegR =
B.buildFNeg(Ty, R, NewFlags);
3592 auto FMA0 =
B.buildFMA(Ty,
Y,
C, NegR, NewFlags);
3593 auto FMA1 =
B.buildFMA(Ty,
Y, CC, FMA0, NewFlags);
3594 R =
B.buildFAdd(Ty, R, FMA1, NewFlags).getReg(0);
3597 const float ch_log10 = 0x1.344000p-2f;
3598 const float ct_log10 = 0x1.3509f6p-18f;
3601 const float ch_log = 0x1.62e000p-1f;
3602 const float ct_log = 0x1.0bfbe8p-15f;
3604 auto CH =
B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3605 auto CT =
B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3607 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
3608 auto YH =
B.buildAnd(Ty,
Y, MaskConst);
3609 auto YT =
B.buildFSub(Ty,
Y, YH, Flags);
3613 auto YTCT =
B.buildFMul(Ty, YT, CT, NewFlags);
3616 getMad(
B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), NewFlags);
3618 R =
getMad(
B, Ty, YH.getReg(0),
CH.getReg(0), Mad1, NewFlags);
3621 const bool IsFiniteOnly =
3625 if (!IsFiniteOnly) {
3628 auto Fabs =
B.buildFAbs(Ty,
Y);
3631 R =
B.buildSelect(Ty, IsFinite, R,
Y, Flags).getReg(0);
3635 auto Zero =
B.buildFConstant(Ty, 0.0);
3637 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3638 auto Shift =
B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3639 B.buildFSub(Dst, R, Shift, Flags);
3641 B.buildCopy(Dst, R);
3644 MI.eraseFromParent();
3650 unsigned Flags)
const {
3651 const double Log2BaseInverted =
3654 LLT Ty =
B.getMRI()->getType(Dst);
3659 auto LogSrc =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3662 auto ScaledResultOffset =
B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3663 auto Zero =
B.buildFConstant(Ty, 0.0);
3665 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3666 auto Log2Inv =
B.buildFConstant(Ty, Log2BaseInverted);
3668 if (ST.hasFastFMAF32())
3669 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3671 auto Mul =
B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3672 B.buildFAdd(Dst,
Mul, ResultOffset, Flags);
3680 ?
B.buildFLog2(Ty, Src, Flags)
3681 :
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3684 auto Log2BaseInvertedOperand =
B.buildFConstant(Ty, Log2BaseInverted);
3685 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3696 unsigned Flags =
MI.getFlags();
3697 LLT Ty =
B.getMRI()->getType(Dst);
3703 auto Ext =
B.buildFPExt(
F32, Src, Flags);
3704 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {
F32})
3705 .addUse(Ext.getReg(0))
3707 B.buildFPTrunc(Dst,
Log2, Flags);
3708 MI.eraseFromParent();
3718 MI.eraseFromParent();
3726 auto RangeCheckConst =
B.buildFConstant(Ty, -0x1.f80000p+6f);
3728 RangeCheckConst, Flags);
3730 auto SixtyFour =
B.buildFConstant(Ty, 0x1.0p+6f);
3731 auto Zero =
B.buildFConstant(Ty, 0.0);
3732 auto AddOffset =
B.buildSelect(
F32, NeedsScaling, SixtyFour, Zero, Flags);
3733 auto AddInput =
B.buildFAdd(
F32, Src, AddOffset, Flags);
3735 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3736 .addUse(AddInput.getReg(0))
3739 auto TwoExpNeg64 =
B.buildFConstant(Ty, 0x1.0p-64f);
3740 auto One =
B.buildFConstant(Ty, 1.0);
3741 auto ResultScale =
B.buildSelect(
F32, NeedsScaling, TwoExpNeg64, One, Flags);
3742 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3743 MI.eraseFromParent();
3748 const SrcOp &Src,
unsigned Flags) {
3749 LLT Ty = Dst.getLLTTy(*
B.getMRI());
3752 return B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Dst})
3753 .addUse(Src.getReg())
3756 return B.buildFExp2(Dst, Src, Flags);
3762 bool IsExp10)
const {
3763 LLT Ty =
B.getMRI()->getType(
X);
3767 auto Const =
B.buildFConstant(Ty, IsExp10 ? 0x1.a934f0p+1f :
numbers::log2e);
3768 auto Mul =
B.buildFMul(Ty,
X, Const, Flags);
3775 LLT Ty =
B.getMRI()->getType(Dst);
3782 auto Threshold =
B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3785 auto ScaleOffset =
B.buildFConstant(Ty, 0x1.0p+6f);
3786 auto ScaledX =
B.buildFAdd(Ty,
X, ScaleOffset, Flags);
3787 auto AdjustedX =
B.buildSelect(Ty, NeedsScaling, ScaledX,
X, Flags);
3790 auto ExpInput =
B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3792 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3793 .addUse(ExpInput.getReg(0))
3796 auto ResultScaleFactor =
B.buildFConstant(Ty, 0x1.969d48p-93f);
3797 auto AdjustedResult =
B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3798 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3804 unsigned Flags)
const {
3805 LLT Ty =
B.getMRI()->getType(Dst);
3810 auto K0 =
B.buildFConstant(Ty, 0x1.a92000p+1f);
3811 auto K1 =
B.buildFConstant(Ty, 0x1.4f0978p-11f);
3813 auto Mul1 =
B.buildFMul(Ty,
X, K1, Flags);
3814 auto Exp2_1 =
buildExp(
B, Ty, Mul1, Flags);
3815 auto Mul0 =
B.buildFMul(Ty,
X, K0, Flags);
3816 auto Exp2_0 =
buildExp(
B, Ty, Mul0, Flags);
3817 B.buildFMul(Dst, Exp2_0, Exp2_1, Flags);
3827 auto Threshold =
B.buildFConstant(Ty, -0x1.2f7030p+5f);
3831 auto ScaleOffset =
B.buildFConstant(Ty, 0x1.0p+5f);
3832 auto ScaledX =
B.buildFAdd(Ty,
X, ScaleOffset, Flags);
3833 auto AdjustedX =
B.buildSelect(Ty, NeedsScaling, ScaledX,
X);
3835 auto K0 =
B.buildFConstant(Ty, 0x1.a92000p+1f);
3836 auto K1 =
B.buildFConstant(Ty, 0x1.4f0978p-11f);
3838 auto Mul1 =
B.buildFMul(Ty, AdjustedX, K1, Flags);
3839 auto Exp2_1 =
buildExp(
B, Ty, Mul1, Flags);
3840 auto Mul0 =
B.buildFMul(Ty, AdjustedX, K0, Flags);
3841 auto Exp2_0 =
buildExp(
B, Ty, Mul0, Flags);
3843 auto MulExps =
B.buildFMul(Ty, Exp2_0, Exp2_1, Flags);
3844 auto ResultScaleFactor =
B.buildFConstant(Ty, 0x1.9f623ep-107f);
3845 auto AdjustedResult =
B.buildFMul(Ty, MulExps, ResultScaleFactor, Flags);
3847 B.buildSelect(Dst, NeedsScaling, AdjustedResult, MulExps);
3855 const unsigned Flags =
MI.getFlags();
3858 LLT Ty =
MRI.getType(Dst);
3861 const bool IsExp10 =
MI.getOpcode() == TargetOpcode::G_FEXP10;
3869 MI.eraseFromParent();
3880 auto Ext =
B.buildFPExt(
F32,
X, Flags);
3883 B.buildFPTrunc(Dst, Lowered, Flags);
3884 MI.eraseFromParent();
3895 MI.eraseFromParent();
3923 const unsigned FlagsNoContract = Flags &
~MachineInstr::FmContract;
3926 if (ST.hasFastFMAF32()) {
3928 const float cc_exp = 0x1.4ae0bep-26f;
3929 const float c_exp10 = 0x1.a934f0p+1f;
3930 const float cc_exp10 = 0x1.2f346ep-24f;
3932 auto C =
B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
3933 PH =
B.buildFMul(Ty,
X,
C, Flags).getReg(0);
3934 auto NegPH =
B.buildFNeg(Ty, PH, Flags);
3935 auto FMA0 =
B.buildFMA(Ty,
X,
C, NegPH, Flags);
3937 auto CC =
B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
3938 PL =
B.buildFMA(Ty,
X, CC, FMA0, Flags).getReg(0);
3940 const float ch_exp = 0x1.714000p+0f;
3941 const float cl_exp = 0x1.47652ap-12f;
3943 const float ch_exp10 = 0x1.a92000p+1f;
3944 const float cl_exp10 = 0x1.4f0978p-11f;
3946 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
3947 auto XH =
B.buildAnd(Ty,
X, MaskConst);
3948 auto XL =
B.buildFSub(Ty,
X, XH, Flags);
3950 auto CH =
B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
3951 PH =
B.buildFMul(Ty, XH,
CH, Flags).getReg(0);
3953 auto CL =
B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
3954 auto XLCL =
B.buildFMul(Ty, XL, CL, Flags);
3957 getMad(
B, Ty, XL.getReg(0),
CH.getReg(0), XLCL.getReg(0), Flags);
3958 PL =
getMad(
B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
3961 auto E =
B.buildIntrinsicRoundeven(Ty, PH, Flags);
3964 auto PHSubE =
B.buildFSub(Ty, PH, E, FlagsNoContract);
3965 auto A =
B.buildFAdd(Ty, PHSubE, PL, Flags);
3968 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3969 .addUse(
A.getReg(0))
3971 auto R =
B.buildFLdexp(Ty, Exp2, IntE, Flags);
3973 auto UnderflowCheckConst =
3974 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3975 auto Zero =
B.buildFConstant(Ty, 0.0);
3979 R =
B.buildSelect(Ty, Underflow, Zero, R);
3982 auto OverflowCheckConst =
3983 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3988 R =
B.buildSelect(Ty, Overflow, Inf, R, Flags);
3991 B.buildCopy(Dst, R);
3992 MI.eraseFromParent();
4001 unsigned Flags =
MI.getFlags();
4002 LLT Ty =
B.getMRI()->getType(Dst);
4007 auto Log =
B.buildFLog2(
F32, Src0, Flags);
4008 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {
F32})
4009 .addUse(Log.getReg(0))
4012 B.buildFExp2(Dst,
Mul, Flags);
4013 }
else if (Ty == F16) {
4015 auto Log =
B.buildFLog2(F16, Src0, Flags);
4016 auto Ext0 =
B.buildFPExt(
F32, Log, Flags);
4017 auto Ext1 =
B.buildFPExt(
F32, Src1, Flags);
4018 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {
F32})
4019 .addUse(Ext0.getReg(0))
4020 .addUse(Ext1.getReg(0))
4022 B.buildFExp2(Dst,
B.buildFPTrunc(F16,
Mul), Flags);
4026 MI.eraseFromParent();
4034 ModSrc = SrcFNeg->getOperand(1).getReg();
4036 ModSrc = SrcFAbs->getOperand(1).getReg();
4038 ModSrc = SrcFAbs->getOperand(1).getReg();
4049 Register OrigSrc =
MI.getOperand(1).getReg();
4050 unsigned Flags =
MI.getFlags();
4052 "this should not have been custom lowered");
4062 auto Fract =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {
F64})
4082 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
4084 B.buildFMinNum(Min, Fract, Const, Flags);
4089 CorrectedFract =
B.buildSelect(
F64, IsNan, ModSrc, Min, Flags).getReg(0);
4092 auto NegFract =
B.buildFNeg(
F64, CorrectedFract, Flags);
4093 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
4095 MI.eraseFromParent();
4111 if (
MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
4113 Src0 =
B.buildTrunc(
S16,
MI.getOperand(1).getReg()).getReg(0);
4114 Src1 =
B.buildTrunc(
S16,
MI.getOperand(2).getReg()).getReg(0);
4117 auto Merge =
B.buildMergeLikeInstr(
S32, {Src0, Src1});
4118 B.buildBitcast(Dst,
Merge);
4120 MI.eraseFromParent();
4137 bool UsePartialMad64_32,
4138 bool SeparateOddAlignedProducts)
const {
4153 auto getZero32 = [&]() ->
Register {
4155 Zero32 =
B.buildConstant(
S32, 0).getReg(0);
4158 auto getZero64 = [&]() ->
Register {
4160 Zero64 =
B.buildConstant(
S64, 0).getReg(0);
4165 for (
unsigned i = 0; i < Src0.
size(); ++i) {
4176 if (CarryIn.empty())
4179 bool HaveCarryOut =
true;
4181 if (CarryIn.size() == 1) {
4183 LocalAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
4187 CarryAccum = getZero32();
4189 CarryAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
4190 for (
unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
4192 B.buildUAdde(
S32,
S1, CarryAccum, getZero32(), CarryIn[i])
4197 LocalAccum = getZero32();
4198 HaveCarryOut =
false;
4203 B.buildUAdde(
S32,
S1, CarryAccum, LocalAccum, CarryIn.back());
4204 LocalAccum =
Add.getReg(0);
4218 auto buildMadChain =
4221 assert((DstIndex + 1 < Accum.
size() && LocalAccum.size() == 2) ||
4222 (DstIndex + 1 >= Accum.
size() && LocalAccum.size() == 1));
4229 if (LocalAccum.size() == 1 &&
4230 (!UsePartialMad64_32 || !CarryIn.empty())) {
4233 unsigned j1 = DstIndex - j0;
4234 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4238 auto Mul =
B.buildMul(
S32, Src0[j0], Src1[j1]);
4240 LocalAccum[0] =
Mul.getReg(0);
4242 if (CarryIn.empty()) {
4243 LocalAccum[0] =
B.buildAdd(
S32, LocalAccum[0],
Mul).getReg(0);
4246 B.buildUAdde(
S32,
S1, LocalAccum[0],
Mul, CarryIn.back())
4252 }
while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4256 if (j0 <= DstIndex) {
4257 bool HaveSmallAccum =
false;
4260 if (LocalAccum[0]) {
4261 if (LocalAccum.size() == 1) {
4262 Tmp =
B.buildAnyExt(
S64, LocalAccum[0]).getReg(0);
4263 HaveSmallAccum =
true;
4264 }
else if (LocalAccum[1]) {
4265 Tmp =
B.buildMergeLikeInstr(
S64, LocalAccum).getReg(0);
4266 HaveSmallAccum =
false;
4268 Tmp =
B.buildZExt(
S64, LocalAccum[0]).getReg(0);
4269 HaveSmallAccum =
true;
4272 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4274 HaveSmallAccum =
true;
4278 unsigned j1 = DstIndex - j0;
4279 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4283 auto Mad =
B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {
S64,
S1},
4284 {Src0[j0], Src1[j1], Tmp});
4285 Tmp = Mad.getReg(0);
4286 if (!HaveSmallAccum)
4287 CarryOut.push_back(Mad.getReg(1));
4288 HaveSmallAccum =
false;
4291 }
while (j0 <= DstIndex);
4293 auto Unmerge =
B.buildUnmerge(
S32, Tmp);
4294 LocalAccum[0] = Unmerge.getReg(0);
4295 if (LocalAccum.size() > 1)
4296 LocalAccum[1] = Unmerge.getReg(1);
4323 for (
unsigned i = 0; i <= Accum.
size() / 2; ++i) {
4324 Carry OddCarryIn = std::move(OddCarry);
4325 Carry EvenCarryIn = std::move(EvenCarry);
4330 if (2 * i < Accum.
size()) {
4331 auto LocalAccum = Accum.
drop_front(2 * i).take_front(2);
4332 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4337 if (!SeparateOddAlignedProducts) {
4338 auto LocalAccum = Accum.
drop_front(2 * i - 1).take_front(2);
4339 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4341 bool IsHighest = 2 * i >= Accum.
size();
4344 .take_front(IsHighest ? 1 : 2);
4345 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4351 Lo =
B.buildUAddo(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0]);
4353 Lo =
B.buildAdd(
S32, Accum[2 * i - 1], SeparateOddOut[0]);
4355 Lo =
B.buildUAdde(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0],
4358 Accum[2 * i - 1] =
Lo->getOperand(0).getReg();
4361 auto Hi =
B.buildUAdde(
S32,
S1, Accum[2 * i], SeparateOddOut[1],
4362 Lo->getOperand(1).getReg());
4363 Accum[2 * i] =
Hi.getReg(0);
4364 SeparateOddCarry =
Hi.getReg(1);
4371 if (
Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4372 EvenCarryIn.push_back(CarryOut);
4374 if (2 * i < Accum.
size()) {
4375 if (
Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4376 OddCarry.push_back(CarryOut);
4388 assert(ST.hasMad64_32());
4389 assert(
MI.getOpcode() == TargetOpcode::G_MUL);
4398 LLT Ty =
MRI.getType(DstReg);
4401 unsigned Size = Ty.getSizeInBits();
4402 if (ST.hasVectorMulU64() &&
Size == 64)
4405 unsigned NumParts =
Size / 32;
4417 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4421 for (
unsigned i = 0; i < NumParts; ++i) {
4425 B.buildUnmerge(Src0Parts, Src0);
4426 B.buildUnmerge(Src1Parts, Src1);
4429 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4430 SeparateOddAlignedProducts);
4432 B.buildMergeLikeInstr(DstReg, AccumRegs);
4433 MI.eraseFromParent();
4445 LLT DstTy =
MRI.getType(Dst);
4446 LLT SrcTy =
MRI.getType(Src);
4448 unsigned NewOpc =
MI.getOpcode() == AMDGPU::G_CTLZ
4449 ? AMDGPU::G_AMDGPU_FFBH_U32
4450 : AMDGPU::G_AMDGPU_FFBL_B32;
4451 auto Tmp =
B.buildInstr(NewOpc, {DstTy}, {Src});
4454 MI.eraseFromParent();
4463 LLT SrcTy =
MRI.getType(Src);
4464 TypeSize NumBits = SrcTy.getSizeInBits();
4468 auto ShiftAmt =
B.buildConstant(
S32, 32u - NumBits);
4469 auto Extend =
B.buildAnyExt(
S32, {Src}).
getReg(0u);
4470 auto Shift =
B.buildShl(
S32, Extend, ShiftAmt);
4471 auto Ctlz =
B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {
S32}, {Shift});
4472 B.buildTrunc(Dst, Ctlz);
4473 MI.eraseFromParent();
4479 if (
MI.getOpcode() != TargetOpcode::G_XOR)
4482 return ConstVal == -1;
4489 Register CondDef =
MI.getOperand(0).getReg();
4490 if (!
MRI.hasOneNonDBGUse(CondDef))
4498 if (!
MRI.hasOneNonDBGUse(NegatedCond))
4504 UseMI = &*
MRI.use_instr_nodbg_begin(NegatedCond);
4508 if (
UseMI->getParent() != Parent ||
UseMI->getOpcode() != AMDGPU::G_BRCOND)
4517 UncondBrTarget = &*NextMBB;
4519 if (
Next->getOpcode() != AMDGPU::G_BR)
4538 *ArgRC,
B.getDebugLoc(), ArgTy);
4542 const unsigned Mask = Arg->
getMask();
4550 auto ShiftAmt =
B.buildConstant(
S32, Shift);
4551 AndMaskSrc =
B.buildLShr(
S32, LiveIn, ShiftAmt).getReg(0);
4554 B.buildAnd(DstReg, AndMaskSrc,
B.buildConstant(
S32, Mask >> Shift));
4556 B.buildCopy(DstReg, LiveIn);
4566 if (!ST.hasClusters()) {
4569 MI.eraseFromParent();
4582 Register ClusterMaxIdXYZ =
MRI.createGenericVirtualRegister(
S32);
4583 Register ClusterWorkGroupIdXYZ =
MRI.createGenericVirtualRegister(
S32);
4589 auto One =
B.buildConstant(
S32, 1);
4590 auto ClusterSizeXYZ =
B.buildAdd(
S32, ClusterMaxIdXYZ, One);
4591 auto GlobalIdXYZ =
B.buildAdd(
S32, ClusterWorkGroupIdXYZ,
4592 B.buildMul(
S32, ClusterIdXYZ, ClusterSizeXYZ));
4599 B.buildCopy(DstReg, GlobalIdXYZ);
4600 MI.eraseFromParent();
4604 B.buildCopy(DstReg, ClusterIdXYZ);
4605 MI.eraseFromParent();
4610 unsigned ClusterIdField = HwregEncoding::encode(ID_IB_STS2, 6, 4);
4612 MRI.setRegClass(ClusterId, &AMDGPU::SReg_32RegClass);
4613 B.buildInstr(AMDGPU::S_GETREG_B32_const)
4615 .addImm(ClusterIdField);
4616 auto Zero =
B.buildConstant(
S32, 0);
4619 B.buildSelect(DstReg, NoClusters, ClusterIdXYZ, GlobalIdXYZ);
4620 MI.eraseFromParent();
4662 auto LoadConstant = [&](
unsigned N) {
4663 B.buildConstant(DstReg,
N);
4667 if (ST.hasArchitectedSGPRs() &&
4674 Arg = &WorkGroupIDX;
4675 ArgRC = &AMDGPU::SReg_32RegClass;
4679 Arg = &WorkGroupIDY;
4680 ArgRC = &AMDGPU::SReg_32RegClass;
4684 Arg = &WorkGroupIDZ;
4685 ArgRC = &AMDGPU::SReg_32RegClass;
4689 if (HasFixedDims && ClusterDims.
getDims()[0] == 1)
4690 return LoadConstant(0);
4691 Arg = &ClusterWorkGroupIDX;
4692 ArgRC = &AMDGPU::SReg_32RegClass;
4696 if (HasFixedDims && ClusterDims.
getDims()[1] == 1)
4697 return LoadConstant(0);
4698 Arg = &ClusterWorkGroupIDY;
4699 ArgRC = &AMDGPU::SReg_32RegClass;
4703 if (HasFixedDims && ClusterDims.
getDims()[2] == 1)
4704 return LoadConstant(0);
4705 Arg = &ClusterWorkGroupIDZ;
4706 ArgRC = &AMDGPU::SReg_32RegClass;
4711 return LoadConstant(ClusterDims.
getDims()[0] - 1);
4712 Arg = &ClusterWorkGroupMaxIDX;
4713 ArgRC = &AMDGPU::SReg_32RegClass;
4718 return LoadConstant(ClusterDims.
getDims()[1] - 1);
4719 Arg = &ClusterWorkGroupMaxIDY;
4720 ArgRC = &AMDGPU::SReg_32RegClass;
4725 return LoadConstant(ClusterDims.
getDims()[2] - 1);
4726 Arg = &ClusterWorkGroupMaxIDZ;
4727 ArgRC = &AMDGPU::SReg_32RegClass;
4731 Arg = &ClusterWorkGroupMaxFlatID;
4732 ArgRC = &AMDGPU::SReg_32RegClass;
4747 return LoadConstant(0);
4752 B.buildUndef(DstReg);
4756 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4768 MI.eraseFromParent();
4774 B.buildConstant(
MI.getOperand(0).getReg(),
C);
4775 MI.eraseFromParent();
4782 unsigned MaxID = ST.getMaxWorkitemID(
B.getMF().getFunction(), Dim);
4796 B.buildUndef(DstReg);
4797 MI.eraseFromParent();
4801 if (Arg->isMasked()) {
4815 MI.eraseFromParent();
4830 Register KernArgReg =
B.getMRI()->createGenericVirtualRegister(PtrTy);
4839 return B.buildObjectPtrOffset(PtrTy, KernArgReg, COffset).getReg(0);
4847 Align Alignment)
const {
4851 "unexpected kernarg parameter type");
4858 MI.eraseFromParent();
4866 LLT DstTy =
MRI.getType(Dst);
4893 auto FloatY =
B.buildUITOFP(
S32,
Y);
4894 auto RcpIFlag =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {
S32}, {FloatY});
4896 auto ScaledY =
B.buildFMul(
S32, RcpIFlag, Scale);
4897 auto Z =
B.buildFPTOUI(
S32, ScaledY);
4900 auto NegY =
B.buildSub(
S32,
B.buildConstant(
S32, 0),
Y);
4901 auto NegYZ =
B.buildMul(
S32, NegY, Z);
4902 Z =
B.buildAdd(
S32, Z,
B.buildUMulH(
S32, Z, NegYZ));
4905 auto Q =
B.buildUMulH(
S32,
X, Z);
4906 auto R =
B.buildSub(
S32,
X,
B.buildMul(
S32, Q,
Y));
4909 auto One =
B.buildConstant(
S32, 1);
4912 Q =
B.buildSelect(
S32,
Cond,
B.buildAdd(
S32, Q, One), Q);
4918 B.buildSelect(DstDivReg,
Cond,
B.buildAdd(
S32, Q, One), Q);
4921 B.buildSelect(DstRemReg,
Cond,
B.buildSub(
S32, R,
Y), R);
4940 auto Unmerge =
B.buildUnmerge(
S32, Val);
4942 auto CvtLo =
B.buildUITOFP(
S32, Unmerge.getReg(0));
4943 auto CvtHi =
B.buildUITOFP(
S32, Unmerge.getReg(1));
4945 auto Mad =
B.buildFMAD(
4949 auto Rcp =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {
S32}, {Mad});
4950 auto Mul1 =
B.buildFMul(
4954 auto Mul2 =
B.buildFMul(
4956 auto Trunc =
B.buildIntrinsicTrunc(
S32, Mul2);
4959 auto Mad2 =
B.buildFMAD(
4963 auto ResultLo =
B.buildFPTOUI(
S32, Mad2);
4964 auto ResultHi =
B.buildFPTOUI(
S32, Trunc);
4966 return {ResultLo.getReg(0), ResultHi.getReg(0)};
4981 auto Rcp =
B.buildMergeLikeInstr(
S64, {RcpLo, RcpHi});
4983 auto Zero64 =
B.buildConstant(
S64, 0);
4984 auto NegDenom =
B.buildSub(
S64, Zero64, Denom);
4986 auto MulLo1 =
B.buildMul(
S64, NegDenom, Rcp);
4987 auto MulHi1 =
B.buildUMulH(
S64, Rcp, MulLo1);
4989 auto UnmergeMulHi1 =
B.buildUnmerge(
S32, MulHi1);
4990 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
4991 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
4993 auto Add1_Lo =
B.buildUAddo(
S32,
S1, RcpLo, MulHi1_Lo);
4994 auto Add1_Hi =
B.buildUAdde(
S32,
S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
4995 auto Add1 =
B.buildMergeLikeInstr(
S64, {Add1_Lo, Add1_Hi});
4997 auto MulLo2 =
B.buildMul(
S64, NegDenom, Add1);
4998 auto MulHi2 =
B.buildUMulH(
S64, Add1, MulLo2);
4999 auto UnmergeMulHi2 =
B.buildUnmerge(
S32, MulHi2);
5000 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
5001 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
5003 auto Zero32 =
B.buildConstant(
S32, 0);
5004 auto Add2_Lo =
B.buildUAddo(
S32,
S1, Add1_Lo, MulHi2_Lo);
5005 auto Add2_Hi =
B.buildUAdde(
S32,
S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
5006 auto Add2 =
B.buildMergeLikeInstr(
S64, {Add2_Lo, Add2_Hi});
5008 auto UnmergeNumer =
B.buildUnmerge(
S32, Numer);
5009 Register NumerLo = UnmergeNumer.getReg(0);
5010 Register NumerHi = UnmergeNumer.getReg(1);
5012 auto MulHi3 =
B.buildUMulH(
S64, Numer, Add2);
5013 auto Mul3 =
B.buildMul(
S64, Denom, MulHi3);
5014 auto UnmergeMul3 =
B.buildUnmerge(
S32, Mul3);
5015 Register Mul3_Lo = UnmergeMul3.getReg(0);
5016 Register Mul3_Hi = UnmergeMul3.getReg(1);
5017 auto Sub1_Lo =
B.buildUSubo(
S32,
S1, NumerLo, Mul3_Lo);
5018 auto Sub1_Hi =
B.buildUSube(
S32,
S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
5019 auto Sub1_Mi =
B.buildSub(
S32, NumerHi, Mul3_Hi);
5020 auto Sub1 =
B.buildMergeLikeInstr(
S64, {Sub1_Lo, Sub1_Hi});
5022 auto UnmergeDenom =
B.buildUnmerge(
S32, Denom);
5023 Register DenomLo = UnmergeDenom.getReg(0);
5024 Register DenomHi = UnmergeDenom.getReg(1);
5027 auto C1 =
B.buildSExt(
S32, CmpHi);
5030 auto C2 =
B.buildSExt(
S32, CmpLo);
5033 auto C3 =
B.buildSelect(
S32, CmpEq, C2, C1);
5040 auto Sub2_Lo =
B.buildUSubo(
S32,
S1, Sub1_Lo, DenomLo);
5041 auto Sub2_Mi =
B.buildUSube(
S32,
S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
5042 auto Sub2_Hi =
B.buildUSube(
S32,
S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
5043 auto Sub2 =
B.buildMergeLikeInstr(
S64, {Sub2_Lo, Sub2_Hi});
5045 auto One64 =
B.buildConstant(
S64, 1);
5046 auto Add3 =
B.buildAdd(
S64, MulHi3, One64);
5052 auto C6 =
B.buildSelect(
5056 auto Add4 =
B.buildAdd(
S64, Add3, One64);
5057 auto Sub3_Lo =
B.buildUSubo(
S32,
S1, Sub2_Lo, DenomLo);
5059 auto Sub3_Mi =
B.buildUSube(
S32,
S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
5060 auto Sub3_Hi =
B.buildUSube(
S32,
S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
5061 auto Sub3 =
B.buildMergeLikeInstr(
S64, {Sub3_Lo, Sub3_Hi});
5067 auto Sel1 =
B.buildSelect(
5074 auto Sel2 =
B.buildSelect(
5085 switch (
MI.getOpcode()) {
5088 case AMDGPU::G_UDIV: {
5089 DstDivReg =
MI.getOperand(0).getReg();
5092 case AMDGPU::G_UREM: {
5093 DstRemReg =
MI.getOperand(0).getReg();
5096 case AMDGPU::G_UDIVREM: {
5097 DstDivReg =
MI.getOperand(0).getReg();
5098 DstRemReg =
MI.getOperand(1).getReg();
5105 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
5106 Register Num =
MI.getOperand(FirstSrcOpIdx).getReg();
5107 Register Den =
MI.getOperand(FirstSrcOpIdx + 1).getReg();
5108 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
5117 MI.eraseFromParent();
5127 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
5128 if (Ty !=
S32 && Ty !=
S64)
5131 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
5132 Register LHS =
MI.getOperand(FirstSrcOpIdx).getReg();
5133 Register RHS =
MI.getOperand(FirstSrcOpIdx + 1).getReg();
5135 auto SignBitOffset =
B.buildConstant(
S32, Ty.getSizeInBits() - 1);
5136 auto LHSign =
B.buildAShr(Ty, LHS, SignBitOffset);
5137 auto RHSign =
B.buildAShr(Ty, RHS, SignBitOffset);
5139 LHS =
B.buildAdd(Ty, LHS, LHSign).getReg(0);
5140 RHS =
B.buildAdd(Ty, RHS, RHSign).getReg(0);
5142 LHS =
B.buildXor(Ty, LHS, LHSign).getReg(0);
5143 RHS =
B.buildXor(Ty, RHS, RHSign).getReg(0);
5145 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
5146 switch (
MI.getOpcode()) {
5149 case AMDGPU::G_SDIV: {
5150 DstDivReg =
MI.getOperand(0).getReg();
5151 TmpDivReg =
MRI.createGenericVirtualRegister(Ty);
5154 case AMDGPU::G_SREM: {
5155 DstRemReg =
MI.getOperand(0).getReg();
5156 TmpRemReg =
MRI.createGenericVirtualRegister(Ty);
5159 case AMDGPU::G_SDIVREM: {
5160 DstDivReg =
MI.getOperand(0).getReg();
5161 DstRemReg =
MI.getOperand(1).getReg();
5162 TmpDivReg =
MRI.createGenericVirtualRegister(Ty);
5163 TmpRemReg =
MRI.createGenericVirtualRegister(Ty);
5174 auto Sign =
B.buildXor(Ty, LHSign, RHSign).getReg(0);
5175 auto SignXor =
B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
5176 B.buildSub(DstDivReg, SignXor, Sign);
5180 auto Sign = LHSign.getReg(0);
5181 auto SignXor =
B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
5182 B.buildSub(DstRemReg, SignXor, Sign);
5185 MI.eraseFromParent();
5196 LLT ResTy =
MRI.getType(Res);
5201 if (!AllowInaccurateRcp && ResTy !=
LLT::scalar(16))
5212 if (CLHS->isExactlyValue(1.0)) {
5213 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5217 MI.eraseFromParent();
5222 if (CLHS->isExactlyValue(-1.0)) {
5223 auto FNeg =
B.buildFNeg(ResTy, RHS, Flags);
5224 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5225 .addUse(FNeg.getReg(0))
5228 MI.eraseFromParent();
5235 if (!AllowInaccurateRcp && (ResTy !=
LLT::scalar(16) ||
5240 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5243 B.buildFMul(Res, LHS, RCP, Flags);
5245 MI.eraseFromParent();
5256 LLT ResTy =
MRI.getType(Res);
5260 if (!AllowInaccurateRcp)
5263 auto NegY =
B.buildFNeg(ResTy,
Y);
5264 auto One =
B.buildFConstant(ResTy, 1.0);
5266 auto R =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5270 auto Tmp0 =
B.buildFMA(ResTy, NegY, R, One);
5271 R =
B.buildFMA(ResTy, Tmp0, R, R);
5273 auto Tmp1 =
B.buildFMA(ResTy, NegY, R, One);
5274 R =
B.buildFMA(ResTy, Tmp1, R, R);
5276 auto Ret =
B.buildFMul(ResTy,
X, R);
5277 auto Tmp2 =
B.buildFMA(ResTy, NegY, Ret,
X);
5279 B.buildFMA(Res, Tmp2, R, Ret);
5280 MI.eraseFromParent();
5312 auto LHSExt =
B.buildFPExt(
S32, LHS, Flags);
5313 auto RHSExt =
B.buildFPExt(
S32, RHS, Flags);
5314 auto NegRHSExt =
B.buildFNeg(
S32, RHSExt);
5315 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5316 .addUse(RHSExt.getReg(0))
5318 auto Quot =
B.buildFMul(
S32, LHSExt, Rcp, Flags);
5320 if (ST.hasMadMacF32Insts()) {
5321 Err =
B.buildFMAD(
S32, NegRHSExt, Quot, LHSExt, Flags);
5322 Quot =
B.buildFMAD(
S32, Err, Rcp, Quot, Flags);
5323 Err =
B.buildFMAD(
S32, NegRHSExt, Quot, LHSExt, Flags);
5325 Err =
B.buildFMA(
S32, NegRHSExt, Quot, LHSExt, Flags);
5326 Quot =
B.buildFMA(
S32, Err, Rcp, Quot, Flags);
5327 Err =
B.buildFMA(
S32, NegRHSExt, Quot, LHSExt, Flags);
5329 auto Tmp =
B.buildFMul(
S32, Err, Rcp, Flags);
5330 Tmp =
B.buildAnd(
S32, Tmp,
B.buildConstant(
S32, 0xff800000));
5331 Quot =
B.buildFAdd(
S32, Tmp, Quot, Flags);
5332 auto RDst =
B.buildFPTrunc(
S16, Quot, Flags);
5333 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5334 .addUse(RDst.getReg(0))
5339 MI.eraseFromParent();
5352 unsigned SPDenormMode =
5355 if (ST.hasDenormModeInst()) {
5357 uint32_t DPDenormModeDefault =
Mode.fpDenormModeDPValue();
5359 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
5360 B.buildInstr(AMDGPU::S_DENORM_MODE)
5361 .addImm(NewDenormModeValue);
5364 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
5365 .addImm(SPDenormMode)
5387 auto One =
B.buildFConstant(
S32, 1.0f);
5389 auto DenominatorScaled =
5390 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S32,
S1})
5395 auto NumeratorScaled =
5396 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S32,
S1})
5402 auto ApproxRcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5403 .addUse(DenominatorScaled.getReg(0))
5405 auto NegDivScale0 =
B.buildFNeg(
S32, DenominatorScaled, Flags);
5408 const bool HasDynamicDenormals =
5413 if (!PreservesDenormals) {
5414 if (HasDynamicDenormals) {
5415 SavedSPDenormMode =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5416 B.buildInstr(AMDGPU::S_GETREG_B32)
5417 .addDef(SavedSPDenormMode)
5423 auto Fma0 =
B.buildFMA(
S32, NegDivScale0, ApproxRcp, One, Flags);
5424 auto Fma1 =
B.buildFMA(
S32, Fma0, ApproxRcp, ApproxRcp, Flags);
5425 auto Mul =
B.buildFMul(
S32, NumeratorScaled, Fma1, Flags);
5426 auto Fma2 =
B.buildFMA(
S32, NegDivScale0,
Mul, NumeratorScaled, Flags);
5427 auto Fma3 =
B.buildFMA(
S32, Fma2, Fma1,
Mul, Flags);
5428 auto Fma4 =
B.buildFMA(
S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
5430 if (!PreservesDenormals) {
5431 if (HasDynamicDenormals) {
5432 assert(SavedSPDenormMode);
5433 B.buildInstr(AMDGPU::S_SETREG_B32)
5434 .addReg(SavedSPDenormMode)
5440 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {
S32})
5441 .addUse(Fma4.getReg(0))
5442 .addUse(Fma1.getReg(0))
5443 .addUse(Fma3.getReg(0))
5444 .addUse(NumeratorScaled.getReg(1))
5447 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5448 .addUse(Fmas.getReg(0))
5453 MI.eraseFromParent();
5472 auto One =
B.buildFConstant(
S64, 1.0);
5474 auto DivScale0 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S64,
S1})
5480 auto NegDivScale0 =
B.buildFNeg(
S64, DivScale0.getReg(0), Flags);
5482 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S64})
5483 .addUse(DivScale0.getReg(0))
5486 auto Fma0 =
B.buildFMA(
S64, NegDivScale0, Rcp, One, Flags);
5487 auto Fma1 =
B.buildFMA(
S64, Rcp, Fma0, Rcp, Flags);
5488 auto Fma2 =
B.buildFMA(
S64, NegDivScale0, Fma1, One, Flags);
5490 auto DivScale1 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S64,
S1})
5496 auto Fma3 =
B.buildFMA(
S64, Fma1, Fma2, Fma1, Flags);
5497 auto Mul =
B.buildFMul(
S64, DivScale1.getReg(0), Fma3, Flags);
5498 auto Fma4 =
B.buildFMA(
S64, NegDivScale0,
Mul, DivScale1.getReg(0), Flags);
5501 if (!ST.hasUsableDivScaleConditionOutput()) {
5507 auto NumUnmerge =
B.buildUnmerge(
S32, LHS);
5508 auto DenUnmerge =
B.buildUnmerge(
S32, RHS);
5509 auto Scale0Unmerge =
B.buildUnmerge(
S32, DivScale0);
5510 auto Scale1Unmerge =
B.buildUnmerge(
S32, DivScale1);
5513 Scale1Unmerge.getReg(1));
5515 Scale0Unmerge.getReg(1));
5516 Scale =
B.buildXor(
S1, CmpNum, CmpDen).getReg(0);
5518 Scale = DivScale1.getReg(1);
5521 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {
S64})
5522 .addUse(Fma4.getReg(0))
5523 .addUse(Fma3.getReg(0))
5524 .addUse(
Mul.getReg(0))
5528 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup,
ArrayRef(Res))
5529 .addUse(Fmas.getReg(0))
5534 MI.eraseFromParent();
5546 LLT Ty =
MRI.getType(Res0);
5549 auto Mant =
B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5552 auto Exp =
B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5556 if (ST.hasFractBug()) {
5557 auto Fabs =
B.buildFAbs(Ty, Val);
5561 auto Zero =
B.buildConstant(InstrExpTy, 0);
5562 Exp =
B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5563 Mant =
B.buildSelect(Ty, IsFinite, Mant, Val);
5566 B.buildCopy(Res0, Mant);
5567 B.buildSExtOrTrunc(Res1, Exp);
5569 MI.eraseFromParent();
5584 auto Abs =
B.buildFAbs(
S32, RHS, Flags);
5587 auto C0 =
B.buildFConstant(
S32, 0x1p+96f);
5588 auto C1 =
B.buildFConstant(
S32, 0x1p-32f);
5589 auto C2 =
B.buildFConstant(
S32, 1.0f);
5592 auto Sel =
B.buildSelect(
S32, CmpRes, C1, C2, Flags);
5594 auto Mul0 =
B.buildFMul(
S32, RHS, Sel, Flags);
5596 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5597 .addUse(Mul0.getReg(0))
5600 auto Mul1 =
B.buildFMul(
S32, LHS, RCP, Flags);
5602 B.buildFMul(Res, Sel, Mul1, Flags);
5604 MI.eraseFromParent();
5613 unsigned Flags =
MI.getFlags();
5614 assert(!ST.has16BitInsts());
5616 auto Ext =
B.buildFPExt(
F32,
MI.getOperand(1), Flags);
5617 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {
F32})
5618 .addUse(Ext.getReg(0))
5620 B.buildFPTrunc(
MI.getOperand(0),
Log2, Flags);
5621 MI.eraseFromParent();
5631 const unsigned Flags =
MI.getFlags();
5640 MI.eraseFromParent();
5644 auto ScaleThreshold =
B.buildFConstant(
F32, 0x1.0p-96f);
5646 auto ScaleUpFactor =
B.buildFConstant(
F32, 0x1.0p+32f);
5647 auto ScaledX =
B.buildFMul(
F32,
X, ScaleUpFactor, Flags);
5648 auto SqrtX =
B.buildSelect(
F32, NeedScale, ScaledX,
X, Flags);
5653 .addUse(SqrtX.getReg(0))
5656 auto NegOne =
B.buildConstant(I32, -1);
5657 auto SqrtSNextDown =
B.buildAdd(I32, SqrtS, NegOne);
5659 auto NegSqrtSNextDown =
B.buildFNeg(
F32, SqrtSNextDown, Flags);
5660 auto SqrtVP =
B.buildFMA(
F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5662 auto PosOne =
B.buildConstant(I32, 1);
5663 auto SqrtSNextUp =
B.buildAdd(I32, SqrtS, PosOne);
5665 auto NegSqrtSNextUp =
B.buildFNeg(
F32, SqrtSNextUp, Flags);
5666 auto SqrtVS =
B.buildFMA(
F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5668 auto Zero =
B.buildFConstant(
F32, 0.0f);
5672 B.buildSelect(
F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5676 B.buildSelect(
F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5679 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
F32}).addReg(SqrtX.getReg(0));
5680 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5682 auto Half =
B.buildFConstant(
F32, 0.5f);
5683 auto SqrtH =
B.buildFMul(
F32, SqrtR, Half, Flags);
5684 auto NegSqrtH =
B.buildFNeg(
F32, SqrtH, Flags);
5685 auto SqrtE =
B.buildFMA(
F32, NegSqrtH, SqrtS, Half, Flags);
5686 SqrtH =
B.buildFMA(
F32, SqrtH, SqrtE, SqrtH, Flags);
5687 SqrtS =
B.buildFMA(
F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5688 auto NegSqrtS =
B.buildFNeg(
F32, SqrtS, Flags);
5689 auto SqrtD =
B.buildFMA(
F32, NegSqrtS, SqrtS, SqrtX, Flags);
5690 SqrtS =
B.buildFMA(
F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5693 auto ScaleDownFactor =
B.buildFConstant(
F32, 0x1.0p-16f);
5695 auto ScaledDown =
B.buildFMul(
F32, SqrtS, ScaleDownFactor, Flags);
5697 SqrtS =
B.buildSelect(
F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5700 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5702 MI.eraseFromParent();
5734 assert(
MRI.getType(Dst) ==
F64 &&
"only expect to lower f64 sqrt");
5737 unsigned Flags =
MI.getFlags();
5739 auto ScaleConstant =
B.buildFConstant(
F64, 0x1.0p-767);
5741 auto ZeroInt =
B.buildConstant(
S32, 0);
5745 auto ScaleUpFactor =
B.buildConstant(
S32, 256);
5746 auto ScaleUp =
B.buildSelect(
S32, Scaling, ScaleUpFactor, ZeroInt);
5747 auto SqrtX =
B.buildFLdexp(
F64,
X, ScaleUp, Flags);
5750 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {
F64}).addReg(SqrtX.getReg(0));
5752 auto Half =
B.buildFConstant(
F64, 0.5);
5753 auto SqrtH0 =
B.buildFMul(
F64, SqrtY, Half);
5754 auto SqrtS0 =
B.buildFMul(
F64, SqrtX, SqrtY);
5756 auto NegSqrtH0 =
B.buildFNeg(
F64, SqrtH0);
5757 auto SqrtR0 =
B.buildFMA(
F64, NegSqrtH0, SqrtS0, Half);
5759 auto SqrtS1 =
B.buildFMA(
F64, SqrtS0, SqrtR0, SqrtS0);
5760 auto SqrtH1 =
B.buildFMA(
F64, SqrtH0, SqrtR0, SqrtH0);
5762 auto NegSqrtS1 =
B.buildFNeg(
F64, SqrtS1);
5763 auto SqrtD0 =
B.buildFMA(
F64, NegSqrtS1, SqrtS1, SqrtX);
5765 auto SqrtS2 =
B.buildFMA(
F64, SqrtD0, SqrtH1, SqrtS1);
5767 auto NegSqrtS2 =
B.buildFNeg(
F64, SqrtS2);
5768 auto SqrtD1 =
B.buildFMA(
F64, NegSqrtS2, SqrtS2, SqrtX);
5770 auto SqrtRet =
B.buildFMA(
F64, SqrtD1, SqrtH1, SqrtS2);
5773 auto ScaleDownFactor =
B.buildConstant(
S32, -128);
5774 auto ScaleDown =
B.buildSelect(
S32, Scaling, ScaleDownFactor, ZeroInt);
5775 SqrtRet =
B.buildFLdexp(
F64, SqrtRet, ScaleDown, Flags);
5784 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
5786 MI.eraseFromParent();
5793 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
5817 auto Flags =
MI.getFlags();
5819 LLT Ty =
MRI.getType(Dst);
5829 auto Rsq =
B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
5839 auto ClampMax = UseIEEE ?
B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
5840 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
5845 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
5847 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
5848 MI.eraseFromParent();
5860 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
5861 IID == Intrinsic::amdgcn_permlanex16;
5862 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
5863 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
5867 auto LaneOp =
B.buildIntrinsic(IID, {VT}).addUse(Src0);
5869 case Intrinsic::amdgcn_readfirstlane:
5870 case Intrinsic::amdgcn_permlane64:
5871 return LaneOp.getReg(0);
5872 case Intrinsic::amdgcn_readlane:
5873 case Intrinsic::amdgcn_set_inactive:
5874 case Intrinsic::amdgcn_set_inactive_chain_arg:
5875 return LaneOp.addUse(Src1).getReg(0);
5876 case Intrinsic::amdgcn_writelane:
5877 return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
5878 case Intrinsic::amdgcn_permlane16:
5879 case Intrinsic::amdgcn_permlanex16: {
5881 int64_t Src4 =
MI.getOperand(6).getImm();
5882 int64_t Src5 =
MI.getOperand(7).getImm();
5883 return LaneOp.addUse(Src1)
5890 case Intrinsic::amdgcn_mov_dpp8:
5891 return LaneOp.addImm(
MI.getOperand(3).getImm()).getReg(0);
5892 case Intrinsic::amdgcn_update_dpp:
5893 return LaneOp.addUse(Src1)
5894 .addImm(
MI.getOperand(4).getImm())
5895 .addImm(
MI.getOperand(5).getImm())
5896 .addImm(
MI.getOperand(6).getImm())
5897 .addImm(
MI.getOperand(7).getImm())
5907 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
5908 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
5909 Src1 =
MI.getOperand(3).getReg();
5910 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
5911 Src2 =
MI.getOperand(4).getReg();
5915 LLT Ty =
MRI.getType(DstReg);
5916 unsigned Size = Ty.getSizeInBits();
5918 unsigned SplitSize = 32;
5919 if (IID == Intrinsic::amdgcn_update_dpp && (
Size % 64 == 0) &&
5920 ST.hasDPALU_DPP() &&
5924 if (
Size == SplitSize) {
5930 Src0 =
B.buildAnyExt(
S32, Src0).getReg(0);
5932 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5935 if (IID == Intrinsic::amdgcn_writelane)
5938 Register LaneOpDst = createLaneOp(Src0, Src1, Src2,
S32);
5939 B.buildTrunc(DstReg, LaneOpDst);
5940 MI.eraseFromParent();
5944 if (
Size % SplitSize != 0)
5948 bool NeedsBitcast =
false;
5949 if (Ty.isVector()) {
5952 if (EltSize == SplitSize) {
5953 PartialResTy = EltTy;
5954 }
else if (EltSize == 16 || EltSize == 32) {
5955 unsigned NElem = SplitSize / EltSize;
5959 NeedsBitcast =
true;
5964 unsigned NumParts =
Size / SplitSize;
5968 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5969 Src1Parts =
B.buildUnmerge(PartialResTy, Src1);
5971 if (IID == Intrinsic::amdgcn_writelane)
5972 Src2Parts =
B.buildUnmerge(PartialResTy, Src2);
5974 for (
unsigned i = 0; i < NumParts; ++i) {
5975 Src0 = Src0Parts.
getReg(i);
5977 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
5978 Src1 = Src1Parts.
getReg(i);
5980 if (IID == Intrinsic::amdgcn_writelane)
5981 Src2 = Src2Parts.
getReg(i);
5983 PartialRes.
push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
5987 B.buildBitcast(DstReg,
B.buildMergeLikeInstr(
5990 B.buildMergeLikeInstr(DstReg, PartialRes);
5992 MI.eraseFromParent();
6000 ST.getTargetLowering()->getImplicitParameterOffset(
6002 LLT DstTy =
MRI.getType(DstReg);
6005 Register KernargPtrReg =
MRI.createGenericVirtualRegister(DstTy);
6010 B.buildObjectPtrOffset(DstReg, KernargPtrReg,
6011 B.buildConstant(IdxTy,
Offset).getReg(0));
6022 Register Pointer =
MI.getOperand(2).getReg();
6024 Register NumRecords =
MI.getOperand(4).getReg();
6030 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6032 auto ExtStride =
B.buildAnyExt(
S32, Stride);
6034 if (ST.has45BitNumRecordsBufferResource()) {
6039 auto PointerInt =
B.buildPtrToInt(PtrIntTy, Pointer);
6040 auto ExtPointer =
B.buildAnyExtOrTrunc(
S64, PointerInt);
6041 auto NumRecordsLHS =
B.buildShl(
S64, NumRecords,
B.buildConstant(
S32, 57));
6042 Register LowHalf =
B.buildOr(
S64, ExtPointer, NumRecordsLHS).getReg(0);
6046 auto NumRecordsRHS =
B.buildLShr(
S64, NumRecords,
B.buildConstant(
S32, 7));
6047 auto ShiftedStride =
B.buildShl(
S32, ExtStride,
B.buildConstant(
S32, 12));
6048 auto ExtShiftedStride =
6049 B.buildMergeValues(
S64, {Zero, ShiftedStride.getReg(0)});
6050 auto ShiftedFlags =
B.buildShl(
S32, Flags,
B.buildConstant(
S32, 28));
6051 auto ExtShiftedFlags =
6052 B.buildMergeValues(
S64, {Zero, ShiftedFlags.getReg(0)});
6053 auto CombinedFields =
B.buildOr(
S64, NumRecordsRHS, ExtShiftedStride);
6055 B.buildOr(
S64, CombinedFields, ExtShiftedFlags).getReg(0);
6056 B.buildMergeValues(Result, {LowHalf, HighHalf});
6058 NumRecords =
B.buildTrunc(
S32, NumRecords).getReg(0);
6059 auto Unmerge =
B.buildUnmerge(
S32, Pointer);
6060 auto LowHalf = Unmerge.getReg(0);
6061 auto HighHalf = Unmerge.getReg(1);
6063 auto AndMask =
B.buildConstant(
S32, 0x0000ffff);
6064 auto Masked =
B.buildAnd(
S32, HighHalf, AndMask);
6065 auto ShiftConst =
B.buildConstant(
S32, 16);
6066 auto ShiftedStride =
B.buildShl(
S32, ExtStride, ShiftConst);
6067 auto NewHighHalf =
B.buildOr(
S32,
Masked, ShiftedStride);
6068 Register NewHighHalfReg = NewHighHalf.getReg(0);
6069 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
6072 MI.eraseFromParent();
6089 MI.eraseFromParent();
6097 std::optional<uint32_t> KnownSize =
6099 if (KnownSize.has_value())
6100 B.buildConstant(DstReg, *KnownSize);
6118 MI.eraseFromParent();
6125 unsigned AddrSpace)
const {
6127 auto Unmerge =
B.buildUnmerge(
S32,
MI.getOperand(2).getReg());
6131 ST.hasGloballyAddressableScratch()) {
6133 B.buildInstr(AMDGPU::S_MOV_B32, {
S32},
6134 {
Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)})
6136 MRI.setRegClass(FlatScratchBaseHi, &AMDGPU::SReg_32RegClass);
6138 Register XOR =
B.buildXor(
S32, Hi32, FlatScratchBaseHi).getReg(0);
6140 B.buildConstant(
S32, 1u << 26));
6145 MI.eraseFromParent();
6155std::pair<Register, unsigned>
6167 bool CheckNUW = ST.hasGFX1250Insts();
6169 MRI, OrigOffset,
nullptr, CheckNUW);
6172 if (
MRI.getType(BaseReg).isPointer())
6173 BaseReg =
B.buildPtrToInt(
MRI.getType(OrigOffset), BaseReg).getReg(0);
6183 unsigned Overflow = ImmOffset & ~MaxImm;
6184 ImmOffset -= Overflow;
6185 if ((int32_t)Overflow < 0) {
6186 Overflow += ImmOffset;
6190 if (Overflow != 0) {
6192 BaseReg =
B.buildConstant(
S32, Overflow).getReg(0);
6194 auto OverflowVal =
B.buildConstant(
S32, Overflow);
6195 BaseReg =
B.buildAdd(
S32, BaseReg, OverflowVal).getReg(0);
6200 BaseReg =
B.buildConstant(
S32, 0).getReg(0);
6202 return std::pair(BaseReg, ImmOffset);
6209 bool ImageStore)
const {
6212 LLT StoreVT =
MRI.getType(Reg);
6215 if (ST.hasUnpackedD16VMem()) {
6216 auto Unmerge =
B.buildUnmerge(
S16, Reg);
6219 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6220 WideRegs.
push_back(
B.buildAnyExt(
S32, Unmerge.getReg(
I)).getReg(0));
6228 if (ImageStore && ST.hasImageStoreD16Bug()) {
6231 Reg =
B.buildBitcast(
S32, Reg).getReg(0);
6233 PackedRegs.
resize(2,
B.buildUndef(
S32).getReg(0));
6240 auto Unmerge =
B.buildUnmerge(
S16, Reg);
6241 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6243 PackedRegs.
resize(6,
B.buildUndef(
S16).getReg(0));
6251 auto Unmerge =
B.buildUnmerge(
S32, Reg);
6252 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
6254 PackedRegs.
resize(4,
B.buildUndef(
S32).getReg(0));
6271 bool IsFormat)
const {
6273 LLT Ty =
MRI->getType(VData);
6283 VData =
B.buildBitcast(Ty, VData).getReg(0);
6291 if (Ty.isVector()) {
6292 if (Ty.getElementType() ==
S16 && Ty.getNumElements() <= 4) {
6304 bool IsFormat)
const {
6309 LLT Ty =
MRI.getType(VData);
6311 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
6326 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6329 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
6333 VIndex =
MI.getOperand(3).getReg();
6336 VIndex =
B.buildConstant(
S32, 0).getReg(0);
6339 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
6340 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
6344 Format =
MI.getOperand(5 + OpOffset).getImm();
6348 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
6354 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
6355 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
6356 }
else if (IsFormat) {
6357 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
6358 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
6362 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
6365 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
6368 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
6373 auto MIB =
B.buildInstr(
Opc)
6384 MIB.addImm(AuxiliaryData)
6385 .addImm(HasVIndex ? -1 : 0)
6386 .addMemOperand(MMO);
6388 MI.eraseFromParent();
6394 unsigned ImmOffset,
unsigned Format,
6397 auto MIB =
B.buildInstr(
Opc)
6408 MIB.addImm(AuxiliaryData)
6409 .addImm(HasVIndex ? -1 : 0)
6410 .addMemOperand(MMO);
6416 bool IsTyped)
const {
6430 assert(
MI.getNumExplicitDefs() == 1 ||
MI.getNumExplicitDefs() == 2);
6431 bool IsTFE =
MI.getNumExplicitDefs() == 2;
6433 StatusDst =
MI.getOperand(1).getReg();
6438 Register RSrc =
MI.getOperand(2 + OpOffset).getReg();
6441 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6444 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps + OpOffset;
6447 VIndex =
MI.getOperand(3 + OpOffset).getReg();
6450 VIndex =
B.buildConstant(
S32, 0).getReg(0);
6453 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
6454 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
6458 Format =
MI.getOperand(5 + OpOffset).getImm();
6462 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
6465 LLT Ty =
MRI.getType(Dst);
6472 Dst =
MI.getOperand(0).getReg();
6473 B.setInsertPt(
B.getMBB(),
MI);
6480 Dst =
MI.getOperand(0).getReg();
6481 B.setInsertPt(
B.getMBB(),
MI);
6485 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
6486 const bool Unpacked = ST.hasUnpackedD16VMem();
6496 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
6497 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
6498 }
else if (IsFormat) {
6502 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6504 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6505 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6510 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6511 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6514 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6515 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6518 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6519 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6525 unsigned NumValueDWords =
divideCeil(Ty.getSizeInBits(), 32);
6526 unsigned NumLoadDWords = NumValueDWords + 1;
6528 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(LoadTy);
6530 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6532 Register ExtDst =
B.getMRI()->createGenericVirtualRegister(
S32);
6533 B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6534 B.buildTrunc(Dst, ExtDst);
6535 }
else if (NumValueDWords == 1) {
6536 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6539 for (
unsigned I = 0;
I != NumValueDWords; ++
I)
6540 LoadElts.
push_back(
B.getMRI()->createGenericVirtualRegister(
S32));
6542 B.buildUnmerge(LoadElts, LoadDstReg);
6544 B.buildMergeLikeInstr(Dst, LoadElts);
6547 (IsD16 && !Ty.isVector())) {
6548 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(
S32);
6550 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6551 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6552 B.buildTrunc(Dst, LoadDstReg);
6553 }
else if (Unpacked && IsD16 && Ty.isVector()) {
6555 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6557 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6558 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6560 auto Unmerge =
B.buildUnmerge(
S32, LoadDstReg);
6562 for (
unsigned I = 0,
N = Unmerge->getNumOperands() - 1;
I !=
N; ++
I)
6563 Repack.
push_back(
B.buildTrunc(EltTy, Unmerge.getReg(
I)).getReg(0));
6564 B.buildMergeLikeInstr(Dst, Repack);
6567 AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
6570 MI.eraseFromParent();
6576 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6577 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6578 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6579 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6580 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6581 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6582 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6583 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6584 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6585 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6586 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6587 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6588 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6589 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6590 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6591 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6592 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6593 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6594 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6595 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6596 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6597 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6598 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6599 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6600 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6601 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6602 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6603 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6604 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6605 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6606 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6607 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6608 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6609 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6610 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6611 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6612 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6613 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6614 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6615 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6616 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6617 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6618 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6619 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6620 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6621 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6622 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6623 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6624 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6625 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6626 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6627 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6628 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6629 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6630 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6631 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6632 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6633 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6634 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6635 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6636 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6637 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6638 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6639 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6640 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6641 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6642 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6643 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6644 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6645 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6646 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6647 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6648 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6649 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6650 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6651 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6652 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6653 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6654 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6655 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6656 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
6657 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
6658 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
6659 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
6660 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32;
6661 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6662 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
6663 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6664 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
6665 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6674 const bool IsCmpSwap =
6675 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6676 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6677 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6678 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6689 CmpVal =
MI.getOperand(3).getReg();
6694 Register RSrc =
MI.getOperand(3 + OpOffset).getReg();
6695 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6698 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
6701 VIndex =
MI.getOperand(4 + OpOffset).getReg();
6704 VIndex =
B.buildConstant(
LLT::scalar(32), 0).getReg(0);
6707 Register VOffset =
MI.getOperand(4 + OpOffset).getReg();
6708 Register SOffset =
MI.getOperand(5 + OpOffset).getReg();
6709 unsigned AuxiliaryData =
MI.getOperand(6 + OpOffset).getImm();
6728 .addImm(AuxiliaryData)
6729 .addImm(HasVIndex ? -1 : 0)
6730 .addMemOperand(MMO);
6732 MI.eraseFromParent();
6742 bool IsA16,
bool IsG16) {
6758 (
B.getMRI()->getType(AddrReg) ==
S16)) {
6763 B.buildBuildVector(
V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6767 "Bias needs to be converted to 16 bit in A16 mode");
6769 AddrReg =
B.buildBitcast(
V2S16, AddrReg).getReg(0);
6775 if (((
I + 1) >= EndIdx) ||
6782 !
MI.getOperand(ArgOffset +
I + 1).isReg()) {
6784 B.buildBuildVector(
V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6789 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
6800 int DimIdx,
int NumVAddrs) {
6804 for (
int I = 0;
I != NumVAddrs; ++
I) {
6806 if (
SrcOp.isReg()) {
6812 int NumAddrRegs = AddrRegs.
size();
6813 if (NumAddrRegs != 1) {
6816 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
6819 for (
int I = 1;
I != NumVAddrs; ++
I) {
6822 MI.getOperand(DimIdx +
I).setReg(AMDGPU::NoRegister);
6844 const unsigned NumDefs =
MI.getNumExplicitDefs();
6845 const unsigned ArgOffset = NumDefs + 1;
6846 bool IsTFE = NumDefs == 2;
6864 VData =
MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
6865 Ty =
MRI->getType(VData);
6868 const bool IsAtomicPacked16Bit =
6869 (BaseOpcode->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
6870 BaseOpcode->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
6878 ST.hasG16() ? (BaseOpcode->
Gradients && GradTy ==
S16) : GradTy ==
S16;
6879 const bool IsA16 = AddrTy ==
S16;
6880 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() ==
S16;
6883 if (!BaseOpcode->
Atomic) {
6884 DMask =
MI.getOperand(ArgOffset + Intr->
DMaskIndex).getImm();
6887 }
else if (DMask != 0) {
6889 }
else if (!IsTFE && !BaseOpcode->
Store) {
6891 B.buildUndef(
MI.getOperand(0));
6892 MI.eraseFromParent();
6900 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
6901 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
6902 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
6903 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
6904 unsigned NewOpcode = LoadOpcode;
6905 if (BaseOpcode->
Store)
6906 NewOpcode = StoreOpcode;
6908 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
6911 MI.setDesc(
B.getTII().get(NewOpcode));
6915 if (IsTFE && DMask == 0) {
6918 MI.getOperand(ArgOffset + Intr->
DMaskIndex).setImm(DMask);
6921 if (BaseOpcode->
Atomic) {
6923 LLT Ty =
MRI->getType(VData0);
6926 if (Ty.isVector() && !IsAtomicPacked16Bit)
6933 auto Concat =
B.buildBuildVector(PackedTy, {VData0, VData1});
6934 MI.getOperand(2).setReg(
Concat.getReg(0));
6935 MI.getOperand(3).setReg(AMDGPU::NoRegister);
6939 unsigned CorrectedNumVAddrs = Intr->
NumVAddrs;
6942 if (BaseOpcode->
Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
6948 if (IsA16 && !ST.hasA16()) {
6953 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->
Sampler);
6954 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
6956 if (IsA16 || IsG16) {
6964 const bool UseNSA = ST.hasNSAEncoding() &&
6965 PackedRegs.
size() >= ST.getNSAThreshold(MF) &&
6966 (PackedRegs.
size() <= NSAMaxSize || HasPartialNSA);
6967 const bool UsePartialNSA =
6968 UseNSA && HasPartialNSA && PackedRegs.
size() > NSAMaxSize;
6970 if (UsePartialNSA) {
6974 auto Concat =
B.buildConcatVectors(
6975 PackedAddrTy,
ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
6976 PackedRegs[NSAMaxSize - 1] =
Concat.getReg(0);
6977 PackedRegs.
resize(NSAMaxSize);
6978 }
else if (!UseNSA && PackedRegs.
size() > 1) {
6980 auto Concat =
B.buildConcatVectors(PackedAddrTy, PackedRegs);
6981 PackedRegs[0] =
Concat.getReg(0);
6985 const unsigned NumPacked = PackedRegs.
size();
6988 if (!
SrcOp.isReg()) {
6998 SrcOp.setReg(AMDGPU::NoRegister);
7015 const bool UseNSA = ST.hasNSAEncoding() &&
7016 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
7017 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
7018 const bool UsePartialNSA =
7019 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
7021 if (UsePartialNSA) {
7023 ArgOffset + Intr->
VAddrStart + NSAMaxSize - 1,
7025 }
else if (!UseNSA && Intr->
NumVAddrs > 1) {
7040 if (!Ty.isVector() || !IsD16)
7044 if (RepackedReg != VData) {
7045 MI.getOperand(1).setReg(RepackedReg);
7053 const int NumElts = Ty.
isVector() ? Ty.getNumElements() : 1;
7056 if (NumElts < DMaskLanes)
7059 if (NumElts > 4 || DMaskLanes > 4)
7069 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
7070 const LLT AdjustedTy =
7086 if (IsD16 && ST.hasUnpackedD16VMem()) {
7093 unsigned RoundedElts = (AdjustedTy.
getSizeInBits() + 31) / 32;
7094 unsigned RoundedSize = 32 * RoundedElts;
7098 RegTy = !IsTFE && EltSize == 16 ?
V2S16 :
S32;
7103 if (!IsTFE && (RoundedTy == Ty || !Ty.
isVector()))
7109 B.setInsertPt(*
MI.getParent(), ++
MI.getIterator());
7113 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
7114 const int ResultNumRegs = LoadResultTy.
getSizeInBits() / 32;
7116 Register NewResultReg =
MRI->createGenericVirtualRegister(LoadResultTy);
7118 MI.getOperand(0).setReg(NewResultReg);
7126 Dst1Reg =
MI.getOperand(1).getReg();
7127 if (
MRI->getType(Dst1Reg) !=
S32)
7131 MI.removeOperand(1);
7135 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
7144 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
7146 if (ResultNumRegs == 1) {
7148 ResultRegs[0] = NewResultReg;
7151 for (
int I = 0;
I != NumDataRegs; ++
I)
7152 ResultRegs[
I] =
MRI->createGenericVirtualRegister(RegTy);
7153 B.buildUnmerge(ResultRegs, NewResultReg);
7158 ResultRegs.
resize(NumDataRegs);
7163 if (IsD16 && !Ty.isVector()) {
7164 B.buildTrunc(DstReg, ResultRegs[0]);
7169 if (Ty ==
V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
7170 B.buildBitcast(DstReg, ResultRegs[0]);
7182 if (RegTy !=
V2S16 && !ST.hasUnpackedD16VMem()) {
7184 Reg =
B.buildBitcast(
V2S16, Reg).getReg(0);
7185 }
else if (ST.hasUnpackedD16VMem()) {
7187 Reg =
B.buildTrunc(
S16, Reg).getReg(0);
7191 auto padWithUndef = [&](
LLT Ty,
int NumElts) {
7195 for (
int I = 0;
I != NumElts; ++
I)
7200 LLT ResTy =
MRI->getType(ResultRegs[0]);
7202 padWithUndef(ResTy, NumElts - ResultRegs.
size());
7203 B.buildBuildVector(DstReg, ResultRegs);
7207 assert(!ST.hasUnpackedD16VMem() && ResTy ==
V2S16);
7208 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
7214 if (ResultRegs.
size() == 1) {
7215 NewResultReg = ResultRegs[0];
7216 }
else if (ResultRegs.
size() == 2) {
7218 NewResultReg =
B.buildConcatVectors(
V4S16, ResultRegs).getReg(0);
7224 if (
MRI->getType(DstReg).getNumElements() <
7225 MRI->getType(NewResultReg).getNumElements()) {
7226 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
7228 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
7233 padWithUndef(ResTy, RegsToCover - ResultRegs.
size());
7234 B.buildConcatVectors(DstReg, ResultRegs);
7243 Register OrigDst =
MI.getOperand(0).getReg();
7245 LLT Ty =
B.getMRI()->getType(OrigDst);
7246 unsigned Size = Ty.getSizeInBits();
7249 if (
Size < 32 && ST.hasScalarSubwordLoads()) {
7251 Opc =
Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
7252 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
7255 Dst =
B.getMRI()->createGenericVirtualRegister(
LLT::scalar(32));
7257 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
7266 B.setInsertPt(
B.getMBB(),
MI);
7271 B.setInsertPt(
B.getMBB(),
MI);
7277 MI.setDesc(
B.getTII().get(
Opc));
7278 MI.removeOperand(1);
7281 const unsigned MemSize = (
Size + 7) / 8;
7282 const Align MemAlign =
B.getDataLayout().getABITypeAlign(
7289 MI.addMemOperand(MF, MMO);
7290 if (Dst != OrigDst) {
7291 MI.getOperand(0).setReg(Dst);
7292 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
7293 B.buildTrunc(OrigDst, Dst);
7315 MI.setDesc(
B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
7316 MI.removeOperand(0);
7326 if (!ST.hasTrapHandler() ||
7330 return ST.supportsGetDoorbellID() ?
7343 MI.eraseFromParent();
7353 BuildMI(*TrapBB, TrapBB->
end(),
DL,
B.getTII().get(AMDGPU::S_ENDPGM))
7355 BuildMI(BB, &
MI,
DL,
B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
7359 MI.eraseFromParent();
7368 Register SGPR01(AMDGPU::SGPR0_SGPR1);
7375 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), Param);
7377 Register KernargPtrReg =
MRI.createGenericVirtualRegister(
7393 Register LoadAddr =
MRI.createGenericVirtualRegister(
7395 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
7398 Register Temp =
B.buildLoad(
S64, LoadAddr, *MMO).getReg(0);
7399 B.buildCopy(SGPR01, Temp);
7400 B.buildInstr(AMDGPU::S_TRAP)
7403 MI.eraseFromParent();
7414 B.buildCopy(SGPR01, LiveIn);
7415 B.buildInstr(AMDGPU::S_TRAP)
7419 MI.eraseFromParent();
7428 if (ST.hasPrivEnabledTrap2NopBug()) {
7429 ST.getInstrInfo()->insertSimulatedTrap(
MRI,
B.getMBB(),
MI,
7431 MI.eraseFromParent();
7435 B.buildInstr(AMDGPU::S_TRAP)
7437 MI.eraseFromParent();
7446 if (!ST.hasTrapHandler() ||
7450 Fn,
"debugtrap handler not supported",
MI.getDebugLoc(),
DS_Warning));
7453 B.buildInstr(AMDGPU::S_TRAP)
7457 MI.eraseFromParent();
7470 Register NodePtr =
MI.getOperand(2).getReg();
7471 Register RayExtent =
MI.getOperand(3).getReg();
7472 Register RayOrigin =
MI.getOperand(4).getReg();
7474 Register RayInvDir =
MI.getOperand(6).getReg();
7477 if (!ST.hasGFX10_AEncoding()) {
7480 Fn,
"intrinsic not supported on subtarget",
MI.getDebugLoc()));
7487 const bool IsA16 =
MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
7488 const bool Is64 =
MRI.getType(NodePtr).getSizeInBits() == 64;
7489 const unsigned NumVDataDwords = 4;
7490 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
7491 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
7493 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
7495 const unsigned BaseOpcodes[2][2] = {
7496 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7497 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7498 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7502 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
7503 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
7504 : AMDGPU::MIMGEncGfx10NSA,
7505 NumVDataDwords, NumVAddrDwords);
7509 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
7510 : AMDGPU::MIMGEncGfx10Default,
7511 NumVDataDwords, NumVAddrDwords);
7516 if (UseNSA && IsGFX11Plus) {
7518 auto Unmerge =
B.buildUnmerge({
S32,
S32,
S32}, Src);
7519 auto Merged =
B.buildMergeLikeInstr(
7520 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
7521 Ops.push_back(Merged.getReg(0));
7524 Ops.push_back(NodePtr);
7525 Ops.push_back(RayExtent);
7526 packLanes(RayOrigin);
7529 auto UnmergeRayDir =
B.buildUnmerge({
S16,
S16,
S16}, RayDir);
7530 auto UnmergeRayInvDir =
B.buildUnmerge({
S16,
S16,
S16}, RayInvDir);
7531 auto MergedDir =
B.buildMergeLikeInstr(
7534 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(0),
7535 UnmergeRayDir.getReg(0)}))
7538 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(1),
7539 UnmergeRayDir.getReg(1)}))
7542 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(2),
7543 UnmergeRayDir.getReg(2)}))
7545 Ops.push_back(MergedDir.getReg(0));
7548 packLanes(RayInvDir);
7552 auto Unmerge =
B.buildUnmerge({
S32,
S32}, NodePtr);
7553 Ops.push_back(Unmerge.getReg(0));
7554 Ops.push_back(Unmerge.getReg(1));
7556 Ops.push_back(NodePtr);
7558 Ops.push_back(RayExtent);
7561 auto Unmerge =
B.buildUnmerge({
S32,
S32,
S32}, Src);
7562 Ops.push_back(Unmerge.getReg(0));
7563 Ops.push_back(Unmerge.getReg(1));
7564 Ops.push_back(Unmerge.getReg(2));
7567 packLanes(RayOrigin);
7569 auto UnmergeRayDir =
B.buildUnmerge({
S16,
S16,
S16}, RayDir);
7570 auto UnmergeRayInvDir =
B.buildUnmerge({
S16,
S16,
S16}, RayInvDir);
7574 B.buildMergeLikeInstr(R1,
7575 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7576 B.buildMergeLikeInstr(
7577 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7578 B.buildMergeLikeInstr(
7579 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7585 packLanes(RayInvDir);
7592 Register MergedOps =
B.buildMergeLikeInstr(OpTy,
Ops).getReg(0);
7594 Ops.push_back(MergedOps);
7597 auto MIB =
B.buildInstr(AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY)
7606 .addImm(IsA16 ? 1 : 0)
7609 MI.eraseFromParent();
7619 Register DstOrigin =
MI.getOperand(1).getReg();
7621 Register NodePtr =
MI.getOperand(4).getReg();
7622 Register RayExtent =
MI.getOperand(5).getReg();
7623 Register InstanceMask =
MI.getOperand(6).getReg();
7624 Register RayOrigin =
MI.getOperand(7).getReg();
7626 Register Offsets =
MI.getOperand(9).getReg();
7627 Register TDescr =
MI.getOperand(10).getReg();
7629 if (!ST.hasBVHDualAndBVH8Insts()) {
7632 Fn,
"intrinsic not supported on subtarget",
MI.getDebugLoc()));
7637 Intrinsic::amdgcn_image_bvh8_intersect_ray;
7638 const unsigned NumVDataDwords = 10;
7639 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
7641 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
7642 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
7643 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
7646 auto RayExtentInstanceMaskVec =
B.buildMergeLikeInstr(
7647 V2S32, {RayExtent,
B.buildAnyExt(
S32, InstanceMask)});
7649 B.buildInstr(IsBVH8 ? AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY
7650 : AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY)
7656 .addUse(RayExtentInstanceMaskVec.getReg(0))
7663 MI.eraseFromParent();
7672 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
7673 MI.eraseFromParent();
7680 if (!ST.hasArchitectedSGPRs())
7684 auto TTMP8 =
B.buildCopy(
S32,
Register(AMDGPU::TTMP8));
7685 auto LSB =
B.buildConstant(
S32, 25);
7686 auto Width =
B.buildConstant(
S32, 5);
7687 B.buildUbfx(DstReg, TTMP8, LSB, Width);
7688 MI.eraseFromParent();
7696 unsigned Width)
const {
7699 if (!
MRI.getRegClassOrNull(DstReg))
7700 MRI.setRegClass(DstReg, &AMDGPU::SReg_32RegClass);
7701 B.buildInstr(AMDGPU::S_GETREG_B32_const)
7704 MI.eraseFromParent();
7718 if (
MRI.getType(Src) !=
S64)
7722 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {
S32},
7726 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {
S32},
7729 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
7730 MI.eraseFromParent();
7738 if (
MRI.getType(Src) !=
S64)
7741 auto Unmerge =
B.buildUnmerge({
S32,
S32},
MI.getOperand(0));
7745 .addReg(Unmerge.getReg(0));
7749 .addReg(Unmerge.getReg(1));
7750 MI.eraseFromParent();
7762 case Intrinsic::amdgcn_if:
7763 case Intrinsic::amdgcn_else: {
7766 bool Negated =
false;
7778 std::swap(CondBrTarget, UncondBrTarget);
7780 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
7781 if (IntrID == Intrinsic::amdgcn_if) {
7782 B.buildInstr(AMDGPU::SI_IF)
7785 .addMBB(UncondBrTarget);
7787 B.buildInstr(AMDGPU::SI_ELSE)
7790 .addMBB(UncondBrTarget);
7799 B.buildBr(*CondBrTarget);
7802 MRI.setRegClass(Def,
TRI->getWaveMaskRegClass());
7803 MRI.setRegClass(
Use,
TRI->getWaveMaskRegClass());
7804 MI.eraseFromParent();
7805 BrCond->eraseFromParent();
7811 case Intrinsic::amdgcn_loop: {
7814 bool Negated =
false;
7824 std::swap(CondBrTarget, UncondBrTarget);
7826 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
7827 B.buildInstr(AMDGPU::SI_LOOP)
7829 .addMBB(UncondBrTarget);
7834 B.buildBr(*CondBrTarget);
7836 MI.eraseFromParent();
7837 BrCond->eraseFromParent();
7838 MRI.setRegClass(Reg,
TRI->getWaveMaskRegClass());
7844 case Intrinsic::amdgcn_addrspacecast_nonnull:
7846 case Intrinsic::amdgcn_make_buffer_rsrc:
7848 case Intrinsic::amdgcn_kernarg_segment_ptr:
7851 B.buildConstant(
MI.getOperand(0).getReg(), 0);
7852 MI.eraseFromParent();
7858 case Intrinsic::amdgcn_implicitarg_ptr:
7860 case Intrinsic::amdgcn_workitem_id_x:
7863 case Intrinsic::amdgcn_workitem_id_y:
7866 case Intrinsic::amdgcn_workitem_id_z:
7869 case Intrinsic::amdgcn_workgroup_id_x:
7874 case Intrinsic::amdgcn_workgroup_id_y:
7879 case Intrinsic::amdgcn_workgroup_id_z:
7884 case Intrinsic::amdgcn_cluster_id_x:
7885 return ST.hasClusters() &&
7888 case Intrinsic::amdgcn_cluster_id_y:
7889 return ST.hasClusters() &&
7892 case Intrinsic::amdgcn_cluster_id_z:
7893 return ST.hasClusters() &&
7896 case Intrinsic::amdgcn_cluster_workgroup_id_x:
7897 return ST.hasClusters() &&
7900 case Intrinsic::amdgcn_cluster_workgroup_id_y:
7901 return ST.hasClusters() &&
7904 case Intrinsic::amdgcn_cluster_workgroup_id_z:
7905 return ST.hasClusters() &&
7908 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
7909 return ST.hasClusters() &&
7911 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
7912 return ST.hasClusters() &&
7915 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
7916 return ST.hasClusters() &&
7919 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
7920 return ST.hasClusters() &&
7923 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
7924 return ST.hasClusters() &&
7928 case Intrinsic::amdgcn_wave_id:
7930 case Intrinsic::amdgcn_lds_kernel_id:
7933 case Intrinsic::amdgcn_dispatch_ptr:
7936 case Intrinsic::amdgcn_queue_ptr:
7939 case Intrinsic::amdgcn_implicit_buffer_ptr:
7942 case Intrinsic::amdgcn_dispatch_id:
7945 case Intrinsic::r600_read_ngroups_x:
7949 case Intrinsic::r600_read_ngroups_y:
7952 case Intrinsic::r600_read_ngroups_z:
7955 case Intrinsic::r600_read_local_size_x:
7958 case Intrinsic::r600_read_local_size_y:
7962 case Intrinsic::r600_read_local_size_z:
7965 case Intrinsic::amdgcn_fdiv_fast:
7967 case Intrinsic::amdgcn_is_shared:
7969 case Intrinsic::amdgcn_is_private:
7971 case Intrinsic::amdgcn_wavefrontsize: {
7972 B.buildConstant(
MI.getOperand(0), ST.getWavefrontSize());
7973 MI.eraseFromParent();
7976 case Intrinsic::amdgcn_s_buffer_load:
7978 case Intrinsic::amdgcn_raw_buffer_store:
7979 case Intrinsic::amdgcn_raw_ptr_buffer_store:
7980 case Intrinsic::amdgcn_struct_buffer_store:
7981 case Intrinsic::amdgcn_struct_ptr_buffer_store:
7983 case Intrinsic::amdgcn_raw_buffer_store_format:
7984 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
7985 case Intrinsic::amdgcn_struct_buffer_store_format:
7986 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
7988 case Intrinsic::amdgcn_raw_tbuffer_store:
7989 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
7990 case Intrinsic::amdgcn_struct_tbuffer_store:
7991 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
7993 case Intrinsic::amdgcn_raw_buffer_load:
7994 case Intrinsic::amdgcn_raw_ptr_buffer_load:
7995 case Intrinsic::amdgcn_raw_atomic_buffer_load:
7996 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
7997 case Intrinsic::amdgcn_struct_buffer_load:
7998 case Intrinsic::amdgcn_struct_ptr_buffer_load:
7999 case Intrinsic::amdgcn_struct_atomic_buffer_load:
8000 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
8002 case Intrinsic::amdgcn_raw_buffer_load_format:
8003 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
8004 case Intrinsic::amdgcn_struct_buffer_load_format:
8005 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
8007 case Intrinsic::amdgcn_raw_tbuffer_load:
8008 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
8009 case Intrinsic::amdgcn_struct_tbuffer_load:
8010 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
8012 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
8013 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
8014 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
8015 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
8016 case Intrinsic::amdgcn_raw_buffer_atomic_add:
8017 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
8018 case Intrinsic::amdgcn_struct_buffer_atomic_add:
8019 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
8020 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
8021 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
8022 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
8023 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
8024 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
8025 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
8026 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
8027 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
8028 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
8029 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
8030 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
8031 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
8032 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
8033 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
8034 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
8035 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
8036 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
8037 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
8038 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
8039 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
8040 case Intrinsic::amdgcn_raw_buffer_atomic_and:
8041 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
8042 case Intrinsic::amdgcn_struct_buffer_atomic_and:
8043 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
8044 case Intrinsic::amdgcn_raw_buffer_atomic_or:
8045 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
8046 case Intrinsic::amdgcn_struct_buffer_atomic_or:
8047 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
8048 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
8049 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
8050 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
8051 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
8052 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
8053 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
8054 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
8055 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
8056 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
8057 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
8058 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
8059 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
8060 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
8061 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
8062 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
8063 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
8064 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
8065 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
8066 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
8067 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
8068 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
8069 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
8070 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
8071 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
8072 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
8073 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
8074 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
8075 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
8076 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
8077 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
8078 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
8079 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
8080 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
8081 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
8082 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
8083 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
8085 case Intrinsic::amdgcn_rsq_clamp:
8087 case Intrinsic::amdgcn_image_bvh_intersect_ray:
8089 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
8090 case Intrinsic::amdgcn_image_bvh8_intersect_ray:
8092 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
8093 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
8094 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
8095 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
8096 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
8097 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
8098 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
8099 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
8102 if (
MRI.getType(Index) !=
S64)
8103 MI.getOperand(5).setReg(
B.buildAnyExt(
S64, Index).getReg(0));
8106 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8107 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8108 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8109 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8110 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8111 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8112 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8113 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8116 if (
MRI.getType(Index) !=
S32)
8117 MI.getOperand(5).setReg(
B.buildAnyExt(
S32, Index).getReg(0));
8120 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
8121 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
8122 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
8123 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
8124 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
8125 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
8126 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8127 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8128 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8130 LLT IdxTy = IntrID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
8133 if (
MRI.getType(Index) != IdxTy)
8134 MI.getOperand(7).setReg(
B.buildAnyExt(IdxTy, Index).getReg(0));
8138 case Intrinsic::amdgcn_fmed3: {
8144 MI.setDesc(
B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
8145 MI.removeOperand(1);
8149 case Intrinsic::amdgcn_readlane:
8150 case Intrinsic::amdgcn_writelane:
8151 case Intrinsic::amdgcn_readfirstlane:
8152 case Intrinsic::amdgcn_permlane16:
8153 case Intrinsic::amdgcn_permlanex16:
8154 case Intrinsic::amdgcn_permlane64:
8155 case Intrinsic::amdgcn_set_inactive:
8156 case Intrinsic::amdgcn_set_inactive_chain_arg:
8157 case Intrinsic::amdgcn_mov_dpp8:
8158 case Intrinsic::amdgcn_update_dpp:
8160 case Intrinsic::amdgcn_s_buffer_prefetch_data:
8162 case Intrinsic::amdgcn_dead: {
8166 MI.eraseFromParent();
8169 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
8170 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
8171 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
8172 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8173 B.buildLoad(
MI.getOperand(0),
MI.getOperand(2), **
MI.memoperands_begin());
8174 MI.eraseFromParent();
8176 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
8177 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
8178 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
8179 assert(
MI.hasOneMemOperand() &&
"Expected IRTranslator to set MemOp!");
8180 B.buildStore(
MI.getOperand(2),
MI.getOperand(1), **
MI.memoperands_begin());
8181 MI.eraseFromParent();
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID)
static LLT getBufferRsrcScalarType(const LLT Ty)
static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST, unsigned TypeIdx)
static cl::opt< bool > EnableNewLegality("amdgpu-global-isel-new-legality", cl::desc("Use GlobalISel desired legality, rather than try to use" "rules compatible with selection patterns"), cl::init(false), cl::ReallyHidden)
static MachineInstrBuilder buildExp(MachineIRBuilder &B, const DstOp &Dst, const SrcOp &Src, unsigned Flags)
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, unsigned Flags)
constexpr std::initializer_list< LLT > AllVectors
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx)
static LegalityPredicate isSmallOddVector(unsigned TypeIdx)
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size)
static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags)
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const LLT MemTy)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
static constexpr unsigned FPEnvModeBitField
static LegalizeMutation getScalarTypeFromMemDesc(unsigned TypeIdx)
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, uint64_t AlignInBits, unsigned AddrSpace, unsigned Opcode)
Return true if we should legalize a load by widening an odd sized memory access up to the alignment.
static bool isRegisterVectorElementType(LLT EltTy)
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
static LegalityPredicate isWideVec16(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllScalarTypes
static LegalityPredicate isTruncStoreToSizePowerOf2(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllS32Vectors
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx)
static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B)
Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is the form in which the valu...
static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty)
static std::pair< Register, Register > emitReciprocalU64(MachineIRBuilder &B, Register Val)
static LLT getBitcastRegisterType(const LLT Ty)
static LLT getBufferRsrcRegisterType(const LLT Ty)
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx)
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI)
static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, MachineRegisterInfo &MRI, unsigned Idx)
Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial type of the operand idx an...
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C)
static constexpr unsigned SPDenormModeBitField
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, bool IsLoad, bool IsAtomic)
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, MachineBasicBlock *&UncondBrTarget, bool &Negated)
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllS64Vectors
static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, unsigned Idx)
static constexpr unsigned FPEnvTrapBitField
static constexpr unsigned MaxRegisterSize
static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size)
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx)
static bool hasBufferRsrcWorkaround(const LLT Ty)
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, SIModeRegisterDefaults Mode)
constexpr std::initializer_list< LLT > AllS16Vectors
static bool loadStoreBitcastWorkaround(const LLT Ty)
static LLT widenToNextPowerOf2(LLT Ty)
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI)
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx)
static LLT getPow2VectorType(LLT Ty)
static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, Register VIndex, Register VOffset, Register SOffset, unsigned ImmOffset, unsigned Format, unsigned AuxiliaryData, MachineMemOperand *MMO, bool IsTyped, bool HasVIndex, MachineIRBuilder &B)
static LLT getPow2ScalarType(LLT Ty)
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx)
static bool isRegisterVectorType(LLT Ty)
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
static bool isRegisterType(const GCNSubtarget &ST, LLT Ty)
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Interface for Targets to specify which operations they can successfully select and how the others sho...
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
#define FP_DENORM_FLUSH_NONE
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static constexpr int Concat[]
bool legalizeConstHwRegRead(MachineInstr &MI, MachineIRBuilder &B, AMDGPU::Hwreg::Id HwReg, unsigned LowBit, unsigned Width) const
void buildMultiply(LegalizerHelper &Helper, MutableArrayRef< Register > Accum, ArrayRef< Register > Src0, ArrayRef< Register > Src1, bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const
bool legalizeBVHIntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferStore(MachineInstr &MI, LegalizerHelper &Helper, bool IsTyped, bool IsFormat) const
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFFREXP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
To create a buffer resource from a 64-bit pointer, mask off the upper 32 bits of the pointer and repl...
bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const
bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const
void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore=false) const
Handle register layout difference for f16 images for some subtargets.
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM)
bool legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSBufferPrefetch(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFExp10Unsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override
Called for instructions with the Custom LegalizationAction.
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, int64_t Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafeImpl(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags, bool IsExp10) const
std::pair< Register, Register > getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool loadInputValue(Register DstReg, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeBVHDualOrBVH8IntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferLoad(MachineInstr &MI, LegalizerHelper &Helper, bool IsFormat, bool IsTyped) const
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const
void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
bool legalizeDebugTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFroundeven(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkGroupId(MachineInstr &MI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ClusterIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const
bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, bool IsLog10, unsigned Flags) const
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, uint64_t Offset, Align Alignment=Align(4)) const
Legalize a value that's loaded from kernel arguments.
bool legalizeImageIntrinsic(MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const
Rewrite image intrinsics to use register layouts expected by the subtarget.
void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, MachineRegisterInfo &MRI) const
bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register fixStoreSourceType(MachineIRBuilder &B, Register VData, LLT MemTy, bool IsFormat) const
bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const
bool legalizeSetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkitemIDIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
void buildLoadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool isEntryFunction() const
bool isModuleEntryFunction() const
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const fltSemantics & IEEEsingle()
static const fltSemantics & IEEEdouble()
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
@ ICMP_SLT
signed less than
@ FCMP_OLT
0 1 0 0 True if ordered and less than
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
@ ICMP_UGE
unsigned greater or equal
@ ICMP_SGT
signed greater than
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
@ ICMP_ULT
unsigned less than
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
This is the shared class of boolean and integer constants.
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Abstract class that contains various methods for clients to notify about changes.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
Simple wrapper observer that takes several observers, and calls each one for each event.
KnownBits getKnownBits(Register R)
bool hasExternalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
static constexpr LLT float64()
Get a 64-bit IEEE double value.
constexpr unsigned getScalarSizeInBits() const
constexpr LLT changeElementType(LLT NewEltTy) const
If this type is a vector, return a vector with the same number of elements but the new element type.
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
static constexpr LLT float16()
Get a 16-bit IEEE half value.
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr LLT changeElementCount(ElementCount EC) const
Return a vector or scalar with the same element type and the new element count.
constexpr LLT getScalarType() const
static constexpr LLT scalarOrVector(ElementCount EC, LLT ScalarTy)
static constexpr LLT float32()
Get a 32-bit IEEE float value.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI void computeTables()
Compute any ancillary tables needed to quickly decide how an operation should be handled.
LegalizeRuleSet & minScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty.
LegalizeRuleSet & legalFor(std::initializer_list< LLT > Types)
The instruction is legal when type index 0 is any type in the given list.
LegalizeRuleSet & unsupported()
The instruction is unsupported.
LegalizeRuleSet & scalarSameSizeAs(unsigned TypeIdx, unsigned SameSizeIdx)
Change the type TypeIdx to have the same scalar size as type SameSizeIdx.
LegalizeRuleSet & fewerElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Remove elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalarOrElt(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & bitcastIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
The specified type index is coerced if predicate is true.
LegalizeRuleSet & maxScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at most as wide as Ty.
LegalizeRuleSet & minScalarOrElt(unsigned TypeIdx, const LLT Ty)
Ensure the scalar or element is at least as wide as Ty.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
LegalizeRuleSet & unsupportedFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & lower()
The instruction is lowered.
LegalizeRuleSet & moreElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Add more elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & lowerFor(std::initializer_list< LLT > Types)
The instruction is lowered when type index 0 is any type in the given list.
LegalizeRuleSet & lowerIf(LegalityPredicate Predicate)
The instruction is lowered if predicate is true.
LegalizeRuleSet & clampScalar(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & custom()
Unconditionally custom lower.
LegalizeRuleSet & clampMaxNumElementsStrict(unsigned TypeIdx, const LLT EltTy, unsigned NumElts)
Express EltTy vectors strictly using vectors with NumElts elements (or scalars when NumElts equals 1)...
LegalizeRuleSet & unsupportedIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Widen the scalar to the one selected by the mutation if the predicate is true.
LegalizeRuleSet & alwaysLegal()
LegalizeRuleSet & clampNumElements(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the number of elements for the given vectors to at least MinTy's number of elements and at most...
LegalizeRuleSet & maxScalarIf(LegalityPredicate Predicate, unsigned TypeIdx, const LLT Ty)
Conditionally limit the maximum size of the scalar.
LegalizeRuleSet & customIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
LegalizeRuleSet & scalarize(unsigned TypeIdx)
LegalizeRuleSet & legalForCartesianProduct(std::initializer_list< LLT > Types)
The instruction is legal when type indexes 0 and 1 are both in the given list.
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & widenScalarToNextMultipleOf(unsigned TypeIdx, unsigned Size)
Widen the scalar to the next multiple of Size.
LLVM_ABI LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
LLVM_ABI void moreElementsVectorDst(MachineInstr &MI, LLT MoreTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a Def by performing it with addition...
GISelValueTracking * getValueTracking() const
@ Legalized
Instruction has been legalized and the MachineFunction changed.
GISelChangeObserver & Observer
To keep track of changes made by the LegalizerHelper.
LLVM_ABI void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a def by inserting a G_BITCAST from ...
LLVM_ABI LegalizeResult lowerFMad(MachineInstr &MI)
MachineIRBuilder & MIRBuilder
Expose MIRBuilder so clients can set their own RecordInsertInstruction functions.
LLVM_ABI void widenScalarDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx=0, unsigned TruncOpcode=TargetOpcode::G_TRUNC)
Legalize a single operand OpIdx of the machine instruction MI as a Def by extending the operand's typ...
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
const LegacyLegalizerInfo & getLegacyLegalizerInfo() const
TypeSize getValue() const
Wrapper class representing physical registers. Should be passed by value.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Helper class to build MachineInstr.
MachineFunction & getMF()
Getter for the function we currently build.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLT getMemoryType() const
Return the memory type of the memory reference.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineBasicBlock * getMBB() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setMBB(MachineBasicBlock *MBB)
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
MutableArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
constexpr bool isValid() const
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool hasWorkGroupIDZ() const
AMDGPU::ClusterDimsAttr getClusterDims() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
bool shouldEmitFixup(const GlobalValue *GV) const
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool shouldEmitPCReloc(const GlobalValue *GV) const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void truncate(size_type N)
Like resize, but requires that N is less than size().
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
unsigned getPointerSizeInBits(unsigned AS) const
unsigned NoNaNsFPMath
NoNaNsFPMath - This flag is enabled when the -enable-no-nans-fp-math flag is specified on the command...
A Use represents the edge between a Value definition and its users.
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isGFX11(const MCSubtargetInfo &STI)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isGFX11Plus(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
LLVM_ABI LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
LLVM_ABI LegalityPredicate isScalar(unsigned TypeIdx)
True iff the specified type index is a scalar.
LLVM_ABI LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
LLVM_ABI LegalityPredicate typeInSet(unsigned TypeIdx, std::initializer_list< LLT > TypesInit)
True iff the given type index is one of the specified types.
LLVM_ABI LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a smaller total bit size than second type index.
LLVM_ABI LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
LLVM_ABI LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
LLVM_ABI LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the specified type indices are both the same bit size.
LLVM_ABI LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or vector with an element type that's narrower than the...
LLVM_ABI LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size)
True if the total bitwidth of the specified type index is Size bits.
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
LLVM_ABI LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified type.
LLVM_ABI LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
LLVM_ABI LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
LLVM_ABI LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min=0)
Widen the scalar type or vector element type for the given type index to the next power of 2.
LLVM_ABI LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
Invariant opcodes: All instruction sets have these as their low opcodes.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
LLVM_ABI Type * getTypeForLLT(LLT Ty, LLVMContext &C)
Get the type back from LLT.
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Undef
Value of the register doesn't matter.
LLVM_ABI const llvm::fltSemantics & getFltSemanticForLLT(LLT Ty)
Get the appropriate floating point arithmetic semantic based on the bit size of the given scalar LLT.
std::function< std::pair< unsigned, LLT >(const LegalityQuery &)> LegalizeMutation
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool has_single_bit(T Value) noexcept
std::function< bool(const LegalityQuery &)> LegalityPredicate
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
To bit_cast(const From &from) noexcept
@ Mul
Product of integers.
@ Sub
Subtraction of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
FunctionAddr VTableAddr Next
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI, LostDebugLocObserver *LocObserver=nullptr)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
unsigned Log2(Align A)
Returns the log2 of the alignment.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
@ CLUSTER_WORKGROUP_MAX_ID_X
@ CLUSTER_WORKGROUP_MAX_ID_Z
@ CLUSTER_WORKGROUP_MAX_FLAT_ID
@ CLUSTER_WORKGROUP_MAX_ID_Y
static constexpr uint64_t encode(Fields... Values)
MIMGBaseOpcode BaseOpcode
This struct is a compact representation of a valid (non-zero power of two) alignment.
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
MCRegister getRegister() const
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
bool isZero() const
Returns true if value is all zero.
The LegalityQuery object bundles together all the information that's needed to decide whether a given...
ArrayRef< MemDesc > MMODescrs
Operations which require memory can use this to place requirements on the memory type for each MMO.
This class contains a discriminated union of information about pointers in memory operands,...
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.